From 4d0f9a995cb4b050e717b3030c6fcd900234ba22 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Tue, 17 Jan 2023 16:42:41 -0800 Subject: [PATCH] Consider TTL compaction file cutting earlier to prevent small output file (#11075) Summary: in `CompactionOutputs::ShouldStopBefore()`, TTL-related states, `cur_files_to_cut_for_ttl_` and `next_files_to_cut_for_ttl_`, are not updated if the function returns early. This can cause unnecessary compaction output file cuttings and hence produce smaller output files, which may hurt write amp. See the example in the unit test for how this "unnecessary file cutting" can happen. This PR fixes this issue by moving the code for updating TTL states earlier in `CompactionOutputs::ShouldStopBefore()` so that the states are updated for each key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11075 Test Plan: - Added new unit test. Reviewed By: hx235 Differential Revision: D42398739 Pulled By: cbi42 fbshipit-source-id: 09fab66679c1a734abcfc31bcea33dd9aeb9dbc7 --- HISTORY.md | 1 + db/compaction/compaction_outputs.cc | 100 ++++++++++++++++------------ db/compaction/compaction_outputs.h | 7 ++ db/db_compaction_test.cc | 72 ++++++++++++++++++++ 4 files changed, 137 insertions(+), 43 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index e458fdfc7..6d3dd94b2 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -15,6 +15,7 @@ * Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache. * Fixed a heap use after free in async scan prefetching if dictionary compression is enabled, in which case sync read of the compression dictionary gets mixed with async prefetching * Fixed a data race bug of `CompactRange()` under `change_level=true` acts on overlapping range with an ongoing file ingestion for level compaction. This will either result in overlapping file ranges corruption at a certain level caught by `force_consistency_checks=true` or protentially two same keys both with seqno 0 in two different levels (i.e, new data ends up in lower/older level). The latter will be caught by assertion in debug build but go silently and result in read returning wrong result in release build. This fix is general so it also replaced previous fixes to a similar problem for `CompactFiles()` (#4665), general `CompactRange()` and auto compaction (commit 5c64fb6 and 87dfc1d). +* Fixed a bug in compaction output cutting where small output files were produced due to TTL file cutting states were not being updated (#11075). ### New Features * When an SstPartitionerFactory is configured, CompactRange() now automatically selects for compaction any files overlapping a partition boundary that is in the compaction range, even if no actual entries are in the requested compaction range. With this feature, manual compaction can be used to (re-)establish SST partition points when SstPartitioner changes, without a full compaction. diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index eba3208ac..01333d774 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -76,6 +76,46 @@ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status, return io_s; } +bool CompactionOutputs::UpdateFilesToCutForTTLStates( + const Slice& internal_key) { + if (!files_to_cut_for_ttl_.empty()) { + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + if (cur_files_to_cut_for_ttl_ != -1) { + // Previous key is inside the range of a file + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_] + ->largest.Encode()) > 0) { + next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1; + cur_files_to_cut_for_ttl_ = -1; + return true; + } + } else { + // Look for the key position + while (next_files_to_cut_for_ttl_ < + static_cast(files_to_cut_for_ttl_.size())) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->smallest.Encode()) >= 0) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->largest.Encode()) <= 0) { + // With in the current file + cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_; + return true; + } + // Beyond the current file + next_files_to_cut_for_ttl_++; + } else { + // Still fall into the gap + break; + } + } + } + } + return false; +} + size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( const Slice& internal_key) { size_t curr_key_boundary_switched_num = 0; @@ -185,18 +225,30 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes( bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { assert(c_iter.Valid()); - - // always update grandparent information like overlapped file number, size - // etc. const Slice& internal_key = c_iter.key(); const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_; - size_t num_grandparent_boundaries_crossed = - UpdateGrandparentBoundaryInfo(internal_key); + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + size_t num_grandparent_boundaries_crossed = 0; + bool should_stop_for_ttl = false; + // Always update grandparent information like overlapped file number, size + // etc., and TTL states. + // If compaction_->output_level() == 0, there is no need to update grandparent + // info, and that `grandparent` should be empty. + if (compaction_->output_level() > 0) { + num_grandparent_boundaries_crossed = + UpdateGrandparentBoundaryInfo(internal_key); + should_stop_for_ttl = UpdateFilesToCutForTTLStates(internal_key); + } if (!HasBuilder()) { return false; } + if (should_stop_for_ttl) { + return true; + } + // If there's user defined partitioner, check that first if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest( last_key_for_partitioner_, c_iter.user_key(), @@ -214,9 +266,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { return true; } - const InternalKeyComparator* icmp = - &compaction_->column_family_data()->internal_comparator(); - // Check if it needs to split for RoundRobin // Invalid local_output_split_key indicates that we do not need to split if (local_output_split_key_ != nullptr && !is_split_) { @@ -290,41 +339,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { } } - // check ttl file boundaries if there's any - if (!files_to_cut_for_ttl_.empty()) { - if (cur_files_to_cut_for_ttl_ != -1) { - // Previous key is inside the range of a file - if (icmp->Compare(internal_key, - files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_] - ->largest.Encode()) > 0) { - next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1; - cur_files_to_cut_for_ttl_ = -1; - return true; - } - } else { - // Look for the key position - while (next_files_to_cut_for_ttl_ < - static_cast(files_to_cut_for_ttl_.size())) { - if (icmp->Compare(internal_key, - files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] - ->smallest.Encode()) >= 0) { - if (icmp->Compare(internal_key, - files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] - ->largest.Encode()) <= 0) { - // With in the current file - cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_; - return true; - } - // Beyond the current file - next_files_to_cut_for_ttl_++; - } else { - // Still fall into the gap - break; - } - } - } - } - return false; } diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index f40aa8215..52233917f 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -221,6 +221,13 @@ class CompactionOutputs { } } + // Updates states related to file cutting for TTL. + // Returns a boolean value indicating whether the current + // compaction output file should be cut before `internal_key`. + // + // @param internal_key the current key to be added to output. + bool UpdateFilesToCutForTTLStates(const Slice& internal_key); + // update tracked grandparents information like grandparent index, if it's // in the gap between 2 grandparent files, accumulated grandparent files size // etc. diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 2fffcae60..6591ac41f 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -4219,6 +4219,78 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBCompactionTest, LevelTtlCompactionOutputCuttingIteractingWithOther) { + // This test is for a bug fix in CompactionOutputs::ShouldStopBefore() where + // TTL states were not being updated for keys that ShouldStopBefore() would + // return true for reasons other than TTL. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.ttl = 24 * 60 * 60; // 24 hours + options.max_open_files = -1; + options.compaction_pri = kMinOverlappingRatio; + env_->SetMockSleep(); + options.env = env_; + options.target_file_size_base = 4 << 10; + options.disable_auto_compactions = true; + options.level_compaction_dynamic_file_size = false; + + DestroyAndReopen(options); + Random rnd(301); + + // This makes sure the manual compaction below + // is not a bottommost compaction as TTL is only + // for non-bottommost compactions. + ASSERT_OK(Put(Key(3), rnd.RandomString(1 << 10))); + ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + + // L2: + ASSERT_OK(Put(Key(2), rnd.RandomString(4 << 10))); + ASSERT_OK(Put(Key(3), rnd.RandomString(4 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + // L1, overlaps in range with the file in L2 so + // that they compact together. + ASSERT_OK(Put(Key(0), rnd.RandomString(4 << 10))); + ASSERT_OK(Put(Key(1), rnd.RandomString(4 << 10))); + ASSERT_OK(Put(Key(3), rnd.RandomString(4 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + ASSERT_EQ("0,1,1,0,0,0,1", FilesPerLevel()); + // 36 hours so that the file in L2 is eligible for TTL + env_->MockSleepForSeconds(36 * 60 * 60); + + CompactRangeOptions compact_range_opts; + + ASSERT_OK(dbfull()->RunManualCompaction( + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(), + 1 /* input_level */, 2 /* output_level */, compact_range_opts, + nullptr /* begin */, nullptr /* end */, true /* exclusive */, + true /* disallow_trivial_move */, + std::numeric_limits::max() /*max_file_num_to_ignore*/, + "" /*trim_ts*/)); + + // L2 should have 2 files: + // file 1: Key(0), Key(1) + // ShouldStopBefore(Key(2)) return true due to TTL or output file size + // file 2: Key(2), Key(3) + // + // Before the fix in this PR, L2 would have 3 files: + // file 1: Key(0), Key(1) + // CompactionOutputs::ShouldStopBefore(Key(2)) returns true due to output file + // size. + // file 2: Key(2) + // CompactionOutput::ShouldStopBefore(Key(3)) returns true + // due to TTL cutting and that TTL states were not updated + // for Key(2). + // file 3: Key(3) + ASSERT_EQ("0,0,2,0,0,0,1", FilesPerLevel()); +} + TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { env_->SetMockSleep(); const int kValueSize = 100;