From 4aa52d89cf9cc94da6800396e06a21b2cd74a7b5 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Mon, 5 Jun 2023 10:26:40 -0700 Subject: [PATCH] Drop range tombstone during non-bottommost compaction (#11459) Summary: Similar to point tombstones, we can drop a range tombstone during compaction when we know its range does not exist in any higher level. This PR adds this optimization. Some existing test in db_range_del_test is fixed to work under this optimization. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11459 Test Plan: * Add unit test `DBRangeDelTest, NonBottommostCompactionDropRangetombstone`. * Ran crash test that issues range deletion for a few hours: `python3 tools/db_crashtest.py blackbox --simple --write_buffer_size=1048576 --delrangepercent=10 --writepercent=31 --readpercent=40` Reviewed By: ajkr Differential Revision: D46007904 Pulled By: cbi42 fbshipit-source-id: 3f37205b6778b7d55ed106369ca41b0632a6d0fd --- db/compaction/compaction.cc | 43 ++++++++++++ db/compaction/compaction.h | 10 ++- db/compaction/compaction_outputs.cc | 29 +++++--- db/compaction/compaction_outputs.h | 9 +++ db/db_properties_test.cc | 34 --------- db/db_range_del_test.cc | 70 +++++++++++++++++++ db/db_test_util.h | 34 +++++++++ .../behavior_changes/drop_range_tombstone.md | 1 + 8 files changed, 186 insertions(+), 44 deletions(-) create mode 100644 unreleased_history/behavior_changes/drop_range_tombstone.md diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index e3e084e89..55201ec42 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -562,6 +562,49 @@ bool Compaction::KeyNotExistsBeyondOutputLevel( return false; } +bool Compaction::KeyRangeNotExistsBeyondOutputLevel( + const Slice& begin_key, const Slice& end_key, + std::vector* level_ptrs) const { + assert(input_version_ != nullptr); + assert(level_ptrs != nullptr); + assert(level_ptrs->size() == static_cast(number_levels_)); + assert(cfd_->user_comparator()->CompareWithoutTimestamp(begin_key, end_key) < + 0); + if (bottommost_level_) { + return true /* does not overlap */; + } else if (output_level_ != 0 && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + const Comparator* user_cmp = cfd_->user_comparator(); + for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) { + const std::vector& files = + input_vstorage_->LevelFiles(lvl); + for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) { + auto* f = files[level_ptrs->at(lvl)]; + // Advance until the first file with begin_key <= f->largest.user_key() + if (user_cmp->CompareWithoutTimestamp(begin_key, + f->largest.user_key()) > 0) { + continue; + } + // We know that the previous file prev_f, if exists, has + // prev_f->largest.user_key() < begin_key. + if (user_cmp->CompareWithoutTimestamp(end_key, + f->smallest.user_key()) <= 0) { + // not overlapping with this level + break; + } else { + // We have: + // - begin_key < end_key, + // - begin_key <= f->largest.user_key(), and + // - end_key > f->smallest.user_key() + return false /* overlap */; + } + } + } + return true /* does not overlap */; + } + return false /* overlaps */; +}; + // Mark (or clear) each file that is being compacted void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { for (size_t i = 0; i < num_input_levels(); i++) { diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index 4fa1dce71..391af3a61 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -203,10 +203,18 @@ class Compaction { void AddInputDeletions(VersionEdit* edit); // Returns true if the available information we have guarantees that - // the input "user_key" does not exist in any level beyond "output_level()". + // the input "user_key" does not exist in any level beyond `output_level()`. bool KeyNotExistsBeyondOutputLevel(const Slice& user_key, std::vector* level_ptrs) const; + // Returns true if the user key range [begin_key, end_key) does not exist + // in any level beyond `output_level()`. + // Used for checking range tombstones, so we assume begin_key < end_key. + // begin_key and end_key should include timestamp if enabled. + bool KeyRangeNotExistsBeyondOutputLevel( + const Slice& begin_key, const Slice& end_key, + std::vector* level_ptrs) const; + // Clear all files to indicate that they are not being compacted // Delete this compaction from the list of running compactions. // diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index f6e0dbd7d..81323aa3b 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -598,10 +598,12 @@ Status CompactionOutputs::AddRangeDels( // in any snapshot. trim_ts_ is passed to the constructor for // range_del_agg_, and range_del_agg_ internally drops tombstones above // trim_ts_. - if (bottommost_level && tombstone.seq_ <= earliest_snapshot && + bool consider_drop = + tombstone.seq_ <= earliest_snapshot && (ts_sz == 0 || (!full_history_ts_low.empty() && - ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) { + ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0)); + if (consider_drop && bottommost_level) { // TODO(andrewkr): tombstones that span multiple output files are // counted for each compaction output file, so lots of double // counting. @@ -635,6 +637,20 @@ Status CompactionOutputs::AddRangeDels( icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) { break; } + if (lower_bound && + icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) { + tombstone_start.DecodeFrom(*lower_bound); + } + if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) { + tombstone_end.DecodeFrom(*upper_bound); + } + if (consider_drop && compaction_->KeyRangeNotExistsBeyondOutputLevel( + tombstone_start.user_key(), + tombstone_end.user_key(), &level_ptrs_)) { + range_del_out_stats.num_range_del_drop_obsolete++; + range_del_out_stats.num_record_drop_obsolete++; + continue; + } // Here we show that *only* range tombstones that overlap with // [lower_bound, upper_bound] are added to the current file, and // sanity checking invariants that should hold: @@ -688,13 +704,6 @@ Status CompactionOutputs::AddRangeDels( // Range tombstone is not supported by output validator yet. builder_->Add(kv.first.Encode(), kv.second); - if (lower_bound && - icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) { - tombstone_start.DecodeFrom(*lower_bound); - } - if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) { - tombstone_end.DecodeFrom(*upper_bound); - } assert(icmp.Compare(tombstone_start, tombstone_end) <= 0); meta.UpdateBoundariesForRange(tombstone_start, tombstone_end, tombstone.seq_, icmp); @@ -779,6 +788,8 @@ CompactionOutputs::CompactionOutputs(const Compaction* compaction, if (compaction->output_level() != 0) { FillFilesToCutForTtl(); } + + level_ptrs_ = std::vector(compaction_->number_levels(), 0); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index 1b65ee662..86f6da303 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -356,6 +356,15 @@ class CompactionOutputs { // The smallest key of the current output file, this is set when current // output file's smallest key is a range tombstone start key. InternalKey range_tombstone_lower_bound_; + + // Used for calls to compaction->KeyRangeNotExistsBeyondOutputLevel() in + // CompactionOutputs::AddRangeDels(). + // level_ptrs_[i] holds index of the file that was checked during the last + // call to compaction->KeyRangeNotExistsBeyondOutputLevel(). This allows + // future calls to the function to pick up where it left off, since each + // range tombstone added to output file within each subcompaction is in + // increasing key range. + std::vector level_ptrs_; }; // helper struct to concatenate the last level and penultimate level outputs diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 085ee064c..1fcb7c3b4 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -188,40 +188,6 @@ TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) { } namespace { -void ResetTableProperties(TableProperties* tp) { - tp->data_size = 0; - tp->index_size = 0; - tp->filter_size = 0; - tp->raw_key_size = 0; - tp->raw_value_size = 0; - tp->num_data_blocks = 0; - tp->num_entries = 0; - tp->num_deletions = 0; - tp->num_merge_operands = 0; - tp->num_range_deletions = 0; -} - -void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { - double dummy_double; - std::replace(tp_string.begin(), tp_string.end(), ';', ' '); - std::replace(tp_string.begin(), tp_string.end(), '=', ' '); - ResetTableProperties(tp); - sscanf(tp_string.c_str(), - "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64 - " # merge operands %" SCNu64 " # range deletions %" SCNu64 - " raw key size %" SCNu64 - " raw average key size %lf " - " raw value size %" SCNu64 - " raw average value size %lf " - " data block size %" SCNu64 " index block size (user-key? %" SCNu64 - ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, - &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions, - &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size, - &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, - &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded, - &tp->index_size, &tp->filter_size); -} - void VerifySimilar(uint64_t a, uint64_t b, double bias) { ASSERT_EQ(a == 0U, b == 0U); if (a == 0) { diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 08bd3af04..ac283157c 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -1690,6 +1690,8 @@ TEST_F(DBRangeDelTest, LevelCompactOutputCutAtRangeTombstoneForTtlFiles) { ASSERT_EQ("0,1,0,1", FilesPerLevel()); env_->MockSleepForSeconds(20 * 60 * 60); + // Prevent range tombstone from being dropped during compaction. + const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(11), Key(12))); ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10))); @@ -1703,6 +1705,7 @@ TEST_F(DBRangeDelTest, LevelCompactOutputCutAtRangeTombstoneForTtlFiles) { // File 1: (qualified for TTL): Key(5) - Key(10) // File 1: DeleteRange [11, 12) ASSERT_EQ("0,3,0,1", FilesPerLevel()); + db_->ReleaseSnapshot(snapshot); } // Test SST partitioner cut after every single key @@ -3405,6 +3408,73 @@ TEST_F(DBRangeDelTest, AddRangeDelsSingleUserKeyTombstoneOnlyFile) { db_->ReleaseSnapshot(snapshot1); } +TEST_F(DBRangeDelTest, NonBottommostCompactionDropRangetombstone) { + // L0: file 1: [DeleteRange[4, 5)], file 2: [3, 6, DeleteRange[8, 9)] + // L6 file 1: [2, 3], file 2: [7, 8] + // When compacting the two L0 files to L1, the compaction is non-bottommost + // since the compaction key range overlaps with L6 file 1. The range tombstone + // [4, 5) should be dropped since it does not overlap with any file in lower + // levels. The range tombstone [8, 9) should not be dropped. + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.num_levels = 7; + DestroyAndReopen(opts); + + Random rnd(301); + // L6 file 1 + ASSERT_OK(Put(Key(2), rnd.RandomString(100))); + ASSERT_OK(Put(Key(3), rnd.RandomString(100))); + ASSERT_OK(Flush()); + // L6 file 2 + ASSERT_OK(Put(Key(7), rnd.RandomString(100))); + ASSERT_OK(Put(Key(8), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + ASSERT_EQ(NumTableFilesAtLevel(6), 2); + // L0 file 1 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4), + Key(5))); + ASSERT_OK(Flush()); + // L0 file 2 + ASSERT_OK(Put(Key(3), rnd.RandomString(100))); + ASSERT_OK(Put(Key(6), rnd.RandomString(100))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(8), + Key(9))); + ASSERT_OK(Flush()); + // nothing is dropped during flush + std::string property; + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + TableProperties output_tp; + ParseTablePropertiesString(property, &output_tp); + ASSERT_EQ(output_tp.num_range_deletions, 2); + // Compact two L0s into L1 + std::string begin_str = Key(4); + std::string end_str = Key(6); + Slice begin_slice = begin_str; + Slice end_slice = end_str; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_slice, &end_slice)); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + ParseTablePropertiesString(property, &output_tp); + ASSERT_EQ(output_tp.num_range_deletions, 1); + + // Now create a snapshot protected range tombstone [4, 5), it should not + // be dropped. + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4), + Key(5))); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + end_str = Key(5); + end_slice = end_str; + ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice)); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + ParseTablePropertiesString(property, &output_tp); + ASSERT_EQ(output_tp.num_range_deletions, 2); + db_->ReleaseSnapshot(snapshot); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_test_util.h b/db/db_test_util.h index 5b35c9907..28dd23160 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -1327,6 +1327,40 @@ class DBTestBase : public testing::Test { // supported void SetTimeElapseOnlySleepOnReopen(DBOptions* options); + void ResetTableProperties(TableProperties* tp) { + tp->data_size = 0; + tp->index_size = 0; + tp->filter_size = 0; + tp->raw_key_size = 0; + tp->raw_value_size = 0; + tp->num_data_blocks = 0; + tp->num_entries = 0; + tp->num_deletions = 0; + tp->num_merge_operands = 0; + tp->num_range_deletions = 0; + } + + void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { + double dummy_double; + std::replace(tp_string.begin(), tp_string.end(), ';', ' '); + std::replace(tp_string.begin(), tp_string.end(), '=', ' '); + ResetTableProperties(tp); + sscanf(tp_string.c_str(), + "# data blocks %" SCNu64 " # entries %" SCNu64 + " # deletions %" SCNu64 " # merge operands %" SCNu64 + " # range deletions %" SCNu64 " raw key size %" SCNu64 + " raw average key size %lf " + " raw value size %" SCNu64 + " raw average value size %lf " + " data block size %" SCNu64 " index block size (user-key? %" SCNu64 + ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, + &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions, + &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size, + &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, + &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded, + &tp->index_size, &tp->filter_size); + } + private: // Prone to error on direct use void MaybeInstallTimeElapseOnlySleep(const DBOptions& options); diff --git a/unreleased_history/behavior_changes/drop_range_tombstone.md b/unreleased_history/behavior_changes/drop_range_tombstone.md new file mode 100644 index 000000000..2577802c9 --- /dev/null +++ b/unreleased_history/behavior_changes/drop_range_tombstone.md @@ -0,0 +1 @@ +RocksDB will try to drop range tombstones during non-bottommost compaction when it is safe to do so. (#11459) \ No newline at end of file