Drop range tombstone during non-bottommost compaction (#11459)

Summary:
Similar to point tombstones, we can drop a range tombstone during compaction when we know its range does not exist in any higher level. This PR adds this optimization. Some existing test in db_range_del_test is fixed to work under this optimization.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11459

Test Plan:
* Add unit test `DBRangeDelTest, NonBottommostCompactionDropRangetombstone`.
* Ran crash test that issues range deletion for a few hours: `python3 tools/db_crashtest.py blackbox --simple --write_buffer_size=1048576 --delrangepercent=10 --writepercent=31 --readpercent=40`

Reviewed By: ajkr

Differential Revision: D46007904

Pulled By: cbi42

fbshipit-source-id: 3f37205b6778b7d55ed106369ca41b0632a6d0fd
oxigraph-main
Changyu Bi 1 year ago committed by Facebook GitHub Bot
parent 687a2a0d9a
commit 4aa52d89cf
  1. 43
      db/compaction/compaction.cc
  2. 10
      db/compaction/compaction.h
  3. 29
      db/compaction/compaction_outputs.cc
  4. 9
      db/compaction/compaction_outputs.h
  5. 34
      db/db_properties_test.cc
  6. 70
      db/db_range_del_test.cc
  7. 34
      db/db_test_util.h
  8. 1
      unreleased_history/behavior_changes/drop_range_tombstone.md

@ -562,6 +562,49 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
return false; return false;
} }
bool Compaction::KeyRangeNotExistsBeyondOutputLevel(
const Slice& begin_key, const Slice& end_key,
std::vector<size_t>* level_ptrs) const {
assert(input_version_ != nullptr);
assert(level_ptrs != nullptr);
assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
assert(cfd_->user_comparator()->CompareWithoutTimestamp(begin_key, end_key) <
0);
if (bottommost_level_) {
return true /* does not overlap */;
} else if (output_level_ != 0 &&
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
const Comparator* user_cmp = cfd_->user_comparator();
for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
const std::vector<FileMetaData*>& files =
input_vstorage_->LevelFiles(lvl);
for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
auto* f = files[level_ptrs->at(lvl)];
// Advance until the first file with begin_key <= f->largest.user_key()
if (user_cmp->CompareWithoutTimestamp(begin_key,
f->largest.user_key()) > 0) {
continue;
}
// We know that the previous file prev_f, if exists, has
// prev_f->largest.user_key() < begin_key.
if (user_cmp->CompareWithoutTimestamp(end_key,
f->smallest.user_key()) <= 0) {
// not overlapping with this level
break;
} else {
// We have:
// - begin_key < end_key,
// - begin_key <= f->largest.user_key(), and
// - end_key > f->smallest.user_key()
return false /* overlap */;
}
}
}
return true /* does not overlap */;
}
return false /* overlaps */;
};
// Mark (or clear) each file that is being compacted // Mark (or clear) each file that is being compacted
void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
for (size_t i = 0; i < num_input_levels(); i++) { for (size_t i = 0; i < num_input_levels(); i++) {

@ -203,10 +203,18 @@ class Compaction {
void AddInputDeletions(VersionEdit* edit); void AddInputDeletions(VersionEdit* edit);
// Returns true if the available information we have guarantees that // Returns true if the available information we have guarantees that
// the input "user_key" does not exist in any level beyond "output_level()". // the input "user_key" does not exist in any level beyond `output_level()`.
bool KeyNotExistsBeyondOutputLevel(const Slice& user_key, bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
std::vector<size_t>* level_ptrs) const; std::vector<size_t>* level_ptrs) const;
// Returns true if the user key range [begin_key, end_key) does not exist
// in any level beyond `output_level()`.
// Used for checking range tombstones, so we assume begin_key < end_key.
// begin_key and end_key should include timestamp if enabled.
bool KeyRangeNotExistsBeyondOutputLevel(
const Slice& begin_key, const Slice& end_key,
std::vector<size_t>* level_ptrs) const;
// Clear all files to indicate that they are not being compacted // Clear all files to indicate that they are not being compacted
// Delete this compaction from the list of running compactions. // Delete this compaction from the list of running compactions.
// //

@ -598,10 +598,12 @@ Status CompactionOutputs::AddRangeDels(
// in any snapshot. trim_ts_ is passed to the constructor for // in any snapshot. trim_ts_ is passed to the constructor for
// range_del_agg_, and range_del_agg_ internally drops tombstones above // range_del_agg_, and range_del_agg_ internally drops tombstones above
// trim_ts_. // trim_ts_.
if (bottommost_level && tombstone.seq_ <= earliest_snapshot && bool consider_drop =
tombstone.seq_ <= earliest_snapshot &&
(ts_sz == 0 || (ts_sz == 0 ||
(!full_history_ts_low.empty() && (!full_history_ts_low.empty() &&
ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) { ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0));
if (consider_drop && bottommost_level) {
// TODO(andrewkr): tombstones that span multiple output files are // TODO(andrewkr): tombstones that span multiple output files are
// counted for each compaction output file, so lots of double // counted for each compaction output file, so lots of double
// counting. // counting.
@ -635,6 +637,20 @@ Status CompactionOutputs::AddRangeDels(
icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) { icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) {
break; break;
} }
if (lower_bound &&
icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) {
tombstone_start.DecodeFrom(*lower_bound);
}
if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) {
tombstone_end.DecodeFrom(*upper_bound);
}
if (consider_drop && compaction_->KeyRangeNotExistsBeyondOutputLevel(
tombstone_start.user_key(),
tombstone_end.user_key(), &level_ptrs_)) {
range_del_out_stats.num_range_del_drop_obsolete++;
range_del_out_stats.num_record_drop_obsolete++;
continue;
}
// Here we show that *only* range tombstones that overlap with // Here we show that *only* range tombstones that overlap with
// [lower_bound, upper_bound] are added to the current file, and // [lower_bound, upper_bound] are added to the current file, and
// sanity checking invariants that should hold: // sanity checking invariants that should hold:
@ -688,13 +704,6 @@ Status CompactionOutputs::AddRangeDels(
// Range tombstone is not supported by output validator yet. // Range tombstone is not supported by output validator yet.
builder_->Add(kv.first.Encode(), kv.second); builder_->Add(kv.first.Encode(), kv.second);
if (lower_bound &&
icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) {
tombstone_start.DecodeFrom(*lower_bound);
}
if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) {
tombstone_end.DecodeFrom(*upper_bound);
}
assert(icmp.Compare(tombstone_start, tombstone_end) <= 0); assert(icmp.Compare(tombstone_start, tombstone_end) <= 0);
meta.UpdateBoundariesForRange(tombstone_start, tombstone_end, meta.UpdateBoundariesForRange(tombstone_start, tombstone_end,
tombstone.seq_, icmp); tombstone.seq_, icmp);
@ -779,6 +788,8 @@ CompactionOutputs::CompactionOutputs(const Compaction* compaction,
if (compaction->output_level() != 0) { if (compaction->output_level() != 0) {
FillFilesToCutForTtl(); FillFilesToCutForTtl();
} }
level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
} }
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

@ -356,6 +356,15 @@ class CompactionOutputs {
// The smallest key of the current output file, this is set when current // The smallest key of the current output file, this is set when current
// output file's smallest key is a range tombstone start key. // output file's smallest key is a range tombstone start key.
InternalKey range_tombstone_lower_bound_; InternalKey range_tombstone_lower_bound_;
// Used for calls to compaction->KeyRangeNotExistsBeyondOutputLevel() in
// CompactionOutputs::AddRangeDels().
// level_ptrs_[i] holds index of the file that was checked during the last
// call to compaction->KeyRangeNotExistsBeyondOutputLevel(). This allows
// future calls to the function to pick up where it left off, since each
// range tombstone added to output file within each subcompaction is in
// increasing key range.
std::vector<size_t> level_ptrs_;
}; };
// helper struct to concatenate the last level and penultimate level outputs // helper struct to concatenate the last level and penultimate level outputs

@ -188,40 +188,6 @@ TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) {
} }
namespace { namespace {
void ResetTableProperties(TableProperties* tp) {
tp->data_size = 0;
tp->index_size = 0;
tp->filter_size = 0;
tp->raw_key_size = 0;
tp->raw_value_size = 0;
tp->num_data_blocks = 0;
tp->num_entries = 0;
tp->num_deletions = 0;
tp->num_merge_operands = 0;
tp->num_range_deletions = 0;
}
void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
double dummy_double;
std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
ResetTableProperties(tp);
sscanf(tp_string.c_str(),
"# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64
" # merge operands %" SCNu64 " # range deletions %" SCNu64
" raw key size %" SCNu64
" raw average key size %lf "
" raw value size %" SCNu64
" raw average value size %lf "
" data block size %" SCNu64 " index block size (user-key? %" SCNu64
", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
&tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
&tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
&dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
&tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
&tp->index_size, &tp->filter_size);
}
void VerifySimilar(uint64_t a, uint64_t b, double bias) { void VerifySimilar(uint64_t a, uint64_t b, double bias) {
ASSERT_EQ(a == 0U, b == 0U); ASSERT_EQ(a == 0U, b == 0U);
if (a == 0) { if (a == 0) {

@ -1690,6 +1690,8 @@ TEST_F(DBRangeDelTest, LevelCompactOutputCutAtRangeTombstoneForTtlFiles) {
ASSERT_EQ("0,1,0,1", FilesPerLevel()); ASSERT_EQ("0,1,0,1", FilesPerLevel());
env_->MockSleepForSeconds(20 * 60 * 60); env_->MockSleepForSeconds(20 * 60 * 60);
// Prevent range tombstone from being dropped during compaction.
const Snapshot* snapshot = db_->GetSnapshot();
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
Key(11), Key(12))); Key(11), Key(12)));
ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10))); ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10)));
@ -1703,6 +1705,7 @@ TEST_F(DBRangeDelTest, LevelCompactOutputCutAtRangeTombstoneForTtlFiles) {
// File 1: (qualified for TTL): Key(5) - Key(10) // File 1: (qualified for TTL): Key(5) - Key(10)
// File 1: DeleteRange [11, 12) // File 1: DeleteRange [11, 12)
ASSERT_EQ("0,3,0,1", FilesPerLevel()); ASSERT_EQ("0,3,0,1", FilesPerLevel());
db_->ReleaseSnapshot(snapshot);
} }
// Test SST partitioner cut after every single key // Test SST partitioner cut after every single key
@ -3405,6 +3408,73 @@ TEST_F(DBRangeDelTest, AddRangeDelsSingleUserKeyTombstoneOnlyFile) {
db_->ReleaseSnapshot(snapshot1); db_->ReleaseSnapshot(snapshot1);
} }
TEST_F(DBRangeDelTest, NonBottommostCompactionDropRangetombstone) {
// L0: file 1: [DeleteRange[4, 5)], file 2: [3, 6, DeleteRange[8, 9)]
// L6 file 1: [2, 3], file 2: [7, 8]
// When compacting the two L0 files to L1, the compaction is non-bottommost
// since the compaction key range overlaps with L6 file 1. The range tombstone
// [4, 5) should be dropped since it does not overlap with any file in lower
// levels. The range tombstone [8, 9) should not be dropped.
Options opts = CurrentOptions();
opts.disable_auto_compactions = true;
opts.num_levels = 7;
DestroyAndReopen(opts);
Random rnd(301);
// L6 file 1
ASSERT_OK(Put(Key(2), rnd.RandomString(100)));
ASSERT_OK(Put(Key(3), rnd.RandomString(100)));
ASSERT_OK(Flush());
// L6 file 2
ASSERT_OK(Put(Key(7), rnd.RandomString(100)));
ASSERT_OK(Put(Key(8), rnd.RandomString(100)));
ASSERT_OK(Flush());
MoveFilesToLevel(6);
ASSERT_EQ(NumTableFilesAtLevel(6), 2);
// L0 file 1
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
Key(5)));
ASSERT_OK(Flush());
// L0 file 2
ASSERT_OK(Put(Key(3), rnd.RandomString(100)));
ASSERT_OK(Put(Key(6), rnd.RandomString(100)));
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(8),
Key(9)));
ASSERT_OK(Flush());
// nothing is dropped during flush
std::string property;
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
TableProperties output_tp;
ParseTablePropertiesString(property, &output_tp);
ASSERT_EQ(output_tp.num_range_deletions, 2);
// Compact two L0s into L1
std::string begin_str = Key(4);
std::string end_str = Key(6);
Slice begin_slice = begin_str;
Slice end_slice = end_str;
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_slice, &end_slice));
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
ParseTablePropertiesString(property, &output_tp);
ASSERT_EQ(output_tp.num_range_deletions, 1);
// Now create a snapshot protected range tombstone [4, 5), it should not
// be dropped.
const Snapshot* snapshot = db_->GetSnapshot();
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
Key(5)));
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
end_str = Key(5);
end_slice = end_str;
ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice));
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
ParseTablePropertiesString(property, &output_tp);
ASSERT_EQ(output_tp.num_range_deletions, 2);
db_->ReleaseSnapshot(snapshot);
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -1327,6 +1327,40 @@ class DBTestBase : public testing::Test {
// supported // supported
void SetTimeElapseOnlySleepOnReopen(DBOptions* options); void SetTimeElapseOnlySleepOnReopen(DBOptions* options);
void ResetTableProperties(TableProperties* tp) {
tp->data_size = 0;
tp->index_size = 0;
tp->filter_size = 0;
tp->raw_key_size = 0;
tp->raw_value_size = 0;
tp->num_data_blocks = 0;
tp->num_entries = 0;
tp->num_deletions = 0;
tp->num_merge_operands = 0;
tp->num_range_deletions = 0;
}
void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
double dummy_double;
std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
ResetTableProperties(tp);
sscanf(tp_string.c_str(),
"# data blocks %" SCNu64 " # entries %" SCNu64
" # deletions %" SCNu64 " # merge operands %" SCNu64
" # range deletions %" SCNu64 " raw key size %" SCNu64
" raw average key size %lf "
" raw value size %" SCNu64
" raw average value size %lf "
" data block size %" SCNu64 " index block size (user-key? %" SCNu64
", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
&tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
&tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
&dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
&tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
&tp->index_size, &tp->filter_size);
}
private: // Prone to error on direct use private: // Prone to error on direct use
void MaybeInstallTimeElapseOnlySleep(const DBOptions& options); void MaybeInstallTimeElapseOnlySleep(const DBOptions& options);

@ -0,0 +1 @@
RocksDB will try to drop range tombstones during non-bottommost compaction when it is safe to do so. (#11459)
Loading…
Cancel
Save