Refactor AddRangeDels() + consider range tombstone during compaction file cutting (#11113)

Summary:
A second attempt after https://github.com/facebook/rocksdb/issues/10802, with bug fixes and refactoring. This PR updates compaction logic to take range tombstones into account when determining whether to cut the current compaction output file (https://github.com/facebook/rocksdb/issues/4811). Before this change, only point keys were considered, and range tombstones could cause large compactions. For example, if the current compaction outputs is a range tombstone [a, b) and 2 point keys y, z, they would be added to the same file, and may overlap with too many files in the next level and cause a large compaction in the future. This PR also includes ajkr's effort to simplify the logic to add range tombstones to compaction output files in `AddRangeDels()` ([https://github.com/facebook/rocksdb/issues/11078](https://github.com/facebook/rocksdb/pull/11078#issuecomment-1386078861)).

The main change is for `CompactionIterator` to emit range tombstone start keys to be processed by `CompactionOutputs`. A new class `CompactionMergingIterator` is introduced to replace `MergingIterator` under `CompactionIterator` to enable emitting of range tombstone start keys. Further improvement after this PR include cutting compaction output at some grandparent boundary key (instead of the next output key) when cutting within a range tombstone to reduce overlap with grandparents.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11113

Test Plan:
* added unit test in db_range_del_test
* crash test with a small key range: `python3 tools/db_crashtest.py blackbox --simple --max_key=100 --interval=600 --write_buffer_size=262144 --target_file_size_base=256 --max_bytes_for_level_base=262144 --block_size=128 --value_size_mult=33 --subcompactions=10 --use_multiget=1 --delpercent=3 --delrangepercent=2 --verify_iterator_with_expected_state_one_in=2 --num_iterations=10`

Reviewed By: ajkr

Differential Revision: D42655709

Pulled By: cbi42

fbshipit-source-id: 8367e36ef5640e8f21c14a3855d4a8d6e360a34c
oxigraph-8.1.1
Changyu Bi 2 years ago committed by Facebook GitHub Bot
parent 9fa9becf53
commit 229297d1b8
  1. 1
      CMakeLists.txt
  2. 2
      HISTORY.md
  3. 2
      TARGETS
  4. 4
      db/blob/blob_counting_iterator.h
  5. 5
      db/compaction/clipping_iterator.h
  6. 31
      db/compaction/compaction_iterator.cc
  7. 18
      db/compaction/compaction_iterator.h
  8. 30
      db/compaction/compaction_job.cc
  9. 4
      db/compaction/compaction_job.h
  10. 547
      db/compaction/compaction_outputs.cc
  11. 15
      db/compaction/compaction_outputs.h
  12. 5
      db/compaction/subcompaction_state.h
  13. 393
      db/db_range_del_test.cc
  14. 4
      db/dbformat.h
  15. 4
      db/history_trimming_iterator.h
  16. 4
      db/merge_helper.cc
  17. 41
      db/range_del_aggregator.cc
  18. 15
      db/range_del_aggregator.h
  19. 184
      db/range_del_aggregator_test.cc
  20. 3
      db/range_tombstone_fragmenter.h
  21. 26
      db/version_set.cc
  22. 1
      src.mk
  23. 370
      table/compaction_merging_iterator.cc
  24. 44
      table/compaction_merging_iterator.h
  25. 234
      table/merging_iterator.cc
  26. 1
      table/merging_iterator.h

@ -838,6 +838,7 @@ set(SOURCES
table/get_context.cc
table/iterator.cc
table/merging_iterator.cc
table/compaction_merging_iterator.cc
table/meta_blocks.cc
table/persistent_cache_helper.cc
table/plain/plain_table_bloom.cc

@ -1,5 +1,7 @@
# Rocksdb Change Log
## Unreleased
### Behavior changes
* Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys.
## 8.0.0 (02/19/2023)
### Behavior changes

@ -200,6 +200,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"table/block_based/reader_common.cc",
"table/block_based/uncompression_dict_reader.cc",
"table/block_fetcher.cc",
"table/compaction_merging_iterator.cc",
"table/cuckoo/cuckoo_table_builder.cc",
"table/cuckoo/cuckoo_table_factory.cc",
"table/cuckoo/cuckoo_table_reader.cc",
@ -543,6 +544,7 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
"table/block_based/reader_common.cc",
"table/block_based/uncompression_dict_reader.cc",
"table/block_fetcher.cc",
"table/compaction_merging_iterator.cc",
"table/cuckoo/cuckoo_table_builder.cc",
"table/cuckoo/cuckoo_table_factory.cc",
"table/cuckoo/cuckoo_table_reader.cc",

@ -123,6 +123,10 @@ class BlobCountingIterator : public InternalIterator {
return iter_->GetProperty(prop_name, prop);
}
bool IsDeleteRangeSentinelKey() const override {
return iter_->IsDeleteRangeSentinelKey();
}
private:
void UpdateAndCountBlobIfNeeded() {
assert(!iter_->Valid() || iter_->status().ok());

@ -188,6 +188,11 @@ class ClippingIterator : public InternalIterator {
return iter_->GetProperty(prop_name, prop);
}
bool IsDeleteRangeSentinelKey() const override {
assert(valid_);
return iter_->IsDeleteRangeSentinelKey();
}
private:
void UpdateValid() {
assert(!iter_->Valid() || iter_->status().ok());

@ -464,6 +464,7 @@ void CompactionIterator::NextFromInput() {
value_ = input_.value();
blob_value_.Reset();
iter_stats_.num_input_records++;
is_range_del_ = input_.IsDeleteRangeSentinelKey();
Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
if (!pik_status.ok()) {
@ -483,7 +484,10 @@ void CompactionIterator::NextFromInput() {
break;
}
TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
if (is_range_del_) {
validity_info_.SetValid(kRangeDeletion);
break;
}
// Update input statistics
if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion ||
ikey_.type == kTypeDeletionWithTimestamp) {
@ -705,6 +709,14 @@ void CompactionIterator::NextFromInput() {
ParsedInternalKey next_ikey;
AdvanceInputIter();
while (input_.Valid() && input_.IsDeleteRangeSentinelKey() &&
ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
.ok() &&
cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
// skip range tombstone start keys with the same user key
// since they are not "real" point keys.
AdvanceInputIter();
}
// Check whether the next key exists, is not corrupt, and is the same key
// as the single delete.
@ -712,6 +724,7 @@ void CompactionIterator::NextFromInput() {
ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
.ok() &&
cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
assert(!input_.IsDeleteRangeSentinelKey());
#ifndef NDEBUG
const Compaction* c =
compaction_ ? compaction_->real_compaction() : nullptr;
@ -936,12 +949,14 @@ void CompactionIterator::NextFromInput() {
// Note that a deletion marker of type kTypeDeletionWithTimestamp will be
// considered to have a different user key unless the timestamp is older
// than *full_history_ts_low_.
//
// Range tombstone start keys are skipped as they are not "real" keys.
while (!IsPausingManualCompaction() && !IsShuttingDown() &&
input_.Valid() &&
(ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
.ok()) &&
cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) &&
(prev_snapshot == 0 ||
(prev_snapshot == 0 || input_.IsDeleteRangeSentinelKey() ||
DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) {
AdvanceInputIter();
}
@ -1235,10 +1250,12 @@ void CompactionIterator::DecideOutputLevel() {
void CompactionIterator::PrepareOutput() {
if (Valid()) {
if (ikey_.type == kTypeValue) {
ExtractLargeValueIfNeeded();
} else if (ikey_.type == kTypeBlobIndex) {
GarbageCollectBlobIfNeeded();
if (LIKELY(!is_range_del_)) {
if (ikey_.type == kTypeValue) {
ExtractLargeValueIfNeeded();
} else if (ikey_.type == kTypeBlobIndex) {
GarbageCollectBlobIfNeeded();
}
}
if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
@ -1261,7 +1278,7 @@ void CompactionIterator::PrepareOutput() {
DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
ikey_.type != kTypeMerge && current_key_committed_ &&
!output_to_penultimate_level_ &&
ikey_.sequence < preserve_time_min_seqno_) {
ikey_.sequence < preserve_time_min_seqno_ && !is_range_del_) {
if (ikey_.type == kTypeDeletion ||
(ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
ROCKS_LOG_FATAL(

@ -63,6 +63,10 @@ class SequenceIterWrapper : public InternalIterator {
void SeekToLast() override { assert(false); }
uint64_t num_itered() const { return num_itered_; }
bool IsDeleteRangeSentinelKey() const override {
assert(Valid());
return inner_iter_->IsDeleteRangeSentinelKey();
}
private:
InternalKeyComparator icmp_;
@ -242,7 +246,12 @@ class CompactionIterator {
const Status& status() const { return status_; }
const ParsedInternalKey& ikey() const { return ikey_; }
inline bool Valid() const { return validity_info_.IsValid(); }
const Slice& user_key() const { return current_user_key_; }
const Slice& user_key() const {
if (UNLIKELY(is_range_del_)) {
return ikey_.user_key;
}
return current_user_key_;
}
const CompactionIterationStats& iter_stats() const { return iter_stats_; }
uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
// If the current key should be placed on penultimate level, only valid if
@ -252,6 +261,8 @@ class CompactionIterator {
}
Status InputStatus() const { return input_.status(); }
bool IsDeleteRangeSentinelKey() const { return is_range_del_; }
private:
// Processes the input stream to find the next output
void NextFromInput();
@ -385,6 +396,7 @@ class CompactionIterator {
kKeepSD = 8,
kKeepDel = 9,
kNewUserKey = 10,
kRangeDeletion = 11,
};
struct ValidityInfo {
@ -493,6 +505,10 @@ class CompactionIterator {
// This is a best-effort facility, so memory_order_relaxed is sufficient.
return manual_compaction_canceled_.load(std::memory_order_relaxed);
}
// Stores whether the current compaction iterator output
// is a range tombstone start key.
bool is_range_del_{false};
};
inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq,

@ -1118,6 +1118,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
IterKey end_ikey;
Slice start_slice;
Slice end_slice;
Slice start_user_key{};
Slice end_user_key{};
static constexpr char kMaxTs[] =
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
@ -1140,6 +1142,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
&ts_slice);
}
start_slice = start_ikey.GetInternalKey();
start_user_key = start_ikey.GetUserKey();
}
if (end.has_value()) {
end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
@ -1148,6 +1151,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
&ts_slice);
}
end_slice = end_ikey.GetInternalKey();
end_user_key = end_ikey.GetUserKey();
}
std::unique_ptr<InternalIterator> clip;
@ -1263,11 +1267,15 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
[this, sub_compact](CompactionOutputs& outputs) {
return this->OpenCompactionOutputFile(sub_compact, outputs);
};
const CompactionFileCloseFunc close_file_func =
[this, sub_compact](CompactionOutputs& outputs, const Status& status,
const Slice& next_table_min_key) {
return this->FinishCompactionOutputFile(status, sub_compact, outputs,
next_table_min_key);
[this, sub_compact, start_user_key, end_user_key](
CompactionOutputs& outputs, const Status& status,
const Slice& next_table_min_key) {
return this->FinishCompactionOutputFile(
status, sub_compact, outputs, next_table_min_key,
sub_compact->start.has_value() ? &start_user_key : nullptr,
sub_compact->end.has_value() ? &end_user_key : nullptr);
};
Status status;
@ -1278,7 +1286,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
// Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
// returns true.
assert(!end.has_value() || cfd->user_comparator()->Compare(
c_iter->user_key(), end.value()) < 0);
@ -1458,7 +1465,8 @@ void CompactionJob::RecordDroppedKeys(
Status CompactionJob::FinishCompactionOutputFile(
const Status& input_status, SubcompactionState* sub_compact,
CompactionOutputs& outputs, const Slice& next_table_min_key) {
CompactionOutputs& outputs, const Slice& next_table_min_key,
const Slice* comp_start_user_key, const Slice* comp_end_user_key) {
AutoThreadOperationStageUpdater stage_updater(
ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
assert(sub_compact != nullptr);
@ -1488,12 +1496,10 @@ Status CompactionJob::FinishCompactionOutputFile(
// output_to_penultimate_level compaction here, as it's only used to decide
// if range dels could be dropped.
if (outputs.HasRangeDel()) {
s = outputs.AddRangeDels(
sub_compact->start.has_value() ? &(sub_compact->start.value())
: nullptr,
sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
earliest_snapshot, next_table_min_key, full_history_ts_low_);
s = outputs.AddRangeDels(comp_start_user_key, comp_end_user_key,
range_del_out_stats, bottommost_level_,
cfd->internal_comparator(), earliest_snapshot,
next_table_min_key, full_history_ts_low_);
}
RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");

@ -256,7 +256,9 @@ class CompactionJob {
Status FinishCompactionOutputFile(const Status& input_status,
SubcompactionState* sub_compact,
CompactionOutputs& outputs,
const Slice& next_table_min_key);
const Slice& next_table_min_key,
const Slice* comp_start_user_key,
const Slice* comp_end_user_key);
Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
CompactionOutputs& outputs);

@ -226,6 +226,15 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
assert(c_iter.Valid());
const Slice& internal_key = c_iter.key();
#ifndef NDEBUG
bool should_stop = false;
std::pair<bool*, const Slice> p{&should_stop, internal_key};
TEST_SYNC_POINT_CALLBACK(
"CompactionOutputs::ShouldStopBefore::manual_decision", (void*)&p);
if (should_stop) {
return true;
}
#endif // NDEBUG
const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
const InternalKeyComparator* icmp =
&compaction_->column_family_data()->internal_comparator();
@ -347,8 +356,14 @@ Status CompactionOutputs::AddToOutput(
const CompactionFileOpenFunc& open_file_func,
const CompactionFileCloseFunc& close_file_func) {
Status s;
bool is_range_del = c_iter.IsDeleteRangeSentinelKey();
if (is_range_del && compaction_->bottommost_level()) {
// We don't consider range tombstone for bottommost level since:
// 1. there is no grandparent and hence no overlap to consider
// 2. range tombstone may be dropped at bottommost level.
return s;
}
const Slice& key = c_iter.key();
if (ShouldStopBefore(c_iter) && HasBuilder()) {
s = close_file_func(*this, c_iter.InputStatus(), key);
if (!s.ok()) {
@ -358,6 +373,13 @@ Status CompactionOutputs::AddToOutput(
grandparent_boundary_switched_num_ = 0;
grandparent_overlapped_bytes_ =
GetCurrentKeyGrandparentOverlappedBytes(key);
if (UNLIKELY(is_range_del)) {
// lower bound for this new output file, this is needed as the lower bound
// does not come from the smallest point key in this case.
range_tombstone_lower_bound_.DecodeFrom(key);
} else {
range_tombstone_lower_bound_.Clear();
}
}
// Open output file if necessary
@ -368,6 +390,17 @@ Status CompactionOutputs::AddToOutput(
}
}
// c_iter may emit range deletion keys, so update `last_key_for_partitioner_`
// here before returning below when `is_range_del` is true
if (partitioner_) {
last_key_for_partitioner_.assign(c_iter.user_key().data_,
c_iter.user_key().size_);
}
if (UNLIKELY(is_range_del)) {
return s;
}
assert(builder_ != nullptr);
const Slice& value = c_iter.value();
s = current_output().validator.Add(key, value);
@ -391,28 +424,33 @@ Status CompactionOutputs::AddToOutput(
s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
ikey.type);
if (partitioner_) {
last_key_for_partitioner_.assign(c_iter.user_key().data_,
c_iter.user_key().size_);
}
return s;
}
namespace {
void SetMaxSeqAndTs(InternalKey& internal_key, const Slice& user_key,
const size_t ts_sz) {
if (ts_sz) {
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
if (ts_sz <= strlen(kTsMax)) {
internal_key = InternalKey(user_key, kMaxSequenceNumber,
kTypeRangeDeletion, Slice(kTsMax, ts_sz));
} else {
internal_key =
InternalKey(user_key, kMaxSequenceNumber, kTypeRangeDeletion,
std::string(ts_sz, '\xff'));
}
} else {
internal_key.Set(user_key, kMaxSequenceNumber, kTypeRangeDeletion);
}
}
} // namespace
Status CompactionOutputs::AddRangeDels(
const Slice* comp_start_user_key, const Slice* comp_end_user_key,
CompactionIterationStats& range_del_out_stats, bool bottommost_level,
const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
const Slice& next_table_min_key, const std::string& full_history_ts_low) {
assert(HasRangeDel());
FileMetaData& meta = current_output().meta;
const Comparator* ucmp = icmp.user_comparator();
Slice lower_bound_guard, upper_bound_guard;
std::string smallest_user_key;
const Slice *lower_bound, *upper_bound;
bool lower_bound_from_sub_compact = false;
// The following example does not happen since
// CompactionOutput::ShouldStopBefore() always return false for the first
// point key. But we should consider removing this dependency. Suppose for the
@ -424,98 +462,134 @@ Status CompactionOutputs::AddRangeDels(
// Then meta.smallest will be set to comp_start_user_key@seqno
// and meta.largest will be set to comp_start_user_key@kMaxSequenceNumber
// which violates the assumption that meta.smallest should be <= meta.largest.
assert(HasRangeDel());
FileMetaData& meta = current_output().meta;
const Comparator* ucmp = icmp.user_comparator();
InternalKey lower_bound_buf, upper_bound_buf;
Slice lower_bound_guard, upper_bound_guard;
std::string smallest_user_key;
const Slice *lower_bound, *upper_bound;
// We first determine the internal key lower_bound and upper_bound for
// this output file. All and only range tombstones that overlap with
// [lower_bound, upper_bound] should be added to this file. File
// boundaries (meta.smallest/largest) should be updated accordingly when
// extended by range tombstones.
size_t output_size = outputs_.size();
if (output_size == 1) {
// For the first output table, include range tombstones before the min
// key but after the subcompaction boundary.
lower_bound = comp_start_user_key;
lower_bound_from_sub_compact = true;
} else if (meta.smallest.size() > 0) {
// This is the first file in the subcompaction.
//
// When outputting a range tombstone that spans a subcompaction boundary,
// the files on either side of that boundary need to include that
// boundary's user key. Otherwise, the spanning range tombstone would lose
// coverage.
//
// To achieve this while preventing files from overlapping in internal key
// (an LSM invariant violation), we allow the earlier file to include the
// boundary user key up to `kMaxSequenceNumber,kTypeRangeDeletion`. The
// later file can begin at the boundary user key at the newest key version
// it contains. At this point that version number is unknown since we have
// not processed the range tombstones yet, so permit any version. Same story
// applies to timestamp, and a non-nullptr `comp_start_user_key` should have
// `kMaxTs` here, which similarly permits any timestamp.
if (comp_start_user_key) {
lower_bound_buf.Set(*comp_start_user_key, kMaxSequenceNumber,
kTypeRangeDeletion);
lower_bound_guard = lower_bound_buf.Encode();
lower_bound = &lower_bound_guard;
} else {
lower_bound = nullptr;
}
} else {
// For subsequent output tables, only include range tombstones from min
// key onwards since the previous file was extended to contain range
// tombstones falling before min key.
smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
lower_bound_guard = Slice(smallest_user_key);
lower_bound = &lower_bound_guard;
} else {
lower_bound = nullptr;
}
if (!next_table_min_key.empty()) {
// This may be the last file in the subcompaction in some cases, so we
// need to compare the end key of subcompaction with the next file start
// key. When the end key is chosen by the subcompaction, we know that
// it must be the biggest key in output file. Therefore, it is safe to
// use the smaller key as the upper bound of the output file, to ensure
// that there is no overlapping between different output files.
upper_bound_guard = ExtractUserKey(next_table_min_key);
if (comp_end_user_key != nullptr &&
ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >=
0) {
upper_bound = comp_end_user_key;
if (range_tombstone_lower_bound_.size() > 0) {
assert(meta.smallest.size() == 0 ||
icmp.Compare(range_tombstone_lower_bound_, meta.smallest) < 0);
lower_bound_guard = range_tombstone_lower_bound_.Encode();
} else {
assert(meta.smallest.size() > 0);
lower_bound_guard = meta.smallest.Encode();
}
lower_bound = &lower_bound_guard;
}
const size_t ts_sz = ucmp->timestamp_size();
if (next_table_min_key.empty()) {
// Last file of the subcompaction.
if (comp_end_user_key) {
upper_bound_buf.Set(*comp_end_user_key, kMaxSequenceNumber,
kTypeRangeDeletion);
upper_bound_guard = upper_bound_buf.Encode();
upper_bound = &upper_bound_guard;
} else {
upper_bound = nullptr;
}
} else {
// This is the last file in the subcompaction, so extend until the
// subcompaction ends.
upper_bound = comp_end_user_key;
}
bool has_overlapping_endpoints;
if (upper_bound != nullptr && meta.largest.size() > 0) {
has_overlapping_endpoints = ucmp->CompareWithoutTimestamp(
meta.largest.user_key(), *upper_bound) == 0;
} else {
has_overlapping_endpoints = false;
// There is another file coming whose coverage will begin at
// `next_table_min_key`. The current file needs to extend range tombstone
// coverage through its own keys (through `meta.largest`) and through user
// keys preceding `next_table_min_key`'s user key.
ParsedInternalKey next_table_min_key_parsed;
ParseInternalKey(next_table_min_key, &next_table_min_key_parsed,
false /* log_err_key */)
.PermitUncheckedError();
assert(next_table_min_key_parsed.sequence < kMaxSequenceNumber);
assert(meta.largest.size() == 0 ||
icmp.Compare(meta.largest.Encode(), next_table_min_key) < 0);
assert(!lower_bound || icmp.Compare(*lower_bound, next_table_min_key) <= 0);
if (meta.largest.size() > 0 &&
ucmp->EqualWithoutTimestamp(meta.largest.user_key(),
next_table_min_key_parsed.user_key)) {
// Caution: this assumes meta.largest.Encode() lives longer than
// upper_bound, which is only true if meta.largest is never updated.
// This just happens to be the case here since meta.largest serves
// as the upper_bound.
upper_bound_guard = meta.largest.Encode();
} else {
SetMaxSeqAndTs(upper_bound_buf, next_table_min_key_parsed.user_key,
ts_sz);
upper_bound_guard = upper_bound_buf.Encode();
}
upper_bound = &upper_bound_guard;
}
if (lower_bound && upper_bound &&
icmp.Compare(*lower_bound, *upper_bound) > 0) {
assert(meta.smallest.size() == 0 &&
ucmp->EqualWithoutTimestamp(ExtractUserKey(*lower_bound),
ExtractUserKey(*upper_bound)));
// This can only happen when lower_bound have the same user key as
// next_table_min_key and that there is no point key in the current
// compaction output file.
return Status::OK();
}
// The end key of the subcompaction must be bigger or equal to the upper
// bound. If the end of subcompaction is null or the upper bound is null,
// it means that this file is the last file in the compaction. So there
// will be no overlapping between this file and others.
assert(comp_end_user_key == nullptr || upper_bound == nullptr ||
ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0);
auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
has_overlapping_endpoints);
// Position the range tombstone output iterator. There may be tombstone
// fragments that are entirely out of range, so make sure that we do not
// include those.
if (lower_bound != nullptr) {
it->Seek(*lower_bound);
} else {
it->SeekToFirst();
}
ucmp->CompareWithoutTimestamp(ExtractUserKey(*upper_bound),
*comp_end_user_key) <= 0);
auto it = range_del_agg_->NewIterator(lower_bound, upper_bound);
Slice last_tombstone_start_user_key{};
for (; it->Valid(); it->Next()) {
bool reached_lower_bound = false;
for (it->SeekToFirst(); it->Valid(); it->Next()) {
auto tombstone = it->Tombstone();
if (upper_bound != nullptr) {
int cmp =
ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_);
// Tombstones starting after upper_bound only need to be included in
// the next table.
// If the current SST ends before upper_bound, i.e.,
// `has_overlapping_endpoints == false`, we can also skip over range
// tombstones that start exactly at upper_bound. Such range
// tombstones will be included in the next file and are not relevant
// to the point keys or endpoints of the current file.
// If the current SST ends at the same user key at upper_bound,
// i.e., `has_overlapping_endpoints == true`, AND the tombstone has
// the same start key as upper_bound, i.e., cmp == 0, then
// the tombstone is relevant only if the tombstone's sequence number
// is no larger than this file's largest key's sequence number. This
// is because the upper bound to truncate this file's range tombstone
// will be meta.largest in this case, and any tombstone that starts after
// it will not be relevant.
if (cmp < 0) {
break;
} else if (cmp == 0) {
if (!has_overlapping_endpoints ||
tombstone.seq_ < GetInternalKeySeqno(meta.largest.Encode())) {
break;
}
}
auto kv = tombstone.Serialize();
InternalKey tombstone_end = tombstone.SerializeEndKey();
// TODO: the underlying iterator should support clamping the bounds.
// tombstone_end.Encode is of form user_key@kMaxSeqno
// if it is equal to lower_bound, there is no need to include
// such range tombstone.
if (!reached_lower_bound && lower_bound &&
icmp.Compare(tombstone_end.Encode(), *lower_bound) <= 0) {
continue;
}
assert(!lower_bound ||
icmp.Compare(*lower_bound, tombstone_end.Encode()) <= 0);
reached_lower_bound = true;
const size_t ts_sz = ucmp->timestamp_size();
// Garbage collection for range tombstones.
// If user-defined timestamp is enabled, range tombstones are dropped if
// they are at bottommost_level, below full_history_ts_low and not visible
@ -534,83 +608,93 @@ Status CompactionOutputs::AddRangeDels(
continue;
}
auto kv = tombstone.Serialize();
assert(lower_bound == nullptr ||
ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0);
ucmp->CompareWithoutTimestamp(ExtractUserKey(*lower_bound),
kv.second) < 0);
InternalKey tombstone_start = kv.first;
if (lower_bound &&
ucmp->CompareWithoutTimestamp(tombstone_start.user_key(),
ExtractUserKey(*lower_bound)) < 0) {
// This just updates the non-timestamp portion of `tombstone_start`'s user
// key. Ideally there would be a simpler API usage
ParsedInternalKey tombstone_start_parsed;
ParseInternalKey(tombstone_start.Encode(), &tombstone_start_parsed,
false /* log_err_key */)
.PermitUncheckedError();
// timestamp should be from where sequence number is from, which is from
// tombstone in this case
std::string ts =
tombstone_start_parsed.GetTimestamp(ucmp->timestamp_size())
.ToString();
tombstone_start_parsed.user_key = ExtractUserKey(*lower_bound);
tombstone_start.SetFrom(tombstone_start_parsed, ts);
}
if (upper_bound != nullptr &&
icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) {
break;
}
// Here we show that *only* range tombstones that overlap with
// [lower_bound, upper_bound] are added to the current file, and
// sanity checking invariants that should hold:
// - [tombstone_start, tombstone_end] overlaps with [lower_bound,
// upper_bound]
// - meta.smallest <= meta.largest
// Corresponding assertions are made, the proof is broken is any of them
// fails.
// TODO: show that *all* range tombstones that overlap with
// [lower_bound, upper_bound] are added.
// TODO: some invariant about boundaries are correctly updated.
//
// Note that `tombstone_start` is updated in the if condition above, we use
// tombstone_start to refer to its initial value, i.e.,
// it->Tombstone().first, and use tombstone_start* to refer to its value
// after the update.
//
// To show [lower_bound, upper_bound] overlaps with [tombstone_start,
// tombstone_end]:
// lower_bound <= upper_bound from the if condition right after all
// bounds are initialized. We assume each tombstone fragment has
// start_key.user_key < end_key.user_key, so
// tombstone_start < tombstone_end by
// FragmentedTombstoneIterator::Tombstone(). So these two ranges are both
// non-emtpy. The flag `reached_lower_bound` and the if logic before it
// ensures lower_bound <= tombstone_end. tombstone_start is only updated
// if it has a smaller user_key than lower_bound user_key, so
// tombstone_start <= tombstone_start*. The above if condition implies
// tombstone_start* <= upper_bound. So we have
// tombstone_start <= upper_bound and lower_bound <= tombstone_end
// and the two ranges overlap.
//
// To show meta.smallest <= meta.largest:
// From the implementation of UpdateBoundariesForRange(), it suffices to
// prove that when it is first called in this function, its parameters
// satisfy `start <= end`, where start = max(tombstone_start*, lower_bound)
// and end = min(tombstone_end, upper_bound). From the above proof we have
// lower_bound <= tombstone_end and lower_bound <= upper_bound. We only need
// to show that tombstone_start* <= min(tombstone_end, upper_bound).
// Note that tombstone_start*.user_key = max(tombstone_start.user_key,
// lower_bound.user_key). Assuming tombstone_end always has
// kMaxSequenceNumber and lower_bound.seqno < kMaxSequenceNumber.
// Since lower_bound <= tombstone_end and lower_bound.seqno <
// tombstone_end.seqno (in absolute number order, not internal key order),
// lower_bound.user_key < tombstone_end.user_key.
// Since lower_bound.user_key < tombstone_end.user_key and
// tombstone_start.user_key < tombstone_end.user_key, tombstone_start* <
// tombstone_end. Since tombstone_start* <= upper_bound from the above proof
// and tombstone_start* < tombstone_end, tombstone_start* <=
// min(tombstone_end, upper_bound), so the two ranges overlap.
// Range tombstone is not supported by output validator yet.
builder_->Add(kv.first.Encode(), kv.second);
InternalKey tombstone_start = std::move(kv.first);
InternalKey smallest_candidate{tombstone_start};
if (lower_bound != nullptr &&
ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
*lower_bound) <= 0) {
// Pretend the smallest key has the same user key as lower_bound
// (the max key in the previous table or subcompaction) in order for
// files to appear key-space partitioned.
if (lower_bound_from_sub_compact) {
// When lower_bound is chosen by a subcompaction
// (lower_bound_from_sub_compact), we know that subcompactions over
// smaller keys cannot contain any keys at lower_bound. We also know
// that smaller subcompactions exist, because otherwise the
// subcompaction woud be unbounded on the left. As a result, we know
// that no other files on the output level will contain actual keys at
// lower_bound (an output file may have a largest key of
// lower_bound@kMaxSequenceNumber, but this only indicates a large range
// tombstone was truncated). Therefore, it is safe to use the
// tombstone's sequence number, to ensure that keys at lower_bound at
// lower levels are covered by truncated tombstones.
if (ts_sz) {
assert(tombstone.ts_.size() == ts_sz);
smallest_candidate = InternalKey(*lower_bound, tombstone.seq_,
kTypeRangeDeletion, tombstone.ts_);
} else {
smallest_candidate =
InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
}
} else {
// If lower_bound was chosen by the smallest data key in the file,
// choose lowest seqnum so this file's smallest internal key comes
// after the previous file's largest. The fake seqnum is OK because
// the read path's file-picking code only considers user key.
smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
}
if (lower_bound &&
icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) {
tombstone_start.DecodeFrom(*lower_bound);
}
InternalKey tombstone_end = tombstone.SerializeEndKey();
InternalKey largest_candidate{tombstone_end};
if (upper_bound != nullptr &&
ucmp->CompareWithoutTimestamp(*upper_bound,
largest_candidate.user_key()) <= 0) {
// Pretend the largest key has the same user key as upper_bound (the
// min key in the following table or subcompaction) in order for files
// to appear key-space partitioned.
//
// Choose highest seqnum so this file's largest internal key comes
// before the next file's/subcompaction's smallest. The fake seqnum is
// OK because the read path's file-picking code only considers the
// user key portion.
//
// Note Seek() also creates InternalKey with (user_key,
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
// Seek() key in InternalKey's ordering. So Seek() will look in the
// next file for the user key
if (ts_sz) {
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
if (ts_sz <= strlen(kTsMax)) {
largest_candidate =
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
Slice(kTsMax, ts_sz));
} else {
largest_candidate =
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
std::string(ts_sz, '\xff'));
}
} else {
largest_candidate =
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
}
if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) {
tombstone_end.DecodeFrom(*upper_bound);
}
meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
assert(icmp.Compare(tombstone_start, tombstone_end) <= 0);
meta.UpdateBoundariesForRange(tombstone_start, tombstone_end,
tombstone.seq_, icmp);
if (!bottommost_level) {
bool start_user_key_changed =
@ -618,17 +702,8 @@ Status CompactionOutputs::AddRangeDels(
ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key,
it->start_key()) < 0;
last_tombstone_start_user_key = it->start_key();
// Range tombstones are truncated at file boundaries
if (icmp.Compare(tombstone_start, meta.smallest) < 0) {
tombstone_start = meta.smallest;
}
if (icmp.Compare(tombstone_end, meta.largest) > 0) {
tombstone_end = meta.largest;
}
// this assertion validates invariant (2) in the comment below.
assert(icmp.Compare(tombstone_start, tombstone_end) <= 0);
if (start_user_key_changed) {
// if tombstone_start >= tombstone_end, then either no key range is
// If tombstone_start >= tombstone_end, then either no key range is
// covered, or that they have the same user key. If they have the same
// user key, then the internal key range should only be within this
// level, and no keys from older levels is covered.
@ -646,138 +721,6 @@ Status CompactionOutputs::AddRangeDels(
}
}
}
// TODO: show invariants that ensure all necessary range tombstones are
// added
// and that file boundaries ensure no coverage is lost.
// Each range tombstone with internal key range [tombstone_start,
// tombstone_end] is being added to the current compaction output file here.
// The range tombstone is going to be truncated at range [meta.smallest,
// meta.largest] during reading/scanning. We should maintain invariants
// (1) meta.smallest <= meta.largest and,
// (2) [tombstone_start, tombstone_end] and [meta.smallest, meta.largest]
// overlaps, as there is no point adding range tombstone with a range
// outside the file's range.
// Since `tombstone_end` is always some user_key@kMaxSeqno, it is okay to
// use either open or closed range. Using closed range here to make
// reasoning easier, and it is more consistent with an ongoing work that
// tries to simplify this method.
//
// There are two cases:
// Case 1. Output file has no point key:
// First we show this case only happens when the entire compaction output
// is range tombstone only. This is true if CompactionIterator does not
// emit any point key. Suppose CompactionIterator emits some point key.
// Based on the assumption that CompactionOutputs::ShouldStopBefore()
// always return false for the first point key, the first compaction
// output file always contains a point key. Each new compaction output
// file is created if there is a point key for which ShouldStopBefore()
// returns true, and the point key would be added to the new compaction
// output file. So each new compaction file always contains a point key.
// So Case 1 only happens when CompactionIterator does not emit any
// point key.
//
// To show (1) meta.smallest <= meta.largest:
// Since the compaction output is range tombstone only, `lower_bound` and
// `upper_bound` are either null or comp_start/end_user_key respectively.
// According to how UpdateBoundariesForRange() is implemented, it blindly
// updates meta.smallest and meta.largest to smallest_candidate and
// largest_candidate the first time it is called. Subsequently, it
// compares input parameter with meta.smallest and meta.largest and only
// updates them when input is smaller/larger. So we only need to show
// smallest_candidate <= largest_candidate the first time
// UpdateBoundariesForRange() is called. Here we show something stronger
// that smallest_candidate.user_key < largest_candidate.user_key always
// hold for Case 1.
// We assume comp_start_user_key < comp_end_user_key, if provided. We
// assume that tombstone_start < tombstone_end. This assumption is based
// on that each fragment in FragmentedTombstoneList has
// start_key < end_key (user_key) and that
// FragmentedTombstoneIterator::Tombstone() returns the pair
// (start_key@tombstone_seqno with op_type kTypeRangeDeletion, end_key).
// The logic in this loop sets smallest_candidate to
// max(tombstone_start.user_key, comp_start_user_key)@tombstone.seq_ with
// op_type kTypeRangeDeletion, largest_candidate to
// min(tombstone_end.user_key, comp_end_user_key)@kMaxSequenceNumber with
// op_type kTypeRangeDeletion. When a bound is null, there is no
// truncation on that end. To show that smallest_candidate.user_key <
// largest_candidate.user_key, it suffices to show
// tombstone_start.user_key < comp_end_user_key (if not null) AND
// comp_start_user_key (if not null) < tombstone_end.user_key.
// Since the file has no point key, `has_overlapping_endpoints` is false.
// In the first sanity check of this for-loop, we compare
// tombstone_start.user_key against upper_bound = comp_end_user_key,
// and only proceed if tombstone_start.user_key < comp_end_user_key.
// We assume FragmentedTombstoneIterator::Seek(k) lands
// on a tombstone with end_key > k. So the call it->Seek(*lower_bound)
// above implies compact_start_user_key < tombstone_end.user_key.
//
// To show (2) [tombstone_start, tombstone_end] and [meta.smallest,
// meta.largest] overlaps (after the call to UpdateBoundariesForRange()):
// In the proof for (1) we have shown that
// smallest_candidate <= largest_candidate. Since tombstone_start <=
// smallest_candidate <= largest_candidate <= tombstone_end, for (2) to
// hold, it suffices to show that [smallest_candidate, largest_candidate]
// overlaps with [meta.smallest, meta.largest]. too.
// Given meta.smallest <= meta.largest shown above, we need to show
// that it is impossible to have largest_candidate < meta.smallest or
// meta.largest < smallest_candidate. If the above
// meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate)
// updates meta.largest or meta.smallest, then the two ranges overlap.
// So we assume meta.UpdateBoundariesForRange(smallest_candidate,
// largest_candidate) did not update meta.smallest nor meta.largest, which
// means meta.smallest < smallest_candidate and largest_candidate <
// meta.largest.
//
// Case 2. Output file has >= 1 point key. This means meta.smallest and
// meta.largest are not empty when AddRangeDels() is called.
// To show (1) meta.smallest <= meta.largest:
// Assume meta.smallest <= meta.largest when AddRangeDels() is called,
// this follow from how UpdateBoundariesForRange() is implemented where it
// takes min or max to update meta.smallest or meta.largest.
//
// To show (2) [tombstone_start, tombstone_end] and [meta.smallest,
// meta.largest] overlaps (after the call to UpdateBoundariesForRange()):
// When smallest_candidate <= largest_candidate, the proof in Case 1
// applies, so we only need to show (2) holds when smallest_candidate >
// largest_candidate. When both bounds are either null or from
// subcompaction boundary, the proof in Case 1 applies, so we only need to
// show (2) holds when at least one bound is from a point key (either
// meta.smallest for lower bound or next_table_min_key for upper bound).
//
// Suppose lower bound is meta.smallest.user_key. The call
// it->Seek(*lower_bound) implies tombstone_end.user_key >
// meta.smallest.user_key. We have smallest_candidate.user_key =
// max(tombstone_start.user_key, meta.smallest.user_key). For
// smallest_candidate to be > largest_candidate, we need
// largest_candidate.user_key = upper_bound = smallest_candidate.user_key,
// where tombstone_end is truncated to largest_candidate.
// Subcase 1:
// Suppose largest_candidate.user_key = comp_end_user_key (there is no
// next point key). Subcompaction ensures any point key from this
// subcompaction has a user_key < comp_end_user_key, so 1)
// meta.smallest.user_key < comp_end_user_key, 2)
// `has_overlapping_endpoints` is false, and the first if condition in
// this for-loop ensures tombstone_start.user_key < comp_end_user_key. So
// smallest_candidate.user_key < largest_candidate.user_key. This case
// cannot happen when smallest > largest_candidate.
// Subcase 2:
// Suppose largest_candidate.user_key = next_table_min_key.user_key.
// The first if condition in this for-loop together with
// smallest_candidate.user_key = next_table_min_key.user_key =
// upper_bound implies `has_overlapping_endpoints` is true (so meta
// largest.user_key = upper_bound) and
// tombstone.seq_ < meta.largest.seqno. So
// tombstone_start < meta.largest < tombstone_end.
//
// Suppose lower bound is comp_start_user_key and upper_bound is
// next_table_min_key. The call it->Seek(*lower_bound) implies we have
// tombstone_end_key.user_key > comp_start_user_key. So
// tombstone_end_key.user_key > smallest_candidate.user_key. For
// smallest_candidate to be > largest_candidate, we need
// tombstone_start.user_key = largest_candidate.user_key = upper_bound =
// next_table_min_key.user_key. This means `has_overlapping_endpoints` is
// true (so meta.largest.user_key = upper_bound) and tombstone.seq_ <
// meta.largest.seqno. So tombstone_start < meta.largest < tombstone_end.
}
return Status::OK();
}

@ -167,9 +167,15 @@ class CompactionOutputs {
current_output_file_size_ = 0;
}
// Add range-dels from the aggregator to the current output file
// Add range deletions from the range_del_agg_ to the current output file.
// Input parameters, `range_tombstone_lower_bound_` and current output's
// metadata determine the bounds on range deletions to add. Updates output
// file metadata boundary if extended by range tombstones.
//
// @param comp_start_user_key and comp_end_user_key include timestamp if
// user-defined timestamp is enabled.
// user-defined timestamp is enabled. Their timestamp should be max timestamp.
// @param next_table_min_key internal key lower bound for the next compaction
// output.
// @param full_history_ts_low used for range tombstone garbage collection.
Status AddRangeDels(const Slice* comp_start_user_key,
const Slice* comp_end_user_key,
@ -314,6 +320,7 @@ class CompactionOutputs {
std::unique_ptr<SstPartitioner> partitioner_;
// A flag determines if this subcompaction has been split by the cursor
// for RoundRobin compaction
bool is_split_ = false;
// We also maintain the output split key for each subcompaction to avoid
@ -345,6 +352,10 @@ class CompactionOutputs {
// for the current output file, how many file boundaries has it crossed,
// basically number of files overlapped * 2
size_t grandparent_boundary_switched_num_ = 0;
// The smallest key of the current output file, this is set when current
// output file's smallest key is a range tombstone start key.
InternalKey range_tombstone_lower_bound_;
};
// helper struct to concatenate the last level and penultimate level outputs

@ -84,6 +84,11 @@ class SubcompactionState {
// Assign range dels aggregator, for each range_del, it can only be assigned
// to one output level, for per_key_placement, it's going to be the
// penultimate level.
// TODO: This does not work for per_key_placement + user-defined timestamp +
// DeleteRange() combo. If user-defined timestamp is enabled,
// it is possible for a range tombstone to belong to bottommost level (
// seqno < earliest snapshot) without being dropped (garbage collection
// for user-defined timestamp).
void AssignRangeDelAggregator(
std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
if (compaction->SupportsPerKeyPlacement()) {

@ -1661,6 +1661,217 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
ASSERT_EQ(1, num_range_deletions);
}
TEST_F(DBRangeDelTest, LevelCompactOutputCutAtRangeTombstoneForTtlFiles) {
Options options = CurrentOptions();
options.compression = kNoCompression;
options.compaction_pri = kMinOverlappingRatio;
options.disable_auto_compactions = true;
options.ttl = 24 * 60 * 60; // 24 hours
options.target_file_size_base = 8 << 10;
env_->SetMockSleep();
options.env = env_;
DestroyAndReopen(options);
Random rnd(301);
// Fill some data so that future compactions are not bottommost level
// compaction, and hence they would try cut around files for ttl
for (int i = 5; i < 10; ++i) {
ASSERT_OK(Put(Key(i), rnd.RandomString(1 << 10)));
}
ASSERT_OK(Flush());
MoveFilesToLevel(3);
ASSERT_EQ("0,0,0,1", FilesPerLevel());
for (int i = 5; i < 10; ++i) {
ASSERT_OK(Put(Key(i), rnd.RandomString(1 << 10)));
}
ASSERT_OK(Flush());
MoveFilesToLevel(1);
ASSERT_EQ("0,1,0,1", FilesPerLevel());
env_->MockSleepForSeconds(20 * 60 * 60);
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
Key(11), Key(12)));
ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10)));
ASSERT_OK(Flush());
ASSERT_EQ("1,1,0,1", FilesPerLevel());
// L0 file is new, L1 and L3 file are old and qualified for TTL
env_->MockSleepForSeconds(10 * 60 * 60);
MoveFilesToLevel(1);
// L1 output should be cut into 3 files:
// File 0: Key(0)
// File 1: (qualified for TTL): Key(5) - Key(10)
// File 1: DeleteRange [11, 12)
ASSERT_EQ("0,3,0,1", FilesPerLevel());
}
// Test SST partitioner cut after every single key
class SingleKeySstPartitioner : public SstPartitioner {
public:
const char* Name() const override { return "SingleKeySstPartitioner"; }
PartitionerResult ShouldPartition(
const PartitionerRequest& /*request*/) override {
return kRequired;
}
bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
const Slice& /*largest_user_key*/) override {
return false;
}
};
class SingleKeySstPartitionerFactory : public SstPartitionerFactory {
public:
static const char* kClassName() { return "SingleKeySstPartitionerFactory"; }
const char* Name() const override { return kClassName(); }
std::unique_ptr<SstPartitioner> CreatePartitioner(
const SstPartitioner::Context& /* context */) const override {
return std::unique_ptr<SstPartitioner>(new SingleKeySstPartitioner());
}
};
TEST_F(DBRangeDelTest, CompactionEmitRangeTombstoneToSSTPartitioner) {
Options options = CurrentOptions();
auto factory = std::make_shared<SingleKeySstPartitionerFactory>();
options.sst_partitioner_factory = factory;
options.disable_auto_compactions = true;
DestroyAndReopen(options);
Random rnd(301);
// range deletion keys are not processed when compacting to bottommost level,
// so creating a file at older level to make the next compaction not
// bottommost level
ASSERT_OK(db_->Put(WriteOptions(), Key(4), rnd.RandomString(10)));
ASSERT_OK(Flush());
MoveFilesToLevel(5);
ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(10)));
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
Key(5)));
ASSERT_OK(Flush());
ASSERT_EQ(1, NumTableFilesAtLevel(0));
MoveFilesToLevel(1);
// SSTPartitioner decides to cut when range tombstone start key is passed to
// it. Note that the range tombstone [2, 5) itself span multiple keys, but we
// are not able to partition within its range yet.
ASSERT_EQ(2, NumTableFilesAtLevel(1));
}
TEST_F(DBRangeDelTest, OversizeCompactionGapBetweenPointKeyAndTombstone) {
// L2 has 2 files
// L2_0: 0, 1, 2, 3, 4
// L2_1: 5, 6, 7
// L0 has 1 file
// L0: 0, [5, 6), 8
// max_compaction_bytes is less than the size of L2_0 and L2_1.
// When compacting L0 into L1, it should split into 3 files:
// compaction output should cut before key 5 and key 8 to
// limit future compaction size.
const int kNumPerFile = 4, kNumFiles = 2;
Options options = CurrentOptions();
options.disable_auto_compactions = true;
options.target_file_size_base = 9 * 1024;
options.max_compaction_bytes = 9 * 1024;
DestroyAndReopen(options);
Random rnd(301);
for (int i = 0; i < kNumFiles; ++i) {
std::vector<std::string> values;
for (int j = 0; j < kNumPerFile; j++) {
values.push_back(rnd.RandomString(3 << 10));
ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
}
}
ASSERT_OK(db_->Flush(FlushOptions()));
ASSERT_EQ(1, NumTableFilesAtLevel(0));
MoveFilesToLevel(2);
ASSERT_EQ(2, NumTableFilesAtLevel(2));
ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10)));
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(5),
Key(6)));
ASSERT_OK(Put(Key(8), rnd.RandomString(1 << 10)));
ASSERT_OK(db_->Flush(FlushOptions()));
ASSERT_EQ(1, NumTableFilesAtLevel(0));
ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
true /* disallow_trivial_move */));
ASSERT_EQ(3, NumTableFilesAtLevel(1));
}
TEST_F(DBRangeDelTest, OversizeCompactionGapBetweenTombstone) {
// L2 has two files
// L2_0: 0, 1, 2, 3, 4. L2_1: 5, 6, 7
// L0 has two range tombstones [0, 1), [7, 8).
// max_compaction_bytes is less than the size of L2_0.
// When compacting L0 into L1, the two range tombstones should be
// split into two files.
const int kNumPerFile = 4, kNumFiles = 2;
Options options = CurrentOptions();
options.disable_auto_compactions = true;
options.target_file_size_base = 9 * 1024;
options.max_compaction_bytes = 9 * 1024;
DestroyAndReopen(options);
Random rnd(301);
for (int i = 0; i < kNumFiles; ++i) {
std::vector<std::string> values;
// Write 12K (4 values, each 3K)
for (int j = 0; j < kNumPerFile; j++) {
values.push_back(rnd.RandomString(3 << 10));
ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
}
}
ASSERT_OK(db_->Flush(FlushOptions()));
ASSERT_EQ(1, NumTableFilesAtLevel(0));
MoveFilesToLevel(2);
ASSERT_EQ(2, NumTableFilesAtLevel(2));
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
Key(1)));
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(7),
Key(8)));
ASSERT_OK(db_->Flush(FlushOptions()));
ASSERT_EQ(1, NumTableFilesAtLevel(0));
ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
true /* disallow_trivial_move */));
// This is L0 -> L1 compaction
// The two range tombstones are broken up into two output files
// to limit compaction size.
ASSERT_EQ(2, NumTableFilesAtLevel(1));
}
TEST_F(DBRangeDelTest, OversizeCompactionPointKeyWithinRangetombstone) {
// L2 has two files
// L2_0: 0, 1, 2, 3, 4. L2_1: 6, 7, 8
// L0 has [0, 9) and point key 5
// max_compaction_bytes is less than the size of L2_0.
// When compacting L0 into L1, the compaction should cut at point key 5.
Options options = CurrentOptions();
options.disable_auto_compactions = true;
options.target_file_size_base = 9 * 1024;
options.max_compaction_bytes = 9 * 1024;
DestroyAndReopen(options);
Random rnd(301);
for (int i = 0; i < 9; ++i) {
if (i == 5) {
++i;
}
ASSERT_OK(Put(Key(i), rnd.RandomString(3 << 10)));
}
ASSERT_OK(db_->Flush(FlushOptions()));
ASSERT_EQ(1, NumTableFilesAtLevel(0));
MoveFilesToLevel(2);
ASSERT_EQ(2, NumTableFilesAtLevel(2));
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
Key(9)));
ASSERT_OK(Put(Key(5), rnd.RandomString(1 << 10)));
ASSERT_OK(db_->Flush(FlushOptions()));
ASSERT_EQ(1, NumTableFilesAtLevel(0));
ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
true /* disallow_trivial_move */));
ASSERT_EQ(2, NumTableFilesAtLevel(1));
}
TEST_F(DBRangeDelTest, OverlappedTombstones) {
const int kNumPerFile = 4, kNumFiles = 2;
Options options = CurrentOptions();
@ -2093,6 +2304,7 @@ TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) {
options.compression = kNoCompression;
options.disable_auto_compactions = true;
options.target_file_size_base = 2 * 1024;
options.level_compaction_dynamic_file_size = false;
DestroyAndReopen(options);
Random rnd(301);
@ -2508,7 +2720,7 @@ TEST_F(DBRangeDelTest, LeftSentinelKeyTest) {
options.compression = kNoCompression;
options.disable_auto_compactions = true;
options.target_file_size_base = 3 * 1024;
options.max_compaction_bytes = 1024;
options.max_compaction_bytes = 2048;
DestroyAndReopen(options);
// L2
@ -2554,7 +2766,7 @@ TEST_F(DBRangeDelTest, LeftSentinelKeyTestWithNewerKey) {
options.compression = kNoCompression;
options.disable_auto_compactions = true;
options.target_file_size_base = 3 * 1024;
options.max_compaction_bytes = 1024;
options.max_compaction_bytes = 3 * 1024;
DestroyAndReopen(options);
// L2
@ -3015,6 +3227,183 @@ TEST_F(DBRangeDelTest, DoubleCountRangeTombstoneCompensatedSize) {
db_->ReleaseSnapshot(snapshot);
}
TEST_F(DBRangeDelTest, AddRangeDelsSameLowerAndUpperBound) {
// Test for an edge case where CompactionOutputs::AddRangeDels()
// is called with an empty range: `range_tombstone_lower_bound_` is not empty
// and have the same user_key and sequence number as `next_table_min_key.
// This used to cause file's smallest and largest key to be incorrectly set
// such that smallest > largest, and fail some assertions in iterator and/or
// assertion in VersionSet::ApproximateSize().
Options opts = CurrentOptions();
opts.disable_auto_compactions = true;
opts.target_file_size_base = 1 << 10;
opts.level_compaction_dynamic_file_size = false;
DestroyAndReopen(opts);
Random rnd(301);
// Create file at bottommost level so the manual compaction below is
// non-bottommost level and goes through code path like compensate range
// tombstone size.
ASSERT_OK(Put(Key(1), "v1"));
ASSERT_OK(Put(Key(4), "v2"));
ASSERT_OK(Flush());
MoveFilesToLevel(6);
ASSERT_OK(Put(Key(1), rnd.RandomString(4 << 10)));
ASSERT_OK(Put(Key(3), rnd.RandomString(4 << 10)));
// So Key(3) does not get dropped.
const Snapshot* snapshot = db_->GetSnapshot();
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
Key(4)));
ASSERT_OK(Flush());
ASSERT_OK(Put(Key(3), rnd.RandomString(4 << 10)));
ASSERT_OK(Put(Key(4), rnd.RandomString(4 << 10)));
ASSERT_OK(Flush());
MoveFilesToLevel(1);
// Each file will have two keys, with Key(3) straddle between two files.
// File 1: Key(1)@1, Key(3)@6, DeleteRange ends at Key(3)@6
// File 2: Key(3)@4, Key(4)@7, DeleteRange start from Key(3)@4
ASSERT_EQ(NumTableFilesAtLevel(1), 2);
// Manually update compaction output file cutting decisions
// to cut before range tombstone sentinel Key(3)@4
// and the point key Key(3)@4 itself
SyncPoint::GetInstance()->SetCallBack(
"CompactionOutputs::ShouldStopBefore::manual_decision", [opts](void* p) {
auto* pair = (std::pair<bool*, const Slice>*)p;
if ((opts.comparator->Compare(ExtractUserKey(pair->second), Key(3)) ==
0) &&
(GetInternalKeySeqno(pair->second) <= 4)) {
*(pair->first) = true;
}
});
SyncPoint::GetInstance()->EnableProcessing();
std::string begin_key = Key(0);
std::string end_key = Key(5);
Slice begin_slice{begin_key};
Slice end_slice{end_key};
ASSERT_OK(dbfull()->RunManualCompaction(
static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
->cfd(),
1, 2, CompactRangeOptions(), &begin_slice, &end_slice, true,
true /* disallow_trivial_move */,
std::numeric_limits<uint64_t>::max() /*max_file_num_to_ignore*/,
"" /*trim_ts*/));
// iterate through to check if any assertion breaks
std::unique_ptr<Iterator> iter{db_->NewIterator(ReadOptions())};
iter->SeekToFirst();
std::vector<int> expected{1, 3, 4};
for (auto i : expected) {
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(iter->key(), Key(i));
iter->Next();
}
ASSERT_TRUE(iter->status().ok() && !iter->Valid());
db_->ReleaseSnapshot(snapshot);
}
TEST_F(DBRangeDelTest, AddRangeDelsSingleUserKeyTombstoneOnlyFile) {
// Test for an edge case where CompactionOutputs::AddRangeDels()
// is called with an SST file that has no point keys, and that
// the lower bound and upper bound have the same user key.
// This could cause a file's smallest and largest key to be incorrectly set
// such that smallest > largest, and fail some assertions in iterator and/or
// assertion in VersionSet::ApproximateSize().
Options opts = CurrentOptions();
opts.disable_auto_compactions = true;
opts.target_file_size_base = 1 << 10;
opts.level_compaction_dynamic_file_size = false;
DestroyAndReopen(opts);
Random rnd(301);
// Create file at bottommost level so the manual compaction below is
// non-bottommost level and goes through code path like compensate range
// tombstone size.
ASSERT_OK(Put(Key(1), "v1"));
ASSERT_OK(Put(Key(4), "v2"));
ASSERT_OK(Flush());
MoveFilesToLevel(6);
ASSERT_OK(Put(Key(1), rnd.RandomString(10)));
// Key(3)@4
ASSERT_OK(Put(Key(3), rnd.RandomString(10)));
const Snapshot* snapshot1 = db_->GetSnapshot();
// Key(3)@5
ASSERT_OK(Put(Key(3), rnd.RandomString(10)));
const Snapshot* snapshot2 = db_->GetSnapshot();
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
Key(4)));
// Key(3)@7
ASSERT_OK(Put(Key(3), rnd.RandomString(10)));
ASSERT_OK(Flush());
// L0 -> L1 compaction: cut output into two files:
// File 1: Key(1), Key(3)@7, Range tombstone ends at Key(3)@7
// File 2: Key(3)@5, Key(3)@4, Range tombstone starts from Key(3)@5
SyncPoint::GetInstance()->SetCallBack(
"CompactionOutputs::ShouldStopBefore::manual_decision", [opts](void* p) {
auto* pair = (std::pair<bool*, const Slice>*)p;
if ((opts.comparator->Compare(ExtractUserKey(pair->second), Key(3)) ==
0) &&
(GetInternalKeySeqno(pair->second) <= 6)) {
*(pair->first) = true;
SyncPoint::GetInstance()->DisableProcessing();
}
});
SyncPoint::GetInstance()->EnableProcessing();
std::string begin_key = Key(0);
std::string end_key = Key(5);
Slice begin_slice{begin_key};
Slice end_slice{end_key};
ASSERT_OK(dbfull()->RunManualCompaction(
static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
->cfd(),
0, 1, CompactRangeOptions(), &begin_slice, &end_slice, true,
true /* disallow_trivial_move */,
std::numeric_limits<uint64_t>::max() /*max_file_num_to_ignore*/,
"" /*trim_ts*/));
ASSERT_EQ(NumTableFilesAtLevel(1), 2);
// L1 -> L2 compaction, drop the snapshot protecting Key(3)@5.
// Let ShouldStopBefore() return true for Key(3)@5 (delete range sentinel)
// and Key(3)@4.
// Output should have two files:
// File 1: Key(1), Key(3)@7, range tombstone ends at Key(3)@7
// File dropped: range tombstone only file (from Key(3)@5 to Key(3)@4)
// File 2: Range tombstone starting from Key(3)@4, Key(3)@4
db_->ReleaseSnapshot(snapshot2);
SyncPoint::GetInstance()->SetCallBack(
"CompactionOutputs::ShouldStopBefore::manual_decision", [opts](void* p) {
auto* pair = (std::pair<bool*, const Slice>*)p;
if ((opts.comparator->Compare(ExtractUserKey(pair->second), Key(3)) ==
0) &&
(GetInternalKeySeqno(pair->second) <= 6)) {
*(pair->first) = true;
}
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(dbfull()->RunManualCompaction(
static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
->cfd(),
1, 2, CompactRangeOptions(), &begin_slice, &end_slice, true,
true /* disallow_trivial_move */,
std::numeric_limits<uint64_t>::max() /*max_file_num_to_ignore*/,
"" /*trim_ts*/));
ASSERT_EQ(NumTableFilesAtLevel(2), 2);
// iterate through to check if any assertion breaks
std::unique_ptr<Iterator> iter{db_->NewIterator(ReadOptions())};
iter->SeekToFirst();
std::vector<int> expected{1, 3, 4};
for (auto i : expected) {
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(iter->key(), Key(i));
iter->Next();
}
ASSERT_TRUE(iter->status().ok() && !iter->Valid());
db_->ReleaseSnapshot(snapshot1);
}
} // namespace ROCKSDB_NAMESPACE

@ -86,8 +86,10 @@ inline bool IsValueType(ValueType t) {
// Checks whether a type is from user operation
// kTypeRangeDeletion is in meta block so this API is separated from above
// kTypeMaxValid can be from keys generated by
// TruncatedRangeDelIterator::start_key()
inline bool IsExtendedValueType(ValueType t) {
return IsValueType(t) || t == kTypeRangeDeletion;
return IsValueType(t) || t == kTypeRangeDeletion || t == kTypeMaxValid;
}
// We leave eight bits empty at the bottom so a type and sequence#

@ -82,6 +82,10 @@ class HistoryTrimmingIterator : public InternalIterator {
bool IsValuePinned() const override { return input_->IsValuePinned(); }
bool IsDeleteRangeSentinelKey() const override {
return input_->IsDeleteRangeSentinelKey();
}
private:
InternalIterator* input_;
const std::string filter_ts_;

@ -231,6 +231,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
s = Status::ShutdownInProgress();
return s;
}
// Skip range tombstones emitted by the compaction iterator.
if (iter->IsDeleteRangeSentinelKey()) {
continue;
}
ParsedInternalKey ikey;
assert(keys_.size() == merge_context_.GetNumOperands());

@ -36,6 +36,7 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
false /* log_err_key */); // TODO
pik_status.PermitUncheckedError();
parsed_smallest.type = kTypeMaxValid;
assert(pik_status.ok());
smallest_ = &parsed_smallest;
}
@ -70,7 +71,7 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
parsed_largest.sequence -= 1;
// This line is not needed for correctness, but it ensures that the
// truncated end key is not covering keys from the next SST file.
parsed_largest.type = kValueTypeForSeek;
parsed_largest.type = kTypeMaxValid;
}
largest_ = &parsed_largest;
}
@ -393,21 +394,20 @@ bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
namespace {
// Produce a sorted (by start internal key) stream of range tombstones from
// `children`. lower_bound and upper_bound on user key can be
// `children`. lower_bound and upper_bound on internal key can be
// optionally specified. Range tombstones that ends before lower_bound or starts
// after upper_bound are excluded.
// If user-defined timestamp is enabled, lower_bound and upper_bound should
// contain timestamp, but comparison is done ignoring timestamps.
// contain timestamp.
class TruncatedRangeDelMergingIter : public InternalIterator {
public:
TruncatedRangeDelMergingIter(
const InternalKeyComparator* icmp, const Slice* lower_bound,
const Slice* upper_bound, bool upper_bound_inclusive,
const Slice* upper_bound,
const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
: icmp_(icmp),
lower_bound_(lower_bound),
upper_bound_(upper_bound),
upper_bound_inclusive_(upper_bound_inclusive),
heap_(StartKeyMinComparator(icmp)),
ts_sz_(icmp_->user_comparator()->timestamp_size()) {
for (auto& child : children) {
@ -420,7 +420,7 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
}
bool Valid() const override {
return !heap_.empty() && BeforeEndKey(heap_.top());
return !heap_.empty() && !AfterEndKey(heap_.top());
}
Status status() const override { return Status::OK(); }
@ -428,7 +428,13 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
heap_.clear();
for (auto& child : children_) {
if (lower_bound_ != nullptr) {
child->Seek(*lower_bound_);
child->Seek(ExtractUserKey(*lower_bound_));
// Since the above `Seek()` operates on a user key while `lower_bound_`
// is an internal key, we may need to advance `child` farther for it to
// be in bounds.
while (child->Valid() && BeforeStartKey(child)) {
child->InternalNext();
}
} else {
child->SeekToFirst();
}
@ -481,19 +487,23 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
void SeekToLast() override { assert(false); }
private:
bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
bool BeforeStartKey(const TruncatedRangeDelIterator* iter) const {
if (lower_bound_ == nullptr) {
return false;
}
return icmp_->Compare(iter->end_key(), *lower_bound_) <= 0;
}
bool AfterEndKey(const TruncatedRangeDelIterator* iter) const {
if (upper_bound_ == nullptr) {
return true;
return false;
}
int cmp = icmp_->user_comparator()->CompareWithoutTimestamp(
iter->start_key().user_key, *upper_bound_);
return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
return icmp_->Compare(iter->start_key(), *upper_bound_) > 0;
}
const InternalKeyComparator* icmp_;
const Slice* lower_bound_;
const Slice* upper_bound_;
bool upper_bound_inclusive_;
BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
std::vector<TruncatedRangeDelIterator*> children_;
@ -506,11 +516,10 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
std::unique_ptr<FragmentedRangeTombstoneIterator>
CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
const Slice* upper_bound,
bool upper_bound_inclusive) {
const Slice* upper_bound) {
InvalidateRangeDelMapPositions();
auto merging_iter = std::make_unique<TruncatedRangeDelMergingIter>(
icmp_, lower_bound, upper_bound, upper_bound_inclusive, parent_iters_);
icmp_, lower_bound, upper_bound, parent_iters_);
auto fragmented_tombstone_list =
std::make_shared<FragmentedRangeTombstoneList>(

@ -452,16 +452,15 @@ class CompactionRangeDelAggregator : public RangeDelAggregator {
}
// Creates an iterator over all the range tombstones in the aggregator, for
// use in compaction. Nullptr arguments indicate that the iterator range is
// unbounded.
// NOTE: the boundaries are used for optimization purposes to reduce the
// number of tombstones that are passed to the fragmenter; they do not
// guarantee that the resulting iterator only contains range tombstones that
// cover keys in the provided range. If required, these bounds must be
// use in compaction.
//
// NOTE: the internal key boundaries are used for optimization purposes to
// reduce the number of tombstones that are passed to the fragmenter; they do
// not guarantee that the resulting iterator only contains range tombstones
// that cover keys in the provided range. If required, these bounds must be
// enforced during iteration.
std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr,
bool upper_bound_inclusive = false);
const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr);
private:
std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;

@ -224,26 +224,32 @@ TEST_F(RangeDelAggregatorTest, UntruncatedIter) {
TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
nullptr);
VerifyIterator(&iter, bytewise_icmp,
{{UncutEndpoint("a"), UncutEndpoint("e"), 10},
{UncutEndpoint("e"), UncutEndpoint("g"), 8},
{UncutEndpoint("j"), UncutEndpoint("n"), 4}});
VerifyIterator(
&iter, bytewise_icmp,
{{InternalValue("a", 10, kTypeRangeDeletion), UncutEndpoint("e"), 10},
{InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4}});
VerifySeek(
&iter, bytewise_icmp,
{{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
{"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
{"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
{"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
{"", UncutEndpoint("a"), UncutEndpoint("e"), 10}});
{{"d", InternalValue("a", 10, kTypeRangeDeletion), UncutEndpoint("e"),
10},
{"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{"ia", InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4},
{"n", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
true /* invalid */},
{"", InternalValue("a", 10, kTypeRangeDeletion), UncutEndpoint("e"),
10}});
VerifySeekForPrev(
&iter, bytewise_icmp,
{{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
{"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
{"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
{"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
{"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
{{"d", InternalValue("a", 10, kTypeRangeDeletion), UncutEndpoint("e"),
10},
{"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{"ia", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{"n", InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4},
{"", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
true /* invalid */}});
}
TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) {
@ -258,25 +264,29 @@ TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) {
TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
nullptr);
VerifyIterator(&iter, bytewise_icmp,
{{UncutEndpoint("e"), UncutEndpoint("g"), 8},
{UncutEndpoint("j"), UncutEndpoint("n"), 4}});
VerifyIterator(
&iter, bytewise_icmp,
{{InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4}});
VerifySeek(
&iter, bytewise_icmp,
{{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8},
{"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
{"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
{"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
{"", UncutEndpoint("e"), UncutEndpoint("g"), 8}});
{{"d", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{"ia", InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4},
{"n", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
true /* invalid */},
{"", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}});
VerifySeekForPrev(
&iter, bytewise_icmp,
{{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
{"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
{"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
{"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
{"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
{{"d", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
true /* invalid */},
{"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{"ia", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{"n", InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4},
{"", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
true /* invalid */}});
}
TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
@ -295,27 +305,30 @@ TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
VerifyIterator(
&iter, bytewise_icmp,
{{InternalValue("d", 7), UncutEndpoint("e"), 10},
{UncutEndpoint("e"), UncutEndpoint("g"), 8},
{UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4}});
{{InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10},
{InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{InternalValue("j", 4, kTypeRangeDeletion),
InternalValue("m", 8, kTypeMaxValid), 4}});
VerifySeek(
&iter, bytewise_icmp,
{{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
{"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
{"ia", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4,
false /* invalid */},
{"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
{"", InternalValue("d", 7), UncutEndpoint("e"), 10}});
{{"d", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10},
{"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{"ia", InternalValue("j", 4, kTypeRangeDeletion),
InternalValue("m", 8, kTypeMaxValid), 4, false /* invalid */},
{"n", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
true /* invalid */},
{"", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10}});
VerifySeekForPrev(
&iter, bytewise_icmp,
{{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
{"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
{"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
{"n", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4,
false /* invalid */},
{"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
{{"d", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10},
{"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{"ia", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
{"n", InternalValue("j", 4, kTypeRangeDeletion),
InternalValue("m", 8, kTypeMaxValid), 4, false /* invalid */},
{"", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
true /* invalid */}});
}
TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
@ -332,20 +345,23 @@ TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
&smallest, &largest);
VerifyIterator(&iter, bytewise_icmp,
{{InternalValue("f", 7), UncutEndpoint("g"), 8}});
VerifyIterator(
&iter, bytewise_icmp,
{{InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}});
VerifySeek(
&iter, bytewise_icmp,
{{"d", InternalValue("f", 7), UncutEndpoint("g"), 8},
{"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
{"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
{{"d", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8},
{"f", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8},
{"j", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
true /* invalid */}});
VerifySeekForPrev(
&iter, bytewise_icmp,
{{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
{"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
{"j", InternalValue("f", 7), UncutEndpoint("g"), 8}});
{{"d", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
true /* invalid */},
{"f", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8},
{"j", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}});
}
TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) {
@ -627,15 +643,12 @@ TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) {
range_del_agg.AddTombstones(std::move(input_iter));
}
Slice start("p");
Slice end("q");
auto range_del_compaction_iter1 =
range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {});
auto range_del_compaction_iter2 =
range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {});
InternalKey start_buf("p", 0, kTypeRangeDeletion);
InternalKey end_buf("q", 0, kTypeRangeDeletion);
Slice start = start_buf.Encode();
Slice end = end_buf.Encode();
auto range_del_compaction_iter = range_del_agg.NewIterator(&start, &end);
VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {});
}
TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) {
@ -652,18 +665,13 @@ TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) {
range_del_agg.AddTombstones(std::move(input_iter));
}
Slice start("bb");
Slice end("e");
auto range_del_compaction_iter1 =
range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
VerifyFragmentedRangeDels(range_del_compaction_iter1.get(),
InternalKey start_buf("bb", 0, kTypeRangeDeletion);
InternalKey end_buf("e", 9, kTypeRangeDeletion);
Slice start = start_buf.Encode();
Slice end = end_buf.Encode();
auto range_del_compaction_iter = range_del_agg.NewIterator(&start, &end);
VerifyFragmentedRangeDels(range_del_compaction_iter.get(),
{{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}});
auto range_del_compaction_iter2 =
range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
VerifyFragmentedRangeDels(
range_del_compaction_iter2.get(),
{{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}});
}
TEST_F(RangeDelAggregatorTest,
@ -681,29 +689,19 @@ TEST_F(RangeDelAggregatorTest,
range_del_agg.AddTombstones(std::move(input_iter));
}
Slice start("bb");
Slice end("e");
auto range_del_compaction_iter1 =
range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10},
{"b", "c", 20},
{"b", "c", 10},
{"c", "d", 10},
{"c", "d", 8},
{"d", "f", 30},
{"d", "f", 8},
{"f", "g", 8}});
auto range_del_compaction_iter2 =
range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10},
{"b", "c", 20},
{"b", "c", 10},
{"c", "d", 10},
{"c", "d", 8},
{"d", "f", 30},
{"d", "f", 8},
{"f", "g", 8}});
InternalKey start_buf("bb", 0, kTypeRangeDeletion);
InternalKey end_buf("e", 0, kTypeRangeDeletion);
Slice start = start_buf.Encode();
Slice end = end_buf.Encode();
auto range_del_compaction_iter = range_del_agg.NewIterator(&start, &end);
VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 10},
{"b", "c", 20},
{"b", "c", 10},
{"c", "d", 10},
{"c", "d", 8},
{"d", "f", 30},
{"d", "f", 8},
{"f", "g", 8}});
}
} // namespace ROCKSDB_NAMESPACE

@ -218,8 +218,7 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
}
ParsedInternalKey parsed_start_key() const {
return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber,
kTypeRangeDeletion);
return ParsedInternalKey(pos_->start_key, seq(), kTypeRangeDeletion);
}
ParsedInternalKey parsed_end_key() const {
return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber,

@ -38,6 +38,8 @@
#include "db/table_cache.h"
#include "db/version_builder.h"
#include "db/version_edit_handler.h"
#include "table/compaction_merging_iterator.h"
#if USE_COROUTINES
#include "folly/experimental/coro/BlockingWait.h"
#include "folly/experimental/coro/Collect.h"
@ -6635,6 +6637,14 @@ InternalIterator* VersionSet::MakeInputIterator(
c->num_input_levels() - 1
: c->num_input_levels());
InternalIterator** list = new InternalIterator*[space];
// First item in the pair is a pointer to range tombstones.
// Second item is a pointer to a member of a LevelIterator,
// that will be initialized to where CompactionMergingIterator stores
// pointer to its range tombstones. This is used by LevelIterator
// to update pointer to range tombstones as it traverse different SST files.
std::vector<
std::pair<TruncatedRangeDelIterator*, TruncatedRangeDelIterator***>>
range_tombstones;
size_t num = 0;
for (size_t which = 0; which < c->num_input_levels(); which++) {
if (c->input_levels(which)->num_files != 0) {
@ -6655,7 +6665,7 @@ InternalIterator* VersionSet::MakeInputIterator(
end.value(), fmd.smallest.user_key()) < 0) {
continue;
}
TruncatedRangeDelIterator* range_tombstone_iter = nullptr;
list[num++] = cfd->table_cache()->NewIterator(
read_options, file_options_compactions,
cfd->internal_comparator(), fmd, range_del_agg,
@ -6668,10 +6678,13 @@ InternalIterator* VersionSet::MakeInputIterator(
MaxFileSizeForL0MetaPin(*c->mutable_cf_options()),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
/*allow_unprepared_value=*/false);
/*allow_unprepared_value=*/false,
/*range_del_iter=*/&range_tombstone_iter);
range_tombstones.emplace_back(range_tombstone_iter, nullptr);
}
} else {
// Create concatenating iterator for the files from this level
TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
list[num++] = new LevelIterator(
cfd->table_cache(), read_options, file_options_compactions,
cfd->internal_comparator(), c->input_levels(which),
@ -6680,14 +6693,15 @@ InternalIterator* VersionSet::MakeInputIterator(
/*no per level latency histogram=*/nullptr,
TableReaderCaller::kCompaction, /*skip_filters=*/false,
/*level=*/static_cast<int>(c->level(which)), range_del_agg,
c->boundaries(which));
c->boundaries(which), false, &tombstone_iter_ptr);
range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
}
}
}
assert(num <= space);
InternalIterator* result =
NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
static_cast<int>(num));
InternalIterator* result = NewCompactionMergingIterator(
&c->column_family_data()->internal_comparator(), list,
static_cast<int>(num), range_tombstones);
delete[] list;
return result;
}

@ -198,6 +198,7 @@ LIB_SOURCES = \
table/get_context.cc \
table/iterator.cc \
table/merging_iterator.cc \
table/compaction_merging_iterator.cc \
table/meta_blocks.cc \
table/persistent_cache_helper.cc \
table/plain/plain_table_bloom.cc \

@ -0,0 +1,370 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "table/compaction_merging_iterator.h"
namespace ROCKSDB_NAMESPACE {
class CompactionMergingIterator : public InternalIterator {
public:
CompactionMergingIterator(
const InternalKeyComparator* comparator, InternalIterator** children,
int n, bool is_arena_mode,
std::vector<
std::pair<TruncatedRangeDelIterator*, TruncatedRangeDelIterator***>>
range_tombstones)
: is_arena_mode_(is_arena_mode),
comparator_(comparator),
current_(nullptr),
minHeap_(CompactionHeapItemComparator(comparator_)),
pinned_iters_mgr_(nullptr) {
children_.resize(n);
for (int i = 0; i < n; i++) {
children_[i].level = i;
children_[i].iter.Set(children[i]);
assert(children_[i].type == HeapItem::ITERATOR);
}
assert(range_tombstones.size() == static_cast<size_t>(n));
for (auto& p : range_tombstones) {
range_tombstone_iters_.push_back(p.first);
}
pinned_heap_item_.resize(n);
for (int i = 0; i < n; ++i) {
if (range_tombstones[i].second) {
// for LevelIterator
*range_tombstones[i].second = &range_tombstone_iters_[i];
}
pinned_heap_item_[i].level = i;
pinned_heap_item_[i].type = HeapItem::DELETE_RANGE_START;
}
}
void considerStatus(const Status& s) {
if (!s.ok() && status_.ok()) {
status_ = s;
}
}
~CompactionMergingIterator() override {
// TODO: use unique_ptr for range_tombstone_iters_
for (auto child : range_tombstone_iters_) {
delete child;
}
for (auto& child : children_) {
child.iter.DeleteIter(is_arena_mode_);
}
status_.PermitUncheckedError();
}
bool Valid() const override { return current_ != nullptr && status_.ok(); }
Status status() const override { return status_; }
void SeekToFirst() override;
void Seek(const Slice& target) override;
void Next() override;
Slice key() const override {
assert(Valid());
return current_->key();
}
Slice value() const override {
assert(Valid());
if (LIKELY(current_->type == HeapItem::ITERATOR)) {
return current_->iter.value();
} else {
return dummy_tombstone_val;
}
}
// Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
// from current child iterator. Potentially as long as one of child iterator
// report out of bound is not possible, we know current key is within bound.
bool MayBeOutOfLowerBound() override {
assert(Valid());
return current_->type == HeapItem::DELETE_RANGE_START ||
current_->iter.MayBeOutOfLowerBound();
}
IterBoundCheck UpperBoundCheckResult() override {
assert(Valid());
return current_->type == HeapItem::DELETE_RANGE_START
? IterBoundCheck::kUnknown
: current_->iter.UpperBoundCheckResult();
}
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
pinned_iters_mgr_ = pinned_iters_mgr;
for (auto& child : children_) {
child.iter.SetPinnedItersMgr(pinned_iters_mgr);
}
}
bool IsDeleteRangeSentinelKey() const override {
assert(Valid());
return current_->type == HeapItem::DELETE_RANGE_START;
}
// Compaction uses the above subset of InternalIterator interface.
void SeekToLast() override { assert(false); }
void SeekForPrev(const Slice&) override { assert(false); }
void Prev() override { assert(false); }
bool NextAndGetResult(IterateResult*) override {
assert(false);
return false;
}
bool IsKeyPinned() const override {
assert(false);
return false;
}
bool IsValuePinned() const override {
assert(false);
return false;
}
bool PrepareValue() override {
assert(false);
return false;
}
private:
struct HeapItem {
HeapItem() = default;
IteratorWrapper iter;
size_t level = 0;
std::string tombstone_str;
enum Type { ITERATOR, DELETE_RANGE_START };
Type type = ITERATOR;
explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
: level(_level), type(Type::ITERATOR) {
iter.Set(_iter);
}
void SetTombstoneForCompaction(const ParsedInternalKey&& pik) {
tombstone_str.clear();
AppendInternalKey(&tombstone_str, pik);
}
[[nodiscard]] Slice key() const {
return type == ITERATOR ? iter.key() : tombstone_str;
}
};
class CompactionHeapItemComparator {
public:
explicit CompactionHeapItemComparator(
const InternalKeyComparator* comparator)
: comparator_(comparator) {}
bool operator()(HeapItem* a, HeapItem* b) const {
int r = comparator_->Compare(a->key(), b->key());
// For each file, we assume all range tombstone start keys come before
// its file boundary sentinel key (file's meta.largest key).
// In the case when meta.smallest = meta.largest and range tombstone start
// key is truncated at meta.smallest, the start key will have op_type =
// kMaxValid to make it smaller (see TruncatedRangeDelIterator
// constructor). The following assertion validates this assumption.
assert(a->type == b->type || r != 0);
return r > 0;
}
private:
const InternalKeyComparator* comparator_;
};
using CompactionMinHeap = BinaryHeap<HeapItem*, CompactionHeapItemComparator>;
bool is_arena_mode_;
const InternalKeyComparator* comparator_;
// HeapItem for all child point iterators.
std::vector<HeapItem> children_;
// HeapItem for range tombstones. pinned_heap_item_[i] corresponds to the
// current range tombstone from range_tombstone_iters_[i].
std::vector<HeapItem> pinned_heap_item_;
// range_tombstone_iters_[i] contains range tombstones in the sorted run that
// corresponds to children_[i]. range_tombstone_iters_[i] ==
// nullptr means the sorted run of children_[i] does not have range
// tombstones (or the current SSTable does not have range tombstones in the
// case of LevelIterator).
std::vector<TruncatedRangeDelIterator*> range_tombstone_iters_;
// Used as value for range tombstone keys
std::string dummy_tombstone_val{};
// Skip file boundary sentinel keys.
void FindNextVisibleKey();
// top of minHeap_
HeapItem* current_;
// If any of the children have non-ok status, this is one of them.
Status status_;
CompactionMinHeap minHeap_;
PinnedIteratorsManager* pinned_iters_mgr_;
// Process a child that is not in the min heap.
// If valid, add to the min heap. Otherwise, check status.
void AddToMinHeapOrCheckStatus(HeapItem*);
HeapItem* CurrentForward() const {
return !minHeap_.empty() ? minHeap_.top() : nullptr;
}
void InsertRangeTombstoneAtLevel(size_t level) {
if (range_tombstone_iters_[level]->Valid()) {
pinned_heap_item_[level].SetTombstoneForCompaction(
range_tombstone_iters_[level]->start_key());
minHeap_.push(&pinned_heap_item_[level]);
}
}
};
void CompactionMergingIterator::SeekToFirst() {
minHeap_.clear();
status_ = Status::OK();
for (auto& child : children_) {
child.iter.SeekToFirst();
AddToMinHeapOrCheckStatus(&child);
}
for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
if (range_tombstone_iters_[i]) {
range_tombstone_iters_[i]->SeekToFirst();
InsertRangeTombstoneAtLevel(i);
}
}
FindNextVisibleKey();
current_ = CurrentForward();
}
void CompactionMergingIterator::Seek(const Slice& target) {
minHeap_.clear();
status_ = Status::OK();
for (auto& child : children_) {
child.iter.Seek(target);
AddToMinHeapOrCheckStatus(&child);
}
ParsedInternalKey pik;
ParseInternalKey(target, &pik, false /* log_err_key */)
.PermitUncheckedError();
for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
if (range_tombstone_iters_[i]) {
range_tombstone_iters_[i]->Seek(pik.user_key);
// For compaction, output keys should all be after seek target.
while (range_tombstone_iters_[i]->Valid() &&
comparator_->Compare(range_tombstone_iters_[i]->start_key(), pik) <
0) {
range_tombstone_iters_[i]->Next();
}
InsertRangeTombstoneAtLevel(i);
}
}
FindNextVisibleKey();
current_ = CurrentForward();
}
void CompactionMergingIterator::Next() {
assert(Valid());
// For the heap modifications below to be correct, current_ must be the
// current top of the heap.
assert(current_ == CurrentForward());
// as the current points to the current record. move the iterator forward.
if (current_->type == HeapItem::ITERATOR) {
current_->iter.Next();
if (current_->iter.Valid()) {
// current is still valid after the Next() call above. Call
// replace_top() to restore the heap property. When the same child
// iterator yields a sequence of keys, this is cheap.
assert(current_->iter.status().ok());
minHeap_.replace_top(current_);
} else {
// current stopped being valid, remove it from the heap.
considerStatus(current_->iter.status());
minHeap_.pop();
}
} else {
assert(current_->type == HeapItem::DELETE_RANGE_START);
size_t level = current_->level;
assert(range_tombstone_iters_[level]);
range_tombstone_iters_[level]->Next();
if (range_tombstone_iters_[level]->Valid()) {
pinned_heap_item_[level].SetTombstoneForCompaction(
range_tombstone_iters_[level]->start_key());
minHeap_.replace_top(&pinned_heap_item_[level]);
} else {
minHeap_.pop();
}
}
FindNextVisibleKey();
current_ = CurrentForward();
}
void CompactionMergingIterator::FindNextVisibleKey() {
while (!minHeap_.empty()) {
HeapItem* current = minHeap_.top();
// IsDeleteRangeSentinelKey() here means file boundary sentinel keys.
if (current->type != HeapItem::ITERATOR ||
!current->iter.IsDeleteRangeSentinelKey()) {
return;
}
// range tombstone start keys from the same SSTable should have been
// exhausted
assert(!range_tombstone_iters_[current->level] ||
!range_tombstone_iters_[current->level]->Valid());
// current->iter is a LevelIterator, and it enters a new SST file in the
// Next() call here.
current->iter.Next();
if (current->iter.Valid()) {
assert(current->iter.status().ok());
minHeap_.replace_top(current);
} else {
minHeap_.pop();
}
if (range_tombstone_iters_[current->level]) {
InsertRangeTombstoneAtLevel(current->level);
}
}
}
void CompactionMergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) {
if (child->iter.Valid()) {
assert(child->iter.status().ok());
minHeap_.push(child);
} else {
considerStatus(child->iter.status());
}
}
InternalIterator* NewCompactionMergingIterator(
const InternalKeyComparator* comparator, InternalIterator** children, int n,
std::vector<std::pair<TruncatedRangeDelIterator*,
TruncatedRangeDelIterator***>>& range_tombstone_iters,
Arena* arena) {
assert(n >= 0);
if (n == 0) {
return NewEmptyInternalIterator<Slice>(arena);
} else {
if (arena == nullptr) {
return new CompactionMergingIterator(comparator, children, n,
false /* is_arena_mode */,
range_tombstone_iters);
} else {
auto mem = arena->AllocateAligned(sizeof(CompactionMergingIterator));
return new (mem) CompactionMergingIterator(comparator, children, n,
true /* is_arena_mode */,
range_tombstone_iters);
}
}
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,44 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include "db/range_del_aggregator.h"
#include "rocksdb/slice.h"
#include "rocksdb/types.h"
#include "table/merging_iterator.h"
namespace ROCKSDB_NAMESPACE {
/*
* This is a simplified version of MergingIterator and is specifically used for
* compaction. It merges the input `children` iterators into a sorted stream of
* keys. Range tombstone start keys are also emitted to prevent oversize
* compactions. For example, consider an L1 file with content [a, b), y, z,
* where [a, b) is a range tombstone and y and z are point keys. This could
* cause an oversize compaction as it can overlap with a wide range of key space
* in L2.
*
* CompactionMergingIterator emits range tombstone start keys from each LSM
* level's range tombstone iterator, and for each range tombstone
* [start,end)@seqno, the key will be start@seqno with op_type
* kTypeRangeDeletion unless truncated at file boundary (see detail in
* TruncatedRangeDelIterator::start_key()).
*
* Caller should use CompactionMergingIterator::IsDeleteRangeSentinelKey() to
* check if the current key is a range tombstone key.
* TODO(cbi): IsDeleteRangeSentinelKey() is used for two kinds of keys at
* different layers: file boundary and range tombstone keys. Separate them into
* two APIs for clarity.
*/
class CompactionMergingIterator;
InternalIterator* NewCompactionMergingIterator(
const InternalKeyComparator* comparator, InternalIterator** children, int n,
std::vector<std::pair<TruncatedRangeDelIterator*,
TruncatedRangeDelIterator***>>& range_tombstone_iters,
Arena* arena = nullptr);
} // namespace ROCKSDB_NAMESPACE

@ -10,121 +10,8 @@
#include "table/merging_iterator.h"
#include "db/arena_wrapped_db_iter.h"
#include "db/dbformat.h"
#include "db/pinned_iterators_manager.h"
#include "memory/arena.h"
#include "monitoring/perf_context_imp.h"
#include "rocksdb/comparator.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "table/internal_iterator.h"
#include "table/iter_heap.h"
#include "table/iterator_wrapper.h"
#include "test_util/sync_point.h"
#include "util/autovector.h"
#include "util/heap.h"
#include "util/stop_watch.h"
namespace ROCKSDB_NAMESPACE {
// For merging iterator to process range tombstones, we treat the start and end
// keys of a range tombstone as point keys and put them into the minHeap/maxHeap
// used in merging iterator. Take minHeap for example, we are able to keep track
// of currently "active" range tombstones (the ones whose start keys are popped
// but end keys are still in the heap) in `active_`. This `active_` set of range
// tombstones is then used to quickly determine whether the point key at heap
// top is deleted (by heap property, the point key at heap top must be within
// internal key range of active range tombstones).
//
// The HeapItem struct represents 3 types of elements in the minHeap/maxHeap:
// point key and the start and end keys of a range tombstone.
struct HeapItem {
HeapItem() = default;
enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
IteratorWrapper iter;
size_t level = 0;
ParsedInternalKey parsed_ikey;
// Will be overwritten before use, initialize here so compiler does not
// complain.
Type type = ITERATOR;
explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
: level(_level), type(Type::ITERATOR) {
iter.Set(_iter);
}
void SetTombstoneKey(ParsedInternalKey&& pik) {
// op_type is already initialized in MergingIterator::Finish().
parsed_ikey.user_key = pik.user_key;
parsed_ikey.sequence = pik.sequence;
}
Slice key() const {
assert(type == ITERATOR);
return iter.key();
}
bool IsDeleteRangeSentinelKey() const {
if (type == Type::ITERATOR) {
return iter.IsDeleteRangeSentinelKey();
}
return false;
}
};
class MinHeapItemComparator {
public:
MinHeapItemComparator(const InternalKeyComparator* comparator)
: comparator_(comparator) {}
bool operator()(HeapItem* a, HeapItem* b) const {
if (LIKELY(a->type == HeapItem::ITERATOR)) {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->key(), b->key()) > 0;
} else {
return comparator_->Compare(a->key(), b->parsed_ikey) > 0;
}
} else {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->parsed_ikey, b->key()) > 0;
} else {
return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) > 0;
}
}
}
private:
const InternalKeyComparator* comparator_;
};
class MaxHeapItemComparator {
public:
MaxHeapItemComparator(const InternalKeyComparator* comparator)
: comparator_(comparator) {}
bool operator()(HeapItem* a, HeapItem* b) const {
if (LIKELY(a->type == HeapItem::ITERATOR)) {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->key(), b->key()) < 0;
} else {
return comparator_->Compare(a->key(), b->parsed_ikey) < 0;
}
} else {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->parsed_ikey, b->key()) < 0;
} else {
return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) < 0;
}
}
}
private:
const InternalKeyComparator* comparator_;
};
// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
namespace {
using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
} // namespace
class MergingIterator : public InternalIterator {
public:
MergingIterator(const InternalKeyComparator* comparator,
@ -136,7 +23,7 @@ class MergingIterator : public InternalIterator {
direction_(kForward),
comparator_(comparator),
current_(nullptr),
minHeap_(comparator_),
minHeap_(MinHeapItemComparator(comparator_)),
pinned_iters_mgr_(nullptr),
iterate_upper_bound_(iterate_upper_bound) {
children_.resize(n);
@ -199,7 +86,7 @@ class MergingIterator : public InternalIterator {
// TruncatedRangeDelIterator since untruncated tombstone end points
// always have kMaxSequenceNumber and kTypeRangeDeletion (see
// TruncatedRangeDelIterator::start_key()/end_key()).
pinned_heap_item_[i].parsed_ikey.type = kTypeMaxValid;
pinned_heap_item_[i].tombstone_pik.type = kTypeMaxValid;
}
}
}
@ -549,6 +436,92 @@ class MergingIterator : public InternalIterator {
}
private:
// For merging iterator to process range tombstones, we treat the start and
// end
// keys of a range tombstone as point keys and put them into the
// minHeap/maxHeap used in merging iterator. Take minHeap for example, we are
// able to keep track of currently "active" range tombstones (the ones whose
// start keys are popped but end keys are still in the heap) in `active_`.
// This `active_` set of range tombstones is then used to quickly determine
// whether the point key at heap top is deleted (by heap property, the point
// key at heap top must be within internal key range of active range
// tombstones).
//
// The HeapItem struct represents 3 types of elements in the minHeap/maxHeap:
// point key and the start and end keys of a range tombstone.
struct HeapItem {
HeapItem() = default;
IteratorWrapper iter;
size_t level = 0;
ParsedInternalKey tombstone_pik;
// Will be overwritten before use, initialize here so compiler does not
// complain.
enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
Type type = ITERATOR;
explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
: level(_level), type(Type::ITERATOR) {
iter.Set(_iter);
}
void SetTombstoneKey(ParsedInternalKey&& pik) {
// op_type is already initialized in MergingIterator::Finish().
tombstone_pik.user_key = pik.user_key;
tombstone_pik.sequence = pik.sequence;
}
};
class MinHeapItemComparator {
public:
explicit MinHeapItemComparator(const InternalKeyComparator* comparator)
: comparator_(comparator) {}
bool operator()(HeapItem* a, HeapItem* b) const {
if (LIKELY(a->type == HeapItem::ITERATOR)) {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->iter.key(), b->iter.key()) > 0;
} else {
return comparator_->Compare(a->iter.key(), b->tombstone_pik) > 0;
}
} else {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->tombstone_pik, b->iter.key()) > 0;
} else {
return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) > 0;
}
}
}
private:
const InternalKeyComparator* comparator_;
};
class MaxHeapItemComparator {
public:
explicit MaxHeapItemComparator(const InternalKeyComparator* comparator)
: comparator_(comparator) {}
bool operator()(HeapItem* a, HeapItem* b) const {
if (LIKELY(a->type == HeapItem::ITERATOR)) {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->iter.key(), b->iter.key()) < 0;
} else {
return comparator_->Compare(a->iter.key(), b->tombstone_pik) < 0;
}
} else {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->tombstone_pik, b->iter.key()) < 0;
} else {
return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) < 0;
}
}
}
private:
const InternalKeyComparator* comparator_;
};
using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
friend class MergeIteratorBuilder;
// Clears heaps for both directions, used when changing direction or seeking
void ClearHeaps(bool clear_active = true);
@ -1177,7 +1150,7 @@ void MergingIterator::SwitchToForward() {
if (child.iter.status() == Status::TryAgain()) {
continue;
}
if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
assert(child.iter.status().ok());
child.iter.Next();
}
@ -1188,7 +1161,7 @@ void MergingIterator::SwitchToForward() {
for (auto& child : children_) {
if (child.iter.status() == Status::TryAgain()) {
child.iter.Seek(target);
if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
assert(child.iter.status().ok());
child.iter.Next();
}
@ -1239,7 +1212,7 @@ void MergingIterator::SwitchToBackward() {
if (&child.iter != current_) {
child.iter.SeekForPrev(target);
TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
assert(child.iter.status().ok());
child.iter.Prev();
}
@ -1297,7 +1270,8 @@ void MergingIterator::ClearHeaps(bool clear_active) {
void MergingIterator::InitMaxHeap() {
if (!maxHeap_) {
maxHeap_ = std::make_unique<MergerMaxIterHeap>(comparator_);
maxHeap_ =
std::make_unique<MergerMaxIterHeap>(MaxHeapItemComparator(comparator_));
}
}
@ -1308,21 +1282,27 @@ void MergingIterator::InitMaxHeap() {
// key's level, then the current child iterator is simply advanced to its next
// key without reseeking.
inline void MergingIterator::FindNextVisibleKey() {
// When active_ is empty, we know heap top cannot be a range tombstone end
// key. It cannot be a range tombstone start key per PopDeleteRangeStart().
PopDeleteRangeStart();
while (!minHeap_.empty() &&
(!active_.empty() || minHeap_.top()->IsDeleteRangeSentinelKey()) &&
SkipNextDeleted()) {
// PopDeleteRangeStart() implies heap top is not DELETE_RANGE_START
// active_ being empty implies no DELETE_RANGE_END in heap.
// So minHeap_->top() must be of type ITERATOR.
while (
!minHeap_.empty() &&
(!active_.empty() || minHeap_.top()->iter.IsDeleteRangeSentinelKey()) &&
SkipNextDeleted()) {
PopDeleteRangeStart();
}
}
inline void MergingIterator::FindPrevVisibleKey() {
PopDeleteRangeEnd();
while (!maxHeap_->empty() &&
(!active_.empty() || maxHeap_->top()->IsDeleteRangeSentinelKey()) &&
SkipPrevDeleted()) {
// PopDeleteRangeEnd() implies heap top is not DELETE_RANGE_END
// active_ being empty implies no DELETE_RANGE_START in heap.
// So maxHeap_->top() must be of type ITERATOR.
while (
!maxHeap_->empty() &&
(!active_.empty() || maxHeap_->top()->iter.IsDeleteRangeSentinelKey()) &&
SkipPrevDeleted()) {
PopDeleteRangeEnd();
}
}

@ -12,6 +12,7 @@
#include "db/range_del_aggregator.h"
#include "rocksdb/slice.h"
#include "rocksdb/types.h"
#include "table/iterator_wrapper.h"
namespace ROCKSDB_NAMESPACE {

Loading…
Cancel
Save