You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
rocksdb/table/compaction_merging_iterator.cc

143 lines
4.8 KiB

Consider range tombstone in compaction output file cutting (#10802) Summary: This PR is the first step for Issue https://github.com/facebook/rocksdb/issues/4811. Currently compaction output files are cut at point keys, and the decision is made mainly in `CompactionOutputs::ShouldStopBefore()`. This makes it possible for range tombstones to cause large compactions that does not respect `max_compaction_bytes`. For example, we can have a large range tombstone that overlaps with too many files from the next level. Another example is when there is a gap between a range tombstone and another key. The first issue may be more acceptable, as a lot of data is deleted. This PR address the second issue by calling `ShouldStopBefore()` for range tombstone start keys. The main change is for `CompactionIterator` to emit range tombstone start keys to be processed by `CompactionOutputs`. A new `CompactionMergingIterator` is introduced and only used under `CompactionIterator` for this purpose. Further improvement after this PR include 1) cut compaction output at some grandparent boundary key instead of at the next point key or range tombstone start key and 2) cut compaction output file within a large range tombstone (it may be easier and reasonable to only do it for range tombstones at the end of a compaction output). Pull Request resolved: https://github.com/facebook/rocksdb/pull/10802 Test Plan: - added unit tests in db_range_del_test. - stress test: `python3 tools/db_crashtest.py whitebox --[simple|enable_ts] --verify_iterator_with_expected_state_one_in=5 --delrangepercent=5 --prefixpercent=2 --writepercent=58 --readpercen=21 --duration=36000 --range_deletion_width=1000000` Reviewed By: ajkr, jay-zhuang Differential Revision: D40308827 Pulled By: cbi42 fbshipit-source-id: a8fd6f70a3f09d0ef7a40e006f6c964bba8c00df
2 years ago
// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "table/compaction_merging_iterator.h"
namespace ROCKSDB_NAMESPACE {
void CompactionMergingIterator::SeekToFirst() {
minHeap_.clear();
status_ = Status::OK();
for (auto& child : children_) {
child.iter.SeekToFirst();
AddToMinHeapOrCheckStatus(&child);
}
for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
if (range_tombstone_iters_[i]) {
range_tombstone_iters_[i]->SeekToFirst();
InsertRangeTombstoneAtLevel(i);
}
}
FindNextVisibleKey();
current_ = CurrentForward();
}
void CompactionMergingIterator::Seek(const Slice& target) {
minHeap_.clear();
status_ = Status::OK();
for (auto& child : children_) {
child.iter.Seek(target);
AddToMinHeapOrCheckStatus(&child);
}
ParsedInternalKey pik;
ParseInternalKey(target, &pik, false /* log_err_key */)
.PermitUncheckedError();
for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
if (range_tombstone_iters_[i]) {
range_tombstone_iters_[i]->Seek(pik.user_key);
// For compaction, output keys should all be after seek target.
while (range_tombstone_iters_[i]->Valid() &&
comparator_->Compare(range_tombstone_iters_[i]->start_key(), pik) <
0) {
range_tombstone_iters_[i]->Next();
}
InsertRangeTombstoneAtLevel(i);
}
}
FindNextVisibleKey();
current_ = CurrentForward();
}
void CompactionMergingIterator::Next() {
assert(Valid());
// For the heap modifications below to be correct, current_ must be the
// current top of the heap.
assert(current_ == CurrentForward());
// as the current points to the current record. move the iterator forward.
if (current_->type == HeapItem::ITERATOR) {
current_->iter.Next();
if (current_->iter.Valid()) {
// current is still valid after the Next() call above. Call
// replace_top() to restore the heap property. When the same child
// iterator yields a sequence of keys, this is cheap.
assert(current_->iter.status().ok());
minHeap_.replace_top(current_);
} else {
// current stopped being valid, remove it from the heap.
considerStatus(current_->iter.status());
minHeap_.pop();
}
} else {
assert(current_->type == HeapItem::DELETE_RANGE_START);
size_t level = current_->level;
assert(range_tombstone_iters_[level]);
range_tombstone_iters_[level]->Next();
if (range_tombstone_iters_[level]->Valid()) {
pinned_heap_item_[level].SetTombstoneForCompaction(
range_tombstone_iters_[level]->start_key());
minHeap_.replace_top(&pinned_heap_item_[level]);
} else {
minHeap_.pop();
}
}
FindNextVisibleKey();
current_ = CurrentForward();
}
void CompactionMergingIterator::FindNextVisibleKey() {
// IsDeleteRangeSentinelKey() here means file boundary sentinel keys.
while (!minHeap_.empty() && minHeap_.top()->IsDeleteRangeSentinelKey()) {
HeapItem* current = minHeap_.top();
// range tombstone start keys from the same SSTable should have been
// exhausted
assert(!range_tombstone_iters_[current->level] ||
!range_tombstone_iters_[current->level]->Valid());
// iter is a LevelIterator, and it enters a new SST file in the Next()
// call here.
current->iter.Next();
if (current->iter.Valid()) {
assert(current->iter.status().ok());
minHeap_.replace_top(current);
} else {
minHeap_.pop();
}
if (range_tombstone_iters_[current->level]) {
InsertRangeTombstoneAtLevel(current->level);
}
}
}
void CompactionMergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) {
if (child->iter.Valid()) {
assert(child->iter.status().ok());
minHeap_.push(child);
} else {
considerStatus(child->iter.status());
}
}
InternalIterator* NewCompactionMergingIterator(
const InternalKeyComparator* comparator, InternalIterator** children, int n,
std::vector<std::pair<TruncatedRangeDelIterator*,
TruncatedRangeDelIterator***>>& range_tombstone_iters,
Arena* arena) {
assert(n >= 0);
if (n == 0) {
return NewEmptyInternalIterator<Slice>(arena);
} else {
if (arena == nullptr) {
return new CompactionMergingIterator(comparator, children, n, false,
range_tombstone_iters);
} else {
auto mem = arena->AllocateAligned(sizeof(CompactionMergingIterator));
return new (mem) CompactionMergingIterator(comparator, children, n, true,
range_tombstone_iters);
}
}
}
} // namespace ROCKSDB_NAMESPACE