Prevent iterating over range tombstones beyond `iterate_upper_bound` (#10966)

Summary:
Currently, `iterate_upper_bound` is not checked for range tombstone keys in MergingIterator. This may impact performance when there is a large number of range tombstones right after `iterate_upper_bound`. This PR fixes this issue by checking `iterate_upper_bound` in MergingIterator for range tombstone keys.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10966

Test Plan:
- added unit test
- stress test: `python3 tools/db_crashtest.py whitebox --simple --verify_iterator_with_expected_state_one_in=5 --delrangepercent=5 --prefixpercent=18 --writepercent=48 --readpercen=15 --duration=36000 --range_deletion_width=100`
- ran different stress tests over sandcastle
- Falcon team ran some test traffic and saw reduced CPU usage on processing range tombstones.

Reviewed By: ajkr

Differential Revision: D41414172

Pulled By: cbi42

fbshipit-source-id: 9b2c29eb3abb99327c6a649bdc412e70d863f981
main
Changyu Bi 2 years ago committed by Facebook GitHub Bot
parent 54c2542df2
commit 534fb06dd3
  1. 3
      HISTORY.md
  2. 3
      db/db_impl/db_impl.cc
  3. 40
      db/db_range_del_test.cc
  4. 2
      include/rocksdb/options.h
  5. 53
      table/merging_iterator.cc
  6. 3
      table/merging_iterator.h

@ -3,6 +3,9 @@
### Behavior changes ### Behavior changes
* Make best-efforts recovery verify SST unique ID before Version construction (#10962) * Make best-efforts recovery verify SST unique ID before Version construction (#10962)
### Bug Fixes
* Fixed a regression in iterator where range tombstones after `iterate_upper_bound` is processed.
## 7.9.0 (11/21/2022) ## 7.9.0 (11/21/2022)
### Performance Improvements ### Performance Improvements
* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877). * Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877).

@ -1823,7 +1823,8 @@ InternalIterator* DBImpl::NewInternalIterator(
MergeIteratorBuilder merge_iter_builder( MergeIteratorBuilder merge_iter_builder(
&cfd->internal_comparator(), arena, &cfd->internal_comparator(), arena,
!read_options.total_order_seek && !read_options.total_order_seek &&
super_version->mutable_cf_options.prefix_extractor != nullptr); super_version->mutable_cf_options.prefix_extractor != nullptr,
read_options.iterate_upper_bound);
// Collect iterator for mutable memtable // Collect iterator for mutable memtable
auto mem_iter = super_version->mem->NewIterator(read_options, arena); auto mem_iter = super_version->mem->NewIterator(read_options, arena);
Status s; Status s;

@ -2756,6 +2756,46 @@ TEST_F(DBRangeDelTest, RefreshMemtableIter) {
ASSERT_OK(iter->Refresh()); ASSERT_OK(iter->Refresh());
} }
TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) {
// Memtable: a, [b, bz)
// Do a Seek on `a` with iterate_upper_bound being az
// range tombstone [b, bz) should not be processed (added to and
// popped from the min_heap in MergingIterator).
Options options = CurrentOptions();
options.disable_auto_compactions = true;
DestroyAndReopen(options);
ASSERT_OK(Put("a", "bar"));
ASSERT_OK(
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "bz"));
// I could not find a cleaner way to test this without relying on
// implementation detail. Tried to test the value of
// `internal_range_del_reseek_count` but that did not work
// since BlockBasedTable iterator becomes !Valid() when point key
// is out of bound and that reseek only happens when a point key
// is covered by some range tombstone.
SyncPoint::GetInstance()->SetCallBack("MergeIterator::PopDeleteRangeStart",
[](void*) {
// there should not be any range
// tombstone in the heap.
FAIL();
});
SyncPoint::GetInstance()->EnableProcessing();
ReadOptions read_opts;
std::string upper_bound = "az";
Slice upper_bound_slice = upper_bound;
read_opts.iterate_upper_bound = &upper_bound_slice;
std::unique_ptr<Iterator> iter{db_->NewIterator(read_opts)};
iter->Seek("a");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(iter->key(), "a");
iter->Next();
ASSERT_FALSE(iter->Valid());
ASSERT_OK(iter->status());
}
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

@ -1487,7 +1487,7 @@ struct ReadOptions {
const Slice* iterate_lower_bound; const Slice* iterate_lower_bound;
// "iterate_upper_bound" defines the extent up to which the forward iterator // "iterate_upper_bound" defines the extent up to which the forward iterator
// can returns entries. Once the bound is reached, Valid() will be false. // can return entries. Once the bound is reached, Valid() will be false.
// "iterate_upper_bound" is exclusive ie the bound value is // "iterate_upper_bound" is exclusive ie the bound value is
// not a valid entry. If prefix_extractor is not null: // not a valid entry. If prefix_extractor is not null:
// 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used

@ -117,14 +117,16 @@ class MergingIterator : public InternalIterator {
public: public:
MergingIterator(const InternalKeyComparator* comparator, MergingIterator(const InternalKeyComparator* comparator,
InternalIterator** children, int n, bool is_arena_mode, InternalIterator** children, int n, bool is_arena_mode,
bool prefix_seek_mode) bool prefix_seek_mode,
const Slice* iterate_upper_bound = nullptr)
: is_arena_mode_(is_arena_mode), : is_arena_mode_(is_arena_mode),
prefix_seek_mode_(prefix_seek_mode), prefix_seek_mode_(prefix_seek_mode),
direction_(kForward), direction_(kForward),
comparator_(comparator), comparator_(comparator),
current_(nullptr), current_(nullptr),
minHeap_(comparator_), minHeap_(comparator_),
pinned_iters_mgr_(nullptr) { pinned_iters_mgr_(nullptr),
iterate_upper_bound_(iterate_upper_bound) {
children_.resize(n); children_.resize(n);
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
children_[i].level = i; children_[i].level = i;
@ -202,11 +204,26 @@ class MergingIterator : public InternalIterator {
assert(!range_tombstone_iters_.empty() && assert(!range_tombstone_iters_.empty() &&
range_tombstone_iters_[level]->Valid()); range_tombstone_iters_[level]->Valid());
if (start_key) { if (start_key) {
pinned_heap_item_[level].SetTombstoneKey( ParsedInternalKey pik = range_tombstone_iters_[level]->start_key();
range_tombstone_iters_[level]->start_key()); // iterate_upper_bound does not have timestamp
if (iterate_upper_bound_ &&
comparator_->user_comparator()->CompareWithoutTimestamp(
pik.user_key, true /* a_has_ts */, *iterate_upper_bound_,
false /* b_has_ts */) >= 0) {
if (replace_top) {
// replace_top implies this range tombstone iterator is still in
// minHeap_ and at the top.
minHeap_.pop();
}
return;
}
pinned_heap_item_[level].SetTombstoneKey(std::move(pik));
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START; pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
assert(active_.count(level) == 0); assert(active_.count(level) == 0);
} else { } else {
// allow end key to go over upper bound (if present) since start key is
// before upper bound and the range tombstone could still cover a
// range before upper bound.
pinned_heap_item_[level].SetTombstoneKey( pinned_heap_item_[level].SetTombstoneKey(
range_tombstone_iters_[level]->end_key()); range_tombstone_iters_[level]->end_key());
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END; pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
@ -251,6 +268,7 @@ class MergingIterator : public InternalIterator {
void PopDeleteRangeStart() { void PopDeleteRangeStart() {
while (!minHeap_.empty() && while (!minHeap_.empty() &&
minHeap_.top()->type == HeapItem::DELETE_RANGE_START) { minHeap_.top()->type == HeapItem::DELETE_RANGE_START) {
TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr);
// insert end key of this range tombstone and updates active_ // insert end key of this range tombstone and updates active_
InsertRangeTombstoneToMinHeap( InsertRangeTombstoneToMinHeap(
minHeap_.top()->level, false /* start_key */, true /* replace_top */); minHeap_.top()->level, false /* start_key */, true /* replace_top */);
@ -573,6 +591,10 @@ class MergingIterator : public InternalIterator {
std::unique_ptr<MergerMaxIterHeap> maxHeap_; std::unique_ptr<MergerMaxIterHeap> maxHeap_;
PinnedIteratorsManager* pinned_iters_mgr_; PinnedIteratorsManager* pinned_iters_mgr_;
// Used to bound range tombstones. For point keys, DBIter and SSTable iterator
// take care of boundary checking.
const Slice* iterate_upper_bound_;
// In forward direction, process a child that is not in the min heap. // In forward direction, process a child that is not in the min heap.
// If valid, add to the min heap. Otherwise, check status. // If valid, add to the min heap. Otherwise, check status.
void AddToMinHeapOrCheckStatus(HeapItem*); void AddToMinHeapOrCheckStatus(HeapItem*);
@ -634,9 +656,19 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
for (size_t level = 0; level < starting_level; ++level) { for (size_t level = 0; level < starting_level; ++level) {
if (range_tombstone_iters_[level] && if (range_tombstone_iters_[level] &&
range_tombstone_iters_[level]->Valid()) { range_tombstone_iters_[level]->Valid()) {
assert(static_cast<bool>(active_.count(level)) == // use an iterator on active_ if performance becomes an issue here
(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END)); if (active_.count(level) > 0) {
minHeap_.push(&pinned_heap_item_[level]); assert(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END);
// if it was active, then start key must be within upper_bound,
// so we can add to minHeap_ directly.
minHeap_.push(&pinned_heap_item_[level]);
} else {
// this takes care of checking iterate_upper_bound, but with an extra
// key comparison if range_tombstone_iters_[level] was already out of
// bound. Consider using a new HeapItem type or some flag to remember
// boundary checking result.
InsertRangeTombstoneToMinHeap(level);
}
} else { } else {
assert(!active_.count(level)); assert(!active_.count(level));
} }
@ -1280,11 +1312,12 @@ InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp,
} }
MergeIteratorBuilder::MergeIteratorBuilder( MergeIteratorBuilder::MergeIteratorBuilder(
const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode) const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode,
const Slice* iterate_upper_bound)
: first_iter(nullptr), use_merging_iter(false), arena(a) { : first_iter(nullptr), use_merging_iter(false), arena(a) {
auto mem = arena->AllocateAligned(sizeof(MergingIterator)); auto mem = arena->AllocateAligned(sizeof(MergingIterator));
merge_iter = merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true,
new (mem) MergingIterator(comparator, nullptr, 0, true, prefix_seek_mode); prefix_seek_mode, iterate_upper_bound);
} }
MergeIteratorBuilder::~MergeIteratorBuilder() { MergeIteratorBuilder::~MergeIteratorBuilder() {

@ -45,7 +45,8 @@ class MergeIteratorBuilder {
// comparator: the comparator used in merging comparator // comparator: the comparator used in merging comparator
// arena: where the merging iterator needs to be allocated from. // arena: where the merging iterator needs to be allocated from.
explicit MergeIteratorBuilder(const InternalKeyComparator* comparator, explicit MergeIteratorBuilder(const InternalKeyComparator* comparator,
Arena* arena, bool prefix_seek_mode = false); Arena* arena, bool prefix_seek_mode = false,
const Slice* iterate_upper_bound = nullptr);
~MergeIteratorBuilder(); ~MergeIteratorBuilder();
// Add iter to the merging iterator. // Add iter to the merging iterator.

Loading…
Cancel
Save