From 534fb06dd38234a2d6234d17366a7b0ac5272f48 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Wed, 23 Nov 2022 14:27:14 -0800 Subject: [PATCH] Prevent iterating over range tombstones beyond `iterate_upper_bound` (#10966) Summary: Currently, `iterate_upper_bound` is not checked for range tombstone keys in MergingIterator. This may impact performance when there is a large number of range tombstones right after `iterate_upper_bound`. This PR fixes this issue by checking `iterate_upper_bound` in MergingIterator for range tombstone keys. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10966 Test Plan: - added unit test - stress test: `python3 tools/db_crashtest.py whitebox --simple --verify_iterator_with_expected_state_one_in=5 --delrangepercent=5 --prefixpercent=18 --writepercent=48 --readpercen=15 --duration=36000 --range_deletion_width=100` - ran different stress tests over sandcastle - Falcon team ran some test traffic and saw reduced CPU usage on processing range tombstones. Reviewed By: ajkr Differential Revision: D41414172 Pulled By: cbi42 fbshipit-source-id: 9b2c29eb3abb99327c6a649bdc412e70d863f981 --- HISTORY.md | 3 +++ db/db_impl/db_impl.cc | 3 ++- db/db_range_del_test.cc | 40 +++++++++++++++++++++++++++++ include/rocksdb/options.h | 2 +- table/merging_iterator.cc | 53 +++++++++++++++++++++++++++++++-------- table/merging_iterator.h | 3 ++- 6 files changed, 91 insertions(+), 13 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 571bc565a..316276e18 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,9 @@ ### Behavior changes * Make best-efforts recovery verify SST unique ID before Version construction (#10962) +### Bug Fixes +* Fixed a regression in iterator where range tombstones after `iterate_upper_bound` is processed. + ## 7.9.0 (11/21/2022) ### Performance Improvements * Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877). diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index d34d7cfb2..a431111d4 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1823,7 +1823,8 @@ InternalIterator* DBImpl::NewInternalIterator( MergeIteratorBuilder merge_iter_builder( &cfd->internal_comparator(), arena, !read_options.total_order_seek && - super_version->mutable_cf_options.prefix_extractor != nullptr); + super_version->mutable_cf_options.prefix_extractor != nullptr, + read_options.iterate_upper_bound); // Collect iterator for mutable memtable auto mem_iter = super_version->mem->NewIterator(read_options, arena); Status s; diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index abd9162bd..d576f2217 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -2756,6 +2756,46 @@ TEST_F(DBRangeDelTest, RefreshMemtableIter) { ASSERT_OK(iter->Refresh()); } +TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) { + // Memtable: a, [b, bz) + // Do a Seek on `a` with iterate_upper_bound being az + // range tombstone [b, bz) should not be processed (added to and + // popped from the min_heap in MergingIterator). + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + ASSERT_OK(Put("a", "bar")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "bz")); + + // I could not find a cleaner way to test this without relying on + // implementation detail. Tried to test the value of + // `internal_range_del_reseek_count` but that did not work + // since BlockBasedTable iterator becomes !Valid() when point key + // is out of bound and that reseek only happens when a point key + // is covered by some range tombstone. + SyncPoint::GetInstance()->SetCallBack("MergeIterator::PopDeleteRangeStart", + [](void*) { + // there should not be any range + // tombstone in the heap. + FAIL(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ReadOptions read_opts; + std::string upper_bound = "az"; + Slice upper_bound_slice = upper_bound; + read_opts.iterate_upper_bound = &upper_bound_slice; + std::unique_ptr iter{db_->NewIterator(read_opts)}; + iter->Seek("a"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), "a"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); +} + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index cd6644c00..7a4d8b5a6 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1487,7 +1487,7 @@ struct ReadOptions { const Slice* iterate_lower_bound; // "iterate_upper_bound" defines the extent up to which the forward iterator - // can returns entries. Once the bound is reached, Valid() will be false. + // can return entries. Once the bound is reached, Valid() will be false. // "iterate_upper_bound" is exclusive ie the bound value is // not a valid entry. If prefix_extractor is not null: // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index 7300a630a..beb35ea9a 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -117,14 +117,16 @@ class MergingIterator : public InternalIterator { public: MergingIterator(const InternalKeyComparator* comparator, InternalIterator** children, int n, bool is_arena_mode, - bool prefix_seek_mode) + bool prefix_seek_mode, + const Slice* iterate_upper_bound = nullptr) : is_arena_mode_(is_arena_mode), prefix_seek_mode_(prefix_seek_mode), direction_(kForward), comparator_(comparator), current_(nullptr), minHeap_(comparator_), - pinned_iters_mgr_(nullptr) { + pinned_iters_mgr_(nullptr), + iterate_upper_bound_(iterate_upper_bound) { children_.resize(n); for (int i = 0; i < n; i++) { children_[i].level = i; @@ -202,11 +204,26 @@ class MergingIterator : public InternalIterator { assert(!range_tombstone_iters_.empty() && range_tombstone_iters_[level]->Valid()); if (start_key) { - pinned_heap_item_[level].SetTombstoneKey( - range_tombstone_iters_[level]->start_key()); + ParsedInternalKey pik = range_tombstone_iters_[level]->start_key(); + // iterate_upper_bound does not have timestamp + if (iterate_upper_bound_ && + comparator_->user_comparator()->CompareWithoutTimestamp( + pik.user_key, true /* a_has_ts */, *iterate_upper_bound_, + false /* b_has_ts */) >= 0) { + if (replace_top) { + // replace_top implies this range tombstone iterator is still in + // minHeap_ and at the top. + minHeap_.pop(); + } + return; + } + pinned_heap_item_[level].SetTombstoneKey(std::move(pik)); pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START; assert(active_.count(level) == 0); } else { + // allow end key to go over upper bound (if present) since start key is + // before upper bound and the range tombstone could still cover a + // range before upper bound. pinned_heap_item_[level].SetTombstoneKey( range_tombstone_iters_[level]->end_key()); pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END; @@ -251,6 +268,7 @@ class MergingIterator : public InternalIterator { void PopDeleteRangeStart() { while (!minHeap_.empty() && minHeap_.top()->type == HeapItem::DELETE_RANGE_START) { + TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr); // insert end key of this range tombstone and updates active_ InsertRangeTombstoneToMinHeap( minHeap_.top()->level, false /* start_key */, true /* replace_top */); @@ -573,6 +591,10 @@ class MergingIterator : public InternalIterator { std::unique_ptr maxHeap_; PinnedIteratorsManager* pinned_iters_mgr_; + // Used to bound range tombstones. For point keys, DBIter and SSTable iterator + // take care of boundary checking. + const Slice* iterate_upper_bound_; + // In forward direction, process a child that is not in the min heap. // If valid, add to the min heap. Otherwise, check status. void AddToMinHeapOrCheckStatus(HeapItem*); @@ -634,9 +656,19 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level, for (size_t level = 0; level < starting_level; ++level) { if (range_tombstone_iters_[level] && range_tombstone_iters_[level]->Valid()) { - assert(static_cast(active_.count(level)) == - (pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END)); - minHeap_.push(&pinned_heap_item_[level]); + // use an iterator on active_ if performance becomes an issue here + if (active_.count(level) > 0) { + assert(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END); + // if it was active, then start key must be within upper_bound, + // so we can add to minHeap_ directly. + minHeap_.push(&pinned_heap_item_[level]); + } else { + // this takes care of checking iterate_upper_bound, but with an extra + // key comparison if range_tombstone_iters_[level] was already out of + // bound. Consider using a new HeapItem type or some flag to remember + // boundary checking result. + InsertRangeTombstoneToMinHeap(level); + } } else { assert(!active_.count(level)); } @@ -1280,11 +1312,12 @@ InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp, } MergeIteratorBuilder::MergeIteratorBuilder( - const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode) + const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode, + const Slice* iterate_upper_bound) : first_iter(nullptr), use_merging_iter(false), arena(a) { auto mem = arena->AllocateAligned(sizeof(MergingIterator)); - merge_iter = - new (mem) MergingIterator(comparator, nullptr, 0, true, prefix_seek_mode); + merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true, + prefix_seek_mode, iterate_upper_bound); } MergeIteratorBuilder::~MergeIteratorBuilder() { diff --git a/table/merging_iterator.h b/table/merging_iterator.h index 0d66a76fe..16fc0877e 100644 --- a/table/merging_iterator.h +++ b/table/merging_iterator.h @@ -45,7 +45,8 @@ class MergeIteratorBuilder { // comparator: the comparator used in merging comparator // arena: where the merging iterator needs to be allocated from. explicit MergeIteratorBuilder(const InternalKeyComparator* comparator, - Arena* arena, bool prefix_seek_mode = false); + Arena* arena, bool prefix_seek_mode = false, + const Slice* iterate_upper_bound = nullptr); ~MergeIteratorBuilder(); // Add iter to the merging iterator.