From 56715350d9ec660298e39f7aa7ff26b6172d707c Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Thu, 27 Oct 2022 14:28:50 -0700 Subject: [PATCH] Reduce heap operations for range tombstone keys in iterator (#10877) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Right now in MergingIterator, for each range tombstone start and end key, we pop one end from heap and push the other end into the heap. This involves extra downheap and upheap cost. In the likely cases when a range tombstone iterator emits relatively adjacent keys, these keys should have similar order within all keys in the heap. This can happen when there is a burst of consecutive range tombstones, and most of the keys covered by them are dropped already. This PR uses `replace_top()` when inserting new range tombstone keys, which is more efficient in these common cases. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10877 Test Plan: - existing UT - ran all flavors of stress test through sandcastle - benchmark: ``` # Set up: --writes_per_range_tombstone=1 means one point write and one delete range TEST_TMPDIR=/tmp/rocksdb-rangedel-test-all-tombstone ./db_bench --benchmarks=fillseq,levelstats --writes_per_range_tombstone=1 --max_num_range_tombstones=1000000 --range_tombstone_width=2 --num=100000000 --writes=800000 --max_bytes_for_level_base=4194304 --disable_auto_compactions --write_buffer_size=33554432 --key_size=64 Level Files Size(MB) -------------------- 0 8 152 1 0 0 2 0 0 3 0 0 4 0 0 5 0 0 6 0 0 # Benchmark TEST_TMPDIR=/tmp/rocksdb-rangedel-test-all-tombstone/ ./db_bench --benchmarks=readseq[-W1][-X5],levelstats --use_existing_db=true --cache_size=3221225472 --num=100000000 --reads=1000000 --disable_auto_compactions=true --avoid_flush_during_recovery=true # Pre PR readseq [AVG 5 runs] : 1432116 (± 59664) ops/sec; 224.0 (± 9.3) MB/sec readseq [MEDIAN 5 runs] : 1454886 ops/sec; 227.5 MB/sec # Post PR readseq [AVG 5 runs] : 1944425 (± 29521) ops/sec; 304.1 (± 4.6) MB/sec readseq [MEDIAN 5 runs] : 1959430 ops/sec; 306.5 MB/sec ``` Reviewed By: ajkr Differential Revision: D40710936 Pulled By: cbi42 fbshipit-source-id: cb782fb9cdcd26c0c3eb9443215a4ef4d2f79022 --- HISTORY.md | 2 ++ table/merging_iterator.cc | 40 +++++++++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 8a690db9d..308954c2f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,7 @@ # Rocksdb Change Log ## Unreleased +### Performance Improvements +* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877). ## 7.8.0 (10/22/2022) ### New Features diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index b712e935e..e681860e5 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -197,7 +197,8 @@ class MergingIterator : public InternalIterator { // Add range_tombstone_iters_[level] into min heap. // Updates active_ if the end key of a range tombstone is inserted. // @param start_key specifies which end point of the range tombstone to add. - void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true) { + void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true, + bool replace_top = false) { assert(!range_tombstone_iters_.empty() && range_tombstone_iters_[level]->Valid()); if (start_key) { @@ -211,13 +212,18 @@ class MergingIterator : public InternalIterator { pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END; active_.insert(level); } - minHeap_.push(&pinned_heap_item_[level]); + if (replace_top) { + minHeap_.replace_top(&pinned_heap_item_[level]); + } else { + minHeap_.push(&pinned_heap_item_[level]); + } } // Add range_tombstone_iters_[level] into max heap. // Updates active_ if the start key of a range tombstone is inserted. // @param end_key specifies which end point of the range tombstone to add. - void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true) { + void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true, + bool replace_top = false) { assert(!range_tombstone_iters_.empty() && range_tombstone_iters_[level]->Valid()); if (end_key) { @@ -231,7 +237,11 @@ class MergingIterator : public InternalIterator { pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START; active_.insert(level); } - maxHeap_->push(&pinned_heap_item_[level]); + if (replace_top) { + maxHeap_->replace_top(&pinned_heap_item_[level]); + } else { + maxHeap_->push(&pinned_heap_item_[level]); + } } // Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START @@ -241,10 +251,9 @@ class MergingIterator : public InternalIterator { void PopDeleteRangeStart() { while (!minHeap_.empty() && minHeap_.top()->type == HeapItem::DELETE_RANGE_START) { - auto level = minHeap_.top()->level; - minHeap_.pop(); // insert end key of this range tombstone and updates active_ - InsertRangeTombstoneToMinHeap(level, false /* start_key */); + InsertRangeTombstoneToMinHeap( + minHeap_.top()->level, false /* start_key */, true /* replace_top */); } } @@ -255,10 +264,9 @@ class MergingIterator : public InternalIterator { void PopDeleteRangeEnd() { while (!maxHeap_->empty() && maxHeap_->top()->type == HeapItem::DELETE_RANGE_END) { - auto level = maxHeap_->top()->level; - maxHeap_->pop(); // insert start key of this range tombstone and updates active_ - InsertRangeTombstoneToMaxHeap(level, false /* end_key */); + InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */, + true /* replace_top */); } } @@ -766,13 +774,15 @@ bool MergingIterator::SkipNextDeleted() { // - range deletion end key auto current = minHeap_.top(); if (current->type == HeapItem::DELETE_RANGE_END) { - minHeap_.pop(); active_.erase(current->level); assert(range_tombstone_iters_[current->level] && range_tombstone_iters_[current->level]->Valid()); range_tombstone_iters_[current->level]->Next(); if (range_tombstone_iters_[current->level]->Valid()) { - InsertRangeTombstoneToMinHeap(current->level); + InsertRangeTombstoneToMinHeap(current->level, true /* start_key */, + true /* replace_top */); + } else { + minHeap_.pop(); } return true /* current key deleted */; } @@ -981,13 +991,15 @@ bool MergingIterator::SkipPrevDeleted() { // - range deletion start key auto current = maxHeap_->top(); if (current->type == HeapItem::DELETE_RANGE_START) { - maxHeap_->pop(); active_.erase(current->level); assert(range_tombstone_iters_[current->level] && range_tombstone_iters_[current->level]->Valid()); range_tombstone_iters_[current->level]->Prev(); if (range_tombstone_iters_[current->level]->Valid()) { - InsertRangeTombstoneToMaxHeap(current->level); + InsertRangeTombstoneToMaxHeap(current->level, true /* end_key */, + true /* replace_top */); + } else { + maxHeap_->pop(); } return true /* current key deleted */; }