Reduce heap operations for range tombstone keys in iterator (#10877)

Summary:
Right now in MergingIterator, for each range tombstone start and end key, we pop one end from heap and push the other end into the heap. This involves extra downheap and upheap cost. In the likely cases when a range tombstone iterator emits relatively adjacent keys, these keys should have similar order within all keys in the heap. This can happen when there is a burst of consecutive range tombstones, and most of the keys covered by them are dropped already. This PR uses `replace_top()` when inserting new range tombstone keys, which is more efficient in these common cases.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10877

Test Plan:
- existing UT
- ran all flavors of stress test through sandcastle
- benchmark:
```
# Set up: --writes_per_range_tombstone=1 means one point write and one delete range

TEST_TMPDIR=/tmp/rocksdb-rangedel-test-all-tombstone ./db_bench --benchmarks=fillseq,levelstats --writes_per_range_tombstone=1 --max_num_range_tombstones=1000000 --range_tombstone_width=2 --num=100000000 --writes=800000 --max_bytes_for_level_base=4194304 --disable_auto_compactions --write_buffer_size=33554432 --key_size=64

Level Files Size(MB)
--------------------
  0        8      152
  1        0        0
  2        0        0
  3        0        0
  4        0        0
  5        0        0
  6        0        0

# Benchmark
TEST_TMPDIR=/tmp/rocksdb-rangedel-test-all-tombstone/ ./db_bench --benchmarks=readseq[-W1][-X5],levelstats --use_existing_db=true --cache_size=3221225472 --num=100000000 --reads=1000000 --disable_auto_compactions=true --avoid_flush_during_recovery=true

# Pre PR
readseq [AVG    5 runs] : 1432116 (± 59664) ops/sec;  224.0 (± 9.3) MB/sec
readseq [MEDIAN 5 runs] : 1454886 ops/sec;  227.5 MB/sec

# Post PR
readseq [AVG    5 runs] : 1944425 (± 29521) ops/sec;  304.1 (± 4.6) MB/sec
readseq [MEDIAN 5 runs] : 1959430 ops/sec;  306.5 MB/sec
```

Reviewed By: ajkr

Differential Revision: D40710936

Pulled By: cbi42

fbshipit-source-id: cb782fb9cdcd26c0c3eb9443215a4ef4d2f79022
main
Changyu Bi 2 years ago committed by Facebook GitHub Bot
parent 3e686c7cbe
commit 56715350d9
  1. 2
      HISTORY.md
  2. 40
      table/merging_iterator.cc

@ -1,5 +1,7 @@
# Rocksdb Change Log # Rocksdb Change Log
## Unreleased ## Unreleased
### Performance Improvements
* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877).
## 7.8.0 (10/22/2022) ## 7.8.0 (10/22/2022)
### New Features ### New Features

@ -197,7 +197,8 @@ class MergingIterator : public InternalIterator {
// Add range_tombstone_iters_[level] into min heap. // Add range_tombstone_iters_[level] into min heap.
// Updates active_ if the end key of a range tombstone is inserted. // Updates active_ if the end key of a range tombstone is inserted.
// @param start_key specifies which end point of the range tombstone to add. // @param start_key specifies which end point of the range tombstone to add.
void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true) { void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true,
bool replace_top = false) {
assert(!range_tombstone_iters_.empty() && assert(!range_tombstone_iters_.empty() &&
range_tombstone_iters_[level]->Valid()); range_tombstone_iters_[level]->Valid());
if (start_key) { if (start_key) {
@ -211,13 +212,18 @@ class MergingIterator : public InternalIterator {
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END; pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
active_.insert(level); active_.insert(level);
} }
minHeap_.push(&pinned_heap_item_[level]); if (replace_top) {
minHeap_.replace_top(&pinned_heap_item_[level]);
} else {
minHeap_.push(&pinned_heap_item_[level]);
}
} }
// Add range_tombstone_iters_[level] into max heap. // Add range_tombstone_iters_[level] into max heap.
// Updates active_ if the start key of a range tombstone is inserted. // Updates active_ if the start key of a range tombstone is inserted.
// @param end_key specifies which end point of the range tombstone to add. // @param end_key specifies which end point of the range tombstone to add.
void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true) { void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true,
bool replace_top = false) {
assert(!range_tombstone_iters_.empty() && assert(!range_tombstone_iters_.empty() &&
range_tombstone_iters_[level]->Valid()); range_tombstone_iters_[level]->Valid());
if (end_key) { if (end_key) {
@ -231,7 +237,11 @@ class MergingIterator : public InternalIterator {
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START; pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
active_.insert(level); active_.insert(level);
} }
maxHeap_->push(&pinned_heap_item_[level]); if (replace_top) {
maxHeap_->replace_top(&pinned_heap_item_[level]);
} else {
maxHeap_->push(&pinned_heap_item_[level]);
}
} }
// Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START // Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START
@ -241,10 +251,9 @@ class MergingIterator : public InternalIterator {
void PopDeleteRangeStart() { void PopDeleteRangeStart() {
while (!minHeap_.empty() && while (!minHeap_.empty() &&
minHeap_.top()->type == HeapItem::DELETE_RANGE_START) { minHeap_.top()->type == HeapItem::DELETE_RANGE_START) {
auto level = minHeap_.top()->level;
minHeap_.pop();
// insert end key of this range tombstone and updates active_ // insert end key of this range tombstone and updates active_
InsertRangeTombstoneToMinHeap(level, false /* start_key */); InsertRangeTombstoneToMinHeap(
minHeap_.top()->level, false /* start_key */, true /* replace_top */);
} }
} }
@ -255,10 +264,9 @@ class MergingIterator : public InternalIterator {
void PopDeleteRangeEnd() { void PopDeleteRangeEnd() {
while (!maxHeap_->empty() && while (!maxHeap_->empty() &&
maxHeap_->top()->type == HeapItem::DELETE_RANGE_END) { maxHeap_->top()->type == HeapItem::DELETE_RANGE_END) {
auto level = maxHeap_->top()->level;
maxHeap_->pop();
// insert start key of this range tombstone and updates active_ // insert start key of this range tombstone and updates active_
InsertRangeTombstoneToMaxHeap(level, false /* end_key */); InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */,
true /* replace_top */);
} }
} }
@ -766,13 +774,15 @@ bool MergingIterator::SkipNextDeleted() {
// - range deletion end key // - range deletion end key
auto current = minHeap_.top(); auto current = minHeap_.top();
if (current->type == HeapItem::DELETE_RANGE_END) { if (current->type == HeapItem::DELETE_RANGE_END) {
minHeap_.pop();
active_.erase(current->level); active_.erase(current->level);
assert(range_tombstone_iters_[current->level] && assert(range_tombstone_iters_[current->level] &&
range_tombstone_iters_[current->level]->Valid()); range_tombstone_iters_[current->level]->Valid());
range_tombstone_iters_[current->level]->Next(); range_tombstone_iters_[current->level]->Next();
if (range_tombstone_iters_[current->level]->Valid()) { if (range_tombstone_iters_[current->level]->Valid()) {
InsertRangeTombstoneToMinHeap(current->level); InsertRangeTombstoneToMinHeap(current->level, true /* start_key */,
true /* replace_top */);
} else {
minHeap_.pop();
} }
return true /* current key deleted */; return true /* current key deleted */;
} }
@ -981,13 +991,15 @@ bool MergingIterator::SkipPrevDeleted() {
// - range deletion start key // - range deletion start key
auto current = maxHeap_->top(); auto current = maxHeap_->top();
if (current->type == HeapItem::DELETE_RANGE_START) { if (current->type == HeapItem::DELETE_RANGE_START) {
maxHeap_->pop();
active_.erase(current->level); active_.erase(current->level);
assert(range_tombstone_iters_[current->level] && assert(range_tombstone_iters_[current->level] &&
range_tombstone_iters_[current->level]->Valid()); range_tombstone_iters_[current->level]->Valid());
range_tombstone_iters_[current->level]->Prev(); range_tombstone_iters_[current->level]->Prev();
if (range_tombstone_iters_[current->level]->Valid()) { if (range_tombstone_iters_[current->level]->Valid()) {
InsertRangeTombstoneToMaxHeap(current->level); InsertRangeTombstoneToMaxHeap(current->level, true /* end_key */,
true /* replace_top */);
} else {
maxHeap_->pop();
} }
return true /* current key deleted */; return true /* current key deleted */;
} }

Loading…
Cancel
Save