From 6cdb7af9f894ebb0dba2bae3da8c686b24539c07 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Mon, 28 Nov 2022 19:27:22 -0800 Subject: [PATCH] Remove copying of range tombstones keys in iterator (#10878) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: In MergingIterator, if a range tombstone's start or end key is added to minHeap/maxHeap, the key is copied. This PR removes the copying of range tombstone keys by adding InternalKey comparator that compares `Slice` for internal key and `ParsedInternalKey` directly. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10878 Test Plan: - existing UT - ran all flavors of stress test through sandcastle - benchmarks: I did not get improvement when compiling with DEBUG_LEVEL=0, and saw many noise. With `OPTIMIZE_LEVEL="-O3" USE_LTO=1` I do see improvement. ``` # Favorable set up: half of the writes are DeleteRange. TEST_TMPDIR=/tmp/rocksdb-rangedel-test-all-tombstone ./db_bench --benchmarks=fillseq,levelstats --writes_per_range_tombstone=1 --max_num_range_tombstones=1000000 --range_tombstone_width=2 --num=1000000 --max_bytes_for_level_base=4194304 --disable_auto_compactions --write_buffer_size=33554432 --key_size=50 # benchmark command TEST_TMPDIR=/tmp/rocksdb-rangedel-test-all-tombstone ./db_bench --benchmarks=readseq[-W1][-X5],levelstats --use_existing_db=true --cache_size=3221225472 --disable_auto_compactions=true --avoid_flush_during_recovery=true --seek_nexts=100 --reads=1000000 --num=1000000 --threads=25 # main readseq [AVG 5 runs] : 26017977 (± 371077) ops/sec; 3721.9 (± 53.1) MB/sec readseq [MEDIAN 5 runs] : 26096905 ops/sec; 3733.2 MB/sec # this PR readseq [AVG 5 runs] : 27481724 (± 568758) ops/sec; 3931.3 (± 81.4) MB/sec readseq [MEDIAN 5 runs] : 27323957 ops/sec; 3908.7 MB/sec ``` Reviewed By: ajkr Differential Revision: D40711170 Pulled By: cbi42 fbshipit-source-id: 708cb584e2bd085a9ce0d2ef6a420489f721717f --- db/dbformat.cc | 25 +++++++++ db/dbformat.h | 2 + table/merging_iterator.cc | 104 +++++++++++++++++++++++++------------- 3 files changed, 96 insertions(+), 35 deletions(-) diff --git a/db/dbformat.cc b/db/dbformat.cc index b0ac6c339..2c3581ca0 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -150,6 +150,31 @@ int InternalKeyComparator::Compare(const ParsedInternalKey& a, return r; } +int InternalKeyComparator::Compare(const Slice& a, + const ParsedInternalKey& b) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_.Compare(ExtractUserKey(a), b.user_key); + if (r == 0) { + const uint64_t anum = + DecodeFixed64(a.data() + a.size() - kNumInternalBytes); + const uint64_t bnum = (b.sequence << 8) | b.type; + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +int InternalKeyComparator::Compare(const ParsedInternalKey& a, + const Slice& b) const { + return -Compare(b, a); +} + LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s, const Slice* ts) { size_t usize = _user_key.size(); diff --git a/db/dbformat.h b/db/dbformat.h index 8c1fc7055..d9fadea1c 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -283,6 +283,8 @@ class InternalKeyComparator int Compare(const InternalKey& a, const InternalKey& b) const; int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; + int Compare(const Slice& a, const ParsedInternalKey& b) const; + int Compare(const ParsedInternalKey& a, const Slice& b) const; // In this `Compare()` overload, the sequence numbers provided in // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a` // and `b`, respectively. To disable sequence number override(s), provide the diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index beb35ea9a..309ae69c5 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -43,7 +43,7 @@ struct HeapItem { enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END }; IteratorWrapper iter; size_t level = 0; - std::string pinned_key; + ParsedInternalKey parsed_ikey; // Will be overwritten before use, initialize here so compiler does not // complain. Type type = ITERATOR; @@ -54,26 +54,14 @@ struct HeapItem { } void SetTombstoneKey(ParsedInternalKey&& pik) { - pinned_key.clear(); - // Range tombstone end key is exclusive. If a point internal key has the - // same user key and sequence number as the start or end key of a range - // tombstone, the order will be start < end key < internal key with the - // following op_type change. This is helpful to ensure keys popped from - // heap are in expected order since range tombstone start/end keys will - // be distinct from point internal keys. Strictly speaking, this is only - // needed for tombstone end points that are truncated in - // TruncatedRangeDelIterator since untruncated tombstone end points always - // have kMaxSequenceNumber and kTypeRangeDeletion (see - // TruncatedRangeDelIterator::start_key()/end_key()). - ParsedInternalKey p(pik.user_key, pik.sequence, kTypeMaxValid); - AppendInternalKey(&pinned_key, p); + // op_type is already initialized in MergingIterator::Finish(). + parsed_ikey.user_key = pik.user_key; + parsed_ikey.sequence = pik.sequence; } Slice key() const { - if (type == Type::ITERATOR) { - return iter.key(); - } - return pinned_key; + assert(type == ITERATOR); + return iter.key(); } bool IsDeleteRangeSentinelKey() const { @@ -89,7 +77,19 @@ class MinHeapItemComparator { MinHeapItemComparator(const InternalKeyComparator* comparator) : comparator_(comparator) {} bool operator()(HeapItem* a, HeapItem* b) const { - return comparator_->Compare(a->key(), b->key()) > 0; + if (LIKELY(a->type == HeapItem::ITERATOR)) { + if (LIKELY(b->type == HeapItem::ITERATOR)) { + return comparator_->Compare(a->key(), b->key()) > 0; + } else { + return comparator_->Compare(a->key(), b->parsed_ikey) > 0; + } + } else { + if (LIKELY(b->type == HeapItem::ITERATOR)) { + return comparator_->Compare(a->parsed_ikey, b->key()) > 0; + } else { + return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) > 0; + } + } } private: @@ -101,7 +101,19 @@ class MaxHeapItemComparator { MaxHeapItemComparator(const InternalKeyComparator* comparator) : comparator_(comparator) {} bool operator()(HeapItem* a, HeapItem* b) const { - return comparator_->Compare(a->key(), b->key()) < 0; + if (LIKELY(a->type == HeapItem::ITERATOR)) { + if (LIKELY(b->type == HeapItem::ITERATOR)) { + return comparator_->Compare(a->key(), b->key()) < 0; + } else { + return comparator_->Compare(a->key(), b->parsed_ikey) < 0; + } + } else { + if (LIKELY(b->type == HeapItem::ITERATOR)) { + return comparator_->Compare(a->parsed_ikey, b->key()) < 0; + } else { + return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) < 0; + } + } } private: @@ -177,6 +189,17 @@ class MergingIterator : public InternalIterator { pinned_heap_item_.resize(range_tombstone_iters_.size()); for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { pinned_heap_item_[i].level = i; + // Range tombstone end key is exclusive. If a point internal key has the + // same user key and sequence number as the start or end key of a range + // tombstone, the order will be start < end key < internal key with the + // following op_type change. This is helpful to ensure keys popped from + // heap are in expected order since range tombstone start/end keys will + // be distinct from point internal keys. Strictly speaking, this is only + // needed for tombstone end points that are truncated in + // TruncatedRangeDelIterator since untruncated tombstone end points + // always have kMaxSequenceNumber and kTypeRangeDeletion (see + // TruncatedRangeDelIterator::start_key()/end_key()). + pinned_heap_item_[i].parsed_ikey.type = kTypeMaxValid; } } } @@ -824,14 +847,18 @@ bool MergingIterator::SkipNextDeleted() { // SetTombstoneKey()). assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion || active_.count(current->level) == 0); - // LevelIterator enters a new SST file - current->iter.Next(); - if (current->iter.Valid()) { - assert(current->iter.status().ok()); - minHeap_.replace_top(current); - } else { - minHeap_.pop(); - } + // When entering a new file, old range tombstone iter is freed, + // but the last key from that range tombstone iter may still be in the heap. + // We need to ensure the data underlying its corresponding key Slice is + // still alive. We do so by popping the range tombstone key from heap before + // calling iter->Next(). Technically, this change is not needed: if there is + // a range tombstone end key that is after file boundary sentinel key in + // minHeap_, the range tombstone end key must have been truncated at file + // boundary. The underlying data of the range tombstone end key Slice is the + // SST file's largest internal key stored as file metadata in Version. + // However, since there are too many implicit assumptions made, it is safer + // to just ensure range tombstone iter is still alive. + minHeap_.pop(); // Remove last SST file's range tombstone end key if there is one. // This means file boundary is before range tombstone end key, // which could happen when a range tombstone and a user key @@ -842,6 +869,12 @@ bool MergingIterator::SkipNextDeleted() { minHeap_.pop(); active_.erase(current->level); } + // LevelIterator enters a new SST file + current->iter.Next(); + if (current->iter.Valid()) { + assert(current->iter.status().ok()); + minHeap_.push(current); + } if (range_tombstone_iters_[current->level] && range_tombstone_iters_[current->level]->Valid()) { InsertRangeTombstoneToMinHeap(current->level); @@ -1038,18 +1071,19 @@ bool MergingIterator::SkipPrevDeleted() { } if (current->iter.IsDeleteRangeSentinelKey()) { // LevelIterator enters a new SST file - current->iter.Prev(); - if (current->iter.Valid()) { - assert(current->iter.status().ok()); - maxHeap_->replace_top(current); - } else { - maxHeap_->pop(); - } + maxHeap_->pop(); + // Remove last SST file's range tombstone key if there is one. if (!maxHeap_->empty() && maxHeap_->top()->level == current->level && maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) { maxHeap_->pop(); active_.erase(current->level); } + current->iter.Prev(); + if (current->iter.Valid()) { + assert(current->iter.status().ok()); + maxHeap_->push(current); + } + if (range_tombstone_iters_[current->level] && range_tombstone_iters_[current->level]->Valid()) { InsertRangeTombstoneToMaxHeap(current->level);