Remove copying of range tombstones keys in iterator (#10878)

Summary:
In MergingIterator, if a range tombstone's start or end key is added to minHeap/maxHeap, the key is copied. This PR removes the copying of range tombstone keys by adding InternalKey comparator that compares `Slice` for internal key and `ParsedInternalKey` directly.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10878

Test Plan:
- existing UT
- ran all flavors of stress test through sandcastle
- benchmarks: I did not get improvement when compiling with DEBUG_LEVEL=0, and saw many noise. With `OPTIMIZE_LEVEL="-O3" USE_LTO=1` I do see improvement.
```
# Favorable set up: half of the writes are DeleteRange.
TEST_TMPDIR=/tmp/rocksdb-rangedel-test-all-tombstone ./db_bench --benchmarks=fillseq,levelstats --writes_per_range_tombstone=1 --max_num_range_tombstones=1000000 --range_tombstone_width=2 --num=1000000 --max_bytes_for_level_base=4194304 --disable_auto_compactions --write_buffer_size=33554432 --key_size=50

# benchmark command
TEST_TMPDIR=/tmp/rocksdb-rangedel-test-all-tombstone ./db_bench --benchmarks=readseq[-W1][-X5],levelstats --use_existing_db=true --cache_size=3221225472  --disable_auto_compactions=true --avoid_flush_during_recovery=true --seek_nexts=100 --reads=1000000 --num=1000000 --threads=25

# main
readseq [AVG    5 runs] : 26017977 (± 371077) ops/sec; 3721.9 (± 53.1) MB/sec
readseq [MEDIAN 5 runs] : 26096905 ops/sec; 3733.2 MB/sec

# this PR
readseq [AVG    5 runs] : 27481724 (± 568758) ops/sec; 3931.3 (± 81.4) MB/sec
readseq [MEDIAN 5 runs] : 27323957 ops/sec; 3908.7 MB/sec
```

Reviewed By: ajkr

Differential Revision: D40711170

Pulled By: cbi42

fbshipit-source-id: 708cb584e2bd085a9ce0d2ef6a420489f721717f
main
Changyu Bi 2 years ago committed by Facebook GitHub Bot
parent d8c043f7ad
commit 6cdb7af9f8
  1. 25
      db/dbformat.cc
  2. 2
      db/dbformat.h
  3. 94
      table/merging_iterator.cc

@ -150,6 +150,31 @@ int InternalKeyComparator::Compare(const ParsedInternalKey& a,
return r; return r;
} }
int InternalKeyComparator::Compare(const Slice& a,
const ParsedInternalKey& b) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_.Compare(ExtractUserKey(a), b.user_key);
if (r == 0) {
const uint64_t anum =
DecodeFixed64(a.data() + a.size() - kNumInternalBytes);
const uint64_t bnum = (b.sequence << 8) | b.type;
if (anum > bnum) {
r = -1;
} else if (anum < bnum) {
r = +1;
}
}
return r;
}
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
const Slice& b) const {
return -Compare(b, a);
}
LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s, LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
const Slice* ts) { const Slice* ts) {
size_t usize = _user_key.size(); size_t usize = _user_key.size();

@ -283,6 +283,8 @@ class InternalKeyComparator
int Compare(const InternalKey& a, const InternalKey& b) const; int Compare(const InternalKey& a, const InternalKey& b) const;
int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
int Compare(const Slice& a, const ParsedInternalKey& b) const;
int Compare(const ParsedInternalKey& a, const Slice& b) const;
// In this `Compare()` overload, the sequence numbers provided in // In this `Compare()` overload, the sequence numbers provided in
// `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a` // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a`
// and `b`, respectively. To disable sequence number override(s), provide the // and `b`, respectively. To disable sequence number override(s), provide the

@ -43,7 +43,7 @@ struct HeapItem {
enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END }; enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
IteratorWrapper iter; IteratorWrapper iter;
size_t level = 0; size_t level = 0;
std::string pinned_key; ParsedInternalKey parsed_ikey;
// Will be overwritten before use, initialize here so compiler does not // Will be overwritten before use, initialize here so compiler does not
// complain. // complain.
Type type = ITERATOR; Type type = ITERATOR;
@ -54,27 +54,15 @@ struct HeapItem {
} }
void SetTombstoneKey(ParsedInternalKey&& pik) { void SetTombstoneKey(ParsedInternalKey&& pik) {
pinned_key.clear(); // op_type is already initialized in MergingIterator::Finish().
// Range tombstone end key is exclusive. If a point internal key has the parsed_ikey.user_key = pik.user_key;
// same user key and sequence number as the start or end key of a range parsed_ikey.sequence = pik.sequence;
// tombstone, the order will be start < end key < internal key with the
// following op_type change. This is helpful to ensure keys popped from
// heap are in expected order since range tombstone start/end keys will
// be distinct from point internal keys. Strictly speaking, this is only
// needed for tombstone end points that are truncated in
// TruncatedRangeDelIterator since untruncated tombstone end points always
// have kMaxSequenceNumber and kTypeRangeDeletion (see
// TruncatedRangeDelIterator::start_key()/end_key()).
ParsedInternalKey p(pik.user_key, pik.sequence, kTypeMaxValid);
AppendInternalKey(&pinned_key, p);
} }
Slice key() const { Slice key() const {
if (type == Type::ITERATOR) { assert(type == ITERATOR);
return iter.key(); return iter.key();
} }
return pinned_key;
}
bool IsDeleteRangeSentinelKey() const { bool IsDeleteRangeSentinelKey() const {
if (type == Type::ITERATOR) { if (type == Type::ITERATOR) {
@ -89,7 +77,19 @@ class MinHeapItemComparator {
MinHeapItemComparator(const InternalKeyComparator* comparator) MinHeapItemComparator(const InternalKeyComparator* comparator)
: comparator_(comparator) {} : comparator_(comparator) {}
bool operator()(HeapItem* a, HeapItem* b) const { bool operator()(HeapItem* a, HeapItem* b) const {
if (LIKELY(a->type == HeapItem::ITERATOR)) {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->key(), b->key()) > 0; return comparator_->Compare(a->key(), b->key()) > 0;
} else {
return comparator_->Compare(a->key(), b->parsed_ikey) > 0;
}
} else {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->parsed_ikey, b->key()) > 0;
} else {
return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) > 0;
}
}
} }
private: private:
@ -101,7 +101,19 @@ class MaxHeapItemComparator {
MaxHeapItemComparator(const InternalKeyComparator* comparator) MaxHeapItemComparator(const InternalKeyComparator* comparator)
: comparator_(comparator) {} : comparator_(comparator) {}
bool operator()(HeapItem* a, HeapItem* b) const { bool operator()(HeapItem* a, HeapItem* b) const {
if (LIKELY(a->type == HeapItem::ITERATOR)) {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->key(), b->key()) < 0; return comparator_->Compare(a->key(), b->key()) < 0;
} else {
return comparator_->Compare(a->key(), b->parsed_ikey) < 0;
}
} else {
if (LIKELY(b->type == HeapItem::ITERATOR)) {
return comparator_->Compare(a->parsed_ikey, b->key()) < 0;
} else {
return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) < 0;
}
}
} }
private: private:
@ -177,6 +189,17 @@ class MergingIterator : public InternalIterator {
pinned_heap_item_.resize(range_tombstone_iters_.size()); pinned_heap_item_.resize(range_tombstone_iters_.size());
for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
pinned_heap_item_[i].level = i; pinned_heap_item_[i].level = i;
// Range tombstone end key is exclusive. If a point internal key has the
// same user key and sequence number as the start or end key of a range
// tombstone, the order will be start < end key < internal key with the
// following op_type change. This is helpful to ensure keys popped from
// heap are in expected order since range tombstone start/end keys will
// be distinct from point internal keys. Strictly speaking, this is only
// needed for tombstone end points that are truncated in
// TruncatedRangeDelIterator since untruncated tombstone end points
// always have kMaxSequenceNumber and kTypeRangeDeletion (see
// TruncatedRangeDelIterator::start_key()/end_key()).
pinned_heap_item_[i].parsed_ikey.type = kTypeMaxValid;
} }
} }
} }
@ -824,14 +847,18 @@ bool MergingIterator::SkipNextDeleted() {
// SetTombstoneKey()). // SetTombstoneKey()).
assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion || assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion ||
active_.count(current->level) == 0); active_.count(current->level) == 0);
// LevelIterator enters a new SST file // When entering a new file, old range tombstone iter is freed,
current->iter.Next(); // but the last key from that range tombstone iter may still be in the heap.
if (current->iter.Valid()) { // We need to ensure the data underlying its corresponding key Slice is
assert(current->iter.status().ok()); // still alive. We do so by popping the range tombstone key from heap before
minHeap_.replace_top(current); // calling iter->Next(). Technically, this change is not needed: if there is
} else { // a range tombstone end key that is after file boundary sentinel key in
// minHeap_, the range tombstone end key must have been truncated at file
// boundary. The underlying data of the range tombstone end key Slice is the
// SST file's largest internal key stored as file metadata in Version.
// However, since there are too many implicit assumptions made, it is safer
// to just ensure range tombstone iter is still alive.
minHeap_.pop(); minHeap_.pop();
}
// Remove last SST file's range tombstone end key if there is one. // Remove last SST file's range tombstone end key if there is one.
// This means file boundary is before range tombstone end key, // This means file boundary is before range tombstone end key,
// which could happen when a range tombstone and a user key // which could happen when a range tombstone and a user key
@ -842,6 +869,12 @@ bool MergingIterator::SkipNextDeleted() {
minHeap_.pop(); minHeap_.pop();
active_.erase(current->level); active_.erase(current->level);
} }
// LevelIterator enters a new SST file
current->iter.Next();
if (current->iter.Valid()) {
assert(current->iter.status().ok());
minHeap_.push(current);
}
if (range_tombstone_iters_[current->level] && if (range_tombstone_iters_[current->level] &&
range_tombstone_iters_[current->level]->Valid()) { range_tombstone_iters_[current->level]->Valid()) {
InsertRangeTombstoneToMinHeap(current->level); InsertRangeTombstoneToMinHeap(current->level);
@ -1038,18 +1071,19 @@ bool MergingIterator::SkipPrevDeleted() {
} }
if (current->iter.IsDeleteRangeSentinelKey()) { if (current->iter.IsDeleteRangeSentinelKey()) {
// LevelIterator enters a new SST file // LevelIterator enters a new SST file
current->iter.Prev();
if (current->iter.Valid()) {
assert(current->iter.status().ok());
maxHeap_->replace_top(current);
} else {
maxHeap_->pop(); maxHeap_->pop();
} // Remove last SST file's range tombstone key if there is one.
if (!maxHeap_->empty() && maxHeap_->top()->level == current->level && if (!maxHeap_->empty() && maxHeap_->top()->level == current->level &&
maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) { maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) {
maxHeap_->pop(); maxHeap_->pop();
active_.erase(current->level); active_.erase(current->level);
} }
current->iter.Prev();
if (current->iter.Valid()) {
assert(current->iter.status().ok());
maxHeap_->push(current);
}
if (range_tombstone_iters_[current->level] && if (range_tombstone_iters_[current->level] &&
range_tombstone_iters_[current->level]->Valid()) { range_tombstone_iters_[current->level]->Valid()) {
InsertRangeTombstoneToMaxHeap(current->level); InsertRangeTombstoneToMaxHeap(current->level);

Loading…
Cancel
Save