Cache fragmented range tombstones in BlockBasedTableReader (#4493)

Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.

On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom   :       0.983 micros/op 1017076 ops/sec;   78.3 MB/s (63103 of 100000 found)
```

Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
   Tombstones?    | avg micros/op | stddev micros/op |  avg ops/s   | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None              |        0.6186 |          0.04637 | 1,625,252.90 | 124,679.41
500 Expanded      |        0.6019 |          0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded    |        0.6435 |          0.03994 | 1,559,979.40 | 104,090.52
1k Expanded       |        0.6034 |          0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded     |        0.6261 |          0.03093 | 1,600,457.50 |  79,024.94
5k Expanded       |        0.6163 |          0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded     |        0.6402 |          0.04002 | 1,567,804.70 | 100,965.55
10k Expanded      |        0.6036 |          0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded    |        0.6128 |          0.02598 | 1,634,633.40 |  72,161.82
25k Expanded      |        0.6198 |          0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded    |        0.5478 |          0.0362  | 1,833,059.10 | 121,233.81
50k Expanded      |        0.5104 |          0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded    |        0.4528 |          0.03387 | 2,219,034.50 | 170,984.32
```

After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493

Differential Revision: D10842844

Pulled By: abhimadan

fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
main
Abhishek Madan 6 years ago committed by Facebook Github Bot
parent fe0d23059d
commit 7528130e38
  1. 11
      db/memtable.cc
  2. 2
      db/memtable_list.cc
  3. 110
      db/range_tombstone_fragmenter.cc
  4. 51
      db/range_tombstone_fragmenter.h
  5. 90
      db/range_tombstone_fragmenter_test.cc
  6. 13
      db/table_cache.cc
  7. 5
      db/version_set.cc
  8. 37
      table/block_based_table_reader.cc
  9. 5
      table/block_based_table_reader.h
  10. 16
      table/table_test.cc

@ -729,13 +729,20 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
// Avoiding recording stats for speed.
return false;
}
if (*max_covering_tombstone_seq > 0) {
*s = Status::NotFound();
return true;
}
PERF_TIMER_GUARD(get_from_memtable_time);
std::unique_ptr<InternalIterator> range_del_iter(
NewRangeTombstoneIterator(read_opts));
SequenceNumber snapshot = GetInternalKeySeqno(key.internal_key());
FragmentedRangeTombstoneIterator fragment_iter(
std::move(range_del_iter), comparator_.comparator, snapshot);
FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
comparator_.comparator,
true /* one_time_use */, snapshot);
FragmentedRangeTombstoneIterator fragment_iter(&fragment_list,
comparator_.comparator);
*max_covering_tombstone_seq = std::max(
*max_covering_tombstone_seq,
MaxCoveringTombstoneSeqnum(&fragment_iter, key.internal_key(),

@ -146,7 +146,7 @@ bool MemTableListVersion::GetFromList(
}
if (done) {
assert(*seq != kMaxSequenceNumber);
assert(*seq != kMaxSequenceNumber || s->IsNotFound());
return true;
}
if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {

@ -12,19 +12,17 @@
#include <inttypes.h>
#include <stdio.h>
#include "util/autovector.h"
#include "util/kv_map.h"
#include "util/vector_iterator.h"
namespace rocksdb {
FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, SequenceNumber snapshot)
: tombstone_cmp_(icmp.user_comparator()),
icmp_(&icmp),
ucmp_(icmp.user_comparator()) {
const InternalKeyComparator& icmp, bool one_time_use,
SequenceNumber snapshot) {
if (unfragmented_tombstones == nullptr) {
pos_ = tombstones_.end();
return;
}
bool is_sorted = true;
@ -34,7 +32,7 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
unfragmented_tombstones->Next(), num_tombstones++) {
if (num_tombstones > 0 &&
icmp_->Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
is_sorted = false;
break;
}
@ -46,7 +44,8 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
}
}
if (is_sorted) {
FragmentTombstones(std::move(unfragmented_tombstones), snapshot);
FragmentTombstones(std::move(unfragmented_tombstones), icmp, one_time_use,
snapshot);
return;
}
@ -63,15 +62,16 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
}
// VectorIterator implicitly sorts by key during construction.
auto iter = std::unique_ptr<VectorIterator>(
new VectorIterator(std::move(keys), std::move(values), icmp_));
FragmentTombstones(std::move(iter), snapshot);
new VectorIterator(std::move(keys), std::move(values), &icmp));
FragmentTombstones(std::move(iter), icmp, one_time_use, snapshot);
}
void FragmentedRangeTombstoneIterator::FragmentTombstones(
void FragmentedRangeTombstoneList::FragmentTombstones(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool one_time_use,
SequenceNumber snapshot) {
Slice cur_start_key(nullptr, 0);
auto cmp = ParsedInternalKeyComparator(icmp_);
auto cmp = ParsedInternalKeyComparator(&icmp);
// Stores the end keys and sequence numbers of range tombstones with a start
// key less than or equal to cur_start_key. Provides an ordering by end key
@ -87,11 +87,11 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones(
bool reached_next_start_key = false;
for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
Slice cur_end_key = it->user_key;
if (icmp_->user_comparator()->Compare(cur_start_key, cur_end_key) == 0) {
if (icmp.user_comparator()->Compare(cur_start_key, cur_end_key) == 0) {
// Empty tombstone.
continue;
}
if (icmp_->user_comparator()->Compare(next_start_key, cur_end_key) <= 0) {
if (icmp.user_comparator()->Compare(next_start_key, cur_end_key) <= 0) {
// All of the end keys in [it, cur_end_keys.end()) are after
// next_start_key, so the tombstones they represent can be used in
// fragments that start with keys greater than or equal to
@ -109,17 +109,32 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones(
// Flush a range tombstone fragment [cur_start_key, cur_end_key), which
// should not overlap with the last-flushed tombstone fragment.
assert(tombstones_.empty() ||
icmp_->user_comparator()->Compare(tombstones_.back().end_key_,
icmp.user_comparator()->Compare(tombstones_.back().end_key_,
cur_start_key) <= 0);
SequenceNumber max_seqnum = 0;
for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
max_seqnum = std::max(max_seqnum, flush_it->sequence);
if (one_time_use) {
SequenceNumber max_seqnum = 0;
for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
max_seqnum = std::max(max_seqnum, flush_it->sequence);
}
// Flush only the tombstone fragment with the highest sequence number.
tombstones_.push_back(
RangeTombstone(cur_start_key, cur_end_key, max_seqnum));
} else {
// Sort the sequence numbers of the tombstones being fragmented in
// descending order, and then flush them in that order.
autovector<SequenceNumber> seqnums_to_flush;
for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
seqnums_to_flush.push_back(flush_it->sequence);
}
std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
std::greater<SequenceNumber>());
for (const auto seq : seqnums_to_flush) {
tombstones_.push_back(
RangeTombstone(cur_start_key, cur_end_key, seq));
}
}
// Flush only the tombstone fragment with the highest sequence
// number.
tombstones_.push_back(
RangeTombstone(cur_start_key, cur_end_key, max_seqnum));
cur_start_key = cur_end_key;
}
if (!reached_next_start_key) {
@ -140,7 +155,7 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones(
const Slice& ikey = unfragmented_tombstones->key();
Slice tombstone_start_key = ExtractUserKey(ikey);
SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
if (tombstone_seq > snapshot) {
if (one_time_use && tombstone_seq > snapshot) {
// The tombstone is not visible by this snapshot.
continue;
}
@ -152,7 +167,7 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones(
tombstone_end_key.size());
tombstone_end_key = pinned_slices_.back();
}
if (!cur_end_keys.empty() && icmp_->user_comparator()->Compare(
if (!cur_end_keys.empty() && icmp.user_comparator()->Compare(
cur_start_key, tombstone_start_key) != 0) {
// The start key has changed. Flush all tombstones that start before
// this new start key.
@ -177,29 +192,50 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones(
pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
false /* arena */);
}
}
FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
const FragmentedRangeTombstoneList* tombstones,
const InternalKeyComparator& icmp)
: tombstone_cmp_(icmp.user_comparator()),
icmp_(&icmp),
ucmp_(icmp.user_comparator()),
tombstones_(tombstones) {
assert(tombstones_ != nullptr);
pos_ = tombstones_->end();
pinned_pos_ = tombstones_->end();
}
// With this, the caller must Seek before the iterator is valid.
pos_ = tombstones_.end();
pinned_pos_ = tombstones_.end();
FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
const InternalKeyComparator& icmp)
: tombstone_cmp_(icmp.user_comparator()),
icmp_(&icmp),
ucmp_(icmp.user_comparator()),
tombstones_ref_(tombstones),
tombstones_(tombstones_ref_.get()) {
assert(tombstones_ != nullptr);
pos_ = tombstones_->end();
pinned_pos_ = tombstones_->end();
}
void FragmentedRangeTombstoneIterator::SeekToFirst() {
pos_ = tombstones_.begin();
pos_ = tombstones_->begin();
}
void FragmentedRangeTombstoneIterator::SeekToLast() {
pos_ = tombstones_.end();
pos_ = tombstones_->end();
Prev();
}
void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
if (tombstones_.empty()) {
pos_ = tombstones_.end();
if (tombstones_->empty()) {
pos_ = tombstones_->end();
return;
}
RangeTombstone search(ExtractUserKey(target), ExtractUserKey(target),
GetInternalKeySeqno(target));
pos_ = std::lower_bound(tombstones_.begin(), tombstones_.end(), search,
pos_ = std::lower_bound(tombstones_->begin(), tombstones_->end(), search,
tombstone_cmp_);
}
@ -223,20 +259,24 @@ void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
void FragmentedRangeTombstoneIterator::Next() { ++pos_; }
void FragmentedRangeTombstoneIterator::Prev() {
if (pos_ == tombstones_.begin()) {
pos_ = tombstones_.end();
if (pos_ == tombstones_->begin()) {
pos_ = tombstones_->end();
return;
}
--pos_;
}
bool FragmentedRangeTombstoneIterator::Valid() const {
return pos_ != tombstones_.end();
return tombstones_ != nullptr && pos_ != tombstones_->end();
}
SequenceNumber MaxCoveringTombstoneSeqnum(
FragmentedRangeTombstoneIterator* tombstone_iter, const Slice& lookup_key,
const Comparator* ucmp) {
if (tombstone_iter == nullptr) {
return 0;
}
SequenceNumber snapshot = GetInternalKeySeqno(lookup_key);
Slice user_key = ExtractUserKey(lookup_key);

@ -17,6 +17,37 @@
namespace rocksdb {
struct FragmentedRangeTombstoneList {
public:
FragmentedRangeTombstoneList(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool one_time_use,
SequenceNumber snapshot = kMaxSequenceNumber);
std::vector<RangeTombstone>::const_iterator begin() const {
return tombstones_.begin();
}
std::vector<RangeTombstone>::const_iterator end() const {
return tombstones_.end();
}
bool empty() const { return tombstones_.size() == 0; }
private:
// Given an ordered range tombstone iterator unfragmented_tombstones,
// "fragment" the tombstones into non-overlapping pieces, and store them in
// tombstones_.
void FragmentTombstones(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool one_time_use,
SequenceNumber snapshot = kMaxSequenceNumber);
std::vector<RangeTombstone> tombstones_;
std::list<std::string> pinned_slices_;
PinnedIteratorsManager pinned_iters_mgr_;
};
// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
// meta block into an iterator over non-overlapping tombstone fragments. The
// tombstone fragmentation process should be more efficient than the range
@ -29,8 +60,11 @@ namespace rocksdb {
class FragmentedRangeTombstoneIterator : public InternalIterator {
public:
FragmentedRangeTombstoneIterator(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, SequenceNumber snapshot);
const FragmentedRangeTombstoneList* tombstones,
const InternalKeyComparator& icmp);
FragmentedRangeTombstoneIterator(
const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
const InternalKeyComparator& icmp);
void SeekToFirst() override;
void SeekToLast() override;
void Seek(const Slice& target) override;
@ -66,7 +100,7 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
};
void MaybePinKey() const {
if (pos_ != tombstones_.end() && pinned_pos_ != pos_) {
if (pos_ != tombstones_->end() && pinned_pos_ != pos_) {
current_start_key_.Set(pos_->start_key_, pos_->seq_, kTypeRangeDeletion);
pinned_pos_ = pos_;
}
@ -78,18 +112,11 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
parsed->type = kTypeRangeDeletion;
}
// Given an ordered range tombstone iterator unfragmented_tombstones,
// "fragment" the tombstones into non-overlapping pieces, and store them in
// tombstones_.
void FragmentTombstones(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
SequenceNumber snapshot);
const FragmentedRangeTombstoneComparator tombstone_cmp_;
const InternalKeyComparator* icmp_;
const Comparator* ucmp_;
std::vector<RangeTombstone> tombstones_;
std::list<std::string> pinned_slices_;
std::shared_ptr<const FragmentedRangeTombstoneList> tombstones_ref_;
const FragmentedRangeTombstoneList* tombstones_;
std::vector<RangeTombstone>::const_iterator pos_;
mutable std::vector<RangeTombstone>::const_iterator pinned_pos_;
mutable InternalKey current_start_key_;

@ -87,8 +87,9 @@ void VerifyMaxCoveringTombstoneSeqnum(
TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
{{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}});
@ -97,8 +98,9 @@ TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) {
auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter,
{{"a", "c", 10}, {"c", "e", 15}, {"e", "g", 15}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
@ -109,8 +111,9 @@ TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) {
auto range_del_iter = MakeRangeDelIter(
{{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter,
{{"a", "c", 10}, {"c", "e", 20}, {"e", "g", 15}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
@ -121,8 +124,9 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) {
auto range_del_iter =
MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
{{"a", 10}, {"b", 10}, {"c", 0}});
@ -132,8 +136,9 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) {
auto range_del_iter =
MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter,
{{"a", "c", 10}, {"c", "e", 10}, {"e", "g", 7}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
@ -147,8 +152,9 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) {
{"a", "g", 7},
{"a", "c", 3}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter,
{{"a", "c", 30}, {"c", "e", 20}, {"e", "g", 20}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
@ -162,8 +168,9 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
{"j", "n", 4},
{"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
{"c", "e", 10},
{"e", "g", 8},
@ -182,8 +189,9 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyWithSnapshot) {
{"j", "n", 4},
{"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, 9);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */, 9);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(
&iter, {{"c", "g", 8}, {"g", "i", 6}, {"j", "l", 4}, {"l", "n", 4}});
VerifyMaxCoveringTombstoneSeqnum(
@ -198,8 +206,9 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
{"c", "g", 8},
{"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, 9);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */, 9);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(
&iter, {{"c", "g", 8}, {"g", "i", 6}, {"j", "l", 4}, {"l", "n", 4}});
VerifyMaxCoveringTombstoneSeqnum(
@ -207,6 +216,31 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
{{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
}
TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyMultiUse) {
auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
{"c", "g", 8},
{"c", "i", 6},
{"j", "n", 4},
{"j", "l", 2}});
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, false /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
{"c", "e", 10},
{"c", "e", 8},
{"c", "e", 6},
{"e", "g", 8},
{"e", "g", 6},
{"g", "i", 6},
{"j", "l", 4},
{"j", "l", 2},
{"l", "n", 4}});
VerifyMaxCoveringTombstoneSeqnum(
&iter, bytewise_icmp.user_comparator(),
{{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
}
TEST_F(RangeTombstoneFragmenterTest, SeekForPrevStartKey) {
// Same tombstones as OverlapAndRepeatedStartKey.
auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
@ -215,8 +249,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevStartKey) {
{"j", "n", 4},
{"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifySeekForPrev(
&iter,
{{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
@ -230,8 +265,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevCovered) {
{"j", "n", 4},
{"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifySeekForPrev(
&iter,
{{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
@ -245,8 +281,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevEndKey) {
{"j", "n", 4},
{"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifySeekForPrev(&iter, {{"c", {"c", "e", 10}},
{"g", {"g", "i", 6}},
{"i", {"g", "i", 6}},
@ -261,8 +298,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevOutOfBounds) {
{"j", "n", 4},
{"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter),
bytewise_icmp, kMaxSequenceNumber);
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifySeekForPrev(&iter,
{{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}});
}

@ -379,13 +379,12 @@ Status TableCache::Get(const ReadOptions& options,
!options.ignore_range_deletions) {
std::unique_ptr<InternalIterator> range_del_iter(
t->NewRangeTombstoneIterator(options));
FragmentedRangeTombstoneIterator fragment_iter(std::move(range_del_iter),
internal_comparator,
GetInternalKeySeqno(k));
*max_covering_tombstone_seq = std::max(
*max_covering_tombstone_seq,
MaxCoveringTombstoneSeqnum(&fragment_iter, k,
internal_comparator.user_comparator()));
*max_covering_tombstone_seq =
std::max(*max_covering_tombstone_seq,
MaxCoveringTombstoneSeqnum(
static_cast<FragmentedRangeTombstoneIterator*>(
range_del_iter.get()),
k, internal_comparator.user_comparator()));
}
if (s.ok()) {
get_context->SetReplayLog(row_cache_entry); // nullptr if no cache.

@ -1209,6 +1209,11 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
FdWithKeyRange* f = fp.GetNextFile();
while (f != nullptr) {
if (*max_covering_tombstone_seq > 0) {
// Use empty error message for speed
*status = Status::NotFound();
return;
}
if (get_context.sample()) {
sample_file_read_inc(f->file_metadata);
}

@ -972,20 +972,22 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
rep->ioptions.info_log,
"Error when seeking to range delete tombstones block from file: %s",
s.ToString().c_str());
} else {
if (found_range_del_block && !rep->range_del_handle.IsNull()) {
ReadOptions read_options;
s = MaybeLoadDataBlockToCache(
prefetch_buffer.get(), rep, read_options, rep->range_del_handle,
Slice() /* compression_dict */, &rep->range_del_entry,
false /* is_index */, nullptr /* get_context */);
if (!s.ok()) {
ROCKS_LOG_WARN(
rep->ioptions.info_log,
"Encountered error while reading data from range del block %s",
s.ToString().c_str());
}
} else if (found_range_del_block && !rep->range_del_handle.IsNull()) {
ReadOptions read_options;
s = MaybeLoadDataBlockToCache(
prefetch_buffer.get(), rep, read_options, rep->range_del_handle,
Slice() /* compression_dict */, &rep->range_del_entry,
false /* is_index */, nullptr /* get_context */);
if (!s.ok()) {
ROCKS_LOG_WARN(
rep->ioptions.info_log,
"Encountered error while reading data from range del block %s",
s.ToString().c_str());
}
auto iter = std::unique_ptr<InternalIterator>(
new_table->NewUnfragmentedRangeTombstoneIterator(read_options));
rep->fragmented_range_dels = std::make_shared<FragmentedRangeTombstoneList>(
std::move(iter), internal_comparator, false /* one_time_use */);
}
bool need_upper_bound_check =
@ -2263,6 +2265,15 @@ InternalIterator* BlockBasedTable::NewIterator(
}
InternalIterator* BlockBasedTable::NewRangeTombstoneIterator(
const ReadOptions& /* read_options */) {
if (rep_->fragmented_range_dels == nullptr) {
return nullptr;
}
return new FragmentedRangeTombstoneIterator(rep_->fragmented_range_dels,
rep_->internal_comparator);
}
InternalIterator* BlockBasedTable::NewUnfragmentedRangeTombstoneIterator(
const ReadOptions& read_options) {
if (rep_->range_del_handle.IsNull()) {
// The block didn't exist, nullptr indicates no range tombstones.

@ -16,6 +16,7 @@
#include <utility>
#include <vector>
#include "db/range_tombstone_fragmenter.h"
#include "options/cf_options.h"
#include "rocksdb/options.h"
#include "rocksdb/persistent_cache.h"
@ -384,6 +385,9 @@ class BlockBasedTable : public TableReader {
friend class PartitionedFilterBlockReader;
friend class PartitionedFilterBlockTest;
InternalIterator* NewUnfragmentedRangeTombstoneIterator(
const ReadOptions& read_options);
};
// Maitaning state of a two-level iteration on a partitioned index structure
@ -511,6 +515,7 @@ struct BlockBasedTable::Rep {
// cache is enabled.
CachableEntry<Block> range_del_entry;
BlockHandle range_del_handle;
std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
// If global_seqno is used, all Keys in this file will have the same
// seqno with value `global_seqno`.

@ -1278,6 +1278,13 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) {
std::vector<std::string> keys = {"1pika", "2chu"};
std::vector<std::string> vals = {"p", "c"};
std::vector<RangeTombstone> expected_tombstones = {
{"1pika", "2chu", 0},
{"2chu", "c", 1},
{"2chu", "c", 0},
{"c", "p", 0},
};
for (int i = 0; i < 2; i++) {
RangeTombstone t(keys[i], vals[i], i);
std::pair<InternalKey, Slice> p = t.Serialize();
@ -1310,14 +1317,15 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) {
ASSERT_FALSE(iter->Valid());
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
for (int i = 0; i < 2; i++) {
for (size_t i = 0; i < expected_tombstones.size(); i++) {
ASSERT_TRUE(iter->Valid());
ParsedInternalKey parsed_key;
ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key));
RangeTombstone t(parsed_key, iter->value());
ASSERT_EQ(t.start_key_, keys[i]);
ASSERT_EQ(t.end_key_, vals[i]);
ASSERT_EQ(t.seq_, i);
const auto& expected_t = expected_tombstones[i];
ASSERT_EQ(t.start_key_, expected_t.start_key_);
ASSERT_EQ(t.end_key_, expected_t.end_key_);
ASSERT_EQ(t.seq_, expected_t.seq_);
iter->Next();
}
ASSERT_TRUE(!iter->Valid());

Loading…
Cancel
Save