diff --git a/db/memtable.cc b/db/memtable.cc index cbd8acedc..964993dc8 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -729,13 +729,20 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, // Avoiding recording stats for speed. return false; } + if (*max_covering_tombstone_seq > 0) { + *s = Status::NotFound(); + return true; + } PERF_TIMER_GUARD(get_from_memtable_time); std::unique_ptr range_del_iter( NewRangeTombstoneIterator(read_opts)); SequenceNumber snapshot = GetInternalKeySeqno(key.internal_key()); - FragmentedRangeTombstoneIterator fragment_iter( - std::move(range_del_iter), comparator_.comparator, snapshot); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + comparator_.comparator, + true /* one_time_use */, snapshot); + FragmentedRangeTombstoneIterator fragment_iter(&fragment_list, + comparator_.comparator); *max_covering_tombstone_seq = std::max( *max_covering_tombstone_seq, MaxCoveringTombstoneSeqnum(&fragment_iter, key.internal_key(), diff --git a/db/memtable_list.cc b/db/memtable_list.cc index e9842c823..069487f54 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -146,7 +146,7 @@ bool MemTableListVersion::GetFromList( } if (done) { - assert(*seq != kMaxSequenceNumber); + assert(*seq != kMaxSequenceNumber || s->IsNotFound()); return true; } if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index 78090d93d..d6bdac1c3 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -12,19 +12,17 @@ #include #include +#include "util/autovector.h" #include "util/kv_map.h" #include "util/vector_iterator.h" namespace rocksdb { -FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( +FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( std::unique_ptr unfragmented_tombstones, - const InternalKeyComparator& icmp, SequenceNumber snapshot) - : tombstone_cmp_(icmp.user_comparator()), - icmp_(&icmp), - ucmp_(icmp.user_comparator()) { + const InternalKeyComparator& icmp, bool one_time_use, + SequenceNumber snapshot) { if (unfragmented_tombstones == nullptr) { - pos_ = tombstones_.end(); return; } bool is_sorted = true; @@ -34,7 +32,7 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); unfragmented_tombstones->Next(), num_tombstones++) { if (num_tombstones > 0 && - icmp_->Compare(last_start_key, unfragmented_tombstones->key()) > 0) { + icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) { is_sorted = false; break; } @@ -46,7 +44,8 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( } } if (is_sorted) { - FragmentTombstones(std::move(unfragmented_tombstones), snapshot); + FragmentTombstones(std::move(unfragmented_tombstones), icmp, one_time_use, + snapshot); return; } @@ -63,15 +62,16 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( } // VectorIterator implicitly sorts by key during construction. auto iter = std::unique_ptr( - new VectorIterator(std::move(keys), std::move(values), icmp_)); - FragmentTombstones(std::move(iter), snapshot); + new VectorIterator(std::move(keys), std::move(values), &icmp)); + FragmentTombstones(std::move(iter), icmp, one_time_use, snapshot); } -void FragmentedRangeTombstoneIterator::FragmentTombstones( +void FragmentedRangeTombstoneList::FragmentTombstones( std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool one_time_use, SequenceNumber snapshot) { Slice cur_start_key(nullptr, 0); - auto cmp = ParsedInternalKeyComparator(icmp_); + auto cmp = ParsedInternalKeyComparator(&icmp); // Stores the end keys and sequence numbers of range tombstones with a start // key less than or equal to cur_start_key. Provides an ordering by end key @@ -87,11 +87,11 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones( bool reached_next_start_key = false; for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) { Slice cur_end_key = it->user_key; - if (icmp_->user_comparator()->Compare(cur_start_key, cur_end_key) == 0) { + if (icmp.user_comparator()->Compare(cur_start_key, cur_end_key) == 0) { // Empty tombstone. continue; } - if (icmp_->user_comparator()->Compare(next_start_key, cur_end_key) <= 0) { + if (icmp.user_comparator()->Compare(next_start_key, cur_end_key) <= 0) { // All of the end keys in [it, cur_end_keys.end()) are after // next_start_key, so the tombstones they represent can be used in // fragments that start with keys greater than or equal to @@ -109,17 +109,32 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones( // Flush a range tombstone fragment [cur_start_key, cur_end_key), which // should not overlap with the last-flushed tombstone fragment. assert(tombstones_.empty() || - icmp_->user_comparator()->Compare(tombstones_.back().end_key_, + icmp.user_comparator()->Compare(tombstones_.back().end_key_, cur_start_key) <= 0); - SequenceNumber max_seqnum = 0; - for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) { - max_seqnum = std::max(max_seqnum, flush_it->sequence); + if (one_time_use) { + SequenceNumber max_seqnum = 0; + for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) { + max_seqnum = std::max(max_seqnum, flush_it->sequence); + } + + // Flush only the tombstone fragment with the highest sequence number. + tombstones_.push_back( + RangeTombstone(cur_start_key, cur_end_key, max_seqnum)); + } else { + // Sort the sequence numbers of the tombstones being fragmented in + // descending order, and then flush them in that order. + autovector seqnums_to_flush; + for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) { + seqnums_to_flush.push_back(flush_it->sequence); + } + std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(), + std::greater()); + for (const auto seq : seqnums_to_flush) { + tombstones_.push_back( + RangeTombstone(cur_start_key, cur_end_key, seq)); + } } - // Flush only the tombstone fragment with the highest sequence - // number. - tombstones_.push_back( - RangeTombstone(cur_start_key, cur_end_key, max_seqnum)); cur_start_key = cur_end_key; } if (!reached_next_start_key) { @@ -140,7 +155,7 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones( const Slice& ikey = unfragmented_tombstones->key(); Slice tombstone_start_key = ExtractUserKey(ikey); SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey); - if (tombstone_seq > snapshot) { + if (one_time_use && tombstone_seq > snapshot) { // The tombstone is not visible by this snapshot. continue; } @@ -152,7 +167,7 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones( tombstone_end_key.size()); tombstone_end_key = pinned_slices_.back(); } - if (!cur_end_keys.empty() && icmp_->user_comparator()->Compare( + if (!cur_end_keys.empty() && icmp.user_comparator()->Compare( cur_start_key, tombstone_start_key) != 0) { // The start key has changed. Flush all tombstones that start before // this new start key. @@ -177,29 +192,50 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones( pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(), false /* arena */); } +} + +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + const FragmentedRangeTombstoneList* tombstones, + const InternalKeyComparator& icmp) + : tombstone_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_(tombstones) { + assert(tombstones_ != nullptr); + pos_ = tombstones_->end(); + pinned_pos_ = tombstones_->end(); +} - // With this, the caller must Seek before the iterator is valid. - pos_ = tombstones_.end(); - pinned_pos_ = tombstones_.end(); +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp) + : tombstone_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_ref_(tombstones), + tombstones_(tombstones_ref_.get()) { + assert(tombstones_ != nullptr); + pos_ = tombstones_->end(); + pinned_pos_ = tombstones_->end(); } void FragmentedRangeTombstoneIterator::SeekToFirst() { - pos_ = tombstones_.begin(); + pos_ = tombstones_->begin(); } void FragmentedRangeTombstoneIterator::SeekToLast() { - pos_ = tombstones_.end(); + pos_ = tombstones_->end(); Prev(); } void FragmentedRangeTombstoneIterator::Seek(const Slice& target) { - if (tombstones_.empty()) { - pos_ = tombstones_.end(); + if (tombstones_->empty()) { + pos_ = tombstones_->end(); return; } RangeTombstone search(ExtractUserKey(target), ExtractUserKey(target), GetInternalKeySeqno(target)); - pos_ = std::lower_bound(tombstones_.begin(), tombstones_.end(), search, + pos_ = std::lower_bound(tombstones_->begin(), tombstones_->end(), search, tombstone_cmp_); } @@ -223,20 +259,24 @@ void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) { void FragmentedRangeTombstoneIterator::Next() { ++pos_; } void FragmentedRangeTombstoneIterator::Prev() { - if (pos_ == tombstones_.begin()) { - pos_ = tombstones_.end(); + if (pos_ == tombstones_->begin()) { + pos_ = tombstones_->end(); return; } --pos_; } bool FragmentedRangeTombstoneIterator::Valid() const { - return pos_ != tombstones_.end(); + return tombstones_ != nullptr && pos_ != tombstones_->end(); } SequenceNumber MaxCoveringTombstoneSeqnum( FragmentedRangeTombstoneIterator* tombstone_iter, const Slice& lookup_key, const Comparator* ucmp) { + if (tombstone_iter == nullptr) { + return 0; + } + SequenceNumber snapshot = GetInternalKeySeqno(lookup_key); Slice user_key = ExtractUserKey(lookup_key); diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index e82171787..e7b2aa573 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -17,6 +17,37 @@ namespace rocksdb { +struct FragmentedRangeTombstoneList { + public: + FragmentedRangeTombstoneList( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool one_time_use, + SequenceNumber snapshot = kMaxSequenceNumber); + + std::vector::const_iterator begin() const { + return tombstones_.begin(); + } + + std::vector::const_iterator end() const { + return tombstones_.end(); + } + + bool empty() const { return tombstones_.size() == 0; } + + private: + // Given an ordered range tombstone iterator unfragmented_tombstones, + // "fragment" the tombstones into non-overlapping pieces, and store them in + // tombstones_. + void FragmentTombstones( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool one_time_use, + SequenceNumber snapshot = kMaxSequenceNumber); + + std::vector tombstones_; + std::list pinned_slices_; + PinnedIteratorsManager pinned_iters_mgr_; +}; + // FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del // meta block into an iterator over non-overlapping tombstone fragments. The // tombstone fragmentation process should be more efficient than the range @@ -29,8 +60,11 @@ namespace rocksdb { class FragmentedRangeTombstoneIterator : public InternalIterator { public: FragmentedRangeTombstoneIterator( - std::unique_ptr unfragmented_tombstones, - const InternalKeyComparator& icmp, SequenceNumber snapshot); + const FragmentedRangeTombstoneList* tombstones, + const InternalKeyComparator& icmp); + FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp); void SeekToFirst() override; void SeekToLast() override; void Seek(const Slice& target) override; @@ -66,7 +100,7 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { }; void MaybePinKey() const { - if (pos_ != tombstones_.end() && pinned_pos_ != pos_) { + if (pos_ != tombstones_->end() && pinned_pos_ != pos_) { current_start_key_.Set(pos_->start_key_, pos_->seq_, kTypeRangeDeletion); pinned_pos_ = pos_; } @@ -78,18 +112,11 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { parsed->type = kTypeRangeDeletion; } - // Given an ordered range tombstone iterator unfragmented_tombstones, - // "fragment" the tombstones into non-overlapping pieces, and store them in - // tombstones_. - void FragmentTombstones( - std::unique_ptr unfragmented_tombstones, - SequenceNumber snapshot); - const FragmentedRangeTombstoneComparator tombstone_cmp_; const InternalKeyComparator* icmp_; const Comparator* ucmp_; - std::vector tombstones_; - std::list pinned_slices_; + std::shared_ptr tombstones_ref_; + const FragmentedRangeTombstoneList* tombstones_; std::vector::const_iterator pos_; mutable std::vector::const_iterator pinned_pos_; mutable InternalKey current_start_key_; diff --git a/db/range_tombstone_fragmenter_test.cc b/db/range_tombstone_fragmenter_test.cc index 6f0036cb4..4bea5b4c1 100644 --- a/db/range_tombstone_fragmenter_test.cc +++ b/db/range_tombstone_fragmenter_test.cc @@ -87,8 +87,9 @@ void VerifyMaxCoveringTombstoneSeqnum( TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) { auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}}); VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}}); @@ -97,8 +98,9 @@ TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) { TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) { auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, {"c", "e", 15}, {"e", "g", 15}}); VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), @@ -109,8 +111,9 @@ TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) { auto range_del_iter = MakeRangeDelIter( {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, {"c", "e", 20}, {"e", "g", 15}}); VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), @@ -121,8 +124,9 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) { auto range_del_iter = MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}}); VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), {{"a", 10}, {"b", 10}, {"c", 0}}); @@ -132,8 +136,9 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) { auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, {"c", "e", 10}, {"e", "g", 7}}); VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), @@ -147,8 +152,9 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) { {"a", "g", 7}, {"a", "c", 3}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifyFragmentedRangeDels(&iter, {{"a", "c", 30}, {"c", "e", 20}, {"e", "g", 20}}); VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), @@ -162,8 +168,9 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) { {"j", "n", 4}, {"j", "l", 2}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, {"c", "e", 10}, {"e", "g", 8}, @@ -182,8 +189,9 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyWithSnapshot) { {"j", "n", 4}, {"j", "l", 2}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, 9); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */, 9); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifyFragmentedRangeDels( &iter, {{"c", "g", 8}, {"g", "i", 6}, {"j", "l", 4}, {"l", "n", 4}}); VerifyMaxCoveringTombstoneSeqnum( @@ -198,8 +206,9 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) { {"c", "g", 8}, {"j", "l", 2}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, 9); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */, 9); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifyFragmentedRangeDels( &iter, {{"c", "g", 8}, {"g", "i", 6}, {"j", "l", 4}, {"l", "n", 4}}); VerifyMaxCoveringTombstoneSeqnum( @@ -207,6 +216,31 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) { {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); } +TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyMultiUse) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, false /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"c", "e", 6}, + {"e", "g", 8}, + {"e", "g", 6}, + {"g", "i", 6}, + {"j", "l", 4}, + {"j", "l", 2}, + {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter, bytewise_icmp.user_comparator(), + {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); +} + TEST_F(RangeTombstoneFragmenterTest, SeekForPrevStartKey) { // Same tombstones as OverlapAndRepeatedStartKey. auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, @@ -215,8 +249,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevStartKey) { {"j", "n", 4}, {"j", "l", 2}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifySeekForPrev( &iter, {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}}); @@ -230,8 +265,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevCovered) { {"j", "n", 4}, {"j", "l", 2}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifySeekForPrev( &iter, {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}}); @@ -245,8 +281,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevEndKey) { {"j", "n", 4}, {"j", "l", 2}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifySeekForPrev(&iter, {{"c", {"c", "e", 10}}, {"g", {"g", "i", 6}}, {"i", {"g", "i", 6}}, @@ -261,8 +298,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevOutOfBounds) { {"j", "n", 4}, {"j", "l", 2}}); - FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), - bytewise_icmp, kMaxSequenceNumber); + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* one_time_use */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp); VerifySeekForPrev(&iter, {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}}); } diff --git a/db/table_cache.cc b/db/table_cache.cc index 94901a75d..fa5e86e7f 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -379,13 +379,12 @@ Status TableCache::Get(const ReadOptions& options, !options.ignore_range_deletions) { std::unique_ptr range_del_iter( t->NewRangeTombstoneIterator(options)); - FragmentedRangeTombstoneIterator fragment_iter(std::move(range_del_iter), - internal_comparator, - GetInternalKeySeqno(k)); - *max_covering_tombstone_seq = std::max( - *max_covering_tombstone_seq, - MaxCoveringTombstoneSeqnum(&fragment_iter, k, - internal_comparator.user_comparator())); + *max_covering_tombstone_seq = + std::max(*max_covering_tombstone_seq, + MaxCoveringTombstoneSeqnum( + static_cast( + range_del_iter.get()), + k, internal_comparator.user_comparator())); } if (s.ok()) { get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. diff --git a/db/version_set.cc b/db/version_set.cc index e782a438e..0782a64a5 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1209,6 +1209,11 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, FdWithKeyRange* f = fp.GetNextFile(); while (f != nullptr) { + if (*max_covering_tombstone_seq > 0) { + // Use empty error message for speed + *status = Status::NotFound(); + return; + } if (get_context.sample()) { sample_file_read_inc(f->file_metadata); } diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index b116c7220..6595462d0 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -972,20 +972,22 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, rep->ioptions.info_log, "Error when seeking to range delete tombstones block from file: %s", s.ToString().c_str()); - } else { - if (found_range_del_block && !rep->range_del_handle.IsNull()) { - ReadOptions read_options; - s = MaybeLoadDataBlockToCache( - prefetch_buffer.get(), rep, read_options, rep->range_del_handle, - Slice() /* compression_dict */, &rep->range_del_entry, - false /* is_index */, nullptr /* get_context */); - if (!s.ok()) { - ROCKS_LOG_WARN( - rep->ioptions.info_log, - "Encountered error while reading data from range del block %s", - s.ToString().c_str()); - } + } else if (found_range_del_block && !rep->range_del_handle.IsNull()) { + ReadOptions read_options; + s = MaybeLoadDataBlockToCache( + prefetch_buffer.get(), rep, read_options, rep->range_del_handle, + Slice() /* compression_dict */, &rep->range_del_entry, + false /* is_index */, nullptr /* get_context */); + if (!s.ok()) { + ROCKS_LOG_WARN( + rep->ioptions.info_log, + "Encountered error while reading data from range del block %s", + s.ToString().c_str()); } + auto iter = std::unique_ptr( + new_table->NewUnfragmentedRangeTombstoneIterator(read_options)); + rep->fragmented_range_dels = std::make_shared( + std::move(iter), internal_comparator, false /* one_time_use */); } bool need_upper_bound_check = @@ -2263,6 +2265,15 @@ InternalIterator* BlockBasedTable::NewIterator( } InternalIterator* BlockBasedTable::NewRangeTombstoneIterator( + const ReadOptions& /* read_options */) { + if (rep_->fragmented_range_dels == nullptr) { + return nullptr; + } + return new FragmentedRangeTombstoneIterator(rep_->fragmented_range_dels, + rep_->internal_comparator); +} + +InternalIterator* BlockBasedTable::NewUnfragmentedRangeTombstoneIterator( const ReadOptions& read_options) { if (rep_->range_del_handle.IsNull()) { // The block didn't exist, nullptr indicates no range tombstones. diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 24a7ef24b..c3dd3d2f1 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -16,6 +16,7 @@ #include #include +#include "db/range_tombstone_fragmenter.h" #include "options/cf_options.h" #include "rocksdb/options.h" #include "rocksdb/persistent_cache.h" @@ -384,6 +385,9 @@ class BlockBasedTable : public TableReader { friend class PartitionedFilterBlockReader; friend class PartitionedFilterBlockTest; + + InternalIterator* NewUnfragmentedRangeTombstoneIterator( + const ReadOptions& read_options); }; // Maitaning state of a two-level iteration on a partitioned index structure @@ -511,6 +515,7 @@ struct BlockBasedTable::Rep { // cache is enabled. CachableEntry range_del_entry; BlockHandle range_del_handle; + std::shared_ptr fragmented_range_dels; // If global_seqno is used, all Keys in this file will have the same // seqno with value `global_seqno`. diff --git a/table/table_test.cc b/table/table_test.cc index 2ff62640b..f4fc82b5c 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1278,6 +1278,13 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) { std::vector keys = {"1pika", "2chu"}; std::vector vals = {"p", "c"}; + std::vector expected_tombstones = { + {"1pika", "2chu", 0}, + {"2chu", "c", 1}, + {"2chu", "c", 0}, + {"c", "p", 0}, + }; + for (int i = 0; i < 2; i++) { RangeTombstone t(keys[i], vals[i], i); std::pair p = t.Serialize(); @@ -1310,14 +1317,15 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) { ASSERT_FALSE(iter->Valid()); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); - for (int i = 0; i < 2; i++) { + for (size_t i = 0; i < expected_tombstones.size(); i++) { ASSERT_TRUE(iter->Valid()); ParsedInternalKey parsed_key; ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key)); RangeTombstone t(parsed_key, iter->value()); - ASSERT_EQ(t.start_key_, keys[i]); - ASSERT_EQ(t.end_key_, vals[i]); - ASSERT_EQ(t.seq_, i); + const auto& expected_t = expected_tombstones[i]; + ASSERT_EQ(t.start_key_, expected_t.start_key_); + ASSERT_EQ(t.end_key_, expected_t.end_key_); + ASSERT_EQ(t.seq_, expected_t.seq_); iter->Next(); } ASSERT_TRUE(!iter->Valid());