From f3a7847598d89ef8f9f531b10fabb7ce044a38f8 Mon Sep 17 00:00:00 2001 From: yiwu-arbug Date: Fri, 17 May 2019 10:23:38 -0700 Subject: [PATCH] Reduce iterator key comparison for upper/lower bound check (#5111) Summary: Previously if iterator upper/lower bound presents, `DBIter` will check the bound for every key. This patch turns the check into per-file or per-data block check when applicable, by checking against either file largest/smallest key or block index key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5111 Differential Revision: D15330061 Pulled By: siying fbshipit-source-id: 8a653fe3cd50d94d81eb2d13b087326c58ee2024 --- HISTORY.md | 1 + db/db_iter.cc | 4 ++-- db/version_set.cc | 40 +++++++++++++++++++++++-------- table/block_based_table_reader.cc | 20 +++++++++------- table/block_based_table_reader.h | 9 ++++++- table/internal_iterator.h | 25 +++++++++++++++++-- table/iterator_wrapper.h | 22 +++++++++++++---- table/merging_iterator.cc | 24 +++++++++++++++++++ 8 files changed, 117 insertions(+), 28 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9cf8a88da..d45e94bb6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,6 +11,7 @@ * Reduce binary search when iterator reseek into the same data block. * DBIter::Next() can skip user key checking if previous entry's seqnum is 0. * Merging iterator to avoid child iterator reseek for some cases +* Reduce iterator key comparision for upper/lower bound check. ### Bug Fixes diff --git a/db/db_iter.cc b/db/db_iter.cc index 1d8ccf9ad..a606e3acd 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -467,7 +467,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) is_key_seqnum_zero_ = (ikey_.sequence == 0); - if (iterate_upper_bound_ != nullptr && + if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() && user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { break; } @@ -859,7 +859,7 @@ void DBIter::PrevInternal() { return; } - if (iterate_lower_bound_ != nullptr && + if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < 0) { // We've iterated earlier than the user-specified lower bound. diff --git a/db/version_set.cc b/db/version_set.cc index f0dfe7658..03c590272 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -887,7 +887,7 @@ class LevelIterator final : public InternalIterator { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(Slice* ret_key) override; + bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { return file_iter_.Valid(); } @@ -895,23 +895,38 @@ class LevelIterator final : public InternalIterator { assert(Valid()); return file_iter_.key(); } + Slice value() const override { assert(Valid()); return file_iter_.value(); } + Status status() const override { return file_iter_.iter() ? file_iter_.status() : Status::OK(); } + + inline bool MayBeOutOfLowerBound() override { + assert(Valid()); + return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); + } + + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return file_iter_.MayBeOutOfUpperBound(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; if (file_iter_.iter()) { file_iter_.SetPinnedItersMgr(pinned_iters_mgr); } } + bool IsKeyPinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsKeyPinned(); } + bool IsValuePinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsValuePinned(); @@ -954,12 +969,16 @@ class LevelIterator final : public InternalIterator { smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; } + may_be_out_of_lower_bound_ = + read_options_.iterate_lower_bound != nullptr && + user_comparator_.Compare(ExtractUserKey(file_smallest_key(file_index_)), + *read_options_.iterate_lower_bound) < 0; return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *file_meta.file_metadata, range_del_agg_, prefix_extractor_, - nullptr /* don't need reference to table */, - file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_, - level_, smallest_compaction_key, largest_compaction_key); + nullptr /* don't need reference to table */, file_read_hist_, + for_compaction_, nullptr /* arena */, skip_filters_, level_, + smallest_compaction_key, largest_compaction_key); } TableCache* table_cache_; @@ -975,6 +994,7 @@ class LevelIterator final : public InternalIterator { bool should_sample_; bool for_compaction_; bool skip_filters_; + bool may_be_out_of_lower_bound_ = true; size_t file_index_; int level_; RangeDelAggregator* range_del_agg_; @@ -1043,11 +1063,12 @@ void LevelIterator::SeekToLast() { void LevelIterator::Next() { NextImpl(); } -bool LevelIterator::NextAndGetResult(Slice* ret_key) { +bool LevelIterator::NextAndGetResult(IterateResult* result) { NextImpl(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); } return is_valid; } @@ -4278,10 +4299,9 @@ Status VersionSet::Recover( ", last_sequence is %" PRIu64 ", log_number is %" PRIu64 ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 ",min_log_number_to_keep is %" PRIu64 "\n", - manifest_path.c_str(), manifest_file_number_, - next_file_number_.load(), last_sequence_.load(), log_number, - prev_log_number_, column_family_set_->GetMaxColumnFamily(), - min_log_number_to_keep_2pc()); + manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), + last_sequence_.load(), log_number, prev_log_number_, + column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc()); for (auto cfd : *column_family_set_) { if (cfd->IsDropped()) { diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 576117f0d..34e409792 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -2446,11 +2446,12 @@ void BlockBasedTableIterator::Next() { template bool BlockBasedTableIterator::NextAndGetResult( - Slice* ret_key) { + IterateResult* result) { Next(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); } return is_valid; } @@ -2531,6 +2532,11 @@ void BlockBasedTableIterator::InitDataBlock() { key_includes_seq_, index_key_is_full_, /* get_context */ nullptr, s, prefetch_buffer_.get()); block_iter_points_to_real_block_ = true; + if (read_options_.iterate_upper_bound != nullptr) { + data_block_within_upper_bound_ = + (user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) > 0); + } } } @@ -2543,13 +2549,9 @@ void BlockBasedTableIterator::FindBlockForward() { return; } // Whether next data block is out of upper bound, if there is one. - bool next_block_is_out_of_bound = false; - if (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_) { - next_block_is_out_of_bound = - (user_comparator_.Compare(*read_options_.iterate_upper_bound, - index_iter_->user_key()) <= 0); - } + bool next_block_is_out_of_bound = + read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && !data_block_within_upper_bound_; ResetDataIter(); index_iter_->Next(); if (next_block_is_out_of_bound) { diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 385e50ab7..8274f0cf9 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -588,7 +588,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(Slice* ret_key) override; + bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { return !is_out_of_bound_ && block_iter_points_to_real_block_ && @@ -619,6 +619,11 @@ class BlockBasedTableIterator : public InternalIteratorBase { // Whether iterator invalidated for being out of bound. bool IsOutOfBound() override { return is_out_of_bound_; } + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return !data_block_within_upper_bound_; + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; } @@ -680,6 +685,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { TBlockIter block_iter_; bool block_iter_points_to_real_block_; bool is_out_of_bound_ = false; + // Whether current data block being fully within iterate upper bound. + bool data_block_within_upper_bound_ = false; bool check_filter_; // TODO(Zhongyi): pick a better name bool need_upper_bound_check_; diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 8f1cc9dd6..1f57399c7 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -17,6 +17,11 @@ namespace rocksdb { class PinnedIteratorsManager; +struct IterateResult { + Slice key; + bool may_be_out_of_upper_bound; +}; + template class InternalIteratorBase : public Cleanable { public: @@ -55,11 +60,20 @@ class InternalIteratorBase : public Cleanable { // REQUIRES: Valid() virtual void Next() = 0; - virtual bool NextAndGetResult(Slice* ret_key) { + // Moves to the next entry in the source, and return result. Iterator + // implementation should override this method to help methods inline better, + // or when MayBeOutOfUpperBound() is non-trivial. + // REQUIRES: Valid() + virtual bool NextAndGetResult(IterateResult* result) { Next(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual + // call. If an implementation has non-trivial MayBeOutOfUpperBound(), + // it should also override NextAndGetResult(). + result->may_be_out_of_upper_bound = true; + assert(MayBeOutOfUpperBound()); } return is_valid; } @@ -94,6 +108,13 @@ class InternalIteratorBase : public Cleanable { // upper bound virtual bool IsOutOfBound() { return false; } + // Keys return from this iterator can be smaller than iterate_lower_bound. + virtual bool MayBeOutOfLowerBound() { return true; } + + // Keys return from this iterator can be larger or equal to + // iterate_upper_bound. + virtual bool MayBeOutOfUpperBound() { return true; } + // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont // communicate with PinnedIteratorsManager so default implementation is no-op // but for Iterators that need to communicate with PinnedIteratorsManager diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index a570e53c1..a5aa5c49e 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -56,7 +56,10 @@ class IteratorWrapperBase { // Iterator interface methods bool Valid() const { return valid_; } - Slice key() const { assert(Valid()); return key_; } + Slice key() const { + assert(Valid()); + return result_.key; + } TValue value() const { assert(Valid()); return iter_->value(); @@ -65,7 +68,7 @@ class IteratorWrapperBase { Status status() const { assert(iter_); return iter_->status(); } void Next() { assert(iter_); - valid_ = iter_->NextAndGetResult(&key_); + valid_ = iter_->NextAndGetResult(&result_); assert(!valid_ || iter_->status().ok()); } void Prev() { assert(iter_); iter_->Prev(); Update(); } @@ -83,6 +86,16 @@ class IteratorWrapperBase { void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + bool MayBeOutOfLowerBound() { + assert(Valid()); + return iter_->MayBeOutOfLowerBound(); + } + + bool MayBeOutOfUpperBound() { + assert(Valid()); + return result_.may_be_out_of_upper_bound; + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) { assert(iter_); iter_->SetPinnedItersMgr(pinned_iters_mgr); @@ -100,14 +113,15 @@ class IteratorWrapperBase { void Update() { valid_ = iter_->Valid(); if (valid_) { - key_ = iter_->key(); assert(iter_->status().ok()); + result_.key = iter_->key(); + result_.may_be_out_of_upper_bound = true; } } InternalIteratorBase* iter_; + IterateResult result_; bool valid_; - Slice key_; }; using IteratorWrapper = IteratorWrapperBase; diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index e5df6bdf6..244b5e82c 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -227,6 +227,16 @@ class MergingIterator : public InternalIterator { current_ = CurrentForward(); } + bool NextAndGetResult(IterateResult* result) override { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + } + return is_valid; + } + void Prev() override { assert(Valid()); // Ensure that all children are positioned before key(). @@ -296,6 +306,20 @@ class MergingIterator : public InternalIterator { return current_->value(); } + // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result + // from current child iterator. Potentially as long as one of child iterator + // report out of bound is not possible, we know current key is within bound. + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return current_->MayBeOutOfLowerBound(); + } + + bool MayBeOutOfUpperBound() override { + assert(Valid()); + return current_->MayBeOutOfUpperBound(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; for (auto& child : children_) {