diff --git a/CMakeLists.txt b/CMakeLists.txt index 47c4405ef..0fc2cce08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -625,8 +625,10 @@ set(SOURCES table/block_based/block_based_filter_block.cc table/block_based/block_based_table_builder.cc table/block_based/block_based_table_factory.cc + table/block_based/block_based_table_iterator.cc table/block_based/block_based_table_reader.cc table/block_based/block_builder.cc + table/block_based/block_prefetcher.cc table/block_based/block_prefix_index.cc table/block_based/data_block_hash_index.cc table/block_based/data_block_footer.cc @@ -639,6 +641,7 @@ set(SOURCES table/block_based/index_reader_common.cc table/block_based/parsed_full_filter_block.cc table/block_based/partitioned_filter_block.cc + table/block_based/partitioned_index_iterator.cc table/block_based/partitioned_index_reader.cc table/block_based/reader_common.cc table/block_based/uncompression_dict_reader.cc diff --git a/TARGETS b/TARGETS index 99780e3e0..17cb66804 100644 --- a/TARGETS +++ b/TARGETS @@ -236,8 +236,10 @@ cpp_library( "table/block_based/block_based_filter_block.cc", "table/block_based/block_based_table_builder.cc", "table/block_based/block_based_table_factory.cc", + "table/block_based/block_based_table_iterator.cc", "table/block_based/block_based_table_reader.cc", "table/block_based/block_builder.cc", + "table/block_based/block_prefetcher.cc", "table/block_based/block_prefix_index.cc", "table/block_based/data_block_footer.cc", "table/block_based/data_block_hash_index.cc", @@ -250,6 +252,7 @@ cpp_library( "table/block_based/index_reader_common.cc", "table/block_based/parsed_full_filter_block.cc", "table/block_based/partitioned_filter_block.cc", + "table/block_based/partitioned_index_iterator.cc", "table/block_based/partitioned_index_reader.cc", "table/block_based/reader_common.cc", "table/block_based/uncompression_dict_reader.cc", diff --git a/src.mk b/src.mk index 80e6a62e6..628848e73 100644 --- a/src.mk +++ b/src.mk @@ -124,8 +124,10 @@ LIB_SOURCES = \ table/block_based/block_based_filter_block.cc \ table/block_based/block_based_table_builder.cc \ table/block_based/block_based_table_factory.cc \ + table/block_based/block_based_table_iterator.cc \ table/block_based/block_based_table_reader.cc \ table/block_based/block_builder.cc \ + table/block_based/block_prefetcher.cc \ table/block_based/block_prefix_index.cc \ table/block_based/data_block_hash_index.cc \ table/block_based/data_block_footer.cc \ @@ -138,6 +140,7 @@ LIB_SOURCES = \ table/block_based/index_reader_common.cc \ table/block_based/parsed_full_filter_block.cc \ table/block_based/partitioned_filter_block.cc \ + table/block_based/partitioned_index_iterator.cc \ table/block_based/partitioned_index_reader.cc \ table/block_based/reader_common.cc \ table/block_based/uncompression_dict_reader.cc \ diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc new file mode 100644 index 000000000..603c5af38 --- /dev/null +++ b/table/block_based/block_based_table_iterator.cc @@ -0,0 +1,377 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_based_table_iterator.h" + +namespace ROCKSDB_NAMESPACE { +void BlockBasedTableIterator::Seek(const Slice& target) { SeekImpl(&target); } + +void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr); } + +void BlockBasedTableIterator::SeekImpl(const Slice* target) { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) { + ResetDataIter(); + return; + } + + bool need_seek_index = true; + if (block_iter_points_to_real_block_ && block_iter_.Valid()) { + // Reseek. + prev_block_offset_ = index_iter_->value().handle.offset(); + + if (target) { + // We can avoid an index seek if: + // 1. The new seek key is larger than the current key + // 2. The new seek key is within the upper bound of the block + // Since we don't necessarily know the internal key for either + // the current key or the upper bound, we check user keys and + // exclude the equality case. Considering internal keys can + // improve for the boundary cases, but it would complicate the + // code. + if (user_comparator_.Compare(ExtractUserKey(*target), + block_iter_.user_key()) > 0 && + user_comparator_.Compare(ExtractUserKey(*target), + index_iter_->user_key()) < 0) { + need_seek_index = false; + } + } + } + + if (need_seek_index) { + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + IndexValue v = index_iter_->value(); + const bool same_block = block_iter_points_to_real_block_ && + v.handle.offset() == prev_block_offset_; + + // TODO(kolmike): Remove the != kBlockCacheTier condition. + if (!v.first_internal_key.empty() && !same_block && + (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) && + read_options_.read_tier != kBlockCacheTier) { + // Index contains the first key of the block, and it's >= target. + // We can defer reading the block. + is_at_first_key_from_index_ = true; + // ResetDataIter() will invalidate block_iter_. Thus, there is no need to + // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound + // as that will be done later when the data block is actually read. + ResetDataIter(); + } else { + // Need to use the data block. + if (!same_block) { + InitDataBlock(); + } else { + // When the user does a reseek, the iterate_upper_bound might have + // changed. CheckDataBlockWithinUpperBound() needs to be called + // explicitly if the reseek ends up in the same data block. + // If the reseek ends up in a different block, InitDataBlock() will do + // the iterator upper bound check. + CheckDataBlockWithinUpperBound(); + } + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + } + + CheckOutOfBound(); + + if (target) { + assert(!Valid() || icomp_.Compare(*target, key()) <= 0); + } +} + +void BlockBasedTableIterator::SeekForPrev(const Slice& target) { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + // For now totally disable prefix seek in auto prefix mode because we don't + // have logic + if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) { + ResetDataIter(); + return; + } + + SavePrevIndexValue(); + + // Call Seek() rather than SeekForPrev() in the index block, because the + // target data block will likely to contain the position for `target`, the + // same as Seek(), rather than than before. + // For example, if we have three data blocks, each containing two keys: + // [2, 4] [6, 8] [10, 12] + // (the keys in the index block would be [4, 8, 12]) + // and the user calls SeekForPrev(7), we need to go to the second block, + // just like if they call Seek(7). + // The only case where the block is difference is when they seek to a position + // in the boundary. For example, if they SeekForPrev(5), we should go to the + // first block, rather than the second. However, we don't have the information + // to distinguish the two unless we read the second block. In this case, we'll + // end up with reading two blocks. + index_iter_->Seek(target); + + if (!index_iter_->Valid()) { + auto seek_status = index_iter_->status(); + // Check for IO error + if (!seek_status.IsNotFound() && !seek_status.ok()) { + ResetDataIter(); + return; + } + + // With prefix index, Seek() returns NotFound if the prefix doesn't exist + if (seek_status.IsNotFound()) { + // Any key less than the target is fine for prefix seek + ResetDataIter(); + return; + } else { + index_iter_->SeekToLast(); + } + // Check for IO error + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + InitDataBlock(); + + block_iter_.SeekForPrev(target); + + FindKeyBackward(); + CheckDataBlockWithinUpperBound(); + assert(!block_iter_.Valid() || + icomp_.Compare(target, block_iter_.key()) >= 0); +} + +void BlockBasedTableIterator::SeekToLast() { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + SavePrevIndexValue(); + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + InitDataBlock(); + block_iter_.SeekToLast(); + FindKeyBackward(); + CheckDataBlockWithinUpperBound(); +} + +void BlockBasedTableIterator::Next() { + if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { + return; + } + assert(block_iter_points_to_real_block_); + block_iter_.Next(); + FindKeyForward(); + CheckOutOfBound(); +} + +bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + } + return is_valid; +} + +void BlockBasedTableIterator::Prev() { + if (is_at_first_key_from_index_) { + is_at_first_key_from_index_ = false; + + index_iter_->Prev(); + if (!index_iter_->Valid()) { + return; + } + + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + } + + FindKeyBackward(); +} + +void BlockBasedTableIterator::InitDataBlock() { + BlockHandle data_block_handle = index_iter_->value().handle; + if (!block_iter_points_to_real_block_ || + data_block_handle.offset() != prev_block_offset_ || + // if previous attempt of reading the block missed cache, try again + block_iter_.status().IsIncomplete()) { + if (block_iter_points_to_real_block_) { + ResetDataIter(); + } + auto* rep = table_->get_rep(); + + bool is_for_compaction = + lookup_context_.caller == TableReaderCaller::kCompaction; + // Prefetch additional data for range scans (iterators). + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is set. + block_prefetcher_.PrefetchIfNeeded(rep, data_block_handle, + read_options_.readahead_size, + is_for_compaction); + + Status s; + table_->NewDataBlockIterator( + read_options_, data_block_handle, &block_iter_, BlockType::kData, + /*get_context=*/nullptr, &lookup_context_, s, + block_prefetcher_.prefetch_buffer(), + /*for_compaction=*/is_for_compaction); + block_iter_points_to_real_block_ = true; + CheckDataBlockWithinUpperBound(); + } +} + +bool BlockBasedTableIterator::MaterializeCurrentBlock() { + assert(is_at_first_key_from_index_); + assert(!block_iter_points_to_real_block_); + assert(index_iter_->Valid()); + + is_at_first_key_from_index_ = false; + InitDataBlock(); + assert(block_iter_points_to_real_block_); + block_iter_.SeekToFirst(); + + if (!block_iter_.Valid() || + icomp_.Compare(block_iter_.key(), + index_iter_->value().first_internal_key) != 0) { + // Uh oh. + block_iter_.Invalidate(Status::Corruption( + "first key in index doesn't match first key in block")); + return false; + } + + return true; +} + +void BlockBasedTableIterator::FindKeyForward() { + // This method's code is kept short to make it likely to be inlined. + + assert(!is_out_of_bound_); + assert(block_iter_points_to_real_block_); + + if (!block_iter_.Valid()) { + // This is the only call site of FindBlockForward(), but it's extracted into + // a separate method to keep FindKeyForward() short and likely to be + // inlined. When transitioning to a different block, we call + // FindBlockForward(), which is much longer and is probably not inlined. + FindBlockForward(); + } else { + // This is the fast path that avoids a function call. + } +} + +void BlockBasedTableIterator::FindBlockForward() { + // TODO the while loop inherits from two-level-iterator. We don't know + // whether a block can be empty so it can be replaced by an "if". + do { + if (!block_iter_.status().ok()) { + return; + } + // Whether next data block is out of upper bound, if there is one. + const bool next_block_is_out_of_bound = + read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && !data_block_within_upper_bound_; + assert(!next_block_is_out_of_bound || + user_comparator_.CompareWithoutTimestamp( + *read_options_.iterate_upper_bound, /*a_has_ts=*/false, + index_iter_->user_key(), /*b_has_ts=*/true) <= 0); + ResetDataIter(); + index_iter_->Next(); + if (next_block_is_out_of_bound) { + // The next block is out of bound. No need to read it. + TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); + // We need to make sure this is not the last data block before setting + // is_out_of_bound_, since the index key for the last data block can be + // larger than smallest key of the next file on the same level. + if (index_iter_->Valid()) { + is_out_of_bound_ = true; + } + return; + } + + if (!index_iter_->Valid()) { + return; + } + + IndexValue v = index_iter_->value(); + + // TODO(kolmike): Remove the != kBlockCacheTier condition. + if (!v.first_internal_key.empty() && + read_options_.read_tier != kBlockCacheTier) { + // Index contains the first key of the block. Defer reading the block. + is_at_first_key_from_index_ = true; + return; + } + + InitDataBlock(); + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); +} + +void BlockBasedTableIterator::FindKeyBackward() { + while (!block_iter_.Valid()) { + if (!block_iter_.status().ok()) { + return; + } + + ResetDataIter(); + index_iter_->Prev(); + + if (index_iter_->Valid()) { + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + return; + } + } + + // We could have check lower bound here too, but we opt not to do it for + // code simplicity. +} + +void BlockBasedTableIterator::CheckOutOfBound() { + if (read_options_.iterate_upper_bound != nullptr && Valid()) { + is_out_of_bound_ = + user_comparator_.CompareWithoutTimestamp( + *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(), + /*b_has_ts=*/true) <= 0; + } +} + +void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_) { + data_block_within_upper_bound_ = + (user_comparator_.CompareWithoutTimestamp( + *read_options_.iterate_upper_bound, /*a_has_ts=*/false, + index_iter_->user_key(), + /*b_has_ts=*/true) > 0); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h index 1eadc5fc8..0ad8cb99f 100644 --- a/table/block_based/block_based_table_iterator.h +++ b/table/block_based/block_based_table_iterator.h @@ -10,38 +10,36 @@ #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_based_table_reader_impl.h" +#include "table/block_based/block_prefetcher.h" #include "table/block_based/reader_common.h" namespace ROCKSDB_NAMESPACE { // Iterates over the contents of BlockBasedTable. -template -class BlockBasedTableIterator : public InternalIteratorBase { +class BlockBasedTableIterator : public InternalIteratorBase { // compaction_readahead_size: its value will only be used if for_compaction = // true public: - BlockBasedTableIterator(const BlockBasedTable* table, - const ReadOptions& read_options, - const InternalKeyComparator& icomp, - InternalIteratorBase* index_iter, - bool check_filter, bool need_upper_bound_check, - const SliceTransform* prefix_extractor, - BlockType block_type, TableReaderCaller caller, - size_t compaction_readahead_size = 0) + BlockBasedTableIterator( + const BlockBasedTable* table, const ReadOptions& read_options, + const InternalKeyComparator& icomp, + std::unique_ptr>&& index_iter, + bool check_filter, bool need_upper_bound_check, + const SliceTransform* prefix_extractor, TableReaderCaller caller, + size_t compaction_readahead_size = 0) : table_(table), read_options_(read_options), icomp_(icomp), user_comparator_(icomp.user_comparator()), - index_iter_(index_iter), + index_iter_(std::move(index_iter)), pinned_iters_mgr_(nullptr), block_iter_points_to_real_block_(false), check_filter_(check_filter), need_upper_bound_check_(need_upper_bound_check), prefix_extractor_(prefix_extractor), - block_type_(block_type), lookup_context_(caller), - compaction_readahead_size_(compaction_readahead_size) {} + block_prefetcher_(compaction_readahead_size) {} - ~BlockBasedTableIterator() { delete index_iter_; } + ~BlockBasedTableIterator() {} void Seek(const Slice& target) override; void SeekForPrev(const Slice& target) override; @@ -71,7 +69,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { return block_iter_.user_key(); } } - TValue value() const override { + Slice value() const override { assert(Valid()); // Load current block if not loaded. @@ -80,7 +78,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { ->MaterializeCurrentBlock()) { // Oops, index is not consistent with block contents, but we have // no good way to report error at this point. Let's return empty value. - return TValue(); + return Slice(); } return block_iter_.value(); @@ -152,9 +150,9 @@ class BlockBasedTableIterator : public InternalIteratorBase { const ReadOptions read_options_; const InternalKeyComparator& icomp_; UserComparatorWrapper user_comparator_; - InternalIteratorBase* index_iter_; + std::unique_ptr> index_iter_; PinnedIteratorsManager* pinned_iters_mgr_; - TBlockIter block_iter_; + DataBlockIter block_iter_; // True if block_iter_ is initialized and points to the same block // as index iterator. @@ -170,17 +168,10 @@ class BlockBasedTableIterator : public InternalIteratorBase { // TODO(Zhongyi): pick a better name bool need_upper_bound_check_; const SliceTransform* prefix_extractor_; - BlockType block_type_; uint64_t prev_block_offset_ = std::numeric_limits::max(); BlockCacheLookupContext lookup_context_; - // Readahead size used in compaction, its value is used only if - // lookup_context_.caller = kCompaction. - size_t compaction_readahead_size_; - size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; - size_t readahead_limit_ = 0; - int64_t num_file_reads_ = 0; - std::unique_ptr prefetch_buffer_; + BlockPrefetcher block_prefetcher_; // If `target` is null, seek to first. void SeekImpl(const Slice* target); @@ -218,440 +209,4 @@ class BlockBasedTableIterator : public InternalIteratorBase { return true; } }; - -// Functions below cannot be moved to .cc file because the class is a template -// The template is in place so that block based table iterator can be served -// partitioned index too. However, the logic is kind of different between the -// two. So we may think of de-template them by having a separate iterator -// for partitioned index. - -template -void BlockBasedTableIterator::Seek(const Slice& target) { - SeekImpl(&target); -} - -template -void BlockBasedTableIterator::SeekToFirst() { - SeekImpl(nullptr); -} - -template -void BlockBasedTableIterator::SeekImpl( - const Slice* target) { - is_out_of_bound_ = false; - is_at_first_key_from_index_ = false; - if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) { - ResetDataIter(); - return; - } - - bool need_seek_index = true; - if (block_iter_points_to_real_block_ && block_iter_.Valid()) { - // Reseek. - prev_block_offset_ = index_iter_->value().handle.offset(); - - if (target) { - // We can avoid an index seek if: - // 1. The new seek key is larger than the current key - // 2. The new seek key is within the upper bound of the block - // Since we don't necessarily know the internal key for either - // the current key or the upper bound, we check user keys and - // exclude the equality case. Considering internal keys can - // improve for the boundary cases, but it would complicate the - // code. - if (user_comparator_.Compare(ExtractUserKey(*target), - block_iter_.user_key()) > 0 && - user_comparator_.Compare(ExtractUserKey(*target), - index_iter_->user_key()) < 0) { - need_seek_index = false; - } - } - } - - if (need_seek_index) { - if (target) { - index_iter_->Seek(*target); - } else { - index_iter_->SeekToFirst(); - } - - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - } - - IndexValue v = index_iter_->value(); - const bool same_block = block_iter_points_to_real_block_ && - v.handle.offset() == prev_block_offset_; - - // TODO(kolmike): Remove the != kBlockCacheTier condition. - if (!v.first_internal_key.empty() && !same_block && - (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) && - read_options_.read_tier != kBlockCacheTier) { - // Index contains the first key of the block, and it's >= target. - // We can defer reading the block. - is_at_first_key_from_index_ = true; - // ResetDataIter() will invalidate block_iter_. Thus, there is no need to - // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound - // as that will be done later when the data block is actually read. - ResetDataIter(); - } else { - // Need to use the data block. - if (!same_block) { - InitDataBlock(); - } else { - // When the user does a reseek, the iterate_upper_bound might have - // changed. CheckDataBlockWithinUpperBound() needs to be called - // explicitly if the reseek ends up in the same data block. - // If the reseek ends up in a different block, InitDataBlock() will do - // the iterator upper bound check. - CheckDataBlockWithinUpperBound(); - } - - if (target) { - block_iter_.Seek(*target); - } else { - block_iter_.SeekToFirst(); - } - FindKeyForward(); - } - - CheckOutOfBound(); - - if (target) { - assert(!Valid() || ((block_type_ == BlockType::kIndex && - !table_->get_rep()->index_key_includes_seq) - ? (user_comparator_.Compare(ExtractUserKey(*target), - key()) <= 0) - : (icomp_.Compare(*target, key()) <= 0))); - } -} - -template -void BlockBasedTableIterator::SeekForPrev( - const Slice& target) { - is_out_of_bound_ = false; - is_at_first_key_from_index_ = false; - // For now totally disable prefix seek in auto prefix mode because we don't - // have logic - if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) { - ResetDataIter(); - return; - } - - SavePrevIndexValue(); - - // Call Seek() rather than SeekForPrev() in the index block, because the - // target data block will likely to contain the position for `target`, the - // same as Seek(), rather than than before. - // For example, if we have three data blocks, each containing two keys: - // [2, 4] [6, 8] [10, 12] - // (the keys in the index block would be [4, 8, 12]) - // and the user calls SeekForPrev(7), we need to go to the second block, - // just like if they call Seek(7). - // The only case where the block is difference is when they seek to a position - // in the boundary. For example, if they SeekForPrev(5), we should go to the - // first block, rather than the second. However, we don't have the information - // to distinguish the two unless we read the second block. In this case, we'll - // end up with reading two blocks. - index_iter_->Seek(target); - - if (!index_iter_->Valid()) { - auto seek_status = index_iter_->status(); - // Check for IO error - if (!seek_status.IsNotFound() && !seek_status.ok()) { - ResetDataIter(); - return; - } - - // With prefix index, Seek() returns NotFound if the prefix doesn't exist - if (seek_status.IsNotFound()) { - // Any key less than the target is fine for prefix seek - ResetDataIter(); - return; - } else { - index_iter_->SeekToLast(); - } - // Check for IO error - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - } - - InitDataBlock(); - - block_iter_.SeekForPrev(target); - - FindKeyBackward(); - CheckDataBlockWithinUpperBound(); - assert(!block_iter_.Valid() || - icomp_.Compare(target, block_iter_.key()) >= 0); -} - -template -void BlockBasedTableIterator::SeekToLast() { - is_out_of_bound_ = false; - is_at_first_key_from_index_ = false; - SavePrevIndexValue(); - index_iter_->SeekToLast(); - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - InitDataBlock(); - block_iter_.SeekToLast(); - FindKeyBackward(); - CheckDataBlockWithinUpperBound(); -} - -template -void BlockBasedTableIterator::Next() { - if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { - return; - } - assert(block_iter_points_to_real_block_); - block_iter_.Next(); - FindKeyForward(); - CheckOutOfBound(); -} - -template -bool BlockBasedTableIterator::NextAndGetResult( - IterateResult* result) { - Next(); - bool is_valid = Valid(); - if (is_valid) { - result->key = key(); - result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); - } - return is_valid; -} - -template -void BlockBasedTableIterator::Prev() { - if (is_at_first_key_from_index_) { - is_at_first_key_from_index_ = false; - - index_iter_->Prev(); - if (!index_iter_->Valid()) { - return; - } - - InitDataBlock(); - block_iter_.SeekToLast(); - } else { - assert(block_iter_points_to_real_block_); - block_iter_.Prev(); - } - - FindKeyBackward(); -} - -template -void BlockBasedTableIterator::InitDataBlock() { - BlockHandle data_block_handle = index_iter_->value().handle; - if (!block_iter_points_to_real_block_ || - data_block_handle.offset() != prev_block_offset_ || - // if previous attempt of reading the block missed cache, try again - block_iter_.status().IsIncomplete()) { - if (block_iter_points_to_real_block_) { - ResetDataIter(); - } - auto* rep = table_->get_rep(); - - // Prefetch additional data for range scans (iterators). Enabled only for - // user reads. - // Implicit auto readahead: - // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. - // Explicit user requested readahead: - // Enabled from the very first IO when ReadOptions.readahead_size is set. - if (lookup_context_.caller != TableReaderCaller::kCompaction) { - if (read_options_.readahead_size == 0) { - // Implicit auto readahead - num_file_reads_++; - if (num_file_reads_ > - BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) { - if (!rep->file->use_direct_io() && - (data_block_handle.offset() + - static_cast(block_size(data_block_handle)) > - readahead_limit_)) { - // Buffered I/O - // Discarding the return status of Prefetch calls intentionally, as - // we can fallback to reading from disk if Prefetch fails. - rep->file->Prefetch(data_block_handle.offset(), readahead_size_); - readahead_limit_ = static_cast(data_block_handle.offset() + - readahead_size_); - // Keep exponentially increasing readahead size until - // kMaxAutoReadaheadSize. - readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize, - readahead_size_ * 2); - } else if (rep->file->use_direct_io() && !prefetch_buffer_) { - // Direct I/O - // Let FilePrefetchBuffer take care of the readahead. - rep->CreateFilePrefetchBuffer( - BlockBasedTable::kInitAutoReadaheadSize, - BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_); - } - } - } else if (!prefetch_buffer_) { - // Explicit user requested readahead - // The actual condition is: - // if (read_options_.readahead_size != 0 && !prefetch_buffer_) - rep->CreateFilePrefetchBuffer(read_options_.readahead_size, - read_options_.readahead_size, - &prefetch_buffer_); - } - } else if (!prefetch_buffer_) { - rep->CreateFilePrefetchBuffer(compaction_readahead_size_, - compaction_readahead_size_, - &prefetch_buffer_); - } - - Status s; - table_->NewDataBlockIterator( - read_options_, data_block_handle, &block_iter_, block_type_, - /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(), - /*for_compaction=*/lookup_context_.caller == - TableReaderCaller::kCompaction); - block_iter_points_to_real_block_ = true; - CheckDataBlockWithinUpperBound(); - } -} - -template -bool BlockBasedTableIterator::MaterializeCurrentBlock() { - assert(is_at_first_key_from_index_); - assert(!block_iter_points_to_real_block_); - assert(index_iter_->Valid()); - - is_at_first_key_from_index_ = false; - InitDataBlock(); - assert(block_iter_points_to_real_block_); - block_iter_.SeekToFirst(); - - if (!block_iter_.Valid() || - icomp_.Compare(block_iter_.key(), - index_iter_->value().first_internal_key) != 0) { - // Uh oh. - block_iter_.Invalidate(Status::Corruption( - "first key in index doesn't match first key in block")); - return false; - } - - return true; -} - -template -void BlockBasedTableIterator::FindKeyForward() { - // This method's code is kept short to make it likely to be inlined. - - assert(!is_out_of_bound_); - assert(block_iter_points_to_real_block_); - - if (!block_iter_.Valid()) { - // This is the only call site of FindBlockForward(), but it's extracted into - // a separate method to keep FindKeyForward() short and likely to be - // inlined. When transitioning to a different block, we call - // FindBlockForward(), which is much longer and is probably not inlined. - FindBlockForward(); - } else { - // This is the fast path that avoids a function call. - } -} - -template -void BlockBasedTableIterator::FindBlockForward() { - // TODO the while loop inherits from two-level-iterator. We don't know - // whether a block can be empty so it can be replaced by an "if". - do { - if (!block_iter_.status().ok()) { - return; - } - // Whether next data block is out of upper bound, if there is one. - const bool next_block_is_out_of_bound = - read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_ && !data_block_within_upper_bound_; - assert(!next_block_is_out_of_bound || - user_comparator_.CompareWithoutTimestamp( - *read_options_.iterate_upper_bound, /*a_has_ts=*/false, - index_iter_->user_key(), /*b_has_ts=*/true) <= 0); - ResetDataIter(); - index_iter_->Next(); - if (next_block_is_out_of_bound) { - // The next block is out of bound. No need to read it. - TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); - // We need to make sure this is not the last data block before setting - // is_out_of_bound_, since the index key for the last data block can be - // larger than smallest key of the next file on the same level. - if (index_iter_->Valid()) { - is_out_of_bound_ = true; - } - return; - } - - if (!index_iter_->Valid()) { - return; - } - - IndexValue v = index_iter_->value(); - - // TODO(kolmike): Remove the != kBlockCacheTier condition. - if (!v.first_internal_key.empty() && - read_options_.read_tier != kBlockCacheTier) { - // Index contains the first key of the block. Defer reading the block. - is_at_first_key_from_index_ = true; - return; - } - - InitDataBlock(); - block_iter_.SeekToFirst(); - } while (!block_iter_.Valid()); -} - -template -void BlockBasedTableIterator::FindKeyBackward() { - while (!block_iter_.Valid()) { - if (!block_iter_.status().ok()) { - return; - } - - ResetDataIter(); - index_iter_->Prev(); - - if (index_iter_->Valid()) { - InitDataBlock(); - block_iter_.SeekToLast(); - } else { - return; - } - } - - // We could have check lower bound here too, but we opt not to do it for - // code simplicity. -} - -template -void BlockBasedTableIterator::CheckOutOfBound() { - if (read_options_.iterate_upper_bound != nullptr && Valid()) { - is_out_of_bound_ = - user_comparator_.CompareWithoutTimestamp( - *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(), - /*b_has_ts=*/true) <= 0; - } -} - -template -void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { - if (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_) { - data_block_within_upper_bound_ = - (user_comparator_.CompareWithoutTimestamp( - *read_options_.iterate_upper_bound, /*a_has_ts=*/false, - index_iter_->user_key(), - /*b_has_ts=*/true) > 0); - } -} } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 3c619d9ea..3206b258c 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2025,31 +2025,25 @@ InternalIterator* BlockBasedTable::NewIterator( bool need_upper_bound_check = read_options.auto_prefix_mode || PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); + std::unique_ptr> index_iter(NewIndexIterator( + read_options, + need_upper_bound_check && + rep_->index_type == BlockBasedTableOptions::kHashSearch, + /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context)); if (arena == nullptr) { - return new BlockBasedTableIterator( - this, read_options, rep_->internal_comparator, - NewIndexIterator( - read_options, - need_upper_bound_check && - rep_->index_type == BlockBasedTableOptions::kHashSearch, - /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context), + return new BlockBasedTableIterator( + this, read_options, rep_->internal_comparator, std::move(index_iter), !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, - need_upper_bound_check, prefix_extractor, BlockType::kData, caller, + need_upper_bound_check, prefix_extractor, caller, compaction_readahead_size); } else { - auto* mem = - arena->AllocateAligned(sizeof(BlockBasedTableIterator)); - return new (mem) BlockBasedTableIterator( - this, read_options, rep_->internal_comparator, - NewIndexIterator( - read_options, - need_upper_bound_check && - rep_->index_type == BlockBasedTableOptions::kHashSearch, - /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context), + auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); + return new (mem) BlockBasedTableIterator( + this, read_options, rep_->internal_comparator, std::move(index_iter), !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, - need_upper_bound_check, prefix_extractor, BlockType::kData, caller, + need_upper_bound_check, prefix_extractor, caller, compaction_readahead_size); } } diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 536fad2c0..de914d337 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -11,9 +11,9 @@ #include "db/range_tombstone_fragmenter.h" #include "file/filename.h" -#include "table/block_based/cachable_entry.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_type.h" +#include "table/block_based/cachable_entry.h" #include "table/block_based/filter_block.h" #include "table/block_based/uncompression_dict_reader.h" #include "table/table_properties_internal.h" diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc new file mode 100644 index 000000000..087a96864 --- /dev/null +++ b/table/block_based/block_prefetcher.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_prefetcher.h" + +namespace ROCKSDB_NAMESPACE { +void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep, + const BlockHandle& handle, + size_t readahead_size, + bool is_for_compaction) { + if (!is_for_compaction) { + if (readahead_size == 0) { + // Implicit auto readahead + num_file_reads_++; + if (num_file_reads_ > + BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) { + if (!rep->file->use_direct_io() && + (handle.offset() + static_cast(block_size(handle)) > + readahead_limit_)) { + // Buffered I/O + // Discarding the return status of Prefetch calls intentionally, as + // we can fallback to reading from disk if Prefetch fails. + rep->file->Prefetch(handle.offset(), readahead_size_); + readahead_limit_ = + static_cast(handle.offset() + readahead_size_); + // Keep exponentially increasing readahead size until + // kMaxAutoReadaheadSize. + readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize, + readahead_size_ * 2); + } else if (rep->file->use_direct_io() && !prefetch_buffer_) { + // Direct I/O + // Let FilePrefetchBuffer take care of the readahead. + rep->CreateFilePrefetchBuffer(BlockBasedTable::kInitAutoReadaheadSize, + BlockBasedTable::kMaxAutoReadaheadSize, + &prefetch_buffer_); + } + } + } else if (!prefetch_buffer_) { + // Explicit user requested readahead + // The actual condition is: + // if (readahead_size != 0 && !prefetch_buffer_) + rep->CreateFilePrefetchBuffer(readahead_size, readahead_size, + &prefetch_buffer_); + } + } else if (!prefetch_buffer_) { + rep->CreateFilePrefetchBuffer(compaction_readahead_size_, + compaction_readahead_size_, + &prefetch_buffer_); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_prefetcher.h b/table/block_based/block_prefetcher.h new file mode 100644 index 000000000..ee3b61f5c --- /dev/null +++ b/table/block_based/block_prefetcher.h @@ -0,0 +1,32 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/block_based_table_reader.h" + +namespace ROCKSDB_NAMESPACE { +class BlockPrefetcher { + public: + explicit BlockPrefetcher(size_t compaction_readahead_size) + : compaction_readahead_size_(compaction_readahead_size) {} + void PrefetchIfNeeded(const BlockBasedTable::Rep* rep, + const BlockHandle& handle, size_t readahead_size, + bool is_for_compaction); + FilePrefetchBuffer* prefetch_buffer() { return prefetch_buffer_.get(); } + + private: + // Readahead size used in compaction, its value is used only if + // lookup_context_.caller = kCompaction. + size_t compaction_readahead_size_; + + size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; + size_t readahead_limit_ = 0; + int64_t num_file_reads_ = 0; + std::unique_ptr prefetch_buffer_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/partitioned_index_iterator.cc b/table/block_based/partitioned_index_iterator.cc new file mode 100644 index 000000000..2325ea7bd --- /dev/null +++ b/table/block_based/partitioned_index_iterator.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/partitioned_index_iterator.h" + +namespace ROCKSDB_NAMESPACE { +void ParititionedIndexIterator::Seek(const Slice& target) { SeekImpl(&target); } + +void ParititionedIndexIterator::SeekToFirst() { SeekImpl(nullptr); } + +void ParititionedIndexIterator::SeekImpl(const Slice* target) { + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + + if (!index_iter_->Valid()) { + ResetPartitionedIndexIter(); + return; + } + + IndexValue v = index_iter_->value(); + const bool same_block = block_iter_points_to_real_block_ && + v.handle.offset() == prev_block_offset_; + + if (!same_block) { + InitPartitionedIndexBlock(); + } + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + + // We could check upper bound here, but that would be too complicated + // and checking index upper bound is less useful than for data blocks. + + if (target) { + assert(!Valid() || (table_->get_rep()->index_key_includes_seq + ? (icomp_.Compare(*target, key()) <= 0) + : (user_comparator_.Compare(ExtractUserKey(*target), + key()) <= 0))); + } +} + +void ParititionedIndexIterator::SeekToLast() { + SavePrevIndexValue(); + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetPartitionedIndexIter(); + return; + } + InitPartitionedIndexBlock(); + block_iter_.SeekToLast(); + FindKeyBackward(); +} + +void ParititionedIndexIterator::Next() { + assert(block_iter_points_to_real_block_); + block_iter_.Next(); + FindKeyForward(); +} + +void ParititionedIndexIterator::Prev() { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + + FindKeyBackward(); +} + +void ParititionedIndexIterator::InitPartitionedIndexBlock() { + BlockHandle partitioned_index_handle = index_iter_->value().handle; + if (!block_iter_points_to_real_block_ || + partitioned_index_handle.offset() != prev_block_offset_ || + // if previous attempt of reading the block missed cache, try again + block_iter_.status().IsIncomplete()) { + if (block_iter_points_to_real_block_) { + ResetPartitionedIndexIter(); + } + auto* rep = table_->get_rep(); + bool is_for_compaction = + lookup_context_.caller == TableReaderCaller::kCompaction; + // Prefetch additional data for range scans (iterators). + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is set. + block_prefetcher_.PrefetchIfNeeded(rep, partitioned_index_handle, + read_options_.readahead_size, + is_for_compaction); + + Status s; + table_->NewDataBlockIterator( + read_options_, partitioned_index_handle, &block_iter_, + BlockType::kIndex, + /*get_context=*/nullptr, &lookup_context_, s, + block_prefetcher_.prefetch_buffer(), + /*for_compaction=*/is_for_compaction); + block_iter_points_to_real_block_ = true; + // We could check upper bound here but it is complicated to reason about + // upper bound in index iterator. On the other than, in large scans, index + // iterators are moved much less frequently compared to data blocks. So + // the upper bound check is skipped for simplicity. + } +} + +void ParititionedIndexIterator::FindKeyForward() { + // This method's code is kept short to make it likely to be inlined. + + assert(block_iter_points_to_real_block_); + + if (!block_iter_.Valid()) { + // This is the only call site of FindBlockForward(), but it's extracted into + // a separate method to keep FindKeyForward() short and likely to be + // inlined. When transitioning to a different block, we call + // FindBlockForward(), which is much longer and is probably not inlined. + FindBlockForward(); + } else { + // This is the fast path that avoids a function call. + } +} + +void ParititionedIndexIterator::FindBlockForward() { + // TODO the while loop inherits from two-level-iterator. We don't know + // whether a block can be empty so it can be replaced by an "if". + do { + if (!block_iter_.status().ok()) { + return; + } + ResetPartitionedIndexIter(); + index_iter_->Next(); + + if (!index_iter_->Valid()) { + return; + } + + InitPartitionedIndexBlock(); + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); +} + +void ParititionedIndexIterator::FindKeyBackward() { + while (!block_iter_.Valid()) { + if (!block_iter_.status().ok()) { + return; + } + + ResetPartitionedIndexIter(); + index_iter_->Prev(); + + if (index_iter_->Valid()) { + InitPartitionedIndexBlock(); + block_iter_.SeekToLast(); + } else { + return; + } + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/partitioned_index_iterator.h b/table/block_based/partitioned_index_iterator.h new file mode 100644 index 000000000..a8f75d5ec --- /dev/null +++ b/table/block_based/partitioned_index_iterator.h @@ -0,0 +1,145 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/block_based_table_reader.h" + +#include "table/block_based/block_based_table_reader_impl.h" +#include "table/block_based/block_prefetcher.h" +#include "table/block_based/reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Iterator that iterates over partitioned index. +// Some upper and lower bound tricks played in block based table iterators +// could be played here, but it's too complicated to reason about index +// keys with upper or lower bound, so we skip it for simplicity. +class ParititionedIndexIterator : public InternalIteratorBase { + // compaction_readahead_size: its value will only be used if for_compaction = + // true + public: + ParititionedIndexIterator( + const BlockBasedTable* table, const ReadOptions& read_options, + const InternalKeyComparator& icomp, + std::unique_ptr>&& index_iter, + TableReaderCaller caller, size_t compaction_readahead_size = 0) + : table_(table), + read_options_(read_options), + icomp_(icomp), + user_comparator_(icomp.user_comparator()), + index_iter_(std::move(index_iter)), + block_iter_points_to_real_block_(false), + lookup_context_(caller), + block_prefetcher_(compaction_readahead_size) {} + + ~ParititionedIndexIterator() {} + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice&) override { + // Shouldn't be called. + assert(false); + } + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + bool NextAndGetResult(IterateResult*) override { + assert(false); + return false; + } + void Prev() override; + bool Valid() const override { + return block_iter_points_to_real_block_ && block_iter_.Valid(); + } + Slice key() const override { + assert(Valid()); + return block_iter_.key(); + } + Slice user_key() const override { + assert(Valid()); + return block_iter_.user_key(); + } + IndexValue value() const override { + assert(Valid()); + return block_iter_.value(); + } + Status status() const override { + // Prefix index set status to NotFound when the prefix does not exist + if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { + return index_iter_->status(); + } else if (block_iter_points_to_real_block_) { + return block_iter_.status(); + } else { + return Status::OK(); + } + } + + // Whether iterator invalidated for being out of bound. + bool IsOutOfBound() override { + // Shoulldn't be called + assert(false); + return false; + } + + inline bool MayBeOutOfUpperBound() override { + // Shouldn't be called. + assert(false); + return true; + } + void SetPinnedItersMgr(PinnedIteratorsManager*) override { + // Shouldn't be called. + assert(false); + } + bool IsKeyPinned() const override { + // Shouldn't be called. + assert(false); + return false; + } + bool IsValuePinned() const override { + // Shouldn't be called. + assert(false); + return false; + } + + void ResetPartitionedIndexIter() { + if (block_iter_points_to_real_block_) { + block_iter_.Invalidate(Status::OK()); + block_iter_points_to_real_block_ = false; + } + } + + void SavePrevIndexValue() { + if (block_iter_points_to_real_block_) { + // Reseek. If they end up with the same data block, we shouldn't re-fetch + // the same data block. + prev_block_offset_ = index_iter_->value().handle.offset(); + } + } + + private: + const BlockBasedTable* table_; + const ReadOptions read_options_; + const InternalKeyComparator& icomp_; + UserComparatorWrapper user_comparator_; + std::unique_ptr> index_iter_; + IndexBlockIter block_iter_; + + // True if block_iter_ is initialized and points to the same block + // as index iterator. + bool block_iter_points_to_real_block_; + uint64_t prev_block_offset_ = std::numeric_limits::max(); + BlockCacheLookupContext lookup_context_; + BlockPrefetcher block_prefetcher_; + + // If `target` is null, seek to first. + void SeekImpl(const Slice* target); + + void InitPartitionedIndexBlock(); + void FindKeyForward(); + void FindBlockForward(); + void FindKeyBackward(); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index aacbb6053..c22ec4364 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "table/block_based/partitioned_index_reader.h" -#include "table/block_based/block_based_table_iterator.h" +#include "table/block_based/partitioned_index_iterator.h" namespace ROCKSDB_NAMESPACE { Status PartitionIndexReader::Create( @@ -77,14 +77,15 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( ro.fill_cache = read_options.fill_cache; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - it = new BlockBasedTableIterator( - table(), ro, *internal_comparator(), + std::unique_ptr> index_iter( index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, index_has_first_key(), index_key_includes_seq(), - index_value_is_full()), - false, true, /* prefix_extractor */ nullptr, BlockType::kIndex, + index_value_is_full())); + + it = new ParititionedIndexIterator( + table(), ro, *internal_comparator(), std::move(index_iter), lookup_context ? lookup_context->caller : TableReaderCaller::kUncategorized); }