fork of https://github.com/rust-rocksdb/rust-rocksdb for nextgraph
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
311 lines
11 KiB
311 lines
11 KiB
1 year ago
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||
|
// This source code is licensed under both the GPLv2 (found in the
|
||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||
|
// (found in the LICENSE.Apache file in the root directory).
|
||
|
//
|
||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style license that can be
|
||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||
|
#pragma once
|
||
|
#include "table/block_based/block_based_table_reader.h"
|
||
|
#include "table/block_based/block_based_table_reader_impl.h"
|
||
|
#include "table/block_based/block_prefetcher.h"
|
||
|
#include "table/block_based/reader_common.h"
|
||
|
|
||
|
namespace ROCKSDB_NAMESPACE {
|
||
|
// Iterates over the contents of BlockBasedTable.
|
||
|
class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
|
||
|
// compaction_readahead_size: its value will only be used if for_compaction =
|
||
|
// true
|
||
|
// @param read_options Must outlive this iterator.
|
||
|
public:
|
||
|
BlockBasedTableIterator(
|
||
|
const BlockBasedTable* table, const ReadOptions& read_options,
|
||
|
const InternalKeyComparator& icomp,
|
||
|
std::unique_ptr<InternalIteratorBase<IndexValue>>&& index_iter,
|
||
|
bool check_filter, bool need_upper_bound_check,
|
||
|
const SliceTransform* prefix_extractor, TableReaderCaller caller,
|
||
|
size_t compaction_readahead_size = 0, bool allow_unprepared_value = false)
|
||
|
: index_iter_(std::move(index_iter)),
|
||
|
table_(table),
|
||
|
read_options_(read_options),
|
||
|
icomp_(icomp),
|
||
|
user_comparator_(icomp.user_comparator()),
|
||
|
pinned_iters_mgr_(nullptr),
|
||
|
prefix_extractor_(prefix_extractor),
|
||
|
lookup_context_(caller),
|
||
|
block_prefetcher_(
|
||
|
compaction_readahead_size,
|
||
|
table_->get_rep()->table_options.initial_auto_readahead_size),
|
||
|
allow_unprepared_value_(allow_unprepared_value),
|
||
|
block_iter_points_to_real_block_(false),
|
||
|
check_filter_(check_filter),
|
||
|
need_upper_bound_check_(need_upper_bound_check),
|
||
|
async_read_in_progress_(false),
|
||
|
is_last_level_(table->IsLastLevel()) {}
|
||
|
|
||
|
~BlockBasedTableIterator() {}
|
||
|
|
||
|
void Seek(const Slice& target) override;
|
||
|
void SeekForPrev(const Slice& target) override;
|
||
|
void SeekToFirst() override;
|
||
|
void SeekToLast() override;
|
||
|
void Next() final override;
|
||
|
bool NextAndGetResult(IterateResult* result) override;
|
||
|
void Prev() override;
|
||
|
bool Valid() const override {
|
||
|
return !is_out_of_bound_ &&
|
||
|
(is_at_first_key_from_index_ ||
|
||
|
(block_iter_points_to_real_block_ && block_iter_.Valid()));
|
||
|
}
|
||
|
Slice key() const override {
|
||
|
assert(Valid());
|
||
|
if (is_at_first_key_from_index_) {
|
||
|
return index_iter_->value().first_internal_key;
|
||
|
} else {
|
||
|
return block_iter_.key();
|
||
|
}
|
||
|
}
|
||
|
Slice user_key() const override {
|
||
|
assert(Valid());
|
||
|
if (is_at_first_key_from_index_) {
|
||
|
return ExtractUserKey(index_iter_->value().first_internal_key);
|
||
|
} else {
|
||
|
return block_iter_.user_key();
|
||
|
}
|
||
|
}
|
||
|
bool PrepareValue() override {
|
||
|
assert(Valid());
|
||
|
|
||
|
if (!is_at_first_key_from_index_) {
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
return const_cast<BlockBasedTableIterator*>(this)
|
||
|
->MaterializeCurrentBlock();
|
||
|
}
|
||
|
Slice value() const override {
|
||
|
// PrepareValue() must have been called.
|
||
|
assert(!is_at_first_key_from_index_);
|
||
|
assert(Valid());
|
||
|
|
||
|
if (seek_stat_state_ & kReportOnUseful) {
|
||
|
bool filter_used = (seek_stat_state_ & kFilterUsed) != 0;
|
||
|
RecordTick(
|
||
|
table_->GetStatistics(),
|
||
|
filter_used
|
||
|
? (is_last_level_ ? LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH
|
||
|
: NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH)
|
||
|
: (is_last_level_ ? LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER
|
||
|
: NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER));
|
||
|
seek_stat_state_ = kDataBlockReadSinceLastSeek;
|
||
|
}
|
||
|
|
||
|
return block_iter_.value();
|
||
|
}
|
||
|
Status status() const override {
|
||
|
// Prefix index set status to NotFound when the prefix does not exist
|
||
|
if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
|
||
|
return index_iter_->status();
|
||
|
} else if (block_iter_points_to_real_block_) {
|
||
|
return block_iter_.status();
|
||
|
} else if (async_read_in_progress_) {
|
||
|
return Status::TryAgain();
|
||
|
} else {
|
||
|
return Status::OK();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
inline IterBoundCheck UpperBoundCheckResult() override {
|
||
|
if (is_out_of_bound_) {
|
||
|
return IterBoundCheck::kOutOfBound;
|
||
|
} else if (block_upper_bound_check_ ==
|
||
|
BlockUpperBound::kUpperBoundBeyondCurBlock) {
|
||
|
assert(!is_out_of_bound_);
|
||
|
return IterBoundCheck::kInbound;
|
||
|
} else {
|
||
|
return IterBoundCheck::kUnknown;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
|
||
|
pinned_iters_mgr_ = pinned_iters_mgr;
|
||
|
}
|
||
|
bool IsKeyPinned() const override {
|
||
|
// Our key comes either from block_iter_'s current key
|
||
|
// or index_iter_'s current *value*.
|
||
|
return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
|
||
|
((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
|
||
|
(block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
|
||
|
}
|
||
|
bool IsValuePinned() const override {
|
||
|
assert(!is_at_first_key_from_index_);
|
||
|
assert(Valid());
|
||
|
|
||
|
// BlockIter::IsValuePinned() is always true. No need to check
|
||
|
return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
|
||
|
block_iter_points_to_real_block_;
|
||
|
}
|
||
|
|
||
|
void ResetDataIter() {
|
||
|
if (block_iter_points_to_real_block_) {
|
||
|
if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
|
||
|
block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
|
||
|
}
|
||
|
block_iter_.Invalidate(Status::OK());
|
||
|
block_iter_points_to_real_block_ = false;
|
||
|
}
|
||
|
block_upper_bound_check_ = BlockUpperBound::kUnknown;
|
||
|
}
|
||
|
|
||
|
void SavePrevIndexValue() {
|
||
|
if (block_iter_points_to_real_block_) {
|
||
|
// Reseek. If they end up with the same data block, we shouldn't re-fetch
|
||
|
// the same data block.
|
||
|
prev_block_offset_ = index_iter_->value().handle.offset();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
|
||
|
if (block_prefetcher_.prefetch_buffer() != nullptr &&
|
||
|
read_options_.adaptive_readahead) {
|
||
|
block_prefetcher_.prefetch_buffer()->GetReadaheadState(
|
||
|
&(readahead_file_info->data_block_readahead_info));
|
||
|
if (index_iter_) {
|
||
|
index_iter_->GetReadaheadState(readahead_file_info);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
|
||
|
if (read_options_.adaptive_readahead) {
|
||
|
block_prefetcher_.SetReadaheadState(
|
||
|
&(readahead_file_info->data_block_readahead_info));
|
||
|
if (index_iter_) {
|
||
|
index_iter_->SetReadaheadState(readahead_file_info);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
|
||
|
|
||
|
private:
|
||
|
enum class IterDirection {
|
||
|
kForward,
|
||
|
kBackward,
|
||
|
};
|
||
|
// This enum indicates whether the upper bound falls into current block
|
||
|
// or beyond.
|
||
|
// +-------------+
|
||
|
// | cur block | <-- (1)
|
||
|
// +-------------+
|
||
|
// <-- (2)
|
||
|
// --- <boundary key> ---
|
||
|
// <-- (3)
|
||
|
// +-------------+
|
||
|
// | next block | <-- (4)
|
||
|
// ......
|
||
|
//
|
||
|
// When the block is smaller than <boundary key>, kUpperBoundInCurBlock
|
||
|
// is the value to use. The examples are (1) or (2) in the graph. It means
|
||
|
// all keys in the next block or beyond will be out of bound. Keys within
|
||
|
// the current block may or may not be out of bound.
|
||
|
// When the block is larger or equal to <boundary key>,
|
||
|
// kUpperBoundBeyondCurBlock is to be used. The examples are (3) and (4)
|
||
|
// in the graph. It means that all keys in the current block is within the
|
||
|
// upper bound and keys in the next block may or may not be within the uppder
|
||
|
// bound.
|
||
|
// If the boundary key hasn't been checked against the upper bound,
|
||
|
// kUnknown can be used.
|
||
|
enum class BlockUpperBound : uint8_t {
|
||
|
kUpperBoundInCurBlock,
|
||
|
kUpperBoundBeyondCurBlock,
|
||
|
kUnknown,
|
||
|
};
|
||
|
|
||
|
// State bits for collecting stats on seeks and whether they returned useful
|
||
|
// results.
|
||
|
enum SeekStatState : uint8_t {
|
||
|
kNone = 0,
|
||
|
// Most recent seek checked prefix filter (or similar future feature)
|
||
|
kFilterUsed = 1 << 0,
|
||
|
// Already recorded that a data block was accessed since the last seek.
|
||
|
kDataBlockReadSinceLastSeek = 1 << 1,
|
||
|
// Have not yet recorded that a value() was accessed.
|
||
|
kReportOnUseful = 1 << 2,
|
||
|
};
|
||
|
|
||
|
const BlockBasedTable* table_;
|
||
|
const ReadOptions& read_options_;
|
||
|
const InternalKeyComparator& icomp_;
|
||
|
UserComparatorWrapper user_comparator_;
|
||
|
PinnedIteratorsManager* pinned_iters_mgr_;
|
||
|
DataBlockIter block_iter_;
|
||
|
const SliceTransform* prefix_extractor_;
|
||
|
uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
|
||
|
BlockCacheLookupContext lookup_context_;
|
||
|
|
||
|
BlockPrefetcher block_prefetcher_;
|
||
|
|
||
|
const bool allow_unprepared_value_;
|
||
|
// True if block_iter_ is initialized and points to the same block
|
||
|
// as index iterator.
|
||
|
bool block_iter_points_to_real_block_;
|
||
|
// See InternalIteratorBase::IsOutOfBound().
|
||
|
bool is_out_of_bound_ = false;
|
||
|
// How current data block's boundary key with the next block is compared with
|
||
|
// iterate upper bound.
|
||
|
BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown;
|
||
|
// True if we're standing at the first key of a block, and we haven't loaded
|
||
|
// that block yet. A call to PrepareValue() will trigger loading the block.
|
||
|
bool is_at_first_key_from_index_ = false;
|
||
|
bool check_filter_;
|
||
|
// TODO(Zhongyi): pick a better name
|
||
|
bool need_upper_bound_check_;
|
||
|
|
||
|
bool async_read_in_progress_;
|
||
|
|
||
|
mutable SeekStatState seek_stat_state_ = SeekStatState::kNone;
|
||
|
bool is_last_level_;
|
||
|
|
||
|
// If `target` is null, seek to first.
|
||
|
void SeekImpl(const Slice* target, bool async_prefetch);
|
||
|
|
||
|
void InitDataBlock();
|
||
|
void AsyncInitDataBlock(bool is_first_pass);
|
||
|
bool MaterializeCurrentBlock();
|
||
|
void FindKeyForward();
|
||
|
void FindBlockForward();
|
||
|
void FindKeyBackward();
|
||
|
void CheckOutOfBound();
|
||
|
|
||
|
// Check if data block is fully within iterate_upper_bound.
|
||
|
//
|
||
|
// Note MyRocks may update iterate bounds between seek. To workaround it,
|
||
|
// we need to check and update data_block_within_upper_bound_ accordingly.
|
||
|
void CheckDataBlockWithinUpperBound();
|
||
|
|
||
|
bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction,
|
||
|
bool* filter_checked) {
|
||
|
if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
|
||
|
// Upper bound check isn't sufficient for backward direction to
|
||
|
// guarantee the same result as total order, so disable prefix
|
||
|
// check.
|
||
|
return true;
|
||
|
}
|
||
|
if (check_filter_ &&
|
||
|
!table_->PrefixRangeMayMatch(ikey, read_options_, prefix_extractor_,
|
||
|
need_upper_bound_check_, &lookup_context_,
|
||
|
filter_checked)) {
|
||
|
// TODO remember the iterator is invalidated because of prefix
|
||
|
// match. This can avoid the upper level file iterator to falsely
|
||
|
// believe the position is the end of the SST file and move to
|
||
|
// the first key of the next file.
|
||
|
ResetDataIter();
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
};
|
||
|
} // namespace ROCKSDB_NAMESPACE
|