// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "table/block_based/block_based_table_iterator.h" namespace ROCKSDB_NAMESPACE { void BlockBasedTableIterator::Seek(const Slice& target) { SeekImpl(&target); } void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr); } void BlockBasedTableIterator::SeekImpl(const Slice* target) { is_out_of_bound_ = false; is_at_first_key_from_index_ = false; if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) { ResetDataIter(); return; } bool need_seek_index = true; if (block_iter_points_to_real_block_ && block_iter_.Valid()) { // Reseek. prev_block_offset_ = index_iter_->value().handle.offset(); if (target) { // We can avoid an index seek if: // 1. The new seek key is larger than the current key // 2. The new seek key is within the upper bound of the block // Since we don't necessarily know the internal key for either // the current key or the upper bound, we check user keys and // exclude the equality case. Considering internal keys can // improve for the boundary cases, but it would complicate the // code. if (user_comparator_.Compare(ExtractUserKey(*target), block_iter_.user_key()) > 0 && user_comparator_.Compare(ExtractUserKey(*target), index_iter_->user_key()) < 0) { need_seek_index = false; } } } if (need_seek_index) { if (target) { index_iter_->Seek(*target); } else { index_iter_->SeekToFirst(); } if (!index_iter_->Valid()) { ResetDataIter(); return; } } IndexValue v = index_iter_->value(); const bool same_block = block_iter_points_to_real_block_ && v.handle.offset() == prev_block_offset_; if (!v.first_internal_key.empty() && !same_block && (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) && allow_unprepared_value_) { // Index contains the first key of the block, and it's >= target. // We can defer reading the block. is_at_first_key_from_index_ = true; // ResetDataIter() will invalidate block_iter_. Thus, there is no need to // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound // as that will be done later when the data block is actually read. ResetDataIter(); } else { // Need to use the data block. if (!same_block) { InitDataBlock(); } else { // When the user does a reseek, the iterate_upper_bound might have // changed. CheckDataBlockWithinUpperBound() needs to be called // explicitly if the reseek ends up in the same data block. // If the reseek ends up in a different block, InitDataBlock() will do // the iterator upper bound check. CheckDataBlockWithinUpperBound(); } if (target) { block_iter_.Seek(*target); } else { block_iter_.SeekToFirst(); } FindKeyForward(); } CheckOutOfBound(); if (target) { assert(!Valid() || icomp_.Compare(*target, key()) <= 0); } } void BlockBasedTableIterator::SeekForPrev(const Slice& target) { is_out_of_bound_ = false; is_at_first_key_from_index_ = false; // For now totally disable prefix seek in auto prefix mode because we don't // have logic if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) { ResetDataIter(); return; } SavePrevIndexValue(); // Call Seek() rather than SeekForPrev() in the index block, because the // target data block will likely to contain the position for `target`, the // same as Seek(), rather than than before. // For example, if we have three data blocks, each containing two keys: // [2, 4] [6, 8] [10, 12] // (the keys in the index block would be [4, 8, 12]) // and the user calls SeekForPrev(7), we need to go to the second block, // just like if they call Seek(7). // The only case where the block is difference is when they seek to a position // in the boundary. For example, if they SeekForPrev(5), we should go to the // first block, rather than the second. However, we don't have the information // to distinguish the two unless we read the second block. In this case, we'll // end up with reading two blocks. index_iter_->Seek(target); if (!index_iter_->Valid()) { auto seek_status = index_iter_->status(); // Check for IO error if (!seek_status.IsNotFound() && !seek_status.ok()) { ResetDataIter(); return; } // With prefix index, Seek() returns NotFound if the prefix doesn't exist if (seek_status.IsNotFound()) { // Any key less than the target is fine for prefix seek ResetDataIter(); return; } else { index_iter_->SeekToLast(); } // Check for IO error if (!index_iter_->Valid()) { ResetDataIter(); return; } } InitDataBlock(); block_iter_.SeekForPrev(target); FindKeyBackward(); CheckDataBlockWithinUpperBound(); assert(!block_iter_.Valid() || icomp_.Compare(target, block_iter_.key()) >= 0); } void BlockBasedTableIterator::SeekToLast() { is_out_of_bound_ = false; is_at_first_key_from_index_ = false; SavePrevIndexValue(); index_iter_->SeekToLast(); if (!index_iter_->Valid()) { ResetDataIter(); return; } InitDataBlock(); block_iter_.SeekToLast(); FindKeyBackward(); CheckDataBlockWithinUpperBound(); } void BlockBasedTableIterator::Next() { if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { return; } assert(block_iter_points_to_real_block_); block_iter_.Next(); FindKeyForward(); CheckOutOfBound(); } bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) { Next(); bool is_valid = Valid(); if (is_valid) { result->key = key(); result->bound_check_result = UpperBoundCheckResult(); result->value_prepared = !is_at_first_key_from_index_; } return is_valid; } void BlockBasedTableIterator::Prev() { if (is_at_first_key_from_index_) { is_at_first_key_from_index_ = false; index_iter_->Prev(); if (!index_iter_->Valid()) { return; } InitDataBlock(); block_iter_.SeekToLast(); } else { assert(block_iter_points_to_real_block_); block_iter_.Prev(); } FindKeyBackward(); } void BlockBasedTableIterator::InitDataBlock() { BlockHandle data_block_handle = index_iter_->value().handle; if (!block_iter_points_to_real_block_ || data_block_handle.offset() != prev_block_offset_ || // if previous attempt of reading the block missed cache, try again block_iter_.status().IsIncomplete()) { if (block_iter_points_to_real_block_) { ResetDataIter(); } auto* rep = table_->get_rep(); bool is_for_compaction = lookup_context_.caller == TableReaderCaller::kCompaction; // Prefetch additional data for range scans (iterators). // Implicit auto readahead: // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. // Explicit user requested readahead: // Enabled from the very first IO when ReadOptions.readahead_size is set. block_prefetcher_.PrefetchIfNeeded( rep, data_block_handle, read_options_.readahead_size, is_for_compaction, read_options_.async_io, read_options_.rate_limiter_priority); Status s; table_->NewDataBlockIterator( read_options_, data_block_handle, &block_iter_, BlockType::kData, /*get_context=*/nullptr, &lookup_context_, s, block_prefetcher_.prefetch_buffer(), /*for_compaction=*/is_for_compaction); block_iter_points_to_real_block_ = true; CheckDataBlockWithinUpperBound(); } } bool BlockBasedTableIterator::MaterializeCurrentBlock() { assert(is_at_first_key_from_index_); assert(!block_iter_points_to_real_block_); assert(index_iter_->Valid()); is_at_first_key_from_index_ = false; InitDataBlock(); assert(block_iter_points_to_real_block_); if (!block_iter_.status().ok()) { return false; } block_iter_.SeekToFirst(); if (!block_iter_.Valid() || icomp_.Compare(block_iter_.key(), index_iter_->value().first_internal_key) != 0) { block_iter_.Invalidate(Status::Corruption( "first key in index doesn't match first key in block")); return false; } return true; } void BlockBasedTableIterator::FindKeyForward() { // This method's code is kept short to make it likely to be inlined. assert(!is_out_of_bound_); assert(block_iter_points_to_real_block_); if (!block_iter_.Valid()) { // This is the only call site of FindBlockForward(), but it's extracted into // a separate method to keep FindKeyForward() short and likely to be // inlined. When transitioning to a different block, we call // FindBlockForward(), which is much longer and is probably not inlined. FindBlockForward(); } else { // This is the fast path that avoids a function call. } } void BlockBasedTableIterator::FindBlockForward() { // TODO the while loop inherits from two-level-iterator. We don't know // whether a block can be empty so it can be replaced by an "if". do { if (!block_iter_.status().ok()) { return; } // Whether next data block is out of upper bound, if there is one. const bool next_block_is_out_of_bound = read_options_.iterate_upper_bound != nullptr && block_iter_points_to_real_block_ && block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock; assert(!next_block_is_out_of_bound || user_comparator_.CompareWithoutTimestamp( *read_options_.iterate_upper_bound, /*a_has_ts=*/false, index_iter_->user_key(), /*b_has_ts=*/true) <= 0); ResetDataIter(); index_iter_->Next(); if (next_block_is_out_of_bound) { // The next block is out of bound. No need to read it. TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); // We need to make sure this is not the last data block before setting // is_out_of_bound_, since the index key for the last data block can be // larger than smallest key of the next file on the same level. if (index_iter_->Valid()) { is_out_of_bound_ = true; } return; } if (!index_iter_->Valid()) { return; } IndexValue v = index_iter_->value(); if (!v.first_internal_key.empty() && allow_unprepared_value_) { // Index contains the first key of the block. Defer reading the block. is_at_first_key_from_index_ = true; return; } InitDataBlock(); block_iter_.SeekToFirst(); } while (!block_iter_.Valid()); } void BlockBasedTableIterator::FindKeyBackward() { while (!block_iter_.Valid()) { if (!block_iter_.status().ok()) { return; } ResetDataIter(); index_iter_->Prev(); if (index_iter_->Valid()) { InitDataBlock(); block_iter_.SeekToLast(); } else { return; } } // We could have check lower bound here too, but we opt not to do it for // code simplicity. } void BlockBasedTableIterator::CheckOutOfBound() { if (read_options_.iterate_upper_bound != nullptr && block_upper_bound_check_ != BlockUpperBound::kUpperBoundBeyondCurBlock && Valid()) { is_out_of_bound_ = user_comparator_.CompareWithoutTimestamp( *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(), /*b_has_ts=*/true) <= 0; } } void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { if (read_options_.iterate_upper_bound != nullptr && block_iter_points_to_real_block_) { block_upper_bound_check_ = (user_comparator_.CompareWithoutTimestamp( *read_options_.iterate_upper_bound, /*a_has_ts=*/false, index_iter_->user_key(), /*b_has_ts=*/true) > 0) ? BlockUpperBound::kUpperBoundBeyondCurBlock : BlockUpperBound::kUpperBoundInCurBlock; } } } // namespace ROCKSDB_NAMESPACE