From b4d72094280e1e0220ec321779902aba6662db25 Mon Sep 17 00:00:00 2001 From: Mike Kolupaev Date: Mon, 24 Jun 2019 20:50:35 -0700 Subject: [PATCH] Add an option to put first key of each sst block in the index (#5289) Summary: The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes. Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it. So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks. Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files. This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289 Differential Revision: D15256423 Pulled By: al13n321 fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a --- HISTORY.md | 1 + db/db_iterator_test.cc | 142 ++++ include/rocksdb/table.h | 24 +- java/rocksjni/portal.h | 7 +- options/options_helper.cc | 4 +- table/block_based/block.cc | 77 ++- table/block_based/block.h | 103 +-- table/block_based/block_based_table_reader.cc | 653 ++++++++++-------- table/block_based/block_based_table_reader.h | 107 ++- table/block_based/block_test.cc | 253 +++---- .../block_based/data_block_hash_index_test.cc | 8 +- table/block_based/index_builder.cc | 10 +- table/block_based/index_builder.h | 48 +- table/block_based/partitioned_filter_block.cc | 25 +- table/block_fetcher.cc | 1 - table/format.cc | 52 ++ table/format.h | 29 + table/internal_iterator.h | 7 +- table/iterator.cc | 8 +- table/meta_blocks.cc | 17 +- table/table_test.cc | 319 ++++++++- table/two_level_iterator.cc | 26 +- table/two_level_iterator.h | 7 +- test_util/testutil.cc | 9 +- util/coding.h | 13 + 25 files changed, 1362 insertions(+), 588 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 975ece580..07eb27597 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -41,6 +41,7 @@ * Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior. * When reading from option file/string/map, customized envs can be filled according to object registry. * Improve range scan performance when using explicit user readahead by not creating new table readers for every iterator. +* Add index type BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It significantly reduces read amplification in some setups, especially for iterator seeks. It's not fully implemented yet: IO errors are not handled right. ### Public API Change * Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering. diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index e2b9f503f..d514e7683 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -1049,6 +1049,148 @@ TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) { ASSERT_EQ(upper_bound_hits, 1); } } + +// Enable kBinarySearchWithFirstKey, do some iterator operations and check that +// they don't do unnecessary block reads. +TEST_P(DBIteratorTest, IndexWithFirstKey) { + for (int tailing = 0; tailing < 2; ++tailing) { + SCOPED_TRACE("tailing = " + std::to_string(tailing)); + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.statistics = rocksdb::CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + BlockBasedTableOptions table_options; + table_options.index_type = + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = NewLRUCache(1000); // fits all blocks + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Merge("a1", "x1")); + ASSERT_OK(Merge("b1", "y1")); + ASSERT_OK(Merge("c0", "z1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("a2", "x2")); + ASSERT_OK(Merge("b2", "y2")); + ASSERT_OK(Merge("c0", "z2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("a3", "x3")); + ASSERT_OK(Merge("b3", "y3")); + ASSERT_OK(Merge("c3", "z3")); + ASSERT_OK(Flush()); + + // Block cache is not important for this test. + // We use BLOCK_CACHE_DATA_* counters just because they're the most readily + // available way of counting block accesses. + + ReadOptions ropt; + ropt.tailing = tailing; + std::unique_ptr iter(NewIterator(ropt)); + + iter->Seek("b10"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b2", iter->key().ToString()); + EXPECT_EQ("y2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b3", iter->key().ToString()); + EXPECT_EQ("y3", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Seek("c0"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("c0", iter->key().ToString()); + EXPECT_EQ("z1,z2", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("c3", iter->key().ToString()); + EXPECT_EQ("z3", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter.reset(); + + // Enable iterate_upper_bound and check that iterator is not trying to read + // blocks that are fully above upper bound. + std::string ub = "b3"; + Slice ub_slice(ub); + ropt.iterate_upper_bound = &ub_slice; + iter.reset(NewIterator(ropt)); + + iter->Seek("b2"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b2", iter->key().ToString()); + EXPECT_EQ("y2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + } +} + +TEST_P(DBIteratorTest, IndexWithFirstKeyGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.statistics = rocksdb::CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + BlockBasedTableOptions table_options; + table_options.index_type = + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = NewLRUCache(1000); // fits all blocks + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Merge("a", "x1")); + ASSERT_OK(Merge("c", "y1")); + ASSERT_OK(Merge("e", "z1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("c", "y2")); + ASSERT_OK(Merge("e", "z2")); + ASSERT_OK(Flush()); + + // Get() between blocks shouldn't read any blocks. + ASSERT_EQ("NOT_FOUND", Get("b")); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Get() of an existing key shouldn't read any unnecessary blocks when there's + // only one key per block. + + ASSERT_EQ("y1,y2", Get("c")); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + ASSERT_EQ("x1", Get("a")); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + EXPECT_EQ(std::vector({"NOT_FOUND", "z1,z2"}), + MultiGet({"b", "e"})); +} + // TODO(3.13): fix the issue of Seek() + Prev() which might not necessary // return the biggest key which is smaller than the seek key. TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) { diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 88fcc78ed..929239100 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -93,14 +93,32 @@ struct BlockBasedTableOptions { enum IndexType : char { // A space efficient index block that is optimized for // binary-search-based index. - kBinarySearch, + kBinarySearch = 0x00, // The hash index, if enabled, will do the hash lookup when // `Options.prefix_extractor` is provided. - kHashSearch, + kHashSearch = 0x01, // A two-level index implementation. Both levels are binary search indexes. - kTwoLevelIndexSearch, + kTwoLevelIndexSearch = 0x02, + + // Like kBinarySearch, but index also contains first key of each block. + // This allows iterators to defer reading the block until it's actually + // needed. May significantly reduce read amplification of short range scans. + // Without it, iterator seek usually reads one block from each level-0 file + // and from each level, which may be expensive. + // Works best in combination with: + // - IndexShorteningMode::kNoShortening, + // - custom FlushBlockPolicy to cut blocks at some meaningful boundaries, + // e.g. when prefix changes. + // Makes the index significantly bigger (2x or more), especially when keys + // are long. + // + // IO errors are not handled correctly in this mode right now: if an error + // happens when lazily reading a block in value(), value() returns empty + // slice, and you need to call Valid()/status() afterwards. + // TODO(kolmike): Fix it. + kBinarySearchWithFirstKey = 0x03, }; IndexType index_type = kBinarySearch; diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index d1585fcfa..667af809b 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5902,8 +5902,10 @@ class IndexTypeJni { return 0x0; case rocksdb::BlockBasedTableOptions::IndexType::kHashSearch: return 0x1; - case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch: + case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch: return 0x2; + case rocksdb::BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey: + return 0x3; default: return 0x7F; // undefined } @@ -5920,6 +5922,9 @@ class IndexTypeJni { return rocksdb::BlockBasedTableOptions::IndexType::kHashSearch; case 0x2: return rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + case 0x3: + return rocksdb::BlockBasedTableOptions::IndexType:: + kBinarySearchWithFirstKey; default: // undefined/default return rocksdb::BlockBasedTableOptions::IndexType::kBinarySearch; diff --git a/options/options_helper.cc b/options/options_helper.cc index 71a7f9b2f..47aba7ad0 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -1671,7 +1671,9 @@ std::unordered_map {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch}, {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch}, {"kTwoLevelIndexSearch", - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}}; + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}, + {"kBinarySearchWithFirstKey", + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}}; std::unordered_map OptionsHelper::block_base_table_data_block_index_type_string_map = { diff --git a/table/block_based/block.cc b/table/block_based/block.cc index 6c7e46d59..8fa3ff9b9 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -608,8 +608,7 @@ bool IndexBlockIter::ParseNextIndexKey() { } // else we are in the middle of a restart interval and the restart_index_ // thus has not changed - if (value_delta_encoded_) { - assert(value_length == 0); + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { DecodeCurrentValue(shared); } return true; @@ -627,24 +626,32 @@ bool IndexBlockIter::ParseNextIndexKey() { // Otherwise the format is delta-size = block handle size - size of last block // handle. void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { - assert(value_delta_encoded_); - const char* limit = data_ + restarts_; - if (shared == 0) { - uint64_t o, s; - const char* newp = GetVarint64Ptr(value_.data(), limit, &o); - assert(newp); - newp = GetVarint64Ptr(newp, limit, &s); - assert(newp); - decoded_value_ = BlockHandle(o, s); - value_ = Slice(value_.data(), newp - value_.data()); - } else { - uint64_t next_value_base = - decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize; - int64_t delta; - const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta); - decoded_value_ = - BlockHandle(next_value_base, decoded_value_.size() + delta); - value_ = Slice(value_.data(), newp - value_.data()); + Slice v(value_.data(), data_ + restarts_ - value_.data()); + // Delta encoding is used if `shared` != 0. + Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom( + &v, have_first_key_, + (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr); + assert(decode_s.ok()); + value_ = Slice(value_.data(), v.data() - value_.data()); + + if (global_seqno_state_ != nullptr) { + // Overwrite sequence number the same way as in DataBlockIter. + + IterKey& first_internal_key = global_seqno_state_->first_internal_key; + first_internal_key.SetInternalKey(decoded_value_.first_internal_key, + /* copy */ true); + + assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0); + + ValueType value_type = ExtractValueType(first_internal_key.GetKey()); + assert(value_type == ValueType::kTypeValue || + value_type == ValueType::kTypeMerge || + value_type == ValueType::kTypeDeletion || + value_type == ValueType::kTypeRangeDeletion); + + first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno, + value_type); + decoded_value_.first_internal_key = first_internal_key.GetKey(); } } @@ -875,14 +882,10 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, } } -template <> -DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, - DataBlockIter* iter, Statistics* stats, - bool /*total_order_seek*/, - bool /*key_includes_seq*/, - bool /*value_is_full*/, - bool block_contents_pinned, - BlockPrefixIndex* /*prefix_index*/) { +DataBlockIter* Block::NewDataIterator(const Comparator* cmp, + const Comparator* ucmp, + DataBlockIter* iter, Statistics* stats, + bool block_contents_pinned) { DataBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -913,13 +916,11 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, return ret_iter; } -template <> -IndexBlockIter* Block::NewIterator(const Comparator* cmp, - const Comparator* ucmp, IndexBlockIter* iter, - Statistics* /*stats*/, bool total_order_seek, - bool key_includes_seq, bool value_is_full, - bool block_contents_pinned, - BlockPrefixIndex* prefix_index) { +IndexBlockIter* Block::NewIndexIterator( + const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter, + Statistics* /*stats*/, bool total_order_seek, bool have_first_key, + bool key_includes_seq, bool value_is_full, bool block_contents_pinned, + BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -938,9 +939,9 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp, BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, - prefix_index_ptr, key_includes_seq, value_is_full, - block_contents_pinned, - nullptr /* data_block_hash_index */); + global_seqno_, prefix_index_ptr, have_first_key, + key_includes_seq, value_is_full, + block_contents_pinned); } return ret_iter; diff --git a/table/block_based/block.h b/table/block_based/block.h index 2bb577d33..3af92b6a2 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -165,17 +165,7 @@ class Block { // If iter is null, return new Iterator // If iter is not null, update this one and return it as Iterator* // - // key_includes_seq, default true, means that the keys are in internal key - // format. - // value_is_full, default true, means that no delta encoding is - // applied to values. - // - // NewIterator - // Same as above but also updates read_amp_bitmap_ if it is not nullptr. - // - // NewIterator - // If `prefix_index` is not nullptr this block will do hash lookup for the key - // prefix. If total_order_seek is true, prefix_index_ is ignored. + // Updates read_amp_bitmap_ if it is not nullptr. // // If `block_contents_pinned` is true, the caller will guarantee that when // the cleanup functions are transferred from the iterator to other @@ -188,13 +178,32 @@ class Block { // NOTE: for the hash based lookup, if a key prefix doesn't match any key, // the iterator will simply be set as "invalid", rather than returning // the key that is just pass the target key. - template - TBlockIter* NewIterator( - const Comparator* comparator, const Comparator* user_comparator, - TBlockIter* iter = nullptr, Statistics* stats = nullptr, - bool total_order_seek = true, bool key_includes_seq = true, - bool value_is_full = true, bool block_contents_pinned = false, - BlockPrefixIndex* prefix_index = nullptr); + + DataBlockIter* NewDataIterator(const Comparator* comparator, + const Comparator* user_comparator, + DataBlockIter* iter = nullptr, + Statistics* stats = nullptr, + bool block_contents_pinned = false); + + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default true, means that no delta encoding is + // applied to values. + // + // If `prefix_index` is not nullptr this block will do hash lookup for the key + // prefix. If total_order_seek is true, prefix_index_ is ignored. + // + // `have_first_key` controls whether IndexValue will contain + // first_internal_key. It affects data serialization format, so the same value + // have_first_key must be used when writing and reading index. + // It is determined by IndexType property of the table. + IndexBlockIter* NewIndexIterator(const Comparator* comparator, + const Comparator* user_comparator, + IndexBlockIter* iter, Statistics* stats, + bool total_order_seek, bool have_first_key, + bool key_includes_seq, bool value_is_full, + bool block_contents_pinned = false, + BlockPrefixIndex* prefix_index = nullptr); // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const; @@ -471,7 +480,7 @@ class DataBlockIter final : public BlockIter { bool SeekForGetImpl(const Slice& target); }; -class IndexBlockIter final : public BlockIter { +class IndexBlockIter final : public BlockIter { public: IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} @@ -483,23 +492,12 @@ class IndexBlockIter final : public BlockIter { // format. // value_is_full, default true, means that no delta encoding is // applied to values. - IndexBlockIter(const Comparator* comparator, - const Comparator* user_comparator, const char* data, - uint32_t restarts, uint32_t num_restarts, - BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned) - : IndexBlockIter() { - Initialize(comparator, user_comparator, data, restarts, num_restarts, - prefix_index, key_includes_seq, block_contents_pinned, - value_is_full, nullptr /* data_block_hash_index */); - } - void Initialize(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, - BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned, - DataBlockHashIndex* /*data_block_hash_index*/) { + SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, + bool have_first_key, bool key_includes_seq, + bool value_is_full, bool block_contents_pinned) { InitializeBase(key_includes_seq ? comparator : user_comparator, data, restarts, num_restarts, kDisableGlobalSequenceNumber, block_contents_pinned); @@ -507,6 +505,12 @@ class IndexBlockIter final : public BlockIter { key_.SetIsUserKey(!key_includes_seq_); prefix_index_ = prefix_index; value_delta_encoded_ = !value_is_full; + have_first_key_ = have_first_key; + if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) { + global_seqno_state_.reset(new GlobalSeqnoState(global_seqno)); + } else { + global_seqno_state_.reset(); + } } Slice user_key() const override { @@ -516,16 +520,17 @@ class IndexBlockIter final : public BlockIter { return key(); } - virtual BlockHandle value() const override { + virtual IndexValue value() const override { assert(Valid()); - if (value_delta_encoded_) { + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { return decoded_value_; } else { - BlockHandle handle; + IndexValue entry; Slice v = value_; - Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v); + Status decode_s __attribute__((__unused__)) = + entry.DecodeFrom(&v, have_first_key_, nullptr); assert(decode_s.ok()); - return handle; + return entry; } } @@ -552,10 +557,15 @@ class IndexBlockIter final : public BlockIter { void Invalidate(Status s) { InvalidateBase(s); } + bool IsValuePinned() const override { + return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); + } + private: // Key is in InternalKey format bool key_includes_seq_; bool value_delta_encoded_; + bool have_first_key_; // value includes first_internal_key BlockPrefixIndex* prefix_index_; // Whether the value is delta encoded. In that case the value is assumed to be // BlockHandle. The first value in each restart interval is the full encoded @@ -563,7 +573,22 @@ class IndexBlockIter final : public BlockIter { // offset of delta encoded BlockHandles is computed by adding the size of // previous delta encoded values in the same restart interval to the offset of // the first value in that restart interval. - BlockHandle decoded_value_; + IndexValue decoded_value_; + + // When sequence number overwriting is enabled, this struct contains the seqno + // to overwrite with, and current first_internal_key with overwritten seqno. + // This is rarely used, so we put it behind a pointer and only allocate when + // needed. + struct GlobalSeqnoState { + // First internal key according to current index entry, but with sequence + // number overwritten to global_seqno. + IterKey first_internal_key; + SequenceNumber global_seqno; + + explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {} + }; + + std::unique_ptr global_seqno_state_; bool PrefixSeek(const Slice& target, uint32_t* index); bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 5b2f51500..5344625ec 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -191,24 +191,22 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { return &table_->get_rep()->internal_comparator; } - bool index_key_includes_seq() const { + bool index_has_first_key() const { assert(table_ != nullptr); assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_has_first_key; + } - const TableProperties* const properties = - table_->get_rep()->table_properties.get(); - - return properties == nullptr || !properties->index_key_is_user_key; + bool index_key_includes_seq() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_key_includes_seq; } bool index_value_is_full() const { assert(table_ != nullptr); assert(table_->get_rep() != nullptr); - - const TableProperties* const properties = - table_->get_rep()->table_properties.get(); - - return properties == nullptr || !properties->index_value_is_delta_encoded; + return table_->get_rep()->index_value_is_full; } Status GetOrReadIndexBlock(bool no_io, GetContext* get_context, @@ -305,7 +303,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { } // return a two-level iterator: first level is on the partition index - InternalIteratorBase* NewIterator( + InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool /* disable_prefix_seek */, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override { @@ -319,10 +317,10 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { return iter; } - return NewErrorInternalIterator(s); + return NewErrorInternalIterator(s); } - InternalIteratorBase* it = nullptr; + InternalIteratorBase* it = nullptr; Statistics* kNullStats = nullptr; // Filters are already checked before seeking the index @@ -330,26 +328,24 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. it = NewTwoLevelIterator( - new BlockBasedTable::PartitionedIndexIteratorState( - table(), &partition_map_, index_key_includes_seq(), - index_value_is_full()), - index_block.GetValue()->NewIterator( + new BlockBasedTable::PartitionedIndexIteratorState(table(), + &partition_map_), + index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), - nullptr, kNullStats, true, index_key_includes_seq(), - index_value_is_full())); + nullptr, kNullStats, true, index_has_first_key(), + index_key_includes_seq(), index_value_is_full())); } else { ReadOptions ro; ro.fill_cache = read_options.fill_cache; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - it = new BlockBasedTableIterator( + it = new BlockBasedTableIterator( table(), ro, *internal_comparator(), - index_block.GetValue()->NewIterator( + index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), - nullptr, kNullStats, true, index_key_includes_seq(), - index_value_is_full()), + nullptr, kNullStats, true, index_has_first_key(), + index_key_includes_seq(), index_value_is_full()), false, true, /* prefix_extractor */ nullptr, BlockType::kIndex, - index_key_includes_seq(), index_value_is_full(), lookup_context ? lookup_context->caller : TableReaderCaller::kUncategorized); } @@ -368,7 +364,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { void CacheDependencies(bool pin) override { // Before read partitions, prefetch them to avoid lots of IOs BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; - auto rep = table()->rep_; + const BlockBasedTable::Rep* rep = table()->rep_; IndexBlockIter biter; BlockHandle handle; Statistics* kNullStats = nullptr; @@ -386,9 +382,10 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - index_block.GetValue()->NewIterator( + index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), &biter, - kNullStats, true, index_key_includes_seq(), index_value_is_full()); + kNullStats, true, index_has_first_key(), index_key_includes_seq(), + index_value_is_full()); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -396,7 +393,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // Empty index. return; } - handle = biter.value(); + handle = biter.value().handle; uint64_t prefetch_off = handle.offset(); // Read the last block's offset @@ -405,7 +402,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // Empty index. return; } - handle = biter.value(); + handle = biter.value().handle; uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; @@ -418,7 +415,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { biter.SeekToFirst(); auto ro = ReadOptions(); for (; biter.Valid(); biter.Next()) { - handle = biter.value(); + handle = biter.value().handle; CachableEntry block; // TODO: Support counter batch update for partitioned index and // filter blocks @@ -493,7 +490,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { return Status::OK(); } - InternalIteratorBase* NewIterator( + InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool /* disable_prefix_seek */, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override { @@ -507,15 +504,16 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { return iter; } - return NewErrorInternalIterator(s); + return NewErrorInternalIterator(s); } Statistics* kNullStats = nullptr; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - auto it = index_block.GetValue()->NewIterator( + auto it = index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), iter, - kNullStats, true, index_key_includes_seq(), index_value_is_full()); + kNullStats, true, index_has_first_key(), index_key_includes_seq(), + index_value_is_full()); assert(it != nullptr); index_block.TransferTo(it); @@ -552,7 +550,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { assert(index_reader != nullptr); assert(!pin || prefetch); - auto rep = table->get_rep(); + const BlockBasedTable::Rep* rep = table->get_rep(); assert(rep != nullptr); CachableEntry index_block; @@ -636,7 +634,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { return Status::OK(); } - InternalIteratorBase* NewIterator( + InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override { @@ -650,7 +648,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { return iter; } - return NewErrorInternalIterator(s); + return NewErrorInternalIterator(s); } Statistics* kNullStats = nullptr; @@ -658,11 +656,11 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { read_options.total_order_seek || disable_prefix_seek; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - auto it = index_block.GetValue()->NewIterator( + auto it = index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), iter, - kNullStats, total_order_seek, index_key_includes_seq(), - index_value_is_full(), false /* block_contents_pinned */, - prefix_index_.get()); + kNullStats, total_order_seek, index_has_first_key(), + index_key_includes_seq(), index_value_is_full(), + false /* block_contents_pinned */, prefix_index_.get()); assert(it != nullptr); index_block.TransferTo(it); @@ -1083,7 +1081,6 @@ Status BlockBasedTable::Open( immortal_table); rep->file = std::move(file); rep->footer = footer; - rep->index_type = table_options.index_type; rep->hash_index_allow_collision = table_options.hash_index_allow_collision; // We need to wrap data with internal_prefix_transform to make sure it can // handle prefix correctly. @@ -1113,6 +1110,8 @@ Status BlockBasedTable::Open( return s; } + // Populates table_properties and some fields that depend on it, + // such as index_type. s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), meta_iter.get(), largest_seqno); if (!s.ok()) { @@ -1317,6 +1316,24 @@ Status BlockBasedTable::ReadPropertiesBlock( BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.info_log); + rep_->index_key_includes_seq = + rep_->table_properties->index_key_is_user_key == 0; + rep_->index_value_is_full = + rep_->table_properties->index_value_is_delta_encoded == 0; + + // Update index_type with the true type. + // If table properties don't contain index type, we assume that the table + // is in very old format and has kBinarySearch index type. + auto& props = rep_->table_properties->user_collected_properties; + auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (pos != props.end()) { + rep_->index_type = static_cast( + DecodeFixed32(pos->second.c_str())); + } + + rep_->index_has_first_key = + rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey; + s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno, &(rep_->global_seqno)); if (!s.ok()) { @@ -1344,7 +1361,6 @@ Status BlockBasedTable::ReadRangeDelBlock( std::unique_ptr iter(NewDataBlockIterator( read_options, range_del_handle, /*input_iter=*/nullptr, BlockType::kRangeDeletion, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer)); assert(iter != nullptr); s = iter->status(); @@ -1436,7 +1452,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( &rep_->compression_dict_handle); } - BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType(); + BlockBasedTableOptions::IndexType index_type = rep_->index_type; const bool use_cache = table_options.cache_index_and_filter_blocks; @@ -1602,8 +1618,8 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer, *meta_block = std::move(meta); // meta block uses bytewise comparator. - iter->reset(meta_block->get()->NewIterator( - BytewiseComparator(), BytewiseComparator())); + iter->reset(meta_block->get()->NewDataIterator(BytewiseComparator(), + BytewiseComparator())); return Status::OK(); } @@ -1846,10 +1862,7 @@ FilterBlockReader* BlockBasedTable::ReadFilter( rep->prefix_filtering ? prefix_extractor : nullptr, rep->whole_key_filtering, std::move(block), nullptr, rep->ioptions.statistics, rep->internal_comparator, this, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0); + rep_->index_key_includes_seq, rep_->index_value_is_full); } case Rep::FilterType::kBlockFilter: @@ -2055,7 +2068,7 @@ CachableEntry BlockBasedTable::GetUncompressionDict( // disable_prefix_seek should be set to true when prefix_extractor found in SST // differs from the one in mutable_cf_options and index type is HashBasedIndex -InternalIteratorBase* BlockBasedTable::NewIndexIterator( +InternalIteratorBase* BlockBasedTable::NewIndexIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* input_iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) const { @@ -2076,8 +2089,8 @@ InternalIteratorBase* BlockBasedTable::NewIndexIterator( template TBlockIter* BlockBasedTable::NewDataBlockIterator( const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, - BlockType block_type, bool key_includes_seq, bool index_key_is_full, - GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, Status s, FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const { PERF_TIMER_GUARD(new_table_block_iter_nanos); @@ -2106,7 +2119,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( } assert(block.GetValue() != nullptr); - constexpr bool kTotalOrderSeek = true; + // Block contents are pinned and it is still pinned after the iterator // is destroyed as long as cleanup functions are moved to another object, // when: @@ -2117,10 +2130,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( const bool block_contents_pinned = block.IsCached() || (!block.GetValue()->own_bytes() && rep_->immortal_table); - iter = block.GetValue()->NewIterator( - &rep_->internal_comparator, rep_->internal_comparator.user_comparator(), - iter, rep_->ioptions.statistics, kTotalOrderSeek, key_includes_seq, - index_key_is_full, block_contents_pinned); + iter = InitBlockIterator(rep_, block.GetValue(), iter, + block_contents_pinned); if (!block.IsCached()) { if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) { @@ -2162,6 +2173,26 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( return iter; } +template <> +DataBlockIter* BlockBasedTable::InitBlockIterator( + const Rep* rep, Block* block, DataBlockIter* input_iter, + bool block_contents_pinned) { + return block->NewDataIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + input_iter, rep->ioptions.statistics, block_contents_pinned); +} + +template <> +IndexBlockIter* BlockBasedTable::InitBlockIterator( + const Rep* rep, Block* block, IndexBlockIter* input_iter, + bool block_contents_pinned) { + return block->NewIndexIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + input_iter, rep->ioptions.statistics, /* total_order_seek */ true, + rep->index_has_first_key, rep->index_key_includes_seq, + rep->index_value_is_full, block_contents_pinned); +} + Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, @@ -2360,14 +2391,10 @@ Status BlockBasedTable::RetrieveBlock( BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( const BlockBasedTable* table, - std::unordered_map>* block_map, - bool index_key_includes_seq, bool index_key_is_full) - : table_(table), - block_map_(block_map), - index_key_includes_seq_(index_key_includes_seq), - index_key_is_full_(index_key_is_full) {} - -InternalIteratorBase* + std::unordered_map>* block_map) + : table_(table), block_map_(block_map) {} + +InternalIteratorBase* BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( const BlockHandle& handle) { // Return a block iterator on the index partition @@ -2375,15 +2402,16 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( // This is a possible scenario since block cache might not have had space // for the partition if (block != block_map_->end()) { - auto rep = table_->get_rep(); + const Rep* rep = table_->get_rep(); assert(rep); Statistics* kNullStats = nullptr; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - return block->second.GetValue()->NewIterator( + return block->second.GetValue()->NewIndexIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), - nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_); + nullptr, kNullStats, true, rep->index_has_first_key, + rep->index_key_includes_seq, rep->index_value_is_full); } // Create an empty iterator return new IndexBlockIter(); @@ -2459,10 +2487,10 @@ bool BlockBasedTable::PrefixMayMatch( // Then, try find it within each block // we already know prefix_extractor and prefix_extractor_name must match // because `CheckPrefixMayMatch` first checks `check_filter_ == true` - std::unique_ptr> iiter(NewIndexIterator( + std::unique_ptr> iiter(NewIndexIterator( no_io_read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, - /*need_upper_bound_check=*/nullptr, lookup_context)); + /*get_context=*/nullptr, lookup_context)); iiter->Seek(internal_prefix); if (!iiter->Valid()) { @@ -2471,10 +2499,8 @@ bool BlockBasedTable::PrefixMayMatch( // and we're not really sure that we're past the end // of the file may_match = iiter->status().IsIncomplete(); - } else if ((rep_->table_properties && - rep_->table_properties->index_key_is_user_key - ? iiter->key() - : ExtractUserKey(iiter->key())) + } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key()) + : iiter->key()) .starts_with(ExtractUserKey(internal_prefix))) { // we need to check for this subtle case because our only // guarantee is that "the key is a string >= last key in that data @@ -2493,7 +2519,7 @@ bool BlockBasedTable::PrefixMayMatch( // after the data block corresponding to iiter->key() cannot // possibly contain the key. Thus, the corresponding data block // is the only on could potentially contain the prefix. - BlockHandle handle = iiter->value(); + BlockHandle handle = iiter->value().handle; may_match = filter->PrefixMayMatch( prefix, prefix_extractor, handle.offset(), /*no_io=*/false, /*const_key_ptr=*/nullptr, lookup_context); @@ -2514,8 +2540,20 @@ bool BlockBasedTable::PrefixMayMatch( template void BlockBasedTableIterator::Seek(const Slice& target) { + SeekImpl(&target); +} + +template +void BlockBasedTableIterator::SeekToFirst() { + SeekImpl(nullptr); +} + +template +void BlockBasedTableIterator::SeekImpl( + const Slice* target) { is_out_of_bound_ = false; - if (!CheckPrefixMayMatch(target)) { + is_at_first_key_from_index_ = false; + if (target && !CheckPrefixMayMatch(*target)) { ResetDataIter(); return; } @@ -2523,47 +2561,82 @@ void BlockBasedTableIterator::Seek(const Slice& target) { bool need_seek_index = true; if (block_iter_points_to_real_block_ && block_iter_.Valid()) { // Reseek. - prev_index_value_ = index_iter_->value(); - // We can avoid an index seek if: - // 1. The new seek key is larger than the current key - // 2. The new seek key is within the upper bound of the block - // Since we don't necessarily know the internal key for either - // the current key or the upper bound, we check user keys and - // exclude the equality case. Considering internal keys can - // improve for the boundary cases, but it would complicate the - // code. - if (user_comparator_.Compare(ExtractUserKey(target), - block_iter_.user_key()) > 0 && - user_comparator_.Compare(ExtractUserKey(target), - index_iter_->user_key()) < 0) { - need_seek_index = false; + prev_block_offset_ = index_iter_->value().handle.offset(); + + if (target) { + // We can avoid an index seek if: + // 1. The new seek key is larger than the current key + // 2. The new seek key is within the upper bound of the block + // Since we don't necessarily know the internal key for either + // the current key or the upper bound, we check user keys and + // exclude the equality case. Considering internal keys can + // improve for the boundary cases, but it would complicate the + // code. + if (user_comparator_.Compare(ExtractUserKey(*target), + block_iter_.user_key()) > 0 && + user_comparator_.Compare(ExtractUserKey(*target), + index_iter_->user_key()) < 0) { + need_seek_index = false; + } } } if (need_seek_index) { - index_iter_->Seek(target); + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + if (!index_iter_->Valid()) { ResetDataIter(); return; } - InitDataBlock(); } - block_iter_.Seek(target); + IndexValue v = index_iter_->value(); + const bool same_block = block_iter_points_to_real_block_ && + v.handle.offset() == prev_block_offset_; + + // TODO(kolmike): Remove the != kBlockCacheTier condition. + if (!v.first_internal_key.empty() && !same_block && + (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) && + read_options_.read_tier != kBlockCacheTier) { + // Index contains the first key of the block, and it's >= target. + // We can defer reading the block. + is_at_first_key_from_index_ = true; + ResetDataIter(); + } else { + // Need to use the data block. + if (!same_block) { + InitDataBlock(); + } + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + } - FindKeyForward(); CheckOutOfBound(); - assert( - !block_iter_.Valid() || - (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) || - (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target), - block_iter_.key()) <= 0)); + + if (target) { + assert( + !Valid() || + ((block_type_ == BlockType::kIndex && + !table_->get_rep()->index_key_includes_seq) + ? (user_comparator_.Compare(ExtractUserKey(*target), key()) <= 0) + : (icomp_.Compare(*target, key()) <= 0))); + } } template void BlockBasedTableIterator::SeekForPrev( const Slice& target) { is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; if (!CheckPrefixMayMatch(target)) { ResetDataIter(); return; @@ -2587,10 +2660,14 @@ void BlockBasedTableIterator::SeekForPrev( index_iter_->Seek(target); if (!index_iter_->Valid()) { + if (!index_iter_->status().ok()) { + ResetDataIter(); + return; + } + index_iter_->SeekToLast(); if (!index_iter_->Valid()) { ResetDataIter(); - block_iter_points_to_real_block_ = false; return; } } @@ -2604,24 +2681,10 @@ void BlockBasedTableIterator::SeekForPrev( icomp_.Compare(target, block_iter_.key()) >= 0); } -template -void BlockBasedTableIterator::SeekToFirst() { - is_out_of_bound_ = false; - SavePrevIndexValue(); - index_iter_->SeekToFirst(); - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - InitDataBlock(); - block_iter_.SeekToFirst(); - FindKeyForward(); - CheckOutOfBound(); -} - template void BlockBasedTableIterator::SeekToLast() { is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; SavePrevIndexValue(); index_iter_->SeekToLast(); if (!index_iter_->Valid()) { @@ -2635,9 +2698,13 @@ void BlockBasedTableIterator::SeekToLast() { template void BlockBasedTableIterator::Next() { + if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { + return; + } assert(block_iter_points_to_real_block_); block_iter_.Next(); FindKeyForward(); + CheckOutOfBound(); } template @@ -2653,8 +2720,21 @@ bool BlockBasedTableIterator::NextAndGetResult( template void BlockBasedTableIterator::Prev() { - assert(block_iter_points_to_real_block_); - block_iter_.Prev(); + if (is_at_first_key_from_index_) { + is_at_first_key_from_index_ = false; + + index_iter_->Prev(); + if (!index_iter_->Valid()) { + return; + } + + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + } + FindKeyBackward(); } @@ -2667,9 +2747,9 @@ const size_t template void BlockBasedTableIterator::InitDataBlock() { - BlockHandle data_block_handle = index_iter_->value(); + BlockHandle data_block_handle = index_iter_->value().handle; if (!block_iter_points_to_real_block_ || - data_block_handle.offset() != prev_index_value_.offset() || + data_block_handle.offset() != prev_block_offset_ || // if previous attempt of reading the block missed cache, try again block_iter_.status().IsIncomplete()) { if (block_iter_points_to_real_block_) { @@ -2728,7 +2808,6 @@ void BlockBasedTableIterator::InitDataBlock() { Status s; table_->NewDataBlockIterator( read_options_, data_block_handle, &block_iter_, block_type_, - key_includes_seq_, index_key_is_full_, /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(), /*for_compaction=*/lookup_context_.caller == TableReaderCaller::kCompaction); @@ -2736,6 +2815,47 @@ void BlockBasedTableIterator::InitDataBlock() { } } +template +bool BlockBasedTableIterator::MaterializeCurrentBlock() { + assert(is_at_first_key_from_index_); + assert(!block_iter_points_to_real_block_); + assert(index_iter_->Valid()); + + is_at_first_key_from_index_ = false; + InitDataBlock(); + assert(block_iter_points_to_real_block_); + block_iter_.SeekToFirst(); + + if (!block_iter_.Valid() || + icomp_.Compare(block_iter_.key(), + index_iter_->value().first_internal_key) != 0) { + // Uh oh. + block_iter_.Invalidate(Status::Corruption( + "first key in index doesn't match first key in block")); + return false; + } + + return true; +} + +template +void BlockBasedTableIterator::FindKeyForward() { + // This method's code is kept short to make it likely to be inlined. + + assert(!is_out_of_bound_); + assert(block_iter_points_to_real_block_); + + if (!block_iter_.Valid()) { + // This is the only call site of FindBlockForward(), but it's extracted into + // a separate method to keep FindKeyForward() short and likely to be + // inlined. When transitioning to a different block, we call + // FindBlockForward(), which is much longer and is probably not inlined. + FindBlockForward(); + } else { + // This is the fast path that avoids a function call. + } +} + template void BlockBasedTableIterator::FindBlockForward() { // TODO the while loop inherits from two-level-iterator. We don't know @@ -2766,22 +2886,23 @@ void BlockBasedTableIterator::FindBlockForward() { return; } - if (index_iter_->Valid()) { - InitDataBlock(); - block_iter_.SeekToFirst(); - } else { + if (!index_iter_->Valid()) { return; } - } while (!block_iter_.Valid()); -} -template -void BlockBasedTableIterator::FindKeyForward() { - assert(!is_out_of_bound_); + IndexValue v = index_iter_->value(); - if (!block_iter_.Valid()) { - FindBlockForward(); - } + // TODO(kolmike): Remove the != kBlockCacheTier condition. + if (!v.first_internal_key.empty() && + read_options_.read_tier != kBlockCacheTier) { + // Index contains the first key of the block. Defer reading the block. + is_at_first_key_from_index_ = true; + return; + } + + InitDataBlock(); + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); } template @@ -2808,8 +2929,7 @@ void BlockBasedTableIterator::FindKeyBackward() { template void BlockBasedTableIterator::CheckOutOfBound() { - if (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_ && block_iter_.Valid()) { + if (read_options_.iterate_upper_bound != nullptr && Valid()) { is_out_of_bound_ = user_comparator_.Compare( *read_options_.iterate_upper_bound, user_key()) <= 0; } @@ -2832,8 +2952,7 @@ InternalIterator* BlockBasedTable::NewIterator( !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, BlockType::kData, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller, - compaction_readahead_size); + caller, compaction_readahead_size); } else { auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); @@ -2845,7 +2964,7 @@ InternalIterator* BlockBasedTable::NewIterator( !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, BlockType::kData, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller, compaction_readahead_size); + caller, compaction_readahead_size); } } @@ -2961,7 +3080,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, get_context, &lookup_context); - std::unique_ptr> iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); } @@ -2971,12 +3090,12 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, bool matched = false; // if such user key mathced a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { - BlockHandle handle = iiter->value(); + IndexValue v = iiter->value(); bool not_exist_in_filter = filter != nullptr && filter->IsBlockBased() == true && !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), - prefix_extractor, handle.offset(), no_io, + prefix_extractor, v.handle.offset(), no_io, /*const_ikey_ptr=*/nullptr, &lookup_context); if (not_exist_in_filter) { @@ -2986,78 +3105,85 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); break; - } else { - BlockCacheLookupContext lookup_data_block_context{ - TableReaderCaller::kUserGet}; - bool does_referenced_key_exist = false; - DataBlockIter biter; - uint64_t referenced_data_size = 0; - NewDataBlockIterator( - read_options, iiter->value(), &biter, BlockType::kData, - /*key_includes_seq=*/true, - /*index_key_is_full=*/true, get_context, &lookup_data_block_context, - /*s=*/Status(), /*prefetch_buffer*/ nullptr); + } - if (read_options.read_tier == kBlockCacheTier && - biter.status().IsIncomplete()) { - // couldn't get block from block_cache - // Update Saver.state to Found because we are only looking for - // whether we can guarantee the key is not there when "no_io" is set - get_context->MarkKeyMayExist(); - break; - } - if (!biter.status().ok()) { - s = biter.status(); - break; - } + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } - bool may_exist = biter.SeekForGet(key); - // If user-specified timestamp is supported, we cannot end the search - // just because hash index lookup indicates the key+ts does not exist. - if (!may_exist && ts_sz == 0) { - // HashSeek cannot find the key this block and the the iter is not - // the end of the block, i.e. cannot be in the following blocks - // either. In this case, the seek_key cannot be found, so we break - // from the top level for-loop. - done = true; - } else { - // Call the *saver function on each entry/block until it returns false - for (; biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } + BlockCacheLookupContext lookup_data_block_context{ + TableReaderCaller::kUserGet}; + bool does_referenced_key_exist = false; + DataBlockIter biter; + uint64_t referenced_data_size = 0; + NewDataBlockIterator( + read_options, v.handle, &biter, BlockType::kData, + get_context, &lookup_data_block_context, + /*s=*/Status(), /*prefetch_buffer*/ nullptr); + + if (no_io && biter.status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for + // whether we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); + break; + } + if (!biter.status().ok()) { + s = biter.status(); + break; + } - if (!get_context->SaveValue( - parsed_key, biter.value(), &matched, - biter.IsValuePinned() ? &biter : nullptr)) { - does_referenced_key_exist = true; - referenced_data_size = biter.key().size() + biter.value().size(); - done = true; - break; - } + bool may_exist = biter.SeekForGet(key); + // If user-specified timestamp is supported, we cannot end the search + // just because hash index lookup indicates the key+ts does not exist. + if (!may_exist && ts_sz == 0) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + done = true; + } else { + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!get_context->SaveValue( + parsed_key, biter.value(), &matched, + biter.IsValuePinned() ? &biter : nullptr)) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + done = true; + break; } - s = biter.status(); - } - // Write the block cache access record. - if (block_cache_tracer_) { - // Avoid making copy of block_key, cf_name, and referenced_key when - // constructing the access record. - BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), - /*block_key=*/"", lookup_data_block_context.block_type, - lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), - /*cf_name=*/"", rep_->level_for_tracing(), - rep_->sst_number_for_tracing(), lookup_data_block_context.caller, - lookup_data_block_context.is_cache_hit, - lookup_data_block_context.no_insert, - /*referenced_key=*/"", referenced_data_size, - lookup_data_block_context.num_keys_in_block, - does_referenced_key_exist); - block_cache_tracer_->WriteBlockAccess( - access_record, lookup_data_block_context.block_key, - rep_->cf_name_for_tracing(), key); } + s = biter.status(); + } + // Write the block cache access record. + if (block_cache_tracer_) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + block_cache_tracer_->WriteBlockAccess( + access_record, lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), key); } if (done) { @@ -3115,7 +3241,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, sst_file_range.begin()->get_context, &lookup_context); - std::unique_ptr> iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); } @@ -3130,21 +3256,30 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, bool matched = false; // if such user key matched a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { + IndexValue v = iiter->value(); + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } + bool reusing_block = true; uint64_t referenced_data_size = 0; bool does_referenced_key_exist = false; BlockCacheLookupContext lookup_data_block_context( TableReaderCaller::kUserMultiGet); - if (iiter->value().offset() != offset) { - offset = iiter->value().offset(); + if (iiter->value().handle.offset() != offset) { + offset = iiter->value().handle.offset(); biter.Invalidate(Status::OK()); NewDataBlockIterator( - read_options, iiter->value(), &biter, BlockType::kData, - /*key_includes_seq=*/false, - /*index_key_is_full=*/true, get_context, - &lookup_data_block_context, Status(), nullptr); + read_options, v.handle, &biter, BlockType::kData, + get_context, &lookup_data_block_context, Status(), nullptr); reusing_block = false; } + if (read_options.read_tier == kBlockCacheTier && biter.status().IsIncomplete()) { // couldn't get block from block_cache @@ -3238,7 +3373,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, Status BlockBasedTable::Prefetch(const Slice* const begin, const Slice* const end) { auto& comparator = rep_->internal_comparator; - auto user_comparator = comparator.user_comparator(); + UserComparatorWrapper user_comparator(comparator.user_comparator()); // pre-condition if (begin && end && comparator.Compare(*begin, *end) > 0) { return Status::InvalidArgument(*begin, *end); @@ -3248,10 +3383,9 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack, /*get_context=*/nullptr, &lookup_context); - std::unique_ptr> iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { - iiter_unique_ptr = - std::unique_ptr>(iiter); + iiter_unique_ptr = std::unique_ptr>(iiter); } if (!iiter->status().ok()) { @@ -3264,13 +3398,12 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) { - BlockHandle block_handle = iiter->value(); - const bool is_user_key = rep_->table_properties && - rep_->table_properties->index_key_is_user_key > 0; + BlockHandle block_handle = iiter->value().handle; + const bool is_user_key = !rep_->index_key_includes_seq; if (end && ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) || (is_user_key && - user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) { + user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) { if (prefetching_boundary_page) { break; } @@ -3285,7 +3418,6 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, NewDataBlockIterator( ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, /*get_context=*/nullptr, &lookup_context, Status(), /*prefetch_buffer=*/nullptr); @@ -3315,13 +3447,12 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) { // Check Data blocks IndexBlockIter iiter_on_stack; BlockCacheLookupContext context{caller}; - InternalIteratorBase* iiter = NewIndexIterator( + InternalIteratorBase* iiter = NewIndexIterator( ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack, /*get_context=*/nullptr, &context); - std::unique_ptr> iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { - iiter_unique_ptr = - std::unique_ptr>(iiter); + iiter_unique_ptr = std::unique_ptr>(iiter); } if (!iiter->status().ok()) { // error opening index iterator @@ -3332,14 +3463,14 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) { } Status BlockBasedTable::VerifyChecksumInBlocks( - InternalIteratorBase* index_iter) { + InternalIteratorBase* index_iter) { Status s; for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { s = index_iter->status(); if (!s.ok()) { break; } - BlockHandle handle = index_iter->value(); + BlockHandle handle = index_iter->value().handle; BlockContents contents; BlockFetcher block_fetcher( rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, @@ -3445,31 +3576,13 @@ bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const { bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { - std::unique_ptr> iiter(NewIndexIterator( + std::unique_ptr> iiter(NewIndexIterator( options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); iiter->Seek(key); assert(iiter->Valid()); - return TEST_BlockInCache(iiter->value()); -} - -BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() { - // Some old version of block-based tables don't have index type present in - // table properties. If that's the case we can safely use the kBinarySearch. - BlockBasedTableOptions::IndexType index_type_on_file = - BlockBasedTableOptions::kBinarySearch; - if (rep_->table_properties) { - auto& props = rep_->table_properties->user_collected_properties; - auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); - if (pos != props.end()) { - index_type_on_file = static_cast( - DecodeFixed32(pos->second.c_str())); - // update index_type with the true type - rep_->index_type = index_type_on_file; - } - } - return index_type_on_file; + return TEST_BlockInCache(iiter->value().handle); } // REQUIRES: The following fields of rep_ should have already been populated: @@ -3483,21 +3596,20 @@ Status BlockBasedTable::CreateIndexReader( InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, bool pin, IndexReader** index_reader, BlockCacheLookupContext* lookup_context) { - auto index_type_on_file = rep_->index_type; - // kHashSearch requires non-empty prefix_extractor but bypass checking // prefix_extractor here since we have no access to MutableCFOptions. // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. // If prefix_extractor does not match prefix_extractor_name from table // properties, turn off Hash Index by setting total_order_seek to true - switch (index_type_on_file) { + switch (rep_->index_type) { case BlockBasedTableOptions::kTwoLevelIndexSearch: { return PartitionIndexReader::Create(this, prefetch_buffer, use_cache, prefetch, pin, index_reader, lookup_context); } - case BlockBasedTableOptions::kBinarySearch: { + case BlockBasedTableOptions::kBinarySearch: + case BlockBasedTableOptions::kBinarySearchWithFirstKey: { return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, prefetch, pin, index_reader, lookup_context); @@ -3527,7 +3639,7 @@ Status BlockBasedTable::CreateIndexReader( } default: { std::string error_message = - "Unrecognized index type: " + ToString(index_type_on_file); + "Unrecognized index type: " + ToString(rep_->index_type); return Status::InvalidArgument(error_message.c_str()); } } @@ -3536,7 +3648,7 @@ Status BlockBasedTable::CreateIndexReader( uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, TableReaderCaller caller) { BlockCacheLookupContext context(caller); - std::unique_ptr> index_iter( + std::unique_ptr> index_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/&context)); @@ -3544,7 +3656,7 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, index_iter->Seek(key); uint64_t result; if (index_iter->Valid()) { - BlockHandle handle = index_iter->value(); + BlockHandle handle = index_iter->value().handle; result = handle.offset(); } else { // key is past the last key in the file. If table_properties is not @@ -3574,7 +3686,7 @@ bool BlockBasedTable::TEST_IndexBlockInCache() const { Status BlockBasedTable::GetKVPairsFromDataBlocks( std::vector* kv_pair_blocks) { - std::unique_ptr> blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); @@ -3595,9 +3707,8 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( std::unique_ptr datablock_iter; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr, - /*type=*/BlockType::kData, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, + ReadOptions(), blockhandles_iter->value().handle, + /*input_iter=*/nullptr, /*type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(), /*prefetch_buffer=*/nullptr)); s = datablock_iter->status(); @@ -3806,7 +3917,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { out_file->Append( "Index Details:\n" "--------------------------------------\n"); - std::unique_ptr> blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); @@ -3827,8 +3938,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { Slice key = blockhandles_iter->key(); Slice user_key; InternalKey ikey; - if (rep_->table_properties && - rep_->table_properties->index_key_is_user_key != 0) { + if (!rep_->index_key_includes_seq) { user_key = key; } else { ikey.DecodeFrom(key); @@ -3838,7 +3948,9 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { out_file->Append(" HEX "); out_file->Append(user_key.ToString(true).c_str()); out_file->Append(": "); - out_file->Append(blockhandles_iter->value().ToString(true).c_str()); + out_file->Append(blockhandles_iter->value() + .ToString(true, rep_->index_has_first_key) + .c_str()); out_file->Append("\n"); std::string str_key = user_key.ToString(); @@ -3857,7 +3969,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { } Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { - std::unique_ptr> blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); @@ -3879,7 +3991,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { break; } - BlockHandle bh = blockhandles_iter->value(); + BlockHandle bh = blockhandles_iter->value().handle; uint64_t datablock_size = bh.size(); datablock_size_min = std::min(datablock_size_min, datablock_size); datablock_size_max = std::max(datablock_size_max, datablock_size); @@ -3888,15 +4000,14 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { out_file->Append("Data Block # "); out_file->Append(rocksdb::ToString(block_id)); out_file->Append(" @ "); - out_file->Append(blockhandles_iter->value().ToString(true).c_str()); + out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str()); out_file->Append("\n"); out_file->Append("--------------------------------------\n"); std::unique_ptr datablock_iter; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr, - /*type=*/BlockType::kData, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, + ReadOptions(), blockhandles_iter->value().handle, + /*input_iter=*/nullptr, /*type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(), /*prefetch_buffer=*/nullptr)); s = datablock_iter->status(); diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index b03e67128..9300fb36a 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -43,7 +43,6 @@ namespace rocksdb { -class BlockHandle; class Cache; class FilterBlockReader; class BlockBasedFilterBlockReader; @@ -198,7 +197,7 @@ class BlockBasedTable : public TableReader { // wraps the passed iter. In the latter case the return value points // to a different object then iter, and the callee has the ownership of the // returned object. - virtual InternalIteratorBase* NewIterator( + virtual InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) = 0; @@ -230,8 +229,7 @@ class BlockBasedTable : public TableReader { template TBlockIter* NewDataBlockIterator( const ReadOptions& ro, const BlockHandle& block_handle, - TBlockIter* input_iter, BlockType block_type, bool key_includes_seq, - bool index_key_is_full, GetContext* get_context, + TBlockIter* input_iter, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s, FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const; @@ -259,6 +257,12 @@ class BlockBasedTable : public TableReader { BlockType block_type, GetContext* get_context) const; + // Either Block::NewDataIterator() or Block::NewIndexIterator(). + template + static TBlockIter* InitBlockIterator(const Rep* rep, Block* block, + TBlockIter* input_iter, + bool block_contents_pinned); + // If block cache enabled (compressed or uncompressed), looks for the block // identified by handle in (1) uncompressed cache, (2) compressed cache, and // then (3) file. If found, inserts into the cache(s) that were searched @@ -312,7 +316,7 @@ class BlockBasedTable : public TableReader { // 2. index is not present in block cache. // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier - InternalIteratorBase* NewIndexIterator( + InternalIteratorBase* NewIndexIterator( const ReadOptions& read_options, bool need_upper_bound_check, IndexBlockIter* input_iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) const; @@ -355,9 +359,6 @@ class BlockBasedTable : public TableReader { friend class TableCache; friend class BlockBasedTableBuilder; - // Figure the index type, update it in rep_, and also return it. - BlockBasedTableOptions::IndexType UpdateIndexType(); - // Create a index reader based on the index type stored in the table. // Optionally, user can pass a preloaded meta_index_iter for the index that // need to access extra meta blocks for index construction. This parameter @@ -410,7 +411,7 @@ class BlockBasedTable : public TableReader { static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); - Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); + Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); // Create the filter from the filter block. virtual FilterBlockReader* ReadFilter( @@ -446,17 +447,14 @@ class BlockBasedTable::PartitionedIndexIteratorState public: PartitionedIndexIteratorState( const BlockBasedTable* table, - std::unordered_map>* block_map, - const bool index_key_includes_seq, const bool index_key_is_full); - InternalIteratorBase* NewSecondaryIterator( + std::unordered_map>* block_map); + InternalIteratorBase* NewSecondaryIterator( const BlockHandle& index_value) override; private: // Don't own table_ const BlockBasedTable* table_; std::unordered_map>* block_map_; - bool index_key_includes_seq_; - bool index_key_is_full_; }; // Stores all the properties associated with a BlockBasedTable. @@ -564,12 +562,16 @@ struct BlockBasedTable::Rep { // still work, just not as quickly. bool blocks_definitely_zstd_compressed = false; + // These describe how index is encoded. + bool index_has_first_key = false; + bool index_key_includes_seq = true; + bool index_value_is_full = true; + bool closed = false; const bool immortal_table; SequenceNumber get_global_seqno(BlockType block_type) const { return (block_type == BlockType::kFilter || - block_type == BlockType::kIndex || block_type == BlockType::kCompressionDictionary) ? kDisableGlobalSequenceNumber : global_seqno; @@ -602,11 +604,10 @@ class BlockBasedTableIterator : public InternalIteratorBase { BlockBasedTableIterator(const BlockBasedTable* table, const ReadOptions& read_options, const InternalKeyComparator& icomp, - InternalIteratorBase* index_iter, + InternalIteratorBase* index_iter, bool check_filter, bool need_upper_bound_check, const SliceTransform* prefix_extractor, - BlockType block_type, bool key_includes_seq, - bool index_key_is_full, TableReaderCaller caller, + BlockType block_type, TableReaderCaller caller, size_t compaction_readahead_size = 0) : InternalIteratorBase(false), table_(table), @@ -620,8 +621,6 @@ class BlockBasedTableIterator : public InternalIteratorBase { need_upper_bound_check_(need_upper_bound_check), prefix_extractor_(prefix_extractor), block_type_(block_type), - key_includes_seq_(key_includes_seq), - index_key_is_full_(index_key_is_full), lookup_context_(caller), compaction_readahead_size_(compaction_readahead_size) {} @@ -635,19 +634,38 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool NextAndGetResult(Slice* ret_key) override; void Prev() override; bool Valid() const override { - return !is_out_of_bound_ && block_iter_points_to_real_block_ && - block_iter_.Valid(); + return !is_out_of_bound_ && + (is_at_first_key_from_index_ || + (block_iter_points_to_real_block_ && block_iter_.Valid())); } Slice key() const override { assert(Valid()); - return block_iter_.key(); + if (is_at_first_key_from_index_) { + return index_iter_->value().first_internal_key; + } else { + return block_iter_.key(); + } } Slice user_key() const override { assert(Valid()); - return block_iter_.user_key(); + if (is_at_first_key_from_index_) { + return ExtractUserKey(index_iter_->value().first_internal_key); + } else { + return block_iter_.user_key(); + } } TValue value() const override { assert(Valid()); + + // Load current block if not loaded. + if (is_at_first_key_from_index_ && + !const_cast(this) + ->MaterializeCurrentBlock()) { + // Oops, index is not consistent with block contents, but we have + // no good way to report error at this point. Let's return empty value. + return TValue(); + } + return block_iter_.value(); } Status status() const override { @@ -667,10 +685,17 @@ class BlockBasedTableIterator : public InternalIteratorBase { pinned_iters_mgr_ = pinned_iters_mgr; } bool IsKeyPinned() const override { + // Our key comes either from block_iter_'s current key + // or index_iter_'s current *value*. return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && - block_iter_points_to_real_block_ && block_iter_.IsKeyPinned(); + ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) || + (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned())); } bool IsValuePinned() const override { + // Load current block if not loaded. + if (is_at_first_key_from_index_) { + const_cast(this)->MaterializeCurrentBlock(); + } // BlockIter::IsValuePinned() is always true. No need to check return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && block_iter_points_to_real_block_; @@ -704,35 +729,33 @@ class BlockBasedTableIterator : public InternalIteratorBase { if (block_iter_points_to_real_block_) { // Reseek. If they end up with the same data block, we shouldn't re-fetch // the same data block. - prev_index_value_ = index_iter_->value(); + prev_block_offset_ = index_iter_->value().handle.offset(); } } - void InitDataBlock(); - inline void FindKeyForward(); - void FindBlockForward(); - void FindKeyBackward(); - void CheckOutOfBound(); - private: const BlockBasedTable* table_; const ReadOptions read_options_; const InternalKeyComparator& icomp_; UserComparatorWrapper user_comparator_; - InternalIteratorBase* index_iter_; + InternalIteratorBase* index_iter_; PinnedIteratorsManager* pinned_iters_mgr_; TBlockIter block_iter_; + + // True if block_iter_ is initialized and points to the same block + // as index iterator. bool block_iter_points_to_real_block_; + // See InternalIteratorBase::IsOutOfBound(). bool is_out_of_bound_ = false; + // True if we're standing at the first key of a block, and we haven't loaded + // that block yet. A call to value() will trigger loading the block. + bool is_at_first_key_from_index_ = false; bool check_filter_; // TODO(Zhongyi): pick a better name bool need_upper_bound_check_; const SliceTransform* prefix_extractor_; BlockType block_type_; - // If the keys in the blocks over which we iterate include 8 byte sequence - bool key_includes_seq_; - bool index_key_is_full_; - BlockHandle prev_index_value_; + uint64_t prev_block_offset_; BlockCacheLookupContext lookup_context_; // Readahead size used in compaction, its value is used only if // lookup_context_.caller = kCompaction. @@ -748,6 +771,16 @@ class BlockBasedTableIterator : public InternalIteratorBase { size_t readahead_limit_ = 0; int64_t num_file_reads_ = 0; std::unique_ptr prefetch_buffer_; + + // If `target` is null, seek to first. + void SeekImpl(const Slice* target); + + void InitDataBlock(); + bool MaterializeCurrentBlock(); + void FindKeyForward(); + void FindBlockForward(); + void FindKeyBackward(); + void CheckOutOfBound(); }; } // namespace rocksdb diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc index 2dab4627c..e0ca24bf4 100644 --- a/table/block_based/block_test.cc +++ b/table/block_based/block_test.cc @@ -69,37 +69,12 @@ void GenerateRandomKVs(std::vector *keys, } } -// Same as GenerateRandomKVs but the values are BlockHandle -void GenerateRandomKBHs(std::vector *keys, - std::vector *values, const int from, - const int len, const int step = 1, - const int padding_size = 0, - const int keys_share_prefix = 1) { - Random rnd(302); - uint64_t offset = 0; - - // generate different prefix - for (int i = from; i < from + len; i += step) { - // generate keys that shares the prefix - for (int j = 0; j < keys_share_prefix; ++j) { - keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); - - uint64_t size = rnd.Uniform(1024 * 16); - BlockHandle handle(offset, size); - offset += size + kBlockTrailerSize; - values->emplace_back(handle); - } - } -} - class BlockTest : public testing::Test {}; // block test TEST_F(BlockTest, SimpleTest) { Random rnd(301); Options options = Options(); - std::unique_ptr ic; - ic.reset(new test::PlainInternalKeyComparator(options.comparator)); std::vector keys; std::vector values; @@ -123,7 +98,7 @@ TEST_F(BlockTest, SimpleTest) { // read contents of block sequentially int count = 0; InternalIterator *iter = - reader.NewIterator(options.comparator, options.comparator); + reader.NewDataIterator(options.comparator, options.comparator); for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { // read kv from block Slice k = iter->key(); @@ -136,8 +111,7 @@ TEST_F(BlockTest, SimpleTest) { delete iter; // read block contents randomly - iter = - reader.NewIterator(options.comparator, options.comparator); + iter = reader.NewDataIterator(options.comparator, options.comparator); for (int i = 0; i < num_records; i++) { // find a random key in the lookaside array int index = rnd.Uniform(num_records); @@ -152,83 +126,6 @@ TEST_F(BlockTest, SimpleTest) { delete iter; } -TEST_F(BlockTest, ValueDeltaEncodingTest) { - Random rnd(301); - Options options = Options(); - std::unique_ptr ic; - ic.reset(new test::PlainInternalKeyComparator(options.comparator)); - - std::vector keys; - std::vector values; - const bool kUseDeltaEncoding = true; - const bool kUseValueDeltaEncoding = true; - BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding); - int num_records = 100; - - GenerateRandomKBHs(&keys, &values, 0, num_records); - // add a bunch of records to a block - BlockHandle last_encoded_handle; - for (int i = 0; i < num_records; i++) { - auto block_handle = values[i]; - std::string handle_encoding; - block_handle.EncodeTo(&handle_encoding); - std::string handle_delta_encoding; - PutVarsignedint64(&handle_delta_encoding, - block_handle.size() - last_encoded_handle.size()); - last_encoded_handle = block_handle; - const Slice handle_delta_encoding_slice(handle_delta_encoding); - builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice); - } - - // read serialized contents of the block - Slice rawblock = builder.Finish(); - - // create block reader - BlockContents contents; - contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); - - const bool kTotalOrderSeek = true; - const bool kIncludesSeq = true; - const bool kValueIsFull = !kUseValueDeltaEncoding; - IndexBlockIter *kNullIter = nullptr; - Statistics *kNullStats = nullptr; - // read contents of block sequentially - int count = 0; - InternalIteratorBase *iter = reader.NewIterator( - options.comparator, options.comparator, kNullIter, kNullStats, - kTotalOrderSeek, kIncludesSeq, kValueIsFull); - for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { - // read kv from block - Slice k = iter->key(); - BlockHandle handle = iter->value(); - - // compare with lookaside array - ASSERT_EQ(k.ToString().compare(keys[count]), 0); - - ASSERT_EQ(values[count].offset(), handle.offset()); - ASSERT_EQ(values[count].size(), handle.size()); - } - delete iter; - - // read block contents randomly - iter = reader.NewIterator( - options.comparator, options.comparator, kNullIter, kNullStats, - kTotalOrderSeek, kIncludesSeq, kValueIsFull); - for (int i = 0; i < num_records; i++) { - // find a random key in the lookaside array - int index = rnd.Uniform(num_records); - Slice k(keys[index]); - - // search in block for this key - iter->Seek(k); - ASSERT_TRUE(iter->Valid()); - BlockHandle handle = iter->value(); - ASSERT_EQ(values[index].offset(), handle.offset()); - ASSERT_EQ(values[index].size(), handle.size()); - } - delete iter; -} // return the block contents BlockContents GetBlockContents(std::unique_ptr *builder, const std::vector &keys, @@ -261,8 +158,7 @@ void CheckBlockContents(BlockContents contents, const int max_key, NewFixedPrefixTransform(prefix_size)); std::unique_ptr regular_iter( - reader2.NewIterator(BytewiseComparator(), - BytewiseComparator())); + reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator())); // Seek existent keys for (size_t i = 0; i < keys.size(); i++) { @@ -457,8 +353,6 @@ TEST_F(BlockTest, BlockReadAmpBitmap) { TEST_F(BlockTest, BlockWithReadAmpBitmap) { Random rnd(301); Options options = Options(); - std::unique_ptr ic; - ic.reset(new test::PlainInternalKeyComparator(options.comparator)); std::vector keys; std::vector values; @@ -486,9 +380,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { // read contents of block sequentially size_t read_bytes = 0; - DataBlockIter *iter = - static_cast(reader.NewIterator( - options.comparator, options.comparator, nullptr, stats.get())); + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { iter->value(); read_bytes += iter->TEST_CurrentEntrySize(); @@ -519,9 +412,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { kBytesPerBit, stats.get()); size_t read_bytes = 0; - DataBlockIter *iter = - static_cast(reader.NewIterator( - options.comparator, options.comparator, nullptr, stats.get())); + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); for (int i = 0; i < num_records; i++) { Slice k(keys[i]); @@ -555,9 +447,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { kBytesPerBit, stats.get()); size_t read_bytes = 0; - DataBlockIter *iter = - static_cast(reader.NewIterator( - options.comparator, options.comparator, nullptr, stats.get())); + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); std::unordered_set read_keys; for (int i = 0; i < num_records; i++) { int index = rnd.Uniform(num_records); @@ -602,6 +493,132 @@ TEST_F(BlockTest, ReadAmpBitmapPow2) { ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32); } +class IndexBlockTest + : public testing::Test, + public testing::WithParamInterface> { + public: + IndexBlockTest() = default; + + bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); } + bool includeFirstKey() const { return std::get<1>(GetParam()); } +}; + +// Similar to GenerateRandomKVs but for index block contents. +void GenerateRandomIndexEntries(std::vector *separators, + std::vector *block_handles, + std::vector *first_keys, + const int len) { + Random rnd(42); + + // For each of `len` blocks, we need to generate a first and last key. + // Let's generate n*2 random keys, sort them, group into consecutive pairs. + std::set keys; + while ((int)keys.size() < len * 2) { + // Keys need to be at least 8 bytes long to look like internal keys. + keys.insert(test::RandomKey(&rnd, 12)); + } + + uint64_t offset = 0; + for (auto it = keys.begin(); it != keys.end();) { + first_keys->emplace_back(*it++); + separators->emplace_back(*it++); + uint64_t size = rnd.Uniform(1024 * 16); + BlockHandle handle(offset, size); + offset += size + kBlockTrailerSize; + block_handles->emplace_back(handle); + } +} + +TEST_P(IndexBlockTest, IndexValueEncodingTest) { + Random rnd(301); + Options options = Options(); + + std::vector separators; + std::vector block_handles; + std::vector first_keys; + const bool kUseDeltaEncoding = true; + BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding()); + int num_records = 100; + + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + num_records); + BlockHandle last_encoded_handle; + for (int i = 0; i < num_records; i++) { + IndexValue entry(block_handles[i], first_keys[i]); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr); + if (useValueDeltaEncoding() && i > 0) { + entry.EncodeTo(&delta_encoded_entry, includeFirstKey(), + &last_encoded_handle); + } + last_encoded_handle = entry.handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const bool kTotalOrderSeek = true; + const bool kIncludesSeq = true; + const bool kValueIsFull = !useValueDeltaEncoding(); + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + InternalIteratorBase *iter = reader.NewIndexIterator( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull); + iter->SeekToFirst(); + for (int index = 0; index < num_records; ++index) { + ASSERT_TRUE(iter->Valid()); + + Slice k = iter->key(); + IndexValue v = iter->value(); + + EXPECT_EQ(separators[index], k.ToString()); + EXPECT_EQ(block_handles[index].offset(), v.handle.offset()); + EXPECT_EQ(block_handles[index].size(), v.handle.size()); + EXPECT_EQ(includeFirstKey() ? first_keys[index] : "", + v.first_internal_key.ToString()); + + iter->Next(); + } + delete iter; + + // read block contents randomly + iter = reader.NewIndexIterator(options.comparator, options.comparator, + kNullIter, kNullStats, kTotalOrderSeek, + includeFirstKey(), kIncludesSeq, kValueIsFull); + for (int i = 0; i < num_records * 2; i++) { + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(separators[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + IndexValue v = iter->value(); + EXPECT_EQ(separators[index], iter->key().ToString()); + EXPECT_EQ(block_handles[index].offset(), v.handle.offset()); + EXPECT_EQ(block_handles[index].size(), v.handle.size()); + EXPECT_EQ(includeFirstKey() ? first_keys[index] : "", + v.first_internal_key.ToString()); + } + delete iter; +} + +INSTANTIATE_TEST_CASE_P(P, IndexBlockTest, + ::testing::Values(std::make_tuple(false, false), + std::make_tuple(false, true), + std::make_tuple(true, false), + std::make_tuple(true, true))); + } // namespace rocksdb int main(int argc, char **argv) { diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index 5ec093871..484617d7e 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -391,7 +391,7 @@ TEST(DataBlockHashIndex, BlockTestSingleKey) { Block reader(std::move(contents), kDisableGlobalSequenceNumber); const InternalKeyComparator icmp(BytewiseComparator()); - auto iter = reader.NewIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); bool may_exist; // search in block for the key just inserted { @@ -474,8 +474,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) { // random seek existent keys for (int i = 0; i < num_records; i++) { - auto iter = - reader.NewIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); // find a random key in the lookaside array int index = rnd.Uniform(num_records); std::string ukey(keys[index] + "1" /* existing key marker */); @@ -512,8 +511,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) { // C true false for (int i = 0; i < num_records; i++) { - auto iter = - reader.NewIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); // find a random key in the lookaside array int index = rnd.Uniform(num_records); std::string ukey(keys[index] + "0" /* non-existing key marker */); diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc index c1ce541ae..f3a4b10e0 100644 --- a/table/block_based/index_builder.cc +++ b/table/block_based/index_builder.cc @@ -36,7 +36,7 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( result = new ShortenedIndexBuilder( comparator, table_opt.index_block_restart_interval, table_opt.format_version, use_value_delta_encoding, - table_opt.index_shortening); + table_opt.index_shortening, /* include_first_key */ false); } break; case BlockBasedTableOptions::kHashSearch: { result = new HashIndexBuilder( @@ -48,6 +48,12 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( result = PartitionedIndexBuilder::CreateIndexBuilder( comparator, use_value_delta_encoding, table_opt); } break; + case BlockBasedTableOptions::kBinarySearchWithFirstKey: { + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding, + table_opt.index_shortening, /* include_first_key */ true); + } break; default: { assert(!"Do not recognize the index type "); } break; @@ -94,7 +100,7 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { sub_index_builder_ = new ShortenedIndexBuilder( comparator_, table_opt_.index_block_restart_interval, table_opt_.format_version, use_value_delta_encoding_, - table_opt_.index_shortening); + table_opt_.index_shortening, /* include_first_key */ false); flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, // Note: this is sub-optimal since sub_index_builder_ could later reset diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h index 6baa9891b..47348b31f 100644 --- a/table/block_based/index_builder.h +++ b/table/block_based/index_builder.h @@ -58,6 +58,7 @@ class IndexBuilder { // To allow further optimization, we provide `last_key_in_current_block` and // `first_key_in_next_block`, based on which the specific implementation can // determine the best index key to be used for the index block. + // Called before the OnKeyAdded() call for first_key_in_next_block. // @last_key_in_current_block: this parameter maybe overridden with the value // "substitute key". // @first_key_in_next_block: it will be nullptr if the entry being added is @@ -123,7 +124,8 @@ class ShortenedIndexBuilder : public IndexBuilder { const InternalKeyComparator* comparator, const int index_block_restart_interval, const uint32_t format_version, const bool use_value_delta_encoding, - BlockBasedTableOptions::IndexShorteningMode shortening_mode) + BlockBasedTableOptions::IndexShorteningMode shortening_mode, + bool include_first_key) : IndexBuilder(comparator), index_block_builder_(index_block_restart_interval, true /*use_delta_encoding*/, @@ -131,11 +133,19 @@ class ShortenedIndexBuilder : public IndexBuilder { index_block_builder_without_seq_(index_block_restart_interval, true /*use_delta_encoding*/, use_value_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), + include_first_key_(include_first_key), shortening_mode_(shortening_mode) { // Making the default true will disable the feature for old versions seperator_is_key_plus_seq_ = (format_version <= 2); } + virtual void OnKeyAdded(const Slice& key) override { + if (include_first_key_ && current_block_first_internal_key_.empty()) { + current_block_first_internal_key_.assign(key.data(), key.size()); + } + } + virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, const BlockHandle& block_handle) override { @@ -159,20 +169,27 @@ class ShortenedIndexBuilder : public IndexBuilder { } auto sep = Slice(*last_key_in_current_block); - std::string handle_encoding; - block_handle.EncodeTo(&handle_encoding); - std::string handle_delta_encoding; - PutVarsignedint64(&handle_delta_encoding, - block_handle.size() - last_encoded_handle_.size()); - assert(handle_delta_encoding.size() != 0); + assert(!include_first_key_ || !current_block_first_internal_key_.empty()); + IndexValue entry(block_handle, current_block_first_internal_key_); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, include_first_key_, nullptr); + if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) { + entry.EncodeTo(&delta_encoded_entry, include_first_key_, + &last_encoded_handle_); + } else { + // If it's the first block, or delta encoding is disabled, + // BlockBuilder::Add() below won't use delta-encoded slice. + } last_encoded_handle_ = block_handle; - const Slice handle_delta_encoding_slice(handle_delta_encoding); - index_block_builder_.Add(sep, handle_encoding, - &handle_delta_encoding_slice); + const Slice delta_encoded_entry_slice(delta_encoded_entry); + index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice); if (!seperator_is_key_plus_seq_) { - index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding, - &handle_delta_encoding_slice); + index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry, + &delta_encoded_entry_slice); } + + current_block_first_internal_key_.clear(); } using IndexBuilder::Finish; @@ -200,9 +217,12 @@ class ShortenedIndexBuilder : public IndexBuilder { private: BlockBuilder index_block_builder_; BlockBuilder index_block_builder_without_seq_; + const bool use_value_delta_encoding_; bool seperator_is_key_plus_seq_; + const bool include_first_key_; BlockBasedTableOptions::IndexShorteningMode shortening_mode_; - BlockHandle last_encoded_handle_; + BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle(); + std::string current_block_first_internal_key_; }; // HashIndexBuilder contains a binary-searchable primary index and the @@ -243,7 +263,7 @@ class HashIndexBuilder : public IndexBuilder { : IndexBuilder(comparator), primary_index_builder_(comparator, index_block_restart_interval, format_version, use_value_delta_encoding, - shortening_mode), + shortening_mode, /* include_first_key */ false), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index cce674415..dcd985152 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -147,12 +147,13 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { IndexBlockIter biter; BlockHandle handle; Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIterator( + idx_on_fltr_blk_->NewIndexIterator( &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); + /* have_first_key */ false, index_key_includes_seq_, + index_value_is_full_); biter.SeekToFirst(); for (; biter.Valid(); biter.Next()) { - handle = biter.value(); + handle = biter.value().handle; auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix, table_->rep_->cache_key_prefix_size, handle, cache_key); @@ -221,15 +222,16 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( const Slice& entry) { IndexBlockIter iter; Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIterator( + idx_on_fltr_blk_->NewIndexIterator( &comparator_, comparator_.user_comparator(), &iter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); + /* have_first_key */ false, index_key_includes_seq_, + index_value_is_full_); iter.Seek(entry); if (UNLIKELY(!iter.Valid())) { return BlockHandle(0, 0); } assert(iter.Valid()); - BlockHandle fltr_blk_handle = iter.value(); + BlockHandle fltr_blk_handle = iter.value().handle; return fltr_blk_handle; } @@ -280,18 +282,19 @@ void PartitionedFilterBlockReader::CacheDependencies( BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; IndexBlockIter biter; Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIterator( + idx_on_fltr_blk_->NewIndexIterator( &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); + /* have_first_key */ false, index_key_includes_seq_, + index_value_is_full_); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); - BlockHandle handle = biter.value(); + BlockHandle handle = biter.value().handle; uint64_t prefetch_off = handle.offset(); // Read the last block's offset biter.SeekToLast(); - handle = biter.value(); + handle = biter.value().handle; uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; @@ -304,7 +307,7 @@ void PartitionedFilterBlockReader::CacheDependencies( // After prefetch, read the partitions one by one biter.SeekToFirst(); for (; biter.Valid(); biter.Next()) { - handle = biter.value(); + handle = biter.value().handle; const bool no_io = true; const bool is_a_filter_partition = true; auto filter = table_->GetFilter( diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 6fdddc37e..81e1345d9 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -15,7 +15,6 @@ #include "logging/logging.h" #include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" #include "rocksdb/env.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" diff --git a/table/format.cc b/table/format.cc index 2046903a7..b3eb281a2 100644 --- a/table/format.cc +++ b/table/format.cc @@ -91,6 +91,58 @@ std::string BlockHandle::ToString(bool hex) const { const BlockHandle BlockHandle::kNullBlockHandle(0, 0); +void IndexValue::EncodeTo(std::string* dst, bool have_first_key, + const BlockHandle* previous_handle) const { + if (previous_handle) { + assert(handle.offset() == previous_handle->offset() + + previous_handle->size() + kBlockTrailerSize); + PutVarsignedint64(dst, handle.size() - previous_handle->size()); + } else { + handle.EncodeTo(dst); + } + assert(dst->size() != 0); + + if (have_first_key) { + PutLengthPrefixedSlice(dst, first_internal_key); + } +} + +Status IndexValue::DecodeFrom(Slice* input, bool have_first_key, + const BlockHandle* previous_handle) { + if (previous_handle) { + int64_t delta; + if (!GetVarsignedint64(input, &delta)) { + return Status::Corruption("bad delta-encoded index value"); + } + handle = BlockHandle( + previous_handle->offset() + previous_handle->size() + kBlockTrailerSize, + previous_handle->size() + delta); + } else { + Status s = handle.DecodeFrom(input); + if (!s.ok()) { + return s; + } + } + + if (!have_first_key) { + first_internal_key = Slice(); + } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) { + return Status::Corruption("bad first key in block info"); + } + + return Status::OK(); +} + +std::string IndexValue::ToString(bool hex, bool have_first_key) const { + std::string s; + EncodeTo(&s, have_first_key, nullptr); + if (hex) { + return Slice(s).ToString(true); + } else { + return s; + } +} + namespace { inline bool IsLegacyFooterFormat(uint64_t magic_number) { return magic_number == kLegacyBlockBasedTableMagicNumber || diff --git a/table/format.h b/table/format.h index baad78070..539ca8880 100644 --- a/table/format.h +++ b/table/format.h @@ -76,6 +76,35 @@ class BlockHandle { static const BlockHandle kNullBlockHandle; }; +// Value in block-based table file index. +// +// The index entry for block n is: y -> h, [x], +// where: y is some key between the last key of block n (inclusive) and the +// first key of block n+1 (exclusive); h is BlockHandle pointing to block n; +// x, if present, is the first key of block n (unshortened). +// This struct represents the "h, [x]" part. +struct IndexValue { + BlockHandle handle; + // Empty means unknown. + Slice first_internal_key; + + IndexValue() = default; + IndexValue(BlockHandle _handle, Slice _first_internal_key) + : handle(_handle), first_internal_key(_first_internal_key) {} + + // have_first_key indicates whether the `first_internal_key` is used. + // If previous_handle is not null, delta encoding is used; + // in this case, the two handles must point to consecutive blocks: + // handle.offset() == + // previous_handle->offset() + previous_handle->size() + kBlockTrailerSize + void EncodeTo(std::string* dst, bool have_first_key, + const BlockHandle* previous_handle) const; + Status DecodeFrom(Slice* input, bool have_first_key, + const BlockHandle* previous_handle); + + std::string ToString(bool hex, bool have_first_key) const; +}; + inline uint32_t GetCompressFormatForVersion(CompressionType compression_type, uint32_t version) { #ifdef NDEBUG diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 8f1cc9dd6..696e66135 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -90,8 +90,11 @@ class InternalIteratorBase : public Cleanable { // satisfied without doing some IO, then this returns Status::Incomplete(). virtual Status status() const = 0; - // True if the iterator is invalidated because it is out of the iterator - // upper bound + // True if the iterator is invalidated because it reached a key that is above + // the iterator upper bound. Used by LevelIterator to decide whether it should + // stop or move on to the next file. + // Important: if iterator reached the end of the file without encountering any + // keys above the upper bound, IsOutOfBound() must return false. virtual bool IsOutOfBound() { return false; } // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont diff --git a/table/iterator.cc b/table/iterator.cc index 97a0cef5e..f6c7f9cec 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -167,7 +167,7 @@ template InternalIteratorBase* NewErrorInternalIterator(const Status& status) { return new EmptyInternalIterator(status); } -template InternalIteratorBase* NewErrorInternalIterator( +template InternalIteratorBase* NewErrorInternalIterator( const Status& status); template InternalIteratorBase* NewErrorInternalIterator( const Status& status); @@ -182,7 +182,7 @@ InternalIteratorBase* NewErrorInternalIterator(const Status& status, return new (mem) EmptyInternalIterator(status); } } -template InternalIteratorBase* NewErrorInternalIterator( +template InternalIteratorBase* NewErrorInternalIterator( const Status& status, Arena* arena); template InternalIteratorBase* NewErrorInternalIterator( const Status& status, Arena* arena); @@ -191,7 +191,7 @@ template InternalIteratorBase* NewEmptyInternalIterator() { return new EmptyInternalIterator(Status::OK()); } -template InternalIteratorBase* NewEmptyInternalIterator(); +template InternalIteratorBase* NewEmptyInternalIterator(); template InternalIteratorBase* NewEmptyInternalIterator(); template @@ -203,7 +203,7 @@ InternalIteratorBase* NewEmptyInternalIterator(Arena* arena) { return new (mem) EmptyInternalIterator(Status::OK()); } } -template InternalIteratorBase* NewEmptyInternalIterator( +template InternalIteratorBase* NewEmptyInternalIterator( Arena* arena); template InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 4205d298b..3bbc6d870 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -229,8 +229,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, Block properties_block(std::move(block_contents), kDisableGlobalSequenceNumber); DataBlockIter iter; - properties_block.NewIterator(BytewiseComparator(), - BytewiseComparator(), &iter); + properties_block.NewDataIterator(BytewiseComparator(), BytewiseComparator(), + &iter); auto new_table_properties = new TableProperties(); // All pre-defined properties of type uint64_t @@ -386,9 +386,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, // are to compress it. Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); - std::unique_ptr meta_iter( - metaindex_block.NewIterator(BytewiseComparator(), - BytewiseComparator())); + std::unique_ptr meta_iter(metaindex_block.NewDataIterator( + BytewiseComparator(), BytewiseComparator())); // -- Read property block bool found_properties_block = true; @@ -459,8 +458,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, kDisableGlobalSequenceNumber); std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewIterator( - BytewiseComparator(), BytewiseComparator())); + meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(), + BytewiseComparator())); return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); } @@ -504,8 +503,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file, kDisableGlobalSequenceNumber); std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewIterator( - BytewiseComparator(), BytewiseComparator())); + meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(), + BytewiseComparator())); BlockHandle block_handle; status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); diff --git a/table/table_test.cc b/table/table_test.cc index 2e2286efa..418ecf004 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -236,7 +236,7 @@ class BlockConstructor: public Constructor { } InternalIterator* NewIterator( const SliceTransform* /*prefix_extractor*/) const override { - return block_->NewIterator(comparator_, comparator_); + return block_->NewDataIterator(comparator_, comparator_); } private: @@ -308,8 +308,9 @@ class TableConstructor: public Constructor { public: explicit TableConstructor(const Comparator* cmp, bool convert_to_internal_key = false, - int level = -1) + int level = -1, SequenceNumber largest_seqno = 0) : Constructor(cmp), + largest_seqno_(largest_seqno), convert_to_internal_key_(convert_to_internal_key), level_(level) {} ~TableConstructor() override { Reset(); } @@ -326,6 +327,14 @@ class TableConstructor: public Constructor { std::unique_ptr builder; std::vector> int_tbl_prop_collector_factories; + + if (largest_seqno_ != 0) { + // Pretend that it's an external file written by SstFileWriter. + int_tbl_prop_collector_factories.emplace_back( + new SstFileWriterPropertiesCollectorFactory(2 /* version */, + 0 /* global_seqno*/)); + } + std::string column_family_name; builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, internal_comparator, @@ -362,7 +371,7 @@ class TableConstructor: public Constructor { return ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, internal_comparator, !kSkipFilters, !kImmortal, - level_), + level_, largest_seqno_, nullptr), std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_); } @@ -428,6 +437,7 @@ class TableConstructor: public Constructor { std::unique_ptr file_writer_; std::unique_ptr file_reader_; std::unique_ptr table_reader_; + SequenceNumber largest_seqno_; bool convert_to_internal_key_; int level_; @@ -1484,7 +1494,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) { TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - for (int i = 0; i < 4; ++i) { + for (int i = 0; i <= 5; ++i) { Options options; // Make each key/value an individual block table_options.block_size = 64; @@ -1515,11 +1525,16 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { options.prefix_extractor.reset(NewFixedPrefixTransform(4)); break; case 4: - default: - // Binary search index + // Two-level index table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; options.table_factory.reset(new BlockBasedTableFactory(table_options)); break; + case 5: + // Binary search with first key + table_options.index_type = + BlockBasedTableOptions::kBinarySearchWithFirstKey; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + break; } TableConstructor c(BytewiseComparator(), @@ -1663,10 +1678,10 @@ static std::string RandomString(Random* rnd, int len) { } void AddInternalKey(TableConstructor* c, const std::string& prefix, - int /*suffix_len*/ = 800) { + std::string value = "v", int /*suffix_len*/ = 800) { static Random rnd(1023); InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue); - c->Add(k.Encode().ToString(), "v"); + c->Add(k.Encode().ToString(), value); } void TableTest::IndexTest(BlockBasedTableOptions table_options) { @@ -1845,6 +1860,286 @@ TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) { ASSERT_TRUE(iter->status().IsIncomplete()); } +TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey1) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey; + IndexTest(table_options); +} + +class CustomFlushBlockPolicy : public FlushBlockPolicyFactory, + public FlushBlockPolicy { + public: + explicit CustomFlushBlockPolicy(std::vector keys_per_block) + : keys_per_block_(keys_per_block) {} + + const char* Name() const override { return "table_test"; } + FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&, + const BlockBuilder&) const override { + return new CustomFlushBlockPolicy(keys_per_block_); + } + + bool Update(const Slice&, const Slice&) override { + if (keys_in_current_block_ >= keys_per_block_.at(current_block_idx_)) { + ++current_block_idx_; + keys_in_current_block_ = 1; + return true; + } + + ++keys_in_current_block_; + return false; + } + + std::vector keys_per_block_; + + int current_block_idx_ = 0; + int keys_in_current_block_ = 0; +}; + +TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) { + for (int use_first_key = 0; use_first_key < 2; ++use_first_key) { + SCOPED_TRACE("use_first_key = " + std::to_string(use_first_key)); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = + use_first_key ? BlockBasedTableOptions::kBinarySearchWithFirstKey + : BlockBasedTableOptions::kBinarySearch; + table_options.block_cache = NewLRUCache(10000); // fits all blocks + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(std::vector{2, 1, 3, 2}); + Options options; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + + TableConstructor c(BytewiseComparator()); + + // Block 0. + AddInternalKey(&c, "aaaa", "v0"); + AddInternalKey(&c, "aaac", "v1"); + + // Block 1. + AddInternalKey(&c, "aaca", "v2"); + + // Block 2. + AddInternalKey(&c, "caaa", "v3"); + AddInternalKey(&c, "caac", "v4"); + AddInternalKey(&c, "caae", "v5"); + + // Block 3. + AddInternalKey(&c, "ccaa", "v6"); + AddInternalKey(&c, "ccac", "v7"); + + // Write the file. + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, + &kvmap); + ASSERT_EQ(8, keys.size()); + + auto reader = c.GetTableReader(); + auto props = reader->GetTableProperties(); + ASSERT_EQ(4u, props->num_data_blocks); + std::unique_ptr iter(reader->NewIterator( + ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + // Shouldn't have read data blocks before iterator is seeked. + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + auto ikey = [](Slice user_key) { + return InternalKey(user_key, 0, kTypeValue).Encode().ToString(); + }; + + // Seek to a key between blocks. If index contains first key, we shouldn't + // read any data blocks until value is requested. + iter->Seek(ikey("aaba")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 0 : 1, + stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ("v2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to the middle of a block. The block should be read right away. + iter->Seek(ikey("caab")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[4], iter->key().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v4", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to just before the same block and don't access value. + // The iterator should keep pinning the block contents. + iter->Seek(ikey("baaa")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[3], iter->key().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to the same block again to check that the block is still pinned. + iter->Seek(ikey("caae")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[5], iter->key().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v5", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Step forward and fall through to the next block. Don't access value. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[6], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 3, + stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Step forward again. Block should be read. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[7], iter->key().ToString()); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ("v7", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Step forward and reach the end. + iter->Next(); + EXPECT_FALSE(iter->Valid()); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to a single-key block and step forward without accessing value. + iter->Seek(ikey("aaca")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 0 : 1, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[3], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 1 : 2, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v3", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + // Seek between blocks and step back without accessing value. + iter->Seek(ikey("aaca")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 3, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[1], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 3, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + // All blocks are in cache now, there'll be no more misses ever. + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ("v1", iter->value().ToString()); + + // Next into the next block again. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 4, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to first and step back without accessing value. + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[0], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 5, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Prev(); + EXPECT_FALSE(iter->Valid()); + EXPECT_EQ(use_first_key ? 2 : 5, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Do some SeekForPrev() and SeekToLast() just to cover all methods. + iter->SeekForPrev(ikey("caad")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[4], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 3 : 6, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v4", iter->value().ToString()); + EXPECT_EQ(use_first_key ? 3 : 6, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[7], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 4 : 7, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v7", iter->value().ToString()); + EXPECT_EQ(use_first_key ? 4 : 7, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + c.ResetTableReader(); + } +} + +TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey; + table_options.block_cache = NewLRUCache(10000); + Options options; + options.statistics = CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + + TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false, + /* level */ -1, /* largest_seqno */ 42); + + c.Add(InternalKey("b", 0, kTypeValue).Encode().ToString(), "x"); + c.Add(InternalKey("c", 0, kTypeValue).Encode().ToString(), "y"); + + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, + &kvmap); + ASSERT_EQ(2, keys.size()); + + auto reader = c.GetTableReader(); + auto props = reader->GetTableProperties(); + ASSERT_EQ(1u, props->num_data_blocks); + std::unique_ptr iter(reader->NewIterator( + ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString()); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(), + iter->key().ToString()); + EXPECT_NE(keys[0], iter->key().ToString()); + // Key should have been served from index, without reading data blocks. + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + EXPECT_EQ("x", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(), + iter->key().ToString()); + + c.ResetTableReader(); +} + // It's very hard to figure out the index block size of a block accurately. // To make sure we get the index size, we just make sure as key number // grows, the filter block size also grows. @@ -3606,9 +3901,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); - std::unique_ptr meta_iter( - metaindex_block.NewIterator(BytewiseComparator(), - BytewiseComparator())); + std::unique_ptr meta_iter(metaindex_block.NewDataIterator( + BytewiseComparator(), BytewiseComparator())); bool found_properties_block = true; ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block)); ASSERT_TRUE(found_properties_block); @@ -3688,8 +3982,7 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { // verify properties block comes last std::unique_ptr metaindex_iter{ - metaindex_block.NewIterator(options.comparator, - options.comparator)}; + metaindex_block.NewDataIterator(options.comparator, options.comparator)}; uint64_t max_offset = 0; std::string key_at_max_offset; for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 7ff73cd4e..1cb00b639 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -19,11 +19,11 @@ namespace rocksdb { namespace { -class TwoLevelIndexIterator : public InternalIteratorBase { +class TwoLevelIndexIterator : public InternalIteratorBase { public: explicit TwoLevelIndexIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter); + InternalIteratorBase* first_level_iter); ~TwoLevelIndexIterator() override { first_level_iter_.DeleteIter(false /* is_arena_mode */); @@ -43,7 +43,7 @@ class TwoLevelIndexIterator : public InternalIteratorBase { assert(Valid()); return second_level_iter_.key(); } - BlockHandle value() const override { + IndexValue value() const override { assert(Valid()); return second_level_iter_.value(); } @@ -69,12 +69,12 @@ class TwoLevelIndexIterator : public InternalIteratorBase { } void SkipEmptyDataBlocksForward(); void SkipEmptyDataBlocksBackward(); - void SetSecondLevelIterator(InternalIteratorBase* iter); + void SetSecondLevelIterator(InternalIteratorBase* iter); void InitDataBlock(); TwoLevelIteratorState* state_; - IteratorWrapperBase first_level_iter_; - IteratorWrapperBase second_level_iter_; // May be nullptr + IteratorWrapperBase first_level_iter_; + IteratorWrapperBase second_level_iter_; // May be nullptr Status status_; // If second_level_iter is non-nullptr, then "data_block_handle_" holds the // "index_value" passed to block_function_ to create the second_level_iter. @@ -83,7 +83,7 @@ class TwoLevelIndexIterator : public InternalIteratorBase { TwoLevelIndexIterator::TwoLevelIndexIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter) + InternalIteratorBase* first_level_iter) : state_(state), first_level_iter_(first_level_iter) {} void TwoLevelIndexIterator::Seek(const Slice& target) { @@ -177,8 +177,8 @@ void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() { } void TwoLevelIndexIterator::SetSecondLevelIterator( - InternalIteratorBase* iter) { - InternalIteratorBase* old_iter = second_level_iter_.Set(iter); + InternalIteratorBase* iter) { + InternalIteratorBase* old_iter = second_level_iter_.Set(iter); delete old_iter; } @@ -186,14 +186,14 @@ void TwoLevelIndexIterator::InitDataBlock() { if (!first_level_iter_.Valid()) { SetSecondLevelIterator(nullptr); } else { - BlockHandle handle = first_level_iter_.value(); + BlockHandle handle = first_level_iter_.value().handle; if (second_level_iter_.iter() != nullptr && !second_level_iter_.status().IsIncomplete() && handle.offset() == data_block_handle_.offset()) { // second_level_iter is already constructed with this iterator, so // no need to change anything } else { - InternalIteratorBase* iter = + InternalIteratorBase* iter = state_->NewSecondaryIterator(handle); data_block_handle_ = handle; SetSecondLevelIterator(iter); @@ -203,9 +203,9 @@ void TwoLevelIndexIterator::InitDataBlock() { } // namespace -InternalIteratorBase* NewTwoLevelIterator( +InternalIteratorBase* NewTwoLevelIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter) { + InternalIteratorBase* first_level_iter) { return new TwoLevelIndexIterator(state, first_level_iter); } } // namespace rocksdb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index 55d5c01a4..545c29f49 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -22,11 +22,10 @@ struct TwoLevelIteratorState { TwoLevelIteratorState() {} virtual ~TwoLevelIteratorState() {} - virtual InternalIteratorBase* NewSecondaryIterator( + virtual InternalIteratorBase* NewSecondaryIterator( const BlockHandle& handle) = 0; }; - // Return a new two level iterator. A two-level iterator contains an // index iterator whose values point to a sequence of blocks where // each block is itself a sequence of key,value pairs. The returned @@ -37,8 +36,8 @@ struct TwoLevelIteratorState { // Uses a supplied function to convert an index_iter value into // an iterator over the contents of the corresponding block. // Note: this function expects first_level_iter was not created using the arena -extern InternalIteratorBase* NewTwoLevelIterator( +extern InternalIteratorBase* NewTwoLevelIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter); + InternalIteratorBase* first_level_iter); } // namespace rocksdb diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 4e37cde40..61a49d88a 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -9,6 +9,7 @@ #include "test_util/testutil.h" +#include #include #include @@ -197,8 +198,12 @@ BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) { opt.cache_index_and_filter_blocks = rnd->Uniform(2); opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2); opt.pin_top_level_index_and_filter = rnd->Uniform(2); - opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch - : BlockBasedTableOptions::kHashSearch; + using IndexType = BlockBasedTableOptions::IndexType; + const std::array index_types = { + {IndexType::kBinarySearch, IndexType::kHashSearch, + IndexType::kTwoLevelIndexSearch, IndexType::kBinarySearchWithFirstKey}}; + opt.index_type = + index_types[rnd->Uniform(static_cast(index_types.size()))]; opt.hash_index_allow_collision = rnd->Uniform(2); opt.checksum = static_cast(rnd->Uniform(3)); opt.block_size = rnd->Uniform(10000000); diff --git a/util/coding.h b/util/coding.h index 4046a2b60..9427d5261 100644 --- a/util/coding.h +++ b/util/coding.h @@ -58,6 +58,7 @@ extern bool GetFixed32(Slice* input, uint32_t* value); extern bool GetFixed16(Slice* input, uint16_t* value); extern bool GetVarint32(Slice* input, uint32_t* value); extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetVarsignedint64(Slice* input, int64_t* value); extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); // This function assumes data is well-formed. extern Slice GetLengthPrefixedSlice(const char* data); @@ -377,6 +378,18 @@ inline bool GetVarint64(Slice* input, uint64_t* value) { } } +inline bool GetVarsignedint64(Slice* input, int64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarsignedint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, static_cast(limit - q)); + return true; + } +} + // Provide an interface for platform independent endianness transformation inline uint64_t EndianTransform(uint64_t input, size_t size) { char* pos = reinterpret_cast(&input);