diff --git a/db/db_test.cc b/db/db_test.cc index 1367238d7..e5720007f 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3443,6 +3443,27 @@ TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { ASSERT_EQ("v1", Get("k1")); ASSERT_EQ("v2", Get("k2")); } +TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewCappedPrefixTransform(2)); + + Reopen(options); + ASSERT_OK(Put("kk1", "v1")); + ASSERT_OK(Put("kk2", "v2")); + ASSERT_OK(Put("kk", "v3")); + ASSERT_OK(Put("k", "v4")); + Flush(); + + ASSERT_EQ("v1", Get("kk1")); + ASSERT_EQ("v2", Get("kk2")); + + ASSERT_EQ("v3", Get("kk")); + ASSERT_EQ("v4", Get("k")); +} TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) { // create a DB with block prefix index diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index e4146858e..488af7b46 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -772,11 +772,13 @@ TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) { Close(); } -TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLongerThanKey) { +TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithCappedPrefix) { Options options = CurrentOptions(); options.env = env_; options.create_if_missing = true; - options.prefix_extractor.reset(NewFixedPrefixTransform(20)); + // All of the keys or this test must be longer than 3 characters + constexpr int kMinKeyLen = 3; + options.prefix_extractor.reset(NewCappedPrefixTransform(kMinKeyLen)); options.memtable_whole_key_filtering = true; options.memtable_prefix_bloom_size_ratio = 0.1; BlockBasedTableOptions bbto; @@ -1494,7 +1496,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) { Options options = CurrentOptions(); options.env = env_; options.create_if_missing = true; - options.prefix_extractor.reset(NewCappedPrefixTransform(5)); + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); bbto.cache_index_and_filter_blocks = true; diff --git a/table/block_based/block.cc b/table/block_based/block.cc index 9c2a60844..7acc25997 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -126,10 +126,24 @@ struct DecodeKeyV4 { } }; -void DataBlockIter::NextImpl() { ParseNextDataKey(); } +struct DecodeEntryV4 { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + assert(value_length); + + *value_length = 0; + return DecodeKeyV4()(p, limit, shared, non_shared); + } +}; +void DataBlockIter::NextImpl() { + bool is_shared = false; + ParseNextDataKey(&is_shared); +} -void DataBlockIter::NextOrReportImpl() { - ParseNextDataKey(); +void MetaBlockIter::NextImpl() { + bool is_shared = false; + ParseNextKey(&is_shared); } void IndexBlockIter::NextImpl() { ParseNextIndexKey(); } @@ -153,6 +167,27 @@ void IndexBlockIter::PrevImpl() { } } +void MetaBlockIter::PrevImpl() { + assert(Valid()); + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + SeekToRestartPoint(restart_index_); + bool is_shared = false; + // Loop until end of current entry hits the start of original entry + while (ParseNextKey(&is_shared) && + NextEntryOffset() < original) { + } +} + // Similar to IndexBlockIter::PrevImpl but also caches the prev entries void DataBlockIter::PrevImpl() { assert(Valid()); @@ -212,7 +247,8 @@ void DataBlockIter::PrevImpl() { SeekToRestartPoint(restart_index_); do { - if (!ParseNextDataKey()) { + bool is_shared = false; + if (!ParseNextDataKey(&is_shared)) { break; } Slice current_key = raw_key_.GetKey(); @@ -250,6 +286,22 @@ void DataBlockIter::SeekImpl(const Slice& target) { FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); } +void MetaBlockIter::SeekImpl(const Slice& target) { + Slice seek_key = target; + PERF_TIMER_GUARD(block_seek_nanos); + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool skip_linear_scan = false; + bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + + if (!ok) { + return; + } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); +} + // Optimized Seek for point lookup for an internal key `target` // target = "seek_user_key @ type | seqno". // @@ -309,23 +361,21 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { // check if the key is in the restart_interval assert(restart_index < num_restarts_); SeekToRestartPoint(restart_index); + current_ = GetRestartPoint(restart_index); - const char* limit = nullptr; - if (restart_index_ + 1 < num_restarts_) { - limit = data_ + GetRestartPoint(restart_index_ + 1); - } else { - limit = data_ + restarts_; + uint32_t limit = restarts_; + if (restart_index + 1 < num_restarts_) { + limit = GetRestartPoint(restart_index + 1); } - - while (true) { + while (current_ < limit) { + bool shared; // Here we only linear seek the target key inside the restart interval. // If a key does not exist inside a restart interval, we avoid - // further searching the block content accross restart interval boundary. + // further searching the block content across restart interval boundary. // - // TODO(fwu): check the left and write boundary of the restart interval + // TODO(fwu): check the left and right boundary of the restart interval // to avoid linear seek a target key that is out of range. - if (!ParseNextDataKey(limit) || - CompareCurrentKey(target) >= 0) { + if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) { // we stop at the first potential matching user key. break; } @@ -336,7 +386,7 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { // 1) there is only one user_key match in the block (otherwise collsion). // the matching user_key resides in the last restart interval, and it // is the last key of the restart interval and of the block as well. - // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller. + // ParseNextKey() skiped it as its [ type | seqno ] is smaller. // // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, // AND all existing user_keys in the restart interval are smaller than @@ -432,20 +482,46 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) { } } +void MetaBlockIter::SeekForPrevImpl(const Slice& target) { + PERF_TIMER_GUARD(block_seek_nanos); + Slice seek_key = target; + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool skip_linear_scan = false; + bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + + if (!ok) { + return; + } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); + + if (!Valid()) { + SeekToLastImpl(); + } else { + while (Valid() && CompareCurrentKey(seek_key) > 0) { + PrevImpl(); + } + } +} + void DataBlockIter::SeekToFirstImpl() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(0); - ParseNextDataKey(); + bool is_shared = false; + ParseNextDataKey(&is_shared); } -void DataBlockIter::SeekToFirstOrReportImpl() { +void MetaBlockIter::SeekToFirstImpl() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(0); - ParseNextDataKey(); + bool is_shared = false; + ParseNextKey(&is_shared); } void IndexBlockIter::SeekToFirstImpl() { @@ -462,7 +538,20 @@ void DataBlockIter::SeekToLastImpl() { return; } SeekToRestartPoint(num_restarts_ - 1); - while (ParseNextDataKey() && NextEntryOffset() < restarts_) { + bool is_shared = false; + while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) { + // Keep skipping + } +} + +void MetaBlockIter::SeekToLastImpl() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(num_restarts_ - 1); + bool is_shared = false; + while (ParseNextKey(&is_shared) && + NextEntryOffset() < restarts_) { // Keep skipping } } @@ -487,13 +576,12 @@ void BlockIter::CorruptionError() { value_.clear(); } +template template -bool DataBlockIter::ParseNextDataKey(const char* limit) { +bool BlockIter::ParseNextKey(bool* is_shared) { current_ = NextEntryOffset(); const char* p = data_ + current_; - if (!limit) { - limit = data_ + restarts_; // Restarts come right after data - } + const char* limit = data_ + restarts_; // Restarts come right after data if (p >= limit) { // No more entries to return. Mark as invalid. @@ -501,7 +589,6 @@ bool DataBlockIter::ParseNextDataKey(const char* limit) { restart_index_ = num_restarts_; return false; } - // Decode next entry uint32_t shared, non_shared, value_length; p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); @@ -510,14 +597,30 @@ bool DataBlockIter::ParseNextDataKey(const char* limit) { return false; } else { if (shared == 0) { + *is_shared = false; // If this key doesn't share any bytes with prev key then we don't need // to decode it and can use its address in the block directly. raw_key_.SetKey(Slice(p, non_shared), false /* copy */); } else { // This key share `shared` bytes with prev key, we need to decode it + *is_shared = true; raw_key_.TrimAppend(shared, p, non_shared); } + value_ = Slice(p + non_shared, value_length); + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed + return true; + } +} +bool DataBlockIter::ParseNextDataKey(bool* is_shared) { + if (ParseNextKey(is_shared)) { #ifndef NDEBUG if (global_seqno_ != kDisableGlobalSequenceNumber) { // If we are reading a file with a global sequence number we should @@ -536,64 +639,22 @@ bool DataBlockIter::ParseNextDataKey(const char* limit) { assert(seqno == 0); } #endif // NDEBUG - - value_ = Slice(p + non_shared, value_length); - if (shared == 0) { - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; - } - } - // else we are in the middle of a restart interval and the restart_index_ - // thus has not changed return true; + } else { + return false; } } bool IndexBlockIter::ParseNextIndexKey() { - current_ = NextEntryOffset(); - const char* p = data_ + current_; - const char* limit = data_ + restarts_; // Restarts come right after data - if (p >= limit) { - // No more entries to return. Mark as invalid. - current_ = restarts_; - restart_index_ = num_restarts_; - return false; - } - - // Decode next entry - uint32_t shared, non_shared, value_length; - if (value_delta_encoded_) { - p = DecodeKeyV4()(p, limit, &shared, &non_shared); - value_length = 0; - } else { - p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); - } - if (p == nullptr || raw_key_.Size() < shared) { - CorruptionError(); - return false; - } - if (shared == 0) { - // If this key doesn't share any bytes with prev key then we don't need - // to decode it and can use its address in the block directly. - raw_key_.SetKey(Slice(p, non_shared), false /* copy */); - } else { - // This key share `shared` bytes with prev key, we need to decode it - raw_key_.TrimAppend(shared, p, non_shared); - } - value_ = Slice(p + non_shared, value_length); - if (shared == 0) { - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; + bool is_shared = false; + bool ok = (value_delta_encoded_) ? ParseNextKey(&is_shared) + : ParseNextKey(&is_shared); + if (ok) { + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { + DecodeCurrentValue(is_shared); } } - // else we are in the middle of a restart interval and the restart_index_ - // thus has not changed - if (value_delta_encoded_ || global_seqno_state_ != nullptr) { - DecodeCurrentValue(shared); - } - return true; + return ok; } // The format: @@ -604,15 +665,15 @@ bool IndexBlockIter::ParseNextIndexKey() { // where, k is key, v is value, and its encoding is in parenthesis. // The format of each key is (shared_size, non_shared_size, shared, non_shared) // The format of each value, i.e., block handle, is (offset, size) whenever the -// shared_size is 0, which included the first entry in each restart point. +// is_shared is false, which included the first entry in each restart point. // Otherwise the format is delta-size = block handle size - size of last block // handle. -void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { +void IndexBlockIter::DecodeCurrentValue(bool is_shared) { Slice v(value_.data(), data_ + restarts_ - value_.data()); // Delta encoding is used if `shared` != 0. Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom( &v, have_first_key_, - (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr); + (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr); assert(decode_s.ok()); value_ = Slice(value_.data(), v.data() - value_.data()); @@ -970,6 +1031,21 @@ Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, } } +MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { + MetaBlockIter* iter = new MetaBlockIter(); + if (size_ < 2 * sizeof(uint32_t)) { + iter->Invalidate(Status::Corruption("bad block contents")); + return iter; + } else if (num_restarts_ == 0) { + // Empty block. + iter->Invalidate(Status::OK()); + } else { + iter->Initialize(data_, restart_offset_, num_restarts_, + block_contents_pinned); + } + return iter; +} + DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, SequenceNumber global_seqno, DataBlockIter* iter, Statistics* stats, diff --git a/table/block_based/block.h b/table/block_based/block.h index 000f64842..348386298 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -34,6 +34,7 @@ template class BlockIter; class DataBlockIter; class IndexBlockIter; +class MetaBlockIter; class BlockPrefixIndex; // BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data @@ -192,6 +193,21 @@ class Block { Statistics* stats = nullptr, bool block_contents_pinned = false); + // Returns an MetaBlockIter for iterating over blocks containing metadata + // (like Properties blocks). Unlike data blocks, the keys for these blocks + // do not contain sequence numbers, do not use a user-define comparator, and + // do not track read amplification/statistics. Additionally, MetaBlocks will + // not assert if the block is formatted improperly. + // + // If `block_contents_pinned` is true, the caller will guarantee that when + // the cleanup functions are transferred from the iterator to other + // classes, e.g. PinnableSlice, the pointer to the bytes will still be + // valid. Either the iterator holds cache handle or ownership of some resource + // and release them in a release function, or caller is sure that the data + // will not go away (for example, it's from mmapped file which will not be + // closed). + MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false); + // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key // comparator. // @@ -269,10 +285,9 @@ class BlockIter : public InternalIteratorBase { // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do // nothing. Calls cleanup functions. - void InvalidateBase(Status s) { + virtual void Invalidate(const Status& s) { // Assert that the BlockIter is never deleted while Pinning is Enabled. - assert(!pinned_iters_mgr_ || - (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); + assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled()); data_ = nullptr; current_ = restarts_; @@ -384,8 +399,12 @@ class BlockIter : public InternalIteratorBase { virtual void SeekImpl(const Slice& target) = 0; virtual void SeekForPrevImpl(const Slice& target) = 0; virtual void NextImpl() = 0; + virtual void PrevImpl() = 0; + template + inline bool ParseNextKey(bool* is_shared); + InternalKeyComparator icmp() { return InternalKeyComparator(raw_ucmp_, false /* named */); } @@ -519,24 +538,8 @@ class DataBlockIter final : public BlockIter { return res; } - // Try to advance to the next entry in the block. If there is data corruption - // or error, report it to the caller instead of aborting the process. May - // incur higher CPU overhead because we need to perform check on every entry. - void NextOrReport() { - NextOrReportImpl(); - UpdateKey(); - } - - // Try to seek to the first entry in the block. If there is data corruption - // or error, report it to caller instead of aborting the process. May incur - // higher CPU overhead because we need to perform check on every entry. - void SeekToFirstOrReport() { - SeekToFirstOrReportImpl(); - UpdateKey(); - } - - void Invalidate(Status s) { - InvalidateBase(s); + void Invalidate(const Status& s) override { + BlockIter::Invalidate(s); // Clear prev entries cache. prev_entries_keys_buff_.clear(); prev_entries_.clear(); @@ -544,12 +547,14 @@ class DataBlockIter final : public BlockIter { } protected: - virtual void SeekToFirstImpl() override; - virtual void SeekToLastImpl() override; - virtual void SeekImpl(const Slice& target) override; - virtual void SeekForPrevImpl(const Slice& target) override; - virtual void NextImpl() override; - virtual void PrevImpl() override; + friend Block; + inline bool ParseNextDataKey(bool* is_shared); + void SeekToFirstImpl() override; + void SeekToLastImpl() override; + void SeekImpl(const Slice& target) override; + void SeekForPrevImpl(const Slice& target) override; + void NextImpl() override; + void PrevImpl() override; private: // read-amp bitmap @@ -582,12 +587,38 @@ class DataBlockIter final : public BlockIter { DataBlockHashIndex* data_block_hash_index_; - template - inline bool ParseNextDataKey(const char* limit = nullptr); - bool SeekForGetImpl(const Slice& target); - void NextOrReportImpl(); - void SeekToFirstOrReportImpl(); +}; + +// Iterator over MetaBlocks. MetaBlocks are similar to Data Blocks and +// are used to store Properties associated with table. +// Meta blocks always store user keys (no sequence number) and always +// use the BytewiseComparator. Additionally, MetaBlock accesses are +// not recorded in the Statistics or for Read-Amplification. +class MetaBlockIter final : public BlockIter { + public: + MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); } + void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts, + bool block_contents_pinned) { + // Initializes the iterator with a BytewiseComparator and + // the raw key being a user key. + InitializeBase(BytewiseComparator(), data, restarts, num_restarts, + kDisableGlobalSequenceNumber, block_contents_pinned); + raw_key_.SetIsUserKey(true); + } + + Slice value() const override { + assert(Valid()); + return value_; + } + + protected: + void SeekToFirstImpl() override; + void SeekToLastImpl() override; + void SeekImpl(const Slice& target) override; + void SeekForPrevImpl(const Slice& target) override; + void NextImpl() override; + void PrevImpl() override; }; class IndexBlockIter final : public BlockIter { @@ -635,8 +666,6 @@ class IndexBlockIter final : public BlockIter { } } - void Invalidate(Status s) { InvalidateBase(s); } - bool IsValuePinned() const override { return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); } @@ -713,7 +742,7 @@ class IndexBlockIter final : public BlockIter { // When value_delta_encoded_ is enabled it decodes the value which is assumed // to be BlockHandle and put it to decoded_value_ - inline void DecodeCurrentValue(uint32_t shared); + inline void DecodeCurrentValue(bool is_shared); }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index c0815e623..8928e586a 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -1112,8 +1112,7 @@ Status BlockBasedTable::ReadMetaIndexBlock( *metaindex_block = std::move(metaindex); // meta block uses bytewise comparator. - iter->reset(metaindex_block->get()->NewDataIterator( - BytewiseComparator(), kDisableGlobalSequenceNumber)); + iter->reset(metaindex_block->get()->NewMetaIterator()); return Status::OK(); } diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 80e757295..729f4d794 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -255,9 +255,7 @@ Status ReadTablePropertiesHelper( // and Block hides block_contents uint64_t block_size = block_contents.data.size(); Block properties_block(std::move(block_contents)); - DataBlockIter iter; - properties_block.NewDataIterator(BytewiseComparator(), - kDisableGlobalSequenceNumber, &iter); + std::unique_ptr iter(properties_block.NewMetaIterator()); std::unique_ptr new_table_properties{new TableProperties}; // All pre-defined properties of type uint64_t @@ -308,13 +306,13 @@ Status ReadTablePropertiesHelper( }; std::string last_key; - for (iter.SeekToFirstOrReport(); iter.Valid(); iter.NextOrReport()) { - s = iter.status(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = iter->status(); if (!s.ok()) { break; } - auto key = iter.key().ToString(); + auto key = iter->key().ToString(); // properties block should be strictly sorted with no duplicate key. if (!last_key.empty() && BytewiseComparator()->Compare(key, last_key) <= 0) { @@ -323,12 +321,12 @@ Status ReadTablePropertiesHelper( } last_key = key; - auto raw_val = iter.value(); + auto raw_val = iter->value(); auto pos = predefined_uint64_properties.find(key); if (key == ExternalSstFilePropertyNames::kGlobalSeqno) { new_table_properties->external_sst_file_global_seqno_offset = - handle.offset() + iter.ValueOffset(); + handle.offset() + iter->ValueOffset(); } if (pos != predefined_uint64_properties.end()) { @@ -500,8 +498,7 @@ Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewDataIterator( - BytewiseComparator(), kDisableGlobalSequenceNumber)); + meta_iter.reset(metaindex_block.NewMetaIterator()); return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); } diff --git a/table/table_test.cc b/table/table_test.cc index 1729d99e2..6a6a1bb95 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -4819,8 +4819,7 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { // verify properties block comes last std::unique_ptr metaindex_iter{ - metaindex_block.NewDataIterator(options.comparator, - kDisableGlobalSequenceNumber)}; + metaindex_block.NewMetaIterator()}; uint64_t max_offset = 0; std::string key_at_max_offset; for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); @@ -4840,6 +4839,90 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { c.ResetTableReader(); } +TEST_P(BlockBasedTableTest, SeekMetaBlocks) { + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + c.Add("foo_a1", "val1"); + c.Add("foo_b2", "val2"); + c.Add("foo_c3", "val3"); + c.Add("foo_d4", "val4"); + c.Add("foo_e5", "val5"); + c.Add("foo_f6", "val6"); + c.Add("foo_g7", "val7"); + c.Add("foo_h8", "val8"); + c.Add("foo_j9", "val9"); + + // write an SST file + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.filter_policy.reset(NewBloomFilterPolicy( + 8 /* bits_per_key */, false /* use_block_based_filter */)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + // get file reader + test::StringSink* table_sink = c.TEST_GetSink(); + std::unique_ptr source(new test::StringSource( + table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */)); + + std::unique_ptr table_reader( + new RandomAccessFileReader(std::move(source), "test")); + size_t table_size = table_sink->contents().size(); + + // read footer + Footer footer; + IOOptions opts; + ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(), + nullptr /* prefetch_buffer */, table_size, + &footer, kBlockBasedTableMagicNumber)); + + // read metaindex + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + PersistentCacheOptions pcache_opts; + BlockFetcher block_fetcher( + table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(), + metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, BlockType::kMetaIndex, + UncompressionDict::GetEmptyDict(), pcache_opts, + nullptr /*memory_allocator*/); + ASSERT_OK(block_fetcher.ReadBlockContents()); + Block metaindex_block(std::move(metaindex_contents)); + + // verify properties block comes last + std::unique_ptr metaindex_iter( + metaindex_block.NewMetaIterator()); + bool has_hash_prefixes = false; + bool has_hash_metadata = false; + for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); + metaindex_iter->Next()) { + if (metaindex_iter->key().ToString() == kHashIndexPrefixesBlock) { + has_hash_prefixes = true; + } else if (metaindex_iter->key().ToString() == + kHashIndexPrefixesMetadataBlock) { + has_hash_metadata = true; + } + } + if (has_hash_metadata) { + metaindex_iter->Seek(kHashIndexPrefixesMetadataBlock); + ASSERT_TRUE(metaindex_iter->Valid()); + ASSERT_EQ(kHashIndexPrefixesMetadataBlock, + metaindex_iter->key().ToString()); + } + if (has_hash_prefixes) { + metaindex_iter->Seek(kHashIndexPrefixesBlock); + ASSERT_TRUE(metaindex_iter->Valid()); + ASSERT_EQ(kHashIndexPrefixesBlock, metaindex_iter->key().ToString()); + } + c.ResetTableReader(); +} + TEST_P(BlockBasedTableTest, BadOptions) { ROCKSDB_NAMESPACE::Options options; options.compression = kNoCompression;