diff --git a/HISTORY.md b/HISTORY.md index e65f204af..c2f891890 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ * Fix a race condition in WAL size tracking which is caused by an unsafe iterator access after container is changed. * Fix unprotected concurrent accesses to `WritableFileWriter::filesize_` by `DB::SyncWAL()` and `DB::Put()` in two write queue mode. * Fix a bug in WAL tracking. Before this PR (#10087), calling `SyncWAL()` on the only WAL file of the db will not log the event in MANIFEST, thus allowing a subsequent `DB::Open` even if the WAL file is missing or corrupted. +* Fix a bug that could return wrong results with `index_type=kHashSearch` and using `SetOptions` to change the `prefix_extractor`. * Fixed a bug in WAL tracking with wal_compression. WAL compression writes a kSetCompressionType record which is not associated with any sequence number. As result, WalManager::GetSortedWalsOfType() will skip these WALs and not return them to caller, e.g. Checkpoint, Backup, causing the operations to fail. ### Public API changes diff --git a/db/db_test.cc b/db/db_test.cc index 14abe8056..bc1808be8 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3467,7 +3467,40 @@ TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { ASSERT_OK(Flush()); ASSERT_OK(Put("k2", "v2")); - // Reopen it without prefix extractor, make sure everything still works. + // Reopen with different prefix extractor, make sure everything still works. + // RocksDB should just fall back to the binary index. + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + + Reopen(options); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); + +#ifndef ROCKSDB_LITE + // Back to original + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}})); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); +#endif // !ROCKSDB_LITE + + // Same if there's a problem initally loading prefix transform + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::ForceNullTablePrefixExtractor", + [&](void* arg) { *static_cast(arg) = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); + +#ifndef ROCKSDB_LITE + // Change again + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); +#endif // !ROCKSDB_LITE + SyncPoint::GetInstance()->DisableProcessing(); + + // Reopen with no prefix extractor, make sure everything still works. // RocksDB should just fall back to the binary index. table_options.index_type = BlockBasedTableOptions::kBinarySearch; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -3477,6 +3510,7 @@ TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { ASSERT_EQ("v1", Get("k1")); ASSERT_EQ("v2", Get("k2")); } + TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) { // create a DB with block prefix index BlockBasedTableOptions table_options; diff --git a/db/dbformat.h b/db/dbformat.h index 670c188c7..3ff2277aa 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -625,7 +625,7 @@ class IterKey { }; // Convert from a SliceTransform of user keys, to a SliceTransform of -// user keys. +// internal keys. class InternalKeySliceTransform : public SliceTransform { public: explicit InternalKeySliceTransform(const SliceTransform* transform) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 19eab5f16..e119d1bc7 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -654,17 +654,6 @@ Status BlockBasedTable::Open( file_size, level, immortal_table); rep->file = std::move(file); rep->footer = footer; - // We've successfully read the footer. We are ready to serve requests. - // Better not mutate rep_ after the creation. eg. internal_prefix_transform - // raw pointer will be used to create HashIndexReader, whose reset may - // access a dangling pointer. - // We need to wrap data with internal_prefix_transform to make sure it can - // handle prefix correctly. - // FIXME: is changed prefix_extractor handled anywhere for hash index? - if (prefix_extractor != nullptr) { - rep->internal_prefix_transform.reset( - new InternalKeySliceTransform(prefix_extractor.get())); - } // For fully portable/stable cache keys, we need to read the properties // block before setting up cache keys. TODO: consider setting up a bootstrap @@ -694,8 +683,14 @@ Status BlockBasedTable::Open( if (!s.ok()) { return s; } - if (!PrefixExtractorChangedHelper(rep->table_properties.get(), - prefix_extractor.get())) { + bool force_null_table_prefix_extractor = false; + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTable::Open::ForceNullTablePrefixExtractor", + &force_null_table_prefix_extractor); + if (force_null_table_prefix_extractor) { + assert(!rep->table_prefix_extractor); + } else if (!PrefixExtractorChangedHelper(rep->table_properties.get(), + prefix_extractor.get())) { // Establish fast path for unchanged prefix_extractor rep->table_prefix_extractor = prefix_extractor; } else { @@ -2003,7 +1998,7 @@ InternalIterator* BlockBasedTable::NewIterator( read_options.auto_prefix_mode || PrefixExtractorChanged(prefix_extractor); std::unique_ptr> index_iter(NewIndexIterator( read_options, - need_upper_bound_check && + /*disable_prefix_seek=*/need_upper_bound_check && rep_->index_type == BlockBasedTableOptions::kHashSearch, /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context)); if (arena == nullptr) { @@ -2565,18 +2560,10 @@ Status BlockBasedTable::CreateIndexReader( lookup_context, index_reader); } case BlockBasedTableOptions::kHashSearch: { - std::unique_ptr metaindex_guard; - std::unique_ptr metaindex_iter_guard; - bool should_fallback = false; - // FIXME: is changed prefix_extractor handled anywhere for hash index? - if (rep_->internal_prefix_transform.get() == nullptr) { + if (!rep_->table_prefix_extractor) { ROCKS_LOG_WARN(rep_->ioptions.logger, - "No prefix extractor passed in. Fall back to binary" - " search index."); - should_fallback = true; - } - - if (should_fallback) { + "Missing prefix extractor for hash index. Fall back to" + " binary search index."); return BinarySearchIndexReader::Create(this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context, index_reader); diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 39ef7aa41..2d624cd00 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -596,12 +596,6 @@ struct BlockBasedTable::Rep { BlockBasedTableOptions::IndexType index_type; bool whole_key_filtering; bool prefix_filtering; - // TODO(kailiu) It is very ugly to use internal key in table, since table - // module should not be relying on db module. However to make things easier - // and compatible with existing code, we introduce a wrapper that allows - // block to extract prefix without knowing if a key is internal or not. - // null if no prefix_extractor is passed in when opening the table reader. - std::unique_ptr internal_prefix_transform; std::shared_ptr table_prefix_extractor; std::shared_ptr fragmented_range_dels; diff --git a/table/block_based/block_prefix_index.cc b/table/block_based/block_prefix_index.cc index f9d92c74c..c83701d69 100644 --- a/table/block_based/block_prefix_index.cc +++ b/table/block_based/block_prefix_index.cc @@ -69,9 +69,6 @@ struct PrefixRecord { class BlockPrefixIndex::Builder { public: - explicit Builder(const SliceTransform* internal_prefix_extractor) - : internal_prefix_extractor_(internal_prefix_extractor) {} - void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) { PrefixRecord* record = reinterpret_cast( arena_.AllocateAligned(sizeof(PrefixRecord))); @@ -82,7 +79,7 @@ class BlockPrefixIndex::Builder { prefixes_.push_back(record); } - BlockPrefixIndex* Finish() { + BlockPrefixIndex* Finish(const SliceTransform* prefix_extractor) { // For now, use roughly 1:1 prefix to bucket ratio. uint32_t num_buckets = static_cast(prefixes_.size()) + 1; @@ -154,25 +151,22 @@ class BlockPrefixIndex::Builder { assert(offset == total_block_array_entries); - return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets, - buckets, total_block_array_entries, - block_array_buffer); + return new BlockPrefixIndex(prefix_extractor, num_buckets, buckets, + total_block_array_entries, block_array_buffer); } private: - const SliceTransform* internal_prefix_extractor_; - std::vector prefixes_; Arena arena_; }; -Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, +Status BlockPrefixIndex::Create(const SliceTransform* prefix_extractor, const Slice& prefixes, const Slice& prefix_meta, BlockPrefixIndex** prefix_index) { uint64_t pos = 0; auto meta_pos = prefix_meta; Status s; - Builder builder(internal_prefix_extractor); + Builder builder; while (!meta_pos.empty()) { uint32_t prefix_size = 0; @@ -201,14 +195,14 @@ Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, } if (s.ok()) { - *prefix_index = builder.Finish(); + *prefix_index = builder.Finish(prefix_extractor); } return s; } uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) { - Slice prefix = internal_prefix_extractor_->Transform(key); + Slice prefix = internal_prefix_extractor_.Transform(key); uint32_t bucket = PrefixToBucket(prefix, num_buckets_); uint32_t block_id = buckets_[bucket]; diff --git a/table/block_based/block_prefix_index.h b/table/block_based/block_prefix_index.h index 04121320e..4db8e2c65 100644 --- a/table/block_based/block_prefix_index.h +++ b/table/block_based/block_prefix_index.h @@ -5,6 +5,8 @@ #pragma once #include + +#include "db/dbformat.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { @@ -31,6 +33,8 @@ class BlockPrefixIndex { } // Create hash index by reading from the metadata blocks. + // Note: table reader (caller) is responsible for keeping shared_ptr to + // underlying prefix extractor // @params prefixes: a sequence of prefixes. // @params prefix_meta: contains the "metadata" to of the prefixes. static Status Create(const SliceTransform* hash_key_extractor, @@ -46,17 +50,17 @@ class BlockPrefixIndex { class Builder; friend Builder; - BlockPrefixIndex(const SliceTransform* internal_prefix_extractor, - uint32_t num_buckets, uint32_t* buckets, - uint32_t num_block_array_buffer_entries, + BlockPrefixIndex(const SliceTransform* prefix_extractor, uint32_t num_buckets, + uint32_t* buckets, uint32_t num_block_array_buffer_entries, uint32_t* block_array_buffer) - : internal_prefix_extractor_(internal_prefix_extractor), + : internal_prefix_extractor_(prefix_extractor), num_buckets_(num_buckets), num_block_array_buffer_entries_(num_block_array_buffer_entries), buckets_(buckets), block_array_buffer_(block_array_buffer) {} - const SliceTransform* internal_prefix_extractor_; + InternalKeySliceTransform internal_prefix_extractor_; + uint32_t num_buckets_; uint32_t num_block_array_buffer_entries_; uint32_t* buckets_; diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc index 839c79dd9..bcaba17a2 100644 --- a/table/block_based/hash_index_reader.cc +++ b/table/block_based/hash_index_reader.cc @@ -95,8 +95,8 @@ Status HashIndexReader::Create(const BlockBasedTable* table, } BlockPrefixIndex* prefix_index = nullptr; - assert(rep->internal_prefix_transform.get() != nullptr); - s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(), + assert(rep->table_prefix_extractor); + s = BlockPrefixIndex::Create(rep->table_prefix_extractor.get(), prefixes_contents.data, prefixes_meta_contents.data, &prefix_index); // TODO: log error