diff --git a/HISTORY.md b/HISTORY.md index 4e5a350c4..77ca9a169 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * Fixed a potential hang in shutdown for a DB whose `Env` has high-pri thread pool disabled (`Env::GetBackgroundThreads(Env::Priority::HIGH) == 0`) * Made BackupEngine thread-safe and added documentation comments to clarify what is safe for multiple BackupEngine objects accessing the same backup directory. * Fixed crash (divide by zero) when compression dictionary is applied to a file containing only range tombstones. +* Fixed a backward iteration bug with partitioned filter enabled: not including the prefix of the last key of the previous filter partition in current filter partition can cause wrong iteration result. ### Performance Improvements * On ARM platform, use `yield` instead of `wfe` to relax cpu to gain better performance. diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 7c9277c14..30fc11eba 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -7,6 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + #include "db/db_test_util.h" #include "options/options_helper.h" #include "port/stack_trace.h" @@ -2117,6 +2120,54 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { } } +TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) { + Options options = CurrentOptions(); + constexpr size_t kNumKeys = 10000; + static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000"); + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeys + 10)); + options.create_if_missing = true; + constexpr size_t kPrefixLength = 4; + options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength)); + options.compression = kNoCompression; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(50)); + bbto.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + bbto.block_size = 128; + bbto.metadata_block_size = 128; + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const std::string value(64, '\0'); + + WriteOptions write_opts; + write_opts.disableWAL = true; + for (size_t i = 0; i < kNumKeys; ++i) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(4) << std::fixed << i; + ASSERT_OK(db_->Put(write_opts, oss.str(), value)); + } + ASSERT_OK(Flush()); + + ReadOptions read_opts; + // Use legacy, implicit prefix seek + read_opts.total_order_seek = false; + read_opts.auto_prefix_mode = false; + std::unique_ptr it(db_->NewIterator(read_opts)); + for (size_t i = 0; i < kNumKeys; ++i) { + // Seek with a key after each one added but with same prefix. One will + // surely cross a partition boundary. + std::ostringstream oss; + oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a"; + it->SeekForPrev(oss.str()); + ASSERT_OK(it->status()); + ASSERT_TRUE(it->Valid()); + } + it.reset(); +} + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index 64e3f5432..d7112ddc4 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -22,6 +22,7 @@ FullFilterBlockBuilder::FullFilterBlockBuilder( whole_key_filtering_(whole_key_filtering), last_whole_key_recorded_(false), last_prefix_recorded_(false), + last_key_in_domain_(false), num_added_(0) { assert(filter_bits_builder != nullptr); filter_bits_builder_.reset(filter_bits_builder); @@ -30,6 +31,15 @@ FullFilterBlockBuilder::FullFilterBlockBuilder( void FullFilterBlockBuilder::Add(const Slice& key_without_ts) { const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts); + + if (!last_prefix_recorded_ && last_key_in_domain_) { + // We can reach here when a new filter partition starts in partitioned + // filter. The last prefix in the previous partition should be added if + // necessary regardless of key_without_ts, to support prefix SeekForPrev. + AddKey(last_prefix_str_); + last_prefix_recorded_ = true; + } + if (whole_key_filtering_) { if (!add_prefix) { AddKey(key_without_ts); @@ -49,7 +59,10 @@ void FullFilterBlockBuilder::Add(const Slice& key_without_ts) { } } if (add_prefix) { + last_key_in_domain_ = true; AddPrefix(key_without_ts); + } else { + last_key_in_domain_ = false; } } @@ -61,6 +74,7 @@ inline void FullFilterBlockBuilder::AddKey(const Slice& key) { // Add prefix to filter if needed void FullFilterBlockBuilder::AddPrefix(const Slice& key) { + assert(prefix_extractor_ && prefix_extractor_->InDomain(key)); Slice prefix = prefix_extractor_->Transform(key); if (whole_key_filtering_) { // if both whole_key and prefix are added to bloom then we will have whole diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 89f9e40c9..1285e25fc 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -61,6 +61,7 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { virtual void Reset(); void AddPrefix(const Slice& key); const SliceTransform* prefix_extractor() { return prefix_extractor_; } + const std::string& last_prefix_str() const { return last_prefix_str_; } private: // important: all of these might point to invalid addresses @@ -72,10 +73,14 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { std::string last_whole_key_str_; bool last_prefix_recorded_; std::string last_prefix_str_; + // Whether prefix_extractor_->InDomain(last_whole_key_) is true. + // Used in partitioned filters so that the last prefix from the previous + // filter partition will be added to the current partition if + // last_key_in_domain_ is true, regardless of the current key. + bool last_key_in_domain_; uint32_t num_added_; std::unique_ptr filter_data_; - }; // A FilterBlockReader is used to parse filter from SST table. diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index 694ce8199..402226980 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -73,13 +73,16 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( } filter_gc.push_back(std::unique_ptr(nullptr)); - // Add the prefix of the next key before finishing the partition. This hack, - // fixes a bug with format_verison=3 where seeking for the prefix would lead - // us to the previous partition. - const bool add_prefix = + // Add the prefix of the next key before finishing the partition without + // updating last_prefix_str_. This hack, fixes a bug with format_verison=3 + // where seeking for the prefix would lead us to the previous partition. + const bool maybe_add_prefix = next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key); - if (add_prefix) { - FullFilterBlockBuilder::AddPrefix(*next_key); + if (maybe_add_prefix) { + const Slice next_key_prefix = prefix_extractor()->Transform(*next_key); + if (next_key_prefix.compare(last_prefix_str()) != 0) { + AddKey(next_key_prefix); + } } Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); diff --git a/table/table_test.cc b/table/table_test.cc index d35712670..449922940 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -3988,8 +3988,7 @@ class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform { } bool InDomain(const ROCKSDB_NAMESPACE::Slice& src) const override { - assert(IsValid(src)); - return true; + return IsValid(src); } bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override { diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index fe93640ea..d7e7d7055 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -299,7 +299,6 @@ ts_params = { "use_blob_db": 0, "enable_compaction_filter": 0, "ingest_external_file_one_in": 0, - "partition_filters": 0, } def finalize_and_sanitize(src_params):