diff --git a/HISTORY.md b/HISTORY.md index fe7461988..772f4e64c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ * Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0.. * Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up. * Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2. +* Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix. ### New Features * Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit. * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting. diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index b00949c3c..77da88900 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -57,7 +57,7 @@ inline void FullFilterBlockBuilder::AddKey(const Slice& key) { } // Add prefix to filter if needed -inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) { +void FullFilterBlockBuilder::AddPrefix(const Slice& key) { Slice prefix = prefix_extractor_->Transform(key); if (whole_key_filtering_) { // if both whole_key and prefix are added to bloom then we will have whole diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index ae1e974f4..65dc278a8 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -59,6 +59,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { virtual void AddKey(const Slice& key); std::unique_ptr filter_bits_builder_; virtual void Reset(); + void AddPrefix(const Slice& key); + const SliceTransform* prefix_extractor() { return prefix_extractor_; } private: // important: all of these might point to invalid addresses @@ -74,7 +76,6 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { uint32_t num_added_; std::unique_ptr filter_data_; - void AddPrefix(const Slice& key); }; // A FilterBlockReader is used to parse filter from SST table. diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index f06150c29..f103b7673 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -40,7 +40,8 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} -void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() { +void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( + const Slice* next_key) { // Use == to send the request only once if (filters_in_partition_ == filters_per_partition_) { // Currently only index builder is in charge of cutting a partition. We keep @@ -51,6 +52,16 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() { return; } filter_gc.push_back(std::unique_ptr(nullptr)); + + // Add the prefix of the next key before finishing the partition. This hack, + // fixes a bug with format_verison=3 where seeking for the prefix would lead + // us to the previous partition. + const bool add_prefix = + next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key); + if (add_prefix) { + FullFilterBlockBuilder::AddPrefix(*next_key); + } + Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); std::string& index_key = p_index_builder_->GetPartitionKey(); filters.push_back({index_key, filter}); @@ -58,8 +69,12 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() { Reset(); } +void PartitionedFilterBlockBuilder::Add(const Slice& key) { + MaybeCutAFilterBlock(&key); + FullFilterBlockBuilder::Add(key); +} + void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { - MaybeCutAFilterBlock(); filter_bits_builder_->AddKey(key); filters_in_partition_++; num_added_++; @@ -87,7 +102,7 @@ Slice PartitionedFilterBlockBuilder::Finish( } filters.pop_front(); } else { - MaybeCutAFilterBlock(); + MaybeCutAFilterBlock(nullptr); } // If there is no filter partition left, then return the index on filter // partitions diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index b73ae3baa..9cac1b88a 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -32,6 +32,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { virtual ~PartitionedFilterBlockBuilder(); void AddKey(const Slice& key) override; + void Add(const Slice& key) override; size_t NumAdded() const override { return num_added_; } @@ -53,7 +54,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { bool finishing_filters = false; // true if Finish is called once but not complete yet. // The policy of when cut a filter block and Finish it - void MaybeCutAFilterBlock(); + void MaybeCutAFilterBlock(const Slice* next_key); // Currently we keep the same number of partitions for filters and indexes. // This would allow for some potentioal optimizations in future. If such // optimizations did not realize we can use different number of partitions and diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index aa667afed..fdfc97209 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -346,6 +346,44 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { } } +// This reproduces the bug in format_version=3 that the seeking the prefix will +// lead us to the partition before the one that has filter for the prefix. +TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) { + // some small number to cause partition cuts + table_options_.metadata_block_size = 1; + std::unique_ptr prefix_extractor( + rocksdb::NewFixedPrefixTransform(2)); + std::unique_ptr pib(NewIndexBuilder()); + std::unique_ptr builder( + NewBuilder(pib.get(), prefix_extractor.get())); + // In the bug, searching for prefix "p3" on an index with format version 3, + // will give the key "p3" and the partition of the keys that are <= p3, i.e., + // p2-keys, where the filter for prefix "p3" does not exist. + const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3", + "p5-key3"}; + builder->Add(pkeys[0]); + CutABlock(pib.get(), pkeys[0], pkeys[1]); + builder->Add(pkeys[1]); + CutABlock(pib.get(), pkeys[1], pkeys[2]); + builder->Add(pkeys[2]); + CutABlock(pib.get(), pkeys[2], pkeys[3]); + builder->Add(pkeys[3]); + CutABlock(pib.get(), pkeys[3], pkeys[4]); + builder->Add(pkeys[4]); + CutABlock(pib.get(), pkeys[4]); + std::unique_ptr reader( + NewReader(builder.get(), pib.get())); + for (auto key : pkeys) { + auto prefix = prefix_extractor->Transform(key); + auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->PrefixMayMatch( + prefix, prefix_extractor.get(), kNotValid, + /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + } +} + TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) { uint64_t max_index_size = MaxIndexSize(); for (uint64_t i = 1; i < max_index_size + 1; i++) {