Fix a bug in format_version 3 + partition filters + prefix search (#5835)

Summary:
Partitioned filters make use of a top-level index to find the partition in which the filter resides. The top-level index has a key per partition. The key is guaranteed to be larger or equal than any key in that partition. When used with format_version 3, which excludes the sequence number form index keys, the separator key in the index could be equal to the prefix of the keys in the next partition. In this way, when searching for the key, the top-level index will lead us to the previous partition, which has no key with that prefix. The prefix bloom test thus returns false, although the prefix exists in the bloom of the next partition.
The patch fixes that by a hack: It always adds the prefix of the first key of the next partition to the bloom of the current partition. In this way, in the corner cases that the index will lead us to the previous partition, we still can find the bloom filter there.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5835

Differential Revision: D17513585

Pulled By: maysamyabandeh

fbshipit-source-id: e2d1ff26c759e6e03875c4d57f4228316ecf50e9
main
Maysam Yabandeh 5 years ago committed by Facebook Github Bot
parent c9932d18cc
commit 6652c94f59
  1. 1
      HISTORY.md
  2. 2
      table/block_based/full_filter_block.cc
  3. 3
      table/block_based/full_filter_block.h
  4. 21
      table/block_based/partitioned_filter_block.cc
  5. 3
      table/block_based/partitioned_filter_block.h
  6. 38
      table/block_based/partitioned_filter_block_test.cc

@ -5,6 +5,7 @@
* Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
* Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
* Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2.
* Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix.
### New Features
* Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.

@ -57,7 +57,7 @@ inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
}
// Add prefix to filter if needed
inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
Slice prefix = prefix_extractor_->Transform(key);
if (whole_key_filtering_) {
// if both whole_key and prefix are added to bloom then we will have whole

@ -59,6 +59,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
virtual void AddKey(const Slice& key);
std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
virtual void Reset();
void AddPrefix(const Slice& key);
const SliceTransform* prefix_extractor() { return prefix_extractor_; }
private:
// important: all of these might point to invalid addresses
@ -74,7 +76,6 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
uint32_t num_added_;
std::unique_ptr<const char[]> filter_data_;
void AddPrefix(const Slice& key);
};
// A FilterBlockReader is used to parse filter from SST table.

@ -40,7 +40,8 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
const Slice* next_key) {
// Use == to send the request only once
if (filters_in_partition_ == filters_per_partition_) {
// Currently only index builder is in charge of cutting a partition. We keep
@ -51,6 +52,16 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
return;
}
filter_gc.push_back(std::unique_ptr<const char[]>(nullptr));
// Add the prefix of the next key before finishing the partition. This hack,
// fixes a bug with format_verison=3 where seeking for the prefix would lead
// us to the previous partition.
const bool add_prefix =
next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
if (add_prefix) {
FullFilterBlockBuilder::AddPrefix(*next_key);
}
Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
std::string& index_key = p_index_builder_->GetPartitionKey();
filters.push_back({index_key, filter});
@ -58,8 +69,12 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
Reset();
}
void PartitionedFilterBlockBuilder::Add(const Slice& key) {
MaybeCutAFilterBlock(&key);
FullFilterBlockBuilder::Add(key);
}
void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
MaybeCutAFilterBlock();
filter_bits_builder_->AddKey(key);
filters_in_partition_++;
num_added_++;
@ -87,7 +102,7 @@ Slice PartitionedFilterBlockBuilder::Finish(
}
filters.pop_front();
} else {
MaybeCutAFilterBlock();
MaybeCutAFilterBlock(nullptr);
}
// If there is no filter partition left, then return the index on filter
// partitions

@ -32,6 +32,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
virtual ~PartitionedFilterBlockBuilder();
void AddKey(const Slice& key) override;
void Add(const Slice& key) override;
size_t NumAdded() const override { return num_added_; }
@ -53,7 +54,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
bool finishing_filters =
false; // true if Finish is called once but not complete yet.
// The policy of when cut a filter block and Finish it
void MaybeCutAFilterBlock();
void MaybeCutAFilterBlock(const Slice* next_key);
// Currently we keep the same number of partitions for filters and indexes.
// This would allow for some potentioal optimizations in future. If such
// optimizations did not realize we can use different number of partitions and

@ -346,6 +346,44 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
}
}
// This reproduces the bug in format_version=3 that the seeking the prefix will
// lead us to the partition before the one that has filter for the prefix.
TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) {
// some small number to cause partition cuts
table_options_.metadata_block_size = 1;
std::unique_ptr<const SliceTransform> prefix_extractor(
rocksdb::NewFixedPrefixTransform(2));
std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
std::unique_ptr<PartitionedFilterBlockBuilder> builder(
NewBuilder(pib.get(), prefix_extractor.get()));
// In the bug, searching for prefix "p3" on an index with format version 3,
// will give the key "p3" and the partition of the keys that are <= p3, i.e.,
// p2-keys, where the filter for prefix "p3" does not exist.
const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3",
"p5-key3"};
builder->Add(pkeys[0]);
CutABlock(pib.get(), pkeys[0], pkeys[1]);
builder->Add(pkeys[1]);
CutABlock(pib.get(), pkeys[1], pkeys[2]);
builder->Add(pkeys[2]);
CutABlock(pib.get(), pkeys[2], pkeys[3]);
builder->Add(pkeys[3]);
CutABlock(pib.get(), pkeys[3], pkeys[4]);
builder->Add(pkeys[4]);
CutABlock(pib.get(), pkeys[4]);
std::unique_ptr<PartitionedFilterBlockReader> reader(
NewReader(builder.get(), pib.get()));
for (auto key : pkeys) {
auto prefix = prefix_extractor->Transform(key);
auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue);
const Slice ikey_slice = Slice(*ikey.rep());
ASSERT_TRUE(reader->PrefixMayMatch(
prefix, prefix_extractor.get(), kNotValid,
/*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
/*lookup_context=*/nullptr));
}
}
TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) {
uint64_t max_index_size = MaxIndexSize();
for (uint64_t i = 1; i < max_index_size + 1; i++) {

Loading…
Cancel
Save