From 91687d70eaadefec7adc4070376d3cdc702a672d Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Wed, 23 Mar 2022 10:00:54 -0700 Subject: [PATCH] Fix a major performance bug in 7.0 re: filter compatibility (#9736) Summary: Bloom filters generated by pre-7.0 releases are not read by 7.0.x releases (and vice-versa) due to changes to FilterPolicy::Name() in https://github.com/facebook/rocksdb/issues/9590. This can severely impact read performance and read I/O on upgrade or downgrade with existing DB, but not data correctness. To fix, we go back using the old, unified name in SST metadata but (for a while anyway) recognize the aliases that could be generated by early 7.0.x releases. This unfortunately requires a public API change to avoid interfering with all the good changes from https://github.com/facebook/rocksdb/issues/9590, but the API change only affects users with custom FilterPolicy, which should be very few. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9736 Test Plan: manual Generate DBs with ``` ./db_bench.7.0 -db=/dev/shm/rocksdb.7.0 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 ``` and similar. Compare with ``` for IMPL in 6.29 7.0 fixed; do for DB in 6.29 7.0 fixed; do echo "Testing $IMPL on $DB:"; ./db_bench.$IMPL -db=/dev/shm/rocksdb.$DB -use_existing_db -readonly -bloom_bits=10 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=10 2>&1 | grep micros/op; done; done ``` Results: ``` Testing 6.29 on 6.29: readrandom : 34.381 micros/op 29085 ops/sec; 3.2 MB/s (291999 of 291999 found) Testing 6.29 on 7.0: readrandom : 190.443 micros/op 5249 ops/sec; 0.6 MB/s (52999 of 52999 found) Testing 6.29 on fixed: readrandom : 40.148 micros/op 24907 ops/sec; 2.8 MB/s (249999 of 249999 found) Testing 7.0 on 6.29: readrandom : 229.430 micros/op 4357 ops/sec; 0.5 MB/s (43999 of 43999 found) Testing 7.0 on 7.0: readrandom : 33.348 micros/op 29986 ops/sec; 3.3 MB/s (299999 of 299999 found) Testing 7.0 on fixed: readrandom : 152.734 micros/op 6546 ops/sec; 0.7 MB/s (65999 of 65999 found) Testing fixed on 6.29: readrandom : 32.024 micros/op 31224 ops/sec; 3.5 MB/s (312999 of 312999 found) Testing fixed on 7.0: readrandom : 33.990 micros/op 29390 ops/sec; 3.3 MB/s (294999 of 294999 found) Testing fixed on fixed: readrandom : 28.714 micros/op 34825 ops/sec; 3.9 MB/s (348999 of 348999 found) ``` Just paying attention to order of magnitude of ops/sec (short test durations, lots of noise), it's clear that with the fix we can read <= 6.29 & >= 7.0 at full speed, where neither 6.29 nor 7.0 can on both. And 6.29 release can properly read fixed DB at full speed. Reviewed By: siying, ajkr Differential Revision: D35057844 Pulled By: pdillinger fbshipit-source-id: a46893a6af4bf084375ebe4728066d00eb08f050 --- HISTORY.md | 7 ++ db/c.cc | 6 ++ db/db_bloom_filter_test.cc | 8 +- include/rocksdb/filter_policy.h | 13 +++ options/customizable_test.cc | 1 + .../block_based/block_based_table_builder.cc | 2 +- table/block_based/block_based_table_reader.cc | 80 +++++++++++++------ table/block_based/filter_policy.cc | 12 ++- table/block_based/filter_policy_internal.h | 3 + table/block_based/full_filter_block_test.cc | 1 + 10 files changed, 104 insertions(+), 29 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 677b984e5..79eb4ca52 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,11 @@ # Rocksdb Change Log +## Unreleased +### Bug Fixes +* Fixed a major performance bug in which Bloom filters generated by pre-7.0 releases are not read by early 7.0.x releases (and vice-versa) due to changes to FilterPolicy::Name() in #9590. This can severely impact read performance and read I/O on upgrade or downgrade with existing DB, but not data correctness. + +### Public API changes +* Added pure virtual FilterPolicy::CompatibilityName(), which is needed for fixing major performance bug involving FilterPolicy naming in SST metadata without affecting Customizable aspect of FilterPolicy. This change only affects those with their own custom or wrapper FilterPolicy classes. + ## 7.1.0 (03/21/2022) ### Public API changes * Add DB::OpenAndTrimHistory API. This API will open DB and trim data to the timestamp specified by trim_ts (The data with timestamp larger than specified trim bound will be removed). This API should only be used at a timestamp-enabled column families recovery. If the column family doesn't have timestamp enabled, this API won't trim any data on that column family. This API is not compatible with avoid_flush_during_recovery option. diff --git a/db/c.cc b/db/c.cc index 0a181eecc..4ddf6ad80 100644 --- a/db/c.cc +++ b/db/c.cc @@ -3752,6 +3752,9 @@ rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format( const FilterPolicy* rep_; ~Wrapper() override { delete rep_; } const char* Name() const override { return rep_->Name(); } + const char* CompatibilityName() const override { + return rep_->CompatibilityName(); + } // No need to override GetFilterBitsBuilder if this one is overridden ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext( const ROCKSDB_NAMESPACE::FilterBuildingContext& context) @@ -3789,6 +3792,9 @@ rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_format( const FilterPolicy* rep_; ~Wrapper() override { delete rep_; } const char* Name() const override { return rep_->Name(); } + const char* CompatibilityName() const override { + return rep_->CompatibilityName(); + } ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext( const ROCKSDB_NAMESPACE::FilterBuildingContext& context) const override { diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index eea41bc0c..c613505c5 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -1638,9 +1638,15 @@ class LevelAndStyleCustomFilterPolicy : public FilterPolicy { policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)), policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {} + const char* Name() const override { + return "LevelAndStyleCustomFilterPolicy"; + } + // OK to use built-in policy name because we are deferring to a // built-in builder. We aren't changing the serialized format. - const char* Name() const override { return policy_fifo_->Name(); } + const char* CompatibilityName() const override { + return policy_fifo_->CompatibilityName(); + } FilterBitsBuilder* GetBuilderWithContext( const FilterBuildingContext& context) const override { diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 873520ee0..954d15b4a 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -90,6 +90,19 @@ class FilterPolicy : public Customizable { virtual ~FilterPolicy(); static const char* Type() { return "FilterPolicy"; } + // The name used for identifying whether a filter on disk is readable + // by this FilterPolicy. If this FilterPolicy is part of a family that + // can read each others filters, such as built-in BloomFilterPolcy and + // RibbonFilterPolicy, the CompatibilityName is a shared family name, + // while kinds of filters in the family can have distinct Customizable + // Names. This function is pure virtual so that wrappers around built-in + // policies are prompted to defer to CompatibilityName() of the wrapped + // policy, which is important for compatibility. + // + // For custom filter policies that are not part of a read-compatible + // family (rare), implementations may return Name(). + virtual const char* CompatibilityName() const = 0; + // Creates a new FilterPolicy based on the input value string and returns the // result The value might be an ID, and ID with properties, or an old-style // policy string. diff --git a/options/customizable_test.cc b/options/customizable_test.cc index 690b394ea..84f7f4656 100644 --- a/options/customizable_test.cc +++ b/options/customizable_test.cc @@ -1487,6 +1487,7 @@ class MockFilterPolicy : public FilterPolicy { public: static const char* kClassName() { return "MockFilterPolicy"; } const char* Name() const override { return kClassName(); } + const char* CompatibilityName() const override { return Name(); } FilterBitsBuilder* GetBuilderWithContext( const FilterBuildingContext&) const override { return nullptr; diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 6c622555d..41a49556b 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -1605,7 +1605,7 @@ void BlockBasedTableBuilder::WriteFilterBlock( ? BlockBasedTable::kPartitionedFilterBlockPrefix : BlockBasedTable::kFullFilterBlockPrefix; } - key.append(rep_->table_options.filter_policy->Name()); + key.append(rep_->table_options.filter_policy->CompatibilityName()); meta_index_builder->Add(key, filter_block_handle); } } diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 76010bc05..6338cba28 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,7 @@ #include "table/block_based/block_prefix_index.h" #include "table/block_based/block_type.h" #include "table/block_based/filter_block.h" +#include "table/block_based/filter_policy_internal.h" #include "table/block_based/full_filter_block.h" #include "table/block_based/hash_index_reader.h" #include "table/block_based/partitioned_filter_block.h" @@ -897,33 +899,59 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( const BlockBasedTableOptions& table_options, const int level, size_t file_size, size_t max_file_size_for_l0_meta_pin, BlockCacheLookupContext* lookup_context) { - Status s; - // Find filter handle and filter type if (rep_->filter_policy) { - for (auto filter_type : - {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter, - Rep::FilterType::kBlockFilter}) { - std::string prefix; - switch (filter_type) { - case Rep::FilterType::kFullFilter: - prefix = kFullFilterBlockPrefix; - break; - case Rep::FilterType::kPartitionedFilter: - prefix = kPartitionedFilterBlockPrefix; - break; - case Rep::FilterType::kBlockFilter: - prefix = kFilterBlockPrefix; + auto name = rep_->filter_policy->CompatibilityName(); + bool builtin_compatible = + strcmp(name, BuiltinFilterPolicy::kCompatibilityName()) == 0; + + for (const auto& [filter_type, prefix] : + {std::make_pair(Rep::FilterType::kFullFilter, kFullFilterBlockPrefix), + std::make_pair(Rep::FilterType::kPartitionedFilter, + kPartitionedFilterBlockPrefix), + std::make_pair(Rep::FilterType::kBlockFilter, kFilterBlockPrefix)}) { + if (builtin_compatible) { + // This code is only here to deal with a hiccup in early 7.0.x where + // there was an unintentional name change in the SST files metadata. + // It should be OK to remove this in the future (late 2022) and just + // have the 'else' code. + // NOTE: the test:: names below are likely not needed but included + // out of caution + static const std::unordered_set kBuiltinNameAndAliases = { + BuiltinFilterPolicy::kCompatibilityName(), + test::LegacyBloomFilterPolicy::kClassName(), + test::FastLocalBloomFilterPolicy::kClassName(), + test::Standard128RibbonFilterPolicy::kClassName(), + DeprecatedBlockBasedBloomFilterPolicy::kClassName(), + BloomFilterPolicy::kClassName(), + RibbonFilterPolicy::kClassName(), + }; + + // For efficiency, do a prefix seek and see if the first match is + // good. + meta_iter->Seek(prefix); + if (meta_iter->status().ok() && meta_iter->Valid()) { + Slice key = meta_iter->key(); + if (key.starts_with(prefix)) { + key.remove_prefix(prefix.size()); + if (kBuiltinNameAndAliases.find(key.ToString()) != + kBuiltinNameAndAliases.end()) { + Slice v = meta_iter->value(); + Status s = rep_->filter_handle.DecodeFrom(&v); + if (s.ok()) { + rep_->filter_type = filter_type; + break; + } + } + } + } + } else { + std::string filter_block_key = prefix + name; + if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle) + .ok()) { + rep_->filter_type = filter_type; break; - default: - assert(0); - } - std::string filter_block_key = prefix; - filter_block_key.append(rep_->filter_policy->Name()); - if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle) - .ok()) { - rep_->filter_type = filter_type; - break; + } } } } @@ -932,8 +960,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); // Find compression dictionary handle - s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName, - &rep_->compression_dict_handle); + Status s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName, + &rep_->compression_dict_handle); if (!s.ok()) { return s; } diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 6594bfb4d..cfbc658aa 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -1325,6 +1325,16 @@ bool BuiltinFilterPolicy::IsInstanceOf(const std::string& name) const { } } +static const char* kBuiltinFilterMetadataName = "rocksdb.BuiltinBloomFilter"; + +const char* BuiltinFilterPolicy::kCompatibilityName() { + return kBuiltinFilterMetadataName; +} + +const char* BuiltinFilterPolicy::CompatibilityName() const { + return kBuiltinFilterMetadataName; +} + BloomLikeFilterPolicy::BloomLikeFilterPolicy(double bits_per_key) : warned_(false), aggregate_rounding_balance_(0) { // Sanitize bits_per_key @@ -1372,7 +1382,7 @@ bool BloomLikeFilterPolicy::IsInstanceOf(const std::string& name) const { } const char* ReadOnlyBuiltinFilterPolicy::kClassName() { - return "rocksdb.BuiltinBloomFilter"; + return kBuiltinFilterMetadataName; } const char* DeprecatedBlockBasedBloomFilterPolicy::kClassName() { diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h index 4ceeed0d0..06566f871 100644 --- a/table/block_based/filter_policy_internal.h +++ b/table/block_based/filter_policy_internal.h @@ -135,6 +135,9 @@ class BuiltinFilterPolicy : public FilterPolicy { FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override; static const char* kClassName(); bool IsInstanceOf(const std::string& id) const override; + // All variants of BuiltinFilterPolicy can read each others filters. + const char* CompatibilityName() const override; + static const char* kCompatibilityName(); public: // new // An internal function for the implementation of diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 76f612728..24d870d4c 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -84,6 +84,7 @@ class TestFilterBitsReader : public FilterBitsReader { class TestHashFilter : public FilterPolicy { public: const char* Name() const override { return "TestHashFilter"; } + const char* CompatibilityName() const override { return Name(); } FilterBitsBuilder* GetBuilderWithContext( const FilterBuildingContext&) const override {