// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include #include #include #include #include "cache/cache_entry_roles.h" #include "cache/cache_reservation_manager.h" #include "db/db_test_util.h" #include "options/options_helper.h" #include "port/stack_trace.h" #include "rocksdb/advanced_options.h" #include "rocksdb/convenience.h" #include "rocksdb/filter_policy.h" #include "rocksdb/perf_context.h" #include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/filter_policy_internal.h" #include "table/format.h" #include "test_util/testutil.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { namespace { std::shared_ptr Create(double bits_per_key, const std::string& name) { return BloomLikeFilterPolicy::Create(name, bits_per_key); } const std::string kLegacyBloom = test::LegacyBloomFilterPolicy::kClassName(); const std::string kFastLocalBloom = test::FastLocalBloomFilterPolicy::kClassName(); const std::string kStandard128Ribbon = test::Standard128RibbonFilterPolicy::kClassName(); const std::string kAutoBloom = BloomFilterPolicy::kClassName(); const std::string kAutoRibbon = RibbonFilterPolicy::kClassName(); } // namespace // DB tests related to bloom filter. class DBBloomFilterTest : public DBTestBase { public: DBBloomFilterTest() : DBTestBase("db_bloom_filter_test", /*env_do_fsync=*/true) {} }; class DBBloomFilterTestWithParam : public DBTestBase, public testing::WithParamInterface< std::tuple> { // public testing::WithParamInterface { protected: std::string bfp_impl_; bool partition_filters_; uint32_t format_version_; public: DBBloomFilterTestWithParam() : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {} ~DBBloomFilterTestWithParam() override {} void SetUp() override { bfp_impl_ = std::get<0>(GetParam()); partition_filters_ = std::get<1>(GetParam()); format_version_ = std::get<2>(GetParam()); } }; class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {}; class SliceTransformLimitedDomainGeneric : public SliceTransform { const char* Name() const override { return "SliceTransformLimitedDomainGeneric"; } Slice Transform(const Slice& src) const override { return Slice(src.data(), 5); } bool InDomain(const Slice& src) const override { // prefix will be x???? return src.size() >= 5; } bool InRange(const Slice& dst) const override { // prefix will be x???? return dst.size() == 5; } }; // KeyMayExist can lead to a few false positives, but not false negatives. // To make test deterministic, use a much larger number of bits per key-20 than // bits in the key, so that false positives are eliminated TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { do { ReadOptions ropts; std::string value; anon::OptionsOverride options_override; options_override.filter_policy = Create(20, bfp_impl_); options_override.partition_filters = partition_filters_; options_override.metadata_block_size = 32; options_override.full_block_cache = true; Options options = CurrentOptions(options_override); if (partition_filters_) { auto* table_options = options.table_factory->GetOptions(); if (table_options != nullptr && table_options->index_type != BlockBasedTableOptions::kTwoLevelIndexSearch) { // In the current implementation partitioned filters depend on // partitioned indexes continue; } } options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_OK(Put(1, "a", "b")); bool value_found = false; ASSERT_TRUE( db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); ASSERT_TRUE(value_found); ASSERT_EQ("b", value); ASSERT_OK(Flush(1)); value.clear(); uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); ASSERT_TRUE( db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); ASSERT_TRUE(!value_found); // assert that no new files were opened and no new blocks were // read into block cache. ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); ASSERT_OK(Delete(1, "a")); numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); ASSERT_OK(Flush(1)); ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], true /* disallow trivial move */)); numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); ASSERT_OK(Delete(1, "c")); numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); // KeyMayExist function only checks data in block caches, which is not used // by plain table format. } while ( ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); } TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) { for (bool partition_filters : {true, false}) { Options options = last_options_; options.prefix_extractor = std::make_shared(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); get_perf_context()->EnablePerLevelPerfContext(); BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(10)); if (partition_filters) { bbto.partition_filters = true; bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); WriteOptions wo; ReadOptions ro; FlushOptions fo; fo.wait = true; std::string value; ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); ASSERT_OK(dbfull()->Flush(fo)); ASSERT_EQ("foo", Get("barbarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ( 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); ASSERT_EQ("foo2", Get("barbarbar2")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ( 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ( 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ( 1, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); ASSERT_EQ("NOT_FOUND", Get("foobarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); ASSERT_EQ( 2, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); ro.total_order_seek = true; // NOTE: total_order_seek no longer affects Get() ASSERT_EQ("NOT_FOUND", Get("foobarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); ASSERT_EQ( 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); // No bloom on extractor changed #ifndef ROCKSDB_LITE ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); ASSERT_EQ("NOT_FOUND", Get("foobarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); ASSERT_EQ( 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); #endif // ROCKSDB_LITE // No bloom on extractor changed, after re-open options.prefix_extractor.reset(NewCappedPrefixTransform(10)); Reopen(options); ASSERT_EQ("NOT_FOUND", Get("foobarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); ASSERT_EQ( 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); get_perf_context()->Reset(); } } TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) { for (bool partition_filters : {true, false}) { Options options = last_options_; options.prefix_extractor.reset(NewFixedPrefixTransform(8)); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); get_perf_context()->EnablePerLevelPerfContext(); BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(10)); if (partition_filters) { bbto.partition_filters = true; bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); WriteOptions wo; ReadOptions ro; FlushOptions fo; fo.wait = true; std::string value; ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); ASSERT_OK(dbfull()->Flush(fo)); ASSERT_EQ("foo", Get("barbarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("foo2", Get("barbarbar2")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("NOT_FOUND", Get("foobarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); ro.total_order_seek = true; // NOTE: total_order_seek no longer affects Get() ASSERT_EQ("NOT_FOUND", Get("foobarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); ASSERT_EQ( 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); // No bloom on extractor changed #ifndef ROCKSDB_LITE ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); ASSERT_EQ("NOT_FOUND", Get("foobarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); ASSERT_EQ( 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); #endif // ROCKSDB_LITE get_perf_context()->Reset(); } } TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { for (bool partition_filters : {true, false}) { Options options = last_options_; options.prefix_extractor.reset(NewFixedPrefixTransform(3)); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); get_perf_context()->EnablePerLevelPerfContext(); BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(10)); bbto.whole_key_filtering = false; if (partition_filters) { bbto.partition_filters = true; bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); WriteOptions wo; ReadOptions ro; FlushOptions fo; fo.wait = true; std::string value; ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); // Needs insert some keys to make sure files are not filtered out by key // ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); ASSERT_OK(dbfull()->Flush(fo)); Reopen(options); ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("bar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("foo", Get("foobar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); // Reopen with whole key filtering enabled and prefix extractor // NULL. Bloom filter should be off for both of whole key and // prefix bloom. bbto.whole_key_filtering = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.prefix_extractor.reset(); Reopen(options); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("NOT_FOUND", Get("bar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("foo", Get("foobar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); // Write DB with only full key filtering. ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); // Needs insert some keys to make sure files are not filtered out by key // ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // Reopen with both of whole key off and prefix extractor enabled. // Still no bloom filter should be used. options.prefix_extractor.reset(NewFixedPrefixTransform(3)); bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("NOT_FOUND", Get("bar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("foo", Get("foobar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); // Try to create a DB with mixed files: ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); // Needs insert some keys to make sure files are not filtered out by key // ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); options.prefix_extractor.reset(); bbto.whole_key_filtering = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); // Try to create a DB with mixed files. ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); // In this case needs insert some keys to make sure files are // not filtered out by key ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); ASSERT_OK(Flush()); // Now we have two files: // File 1: An older file with prefix bloom. // File 2: A newer file with whole bloom filter. ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); ASSERT_EQ("NOT_FOUND", Get("bar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); ASSERT_EQ("foo", Get("foobar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); ASSERT_EQ("bar", Get("barfoo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); // Reopen with the same setting: only whole key is used Reopen(options); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5); ASSERT_EQ("NOT_FOUND", Get("bar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6); ASSERT_EQ("foo", Get("foobar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); ASSERT_EQ("bar", Get("barfoo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); // Restart with both filters are allowed options.prefix_extractor.reset(NewFixedPrefixTransform(3)); bbto.whole_key_filtering = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); // File 1 will has it filtered out. // File 2 will not, as prefix `foo` exists in the file. ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8); ASSERT_EQ("NOT_FOUND", Get("bar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10); ASSERT_EQ("foo", Get("foobar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); ASSERT_EQ("bar", Get("barfoo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); // Restart with only prefix bloom is allowed. options.prefix_extractor.reset(NewFixedPrefixTransform(3)); bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); ASSERT_EQ("NOT_FOUND", Get("bar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); ASSERT_EQ("foo", Get("foobar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); ASSERT_EQ("bar", Get("barfoo")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); uint64_t bloom_filter_useful_all_levels = 0; for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { if (kv.second.bloom_filter_useful > 0) { bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; } } ASSERT_EQ(12, bloom_filter_useful_all_levels); get_perf_context()->Reset(); } } TEST_P(DBBloomFilterTestWithParam, BloomFilter) { do { Options options = CurrentOptions(); env_->count_random_reads_ = true; options.env = env_; // ChangeCompactOptions() only changes compaction style, which does not // trigger reset of table_factory BlockBasedTableOptions table_options; table_options.no_block_cache = true; table_options.filter_policy = Create(10, bfp_impl_); table_options.partition_filters = partition_filters_; if (partition_filters_) { table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } table_options.format_version = format_version_; if (format_version_ >= 4) { // value delta encoding challenged more with index interval > 1 table_options.index_block_restart_interval = 8; } table_options.metadata_block_size = 32; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); // Populate multiple layers const int N = 10000; for (int i = 0; i < N; i++) { ASSERT_OK(Put(1, Key(i), Key(i))); } Compact(1, "a", "z"); for (int i = 0; i < N; i += 100) { ASSERT_OK(Put(1, Key(i), Key(i))); } ASSERT_OK(Flush(1)); // Prevent auto compactions triggered by seeks env_->delay_sstable_sync_.store(true, std::memory_order_release); // Lookup present keys. Should rarely read from small sstable. env_->random_read_counter_.Reset(); for (int i = 0; i < N; i++) { ASSERT_EQ(Key(i), Get(1, Key(i))); } int reads = env_->random_read_counter_.Read(); fprintf(stderr, "%d present => %d reads\n", N, reads); ASSERT_GE(reads, N); if (partition_filters_) { // Without block cache, we read an extra partition filter per each // level*read and a partition index per each read ASSERT_LE(reads, 4 * N + 2 * N / 100); } else { ASSERT_LE(reads, N + 2 * N / 100); } // Lookup present keys. Should rarely read from either sstable. env_->random_read_counter_.Reset(); for (int i = 0; i < N; i++) { ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); } reads = env_->random_read_counter_.Read(); fprintf(stderr, "%d missing => %d reads\n", N, reads); if (partition_filters_) { // With partitioned filter we read one extra filter per level per each // missed read. ASSERT_LE(reads, 2 * N + 3 * N / 100); } else { ASSERT_LE(reads, 3 * N / 100); } #ifndef ROCKSDB_LITE // Sanity check some table properties std::map props; ASSERT_TRUE(db_->GetMapProperty( handles_[1], DB::Properties::kAggregatedTableProperties, &props)); uint64_t nkeys = N + N / 100; uint64_t filter_size = ParseUint64(props["filter_size"]); EXPECT_LE(filter_size, (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ 8); if (bfp_impl_ == kAutoRibbon) { // Sometimes using Ribbon filter which is more space-efficient EXPECT_GE(filter_size, 7 * nkeys / /*bits / byte*/ 8); } else { // Always Bloom EXPECT_GE(filter_size, 10 * nkeys / /*bits / byte*/ 8); } uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]); EXPECT_EQ(num_filter_entries, nkeys); #endif // ROCKSDB_LITE env_->delay_sstable_sync_.store(false, std::memory_order_release); Close(); } while (ChangeCompactOptions()); } namespace { class AlwaysTrueBitsBuilder : public FilterBitsBuilder { public: void AddKey(const Slice&) override {} size_t EstimateEntriesAdded() override { return 0U; } Slice Finish(std::unique_ptr* /* buf */) override { // Interpreted as "always true" filter (0 probes over 1 byte of // payload, 5 bytes metadata) return Slice("\0\0\0\0\0\0", 6); } using FilterBitsBuilder::Finish; size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; } }; class AlwaysTrueFilterPolicy : public ReadOnlyBuiltinFilterPolicy { public: explicit AlwaysTrueFilterPolicy(bool skip) : skip_(skip) {} FilterBitsBuilder* GetBuilderWithContext( const FilterBuildingContext&) const override { if (skip_) { return nullptr; } else { return new AlwaysTrueBitsBuilder(); } } private: bool skip_; }; } // namespace TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { constexpr int maxKey = 10; auto PutFn = [&]() { int i; // Put for (i = 0; i < maxKey; i++) { ASSERT_OK(Put(Key(i), Key(i))); } Flush(); }; auto GetFn = [&]() { int i; // Get OK for (i = 0; i < maxKey; i++) { ASSERT_EQ(Key(i), Get(Key(i))); } // Get NotFound for (; i < maxKey * 2; i++) { ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); } }; auto PutAndGetFn = [&]() { PutFn(); GetFn(); }; #ifndef ROCKSDB_LITE std::map props; const auto& kAggTableProps = DB::Properties::kAggregatedTableProperties; #endif // ROCKSDB_LITE Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions table_options; table_options.partition_filters = partition_filters_; if (partition_filters_) { table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } table_options.format_version = format_version_; // Test 1: bits per key < 0.5 means skip filters -> no filter // constructed or read. table_options.filter_policy = Create(0.4, bfp_impl_); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); PutAndGetFn(); // Verify no filter access nor contruction EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); #ifndef ROCKSDB_LITE props.clear(); ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); EXPECT_EQ(props["filter_size"], "0"); #endif // ROCKSDB_LITE // Test 2: use custom API to skip filters -> no filter constructed // or read. table_options.filter_policy.reset( new AlwaysTrueFilterPolicy(/* skip */ true)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); PutAndGetFn(); // Verify no filter access nor construction EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); #ifndef ROCKSDB_LITE props.clear(); ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); EXPECT_EQ(props["filter_size"], "0"); #endif // ROCKSDB_LITE // Control test: using an actual filter with 100% FP rate -> the filter // is constructed and checked on read. table_options.filter_policy.reset( new AlwaysTrueFilterPolicy(/* skip */ false)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); PutAndGetFn(); // Verify filter is accessed (and constructed) EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), maxKey * 2); EXPECT_EQ( TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), maxKey); #ifndef ROCKSDB_LITE props.clear(); ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); EXPECT_NE(props["filter_size"], "0"); #endif // ROCKSDB_LITE // Test 3 (options test): Able to read existing filters with longstanding // generated options file entry `filter_policy=rocksdb.BuiltinBloomFilter` ASSERT_OK(FilterPolicy::CreateFromString(ConfigOptions(), "rocksdb.BuiltinBloomFilter", &table_options.filter_policy)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); GetFn(); // Verify filter is accessed EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), maxKey * 2); EXPECT_EQ( TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), maxKey); // But new filters are not generated (configuration details unknown) DestroyAndReopen(options); PutAndGetFn(); // Verify no filter access nor construction EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); #ifndef ROCKSDB_LITE props.clear(); ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); EXPECT_EQ(props["filter_size"], "0"); #endif // ROCKSDB_LITE } #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) INSTANTIATE_TEST_CASE_P( FormatDef, DBBloomFilterTestDefFormatVersion, ::testing::Values( std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion), std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion), std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( FormatDef, DBBloomFilterTestWithParam, ::testing::Values( std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion), std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion), std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( FormatLatest, DBBloomFilterTestWithParam, ::testing::Values( std::make_tuple(kAutoBloom, true, kLatestFormatVersion), std::make_tuple(kAutoBloom, false, kLatestFormatVersion), std::make_tuple(kAutoRibbon, false, kLatestFormatVersion))); #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_F(DBBloomFilterTest, BloomFilterRate) { while (ChangeFilterOptions()) { Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); get_perf_context()->EnablePerLevelPerfContext(); CreateAndReopenWithCF({"pikachu"}, options); const int maxKey = 10000; for (int i = 0; i < maxKey; i++) { ASSERT_OK(Put(1, Key(i), Key(i))); } // Add a large key to make the file contain wide range ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); Flush(1); // Check if they can be found for (int i = 0; i < maxKey; i++) { ASSERT_EQ(Key(i), Get(1, Key(i))); } ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); // Check if filter is useful for (int i = 0; i < maxKey; i++) { ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); } ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); ASSERT_GE( (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful, maxKey * 0.98); get_perf_context()->Reset(); } } namespace { struct CompatibilityConfig { std::shared_ptr policy; bool partitioned; uint32_t format_version; void SetInTableOptions(BlockBasedTableOptions* table_options) { table_options->filter_policy = policy; table_options->partition_filters = partitioned; if (partitioned) { table_options->index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } else { table_options->index_type = BlockBasedTableOptions::IndexType::kBinarySearch; } table_options->format_version = format_version; } }; // High bits per key -> almost no FPs std::shared_ptr kCompatibilityBloomPolicy{ NewBloomFilterPolicy(20)}; // bloom_before_level=-1 -> always use Ribbon std::shared_ptr kCompatibilityRibbonPolicy{ NewRibbonFilterPolicy(20, -1)}; std::vector kCompatibilityConfigs = { {kCompatibilityBloomPolicy, false, BlockBasedTableOptions().format_version}, {kCompatibilityBloomPolicy, true, BlockBasedTableOptions().format_version}, {kCompatibilityBloomPolicy, false, /* legacy Bloom */ 4U}, {kCompatibilityRibbonPolicy, false, BlockBasedTableOptions().format_version}, {kCompatibilityRibbonPolicy, true, BlockBasedTableOptions().format_version}, }; } // namespace TEST_F(DBBloomFilterTest, BloomFilterCompatibility) { Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.level0_file_num_compaction_trigger = static_cast(kCompatibilityConfigs.size()) + 1; options.max_open_files = -1; Close(); // Create one file for each kind of filter. Each file covers a distinct key // range. for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) { BlockBasedTableOptions table_options; kCompatibilityConfigs[i].SetInTableOptions(&table_options); ASSERT_TRUE(table_options.filter_policy != nullptr); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); std::string prefix = std::to_string(i) + "_"; ASSERT_OK(Put(prefix + "A", "val")); ASSERT_OK(Put(prefix + "Z", "val")); ASSERT_OK(Flush()); } // Test filter is used between each pair of {reader,writer} configurations, // because any built-in FilterPolicy should be able to read filters from any // other built-in FilterPolicy for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) { BlockBasedTableOptions table_options; kCompatibilityConfigs[i].SetInTableOptions(&table_options); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); for (size_t j = 0; j < kCompatibilityConfigs.size(); ++j) { std::string prefix = std::to_string(j) + "_"; ASSERT_EQ("val", Get(prefix + "A")); // Filter positive ASSERT_EQ("val", Get(prefix + "Z")); // Filter positive // Filter negative, with high probability ASSERT_EQ("NOT_FOUND", Get(prefix + "Q")); EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 2); EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), 1); } } } // To align with the type of hash entry being reserved in implementation. using FilterConstructionReserveMemoryHash = uint64_t; class ChargeFilterConstructionTestWithParam : public DBTestBase, public testing::WithParamInterface> { public: ChargeFilterConstructionTestWithParam() : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true), num_key_(0), charge_filter_construction_(std::get<0>(GetParam())), policy_(std::get<1>(GetParam())), partition_filters_(std::get<2>(GetParam())), detect_filter_construct_corruption_(std::get<3>(GetParam())) { if (charge_filter_construction_ == CacheEntryRoleOptions::Decision::kDisabled || policy_ == kLegacyBloom) { // For these cases, we only interested in whether filter construction // cache charging happens instead of its accuracy. Therefore we don't // need many keys. num_key_ = 5; } else if (partition_filters_) { // For PartitionFilter case, since we set // table_options.metadata_block_size big enough such that each partition // trigger at least 1 dummy entry reservation each for hash entries and // final filter, we need a large number of keys to ensure we have at least // two partitions. num_key_ = 18 * CacheReservationManagerImpl< CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / sizeof(FilterConstructionReserveMemoryHash); } else if (policy_ == kFastLocalBloom) { // For Bloom Filter + FullFilter case, since we design the num_key_ to // make hash entry cache charging be a multiple of dummy entries, the // correct behavior of charging final filter on top of it will trigger at // least another dummy entry insertion. Therefore we can assert that // behavior and we don't need a large number of keys to verify we // indeed charge the final filter for in cache, even though final // filter is a lot smaller than hash entries. num_key_ = 1 * CacheReservationManagerImpl< CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / sizeof(FilterConstructionReserveMemoryHash); } else { // For Ribbon Filter + FullFilter case, we need a large enough number of // keys so that charging final filter after releasing the hash entries // reservation will trigger at least another dummy entry (or equivalently // to saying, causing another peak in cache charging) as banding // reservation might not be a multiple of dummy entry. num_key_ = 12 * CacheReservationManagerImpl< CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / sizeof(FilterConstructionReserveMemoryHash); } } BlockBasedTableOptions GetBlockBasedTableOptions() { BlockBasedTableOptions table_options; // We set cache capacity big enough to prevent cache full for convenience in // calculation. constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024; table_options.cache_usage_options.options_overrides.insert( {CacheEntryRole::kFilterConstruction, {/*.charged = */ charge_filter_construction_}}); table_options.filter_policy = Create(10, policy_); table_options.partition_filters = partition_filters_; if (table_options.partition_filters) { table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; // We set table_options.metadata_block_size big enough so that each // partition trigger at least 1 dummy entry insertion each for hash // entries and final filter. table_options.metadata_block_size = 409000; } table_options.detect_filter_construct_corruption = detect_filter_construct_corruption_; LRUCacheOptions lo; lo.capacity = kCacheCapacity; lo.num_shard_bits = 0; // 2^0 shard lo.strict_capacity_limit = true; cache_ = std::make_shared< TargetCacheChargeTrackingCache>( (NewLRUCache(lo))); table_options.block_cache = cache_; return table_options; } std::size_t GetNumKey() { return num_key_; } CacheEntryRoleOptions::Decision ChargeFilterConstructMemory() { return charge_filter_construction_; } std::string GetFilterPolicy() { return policy_; } bool PartitionFilters() { return partition_filters_; } std::shared_ptr< TargetCacheChargeTrackingCache> GetCache() { return cache_; } private: std::size_t num_key_; CacheEntryRoleOptions::Decision charge_filter_construction_; std::string policy_; bool partition_filters_; std::shared_ptr< TargetCacheChargeTrackingCache> cache_; bool detect_filter_construct_corruption_; }; INSTANTIATE_TEST_CASE_P( ChargeFilterConstructionTestWithParam, ChargeFilterConstructionTestWithParam, ::testing::Values( std::make_tuple(CacheEntryRoleOptions::Decision::kDisabled, kFastLocalBloom, false, false), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kFastLocalBloom, false, false), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kFastLocalBloom, false, true), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kFastLocalBloom, true, false), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kFastLocalBloom, true, true), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kStandard128Ribbon, false, false), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kStandard128Ribbon, false, true), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kStandard128Ribbon, true, false), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kStandard128Ribbon, true, true), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kLegacyBloom, false, false))); // TODO: Speed up this test, and reduce disk space usage (~700MB) // The current test inserts many keys (on the scale of dummy entry size) // in order to make small memory user (e.g, final filter, partitioned hash // entries/filter/banding) , which is proportional to the number of // keys, big enough so that its cache charging triggers dummy entry insertion // and becomes observable in the test. // // However, inserting that many keys slows down this test and leaves future // developers an opportunity to speed it up. // // Possible approaches & challenges: // 1. Use sync point during cache charging of filter construction // // Benefit: It does not rely on triggering dummy entry insertion // but the sync point to verify small memory user is charged correctly. // // Challenge: this approach is intrusive. // // 2. Make dummy entry size configurable and set it small in the test // // Benefit: It increases the precision of cache charging and therefore // small memory usage can still trigger insertion of dummy entry. // // Challenge: change CacheReservationManager related APIs and a hack // might be needed to control the size of dummmy entry of // CacheReservationManager used in filter construction for testing // since CacheReservationManager is not exposed at the high level. // TEST_P(ChargeFilterConstructionTestWithParam, Basic) { Options options = CurrentOptions(); // We set write_buffer_size big enough so that in the case where there is // filter construction cache charging, flush won't be triggered before we // manually trigger it for clean testing options.write_buffer_size = 640 << 20; BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); std::shared_ptr< TargetCacheChargeTrackingCache> cache = GetCache(); options.create_if_missing = true; // Disable auto compaction to prevent its unexpected side effect // to the number of keys per partition designed by us in the test options.disable_auto_compactions = true; DestroyAndReopen(options); int num_key = static_cast(GetNumKey()); for (int i = 0; i < num_key; i++) { ASSERT_OK(Put(Key(i), Key(i))); } ASSERT_EQ(cache->GetChargedCacheIncrementSum(), 0) << "Flush was triggered too early in the test case with filter " "construction cache charging - please make sure no flush triggered " "during the key insertions above"; ASSERT_OK(Flush()); bool charge_filter_construction = (ChargeFilterConstructMemory() == CacheEntryRoleOptions::Decision::kEnabled); std::string policy = GetFilterPolicy(); bool partition_filters = PartitionFilters(); bool detect_filter_construct_corruption = table_options.detect_filter_construct_corruption; std::deque filter_construction_cache_res_peaks = cache->GetChargedCachePeaks(); std::size_t filter_construction_cache_res_increments_sum = cache->GetChargedCacheIncrementSum(); if (!charge_filter_construction) { EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0); return; } if (policy == kLegacyBloom) { EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0) << "There shouldn't be filter construction cache charging as this " "feature does not support kLegacyBloom"; return; } const std::size_t kDummyEntrySize = CacheReservationManagerImpl< CacheEntryRole::kFilterConstruction>::GetDummyEntrySize(); const std::size_t predicted_hash_entries_cache_res = num_key * sizeof(FilterConstructionReserveMemoryHash); ASSERT_EQ(predicted_hash_entries_cache_res % kDummyEntrySize, 0) << "It's by this test's design that predicted_hash_entries_cache_res is " "a multipe of dummy entry"; const std::size_t predicted_hash_entries_cache_res_dummy_entry_num = predicted_hash_entries_cache_res / kDummyEntrySize; const std::size_t predicted_final_filter_cache_res = static_cast( std::ceil(1.0 * predicted_hash_entries_cache_res_dummy_entry_num / 6 * (policy == kStandard128Ribbon ? 0.7 : 1))) * kDummyEntrySize; const std::size_t predicted_banding_cache_res = static_cast( std::ceil(predicted_hash_entries_cache_res_dummy_entry_num * 2.5)) * kDummyEntrySize; if (policy == kFastLocalBloom) { /* kFastLocalBloom + FullFilter * p0 * / \ * b / \ * / \ * / \ * 0/ \ * hash entries = b - 0, final filter = p0 - b * p0 = hash entries + final filter * * The test is designed in a way such that the reservation for b is a * multiple of dummy entries so that reservation for (p0 - b) * will trigger at least another dummy entry insertion. * * kFastLocalBloom + FullFilter + * detect_filter_construct_corruption * The peak p0 stays the same as * (kFastLocalBloom + FullFilter) but just lasts * longer since we release hash entries reservation later. * * kFastLocalBloom + PartitionedFilter * p1 * / \ * p0 b'/ \ * / \ / \ * b / \ / \ * / \ / \ * / a \ * 0/ \ * partitioned hash entries1 = b - 0, partitioned hash entries1 = b' - a * parittioned final filter1 = p0 - b, parittioned final filter2 = p1 - b' * * (increment p0 - 0) + (increment p1 - a) * = partitioned hash entries1 + partitioned hash entries2 * + parittioned final filter1 + parittioned final filter2 * = hash entries + final filter * * kFastLocalBloom + PartitionedFilter + * detect_filter_construct_corruption * The peak p0, p1 stay the same as * (kFastLocalBloom + PartitionedFilter) but just * last longer since we release hash entries reservation later. * */ if (!partition_filters) { EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) << "Filter construction cache charging should have only 1 peak in " "case: kFastLocalBloom + FullFilter"; std::size_t filter_construction_cache_res_peak = filter_construction_cache_res_peaks[0]; EXPECT_GT(filter_construction_cache_res_peak, predicted_hash_entries_cache_res) << "The testing number of hash entries is designed to make hash " "entries cache charging be multiples of dummy entries" " so the correct behavior of charging final filter on top of it" " should've triggered at least another dummy entry insertion"; std::size_t predicted_filter_construction_cache_res_peak = predicted_hash_entries_cache_res + predicted_final_filter_cache_res; EXPECT_GE(filter_construction_cache_res_peak, predicted_filter_construction_cache_res_peak * 0.9); EXPECT_LE(filter_construction_cache_res_peak, predicted_filter_construction_cache_res_peak * 1.1); return; } else { EXPECT_GE(filter_construction_cache_res_peaks.size(), 2) << "Filter construction cache charging should have multiple peaks " "in case: kFastLocalBloom + " "PartitionedFilter"; std::size_t predicted_filter_construction_cache_res_increments_sum = predicted_hash_entries_cache_res + predicted_final_filter_cache_res; EXPECT_GE(filter_construction_cache_res_increments_sum, predicted_filter_construction_cache_res_increments_sum * 0.9); EXPECT_LE(filter_construction_cache_res_increments_sum, predicted_filter_construction_cache_res_increments_sum * 1.1); return; } } if (policy == kStandard128Ribbon) { /* kStandard128Ribbon + FullFilter * p0 * / \ p1 * / \/\ * b / b' \ * / \ * 0/ \ * hash entries = b - 0, banding = p0 - b, final filter = p1 - b' * p0 = hash entries + banding * * The test is designed in a way such that the reservation for (p1 - b') * will trigger at least another dummy entry insertion * (or equivelantly to saying, creating another peak). * * kStandard128Ribbon + FullFilter + * detect_filter_construct_corruption * * new p0 * / \ * / \ * pre p0 \ * / \ * / \ * b / \ * / \ * 0/ \ * hash entries = b - 0, banding = pre p0 - b, * final filter = new p0 - pre p0 * new p0 = hash entries + banding + final filter * * The previous p0 will no longer be a peak since under * detect_filter_construct_corruption == true, we do not release hash * entries reserveration (like p0 - b' previously) until after final filter * creation and post-verification * * kStandard128Ribbon + PartitionedFilter * p3 * p0 /\ p4 * / \ p1 / \ /\ * / \/\ b''/ a' \ * b / b' \ / \ * / \ / \ * 0/ a \ * partitioned hash entries1 = b - 0, partitioned hash entries2 = b'' - a * partitioned banding1 = p0 - b, partitioned banding2 = p3 - b'' * parittioned final filter1 = p1 - b',parittioned final filter2 = p4 - a' * * (increment p0 - 0) + (increment p1 - b') * + (increment p3 - a) + (increment p4 - a') * = partitioned hash entries1 + partitioned hash entries2 * + parittioned banding1 + parittioned banding2 * + parittioned final filter1 + parittioned final filter2 * = hash entries + banding + final filter * * kStandard128Ribbon + PartitionedFilter + * detect_filter_construct_corruption * * new p3 * / \ * pre p3 \ * new p0 / \ * / \ / \ * pre p0 \ / \ * / \ b'/ \ * / \ / \ * b / \ / \ * / \a \ * 0/ \ * partitioned hash entries1 = b - 0, partitioned hash entries2 = b' - a * partitioned banding1 = pre p0 - b, partitioned banding2 = pre p3 - b' * parittioned final filter1 = new p0 - pre p0, * parittioned final filter2 = new p3 - pre p3 * * The previous p0 and p3 will no longer be a peak since under * detect_filter_construct_corruption == true, we do not release hash * entries reserveration (like p0 - b', p3 - a' previously) until after * parittioned final filter creation and post-verification * * However, increments sum stay the same as shown below: * (increment new p0 - 0) + (increment new p3 - a) * = partitioned hash entries1 + partitioned hash entries2 * + parittioned banding1 + parittioned banding2 * + parittioned final filter1 + parittioned final filter2 * = hash entries + banding + final filter * */ if (!partition_filters) { ASSERT_GE( std::floor( 1.0 * predicted_final_filter_cache_res / CacheReservationManagerImpl< CacheEntryRole::kFilterConstruction>::GetDummyEntrySize()), 1) << "Final filter cache charging too small for this test - please " "increase the number of keys"; if (!detect_filter_construct_corruption) { EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2) << "Filter construction cache charging should have 2 peaks in " "case: kStandard128Ribbon + " "FullFilter. " "The second peak is resulted from charging the final filter " "after " "decreasing the hash entry reservation since the testing final " "filter reservation is designed to be at least 1 dummy entry " "size"; std::size_t filter_construction_cache_res_peak = filter_construction_cache_res_peaks[0]; std::size_t predicted_filter_construction_cache_res_peak = predicted_hash_entries_cache_res + predicted_banding_cache_res; EXPECT_GE(filter_construction_cache_res_peak, predicted_filter_construction_cache_res_peak * 0.9); EXPECT_LE(filter_construction_cache_res_peak, predicted_filter_construction_cache_res_peak * 1.1); } else { EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) << "Filter construction cache charging should have 1 peaks in " "case: kStandard128Ribbon + FullFilter " "+ detect_filter_construct_corruption. " "The previous second peak now disappears since we don't " "decrease the hash entry reservation" "until after final filter reservation and post-verification"; std::size_t filter_construction_cache_res_peak = filter_construction_cache_res_peaks[0]; std::size_t predicted_filter_construction_cache_res_peak = predicted_hash_entries_cache_res + predicted_banding_cache_res + predicted_final_filter_cache_res; EXPECT_GE(filter_construction_cache_res_peak, predicted_filter_construction_cache_res_peak * 0.9); EXPECT_LE(filter_construction_cache_res_peak, predicted_filter_construction_cache_res_peak * 1.1); } return; } else { if (!detect_filter_construct_corruption) { EXPECT_GE(filter_construction_cache_res_peaks.size(), 3) << "Filter construction cache charging should have more than 3 " "peaks " "in case: kStandard128Ribbon + " "PartitionedFilter"; } else { EXPECT_GE(filter_construction_cache_res_peaks.size(), 2) << "Filter construction cache charging should have more than 2 " "peaks " "in case: kStandard128Ribbon + " "PartitionedFilter + detect_filter_construct_corruption"; } std::size_t predicted_filter_construction_cache_res_increments_sum = predicted_hash_entries_cache_res + predicted_banding_cache_res + predicted_final_filter_cache_res; EXPECT_GE(filter_construction_cache_res_increments_sum, predicted_filter_construction_cache_res_increments_sum * 0.9); EXPECT_LE(filter_construction_cache_res_increments_sum, predicted_filter_construction_cache_res_increments_sum * 1.1); return; } } } class DBFilterConstructionCorruptionTestWithParam : public DBTestBase, public testing::WithParamInterface< std::tuple> { public: DBFilterConstructionCorruptionTestWithParam() : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {} BlockBasedTableOptions GetBlockBasedTableOptions() { BlockBasedTableOptions table_options; table_options.detect_filter_construct_corruption = std::get<0>(GetParam()); table_options.filter_policy = Create(10, std::get<1>(GetParam())); table_options.partition_filters = std::get<2>(GetParam()); if (table_options.partition_filters) { table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; // We set table_options.metadata_block_size small enough so we can // trigger filter partitioning with GetNumKey() amount of keys table_options.metadata_block_size = 10; } return table_options; } // Return an appropriate amount of keys for testing // to generate a long filter (i.e, size >= 8 + kMetadataLen) std::size_t GetNumKey() { return 5000; } }; INSTANTIATE_TEST_CASE_P( DBFilterConstructionCorruptionTestWithParam, DBFilterConstructionCorruptionTestWithParam, ::testing::Values(std::make_tuple(false, kFastLocalBloom, false), std::make_tuple(true, kFastLocalBloom, false), std::make_tuple(true, kFastLocalBloom, true), std::make_tuple(true, kStandard128Ribbon, false), std::make_tuple(true, kStandard128Ribbon, true))); TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) { Options options = CurrentOptions(); BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.create_if_missing = true; options.disable_auto_compactions = true; DestroyAndReopen(options); int num_key = static_cast(GetNumKey()); Status s; // Case 1: No corruption in filter construction for (int i = 0; i < num_key; i++) { ASSERT_OK(Put(Key(i), Key(i))); } s = Flush(); EXPECT_TRUE(s.ok()); // Case 2: Corruption of hash entries in filter construction for (int i = 0; i < num_key; i++) { ASSERT_OK(Put(Key(i), Key(i))); } SyncPoint::GetInstance()->SetCallBack( "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { std::deque* hash_entries_to_corrupt = (std::deque*)arg; assert(!hash_entries_to_corrupt->empty()); *(hash_entries_to_corrupt->begin()) = *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); if (table_options.detect_filter_construct_corruption) { EXPECT_TRUE(s.IsCorruption()); EXPECT_TRUE( s.ToString().find("Filter's hash entries checksum mismatched") != std::string::npos); } else { EXPECT_TRUE(s.ok()); } SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearCallBack( "XXPH3FilterBitsBuilder::Finish::" "TamperHashEntries"); // Case 3: Corruption of filter content in filter construction DestroyAndReopen(options); for (int i = 0; i < num_key; i++) { ASSERT_OK(Put(Key(i), Key(i))); } SyncPoint::GetInstance()->SetCallBack( "XXPH3FilterBitsBuilder::Finish::TamperFilter", [&](void* arg) { std::pair*, std::size_t>* TEST_arg_pair = (std::pair*, std::size_t>*)arg; std::size_t filter_size = TEST_arg_pair->second; // 5 is the kMetadataLen and assert(filter_size >= 8 + 5); std::unique_ptr* filter_content_to_corrupt = TEST_arg_pair->first; std::memset(filter_content_to_corrupt->get(), '\0', 8); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); if (table_options.detect_filter_construct_corruption) { EXPECT_TRUE(s.IsCorruption()); EXPECT_TRUE(s.ToString().find("Corrupted filter content") != std::string::npos); } else { EXPECT_TRUE(s.ok()); } SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearCallBack( "XXPH3FilterBitsBuilder::Finish::" "TamperFilter"); } // RocksDB lite does not support dynamic options #ifndef ROCKSDB_LITE TEST_P(DBFilterConstructionCorruptionTestWithParam, DynamicallyTurnOnAndOffDetectConstructCorruption) { Options options = CurrentOptions(); BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); // We intend to turn on // table_options.detect_filter_construct_corruption dynamically // therefore we override this test parmater's value table_options.detect_filter_construct_corruption = false; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.create_if_missing = true; int num_key = static_cast(GetNumKey()); Status s; DestroyAndReopen(options); // Case 1: !table_options.detect_filter_construct_corruption for (int i = 0; i < num_key; i++) { ASSERT_OK(Put(Key(i), Key(i))); } SyncPoint::GetInstance()->SetCallBack( "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { std::deque* hash_entries_to_corrupt = (std::deque*)arg; assert(!hash_entries_to_corrupt->empty()); *(hash_entries_to_corrupt->begin()) = *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearCallBack( "XXPH3FilterBitsBuilder::Finish::" "TamperHashEntries"); ASSERT_FALSE(table_options.detect_filter_construct_corruption); EXPECT_TRUE(s.ok()); // Case 2: dynamically turn on // table_options.detect_filter_construct_corruption ASSERT_OK(db_->SetOptions({{"block_based_table_factory", "{detect_filter_construct_corruption=true;}"}})); for (int i = 0; i < num_key; i++) { ASSERT_OK(Put(Key(i), Key(i))); } SyncPoint::GetInstance()->SetCallBack( "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { std::deque* hash_entries_to_corrupt = (std::deque*)arg; assert(!hash_entries_to_corrupt->empty()); *(hash_entries_to_corrupt->begin()) = *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearCallBack( "XXPH3FilterBitsBuilder::Finish::" "TamperHashEntries"); auto updated_table_options = db_->GetOptions().table_factory->GetOptions(); EXPECT_TRUE(updated_table_options->detect_filter_construct_corruption); EXPECT_TRUE(s.IsCorruption()); EXPECT_TRUE(s.ToString().find("Filter's hash entries checksum mismatched") != std::string::npos); // Case 3: dynamically turn off // table_options.detect_filter_construct_corruption ASSERT_OK(db_->SetOptions({{"block_based_table_factory", "{detect_filter_construct_corruption=false;}"}})); updated_table_options = db_->GetOptions().table_factory->GetOptions(); EXPECT_FALSE(updated_table_options->detect_filter_construct_corruption); } #endif // ROCKSDB_LITE namespace { // NOTE: This class is referenced by HISTORY.md as a model for a wrapper // FilterPolicy selecting among configurations based on context. class LevelAndStyleCustomFilterPolicy : public FilterPolicy { public: explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other, int bpk_otherwise) : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)), policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)), policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {} const char* Name() const override { return "LevelAndStyleCustomFilterPolicy"; } // OK to use built-in policy name because we are deferring to a // built-in builder. We aren't changing the serialized format. const char* CompatibilityName() const override { return policy_fifo_->CompatibilityName(); } FilterBitsBuilder* GetBuilderWithContext( const FilterBuildingContext& context) const override { if (context.compaction_style == kCompactionStyleFIFO) { return policy_fifo_->GetBuilderWithContext(context); } else if (context.level_at_creation == 0) { return policy_l0_other_->GetBuilderWithContext(context); } else { return policy_otherwise_->GetBuilderWithContext(context); } } FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override { // OK to defer to any of them; they all can parse built-in filters // from any settings. return policy_fifo_->GetFilterBitsReader(contents); } private: const std::unique_ptr policy_fifo_; const std::unique_ptr policy_l0_other_; const std::unique_ptr policy_otherwise_; }; static std::map table_file_creation_reason_to_string{ {TableFileCreationReason::kCompaction, "kCompaction"}, {TableFileCreationReason::kFlush, "kFlush"}, {TableFileCreationReason::kMisc, "kMisc"}, {TableFileCreationReason::kRecovery, "kRecovery"}, }; class TestingContextCustomFilterPolicy : public LevelAndStyleCustomFilterPolicy { public: explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other, int bpk_otherwise) : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) { } FilterBitsBuilder* GetBuilderWithContext( const FilterBuildingContext& context) const override { test_report_ += "cf="; test_report_ += context.column_family_name; test_report_ += ",s="; test_report_ += OptionsHelper::compaction_style_to_string[context.compaction_style]; test_report_ += ",n="; test_report_ += std::to_string(context.num_levels); test_report_ += ",l="; test_report_ += std::to_string(context.level_at_creation); test_report_ += ",b="; test_report_ += std::to_string(int{context.is_bottommost}); test_report_ += ",r="; test_report_ += table_file_creation_reason_to_string[context.reason]; test_report_ += "\n"; return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context); } std::string DumpTestReport() { std::string rv; std::swap(rv, test_report_); return rv; } private: mutable std::string test_report_; }; } // namespace TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { auto policy = std::make_shared(15, 8, 5); Options options; for (bool fifo : {true, false}) { options = CurrentOptions(); options.max_open_files = fifo ? -1 : options.max_open_files; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.compaction_style = fifo ? kCompactionStyleFIFO : kCompactionStyleLevel; BlockBasedTableOptions table_options; table_options.filter_policy = policy; table_options.format_version = 5; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); TryReopen(options); CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options); const int maxKey = 10000; for (int i = 0; i < maxKey / 2; i++) { ASSERT_OK(Put(1, Key(i), Key(i))); } // Add a large key to make the file contain wide range ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); Flush(1); EXPECT_EQ(policy->DumpTestReport(), fifo ? "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n" : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); for (int i = maxKey / 2; i < maxKey; i++) { ASSERT_OK(Put(1, Key(i), Key(i))); } Flush(1); EXPECT_EQ(policy->DumpTestReport(), fifo ? "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n" : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); // Check that they can be found for (int i = 0; i < maxKey; i++) { ASSERT_EQ(Key(i), Get(1, Key(i))); } // Since we have two tables / two filters, we might have Bloom checks on // our queries, but no more than one "useful" per query on a found key. EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey); // Check that we have two filters, each about // fifo: 0.12% FP rate (15 bits per key) // level: 2.3% FP rate (8 bits per key) for (int i = 0; i < maxKey; i++) { ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); } { auto useful_count = TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975)); EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98)); } if (!fifo) { // FIFO only has L0 // Full compaction ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); EXPECT_EQ(policy->DumpTestReport(), "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n"); // Check that we now have one filter, about 9.2% FP rate (5 bits per key) for (int i = 0; i < maxKey; i++) { ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); } { auto useful_count = TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); EXPECT_GE(useful_count, maxKey * 0.90); EXPECT_LE(useful_count, maxKey * 0.91); } } else { #ifndef ROCKSDB_LITE // Also try external SST file { std::string file_path = dbname_ + "/external.sst"; SstFileWriter sst_file_writer(EnvOptions(), options, handles_[1]); ASSERT_OK(sst_file_writer.Open(file_path)); ASSERT_OK(sst_file_writer.Put("key", "value")); ASSERT_OK(sst_file_writer.Finish()); } // Note: kCompactionStyleLevel is default, ignored if num_levels == -1 EXPECT_EQ(policy->DumpTestReport(), "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n"); #endif } // Destroy ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); handles_[1] = nullptr; } } class SliceTransformLimitedDomain : public SliceTransform { const char* Name() const override { return "SliceTransformLimitedDomain"; } Slice Transform(const Slice& src) const override { return Slice(src.data(), 5); } bool InDomain(const Slice& src) const override { // prefix will be x???? return src.size() >= 5 && src[0] == 'x'; } bool InRange(const Slice& dst) const override { // prefix will be x???? return dst.size() == 5 && dst[0] == 'x'; } }; TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter1) { BlockBasedTableOptions bbto; bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); bbto.whole_key_filtering = false; Options options = CurrentOptions(); options.prefix_extractor = std::make_shared(); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); ASSERT_OK(Put("x1111_AAAA", "val1")); ASSERT_OK(Put("x1112_AAAA", "val2")); ASSERT_OK(Put("x1113_AAAA", "val3")); ASSERT_OK(Put("x1114_AAAA", "val4")); // Not in domain, wont be added to filter ASSERT_OK(Put("zzzzz_AAAA", "val5")); ASSERT_OK(Flush()); ASSERT_EQ(Get("x1111_AAAA"), "val1"); ASSERT_EQ(Get("x1112_AAAA"), "val2"); ASSERT_EQ(Get("x1113_AAAA"), "val3"); ASSERT_EQ(Get("x1114_AAAA"), "val4"); // Was not added to filter but rocksdb will try to read it from the filter ASSERT_EQ(Get("zzzzz_AAAA"), "val5"); } TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter2) { BlockBasedTableOptions bbto; bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); Options options = CurrentOptions(); options.prefix_extractor = std::make_shared(); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); ASSERT_OK(Put("x1113_AAAA", "val3")); ASSERT_OK(Put("x1114_AAAA", "val4")); // Not in domain, wont be added to filter ASSERT_OK(Put("zzzzz_AAAA", "val1")); ASSERT_OK(Put("zzzzz_AAAB", "val2")); ASSERT_OK(Put("zzzzz_AAAC", "val3")); ASSERT_OK(Put("zzzzz_AAAD", "val4")); ASSERT_OK(Flush()); std::vector iter_res; auto iter = db_->NewIterator(ReadOptions()); // Seek to a key that was not in Domain for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) { iter_res.emplace_back(iter->value().ToString()); } std::vector expected_res = {"val1", "val2", "val3", "val4"}; ASSERT_EQ(iter_res, expected_res); delete iter; } TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) { // regression test for #2743. the range delete tombstones in memtable should // be added even when Get() skips searching due to its prefix bloom filter const int kMemtableSize = 1 << 20; // 1MB const int kMemtablePrefixFilterSize = 1 << 13; // 8KB const int kPrefixLen = 4; Options options = CurrentOptions(); options.memtable_prefix_bloom_size_ratio = static_cast(kMemtablePrefixFilterSize) / kMemtableSize; options.prefix_extractor.reset( ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen)); options.write_buffer_size = kMemtableSize; options.memtable_whole_key_filtering = false; Reopen(options); std::string key1("AAAABBBB"); std::string key2("AAAACCCC"); // not in DB std::string key3("AAAADDDD"); std::string key4("AAAAEEEE"); std::string value1("Value1"); std::string value3("Value3"); std::string value4("Value4"); ASSERT_OK(Put(key1, value1, WriteOptions())); // check memtable bloom stats ASSERT_EQ("NOT_FOUND", Get(key2)); ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); // same prefix, bloom filter false positive ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); // enable whole key bloom filter options.memtable_whole_key_filtering = true; Reopen(options); // check memtable bloom stats ASSERT_OK(Put(key3, value3, WriteOptions())); ASSERT_EQ("NOT_FOUND", Get(key2)); // whole key bloom filter kicks in and determines it's a miss ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); // verify whole key filtering does not depend on prefix_extractor options.prefix_extractor.reset(); Reopen(options); // check memtable bloom stats ASSERT_OK(Put(key4, value4, WriteOptions())); ASSERT_EQ("NOT_FOUND", Get(key2)); // whole key bloom filter kicks in and determines it's a miss ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count); ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); } TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) { Options options = CurrentOptions(); options.memtable_prefix_bloom_size_ratio = 0.015; options.memtable_whole_key_filtering = true; Reopen(options); std::string key1("AA"); std::string key2("BB"); std::string key3("CC"); std::string key4("DD"); std::string key_not("EE"); std::string value1("Value1"); std::string value2("Value2"); std::string value3("Value3"); std::string value4("Value4"); ASSERT_OK(Put(key1, value1, WriteOptions())); ASSERT_OK(Put(key2, value2, WriteOptions())); ASSERT_OK(Flush()); ASSERT_OK(Put(key3, value3, WriteOptions())); const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK(Put(key4, value4, WriteOptions())); // Delete key2 and key3 ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ")); // Read without snapshot auto results = MultiGet({key_not, key1, key2, key3, key4}); ASSERT_EQ(results[0], "NOT_FOUND"); ASSERT_EQ(results[1], value1); ASSERT_EQ(results[2], "NOT_FOUND"); ASSERT_EQ(results[3], "NOT_FOUND"); ASSERT_EQ(results[4], value4); // Also check Get ASSERT_EQ(Get(key1), value1); ASSERT_EQ(Get(key2), "NOT_FOUND"); ASSERT_EQ(Get(key3), "NOT_FOUND"); ASSERT_EQ(Get(key4), value4); // Read with snapshot results = MultiGet({key_not, key1, key2, key3, key4}, snapshot); ASSERT_EQ(results[0], "NOT_FOUND"); ASSERT_EQ(results[1], value1); ASSERT_EQ(results[2], value2); ASSERT_EQ(results[3], value3); ASSERT_EQ(results[4], "NOT_FOUND"); // Also check Get ASSERT_EQ(Get(key1, snapshot), value1); ASSERT_EQ(Get(key2, snapshot), value2); ASSERT_EQ(Get(key3, snapshot), value3); ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND"); db_->ReleaseSnapshot(snapshot); } TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { constexpr size_t kPrefixSize = 8; const std::string kKey = "key"; assert(kKey.size() < kPrefixSize); Options options = CurrentOptions(); options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize)); options.memtable_prefix_bloom_size_ratio = 0.25; Reopen(options); ASSERT_OK(Put(kKey, "v")); ASSERT_EQ("v", Get(kKey)); std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); iter->Seek(kKey); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(kKey, iter->key()); iter->SeekForPrev(kKey); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(kKey, iter->key()); } class DBBloomFilterTestVaryPrefixAndFormatVer : public DBTestBase, public testing::WithParamInterface> { protected: bool use_prefix_; uint32_t format_version_; public: DBBloomFilterTestVaryPrefixAndFormatVer() : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {} ~DBBloomFilterTestVaryPrefixAndFormatVer() override {} void SetUp() override { use_prefix_ = std::get<0>(GetParam()); format_version_ = std::get<1>(GetParam()); } static std::string UKey(uint32_t i) { return Key(static_cast(i)); } }; TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) { Options options = CurrentOptions(); if (use_prefix_) { // Entire key from UKey() options.prefix_extractor.reset(NewCappedPrefixTransform(9)); } options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(20)); bbto.partition_filters = true; bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; bbto.whole_key_filtering = !use_prefix_; if (use_prefix_) { // (not related to prefix, just alternating between) // Make sure code appropriately deals with metadata block size setting // that is "too small" (smaller than minimum size for filter builder) bbto.metadata_block_size = 63; } else { // Make sure the test will work even on platforms with large minimum // filter size, due to large cache line size. // (Largest cache line size + 10+% overhead.) bbto.metadata_block_size = 290; } options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); ReadOptions ropts; constexpr uint32_t N = 12000; // Add N/2 evens for (uint32_t i = 0; i < N; i += 2) { ASSERT_OK(Put(UKey(i), UKey(i))); } ASSERT_OK(Flush()); #ifndef ROCKSDB_LITE ASSERT_EQ(TotalTableFiles(), 1); #endif constexpr uint32_t Q = 29; // MultiGet In std::array keys; std::array key_slices; std::array column_families; // MultiGet Out std::array statuses; std::array values; TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT); TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL); TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED); TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE); TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE); // Check that initial clump of keys only loads one partition filter from // block cache. // And that spread out keys load many partition filters. // In both cases, mix present vs. not present keys. for (uint32_t stride : {uint32_t{1}, (N / Q) | 1}) { for (uint32_t i = 0; i < Q; ++i) { keys[i] = UKey(i * stride); key_slices[i] = Slice(keys[i]); column_families[i] = db_->DefaultColumnFamily(); statuses[i] = Status(); values[i] = PinnableSlice(); } db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0], /*timestamps=*/nullptr, &statuses[0], true); // Confirm correct status results uint32_t number_not_found = 0; for (uint32_t i = 0; i < Q; ++i) { if ((i * stride % 2) == 0) { ASSERT_OK(statuses[i]); } else { ASSERT_TRUE(statuses[i].IsNotFound()); ++number_not_found; } } // Confirm correct Bloom stats (no FPs) uint64_t filter_useful = TestGetAndResetTickerCount( options, use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL); uint64_t filter_checked = TestGetAndResetTickerCount(options, use_prefix_ ? BLOOM_FILTER_PREFIX_CHECKED : BLOOM_FILTER_FULL_POSITIVE) + (use_prefix_ ? 0 : filter_useful); EXPECT_EQ(filter_useful, number_not_found); EXPECT_EQ(filter_checked, Q); if (!use_prefix_) { EXPECT_EQ( TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), Q - number_not_found); } // Confirm no duplicate loading same filter partition uint64_t filter_accesses = TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); if (stride == 1) { EXPECT_EQ(filter_accesses, 1); } else { // for large stride EXPECT_GE(filter_accesses, Q / 2 + 1); } } // Check that a clump of keys (present and not) works when spanning // two partitions int found_spanning = 0; for (uint32_t start = 0; start < N / 2;) { for (uint32_t i = 0; i < Q; ++i) { keys[i] = UKey(start + i); key_slices[i] = Slice(keys[i]); column_families[i] = db_->DefaultColumnFamily(); statuses[i] = Status(); values[i] = PinnableSlice(); } db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0], /*timestamps=*/nullptr, &statuses[0], true); // Confirm correct status results uint32_t number_not_found = 0; for (uint32_t i = 0; i < Q; ++i) { if (((start + i) % 2) == 0) { ASSERT_OK(statuses[i]); } else { ASSERT_TRUE(statuses[i].IsNotFound()); ++number_not_found; } } // Confirm correct Bloom stats (might see some FPs) uint64_t filter_useful = TestGetAndResetTickerCount( options, use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL); uint64_t filter_checked = TestGetAndResetTickerCount(options, use_prefix_ ? BLOOM_FILTER_PREFIX_CHECKED : BLOOM_FILTER_FULL_POSITIVE) + (use_prefix_ ? 0 : filter_useful); EXPECT_GE(filter_useful, number_not_found - 2); // possible FP EXPECT_EQ(filter_checked, Q); if (!use_prefix_) { EXPECT_EQ( TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), Q - number_not_found); } // Confirm no duplicate loading of same filter partition uint64_t filter_accesses = TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); if (filter_accesses == 2) { // Spanned across partitions. ++found_spanning; if (found_spanning >= 2) { break; } else { // Ensure that at least once we have at least one present and // one non-present key on both sides of partition boundary. start += 2; } } else { EXPECT_EQ(filter_accesses, 1); // See explanation at "start += 2" start += Q - 4; } } EXPECT_TRUE(found_spanning >= 2); } INSTANTIATE_TEST_CASE_P(DBBloomFilterTestVaryPrefixAndFormatVer, DBBloomFilterTestVaryPrefixAndFormatVer, ::testing::Values( // (use_prefix, format_version) std::make_tuple(false, 2), std::make_tuple(false, 3), std::make_tuple(false, 4), std::make_tuple(false, 5), std::make_tuple(true, 2), std::make_tuple(true, 3), std::make_tuple(true, 4), std::make_tuple(true, 5))); #ifndef ROCKSDB_LITE namespace { static const std::string kPlainTable = "test_PlainTableBloom"; } // namespace class BloomStatsTestWithParam : public DBBloomFilterTest, public testing::WithParamInterface> { public: BloomStatsTestWithParam() { bfp_impl_ = std::get<0>(GetParam()); partition_filters_ = std::get<1>(GetParam()); options_.create_if_missing = true; options_.prefix_extractor.reset( ROCKSDB_NAMESPACE::NewFixedPrefixTransform(4)); options_.memtable_prefix_bloom_size_ratio = 8.0 * 1024.0 / static_cast(options_.write_buffer_size); if (bfp_impl_ == kPlainTable) { assert(!partition_filters_); // not supported in plain table PlainTableOptions table_options; options_.table_factory.reset(NewPlainTableFactory(table_options)); } else { BlockBasedTableOptions table_options; if (partition_filters_) { table_options.partition_filters = partition_filters_; table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } table_options.filter_policy = Create(10, bfp_impl_); options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); } options_.env = env_; get_perf_context()->Reset(); DestroyAndReopen(options_); } ~BloomStatsTestWithParam() override { get_perf_context()->Reset(); Destroy(options_); } // Required if inheriting from testing::WithParamInterface<> static void SetUpTestCase() {} static void TearDownTestCase() {} std::string bfp_impl_; bool partition_filters_; Options options_; }; // 1 Insert 2 K-V pairs into DB // 2 Call Get() for both keys - expext memtable bloom hit stat to be 2 // 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1 // 4 Call Flush() to create SST // 5 Call Get() for both keys - expext SST bloom hit stat to be 2 // 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1 // Test both: block and plain SST TEST_P(BloomStatsTestWithParam, BloomStatsTest) { std::string key1("AAAA"); std::string key2("RXDB"); // not in DB std::string key3("ZBRA"); std::string value1("Value1"); std::string value3("Value3"); ASSERT_OK(Put(key1, value1, WriteOptions())); ASSERT_OK(Put(key3, value3, WriteOptions())); // check memtable bloom stats ASSERT_EQ(value1, Get(key1)); ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); ASSERT_EQ(value3, Get(key3)); ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); ASSERT_EQ("NOT_FOUND", Get(key2)); ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); // sanity checks ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); Flush(); // sanity checks ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); // check SST bloom stats ASSERT_EQ(value1, Get(key1)); ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count); ASSERT_EQ(value3, Get(key3)); ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count); ASSERT_EQ("NOT_FOUND", Get(key2)); ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); } // Same scenario as in BloomStatsTest but using an iterator TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { std::string key1("AAAA"); std::string key2("RXDB"); // not in DB std::string key3("ZBRA"); std::string value1("Value1"); std::string value3("Value3"); ASSERT_OK(Put(key1, value1, WriteOptions())); ASSERT_OK(Put(key3, value3, WriteOptions())); std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); // check memtable bloom stats iter->Seek(key1); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(value1, iter->value().ToString()); ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); iter->Seek(key3); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(value3, iter->value().ToString()); ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); iter->Seek(key2); ASSERT_OK(iter->status()); ASSERT_TRUE(!iter->Valid()); ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); Flush(); iter.reset(dbfull()->NewIterator(ReadOptions())); // Check SST bloom stats iter->Seek(key1); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(value1, iter->value().ToString()); ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count); iter->Seek(key3); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(value3, iter->value().ToString()); uint64_t expected_hits = 2; ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); iter->Seek(key2); ASSERT_OK(iter->status()); ASSERT_TRUE(!iter->Valid()); ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); } INSTANTIATE_TEST_CASE_P( BloomStatsTestWithParam, BloomStatsTestWithParam, ::testing::Values(std::make_tuple(kLegacyBloom, false), std::make_tuple(kLegacyBloom, true), std::make_tuple(kFastLocalBloom, false), std::make_tuple(kFastLocalBloom, true), std::make_tuple(kPlainTable, false))); namespace { void PrefixScanInit(DBBloomFilterTest* dbtest) { char buf[100]; std::string keystr; const int small_range_sstfiles = 5; const int big_range_sstfiles = 5; // Generate 11 sst files with the following prefix ranges. // GROUP 0: [0,10] (level 1) // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) // // A seek with the previous API would do 11 random I/Os (to all the // files). With the new API and a prefix filter enabled, we should // only do 2 random I/O, to the 2 files containing the key. // GROUP 0 snprintf(buf, sizeof(buf), "%02d______:start", 0); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); snprintf(buf, sizeof(buf), "%02d______:end", 10); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); ASSERT_OK(dbtest->Flush()); ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // move to level 1 // GROUP 1 for (int i = 1; i <= small_range_sstfiles; i++) { snprintf(buf, sizeof(buf), "%02d______:start", i); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); snprintf(buf, sizeof(buf), "%02d______:end", i + 1); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); dbtest->Flush(); } // GROUP 2 for (int i = 1; i <= big_range_sstfiles; i++) { snprintf(buf, sizeof(buf), "%02d______:start", 0); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); dbtest->Flush(); } } } // namespace TEST_F(DBBloomFilterTest, PrefixScan) { while (ChangeFilterOptions()) { int count; Slice prefix; Slice key; char buf[100]; Iterator* iter; snprintf(buf, sizeof(buf), "03______:"); prefix = Slice(buf, 8); key = Slice(buf, 9); ASSERT_EQ(key.difference_offset(prefix), 8); ASSERT_EQ(prefix.difference_offset(key), 8); // db configs env_->count_random_reads_ = true; Options options = CurrentOptions(); options.env = env_; options.prefix_extractor.reset(NewFixedPrefixTransform(8)); options.disable_auto_compactions = true; options.max_background_compactions = 2; options.create_if_missing = true; options.memtable_factory.reset(NewHashSkipListRepFactory(16)); assert(!options.unordered_write); // It is incompatible with allow_concurrent_memtable_write=false options.allow_concurrent_memtable_write = false; BlockBasedTableOptions table_options; table_options.no_block_cache = true; table_options.filter_policy.reset(NewBloomFilterPolicy(10)); table_options.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); // 11 RAND I/Os DestroyAndReopen(options); PrefixScanInit(this); count = 0; env_->random_read_counter_.Reset(); iter = db_->NewIterator(ReadOptions()); for (iter->Seek(prefix); iter->Valid(); iter->Next()) { if (!iter->key().starts_with(prefix)) { break; } count++; } ASSERT_OK(iter->status()); delete iter; ASSERT_EQ(count, 2); ASSERT_EQ(env_->random_read_counter_.Read(), 2); Close(); } // end of while } TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { Options options = CurrentOptions(); options.write_buffer_size = 64 * 1024; options.arena_block_size = 4 * 1024; options.target_file_size_base = 64 * 1024; options.level0_file_num_compaction_trigger = 2; options.level0_slowdown_writes_trigger = 2; options.level0_stop_writes_trigger = 4; options.max_bytes_for_level_base = 256 * 1024; options.max_write_buffer_number = 2; options.max_background_compactions = 8; options.max_background_flushes = 8; options.compression = kNoCompression; options.compaction_style = kCompactionStyleLevel; options.level_compaction_dynamic_level_bytes = true; BlockBasedTableOptions bbto; bbto.cache_index_and_filter_blocks = true; bbto.filter_policy.reset(NewBloomFilterPolicy(10)); bbto.whole_key_filtering = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.optimize_filters_for_hits = true; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); get_perf_context()->Reset(); get_perf_context()->EnablePerLevelPerfContext(); CreateAndReopenWithCF({"mypikachu"}, options); int numkeys = 200000; // Generate randomly shuffled keys, so the updates are almost // random. std::vector keys; keys.reserve(numkeys); for (int i = 0; i < numkeys; i += 2) { keys.push_back(i); } RandomShuffle(std::begin(keys), std::end(keys), /*seed*/ 42); int num_inserted = 0; for (int key : keys) { ASSERT_OK(Put(1, Key(key), "val")); if (++num_inserted % 1000 == 0) { ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } } ASSERT_OK(Put(1, Key(0), "val")); ASSERT_OK(Put(1, Key(numkeys), "val")); ASSERT_OK(Flush(1)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (NumTableFilesAtLevel(0, 1) == 0) { // No Level 0 file. Create one. ASSERT_OK(Put(1, Key(0), "val")); ASSERT_OK(Put(1, Key(numkeys), "val")); ASSERT_OK(Flush(1)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } for (int i = 1; i < numkeys; i += 2) { ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND"); } ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); // Now we have three sorted run, L0, L5 and L6 with most files in L6 have // no bloom filter. Most keys be checked bloom filters twice. ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2); ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2); uint64_t bloom_filter_useful_all_levels = 0; for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { if (kv.second.bloom_filter_useful > 0) { bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; } } ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2); ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2); for (int i = 0; i < numkeys; i += 2) { ASSERT_EQ(Get(1, Key(i)), "val"); } // Part 2 (read path): rewrite last level with blooms, then verify they get // cached only if !optimize_filters_for_hits options.disable_auto_compactions = true; options.num_levels = 9; options.optimize_filters_for_hits = false; options.statistics = CreateDBStatistics(); bbto.block_cache.reset(); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); ReopenWithColumnFamilies({"default", "mypikachu"}, options); MoveFilesToLevel(7 /* level */, 1 /* column family index */); std::string value = Get(1, Key(0)); uint64_t prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); value = Get(1, Key(0)); ASSERT_EQ(prev_cache_filter_hits + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); // Now that we know the filter blocks exist in the last level files, see if // filter caching is skipped for this optimization options.optimize_filters_for_hits = true; options.statistics = CreateDBStatistics(); bbto.block_cache.reset(); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); ReopenWithColumnFamilies({"default", "mypikachu"}, options); value = Get(1, Key(0)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(2 /* index and data block */, TestGetTickerCount(options, BLOCK_CACHE_ADD)); // Check filter block ignored for files preloaded during DB::Open() options.max_open_files = -1; options.statistics = CreateDBStatistics(); bbto.block_cache.reset(); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); ReopenWithColumnFamilies({"default", "mypikachu"}, options); uint64_t prev_cache_filter_misses = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); Get(1, Key(0)); ASSERT_EQ(prev_cache_filter_misses, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(prev_cache_filter_hits, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); // Check filter block ignored for file trivially-moved to bottom level bbto.block_cache.reset(); options.max_open_files = 100; // setting > -1 makes it not preload all files options.statistics = CreateDBStatistics(); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); ReopenWithColumnFamilies({"default", "mypikachu"}, options); ASSERT_OK(Put(1, Key(numkeys + 1), "val")); ASSERT_OK(Flush(1)); int32_t trivial_move = 0; int32_t non_trivial_move = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::BackgroundCompaction:TrivialMove", [&](void* /*arg*/) { trivial_move++; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::BackgroundCompaction:NonTrivial", [&](void* /*arg*/) { non_trivial_move++; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); CompactRangeOptions compact_options; compact_options.bottommost_level_compaction = BottommostLevelCompaction::kSkip; compact_options.change_level = true; compact_options.target_level = 7; ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) .IsNotSupported()); ASSERT_EQ(trivial_move, 1); ASSERT_EQ(non_trivial_move, 0); prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); prev_cache_filter_misses = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); value = Get(1, Key(numkeys + 1)); ASSERT_EQ(prev_cache_filter_hits, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(prev_cache_filter_misses, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); // Check filter block not cached for iterator bbto.block_cache.reset(); options.statistics = CreateDBStatistics(); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); ReopenWithColumnFamilies({"default", "mypikachu"}, options); std::unique_ptr iter(db_->NewIterator(ReadOptions(), handles_[1])); iter->SeekToFirst(); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(2 /* index and data block */, TestGetTickerCount(options, BLOCK_CACHE_ADD)); get_perf_context()->Reset(); } int CountIter(std::unique_ptr& iter, const Slice& key) { int count = 0; for (iter->Seek(key); iter->Valid(); iter->Next()) { count++; } EXPECT_OK(iter->status()); return count; } // use iterate_upper_bound to hint compatiability of existing bloom filters. // The BF is considered compatible if 1) upper bound and seek key transform // into the same string, or 2) the transformed seek key is of the same length // as the upper bound and two keys are adjacent according to the comparator. TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) { Options options; options.create_if_missing = true; options.env = CurrentOptions().env; options.prefix_extractor.reset(NewCappedPrefixTransform(4)); options.disable_auto_compactions = true; options.statistics = CreateDBStatistics(); // Enable prefix bloom for SST files BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; table_options.filter_policy = Create(10, bfp_impl); table_options.index_shortening = BlockBasedTableOptions:: IndexShorteningMode::kShortenSeparatorsAndSuccessor; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); ASSERT_OK(Put("abcdxxx0", "val1")); ASSERT_OK(Put("abcdxxx1", "val2")); ASSERT_OK(Put("abcdxxx2", "val3")); ASSERT_OK(Put("abcdxxx3", "val4")); ASSERT_OK(dbfull()->Flush(FlushOptions())); { // prefix_extractor has not changed, BF will always be read Slice upper_bound("abce"); ReadOptions read_options; read_options.prefix_same_as_start = true; read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abcd0000"), 4); } { Slice upper_bound("abcdzzzz"); ReadOptions read_options; read_options.prefix_same_as_start = true; read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abcd0000"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}})); ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), "rocksdb.FixedPrefix.5"); { // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read Slice upper_bound("abce"); ReadOptions read_options; read_options.prefix_same_as_start = true; read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abcdxx00"), 4); // should check bloom filter since upper bound meets requirement ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } { // [abcdxx01, abcey) is not valid bound since upper bound is too long for // the BF in SST (capped:4) Slice upper_bound("abcey"); ReadOptions read_options; read_options.prefix_same_as_start = true; read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abcdxx01"), 4); // should skip bloom filter since upper bound is too long ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } { // [abcdxx02, abcdy) is a valid bound since the prefix is the same Slice upper_bound("abcdy"); ReadOptions read_options; read_options.prefix_same_as_start = true; read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abcdxx02"), 4); // should check bloom filter since upper bound matches transformed seek // key ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } { // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the // same prefix, 2) the prefixes are not consecutive Slice upper_bound("abce"); ReadOptions read_options; read_options.prefix_same_as_start = true; read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0); // should skip bloom filter since mismatch is found ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}})); { // [abc, abd) is not a valid bound since the upper bound is too short // for BF (capped:4) Slice upper_bound("abd"); ReadOptions read_options; read_options.prefix_same_as_start = true; read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } // Same with re-open options.prefix_extractor.reset(NewFixedPrefixTransform(3)); Reopen(options); { Slice upper_bound("abd"); ReadOptions read_options; read_options.prefix_same_as_start = true; read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } // Set back to capped:4 and verify BF is always read options.prefix_extractor.reset(NewCappedPrefixTransform(4)); Reopen(options); { Slice upper_bound("abd"); ReadOptions read_options; read_options.prefix_same_as_start = true; read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); } // Same if there's a problem initally loading prefix transform SyncPoint::GetInstance()->SetCallBack( "BlockBasedTable::Open::ForceNullTablePrefixExtractor", [&](void* arg) { *static_cast(arg) = true; }); SyncPoint::GetInstance()->EnableProcessing(); Reopen(options); { Slice upper_bound("abd"); ReadOptions read_options; read_options.prefix_same_as_start = true; read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); } SyncPoint::GetInstance()->DisableProcessing(); } } // Create multiple SST files each with a different prefix_extractor config, // verify iterators can read all SST files using the latest config. TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) { Options options; options.env = CurrentOptions().env; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.disable_auto_compactions = true; options.statistics = CreateDBStatistics(); // Enable prefix bloom for SST files BlockBasedTableOptions table_options; table_options.filter_policy = Create(10, bfp_impl); table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); Slice upper_bound("foz90000"); ReadOptions read_options; read_options.prefix_same_as_start = true; // first SST with fixed:1 BF ASSERT_OK(Put("foo2", "bar2")); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("foq1", "bar1")); ASSERT_OK(Put("fpa", "0")); dbfull()->Flush(FlushOptions()); std::unique_ptr iter_old(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_old, "foo"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1); ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), "rocksdb.CappedPrefix.3"); read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "foo"), 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); ASSERT_EQ(CountIter(iter, "gpk"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); // second SST with capped:3 BF ASSERT_OK(Put("foo3", "bar3")); ASSERT_OK(Put("foo4", "bar4")); ASSERT_OK(Put("foq5", "bar5")); ASSERT_OK(Put("fpb", "1")); ASSERT_OK(dbfull()->Flush(FlushOptions())); { // BF is cappped:3 now std::unique_ptr iter_tmp(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); // both counters are incremented because BF is "not changed" for 1 of the // 2 SST files, so filter is checked once and found no match. ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), "rocksdb.FixedPrefix.2"); // third SST with fixed:2 BF ASSERT_OK(Put("foo6", "bar6")); ASSERT_OK(Put("foo7", "bar7")); ASSERT_OK(Put("foq8", "bar8")); ASSERT_OK(Put("fpc", "2")); ASSERT_OK(dbfull()->Flush(FlushOptions())); { // BF is fixed:2 now std::unique_ptr iter_tmp(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); // the first and last BF are checked ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 7); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); // only last BF is checked and not found ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 8); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); } // iter_old can only see the first SST, so checked plus 1 ASSERT_EQ(CountIter(iter_old, "foo"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); // iter was created after the first setoptions call so only full filter // will check the filter ASSERT_EQ(CountIter(iter, "foo"), 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 10); { // keys in all three SSTs are visible to iterator // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2) // so +2 for checked counter std::unique_ptr iter_all(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_all, "foo"), 9); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); ASSERT_EQ(CountIter(iter_all, "gpk"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 13); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), "rocksdb.CappedPrefix.3"); { std::unique_ptr iter_all(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_all, "foo"), 6); // all three SST are checked because the current options has the same as // the remaining SST (capped:3) ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 16); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); ASSERT_EQ(CountIter(iter_all, "gpk"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 17); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4); } // TODO(Zhongyi): Maybe also need to add Get calls to test point look up? } } // Create a new column family in a running DB, change prefix_extractor // dynamically, verify the iterator created on the new column family behaves // as expected TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { int iteration = 0; for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) { Options options = CurrentOptions(); options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.disable_auto_compactions = true; options.statistics = CreateDBStatistics(); // Enable prefix bloom for SST files BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; table_options.filter_policy = Create(10, bfp_impl); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options); ReadOptions read_options; read_options.prefix_same_as_start = true; // create a new CF and set prefix_extractor dynamically options.prefix_extractor.reset(NewCappedPrefixTransform(3)); CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options); ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(), "rocksdb.CappedPrefix.3"); ASSERT_OK(Put(2, "foo3", "bar3")); ASSERT_OK(Put(2, "foo4", "bar4")); ASSERT_OK(Put(2, "foo5", "bar5")); ASSERT_OK(Put(2, "foq6", "bar6")); ASSERT_OK(Put(2, "fpq7", "bar7")); dbfull()->Flush(FlushOptions()); { std::unique_ptr iter( db_->NewIterator(read_options, handles_[2])); ASSERT_EQ(CountIter(iter, "foo"), 3); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK( dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}})); ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(), "rocksdb.FixedPrefix.2"); { std::unique_ptr iter( db_->NewIterator(read_options, handles_[2])); ASSERT_EQ(CountIter(iter, "foo"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK(dbfull()->DropColumnFamily(handles_[2])); ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2])); handles_[2] = nullptr; ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); handles_[1] = nullptr; iteration++; } } // Verify it's possible to change prefix_extractor at runtime and iterators // behaves as expected TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) { Options options; options.env = CurrentOptions().env; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.disable_auto_compactions = true; options.statistics = CreateDBStatistics(); // Enable prefix bloom for SST files BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; table_options.filter_policy = Create(10, bfp_impl); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); ASSERT_OK(Put("foo2", "bar2")); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("foo1", "bar1")); ASSERT_OK(Put("fpa", "0")); dbfull()->Flush(FlushOptions()); ASSERT_OK(Put("foo3", "bar3")); ASSERT_OK(Put("foo4", "bar4")); ASSERT_OK(Put("foo5", "bar5")); ASSERT_OK(Put("fpb", "1")); dbfull()->Flush(FlushOptions()); ASSERT_OK(Put("foo6", "bar6")); ASSERT_OK(Put("foo7", "bar7")); ASSERT_OK(Put("foo8", "bar8")); ASSERT_OK(Put("fpc", "2")); dbfull()->Flush(FlushOptions()); ReadOptions read_options; read_options.prefix_same_as_start = true; { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "foo"), 12); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } std::unique_ptr iter_old(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_old, "foo"), 12); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), "rocksdb.CappedPrefix.3"); { std::unique_ptr iter(db_->NewIterator(read_options)); // "fp*" should be skipped ASSERT_EQ(CountIter(iter, "foo"), 9); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } // iterator created before should not be affected and see all keys ASSERT_EQ(CountIter(iter_old, "foo"), 12); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); ASSERT_EQ(CountIter(iter_old, "abc"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); } } TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) { Options options = CurrentOptions(); constexpr size_t kNumKeys = 10000; static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000"); options.memtable_factory.reset( test::NewSpecialSkipListFactory(kNumKeys + 10)); options.create_if_missing = true; constexpr size_t kPrefixLength = 4; options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength)); options.compression = kNoCompression; BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(50)); bbto.index_shortening = BlockBasedTableOptions::IndexShorteningMode::kNoShortening; bbto.block_size = 128; bbto.metadata_block_size = 128; bbto.partition_filters = true; bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); const std::string value(64, '\0'); WriteOptions write_opts; write_opts.disableWAL = true; for (size_t i = 0; i < kNumKeys; ++i) { std::ostringstream oss; oss << std::setfill('0') << std::setw(4) << std::fixed << i; ASSERT_OK(db_->Put(write_opts, oss.str(), value)); } ASSERT_OK(Flush()); ReadOptions read_opts; // Use legacy, implicit prefix seek read_opts.total_order_seek = false; read_opts.auto_prefix_mode = false; std::unique_ptr it(db_->NewIterator(read_opts)); for (size_t i = 0; i < kNumKeys; ++i) { // Seek with a key after each one added but with same prefix. One will // surely cross a partition boundary. std::ostringstream oss; oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a"; it->SeekForPrev(oss.str()); ASSERT_OK(it->status()); ASSERT_TRUE(it->Valid()); } it.reset(); } namespace { class BackwardBytewiseComparator : public Comparator { public: const char* Name() const override { return "BackwardBytewiseComparator"; } int Compare(const Slice& a, const Slice& b) const override { int min_size_neg = -static_cast(std::min(a.size(), b.size())); const char* a_end = a.data() + a.size(); const char* b_end = b.data() + b.size(); for (int i = -1; i >= min_size_neg; --i) { if (a_end[i] != b_end[i]) { if (static_cast(a_end[i]) < static_cast(b_end[i])) { return -1; } else { return 1; } } } return static_cast(a.size()) - static_cast(b.size()); } void FindShortestSeparator(std::string* /*start*/, const Slice& /*limit*/) const override {} void FindShortSuccessor(std::string* /*key*/) const override {} }; const BackwardBytewiseComparator kBackwardBytewiseComparator{}; class FixedSuffix4Transform : public SliceTransform { const char* Name() const override { return "FixedSuffixTransform"; } Slice Transform(const Slice& src) const override { return Slice(src.data() + src.size() - 4, 4); } bool InDomain(const Slice& src) const override { return src.size() >= 4; } }; std::pair GetBloomStat(const Options& options, bool sst) { if (sst) { return { options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_CHECKED), options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_USEFUL)}; } else { auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0); auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0); return {hit + miss, miss}; } } std::pair CheckedAndUseful(uint64_t checked, uint64_t useful) { return {checked, useful}; } } // namespace // This uses a prefix_extractor + comparator combination that violates // one of the old obsolete, unnecessary axioms of prefix extraction: // * key.starts_with(prefix(key)) // This axiom is not really needed, and we validate that here. TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) { BlockBasedTableOptions bbto; bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); bbto.whole_key_filtering = false; Options options = CurrentOptions(); options.comparator = &kBackwardBytewiseComparator; options.prefix_extractor = std::make_shared(); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.memtable_prefix_bloom_size_ratio = 0.1; options.statistics = CreateDBStatistics(); DestroyAndReopen(options); ASSERT_OK(Put("321aaaa", "val1")); ASSERT_OK(Put("112aaaa", "val2")); ASSERT_OK(Put("009aaaa", "val3")); ASSERT_OK(Put("baa", "val4")); // out of domain ASSERT_OK(Put("321abaa", "val5")); ASSERT_OK(Put("zzz", "val6")); // out of domain for (auto flushed : {false, true}) { SCOPED_TRACE("flushed=" + std::to_string(flushed)); if (flushed) { ASSERT_OK(Flush()); } ReadOptions read_options; if (flushed) { // TODO: support auto_prefix_mode in memtable? read_options.auto_prefix_mode = true; } EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); { Slice ub("999aaaa"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa"), 3); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); } { Slice ub("999abaa"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "abaa"), 1); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); } { Slice ub("999acaa"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "acaa"), 0); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); } { Slice ub("zzzz"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "baa"), 3); if (flushed) { // TODO: fix memtable case EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); } } } } // This uses a prefix_extractor + comparator combination that violates // one of the old obsolete, unnecessary axioms of prefix extraction: // * Compare(prefix(key), key) <= 0 // This axiom is not really needed, and we validate that here. TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) { BlockBasedTableOptions bbto; bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); bbto.whole_key_filtering = false; Options options = CurrentOptions(); options.comparator = ReverseBytewiseComparator(); options.prefix_extractor.reset(NewFixedPrefixTransform(4)); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.memtable_prefix_bloom_size_ratio = 0.1; options.statistics = CreateDBStatistics(); DestroyAndReopen(options); ASSERT_OK(Put("aaaa123", "val1")); ASSERT_OK(Put("aaaa211", "val2")); ASSERT_OK(Put("aaaa900", "val3")); ASSERT_OK(Put("aab", "val4")); // out of domain ASSERT_OK(Put("aaba123", "val5")); ASSERT_OK(Put("qqqq123", "val7")); ASSERT_OK(Put("qqqq", "val8")); ASSERT_OK(Put("zzz", "val8")); // out of domain for (auto flushed : {false, true}) { SCOPED_TRACE("flushed=" + std::to_string(flushed)); if (flushed) { ASSERT_OK(Flush()); } ReadOptions read_options; if (flushed) { // TODO: support auto_prefix_mode in memtable? read_options.auto_prefix_mode = true; } else { // TODO: why needed? get_perf_context()->bloom_memtable_hit_count = 0; get_perf_context()->bloom_memtable_miss_count = 0; } EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); { Slice ub("aaaa000"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa999"), 3); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); } { // Note: prefix does work as upper bound Slice ub("aaaa"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa999"), 3); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); } { // Note: prefix does not work here as seek key Slice ub("aaaa500"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa"), 0); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); } { Slice ub("aaba000"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaba999"), 1); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); } { Slice ub("aaca000"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaca999"), 0); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); } { Slice ub("aaaz"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "zzz"), 5); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); } { // Note: prefix does work here as seek key, but only finds key equal // to prefix (others with same prefix are less) read_options.auto_prefix_mode = false; read_options.iterate_upper_bound = nullptr; read_options.prefix_same_as_start = true; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "qqqq"), 1); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); } } } namespace { // A weird comparator that in combination with NonIdempotentFixed4Transform // breaks an old axiom of prefix filtering. class WeirdComparator : public Comparator { public: const char* Name() const override { return "WeirdComparator"; } int Compare(const Slice& a, const Slice& b) const override { bool a_in = a.size() >= 5; bool b_in = b.size() >= 5; if (a_in != b_in) { // Order keys after prefixes return a_in - b_in; } if (a_in) { return BytewiseComparator()->Compare(a, b); } else { // Different ordering on the prefixes return ReverseBytewiseComparator()->Compare(a, b); } } void FindShortestSeparator(std::string* /*start*/, const Slice& /*limit*/) const override {} void FindShortSuccessor(std::string* /*key*/) const override {} }; const WeirdComparator kWeirdComparator{}; // Non-idempotentent because prefix is always 4 bytes, but this is // out-of-domain for keys to be assigned prefixes (>= 5 bytes) class NonIdempotentFixed4Transform : public SliceTransform { const char* Name() const override { return "NonIdempotentFixed4Transform"; } Slice Transform(const Slice& src) const override { return Slice(src.data(), 4); } bool InDomain(const Slice& src) const override { return src.size() >= 5; } }; } // namespace // This uses a prefix_extractor + comparator combination that violates // two of the old obsolete, unnecessary axioms of prefix extraction: // * prefix(prefix(key)) == prefix(key) // * If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 // This axiom is not really needed, and we validate that here. TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { BlockBasedTableOptions bbto; bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); bbto.whole_key_filtering = false; Options options = CurrentOptions(); options.prefix_extractor = std::make_shared(); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.memtable_prefix_bloom_size_ratio = 0.1; options.statistics = CreateDBStatistics(); for (auto weird_comparator : {false, true}) { if (weird_comparator) { options.comparator = &kWeirdComparator; } DestroyAndReopen(options); ASSERT_OK(Put("aaaa123", "val1")); ASSERT_OK(Put("aaaa211", "val2")); ASSERT_OK(Put("aaaa900", "val3")); ASSERT_OK(Put("aab", "val4")); // out of domain ASSERT_OK(Put("aaba123", "val5")); ASSERT_OK(Put("qqqq123", "val7")); ASSERT_OK(Put("qqqq", "val8")); // out of domain ASSERT_OK(Put("zzzz", "val8")); // out of domain for (auto flushed : {false, true}) { SCOPED_TRACE("flushed=" + std::to_string(flushed)); if (flushed) { ASSERT_OK(Flush()); } ReadOptions read_options; if (flushed) { // TODO: support auto_prefix_mode in memtable? read_options.auto_prefix_mode = true; } else { // TODO: why needed? get_perf_context()->bloom_memtable_hit_count = 0; get_perf_context()->bloom_memtable_miss_count = 0; } EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); { Slice ub("aaaa999"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa000"), 3); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); } { // Note: prefix as seek key is not bloom-optimized // Note: the count works with weird_comparator because "aaaa" is // ordered as the last of the prefixes Slice ub("aaaa999"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa"), 3); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); } { Slice ub("aaba9"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaba0"), 1); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); } { Slice ub("aaca9"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaca0"), 0); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); } { Slice ub("qqqq9"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "qqqq0"), 1); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); } { // Note: prefix as seek key is not bloom-optimized Slice ub("qqqq9"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "qqqq"), weird_comparator ? 7 : 2); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); } { // Note: prefix as seek key is not bloom-optimized Slice ub("zzzz9"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "zzzz"), weird_comparator ? 8 : 1); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); } { Slice ub("zzzz9"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aab"), weird_comparator ? 6 : 5); EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); } } } } #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); }