diff --git a/HISTORY.md b/HISTORY.md index e11d4450d..ee3e727ea 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,10 +11,11 @@ * Add support for trace sampling. * Enable properties block checksum verification for block-based tables. * For all users of dictionary compression, we now generate a separate dictionary for compressing each bottom-level SST file. Previously we reused a single dictionary for a whole compaction to bottom level. The new approach achieves better compression ratios; however, it uses more memory and CPU for buffering/sampling data blocks and training dictionaries. +* Add whole key bloom filter support in memtable. * Files written by `SstFileWriter` will now use dictionary compression if it is configured in the file writer's `CompressionOptions`. ### Public API Change -* Disallow CompactionFilter::IgnoreSnapshots() = false, because it is not very useful and the behavior is confusing. The filter will filter everything if there is no snapshot declared by the time the compaction starts. However, users can define a snapshot after the compaction starts and before it finishes and this new snapshot won't be repeatable, because after the compaction finishes, some keys may be dropped. +* Disallow CompactionFilter::IgnoreSnapshots() = false, because it is not very useful and the behavior is confusing. The filter will filter everything if there is no snapshot declared by the time the compaction starts. However, users can define a snapshot after the compaction starts and before it finishes and this new snapshot won't be repeatable, because after the compaction finishes, some keys may be dropped. * CompactionPri = kMinOverlappingRatio also uses compensated file size, which boosts file with lots of tombstones to be compacted first. * Transaction::GetForUpdate is extended with a do_validate parameter with default value of true. If false it skips validating the snapshot before doing the read. Similarly ::Merge, ::Put, ::Delete, and ::SingleDelete are extended with assume_tracked with default value of false. If true it indicates that call is assumed to be after a ::GetForUpdate. * `TableProperties::num_entries` and `TableProperties::num_deletions` now also account for number of range tombstones. diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index a196b4599..39dd20bb2 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -786,6 +786,56 @@ TEST_F(DBBloomFilterTest, PrefixExtractorBlockFilter) { delete iter; } +TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) { + // regression test for #2743. the range delete tombstones in memtable should + // be added even when Get() skips searching due to its prefix bloom filter + const int kMemtableSize = 1 << 20; // 1MB + const int kMemtablePrefixFilterSize = 1 << 13; // 8KB + const int kPrefixLen = 4; + Options options = CurrentOptions(); + options.memtable_prefix_bloom_size_ratio = + static_cast(kMemtablePrefixFilterSize) / kMemtableSize; + options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(kPrefixLen)); + options.write_buffer_size = kMemtableSize; + options.memtable_whole_key_filtering = false; + Reopen(options); + std::string key1("AAAABBBB"); + std::string key2("AAAACCCC"); // not in DB + std::string key3("AAAADDDD"); + std::string key4("AAAAEEEE"); + std::string value1("Value1"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + // same prefix, bloom filter false positive + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + + // enable whole key bloom filter + options.memtable_whole_key_filtering = true; + Reopen(options); + // check memtable bloom stats + ASSERT_OK(Put(key3, value3, WriteOptions())); + ASSERT_EQ("NOT_FOUND", Get(key2)); + // whole key bloom filter kicks in and determines it's a miss + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + + // verify whole key filtering does not depend on prefix_extractor + options.prefix_extractor.reset(); + Reopen(options); + // check memtable bloom stats + ASSERT_OK(Put(key4, value4, WriteOptions())); + ASSERT_EQ("NOT_FOUND", Get(key2)); + // whole key bloom filter kicks in and determines it's a miss + ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); +} + #ifndef ROCKSDB_LITE class BloomStatsTestWithParam : public DBBloomFilterTest, diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index 872b89125..599fe5786 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -2333,9 +2333,7 @@ TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) { std::atomic num_compression_dicts(0); rocksdb::SyncPoint::GetInstance()->SetCallBack( "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", - [&](void* /* arg */) { - ++num_compression_dicts; - }); + [&](void* /* arg */) { ++num_compression_dicts; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); Random rnd(301); diff --git a/db/memtable.cc b/db/memtable.cc index 54360d83b..33a6378ac 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -51,6 +51,8 @@ ImmutableMemTableOptions::ImmutableMemTableOptions( mutable_cf_options.memtable_prefix_bloom_size_ratio) * 8u), memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size), + memtable_whole_key_filtering( + mutable_cf_options.memtable_whole_key_filtering), inplace_update_support(ioptions.inplace_update_support), inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), inplace_callback(ioptions.inplace_callback), @@ -109,8 +111,10 @@ MemTable::MemTable(const InternalKeyComparator& cmp, // something went wrong if we need to flush before inserting anything assert(!ShouldScheduleFlush()); - if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) { - prefix_bloom_.reset( + // use bloom_filter_ for both whole key and prefix bloom filter + if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) && + moptions_.memtable_prefix_bloom_bits > 0) { + bloom_filter_.reset( new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality, 6 /* hard coded 6 probes */, moptions_.memtable_huge_page_size, ioptions.info_log)); @@ -282,7 +286,7 @@ class MemTableIterator : public InternalIterator { if (use_range_del_table) { iter_ = mem.range_del_table_->GetIterator(arena); } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) { - bloom_ = mem.prefix_bloom_.get(); + bloom_ = mem.bloom_filter_.get(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); } else { iter_ = mem.table_->GetIterator(arena); @@ -313,7 +317,8 @@ class MemTableIterator : public InternalIterator { void Seek(const Slice& k) override { PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); - if (bloom_ != nullptr) { + if (bloom_) { + // iterator should only use prefix bloom filter if (!bloom_->MayContain( prefix_extractor_->Transform(ExtractUserKey(k)))) { PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); @@ -329,7 +334,7 @@ class MemTableIterator : public InternalIterator { void SeekForPrev(const Slice& k) override { PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); - if (bloom_ != nullptr) { + if (bloom_) { if (!bloom_->MayContain( prefix_extractor_->Transform(ExtractUserKey(k)))) { PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); @@ -515,9 +520,11 @@ bool MemTable::Add(SequenceNumber s, ValueType type, std::memory_order_relaxed); } - if (prefix_bloom_) { - assert(prefix_extractor_); - prefix_bloom_->Add(prefix_extractor_->Transform(key)); + if (bloom_filter_ && prefix_extractor_) { + bloom_filter_->Add(prefix_extractor_->Transform(key)); + } + if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { + bloom_filter_->Add(key); } // The first sequence number inserted into the memtable @@ -546,9 +553,11 @@ bool MemTable::Add(SequenceNumber s, ValueType type, post_process_info->num_deletes++; } - if (prefix_bloom_) { - assert(prefix_extractor_); - prefix_bloom_->AddConcurrently(prefix_extractor_->Transform(key)); + if (bloom_filter_ && prefix_extractor_) { + bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key)); + } + if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { + bloom_filter_->AddConcurrently(key); } // atomically update first_seqno_ and earliest_seqno_. @@ -755,16 +764,24 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, Slice user_key = key.user_key(); bool found_final_value = false; bool merge_in_progress = s->IsMergeInProgress(); - bool const may_contain = - nullptr == prefix_bloom_ - ? false - : prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key)); - if (prefix_bloom_ && !may_contain) { + bool may_contain = true; + if (bloom_filter_) { + // when both memtable_whole_key_filtering and prefix_extractor_ are set, + // only do whole key filtering for Get() to save CPU + if (moptions_.memtable_whole_key_filtering) { + may_contain = bloom_filter_->MayContain(user_key); + } else { + assert(prefix_extractor_); + may_contain = + bloom_filter_->MayContain(prefix_extractor_->Transform(user_key)); + } + } + if (bloom_filter_ && !may_contain) { // iter is null if prefix bloom says the key does not exist PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); *seq = kMaxSequenceNumber; } else { - if (prefix_bloom_) { + if (bloom_filter_) { PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); } Saver saver; diff --git a/db/memtable.h b/db/memtable.h index 5724f2c31..709e2061e 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -41,6 +41,7 @@ struct ImmutableMemTableOptions { size_t arena_block_size; uint32_t memtable_prefix_bloom_bits; size_t memtable_huge_page_size; + bool memtable_whole_key_filtering; bool inplace_update_support; size_t inplace_update_num_locks; UpdateStatus (*inplace_callback)(char* existing_value, @@ -274,7 +275,7 @@ class MemTable { // memtable prefix bloom is disabled, since we can't easily allocate more // space. void UpdateWriteBufferSize(size_t new_write_buffer_size) { - if (prefix_bloom_ == nullptr || + if (bloom_filter_ == nullptr || new_write_buffer_size < write_buffer_size_) { write_buffer_size_.store(new_write_buffer_size, std::memory_order_relaxed); @@ -454,7 +455,7 @@ class MemTable { std::vector locks_; const SliceTransform* const prefix_extractor_; - std::unique_ptr prefix_bloom_; + std::unique_ptr bloom_filter_; std::atomic flush_state_; diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index a83d1212c..5da33ffec 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -272,6 +272,15 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API double memtable_prefix_bloom_size_ratio = 0.0; + // Enable whole key bloom filter in memtable. Note this will only take effect + // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering + // can potentially reduce CPU usage for point-look-ups. + // + // Default: false (disable) + // + // Dynamically changeable through SetOptions() API + bool memtable_whole_key_filtering = false; + // Page size for huge page for the arena used by the memtable. If <=0, it // won't allocate from huge page but from malloc. // Users are responsible to reserve huge pages for it to be allocated. For diff --git a/options/cf_options.cc b/options/cf_options.cc index bc8f3e504..6957e150f 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -135,6 +135,8 @@ void MutableCFOptions::Dump(Logger* log) const { arena_block_size); ROCKS_LOG_INFO(log, " memtable_prefix_bloom_ratio: %f", memtable_prefix_bloom_size_ratio); + ROCKS_LOG_INFO(log, " memtable_whole_key_filtering: %d", + memtable_whole_key_filtering); ROCKS_LOG_INFO(log, " memtable_huge_page_size: %" ROCKSDB_PRIszt, memtable_huge_page_size); diff --git a/options/cf_options.h b/options/cf_options.h index 0cc5ef5a5..6653e59f5 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -131,6 +131,7 @@ struct MutableCFOptions { arena_block_size(options.arena_block_size), memtable_prefix_bloom_size_ratio( options.memtable_prefix_bloom_size_ratio), + memtable_whole_key_filtering(options.memtable_whole_key_filtering), memtable_huge_page_size(options.memtable_huge_page_size), max_successive_merges(options.max_successive_merges), inplace_update_num_locks(options.inplace_update_num_locks), @@ -167,6 +168,7 @@ struct MutableCFOptions { max_write_buffer_number(0), arena_block_size(0), memtable_prefix_bloom_size_ratio(0), + memtable_whole_key_filtering(false), memtable_huge_page_size(0), max_successive_merges(0), inplace_update_num_locks(0), @@ -213,6 +215,7 @@ struct MutableCFOptions { int max_write_buffer_number; size_t arena_block_size; double memtable_prefix_bloom_size_ratio; + bool memtable_whole_key_filtering; size_t memtable_huge_page_size; size_t max_successive_merges; size_t inplace_update_num_locks; diff --git a/options/options.cc b/options/options.cc index 837345b10..55cc6777e 100644 --- a/options/options.cc +++ b/options/options.cc @@ -51,6 +51,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) inplace_callback(options.inplace_callback), memtable_prefix_bloom_size_ratio( options.memtable_prefix_bloom_size_ratio), + memtable_whole_key_filtering(options.memtable_whole_key_filtering), memtable_huge_page_size(options.memtable_huge_page_size), memtable_insert_with_hint_prefix_extractor( options.memtable_insert_with_hint_prefix_extractor), @@ -325,6 +326,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.memtable_prefix_bloom_size_ratio: %f", memtable_prefix_bloom_size_ratio); + ROCKS_LOG_HEADER(log, + " Options.memtable_whole_key_filtering: %d", + memtable_whole_key_filtering); ROCKS_LOG_HEADER(log, " Options.memtable_huge_page_size: %" ROCKSDB_PRIszt, memtable_huge_page_size); diff --git a/options/options_helper.cc b/options/options_helper.cc index 1e23e7031..05ea8d67b 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -142,6 +142,8 @@ ColumnFamilyOptions BuildColumnFamilyOptions( cf_opts.arena_block_size = mutable_cf_options.arena_block_size; cf_opts.memtable_prefix_bloom_size_ratio = mutable_cf_options.memtable_prefix_bloom_size_ratio; + cf_opts.memtable_whole_key_filtering = + mutable_cf_options.memtable_whole_key_filtering; cf_opts.memtable_huge_page_size = mutable_cf_options.memtable_huge_page_size; cf_opts.max_successive_merges = mutable_cf_options.max_successive_merges; cf_opts.inplace_update_num_locks = @@ -1801,6 +1803,10 @@ std::unordered_map {"memtable_prefix_bloom_probes", {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true, 0}}, + {"memtable_whole_key_filtering", + {offset_of(&ColumnFamilyOptions::memtable_whole_key_filtering), + OptionType::kBoolean, OptionVerificationType::kNormal, true, + offsetof(struct MutableCFOptions, memtable_whole_key_filtering)}}, {"min_partial_merge_operands", {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true, 0}}, diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index cc6d82e7e..9d37fc186 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -435,6 +435,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "max_write_buffer_number_to_maintain=84;" "merge_operator=aabcxehazrMergeOperator;" "memtable_prefix_bloom_size_ratio=0.4642;" + "memtable_whole_key_filtering=true;" "memtable_insert_with_hint_prefix_extractor=rocksdb.CappedPrefix.13;" "paranoid_file_checks=true;" "force_consistency_checks=true;" diff --git a/options/options_test.cc b/options/options_test.cc index 68865e4fa..cebad4938 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -90,6 +90,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"compaction_measure_io_stats", "false"}, {"inplace_update_num_locks", "25"}, {"memtable_prefix_bloom_size_ratio", "0.26"}, + {"memtable_whole_key_filtering", "true"}, {"memtable_huge_page_size", "28"}, {"bloom_locality", "29"}, {"max_successive_merges", "30"}, @@ -195,6 +196,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.inplace_update_support, true); ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 25U); ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_size_ratio, 0.26); + ASSERT_EQ(new_cf_opt.memtable_whole_key_filtering, true); ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U); ASSERT_EQ(new_cf_opt.bloom_locality, 29U); ASSERT_EQ(new_cf_opt.max_successive_merges, 30U); diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 0a14b6c08..5ac013462 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -354,9 +354,8 @@ struct BlockBasedTableBuilder::Rep { compression_dict(), compression_ctx(_compression_type), verify_dict(), - state((_compression_opts.max_dict_bytes > 0) - ? State::kBuffered - : State::kUnbuffered), + state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered + : State::kUnbuffered), use_delta_encoding_for_index_values(table_opt.format_version >= 4 && !table_opt.block_align), compressed_cache_key_prefix_size(0), diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 845c29bbb..502366236 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -514,6 +514,8 @@ DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means" DEFINE_double(memtable_bloom_size_ratio, 0, "Ratio of memtable size used for bloom filter. 0 means no bloom " "filter."); +DEFINE_bool(memtable_whole_key_filtering, false, + "Try to use whole key bloom filter in memtables."); DEFINE_bool(memtable_use_huge_page, false, "Try to use huge page in memtables."); @@ -3247,6 +3249,7 @@ void VerifyDBFromDB(std::string& truth_db_name) { } options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0; options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio; + options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering; if (FLAGS_memtable_insert_with_hint_prefix_size > 0) { options.memtable_insert_with_hint_prefix_extractor.reset( NewCappedPrefixTransform( diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc index dfc461193..1b19de5f1 100644 --- a/tools/db_bench_tool_test.cc +++ b/tools/db_bench_tool_test.cc @@ -248,6 +248,7 @@ const std::string options_file_content = R"OPTIONS_FILE( verify_checksums_in_compaction=true merge_operator=nullptr memtable_prefix_bloom_bits=0 + memtable_whole_key_filtering=true paranoid_file_checks=false inplace_update_num_locks=10000 optimize_filters_for_hits=false diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 71240fdaf..b916bf5ff 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -210,6 +210,10 @@ DEFINE_double(memtable_prefix_bloom_size_ratio, "creates prefix blooms for memtables, each with size " "`write_buffer_size * memtable_prefix_bloom_size_ratio`."); +DEFINE_bool(memtable_whole_key_filtering, + rocksdb::Options().memtable_whole_key_filtering, + "Enable whole key filtering in memtables."); + DEFINE_int32(open_files, rocksdb::Options().max_open_files, "Maximum number of files to keep open at the same time " "(use default if == 0)"); @@ -2583,6 +2587,8 @@ class StressTest { FLAGS_max_write_buffer_number_to_maintain; options_.memtable_prefix_bloom_size_ratio = FLAGS_memtable_prefix_bloom_size_ratio; + options_.memtable_whole_key_filtering = + FLAGS_memtable_whole_key_filtering; options_.max_background_compactions = FLAGS_max_background_compactions; options_.max_background_flushes = FLAGS_max_background_flushes; options_.compaction_style = diff --git a/util/testutil.cc b/util/testutil.cc index af9ded655..ec95d107e 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -306,6 +306,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) { cf_opt->purge_redundant_kvs_while_flush = rnd->Uniform(2); cf_opt->force_consistency_checks = rnd->Uniform(2); cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2); + cf_opt->memtable_whole_key_filtering = rnd->Uniform(2); // double options cf_opt->hard_rate_limit = static_cast(rnd->Uniform(10000)) / 13;