diff --git a/HISTORY.md b/HISTORY.md index 9e8ebf3d4..5c9da97e1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,8 @@ # Rocksdb Change Log ## Unreleased +### Bug Fixes +* Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.) + ### Public API changes * Remove HDFS support from main repo. * Remove librados support from main repo. diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 92f4db438..5b444d841 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -1502,6 +1502,63 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) { ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); } +TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) { + Options options = CurrentOptions(); + options.memtable_prefix_bloom_size_ratio = 0.015; + options.memtable_whole_key_filtering = true; + Reopen(options); + std::string key1("AA"); + std::string key2("BB"); + std::string key3("CC"); + std::string key4("DD"); + std::string key_not("EE"); + std::string value1("Value1"); + std::string value2("Value2"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key2, value2, WriteOptions())); + ASSERT_OK(Flush()); + ASSERT_OK(Put(key3, value3, WriteOptions())); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put(key4, value4, WriteOptions())); + + // Delete key2 and key3 + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ")); + + // Read without snapshot + auto results = MultiGet({key_not, key1, key2, key3, key4}); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], "NOT_FOUND"); + ASSERT_EQ(results[3], "NOT_FOUND"); + ASSERT_EQ(results[4], value4); + + // Also check Get + ASSERT_EQ(Get(key1), value1); + ASSERT_EQ(Get(key2), "NOT_FOUND"); + ASSERT_EQ(Get(key3), "NOT_FOUND"); + ASSERT_EQ(Get(key4), value4); + + // Read with snapshot + results = MultiGet({key_not, key1, key2, key3, key4}, snapshot); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], value2); + ASSERT_EQ(results[3], value3); + ASSERT_EQ(results[4], "NOT_FOUND"); + + // Also check Get + ASSERT_EQ(Get(key1, snapshot), value1); + ASSERT_EQ(Get(key2, snapshot), value2); + ASSERT_EQ(Get(key3, snapshot), value3); + ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND"); + + db_->ReleaseSnapshot(snapshot); +} + TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { constexpr size_t kPrefixSize = 8; const std::string kKey = "key"; diff --git a/db/memtable.cc b/db/memtable.cc index f2803dfc1..3ce44ea1d 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -33,6 +33,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/types.h" #include "rocksdb/write_buffer_manager.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" @@ -447,11 +448,13 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( is_range_del_table_empty_.load(std::memory_order_relaxed)) { return nullptr; } + return NewRangeTombstoneIteratorInternal(read_options, read_seq); +} + +FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( + const ReadOptions& read_options, SequenceNumber read_seq) { auto* unfragmented_iter = new MemTableIterator( *this, read_options, nullptr /* arena */, true /* use_range_del_table */); - if (unfragmented_iter == nullptr) { - return nullptr; - } auto fragmented_tombstone_list = std::make_shared( std::unique_ptr(unfragmented_iter), @@ -960,53 +963,58 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, } PERF_TIMER_GUARD(get_from_memtable_time); + // For now, memtable Bloom filter is effectively disabled if there are any + // range tombstones. This is the simplest way to ensure range tombstones are + // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0 + bool no_range_del = read_options.ignore_range_deletions || + is_range_del_table_empty_.load(std::memory_order_relaxed); MultiGetRange temp_range(*range, range->begin(), range->end()); - if (bloom_filter_) { - std::array keys; - std::array may_match = {{true}}; - autovector prefixes; + if (bloom_filter_ && no_range_del) { + bool whole_key = + !prefix_extractor_ || moptions_.memtable_whole_key_filtering; + std::array bloom_keys; + std::array may_match; + std::array range_indexes; int num_keys = 0; for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { - if (!prefix_extractor_) { - keys[num_keys++] = &iter->ukey_without_ts; + if (whole_key) { + bloom_keys[num_keys] = iter->ukey_without_ts; + range_indexes[num_keys++] = iter.index(); } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) { - prefixes.emplace_back( - prefix_extractor_->Transform(iter->ukey_without_ts)); - keys[num_keys++] = &prefixes.back(); - } - } - bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]); - int idx = 0; - for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { - if (prefix_extractor_ && - !prefix_extractor_->InDomain(iter->ukey_without_ts)) { + bloom_keys[num_keys] = + prefix_extractor_->Transform(iter->ukey_without_ts); + range_indexes[num_keys++] = iter.index(); + } else { + // TODO: consider not counting these as Bloom hits to more closely + // match bloom_sst_hit_count PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); - continue; } - if (!may_match[idx]) { - temp_range.SkipKey(iter); + } + bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]); + for (int i = 0; i < num_keys; ++i) { + if (!may_match[i]) { + temp_range.SkipIndex(range_indexes[i]); PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); } else { PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); } - idx++; } } for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { - SequenceNumber seq = kMaxSequenceNumber; bool found_final_value{false}; bool merge_in_progress = iter->s->IsMergeInProgress(); - std::unique_ptr range_del_iter( - NewRangeTombstoneIterator( - read_options, GetInternalKeySeqno(iter->lkey->internal_key()))); - if (range_del_iter != nullptr) { + if (!no_range_del) { + std::unique_ptr range_del_iter( + NewRangeTombstoneIteratorInternal( + read_options, GetInternalKeySeqno(iter->lkey->internal_key()))); iter->max_covering_tombstone_seq = std::max( iter->max_covering_tombstone_seq, range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key())); } + SequenceNumber dummy_seq; GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true, callback, &iter->is_blob_index, iter->value->GetSelf(), - iter->timestamp, iter->s, &(iter->merge_context), &seq, + iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq, &found_final_value, &merge_in_progress); if (!found_final_value && merge_in_progress) { diff --git a/db/memtable.h b/db/memtable.h index 695cf2d94..52cfcc9d2 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -600,6 +600,10 @@ class MemTable { std::string* value, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* seq, bool* found_final_value, bool* merge_in_progress); + + // Always returns non-null and assumes certain pre-checks are done + FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal( + const ReadOptions& read_options, SequenceNumber read_seq); }; extern const char* EncodeKey(std::string* scratch, const Slice& target); diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index d7e3205b8..a12bebded 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -366,10 +366,16 @@ struct AdvancedColumnFamilyOptions { Slice delta_value, std::string* merged_value) = nullptr; - // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, - // create prefix bloom for memtable with the size of + // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic + // Bloom filter in memtable to optimize many queries that must go beyond + // the memtable. The size in bytes of the filter is // write_buffer_size * memtable_prefix_bloom_size_ratio. - // If it is larger than 0.25, it is sanitized to 0.25. + // * If prefix_extractor is set, the filter includes prefixes. + // * If memtable_whole_key_filtering, the filter includes whole keys. + // * If both, the filter includes both. + // * If neither, the feature is disabled. + // + // If this value is larger than 0.25, it is sanitized to 0.25. // // Default: 0 (disable) // diff --git a/table/multiget_context.h b/table/multiget_context.h index 3d1ce72bc..6ab337879 100644 --- a/table/multiget_context.h +++ b/table/multiget_context.h @@ -235,9 +235,9 @@ class MultiGetContext { bool empty() const { return RemainingMask() == 0; } - void SkipKey(const Iterator& iter) { - skip_mask_ |= uint64_t{1} << iter.index_; - } + void SkipIndex(size_t index) { skip_mask_ |= uint64_t{1} << index; } + + void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); } bool IsKeySkipped(const Iterator& iter) const { return skip_mask_ & (uint64_t{1} << iter.index_); diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 617745979..5a2becca3 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -87,6 +87,7 @@ default_params = { "partition_filters": lambda: random.randint(0, 1), "partition_pinning": lambda: random.randint(0, 3), "pause_background_one_in": 1000000, + "prefix_size" : -1, "prefixpercent": 5, "progress_reports": 0, "readpercent": 45, @@ -155,6 +156,8 @@ default_params = { "user_timestamp_size": 0, "secondary_cache_fault_one_in" : lambda: random.choice([0, 0, 32]), "prepopulate_block_cache" : lambda: random.choice([0, 1]), + "memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]), + "memtable_whole_key_filtering": lambda: random.randint(0, 1), } _TEST_DIR_ENV_VAR = 'TEST_TMPDIR' @@ -242,7 +245,6 @@ simple_default_params = { "memtablerep": "skip_list", "prefixpercent": 0, "readpercent": 50, - "prefix_size" : -1, "target_file_size_base": 16777216, "target_file_size_multiplier": 1, "test_batches_snapshots": 0, @@ -401,6 +403,9 @@ def finalize_and_sanitize(src_params): dest_params["test_batches_snapshots"] = 0 if dest_params.get("test_batches_snapshots") == 0: dest_params["batch_protection_bytes_per_key"] = 0 + if (dest_params.get("prefix_size") == -1 and + dest_params.get("memtable_whole_key_filtering") == 0): + dest_params["memtable_prefix_bloom_size_ratio"] = 0 return dest_params def gen_cmd_params(args): diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index ba0836c4c..542183280 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -65,7 +65,7 @@ class DynamicBloom { // Multithreaded access to this function is OK bool MayContain(const Slice& key) const; - void MayContain(int num_keys, Slice** keys, bool* may_match) const; + void MayContain(int num_keys, Slice* keys, bool* may_match) const; // Multithreaded access to this function is OK bool MayContainHash(uint32_t hash) const; @@ -120,12 +120,12 @@ inline bool DynamicBloom::MayContain(const Slice& key) const { return (MayContainHash(BloomHash(key))); } -inline void DynamicBloom::MayContain(int num_keys, Slice** keys, +inline void DynamicBloom::MayContain(int num_keys, Slice* keys, bool* may_match) const { std::array hashes; std::array byte_offsets; for (int i = 0; i < num_keys; ++i) { - hashes[i] = BloomHash(*keys[i]); + hashes[i] = BloomHash(keys[i]); size_t a = FastRange32(kLen, hashes[i]); PREFETCH(data_ + a, 0, 3); byte_offsets[i] = a;