From ea89c77f27a0339821f66eaa1b8afcf345b9175d Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 27 Jan 2022 14:53:39 -0800
Subject: [PATCH] Fix major bug with MultiGet, DeleteRange, and memtable Bloom
 (#9453)

Summary:
MemTable::MultiGet was not considering range tombstones before
querying Bloom filter. This means range tombstones would be skipped for
keys (or prefixes) with no other entries in the memtable. This could cause
old values for a key (in SST files) to still show up until the range tombstone
covering it has been flushed.

This is fixed by essentially disabling the memtable Bloom filter when there
are any range tombstones. (This could be better optimized in the future, but
good enough for now.)

Did some other cleanup/optimization in the same code to (more than) offset
the cost of checking on range tombstones in more cases. There is now
notable improvement when memtable_whole_key_filtering and prefix_extractor
are used together (unusual), and this makes MultiGet closer to the Get
implementation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9453

Test Plan:
new unit test added. Added memtable Bloom to crash test.

Performance testing
--------------------

Build WAL-only DB (recovers to memtable):
```
TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -benchmarks=fillrandom -num=1000000 -write_buffer_size=250000000
```

Query test command, to maximize sensitivity to the changed code:
```
TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -use_existing_db -readonly -benchmarks=multireadrandom -num=10000000 -write_buffer_size=250000000 -memtable_bloom_size_ratio=0.015 -multiread_batched -batch_size=24 -threads=8 -memtable_whole_key_filtering=$MWKF -prefix_size=$PXS
```
(Note -num here is 10x larger for mostly memtable misses)

Before & after run simultaneously, average over 10 iterations per data point, ops/sec.

MWKF=0 PXS=0 (Bloom disabled)
Before: 5724844
After: 6722066

MWKF=0 PXS=7 (prefixes hardly unique; Bloom not useful)
Before: 9981319
After: 10237990

MWKF=0 PXS=8 (prefixes unique; Bloom useful)
Before:  12081715
After: 12117603

MWKF=1 PXS=0 (whole key Bloom useful)
Before: 11944354
After: 12096085

MWKF=1 PXS=7 (whole key Bloom useful in new version; prefixes not useful in old version)
Before: 9444299
After: 11826029

MWKF=1 PXS=7 (whole key Bloom useful in new version; prefixes useful in old version)
Before: 11784465
After: 11778591

Only in this last case is the 'before' *slightly* faster, perhaps because hashing prefixes is slightly faster than hashing whole keys. Otherwise, 'after' is faster.

Reviewed By: ajkr

Differential Revision: D33805025

Pulled By: pdillinger

fbshipit-source-id: 597523cae4f4eafdf6ae6bb2bc6cb46f83b017bf
---
 HISTORY.md                         |  3 ++
 db/db_bloom_filter_test.cc         | 57 ++++++++++++++++++++++++++
 db/memtable.cc                     | 66 +++++++++++++++++-------------
 db/memtable.h                      |  4 ++
 include/rocksdb/advanced_options.h | 12 ++++--
 table/multiget_context.h           |  6 +--
 tools/db_crashtest.py              |  7 +++-
 util/dynamic_bloom.h               |  6 +--
 8 files changed, 122 insertions(+), 39 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 9e8ebf3d4..5c9da97e1 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,8 @@
 # Rocksdb Change Log
 ## Unreleased
+### Bug Fixes
+* Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.)
+
 ### Public API changes
 * Remove HDFS support from main repo.
 * Remove librados support from main repo.
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index 92f4db438..5b444d841 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -1502,6 +1502,63 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
   ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
 }
 
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio = 0.015;
+  options.memtable_whole_key_filtering = true;
+  Reopen(options);
+  std::string key1("AA");
+  std::string key2("BB");
+  std::string key3("CC");
+  std::string key4("DD");
+  std::string key_not("EE");
+  std::string value1("Value1");
+  std::string value2("Value2");
+  std::string value3("Value3");
+  std::string value4("Value4");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key2, value2, WriteOptions()));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put(key4, value4, WriteOptions()));
+
+  // Delete key2 and key3
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ"));
+
+  // Read without snapshot
+  auto results = MultiGet({key_not, key1, key2, key3, key4});
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], "NOT_FOUND");
+  ASSERT_EQ(results[3], "NOT_FOUND");
+  ASSERT_EQ(results[4], value4);
+
+  // Also check Get
+  ASSERT_EQ(Get(key1), value1);
+  ASSERT_EQ(Get(key2), "NOT_FOUND");
+  ASSERT_EQ(Get(key3), "NOT_FOUND");
+  ASSERT_EQ(Get(key4), value4);
+
+  // Read with snapshot
+  results = MultiGet({key_not, key1, key2, key3, key4}, snapshot);
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], value2);
+  ASSERT_EQ(results[3], value3);
+  ASSERT_EQ(results[4], "NOT_FOUND");
+
+  // Also check Get
+  ASSERT_EQ(Get(key1, snapshot), value1);
+  ASSERT_EQ(Get(key2, snapshot), value2);
+  ASSERT_EQ(Get(key3, snapshot), value3);
+  ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND");
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
   constexpr size_t kPrefixSize = 8;
   const std::string kKey = "key";
diff --git a/db/memtable.cc b/db/memtable.cc
index f2803dfc1..3ce44ea1d 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -33,6 +33,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/types.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
@@ -447,11 +448,13 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
       is_range_del_table_empty_.load(std::memory_order_relaxed)) {
     return nullptr;
   }
+  return NewRangeTombstoneIteratorInternal(read_options, read_seq);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
+    const ReadOptions& read_options, SequenceNumber read_seq) {
   auto* unfragmented_iter = new MemTableIterator(
       *this, read_options, nullptr /* arena */, true /* use_range_del_table */);
-  if (unfragmented_iter == nullptr) {
-    return nullptr;
-  }
   auto fragmented_tombstone_list =
       std::make_shared<FragmentedRangeTombstoneList>(
           std::unique_ptr<InternalIterator>(unfragmented_iter),
@@ -960,53 +963,58 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
   }
   PERF_TIMER_GUARD(get_from_memtable_time);
 
+  // For now, memtable Bloom filter is effectively disabled if there are any
+  // range tombstones. This is the simplest way to ensure range tombstones are
+  // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
+  bool no_range_del = read_options.ignore_range_deletions ||
+                      is_range_del_table_empty_.load(std::memory_order_relaxed);
   MultiGetRange temp_range(*range, range->begin(), range->end());
-  if (bloom_filter_) {
-    std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
-    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
-    autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+  if (bloom_filter_ && no_range_del) {
+    bool whole_key =
+        !prefix_extractor_ || moptions_.memtable_whole_key_filtering;
+    std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
+    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
+    std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
     int num_keys = 0;
     for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-      if (!prefix_extractor_) {
-        keys[num_keys++] = &iter->ukey_without_ts;
+      if (whole_key) {
+        bloom_keys[num_keys] = iter->ukey_without_ts;
+        range_indexes[num_keys++] = iter.index();
       } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
-        prefixes.emplace_back(
-            prefix_extractor_->Transform(iter->ukey_without_ts));
-        keys[num_keys++] = &prefixes.back();
-      }
-    }
-    bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
-    int idx = 0;
-    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-      if (prefix_extractor_ &&
-          !prefix_extractor_->InDomain(iter->ukey_without_ts)) {
+        bloom_keys[num_keys] =
+            prefix_extractor_->Transform(iter->ukey_without_ts);
+        range_indexes[num_keys++] = iter.index();
+      } else {
+        // TODO: consider not counting these as Bloom hits to more closely
+        // match bloom_sst_hit_count
         PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
-        continue;
       }
-      if (!may_match[idx]) {
-        temp_range.SkipKey(iter);
+    }
+    bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]);
+    for (int i = 0; i < num_keys; ++i) {
+      if (!may_match[i]) {
+        temp_range.SkipIndex(range_indexes[i]);
         PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
       } else {
         PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
       }
-      idx++;
     }
   }
   for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-    SequenceNumber seq = kMaxSequenceNumber;
     bool found_final_value{false};
     bool merge_in_progress = iter->s->IsMergeInProgress();
-    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
-        NewRangeTombstoneIterator(
-            read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
-    if (range_del_iter != nullptr) {
+    if (!no_range_del) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          NewRangeTombstoneIteratorInternal(
+              read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
       iter->max_covering_tombstone_seq = std::max(
           iter->max_covering_tombstone_seq,
           range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
     }
+    SequenceNumber dummy_seq;
     GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
                  callback, &iter->is_blob_index, iter->value->GetSelf(),
-                 iter->timestamp, iter->s, &(iter->merge_context), &seq,
+                 iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq,
                  &found_final_value, &merge_in_progress);
 
     if (!found_final_value && merge_in_progress) {
diff --git a/db/memtable.h b/db/memtable.h
index 695cf2d94..52cfcc9d2 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -600,6 +600,10 @@ class MemTable {
                     std::string* value, std::string* timestamp, Status* s,
                     MergeContext* merge_context, SequenceNumber* seq,
                     bool* found_final_value, bool* merge_in_progress);
+
+  // Always returns non-null and assumes certain pre-checks are done
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
+      const ReadOptions& read_options, SequenceNumber read_seq);
 };
 
 extern const char* EncodeKey(std::string* scratch, const Slice& target);
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index d7e3205b8..a12bebded 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -366,10 +366,16 @@ struct AdvancedColumnFamilyOptions {
                                    Slice delta_value,
                                    std::string* merged_value) = nullptr;
 
-  // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
-  // create prefix bloom for memtable with the size of
+  // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic
+  // Bloom filter in memtable to optimize many queries that must go beyond
+  // the memtable. The size in bytes of the filter is
   // write_buffer_size * memtable_prefix_bloom_size_ratio.
-  // If it is larger than 0.25, it is sanitized to 0.25.
+  // * If prefix_extractor is set, the filter includes prefixes.
+  // * If memtable_whole_key_filtering, the filter includes whole keys.
+  // * If both, the filter includes both.
+  // * If neither, the feature is disabled.
+  //
+  // If this value is larger than 0.25, it is sanitized to 0.25.
   //
   // Default: 0 (disable)
   //
diff --git a/table/multiget_context.h b/table/multiget_context.h
index 3d1ce72bc..6ab337879 100644
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@@ -235,9 +235,9 @@ class MultiGetContext {
 
     bool empty() const { return RemainingMask() == 0; }
 
-    void SkipKey(const Iterator& iter) {
-      skip_mask_ |= uint64_t{1} << iter.index_;
-    }
+    void SkipIndex(size_t index) { skip_mask_ |= uint64_t{1} << index; }
+
+    void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); }
 
     bool IsKeySkipped(const Iterator& iter) const {
       return skip_mask_ & (uint64_t{1} << iter.index_);
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 617745979..5a2becca3 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -87,6 +87,7 @@ default_params = {
     "partition_filters": lambda: random.randint(0, 1),
     "partition_pinning": lambda: random.randint(0, 3),
     "pause_background_one_in": 1000000,
+    "prefix_size" : -1,
     "prefixpercent": 5,
     "progress_reports": 0,
     "readpercent": 45,
@@ -155,6 +156,8 @@ default_params = {
     "user_timestamp_size": 0,
     "secondary_cache_fault_one_in" : lambda: random.choice([0, 0, 32]),
     "prepopulate_block_cache" : lambda: random.choice([0, 1]),
+    "memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]),
+    "memtable_whole_key_filtering": lambda: random.randint(0, 1),
 }
 
 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
@@ -242,7 +245,6 @@ simple_default_params = {
     "memtablerep": "skip_list",
     "prefixpercent": 0,
     "readpercent": 50,
-    "prefix_size" : -1,
     "target_file_size_base": 16777216,
     "target_file_size_multiplier": 1,
     "test_batches_snapshots": 0,
@@ -401,6 +403,9 @@ def finalize_and_sanitize(src_params):
         dest_params["test_batches_snapshots"] = 0
     if dest_params.get("test_batches_snapshots") == 0:
         dest_params["batch_protection_bytes_per_key"] = 0
+    if (dest_params.get("prefix_size") == -1 and
+        dest_params.get("memtable_whole_key_filtering") == 0):
+        dest_params["memtable_prefix_bloom_size_ratio"] = 0
     return dest_params
 
 def gen_cmd_params(args):
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index ba0836c4c..542183280 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -65,7 +65,7 @@ class DynamicBloom {
   // Multithreaded access to this function is OK
   bool MayContain(const Slice& key) const;
 
-  void MayContain(int num_keys, Slice** keys, bool* may_match) const;
+  void MayContain(int num_keys, Slice* keys, bool* may_match) const;
 
   // Multithreaded access to this function is OK
   bool MayContainHash(uint32_t hash) const;
@@ -120,12 +120,12 @@ inline bool DynamicBloom::MayContain(const Slice& key) const {
   return (MayContainHash(BloomHash(key)));
 }
 
-inline void DynamicBloom::MayContain(int num_keys, Slice** keys,
+inline void DynamicBloom::MayContain(int num_keys, Slice* keys,
                                      bool* may_match) const {
   std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
   std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
   for (int i = 0; i < num_keys; ++i) {
-    hashes[i] = BloomHash(*keys[i]);
+    hashes[i] = BloomHash(keys[i]);
     size_t a = FastRange32(kLen, hashes[i]);
     PREFETCH(data_ + a, 0, 3);
     byte_offsets[i] = a;