Fix MultiGet range deletion handling and a memory leak (#10513)

Summary: This PR fixes 2 bugs introduced in https://github.com/facebook/rocksdb/issues/10432 - 1. If the bloom filter returned a negative result for all MultiGet keys in a file, the range tombstones in that file were being ignored, resulting in incorrect results if those tombstones covered a key in a higher level. 2. If all the keys in a file were filtered out in `TableCache::MultiGetFilter`, the table cache handle was not being released. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10513 Test Plan: Add a new unit test that fails without this fix Reviewed By: akankshamahajan15 Differential Revision: D38548739 Pulled By: anand1976 fbshipit-source-id: a741a1e25d2e991d63f038100f126c2dc404a87c
2 years ago · 0b02960d8c
parent 06b04127a8
commit 0b02960d8c
4 changed files with 78 additions and 13 deletions
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@ -2146,7 +2146,7 @@ class DBMultiGetAsyncIOTest : public DBBasicTest {

    // Put all keys in the bottommost level, and overwrite some keys
    // in L0 and L1
-    for (int i = 0; i < 128; ++i) {
+    for (int i = 0; i < 256; ++i) {
      EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i)));
      num_keys++;
      if (num_keys == 8) {
@ -2172,6 +2172,21 @@ class DBMultiGetAsyncIOTest : public DBBasicTest {
      EXPECT_OK(Flush());
      num_keys = 0;
    }
+    // Put some range deletes in L1
+    for (int i = 128; i < 256; i += 32) {
+      std::string range_begin = Key(i);
+      std::string range_end = Key(i + 16);
+      EXPECT_OK(dbfull()->DeleteRange(WriteOptions(),
+                                      dbfull()->DefaultColumnFamily(),
+                                      range_begin, range_end));
+      // Also do some Puts to force creation of bloom filter
+      for (int j = i + 16; j < i + 32; ++j) {
+        if (j % 3 == 0) {
+          EXPECT_OK(Put(Key(j), "val_l1_" + std::to_string(j)));
+        }
+      }
+      EXPECT_OK(Flush());
+    }
    MoveFilesToLevel(1);

    for (int i = 0; i < 128; i += 5) {
@ -2366,6 +2381,32 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
  // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
 }
+
+TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 139 and 163 are in L2, but overlap with a range deletes in L1
+  key_strs.push_back(Key(139));
+  key_strs.push_back(Key(163));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 2);
+  ASSERT_EQ(statuses[0], Status::NotFound());
+  ASSERT_EQ(statuses[1], Status::NotFound());
+
+  // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
+}
 #endif  // USE_COROUTINES

 TEST_F(DBBasicTest, MultiGetStats) {
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -501,6 +501,22 @@ Status TableCache::Get(
  return s;
 }

+void TableCache::UpdateRangeTombstoneSeqnums(
+    const ReadOptions& options, TableReader* t,
+    MultiGetContext::Range& table_range) {
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+      t->NewRangeTombstoneIterator(options));
+  if (range_del_iter != nullptr) {
+    for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
+      SequenceNumber* max_covering_tombstone_seq =
+          iter->get_context->max_covering_tombstone_seq();
+      *max_covering_tombstone_seq = std::max(
+          *max_covering_tombstone_seq,
+          range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts));
+    }
+  }
+}
+
 Status TableCache::MultiGetFilter(
    const ReadOptions& options,
    const InternalKeyComparator& internal_comparator,
@ -524,6 +540,8 @@ Status TableCache::MultiGetFilter(
  Status s;
  TableReader* t = fd.table_reader;
  Cache::Handle* handle = nullptr;
+  MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(),
+                                         mget_range->end());
  if (t == nullptr) {
    s = FindTable(
        options, file_options_, internal_comparator, fd, &handle,
@ -539,6 +557,18 @@ Status TableCache::MultiGetFilter(
  if (s.ok()) {
    s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range);
  }
+  if (mget_range->empty()) {
+    if (s.ok() && !options.ignore_range_deletions) {
+      // If all the keys have been filtered out by the bloom filter, then
+      // update the range tombstone sequence numbers for the keys as
+      // MultiGet() will not be called for this set of keys.
+      UpdateRangeTombstoneSeqnums(options, t, tombstone_range);
+    }
+    if (handle) {
+      ReleaseHandle(handle);
+      *table_handle = nullptr;
+    }
+  }

  return s;
 }
--- a/db/table_cache.h
+++ b/db/table_cache.h
@ -235,6 +235,11 @@ class TableCache {
      size_t max_file_size_for_l0_meta_pin = 0,
      Temperature file_temperature = Temperature::kUnknown);

+  // Update the max_covering_tombstone_seq in the GetContext for each key based
+  // on the range deletions in the table
+  void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t,
+                                   MultiGetContext::Range& table_range);
+
  // Create a key prefix for looking up the row cache. The prefix is of the
  // format row_cache_id + fd_number + seq_no. Later, the user key can be
  // appended to form the full key
--- a/db/table_cache_sync_and_async.h
+++ b/db/table_cache_sync_and_async.h
@ -79,18 +79,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
      }
    }
    if (s.ok() && !options.ignore_range_deletions) {
-      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
-          t->NewRangeTombstoneIterator(options));
-      if (range_del_iter != nullptr) {
-        for (auto iter = table_range.begin(); iter != table_range.end();
-             ++iter) {
-          SequenceNumber* max_covering_tombstone_seq =
-              iter->get_context->max_covering_tombstone_seq();
-          *max_covering_tombstone_seq = std::max(
-              *max_covering_tombstone_seq,
-              range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts));
-        }
-      }
+      UpdateRangeTombstoneSeqnums(options, t, table_range);
    }
    if (s.ok()) {
      CO_AWAIT(t->MultiGet)