From 0b02960d8cb76f74a0487cc352f296cd7709f558 Mon Sep 17 00:00:00 2001 From: anand76 Date: Tue, 9 Aug 2022 14:44:47 -0700 Subject: [PATCH] Fix MultiGet range deletion handling and a memory leak (#10513) Summary: This PR fixes 2 bugs introduced in https://github.com/facebook/rocksdb/issues/10432 - 1. If the bloom filter returned a negative result for all MultiGet keys in a file, the range tombstones in that file were being ignored, resulting in incorrect results if those tombstones covered a key in a higher level. 2. If all the keys in a file were filtered out in `TableCache::MultiGetFilter`, the table cache handle was not being released. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10513 Test Plan: Add a new unit test that fails without this fix Reviewed By: akankshamahajan15 Differential Revision: D38548739 Pulled By: anand1976 fbshipit-source-id: a741a1e25d2e991d63f038100f126c2dc404a87c --- db/db_basic_test.cc | 43 ++++++++++++++++++++++++++++++++- db/table_cache.cc | 30 +++++++++++++++++++++++ db/table_cache.h | 5 ++++ db/table_cache_sync_and_async.h | 13 +--------- 4 files changed, 78 insertions(+), 13 deletions(-) diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 1acf64da6..579f5a0fd 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -2146,7 +2146,7 @@ class DBMultiGetAsyncIOTest : public DBBasicTest { // Put all keys in the bottommost level, and overwrite some keys // in L0 and L1 - for (int i = 0; i < 128; ++i) { + for (int i = 0; i < 256; ++i) { EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i))); num_keys++; if (num_keys == 8) { @@ -2172,6 +2172,21 @@ class DBMultiGetAsyncIOTest : public DBBasicTest { EXPECT_OK(Flush()); num_keys = 0; } + // Put some range deletes in L1 + for (int i = 128; i < 256; i += 32) { + std::string range_begin = Key(i); + std::string range_end = Key(i + 16); + EXPECT_OK(dbfull()->DeleteRange(WriteOptions(), + dbfull()->DefaultColumnFamily(), + range_begin, range_end)); + // Also do some Puts to force creation of bloom filter + for (int j = i + 16; j < i + 32; ++j) { + if (j % 3 == 0) { + EXPECT_OK(Put(Key(j), "val_l1_" + std::to_string(j))); + } + } + EXPECT_OK(Flush()); + } MoveFilesToLevel(1); for (int i = 0; i < 128; i += 5) { @@ -2366,6 +2381,32 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) { // Bloom filters in L0/L1 will avoid the coroutine calls in those levels ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); } + +TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) { + std::vector key_strs; + std::vector keys; + std::vector values; + std::vector statuses; + + // 139 and 163 are in L2, but overlap with a range deletes in L1 + key_strs.push_back(Key(139)); + key_strs.push_back(Key(163)); + keys.push_back(key_strs[0]); + keys.push_back(key_strs[1]); + values.resize(keys.size()); + statuses.resize(keys.size()); + + ReadOptions ro; + ro.async_io = true; + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + ASSERT_EQ(values.size(), 2); + ASSERT_EQ(statuses[0], Status::NotFound()); + ASSERT_EQ(statuses[1], Status::NotFound()); + + // Bloom filters in L0/L1 will avoid the coroutine calls in those levels + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); +} #endif // USE_COROUTINES TEST_F(DBBasicTest, MultiGetStats) { diff --git a/db/table_cache.cc b/db/table_cache.cc index 8265575d4..1b5aa7e22 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -501,6 +501,22 @@ Status TableCache::Get( return s; } +void TableCache::UpdateRangeTombstoneSeqnums( + const ReadOptions& options, TableReader* t, + MultiGetContext::Range& table_range) { + std::unique_ptr range_del_iter( + t->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { + SequenceNumber* max_covering_tombstone_seq = + iter->get_context->max_covering_tombstone_seq(); + *max_covering_tombstone_seq = std::max( + *max_covering_tombstone_seq, + range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts)); + } + } +} + Status TableCache::MultiGetFilter( const ReadOptions& options, const InternalKeyComparator& internal_comparator, @@ -524,6 +540,8 @@ Status TableCache::MultiGetFilter( Status s; TableReader* t = fd.table_reader; Cache::Handle* handle = nullptr; + MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(), + mget_range->end()); if (t == nullptr) { s = FindTable( options, file_options_, internal_comparator, fd, &handle, @@ -539,6 +557,18 @@ Status TableCache::MultiGetFilter( if (s.ok()) { s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range); } + if (mget_range->empty()) { + if (s.ok() && !options.ignore_range_deletions) { + // If all the keys have been filtered out by the bloom filter, then + // update the range tombstone sequence numbers for the keys as + // MultiGet() will not be called for this set of keys. + UpdateRangeTombstoneSeqnums(options, t, tombstone_range); + } + if (handle) { + ReleaseHandle(handle); + *table_handle = nullptr; + } + } return s; } diff --git a/db/table_cache.h b/db/table_cache.h index 9c9342309..dae47ede0 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -235,6 +235,11 @@ class TableCache { size_t max_file_size_for_l0_meta_pin = 0, Temperature file_temperature = Temperature::kUnknown); + // Update the max_covering_tombstone_seq in the GetContext for each key based + // on the range deletions in the table + void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t, + MultiGetContext::Range& table_range); + // Create a key prefix for looking up the row cache. The prefix is of the // format row_cache_id + fd_number + seq_no. Later, the user key can be // appended to form the full key diff --git a/db/table_cache_sync_and_async.h b/db/table_cache_sync_and_async.h index 77bcd8447..a89ec739a 100644 --- a/db/table_cache_sync_and_async.h +++ b/db/table_cache_sync_and_async.h @@ -79,18 +79,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) } } if (s.ok() && !options.ignore_range_deletions) { - std::unique_ptr range_del_iter( - t->NewRangeTombstoneIterator(options)); - if (range_del_iter != nullptr) { - for (auto iter = table_range.begin(); iter != table_range.end(); - ++iter) { - SequenceNumber* max_covering_tombstone_seq = - iter->get_context->max_covering_tombstone_seq(); - *max_covering_tombstone_seq = std::max( - *max_covering_tombstone_seq, - range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts)); - } - } + UpdateRangeTombstoneSeqnums(options, t, table_range); } if (s.ok()) { CO_AWAIT(t->MultiGet)