Fix MultiGet range deletion handling and a memory leak (#10513)

Summary:
This PR fixes 2 bugs introduced in https://github.com/facebook/rocksdb/issues/10432 -
1. If the bloom filter returned a negative result for all MultiGet keys in a file, the range tombstones in that file were being ignored, resulting in incorrect results if those tombstones covered a key in a higher level.
2. If all the keys in a file were filtered out in `TableCache::MultiGetFilter`, the table cache handle was not being released.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10513

Test Plan: Add a new unit test that fails without this fix

Reviewed By: akankshamahajan15

Differential Revision: D38548739

Pulled By: anand1976

fbshipit-source-id: a741a1e25d2e991d63f038100f126c2dc404a87c
main
anand76 2 years ago committed by Facebook GitHub Bot
parent 06b04127a8
commit 0b02960d8c
  1. 43
      db/db_basic_test.cc
  2. 30
      db/table_cache.cc
  3. 5
      db/table_cache.h
  4. 13
      db/table_cache_sync_and_async.h

@ -2146,7 +2146,7 @@ class DBMultiGetAsyncIOTest : public DBBasicTest {
// Put all keys in the bottommost level, and overwrite some keys // Put all keys in the bottommost level, and overwrite some keys
// in L0 and L1 // in L0 and L1
for (int i = 0; i < 128; ++i) { for (int i = 0; i < 256; ++i) {
EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i))); EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i)));
num_keys++; num_keys++;
if (num_keys == 8) { if (num_keys == 8) {
@ -2172,6 +2172,21 @@ class DBMultiGetAsyncIOTest : public DBBasicTest {
EXPECT_OK(Flush()); EXPECT_OK(Flush());
num_keys = 0; num_keys = 0;
} }
// Put some range deletes in L1
for (int i = 128; i < 256; i += 32) {
std::string range_begin = Key(i);
std::string range_end = Key(i + 16);
EXPECT_OK(dbfull()->DeleteRange(WriteOptions(),
dbfull()->DefaultColumnFamily(),
range_begin, range_end));
// Also do some Puts to force creation of bloom filter
for (int j = i + 16; j < i + 32; ++j) {
if (j % 3 == 0) {
EXPECT_OK(Put(Key(j), "val_l1_" + std::to_string(j)));
}
}
EXPECT_OK(Flush());
}
MoveFilesToLevel(1); MoveFilesToLevel(1);
for (int i = 0; i < 128; i += 5) { for (int i = 0; i < 128; i += 5) {
@ -2366,6 +2381,32 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
// Bloom filters in L0/L1 will avoid the coroutine calls in those levels // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
} }
TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) {
std::vector<std::string> key_strs;
std::vector<Slice> keys;
std::vector<PinnableSlice> values;
std::vector<Status> statuses;
// 139 and 163 are in L2, but overlap with a range deletes in L1
key_strs.push_back(Key(139));
key_strs.push_back(Key(163));
keys.push_back(key_strs[0]);
keys.push_back(key_strs[1]);
values.resize(keys.size());
statuses.resize(keys.size());
ReadOptions ro;
ro.async_io = true;
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
keys.data(), values.data(), statuses.data());
ASSERT_EQ(values.size(), 2);
ASSERT_EQ(statuses[0], Status::NotFound());
ASSERT_EQ(statuses[1], Status::NotFound());
// Bloom filters in L0/L1 will avoid the coroutine calls in those levels
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
}
#endif // USE_COROUTINES #endif // USE_COROUTINES
TEST_F(DBBasicTest, MultiGetStats) { TEST_F(DBBasicTest, MultiGetStats) {

@ -501,6 +501,22 @@ Status TableCache::Get(
return s; return s;
} }
void TableCache::UpdateRangeTombstoneSeqnums(
const ReadOptions& options, TableReader* t,
MultiGetContext::Range& table_range) {
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
t->NewRangeTombstoneIterator(options));
if (range_del_iter != nullptr) {
for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
SequenceNumber* max_covering_tombstone_seq =
iter->get_context->max_covering_tombstone_seq();
*max_covering_tombstone_seq = std::max(
*max_covering_tombstone_seq,
range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts));
}
}
}
Status TableCache::MultiGetFilter( Status TableCache::MultiGetFilter(
const ReadOptions& options, const ReadOptions& options,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
@ -524,6 +540,8 @@ Status TableCache::MultiGetFilter(
Status s; Status s;
TableReader* t = fd.table_reader; TableReader* t = fd.table_reader;
Cache::Handle* handle = nullptr; Cache::Handle* handle = nullptr;
MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(),
mget_range->end());
if (t == nullptr) { if (t == nullptr) {
s = FindTable( s = FindTable(
options, file_options_, internal_comparator, fd, &handle, options, file_options_, internal_comparator, fd, &handle,
@ -539,6 +557,18 @@ Status TableCache::MultiGetFilter(
if (s.ok()) { if (s.ok()) {
s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range); s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range);
} }
if (mget_range->empty()) {
if (s.ok() && !options.ignore_range_deletions) {
// If all the keys have been filtered out by the bloom filter, then
// update the range tombstone sequence numbers for the keys as
// MultiGet() will not be called for this set of keys.
UpdateRangeTombstoneSeqnums(options, t, tombstone_range);
}
if (handle) {
ReleaseHandle(handle);
*table_handle = nullptr;
}
}
return s; return s;
} }

@ -235,6 +235,11 @@ class TableCache {
size_t max_file_size_for_l0_meta_pin = 0, size_t max_file_size_for_l0_meta_pin = 0,
Temperature file_temperature = Temperature::kUnknown); Temperature file_temperature = Temperature::kUnknown);
// Update the max_covering_tombstone_seq in the GetContext for each key based
// on the range deletions in the table
void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t,
MultiGetContext::Range& table_range);
// Create a key prefix for looking up the row cache. The prefix is of the // Create a key prefix for looking up the row cache. The prefix is of the
// format row_cache_id + fd_number + seq_no. Later, the user key can be // format row_cache_id + fd_number + seq_no. Later, the user key can be
// appended to form the full key // appended to form the full key

@ -79,18 +79,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
} }
} }
if (s.ok() && !options.ignore_range_deletions) { if (s.ok() && !options.ignore_range_deletions) {
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter( UpdateRangeTombstoneSeqnums(options, t, table_range);
t->NewRangeTombstoneIterator(options));
if (range_del_iter != nullptr) {
for (auto iter = table_range.begin(); iter != table_range.end();
++iter) {
SequenceNumber* max_covering_tombstone_seq =
iter->get_context->max_covering_tombstone_seq();
*max_covering_tombstone_seq = std::max(
*max_covering_tombstone_seq,
range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts));
}
}
} }
if (s.ok()) { if (s.ok()) {
CO_AWAIT(t->MultiGet) CO_AWAIT(t->MultiGet)

Loading…
Cancel
Save