Break TableReader MultiGet into filter and lookup stages (#10432)

Summary:
This PR is the first step in enhancing the coroutines MultiGet to be able to lookup a batch in parallel across levels. By having a separate TableReader function for probing the bloom filters, we can quickly figure out which overlapping keys from a batch are definitely not in the file and can move on to the next level.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10432

Reviewed By: akankshamahajan15

Differential Revision: D38245910

Pulled By: anand1976

fbshipit-source-id: 3d20db2350378c3fe6f086f0c7ba5ff01d7f04de
main
anand76 2 years ago committed by Facebook GitHub Bot
parent 538df26fcc
commit bf4532eb5c
  1. 28
      db/db_basic_test.cc
  2. 42
      db/table_cache.cc
  3. 15
      db/table_cache.h
  4. 9
      db/table_cache_sync_and_async.h
  5. 41
      db/version_set.cc
  6. 6
      db/version_set.h
  7. 11
      db/version_set_sync_and_async.h
  8. 30
      table/block_based/block_based_table_reader.cc
  9. 4
      table/block_based/block_based_table_reader.h
  10. 9
      table/table_reader.h

@ -2338,6 +2338,34 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL1AndL2) {
ASSERT_EQ(multiget_io_batch_size.count, 1);
ASSERT_EQ(multiget_io_batch_size.max, 2);
}
TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
std::vector<std::string> key_strs;
std::vector<Slice> keys;
std::vector<PinnableSlice> values;
std::vector<Status> statuses;
// 19 and 26 are in L2, but overlap with L0 and L1 file ranges
key_strs.push_back(Key(19));
key_strs.push_back(Key(26));
keys.push_back(key_strs[0]);
keys.push_back(key_strs[1]);
values.resize(keys.size());
statuses.resize(keys.size());
ReadOptions ro;
ro.async_io = true;
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
keys.data(), values.data(), statuses.data());
ASSERT_EQ(values.size(), 2);
ASSERT_EQ(statuses[0], Status::OK());
ASSERT_EQ(statuses[1], Status::OK());
ASSERT_EQ(values[0], "val_l2_" + std::to_string(19));
ASSERT_EQ(values[1], "val_l2_" + std::to_string(26));
// Bloom filters in L0/L1 will avoid the coroutine calls in those levels
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
}
#endif // USE_COROUTINES
TEST_F(DBBasicTest, MultiGetStats) {

@ -501,6 +501,48 @@ Status TableCache::Get(
return s;
}
Status TableCache::MultiGetFilter(
const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
HistogramImpl* file_read_hist, int level,
MultiGetContext::Range* mget_range, Cache::Handle** table_handle) {
auto& fd = file_meta.fd;
#ifndef ROCKSDB_LITE
IterKey row_cache_key;
std::string row_cache_entry_buffer;
// Check if we need to use the row cache. If yes, then we cannot do the
// filtering here, since the filtering needs to happen after the row cache
// lookup.
KeyContext& first_key = *mget_range->begin();
if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) {
return Status::NotSupported();
}
#endif // ROCKSDB_LITE
Status s;
TableReader* t = fd.table_reader;
Cache::Handle* handle = nullptr;
if (t == nullptr) {
s = FindTable(
options, file_options_, internal_comparator, fd, &handle,
prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist, /*skip_filters=*/false,
level, true /* prefetch_index_and_filter_in_cache */,
/*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
if (s.ok()) {
t = GetTableReaderFromHandle(handle);
}
*table_handle = handle;
}
if (s.ok()) {
s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range);
}
return s;
}
Status TableCache::GetTableProperties(
const FileOptions& file_options,
const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,

@ -107,6 +107,19 @@ class TableCache {
const FileMetaData& file_meta,
std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
// Call table reader's MultiGetFilter to use the bloom filter to filter out
// keys. Returns Status::NotSupported() if row cache needs to be checked.
// If the table cache is looked up to get the table reader, the cache handle
// is returned in table_handle. This handle should be passed back to
// MultiGet() so it can be released.
Status MultiGetFilter(
const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
HistogramImpl* file_read_hist, int level,
MultiGetContext::Range* mget_range, Cache::Handle** table_handle);
// If a seek to internal key "k" in specified file finds an entry,
// call get_context->SaveValue() repeatedly until
// it returns false. As a side effect, it will insert the TableReader
@ -122,7 +135,7 @@ class TableCache {
const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
int level = -1);
int level = -1, Cache::Handle* table_handle = nullptr);
// Evict any entry for the specified file number
static void Evict(Cache* cache, uint64_t file_number);

@ -17,13 +17,17 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
(const ReadOptions& options, const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
HistogramImpl* file_read_hist, bool skip_filters, int level) {
HistogramImpl* file_read_hist, bool skip_filters, int level,
Cache::Handle* table_handle) {
auto& fd = file_meta.fd;
Status s;
TableReader* t = fd.table_reader;
Cache::Handle* handle = nullptr;
Cache::Handle* handle = table_handle;
MultiGetRange table_range(*mget_range, mget_range->begin(),
mget_range->end());
if (handle != nullptr && t == nullptr) {
t = GetTableReaderFromHandle(handle);
}
#ifndef ROCKSDB_LITE
autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
IterKey row_cache_key;
@ -61,6 +65,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
// found in the row cache and thus the range may now be empty
if (s.ok() && !table_range.empty()) {
if (t == nullptr) {
assert(handle == nullptr);
s = FindTable(options, file_options_, internal_comparator, fd, &handle,
prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */,

@ -2215,11 +2215,13 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
if (!read_options.async_io || !using_coroutines() ||
fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) {
if (f) {
bool skip_filters = IsFilterSkipped(
static_cast<int>(fp.GetHitFileLevel()), fp.IsHitFileLastInLevel());
// Call MultiGetFromSST for looking up a single file
s = MultiGetFromSST(read_options, fp.CurrentFileRange(),
fp.GetHitFileLevel(), fp.IsHitFileLastInLevel(), f,
blob_ctxs, num_filter_read, num_index_read,
num_sst_read);
fp.GetHitFileLevel(), skip_filters, f, blob_ctxs,
/*table_handle=*/nullptr, num_filter_read,
num_index_read, num_sst_read);
if (fp.GetHitFileLevel() == 0) {
dump_stats_for_l0_file = true;
}
@ -2231,16 +2233,39 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
} else {
std::vector<folly::coro::Task<Status>> mget_tasks;
while (f != nullptr) {
mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
read_options, fp.CurrentFileRange(), fp.GetHitFileLevel(),
fp.IsHitFileLastInLevel(), f, blob_ctxs, num_filter_read,
num_index_read, num_sst_read));
MultiGetRange file_range = fp.CurrentFileRange();
Cache::Handle* table_handle = nullptr;
bool skip_filters = IsFilterSkipped(
static_cast<int>(fp.GetHitFileLevel()), fp.IsHitFileLastInLevel());
if (!skip_filters) {
Status status = table_cache_->MultiGetFilter(
read_options, *internal_comparator(), *f->file_metadata,
mutable_cf_options_.prefix_extractor,
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
fp.GetHitFileLevel(), &file_range, &table_handle);
if (status.ok()) {
skip_filters = true;
} else if (!status.IsNotSupported()) {
s = status;
}
}
if (!s.ok()) {
break;
}
if (!file_range.empty()) {
mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
read_options, file_range, fp.GetHitFileLevel(), skip_filters, f,
blob_ctxs, table_handle, num_filter_read, num_index_read,
num_sst_read));
}
if (fp.KeyMaySpanNextFile()) {
break;
}
f = fp.GetNextFileInLevel();
}
if (mget_tasks.size() > 0) {
if (s.ok() && mget_tasks.size() > 0) {
RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
// Collect all results so far
std::vector<Status> statuses = folly::coro::blockingWait(

@ -990,10 +990,10 @@ class Version {
DECLARE_SYNC_AND_ASYNC(
/* ret_type */ Status, /* func_name */ MultiGetFromSST,
const ReadOptions& read_options, MultiGetRange file_range,
int hit_file_level, bool is_hit_file_last_in_level, FdWithKeyRange* f,
int hit_file_level, bool skip_filters, FdWithKeyRange* f,
std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
uint64_t& num_filter_read, uint64_t& num_index_read,
uint64_t& num_sst_read);
Cache::Handle* table_handle, uint64_t& num_filter_read,
uint64_t& num_index_read, uint64_t& num_sst_read);
ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
Logger* info_log_;

@ -13,9 +13,10 @@ namespace ROCKSDB_NAMESPACE {
// Lookup a batch of keys in a single SST file
DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
(const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level,
bool is_hit_file_last_in_level, FdWithKeyRange* f,
bool skip_filters, FdWithKeyRange* f,
std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
uint64_t& num_filter_read, uint64_t& num_index_read, uint64_t& num_sst_read) {
Cache::Handle* table_handle, uint64_t& num_filter_read,
uint64_t& num_index_read, uint64_t& num_sst_read) {
bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
get_perf_context()->per_level_perf_context_enabled;
@ -24,10 +25,8 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
s = CO_AWAIT(table_cache_->MultiGet)(
read_options, *internal_comparator(), *f->file_metadata, &file_range,
mutable_cf_options_.prefix_extractor,
cfd_->internal_stats()->GetFileReadHist(hit_file_level),
IsFilterSkipped(static_cast<int>(hit_file_level),
is_hit_file_last_in_level),
hit_file_level);
cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters,
hit_file_level, table_handle);
// TODO: examine the behavior for corrupted key
if (timer_enabled) {
PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),

@ -2273,6 +2273,36 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
return s;
}
Status BlockBasedTable::MultiGetFilter(const ReadOptions& read_options,
const SliceTransform* prefix_extractor,
MultiGetRange* mget_range) {
if (mget_range->empty()) {
// Caller should ensure non-empty (performance bug)
assert(false);
return Status::OK(); // Nothing to do
}
FilterBlockReader* const filter = rep_->filter.get();
if (!filter) {
return Status::OK();
}
// First check the full filter
// If full filter not useful, Then go into each block
const bool no_io = read_options.read_tier == kBlockCacheTier;
uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
if (mget_range->begin()->get_context) {
tracing_mget_id = mget_range->begin()->get_context->get_tracing_get_id();
}
BlockCacheLookupContext lookup_context{
TableReaderCaller::kUserMultiGet, tracing_mget_id,
/*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
FullFilterKeysMayMatch(filter, mget_range, no_io, prefix_extractor,
&lookup_context, read_options.rate_limiter_priority);
return Status::OK();
}
Status BlockBasedTable::Prefetch(const Slice* const begin,
const Slice* const end) {
auto& comparator = rep_->internal_comparator;

@ -141,6 +141,10 @@ class BlockBasedTable : public TableReader {
GetContext* get_context, const SliceTransform* prefix_extractor,
bool skip_filters = false) override;
Status MultiGetFilter(const ReadOptions& read_options,
const SliceTransform* prefix_extractor,
MultiGetRange* mget_range) override;
DECLARE_SYNC_AND_ASYNC_OVERRIDE(void, MultiGet,
const ReadOptions& readOptions,
const MultiGetContext::Range* mget_range,

@ -128,6 +128,15 @@ class TableReader {
const SliceTransform* prefix_extractor,
bool skip_filters = false) = 0;
// Use bloom filters in the table file, if present, to filter out keys. The
// mget_range will be updated to skip keys that get a negative result from
// the filter lookup.
virtual Status MultiGetFilter(const ReadOptions& /*readOptions*/,
const SliceTransform* /*prefix_extractor*/,
MultiGetContext::Range* /*mget_range*/) {
return Status::NotSupported();
}
virtual void MultiGet(const ReadOptions& readOptions,
const MultiGetContext::Range* mget_range,
const SliceTransform* prefix_extractor,

Loading…
Cancel
Save