Update stats to help users estimate MultiGet async IO impact (#10182)

Summary:
Add a couple of stats to help users estimate the impact of potential MultiGet perf improvements -
1. NUM_LEVEL_READ_PER_MULTIGET - A histogram stat for number of levels that required MultiGet to read from a file
2. MULTIGET_COROUTINE_COUNT - A ticker stat to count the number of times the coroutine version of MultiGetFromSST was used

The NUM_DATA_BLOCKS_READ_PER_LEVEL stat is obsoleted as it doesn't provide useful information for MultiGet optimization.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10182

Reviewed By: akankshamahajan15

Differential Revision: D37213296

Pulled By: anand1976

fbshipit-source-id: 5d2b7708017c0e278578ae4bffac3926f6530efb
main
anand76 2 years ago committed by Facebook GitHub Bot
parent 4d31d3c2ed
commit a6691d0f65
  1. 1
      HISTORY.md
  2. 22
      db/db_basic_test.cc
  3. 33
      db/version_set.cc
  4. 2
      db/version_set.h
  5. 5
      db/version_set_sync_and_async.h
  6. 5
      include/rocksdb/statistics.h
  7. 4
      monitoring/statistics.cc
  8. 6
      table/block_based/block_based_table_reader.cc
  9. 3
      table/block_based/block_based_table_reader_sync_and_async.h
  10. 1
      table/get_context.h

@ -25,6 +25,7 @@
* And Add C API's for Transaction, SstFileWriter, Compaction as mentioned [here](https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-(Experimental))
* The contract for implementations of Comparator::IsSameLengthImmediateSuccessor has been updated to work around a design bug in `auto_prefix_mode`.
* The API documentation for `auto_prefix_mode` now notes some corner cases in which it returns different results than `total_order_seek`, due to design bugs that are not easily fixed. Users using built-in comparators and keys at least the size of a fixed prefix length are not affected.
* Obsoleted the NUM_DATA_BLOCKS_READ_PER_LEVEL stat and introduced the NUM_LEVEL_READ_PER_MULTIGET and MULTIGET_COROUTINE_COUNT stats
### New Features
* Add FileSystem::ReadAsync API in io_tracing

@ -2221,6 +2221,7 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL0) {
// No async IO in this case since we don't do parallel lookup in L0
ASSERT_EQ(multiget_io_batch_size.count, 0);
ASSERT_EQ(multiget_io_batch_size.max, 0);
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0);
}
TEST_F(DBMultiGetAsyncIOTest, GetFromL1) {
@ -2257,6 +2258,7 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL1) {
// A batch of 3 async IOs is expected, one for each overlapping file in L1
ASSERT_EQ(multiget_io_batch_size.count, 1);
ASSERT_EQ(multiget_io_batch_size.max, 3);
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
}
TEST_F(DBMultiGetAsyncIOTest, LastKeyInFile) {
@ -2407,27 +2409,37 @@ TEST_F(DBBasicTest, MultiGetStats) {
values.data(), s.data(), false);
ASSERT_EQ(values.size(), kMultiGetBatchSize);
HistogramData hist_data_blocks;
HistogramData hist_level;
HistogramData hist_index_and_filter_blocks;
HistogramData hist_sst;
options.statistics->histogramData(NUM_DATA_BLOCKS_READ_PER_LEVEL,
&hist_data_blocks);
options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level);
options.statistics->histogramData(NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
&hist_index_and_filter_blocks);
options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst);
// Maximum number of blocks read from a file system in a level.
ASSERT_EQ(hist_data_blocks.max, 32);
ASSERT_EQ(hist_level.max, 1);
ASSERT_GT(hist_index_and_filter_blocks.max, 0);
// Maximum number of sst files read from file system in a level.
ASSERT_EQ(hist_sst.max, 2);
// Minimun number of blocks read in a level.
ASSERT_EQ(hist_data_blocks.min, 4);
ASSERT_EQ(hist_level.min, 1);
ASSERT_GT(hist_index_and_filter_blocks.min, 0);
// Minimun number of sst files read in a level.
ASSERT_EQ(hist_sst.min, 1);
for (PinnableSlice& value : values) {
value.Reset();
}
for (Status& status : s) {
status = Status::OK();
}
db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[950],
values.data(), s.data(), false);
options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level);
ASSERT_EQ(hist_level.max, 2);
}
// Test class for batched MultiGet with prefix extractor

@ -2252,8 +2252,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
Status s;
uint64_t num_index_read = 0;
uint64_t num_filter_read = 0;
uint64_t num_data_read = 0;
uint64_t num_sst_read = 0;
uint64_t num_level_read = 0;
MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
// blob_file => [[blob_idx, it], ...]
@ -2275,7 +2275,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
s = MultiGetFromSST(read_options, fp.CurrentFileRange(),
fp.GetHitFileLevel(), fp.IsHitFileLastInLevel(), f,
blob_rqs, num_filter_read, num_index_read,
num_data_read, num_sst_read);
num_sst_read);
if (fp.GetHitFileLevel() == 0) {
dump_stats_for_l0_file = true;
}
@ -2290,13 +2290,14 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
read_options, fp.CurrentFileRange(), fp.GetHitFileLevel(),
fp.IsHitFileLastInLevel(), f, blob_rqs, num_filter_read,
num_index_read, num_data_read, num_sst_read));
num_index_read, num_sst_read));
if (fp.KeyMaySpanNextFile()) {
break;
}
f = fp.GetNextFileInLevel();
}
if (mget_tasks.size() > 0) {
RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
// Collect all results so far
std::vector<Status> statuses = folly::coro::blockingWait(
folly::coro::collectAllRange(std::move(mget_tasks))
@ -2328,18 +2329,18 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
(prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) {
// Dump the stats if the search has moved to the next level and
// reset for next level.
if (num_sst_read || (num_filter_read + num_index_read)) {
if (num_filter_read + num_index_read) {
RecordInHistogram(db_statistics_,
NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
num_index_read + num_filter_read);
RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
num_data_read);
}
if (num_sst_read) {
RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL,
num_sst_read);
num_level_read++;
}
num_filter_read = 0;
num_index_read = 0;
num_data_read = 0;
num_sst_read = 0;
}
prev_level = fp.GetHitFileLevel();
@ -2347,11 +2348,19 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
}
// Dump stats for most recent level
RecordInHistogram(db_statistics_, NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
num_index_read + num_filter_read);
RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
num_data_read);
RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
if (num_filter_read + num_index_read) {
RecordInHistogram(db_statistics_,
NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
num_index_read + num_filter_read);
}
if (num_sst_read) {
RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
num_level_read++;
}
if (num_level_read) {
RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET,
num_level_read);
}
if (s.ok() && !blob_rqs.empty()) {
MultiGetBlob(read_options, keys_with_blobs_range, blob_rqs);

@ -958,7 +958,7 @@ class Version {
int hit_file_level, bool is_hit_file_last_in_level, FdWithKeyRange* f,
std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs,
uint64_t& num_filter_read, uint64_t& num_index_read,
uint64_t& num_data_read, uint64_t& num_sst_read);
uint64_t& num_sst_read);
ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
Logger* info_log_;

@ -15,8 +15,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
(const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level,
bool is_hit_file_last_in_level, FdWithKeyRange* f,
std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs,
uint64_t& num_filter_read, uint64_t& num_index_read, uint64_t& num_data_read,
uint64_t& num_sst_read) {
uint64_t& num_filter_read, uint64_t& num_index_read, uint64_t& num_sst_read) {
bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
get_perf_context()->per_level_perf_context_enabled;
@ -63,12 +62,10 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
batch_size++;
num_index_read += get_context.get_context_stats_.num_index_read;
num_filter_read += get_context.get_context_stats_.num_filter_read;
num_data_read += get_context.get_context_stats_.num_data_read;
num_sst_read += get_context.get_context_stats_.num_sst_read;
// Reset these stats since they're specific to a level
get_context.get_context_stats_.num_index_read = 0;
get_context.get_context_stats_.num_filter_read = 0;
get_context.get_context_stats_.num_data_read = 0;
get_context.get_context_stats_.num_sst_read = 0;
// report the counters before returning

@ -432,6 +432,7 @@ enum Tickers : uint32_t {
NON_LAST_LEVEL_READ_COUNT,
BLOCK_CHECKSUM_COMPUTE_COUNT,
MULTIGET_COROUTINE_COUNT,
TICKER_ENUM_MAX
};
@ -529,6 +530,7 @@ enum Histograms : uint32_t {
// Num of index and filter blocks read from file system per level.
NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
// Num of data blocks read from file system per level.
// Obsolete
NUM_DATA_BLOCKS_READ_PER_LEVEL,
// Num of sst files read from file system per level.
NUM_SST_READ_PER_LEVEL,
@ -546,6 +548,9 @@ enum Histograms : uint32_t {
// Number of IOs issued in parallel in a MultiGet batch
MULTIGET_IO_BATCH_SIZE,
// Number of levels requiring IO for MultiGet
NUM_LEVEL_READ_PER_MULTIGET,
HISTOGRAM_ENUM_MAX,
};

@ -226,7 +226,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{LAST_LEVEL_READ_COUNT, "rocksdb.last.level.read.count"},
{NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"},
{NON_LAST_LEVEL_READ_COUNT, "rocksdb.non.last.level.read.count"},
{BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"}};
{BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"},
{MULTIGET_COROUTINE_COUNT, "rocksdb.multiget.coroutine.count"}};
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{DB_GET, "rocksdb.db.get.micros"},
@ -287,6 +288,7 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"},
{PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"},
{MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"},
{NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"},
};
std::shared_ptr<Statistics> CreateDBStatistics() {

@ -1611,9 +1611,6 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
case BlockType::kDeprecatedFilter:
++get_context->get_context_stats_.num_filter_read;
break;
case BlockType::kData:
++get_context->get_context_stats_.num_data_read;
break;
default:
break;
}
@ -1771,9 +1768,6 @@ Status BlockBasedTable::RetrieveBlock(
case BlockType::kDeprecatedFilter:
++(get_context->get_context_stats_.num_filter_read);
break;
case BlockType::kData:
++(get_context->get_context_stats_.num_data_read);
break;
default:
break;
}

@ -177,9 +177,6 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
size_t& req_idx = req_idx_for_block[valid_batch_idx];
size_t& req_offset = req_offset_for_block[valid_batch_idx];
valid_batch_idx++;
if (mget_iter->get_context) {
++(mget_iter->get_context->get_context_stats_.num_data_read);
}
FSReadRequest& req = read_reqs[req_idx];
Status s = req.status;
if (s.ok()) {

@ -53,7 +53,6 @@ struct GetContextStats {
// MultiGet stats.
uint64_t num_filter_read = 0;
uint64_t num_index_read = 0;
uint64_t num_data_read = 0;
uint64_t num_sst_read = 0;
};

Loading…
Cancel
Save