Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)

Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer

The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265

Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3  -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**

SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360  -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true

  | seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec;   24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec;   74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec;   75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec;   24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec;   74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec;   74.5 (± 1.6) MB/sec

async io = 1, direct io read = true
  | seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec;   17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec;   68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec;   71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec;   17.4 (± 0.1) MB/sec  | 263 (± 2) ops/sec;   68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec;   72.6 (± 0.6) MB/sec

Reviewed By: ajkr

Differential Revision: D43781467

Pulled By: hx235

fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
oxigraph-8.1.1
Hui Xiao 2 years ago committed by Facebook GitHub Bot
parent 601efe3cf2
commit bab5f9a6f2
  1. 1
      HISTORY.md
  2. 35
      file/file_prefetch_buffer.cc
  3. 35
      file/file_prefetch_buffer.h
  4. 23
      file/prefetch_test.cc
  5. 13
      include/rocksdb/statistics.h
  6. 13
      java/rocksjni/portal.h
  7. 7
      java/src/main/java/org/rocksdb/HistogramType.java
  8. 14
      java/src/main/java/org/rocksdb/TickerType.java
  9. 7
      monitoring/statistics.cc
  10. 13
      table/block_based/block_based_table_reader.cc
  11. 2
      table/block_based/block_based_table_reader.h

@ -12,6 +12,7 @@
### New Features
* Add statistics rocksdb.secondary.cache.filter.hits, rocksdb.secondary.cache.index.hits, and rocksdb.secondary.cache.filter.hits
* Added a new PerfContext counter `internal_merge_point_lookup_count` which tracks the number of Merge operands applied while serving point lookup queries.
* Add new statistics rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit}
## 8.0.0 (02/19/2023)
### Behavior changes

@ -162,6 +162,9 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts,
Status s = Read(opts, reader, rate_limiter_priority, read_len, chunk_len,
rounddown_offset, curr_);
if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) {
RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len);
}
return s;
}
@ -609,6 +612,22 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
Slice* result, Status* status,
Env::IOPriority rate_limiter_priority,
bool for_compaction /* = false */) {
bool ret = TryReadFromCacheUntracked(opts, reader, offset, n, result, status,
rate_limiter_priority, for_compaction);
if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) {
if (ret) {
RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT);
} else {
RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS);
}
}
return ret;
}
bool FilePrefetchBuffer::TryReadFromCacheUntracked(
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
size_t n, Slice* result, Status* status,
Env::IOPriority rate_limiter_priority, bool for_compaction /* = false */) {
if (track_min_offset_ && offset < min_offset_read_) {
min_offset_read_ = static_cast<size_t>(offset);
}
@ -666,6 +685,22 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
size_t n, Slice* result, Status* status,
Env::IOPriority rate_limiter_priority) {
bool ret = TryReadFromCacheAsyncUntracked(opts, reader, offset, n, result,
status, rate_limiter_priority);
if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) {
if (ret) {
RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT);
} else {
RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS);
}
}
return ret;
}
bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked(
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
size_t n, Slice* result, Status* status,
Env::IOPriority rate_limiter_priority) {
if (track_min_offset_ && offset < min_offset_read_) {
min_offset_read_ = static_cast<size_t>(offset);
}

@ -54,6 +54,11 @@ struct BufferInfo {
uint32_t pos_ = 0;
};
enum class FilePrefetchBufferUsage {
kTableOpenPrefetchTail,
kUnknown,
};
// FilePrefetchBuffer is a smart buffer to store and read data from a file.
class FilePrefetchBuffer {
public:
@ -78,13 +83,13 @@ class FilePrefetchBuffer {
// and max_readahead_size are passed in.
// A user can construct a FilePrefetchBuffer without any arguments, but use
// `Prefetch` to load data into the buffer.
FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0,
FilePrefetchBuffer(
size_t readahead_size = 0, size_t max_readahead_size = 0,
bool enable = true, bool track_min_offset = false,
bool implicit_auto_readahead = false,
uint64_t num_file_reads = 0,
uint64_t num_file_reads_for_auto_readahead = 0,
FileSystem* fs = nullptr, SystemClock* clock = nullptr,
Statistics* stats = nullptr)
bool implicit_auto_readahead = false, uint64_t num_file_reads = 0,
uint64_t num_file_reads_for_auto_readahead = 0, FileSystem* fs = nullptr,
SystemClock* clock = nullptr, Statistics* stats = nullptr,
FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown)
: curr_(0),
readahead_size_(readahead_size),
initial_auto_readahead_size_(readahead_size),
@ -100,7 +105,8 @@ class FilePrefetchBuffer {
explicit_prefetch_submitted_(false),
fs_(fs),
clock_(clock),
stats_(stats) {
stats_(stats),
usage_(usage) {
assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) ||
(num_file_reads_ == 0));
// If ReadOptions.async_io is enabled, data is asynchronously filled in
@ -403,6 +409,19 @@ class FilePrefetchBuffer {
bool& copy_to_third_buffer, uint64_t& tmp_offset,
size_t& tmp_length);
bool TryReadFromCacheUntracked(const IOOptions& opts,
RandomAccessFileReader* reader,
uint64_t offset, size_t n, Slice* result,
Status* s,
Env::IOPriority rate_limiter_priority,
bool for_compaction = false);
bool TryReadFromCacheAsyncUntracked(const IOOptions& opts,
RandomAccessFileReader* reader,
uint64_t offset, size_t n, Slice* result,
Status* status,
Env::IOPriority rate_limiter_priority);
std::vector<BufferInfo> bufs_;
// curr_ represents the index for bufs_ indicating which buffer is being
// consumed currently.
@ -442,5 +461,7 @@ class FilePrefetchBuffer {
FileSystem* fs_;
SystemClock* clock_;
Statistics* stats_;
FilePrefetchBufferUsage usage_;
};
} // namespace ROCKSDB_NAMESPACE

@ -125,6 +125,7 @@ TEST_P(PrefetchTest, Basic) {
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
Options options;
SetGenericOptions(env.get(), use_direct_io, options);
options.statistics = CreateDBStatistics();
const int kNumKeys = 1100;
int buff_prefetch_count = 0;
@ -167,9 +168,25 @@ TEST_P(PrefetchTest, Basic) {
Slice least(start_key.data(), start_key.size());
Slice greatest(end_key.data(), end_key.size());
HistogramData prev_table_open_prefetch_tail_read;
options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
&prev_table_open_prefetch_tail_read);
const uint64_t prev_table_open_prefetch_tail_miss =
options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS);
const uint64_t prev_table_open_prefetch_tail_hit =
options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT);
// commenting out the line below causes the example to work correctly
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
HistogramData cur_table_open_prefetch_tail_read;
options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
&cur_table_open_prefetch_tail_read);
const uint64_t cur_table_open_prefetch_tail_miss =
options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS);
const uint64_t cur_table_open_prefetch_tail_hit =
options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT);
if (support_prefetch && !use_direct_io) {
// If underline file system supports prefetch, and directIO is not enabled
// make sure prefetch() is called and FilePrefetchBuffer is not used.
@ -182,6 +199,12 @@ TEST_P(PrefetchTest, Basic) {
// used.
ASSERT_FALSE(fs->IsPrefetchCalled());
ASSERT_GT(buff_prefetch_count, 0);
ASSERT_GT(cur_table_open_prefetch_tail_read.count,
prev_table_open_prefetch_tail_read.count);
ASSERT_GT(cur_table_open_prefetch_tail_hit,
prev_table_open_prefetch_tail_hit);
ASSERT_GE(cur_table_open_prefetch_tail_miss,
prev_table_open_prefetch_tail_miss);
buff_prefetch_count = 0;
}

@ -420,6 +420,15 @@ enum Tickers : uint32_t {
SECONDARY_CACHE_INDEX_HITS,
SECONDARY_CACHE_DATA_HITS,
// Number of lookup into the prefetched tail (see
// `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`)
// that can't find its data for table open
TABLE_OPEN_PREFETCH_TAIL_MISS,
// Number of lookup into the prefetched tail (see
// `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`)
// that finds its data for table open
TABLE_OPEN_PREFETCH_TAIL_HIT,
TICKER_ENUM_MAX
};
@ -533,6 +542,10 @@ enum Histograms : uint32_t {
// Wait time for aborting async read in FilePrefetchBuffer destructor
ASYNC_PREFETCH_ABORT_MICROS,
// Number of bytes read for RocksDB's prefetching contents (as opposed to file
// system's prefetch) from the end of SST table during block based table open
TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
HISTOGRAM_ENUM_MAX
};

@ -5125,6 +5125,10 @@ class TickerTypeJni {
return -0x38;
case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS:
return -0x39;
case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS:
return -0x3A;
case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT:
return -0x3B;
case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
// 0x5F was the max value in the initial copy of tickers to Java.
// Since these values are exposed directly to Java clients, we keep
@ -5482,6 +5486,10 @@ class TickerTypeJni {
return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS;
case -0x39:
return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS;
case -0x3A:
return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS;
case -0x3B:
return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT;
case 0x5F:
// 0x5F was the max value in the initial copy of tickers to Java.
// Since these values are exposed directly to Java clients, we keep
@ -5609,6 +5617,8 @@ class HistogramTypeJni {
return 0x37;
case ASYNC_PREFETCH_ABORT_MICROS:
return 0x38;
case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES:
return 0x39;
case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
// 0x1F for backwards compatibility on current minor version.
return 0x1F;
@ -5725,6 +5735,9 @@ class HistogramTypeJni {
return ROCKSDB_NAMESPACE::Histograms::NUM_LEVEL_READ_PER_MULTIGET;
case 0x38:
return ROCKSDB_NAMESPACE::Histograms::ASYNC_PREFETCH_ABORT_MICROS;
case 0x39:
return ROCKSDB_NAMESPACE::Histograms::
TABLE_OPEN_PREFETCH_TAIL_READ_BYTES;
case 0x1F:
// 0x1F for backwards compatibility on current minor version.
return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX;

@ -162,6 +162,13 @@ public enum HistogramType {
ASYNC_READ_BYTES((byte) 0x33),
/**
* Number of bytes read for RocksDB's prefetching contents
* (as opposed to file system's prefetch)
* from the end of SST table during block based table open
*/
TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x39),
// 0x1F for backwards compatibility on current minor version.
HISTOGRAM_ENUM_MAX((byte) 0x1F);

@ -740,6 +740,20 @@ public enum TickerType {
*/
BLOB_DB_CACHE_BYTES_WRITE((byte) -0x34),
/**
* Number of lookup into the prefetched tail (see
* `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`)
* that can't find its data for table open
*/
TABLE_OPEN_PREFETCH_TAIL_MISS((byte) -0x3A),
/**
* Number of lookup into the prefetched tail (see
* `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`)
* that finds its data for table open
*/
TABLE_OPEN_PREFETCH_TAIL_HIT((byte) -0x3B),
TICKER_ENUM_MAX((byte) 0x5F);
private final byte value;

@ -216,7 +216,10 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{ASYNC_READ_ERROR_COUNT, "rocksdb.async.read.error.count"},
{SECONDARY_CACHE_FILTER_HITS, "rocksdb.secondary.cache.filter.hits"},
{SECONDARY_CACHE_INDEX_HITS, "rocksdb.secondary.cache.index.hits"},
{SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}};
{SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"},
{TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"},
{TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"},
};
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{DB_GET, "rocksdb.db.get.micros"},
@ -272,6 +275,8 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"},
{NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"},
{ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"},
{TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
"rocksdb.table.open.prefetch.tail.read.bytes"},
};
std::shared_ptr<Statistics> CreateDBStatistics() {

@ -591,7 +591,7 @@ Status BlockBasedTable::Open(
if (!ioptions.allow_mmap_reads) {
s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch,
tail_prefetch_stats, prefetch_all, preload_all,
&prefetch_buffer);
&prefetch_buffer, ioptions.stats);
// Return error in prefetch path to users.
if (!s.ok()) {
return s;
@ -802,7 +802,7 @@ Status BlockBasedTable::PrefetchTail(
const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
const bool prefetch_all, const bool preload_all,
std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer, Statistics* stats) {
size_t tail_prefetch_size = 0;
if (tail_prefetch_stats != nullptr) {
// Multiple threads may get a 0 (no history) when running in parallel,
@ -842,9 +842,12 @@ Status BlockBasedTable::PrefetchTail(
}
// Use `FilePrefetchBuffer`
prefetch_buffer->reset(
new FilePrefetchBuffer(0 /* readahead_size */, 0 /* max_readahead_size */,
true /* enable */, true /* track_min_offset */));
prefetch_buffer->reset(new FilePrefetchBuffer(
0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */,
true /* track_min_offset */, false /* implicit_auto_readahead */,
0 /* num_file_reads */, 0 /* num_file_reads_for_auto_readahead */,
nullptr /* fs */, nullptr /* clock */, stats,
FilePrefetchBufferUsage::kTableOpenPrefetchTail));
IOOptions opts;
Status s = file->PrepareIOOptions(ro, opts);

@ -444,7 +444,7 @@ class BlockBasedTable : public TableReader {
const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
const bool prefetch_all, const bool preload_all,
std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer, Statistics* stats);
Status ReadMetaIndexBlock(const ReadOptions& ro,
FilePrefetchBuffer* prefetch_buffer,
std::unique_ptr<Block>* metaindex_block,

Loading…
Cancel
Save