From bab5f9a6f27aa5852817202f4523415c564e307d Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Wed, 15 Mar 2023 14:02:43 -0700 Subject: [PATCH] Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: **Context/Summary:** We are adding new stats to measure behavior of prefetched tail size and look up into this buffer The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265 Test Plan: **- Piggyback on existing test** **- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.** ``` ./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true ``` ``` rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600 rocksdb.table.open.prefetch.tail.miss COUNT : 91 rocksdb.table.open.prefetch.tail.hit COUNT : 1034 ``` **- No perf regression observed in db_bench** SETUP command: create same db with ~900 files for pre-change/post-change. ``` ./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none ``` TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing. ``` ./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true ``` async io = 0, direct io read = true | seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs -- | -- | -- | -- pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec async io = 1, direct io read = true | seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs -- | -- | -- | -- pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec Reviewed By: ajkr Differential Revision: D43781467 Pulled By: hx235 fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a --- HISTORY.md | 1 + file/file_prefetch_buffer.cc | 35 ++++++++++++++++++ file/file_prefetch_buffer.h | 37 +++++++++++++++---- file/prefetch_test.cc | 23 ++++++++++++ include/rocksdb/statistics.h | 13 +++++++ java/rocksjni/portal.h | 13 +++++++ .../main/java/org/rocksdb/HistogramType.java | 7 ++++ .../src/main/java/org/rocksdb/TickerType.java | 14 +++++++ monitoring/statistics.cc | 7 +++- table/block_based/block_based_table_reader.cc | 13 ++++--- table/block_based/block_based_table_reader.h | 2 +- 11 files changed, 150 insertions(+), 15 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index d2f4762f8..0c2fd8aca 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -12,6 +12,7 @@ ### New Features * Add statistics rocksdb.secondary.cache.filter.hits, rocksdb.secondary.cache.index.hits, and rocksdb.secondary.cache.filter.hits * Added a new PerfContext counter `internal_merge_point_lookup_count` which tracks the number of Merge operands applied while serving point lookup queries. +* Add new statistics rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} ## 8.0.0 (02/19/2023) ### Behavior changes diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc index d16b937df..2fab9c917 100644 --- a/file/file_prefetch_buffer.cc +++ b/file/file_prefetch_buffer.cc @@ -162,6 +162,9 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts, Status s = Read(opts, reader, rate_limiter_priority, read_len, chunk_len, rounddown_offset, curr_); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) { + RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len); + } return s; } @@ -609,6 +612,22 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, Slice* result, Status* status, Env::IOPriority rate_limiter_priority, bool for_compaction /* = false */) { + bool ret = TryReadFromCacheUntracked(opts, reader, offset, n, result, status, + rate_limiter_priority, for_compaction); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) { + if (ret) { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT); + } else { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS); + } + } + return ret; +} + +bool FilePrefetchBuffer::TryReadFromCacheUntracked( + const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, + size_t n, Slice* result, Status* status, + Env::IOPriority rate_limiter_priority, bool for_compaction /* = false */) { if (track_min_offset_ && offset < min_offset_read_) { min_offset_read_ = static_cast(offset); } @@ -666,6 +685,22 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync( const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, size_t n, Slice* result, Status* status, Env::IOPriority rate_limiter_priority) { + bool ret = TryReadFromCacheAsyncUntracked(opts, reader, offset, n, result, + status, rate_limiter_priority); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) { + if (ret) { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT); + } else { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS); + } + } + return ret; +} + +bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked( + const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, + size_t n, Slice* result, Status* status, + Env::IOPriority rate_limiter_priority) { if (track_min_offset_ && offset < min_offset_read_) { min_offset_read_ = static_cast(offset); } diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index a4a75fe2b..304d1c3bb 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -54,6 +54,11 @@ struct BufferInfo { uint32_t pos_ = 0; }; +enum class FilePrefetchBufferUsage { + kTableOpenPrefetchTail, + kUnknown, +}; + // FilePrefetchBuffer is a smart buffer to store and read data from a file. class FilePrefetchBuffer { public: @@ -78,13 +83,13 @@ class FilePrefetchBuffer { // and max_readahead_size are passed in. // A user can construct a FilePrefetchBuffer without any arguments, but use // `Prefetch` to load data into the buffer. - FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0, - bool enable = true, bool track_min_offset = false, - bool implicit_auto_readahead = false, - uint64_t num_file_reads = 0, - uint64_t num_file_reads_for_auto_readahead = 0, - FileSystem* fs = nullptr, SystemClock* clock = nullptr, - Statistics* stats = nullptr) + FilePrefetchBuffer( + size_t readahead_size = 0, size_t max_readahead_size = 0, + bool enable = true, bool track_min_offset = false, + bool implicit_auto_readahead = false, uint64_t num_file_reads = 0, + uint64_t num_file_reads_for_auto_readahead = 0, FileSystem* fs = nullptr, + SystemClock* clock = nullptr, Statistics* stats = nullptr, + FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown) : curr_(0), readahead_size_(readahead_size), initial_auto_readahead_size_(readahead_size), @@ -100,7 +105,8 @@ class FilePrefetchBuffer { explicit_prefetch_submitted_(false), fs_(fs), clock_(clock), - stats_(stats) { + stats_(stats), + usage_(usage) { assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) || (num_file_reads_ == 0)); // If ReadOptions.async_io is enabled, data is asynchronously filled in @@ -403,6 +409,19 @@ class FilePrefetchBuffer { bool& copy_to_third_buffer, uint64_t& tmp_offset, size_t& tmp_length); + bool TryReadFromCacheUntracked(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result, + Status* s, + Env::IOPriority rate_limiter_priority, + bool for_compaction = false); + + bool TryReadFromCacheAsyncUntracked(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result, + Status* status, + Env::IOPriority rate_limiter_priority); + std::vector bufs_; // curr_ represents the index for bufs_ indicating which buffer is being // consumed currently. @@ -442,5 +461,7 @@ class FilePrefetchBuffer { FileSystem* fs_; SystemClock* clock_; Statistics* stats_; + + FilePrefetchBufferUsage usage_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index f1ae762c5..96abe598b 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -125,6 +125,7 @@ TEST_P(PrefetchTest, Basic) { std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); Options options; SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); const int kNumKeys = 1100; int buff_prefetch_count = 0; @@ -167,9 +168,25 @@ TEST_P(PrefetchTest, Basic) { Slice least(start_key.data(), start_key.size()); Slice greatest(end_key.data(), end_key.size()); + HistogramData prev_table_open_prefetch_tail_read; + options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + &prev_table_open_prefetch_tail_read); + const uint64_t prev_table_open_prefetch_tail_miss = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS); + const uint64_t prev_table_open_prefetch_tail_hit = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT); + // commenting out the line below causes the example to work correctly ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + HistogramData cur_table_open_prefetch_tail_read; + options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + &cur_table_open_prefetch_tail_read); + const uint64_t cur_table_open_prefetch_tail_miss = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS); + const uint64_t cur_table_open_prefetch_tail_hit = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT); + if (support_prefetch && !use_direct_io) { // If underline file system supports prefetch, and directIO is not enabled // make sure prefetch() is called and FilePrefetchBuffer is not used. @@ -182,6 +199,12 @@ TEST_P(PrefetchTest, Basic) { // used. ASSERT_FALSE(fs->IsPrefetchCalled()); ASSERT_GT(buff_prefetch_count, 0); + ASSERT_GT(cur_table_open_prefetch_tail_read.count, + prev_table_open_prefetch_tail_read.count); + ASSERT_GT(cur_table_open_prefetch_tail_hit, + prev_table_open_prefetch_tail_hit); + ASSERT_GE(cur_table_open_prefetch_tail_miss, + prev_table_open_prefetch_tail_miss); buff_prefetch_count = 0; } diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index ce550f5bc..c10c67919 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -420,6 +420,15 @@ enum Tickers : uint32_t { SECONDARY_CACHE_INDEX_HITS, SECONDARY_CACHE_DATA_HITS, + // Number of lookup into the prefetched tail (see + // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + // that can't find its data for table open + TABLE_OPEN_PREFETCH_TAIL_MISS, + // Number of lookup into the prefetched tail (see + // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + // that finds its data for table open + TABLE_OPEN_PREFETCH_TAIL_HIT, + TICKER_ENUM_MAX }; @@ -533,6 +542,10 @@ enum Histograms : uint32_t { // Wait time for aborting async read in FilePrefetchBuffer destructor ASYNC_PREFETCH_ABORT_MICROS, + // Number of bytes read for RocksDB's prefetching contents (as opposed to file + // system's prefetch) from the end of SST table during block based table open + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + HISTOGRAM_ENUM_MAX }; diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 9d36ec863..ee87f8947 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5125,6 +5125,10 @@ class TickerTypeJni { return -0x38; case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS: return -0x39; + case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS: + return -0x3A; + case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT: + return -0x3B; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep @@ -5482,6 +5486,10 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS; case -0x39: return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS; + case -0x3A: + return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS; + case -0x3B: + return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT; case 0x5F: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep @@ -5609,6 +5617,8 @@ class HistogramTypeJni { return 0x37; case ASYNC_PREFETCH_ABORT_MICROS: return 0x38; + case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES: + return 0x39; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5725,6 +5735,9 @@ class HistogramTypeJni { return ROCKSDB_NAMESPACE::Histograms::NUM_LEVEL_READ_PER_MULTIGET; case 0x38: return ROCKSDB_NAMESPACE::Histograms::ASYNC_PREFETCH_ABORT_MICROS; + case 0x39: + return ROCKSDB_NAMESPACE::Histograms:: + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 1f8b7a78a..20c54422c 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -162,6 +162,13 @@ public enum HistogramType { ASYNC_READ_BYTES((byte) 0x33), + /** + * Number of bytes read for RocksDB's prefetching contents + * (as opposed to file system's prefetch) + * from the end of SST table during block based table open + */ + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x39), + // 0x1F for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x1F); diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 797fb9895..98e3043c6 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -740,6 +740,20 @@ public enum TickerType { */ BLOB_DB_CACHE_BYTES_WRITE((byte) -0x34), + /** + * Number of lookup into the prefetched tail (see + * `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + * that can't find its data for table open + */ + TABLE_OPEN_PREFETCH_TAIL_MISS((byte) -0x3A), + + /** + * Number of lookup into the prefetched tail (see + * `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + * that finds its data for table open + */ + TABLE_OPEN_PREFETCH_TAIL_HIT((byte) -0x3B), + TICKER_ENUM_MAX((byte) 0x5F); private final byte value; diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 476694746..206372c7c 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -216,7 +216,10 @@ const std::vector> TickersNameMap = { {ASYNC_READ_ERROR_COUNT, "rocksdb.async.read.error.count"}, {SECONDARY_CACHE_FILTER_HITS, "rocksdb.secondary.cache.filter.hits"}, {SECONDARY_CACHE_INDEX_HITS, "rocksdb.secondary.cache.index.hits"}, - {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}}; + {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}, + {TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"}, + {TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"}, +}; const std::vector> HistogramsNameMap = { {DB_GET, "rocksdb.db.get.micros"}, @@ -272,6 +275,8 @@ const std::vector> HistogramsNameMap = { {MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"}, {NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"}, {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"}, + {TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + "rocksdb.table.open.prefetch.tail.read.bytes"}, }; std::shared_ptr CreateDBStatistics() { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index eff4df56d..47bf2bee6 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -591,7 +591,7 @@ Status BlockBasedTable::Open( if (!ioptions.allow_mmap_reads) { s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, tail_prefetch_stats, prefetch_all, preload_all, - &prefetch_buffer); + &prefetch_buffer, ioptions.stats); // Return error in prefetch path to users. if (!s.ok()) { return s; @@ -802,7 +802,7 @@ Status BlockBasedTable::PrefetchTail( const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer) { + std::unique_ptr* prefetch_buffer, Statistics* stats) { size_t tail_prefetch_size = 0; if (tail_prefetch_stats != nullptr) { // Multiple threads may get a 0 (no history) when running in parallel, @@ -842,9 +842,12 @@ Status BlockBasedTable::PrefetchTail( } // Use `FilePrefetchBuffer` - prefetch_buffer->reset( - new FilePrefetchBuffer(0 /* readahead_size */, 0 /* max_readahead_size */, - true /* enable */, true /* track_min_offset */)); + prefetch_buffer->reset(new FilePrefetchBuffer( + 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */, + true /* track_min_offset */, false /* implicit_auto_readahead */, + 0 /* num_file_reads */, 0 /* num_file_reads_for_auto_readahead */, + nullptr /* fs */, nullptr /* clock */, stats, + FilePrefetchBufferUsage::kTableOpenPrefetchTail)); IOOptions opts; Status s = file->PrepareIOOptions(ro, opts); diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 34eeffc88..9d384edc6 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -444,7 +444,7 @@ class BlockBasedTable : public TableReader { const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer); + std::unique_ptr* prefetch_buffer, Statistics* stats); Status ReadMetaIndexBlock(const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* metaindex_block,