diff --git a/HISTORY.md b/HISTORY.md index d2f4762f8..0c2fd8aca 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -12,6 +12,7 @@ ### New Features * Add statistics rocksdb.secondary.cache.filter.hits, rocksdb.secondary.cache.index.hits, and rocksdb.secondary.cache.filter.hits * Added a new PerfContext counter `internal_merge_point_lookup_count` which tracks the number of Merge operands applied while serving point lookup queries. +* Add new statistics rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} ## 8.0.0 (02/19/2023) ### Behavior changes diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc index d16b937df..2fab9c917 100644 --- a/file/file_prefetch_buffer.cc +++ b/file/file_prefetch_buffer.cc @@ -162,6 +162,9 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts, Status s = Read(opts, reader, rate_limiter_priority, read_len, chunk_len, rounddown_offset, curr_); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) { + RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len); + } return s; } @@ -609,6 +612,22 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, Slice* result, Status* status, Env::IOPriority rate_limiter_priority, bool for_compaction /* = false */) { + bool ret = TryReadFromCacheUntracked(opts, reader, offset, n, result, status, + rate_limiter_priority, for_compaction); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) { + if (ret) { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT); + } else { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS); + } + } + return ret; +} + +bool FilePrefetchBuffer::TryReadFromCacheUntracked( + const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, + size_t n, Slice* result, Status* status, + Env::IOPriority rate_limiter_priority, bool for_compaction /* = false */) { if (track_min_offset_ && offset < min_offset_read_) { min_offset_read_ = static_cast(offset); } @@ -666,6 +685,22 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync( const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, size_t n, Slice* result, Status* status, Env::IOPriority rate_limiter_priority) { + bool ret = TryReadFromCacheAsyncUntracked(opts, reader, offset, n, result, + status, rate_limiter_priority); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) { + if (ret) { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT); + } else { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS); + } + } + return ret; +} + +bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked( + const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, + size_t n, Slice* result, Status* status, + Env::IOPriority rate_limiter_priority) { if (track_min_offset_ && offset < min_offset_read_) { min_offset_read_ = static_cast(offset); } diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index a4a75fe2b..304d1c3bb 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -54,6 +54,11 @@ struct BufferInfo { uint32_t pos_ = 0; }; +enum class FilePrefetchBufferUsage { + kTableOpenPrefetchTail, + kUnknown, +}; + // FilePrefetchBuffer is a smart buffer to store and read data from a file. class FilePrefetchBuffer { public: @@ -78,13 +83,13 @@ class FilePrefetchBuffer { // and max_readahead_size are passed in. // A user can construct a FilePrefetchBuffer without any arguments, but use // `Prefetch` to load data into the buffer. - FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0, - bool enable = true, bool track_min_offset = false, - bool implicit_auto_readahead = false, - uint64_t num_file_reads = 0, - uint64_t num_file_reads_for_auto_readahead = 0, - FileSystem* fs = nullptr, SystemClock* clock = nullptr, - Statistics* stats = nullptr) + FilePrefetchBuffer( + size_t readahead_size = 0, size_t max_readahead_size = 0, + bool enable = true, bool track_min_offset = false, + bool implicit_auto_readahead = false, uint64_t num_file_reads = 0, + uint64_t num_file_reads_for_auto_readahead = 0, FileSystem* fs = nullptr, + SystemClock* clock = nullptr, Statistics* stats = nullptr, + FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown) : curr_(0), readahead_size_(readahead_size), initial_auto_readahead_size_(readahead_size), @@ -100,7 +105,8 @@ class FilePrefetchBuffer { explicit_prefetch_submitted_(false), fs_(fs), clock_(clock), - stats_(stats) { + stats_(stats), + usage_(usage) { assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) || (num_file_reads_ == 0)); // If ReadOptions.async_io is enabled, data is asynchronously filled in @@ -403,6 +409,19 @@ class FilePrefetchBuffer { bool& copy_to_third_buffer, uint64_t& tmp_offset, size_t& tmp_length); + bool TryReadFromCacheUntracked(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result, + Status* s, + Env::IOPriority rate_limiter_priority, + bool for_compaction = false); + + bool TryReadFromCacheAsyncUntracked(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result, + Status* status, + Env::IOPriority rate_limiter_priority); + std::vector bufs_; // curr_ represents the index for bufs_ indicating which buffer is being // consumed currently. @@ -442,5 +461,7 @@ class FilePrefetchBuffer { FileSystem* fs_; SystemClock* clock_; Statistics* stats_; + + FilePrefetchBufferUsage usage_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index f1ae762c5..96abe598b 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -125,6 +125,7 @@ TEST_P(PrefetchTest, Basic) { std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); Options options; SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); const int kNumKeys = 1100; int buff_prefetch_count = 0; @@ -167,9 +168,25 @@ TEST_P(PrefetchTest, Basic) { Slice least(start_key.data(), start_key.size()); Slice greatest(end_key.data(), end_key.size()); + HistogramData prev_table_open_prefetch_tail_read; + options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + &prev_table_open_prefetch_tail_read); + const uint64_t prev_table_open_prefetch_tail_miss = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS); + const uint64_t prev_table_open_prefetch_tail_hit = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT); + // commenting out the line below causes the example to work correctly ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + HistogramData cur_table_open_prefetch_tail_read; + options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + &cur_table_open_prefetch_tail_read); + const uint64_t cur_table_open_prefetch_tail_miss = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS); + const uint64_t cur_table_open_prefetch_tail_hit = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT); + if (support_prefetch && !use_direct_io) { // If underline file system supports prefetch, and directIO is not enabled // make sure prefetch() is called and FilePrefetchBuffer is not used. @@ -182,6 +199,12 @@ TEST_P(PrefetchTest, Basic) { // used. ASSERT_FALSE(fs->IsPrefetchCalled()); ASSERT_GT(buff_prefetch_count, 0); + ASSERT_GT(cur_table_open_prefetch_tail_read.count, + prev_table_open_prefetch_tail_read.count); + ASSERT_GT(cur_table_open_prefetch_tail_hit, + prev_table_open_prefetch_tail_hit); + ASSERT_GE(cur_table_open_prefetch_tail_miss, + prev_table_open_prefetch_tail_miss); buff_prefetch_count = 0; } diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index ce550f5bc..c10c67919 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -420,6 +420,15 @@ enum Tickers : uint32_t { SECONDARY_CACHE_INDEX_HITS, SECONDARY_CACHE_DATA_HITS, + // Number of lookup into the prefetched tail (see + // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + // that can't find its data for table open + TABLE_OPEN_PREFETCH_TAIL_MISS, + // Number of lookup into the prefetched tail (see + // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + // that finds its data for table open + TABLE_OPEN_PREFETCH_TAIL_HIT, + TICKER_ENUM_MAX }; @@ -533,6 +542,10 @@ enum Histograms : uint32_t { // Wait time for aborting async read in FilePrefetchBuffer destructor ASYNC_PREFETCH_ABORT_MICROS, + // Number of bytes read for RocksDB's prefetching contents (as opposed to file + // system's prefetch) from the end of SST table during block based table open + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + HISTOGRAM_ENUM_MAX }; diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 9d36ec863..ee87f8947 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5125,6 +5125,10 @@ class TickerTypeJni { return -0x38; case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS: return -0x39; + case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS: + return -0x3A; + case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT: + return -0x3B; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep @@ -5482,6 +5486,10 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS; case -0x39: return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS; + case -0x3A: + return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS; + case -0x3B: + return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT; case 0x5F: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep @@ -5609,6 +5617,8 @@ class HistogramTypeJni { return 0x37; case ASYNC_PREFETCH_ABORT_MICROS: return 0x38; + case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES: + return 0x39; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5725,6 +5735,9 @@ class HistogramTypeJni { return ROCKSDB_NAMESPACE::Histograms::NUM_LEVEL_READ_PER_MULTIGET; case 0x38: return ROCKSDB_NAMESPACE::Histograms::ASYNC_PREFETCH_ABORT_MICROS; + case 0x39: + return ROCKSDB_NAMESPACE::Histograms:: + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 1f8b7a78a..20c54422c 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -162,6 +162,13 @@ public enum HistogramType { ASYNC_READ_BYTES((byte) 0x33), + /** + * Number of bytes read for RocksDB's prefetching contents + * (as opposed to file system's prefetch) + * from the end of SST table during block based table open + */ + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x39), + // 0x1F for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x1F); diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 797fb9895..98e3043c6 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -740,6 +740,20 @@ public enum TickerType { */ BLOB_DB_CACHE_BYTES_WRITE((byte) -0x34), + /** + * Number of lookup into the prefetched tail (see + * `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + * that can't find its data for table open + */ + TABLE_OPEN_PREFETCH_TAIL_MISS((byte) -0x3A), + + /** + * Number of lookup into the prefetched tail (see + * `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + * that finds its data for table open + */ + TABLE_OPEN_PREFETCH_TAIL_HIT((byte) -0x3B), + TICKER_ENUM_MAX((byte) 0x5F); private final byte value; diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 476694746..206372c7c 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -216,7 +216,10 @@ const std::vector> TickersNameMap = { {ASYNC_READ_ERROR_COUNT, "rocksdb.async.read.error.count"}, {SECONDARY_CACHE_FILTER_HITS, "rocksdb.secondary.cache.filter.hits"}, {SECONDARY_CACHE_INDEX_HITS, "rocksdb.secondary.cache.index.hits"}, - {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}}; + {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}, + {TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"}, + {TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"}, +}; const std::vector> HistogramsNameMap = { {DB_GET, "rocksdb.db.get.micros"}, @@ -272,6 +275,8 @@ const std::vector> HistogramsNameMap = { {MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"}, {NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"}, {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"}, + {TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + "rocksdb.table.open.prefetch.tail.read.bytes"}, }; std::shared_ptr CreateDBStatistics() { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index eff4df56d..47bf2bee6 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -591,7 +591,7 @@ Status BlockBasedTable::Open( if (!ioptions.allow_mmap_reads) { s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, tail_prefetch_stats, prefetch_all, preload_all, - &prefetch_buffer); + &prefetch_buffer, ioptions.stats); // Return error in prefetch path to users. if (!s.ok()) { return s; @@ -802,7 +802,7 @@ Status BlockBasedTable::PrefetchTail( const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer) { + std::unique_ptr* prefetch_buffer, Statistics* stats) { size_t tail_prefetch_size = 0; if (tail_prefetch_stats != nullptr) { // Multiple threads may get a 0 (no history) when running in parallel, @@ -842,9 +842,12 @@ Status BlockBasedTable::PrefetchTail( } // Use `FilePrefetchBuffer` - prefetch_buffer->reset( - new FilePrefetchBuffer(0 /* readahead_size */, 0 /* max_readahead_size */, - true /* enable */, true /* track_min_offset */)); + prefetch_buffer->reset(new FilePrefetchBuffer( + 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */, + true /* track_min_offset */, false /* implicit_auto_readahead */, + 0 /* num_file_reads */, 0 /* num_file_reads_for_auto_readahead */, + nullptr /* fs */, nullptr /* clock */, stats, + FilePrefetchBufferUsage::kTableOpenPrefetchTail)); IOOptions opts; Status s = file->PrepareIOOptions(ro, opts); diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 34eeffc88..9d384edc6 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -444,7 +444,7 @@ class BlockBasedTable : public TableReader { const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer); + std::unique_ptr* prefetch_buffer, Statistics* stats); Status ReadMetaIndexBlock(const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* metaindex_block,