diff --git a/HISTORY.md b/HISTORY.md index 3b07e26f8..2b10e8e04 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,9 +4,11 @@ * Fixed a hang when an operation such as `GetLiveFiles` or `CreateNewBackup` is asked to trigger and wait for memtable flush on a read-only DB. Such indirect requests for memtable flush are now ignored on a read-only DB. * Fixed bug where `FlushWAL(true /* sync */)` (used by `GetLiveFilesStorageInfo()`, which is used by checkpoint and backup) could cause parallel writes at the tail of a WAL file to never be synced. * Fix periodic_task unable to re-register the same task type, which may cause `SetOptions()` fail to update periodical_task time like: `stats_dump_period_sec`, `stats_persist_period_sec`. +* Fixed a bug in the rocksdb.prefetched.bytes.discarded stat. It was counting the prefetch buffer size, rather than the actual number of bytes discarded from the buffer. ### Public API changes * Add `rocksdb_column_family_handle_get_id`, `rocksdb_column_family_handle_get_name` to get name, id of column family in C API +* Add a new stat rocksdb.async.prefetch.abort.micros to measure time spent waiting for async prefetch reads to abort ### Java API Changes * Add CompactionPriority.RoundRobin. diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index 9a19b5249..d142a74b5 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -20,6 +20,7 @@ #include "rocksdb/file_system.h" #include "rocksdb/options.h" #include "util/aligned_buffer.h" +#include "util/stop_watch.h" namespace ROCKSDB_NAMESPACE { @@ -98,17 +99,47 @@ class FilePrefetchBuffer { if (async_read_in_progress_ && fs_ != nullptr) { std::vector handles; handles.emplace_back(io_handle_); + StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS); Status s = fs_->AbortIO(handles); assert(s.ok()); } // Prefetch buffer bytes discarded. uint64_t bytes_discarded = 0; - if (bufs_[curr_].buffer_.CurrentSize() != 0) { - bytes_discarded = bufs_[curr_].buffer_.CurrentSize(); - } - if (bufs_[curr_ ^ 1].buffer_.CurrentSize() != 0) { - bytes_discarded += bufs_[curr_ ^ 1].buffer_.CurrentSize(); + // Iterated over 2 buffers. + for (int i = 0; i < 2; i++) { + int first = i; + int second = i ^ 1; + + if (bufs_[first].buffer_.CurrentSize() > 0) { + // If last block was read completely from first and some bytes in + // first buffer are still unconsumed. + if (prev_offset_ >= bufs_[first].offset_ && + prev_offset_ + prev_len_ < + bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize()) { + bytes_discarded += bufs_[first].buffer_.CurrentSize() - + (prev_offset_ + prev_len_ - bufs_[first].offset_); + } + // If data was in second buffer and some/whole block bytes were read + // from second buffer. + else if (prev_offset_ < bufs_[first].offset_ && + bufs_[second].buffer_.CurrentSize() > 0) { + // If last block read was completely from different buffer, this + // buffer is unconsumed. + if (prev_offset_ + prev_len_ <= bufs_[first].offset_) { + bytes_discarded += bufs_[first].buffer_.CurrentSize(); + } + // If last block read overlaps with this buffer and some data is + // still unconsumed and previous buffer (second) is not cleared. + else if (prev_offset_ + prev_len_ > bufs_[first].offset_ && + bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize() == + bufs_[second].offset_) { + bytes_discarded += bufs_[first].buffer_.CurrentSize() - + (/*bytes read from this buffer=*/prev_len_ - + (bufs_[first].offset_ - prev_offset_)); + } + } + } } RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded); diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 88332dc32..2317108cd 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -565,6 +565,9 @@ enum Histograms : uint32_t { // Number of levels requiring IO for MultiGet NUM_LEVEL_READ_PER_MULTIGET, + // Wait time for aborting async read in FilePrefetchBuffer destructor + ASYNC_PREFETCH_ABORT_MICROS, + HISTOGRAM_ENUM_MAX, }; diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 40615e284..28c341949 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5623,6 +5623,10 @@ class HistogramTypeJni { return 0x35; case ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE: return 0x36; + case NUM_LEVEL_READ_PER_MULTIGET: + return 0x37; + case ASYNC_PREFETCH_ABORT_MICROS: + return 0x38; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5748,6 +5752,10 @@ class HistogramTypeJni { return ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED; case 0x36: return ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE; + case 0x37: + return ROCKSDB_NAMESPACE::Histograms::NUM_LEVEL_READ_PER_MULTIGET; + case 0x38: + return ROCKSDB_NAMESPACE::Histograms::ASYNC_PREFETCH_ABORT_MICROS; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 55ec19410..cbb96b295 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -295,6 +295,7 @@ const std::vector> HistogramsNameMap = { {PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"}, {MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"}, {NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"}, + {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"}, }; std::shared_ptr CreateDBStatistics() {