Update statistics for async scan readaheads (#10585)

Summary:
Imported a fix to "rocksdb.prefetched.bytes.discarded" stat from https://github.com/facebook/rocksdb/issues/10561, and added a new stat "rocksdb.async.prefetch.abort.micros" to measure time spent waiting for async reads to abort.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10585

Reviewed By: akankshamahajan15

Differential Revision: D39067000

Pulled By: anand1976

fbshipit-source-id: d7cda71abb48017239bd5fd832345a16c7024faf
main
anand76 2 years ago committed by Facebook GitHub Bot
parent 3613d862ba
commit 72a3fb3424
  1. 2
      HISTORY.md
  2. 39
      file/file_prefetch_buffer.h
  3. 3
      include/rocksdb/statistics.h
  4. 8
      java/rocksjni/portal.h
  5. 1
      monitoring/statistics.cc

@ -4,9 +4,11 @@
* Fixed a hang when an operation such as `GetLiveFiles` or `CreateNewBackup` is asked to trigger and wait for memtable flush on a read-only DB. Such indirect requests for memtable flush are now ignored on a read-only DB. * Fixed a hang when an operation such as `GetLiveFiles` or `CreateNewBackup` is asked to trigger and wait for memtable flush on a read-only DB. Such indirect requests for memtable flush are now ignored on a read-only DB.
* Fixed bug where `FlushWAL(true /* sync */)` (used by `GetLiveFilesStorageInfo()`, which is used by checkpoint and backup) could cause parallel writes at the tail of a WAL file to never be synced. * Fixed bug where `FlushWAL(true /* sync */)` (used by `GetLiveFilesStorageInfo()`, which is used by checkpoint and backup) could cause parallel writes at the tail of a WAL file to never be synced.
* Fix periodic_task unable to re-register the same task type, which may cause `SetOptions()` fail to update periodical_task time like: `stats_dump_period_sec`, `stats_persist_period_sec`. * Fix periodic_task unable to re-register the same task type, which may cause `SetOptions()` fail to update periodical_task time like: `stats_dump_period_sec`, `stats_persist_period_sec`.
* Fixed a bug in the rocksdb.prefetched.bytes.discarded stat. It was counting the prefetch buffer size, rather than the actual number of bytes discarded from the buffer.
### Public API changes ### Public API changes
* Add `rocksdb_column_family_handle_get_id`, `rocksdb_column_family_handle_get_name` to get name, id of column family in C API * Add `rocksdb_column_family_handle_get_id`, `rocksdb_column_family_handle_get_name` to get name, id of column family in C API
* Add a new stat rocksdb.async.prefetch.abort.micros to measure time spent waiting for async prefetch reads to abort
### Java API Changes ### Java API Changes
* Add CompactionPriority.RoundRobin. * Add CompactionPriority.RoundRobin.

@ -20,6 +20,7 @@
#include "rocksdb/file_system.h" #include "rocksdb/file_system.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "util/aligned_buffer.h" #include "util/aligned_buffer.h"
#include "util/stop_watch.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -98,17 +99,47 @@ class FilePrefetchBuffer {
if (async_read_in_progress_ && fs_ != nullptr) { if (async_read_in_progress_ && fs_ != nullptr) {
std::vector<void*> handles; std::vector<void*> handles;
handles.emplace_back(io_handle_); handles.emplace_back(io_handle_);
StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
Status s = fs_->AbortIO(handles); Status s = fs_->AbortIO(handles);
assert(s.ok()); assert(s.ok());
} }
// Prefetch buffer bytes discarded. // Prefetch buffer bytes discarded.
uint64_t bytes_discarded = 0; uint64_t bytes_discarded = 0;
if (bufs_[curr_].buffer_.CurrentSize() != 0) { // Iterated over 2 buffers.
bytes_discarded = bufs_[curr_].buffer_.CurrentSize(); for (int i = 0; i < 2; i++) {
int first = i;
int second = i ^ 1;
if (bufs_[first].buffer_.CurrentSize() > 0) {
// If last block was read completely from first and some bytes in
// first buffer are still unconsumed.
if (prev_offset_ >= bufs_[first].offset_ &&
prev_offset_ + prev_len_ <
bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize()) {
bytes_discarded += bufs_[first].buffer_.CurrentSize() -
(prev_offset_ + prev_len_ - bufs_[first].offset_);
}
// If data was in second buffer and some/whole block bytes were read
// from second buffer.
else if (prev_offset_ < bufs_[first].offset_ &&
bufs_[second].buffer_.CurrentSize() > 0) {
// If last block read was completely from different buffer, this
// buffer is unconsumed.
if (prev_offset_ + prev_len_ <= bufs_[first].offset_) {
bytes_discarded += bufs_[first].buffer_.CurrentSize();
}
// If last block read overlaps with this buffer and some data is
// still unconsumed and previous buffer (second) is not cleared.
else if (prev_offset_ + prev_len_ > bufs_[first].offset_ &&
bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize() ==
bufs_[second].offset_) {
bytes_discarded += bufs_[first].buffer_.CurrentSize() -
(/*bytes read from this buffer=*/prev_len_ -
(bufs_[first].offset_ - prev_offset_));
}
}
} }
if (bufs_[curr_ ^ 1].buffer_.CurrentSize() != 0) {
bytes_discarded += bufs_[curr_ ^ 1].buffer_.CurrentSize();
} }
RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded); RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded);

@ -565,6 +565,9 @@ enum Histograms : uint32_t {
// Number of levels requiring IO for MultiGet // Number of levels requiring IO for MultiGet
NUM_LEVEL_READ_PER_MULTIGET, NUM_LEVEL_READ_PER_MULTIGET,
// Wait time for aborting async read in FilePrefetchBuffer destructor
ASYNC_PREFETCH_ABORT_MICROS,
HISTOGRAM_ENUM_MAX, HISTOGRAM_ENUM_MAX,
}; };

@ -5623,6 +5623,10 @@ class HistogramTypeJni {
return 0x35; return 0x35;
case ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE: case ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE:
return 0x36; return 0x36;
case NUM_LEVEL_READ_PER_MULTIGET:
return 0x37;
case ASYNC_PREFETCH_ABORT_MICROS:
return 0x38;
case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
// 0x1F for backwards compatibility on current minor version. // 0x1F for backwards compatibility on current minor version.
return 0x1F; return 0x1F;
@ -5748,6 +5752,10 @@ class HistogramTypeJni {
return ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED; return ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED;
case 0x36: case 0x36:
return ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE; return ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE;
case 0x37:
return ROCKSDB_NAMESPACE::Histograms::NUM_LEVEL_READ_PER_MULTIGET;
case 0x38:
return ROCKSDB_NAMESPACE::Histograms::ASYNC_PREFETCH_ABORT_MICROS;
case 0x1F: case 0x1F:
// 0x1F for backwards compatibility on current minor version. // 0x1F for backwards compatibility on current minor version.
return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX;

@ -295,6 +295,7 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"}, {PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"},
{MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"}, {MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"},
{NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"}, {NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"},
{ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"},
}; };
std::shared_ptr<Statistics> CreateDBStatistics() { std::shared_ptr<Statistics> CreateDBStatistics() {

Loading…
Cancel
Save