diff --git a/db/db_impl.cc b/db/db_impl.cc index 617113ce3..411736cb1 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3322,7 +3322,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (context.schedule_bg_work_) { MaybeScheduleFlushOrCompaction(); } + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_GUARD(write_delay_time); status = DelayWrite(expiration_time); + PERF_TIMER_START(write_pre_and_post_process_time); } if (UNLIKELY(status.ok() && has_timeout && diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h index e06ee1773..009ad9117 100644 --- a/include/rocksdb/iostats_context.h +++ b/include/rocksdb/iostats_context.h @@ -9,7 +9,11 @@ #include #include +#include "rocksdb/perf_level.h" + // A thread local context for gathering io-stats efficiently and transparently. +// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats. + namespace rocksdb { struct IOStatsContext { @@ -25,6 +29,18 @@ struct IOStatsContext { uint64_t bytes_written; // number of bytes that has been read. uint64_t bytes_read; + + // time spent in open() and fopen(). + uint64_t open_nanos; + // time spent in fallocate(). + uint64_t allocate_nanos; + // time spent in write() and pwrite(). + uint64_t write_nanos; + // time spent in sync_file_range(). + uint64_t range_sync_nanos; + + // time spent in Logger::Logv(). + uint64_t logger_nanos; }; #ifndef IOS_CROSS_COMPILE diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 3b8145a94..b55482fc8 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -9,22 +9,13 @@ #include #include -namespace rocksdb { - -enum PerfLevel { - kDisable = 0, // disable perf stats - kEnableCount = 1, // enable only count stats - kEnableTime = 2 // enable time stats too -}; +#include "rocksdb/perf_level.h" -// set the perf stats level -void SetPerfLevel(PerfLevel level); - -// get current perf stats level -PerfLevel GetPerfLevel(); +namespace rocksdb { // A thread local context for gathering performance counter efficiently // and transparently. +// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats. struct PerfContext { @@ -64,11 +55,16 @@ struct PerfContext { uint64_t seek_internal_seek_time; // total time spent on iterating internal entries to find the next user entry uint64_t find_next_user_entry_time; - // total time spent on pre or post processing when writing a record - uint64_t write_pre_and_post_process_time; - uint64_t write_wal_time; // total time spent on writing to WAL + + // total time spent on writing to WAL + uint64_t write_wal_time; // total time spent on writing to mem tables uint64_t write_memtable_time; + // total time spent on delaying write + uint64_t write_delay_time; + // total time spent on writing a record, excluding the above three times + uint64_t write_pre_and_post_process_time; + uint64_t db_mutex_lock_nanos; // time spent on acquiring DB mutex. // Time spent on waiting with a condition variable created with DB mutex. uint64_t db_condition_wait_nanos; diff --git a/include/rocksdb/perf_level.h b/include/rocksdb/perf_level.h new file mode 100644 index 000000000..fee8ce1c4 --- /dev/null +++ b/include/rocksdb/perf_level.h @@ -0,0 +1,30 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef INCLUDE_ROCKSDB_PERF_LEVEL_H_ +#define INCLUDE_ROCKSDB_PERF_LEVEL_H_ + +#include +#include + +namespace rocksdb { + +// How much perf stats to collect. Affects perf_context and iostats_context. + +enum PerfLevel { + kDisable = 0, // disable perf stats + kEnableCount = 1, // enable only count stats + kEnableTime = 2 // enable time stats too +}; + +// set the perf stats level for current thread +void SetPerfLevel(PerfLevel level); + +// get current perf stats level for current thread +PerfLevel GetPerfLevel(); + +} // namespace rocksdb + +#endif // INCLUDE_ROCKSDB_PERF_LEVEL_H_ diff --git a/src.mk b/src.mk index 6efcb67db..f277a85dc 100644 --- a/src.mk +++ b/src.mk @@ -126,6 +126,7 @@ LIB_SOURCES = \ util/options.cc \ util/options_helper.cc \ util/perf_context.cc \ + util/perf_level.cc \ util/rate_limiter.cc \ util/skiplistrep.cc \ util/slice.cc \ diff --git a/util/env_posix.cc b/util/env_posix.cc index c8f50720b..1be1d68b8 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -439,14 +439,17 @@ class PosixMmapFile : public WritableFile { TEST_KILL_RANDOM(rocksdb_kill_odds); // we can't fallocate with FALLOC_FL_KEEP_SIZE here - int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); - if (alloc_status != 0) { - // fallback to posix_fallocate - alloc_status = posix_fallocate(fd_, file_offset_, map_size_); - } - if (alloc_status != 0) { - return Status::IOError("Error allocating space to file : " + filename_ + - "Error : " + strerror(alloc_status)); + { + IOSTATS_TIMER_GUARD(allocate_nanos); + int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); + if (alloc_status != 0) { + // fallback to posix_fallocate + alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + } + if (alloc_status != 0) { + return Status::IOError("Error allocating space to file : " + filename_ + + "Error : " + strerror(alloc_status)); + } } TEST_KILL_RANDOM(rocksdb_kill_odds); @@ -635,6 +638,7 @@ class PosixMmapFile : public WritableFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) override { TEST_KILL_RANDOM(rocksdb_kill_odds); + IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = fallocate( fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); if (alloc_status == 0) { @@ -721,7 +725,12 @@ class PosixWritableFile : public WritableFile { cursize_ += left; } else { while (left != 0) { - ssize_t done = write(fd_, src, RequestToken(left)); + ssize_t done; + size_t size = RequestToken(left); + { + IOSTATS_TIMER_GUARD(write_nanos); + done = write(fd_, src, size); + } if (done < 0) { if (errno == EINTR) { continue; @@ -769,6 +778,7 @@ class PosixWritableFile : public WritableFile { // tmpfs (since Linux 3.5) // We ignore error since failure of this operation does not affect // correctness. + IOSTATS_TIMER_GUARD(allocate_nanos); fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_, block_size * last_allocated_block - filesize_); #endif @@ -787,7 +797,12 @@ class PosixWritableFile : public WritableFile { size_t left = cursize_; char* src = buf_.get(); while (left != 0) { - ssize_t done = write(fd_, src, RequestToken(left)); + ssize_t done; + size_t size = RequestToken(left); + { + IOSTATS_TIMER_GUARD(write_nanos); + done = write(fd_, src, size); + } if (done < 0) { if (errno == EINTR) { continue; @@ -861,7 +876,9 @@ class PosixWritableFile : public WritableFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) override { TEST_KILL_RANDOM(rocksdb_kill_odds); - int alloc_status = fallocate( + int alloc_status; + IOSTATS_TIMER_GUARD(allocate_nanos); + alloc_status = fallocate( fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); if (alloc_status == 0) { return Status::OK(); @@ -871,6 +888,7 @@ class PosixWritableFile : public WritableFile { } virtual Status RangeSync(off_t offset, off_t nbytes) override { + IOSTATS_TIMER_GUARD(range_sync_nanos); if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) { return Status::OK(); } else { @@ -929,7 +947,11 @@ class PosixRandomRWFile : public RandomRWFile { pending_fsync_ = true; while (left != 0) { - ssize_t done = pwrite(fd_, src, left, offset); + ssize_t done; + { + IOSTATS_TIMER_GUARD(write_nanos); + done = pwrite(fd_, src, left, offset); + } if (done < 0) { if (errno == EINTR) { continue; @@ -1001,6 +1023,7 @@ class PosixRandomRWFile : public RandomRWFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) override { TEST_KILL_RANDOM(rocksdb_kill_odds); + IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = fallocate( fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); if (alloc_status == 0) { @@ -1109,6 +1132,7 @@ class PosixEnv : public Env { result->reset(); FILE* f = nullptr; do { + IOSTATS_TIMER_GUARD(open_nanos); f = fopen(fname.c_str(), "r"); } while (f == nullptr && errno == EINTR); if (f == nullptr) { @@ -1127,7 +1151,11 @@ class PosixEnv : public Env { const EnvOptions& options) override { result->reset(); Status s; - int fd = open(fname.c_str(), O_RDONLY); + int fd; + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), O_RDONLY); + } SetFD_CLOEXEC(fd, &options); if (fd < 0) { s = IOError(fname, errno); @@ -1160,6 +1188,7 @@ class PosixEnv : public Env { Status s; int fd = -1; do { + IOSTATS_TIMER_GUARD(open_nanos); fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); } while (fd < 0 && errno == EINTR); if (fd < 0) { @@ -1200,7 +1229,11 @@ class PosixEnv : public Env { return Status::NotSupported("No support for mmap read/write yet"); } Status s; - const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644); + int fd; + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644); + } if (fd < 0) { s = IOError(fname, errno); } else { @@ -1213,7 +1246,11 @@ class PosixEnv : public Env { virtual Status NewDirectory(const std::string& name, unique_ptr* result) override { result->reset(); - const int fd = open(name.c_str(), 0); + int fd; + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(name.c_str(), 0); + } if (fd < 0) { return IOError(name, errno); } else { @@ -1325,7 +1362,11 @@ class PosixEnv : public Env { virtual Status LockFile(const std::string& fname, FileLock** lock) override { *lock = nullptr; Status result; - int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + int fd; + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + } if (fd < 0) { result = IOError(fname, errno); } else if (LockOrUnlock(fname, fd, true) == -1) { @@ -1396,7 +1437,11 @@ class PosixEnv : public Env { virtual Status NewLogger(const std::string& fname, shared_ptr* result) override { - FILE* f = fopen(fname.c_str(), "w"); + FILE* f; + { + IOSTATS_TIMER_GUARD(open_nanos); + f = fopen(fname.c_str(), "w"); + } if (f == nullptr) { result->reset(); return IOError(fname, errno); diff --git a/util/iostats_context.cc b/util/iostats_context.cc index 090813abc..9daaf59d2 100644 --- a/util/iostats_context.cc +++ b/util/iostats_context.cc @@ -17,6 +17,11 @@ void IOStatsContext::Reset() { thread_pool_id = Env::Priority::TOTAL; bytes_read = 0; bytes_written = 0; + open_nanos = 0; + allocate_nanos = 0; + write_nanos = 0; + range_sync_nanos = 0; + logger_nanos = 0; } #define OUTPUT(counter) #counter << " = " << counter << ", " @@ -25,7 +30,13 @@ std::string IOStatsContext::ToString() const { std::ostringstream ss; ss << OUTPUT(thread_pool_id) << OUTPUT(bytes_read) - << OUTPUT(bytes_written); + << OUTPUT(bytes_written) + << OUTPUT(open_nanos) + << OUTPUT(allocate_nanos) + << OUTPUT(write_nanos) + << OUTPUT(range_sync_nanos) + << OUTPUT(logger_nanos); + return ss.str(); } diff --git a/util/iostats_context_imp.h b/util/iostats_context_imp.h index b271ddf14..bc7e768ca 100644 --- a/util/iostats_context_imp.h +++ b/util/iostats_context_imp.h @@ -5,6 +5,7 @@ // #pragma once #include "rocksdb/iostats_context.h" +#include "util/perf_step_timer.h" #ifndef IOS_CROSS_COMPILE @@ -33,6 +34,18 @@ #define IOSTATS(metric) \ (iostats_context.metric) +// Stop the timer and update the metric +#define IOSTATS_TIMER_STOP(metric) \ + iostats_step_timer_ ## metric.Stop(); + +#define IOSTATS_TIMER_START(metric) \ + iostats_step_timer_ ## metric.Start(); + +// Declare and set start time of the timer +#define IOSTATS_TIMER_GUARD(metric) \ + PerfStepTimer iostats_step_timer_ ## metric(&(iostats_context.metric)); \ + iostats_step_timer_ ## metric.Start(); + #else // IOS_CROSS_COMPILE #define IOSTATS_ADD(metric, value) @@ -43,4 +56,8 @@ #define IOSTATS_THREAD_POOL_ID() #define IOSTATS(metric) 0 +#define IOSTATS_TIMER_GUARD(metric) +#define IOSTATS_TIMER_STOP(metric) +#define IOSTATS_TIMER_START(metric) + #endif // IOS_CROSS_COMPILE diff --git a/util/perf_context.cc b/util/perf_context.cc index 7be9980e5..b3e9bb1b6 100644 --- a/util/perf_context.cc +++ b/util/perf_context.cc @@ -10,22 +10,12 @@ namespace rocksdb { #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) -PerfLevel perf_level = kEnableCount; // This is a dummy variable since some place references it PerfContext perf_context; #else -__thread PerfLevel perf_level = kEnableCount; __thread PerfContext perf_context; #endif -void SetPerfLevel(PerfLevel level) { - perf_level = level; -} - -PerfLevel GetPerfLevel() { - return perf_level; -} - void PerfContext::Reset() { #if !defined(NPERF_CONTEXT) && !defined(IOS_CROSS_COMPILE) user_key_comparison_count = 0; @@ -53,6 +43,7 @@ void PerfContext::Reset() { find_next_user_entry_time = 0; write_pre_and_post_process_time = 0; write_memtable_time = 0; + write_delay_time = 0; db_mutex_lock_nanos = 0; db_condition_wait_nanos = 0; merge_operator_time_nanos = 0; @@ -79,7 +70,7 @@ std::string PerfContext::ToString() const { << OUTPUT(seek_internal_seek_time) << OUTPUT(find_next_user_entry_time) << OUTPUT(write_pre_and_post_process_time) << OUTPUT(write_memtable_time) << OUTPUT(db_mutex_lock_nanos) << OUTPUT(db_condition_wait_nanos) - << OUTPUT(merge_operator_time_nanos); + << OUTPUT(merge_operator_time_nanos) << OUTPUT(write_delay_time); return ss.str(); #endif } diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h index e39790105..cde7ee33d 100644 --- a/util/perf_context_imp.h +++ b/util/perf_context_imp.h @@ -5,6 +5,7 @@ // #pragma once #include "rocksdb/perf_context.h" +#include "util/perf_step_timer.h" #include "util/stop_watch.h" namespace rocksdb { @@ -19,49 +20,6 @@ namespace rocksdb { #else -extern __thread PerfLevel perf_level; - -class PerfStepTimer { - public: - PerfStepTimer(uint64_t* metric) - : enabled_(perf_level >= PerfLevel::kEnableTime), - env_(enabled_ ? Env::Default() : nullptr), - start_(0), - metric_(metric) { - } - - ~PerfStepTimer() { - Stop(); - } - - void Start() { - if (enabled_) { - start_ = env_->NowNanos(); - } - } - - void Measure() { - if (start_) { - uint64_t now = env_->NowNanos(); - *metric_ += now - start_; - start_ = now; - } - } - - void Stop() { - if (start_) { - *metric_ += env_->NowNanos() - start_; - start_ = 0; - } - } - - private: - const bool enabled_; - Env* const env_; - uint64_t start_; - uint64_t* metric_; -}; - // Stop the timer and update the metric #define PERF_TIMER_STOP(metric) \ perf_step_timer_ ## metric.Stop(); @@ -70,8 +28,8 @@ class PerfStepTimer { perf_step_timer_ ## metric.Start(); // Declare and set start time of the timer -#define PERF_TIMER_GUARD(metric) \ - PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric)); \ +#define PERF_TIMER_GUARD(metric) \ + PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric)); \ perf_step_timer_ ## metric.Start(); // Update metric with time elapsed since last START. start time is reset diff --git a/util/perf_level.cc b/util/perf_level.cc new file mode 100644 index 000000000..ae03efe67 --- /dev/null +++ b/util/perf_level.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include +#include "util/perf_level_imp.h" + +namespace rocksdb { + +#if defined(IOS_CROSS_COMPILE) +PerfLevel perf_level = kEnableCount; +#else +__thread PerfLevel perf_level = kEnableCount; +#endif + +void SetPerfLevel(PerfLevel level) { + perf_level = level; +} + +PerfLevel GetPerfLevel() { + return perf_level; +} + +} // namespace rocksdb diff --git a/util/perf_level_imp.h b/util/perf_level_imp.h new file mode 100644 index 000000000..8bc4fab71 --- /dev/null +++ b/util/perf_level_imp.h @@ -0,0 +1,17 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/perf_level.h" + +namespace rocksdb { + +#if defined(IOS_CROSS_COMPILE) +extern PerfLevel perf_level; +#else +extern __thread PerfLevel perf_level; +#endif + +} // namespace rocksdb diff --git a/util/perf_step_timer.h b/util/perf_step_timer.h new file mode 100644 index 000000000..950258345 --- /dev/null +++ b/util/perf_step_timer.h @@ -0,0 +1,54 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/env.h" +#include "util/perf_level_imp.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +class PerfStepTimer { + public: + PerfStepTimer(uint64_t* metric) + : enabled_(perf_level >= PerfLevel::kEnableTime), + env_(enabled_ ? Env::Default() : nullptr), + start_(0), + metric_(metric) { + } + + ~PerfStepTimer() { + Stop(); + } + + void Start() { + if (enabled_) { + start_ = env_->NowNanos(); + } + } + + void Measure() { + if (start_) { + uint64_t now = env_->NowNanos(); + *metric_ += now - start_; + start_ = now; + } + } + + void Stop() { + if (start_) { + *metric_ += env_->NowNanos() - start_; + start_ = 0; + } + } + + private: + const bool enabled_; + Env* const env_; + uint64_t start_; + uint64_t* metric_; +}; + +} // namespace rocksdb diff --git a/util/posix_logger.h b/util/posix_logger.h index 213a652c7..159a1b67a 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -21,6 +21,7 @@ #include #endif #include "rocksdb/env.h" +#include "util/iostats_context_imp.h" #include namespace rocksdb { @@ -61,6 +62,8 @@ class PosixLogger : public Logger { using Logger::Logv; virtual void Logv(const char* format, va_list ap) override { + IOSTATS_TIMER_GUARD(logger_nanos); + const uint64_t thread_id = (*gettid_)(); // We try twice: the first time with a fixed-size stack allocated buffer,