more times in perf_context and iostats_context

Summary:
We occasionally get write stalls (>1s Write() calls) on HDD under read load. The following timers explain almost all of the stalls:
 - perf_context.db_mutex_lock_nanos
 - perf_context.db_condition_wait_nanos
 - iostats_context.open_time
 - iostats_context.allocate_time
 - iostats_context.write_time
 - iostats_context.range_sync_time
 - iostats_context.logger_time

In my experiments each of these occasionally takes >1s on write path under some workload. There are rare cases when Write() takes long but none of these takes long.

Test Plan: Added code to our application to write the listed timings to log for slow writes. They usually add up to almost exactly the time Write() call took.

Reviewers: rven, yhchiang, sdong

Reviewed By: sdong

Subscribers: march, dhruba, tnovak

Differential Revision: https://reviews.facebook.net/D39177
main
Mike Kolupaev 10 years ago
parent 4266d4fd90
commit ec7a944360
  1. 3
      db/db_impl.cc
  2. 16
      include/rocksdb/iostats_context.h
  3. 26
      include/rocksdb/perf_context.h
  4. 30
      include/rocksdb/perf_level.h
  5. 1
      src.mk
  6. 79
      util/env_posix.cc
  7. 13
      util/iostats_context.cc
  8. 17
      util/iostats_context_imp.h
  9. 13
      util/perf_context.cc
  10. 48
      util/perf_context_imp.h
  11. 26
      util/perf_level.cc
  12. 17
      util/perf_level_imp.h
  13. 54
      util/perf_step_timer.h
  14. 3
      util/posix_logger.h

@ -3322,7 +3322,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
if (context.schedule_bg_work_) {
MaybeScheduleFlushOrCompaction();
}
PERF_TIMER_STOP(write_pre_and_post_process_time);
PERF_TIMER_GUARD(write_delay_time);
status = DelayWrite(expiration_time);
PERF_TIMER_START(write_pre_and_post_process_time);
}
if (UNLIKELY(status.ok() && has_timeout &&

@ -9,7 +9,11 @@
#include <stdint.h>
#include <string>
#include "rocksdb/perf_level.h"
// A thread local context for gathering io-stats efficiently and transparently.
// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
namespace rocksdb {
struct IOStatsContext {
@ -25,6 +29,18 @@ struct IOStatsContext {
uint64_t bytes_written;
// number of bytes that has been read.
uint64_t bytes_read;
// time spent in open() and fopen().
uint64_t open_nanos;
// time spent in fallocate().
uint64_t allocate_nanos;
// time spent in write() and pwrite().
uint64_t write_nanos;
// time spent in sync_file_range().
uint64_t range_sync_nanos;
// time spent in Logger::Logv().
uint64_t logger_nanos;
};
#ifndef IOS_CROSS_COMPILE

@ -9,22 +9,13 @@
#include <stdint.h>
#include <string>
namespace rocksdb {
enum PerfLevel {
kDisable = 0, // disable perf stats
kEnableCount = 1, // enable only count stats
kEnableTime = 2 // enable time stats too
};
#include "rocksdb/perf_level.h"
// set the perf stats level
void SetPerfLevel(PerfLevel level);
// get current perf stats level
PerfLevel GetPerfLevel();
namespace rocksdb {
// A thread local context for gathering performance counter efficiently
// and transparently.
// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
struct PerfContext {
@ -64,11 +55,16 @@ struct PerfContext {
uint64_t seek_internal_seek_time;
// total time spent on iterating internal entries to find the next user entry
uint64_t find_next_user_entry_time;
// total time spent on pre or post processing when writing a record
uint64_t write_pre_and_post_process_time;
uint64_t write_wal_time; // total time spent on writing to WAL
// total time spent on writing to WAL
uint64_t write_wal_time;
// total time spent on writing to mem tables
uint64_t write_memtable_time;
// total time spent on delaying write
uint64_t write_delay_time;
// total time spent on writing a record, excluding the above three times
uint64_t write_pre_and_post_process_time;
uint64_t db_mutex_lock_nanos; // time spent on acquiring DB mutex.
// Time spent on waiting with a condition variable created with DB mutex.
uint64_t db_condition_wait_nanos;

@ -0,0 +1,30 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef INCLUDE_ROCKSDB_PERF_LEVEL_H_
#define INCLUDE_ROCKSDB_PERF_LEVEL_H_
#include <stdint.h>
#include <string>
namespace rocksdb {
// How much perf stats to collect. Affects perf_context and iostats_context.
enum PerfLevel {
kDisable = 0, // disable perf stats
kEnableCount = 1, // enable only count stats
kEnableTime = 2 // enable time stats too
};
// set the perf stats level for current thread
void SetPerfLevel(PerfLevel level);
// get current perf stats level for current thread
PerfLevel GetPerfLevel();
} // namespace rocksdb
#endif // INCLUDE_ROCKSDB_PERF_LEVEL_H_

@ -126,6 +126,7 @@ LIB_SOURCES = \
util/options.cc \
util/options_helper.cc \
util/perf_context.cc \
util/perf_level.cc \
util/rate_limiter.cc \
util/skiplistrep.cc \
util/slice.cc \

@ -439,14 +439,17 @@ class PosixMmapFile : public WritableFile {
TEST_KILL_RANDOM(rocksdb_kill_odds);
// we can't fallocate with FALLOC_FL_KEEP_SIZE here
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
if (alloc_status != 0) {
// fallback to posix_fallocate
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
}
if (alloc_status != 0) {
return Status::IOError("Error allocating space to file : " + filename_ +
"Error : " + strerror(alloc_status));
{
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
if (alloc_status != 0) {
// fallback to posix_fallocate
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
}
if (alloc_status != 0) {
return Status::IOError("Error allocating space to file : " + filename_ +
"Error : " + strerror(alloc_status));
}
}
TEST_KILL_RANDOM(rocksdb_kill_odds);
@ -635,6 +638,7 @@ class PosixMmapFile : public WritableFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM(rocksdb_kill_odds);
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
if (alloc_status == 0) {
@ -721,7 +725,12 @@ class PosixWritableFile : public WritableFile {
cursize_ += left;
} else {
while (left != 0) {
ssize_t done = write(fd_, src, RequestToken(left));
ssize_t done;
size_t size = RequestToken(left);
{
IOSTATS_TIMER_GUARD(write_nanos);
done = write(fd_, src, size);
}
if (done < 0) {
if (errno == EINTR) {
continue;
@ -769,6 +778,7 @@ class PosixWritableFile : public WritableFile {
// tmpfs (since Linux 3.5)
// We ignore error since failure of this operation does not affect
// correctness.
IOSTATS_TIMER_GUARD(allocate_nanos);
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
filesize_, block_size * last_allocated_block - filesize_);
#endif
@ -787,7 +797,12 @@ class PosixWritableFile : public WritableFile {
size_t left = cursize_;
char* src = buf_.get();
while (left != 0) {
ssize_t done = write(fd_, src, RequestToken(left));
ssize_t done;
size_t size = RequestToken(left);
{
IOSTATS_TIMER_GUARD(write_nanos);
done = write(fd_, src, size);
}
if (done < 0) {
if (errno == EINTR) {
continue;
@ -861,7 +876,9 @@ class PosixWritableFile : public WritableFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM(rocksdb_kill_odds);
int alloc_status = fallocate(
int alloc_status;
IOSTATS_TIMER_GUARD(allocate_nanos);
alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
if (alloc_status == 0) {
return Status::OK();
@ -871,6 +888,7 @@ class PosixWritableFile : public WritableFile {
}
virtual Status RangeSync(off_t offset, off_t nbytes) override {
IOSTATS_TIMER_GUARD(range_sync_nanos);
if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
return Status::OK();
} else {
@ -929,7 +947,11 @@ class PosixRandomRWFile : public RandomRWFile {
pending_fsync_ = true;
while (left != 0) {
ssize_t done = pwrite(fd_, src, left, offset);
ssize_t done;
{
IOSTATS_TIMER_GUARD(write_nanos);
done = pwrite(fd_, src, left, offset);
}
if (done < 0) {
if (errno == EINTR) {
continue;
@ -1001,6 +1023,7 @@ class PosixRandomRWFile : public RandomRWFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM(rocksdb_kill_odds);
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
if (alloc_status == 0) {
@ -1109,6 +1132,7 @@ class PosixEnv : public Env {
result->reset();
FILE* f = nullptr;
do {
IOSTATS_TIMER_GUARD(open_nanos);
f = fopen(fname.c_str(), "r");
} while (f == nullptr && errno == EINTR);
if (f == nullptr) {
@ -1127,7 +1151,11 @@ class PosixEnv : public Env {
const EnvOptions& options) override {
result->reset();
Status s;
int fd = open(fname.c_str(), O_RDONLY);
int fd;
{
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_RDONLY);
}
SetFD_CLOEXEC(fd, &options);
if (fd < 0) {
s = IOError(fname, errno);
@ -1160,6 +1188,7 @@ class PosixEnv : public Env {
Status s;
int fd = -1;
do {
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
} while (fd < 0 && errno == EINTR);
if (fd < 0) {
@ -1200,7 +1229,11 @@ class PosixEnv : public Env {
return Status::NotSupported("No support for mmap read/write yet");
}
Status s;
const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644);
int fd;
{
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644);
}
if (fd < 0) {
s = IOError(fname, errno);
} else {
@ -1213,7 +1246,11 @@ class PosixEnv : public Env {
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) override {
result->reset();
const int fd = open(name.c_str(), 0);
int fd;
{
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(name.c_str(), 0);
}
if (fd < 0) {
return IOError(name, errno);
} else {
@ -1325,7 +1362,11 @@ class PosixEnv : public Env {
virtual Status LockFile(const std::string& fname, FileLock** lock) override {
*lock = nullptr;
Status result;
int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
int fd;
{
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
}
if (fd < 0) {
result = IOError(fname, errno);
} else if (LockOrUnlock(fname, fd, true) == -1) {
@ -1396,7 +1437,11 @@ class PosixEnv : public Env {
virtual Status NewLogger(const std::string& fname,
shared_ptr<Logger>* result) override {
FILE* f = fopen(fname.c_str(), "w");
FILE* f;
{
IOSTATS_TIMER_GUARD(open_nanos);
f = fopen(fname.c_str(), "w");
}
if (f == nullptr) {
result->reset();
return IOError(fname, errno);

@ -17,6 +17,11 @@ void IOStatsContext::Reset() {
thread_pool_id = Env::Priority::TOTAL;
bytes_read = 0;
bytes_written = 0;
open_nanos = 0;
allocate_nanos = 0;
write_nanos = 0;
range_sync_nanos = 0;
logger_nanos = 0;
}
#define OUTPUT(counter) #counter << " = " << counter << ", "
@ -25,7 +30,13 @@ std::string IOStatsContext::ToString() const {
std::ostringstream ss;
ss << OUTPUT(thread_pool_id)
<< OUTPUT(bytes_read)
<< OUTPUT(bytes_written);
<< OUTPUT(bytes_written)
<< OUTPUT(open_nanos)
<< OUTPUT(allocate_nanos)
<< OUTPUT(write_nanos)
<< OUTPUT(range_sync_nanos)
<< OUTPUT(logger_nanos);
return ss.str();
}

@ -5,6 +5,7 @@
//
#pragma once
#include "rocksdb/iostats_context.h"
#include "util/perf_step_timer.h"
#ifndef IOS_CROSS_COMPILE
@ -33,6 +34,18 @@
#define IOSTATS(metric) \
(iostats_context.metric)
// Stop the timer and update the metric
#define IOSTATS_TIMER_STOP(metric) \
iostats_step_timer_ ## metric.Stop();
#define IOSTATS_TIMER_START(metric) \
iostats_step_timer_ ## metric.Start();
// Declare and set start time of the timer
#define IOSTATS_TIMER_GUARD(metric) \
PerfStepTimer iostats_step_timer_ ## metric(&(iostats_context.metric)); \
iostats_step_timer_ ## metric.Start();
#else // IOS_CROSS_COMPILE
#define IOSTATS_ADD(metric, value)
@ -43,4 +56,8 @@
#define IOSTATS_THREAD_POOL_ID()
#define IOSTATS(metric) 0
#define IOSTATS_TIMER_GUARD(metric)
#define IOSTATS_TIMER_STOP(metric)
#define IOSTATS_TIMER_START(metric)
#endif // IOS_CROSS_COMPILE

@ -10,22 +10,12 @@
namespace rocksdb {
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
PerfLevel perf_level = kEnableCount;
// This is a dummy variable since some place references it
PerfContext perf_context;
#else
__thread PerfLevel perf_level = kEnableCount;
__thread PerfContext perf_context;
#endif
void SetPerfLevel(PerfLevel level) {
perf_level = level;
}
PerfLevel GetPerfLevel() {
return perf_level;
}
void PerfContext::Reset() {
#if !defined(NPERF_CONTEXT) && !defined(IOS_CROSS_COMPILE)
user_key_comparison_count = 0;
@ -53,6 +43,7 @@ void PerfContext::Reset() {
find_next_user_entry_time = 0;
write_pre_and_post_process_time = 0;
write_memtable_time = 0;
write_delay_time = 0;
db_mutex_lock_nanos = 0;
db_condition_wait_nanos = 0;
merge_operator_time_nanos = 0;
@ -79,7 +70,7 @@ std::string PerfContext::ToString() const {
<< OUTPUT(seek_internal_seek_time) << OUTPUT(find_next_user_entry_time)
<< OUTPUT(write_pre_and_post_process_time) << OUTPUT(write_memtable_time)
<< OUTPUT(db_mutex_lock_nanos) << OUTPUT(db_condition_wait_nanos)
<< OUTPUT(merge_operator_time_nanos);
<< OUTPUT(merge_operator_time_nanos) << OUTPUT(write_delay_time);
return ss.str();
#endif
}

@ -5,6 +5,7 @@
//
#pragma once
#include "rocksdb/perf_context.h"
#include "util/perf_step_timer.h"
#include "util/stop_watch.h"
namespace rocksdb {
@ -19,49 +20,6 @@ namespace rocksdb {
#else
extern __thread PerfLevel perf_level;
class PerfStepTimer {
public:
PerfStepTimer(uint64_t* metric)
: enabled_(perf_level >= PerfLevel::kEnableTime),
env_(enabled_ ? Env::Default() : nullptr),
start_(0),
metric_(metric) {
}
~PerfStepTimer() {
Stop();
}
void Start() {
if (enabled_) {
start_ = env_->NowNanos();
}
}
void Measure() {
if (start_) {
uint64_t now = env_->NowNanos();
*metric_ += now - start_;
start_ = now;
}
}
void Stop() {
if (start_) {
*metric_ += env_->NowNanos() - start_;
start_ = 0;
}
}
private:
const bool enabled_;
Env* const env_;
uint64_t start_;
uint64_t* metric_;
};
// Stop the timer and update the metric
#define PERF_TIMER_STOP(metric) \
perf_step_timer_ ## metric.Stop();
@ -70,8 +28,8 @@ class PerfStepTimer {
perf_step_timer_ ## metric.Start();
// Declare and set start time of the timer
#define PERF_TIMER_GUARD(metric) \
PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric)); \
#define PERF_TIMER_GUARD(metric) \
PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric)); \
perf_step_timer_ ## metric.Start();
// Update metric with time elapsed since last START. start time is reset

@ -0,0 +1,26 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include <sstream>
#include "util/perf_level_imp.h"
namespace rocksdb {
#if defined(IOS_CROSS_COMPILE)
PerfLevel perf_level = kEnableCount;
#else
__thread PerfLevel perf_level = kEnableCount;
#endif
void SetPerfLevel(PerfLevel level) {
perf_level = level;
}
PerfLevel GetPerfLevel() {
return perf_level;
}
} // namespace rocksdb

@ -0,0 +1,17 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include "rocksdb/perf_level.h"
namespace rocksdb {
#if defined(IOS_CROSS_COMPILE)
extern PerfLevel perf_level;
#else
extern __thread PerfLevel perf_level;
#endif
} // namespace rocksdb

@ -0,0 +1,54 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include "rocksdb/env.h"
#include "util/perf_level_imp.h"
#include "util/stop_watch.h"
namespace rocksdb {
class PerfStepTimer {
public:
PerfStepTimer(uint64_t* metric)
: enabled_(perf_level >= PerfLevel::kEnableTime),
env_(enabled_ ? Env::Default() : nullptr),
start_(0),
metric_(metric) {
}
~PerfStepTimer() {
Stop();
}
void Start() {
if (enabled_) {
start_ = env_->NowNanos();
}
}
void Measure() {
if (start_) {
uint64_t now = env_->NowNanos();
*metric_ += now - start_;
start_ = now;
}
}
void Stop() {
if (start_) {
*metric_ += env_->NowNanos() - start_;
start_ = 0;
}
}
private:
const bool enabled_;
Env* const env_;
uint64_t start_;
uint64_t* metric_;
};
} // namespace rocksdb

@ -21,6 +21,7 @@
#include <linux/falloc.h>
#endif
#include "rocksdb/env.h"
#include "util/iostats_context_imp.h"
#include <atomic>
namespace rocksdb {
@ -61,6 +62,8 @@ class PosixLogger : public Logger {
using Logger::Logv;
virtual void Logv(const char* format, va_list ap) override {
IOSTATS_TIMER_GUARD(logger_nanos);
const uint64_t thread_id = (*gettid_)();
// We try twice: the first time with a fixed-size stack allocated buffer,

Loading…
Cancel
Save