|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include "monitoring/statistics.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "rocksdb/system_clock.h"
|
|
|
|
#include "rocksdb/thread_status.h"
|
|
|
|
#include "util/stop_watch.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class InstrumentedCondVar;
|
|
|
|
|
|
|
|
// A wrapper class for port::Mutex that provides additional layer
|
|
|
|
// for collecting stats and instrumentation.
|
|
|
|
class InstrumentedMutex {
|
|
|
|
public:
|
|
|
|
explicit InstrumentedMutex(bool adaptive = false)
|
|
|
|
: mutex_(adaptive), stats_(nullptr), clock_(nullptr), stats_code_(0) {}
|
|
|
|
|
|
|
|
explicit InstrumentedMutex(SystemClock* clock, bool adaptive = false)
|
|
|
|
: mutex_(adaptive), stats_(nullptr), clock_(clock), stats_code_(0) {}
|
|
|
|
|
|
|
|
InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code,
|
|
|
|
bool adaptive = false)
|
|
|
|
: mutex_(adaptive),
|
|
|
|
stats_(stats),
|
|
|
|
clock_(clock),
|
|
|
|
stats_code_(stats_code) {}
|
|
|
|
|
|
|
|
#ifdef COERCE_CONTEXT_SWITCH
|
|
|
|
InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code,
|
|
|
|
InstrumentedCondVar* bg_cv, bool adaptive = false)
|
|
|
|
: mutex_(adaptive),
|
|
|
|
stats_(stats),
|
|
|
|
clock_(clock),
|
|
|
|
stats_code_(stats_code),
|
|
|
|
bg_cv_(bg_cv) {}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void Lock();
|
|
|
|
|
|
|
|
void Unlock() { mutex_.Unlock(); }
|
|
|
|
|
|
|
|
void AssertHeld() {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
void LockInternal();
|
|
|
|
friend class InstrumentedCondVar;
|
|
|
|
port::Mutex mutex_;
|
|
|
|
Statistics* stats_;
|
|
|
|
SystemClock* clock_;
|
|
|
|
int stats_code_;
|
|
|
|
#ifdef COERCE_CONTEXT_SWITCH
|
|
|
|
InstrumentedCondVar* bg_cv_ = nullptr;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
class ALIGN_AS(CACHE_LINE_SIZE) CacheAlignedInstrumentedMutex
|
|
|
|
: public InstrumentedMutex {
|
|
|
|
using InstrumentedMutex::InstrumentedMutex;
|
|
|
|
};
|
|
|
|
static_assert(alignof(CacheAlignedInstrumentedMutex) != CACHE_LINE_SIZE ||
|
|
|
|
sizeof(CacheAlignedInstrumentedMutex) % CACHE_LINE_SIZE == 0);
|
|
|
|
|
Don't hold DB mutex for block cache entry stat scans (#8538)
Summary:
I previously didn't notice the DB mutex was being held during
block cache entry stat scans, probably because I primarily checked for
read performance regressions, because they require the block cache and
are traditionally latency-sensitive.
This change does some refactoring to avoid holding DB mutex and to
avoid triggering and waiting for a scan in GetProperty("rocksdb.cfstats").
Some tests have to be updated because now the stats collector is
populated in the Cache aggressively on DB startup rather than lazily.
(I hope to clean up some of this added complexity in the future.)
This change also ensures proper treatment of need_out_of_mutex for
non-int DB properties.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8538
Test Plan:
Added unit test logic that uses sync points to fail if the DB mutex
is held during a scan, covering the various ways that a scan might be
triggered.
Performance test - the known impact to holding the DB mutex is on
TransactionDB, and the easiest way to see the impact is to hack the
scan code to almost always miss and take an artificially long time
scanning. Here I've injected an unconditional 5s sleep at the call to
ApplyToAllEntries.
Before (hacked):
$ TEST_TMPDIR=/dev/shm ./db_bench.base_xxx -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 433.219 micros/op 2308 ops/sec; 0.1 MB/s ( transactions:78999 aborts:0)
rocksdb.db.write.micros P50 : 16.135883 P95 : 36.622503 P99 : 66.036115 P100 : 5000614.000000 COUNT : 149677 SUM : 8364856
$ TEST_TMPDIR=/dev/shm ./db_bench.base_xxx -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 448.802 micros/op 2228 ops/sec; 0.1 MB/s ( transactions:75999 aborts:0)
rocksdb.db.write.micros P50 : 16.629221 P95 : 37.320607 P99 : 72.144341 P100 : 5000871.000000 COUNT : 143995 SUM : 13472323
Notice the 5s P100 write time.
After (hacked):
$ TEST_TMPDIR=/dev/shm ./db_bench.new_xxx -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 303.645 micros/op 3293 ops/sec; 0.1 MB/s ( transactions:98999 aborts:0)
rocksdb.db.write.micros P50 : 16.061871 P95 : 33.978834 P99 : 60.018017 P100 : 616315.000000 COUNT : 187619 SUM : 4097407
$ TEST_TMPDIR=/dev/shm ./db_bench.new_xxx -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 310.383 micros/op 3221 ops/sec; 0.1 MB/s ( transactions:96999 aborts:0)
rocksdb.db.write.micros P50 : 16.270026 P95 : 35.786844 P99 : 64.302878 P100 : 603088.000000 COUNT : 183819 SUM : 4095918
P100 write is now ~0.6s. Not good, but it's the same even if I completely bypass all the scanning code:
$ TEST_TMPDIR=/dev/shm ./db_bench.new_skip -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 311.365 micros/op 3211 ops/sec; 0.1 MB/s ( transactions:96999 aborts:0)
rocksdb.db.write.micros P50 : 16.274362 P95 : 36.221184 P99 : 68.809783 P100 : 649808.000000 COUNT : 183819 SUM : 4156767
$ TEST_TMPDIR=/dev/shm ./db_bench.new_skip -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 308.395 micros/op 3242 ops/sec; 0.1 MB/s ( transactions:97999 aborts:0)
rocksdb.db.write.micros P50 : 16.106222 P95 : 37.202403 P99 : 67.081875 P100 : 598091.000000 COUNT : 185714 SUM : 4098832
No substantial difference.
Reviewed By: siying
Differential Revision: D29738847
Pulled By: pdillinger
fbshipit-source-id: 1c5c155f5a1b62e4fea0fd4eeb515a8b7474027b
3 years ago
|
|
|
// RAII wrapper for InstrumentedMutex
|
|
|
|
class InstrumentedMutexLock {
|
|
|
|
public:
|
|
|
|
explicit InstrumentedMutexLock(InstrumentedMutex* mutex) : mutex_(mutex) {
|
|
|
|
mutex_->Lock();
|
|
|
|
}
|
|
|
|
|
|
|
|
~InstrumentedMutexLock() {
|
|
|
|
mutex_->Unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
InstrumentedMutex* const mutex_;
|
|
|
|
InstrumentedMutexLock(const InstrumentedMutexLock&) = delete;
|
|
|
|
void operator=(const InstrumentedMutexLock&) = delete;
|
|
|
|
};
|
|
|
|
|
Don't hold DB mutex for block cache entry stat scans (#8538)
Summary:
I previously didn't notice the DB mutex was being held during
block cache entry stat scans, probably because I primarily checked for
read performance regressions, because they require the block cache and
are traditionally latency-sensitive.
This change does some refactoring to avoid holding DB mutex and to
avoid triggering and waiting for a scan in GetProperty("rocksdb.cfstats").
Some tests have to be updated because now the stats collector is
populated in the Cache aggressively on DB startup rather than lazily.
(I hope to clean up some of this added complexity in the future.)
This change also ensures proper treatment of need_out_of_mutex for
non-int DB properties.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8538
Test Plan:
Added unit test logic that uses sync points to fail if the DB mutex
is held during a scan, covering the various ways that a scan might be
triggered.
Performance test - the known impact to holding the DB mutex is on
TransactionDB, and the easiest way to see the impact is to hack the
scan code to almost always miss and take an artificially long time
scanning. Here I've injected an unconditional 5s sleep at the call to
ApplyToAllEntries.
Before (hacked):
$ TEST_TMPDIR=/dev/shm ./db_bench.base_xxx -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 433.219 micros/op 2308 ops/sec; 0.1 MB/s ( transactions:78999 aborts:0)
rocksdb.db.write.micros P50 : 16.135883 P95 : 36.622503 P99 : 66.036115 P100 : 5000614.000000 COUNT : 149677 SUM : 8364856
$ TEST_TMPDIR=/dev/shm ./db_bench.base_xxx -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 448.802 micros/op 2228 ops/sec; 0.1 MB/s ( transactions:75999 aborts:0)
rocksdb.db.write.micros P50 : 16.629221 P95 : 37.320607 P99 : 72.144341 P100 : 5000871.000000 COUNT : 143995 SUM : 13472323
Notice the 5s P100 write time.
After (hacked):
$ TEST_TMPDIR=/dev/shm ./db_bench.new_xxx -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 303.645 micros/op 3293 ops/sec; 0.1 MB/s ( transactions:98999 aborts:0)
rocksdb.db.write.micros P50 : 16.061871 P95 : 33.978834 P99 : 60.018017 P100 : 616315.000000 COUNT : 187619 SUM : 4097407
$ TEST_TMPDIR=/dev/shm ./db_bench.new_xxx -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 310.383 micros/op 3221 ops/sec; 0.1 MB/s ( transactions:96999 aborts:0)
rocksdb.db.write.micros P50 : 16.270026 P95 : 35.786844 P99 : 64.302878 P100 : 603088.000000 COUNT : 183819 SUM : 4095918
P100 write is now ~0.6s. Not good, but it's the same even if I completely bypass all the scanning code:
$ TEST_TMPDIR=/dev/shm ./db_bench.new_skip -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 311.365 micros/op 3211 ops/sec; 0.1 MB/s ( transactions:96999 aborts:0)
rocksdb.db.write.micros P50 : 16.274362 P95 : 36.221184 P99 : 68.809783 P100 : 649808.000000 COUNT : 183819 SUM : 4156767
$ TEST_TMPDIR=/dev/shm ./db_bench.new_skip -benchmarks=randomtransaction,stats -cache_index_and_filter_blocks=1 -bloom_bits=10 -partition_index_and_filters=1 -duration=30 -stats_dump_period_sec=12 -cache_size=100000000 -statistics -transaction_db 2>&1 | egrep 'db.db.write.micros|micros/op'
randomtransaction : 308.395 micros/op 3242 ops/sec; 0.1 MB/s ( transactions:97999 aborts:0)
rocksdb.db.write.micros P50 : 16.106222 P95 : 37.202403 P99 : 67.081875 P100 : 598091.000000 COUNT : 185714 SUM : 4098832
No substantial difference.
Reviewed By: siying
Differential Revision: D29738847
Pulled By: pdillinger
fbshipit-source-id: 1c5c155f5a1b62e4fea0fd4eeb515a8b7474027b
3 years ago
|
|
|
// RAII wrapper for temporary releasing InstrumentedMutex inside
|
|
|
|
// InstrumentedMutexLock
|
|
|
|
class InstrumentedMutexUnlock {
|
|
|
|
public:
|
|
|
|
explicit InstrumentedMutexUnlock(InstrumentedMutex* mutex) : mutex_(mutex) {
|
|
|
|
mutex_->Unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
~InstrumentedMutexUnlock() { mutex_->Lock(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
InstrumentedMutex* const mutex_;
|
|
|
|
InstrumentedMutexUnlock(const InstrumentedMutexUnlock&) = delete;
|
|
|
|
void operator=(const InstrumentedMutexUnlock&) = delete;
|
|
|
|
};
|
|
|
|
|
|
|
|
class InstrumentedCondVar {
|
|
|
|
public:
|
|
|
|
explicit InstrumentedCondVar(InstrumentedMutex* instrumented_mutex)
|
|
|
|
: cond_(&(instrumented_mutex->mutex_)),
|
|
|
|
stats_(instrumented_mutex->stats_),
|
|
|
|
clock_(instrumented_mutex->clock_),
|
|
|
|
stats_code_(instrumented_mutex->stats_code_) {}
|
|
|
|
|
|
|
|
void Wait();
|
|
|
|
|
|
|
|
bool TimedWait(uint64_t abs_time_us);
|
|
|
|
|
|
|
|
void Signal() {
|
|
|
|
cond_.Signal();
|
|
|
|
}
|
|
|
|
|
|
|
|
void SignalAll() {
|
|
|
|
cond_.SignalAll();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
void WaitInternal();
|
|
|
|
bool TimedWaitInternal(uint64_t abs_time_us);
|
|
|
|
port::CondVar cond_;
|
|
|
|
Statistics* stats_;
|
|
|
|
SystemClock* clock_;
|
|
|
|
int stats_code_;
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|