Allow users to profile a query and see bottleneck of the query

Summary:
Provide a framework to profile a query in detail to figure out latency bottleneck. Currently, in Get(), Put() and iterators, 2-3 simple timing is used. We can easily add more profile counters to the framework later.

Test Plan: Enable this profiling in seveal existing tests.

Reviewers: haobo, dhruba, kailiu, emayanke, vamsi, igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14001

Conflicts:
	table/merger.cc
main
Siying Dong 11 years ago committed by Siying Dong
parent 58e1956d50
commit b135d01e7b
  1. 8
      db/db_bench.cc
  2. 34
      db/db_impl.cc
  3. 22
      db/db_iter.cc
  4. 41
      db/db_test.cc
  5. 20
      db/memtable.cc
  6. 34
      db/perf_context_test.cc
  7. 2
      db/version_set.cc
  8. 22
      include/rocksdb/perf_context.h
  9. 28
      table/merger.cc
  10. 3
      table/merger.h
  11. 15
      util/perf_context.cc

@ -22,6 +22,7 @@
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
#include "rocksdb/write_batch.h" #include "rocksdb/write_batch.h"
#include "rocksdb/statistics.h" #include "rocksdb/statistics.h"
#include "rocksdb/perf_context.h"
#include "port/port.h" #include "port/port.h"
#include "util/bit_set.h" #include "util/bit_set.h"
#include "util/crc32c.h" #include "util/crc32c.h"
@ -350,6 +351,8 @@ DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when" DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
" this is greater than 0."); " this is greater than 0.");
DEFINE_int32(perf_level, 0, "Level of perf collection");
static bool ValidateRateLimit(const char* flagname, double value) { static bool ValidateRateLimit(const char* flagname, double value) {
static constexpr double EPSILON = 1e-10; static constexpr double EPSILON = 1e-10;
if ( value < -EPSILON ) { if ( value < -EPSILON ) {
@ -689,6 +692,7 @@ struct SharedState {
port::Mutex mu; port::Mutex mu;
port::CondVar cv; port::CondVar cv;
int total; int total;
int perf_level;
// Each thread goes through the following states: // Each thread goes through the following states:
// (1) initializing // (1) initializing
@ -700,7 +704,7 @@ struct SharedState {
long num_done; long num_done;
bool start; bool start;
SharedState() : cv(&mu) { } SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
}; };
// Per-thread state for concurrent executions of the same benchmark. // Per-thread state for concurrent executions of the same benchmark.
@ -810,6 +814,7 @@ class Benchmark {
fprintf(stdout, "Memtablerep: vector\n"); fprintf(stdout, "Memtablerep: vector\n");
break; break;
} }
fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
PrintWarnings(); PrintWarnings();
fprintf(stdout, "------------------------------------------------\n"); fprintf(stdout, "------------------------------------------------\n");
@ -1150,6 +1155,7 @@ class Benchmark {
} }
} }
SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
thread->stats.Start(thread->tid); thread->stats.Start(thread->tid);
(arg->bm->*(arg->method))(thread); (arg->bm->*(arg->method))(thread);
thread->stats.Stop(); thread->stats.Stop();

@ -1035,7 +1035,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
(unsigned long)m->GetLogNumber()); (unsigned long)m->GetLogNumber());
list.push_back(m->NewIterator()); list.push_back(m->NewIterator());
} }
Iterator* iter = NewMergingIterator(&internal_comparator_, &list[0], Iterator* iter = NewMergingIterator(env_, &internal_comparator_, &list[0],
list.size()); list.size());
const SequenceNumber newest_snapshot = snapshots_.GetNewest(); const SequenceNumber newest_snapshot = snapshots_.GetNewest();
const SequenceNumber earliest_seqno_in_memtable = const SequenceNumber earliest_seqno_in_memtable =
@ -2519,7 +2519,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
// Collect iterators for files in L0 - Ln // Collect iterators for files in L0 - Ln
versions_->current()->AddIterators(options, storage_options_, &list); versions_->current()->AddIterators(options, storage_options_, &list);
Iterator* internal_iter = Iterator* internal_iter =
NewMergingIterator(&internal_comparator_, &list[0], list.size()); NewMergingIterator(env_, &internal_comparator_, &list[0], list.size());
versions_->current()->Ref(); versions_->current()->Ref();
cleanup->mu = &mutex_; cleanup->mu = &mutex_;
@ -2555,6 +2555,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
Status s; Status s;
StopWatch sw(env_, options_.statistics, DB_GET); StopWatch sw(env_, options_.statistics, DB_GET);
StopWatchNano snapshot_timer(env_, false);
StartPerfTimer(&snapshot_timer);
SequenceNumber snapshot; SequenceNumber snapshot;
mutex_.Lock(); mutex_.Lock();
if (options.snapshot != nullptr) { if (options.snapshot != nullptr) {
@ -2583,15 +2585,23 @@ Status DBImpl::GetImpl(const ReadOptions& options,
// s is both in/out. When in, s could either be OK or MergeInProgress. // s is both in/out. When in, s could either be OK or MergeInProgress.
// merge_operands will contain the sequence of merges in the latter case. // merge_operands will contain the sequence of merges in the latter case.
LookupKey lkey(key, snapshot); LookupKey lkey(key, snapshot);
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
if (mem->Get(lkey, value, &s, &merge_operands, options_)) { if (mem->Get(lkey, value, &s, &merge_operands, options_)) {
// Done // Done
} else if (imm.Get(lkey, value, &s, &merge_operands, options_)) { } else if (imm.Get(lkey, value, &s, &merge_operands, options_)) {
// Done // Done
} else { } else {
StopWatchNano from_files_timer(env_, false);
StartPerfTimer(&from_files_timer);
current->Get(options, lkey, value, &s, &merge_operands, &stats, current->Get(options, lkey, value, &s, &merge_operands, &stats,
options_, value_found); options_, value_found);
have_stat_update = true; have_stat_update = true;
BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
} }
StopWatchNano post_process_timer(env_, false);
StartPerfTimer(&post_process_timer);
mutex_.Lock(); mutex_.Lock();
if (!options_.disable_seek_compaction && if (!options_.disable_seek_compaction &&
@ -2607,6 +2617,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
// Note, tickers are atomic now - no lock protection needed any more. // Note, tickers are atomic now - no lock protection needed any more.
RecordTick(options_.statistics, NUMBER_KEYS_READ); RecordTick(options_.statistics, NUMBER_KEYS_READ);
RecordTick(options_.statistics, BYTES_READ, value->size()); RecordTick(options_.statistics, BYTES_READ, value->size());
BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
return s; return s;
} }
@ -2615,6 +2627,8 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
std::vector<std::string>* values) { std::vector<std::string>* values) {
StopWatch sw(env_, options_.statistics, DB_MULTIGET); StopWatch sw(env_, options_.statistics, DB_MULTIGET);
StopWatchNano snapshot_timer(env_, false);
StartPerfTimer(&snapshot_timer);
SequenceNumber snapshot; SequenceNumber snapshot;
mutex_.Lock(); mutex_.Lock();
if (options.snapshot != nullptr) { if (options.snapshot != nullptr) {
@ -2646,6 +2660,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
// Keep track of bytes that we read for statistics-recording later // Keep track of bytes that we read for statistics-recording later
uint64_t bytesRead = 0; uint64_t bytesRead = 0;
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
// For each of the given keys, apply the entire "get" process as follows: // For each of the given keys, apply the entire "get" process as follows:
// First look in the memtable, then in the immutable memtable (if any). // First look in the memtable, then in the immutable memtable (if any).
@ -2672,6 +2687,8 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
} }
// Post processing (decrement reference counts and record statistics) // Post processing (decrement reference counts and record statistics)
StopWatchNano post_process_timer(env_, false);
StartPerfTimer(&post_process_timer);
mutex_.Lock(); mutex_.Lock();
if (!options_.disable_seek_compaction && if (!options_.disable_seek_compaction &&
have_stat_update && current->UpdateStats(stats)) { have_stat_update && current->UpdateStats(stats)) {
@ -2686,6 +2703,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
RecordTick(options_.statistics, NUMBER_MULTIGET_CALLS); RecordTick(options_.statistics, NUMBER_MULTIGET_CALLS);
RecordTick(options_.statistics, NUMBER_MULTIGET_KEYS_READ, numKeys); RecordTick(options_.statistics, NUMBER_MULTIGET_KEYS_READ, numKeys);
RecordTick(options_.statistics, NUMBER_MULTIGET_BYTES_READ, bytesRead); RecordTick(options_.statistics, NUMBER_MULTIGET_BYTES_READ, bytesRead);
BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
return statList; return statList;
} }
@ -2754,6 +2772,8 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
} }
Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
StopWatchNano pre_post_process_timer(env_, false);
StartPerfTimer(&pre_post_process_timer);
Writer w(&mutex_); Writer w(&mutex_);
w.batch = my_batch; w.batch = my_batch;
w.sync = options.sync; w.sync = options.sync;
@ -2800,12 +2820,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
if (options.disableWAL) { if (options.disableWAL) {
flush_on_destroy_ = true; flush_on_destroy_ = true;
} }
BumpPerfTime(&perf_context.write_pre_and_post_process_time,
&pre_post_process_timer);
if (!options.disableWAL) { if (!options.disableWAL) {
StopWatchNano timer(env_); StopWatchNano timer(env_);
StartPerfTimer(&timer); StartPerfTimer(&timer);
status = log_->AddRecord(WriteBatchInternal::Contents(updates)); status = log_->AddRecord(WriteBatchInternal::Contents(updates));
BumpPerfTime(&perf_context.wal_write_time, &timer);
if (status.ok() && options.sync) { if (status.ok() && options.sync) {
if (options_.use_fsync) { if (options_.use_fsync) {
StopWatch(env_, options_.statistics, WAL_FILE_SYNC_MICROS); StopWatch(env_, options_.statistics, WAL_FILE_SYNC_MICROS);
@ -2815,10 +2836,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
status = log_->file()->Sync(); status = log_->file()->Sync();
} }
} }
BumpPerfTime(&perf_context.write_wal_time, &timer);
} }
if (status.ok()) { if (status.ok()) {
StopWatchNano write_memtable_timer(env_, false);
StartPerfTimer(&write_memtable_timer);
status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this, status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this,
options_.filter_deletes); options_.filter_deletes);
BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer);
if (!status.ok()) { if (!status.ok()) {
// Panic for in-memory corruptions // Panic for in-memory corruptions
// Note that existing logic was not sound. Any partial failure writing // Note that existing logic was not sound. Any partial failure writing
@ -2828,6 +2853,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
} }
SetTickerCount(options_.statistics, SEQUENCE_NUMBER, last_sequence); SetTickerCount(options_.statistics, SEQUENCE_NUMBER, last_sequence);
} }
StartPerfTimer(&pre_post_process_timer);
LogFlush(options_.info_log); LogFlush(options_.info_log);
mutex_.Lock(); mutex_.Lock();
if (status.ok()) { if (status.ok()) {
@ -2855,6 +2881,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
if (!writers_.empty()) { if (!writers_.empty()) {
writers_.front()->cv.Signal(); writers_.front()->cv.Signal();
} }
BumpPerfTime(&perf_context.write_pre_and_post_process_time,
&pre_post_process_timer);
return status; return status;
} }

@ -102,7 +102,8 @@ class DBIter: public Iterator {
virtual void SeekToLast(); virtual void SeekToLast();
private: private:
void FindNextUserEntry(bool skipping); inline void FindNextUserEntry(bool skipping);
void FindNextUserEntryInternal(bool skipping);
void FindPrevUserEntry(); void FindPrevUserEntry();
bool ParseKey(ParsedInternalKey* key); bool ParseKey(ParsedInternalKey* key);
void MergeValuesNewToOld(); void MergeValuesNewToOld();
@ -191,7 +192,15 @@ void DBIter::Next() {
// //
// NOTE: In between, saved_key_ can point to a user key that has // NOTE: In between, saved_key_ can point to a user key that has
// a delete marker // a delete marker
void DBIter::FindNextUserEntry(bool skipping) { inline void DBIter::FindNextUserEntry(bool skipping) {
StopWatchNano timer(env_, false);
StartPerfTimer(&timer);
FindNextUserEntryInternal(skipping);
BumpPerfTime(&perf_context.find_next_user_entry_time, &timer);
}
// Actual implementation of DBIter::FindNextUserEntry()
void DBIter::FindNextUserEntryInternal(bool skipping) {
// Loop until we hit an acceptable entry to yield // Loop until we hit an acceptable entry to yield
assert(iter_->Valid()); assert(iter_->Valid());
assert(direction_ == kForward); assert(direction_ == kForward);
@ -429,7 +438,10 @@ void DBIter::Seek(const Slice& target) {
saved_key_.clear(); saved_key_.clear();
AppendInternalKey( AppendInternalKey(
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
StopWatchNano internal_seek_timer(env_, false);
StartPerfTimer(&internal_seek_timer);
iter_->Seek(saved_key_); iter_->Seek(saved_key_);
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
if (iter_->Valid()) { if (iter_->Valid()) {
direction_ = kForward; direction_ = kForward;
ClearSavedValue(); ClearSavedValue();
@ -442,7 +454,10 @@ void DBIter::Seek(const Slice& target) {
void DBIter::SeekToFirst() { void DBIter::SeekToFirst() {
direction_ = kForward; direction_ = kForward;
ClearSavedValue(); ClearSavedValue();
StopWatchNano internal_seek_timer(env_, false);
StartPerfTimer(&internal_seek_timer);
iter_->SeekToFirst(); iter_->SeekToFirst();
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
if (iter_->Valid()) { if (iter_->Valid()) {
FindNextUserEntry(false /* not skipping */); FindNextUserEntry(false /* not skipping */);
} else { } else {
@ -461,7 +476,10 @@ void DBIter::SeekToLast() {
direction_ = kReverse; direction_ = kReverse;
ClearSavedValue(); ClearSavedValue();
StopWatchNano internal_seek_timer(env_, false);
StartPerfTimer(&internal_seek_timer);
iter_->SeekToLast(); iter_->SeekToLast();
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
FindPrevUserEntry(); FindPrevUserEntry();
} }

@ -22,6 +22,7 @@
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "rocksdb/perf_context.h"
#include "util/hash.h" #include "util/hash.h"
#include "util/logging.h" #include "util/logging.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
@ -1215,7 +1216,13 @@ TEST(DBTest, IterMulti) {
ASSERT_EQ(IterStatus(iter), "a->va"); ASSERT_EQ(IterStatus(iter), "a->va");
iter->Seek("ax"); iter->Seek("ax");
ASSERT_EQ(IterStatus(iter), "b->vb"); ASSERT_EQ(IterStatus(iter), "b->vb");
SetPerfLevel(kEnableTime);
perf_context.Reset();
iter->Seek("b"); iter->Seek("b");
ASSERT_TRUE((int) perf_context.seek_internal_seek_time > 0);
ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
SetPerfLevel(kDisable);
ASSERT_EQ(IterStatus(iter), "b->vb"); ASSERT_EQ(IterStatus(iter), "b->vb");
iter->Seek("z"); iter->Seek("z");
ASSERT_EQ(IterStatus(iter), "(invalid)"); ASSERT_EQ(IterStatus(iter), "(invalid)");
@ -1230,7 +1237,12 @@ TEST(DBTest, IterMulti) {
// Switch from forward to reverse // Switch from forward to reverse
iter->SeekToFirst(); iter->SeekToFirst();
iter->Next(); iter->Next();
SetPerfLevel(kEnableTime);
perf_context.Reset();
iter->Next(); iter->Next();
ASSERT_EQ(0, (int) perf_context.seek_internal_seek_time);
ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
SetPerfLevel(kDisable);
iter->Prev(); iter->Prev();
ASSERT_EQ(IterStatus(iter), "b->vb"); ASSERT_EQ(IterStatus(iter), "b->vb");
@ -1590,22 +1602,42 @@ TEST(DBTest, NumImmutableMemTable) {
std::string big_value(1000000, 'x'); std::string big_value(1000000, 'x');
std::string num; std::string num;
SetPerfLevel(kEnableTime);;
ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
ASSERT_EQ(num, "0"); ASSERT_EQ(num, "0");
perf_context.Reset();
Get("k1");
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
ASSERT_EQ(num, "1"); ASSERT_EQ(num, "1");
perf_context.Reset();
Get("k1");
ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
perf_context.Reset();
Get("k2");
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
ASSERT_EQ(num, "2"); ASSERT_EQ(num, "2");
perf_context.Reset();
Get("k2");
ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
perf_context.Reset();
Get("k3");
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
perf_context.Reset();
Get("k1");
ASSERT_EQ(3, (int) perf_context.get_from_memtable_count);
dbfull()->Flush(FlushOptions()); dbfull()->Flush(FlushOptions());
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
ASSERT_EQ(num, "0"); ASSERT_EQ(num, "0");
SetPerfLevel(kDisable);
} while (ChangeCompactOptions()); } while (ChangeCompactOptions());
} }
@ -1614,11 +1646,16 @@ TEST(DBTest, FLUSH) {
Options options = CurrentOptions(); Options options = CurrentOptions();
WriteOptions writeOpt = WriteOptions(); WriteOptions writeOpt = WriteOptions();
writeOpt.disableWAL = true; writeOpt.disableWAL = true;
SetPerfLevel(kEnableTime);;
ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
// this will now also flush the last 2 writes // this will now also flush the last 2 writes
dbfull()->Flush(FlushOptions()); dbfull()->Flush(FlushOptions());
ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1"));
perf_context.Reset();
Get("foo");
ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
Reopen(); Reopen();
ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("foo"));
ASSERT_EQ("v1", Get("bar")); ASSERT_EQ("v1", Get("bar"));
@ -1630,7 +1667,9 @@ TEST(DBTest, FLUSH) {
Reopen(); Reopen();
ASSERT_EQ("v2", Get("bar")); ASSERT_EQ("v2", Get("bar"));
perf_context.Reset();
ASSERT_EQ("v2", Get("foo")); ASSERT_EQ("v2", Get("foo"));
ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
writeOpt.disableWAL = false; writeOpt.disableWAL = false;
ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3")); ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3"));
@ -1642,6 +1681,8 @@ TEST(DBTest, FLUSH) {
// has WAL enabled. // has WAL enabled.
ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v3", Get("foo"));
ASSERT_EQ("v3", Get("bar")); ASSERT_EQ("v3", Get("bar"));
SetPerfLevel(kDisable);
} while (ChangeCompactOptions()); } while (ChangeCompactOptions());
} }

@ -19,6 +19,8 @@
#include "util/coding.h" #include "util/coding.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/murmurhash.h" #include "util/murmurhash.h"
#include "util/perf_context_imp.h"
#include "util/stop_watch.h"
namespace std { namespace std {
template <> template <>
@ -161,6 +163,9 @@ void MemTable::Add(SequenceNumber s, ValueType type,
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
std::deque<std::string>* operands, const Options& options) { std::deque<std::string>* operands, const Options& options) {
StopWatchNano memtable_get_timer(options.env, false);
StartPerfTimer(&memtable_get_timer);
Slice memkey = key.memtable_key(); Slice memkey = key.memtable_key();
std::shared_ptr<MemTableRep::Iterator> iter( std::shared_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(key.user_key())); table_->GetIterator(key.user_key()));
@ -174,7 +179,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
auto logger = options.info_log; auto logger = options.info_log;
std::string merge_result; std::string merge_result;
for (; iter->Valid(); iter->Next()) { bool found_final_value = false;
for (; !found_final_value && iter->Valid(); iter->Next()) {
// entry format is: // entry format is:
// klength varint32 // klength varint32
// userkey char[klength-8] // userkey char[klength-8]
@ -211,7 +217,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
if (options.inplace_update_support) { if (options.inplace_update_support) {
GetLock(key.user_key())->Unlock(); GetLock(key.user_key())->Unlock();
} }
return true; found_final_value = true;
break;
} }
case kTypeDeletion: { case kTypeDeletion: {
if (merge_in_progress) { if (merge_in_progress) {
@ -225,7 +232,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
} else { } else {
*s = Status::NotFound(Slice()); *s = Status::NotFound(Slice());
} }
return true; found_final_value = true;
break;
} }
case kTypeMerge: { case kTypeMerge: {
Slice v = GetLengthPrefixedSlice(key_ptr + key_length); Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
@ -259,10 +267,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
// No change to value, since we have not yet found a Put/Delete // No change to value, since we have not yet found a Put/Delete
if (merge_in_progress) { if (!found_final_value && merge_in_progress) {
*s = Status::MergeInProgress(""); *s = Status::MergeInProgress("");
} }
return false; BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer);
BumpPerfCount(&perf_context.get_from_memtable_count);
return found_final_value;
} }
bool MemTable::Update(SequenceNumber seq, ValueType type, bool MemTable::Update(SequenceNumber seq, ValueType type,

@ -174,6 +174,13 @@ void ProfileKeyComparison() {
HistogramImpl hist_put; HistogramImpl hist_put;
HistogramImpl hist_get; HistogramImpl hist_get;
HistogramImpl hist_get_snapshot;
HistogramImpl hist_get_memtable;
HistogramImpl hist_get_post_process;
HistogramImpl hist_num_memtable_checked;
HistogramImpl hist_write_pre_post;
HistogramImpl hist_write_wal_time;
HistogramImpl hist_write_memtable_time;
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
@ -192,16 +199,37 @@ void ProfileKeyComparison() {
perf_context.Reset(); perf_context.Reset();
db->Put(write_options, key, value); db->Put(write_options, key, value);
hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
hist_write_wal_time.Add(perf_context.write_wal_time);
hist_write_memtable_time.Add(perf_context.write_memtable_time);
hist_put.Add(perf_context.user_key_comparison_count); hist_put.Add(perf_context.user_key_comparison_count);
perf_context.Reset(); perf_context.Reset();
db->Get(read_options, key, &value); db->Get(read_options, key, &value);
hist_get_snapshot.Add(perf_context.get_snapshot_time);
hist_get_memtable.Add(perf_context.get_from_memtable_time);
hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
hist_get_post_process.Add(perf_context.get_post_process_time);
hist_get.Add(perf_context.user_key_comparison_count); hist_get.Add(perf_context.user_key_comparison_count);
} }
std::cout << "Put uesr key comparison: \n" << hist_put.ToString() std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
<< "Get uesr key comparison: \n" << hist_get.ToString(); << "Get uesr key comparison: \n" << hist_get.ToString();
std::cout << "Put(): Pre and Post Process Time: \n"
<< hist_write_pre_post.ToString()
<< " Writing WAL time: \n"
<< hist_write_wal_time.ToString() << "\n"
<< " Writing Mem Table time: \n"
<< hist_write_memtable_time.ToString() << "\n";
std::cout << "Get(): Time to get snapshot: \n"
<< hist_get_snapshot.ToString()
<< " Time to get value from memtables: \n"
<< hist_get_memtable.ToString() << "\n"
<< " Number of memtables checked: \n"
<< hist_num_memtable_checked.ToString() << "\n"
<< " Time to post process: \n"
<< hist_get_post_process.ToString() << "\n";
} }
TEST(PerfContextTest, KeyComparisonCount) { TEST(PerfContextTest, KeyComparisonCount) {
@ -259,8 +287,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
db->Put(write_options, key, value); db->Put(write_options, key, value);
auto put_time = timer.ElapsedNanos(); auto put_time = timer.ElapsedNanos();
hist_put_time.Add(put_time); hist_put_time.Add(put_time);
hist_wal_time.Add(perf_context.wal_write_time); hist_wal_time.Add(perf_context.write_wal_time);
hist_time_diff.Add(put_time - perf_context.wal_write_time); hist_time_diff.Add(put_time - perf_context.write_wal_time);
} }
std::cout << "Put time:\n" << hist_put_time.ToString() std::cout << "Put time:\n" << hist_put_time.ToString()

@ -2091,7 +2091,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
} }
} }
assert(num <= space); assert(num <= space);
Iterator* result = NewMergingIterator(&icmp_, list, num); Iterator* result = NewMergingIterator(env_, &icmp_, list, num);
delete[] list; delete[] list;
return result; return result;
} }

@ -38,7 +38,27 @@ struct PerfContext {
uint64_t internal_key_skipped_count; uint64_t internal_key_skipped_count;
// total number of deletes skipped over during iteration // total number of deletes skipped over during iteration
uint64_t internal_delete_skipped_count; uint64_t internal_delete_skipped_count;
uint64_t wal_write_time; // total time spent on writing to WAL
uint64_t get_snapshot_time; // total time spent on getting snapshot
uint64_t get_from_memtable_time; // total time spent on querying memtables
uint64_t get_from_memtable_count; // number of mem tables queried
// total time spent after Get() finds a key
uint64_t get_post_process_time;
uint64_t get_from_output_files_time; // total time reading from output files
// total time spent on seeking child iters
uint64_t seek_child_seek_time;
// number of seek issued in child iterators
uint64_t seek_child_seek_count;
uint64_t seek_min_heap_time; // total time spent on the merge heap
// total time spent on seeking the internal entries
uint64_t seek_internal_seek_time;
// total time spent on iterating internal entries to find the next user entry
uint64_t find_next_user_entry_time;
// total time spent on pre or post processing when writing a record
uint64_t write_pre_and_post_process_time;
uint64_t write_wal_time; // total time spent on writing to WAL
// total time spent on writing to mem tables
uint64_t write_memtable_time;
}; };
extern __thread PerfContext perf_context; extern __thread PerfContext perf_context;

@ -11,8 +11,11 @@
#include "rocksdb/comparator.h" #include "rocksdb/comparator.h"
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "table/iter_heap.h" #include "table/iter_heap.h"
#include "table/iterator_wrapper.h" #include "table/iterator_wrapper.h"
#include "util/stop_watch.h"
#include "util/perf_context_imp.h"
#include <vector> #include <vector>
@ -22,11 +25,13 @@ namespace {
class MergingIterator : public Iterator { class MergingIterator : public Iterator {
public: public:
MergingIterator(const Comparator* comparator, Iterator** children, int n) MergingIterator(Env* const env, const Comparator* comparator,
Iterator** children, int n)
: comparator_(comparator), : comparator_(comparator),
children_(n), children_(n),
current_(nullptr), current_(nullptr),
use_heap_(true), use_heap_(true),
env_(env),
direction_(kForward), direction_(kForward),
maxHeap_(NewMaxIterHeap(comparator_)), maxHeap_(NewMaxIterHeap(comparator_)),
minHeap_ (NewMinIterHeap(comparator_)) { minHeap_ (NewMinIterHeap(comparator_)) {
@ -74,8 +79,14 @@ class MergingIterator : public Iterator {
// Invalidate the heap. // Invalidate the heap.
use_heap_ = false; use_heap_ = false;
IteratorWrapper* first_child = nullptr; IteratorWrapper* first_child = nullptr;
StopWatchNano child_seek_timer(env_, false);
StopWatchNano min_heap_timer(env_, false);
for (auto& child : children_) { for (auto& child : children_) {
StartPerfTimer(&child_seek_timer);
child.Seek(target); child.Seek(target);
BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer);
BumpPerfCount(&perf_context.seek_child_seek_count);
if (child.Valid()) { if (child.Valid()) {
// This child has valid key // This child has valid key
if (!use_heap_) { if (!use_heap_) {
@ -86,24 +97,31 @@ class MergingIterator : public Iterator {
} else { } else {
// We have more than one children with valid keys. Initialize // We have more than one children with valid keys. Initialize
// the heap and put the first child into the heap. // the heap and put the first child into the heap.
StartPerfTimer(&min_heap_timer);
ClearHeaps(); ClearHeaps();
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
StartPerfTimer(&min_heap_timer);
minHeap_.push(first_child); minHeap_.push(first_child);
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
} }
} }
if (use_heap_) { if (use_heap_) {
StartPerfTimer(&min_heap_timer);
minHeap_.push(&child); minHeap_.push(&child);
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
} }
} }
} }
if (use_heap_) { if (use_heap_) {
// If heap is valid, need to put the smallest key to curent_. // If heap is valid, need to put the smallest key to curent_.
StartPerfTimer(&min_heap_timer);
FindSmallest(); FindSmallest();
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
} else { } else {
// The heap is not valid, then the current_ iterator is the first // The heap is not valid, then the current_ iterator is the first
// one, or null if there is no first child. // one, or null if there is no first child.
current_ = first_child; current_ = first_child;
} }
direction_ = kForward;
} }
virtual void Next() { virtual void Next() {
@ -211,6 +229,7 @@ class MergingIterator : public Iterator {
// contain valid rows. If it is false, only current_ can possibly contain // contain valid rows. If it is false, only current_ can possibly contain
// valid rows. // valid rows.
bool use_heap_; bool use_heap_;
Env* const env_;
// Which direction is the iterator moving? // Which direction is the iterator moving?
enum Direction { enum Direction {
kForward, kForward,
@ -250,14 +269,15 @@ void MergingIterator::ClearHeaps() {
} }
} // namespace } // namespace
Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { Iterator* NewMergingIterator(Env* const env, const Comparator* cmp,
Iterator** list, int n) {
assert(n >= 0); assert(n >= 0);
if (n == 0) { if (n == 0) {
return NewEmptyIterator(); return NewEmptyIterator();
} else if (n == 1) { } else if (n == 1) {
return list[0]; return list[0];
} else { } else {
return new MergingIterator(cmp, list, n); return new MergingIterator(env, cmp, list, n);
} }
} }

@ -13,6 +13,7 @@ namespace rocksdb {
class Comparator; class Comparator;
class Iterator; class Iterator;
class Env;
// Return an iterator that provided the union of the data in // Return an iterator that provided the union of the data in
// children[0,n-1]. Takes ownership of the child iterators and // children[0,n-1]. Takes ownership of the child iterators and
@ -23,6 +24,6 @@ class Iterator;
// //
// REQUIRES: n >= 0 // REQUIRES: n >= 0
extern Iterator* NewMergingIterator( extern Iterator* NewMergingIterator(
const Comparator* comparator, Iterator** children, int n); Env* const env, const Comparator* comparator, Iterator** children, int n);
} // namespace rocksdb } // namespace rocksdb

@ -22,7 +22,20 @@ void PerfContext::Reset() {
block_decompress_time = 0; block_decompress_time = 0;
internal_key_skipped_count = 0; internal_key_skipped_count = 0;
internal_delete_skipped_count = 0; internal_delete_skipped_count = 0;
wal_write_time = 0; write_wal_time = 0;
get_snapshot_time = 0;
get_from_memtable_time = 0;
get_from_memtable_count = 0;
get_post_process_time = 0;
get_from_output_files_time = 0;
seek_child_seek_time = 0;
seek_child_seek_count = 0;
seek_min_heap_time = 0;
seek_internal_seek_time = 0;
find_next_user_entry_time = 0;
write_pre_and_post_process_time = 0;
write_memtable_time = 0;
} }
__thread PerfContext perf_context; __thread PerfContext perf_context;

Loading…
Cancel
Save