From 1c8b819be2e6f70bc3232bcba8ca522bcecad667 Mon Sep 17 00:00:00 2001 From: kailiu Date: Wed, 20 Nov 2013 13:45:32 -0800 Subject: [PATCH 01/21] Fix a memory leak happened in table_test --- table/table_test.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/table/table_test.cc b/table/table_test.cc index e93e9bcec..394aa4b9d 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -252,11 +252,14 @@ class BlockBasedTableConstructor: public Constructor { virtual Status FinishImpl(const Options& options, const KVMap& data) { Reset(); sink_.reset(new StringSink()); + std::unique_ptr flush_policy_factory( + new FlushBlockBySizePolicyFactory(options.block_size, + options.block_size_deviation)); + BlockBasedTableBuilder builder( options, sink_.get(), - new FlushBlockBySizePolicyFactory( - options.block_size, options.block_size_deviation), + flush_policy_factory.get(), options.compression); for (KVMap::const_iterator it = data.begin(); From 618250b5c50f5c4923efa7d5a79653628e604010 Mon Sep 17 00:00:00 2001 From: Kai Liu Date: Wed, 20 Nov 2013 14:38:17 -0800 Subject: [PATCH 02/21] Update the installation guide for mac users. --- INSTALL.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 7db22da57..44e11ca7d 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -26,8 +26,9 @@ libraries. You are on your own. * Install zlib. Try: `sudo apt-get install zlib1g-dev`. * Install bzip2: `sudo apt-get install libbz2-dev`. * **OS X**: - * Update your xcode to the latest version to get the compiler with - C++ 11 support. + * We encourage you to install latest C++ compiler via [homebrew](http://brew.sh/). + * If you're first time developer in MacOS, please run: `xcode-select --install` in your command line. + * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher). * Install zlib, bzip2 and snappy libraries for compression. * Install gflags. We have included a script `build_tools/mac-install-gflags.sh`, which should automatically install it. From f5acb2eebb62d9cbc50d67af009cc7fb52c37e05 Mon Sep 17 00:00:00 2001 From: Kai Liu Date: Wed, 20 Nov 2013 14:54:53 -0800 Subject: [PATCH 03/21] Add more guide for mac users. --- INSTALL.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 44e11ca7d..549446205 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -26,9 +26,11 @@ libraries. You are on your own. * Install zlib. Try: `sudo apt-get install zlib1g-dev`. * Install bzip2: `sudo apt-get install libbz2-dev`. * **OS X**: - * We encourage you to install latest C++ compiler via [homebrew](http://brew.sh/). - * If you're first time developer in MacOS, please run: `xcode-select --install` in your command line. - * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher). + * Install latest C++ compiler that supports C++ 11: + # Update XCode: run `xcode-select --install` (or install it from XCode App's settting). + # Install via [homebrew](http://brew.sh/). + * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line. + * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher). * Install zlib, bzip2 and snappy libraries for compression. * Install gflags. We have included a script `build_tools/mac-install-gflags.sh`, which should automatically install it. From db2e2615f83f0d63f4e82bcec15b9c05ea5e06b8 Mon Sep 17 00:00:00 2001 From: Kai Liu Date: Wed, 20 Nov 2013 14:55:33 -0800 Subject: [PATCH 04/21] Fix the format in INSTALL.md --- INSTALL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 549446205..f56323eee 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -27,8 +27,8 @@ libraries. You are on your own. * Install bzip2: `sudo apt-get install libbz2-dev`. * **OS X**: * Install latest C++ compiler that supports C++ 11: - # Update XCode: run `xcode-select --install` (or install it from XCode App's settting). - # Install via [homebrew](http://brew.sh/). + * Update XCode: run `xcode-select --install` (or install it from XCode App's settting). + * Install via [homebrew](http://brew.sh/). * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line. * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher). * Install zlib, bzip2 and snappy libraries for compression. From 56589ab81f6827ff7402e31b24a6d548f29a524f Mon Sep 17 00:00:00 2001 From: kailiu Date: Wed, 20 Nov 2013 18:42:12 -0800 Subject: [PATCH 05/21] Add TableOptions for BlockBasedTableFactory We are having more and more options to specify for this table so it makes sense to have a TableOptions for future extension. --- table/block_based_table_factory.cc | 6 ++++-- table/block_based_table_factory.h | 23 ++++++++++++++--------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 43734ea71..836f6edf6 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -29,7 +29,8 @@ Status BlockBasedTableFactory::GetTableReader( TableBuilder* BlockBasedTableFactory::GetTableBuilder( const Options& options, WritableFile* file, CompressionType compression_type) const { - auto flush_block_policy_factory = flush_block_policy_factory_.get(); + auto flush_block_policy_factory = + table_options_.flush_block_policy_factory.get(); // if flush block policy factory is not set, we'll create the default one // from the options. @@ -54,7 +55,8 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder( // options. // We can safely delete flush_block_policy_factory since it will only be used // during the construction of `BlockBasedTableBuilder`. - if (flush_block_policy_factory != flush_block_policy_factory_.get()) { + if (flush_block_policy_factory != + table_options_.flush_block_policy_factory.get()) { delete flush_block_policy_factory; } diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index d6ead29a0..ee525816f 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -31,14 +31,18 @@ class BlockBasedTableBuilder; class BlockBasedTableFactory: public TableFactory { public: - // @flush_block_policy_factory creates the instances of flush block policy. - // which provides a configurable way to determine when to flush a block in - // the block based tables. If not set, table builder will use the default - // block flush policy, which cut blocks by block size (please refer to - // `FlushBlockBySizePolicy`). - BlockBasedTableFactory( - FlushBlockPolicyFactory* flush_block_policy_factory = nullptr) : - flush_block_policy_factory_(flush_block_policy_factory) { + struct TableOptions { + // @flush_block_policy_factory creates the instances of flush block policy. + // which provides a configurable way to determine when to flush a block in + // the block based tables. If not set, table builder will use the default + // block flush policy, which cut blocks by block size (please refer to + // `FlushBlockBySizePolicy`). + std::shared_ptr flush_block_policy_factory; + }; + + BlockBasedTableFactory() : BlockBasedTableFactory(TableOptions()) { } + BlockBasedTableFactory(const TableOptions& table_options): + table_options_(table_options) { } ~BlockBasedTableFactory() { @@ -58,7 +62,8 @@ public: override; private: - std::unique_ptr flush_block_policy_factory_; + TableOptions table_options_; }; + } // namespace rocksdb From 3d8ac31d7168c916d6f2f0729eb627b07d8f082b Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Mon, 18 Nov 2013 11:32:54 -0800 Subject: [PATCH 06/21] Allow users to profile a query and see bottleneck of the query Summary: Provide a framework to profile a query in detail to figure out latency bottleneck. Currently, in Get(), Put() and iterators, 2-3 simple timing is used. We can easily add more profile counters to the framework later. Test Plan: Enable this profiling in seveal existing tests. Reviewers: haobo, dhruba, kailiu, emayanke, vamsi, igor CC: leveldb Differential Revision: https://reviews.facebook.net/D14001 --- db/db_bench.cc | 8 ++++++- db/db_impl.cc | 34 +++++++++++++++++++++++++--- db/db_iter.cc | 22 ++++++++++++++++-- db/db_test.cc | 41 ++++++++++++++++++++++++++++++++++ db/memtable.cc | 20 ++++++++++++----- db/perf_context_test.cc | 34 +++++++++++++++++++++++++--- db/version_set.cc | 2 +- include/rocksdb/perf_context.h | 22 +++++++++++++++++- table/merger.cc | 23 ++++++++++++++++--- table/merger.h | 3 ++- util/perf_context.cc | 15 ++++++++++++- 11 files changed, 203 insertions(+), 21 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index 63cc906e7..3ab130093 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -22,6 +22,7 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/write_batch.h" #include "rocksdb/statistics.h" +#include "rocksdb/perf_context.h" #include "port/port.h" #include "util/bit_set.h" #include "util/crc32c.h" @@ -350,6 +351,8 @@ DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when " DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when" " this is greater than 0."); +DEFINE_int32(perf_level, 0, "Level of perf collection"); + static bool ValidateRateLimit(const char* flagname, double value) { static constexpr double EPSILON = 1e-10; if ( value < -EPSILON ) { @@ -689,6 +692,7 @@ struct SharedState { port::Mutex mu; port::CondVar cv; int total; + int perf_level; // Each thread goes through the following states: // (1) initializing @@ -700,7 +704,7 @@ struct SharedState { long num_done; bool start; - SharedState() : cv(&mu) { } + SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { } }; // Per-thread state for concurrent executions of the same benchmark. @@ -810,6 +814,7 @@ class Benchmark { fprintf(stdout, "Memtablerep: vector\n"); break; } + fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level); PrintWarnings(); fprintf(stdout, "------------------------------------------------\n"); @@ -1150,6 +1155,7 @@ class Benchmark { } } + SetPerfLevel(static_cast (shared->perf_level)); thread->stats.Start(thread->tid); (arg->bm->*(arg->method))(thread); thread->stats.Stop(); diff --git a/db/db_impl.cc b/db/db_impl.cc index 5a2f0de4a..a4e28b032 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1035,7 +1035,7 @@ Status DBImpl::WriteLevel0Table(std::vector &mems, VersionEdit* edit, (unsigned long)m->GetLogNumber()); list.push_back(m->NewIterator()); } - Iterator* iter = NewMergingIterator(&internal_comparator_, &list[0], + Iterator* iter = NewMergingIterator(env_, &internal_comparator_, &list[0], list.size()); const SequenceNumber newest_snapshot = snapshots_.GetNewest(); const SequenceNumber earliest_seqno_in_memtable = @@ -2519,7 +2519,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, // Collect iterators for files in L0 - Ln versions_->current()->AddIterators(options, storage_options_, &list); Iterator* internal_iter = - NewMergingIterator(&internal_comparator_, &list[0], list.size()); + NewMergingIterator(env_, &internal_comparator_, &list[0], list.size()); versions_->current()->Ref(); cleanup->mu = &mutex_; @@ -2555,6 +2555,8 @@ Status DBImpl::GetImpl(const ReadOptions& options, Status s; StopWatch sw(env_, options_.statistics, DB_GET); + StopWatchNano snapshot_timer(env_, false); + StartPerfTimer(&snapshot_timer); SequenceNumber snapshot; mutex_.Lock(); if (options.snapshot != nullptr) { @@ -2583,15 +2585,23 @@ Status DBImpl::GetImpl(const ReadOptions& options, // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. LookupKey lkey(key, snapshot); + BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); if (mem->Get(lkey, value, &s, &merge_operands, options_)) { // Done } else if (imm.Get(lkey, value, &s, &merge_operands, options_)) { // Done } else { + StopWatchNano from_files_timer(env_, false); + StartPerfTimer(&from_files_timer); + current->Get(options, lkey, value, &s, &merge_operands, &stats, options_, value_found); have_stat_update = true; + BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer); } + + StopWatchNano post_process_timer(env_, false); + StartPerfTimer(&post_process_timer); mutex_.Lock(); if (!options_.disable_seek_compaction && @@ -2607,6 +2617,8 @@ Status DBImpl::GetImpl(const ReadOptions& options, // Note, tickers are atomic now - no lock protection needed any more. RecordTick(options_.statistics, NUMBER_KEYS_READ); RecordTick(options_.statistics, BYTES_READ, value->size()); + BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); + return s; } @@ -2615,6 +2627,8 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, std::vector* values) { StopWatch sw(env_, options_.statistics, DB_MULTIGET); + StopWatchNano snapshot_timer(env_, false); + StartPerfTimer(&snapshot_timer); SequenceNumber snapshot; mutex_.Lock(); if (options.snapshot != nullptr) { @@ -2646,6 +2660,7 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, // Keep track of bytes that we read for statistics-recording later uint64_t bytesRead = 0; + BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); // For each of the given keys, apply the entire "get" process as follows: // First look in the memtable, then in the immutable memtable (if any). @@ -2672,6 +2687,8 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, } // Post processing (decrement reference counts and record statistics) + StopWatchNano post_process_timer(env_, false); + StartPerfTimer(&post_process_timer); mutex_.Lock(); if (!options_.disable_seek_compaction && have_stat_update && current->UpdateStats(stats)) { @@ -2686,6 +2703,7 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, RecordTick(options_.statistics, NUMBER_MULTIGET_CALLS); RecordTick(options_.statistics, NUMBER_MULTIGET_KEYS_READ, numKeys); RecordTick(options_.statistics, NUMBER_MULTIGET_BYTES_READ, bytesRead); + BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); return statList; } @@ -2754,6 +2772,8 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { } Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { + StopWatchNano pre_post_process_timer(env_, false); + StartPerfTimer(&pre_post_process_timer); Writer w(&mutex_); w.batch = my_batch; w.sync = options.sync; @@ -2800,12 +2820,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (options.disableWAL) { flush_on_destroy_ = true; } + BumpPerfTime(&perf_context.write_pre_and_post_process_time, + &pre_post_process_timer); if (!options.disableWAL) { StopWatchNano timer(env_); StartPerfTimer(&timer); status = log_->AddRecord(WriteBatchInternal::Contents(updates)); - BumpPerfTime(&perf_context.wal_write_time, &timer); if (status.ok() && options.sync) { if (options_.use_fsync) { StopWatch(env_, options_.statistics, WAL_FILE_SYNC_MICROS); @@ -2815,10 +2836,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { status = log_->file()->Sync(); } } + BumpPerfTime(&perf_context.write_wal_time, &timer); } if (status.ok()) { + StopWatchNano write_memtable_timer(env_, false); + StartPerfTimer(&write_memtable_timer); status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this, options_.filter_deletes); + BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer); if (!status.ok()) { // Panic for in-memory corruptions // Note that existing logic was not sound. Any partial failure writing @@ -2828,6 +2853,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } SetTickerCount(options_.statistics, SEQUENCE_NUMBER, last_sequence); } + StartPerfTimer(&pre_post_process_timer); LogFlush(options_.info_log); mutex_.Lock(); if (status.ok()) { @@ -2855,6 +2881,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (!writers_.empty()) { writers_.front()->cv.Signal(); } + BumpPerfTime(&perf_context.write_pre_and_post_process_time, + &pre_post_process_timer); return status; } diff --git a/db/db_iter.cc b/db/db_iter.cc index 4e3c52c6e..1dc44b93c 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -102,7 +102,8 @@ class DBIter: public Iterator { virtual void SeekToLast(); private: - void FindNextUserEntry(bool skipping); + inline void FindNextUserEntry(bool skipping); + void FindNextUserEntryInternal(bool skipping); void FindPrevUserEntry(); bool ParseKey(ParsedInternalKey* key); void MergeValuesNewToOld(); @@ -191,7 +192,15 @@ void DBIter::Next() { // // NOTE: In between, saved_key_ can point to a user key that has // a delete marker -void DBIter::FindNextUserEntry(bool skipping) { +inline void DBIter::FindNextUserEntry(bool skipping) { + StopWatchNano timer(env_, false); + StartPerfTimer(&timer); + FindNextUserEntryInternal(skipping); + BumpPerfTime(&perf_context.find_next_user_entry_time, &timer); +} + +// Actual implementation of DBIter::FindNextUserEntry() +void DBIter::FindNextUserEntryInternal(bool skipping) { // Loop until we hit an acceptable entry to yield assert(iter_->Valid()); assert(direction_ == kForward); @@ -431,7 +440,10 @@ void DBIter::Seek(const Slice& target) { saved_key_.clear(); AppendInternalKey( &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); + StopWatchNano internal_seek_timer(env_, false); + StartPerfTimer(&internal_seek_timer); iter_->Seek(saved_key_); + BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); if (iter_->Valid()) { FindNextUserEntry(false /*not skipping */); } else { @@ -442,7 +454,10 @@ void DBIter::Seek(const Slice& target) { void DBIter::SeekToFirst() { direction_ = kForward; ClearSavedValue(); + StopWatchNano internal_seek_timer(env_, false); + StartPerfTimer(&internal_seek_timer); iter_->SeekToFirst(); + BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); if (iter_->Valid()) { FindNextUserEntry(false /* not skipping */); } else { @@ -461,7 +476,10 @@ void DBIter::SeekToLast() { direction_ = kReverse; ClearSavedValue(); + StopWatchNano internal_seek_timer(env_, false); + StartPerfTimer(&internal_seek_timer); iter_->SeekToLast(); + BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); FindPrevUserEntry(); } diff --git a/db/db_test.cc b/db/db_test.cc index aca07bcff..ed7425521 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -22,6 +22,7 @@ #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" #include "rocksdb/table.h" +#include "rocksdb/perf_context.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" @@ -1215,7 +1216,13 @@ TEST(DBTest, IterMulti) { ASSERT_EQ(IterStatus(iter), "a->va"); iter->Seek("ax"); ASSERT_EQ(IterStatus(iter), "b->vb"); + + SetPerfLevel(kEnableTime); + perf_context.Reset(); iter->Seek("b"); + ASSERT_TRUE((int) perf_context.seek_internal_seek_time > 0); + ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0); + SetPerfLevel(kDisable); ASSERT_EQ(IterStatus(iter), "b->vb"); iter->Seek("z"); ASSERT_EQ(IterStatus(iter), "(invalid)"); @@ -1230,7 +1237,12 @@ TEST(DBTest, IterMulti) { // Switch from forward to reverse iter->SeekToFirst(); iter->Next(); + SetPerfLevel(kEnableTime); + perf_context.Reset(); iter->Next(); + ASSERT_EQ(0, (int) perf_context.seek_internal_seek_time); + ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0); + SetPerfLevel(kDisable); iter->Prev(); ASSERT_EQ(IterStatus(iter), "b->vb"); @@ -1590,22 +1602,42 @@ TEST(DBTest, NumImmutableMemTable) { std::string big_value(1000000, 'x'); std::string num; + SetPerfLevel(kEnableTime);; ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "0"); + perf_context.Reset(); + Get("k1"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "1"); + perf_context.Reset(); + Get("k1"); + ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get("k2"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "2"); + perf_context.Reset(); + Get("k2"); + ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get("k3"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get("k1"); + ASSERT_EQ(3, (int) perf_context.get_from_memtable_count); dbfull()->Flush(FlushOptions()); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "0"); + SetPerfLevel(kDisable); } while (ChangeCompactOptions()); } @@ -1614,11 +1646,16 @@ TEST(DBTest, FLUSH) { Options options = CurrentOptions(); WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; + SetPerfLevel(kEnableTime);; ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); // this will now also flush the last 2 writes dbfull()->Flush(FlushOptions()); ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + perf_context.Reset(); + Get("foo"); + ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); + Reopen(); ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("bar")); @@ -1630,7 +1667,9 @@ TEST(DBTest, FLUSH) { Reopen(); ASSERT_EQ("v2", Get("bar")); + perf_context.Reset(); ASSERT_EQ("v2", Get("foo")); + ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); writeOpt.disableWAL = false; ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3")); @@ -1642,6 +1681,8 @@ TEST(DBTest, FLUSH) { // has WAL enabled. ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v3", Get("bar")); + + SetPerfLevel(kDisable); } while (ChangeCompactOptions()); } diff --git a/db/memtable.cc b/db/memtable.cc index 291899c21..1df7d6af2 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -19,6 +19,8 @@ #include "util/coding.h" #include "util/mutexlock.h" #include "util/murmurhash.h" +#include "util/perf_context_imp.h" +#include "util/stop_watch.h" namespace std { template <> @@ -162,6 +164,9 @@ void MemTable::Add(SequenceNumber s, ValueType type, bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, std::deque* operands, const Options& options) { + StopWatchNano memtable_get_timer(options.env, false); + StartPerfTimer(&memtable_get_timer); + Slice memkey = key.memtable_key(); std::shared_ptr iter( table_->GetIterator(key.user_key())); @@ -175,7 +180,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, auto logger = options.info_log; std::string merge_result; - for (; iter->Valid(); iter->Next()) { + bool found_final_value = false; + for (; !found_final_value && iter->Valid(); iter->Next()) { // entry format is: // klength varint32 // userkey char[klength-8] @@ -212,7 +218,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, if (options.inplace_update_support) { GetLock(key.user_key())->Unlock(); } - return true; + found_final_value = true; + break; } case kTypeDeletion: { if (merge_in_progress) { @@ -226,7 +233,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, } else { *s = Status::NotFound(Slice()); } - return true; + found_final_value = true; + break; } case kTypeMerge: { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); @@ -260,10 +268,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, // No change to value, since we have not yet found a Put/Delete - if (merge_in_progress) { + if (!found_final_value && merge_in_progress) { *s = Status::MergeInProgress(""); } - return false; + BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer); + BumpPerfCount(&perf_context.get_from_memtable_count); + return found_final_value; } bool MemTable::Update(SequenceNumber seq, ValueType type, diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 05416748d..2a6e6b7e4 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -174,6 +174,13 @@ void ProfileKeyComparison() { HistogramImpl hist_put; HistogramImpl hist_get; + HistogramImpl hist_get_snapshot; + HistogramImpl hist_get_memtable; + HistogramImpl hist_get_post_process; + HistogramImpl hist_num_memtable_checked; + HistogramImpl hist_write_pre_post; + HistogramImpl hist_write_wal_time; + HistogramImpl hist_write_memtable_time; std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; @@ -192,16 +199,37 @@ void ProfileKeyComparison() { perf_context.Reset(); db->Put(write_options, key, value); + hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time); + hist_write_wal_time.Add(perf_context.write_wal_time); + hist_write_memtable_time.Add(perf_context.write_memtable_time); hist_put.Add(perf_context.user_key_comparison_count); perf_context.Reset(); db->Get(read_options, key, &value); + hist_get_snapshot.Add(perf_context.get_snapshot_time); + hist_get_memtable.Add(perf_context.get_from_memtable_time); + hist_num_memtable_checked.Add(perf_context.get_from_memtable_count); + hist_get_post_process.Add(perf_context.get_post_process_time); hist_get.Add(perf_context.user_key_comparison_count); } std::cout << "Put uesr key comparison: \n" << hist_put.ToString() << "Get uesr key comparison: \n" << hist_get.ToString(); - + std::cout << "Put(): Pre and Post Process Time: \n" + << hist_write_pre_post.ToString() + << " Writing WAL time: \n" + << hist_write_wal_time.ToString() << "\n" + << " Writing Mem Table time: \n" + << hist_write_memtable_time.ToString() << "\n"; + + std::cout << "Get(): Time to get snapshot: \n" + << hist_get_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_get_memtable.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" + << hist_get_post_process.ToString() << "\n"; } TEST(PerfContextTest, KeyComparisonCount) { @@ -259,8 +287,8 @@ TEST(PerfContextTest, SeekKeyComparison) { db->Put(write_options, key, value); auto put_time = timer.ElapsedNanos(); hist_put_time.Add(put_time); - hist_wal_time.Add(perf_context.wal_write_time); - hist_time_diff.Add(put_time - perf_context.wal_write_time); + hist_wal_time.Add(perf_context.write_wal_time); + hist_time_diff.Add(put_time - perf_context.write_wal_time); } std::cout << "Put time:\n" << hist_put_time.ToString() diff --git a/db/version_set.cc b/db/version_set.cc index d554657b4..349abfbaa 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2091,7 +2091,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { } } assert(num <= space); - Iterator* result = NewMergingIterator(&icmp_, list, num); + Iterator* result = NewMergingIterator(env_, &icmp_, list, num); delete[] list; return result; } diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 9e900e050..551ca8fe6 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -38,7 +38,27 @@ struct PerfContext { uint64_t internal_key_skipped_count; // total number of deletes skipped over during iteration uint64_t internal_delete_skipped_count; - uint64_t wal_write_time; // total time spent on writing to WAL + + uint64_t get_snapshot_time; // total time spent on getting snapshot + uint64_t get_from_memtable_time; // total time spent on querying memtables + uint64_t get_from_memtable_count; // number of mem tables queried + // total time spent after Get() finds a key + uint64_t get_post_process_time; + uint64_t get_from_output_files_time; // total time reading from output files + // total time spent on seeking child iters + uint64_t seek_child_seek_time; + // number of seek issued in child iterators + uint64_t seek_child_seek_count; + uint64_t seek_min_heap_time; // total time spent on the merge heap + // total time spent on seeking the internal entries + uint64_t seek_internal_seek_time; + // total time spent on iterating internal entries to find the next user entry + uint64_t find_next_user_entry_time; + // total time spent on pre or post processing when writing a record + uint64_t write_pre_and_post_process_time; + uint64_t write_wal_time; // total time spent on writing to WAL + // total time spent on writing to mem tables + uint64_t write_memtable_time; }; extern __thread PerfContext perf_context; diff --git a/table/merger.cc b/table/merger.cc index f5ce7440c..f66aa74c3 100644 --- a/table/merger.cc +++ b/table/merger.cc @@ -11,8 +11,11 @@ #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" +#include "rocksdb/options.h" #include "table/iter_heap.h" #include "table/iterator_wrapper.h" +#include "util/stop_watch.h" +#include "util/perf_context_imp.h" #include @@ -22,10 +25,12 @@ namespace { class MergingIterator : public Iterator { public: - MergingIterator(const Comparator* comparator, Iterator** children, int n) + MergingIterator(Env* const env, const Comparator* comparator, + Iterator** children, int n) : comparator_(comparator), children_(n), current_(nullptr), + env_(env), direction_(kForward), maxHeap_(NewMaxIterHeap(comparator_)), minHeap_ (NewMinIterHeap(comparator_)) { @@ -71,14 +76,24 @@ class MergingIterator : public Iterator { virtual void Seek(const Slice& target) { ClearHeaps(); + StopWatchNano child_seek_timer(env_, false); + StopWatchNano min_heap_timer(env_, false); for (auto& child : children_) { + StartPerfTimer(&child_seek_timer); child.Seek(target); + BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer); + BumpPerfCount(&perf_context.seek_child_seek_count); + if (child.Valid()) { + StartPerfTimer(&min_heap_timer); minHeap_.push(&child); + BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); } } + StartPerfTimer(&min_heap_timer); FindSmallest(); direction_ = kForward; + BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); } virtual void Next() { @@ -178,6 +193,7 @@ class MergingIterator : public Iterator { const Comparator* comparator_; std::vector children_; IteratorWrapper* current_; + Env* const env_; // Which direction is the iterator moving? enum Direction { kForward, @@ -214,14 +230,15 @@ void MergingIterator::ClearHeaps() { } } // namespace -Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { +Iterator* NewMergingIterator(Env* const env, const Comparator* cmp, + Iterator** list, int n) { assert(n >= 0); if (n == 0) { return NewEmptyIterator(); } else if (n == 1) { return list[0]; } else { - return new MergingIterator(cmp, list, n); + return new MergingIterator(env, cmp, list, n); } } diff --git a/table/merger.h b/table/merger.h index dbc1f69eb..74f46ac9b 100644 --- a/table/merger.h +++ b/table/merger.h @@ -13,6 +13,7 @@ namespace rocksdb { class Comparator; class Iterator; +class Env; // Return an iterator that provided the union of the data in // children[0,n-1]. Takes ownership of the child iterators and @@ -23,6 +24,6 @@ class Iterator; // // REQUIRES: n >= 0 extern Iterator* NewMergingIterator( - const Comparator* comparator, Iterator** children, int n); + Env* const env, const Comparator* comparator, Iterator** children, int n); } // namespace rocksdb diff --git a/util/perf_context.cc b/util/perf_context.cc index 1e8ddfb5e..6833f6836 100644 --- a/util/perf_context.cc +++ b/util/perf_context.cc @@ -22,7 +22,20 @@ void PerfContext::Reset() { block_decompress_time = 0; internal_key_skipped_count = 0; internal_delete_skipped_count = 0; - wal_write_time = 0; + write_wal_time = 0; + + get_snapshot_time = 0; + get_from_memtable_time = 0; + get_from_memtable_count = 0; + get_post_process_time = 0; + get_from_output_files_time = 0; + seek_child_seek_time = 0; + seek_child_seek_count = 0; + seek_min_heap_time = 0; + seek_internal_seek_time = 0; + find_next_user_entry_time = 0; + write_pre_and_post_process_time = 0; + write_memtable_time = 0; } __thread PerfContext perf_context; From 3e35aa6412f2fcaa75dc607131caa773d315761b Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 21 Nov 2013 17:40:39 -0800 Subject: [PATCH 07/21] Revert "Allow users to profile a query and see bottleneck of the query" This reverts commit 3d8ac31d7168c916d6f2f0729eb627b07d8f082b. --- db/db_bench.cc | 8 +------ db/db_impl.cc | 34 +++------------------------- db/db_iter.cc | 22 ++---------------- db/db_test.cc | 41 ---------------------------------- db/memtable.cc | 20 +++++------------ db/perf_context_test.cc | 34 +++------------------------- db/version_set.cc | 2 +- include/rocksdb/perf_context.h | 22 +----------------- table/merger.cc | 23 +++---------------- table/merger.h | 3 +-- util/perf_context.cc | 15 +------------ 11 files changed, 21 insertions(+), 203 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index 3ab130093..63cc906e7 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -22,7 +22,6 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/write_batch.h" #include "rocksdb/statistics.h" -#include "rocksdb/perf_context.h" #include "port/port.h" #include "util/bit_set.h" #include "util/crc32c.h" @@ -351,8 +350,6 @@ DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when " DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when" " this is greater than 0."); -DEFINE_int32(perf_level, 0, "Level of perf collection"); - static bool ValidateRateLimit(const char* flagname, double value) { static constexpr double EPSILON = 1e-10; if ( value < -EPSILON ) { @@ -692,7 +689,6 @@ struct SharedState { port::Mutex mu; port::CondVar cv; int total; - int perf_level; // Each thread goes through the following states: // (1) initializing @@ -704,7 +700,7 @@ struct SharedState { long num_done; bool start; - SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { } + SharedState() : cv(&mu) { } }; // Per-thread state for concurrent executions of the same benchmark. @@ -814,7 +810,6 @@ class Benchmark { fprintf(stdout, "Memtablerep: vector\n"); break; } - fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level); PrintWarnings(); fprintf(stdout, "------------------------------------------------\n"); @@ -1155,7 +1150,6 @@ class Benchmark { } } - SetPerfLevel(static_cast (shared->perf_level)); thread->stats.Start(thread->tid); (arg->bm->*(arg->method))(thread); thread->stats.Stop(); diff --git a/db/db_impl.cc b/db/db_impl.cc index a4e28b032..5a2f0de4a 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1035,7 +1035,7 @@ Status DBImpl::WriteLevel0Table(std::vector &mems, VersionEdit* edit, (unsigned long)m->GetLogNumber()); list.push_back(m->NewIterator()); } - Iterator* iter = NewMergingIterator(env_, &internal_comparator_, &list[0], + Iterator* iter = NewMergingIterator(&internal_comparator_, &list[0], list.size()); const SequenceNumber newest_snapshot = snapshots_.GetNewest(); const SequenceNumber earliest_seqno_in_memtable = @@ -2519,7 +2519,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, // Collect iterators for files in L0 - Ln versions_->current()->AddIterators(options, storage_options_, &list); Iterator* internal_iter = - NewMergingIterator(env_, &internal_comparator_, &list[0], list.size()); + NewMergingIterator(&internal_comparator_, &list[0], list.size()); versions_->current()->Ref(); cleanup->mu = &mutex_; @@ -2555,8 +2555,6 @@ Status DBImpl::GetImpl(const ReadOptions& options, Status s; StopWatch sw(env_, options_.statistics, DB_GET); - StopWatchNano snapshot_timer(env_, false); - StartPerfTimer(&snapshot_timer); SequenceNumber snapshot; mutex_.Lock(); if (options.snapshot != nullptr) { @@ -2585,23 +2583,15 @@ Status DBImpl::GetImpl(const ReadOptions& options, // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. LookupKey lkey(key, snapshot); - BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); if (mem->Get(lkey, value, &s, &merge_operands, options_)) { // Done } else if (imm.Get(lkey, value, &s, &merge_operands, options_)) { // Done } else { - StopWatchNano from_files_timer(env_, false); - StartPerfTimer(&from_files_timer); - current->Get(options, lkey, value, &s, &merge_operands, &stats, options_, value_found); have_stat_update = true; - BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer); } - - StopWatchNano post_process_timer(env_, false); - StartPerfTimer(&post_process_timer); mutex_.Lock(); if (!options_.disable_seek_compaction && @@ -2617,8 +2607,6 @@ Status DBImpl::GetImpl(const ReadOptions& options, // Note, tickers are atomic now - no lock protection needed any more. RecordTick(options_.statistics, NUMBER_KEYS_READ); RecordTick(options_.statistics, BYTES_READ, value->size()); - BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); - return s; } @@ -2627,8 +2615,6 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, std::vector* values) { StopWatch sw(env_, options_.statistics, DB_MULTIGET); - StopWatchNano snapshot_timer(env_, false); - StartPerfTimer(&snapshot_timer); SequenceNumber snapshot; mutex_.Lock(); if (options.snapshot != nullptr) { @@ -2660,7 +2646,6 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, // Keep track of bytes that we read for statistics-recording later uint64_t bytesRead = 0; - BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); // For each of the given keys, apply the entire "get" process as follows: // First look in the memtable, then in the immutable memtable (if any). @@ -2687,8 +2672,6 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, } // Post processing (decrement reference counts and record statistics) - StopWatchNano post_process_timer(env_, false); - StartPerfTimer(&post_process_timer); mutex_.Lock(); if (!options_.disable_seek_compaction && have_stat_update && current->UpdateStats(stats)) { @@ -2703,7 +2686,6 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, RecordTick(options_.statistics, NUMBER_MULTIGET_CALLS); RecordTick(options_.statistics, NUMBER_MULTIGET_KEYS_READ, numKeys); RecordTick(options_.statistics, NUMBER_MULTIGET_BYTES_READ, bytesRead); - BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); return statList; } @@ -2772,8 +2754,6 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { } Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { - StopWatchNano pre_post_process_timer(env_, false); - StartPerfTimer(&pre_post_process_timer); Writer w(&mutex_); w.batch = my_batch; w.sync = options.sync; @@ -2820,13 +2800,12 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (options.disableWAL) { flush_on_destroy_ = true; } - BumpPerfTime(&perf_context.write_pre_and_post_process_time, - &pre_post_process_timer); if (!options.disableWAL) { StopWatchNano timer(env_); StartPerfTimer(&timer); status = log_->AddRecord(WriteBatchInternal::Contents(updates)); + BumpPerfTime(&perf_context.wal_write_time, &timer); if (status.ok() && options.sync) { if (options_.use_fsync) { StopWatch(env_, options_.statistics, WAL_FILE_SYNC_MICROS); @@ -2836,14 +2815,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { status = log_->file()->Sync(); } } - BumpPerfTime(&perf_context.write_wal_time, &timer); } if (status.ok()) { - StopWatchNano write_memtable_timer(env_, false); - StartPerfTimer(&write_memtable_timer); status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this, options_.filter_deletes); - BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer); if (!status.ok()) { // Panic for in-memory corruptions // Note that existing logic was not sound. Any partial failure writing @@ -2853,7 +2828,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } SetTickerCount(options_.statistics, SEQUENCE_NUMBER, last_sequence); } - StartPerfTimer(&pre_post_process_timer); LogFlush(options_.info_log); mutex_.Lock(); if (status.ok()) { @@ -2881,8 +2855,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (!writers_.empty()) { writers_.front()->cv.Signal(); } - BumpPerfTime(&perf_context.write_pre_and_post_process_time, - &pre_post_process_timer); return status; } diff --git a/db/db_iter.cc b/db/db_iter.cc index 1dc44b93c..4e3c52c6e 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -102,8 +102,7 @@ class DBIter: public Iterator { virtual void SeekToLast(); private: - inline void FindNextUserEntry(bool skipping); - void FindNextUserEntryInternal(bool skipping); + void FindNextUserEntry(bool skipping); void FindPrevUserEntry(); bool ParseKey(ParsedInternalKey* key); void MergeValuesNewToOld(); @@ -192,15 +191,7 @@ void DBIter::Next() { // // NOTE: In between, saved_key_ can point to a user key that has // a delete marker -inline void DBIter::FindNextUserEntry(bool skipping) { - StopWatchNano timer(env_, false); - StartPerfTimer(&timer); - FindNextUserEntryInternal(skipping); - BumpPerfTime(&perf_context.find_next_user_entry_time, &timer); -} - -// Actual implementation of DBIter::FindNextUserEntry() -void DBIter::FindNextUserEntryInternal(bool skipping) { +void DBIter::FindNextUserEntry(bool skipping) { // Loop until we hit an acceptable entry to yield assert(iter_->Valid()); assert(direction_ == kForward); @@ -440,10 +431,7 @@ void DBIter::Seek(const Slice& target) { saved_key_.clear(); AppendInternalKey( &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); - StopWatchNano internal_seek_timer(env_, false); - StartPerfTimer(&internal_seek_timer); iter_->Seek(saved_key_); - BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); if (iter_->Valid()) { FindNextUserEntry(false /*not skipping */); } else { @@ -454,10 +442,7 @@ void DBIter::Seek(const Slice& target) { void DBIter::SeekToFirst() { direction_ = kForward; ClearSavedValue(); - StopWatchNano internal_seek_timer(env_, false); - StartPerfTimer(&internal_seek_timer); iter_->SeekToFirst(); - BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); if (iter_->Valid()) { FindNextUserEntry(false /* not skipping */); } else { @@ -476,10 +461,7 @@ void DBIter::SeekToLast() { direction_ = kReverse; ClearSavedValue(); - StopWatchNano internal_seek_timer(env_, false); - StartPerfTimer(&internal_seek_timer); iter_->SeekToLast(); - BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); FindPrevUserEntry(); } diff --git a/db/db_test.cc b/db/db_test.cc index ed7425521..aca07bcff 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -22,7 +22,6 @@ #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" #include "rocksdb/table.h" -#include "rocksdb/perf_context.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" @@ -1216,13 +1215,7 @@ TEST(DBTest, IterMulti) { ASSERT_EQ(IterStatus(iter), "a->va"); iter->Seek("ax"); ASSERT_EQ(IterStatus(iter), "b->vb"); - - SetPerfLevel(kEnableTime); - perf_context.Reset(); iter->Seek("b"); - ASSERT_TRUE((int) perf_context.seek_internal_seek_time > 0); - ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0); - SetPerfLevel(kDisable); ASSERT_EQ(IterStatus(iter), "b->vb"); iter->Seek("z"); ASSERT_EQ(IterStatus(iter), "(invalid)"); @@ -1237,12 +1230,7 @@ TEST(DBTest, IterMulti) { // Switch from forward to reverse iter->SeekToFirst(); iter->Next(); - SetPerfLevel(kEnableTime); - perf_context.Reset(); iter->Next(); - ASSERT_EQ(0, (int) perf_context.seek_internal_seek_time); - ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0); - SetPerfLevel(kDisable); iter->Prev(); ASSERT_EQ(IterStatus(iter), "b->vb"); @@ -1602,42 +1590,22 @@ TEST(DBTest, NumImmutableMemTable) { std::string big_value(1000000, 'x'); std::string num; - SetPerfLevel(kEnableTime);; ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "0"); - perf_context.Reset(); - Get("k1"); - ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "1"); - perf_context.Reset(); - Get("k1"); - ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); - perf_context.Reset(); - Get("k2"); - ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "2"); - perf_context.Reset(); - Get("k2"); - ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); - perf_context.Reset(); - Get("k3"); - ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); - perf_context.Reset(); - Get("k1"); - ASSERT_EQ(3, (int) perf_context.get_from_memtable_count); dbfull()->Flush(FlushOptions()); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "0"); - SetPerfLevel(kDisable); } while (ChangeCompactOptions()); } @@ -1646,16 +1614,11 @@ TEST(DBTest, FLUSH) { Options options = CurrentOptions(); WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; - SetPerfLevel(kEnableTime);; ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); // this will now also flush the last 2 writes dbfull()->Flush(FlushOptions()); ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); - perf_context.Reset(); - Get("foo"); - ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); - Reopen(); ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("bar")); @@ -1667,9 +1630,7 @@ TEST(DBTest, FLUSH) { Reopen(); ASSERT_EQ("v2", Get("bar")); - perf_context.Reset(); ASSERT_EQ("v2", Get("foo")); - ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); writeOpt.disableWAL = false; ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3")); @@ -1681,8 +1642,6 @@ TEST(DBTest, FLUSH) { // has WAL enabled. ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v3", Get("bar")); - - SetPerfLevel(kDisable); } while (ChangeCompactOptions()); } diff --git a/db/memtable.cc b/db/memtable.cc index 1df7d6af2..291899c21 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -19,8 +19,6 @@ #include "util/coding.h" #include "util/mutexlock.h" #include "util/murmurhash.h" -#include "util/perf_context_imp.h" -#include "util/stop_watch.h" namespace std { template <> @@ -164,9 +162,6 @@ void MemTable::Add(SequenceNumber s, ValueType type, bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, std::deque* operands, const Options& options) { - StopWatchNano memtable_get_timer(options.env, false); - StartPerfTimer(&memtable_get_timer); - Slice memkey = key.memtable_key(); std::shared_ptr iter( table_->GetIterator(key.user_key())); @@ -180,8 +175,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, auto logger = options.info_log; std::string merge_result; - bool found_final_value = false; - for (; !found_final_value && iter->Valid(); iter->Next()) { + for (; iter->Valid(); iter->Next()) { // entry format is: // klength varint32 // userkey char[klength-8] @@ -218,8 +212,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, if (options.inplace_update_support) { GetLock(key.user_key())->Unlock(); } - found_final_value = true; - break; + return true; } case kTypeDeletion: { if (merge_in_progress) { @@ -233,8 +226,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, } else { *s = Status::NotFound(Slice()); } - found_final_value = true; - break; + return true; } case kTypeMerge: { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); @@ -268,12 +260,10 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, // No change to value, since we have not yet found a Put/Delete - if (!found_final_value && merge_in_progress) { + if (merge_in_progress) { *s = Status::MergeInProgress(""); } - BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer); - BumpPerfCount(&perf_context.get_from_memtable_count); - return found_final_value; + return false; } bool MemTable::Update(SequenceNumber seq, ValueType type, diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 2a6e6b7e4..05416748d 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -174,13 +174,6 @@ void ProfileKeyComparison() { HistogramImpl hist_put; HistogramImpl hist_get; - HistogramImpl hist_get_snapshot; - HistogramImpl hist_get_memtable; - HistogramImpl hist_get_post_process; - HistogramImpl hist_num_memtable_checked; - HistogramImpl hist_write_pre_post; - HistogramImpl hist_write_wal_time; - HistogramImpl hist_write_memtable_time; std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; @@ -199,37 +192,16 @@ void ProfileKeyComparison() { perf_context.Reset(); db->Put(write_options, key, value); - hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time); - hist_write_wal_time.Add(perf_context.write_wal_time); - hist_write_memtable_time.Add(perf_context.write_memtable_time); hist_put.Add(perf_context.user_key_comparison_count); perf_context.Reset(); db->Get(read_options, key, &value); - hist_get_snapshot.Add(perf_context.get_snapshot_time); - hist_get_memtable.Add(perf_context.get_from_memtable_time); - hist_num_memtable_checked.Add(perf_context.get_from_memtable_count); - hist_get_post_process.Add(perf_context.get_post_process_time); hist_get.Add(perf_context.user_key_comparison_count); } std::cout << "Put uesr key comparison: \n" << hist_put.ToString() << "Get uesr key comparison: \n" << hist_get.ToString(); - std::cout << "Put(): Pre and Post Process Time: \n" - << hist_write_pre_post.ToString() - << " Writing WAL time: \n" - << hist_write_wal_time.ToString() << "\n" - << " Writing Mem Table time: \n" - << hist_write_memtable_time.ToString() << "\n"; - - std::cout << "Get(): Time to get snapshot: \n" - << hist_get_snapshot.ToString() - << " Time to get value from memtables: \n" - << hist_get_memtable.ToString() << "\n" - << " Number of memtables checked: \n" - << hist_num_memtable_checked.ToString() << "\n" - << " Time to post process: \n" - << hist_get_post_process.ToString() << "\n"; + } TEST(PerfContextTest, KeyComparisonCount) { @@ -287,8 +259,8 @@ TEST(PerfContextTest, SeekKeyComparison) { db->Put(write_options, key, value); auto put_time = timer.ElapsedNanos(); hist_put_time.Add(put_time); - hist_wal_time.Add(perf_context.write_wal_time); - hist_time_diff.Add(put_time - perf_context.write_wal_time); + hist_wal_time.Add(perf_context.wal_write_time); + hist_time_diff.Add(put_time - perf_context.wal_write_time); } std::cout << "Put time:\n" << hist_put_time.ToString() diff --git a/db/version_set.cc b/db/version_set.cc index 349abfbaa..d554657b4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2091,7 +2091,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { } } assert(num <= space); - Iterator* result = NewMergingIterator(env_, &icmp_, list, num); + Iterator* result = NewMergingIterator(&icmp_, list, num); delete[] list; return result; } diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 551ca8fe6..9e900e050 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -38,27 +38,7 @@ struct PerfContext { uint64_t internal_key_skipped_count; // total number of deletes skipped over during iteration uint64_t internal_delete_skipped_count; - - uint64_t get_snapshot_time; // total time spent on getting snapshot - uint64_t get_from_memtable_time; // total time spent on querying memtables - uint64_t get_from_memtable_count; // number of mem tables queried - // total time spent after Get() finds a key - uint64_t get_post_process_time; - uint64_t get_from_output_files_time; // total time reading from output files - // total time spent on seeking child iters - uint64_t seek_child_seek_time; - // number of seek issued in child iterators - uint64_t seek_child_seek_count; - uint64_t seek_min_heap_time; // total time spent on the merge heap - // total time spent on seeking the internal entries - uint64_t seek_internal_seek_time; - // total time spent on iterating internal entries to find the next user entry - uint64_t find_next_user_entry_time; - // total time spent on pre or post processing when writing a record - uint64_t write_pre_and_post_process_time; - uint64_t write_wal_time; // total time spent on writing to WAL - // total time spent on writing to mem tables - uint64_t write_memtable_time; + uint64_t wal_write_time; // total time spent on writing to WAL }; extern __thread PerfContext perf_context; diff --git a/table/merger.cc b/table/merger.cc index f66aa74c3..f5ce7440c 100644 --- a/table/merger.cc +++ b/table/merger.cc @@ -11,11 +11,8 @@ #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" -#include "rocksdb/options.h" #include "table/iter_heap.h" #include "table/iterator_wrapper.h" -#include "util/stop_watch.h" -#include "util/perf_context_imp.h" #include @@ -25,12 +22,10 @@ namespace { class MergingIterator : public Iterator { public: - MergingIterator(Env* const env, const Comparator* comparator, - Iterator** children, int n) + MergingIterator(const Comparator* comparator, Iterator** children, int n) : comparator_(comparator), children_(n), current_(nullptr), - env_(env), direction_(kForward), maxHeap_(NewMaxIterHeap(comparator_)), minHeap_ (NewMinIterHeap(comparator_)) { @@ -76,24 +71,14 @@ class MergingIterator : public Iterator { virtual void Seek(const Slice& target) { ClearHeaps(); - StopWatchNano child_seek_timer(env_, false); - StopWatchNano min_heap_timer(env_, false); for (auto& child : children_) { - StartPerfTimer(&child_seek_timer); child.Seek(target); - BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer); - BumpPerfCount(&perf_context.seek_child_seek_count); - if (child.Valid()) { - StartPerfTimer(&min_heap_timer); minHeap_.push(&child); - BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); } } - StartPerfTimer(&min_heap_timer); FindSmallest(); direction_ = kForward; - BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); } virtual void Next() { @@ -193,7 +178,6 @@ class MergingIterator : public Iterator { const Comparator* comparator_; std::vector children_; IteratorWrapper* current_; - Env* const env_; // Which direction is the iterator moving? enum Direction { kForward, @@ -230,15 +214,14 @@ void MergingIterator::ClearHeaps() { } } // namespace -Iterator* NewMergingIterator(Env* const env, const Comparator* cmp, - Iterator** list, int n) { +Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { assert(n >= 0); if (n == 0) { return NewEmptyIterator(); } else if (n == 1) { return list[0]; } else { - return new MergingIterator(env, cmp, list, n); + return new MergingIterator(cmp, list, n); } } diff --git a/table/merger.h b/table/merger.h index 74f46ac9b..dbc1f69eb 100644 --- a/table/merger.h +++ b/table/merger.h @@ -13,7 +13,6 @@ namespace rocksdb { class Comparator; class Iterator; -class Env; // Return an iterator that provided the union of the data in // children[0,n-1]. Takes ownership of the child iterators and @@ -24,6 +23,6 @@ class Env; // // REQUIRES: n >= 0 extern Iterator* NewMergingIterator( - Env* const env, const Comparator* comparator, Iterator** children, int n); + const Comparator* comparator, Iterator** children, int n); } // namespace rocksdb diff --git a/util/perf_context.cc b/util/perf_context.cc index 6833f6836..1e8ddfb5e 100644 --- a/util/perf_context.cc +++ b/util/perf_context.cc @@ -22,20 +22,7 @@ void PerfContext::Reset() { block_decompress_time = 0; internal_key_skipped_count = 0; internal_delete_skipped_count = 0; - write_wal_time = 0; - - get_snapshot_time = 0; - get_from_memtable_time = 0; - get_from_memtable_count = 0; - get_post_process_time = 0; - get_from_output_files_time = 0; - seek_child_seek_time = 0; - seek_child_seek_count = 0; - seek_min_heap_time = 0; - seek_internal_seek_time = 0; - find_next_user_entry_time = 0; - write_pre_and_post_process_time = 0; - write_memtable_time = 0; + wal_write_time = 0; } __thread PerfContext perf_context; From 0c93df912e71781719c1a3373cc3c18c44c48910 Mon Sep 17 00:00:00 2001 From: kailiu Date: Thu, 21 Nov 2013 17:54:23 -0800 Subject: [PATCH 08/21] Improve the readability of the TableProperties::ToString() --- db/table_properties_collector.cc | 2 +- table/block_based_table_builder.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc index d0cd1520d..3654663c1 100644 --- a/db/table_properties_collector.cc +++ b/db/table_properties_collector.cc @@ -74,7 +74,7 @@ std::string TableProperties::ToString( ); AppendProperty( result, - "(estimated) table size=", + "(estimated) table size", data_size + index_size + filter_size, prop_delim, kv_delim diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index a5bf216dc..88b7a5fc7 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -530,8 +530,8 @@ Status BlockBasedTableBuilder::Finish() { Log( r->options.info_log, "Table was constructed:\n" - " basic properties: %s\n" - " user collected properties: %s", + " [basic properties]: %s\n" + " [user collected properties]: %s", r->props.ToString().c_str(), user_collected.c_str() ); From 5b825d6964e26ec3b4bb6faa708ebb1787f1d7bd Mon Sep 17 00:00:00 2001 From: Haobo Xu Date: Fri, 22 Nov 2013 14:14:05 -0800 Subject: [PATCH 09/21] [RocksDB] Use raw pointer instead of shared pointer when passing Statistics object internally Summary: liveness of the statistics object is already ensured by the shared pointer in DB options. There's no reason to pass again shared pointer among internal functions. Raw pointer is sufficient and efficient. Test Plan: make check Reviewers: dhruba, MarkCallaghan, igor Reviewed By: dhruba CC: leveldb, reconnect.grayhat Differential Revision: https://reviews.facebook.net/D14289 --- db/builder.cc | 5 +- db/db_impl.cc | 75 +++++++++++++++++------------- db/db_iter.cc | 4 +- db/memtable.cc | 5 +- db/merge_helper.cc | 3 +- db/merge_helper.h | 4 +- db/table_cache.cc | 6 +-- db/version_set.cc | 12 +++-- db/write_batch.cc | 7 ++- include/rocksdb/statistics.h | 21 --------- table/block_based_table_builder.cc | 3 +- table/block_based_table_reader.cc | 20 ++++---- util/statistics_imp.h | 32 +++++++++++++ util/stop_watch.h | 10 ++-- 14 files changed, 117 insertions(+), 90 deletions(-) create mode 100644 util/statistics_imp.h diff --git a/db/builder.cc b/db/builder.cc index b3bf894ef..ad1334a15 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -112,6 +112,7 @@ Status BuildTable(const std::string& dbname, if (this_ikey.type == kTypeMerge) { // Handle merge-type keys using the MergeHelper + // TODO: pass statistics to MergeUntil merge.MergeUntil(iter, 0 /* don't worry about snapshot */); iterator_at_next = true; if (merge.IsSuccess()) { @@ -188,10 +189,10 @@ Status BuildTable(const std::string& dbname, // Finish and check for file errors if (s.ok() && !options.disableDataSync) { if (options.use_fsync) { - StopWatch sw(env, options.statistics, TABLE_SYNC_MICROS); + StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); s = file->Fsync(); } else { - StopWatch sw(env, options.statistics, TABLE_SYNC_MICROS); + StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); s = file->Sync(); } } diff --git a/db/db_impl.cc b/db/db_impl.cc index 5a2f0de4a..22dba5f2e 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -404,7 +404,7 @@ const Status DBImpl::CreateArchivalDirectory() { } void DBImpl::PrintStatistics() { - auto dbstats = options_.statistics; + auto dbstats = options_.statistics.get(); if (dbstats) { Log(options_.info_log, "STATISTCS:\n %s", @@ -860,7 +860,7 @@ Status DBImpl::Recover(VersionEdit* edit, MemTable* external_table, if (versions_->LastSequence() < max_sequence) { versions_->SetLastSequence(max_sequence); } - SetTickerCount(options_.statistics, SEQUENCE_NUMBER, + SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, versions_->LastSequence()); } } @@ -1297,7 +1297,7 @@ SequenceNumber DBImpl::GetLatestSequenceNumber() const { Status DBImpl::GetUpdatesSince(SequenceNumber seq, unique_ptr* iter) { - RecordTick(options_.statistics, GET_UPDATES_SINCE_CALLS); + RecordTick(options_.statistics.get(), GET_UPDATES_SINCE_CALLS); if (seq > versions_->LastSequence()) { return Status::IOError("Requested sequence not yet written in the db"); } @@ -1971,10 +1971,12 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, // Finish and check for file errors if (s.ok() && !options_.disableDataSync) { if (options_.use_fsync) { - StopWatch sw(env_, options_.statistics, COMPACTION_OUTFILE_SYNC_MICROS); + StopWatch sw(env_, options_.statistics.get(), + COMPACTION_OUTFILE_SYNC_MICROS); s = compact->outfile->Fsync(); } else { - StopWatch sw(env_, options_.statistics, COMPACTION_OUTFILE_SYNC_MICROS); + StopWatch sw(env_, options_.statistics.get(), + COMPACTION_OUTFILE_SYNC_MICROS); s = compact->outfile->Sync(); } } @@ -2212,7 +2214,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, ParseInternalKey(key, &ikey); // no value associated with delete value.clear(); - RecordTick(options_.statistics, COMPACTION_KEY_DROP_USER); + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER); } else if (value_changed) { value = compaction_filter_value; } @@ -2238,7 +2240,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, // TODO: why not > ? assert(last_sequence_for_key >= ikey.sequence); drop = true; // (A) - RecordTick(options_.statistics, COMPACTION_KEY_DROP_NEWER_ENTRY); + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_NEWER_ENTRY); } else if (ikey.type == kTypeDeletion && ikey.sequence <= earliest_snapshot && compact->compaction->IsBaseLevelForKey(ikey.user_key)) { @@ -2250,7 +2252,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, // few iterations of this loop (by rule (A) above). // Therefore this deletion marker is obsolete and can be dropped. drop = true; - RecordTick(options_.statistics, COMPACTION_KEY_DROP_OBSOLETE); + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_OBSOLETE); } else if (ikey.type == kTypeMerge) { // We know the merge type entry is not hidden, otherwise we would // have hit (A) @@ -2259,7 +2261,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, // logic could also be nicely re-used for memtable flush purge // optimization in BuildTable. merge.MergeUntil(input.get(), prev_snapshot, bottommost_level, - options_.statistics); + options_.statistics.get()); current_entry_is_merging = true; if (merge.IsSuccess()) { // Successfully found Put/Delete/(end-of-key-range) while merging @@ -2412,8 +2414,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, CompactionStats stats; stats.micros = env_->NowMicros() - start_micros - imm_micros; - if (options_.statistics) { - options_.statistics->measureTime(COMPACTION_TIME, stats.micros); + if (options_.statistics.get()) { + options_.statistics.get()->measureTime(COMPACTION_TIME, stats.micros); } stats.files_in_leveln = compact->compaction->num_input_files(0); stats.files_in_levelnp1 = compact->compaction->num_input_files(1); @@ -2554,7 +2556,7 @@ Status DBImpl::GetImpl(const ReadOptions& options, bool* value_found) { Status s; - StopWatch sw(env_, options_.statistics, DB_GET); + StopWatch sw(env_, options_.statistics.get(), DB_GET); SequenceNumber snapshot; mutex_.Lock(); if (options.snapshot != nullptr) { @@ -2605,8 +2607,8 @@ Status DBImpl::GetImpl(const ReadOptions& options, LogFlush(options_.info_log); // Note, tickers are atomic now - no lock protection needed any more. - RecordTick(options_.statistics, NUMBER_KEYS_READ); - RecordTick(options_.statistics, BYTES_READ, value->size()); + RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); + RecordTick(options_.statistics.get(), BYTES_READ, value->size()); return s; } @@ -2614,7 +2616,7 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, const std::vector& keys, std::vector* values) { - StopWatch sw(env_, options_.statistics, DB_MULTIGET); + StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET); SequenceNumber snapshot; mutex_.Lock(); if (options.snapshot != nullptr) { @@ -2683,9 +2685,9 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, mutex_.Unlock(); LogFlush(options_.info_log); - RecordTick(options_.statistics, NUMBER_MULTIGET_CALLS); - RecordTick(options_.statistics, NUMBER_MULTIGET_KEYS_READ, numKeys); - RecordTick(options_.statistics, NUMBER_MULTIGET_BYTES_READ, bytesRead); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytesRead); return statList; } @@ -2760,7 +2762,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { w.disableWAL = options.disableWAL; w.done = false; - StopWatch sw(env_, options_.statistics, DB_WRITE); + StopWatch sw(env_, options_.statistics.get(), DB_WRITE); MutexLock l(&mutex_); writers_.push_back(&w); while (!w.done && &w != writers_.front()) { @@ -2793,8 +2795,9 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { int my_batch_count = WriteBatchInternal::Count(updates); last_sequence += my_batch_count; // Record statistics - RecordTick(options_.statistics, NUMBER_KEYS_WRITTEN, my_batch_count); - RecordTick(options_.statistics, + RecordTick(options_.statistics.get(), + NUMBER_KEYS_WRITTEN, my_batch_count); + RecordTick(options_.statistics.get(), BYTES_WRITTEN, WriteBatchInternal::ByteSize(updates)); if (options.disableWAL) { @@ -2808,10 +2811,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { BumpPerfTime(&perf_context.wal_write_time, &timer); if (status.ok() && options.sync) { if (options_.use_fsync) { - StopWatch(env_, options_.statistics, WAL_FILE_SYNC_MICROS); + StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); status = log_->file()->Fsync(); } else { - StopWatch(env_, options_.statistics, WAL_FILE_SYNC_MICROS); + StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); status = log_->file()->Sync(); } } @@ -2826,7 +2829,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { // have succeeded in memtable but Status reports error for all writes. throw std::runtime_error("In memory WriteBatch corruption!"); } - SetTickerCount(options_.statistics, SEQUENCE_NUMBER, last_sequence); + SetTickerCount(options_.statistics.get(), + SEQUENCE_NUMBER, last_sequence); } LogFlush(options_.info_log); mutex_.Lock(); @@ -2975,7 +2979,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { mutex_.Unlock(); uint64_t delayed; { - StopWatch sw(env_, options_.statistics, STALL_L0_SLOWDOWN_COUNT); + StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT); env_->SleepForMicroseconds( SlowdownAmount(versions_->NumLevelFiles(0), options_.level0_slowdown_writes_trigger, @@ -2983,7 +2987,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { ); delayed = sw.ElapsedMicros(); } - RecordTick(options_.statistics, STALL_L0_SLOWDOWN_MICROS, delayed); + RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed); stall_level0_slowdown_ += delayed; stall_level0_slowdown_count_++; allow_delay = false; // Do not delay a single write more than once @@ -3003,12 +3007,13 @@ Status DBImpl::MakeRoomForWrite(bool force) { Log(options_.info_log, "wait for memtable compaction...\n"); uint64_t stall; { - StopWatch sw(env_, options_.statistics, + StopWatch sw(env_, options_.statistics.get(), STALL_MEMTABLE_COMPACTION_COUNT); bg_cv_.Wait(); stall = sw.ElapsedMicros(); } - RecordTick(options_.statistics, STALL_MEMTABLE_COMPACTION_MICROS, stall); + RecordTick(options_.statistics.get(), + STALL_MEMTABLE_COMPACTION_MICROS, stall); stall_memtable_compaction_ += stall; stall_memtable_compaction_count_++; } else if (versions_->NumLevelFiles(0) >= @@ -3018,11 +3023,12 @@ Status DBImpl::MakeRoomForWrite(bool force) { Log(options_.info_log, "wait for fewer level0 files...\n"); uint64_t stall; { - StopWatch sw(env_, options_.statistics, STALL_L0_NUM_FILES_COUNT); + StopWatch sw(env_, options_.statistics.get(), + STALL_L0_NUM_FILES_COUNT); bg_cv_.Wait(); stall = sw.ElapsedMicros(); } - RecordTick(options_.statistics, STALL_L0_NUM_FILES_MICROS, stall); + RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall); stall_level0_num_files_ += stall; stall_level0_num_files_count_++; } else if ( @@ -3034,7 +3040,8 @@ Status DBImpl::MakeRoomForWrite(bool force) { mutex_.Unlock(); uint64_t delayed; { - StopWatch sw(env_, options_.statistics, HARD_RATE_LIMIT_DELAY_COUNT); + StopWatch sw(env_, options_.statistics.get(), + HARD_RATE_LIMIT_DELAY_COUNT); env_->SleepForMicroseconds(1000); delayed = sw.ElapsedMicros(); } @@ -3043,7 +3050,8 @@ Status DBImpl::MakeRoomForWrite(bool force) { // Make sure the following value doesn't round to zero. uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1); rate_limit_delay_millis += rate_limit; - RecordTick(options_.statistics, RATE_LIMIT_DELAY_MILLIS, rate_limit); + RecordTick(options_.statistics.get(), + RATE_LIMIT_DELAY_MILLIS, rate_limit); if (options_.rate_limit_delay_max_milliseconds > 0 && rate_limit_delay_millis >= (unsigned)options_.rate_limit_delay_max_milliseconds) { @@ -3058,7 +3066,8 @@ Status DBImpl::MakeRoomForWrite(bool force) { // TODO: add statistics mutex_.Unlock(); { - StopWatch sw(env_, options_.statistics, SOFT_RATE_LIMIT_DELAY_COUNT); + StopWatch sw(env_, options_.statistics.get(), + SOFT_RATE_LIMIT_DELAY_COUNT); env_->SleepForMicroseconds(SlowdownAmount( score, options_.soft_rate_limit, diff --git a/db/db_iter.cc b/db/db_iter.cc index 4e3c52c6e..89eaf0949 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -69,7 +69,7 @@ class DBIter: public Iterator { direction_(kForward), valid_(false), current_entry_is_merged_(false), - statistics_(options.statistics) { + statistics_(options.statistics.get()) { RecordTick(statistics_, NO_ITERATORS, 1); max_skip_ = options.max_sequential_skip_in_iterations; } @@ -135,7 +135,7 @@ class DBIter: public Iterator { Direction direction_; bool valid_; bool current_entry_is_merged_; - std::shared_ptr statistics_; + Statistics* statistics_; uint64_t max_skip_; // No copying allowed diff --git a/db/memtable.cc b/db/memtable.cc index 291899c21..ac589a563 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -19,6 +19,7 @@ #include "util/coding.h" #include "util/mutexlock.h" #include "util/murmurhash.h" +#include "util/statistics_imp.h" namespace std { template <> @@ -203,7 +204,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, assert(merge_operator); if (!merge_operator->FullMerge(key.user_key(), &v, *operands, value, logger.get())) { - RecordTick(options.statistics, NUMBER_MERGE_FAILURES); + RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES); *s = Status::Corruption("Error: Could not perform merge."); } } else { @@ -220,7 +221,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, *s = Status::OK(); if (!merge_operator->FullMerge(key.user_key(), nullptr, *operands, value, logger.get())) { - RecordTick(options.statistics, NUMBER_MERGE_FAILURES); + RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES); *s = Status::Corruption("Error: Could not perform merge."); } } else { diff --git a/db/merge_helper.cc b/db/merge_helper.cc index 9d757a5e6..a7e2df0a3 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -8,6 +8,7 @@ #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/merge_operator.h" +#include "util/statistics_imp.h" #include #include @@ -20,7 +21,7 @@ namespace rocksdb { // operands_ stores the list of merge operands encountered while merging. // keys_[i] corresponds to operands_[i] for each i. void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before, - bool at_bottom, shared_ptr stats) { + bool at_bottom, Statistics* stats) { // Get a copy of the internal key, before it's invalidated by iter->Next() // Also maintain the list of merge operands seen. keys_.clear(); diff --git a/db/merge_helper.h b/db/merge_helper.h index 34e2edd94..6fe9bfb23 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -8,7 +8,6 @@ #include "db/dbformat.h" #include "rocksdb/slice.h" -#include "rocksdb/statistics.h" #include #include @@ -18,6 +17,7 @@ class Comparator; class Iterator; class Logger; class MergeOperator; +class Statistics; class MergeHelper { public: @@ -46,7 +46,7 @@ class MergeHelper { // at_bottom: (IN) true if the iterator covers the bottem level, which means // we could reach the start of the history of this user key. void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0, - bool at_bottom = false, shared_ptr stats=nullptr); + bool at_bottom = false, Statistics* stats = nullptr); // Query the merge result // These are valid until the next MergeUntil call diff --git a/db/table_cache.cc b/db/table_cache.cc index a1f466b5a..e18c20c99 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -65,12 +65,12 @@ Status TableCache::FindTable(const EnvOptions& toptions, unique_ptr file; unique_ptr table_reader; s = env_->NewRandomAccessFile(fname, &file, toptions); - RecordTick(options_->statistics, NO_FILE_OPENS); + RecordTick(options_->statistics.get(), NO_FILE_OPENS); if (s.ok()) { if (options_->advise_random_on_open) { file->Hint(RandomAccessFile::RANDOM); } - StopWatch sw(env_, options_->statistics, TABLE_OPEN_IO_MICROS); + StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); s = options_->table_factory->GetTableReader(*options_, toptions, std::move(file), file_size, &table_reader); @@ -78,7 +78,7 @@ Status TableCache::FindTable(const EnvOptions& toptions, if (!s.ok()) { assert(table_reader == nullptr); - RecordTick(options_->statistics, NO_FILE_ERRORS); + RecordTick(options_->statistics.get(), NO_FILE_ERRORS); // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. } else { diff --git a/db/version_set.cc b/db/version_set.cc index d554657b4..95db3e477 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -290,7 +290,7 @@ struct Saver { std::deque* merge_operands; // the merge operations encountered Logger* logger; bool didIO; // did we do any disk io? - shared_ptr statistics; + Statistics* statistics; }; } @@ -439,7 +439,7 @@ void Version::Get(const ReadOptions& options, saver.merge_operands = operands; saver.logger = logger.get(); saver.didIO = false; - saver.statistics = db_options.statistics; + saver.statistics = db_options.statistics.get(); stats->seek_file = nullptr; stats->seek_file_level = -1; @@ -566,7 +566,7 @@ void Version::Get(const ReadOptions& options, value, logger.get())) { *status = Status::OK(); } else { - RecordTick(db_options.statistics, NUMBER_MERGE_FAILURES); + RecordTick(db_options.statistics.get(), NUMBER_MERGE_FAILURES); *status = Status::Corruption("could not perform end-of-key merge for ", user_key); } @@ -1296,10 +1296,12 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, } if (s.ok()) { if (options_->use_fsync) { - StopWatch sw(env_, options_->statistics, MANIFEST_FILE_SYNC_MICROS); + StopWatch sw(env_, options_->statistics.get(), + MANIFEST_FILE_SYNC_MICROS); s = descriptor_log_->file()->Fsync(); } else { - StopWatch sw(env_, options_->statistics, MANIFEST_FILE_SYNC_MICROS); + StopWatch sw(env_, options_->statistics.get(), + MANIFEST_FILE_SYNC_MICROS); s = descriptor_log_->file()->Sync(); } } diff --git a/db/write_batch.cc b/db/write_batch.cc index 134cfb63c..c04930bbf 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -20,15 +20,14 @@ // data: uint8[len] #include "rocksdb/write_batch.h" - #include "rocksdb/options.h" -#include "rocksdb/statistics.h" #include "db/dbformat.h" #include "db/db_impl.h" #include "db/memtable.h" #include "db/snapshot.h" #include "db/write_batch_internal.h" #include "util/coding.h" +#include "util/statistics_imp.h" #include namespace rocksdb { @@ -197,7 +196,7 @@ class MemTableInserter : public WriteBatch::Handler { virtual void Put(const Slice& key, const Slice& value) { if (options_->inplace_update_support && mem_->Update(sequence_, kTypeValue, key, value)) { - RecordTick(options_->statistics, NUMBER_KEYS_UPDATED); + RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED); } else { mem_->Add(sequence_, kTypeValue, key, value); } @@ -215,7 +214,7 @@ class MemTableInserter : public WriteBatch::Handler { ropts.snapshot = &read_from_snapshot; std::string value; if (!db_->KeyMayExist(ropts, key, &value)) { - RecordTick(options_->statistics, NUMBER_FILTERED_DELETES); + RecordTick(options_->statistics.get(), NUMBER_FILTERED_DELETES); return; } } diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 7d6a53ff8..102a4be58 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -276,27 +276,6 @@ class Statistics { // Create a concrete DBStatistics object std::shared_ptr CreateDBStatistics(); -// Ease of Use functions -inline void RecordTick(std::shared_ptr statistics, - Tickers ticker, - uint64_t count = 1) { - assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX); - assert(TickersNameMap.size() == TICKER_ENUM_MAX); - if (statistics) { - statistics->recordTick(ticker, count); - } -} - -inline void SetTickerCount(std::shared_ptr statistics, - Tickers ticker, - uint64_t count) { - assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX); - assert(TickersNameMap.size() == TICKER_ENUM_MAX); - if (statistics) { - statistics->setTickerCount(ticker, count); - } -} - } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 88b7a5fc7..f846b1ffd 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -272,7 +272,8 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, CompressionType type, BlockHandle* handle) { Rep* r = rep_; - StopWatch sw(r->options.env, r->options.statistics, WRITE_RAW_BLOCK_MICROS); + StopWatch sw(r->options.env, r->options.statistics.get(), + WRITE_RAW_BLOCK_MICROS); handle->set_offset(r->offset); handle->set_size(block_contents.size()); r->status = r->file->Append(block_contents); diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 5a2690103..095c2999c 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -200,7 +200,7 @@ Cache::Handle* GetFromBlockCache( const Slice& key, Tickers block_cache_miss_ticker, Tickers block_cache_hit_ticker, - std::shared_ptr statistics) { + Statistics* statistics) { auto cache_handle = block_cache->Lookup(key); if (cache_handle != nullptr) { BumpPerfCount(&perf_context.block_cache_hit_count); @@ -515,7 +515,7 @@ Status BlockBasedTable::GetBlock( CachableEntry* entry) { bool no_io = options.read_tier == kBlockCacheTier; Cache* block_cache = table->rep_->options.block_cache.get(); - auto statistics = table->rep_->options.statistics; + Statistics* statistics = table->rep_->options.statistics.get(); Status s; if (block_cache != nullptr) { @@ -532,7 +532,7 @@ Status BlockBasedTable::GetBlock( key, block_cache_miss_ticker, block_cache_hit_ticker, - table->rep_->options.statistics + statistics ); if (entry->cache_handle != nullptr) { @@ -593,7 +593,7 @@ Iterator* BlockBasedTable::BlockReader(void* arg, Cache* block_cache = table->rep_->options.block_cache.get(); Cache* block_cache_compressed = table->rep_->options. block_cache_compressed.get(); - std::shared_ptr statistics = table->rep_->options.statistics; + Statistics* statistics = table->rep_->options.statistics.get(); Block* block = nullptr; Block* cblock = nullptr; Cache::Handle* cache_handle = nullptr; @@ -791,12 +791,13 @@ BlockBasedTable::GetFilter(bool no_io) const { cache_key ); + Statistics* statistics = rep_->options.statistics.get(); auto cache_handle = GetFromBlockCache( block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT, - rep_->options.statistics + statistics ); FilterBlockReader* filter = nullptr; @@ -824,7 +825,7 @@ BlockBasedTable::GetFilter(bool no_io) const { cache_handle = block_cache->Insert( key, filter, filter_size, &DeleteCachedFilter); - RecordTick(rep_->options.statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_ADD); } } } @@ -945,9 +946,10 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_prefix) { filter_entry.Release(rep_->options.block_cache.get()); } - RecordTick(rep_->options.statistics, BLOOM_FILTER_PREFIX_CHECKED); + Statistics* statistics = rep_->options.statistics.get(); + RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); if (!may_match) { - RecordTick(rep_->options.statistics, BLOOM_FILTER_PREFIX_USEFUL); + RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); } return may_match; @@ -997,7 +999,7 @@ Status BlockBasedTable::Get( // Not found // TODO: think about interaction with Merge. If a user key cannot // cross one data block, we should be fine. - RecordTick(rep_->options.statistics, BLOOM_FILTER_USEFUL); + RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL); break; } else { bool didIO = false; diff --git a/util/statistics_imp.h b/util/statistics_imp.h new file mode 100644 index 000000000..0dc8884c1 --- /dev/null +++ b/util/statistics_imp.h @@ -0,0 +1,32 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/statistics.h" + +namespace rocksdb { + +// Utility functions +inline void RecordTick(Statistics* statistics, + Tickers ticker, + uint64_t count = 1) { + assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX); + assert(TickersNameMap.size() == TICKER_ENUM_MAX); + if (statistics) { + statistics->recordTick(ticker, count); + } +} + +inline void SetTickerCount(Statistics* statistics, + Tickers ticker, + uint64_t count) { + assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX); + assert(TickersNameMap.size() == TICKER_ENUM_MAX); + if (statistics) { + statistics->setTickerCount(ticker, count); + } +} + +} diff --git a/util/stop_watch.h b/util/stop_watch.h index f251b6bc1..e36bcb7ec 100644 --- a/util/stop_watch.h +++ b/util/stop_watch.h @@ -5,16 +5,16 @@ // #pragma once #include "rocksdb/env.h" -#include "rocksdb/statistics.h" +#include "util/statistics_imp.h" namespace rocksdb { // Auto-scoped. // Records the statistic into the corresponding histogram. class StopWatch { public: - StopWatch( + explicit StopWatch( Env * const env, - std::shared_ptr statistics = nullptr, + Statistics* statistics = nullptr, const Histograms histogram_name = DB_GET) : env_(env), start_time_(env->NowMicros()), @@ -36,7 +36,7 @@ class StopWatch { private: Env* const env_; const uint64_t start_time_; - std::shared_ptr statistics_; + Statistics* statistics_; const Histograms histogram_name_; }; @@ -44,7 +44,7 @@ class StopWatch { // a nano second precision stopwatch class StopWatchNano { public: - StopWatchNano(Env* const env, bool auto_start = false) + explicit StopWatchNano(Env* const env, bool auto_start = false) : env_(env), start_(0) { if (auto_start) { Start(); From 299f5c76bb3e9c85e9c9b2a81822823795adbe0e Mon Sep 17 00:00:00 2001 From: Dhruba Borthakur Date: Fri, 22 Nov 2013 12:52:33 -0800 Subject: [PATCH 10/21] Create new log file outside the dbmutex. Summary: All filesystem Io should be done outside the dbmutex. There was one place when we have to roll the transaction log that we were creating the new log file while holding the dbmutex. I rearranged this code so that the act of creating the new transaction log file is done without holding the dbmutex. I also allocate the new memtable outside the dbmutex, this is important because creating the memtable could be heavyweight. Test Plan: make check and dbstress Reviewers: haobo, igor Reviewed By: haobo CC: leveldb, reconnect.grayhat Differential Revision: https://reviews.facebook.net/D14283 --- db/db_impl.cc | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 22dba5f2e..f8d5f446b 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3077,27 +3077,40 @@ Status DBImpl::MakeRoomForWrite(bool force) { } allow_soft_rate_limit_delay = false; mutex_.Lock(); + } else { - // Attempt to switch to a new memtable and trigger compaction of old - DelayLoggingAndReset(); + unique_ptr lfile; + MemTable* memtmp = nullptr; + + // Attempt to switch to a new memtable and trigger compaction of old. + // Do this without holding the dbmutex lock. assert(versions_->PrevLogNumber() == 0); uint64_t new_log_number = versions_->NewFileNumber(); - unique_ptr lfile; - EnvOptions soptions(storage_options_); - soptions.use_mmap_writes = false; - s = env_->NewWritableFile( + mutex_.Unlock(); + { + EnvOptions soptions(storage_options_); + soptions.use_mmap_writes = false; + DelayLoggingAndReset(); + s = env_->NewWritableFile( LogFileName(options_.wal_dir, new_log_number), &lfile, soptions ); + if (s.ok()) { + // Our final size should be less than write_buffer_size + // (compression, etc) but err on the side of caution. + lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); + memtmp = new MemTable( + internal_comparator_, mem_rep_factory_, NumberLevels(), options_); + } + } + mutex_.Lock(); if (!s.ok()) { // Avoid chewing through file number space in a tight loop. versions_->ReuseFileNumber(new_log_number); + assert (!memtmp); break; } - // Our final size should be less than write_buffer_size - // (compression, etc) but err on the side of caution. - lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); logfile_number_ = new_log_number; log_.reset(new log::Writer(std::move(lfile))); mem_->SetNextLogNumber(logfile_number_); @@ -3105,8 +3118,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { if (force) { imm_.FlushRequested(); } - mem_ = new MemTable( - internal_comparator_, mem_rep_factory_, NumberLevels(), options_); + mem_ = memtmp; mem_->Ref(); Log(options_.info_log, "New memtable created with log file: #%lu\n", From 11c26bd4a40d13a63250344c37df116a981ee9f9 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 25 Nov 2013 12:39:23 -0800 Subject: [PATCH 11/21] [RocksDB] Interface changes required for BackupableDB Summary: This is part of https://reviews.facebook.net/D14295 -- smaller diff that is easier to review Test Plan: make asan_check Reviewers: dhruba, haobo, emayanke Reviewed By: emayanke CC: leveldb, kailiu, reconnect.grayhat Differential Revision: https://reviews.facebook.net/D14301 --- db/db_impl.cc | 4 ++ db/db_impl.h | 1 + db/db_test.cc | 4 ++ include/rocksdb/db.h | 3 ++ include/utilities/stackable_db.h | 82 ++++++++++++++------------------ utilities/ttl/db_ttl.cc | 4 ++ utilities/ttl/db_ttl.h | 4 +- 7 files changed, 54 insertions(+), 48 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index f8d5f446b..1f28af324 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3131,6 +3131,10 @@ Status DBImpl::MakeRoomForWrite(bool force) { return s; } +Env* DBImpl::GetEnv() const { + return env_; +} + bool DBImpl::GetProperty(const Slice& property, std::string* value) { value->clear(); diff --git a/db/db_impl.h b/db/db_impl.h index dc4c20a51..fdd0a2520 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -67,6 +67,7 @@ class DBImpl : public DB { virtual int NumberLevels(); virtual int MaxMemCompactionLevel(); virtual int Level0StopWriteTrigger(); + virtual Env* GetEnv() const; virtual Status Flush(const FlushOptions& options); virtual Status DisableFileDeletions(); virtual Status EnableFileDeletions(); diff --git a/db/db_test.cc b/db/db_test.cc index aca07bcff..5e257483e 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -4462,6 +4462,10 @@ class ModelDB: public DB { return -1; } + virtual Env* GetEnv() const { + return nullptr; + } + virtual Status Flush(const rocksdb::FlushOptions& options) { Status ret; return ret; diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 260202091..0ec7ffba7 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -228,6 +228,9 @@ class DB { // Number of files in level-0 that would stop writes. virtual int Level0StopWriteTrigger() = 0; + // Get Env object from the DB + virtual Env* GetEnv() const = 0; + // Flush all mem-table data. virtual Status Flush(const FlushOptions& options) = 0; diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h index f15a22e12..b1e308b20 100644 --- a/include/utilities/stackable_db.h +++ b/include/utilities/stackable_db.h @@ -10,152 +10,140 @@ namespace rocksdb { // This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d class StackableDB : public DB { public: - explicit StackableDB(StackableDB* sdb) : sdb_(sdb) {} + // StackableDB is the owner of db now! + explicit StackableDB(DB* db) : db_(db) {} - // Returns the DB object that is the lowermost component in the stack of DBs - virtual DB* GetRawDB() { - return sdb_->GetRawDB(); + ~StackableDB() { + delete db_; } - // convert a DB to StackableDB - // TODO: This function does not work yet. Passing nullptr to StackableDB in - // NewStackableDB's constructor will cause segfault on object's usage - static StackableDB* DBToStackableDB(DB* db) { - class NewStackableDB : public StackableDB { - public: - NewStackableDB(DB* db) - : StackableDB(nullptr), - db_(db) {} - - DB* GetRawDB() { - return db_; - } - - private: - DB* db_; - }; - return new NewStackableDB(db); + virtual DB* GetBaseDB() { + return db_; } virtual Status Put(const WriteOptions& options, const Slice& key, const Slice& val) override { - return sdb_->Put(options, key, val); + return db_->Put(options, key, val); } virtual Status Get(const ReadOptions& options, const Slice& key, std::string* value) override { - return sdb_->Get(options, key, value); + return db_->Get(options, key, value); } virtual std::vector MultiGet(const ReadOptions& options, const std::vector& keys, std::vector* values) override { - return sdb_->MultiGet(options, keys, values); + return db_->MultiGet(options, keys, values); } virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, std::string* value, bool* value_found = nullptr) override { - return sdb_->KeyMayExist(options, key, value, value_found); + return db_->KeyMayExist(options, key, value, value_found); } virtual Status Delete(const WriteOptions& wopts, const Slice& key) override { - return sdb_->Delete(wopts, key); + return db_->Delete(wopts, key); } virtual Status Merge(const WriteOptions& options, const Slice& key, const Slice& value) override { - return sdb_->Merge(options, key, value); + return db_->Merge(options, key, value); } virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override { - return sdb_->Write(opts, updates); + return db_->Write(opts, updates); } virtual Iterator* NewIterator(const ReadOptions& opts) override { - return sdb_->NewIterator(opts); + return db_->NewIterator(opts); } virtual const Snapshot* GetSnapshot() override { - return sdb_->GetSnapshot(); + return db_->GetSnapshot(); } virtual void ReleaseSnapshot(const Snapshot* snapshot) override { - return sdb_->ReleaseSnapshot(snapshot); + return db_->ReleaseSnapshot(snapshot); } virtual bool GetProperty(const Slice& property, std::string* value) override { - return sdb_->GetProperty(property, value); + return db_->GetProperty(property, value); } virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) override { - return sdb_->GetApproximateSizes(r, n, sizes); + return db_->GetApproximateSizes(r, n, sizes); } virtual void CompactRange(const Slice* begin, const Slice* end, bool reduce_level = false, int target_level = -1) override { - return sdb_->CompactRange(begin, end, reduce_level, target_level); + return db_->CompactRange(begin, end, reduce_level, target_level); } virtual int NumberLevels() override { - return sdb_->NumberLevels(); + return db_->NumberLevels(); } virtual int MaxMemCompactionLevel() override { - return sdb_->MaxMemCompactionLevel(); + return db_->MaxMemCompactionLevel(); } virtual int Level0StopWriteTrigger() override { - return sdb_->Level0StopWriteTrigger(); + return db_->Level0StopWriteTrigger(); + } + + virtual Env* GetEnv() const override { + return db_->GetEnv(); } virtual Status Flush(const FlushOptions& fopts) override { - return sdb_->Flush(fopts); + return db_->Flush(fopts); } virtual Status DisableFileDeletions() override { - return sdb_->DisableFileDeletions(); + return db_->DisableFileDeletions(); } virtual Status EnableFileDeletions() override { - return sdb_->EnableFileDeletions(); + return db_->EnableFileDeletions(); } virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, bool flush_memtable = true) override { - return sdb_->GetLiveFiles(vec, mfs, flush_memtable); + return db_->GetLiveFiles(vec, mfs, flush_memtable); } virtual SequenceNumber GetLatestSequenceNumber() const override { - return sdb_->GetLatestSequenceNumber(); + return db_->GetLatestSequenceNumber(); } virtual Status GetSortedWalFiles(VectorLogPtr& files) override { - return sdb_->GetSortedWalFiles(files); + return db_->GetSortedWalFiles(files); } virtual Status DeleteFile(std::string name) override { - return sdb_->DeleteFile(name); + return db_->DeleteFile(name); } virtual Status GetUpdatesSince(SequenceNumber seq_number, unique_ptr* iter) override { - return sdb_->GetUpdatesSince(seq_number, iter); + return db_->GetUpdatesSince(seq_number, iter); } protected: - StackableDB* sdb_; + DB* db_; }; } // namespace rocksdb diff --git a/utilities/ttl/db_ttl.cc b/utilities/ttl/db_ttl.cc index a019102d9..127a2e566 100644 --- a/utilities/ttl/db_ttl.cc +++ b/utilities/ttl/db_ttl.cc @@ -254,6 +254,10 @@ int DBWithTTL::Level0StopWriteTrigger() { return db_->Level0StopWriteTrigger(); } +Env* DBWithTTL::GetEnv() const { + return db_->GetEnv(); +} + Status DBWithTTL::Flush(const FlushOptions& fopts) { return db_->Flush(fopts); } diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h index ffee0ccf2..d5c51dd3f 100644 --- a/utilities/ttl/db_ttl.h +++ b/utilities/ttl/db_ttl.h @@ -67,6 +67,8 @@ class DBWithTTL : public StackableDB { virtual int Level0StopWriteTrigger(); + virtual Env* GetEnv() const; + virtual Status Flush(const FlushOptions& fopts); virtual Status DisableFileDeletions(); @@ -88,7 +90,7 @@ class DBWithTTL : public StackableDB { // Simulate a db crash, no elegant closing of database. void TEST_Destroy_DBWithTtl(); - virtual DB* GetRawDB() { + virtual DB* GetBaseDB() { return db_; } From e37221f56b9e077173df024fd8e8a5d5c118a35b Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 25 Nov 2013 15:34:23 -0800 Subject: [PATCH 12/21] Memtable regression test Summary: The old regression tests didn't cover memtable part at all. This is an atempt to also measure memtable performance in regression tests. Test Plan: Ran regression_build_test.sh Reviewers: dhruba, haobo, kailiu, emayanke Reviewed By: dhruba CC: leveldb, reconnect.grayhat Differential Revision: https://reviews.facebook.net/D14325 --- build_tools/regression_build_test.sh | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index 25a96d655..2e8343c5c 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -22,6 +22,7 @@ function cleanup { rm -f $STAT_FILE.fillseq rm -f $STAT_FILE.readrandom rm -f $STAT_FILE.overwrite + rm -f $STAT_FILE.memtablefillreadrandom } trap cleanup EXIT @@ -39,7 +40,7 @@ function send_to_ods { } make clean -make db_bench -j$(nproc) +OPT=-DNDEBUG make db_bench -j$(nproc) ./db_bench \ --benchmarks=fillseq \ @@ -80,7 +81,7 @@ make db_bench -j$(nproc) --use_existing_db=1 \ --bloom_bits=10 \ --num=$NUM \ - --reads=$((NUM / 100)) \ + --reads=$NUM \ --cache_size=6442450944 \ --cache_numshardbits=6 \ --open_files=55000 \ @@ -91,10 +92,33 @@ make db_bench -j$(nproc) --sync=0 \ --threads=128 > ${STAT_FILE}.readrandom +./db_bench \ + --benchmarks=fillrandom,readrandom, \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --num=$((NUM / 10)) \ + --reads=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --write_buffer_size=1000000000 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --value_size=10 \ + --threads=32 > ${STAT_FILE}.memtablefillreadrandom + OVERWRITE_OPS=$(awk '/overwrite/ {print $5}' $STAT_FILE.overwrite) FILLSEQ_OPS=$(awk '/fillseq/ {print $5}' $STAT_FILE.fillseq) READRANDOM_OPS=$(awk '/readrandom/ {print $5}' $STAT_FILE.readrandom) +MEMTABLE_FILLRANDOM_OPS=$(awk '/fillrandom/ {print $5}' $STAT_FILE.memtablefillreadrandom) +MEMTABLE_READRANDOM_OPS=$(awk '/readrandom/ {print $5}' $STAT_FILE.memtablefillreadrandom) send_to_ods rocksdb.build.overwrite.qps $OVERWRITE_OPS send_to_ods rocksdb.build.fillseq.qps $FILLSEQ_OPS send_to_ods rocksdb.build.readrandom.qps $READRANDOM_OPS +send_to_ods rocksdb.build.memtablefillrandom.qps $MEMTABLE_FILLRANDOM_OPS +send_to_ods rocksdb.build.memtablereadrandom.qps $MEMTABLE_READRANDOM_OPS From 793fdd6731a8df7423f2b82e4f7ad35bad5cd7c0 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 25 Nov 2013 15:49:02 -0800 Subject: [PATCH 13/21] We should compile with -fPIC on non-fbcode environments also --- build_tools/build_detect_platform | 2 +- build_tools/fbcode.gcc471.sh | 2 +- build_tools/fbcode.gcc481.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 5b3d58c8c..c3fb8e3f1 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -75,7 +75,7 @@ if test -z "$TARGET_OS"; then TARGET_OS=`uname -s` fi -COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}" +COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS} -fPIC" CROSS_COMPILE= PLATFORM_CCFLAGS= PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" diff --git a/build_tools/fbcode.gcc471.sh b/build_tools/fbcode.gcc471.sh index 015c512ab..e8a0cdeaa 100644 --- a/build_tools/fbcode.gcc471.sh +++ b/build_tools/fbcode.gcc471.sh @@ -51,7 +51,7 @@ CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib -CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic -fPIC" +CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic" CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC" CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE" CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT" diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh index 6d8b9c766..7ca337cf2 100644 --- a/build_tools/fbcode.gcc481.sh +++ b/build_tools/fbcode.gcc481.sh @@ -59,7 +59,7 @@ CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCL AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib -CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic -fPIC" +CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic" CFLAGS+=" -nostdlib $LIBGCC_INCLUDE $GLIBC_INCLUDE" CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT" CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2" From 3ce36584111e236e6d7170f0c0dc9adc4b1f949e Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 25 Nov 2013 15:51:50 -0800 Subject: [PATCH 14/21] DB::GetOptions() Summary: We need access to options for BackupableDB Test Plan: make check Reviewers: dhruba Reviewed By: dhruba CC: leveldb, reconnect.grayhat Differential Revision: https://reviews.facebook.net/D14331 --- db/db_impl.cc | 4 ++++ db/db_impl.h | 1 + db/db_test.cc | 4 ++++ include/rocksdb/db.h | 3 +++ include/utilities/stackable_db.h | 4 ++++ utilities/ttl/db_ttl.cc | 4 ++++ utilities/ttl/db_ttl.h | 2 ++ 7 files changed, 22 insertions(+) diff --git a/db/db_impl.cc b/db/db_impl.cc index 1f28af324..2d85d7014 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3135,6 +3135,10 @@ Env* DBImpl::GetEnv() const { return env_; } +const Options& DBImpl::GetOptions() const { + return options_; +} + bool DBImpl::GetProperty(const Slice& property, std::string* value) { value->clear(); diff --git a/db/db_impl.h b/db/db_impl.h index fdd0a2520..8a57b92f5 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -68,6 +68,7 @@ class DBImpl : public DB { virtual int MaxMemCompactionLevel(); virtual int Level0StopWriteTrigger(); virtual Env* GetEnv() const; + virtual const Options& GetOptions() const; virtual Status Flush(const FlushOptions& options); virtual Status DisableFileDeletions(); virtual Status EnableFileDeletions(); diff --git a/db/db_test.cc b/db/db_test.cc index 5e257483e..1e85ab1fa 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -4466,6 +4466,10 @@ class ModelDB: public DB { return nullptr; } + virtual const Options& GetOptions() const { + return options_; + } + virtual Status Flush(const rocksdb::FlushOptions& options) { Status ret; return ret; diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 0ec7ffba7..73f9ac4da 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -231,6 +231,9 @@ class DB { // Get Env object from the DB virtual Env* GetEnv() const = 0; + // Get DB Options that we use + virtual const Options& GetOptions() const = 0; + // Flush all mem-table data. virtual Status Flush(const FlushOptions& options) = 0; diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h index b1e308b20..dc26ed852 100644 --- a/include/utilities/stackable_db.h +++ b/include/utilities/stackable_db.h @@ -107,6 +107,10 @@ class StackableDB : public DB { return db_->GetEnv(); } + virtual const Options& GetOptions() const override { + return db_->GetOptions(); + } + virtual Status Flush(const FlushOptions& fopts) override { return db_->Flush(fopts); } diff --git a/utilities/ttl/db_ttl.cc b/utilities/ttl/db_ttl.cc index 127a2e566..abe7408a6 100644 --- a/utilities/ttl/db_ttl.cc +++ b/utilities/ttl/db_ttl.cc @@ -258,6 +258,10 @@ Env* DBWithTTL::GetEnv() const { return db_->GetEnv(); } +const Options& DBWithTTL::GetOptions() const { + return db_->GetOptions(); +} + Status DBWithTTL::Flush(const FlushOptions& fopts) { return db_->Flush(fopts); } diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h index d5c51dd3f..d09bae966 100644 --- a/utilities/ttl/db_ttl.h +++ b/utilities/ttl/db_ttl.h @@ -69,6 +69,8 @@ class DBWithTTL : public StackableDB { virtual Env* GetEnv() const; + virtual const Options& GetOptions() const; + virtual Status Flush(const FlushOptions& fopts); virtual Status DisableFileDeletions(); From 27bbef11802d27c80df7e0b27091876df23b9986 Mon Sep 17 00:00:00 2001 From: Dhruba Borthakur Date: Mon, 25 Nov 2013 11:55:36 -0800 Subject: [PATCH 15/21] Free obsolete memtables outside the dbmutex. Summary: Large memory allocations and frees are costly and best done outside the db-mutex. The memtables are already allocated outside the db-mutex but they were being freed while holding the db-mutex. This patch frees obsolete memtables outside the db-mutex. Test Plan: make check db_stress Unit tests pass, I am in the process of running stress tests. Reviewers: haobo, igor, emayanke Reviewed By: haobo CC: reconnect.grayhat, leveldb Differential Revision: https://reviews.facebook.net/D14319 --- db/db_impl.cc | 50 +++++++++++++++++++++++++++++++++++++--------- db/memtable.h | 11 ++++++---- db/memtablelist.cc | 11 +++++++--- db/memtablelist.h | 6 ++++-- 4 files changed, 60 insertions(+), 18 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 2d85d7014..2abdd9107 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -300,6 +300,9 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) } DBImpl::~DBImpl() { + std::vector to_delete; + to_delete.reserve(options_.max_write_buffer_number); + // Wait for background work to finish if (flush_on_destroy_ && mem_->GetFirstSequenceNumber() != 0) { FlushMemTable(FlushOptions()); @@ -317,8 +320,14 @@ DBImpl::~DBImpl() { env_->UnlockFile(db_lock_); } - if (mem_ != nullptr) mem_->Unref(); - imm_.UnrefAll(); + if (mem_ != nullptr) { + delete mem_->Unref(); + } + + imm_.UnrefAll(&to_delete); + for (MemTable* m: to_delete) { + delete m; + } LogFlush(options_.info_log); } @@ -954,7 +963,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, // file-systems cause the DB::Open() to fail. break; } - mem->Unref(); + delete mem->Unref(); mem = nullptr; } } @@ -965,7 +974,9 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, // file-systems cause the DB::Open() to fail. } - if (mem != nullptr && !external_table) mem->Unref(); + if (mem != nullptr && !external_table) { + delete mem->Unref(); + } return status; } @@ -2480,9 +2491,14 @@ struct IterState { static void CleanupIteratorState(void* arg1, void* arg2) { IterState* state = reinterpret_cast(arg1); + std::vector to_delete; + to_delete.reserve(state->mem.size()); state->mu->Lock(); for (unsigned int i = 0; i < state->mem.size(); i++) { - state->mem[i]->Unref(); + MemTable* m = state->mem[i]->Unref(); + if (m != nullptr) { + to_delete.push_back(m); + } } state->version->Unref(); // delete only the sst obsolete files @@ -2491,6 +2507,9 @@ static void CleanupIteratorState(void* arg1, void* arg2) { state->db->FindObsoleteFiles(deletion_state, false, true); state->mu->Unlock(); state->db->PurgeObsoleteFiles(deletion_state); + + // delete obsolete memtables outside the db-mutex + for (MemTable* m : to_delete) delete m; delete state; } } // namespace @@ -2558,6 +2577,8 @@ Status DBImpl::GetImpl(const ReadOptions& options, StopWatch sw(env_, options_.statistics.get(), DB_GET); SequenceNumber snapshot; + std::vector to_delete; + to_delete.reserve(options_.max_write_buffer_number); mutex_.Lock(); if (options.snapshot != nullptr) { snapshot = reinterpret_cast(options.snapshot)->number_; @@ -2600,11 +2621,15 @@ Status DBImpl::GetImpl(const ReadOptions& options, have_stat_update && current->UpdateStats(stats)) { MaybeScheduleFlushOrCompaction(); } - mem->Unref(); - imm.UnrefAll(); + MemTable* m = mem->Unref(); + imm.UnrefAll(&to_delete); current->Unref(); mutex_.Unlock(); + // free up all obsolete memtables outside the mutex + delete m; + for (MemTable* v: to_delete) delete v; + LogFlush(options_.info_log); // Note, tickers are atomic now - no lock protection needed any more. RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); @@ -2618,6 +2643,9 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET); SequenceNumber snapshot; + std::vector to_delete; + to_delete.reserve(options_.max_write_buffer_number); + mutex_.Lock(); if (options.snapshot != nullptr) { snapshot = reinterpret_cast(options.snapshot)->number_; @@ -2679,11 +2707,15 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, have_stat_update && current->UpdateStats(stats)) { MaybeScheduleFlushOrCompaction(); } - mem->Unref(); - imm.UnrefAll(); + MemTable* m = mem->Unref(); + imm.UnrefAll(&to_delete); current->Unref(); mutex_.Unlock(); + // free up all obsolete memtables outside the mutex + delete m; + for (MemTable* v: to_delete) delete v; + LogFlush(options_.info_log); RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS); RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys); diff --git a/db/memtable.h b/db/memtable.h index 93b9b7e2c..5648b7716 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -39,16 +39,20 @@ class MemTable { int numlevel = 7, const Options& options = Options()); + ~MemTable(); + // Increase reference count. void Ref() { ++refs_; } - // Drop reference count. Delete if no more references exist. - void Unref() { + // Drop reference count. + // If the refcount goes to zero return this memtable, otherwise return null + MemTable* Unref() { --refs_; assert(refs_ >= 0); if (refs_ <= 0) { - delete this; + return this; } + return nullptr; } // Returns an estimate of the number of bytes of data in use by this @@ -129,7 +133,6 @@ class MemTable { void MarkImmutable() { table_->MarkReadOnly(); } private: - ~MemTable(); // Private since only Unref() should be used to delete it friend class MemTableIterator; friend class MemTableBackwardIterator; friend class MemTableList; diff --git a/db/memtablelist.cc b/db/memtablelist.cc index 3f2a88592..4453d1721 100644 --- a/db/memtablelist.cc +++ b/db/memtablelist.cc @@ -28,10 +28,15 @@ void MemTableList::RefAll() { } } -// Drop reference count on all underling memtables -void MemTableList::UnrefAll() { +// Drop reference count on all underling memtables. If the +// refcount of an underlying memtable drops to zero, then +// return it in to_delete vector. +void MemTableList::UnrefAll(std::vector* to_delete) { for (auto &memtable : memlist_) { - memtable->Unref(); + MemTable* m = memtable->Unref(); + if (m != nullptr) { + to_delete->push_back(m); + } } } diff --git a/db/memtablelist.h b/db/memtablelist.h index 20ea9ecda..ef10526c9 100644 --- a/db/memtablelist.h +++ b/db/memtablelist.h @@ -44,8 +44,10 @@ class MemTableList { // Increase reference count on all underling memtables void RefAll(); - // Drop reference count on all underling memtables - void UnrefAll(); + // Drop reference count on all underling memtables. If the refcount + // on an underlying memtable drops to zero, then return it in + // to_delete vector. + void UnrefAll(std::vector* to_delete); // Returns the total number of memtables in the list int size(); From fd4eca73e75282cf89b6de7f6f3bb4cf2b0acbc6 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 25 Nov 2013 21:21:01 -0800 Subject: [PATCH 16/21] fPIC in x64 environment Summary: Check https://github.com/facebook/rocksdb/pull/15 for context. Apparently [1], we need -fPIC in x64 environments (this is added only in non-fbcode). In fbcode, I removed -fPIC per @dhruba's suggestion, since it introduces perf regression. I'm not sure what would are the implications of doing that, but looks like it works, and when releasing to the third-party, we're disabling -fPIC either way [2]. Would love a suggestion from someone who knows more about this [1] http://eli.thegreenplace.net/2011/11/11/position-independent-code-pic-in-shared-libraries-on-x64/ [2] https://our.intern.facebook.com/intern/wiki/index.php/Database/RocksDB/Third_Party Test Plan: make check works Reviewers: dhruba, emayanke, kailiu Reviewed By: dhruba CC: leveldb, dhruba, reconnect.grayhat Differential Revision: https://reviews.facebook.net/D14337 --- build_tools/build_detect_platform | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index c3fb8e3f1..59e2e4619 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -75,7 +75,7 @@ if test -z "$TARGET_OS"; then TARGET_OS=`uname -s` fi -COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS} -fPIC" +COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}" CROSS_COMPILE= PLATFORM_CCFLAGS= PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" @@ -174,6 +174,12 @@ if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then # Also don't need any compilation tests if compiling on fbcode true else + # do fPIC on 64 bit in non-fbcode environment + case "$TARGET_OS" in + x86_64) + PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -fPIC" + esac + # If -std=c++0x works, use . Otherwise use port_posix.h. $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null < From 06844ab381e2a13862c07676e1c4a29cf1a0bb6a Mon Sep 17 00:00:00 2001 From: Kangmo Kim Date: Tue, 26 Nov 2013 16:30:52 +0900 Subject: [PATCH 17/21] Added missing component : gflags in Linux platform. Added missing component : gflags in Linux platform. --- INSTALL.md | 1 + 1 file changed, 1 insertion(+) diff --git a/INSTALL.md b/INSTALL.md index f56323eee..ed1cbfba1 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -25,6 +25,7 @@ libraries. You are on your own. `sudo apt-get install libsnappy-dev`. * Install zlib. Try: `sudo apt-get install zlib1g-dev`. * Install bzip2: `sudo apt-get install libbz2-dev`. + * Install gflags: `sudo apt-get install libgflags-dev`. * **OS X**: * Install latest C++ compiler that supports C++ 11: * Update XCode: run `xcode-select --install` (or install it from XCode App's settting). From 8478f380a0fd70081d8943867ea4f627dcd190f0 Mon Sep 17 00:00:00 2001 From: Dhruba Borthakur Date: Mon, 25 Nov 2013 22:04:29 -0800 Subject: [PATCH 18/21] During benchmarking, I see excessive use of vector.reserve(). Summary: This code path can potentially accumulate multiple important_files for level 0. But for other levels, it should have only one file in the important_files, so it is ok not to reserve excessive space, is it not? Test Plan: make check Reviewers: haobo Reviewed By: haobo CC: reconnect.grayhat, leveldb Differential Revision: https://reviews.facebook.net/D14349 --- db/version_set.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/db/version_set.cc b/db/version_set.cc index 95db3e477..adee80d04 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -458,7 +458,9 @@ void Version::Get(const ReadOptions& options, // Get the list of files to search in this level FileMetaData* const* files = &files_[level][0]; important_files.clear(); - important_files.reserve(num_files); + if (level == 0) { + important_files.reserve(num_files); + } // Some files may overlap each other. We find // all files that overlap user_key and process them in order from From 5ebc6b0f0b088a17e83b2b0372d42407fed24d97 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Tue, 26 Nov 2013 16:27:31 -0800 Subject: [PATCH 19/21] [rocksdb] Regression tests Summary: * Fixed regression test params by @dhruba's suggestion * Added p50, p75 and p99 to regression metrics Test Plan: build_tools/build_regression_test.sh Reviewers: dhruba, emayanke Reviewed By: dhruba CC: leveldb, dhruba, reconnect.grayhat Differential Revision: https://reviews.facebook.net/D14355 --- build_tools/regression_build_test.sh | 86 +++++++++++++++++++--------- 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index 2e8343c5c..6ede47466 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -27,21 +27,10 @@ function cleanup { trap cleanup EXIT -function send_to_ods { - key="$1" - value="$2" - - if [ -z "$value" ];then - echo >&2 "ERROR: Key $key doesn't have a value." - return - fi - curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \ - --connect-timeout 60 -} - make clean OPT=-DNDEBUG make db_bench -j$(nproc) +# measure fillseq + fill up the DB for overwrite benchmark ./db_bench \ --benchmarks=fillseq \ --db=$DATA_DIR \ @@ -58,6 +47,7 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --disable_wal=1 \ --sync=0 > ${STAT_FILE}.fillseq +# measure overwrite performance ./db_bench \ --benchmarks=overwrite \ --db=$DATA_DIR \ @@ -75,6 +65,25 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --sync=0 \ --threads=8 > ${STAT_FILE}.overwrite +# fill up the db for readrandom benchmark +./db_bench \ + --benchmarks=fillseq \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$NUM \ + --writes=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=1 > /dev/null + +# measure readrandom ./db_bench \ --benchmarks=readrandom \ --db=$DATA_DIR \ @@ -83,15 +92,17 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --num=$NUM \ --reads=$NUM \ --cache_size=6442450944 \ - --cache_numshardbits=6 \ + --cache_numshardbits=8 \ --open_files=55000 \ + --disable_seek_compaction=1 \ --statistics=1 \ --histogram=1 \ --disable_data_sync=1 \ --disable_wal=1 \ --sync=0 \ - --threads=128 > ${STAT_FILE}.readrandom + --threads=32 > ${STAT_FILE}.readrandom +# measure memtable performance -- none of the data gets flushed to disk ./db_bench \ --benchmarks=fillrandom,readrandom, \ --db=$DATA_DIR \ @@ -99,7 +110,7 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --num=$((NUM / 10)) \ --reads=$NUM \ --cache_size=6442450944 \ - --cache_numshardbits=6 \ + --cache_numshardbits=8 \ --write_buffer_size=1000000000 \ --open_files=55000 \ --disable_seek_compaction=1 \ @@ -111,14 +122,37 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --value_size=10 \ --threads=32 > ${STAT_FILE}.memtablefillreadrandom -OVERWRITE_OPS=$(awk '/overwrite/ {print $5}' $STAT_FILE.overwrite) -FILLSEQ_OPS=$(awk '/fillseq/ {print $5}' $STAT_FILE.fillseq) -READRANDOM_OPS=$(awk '/readrandom/ {print $5}' $STAT_FILE.readrandom) -MEMTABLE_FILLRANDOM_OPS=$(awk '/fillrandom/ {print $5}' $STAT_FILE.memtablefillreadrandom) -MEMTABLE_READRANDOM_OPS=$(awk '/readrandom/ {print $5}' $STAT_FILE.memtablefillreadrandom) - -send_to_ods rocksdb.build.overwrite.qps $OVERWRITE_OPS -send_to_ods rocksdb.build.fillseq.qps $FILLSEQ_OPS -send_to_ods rocksdb.build.readrandom.qps $READRANDOM_OPS -send_to_ods rocksdb.build.memtablefillrandom.qps $MEMTABLE_FILLRANDOM_OPS -send_to_ods rocksdb.build.memtablereadrandom.qps $MEMTABLE_READRANDOM_OPS +# send data to ods +function send_to_ods { + key="$1" + value="$2" + + if [ -z "$value" ];then + echo >&2 "ERROR: Key $key doesn't have a value." + return + fi + curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \ + --connect-timeout 60 +} + +function send_benchmark_to_ods { + bench="$1" + bench_key="$2" + file="$3" + + QPS=$(grep $bench $file | awk '{print $5}') + P50_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $3}' ) + P75_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $5}' ) + P99_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $7}' ) + + send_to_ods rocksdb.build.$bench_key.qps $QPS + send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS + send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS + send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS +} + +send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite +send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq +send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom +send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom +send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom From 4a3583e18e3474a4f08d3517e20bca600354c30f Mon Sep 17 00:00:00 2001 From: Isamu Arimoto Date: Thu, 28 Nov 2013 03:57:16 +0900 Subject: [PATCH 20/21] Fix typo. --- doc/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/index.html b/doc/index.html index 8c0c9de5a..84c4d132a 100644 --- a/doc/index.html +++ b/doc/index.html @@ -80,7 +80,7 @@ Such problems can be avoided by using the WriteBatch class to atomically apply a set of updates:

-  #include "leveldb/write_batch.h"
+  #include "rocksdb/write_batch.h"
   ...
   std::string value;
   rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);

From 4c81383628db46d35b674000a3668b5a9a2498a6 Mon Sep 17 00:00:00 2001
From: lovro 
Date: Tue, 26 Nov 2013 18:00:43 -0800
Subject: [PATCH 21/21] Set background thread name with pthread_setname_np()

Summary: Makes it easier to monitor performance with top

Test Plan: ./manual_compaction_test with `top -H` running.  Previously was two `manual_compacti`, now one shows `rocksdb:bg0`.

Reviewers: igor, dhruba

Reviewed By: igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14367
---
 util/env_posix.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index 356008225..16c3d1c61 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1396,6 +1396,15 @@ class PosixEnv : public Env {
         fprintf(stdout,
                 "Created bg thread 0x%lx\n",
                 (unsigned long)t);
+
+        // Set the thread name to aid debugging
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) && (__GLIBC_PREREQ(2, 12))
+        char name_buf[16];
+        snprintf(name_buf, sizeof name_buf, "rocksdb:bg%zu", bgthreads_.size());
+        name_buf[sizeof name_buf - 1] = '\0';
+        pthread_setname_np(t, name_buf);
+#endif
+
         bgthreads_.push_back(t);
       }