From bde1c1a72afe365240004d5b75d61004618110bb Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Sat, 7 Apr 2018 21:55:42 -0700 Subject: [PATCH] WritePrepared Txn: add stats Summary: Adding some stats that would be helpful to monitor if the DB has gone to unlikely stats that would hurt the performance. These are mostly when we end up needing to acquire a mutex. Closes https://github.com/facebook/rocksdb/pull/3683 Differential Revision: D7529393 Pulled By: maysamyabandeh fbshipit-source-id: f7d36279a8f39bd84d8ddbf64b5c97f670c5d6d9 --- db/db_impl.h | 22 ++++++++++++------- db/memtable.cc | 6 ++--- include/rocksdb/statistics.h | 16 ++++++++++++++ include/rocksdb/utilities/transaction_db.h | 7 +++--- .../transactions/write_prepared_txn_db.cc | 18 +++++++-------- .../transactions/write_prepared_txn_db.h | 8 +++++-- 6 files changed, 51 insertions(+), 26 deletions(-) diff --git a/db/db_impl.h b/db/db_impl.h index 2d386faa7..f7af54349 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -691,14 +691,20 @@ class DBImpl : public DB { void EraseThreadStatusDbInfo() const; // If disable_memtable is set the application logic must guarantee that the - // batch will still be skipped from memtable during the recovery. In - // WriteCommitted it is guarnateed since disable_memtable is used for prepare - // batch which will be written to memtable later during the commit, and in - // WritePrepared it is guaranteed since it will be used only for WAL markers - // which will never be written to memtable. - // batch_cnt is expected to be non-zero in seq_per_batch mode and indicates - // the number of sub-patches. A sub-patch is a subset of the write batch that - // does not have duplicate keys. + // batch will still be skipped from memtable during the recovery. An excption + // to this is seq_per_batch_ mode, in which since each batch already takes one + // seq, it is ok for the batch to write to memtable during recovery as long as + // it only takes one sequence number: i.e., no duplicate keys. + // In WriteCommitted it is guarnateed since disable_memtable is used for + // prepare batch which will be written to memtable later during the commit, + // and in WritePrepared it is guaranteed since it will be used only for WAL + // markers which will never be written to memtable. If the commit marker is + // accompanied with CommitTimeWriteBatch that is not written to memtable as + // long as it has no duplicate keys, it does not violate the one-seq-per-batch + // policy. + // batch_cnt is expected to be non-zero in seq_per_batch mode and + // indicates the number of sub-patches. A sub-patch is a subset of the write + // batch that does not have duplicate keys. Status WriteImpl(const WriteOptions& options, WriteBatch* updates, WriteCallback* callback = nullptr, uint64_t* log_used = nullptr, uint64_t log_ref = 0, diff --git a/db/memtable.cc b/db/memtable.cc index 467790135..f2d2881d9 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -479,12 +479,12 @@ bool MemTable::Add(SequenceNumber s, ValueType type, insert_with_hint_prefix_extractor_->InDomain(key_slice)) { Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice); bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]); - if (!res) { + if (UNLIKELY(!res)) { return res; } } else { bool res = table->InsertKey(handle); - if (!res) { + if (UNLIKELY(!res)) { return res; } } @@ -520,7 +520,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type, UpdateFlushState(); } else { bool res = table->InsertKeyConcurrently(handle); - if (!res) { + if (UNLIKELY(!res)) { return res; } diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index c91a86af5..828a4d0ea 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -309,6 +309,17 @@ enum Tickers : uint32_t { // # of bytes in the blob files evicted because of BlobDB is full. BLOB_DB_FIFO_BYTES_EVICTED, + // These coutners indicate a performance issue in WritePrepared transactions. + // We should not seem them ticking them much. + // # of times prepare_mutex_ is acquired in the fast path. + TXN_PREPARE_MUTEX_OVERHEAD, + // # of times old_commit_map_mutex_ is acquired in the fast path. + TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD, + // # of times we checked a batch for duplicate keys. + TXN_DUPLICATE_KEY_OVERHEAD, + // # of times snapshot_mutex_ is acquired in the fast path. + TXN_SNAPSHOT_MUTEX_OVERHEAD, + TICKER_ENUM_MAX }; @@ -455,6 +466,11 @@ const std::vector> TickersNameMap = { {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"}, {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"}, {BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"}, + {TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"}, + {TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD, + "rocksdb.txn.overhead.mutex.old.commit.map"}, + {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"}, + {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"}, }; /** diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 1482b2466..856f48996 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -98,9 +98,10 @@ struct TransactionOptions { bool deadlock_detect = false; // If set, it states that the CommitTimeWriteBatch represents the latest state - // of the application and meant to be used later during recovery. It enables - // an optimization to postpone updating the memtable with CommitTimeWriteBatch - // to only SwitchMemtable or recovery. + // of the application, has only one sub-batch, i.e., no duplicate keys, and + // meant to be used later during recovery. It enables an optimization to + // postpone updating the memtable with CommitTimeWriteBatch to only + // SwitchMemtable or recovery. bool use_only_the_last_commit_time_batch_for_recovery = false; // TODO(agiardullo): TransactionDB does not yet support comparators that allow diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 89a60d537..13acd0a27 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -131,9 +131,9 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, auto s = batch->Iterate(&counter); assert(s.ok()); batch_cnt = counter.BatchCount(); - // TODO(myabandeh): replace me with a stat - ROCKS_LOG_WARN(info_log_, "Duplicate key overhead: %" PRIu64 " batches", - static_cast(batch_cnt)); + WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD); + ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches", + static_cast(batch_cnt)); } assert(batch_cnt); @@ -506,7 +506,6 @@ void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max, while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) { auto to_be_popped = prepared_txns_.top(); delayed_prepared_.insert(to_be_popped); - // TODO(myabandeh): also add a stat ROCKS_LOG_WARN(info_log_, "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64 " new_max=%" PRIu64 " oldmax=%" PRIu64, @@ -574,14 +573,14 @@ void WritePreparedTxnDB::ReleaseSnapshotInternal( // old_commit_map_. Check and do garbage collection if that is the case. bool need_gc = false; { - // TODO(myabandeh): also add a stat + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead"); ReadLock rl(&old_commit_map_mutex_); auto prep_set_entry = old_commit_map_.find(snap_seq); need_gc = prep_set_entry != old_commit_map_.end(); } if (need_gc) { - // TODO(myabandeh): also add a stat + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead"); WriteLock wl(&old_commit_map_mutex_); old_commit_map_.erase(snap_seq); @@ -601,8 +600,7 @@ void WritePreparedTxnDB::UpdateSnapshots( #ifndef NDEBUG size_t sync_i = 0; #endif - // TODO(myabandeh): replace me with a stat - ROCKS_LOG_WARN(info_log_, "snapshots_mutex_ overhead"); + ROCKS_LOG_DETAILS(info_log_, "snapshots_mutex_ overhead"); WriteLock wl(&snapshots_mutex_); snapshots_version_ = version; // We update the list concurrently with the readers. @@ -682,7 +680,7 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) { if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && ip1 == SNAPSHOT_CACHE_SIZE && snapshot_seq < evicted.prep_seq)) { // Then access the less efficient list of snapshots_ - // TODO(myabandeh): also add a stat + WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD); ROCKS_LOG_WARN(info_log_, "snapshots_mutex_ overhead"); ReadLock rl(&snapshots_mutex_); // Items could have moved from the snapshots_ to snapshot_cache_ before @@ -716,8 +714,8 @@ bool WritePreparedTxnDB::MaybeUpdateOldCommitMap( } // then snapshot_seq < commit_seq if (prep_seq <= snapshot_seq) { // overlapping range + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead"); - // TODO(myabandeh): also add a stat WriteLock wl(&old_commit_map_mutex_); old_commit_map_empty_.store(false, std::memory_order_release); auto& vec = old_commit_map_[snapshot_seq]; diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h index b620db077..70a62b03f 100644 --- a/utilities/transactions/write_prepared_txn_db.h +++ b/utilities/transactions/write_prepared_txn_db.h @@ -144,8 +144,8 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { } if (!delayed_prepared_empty_.load(std::memory_order_acquire)) { // We should not normally reach here + WPRecordTick(TXN_PREPARE_MUTEX_OVERHEAD); ReadLock rl(&prepared_mutex_); - // TODO(myabandeh): also add a stat ROCKS_LOG_WARN(info_log_, "prepared_mutex_ overhead %" PRIu64, static_cast(delayed_prepared_.size())); if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) { @@ -216,7 +216,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // We should not normally reach here unless sapshot_seq is old. This is a // rare case and it is ok to pay the cost of mutex ReadLock for such old, // reading transactions. - // TODO(myabandeh): also add a stat + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead"); ReadLock rl(&old_commit_map_mutex_); auto prep_set_entry = old_commit_map_.find(snapshot_seq); @@ -381,6 +381,10 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { void Init(const TransactionDBOptions& /* unused */); + void WPRecordTick(uint32_t ticker_type) const { + RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type); + } + // A heap with the amortized O(1) complexity for erase. It uses one extra heap // to keep track of erased entries that are not yet on top of the main heap. class PreparedHeap {