reduce db mutex contention for write batch groups

Summary:
This diff allows a Writer to join the next write batch group
without acquiring any locks. Waiting is performed via a per-Writer mutex,
so all of the non-leader writers never need to acquire the db mutex.
It is now possible to join a write batch group after the leader has been
chosen but before the batch has been constructed. This diff doesn't
increase parallelism, but reduces synchronization overheads.

For some CPU-bound workloads (no WAL, RAM-sized working set) this can
substantially reduce contention on the db mutex in a multi-threaded
environment.  With T=8 N=500000 in a CPU-bound scenario (see the test
plan) this is good for a 33% perf win.  Not all scenarios see such a
win, but none show a loss.  This code is slightly faster even for the
single-threaded case (about 2% for the CPU-bound scenario below).

Test Plan:
1. unit tests
2. COMPILE_WITH_TSAN=1 make check
3. stress high-contention scenarios with db_bench -benchmarks=fillrandom -threads=$T -batch_size=1 -memtablerep=skip_list -value_size=0 --num=$N -level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999 -disable_auto_compactions --max_write_buffer_number=8 -max_background_flushes=8 --disable_wal --write_buffer_size=160000000

Reviewers: sdong, igor, rven, ljin, yhchiang

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D43887
main
Nathan Bronson 10 years ago
parent 603b6da8b8
commit b7198c3afe
  1. 56
      db/db_impl.cc
  2. 7
      db/db_impl_debug.cc
  3. 180
      db/write_thread.cc
  4. 141
      db/write_thread.h

@ -2131,14 +2131,13 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
return Status::OK(); return Status::OK();
} }
WriteThread::Writer w(&mutex_); WriteThread::Writer w;
write_thread_.EnterWriteThread(&w); write_thread_.EnterUnbatched(&w, &mutex_);
assert(!w.done); // Nobody should do our job
// SwitchMemtable() will release and reacquire mutex // SwitchMemtable() will release and reacquire mutex
// during execution // during execution
s = SwitchMemtable(cfd, &context); s = SwitchMemtable(cfd, &context);
write_thread_.ExitWriteThread(&w, &w, s); write_thread_.ExitUnbatched(&w);
cfd->imm()->FlushRequested(); cfd->imm()->FlushRequested();
@ -3073,15 +3072,14 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
// ColumnFamilyData object // ColumnFamilyData object
Options opt(db_options_, cf_options); Options opt(db_options_, cf_options);
{ // write thread { // write thread
WriteThread::Writer w(&mutex_); WriteThread::Writer w;
write_thread_.EnterWriteThread(&w); write_thread_.EnterUnbatched(&w, &mutex_);
assert(!w.done); // Nobody should do our job
// LogAndApply will both write the creation in MANIFEST and create // LogAndApply will both write the creation in MANIFEST and create
// ColumnFamilyData object // ColumnFamilyData object
s = versions_->LogAndApply( s = versions_->LogAndApply(
nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit, nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit,
&mutex_, directories_.GetDbDir(), false, &cf_options); &mutex_, directories_.GetDbDir(), false, &cf_options);
write_thread_.ExitWriteThread(&w, &w, s); write_thread_.ExitUnbatched(&w);
} }
if (s.ok()) { if (s.ok()) {
single_column_family_mode_ = false; single_column_family_mode_ = false;
@ -3135,12 +3133,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
} }
if (s.ok()) { if (s.ok()) {
// we drop column family from a single write thread // we drop column family from a single write thread
WriteThread::Writer w(&mutex_); WriteThread::Writer w;
write_thread_.EnterWriteThread(&w); write_thread_.EnterUnbatched(&w, &mutex_);
assert(!w.done); // Nobody should do our job
s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
&edit, &mutex_); &edit, &mutex_);
write_thread_.ExitWriteThread(&w, &w, s); write_thread_.ExitUnbatched(&w);
} }
if (!cf_support_snapshot) { if (!cf_support_snapshot) {
@ -3442,7 +3439,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
} }
PERF_TIMER_GUARD(write_pre_and_post_process_time); PERF_TIMER_GUARD(write_pre_and_post_process_time);
WriteThread::Writer w(&mutex_); WriteThread::Writer w;
w.batch = my_batch; w.batch = my_batch;
w.sync = write_options.sync; w.sync = write_options.sync;
w.disableWAL = write_options.disableWAL; w.disableWAL = write_options.disableWAL;
@ -3456,6 +3453,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
StopWatch write_sw(env_, db_options_.statistics.get(), DB_WRITE); StopWatch write_sw(env_, db_options_.statistics.get(), DB_WRITE);
write_thread_.JoinBatchGroup(&w);
if (w.done) {
// write was done by someone else, no need to grab mutex
RecordTick(stats_, WRITE_DONE_BY_OTHER);
return w.status;
}
// else we are the leader of the write batch group
WriteContext context; WriteContext context;
mutex_.Lock(); mutex_.Lock();
@ -3463,15 +3468,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1); default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1);
} }
write_thread_.EnterWriteThread(&w);
if (w.done) { // write was done by someone else
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
1);
mutex_.Unlock();
RecordTick(stats_, WRITE_DONE_BY_OTHER);
return w.status;
}
RecordTick(stats_, WRITE_DONE_BY_SELF); RecordTick(stats_, WRITE_DONE_BY_SELF);
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1); default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
@ -3560,8 +3556,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
bool need_log_dir_sync = need_log_sync && !log_dir_synced_; bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
if (status.ok()) { if (status.ok()) {
last_batch_group_size_ = last_batch_group_size_ = write_thread_.EnterAsBatchGroupLeader(
write_thread_.BuildBatchGroup(&last_writer, &write_batch_group); &w, &last_writer, &write_batch_group);
if (need_log_sync) { if (need_log_sync) {
while (logs_.front().getting_synced) { while (logs_.front().getting_synced) {
@ -3702,10 +3698,20 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
MarkLogsSynced(logfile_number_, need_log_dir_sync, status); MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
} }
write_thread_.ExitWriteThread(&w, last_writer, status); uint64_t writes_for_other = write_batch_group.size() - 1;
if (writes_for_other > 0) {
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
writes_for_other);
if (!write_options.disableWAL) {
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL,
writes_for_other);
}
}
mutex_.Unlock(); mutex_.Unlock();
write_thread_.ExitAsBatchGroupLeader(&w, last_writer, status);
return status; return status;
} }

@ -131,15 +131,14 @@ void DBImpl::TEST_UnlockMutex() {
} }
void* DBImpl::TEST_BeginWrite() { void* DBImpl::TEST_BeginWrite() {
auto w = new WriteThread::Writer(&mutex_); auto w = new WriteThread::Writer();
write_thread_.EnterWriteThread(w); write_thread_.EnterUnbatched(w, &mutex_);
assert(!w->done); // Nobody should do our job
return reinterpret_cast<void*>(w); return reinterpret_cast<void*>(w);
} }
void DBImpl::TEST_EndWrite(void* w) { void DBImpl::TEST_EndWrite(void* w) {
auto writer = reinterpret_cast<WriteThread::Writer*>(w); auto writer = reinterpret_cast<WriteThread::Writer*>(w);
write_thread_.ExitWriteThread(writer, writer, Status::OK()); write_thread_.ExitUnbatched(writer);
delete writer; delete writer;
} }

@ -7,82 +7,102 @@
namespace rocksdb { namespace rocksdb {
void WriteThread::EnterWriteThread(WriteThread::Writer* w) { void WriteThread::Await(Writer* w) {
// the following code block pushes the current writer "w" into the writer std::unique_lock<std::mutex> guard(w->JoinMutex());
// queue "writers_" and wait until one of the following conditions met: w->JoinCV().wait(guard, [w] { return w->joined; });
// 1. the job of "w" has been done by some other writers. }
// 2. "w" becomes the first writer in "writers_"
// 3. "w" timed-out. void WriteThread::MarkJoined(Writer* w) {
writers_.push_back(w); std::lock_guard<std::mutex> guard(w->JoinMutex());
assert(!w->joined);
while (!w->done && w != writers_.front()) { w->joined = true;
w->cv.Wait(); w->JoinCV().notify_one();
}
void WriteThread::LinkOne(Writer* w, bool* wait_needed) {
assert(!w->joined && !w->done);
Writer* writers = newest_writer_.load(std::memory_order_relaxed);
while (true) {
w->link_older = writers;
if (writers != nullptr) {
w->CreateMutex();
}
if (newest_writer_.compare_exchange_strong(writers, w)) {
// Success.
*wait_needed = (writers != nullptr);
return;
}
} }
} }
void WriteThread::ExitWriteThread(WriteThread::Writer* w, void WriteThread::CreateMissingNewerLinks(Writer* head) {
WriteThread::Writer* last_writer, while (true) {
Status status) { Writer* next = head->link_older;
// Pop out the current writer and all writers being pushed before the if (next == nullptr || next->link_newer != nullptr) {
// current writer from the writer queue. assert(next == nullptr || next->link_newer == head);
while (!writers_.empty()) { break;
Writer* ready = writers_.front();
writers_.pop_front();
if (ready != w) {
ready->status = status;
ready->done = true;
ready->cv.Signal();
} }
if (ready == last_writer) break; next->link_newer = head;
head = next;
} }
}
// Notify new head of write queue void WriteThread::JoinBatchGroup(Writer* w) {
if (!writers_.empty()) { assert(w->batch != nullptr);
writers_.front()->cv.Signal(); bool wait_needed;
LinkOne(w, &wait_needed);
if (wait_needed) {
Await(w);
} }
} }
// This function will be called only when the first writer succeeds. size_t WriteThread::EnterAsBatchGroupLeader(
// All writers in the to-be-built batch group will be processed. Writer* leader, WriteThread::Writer** last_writer,
//
// REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-nullptr batch
size_t WriteThread::BuildBatchGroup(
WriteThread::Writer** last_writer,
autovector<WriteBatch*>* write_batch_group) { autovector<WriteBatch*>* write_batch_group) {
assert(!writers_.empty()); assert(leader->link_older == nullptr);
Writer* first = writers_.front(); assert(leader->batch != nullptr);
assert(first->batch != nullptr);
size_t size = WriteBatchInternal::ByteSize(first->batch); size_t size = WriteBatchInternal::ByteSize(leader->batch);
write_batch_group->push_back(first->batch); write_batch_group->push_back(leader->batch);
// Allow the group to grow up to a maximum size, but if the // Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow // original write is small, limit the growth so we do not slow
// down the small write too much. // down the small write too much.
size_t max_size = 1 << 20; size_t max_size = 1 << 20;
if (size <= (128<<10)) { if (size <= (128 << 10)) {
max_size = size + (128<<10); max_size = size + (128 << 10);
} }
*last_writer = first; *last_writer = leader;
if (first->has_callback) { if (leader->has_callback) {
// TODO(agiardullo:) Batching not currently supported as this write may // TODO(agiardullo:) Batching not currently supported as this write may
// fail if the callback function decides to abort this write. // fail if the callback function decides to abort this write.
return size; return size;
} }
std::deque<Writer*>::iterator iter = writers_.begin(); Writer* newest_writer = newest_writer_.load(std::memory_order_acquire);
++iter; // Advance past "first"
for (; iter != writers_.end(); ++iter) { // This is safe regardless of any db mutex status of the caller. Previous
Writer* w = *iter; // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks
if (w->sync && !first->sync) { // (they emptied the list and then we added ourself as leader) or had to
// explicitly wake up us (the list was non-empty when we added ourself,
// so we have already received our MarkJoined).
CreateMissingNewerLinks(newest_writer);
// Tricky. Iteration start (leader) is exclusive and finish
// (newest_writer) is inclusive. Iteration goes from old to new.
Writer* w = leader;
while (w != newest_writer) {
w = w->link_newer;
if (w->sync && !leader->sync) {
// Do not include a sync write into a batch handled by a non-sync write. // Do not include a sync write into a batch handled by a non-sync write.
break; break;
} }
if (!w->disableWAL && first->disableWAL) { if (!w->disableWAL && leader->disableWAL) {
// Do not include a write that needs WAL into a batch that has // Do not include a write that needs WAL into a batch that has
// WAL disabled. // WAL disabled.
break; break;
@ -113,4 +133,68 @@ size_t WriteThread::BuildBatchGroup(
return size; return size;
} }
void WriteThread::ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer,
Status status) {
assert(leader->link_older == nullptr);
Writer* head = newest_writer_.load(std::memory_order_acquire);
if (head != last_writer ||
!newest_writer_.compare_exchange_strong(head, nullptr)) {
// Either w wasn't the head during the load(), or it was the head
// during the load() but somebody else pushed onto the list before
// we did the compare_exchange_strong (causing it to fail). In the
// latter case compare_exchange_strong has the effect of re-reading
// its first param (head). No need to retry a failing CAS, because
// only a departing leader (which we are at the moment) can remove
// nodes from the list.
assert(head != last_writer);
// After walking link_older starting from head (if not already done)
// we will be able to traverse w->link_newer below. This function
// can only be called from an active leader, only a leader can
// clear newest_writer_, we didn't, and only a clear newest_writer_
// could cause the next leader to start their work without a call
// to MarkJoined, so we can definitely conclude that no other leader
// work is going on here (with or without db mutex).
CreateMissingNewerLinks(head);
assert(last_writer->link_newer->link_older == last_writer);
last_writer->link_newer->link_older = nullptr;
// Next leader didn't self-identify, because newest_writer_ wasn't
// nullptr when they enqueued (we were definitely enqueued before them
// and are still in the list). That means leader handoff occurs when
// we call MarkJoined
MarkJoined(last_writer->link_newer);
}
// else nobody else was waiting, although there might already be a new
// leader now
while (last_writer != leader) {
last_writer->status = status;
last_writer->done = true;
// We must read link_older before calling MarkJoined, because as
// soon as it is marked the other thread's AwaitJoined may return
// and deallocate the Writer.
auto next = last_writer->link_older;
MarkJoined(last_writer);
last_writer = next;
}
}
void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) {
assert(w->batch == nullptr);
bool wait_needed;
LinkOne(w, &wait_needed);
if (wait_needed) {
mu->Unlock();
Await(w);
mu->Lock();
}
}
void WriteThread::ExitUnbatched(Writer* w) {
Status dummy_status;
ExitAsBatchGroupLeader(w, w, dummy_status);
}
} // namespace rocksdb } // namespace rocksdb

@ -5,76 +5,145 @@
#pragma once #pragma once
#include <assert.h>
#include <stdint.h> #include <stdint.h>
#include <deque> #include <atomic>
#include <limits> #include <condition_variable>
#include <mutex>
#include <type_traits>
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "util/autovector.h" #include "util/autovector.h"
#include "port/port.h"
#include "util/instrumented_mutex.h" #include "util/instrumented_mutex.h"
namespace rocksdb { namespace rocksdb {
class WriteThread { class WriteThread {
public: public:
// Information kept for every waiting writer // Information kept for every waiting writer.
struct Writer { struct Writer {
Status status;
WriteBatch* batch; WriteBatch* batch;
bool sync; bool sync;
bool disableWAL; bool disableWAL;
bool in_batch_group; bool in_batch_group;
bool done; bool done;
bool has_callback; bool has_callback;
InstrumentedCondVar cv; Status status;
bool made_waitable; // records lazy construction of mutex and cv
bool joined; // read/write only under JoinMutex() (or pre-link)
std::aligned_storage<sizeof(std::mutex)>::type join_mutex_bytes;
std::aligned_storage<sizeof(std::condition_variable)>::type join_cv_bytes;
Writer* link_older; // read/write only before linking, or as leader
Writer* link_newer; // lazy, read/write only before linking, or as leader
explicit Writer(InstrumentedMutex* mu) Writer()
: batch(nullptr), : batch(nullptr),
sync(false), sync(false),
disableWAL(false), disableWAL(false),
in_batch_group(false), in_batch_group(false),
done(false), done(false),
has_callback(false), has_callback(false),
cv(mu) {} made_waitable(false),
joined(false),
link_older(nullptr),
link_newer(nullptr) {}
~Writer() {
if (made_waitable) {
JoinMutex().~mutex();
JoinCV().~condition_variable();
}
}
void CreateMutex() {
assert(!joined);
if (!made_waitable) {
made_waitable = true;
new (&join_mutex_bytes) std::mutex;
new (&join_cv_bytes) std::condition_variable;
}
}
// No other mutexes may be acquired while holding JoinMutex(), it is
// always last in the order
std::mutex& JoinMutex() {
assert(made_waitable);
return *static_cast<std::mutex*>(static_cast<void*>(&join_mutex_bytes));
}
std::condition_variable& JoinCV() {
assert(made_waitable);
return *static_cast<std::condition_variable*>(
static_cast<void*>(&join_cv_bytes));
}
}; };
WriteThread() = default; WriteThread() : newest_writer_(nullptr) {}
~WriteThread() = default;
// Before applying write operation (such as DBImpl::Write, DBImpl::Flush) // IMPORTANT: None of the methods in this class rely on the db mutex
// thread should grab the mutex_ and be the first on writers queue. // for correctness. All of the methods except JoinBatchGroup and
// EnterWriteThread is used for it. // EnterUnbatched may be called either with or without the db mutex held.
// Be aware! Writer's job can be done by other thread (see DBImpl::Write // Correctness is maintained by ensuring that only a single thread is
// for examples), so check it via w.done before applying changes. // a leader at a time.
// Registers w as ready to become part of a batch group, and blocks
// until some other thread has completed the write (in which case
// w->done will be set to true) or this write has become the leader
// of a batch group (w->done will remain unset). The db mutex SHOULD
// NOT be held when calling this function, because it will block.
// If !w->done then JoinBatchGroup should be followed by a call to
// EnterAsBatchGroupLeader and ExitAsBatchGroupLeader.
// //
// Writer* w: writer to be placed in the queue // Writer* w: Writer to be executed as part of a batch group
// See also: ExitWriteThread void JoinBatchGroup(Writer* w);
// REQUIRES: db mutex held
void EnterWriteThread(Writer* w);
// After doing write job, we need to remove already used writers from // Constructs a write batch group led by leader, which should be a
// writers_ queue and notify head of the queue about it. // Writer passed to JoinBatchGroup on the current thread.
// ExitWriteThread is used for this.
// //
// Writer* w: Writer, that was added by EnterWriteThread function // Writer* leader: Writer passed to JoinBatchGroup, but !done
// Writer* last_writer: Since we can join a few Writers (as DBImpl::Write // Writer** last_writer: Out-param for use by ExitAsBatchGroupLeader
// does) // autovector<WriteBatch*>* write_batch_group: Out-param of group members
// we should pass last_writer as a parameter to // returns: Total batch group size
// ExitWriteThread size_t EnterAsBatchGroupLeader(Writer* leader, Writer** last_writer,
// (if you don't touch other writers, just pass w) autovector<WriteBatch*>* write_batch_group);
// Status status: Status of write operation
// See also: EnterWriteThread // Unlinks the Writer-s in a batch group, wakes up the non-leaders, and
// wakes up the next leader (if any).
//
// Writer* leader: From EnterAsBatchGroupLeader
// Writer* last_writer: Value of out-param of EnterAsBatchGroupLeader
// Status status: Status of write operation
void ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer,
Status status);
// Waits for all preceding writers (unlocking mu while waiting), then
// registers w as the currently proceeding writer.
//
// Writer* w: A Writer not eligible for batching
// InstrumentedMutex* mu: The db mutex, to unlock while waiting
// REQUIRES: db mutex held // REQUIRES: db mutex held
void ExitWriteThread(Writer* w, Writer* last_writer, Status status); void EnterUnbatched(Writer* w, InstrumentedMutex* mu);
// return total batch group size // Completes a Writer begun with EnterUnbatched, unblocking subsequent
size_t BuildBatchGroup(Writer** last_writer, // writers.
autovector<WriteBatch*>* write_batch_group); void ExitUnbatched(Writer* w);
private: private:
// Queue of writers. // Points to the newest pending Writer. Only leader can remove
std::deque<Writer*> writers_; // elements, adding can be done lock-free by anybody
std::atomic<Writer*> newest_writer_;
void Await(Writer* w);
void MarkJoined(Writer* w);
// Links w into the newest_writer_ list. Sets *wait_needed to false
// if w was linked directly into the leader position, true otherwise.
// Safe to call from multiple threads without external locking.
void LinkOne(Writer* w, bool* wait_needed);
// Computes any missing link_newer links. Should not be called
// concurrently with itself.
void CreateMissingNewerLinks(Writer* head);
}; };
} // namespace rocksdb } // namespace rocksdb

Loading…
Cancel
Save