New WriteImpl to pipeline WAL/memtable write

Summary:
PipelineWriteImpl is an alternative approach to WriteImpl. In WriteImpl, only one thread is allow to write at the same time. This thread will do both WAL and memtable writes for all write threads in the write group. Pending writers wait in queue until the current writer finishes. In the pipeline write approach, two queue is maintained: one WAL writer queue and one memtable writer queue. All writers (regardless of whether they need to write WAL) will still need to first join the WAL writer queue, and after the house keeping work and WAL writing, they will need to join memtable writer queue if needed. The benefit of this approach is that
1. Writers without memtable writes (e.g. the prepare phase of two phase commit) can exit write thread once WAL write is finish. They don't need to wait for memtable writes in case of group commit.
2. Pending writers only need to wait for previous WAL writer finish to be able to join the write thread, instead of wait also for previous memtable writes.

Merging #2056 and #2058 into this PR.
Closes https://github.com/facebook/rocksdb/pull/2286

Differential Revision: D5054606

Pulled By: yiwu-arbug

fbshipit-source-id: ee5b11efd19d3e39d6b7210937b11cefdd4d1c8d
main
Yi Wu 8 years ago committed by Facebook Github Bot
parent d746aead1a
commit 07bdcb91fe
  1. 1
      HISTORY.md
  2. 5
      db/db_impl.cc
  3. 19
      db/db_impl.h
  4. 238
      db/db_impl_write.cc
  5. 16
      db/flush_scheduler.cc
  6. 13
      db/write_batch.cc
  7. 2
      db/write_batch_internal.h
  8. 16
      db/write_callback_test.cc
  9. 332
      db/write_thread.cc
  10. 190
      db/write_thread.h
  11. 15
      include/rocksdb/options.h
  12. 3
      options/db_options.cc
  13. 1
      options/db_options.h
  14. 1
      options/options.cc
  15. 3
      options/options_helper.h
  16. 1
      options/options_settable_test.cc

@ -13,6 +13,7 @@
* Add debugging function `GetAllKeyVersions` to see internal versions of a range of keys. * Add debugging function `GetAllKeyVersions` to see internal versions of a range of keys.
* Support file ingestion with universal compaction style * Support file ingestion with universal compaction style
* Support file ingestion behind with option `allow_ingest_behind` * Support file ingestion behind with option `allow_ingest_behind`
* New option enable_pipelined_write which may improve write throughput in case writing from multiple threads and WAL enabled .
## 5.4.0 (04/11/2017) ## 5.4.0 (04/11/2017)
### Public API Change ### Public API Change

@ -159,10 +159,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
max_total_in_memory_state_(0), max_total_in_memory_state_(0),
is_snapshot_supported_(true), is_snapshot_supported_(true),
write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()), write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
write_thread_(immutable_db_options_.enable_write_thread_adaptive_yield write_thread_(immutable_db_options_),
? immutable_db_options_.write_thread_max_yield_usec
: 0,
immutable_db_options_.write_thread_slow_yield_usec),
write_controller_(mutable_db_options_.delayed_write_rate), write_controller_(mutable_db_options_.delayed_write_rate),
last_batch_group_size_(0), last_batch_group_size_(0),
unscheduled_flushes_(0), unscheduled_flushes_(0),

@ -607,6 +607,11 @@ class DBImpl : public DB {
uint64_t* log_used = nullptr, uint64_t log_ref = 0, uint64_t* log_used = nullptr, uint64_t log_ref = 0,
bool disable_memtable = false); bool disable_memtable = false);
Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
WriteCallback* callback = nullptr,
uint64_t* log_used = nullptr, uint64_t log_ref = 0,
bool disable_memtable = false);
uint64_t FindMinLogContainingOutstandingPrep(); uint64_t FindMinLogContainingOutstandingPrep();
uint64_t FindMinPrepLogReferencedByMemTable(); uint64_t FindMinPrepLogReferencedByMemTable();
@ -726,16 +731,18 @@ class DBImpl : public DB {
Status HandleWriteBufferFull(WriteContext* write_context); Status HandleWriteBufferFull(WriteContext* write_context);
// REQUIRES: mutex locked // REQUIRES: mutex locked
Status PreprocessWrite(const WriteOptions& write_options, bool need_log_sync, Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync,
bool* logs_getting_syned, WriteContext* write_context); WriteContext* write_context);
Status WriteToWAL(const autovector<WriteThread::Writer*>& write_group, Status WriteToWAL(const WriteThread::WriteGroup& write_group,
log::Writer* log_writer, bool need_log_sync, log::Writer* log_writer, bool need_log_sync,
bool need_log_dir_sync, SequenceNumber sequence); bool need_log_dir_sync, SequenceNumber sequence);
// Used by WriteImpl to update bg_error_ when encountering memtable insert // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
// error. void ParanoidCheck(const Status& status);
void UpdateBackgroundError(const Status& memtable_insert_status);
// Used by WriteImpl to update bg_error_ in case of memtable insert error.
void MemTableInsertStatusCheck(const Status& memtable_insert_status);
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE

@ -66,6 +66,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
return Status::Corruption("Batch is nullptr!"); return Status::Corruption("Batch is nullptr!");
} }
if (immutable_db_options_.enable_pipelined_write) {
return PipelinedWriteImpl(write_options, my_batch, callback, log_used,
log_ref, disable_memtable);
}
Status status; Status status;
PERF_TIMER_GUARD(write_pre_and_post_process_time); PERF_TIMER_GUARD(write_pre_and_post_process_time);
@ -79,7 +84,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
write_thread_.JoinBatchGroup(&w); write_thread_.JoinBatchGroup(&w);
if (w.state == WriteThread::STATE_PARALLEL_FOLLOWER) { if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
// we are a non-leader in a parallel group // we are a non-leader in a parallel group
PERF_TIMER_GUARD(write_memtable_time); PERF_TIMER_GUARD(write_memtable_time);
@ -93,11 +98,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
true /*concurrent_memtable_writes*/); true /*concurrent_memtable_writes*/);
} }
if (write_thread_.CompleteParallelWorker(&w)) { if (write_thread_.CompleteParallelMemTableWriter(&w)) {
// we're responsible for exit batch group // we're responsible for exit batch group
auto last_sequence = w.parallel_group->last_sequence; auto last_sequence = w.write_group->last_sequence;
versions_->SetLastSequence(last_sequence); versions_->SetLastSequence(last_sequence);
UpdateBackgroundError(w.status); MemTableInsertStatusCheck(w.status);
write_thread_.ExitAsBatchGroupFollower(&w); write_thread_.ExitAsBatchGroupFollower(&w);
} }
assert(w.state == WriteThread::STATE_COMPLETED); assert(w.state == WriteThread::STATE_COMPLETED);
@ -120,10 +125,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
// when it finds suitable, and finish them in the same write batch. // when it finds suitable, and finish them in the same write batch.
// This is how a write job could be done by the other writer. // This is how a write job could be done by the other writer.
WriteContext write_context; WriteContext write_context;
WriteThread::Writer* last_writer = &w; // Dummy intial value WriteThread::WriteGroup write_group;
autovector<WriteThread::Writer*> write_group;
WriteThread::ParallelGroup pg;
bool logs_getting_synced = false;
bool in_parallel_group = false; bool in_parallel_group = false;
uint64_t last_sequence = versions_->LastSequence(); uint64_t last_sequence = versions_->LastSequence();
@ -131,8 +133,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
bool need_log_sync = !write_options.disableWAL && write_options.sync; bool need_log_sync = !write_options.disableWAL && write_options.sync;
bool need_log_dir_sync = need_log_sync && !log_dir_synced_; bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
status = PreprocessWrite(write_options, need_log_sync, &logs_getting_synced, status = PreprocessWrite(write_options, &need_log_sync, &write_context);
&write_context);
log::Writer* cur_log_writer = logs_.back().writer; log::Writer* cur_log_writer = logs_.back().writer;
mutex_.Unlock(); mutex_.Unlock();
@ -143,7 +144,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
// into memtables // into memtables
last_batch_group_size_ = last_batch_group_size_ =
write_thread_.EnterAsBatchGroupLeader(&w, &last_writer, &write_group); write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
if (status.ok()) { if (status.ok()) {
// Rules for when we can update the memtable concurrently // Rules for when we can update the memtable concurrently
@ -158,10 +159,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
// relax rules 2 if we could prevent write batches from referring // relax rules 2 if we could prevent write batches from referring
// more than once to a particular key. // more than once to a particular key.
bool parallel = immutable_db_options_.allow_concurrent_memtable_write && bool parallel = immutable_db_options_.allow_concurrent_memtable_write &&
write_group.size() > 1; write_group.size > 1;
int total_count = 0; int total_count = 0;
uint64_t total_byte_size = 0; uint64_t total_byte_size = 0;
for (auto writer : write_group) { for (auto* writer : write_group) {
if (writer->CheckCallback(this)) { if (writer->CheckCallback(this)) {
if (writer->ShouldWriteToMemtable()) { if (writer->ShouldWriteToMemtable()) {
total_count += WriteBatchInternal::Count(writer->batch); total_count += WriteBatchInternal::Count(writer->batch);
@ -187,7 +188,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
RecordTick(stats_, BYTES_WRITTEN, total_byte_size); RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1); stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
RecordTick(stats_, WRITE_DONE_BY_SELF); RecordTick(stats_, WRITE_DONE_BY_SELF);
auto write_done_by_other = write_group.size() - 1; auto write_done_by_other = write_group.size - 1;
if (write_done_by_other > 0) { if (write_done_by_other > 0) {
stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
write_done_by_other); write_done_by_other);
@ -219,12 +220,17 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
&flush_scheduler_, write_options.ignore_missing_column_families, &flush_scheduler_, write_options.ignore_missing_column_families,
0 /*recovery_log_number*/, this); 0 /*recovery_log_number*/, this);
} else { } else {
pg.leader = &w; SequenceNumber next_sequence = current_sequence;
pg.last_writer = last_writer; for (auto* writer : write_group) {
pg.last_sequence = last_sequence; if (writer->ShouldWriteToMemtable()) {
pg.running.store(static_cast<uint32_t>(write_group.size()), writer->sequence = next_sequence;
next_sequence += WriteBatchInternal::Count(writer->batch);
}
}
write_group.last_sequence = last_sequence;
write_group.running.store(static_cast<uint32_t>(write_group.size),
std::memory_order_relaxed); std::memory_order_relaxed);
write_thread_.LaunchParallelFollowers(&pg, current_sequence); write_thread_.LaunchParallelMemTableWriters(&write_group);
in_parallel_group = true; in_parallel_group = true;
// Each parallel follower is doing each own writes. The leader should // Each parallel follower is doing each own writes. The leader should
@ -244,19 +250,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
} }
PERF_TIMER_START(write_pre_and_post_process_time); PERF_TIMER_START(write_pre_and_post_process_time);
// if (!w.CallbackFailed()) {
// Is setting bg_error_ enough here? This will at least stop ParanoidCheck(status);
// compaction and fail any further writes.
if (immutable_db_options_.paranoid_checks && !status.ok() &&
!w.CallbackFailed() && !status.IsBusy() && !status.IsIncomplete()) {
mutex_.Lock();
if (bg_error_.ok()) {
bg_error_ = status; // stop compaction & fail any further writes
}
mutex_.Unlock();
} }
if (logs_getting_synced) { if (need_log_sync) {
mutex_.Lock(); mutex_.Lock();
MarkLogsSynced(logfile_number_, need_log_dir_sync, status); MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
mutex_.Unlock(); mutex_.Unlock();
@ -266,40 +264,180 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
if (in_parallel_group) { if (in_parallel_group) {
// CompleteParallelWorker returns true if this thread should // CompleteParallelWorker returns true if this thread should
// handle exit, false means somebody else did // handle exit, false means somebody else did
should_exit_batch_group = write_thread_.CompleteParallelWorker(&w); should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
} }
if (should_exit_batch_group) { if (should_exit_batch_group) {
versions_->SetLastSequence(last_sequence); versions_->SetLastSequence(last_sequence);
UpdateBackgroundError(w.status); MemTableInsertStatusCheck(w.status);
write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
} }
if (status.ok()) { if (status.ok()) {
status = w.FinalStatus(); status = w.FinalStatus();
} }
return status; return status;
} }
void DBImpl::UpdateBackgroundError(const Status& memtable_insert_status) { Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
WriteBatch* my_batch, WriteCallback* callback,
uint64_t* log_used, uint64_t log_ref,
bool disable_memtable) {
PERF_TIMER_GUARD(write_pre_and_post_process_time);
StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
WriteContext write_context;
WriteThread::Writer w(write_options, my_batch, callback, log_ref,
disable_memtable);
write_thread_.JoinBatchGroup(&w);
if (w.state == WriteThread::STATE_GROUP_LEADER) {
WriteThread::WriteGroup wal_write_group;
if (w.callback && !w.callback->AllowWriteBatching()) {
write_thread_.WaitForMemTableWriters();
}
mutex_.Lock();
bool need_log_sync = !write_options.disableWAL && write_options.sync;
bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
w.status = PreprocessWrite(write_options, &need_log_sync, &write_context);
log::Writer* cur_log_writer = logs_.back().writer;
mutex_.Unlock();
// This can set non-OK status if callback fail.
last_batch_group_size_ =
write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group);
const SequenceNumber current_sequence =
write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1;
size_t total_count = 0;
size_t total_byte_size = 0;
if (w.status.ok()) {
SequenceNumber next_sequence = current_sequence;
for (auto writer : wal_write_group) {
if (writer->CheckCallback(this)) {
if (writer->ShouldWriteToMemtable()) {
writer->sequence = next_sequence;
size_t count = WriteBatchInternal::Count(writer->batch);
next_sequence += count;
total_count += count;
}
total_byte_size = WriteBatchInternal::AppendedByteSize(
total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
}
}
if (w.disable_wal) {
has_unpersisted_data_.store(true, std::memory_order_relaxed);
}
write_thread_.UpdateLastSequence(current_sequence + total_count - 1);
}
auto stats = default_cf_internal_stats_;
stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count);
RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size);
RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
PERF_TIMER_STOP(write_pre_and_post_process_time);
if (w.ShouldWriteToWAL()) {
PERF_TIMER_GUARD(write_wal_time);
stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
if (wal_write_group.size > 1) {
stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
wal_write_group.size - 1);
RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
}
w.status = WriteToWAL(wal_write_group, cur_log_writer, need_log_sync,
need_log_dir_sync, current_sequence);
}
if (!w.CallbackFailed()) {
ParanoidCheck(w.status);
}
if (need_log_sync) {
mutex_.Lock();
MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status);
mutex_.Unlock();
}
write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
}
WriteThread::WriteGroup memtable_write_group;
if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
PERF_TIMER_GUARD(write_memtable_time);
assert(w.status.ok());
write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group);
if (memtable_write_group.size > 1 &&
immutable_db_options_.allow_concurrent_memtable_write) {
write_thread_.LaunchParallelMemTableWriters(&memtable_write_group);
} else {
memtable_write_group.status = WriteBatchInternal::InsertInto(
memtable_write_group, w.sequence, column_family_memtables_.get(),
&flush_scheduler_, write_options.ignore_missing_column_families,
0 /*log_number*/, this);
versions_->SetLastSequence(memtable_write_group.last_sequence);
write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
}
}
if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
assert(w.ShouldWriteToMemtable());
WriteBatchInternal::SetSequence(w.batch, w.sequence);
ColumnFamilyMemTablesImpl column_family_memtables(
versions_->GetColumnFamilySet());
w.status = WriteBatchInternal::InsertInto(
&w, &column_family_memtables, &flush_scheduler_,
write_options.ignore_missing_column_families, 0 /*log_number*/, this,
true /*concurrent_memtable_writes*/);
if (write_thread_.CompleteParallelMemTableWriter(&w)) {
MemTableInsertStatusCheck(w.status);
versions_->SetLastSequence(w.write_group->last_sequence);
write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
}
}
assert(w.state == WriteThread::STATE_COMPLETED);
if (log_used != nullptr) {
*log_used = w.log_used;
}
return w.FinalStatus();
}
void DBImpl::ParanoidCheck(const Status& status) {
// Is setting bg_error_ enough here? This will at least stop
// compaction and fail any further writes.
if (immutable_db_options_.paranoid_checks && !status.ok() &&
!status.IsBusy() && !status.IsIncomplete()) {
mutex_.Lock();
if (bg_error_.ok()) {
bg_error_ = status; // stop compaction & fail any further writes
}
mutex_.Unlock();
}
}
void DBImpl::MemTableInsertStatusCheck(const Status& status) {
// A non-OK status here indicates that the state implied by the // A non-OK status here indicates that the state implied by the
// WAL has diverged from the in-memory state. This could be // WAL has diverged from the in-memory state. This could be
// because of a corrupt write_batch (very bad), or because the // because of a corrupt write_batch (very bad), or because the
// client specified an invalid column family and didn't specify // client specified an invalid column family and didn't specify
// ignore_missing_column_families. // ignore_missing_column_families.
if (!memtable_insert_status.ok()) { if (!status.ok()) {
mutex_.Lock(); mutex_.Lock();
assert(bg_error_.ok()); assert(bg_error_.ok());
bg_error_ = memtable_insert_status; bg_error_ = status;
mutex_.Unlock(); mutex_.Unlock();
} }
} }
Status DBImpl::PreprocessWrite(const WriteOptions& write_options, Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
bool need_log_sync, bool* logs_getting_synced, bool* need_log_sync,
WriteContext* write_context) { WriteContext* write_context) {
mutex_.AssertHeld(); mutex_.AssertHeld();
assert(write_context != nullptr && logs_getting_synced != nullptr); assert(write_context != nullptr && need_log_sync != nullptr);
Status status; Status status;
assert(!single_column_family_mode_ || assert(!single_column_family_mode_ ||
@ -336,7 +474,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
status = DelayWrite(last_batch_group_size_, write_options); status = DelayWrite(last_batch_group_size_, write_options);
} }
if (status.ok() && need_log_sync) { if (status.ok() && *need_log_sync) {
// Wait until the parallel syncs are finished. Any sync process has to sync // Wait until the parallel syncs are finished. Any sync process has to sync
// the front log too so it is enough to check the status of front() // the front log too so it is enough to check the status of front()
// We do a while loop since log_sync_cv_ is signalled when any sync is // We do a while loop since log_sync_cv_ is signalled when any sync is
@ -356,26 +494,28 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
// actually write to the WAL // actually write to the WAL
log.getting_synced = true; log.getting_synced = true;
} }
*logs_getting_synced = true; } else {
*need_log_sync = false;
} }
return status; return status;
} }
Status DBImpl::WriteToWAL(const autovector<WriteThread::Writer*>& write_group, Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
log::Writer* log_writer, bool need_log_sync, log::Writer* log_writer, bool need_log_sync,
bool need_log_dir_sync, SequenceNumber sequence) { bool need_log_dir_sync, SequenceNumber sequence) {
Status status; Status status;
WriteBatch* merged_batch = nullptr; WriteBatch* merged_batch = nullptr;
size_t write_with_wal = 0; size_t write_with_wal = 0;
if (write_group.size() == 1 && write_group[0]->ShouldWriteToWAL() && auto* leader = write_group.leader;
write_group[0]->batch->GetWalTerminationPoint().is_cleared()) { if (write_group.size == 1 && leader->ShouldWriteToWAL() &&
leader->batch->GetWalTerminationPoint().is_cleared()) {
// we simply write the first WriteBatch to WAL if the group only // we simply write the first WriteBatch to WAL if the group only
// contains one batch, that batch should be written to the WAL, // contains one batch, that batch should be written to the WAL,
// and the batch is not wanting to be truncated // and the batch is not wanting to be truncated
merged_batch = write_group[0]->batch; merged_batch = leader->batch;
write_group[0]->log_used = logfile_number_; leader->log_used = logfile_number_;
write_with_wal = 1; write_with_wal = 1;
} else { } else {
// WAL needs all of the batches flattened into a single batch. // WAL needs all of the batches flattened into a single batch.
@ -643,6 +783,12 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
log::Writer* new_log = nullptr; log::Writer* new_log = nullptr;
MemTable* new_mem = nullptr; MemTable* new_mem = nullptr;
// In case of pipelined write is enabled, wait for all pending memtable
// writers.
if (immutable_db_options_.enable_pipelined_write) {
write_thread_.WaitForMemTableWriters();
}
// Attempt to switch to a new memtable and trigger flush of old. // Attempt to switch to a new memtable and trigger flush of old.
// Do this without holding the dbmutex lock. // Do this without holding the dbmutex lock.
assert(versions_->prev_log_number() == 0); assert(versions_->prev_log_number() == 0);

@ -15,11 +15,9 @@ namespace rocksdb {
void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) { void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
#ifndef NDEBUG #ifndef NDEBUG
{
std::lock_guard<std::mutex> lock(checking_mutex_); std::lock_guard<std::mutex> lock(checking_mutex_);
assert(checking_set_.count(cfd) == 0); assert(checking_set_.count(cfd) == 0);
checking_set_.insert(cfd); checking_set_.insert(cfd);
}
#endif // NDEBUG #endif // NDEBUG
cfd->Ref(); cfd->Ref();
// Suppress false positive clang analyzer warnings. // Suppress false positive clang analyzer warnings.
@ -36,8 +34,11 @@ void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
} }
ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
#ifndef NDEBUG
std::lock_guard<std::mutex> lock(checking_mutex_);
#endif // NDEBUG
while (true) { while (true) {
if (Empty()) { if (head_.load(std::memory_order_relaxed) == nullptr) {
return nullptr; return nullptr;
} }
@ -48,11 +49,9 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
delete node; delete node;
#ifndef NDEBUG #ifndef NDEBUG
{
auto iter = checking_set_.find(cfd); auto iter = checking_set_.find(cfd);
assert(iter != checking_set_.end()); assert(iter != checking_set_.end());
checking_set_.erase(iter); checking_set_.erase(iter);
}
#endif // NDEBUG #endif // NDEBUG
if (!cfd->IsDropped()) { if (!cfd->IsDropped()) {
@ -68,8 +67,13 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
} }
bool FlushScheduler::Empty() { bool FlushScheduler::Empty() {
#ifndef NDEBUG
std::lock_guard<std::mutex> lock(checking_mutex_);
#endif // NDEBUG
auto rv = head_.load(std::memory_order_relaxed) == nullptr; auto rv = head_.load(std::memory_order_relaxed) == nullptr;
#ifndef NDEBUG
assert(rv == checking_set_.empty()); assert(rv == checking_set_.empty());
#endif // NDEBUG
return rv; return rv;
} }
@ -80,7 +84,7 @@ void FlushScheduler::Clear() {
delete cfd; delete cfd;
} }
} }
assert(Empty()); assert(head_.load(std::memory_order_relaxed) == nullptr);
} }
} // namespace rocksdb } // namespace rocksdb

@ -1290,16 +1290,17 @@ public:
// 2) During Write(), in a single-threaded write thread // 2) During Write(), in a single-threaded write thread
// 3) During Write(), in a concurrent context where memtables has been cloned // 3) During Write(), in a concurrent context where memtables has been cloned
// The reason is that it calls memtables->Seek(), which has a stateful cache // The reason is that it calls memtables->Seek(), which has a stateful cache
Status WriteBatchInternal::InsertInto( Status WriteBatchInternal::InsertInto(WriteThread::WriteGroup& write_group,
const autovector<WriteThread::Writer*>& writers, SequenceNumber sequence, SequenceNumber sequence,
ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, ColumnFamilyMemTables* memtables,
bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db, FlushScheduler* flush_scheduler,
bool ignore_missing_column_families,
uint64_t recovery_log_number, DB* db,
bool concurrent_memtable_writes) { bool concurrent_memtable_writes) {
MemTableInserter inserter(sequence, memtables, flush_scheduler, MemTableInserter inserter(sequence, memtables, flush_scheduler,
ignore_missing_column_families, recovery_log_number, ignore_missing_column_families, recovery_log_number,
db, concurrent_memtable_writes); db, concurrent_memtable_writes);
for (size_t i = 0; i < writers.size(); i++) { for (auto w : write_group) {
auto w = writers[i];
if (!w->ShouldWriteToMemtable()) { if (!w->ShouldWriteToMemtable()) {
continue; continue;
} }

@ -153,7 +153,7 @@ class WriteBatchInternal {
// //
// Under concurrent use, the caller is responsible for making sure that // Under concurrent use, the caller is responsible for making sure that
// the memtables object itself is thread-local. // the memtables object itself is thread-local.
static Status InsertInto(const autovector<WriteThread::Writer*>& batches, static Status InsertInto(WriteThread::WriteGroup& write_group,
SequenceNumber sequence, SequenceNumber sequence,
ColumnFamilyMemTables* memtables, ColumnFamilyMemTables* memtables,
FlushScheduler* flush_scheduler, FlushScheduler* flush_scheduler,

@ -119,10 +119,12 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
for (auto& allow_parallel : {true, false}) { for (auto& allow_parallel : {true, false}) {
for (auto& allow_batching : {true, false}) { for (auto& allow_batching : {true, false}) {
for (auto& enable_WAL : {true, false}) { for (auto& enable_WAL : {true, false}) {
for (auto& enable_pipelined_write : {true, false}) {
for (auto& write_group : write_scenarios) { for (auto& write_group : write_scenarios) {
Options options; Options options;
options.create_if_missing = true; options.create_if_missing = true;
options.allow_concurrent_memtable_write = allow_parallel; options.allow_concurrent_memtable_write = allow_parallel;
options.enable_pipelined_write = enable_pipelined_write;
ReadOptions read_options; ReadOptions read_options;
DB* db; DB* db;
@ -159,7 +161,8 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
ASSERT_TRUE(writer->state == ASSERT_TRUE(writer->state ==
WriteThread::State::STATE_GROUP_LEADER); WriteThread::State::STATE_GROUP_LEADER);
} else { } else {
ASSERT_TRUE(writer->state == WriteThread::State::STATE_INIT); ASSERT_TRUE(writer->state ==
WriteThread::State::STATE_INIT);
} }
// (meta test) the first WriteOP should indeed be the first // (meta test) the first WriteOP should indeed be the first
@ -188,8 +191,11 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
ASSERT_TRUE(writer->state == ASSERT_TRUE(writer->state ==
WriteThread::State::STATE_GROUP_LEADER); WriteThread::State::STATE_GROUP_LEADER);
} else if (!allow_parallel) { } else if (!allow_parallel) {
ASSERT_TRUE(writer->state == ASSERT_TRUE(
WriteThread::State::STATE_COMPLETED); writer->state == WriteThread::State::STATE_COMPLETED ||
(enable_pipelined_write &&
writer->state ==
WriteThread::State::STATE_MEMTABLE_WRITER_LEADER));
} }
}); });
@ -218,7 +224,8 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
char my_key = 0; char my_key = 0;
do { do {
my_key = dummy_key.load(); my_key = dummy_key.load();
} while (!dummy_key.compare_exchange_strong(my_key, my_key + 1)); } while (
!dummy_key.compare_exchange_strong(my_key, my_key + 1));
string skey(5, my_key); string skey(5, my_key);
string sval(10, my_key); string sval(10, my_key);
@ -278,6 +285,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
} }
} }
} }
}
} }
TEST_F(WriteCallbackTest, WriteCallBackTest) { TEST_F(WriteCallbackTest, WriteCallBackTest) {

@ -15,10 +15,17 @@
namespace rocksdb { namespace rocksdb {
WriteThread::WriteThread(uint64_t max_yield_usec, uint64_t slow_yield_usec) WriteThread::WriteThread(const ImmutableDBOptions& db_options)
: max_yield_usec_(max_yield_usec), : max_yield_usec_(db_options.enable_write_thread_adaptive_yield
slow_yield_usec_(slow_yield_usec), ? db_options.write_thread_max_yield_usec
newest_writer_(nullptr) {} : 0),
slow_yield_usec_(db_options.write_thread_slow_yield_usec),
allow_concurrent_memtable_write_(
db_options.allow_concurrent_memtable_write),
enable_pipelined_write_(db_options.enable_pipelined_write),
newest_writer_(nullptr),
newest_memtable_writer_(nullptr),
last_sequence_(0) {}
uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) { uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) {
// We're going to block. Lazily create the mutex. We guarantee // We're going to block. Lazily create the mutex. We guarantee
@ -184,22 +191,39 @@ void WriteThread::SetState(Writer* w, uint8_t new_state) {
} }
} }
void WriteThread::LinkOne(Writer* w, bool* linked_as_leader) { bool WriteThread::LinkOne(Writer* w, std::atomic<Writer*>* newest_writer) {
assert(newest_writer != nullptr);
assert(w->state == STATE_INIT); assert(w->state == STATE_INIT);
Writer* writers = newest_writer->load(std::memory_order_relaxed);
while (true) { while (true) {
Writer* writers = newest_writer_.load(std::memory_order_relaxed);
w->link_older = writers; w->link_older = writers;
if (newest_writer_.compare_exchange_strong(writers, w)) { if (newest_writer->compare_exchange_weak(writers, w)) {
if (writers == nullptr) { return (writers == nullptr);
// this isn't part of the WriteThread machinery, but helps with }
// debugging and is checked by an assert in WriteImpl }
w->state.store(STATE_GROUP_LEADER, std::memory_order_relaxed); }
}
// Then we are the head of the queue and hence definiltly the leader bool WriteThread::LinkGroup(WriteGroup& write_group,
*linked_as_leader = (writers == nullptr); std::atomic<Writer*>* newest_writer) {
// Otherwise we will wait for previous leader to define our status assert(newest_writer != nullptr);
return; Writer* leader = write_group.leader;
Writer* last_writer = write_group.last_writer;
Writer* w = last_writer;
while (true) {
// Unset link_newer pointers to make sure when we call
// CreateMissingNewerLinks later it create all missing links.
w->link_newer = nullptr;
w->write_group = nullptr;
if (w == leader) {
break;
}
w = w->link_older;
}
Writer* newest = newest_writer->load(std::memory_order_relaxed);
while (true) {
leader->link_older = newest;
if (newest_writer->compare_exchange_weak(newest, last_writer)) {
return (newest == nullptr);
} }
} }
} }
@ -216,12 +240,43 @@ void WriteThread::CreateMissingNewerLinks(Writer* head) {
} }
} }
void WriteThread::CompleteLeader(WriteGroup& write_group) {
assert(write_group.size > 0);
Writer* leader = write_group.leader;
if (write_group.size == 1) {
write_group.leader = nullptr;
write_group.last_writer = nullptr;
} else {
assert(leader->link_newer != nullptr);
leader->link_newer->link_older = nullptr;
write_group.leader = leader->link_newer;
}
write_group.size -= 1;
SetState(leader, STATE_COMPLETED);
}
void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) {
assert(write_group.size > 1);
assert(w != write_group.leader);
if (w == write_group.last_writer) {
w->link_older->link_newer = nullptr;
write_group.last_writer = w->link_older;
} else {
w->link_older->link_newer = w->link_newer;
w->link_newer->link_older = w->link_older;
}
write_group.size -= 1;
SetState(w, STATE_COMPLETED);
}
void WriteThread::JoinBatchGroup(Writer* w) { void WriteThread::JoinBatchGroup(Writer* w) {
static AdaptationContext ctx("JoinBatchGroup"); static AdaptationContext ctx("JoinBatchGroup");
assert(w->batch != nullptr); assert(w->batch != nullptr);
bool linked_as_leader; bool linked_as_leader = LinkOne(w, &newest_writer_);
LinkOne(w, &linked_as_leader); if (linked_as_leader) {
SetState(w, STATE_GROUP_LEADER);
}
TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w); TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w);
@ -231,23 +286,28 @@ void WriteThread::JoinBatchGroup(Writer* w) {
* 1) An existing leader pick us as the new leader when it finishes * 1) An existing leader pick us as the new leader when it finishes
* 2) An existing leader pick us as its follewer and * 2) An existing leader pick us as its follewer and
* 2.1) finishes the memtable writes on our behalf * 2.1) finishes the memtable writes on our behalf
* 2.2) Or tell us to finish the memtable writes it in pralallel * 2.2) Or tell us to finish the memtable writes in pralallel
* 3) (pipelined write) An existing leader pick us as its follower and
* finish book-keeping and WAL write for us, enqueue us as pending
* memtable writer, and
* 3.1) we become memtable writer group leader, or
* 3.2) an existing memtable writer group leader tell us to finish memtable
* writes in parallel.
*/ */
AwaitState(w, AwaitState(w, STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER |
STATE_GROUP_LEADER | STATE_PARALLEL_FOLLOWER | STATE_COMPLETED, STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
&ctx); &ctx);
TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w); TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w);
} }
} }
size_t WriteThread::EnterAsBatchGroupLeader( size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
Writer* leader, WriteThread::Writer** last_writer, WriteGroup* write_group) {
autovector<WriteThread::Writer*>* write_batch_group) {
assert(leader->link_older == nullptr); assert(leader->link_older == nullptr);
assert(leader->batch != nullptr); assert(leader->batch != nullptr);
assert(write_group != nullptr);
size_t size = WriteBatchInternal::ByteSize(leader->batch); size_t size = WriteBatchInternal::ByteSize(leader->batch);
write_batch_group->push_back(leader);
// Allow the group to grow up to a maximum size, but if the // Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow // original write is small, limit the growth so we do not slow
@ -257,8 +317,10 @@ size_t WriteThread::EnterAsBatchGroupLeader(
max_size = size + (128 << 10); max_size = size + (128 << 10);
} }
*last_writer = leader; leader->write_group = write_group;
write_group->leader = leader;
write_group->last_writer = leader;
write_group->size = 1;
Writer* newest_writer = newest_writer_.load(std::memory_order_acquire); Writer* newest_writer = newest_writer_.load(std::memory_order_acquire);
// This is safe regardless of any db mutex status of the caller. Previous // This is safe regardless of any db mutex status of the caller. Previous
@ -308,74 +370,184 @@ size_t WriteThread::EnterAsBatchGroupLeader(
break; break;
} }
w->write_group = write_group;
size += batch_size; size += batch_size;
write_batch_group->push_back(w); write_group->last_writer = w;
w->in_batch_group = true; write_group->size++;
*last_writer = w;
} }
return size; return size;
} }
void WriteThread::LaunchParallelFollowers(ParallelGroup* pg, void WriteThread::EnterAsMemTableWriter(Writer* leader,
SequenceNumber sequence) { WriteGroup* write_group) {
// EnterAsBatchGroupLeader already created the links from leader to assert(leader != nullptr);
// newer writers in the group assert(leader->link_older == nullptr);
assert(leader->batch != nullptr);
pg->leader->parallel_group = pg; assert(write_group != nullptr);
Writer* w = pg->leader; size_t size = WriteBatchInternal::ByteSize(leader->batch);
w->sequence = sequence;
// Initialize and wake up the others // Allow the group to grow up to a maximum size, but if the
while (w != pg->last_writer) { // original write is small, limit the growth so we do not slow
// Writers that won't write don't get sequence allotment // down the small write too much.
if (!w->CallbackFailed() && w->ShouldWriteToMemtable()) { size_t max_size = 1 << 20;
// There is a sequence number of each written key if (size <= (128 << 10)) {
sequence += WriteBatchInternal::Count(w->batch); max_size = size + (128 << 10);
} }
leader->write_group = write_group;
write_group->leader = leader;
write_group->size = 1;
Writer* last_writer = leader;
if (!allow_concurrent_memtable_write_ || !leader->batch->HasMerge()) {
Writer* newest_writer = newest_memtable_writer_.load();
CreateMissingNewerLinks(newest_writer);
Writer* w = leader;
while (w != newest_writer) {
w = w->link_newer; w = w->link_newer;
w->sequence = sequence; // sequence number for the first key in the batch if (w->batch == nullptr) {
w->parallel_group = pg; break;
SetState(w, STATE_PARALLEL_FOLLOWER); }
if (w->batch->HasMerge()) {
break;
}
if (!allow_concurrent_memtable_write_) {
auto batch_size = WriteBatchInternal::ByteSize(w->batch);
if (size + batch_size > max_size) {
// Do not make batch too big
break;
}
size += batch_size;
}
w->write_group = write_group;
last_writer = w;
write_group->size++;
}
}
write_group->last_writer = last_writer;
write_group->last_sequence =
last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1;
}
void WriteThread::ExitAsMemTableWriter(Writer* self, WriteGroup& write_group) {
Writer* leader = write_group.leader;
Writer* last_writer = write_group.last_writer;
Writer* newest_writer = last_writer;
if (!newest_memtable_writer_.compare_exchange_strong(newest_writer,
nullptr)) {
CreateMissingNewerLinks(newest_writer);
Writer* next_leader = last_writer->link_newer;
assert(next_leader != nullptr);
next_leader->link_older = nullptr;
SetState(next_leader, STATE_MEMTABLE_WRITER_LEADER);
}
Writer* w = leader;
while (true) {
if (!write_group.status.ok()) {
w->status = write_group.status;
}
Writer* next = w->link_newer;
if (w != leader) {
SetState(w, STATE_COMPLETED);
}
if (w == last_writer) {
break;
}
w = next;
}
// Note that leader has to exit last, since it owns the write group.
SetState(leader, STATE_COMPLETED);
}
void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) {
assert(write_group != nullptr);
write_group->running.store(write_group->size);
for (auto w : *write_group) {
SetState(w, STATE_PARALLEL_MEMTABLE_WRITER);
} }
} }
// This method is called by both the leader and parallel followers // This method is called by both the leader and parallel followers
bool WriteThread::CompleteParallelWorker(Writer* w) { bool WriteThread::CompleteParallelMemTableWriter(Writer* w) {
static AdaptationContext ctx("CompleteParallelWorker"); static AdaptationContext ctx("CompleteParallelMemTableWriter");
auto* pg = w->parallel_group; auto* write_group = w->write_group;
if (!w->status.ok()) { if (!w->status.ok()) {
std::lock_guard<std::mutex> guard(pg->leader->StateMutex()); std::lock_guard<std::mutex> guard(write_group->leader->StateMutex());
pg->status = w->status; write_group->status = w->status;
} }
if (pg->running.load(std::memory_order_acquire) > 1 && pg->running-- > 1) { if (write_group->running-- > 1) {
// we're not the last one // we're not the last one
AwaitState(w, STATE_COMPLETED, &ctx); AwaitState(w, STATE_COMPLETED, &ctx);
return false; return false;
} }
// else we're the last parallel worker and should perform exit duties. // else we're the last parallel worker and should perform exit duties.
w->status = pg->status; w->status = write_group->status;
return true; return true;
} }
void WriteThread::ExitAsBatchGroupFollower(Writer* w) { void WriteThread::ExitAsBatchGroupFollower(Writer* w) {
auto* pg = w->parallel_group; auto* write_group = w->write_group;
assert(w->state == STATE_PARALLEL_FOLLOWER); assert(w->state == STATE_PARALLEL_MEMTABLE_WRITER);
assert(pg->status.ok()); assert(write_group->status.ok());
ExitAsBatchGroupLeader(pg->leader, pg->last_writer, pg->status); ExitAsBatchGroupLeader(*write_group, write_group->status);
assert(w->status.ok()); assert(w->status.ok());
assert(w->state == STATE_COMPLETED); assert(w->state == STATE_COMPLETED);
SetState(pg->leader, STATE_COMPLETED); SetState(write_group->leader, STATE_COMPLETED);
} }
void WriteThread::ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer, void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
Status status) { Status status) {
static AdaptationContext ctx("ExitAsBatchGroupLeader");
Writer* leader = write_group.leader;
Writer* last_writer = write_group.last_writer;
assert(leader->link_older == nullptr); assert(leader->link_older == nullptr);
if (enable_pipelined_write_) {
// Notify writers don't write to memtable to exit.
for (Writer* w = last_writer; w != leader;) {
Writer* next = w->link_older;
w->status = status;
if (!w->ShouldWriteToMemtable()) {
CompleteFollower(w, write_group);
}
w = next;
}
if (!leader->ShouldWriteToMemtable()) {
CompleteLeader(write_group);
}
// Link the ramaining of the group to memtable writer list.
if (write_group.size > 0) {
if (LinkGroup(write_group, &newest_memtable_writer_)) {
// The leader can now be different from current writer.
SetState(write_group.leader, STATE_MEMTABLE_WRITER_LEADER);
}
}
// Reset newest_writer_ and wake up the next leader.
Writer* newest_writer = last_writer;
if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) {
Writer* next_leader = newest_writer;
while (next_leader->link_older != last_writer) {
next_leader = next_leader->link_older;
assert(next_leader != nullptr);
}
next_leader->link_older = nullptr;
SetState(next_leader, STATE_GROUP_LEADER);
}
AwaitState(leader, STATE_MEMTABLE_WRITER_LEADER |
STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
&ctx);
} else {
Writer* head = newest_writer_.load(std::memory_order_acquire); Writer* head = newest_writer_.load(std::memory_order_acquire);
if (head != last_writer || if (head != last_writer ||
!newest_writer_.compare_exchange_strong(head, nullptr)) { !newest_writer_.compare_exchange_strong(head, nullptr)) {
@ -418,26 +590,48 @@ void WriteThread::ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer,
last_writer = next; last_writer = next;
} }
}
} }
void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) { void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) {
static AdaptationContext ctx("EnterUnbatched"); static AdaptationContext ctx("EnterUnbatched");
assert(w != nullptr && w->batch == nullptr);
assert(w->batch == nullptr);
bool linked_as_leader;
LinkOne(w, &linked_as_leader);
if (!linked_as_leader) {
mu->Unlock(); mu->Unlock();
bool linked_as_leader = LinkOne(w, &newest_writer_);
if (!linked_as_leader) {
TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait"); TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait");
// Last leader will not pick us as a follower since our batch is nullptr // Last leader will not pick us as a follower since our batch is nullptr
AwaitState(w, STATE_GROUP_LEADER, &ctx); AwaitState(w, STATE_GROUP_LEADER, &ctx);
mu->Lock();
} }
if (enable_pipelined_write_) {
WaitForMemTableWriters();
}
mu->Lock();
} }
void WriteThread::ExitUnbatched(Writer* w) { void WriteThread::ExitUnbatched(Writer* w) {
Status dummy_status; assert(w != nullptr);
ExitAsBatchGroupLeader(w, w, dummy_status); Writer* newest_writer = w;
if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) {
CreateMissingNewerLinks(newest_writer);
Writer* next_leader = w->link_newer;
assert(next_leader != nullptr);
next_leader->link_older = nullptr;
SetState(next_leader, STATE_GROUP_LEADER);
}
}
void WriteThread::WaitForMemTableWriters() {
static AdaptationContext ctx("WaitForMemTableWriters");
assert(enable_pipelined_write_);
if (newest_memtable_writer_.load() == nullptr) {
return;
}
Writer w;
if (!LinkOne(&w, &newest_memtable_writer_)) {
AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &ctx);
}
newest_memtable_writer_.store(nullptr);
} }
} // namespace rocksdb } // namespace rocksdb

@ -49,32 +49,66 @@ class WriteThread {
// the leader to STATE_COMPLETED. // the leader to STATE_COMPLETED.
STATE_GROUP_LEADER = 2, STATE_GROUP_LEADER = 2,
// A Writer that has returned as a follower in a parallel group. // The state used to inform a waiting writer that it has become the
// It should apply its batch to the memtable and then call // leader of memtable writer group. The leader will either write
// CompleteParallelWorker. When someone calls ExitAsBatchGroupLeader // memtable for the whole group, or launch a parallel group write
// or EarlyExitParallelGroup this state will get transitioned to // to memtable by calling LaunchParallelMemTableWrite.
// STATE_COMPLETED. STATE_MEMTABLE_WRITER_LEADER = 4,
STATE_PARALLEL_FOLLOWER = 4,
// The state used to inform a waiting writer that it has become a
// parallel memtable writer. It can be the group leader who launch the
// praallel writer group, or one of the followers. The writer should then
// apply its batch to the memtable concurrently and call
// CompleteParallelMemTableWriter.
STATE_PARALLEL_MEMTABLE_WRITER = 8,
// A follower whose writes have been applied, or a parallel leader // A follower whose writes have been applied, or a parallel leader
// whose followers have all finished their work. This is a terminal // whose followers have all finished their work. This is a terminal
// state. // state.
STATE_COMPLETED = 8, STATE_COMPLETED = 16,
// A state indicating that the thread may be waiting using StateMutex() // A state indicating that the thread may be waiting using StateMutex()
// and StateCondVar() // and StateCondVar()
STATE_LOCKED_WAITING = 16, STATE_LOCKED_WAITING = 32,
}; };
struct Writer; struct Writer;
struct ParallelGroup { struct WriteGroup {
Writer* leader; Writer* leader = nullptr;
Writer* last_writer; Writer* last_writer = nullptr;
SequenceNumber last_sequence; SequenceNumber last_sequence;
// before running goes to zero, status needs leader->StateMutex() // before running goes to zero, status needs leader->StateMutex()
Status status; Status status;
std::atomic<uint32_t> running; std::atomic<size_t> running;
size_t size = 0;
struct Iterator {
Writer* writer;
Writer* last_writer;
explicit Iterator(Writer* w, Writer* last)
: writer(w), last_writer(last) {}
Writer* operator*() const { return writer; }
Iterator& operator++() {
assert(writer != nullptr);
if (writer == last_writer) {
writer = nullptr;
} else {
writer = writer->link_newer;
}
return *this;
}
bool operator!=(const Iterator& other) const {
return writer != other.writer;
}
};
Iterator begin() const { return Iterator(leader, last_writer); }
Iterator end() const { return Iterator(nullptr, nullptr); }
}; };
// Information kept for every waiting writer. // Information kept for every waiting writer.
@ -86,11 +120,10 @@ class WriteThread {
bool disable_memtable; bool disable_memtable;
uint64_t log_used; // log number that this batch was inserted into uint64_t log_used; // log number that this batch was inserted into
uint64_t log_ref; // log number that memtable insert should reference uint64_t log_ref; // log number that memtable insert should reference
bool in_batch_group;
WriteCallback* callback; WriteCallback* callback;
bool made_waitable; // records lazy construction of mutex and cv bool made_waitable; // records lazy construction of mutex and cv
std::atomic<uint8_t> state; // write under StateMutex() or pre-link std::atomic<uint8_t> state; // write under StateMutex() or pre-link
ParallelGroup* parallel_group; WriteGroup* write_group;
SequenceNumber sequence; // the sequence number to use for the first key SequenceNumber sequence; // the sequence number to use for the first key
Status status; // status of memtable inserter Status status; // status of memtable inserter
Status callback_status; // status returned by callback->Callback() Status callback_status; // status returned by callback->Callback()
@ -107,11 +140,10 @@ class WriteThread {
disable_memtable(false), disable_memtable(false),
log_used(0), log_used(0),
log_ref(0), log_ref(0),
in_batch_group(false),
callback(nullptr), callback(nullptr),
made_waitable(false), made_waitable(false),
state(STATE_INIT), state(STATE_INIT),
parallel_group(nullptr), write_group(nullptr),
link_older(nullptr), link_older(nullptr),
link_newer(nullptr) {} link_newer(nullptr) {}
@ -124,11 +156,10 @@ class WriteThread {
disable_memtable(_disable_memtable), disable_memtable(_disable_memtable),
log_used(0), log_used(0),
log_ref(_log_ref), log_ref(_log_ref),
in_batch_group(false),
callback(_callback), callback(_callback),
made_waitable(false), made_waitable(false),
state(STATE_INIT), state(STATE_INIT),
parallel_group(nullptr), write_group(nullptr),
link_older(nullptr), link_older(nullptr),
link_newer(nullptr) {} link_newer(nullptr) {}
@ -182,10 +213,12 @@ class WriteThread {
} }
bool ShouldWriteToMemtable() { bool ShouldWriteToMemtable() {
return !CallbackFailed() && !disable_memtable; return status.ok() && !CallbackFailed() && !disable_memtable;
} }
bool ShouldWriteToWAL() { return !CallbackFailed() && !disable_wal; } bool ShouldWriteToWAL() {
return status.ok() && !CallbackFailed() && !disable_wal;
}
// No other mutexes may be acquired while holding StateMutex(), it is // No other mutexes may be acquired while holding StateMutex(), it is
// always last in the order // always last in the order
@ -201,7 +234,16 @@ class WriteThread {
} }
}; };
WriteThread(uint64_t max_yield_usec, uint64_t slow_yield_usec); struct AdaptationContext {
const char* name;
std::atomic<int32_t> value;
explicit AdaptationContext(const char* name0) : name(name0), value(0) {}
};
explicit WriteThread(const ImmutableDBOptions& db_options);
virtual ~WriteThread() = default;
// IMPORTANT: None of the methods in this class rely on the db mutex // IMPORTANT: None of the methods in this class rely on the db mutex
// for correctness. All of the methods except JoinBatchGroup and // for correctness. All of the methods except JoinBatchGroup and
@ -227,39 +269,44 @@ class WriteThread {
// Writer passed to JoinBatchGroup on the current thread. // Writer passed to JoinBatchGroup on the current thread.
// //
// Writer* leader: Writer that is STATE_GROUP_LEADER // Writer* leader: Writer that is STATE_GROUP_LEADER
// Writer** last_writer: Out-param that identifies the last follower // WriteGroup* write_group: Out-param of group members
// autovector<WriteBatch*>* write_batch_group: Out-param of group members
// returns: Total batch group byte size // returns: Total batch group byte size
size_t EnterAsBatchGroupLeader( size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group);
Writer* leader, Writer** last_writer,
autovector<WriteThread::Writer*>* write_batch_group); // Unlinks the Writer-s in a batch group, wakes up the non-leaders,
// and wakes up the next leader (if any).
//
// WriteGroup* write_group: the write group
// Status status: Status of write operation
void ExitAsBatchGroupLeader(WriteGroup& write_group, Status status);
// Exit batch group on behalf of batch group leader.
void ExitAsBatchGroupFollower(Writer* w);
// Constructs a write batch group led by leader from newest_memtable_writers_
// list. The leader should either write memtable for the whole group and
// call ExitAsMemTableWriter, or launch parallel memtable write through
// LaunchParallelMemTableWriters.
void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup);
// Memtable writer group leader, or the last finished writer in a parallel
// write group, exit from the newest_memtable_writers_ list, and wake up
// the next leader if needed.
void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group);
// Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the // Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the
// non-leader members of this write batch group. Sets Writer::sequence // non-leader members of this write batch group. Sets Writer::sequence
// before waking them up. // before waking them up.
// //
// ParallalGroup* pg: Extra state used to coordinate the parallel add // WriteGroup* write_group: Extra state used to coordinate the parallel add
// SequenceNumber sequence: Starting sequence number to assign to Writer-s void LaunchParallelMemTableWriters(WriteGroup* write_group);
void LaunchParallelFollowers(ParallelGroup* pg, SequenceNumber sequence);
// Reports the completion of w's batch to the parallel group leader, and // Reports the completion of w's batch to the parallel group leader, and
// waits for the rest of the parallel batch to complete. Returns true // waits for the rest of the parallel batch to complete. Returns true
// if this thread is the last to complete, and hence should advance // if this thread is the last to complete, and hence should advance
// the sequence number and then call EarlyExitParallelGroup, false if // the sequence number and then call EarlyExitParallelGroup, false if
// someone else has already taken responsibility for that. // someone else has already taken responsibility for that.
bool CompleteParallelWorker(Writer* w); bool CompleteParallelMemTableWriter(Writer* w);
// Exit batch group on behalf of batch group leader.
void ExitAsBatchGroupFollower(Writer* w);
// Unlinks the Writer-s in a batch group, wakes up the non-leaders,
// and wakes up the next leader (if any).
//
// Writer* leader: From EnterAsBatchGroupLeader
// Writer* last_writer: Value of out-param of EnterAsBatchGroupLeader
// Status status: Status of write operation
void ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer,
Status status);
// Waits for all preceding writers (unlocking mu while waiting), then // Waits for all preceding writers (unlocking mu while waiting), then
// registers w as the currently proceeding writer. // registers w as the currently proceeding writer.
@ -273,21 +320,40 @@ class WriteThread {
// writers. // writers.
void ExitUnbatched(Writer* w); void ExitUnbatched(Writer* w);
struct AdaptationContext { // Wait for all parallel memtable writers to finish, in case pipelined
const char* name; // write is enabled.
std::atomic<int32_t> value; void WaitForMemTableWriters();
explicit AdaptationContext(const char* name0) : name(name0), value(0) {} SequenceNumber UpdateLastSequence(SequenceNumber sequence) {
}; if (sequence > last_sequence_) {
last_sequence_ = sequence;
}
return last_sequence_;
}
private: private:
uint64_t max_yield_usec_; // See AwaitState.
uint64_t slow_yield_usec_; const uint64_t max_yield_usec_;
const uint64_t slow_yield_usec_;
// Allow multiple writers write to memtable concurrently.
const bool allow_concurrent_memtable_write_;
// Points to the newest pending Writer. Only leader can remove // Enable pipelined write to WAL and memtable.
// elements, adding can be done lock-free by anybody const bool enable_pipelined_write_;
// Points to the newest pending writer. Only leader can remove
// elements, adding can be done lock-free by anybody.
std::atomic<Writer*> newest_writer_; std::atomic<Writer*> newest_writer_;
// Points to the newest pending memtable writer. Used only when pipelined
// write is enabled.
std::atomic<Writer*> newest_memtable_writer_;
// The last sequence that have been consumed by a writer. The sequence
// is not necessary visible to reads because the writer can be ongoing.
SequenceNumber last_sequence_;
// Waits for w->state & goal_mask using w->StateMutex(). Returns // Waits for w->state & goal_mask using w->StateMutex(). Returns
// the state that satisfies goal_mask. // the state that satisfies goal_mask.
uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask); uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask);
@ -298,16 +364,30 @@ class WriteThread {
// a context-dependent static. // a context-dependent static.
uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx); uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx);
// Set writer state and wake the writer up if it is waiting.
void SetState(Writer* w, uint8_t new_state); void SetState(Writer* w, uint8_t new_state);
// Links w into the newest_writer_ list. Sets *linked_as_leader to // Links w into the newest_writer list. Return true if w was linked directly
// true if w was linked directly into the leader position. Safe to // into the leader position. Safe to call from multiple threads without
// call from multiple threads without external locking. // external locking.
void LinkOne(Writer* w, bool* linked_as_leader); bool LinkOne(Writer* w, std::atomic<Writer*>* newest_writer);
// Link write group into the newest_writer list as a whole, while keeping the
// order of the writers unchanged. Return true if the group was linked
// directly into the leader position.
bool LinkGroup(WriteGroup& write_group, std::atomic<Writer*>* newest_writer);
// Computes any missing link_newer links. Should not be called // Computes any missing link_newer links. Should not be called
// concurrently with itself. // concurrently with itself.
void CreateMissingNewerLinks(Writer* head); void CreateMissingNewerLinks(Writer* head);
// Set the leader in write_group to completed state and remove it from the
// write group.
void CompleteLeader(WriteGroup& write_group);
// Set a follower in write_group to completed state and remove it from the
// write group.
void CompleteFollower(Writer* w, WriteGroup& write_group);
}; };
} // namespace rocksdb } // namespace rocksdb

@ -751,6 +751,21 @@ struct DBOptions {
// Default: 16MB/s // Default: 16MB/s
uint64_t delayed_write_rate = 16 * 1024U * 1024U; uint64_t delayed_write_rate = 16 * 1024U * 1024U;
// By default, a single write thread queue is maintained. The thread gets
// to the head of the queue becomes write batch group leader and responsible
// for writing to WAL and memtable for the batch group.
//
// If enable_pipelined_write is true, separate write thread queue is
// maintained for WAL write and memtable write. A write thread first enter WAL
// writer queue and then memtable writer queue. Pending thread on the WAL
// writer queue thus only have to wait for previous writers to finish thier
// WAL writing but not the memtable writing. Enabling the feature may improve
// write throughput and reduce latency of the prepare phase of two-phase
// commit.
//
// Default: false
bool enable_pipelined_write = false;
// If true, allow multi-writers to update mem tables in parallel. // If true, allow multi-writers to update mem tables in parallel.
// Only some memtable_factory-s support concurrent writes; currently it // Only some memtable_factory-s support concurrent writes; currently it
// is implemented only for SkipListFactory. Concurrent memtable writes // is implemented only for SkipListFactory. Concurrent memtable writes

@ -70,6 +70,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
wal_bytes_per_sync(options.wal_bytes_per_sync), wal_bytes_per_sync(options.wal_bytes_per_sync),
listeners(options.listeners), listeners(options.listeners),
enable_thread_tracking(options.enable_thread_tracking), enable_thread_tracking(options.enable_thread_tracking),
enable_pipelined_write(options.enable_pipelined_write),
allow_concurrent_memtable_write(options.allow_concurrent_memtable_write), allow_concurrent_memtable_write(options.allow_concurrent_memtable_write),
enable_write_thread_adaptive_yield( enable_write_thread_adaptive_yield(
options.enable_write_thread_adaptive_yield), options.enable_write_thread_adaptive_yield),
@ -189,6 +190,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
wal_recovery_mode); wal_recovery_mode);
ROCKS_LOG_HEADER(log, " Options.enable_thread_tracking: %d", ROCKS_LOG_HEADER(log, " Options.enable_thread_tracking: %d",
enable_thread_tracking); enable_thread_tracking);
ROCKS_LOG_HEADER(log, " Options.enable_pipelined_write: %d",
enable_pipelined_write);
ROCKS_LOG_HEADER(log, " Options.allow_concurrent_memtable_write: %d", ROCKS_LOG_HEADER(log, " Options.allow_concurrent_memtable_write: %d",
allow_concurrent_memtable_write); allow_concurrent_memtable_write);
ROCKS_LOG_HEADER(log, " Options.enable_write_thread_adaptive_yield: %d", ROCKS_LOG_HEADER(log, " Options.enable_write_thread_adaptive_yield: %d",

@ -63,6 +63,7 @@ struct ImmutableDBOptions {
uint64_t wal_bytes_per_sync; uint64_t wal_bytes_per_sync;
std::vector<std::shared_ptr<EventListener>> listeners; std::vector<std::shared_ptr<EventListener>> listeners;
bool enable_thread_tracking; bool enable_thread_tracking;
bool enable_pipelined_write;
bool allow_concurrent_memtable_write; bool allow_concurrent_memtable_write;
bool enable_write_thread_adaptive_yield; bool enable_write_thread_adaptive_yield;
uint64_t write_thread_max_yield_usec; uint64_t write_thread_max_yield_usec;

@ -176,6 +176,7 @@ DBOptions::DBOptions(const Options& options)
listeners(options.listeners), listeners(options.listeners),
enable_thread_tracking(options.enable_thread_tracking), enable_thread_tracking(options.enable_thread_tracking),
delayed_write_rate(options.delayed_write_rate), delayed_write_rate(options.delayed_write_rate),
enable_pipelined_write(options.enable_pipelined_write),
allow_concurrent_memtable_write(options.allow_concurrent_memtable_write), allow_concurrent_memtable_write(options.allow_concurrent_memtable_write),
enable_write_thread_adaptive_yield( enable_write_thread_adaptive_yield(
options.enable_write_thread_adaptive_yield), options.enable_write_thread_adaptive_yield),

@ -305,6 +305,9 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
{"fail_if_options_file_error", {"fail_if_options_file_error",
{offsetof(struct DBOptions, fail_if_options_file_error), {offsetof(struct DBOptions, fail_if_options_file_error),
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
{"enable_pipelined_write",
{offsetof(struct DBOptions, enable_pipelined_write), OptionType::kBoolean,
OptionVerificationType::kNormal, false, 0}},
{"allow_concurrent_memtable_write", {"allow_concurrent_memtable_write",
{offsetof(struct DBOptions, allow_concurrent_memtable_write), {offsetof(struct DBOptions, allow_concurrent_memtable_write),
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},

@ -281,6 +281,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"random_access_max_buffer_size=1048576;" "random_access_max_buffer_size=1048576;"
"advise_random_on_open=true;" "advise_random_on_open=true;"
"fail_if_options_file_error=false;" "fail_if_options_file_error=false;"
"enable_pipelined_write=false;"
"allow_concurrent_memtable_write=true;" "allow_concurrent_memtable_write=true;"
"wal_recovery_mode=kPointInTimeRecovery;" "wal_recovery_mode=kPointInTimeRecovery;"
"enable_write_thread_adaptive_yield=true;" "enable_write_thread_adaptive_yield=true;"

Loading…
Cancel
Save