Summary: This diff just moves the write thread control out of the DBImpl. I will need this as I will control column family data concurrency by only accessing some data in the write thread. That way, we won't have to lock our accesses to column family hash table (mappings from IDs to CFDs). Test Plan: make check Reviewers: sdong, yhchiang, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D23301main
parent
540a257f2c
commit
dee91c259d
@ -0,0 +1,147 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "db/write_thread.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
Status WriteThread::EnterWriteThread(WriteThread::Writer* w, |
||||
uint64_t expiration_time) { |
||||
// the following code block pushes the current writer "w" into the writer
|
||||
// queue "writers_" and wait until one of the following conditions met:
|
||||
// 1. the job of "w" has been done by some other writers.
|
||||
// 2. "w" becomes the first writer in "writers_"
|
||||
// 3. "w" timed-out.
|
||||
writers_.push_back(w); |
||||
|
||||
bool timed_out = false; |
||||
while (!w->done && w != writers_.front()) { |
||||
if (expiration_time == 0) { |
||||
w->cv.Wait(); |
||||
} else if (w->cv.TimedWait(expiration_time)) { |
||||
if (w->in_batch_group) { |
||||
// then it means the front writer is currently doing the
|
||||
// write on behalf of this "timed-out" writer. Then it
|
||||
// should wait until the write completes.
|
||||
expiration_time = 0; |
||||
} else { |
||||
timed_out = true; |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
|
||||
if (timed_out) { |
||||
#ifndef NDEBUG |
||||
bool found = false; |
||||
#endif |
||||
for (auto iter = writers_.begin(); iter != writers_.end(); iter++) { |
||||
if (*iter == w) { |
||||
writers_.erase(iter); |
||||
#ifndef NDEBUG |
||||
found = true; |
||||
#endif |
||||
break; |
||||
} |
||||
} |
||||
#ifndef NDEBUG |
||||
assert(found); |
||||
#endif |
||||
// writers_.front() might still be in cond_wait without a time-out.
|
||||
// As a result, we need to signal it to wake it up. Otherwise no
|
||||
// one else will wake him up, and RocksDB will hang.
|
||||
if (!writers_.empty()) { |
||||
writers_.front()->cv.Signal(); |
||||
} |
||||
return Status::TimedOut(); |
||||
} |
||||
return Status::OK(); |
||||
} |
||||
|
||||
void WriteThread::ExitWriteThread(WriteThread::Writer* w, |
||||
WriteThread::Writer* last_writer, |
||||
Status status) { |
||||
// Pop out the current writer and all writers being pushed before the
|
||||
// current writer from the writer queue.
|
||||
while (!writers_.empty()) { |
||||
Writer* ready = writers_.front(); |
||||
writers_.pop_front(); |
||||
if (ready != w) { |
||||
ready->status = status; |
||||
ready->done = true; |
||||
ready->cv.Signal(); |
||||
} |
||||
if (ready == last_writer) break; |
||||
} |
||||
|
||||
// Notify new head of write queue
|
||||
if (!writers_.empty()) { |
||||
writers_.front()->cv.Signal(); |
||||
} |
||||
} |
||||
|
||||
// This function will be called only when the first writer succeeds.
|
||||
// All writers in the to-be-built batch group will be processed.
|
||||
//
|
||||
// REQUIRES: Writer list must be non-empty
|
||||
// REQUIRES: First writer must have a non-nullptr batch
|
||||
void WriteThread::BuildBatchGroup(WriteThread::Writer** last_writer, |
||||
autovector<WriteBatch*>* write_batch_group) { |
||||
assert(!writers_.empty()); |
||||
Writer* first = writers_.front(); |
||||
assert(first->batch != nullptr); |
||||
|
||||
size_t size = WriteBatchInternal::ByteSize(first->batch); |
||||
write_batch_group->push_back(first->batch); |
||||
|
||||
// Allow the group to grow up to a maximum size, but if the
|
||||
// original write is small, limit the growth so we do not slow
|
||||
// down the small write too much.
|
||||
size_t max_size = 1 << 20; |
||||
if (size <= (128<<10)) { |
||||
max_size = size + (128<<10); |
||||
} |
||||
|
||||
*last_writer = first; |
||||
std::deque<Writer*>::iterator iter = writers_.begin(); |
||||
++iter; // Advance past "first"
|
||||
for (; iter != writers_.end(); ++iter) { |
||||
Writer* w = *iter; |
||||
if (w->sync && !first->sync) { |
||||
// Do not include a sync write into a batch handled by a non-sync write.
|
||||
break; |
||||
} |
||||
|
||||
if (!w->disableWAL && first->disableWAL) { |
||||
// Do not include a write that needs WAL into a batch that has
|
||||
// WAL disabled.
|
||||
break; |
||||
} |
||||
|
||||
if (w->timeout_hint_us < first->timeout_hint_us) { |
||||
// Do not include those writes with shorter timeout. Otherwise, we might
|
||||
// execute a write that should instead be aborted because of timeout.
|
||||
break; |
||||
} |
||||
|
||||
if (w->batch == nullptr) { |
||||
// Do not include those writes with nullptr batch. Those are not writes,
|
||||
// those are something else. They want to be alone
|
||||
break; |
||||
} |
||||
|
||||
size += WriteBatchInternal::ByteSize(w->batch); |
||||
if (size > max_size) { |
||||
// Do not make batch too big
|
||||
break; |
||||
} |
||||
|
||||
write_batch_group->push_back(w->batch); |
||||
w->in_batch_group = true; |
||||
*last_writer = w; |
||||
} |
||||
} |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,80 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
|
||||
#include <stdint.h> |
||||
#include <deque> |
||||
#include <limits> |
||||
#include "rocksdb/status.h" |
||||
#include "db/write_batch_internal.h" |
||||
#include "util/autovector.h" |
||||
#include "port/port.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class WriteThread { |
||||
public: |
||||
static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max(); |
||||
// Information kept for every waiting writer
|
||||
struct Writer { |
||||
Status status; |
||||
WriteBatch* batch; |
||||
bool sync; |
||||
bool disableWAL; |
||||
bool in_batch_group; |
||||
bool done; |
||||
uint64_t timeout_hint_us; |
||||
port::CondVar cv; |
||||
|
||||
explicit Writer(port::Mutex* mu) |
||||
: batch(nullptr), |
||||
sync(false), |
||||
disableWAL(false), |
||||
in_batch_group(false), |
||||
done(false), |
||||
timeout_hint_us(kNoTimeOut), |
||||
cv(mu) {} |
||||
}; |
||||
|
||||
WriteThread() = default; |
||||
~WriteThread() = default; |
||||
|
||||
// Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
|
||||
// thread should grab the mutex_ and be the first on writers queue.
|
||||
// EnterWriteThread is used for it.
|
||||
// Be aware! Writer's job can be done by other thread (see DBImpl::Write
|
||||
// for examples), so check it via w.done before applying changes.
|
||||
//
|
||||
// Writer* w: writer to be placed in the queue
|
||||
// uint64_t expiration_time: maximum time to be in the queue
|
||||
// See also: ExitWriteThread
|
||||
// REQUIRES: db mutex held
|
||||
Status EnterWriteThread(Writer* w, uint64_t expiration_time); |
||||
|
||||
// After doing write job, we need to remove already used writers from
|
||||
// writers_ queue and notify head of the queue about it.
|
||||
// ExitWriteThread is used for this.
|
||||
//
|
||||
// Writer* w: Writer, that was added by EnterWriteThread function
|
||||
// Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
|
||||
// does)
|
||||
// we should pass last_writer as a parameter to
|
||||
// ExitWriteThread
|
||||
// (if you don't touch other writers, just pass w)
|
||||
// Status status: Status of write operation
|
||||
// See also: EnterWriteThread
|
||||
// REQUIRES: db mutex held
|
||||
void ExitWriteThread(Writer* w, Writer* last_writer, Status status); |
||||
|
||||
void BuildBatchGroup(Writer** last_writer, |
||||
autovector<WriteBatch*>* write_batch_group); |
||||
|
||||
private: |
||||
// Queue of writers.
|
||||
std::deque<Writer*> writers_; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
Loading…
Reference in new issue