Do not hold mutex when write keys if not necessary (#7516)

Summary:
## Problem Summary
RocksDB will acquire the global mutex of db instance for every time when user calls `Write`.  When RocksDB schedules a lot of compaction jobs,   it will compete the mutex with write thread and it will hurt the write performance.

## Problem Solution:
I want to use log_write_mutex to replace the global mutex in most case so that we do not acquire it in write-thread unless there is a write-stall event or a write-buffer-full event occur.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/7516

Test Plan:
1. make check
2. CI
3. COMPILE_WITH_TSAN=1 make db_stress
make crash_test
make crash_test_with_multiops_wp_txn
make crash_test_with_multiops_wc_txn
make crash_test_with_atomic_flush

Reviewed By: siying

Differential Revision: D36908702

Pulled By: riversand963

fbshipit-source-id: 59b13881f4f5c0a58fd3ca79128a396d9cd98efe
main
Wallace 2 years ago committed by Facebook GitHub Bot
parent a0c63083d3
commit 1e9bf25f61
  1. 1
      HISTORY.md
  2. 14
      db/db_compaction_test.cc
  3. 61
      db/db_impl/db_impl.cc
  4. 121
      db/db_impl/db_impl.h
  5. 32
      db/db_impl/db_impl_compaction_flush.cc
  6. 2
      db/db_impl/db_impl_debug.cc
  7. 51
      db/db_impl/db_impl_files.cc
  8. 14
      db/db_impl/db_impl_open.cc
  9. 117
      db/db_impl/db_impl_write.cc
  10. 7
      db/error_handler.cc
  11. 9
      db/error_handler.h
  12. 2
      db/perf_context_test.cc
  13. 14
      db/version_set_test.cc
  14. 33
      db/wal_edit.cc
  15. 10
      db/wal_edit.h
  16. 9
      db/wal_edit_test.cc
  17. 3
      utilities/backup/backup_engine_test.cc

@ -99,6 +99,7 @@
### Behavior changes
* DB::Open(), DB::OpenAsSecondary() will fail if a Logger cannot be created (#9984)
* DB::Write does not hold global `mutex_` if this db instance does not need to switch wal and mem-table (#7516).
* Removed support for reading Bloom filters using obsolete block-based filter format. (Support for writing such filters was dropped in 7.0.) For good read performance on old DBs using these filters, a full compaction is required.
* Per KV checksum in write batch is verified before a write batch is written to WAL to detect any corruption to the write batch (#10114).

@ -5659,18 +5659,10 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
for (int j = 0; j != kNumKeysPerFile; ++j) {
ASSERT_OK(Put(Key(j), rnd.RandomString(990)));
}
if (0 == i) {
// When we reach here, the memtables have kNumKeysPerFile keys. Note that
// flush is not yet triggered. We need to write an extra key so that the
// write path will call PreprocessWrite and flush the previous key-value
// pairs to e flushed. After that, there will be the newest key in the
// memtable, and a bunch of L0 files. Since there is already one key in
// the memtable, then for i = 1, 2, ..., we do not have to write this
// extra key to trigger flush.
ASSERT_OK(Put("", ""));
}
if (i > 0) {
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i);
}
}
// When we reach this point, there will be level0_stop_writes_trigger L0
// files and one extra key (99) in memory, which overlaps with the external

@ -185,7 +185,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
log_dir_synced_(false),
log_empty_(true),
persist_stats_cf_handle_(nullptr),
log_sync_cv_(&mutex_),
log_sync_cv_(&log_write_mutex_),
total_log_size_(0),
is_snapshot_supported_(true),
write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
@ -273,6 +273,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
mutable_db_options_.Dump(immutable_db_options_.info_log.get());
DumpSupportInfo(immutable_db_options_.info_log.get());
max_total_wal_size_.store(mutable_db_options_.max_total_wal_size,
std::memory_order_relaxed);
if (write_buffer_manager_) {
wbm_stall_.reset(new WBMStallInterface());
}
@ -625,7 +627,8 @@ Status DBImpl::CloseHelper() {
job_context.Clean();
mutex_.Lock();
}
{
InstrumentedMutexLock lock(&log_write_mutex_);
for (auto l : logs_to_free_) {
delete l;
}
@ -645,6 +648,7 @@ Status DBImpl::CloseHelper() {
}
}
logs_.clear();
}
// Table cache may have table handles holding blocks from the block cache.
// We need to release them before the block cache is destroyed. The block
@ -1108,6 +1112,7 @@ Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
}
void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
mutex_.AssertHeld();
if (!job_context->logs_to_free.empty()) {
for (auto l : job_context->logs_to_free) {
AddToLogsToFreeQueue(l);
@ -1285,6 +1290,11 @@ Status DBImpl::SetDBOptions(
new_options.stats_persist_period_sec);
mutex_.Lock();
}
if (new_options.max_total_wal_size !=
mutable_db_options_.max_total_wal_size) {
max_total_wal_size_.store(new_options.max_total_wal_size,
std::memory_order_release);
}
write_controller_.set_max_delayed_write_rate(
new_options.delayed_write_rate);
table_cache_.get()->SetCapacity(new_options.max_open_files == -1
@ -1405,7 +1415,7 @@ Status DBImpl::SyncWAL() {
uint64_t current_log_number;
{
InstrumentedMutexLock l(&mutex_);
InstrumentedMutexLock l(&log_write_mutex_);
assert(!logs_.empty());
// This SyncWAL() call only cares about logs up to this number.
@ -1462,19 +1472,37 @@ Status DBImpl::SyncWAL() {
TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
VersionEdit synced_wals;
{
InstrumentedMutexLock l(&mutex_);
InstrumentedMutexLock l(&log_write_mutex_);
if (status.ok()) {
status = MarkLogsSynced(current_log_number, need_log_dir_sync);
MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals);
} else {
MarkLogsNotSynced(current_log_number);
}
}
if (status.ok() && synced_wals.IsWalAddition()) {
InstrumentedMutexLock l(&mutex_);
status = ApplyWALToManifest(&synced_wals);
}
TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
return status;
}
Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) {
// not empty, write to MANIFEST.
mutex_.AssertHeld();
Status status =
versions_->LogAndApplyToDefaultColumnFamily(synced_wals, &mutex_);
if (!status.ok() && versions_->io_status().IsIOError()) {
status = error_handler_.SetBGError(versions_->io_status(),
BackgroundErrorReason::kManifestWrite);
}
return status;
}
Status DBImpl::LockWAL() {
log_write_mutex_.Lock();
auto cur_log_writer = logs_.back().writer;
@ -1494,12 +1522,12 @@ Status DBImpl::UnlockWAL() {
return Status::OK();
}
Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
mutex_.AssertHeld();
void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
VersionEdit* synced_wals) {
log_write_mutex_.AssertHeld();
if (synced_dir && logfile_number_ == up_to) {
log_dir_synced_ = true;
}
VersionEdit synced_wals;
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
auto& wal = *it;
assert(wal.IsSyncing());
@ -1507,11 +1535,9 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
if (logs_.size() > 1) {
if (immutable_db_options_.track_and_verify_wals_in_manifest &&
wal.GetPreSyncSize() > 0) {
synced_wals.AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
}
logs_to_free_.push_back(wal.ReleaseWriter());
// To modify logs_ both mutex_ and log_write_mutex_ must be held
InstrumentedMutexLock l(&log_write_mutex_);
it = logs_.erase(it);
} else {
wal.FinishSync();
@ -1520,22 +1546,11 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
}
assert(logs_.empty() || logs_[0].number > up_to ||
(logs_.size() == 1 && !logs_[0].IsSyncing()));
Status s;
if (synced_wals.IsWalAddition()) {
// not empty, write to MANIFEST.
s = versions_->LogAndApplyToDefaultColumnFamily(&synced_wals, &mutex_);
if (!s.ok() && versions_->io_status().IsIOError()) {
s = error_handler_.SetBGError(versions_->io_status(),
BackgroundErrorReason::kManifestWrite);
}
}
log_sync_cv_.SignalAll();
return s;
}
void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
mutex_.AssertHeld();
log_write_mutex_.AssertHeld();
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
++it) {
auto& wal = *it;

@ -998,6 +998,7 @@ class DBImpl : public DB {
}
void AddToLogsToFreeQueue(log::Writer* log_writer) {
mutex_.AssertHeld();
logs_to_free_queue_.push_back(log_writer);
}
@ -1298,7 +1299,7 @@ class DBImpl : public DB {
// only used for dynamically adjusting max_total_wal_size. it is a sum of
// [write_buffer_size * max_write_buffer_number] over all column families
uint64_t max_total_in_memory_state_;
std::atomic<uint64_t> max_total_in_memory_state_;
// The options to access storage files
const FileOptions file_options_;
@ -1648,6 +1649,15 @@ class DBImpl : public DB {
uint64_t pre_sync_size = 0;
};
struct LogContext {
explicit LogContext(bool need_sync = false)
: need_log_sync(need_sync), need_log_dir_sync(need_sync) {}
bool need_log_sync = false;
bool need_log_dir_sync = false;
log::Writer* writer = nullptr;
LogFileNumberSize* log_file_number_size = nullptr;
};
// PurgeFileInfo is a structure to hold information of files to be deleted in
// purge_files_
struct PurgeFileInfo {
@ -1801,7 +1811,7 @@ class DBImpl : public DB {
void ReleaseFileNumberFromPendingOutputs(
std::unique_ptr<std::list<uint64_t>::iterator>& v);
IOStatus SyncClosedLogs(JobContext* job_context);
IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals);
// Flush the in-memory write buffer to storage. Switches to a new
// log-file/memtable and writes a new descriptor iff successful. Then
@ -1961,8 +1971,8 @@ class DBImpl : public DB {
Status HandleWriteBufferManagerFlush(WriteContext* write_context);
// REQUIRES: mutex locked
Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync,
WriteContext* write_context);
Status PreprocessWrite(const WriteOptions& write_options,
LogContext* log_context, WriteContext* write_context);
// Merge write batches in the write group into merged_batch.
// Returns OK if merge is successful.
@ -2101,7 +2111,8 @@ class DBImpl : public DB {
std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
// helper function to call after some of the logs_ were synced
Status MarkLogsSynced(uint64_t up_to, bool synced_dir);
void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit);
Status ApplyWALToManifest(VersionEdit* edit);
// WALs with log number up to up_to are not synced successfully.
void MarkLogsNotSynced(uint64_t up_to);
@ -2307,8 +2318,9 @@ class DBImpl : public DB {
// logfile_number_ is currently updated only in write_thread_, it can be read
// from the same write_thread_ without any locks.
uint64_t logfile_number_;
std::deque<uint64_t>
log_recycle_files_; // a list of log files that we can recycle
// Log files that we can recycle. Must be protected by db mutex_.
std::deque<uint64_t> log_recycle_files_;
// Protected by log_write_mutex_.
bool log_dir_synced_;
// Without two_write_queues, read and writes to log_empty_ are protected by
// mutex_. Since it is currently updated/read only in write_thread_, it can be
@ -2322,26 +2334,93 @@ class DBImpl : public DB {
bool persistent_stats_cfd_exists_ = true;
// Without two_write_queues, read and writes to alive_log_files_ are
// protected by mutex_. With two_write_queues_, writes
// are protected by locking both mutex_ and log_write_mutex_, and reads must
// be under either mutex_ or log_write_mutex_.
// alive_log_files_ is protected by mutex_ and log_write_mutex_ with details
// as follows:
// 1. read by FindObsoleteFiles() which can be called in either application
// thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
// held.
// 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_
// are held.
// 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles()
// (actually called by Open()), only mutex_ is held because at this point,
// the DB::Open() call has not returned success to application, and the
// only other thread(s) that can conflict are bg threads calling
// FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_
// are held when accessing alive_log_files_.
// 4. read by DBImpl::Open() is protected by mutex_.
// 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are
// held. This is done by the write group leader. Note that in the case of
// two-write-queues, another WAL-only write thread can be writing to the
// WAL concurrently. See 9.
// 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is
// done by write group leader.
// 7. read by ConcurrentWriteToWAL() by the write group leader in the case of
// two-write-queues. Only log_write_mutex_ is held to protect concurrent
// pop_front() by FindObsoleteFiles().
// 8. read by PreprocessWrite() by the write group leader. log_write_mutex_
// is held to protect the data structure from concurrent pop_front() by
// FindObsoleteFiles().
// 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case
// of two-write-queues. Only log_write_mutex_ is held. This suffices to
// protect the data structure from concurrent push_back() by current
// write group leader as well as pop_front() by FindObsoleteFiles().
std::deque<LogFileNumberSize> alive_log_files_;
// Log files that aren't fully synced, and the current log file.
// Synchronization:
// - push_back() is done from write_thread_ with locked mutex_ and
// log_write_mutex_
// - pop_front() is done from any thread with locked mutex_ and
// log_write_mutex_
// - reads are done with either locked mutex_ or log_write_mutex_
// 1. read by FindObsoleteFiles() which can be called either in application
// thread or RocksDB bg threads. log_write_mutex_ is always held, while
// some reads are performed without mutex_.
// 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held.
// 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_.
// 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex.
// Note that at this point, DB::Open() has not returned success to
// application, thus the only other thread(s) that can conflict are bg
// threads calling FindObsoleteFiles(). See 1.
// 5. iteration and clear() from CloseHelper() always hold log_write_mutex
// and mutex_.
// 6. back() called by APIs FlushWAL() and LockWAL() are protected by only
// log_write_mutex_. These two can be called by application threads after
// DB::Open() returns success to applications.
// 7. read by SyncWAL(), another API, protected by only log_write_mutex_.
// 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
// log_write_mutex_.
// 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
// 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can
// happen in bg flush threads after DB::Open() returns success to
// applications.
// 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
// holds only the log_write_mutex_. This is done by the write group
// leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced()
// can happen concurrently. This is fine because log_write_mutex_ is used
// by all parties. See 2, 5, 9.
// 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and
// log_write_mutex_. This happens in the write group leader.
// 13. emplace_back() by SwitchMemtable() hold both mutex_ and
// log_write_mutex_. This happens in the write group leader. Can conflict
// with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
// SyncClosedLogs(), etc. as well as application threads calling
// FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
// require at least log_write_mutex_.
// 14. iteration called in WriteToWAL(write_group) protected by
// log_write_mutex_. This is done by write group leader when
// two-write-queues is disabled and write needs to sync logs.
// 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_.
// This can be done by the write group leader if two-write-queues is
// enabled. It can also be done by another WAL-only write thread.
//
// Other observations:
// - back() and items with getting_synced=true are not popped,
// - The same thread that sets getting_synced=true will reset it.
// - it follows that the object referred by back() can be safely read from
// the write_thread_ without using mutex
// the write_thread_ without using mutex. Note that calling back() without
// mutex may be unsafe because different implementations of deque::back() may
// access other member variables of deque, causing undefined behaviors.
// Generally, do not access stl containers without proper synchronization.
// - it follows that the items with getting_synced=true can be safely read
// from the same thread that has set getting_synced=true
std::deque<LogWriterNumber> logs_;
// Signaled when getting_synced becomes false for some of the logs_.
InstrumentedCondVar log_sync_cv_;
// This is the app-level state that is written to the WAL but will be used
@ -2356,7 +2435,7 @@ class DBImpl : public DB {
std::atomic<uint64_t> total_log_size_;
// If this is non-empty, we need to delete these log files in background
// threads. Protected by db mutex.
// threads. Protected by log_write_mutex_.
autovector<log::Writer*> logs_to_free_;
bool is_snapshot_supported_;
@ -2436,10 +2515,13 @@ class DBImpl : public DB {
// JobContext. Current implementation tracks table and blob files only.
std::unordered_set<uint64_t> files_grabbed_for_purge_;
// A queue to store log writers to close
// A queue to store log writers to close. Protected by db mutex_.
std::deque<log::Writer*> logs_to_free_queue_;
std::deque<SuperVersion*> superversions_to_free_queue_;
int unscheduled_flushes_;
int unscheduled_compactions_;
// count how many background compactions are running or have been scheduled in
@ -2592,6 +2674,7 @@ class DBImpl : public DB {
InstrumentedCondVar atomic_flush_install_cv_;
bool wal_in_db_path_;
std::atomic<uint64_t> max_total_wal_size_;
BlobFileCompletionCallback blob_callback_;

@ -82,9 +82,10 @@ bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
return false;
}
IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
IOStatus DBImpl::SyncClosedLogs(JobContext* job_context,
VersionEdit* synced_wals) {
TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
mutex_.AssertHeld();
InstrumentedMutexLock l(&log_write_mutex_);
autovector<log::Writer*, 1> logs_to_sync;
uint64_t current_log_number = logfile_number_;
while (logs_.front().number < current_log_number &&
@ -100,7 +101,7 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
IOStatus io_s;
if (!logs_to_sync.empty()) {
mutex_.Unlock();
log_write_mutex_.Unlock();
assert(job_context);
@ -128,12 +129,12 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock",
/*arg=*/nullptr);
mutex_.Lock();
log_write_mutex_.Lock();
// "number <= current_log_number - 1" is equivalent to
// "number < current_log_number".
if (io_s.ok()) {
io_s = status_to_io_status(MarkLogsSynced(current_log_number - 1, true));
MarkLogsSynced(current_log_number - 1, true, synced_wals);
} else {
MarkLogsNotSynced(current_log_number - 1);
}
@ -220,8 +221,16 @@ Status DBImpl::FlushMemTableToOutputFile(
bool need_cancel = false;
IOStatus log_io_s = IOStatus::OK();
if (needs_to_sync_closed_wals) {
// SyncClosedLogs() may unlock and re-lock the db_mutex.
log_io_s = SyncClosedLogs(job_context);
// SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple
// times.
VersionEdit synced_wals;
mutex_.Unlock();
log_io_s = SyncClosedLogs(job_context, &synced_wals);
mutex_.Lock();
if (log_io_s.ok() && synced_wals.IsWalAddition()) {
log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
}
if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
!log_io_s.IsColumnFamilyDropped()) {
error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
@ -474,7 +483,14 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
if (logfile_number_ > 0) {
// TODO (yanqin) investigate whether we should sync the closed logs for
// single column family case.
log_io_s = SyncClosedLogs(job_context);
VersionEdit synced_wals;
mutex_.Unlock();
log_io_s = SyncClosedLogs(job_context, &synced_wals);
mutex_.Lock();
if (log_io_s.ok() && synced_wals.IsWalAddition()) {
log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
}
if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
!log_io_s.IsColumnFamilyDropped()) {
if (total_log_size_ > 0) {

@ -223,7 +223,7 @@ void DBImpl::TEST_EndWrite(void* w) {
}
size_t DBImpl::TEST_LogsToFreeSize() {
InstrumentedMutexLock l(&mutex_);
InstrumentedMutexLock l(&log_write_mutex_);
return logs_to_free_.size();
}

@ -271,6 +271,15 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
// logs_ is empty when called during recovery, in which case there can't yet
// be any tracked obsolete logs
log_write_mutex_.Lock();
if (alive_log_files_.empty() || logs_.empty()) {
mutex_.AssertHeld();
// We may reach here if the db is DBImplSecondary
log_write_mutex_.Unlock();
return;
}
if (!alive_log_files_.empty() && !logs_.empty()) {
uint64_t min_log_number = job_context->log_number;
size_t num_alive_log_files = alive_log_files_.size();
@ -292,17 +301,15 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
}
job_context->size_log_to_delete += earliest.size;
total_log_size_ -= earliest.size;
if (two_write_queues_) {
log_write_mutex_.Lock();
}
alive_log_files_.pop_front();
if (two_write_queues_) {
log_write_mutex_.Unlock();
}
// Current log should always stay alive since it can't have
// number < MinLogNumber().
assert(alive_log_files_.size());
}
log_write_mutex_.Unlock();
mutex_.Unlock();
log_write_mutex_.Lock();
while (!logs_.empty() && logs_.front().number < min_log_number) {
auto& log = logs_.front();
if (log.IsSyncing()) {
@ -311,11 +318,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
continue;
}
logs_to_free_.push_back(log.ReleaseWriter());
{
InstrumentedMutexLock wl(&log_write_mutex_);
logs_.pop_front();
}
}
// Current log cannot be obsolete.
assert(!logs_.empty());
}
@ -323,23 +327,13 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
// We're just cleaning up for DB::Write().
assert(job_context->logs_to_free.empty());
job_context->logs_to_free = logs_to_free_;
logs_to_free_.clear();
log_write_mutex_.Unlock();
mutex_.Lock();
job_context->log_recycle_files.assign(log_recycle_files_.begin(),
log_recycle_files_.end());
logs_to_free_.clear();
}
namespace {
bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
const JobContext::CandidateFileInfo& second) {
if (first.file_name > second.file_name) {
return true;
} else if (first.file_name < second.file_name) {
return false;
} else {
return (first.file_path > second.file_path);
}
}
} // namespace
// Delete obsolete files and log status and information of file deletion
void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
@ -445,7 +439,16 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
// dedup state.candidate_files so we don't try to delete the same
// file twice
std::sort(candidate_files.begin(), candidate_files.end(),
CompareCandidateFile);
[](const JobContext::CandidateFileInfo& lhs,
const JobContext::CandidateFileInfo& rhs) {
if (lhs.file_name > rhs.file_name) {
return true;
} else if (lhs.file_name < rhs.file_name) {
return false;
} else {
return (lhs.file_path > rhs.file_path);
}
});
candidate_files.erase(
std::unique(candidate_files.begin(), candidate_files.end()),
candidate_files.end());

@ -1459,9 +1459,6 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
Status s;
mutex_.AssertHeld();
assert(immutable_db_options_.avoid_flush_during_recovery);
if (two_write_queues_) {
log_write_mutex_.Lock();
}
// Mark these as alive so they'll be considered for deletion later by
// FindObsoleteFiles()
total_log_size_ = 0;
@ -1486,9 +1483,6 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
total_log_size_ += log.size;
alive_log_files_.push_back(log);
}
if (two_write_queues_) {
log_write_mutex_.Unlock();
}
return s;
}
@ -1871,16 +1865,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
}
if (s.ok()) {
if (impl->two_write_queues_) {
impl->log_write_mutex_.Lock();
}
impl->alive_log_files_.push_back(
DBImpl::LogFileNumberSize(impl->logfile_number_));
if (impl->two_write_queues_) {
impl->log_write_mutex_.Unlock();
}
}
if (s.ok()) {
// In WritePrepared there could be gap in sequence numbers. This breaks
// the trick we use in kPointInTimeRecovery which assumes the first seq in
// the log right after the corrupted log is one larger than the last seq

@ -349,14 +349,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
// when it finds suitable, and finish them in the same write batch.
// This is how a write job could be done by the other writer.
WriteContext write_context;
LogContext log_context(write_options.sync);
WriteThread::WriteGroup write_group;
bool in_parallel_group = false;
uint64_t last_sequence = kMaxSequenceNumber;
mutex_.Lock();
bool need_log_sync = write_options.sync;
bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
assert(!two_write_queues_ || !disable_memtable);
{
// With concurrent writes we do preprocess only in the write thread that
@ -366,7 +363,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
// PreprocessWrite does its own perf timing.
PERF_TIMER_STOP(write_pre_and_post_process_time);
status = PreprocessWrite(write_options, &need_log_sync, &write_context);
status = PreprocessWrite(write_options, &log_context, &write_context);
if (!two_write_queues_) {
// Assign it after ::PreprocessWrite since the sequence might advance
// inside it by WriteRecoverableState
@ -376,13 +373,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
PERF_TIMER_START(write_pre_and_post_process_time);
}
log::Writer* log_writer = logs_.back().writer;
LogFileNumberSize& log_file_number_size = alive_log_files_.back();
assert(log_writer->get_log_number() == log_file_number_size.number);
mutex_.Unlock();
// Add to log and apply to memtable. We can release the lock
// during this phase since &w is currently responsible for logging
// and protects against concurrent loggers and concurrent writes
@ -477,10 +467,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
if (!two_write_queues_) {
if (status.ok() && !write_options.disableWAL) {
assert(log_context.log_file_number_size);
LogFileNumberSize& log_file_number_size =
*(log_context.log_file_number_size);
PERF_TIMER_GUARD(write_wal_time);
io_s = WriteToWAL(write_group, log_writer, log_used, need_log_sync,
need_log_dir_sync, last_sequence + 1,
log_file_number_size);
io_s =
WriteToWAL(write_group, log_context.writer, log_used,
log_context.need_log_sync, log_context.need_log_dir_sync,
last_sequence + 1, log_file_number_size);
}
} else {
if (status.ok() && !write_options.disableWAL) {
@ -582,14 +576,21 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
assert(pre_release_cb_status.ok());
}
if (need_log_sync) {
mutex_.Lock();
if (log_context.need_log_sync) {
VersionEdit synced_wals;
log_write_mutex_.Lock();
if (status.ok()) {
status = MarkLogsSynced(logfile_number_, need_log_dir_sync);
MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
&synced_wals);
} else {
MarkLogsNotSynced(logfile_number_);
}
mutex_.Unlock();
log_write_mutex_.Unlock();
if (status.ok() && synced_wals.IsWalAddition()) {
InstrumentedMutexLock l(&mutex_);
status = ApplyWALToManifest(&synced_wals);
}
// Requesting sync with two_write_queues_ is expected to be very rare. We
// hence provide a simple implementation that is not necessarily efficient.
if (two_write_queues_) {
@ -652,19 +653,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
if (w.callback && !w.callback->AllowWriteBatching()) {
write_thread_.WaitForMemTableWriters();
}
mutex_.Lock();
bool need_log_sync = !write_options.disableWAL && write_options.sync;
bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
LogContext log_context(!write_options.disableWAL && write_options.sync);
// PreprocessWrite does its own perf timing.
PERF_TIMER_STOP(write_pre_and_post_process_time);
w.status = PreprocessWrite(write_options, &need_log_sync, &write_context);
w.status = PreprocessWrite(write_options, &log_context, &write_context);
PERF_TIMER_START(write_pre_and_post_process_time);
log::Writer* log_writer = logs_.back().writer;
LogFileNumberSize& log_file_number_size = alive_log_files_.back();
assert(log_writer->get_log_number() == log_file_number_size.number);
mutex_.Unlock();
// This can set non-OK status if callback fail.
last_batch_group_size_ =
@ -727,9 +720,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
wal_write_group.size - 1);
RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
}
assert(log_context.log_file_number_size);
LogFileNumberSize& log_file_number_size =
*(log_context.log_file_number_size);
io_s =
WriteToWAL(wal_write_group, log_writer, log_used, need_log_sync,
need_log_dir_sync, current_sequence, log_file_number_size);
WriteToWAL(wal_write_group, log_context.writer, log_used,
log_context.need_log_sync, log_context.need_log_dir_sync,
current_sequence, log_file_number_size);
w.status = io_s;
}
@ -740,16 +737,20 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
WriteStatusCheck(w.status);
}
if (need_log_sync) {
mutex_.Lock();
VersionEdit synced_wals;
if (log_context.need_log_sync) {
InstrumentedMutexLock l(&log_write_mutex_);
if (w.status.ok()) {
w.status = MarkLogsSynced(logfile_number_, need_log_dir_sync);
MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
&synced_wals);
} else {
MarkLogsNotSynced(logfile_number_);
}
mutex_.Unlock();
}
if (w.status.ok() && synced_wals.IsWalAddition()) {
InstrumentedMutexLock l(&mutex_);
w.status = ApplyWALToManifest(&synced_wals);
}
write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
}
@ -893,9 +894,8 @@ Status DBImpl::WriteImplWALOnly(
// TODO(myabandeh): Make preliminary checks thread-safe so we could do them
// without paying the cost of obtaining the mutex.
if (status.ok()) {
InstrumentedMutexLock l(&mutex_);
bool need_log_sync = false;
status = PreprocessWrite(write_options, &need_log_sync, &write_context);
LogContext log_context;
status = PreprocessWrite(write_options, &log_context, &write_context);
WriteStatusCheckOnLocked(status);
}
if (!status.ok()) {
@ -1057,9 +1057,8 @@ Status DBImpl::WriteImplWALOnly(
void DBImpl::WriteStatusCheckOnLocked(const Status& status) {
// Is setting bg_error_ enough here? This will at least stop
// compaction and fail any further writes.
// Caller must hold mutex_.
InstrumentedMutexLock l(&mutex_);
assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
mutex_.AssertHeld();
if (immutable_db_options_.paranoid_checks && !status.ok() &&
!status.IsBusy() && !status.IsIncomplete()) {
// Maybe change the return status to void?
@ -1110,13 +1109,13 @@ void DBImpl::MemTableInsertStatusCheck(const Status& status) {
}
Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
bool* need_log_sync,
LogContext* log_context,
WriteContext* write_context) {
mutex_.AssertHeld();
assert(write_context != nullptr && need_log_sync != nullptr);
assert(write_context != nullptr && log_context != nullptr);
Status status;
if (error_handler_.IsDBStopped()) {
InstrumentedMutexLock l(&mutex_);
status = error_handler_.GetBGError();
}
@ -1124,11 +1123,11 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) {
assert(versions_);
InstrumentedMutexLock l(&mutex_);
const ColumnFamilySet* const column_families =
versions_->GetColumnFamilySet();
assert(column_families);
size_t num_cfs = column_families->NumberOfColumnFamilies();
assert(num_cfs >= 1);
if (num_cfs > 1) {
WaitForPendingWrites();
@ -1142,15 +1141,18 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
// thread is writing to another DB with the same write buffer, they may also
// be flushed. We may end up with flushing much more DBs than needed. It's
// suboptimal but still correct.
InstrumentedMutexLock l(&mutex_);
WaitForPendingWrites();
status = HandleWriteBufferManagerFlush(write_context);
}
if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
InstrumentedMutexLock l(&mutex_);
status = TrimMemtableHistory(write_context);
}
if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
InstrumentedMutexLock l(&mutex_);
WaitForPendingWrites();
status = ScheduleFlushes(write_context);
}
@ -1166,6 +1168,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
// for previous one. It might create a fairness issue that expiration
// might happen for smaller writes but larger writes can go through.
// Can optimize it if it is an issue.
InstrumentedMutexLock l(&mutex_);
status = DelayWrite(last_batch_group_size_, write_options);
PERF_TIMER_START(write_pre_and_post_process_time);
}
@ -1180,11 +1183,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
if (write_options.no_slowdown) {
status = Status::Incomplete("Write stall");
} else {
InstrumentedMutexLock l(&mutex_);
WriteBufferManagerStallWrites();
}
}
if (status.ok() && *need_log_sync) {
InstrumentedMutexLock l(&log_write_mutex_);
if (status.ok() && log_context->need_log_sync) {
// Wait until the parallel syncs are finished. Any sync process has to sync
// the front log too so it is enough to check the status of front()
// We do a while loop since log_sync_cv_ is signalled when any sync is
@ -1204,8 +1208,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
log.PrepareForSync();
}
} else {
*need_log_sync = false;
log_context->need_log_sync = false;
}
log_context->writer = logs_.back().writer;
log_context->need_log_dir_sync =
log_context->need_log_dir_sync && !log_dir_synced_;
log_context->log_file_number_size = std::addressof(alive_log_files_.back());
return status;
}
@ -1714,10 +1722,12 @@ Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
}
uint64_t DBImpl::GetMaxTotalWalSize() const {
mutex_.AssertHeld();
return mutable_db_options_.max_total_wal_size == 0
? 4 * max_total_in_memory_state_
: mutable_db_options_.max_total_wal_size;
uint64_t max_total_wal_size =
max_total_wal_size_.load(std::memory_order_acquire);
if (max_total_wal_size > 0) {
return max_total_wal_size;
}
return 4 * max_total_in_memory_state_.load(std::memory_order_acquire);
}
// REQUIRES: mutex_ is held
@ -2065,7 +2075,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
log_recycle_files_.pop_front();
}
if (s.ok() && creating_new_log) {
log_write_mutex_.Lock();
InstrumentedMutexLock l(&log_write_mutex_);
assert(new_log != nullptr);
if (!logs_.empty()) {
// Alway flush the buffer of the last log before switching to a new one
@ -2089,7 +2099,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
logs_.emplace_back(logfile_number_, new_log);
alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
}
log_write_mutex_.Unlock();
}
if (!s.ok()) {

@ -358,6 +358,9 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
RecoverFromNoSpace();
}
}
if (bg_error_.severity() >= Status::Severity::kHardError) {
is_db_stopped_.store(true, std::memory_order_release);
}
return bg_error_;
}
@ -736,6 +739,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
// the bg_error and notify user.
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
Status old_bg_error = bg_error_;
is_db_stopped_.store(false, std::memory_order_release);
bg_error_ = Status::OK();
bg_error_.PermitUncheckedError();
EventHelpers::NotifyOnErrorRecoveryEnd(
@ -792,6 +796,9 @@ void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err;
}
if (bg_error_.severity() >= Status::Severity::kHardError) {
is_db_stopped_.store(true, std::memory_order_release);
}
return;
}

@ -38,6 +38,7 @@ class ErrorHandler {
auto_recovery_(false),
recovery_in_prog_(false),
soft_error_no_bg_work_(false),
is_db_stopped_(false),
bg_error_stats_(db_options.statistics) {
// Clear the checked flag for uninitialized errors
bg_error_.PermitUncheckedError();
@ -59,12 +60,11 @@ class ErrorHandler {
Status ClearBGError();
bool IsDBStopped() {
return !bg_error_.ok() &&
bg_error_.severity() >= Status::Severity::kHardError;
}
bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); }
bool IsBGWorkStopped() {
assert(db_mutex_);
db_mutex_->AssertHeld();
return !bg_error_.ok() &&
(bg_error_.severity() >= Status::Severity::kHardError ||
!auto_recovery_ || soft_error_no_bg_work_);
@ -105,6 +105,7 @@ class ErrorHandler {
// Used to store the context for recover, such as flush reason.
DBRecoverContext recover_context_;
std::atomic<bool> is_db_stopped_;
// The pointer of DB statistics.
std::shared_ptr<Statistics> bg_error_stats_;

@ -390,7 +390,7 @@ void ProfileQueries(bool enabled_time = false) {
EXPECT_GT(hist_write_scheduling_time.Average(), 0);
#ifndef NDEBUG
ASSERT_GT(total_db_mutex_nanos, 2000U);
ASSERT_LT(total_db_mutex_nanos, 100U);
#endif
}

@ -1978,6 +1978,7 @@ TEST_F(VersionSetTest, WalCreateAfterClose) {
TEST_F(VersionSetTest, AddWalWithSmallerSize) {
NewDB();
assert(versions_);
constexpr WalNumber kLogNumber = 10;
constexpr uint64_t kSizeInBytes = 111;
@ -1990,6 +1991,9 @@ TEST_F(VersionSetTest, AddWalWithSmallerSize) {
ASSERT_OK(LogAndApplyToDefaultCF(edit));
}
// Copy for future comparison.
const std::map<WalNumber, WalMetadata> wals1 =
versions_->GetWalSet().GetWals();
{
// Add the same WAL with smaller synced size.
@ -1998,13 +2002,11 @@ TEST_F(VersionSetTest, AddWalWithSmallerSize) {
edit.AddWal(kLogNumber, wal);
Status s = LogAndApplyToDefaultCF(edit);
ASSERT_TRUE(s.IsCorruption());
ASSERT_TRUE(
s.ToString().find(
"WAL 10 must not have smaller synced size than previous one") !=
std::string::npos)
<< s.ToString();
ASSERT_OK(s);
}
const std::map<WalNumber, WalMetadata> wals2 =
versions_->GetWalSet().GetWals();
ASSERT_EQ(wals1, wals2);
}
TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) {

@ -112,26 +112,33 @@ Status WalSet::AddWal(const WalAddition& wal) {
auto it = wals_.lower_bound(wal.GetLogNumber());
bool existing = it != wals_.end() && it->first == wal.GetLogNumber();
if (existing && !wal.GetMetadata().HasSyncedSize()) {
if (!existing) {
wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()});
return Status::OK();
}
assert(existing);
if (!wal.GetMetadata().HasSyncedSize()) {
std::stringstream ss;
ss << "WAL " << wal.GetLogNumber() << " is created more than once";
return Status::Corruption("WalSet::AddWal", ss.str());
}
// If the WAL has synced size, it must >= the previous size.
if (wal.GetMetadata().HasSyncedSize() && existing &&
it->second.HasSyncedSize() &&
wal.GetMetadata().GetSyncedSizeInBytes() <
assert(wal.GetMetadata().HasSyncedSize());
if (it->second.HasSyncedSize() && wal.GetMetadata().GetSyncedSizeInBytes() <=
it->second.GetSyncedSizeInBytes()) {
std::stringstream ss;
ss << "WAL " << wal.GetLogNumber()
<< " must not have smaller synced size than previous one";
return Status::Corruption("WalSet::AddWal", ss.str());
// This is possible because version edits with different synced WAL sizes
// for the same WAL can be committed out-of-order. For example, thread
// 1 synces the first 10 bytes of 1.log, while thread 2 synces the first 20
// bytes of 1.log. It's possible that thread 1 calls LogAndApply() after
// thread 2.
// In this case, just return ok.
return Status::OK();
}
if (existing) {
// Update synced size for the given WAL.
it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes());
} else {
wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()});
}
return Status::OK();
}

@ -42,6 +42,8 @@ class WalMetadata {
uint64_t GetSyncedSizeInBytes() const { return synced_size_bytes_; }
private:
friend bool operator==(const WalMetadata& lhs, const WalMetadata& rhs);
friend bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs);
// The size of WAL is unknown, used when the WAL is not synced yet or is
// empty.
constexpr static uint64_t kUnknownWalSize =
@ -51,6 +53,14 @@ class WalMetadata {
uint64_t synced_size_bytes_ = kUnknownWalSize;
};
inline bool operator==(const WalMetadata& lhs, const WalMetadata& rhs) {
return lhs.synced_size_bytes_ == rhs.synced_size_bytes_;
}
inline bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs) {
return !(lhs == rhs);
}
// These tags are persisted to MANIFEST, so it's part of the user API.
enum class WalAdditionTag : uint32_t {
// Indicates that there are no more tags.

@ -54,12 +54,11 @@ TEST(WalSet, SmallerSyncedSize) {
constexpr uint64_t kBytes = 100;
WalSet wals;
ASSERT_OK(wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes))));
const auto wals1 = wals.GetWals();
Status s = wals.AddWal(WalAddition(kNumber, WalMetadata(0)));
ASSERT_TRUE(s.IsCorruption());
ASSERT_TRUE(
s.ToString().find(
"WAL 100 must not have smaller synced size than previous one") !=
std::string::npos);
const auto wals2 = wals.GetWals();
ASSERT_OK(s);
ASSERT_EQ(wals1, wals2);
}
TEST(WalSet, CreateTwice) {

@ -1115,7 +1115,8 @@ TEST_P(BackupEngineTestWithParam, OfflineIntegrationTest) {
destroy_data = false;
// kAutoFlushOnly to preserve legacy test behavior (consider updating)
FillDB(db_.get(), keys_iteration * i, fill_up_to, kAutoFlushOnly);
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), iter == 0));
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), iter == 0))
<< "iter: " << iter << ", idx: " << i;
CloseDBAndBackupEngine();
DestroyDB(dbname_, options_);

Loading…
Cancel
Save