Fix race condition via concurrent FlushWAL

Summary:
Currently log_writer->AddRecord in WriteImpl is protected from concurrent calls via FlushWAL only if two_write_queues_ option is set. The patch fixes the problem by i) skip log_writer->AddRecord in FlushWAL if manual_wal_flush is not set, ii) protects log_writer->AddRecord in WriteImpl via log_write_mutex_ if manual_wal_flush_ is set but two_write_queues_ is not.

Fixes #3599
Closes https://github.com/facebook/rocksdb/pull/3656

Differential Revision: D7405608

Pulled By: maysamyabandeh

fbshipit-source-id: d6cc265051c77ae49c7c6df4f427350baaf46934
main
Maysam Yabandeh 7 years ago committed by Facebook Github Bot
parent 23f9d93f47
commit 35a4469bbf
  1. 1
      HISTORY.md
  2. 5
      db/db_impl.cc
  3. 1
      db/db_impl.h
  4. 14
      db/db_impl_write.cc
  5. 50
      db/db_test.cc
  6. 4
      utilities/checkpoint/checkpoint_impl.cc

@ -6,6 +6,7 @@
### Bug Fixes ### Bug Fixes
* Fsync after writing global seq number to the ingestion file in ExternalSstFileIngestionJob. * Fsync after writing global seq number to the ingestion file in ExternalSstFileIngestionJob.
* Fix WAL corruption caused by race condition between user write thread and FlushWAL when two_write_queue is not set.
### Java API Changes ### Java API Changes
* Add `BlockBasedTableConfig.setBlockCache` to allow sharing a block cache across DB instances. * Add `BlockBasedTableConfig.setBlockCache` to allow sharing a block cache across DB instances.

@ -693,7 +693,7 @@ int DBImpl::FindMinimumEmptyLevelFitting(
} }
Status DBImpl::FlushWAL(bool sync) { Status DBImpl::FlushWAL(bool sync) {
{ if (manual_wal_flush_) {
// We need to lock log_write_mutex_ since logs_ might change concurrently // We need to lock log_write_mutex_ since logs_ might change concurrently
InstrumentedMutexLock wl(&log_write_mutex_); InstrumentedMutexLock wl(&log_write_mutex_);
log::Writer* cur_log_writer = logs_.back().writer; log::Writer* cur_log_writer = logs_.back().writer;
@ -707,6 +707,9 @@ Status DBImpl::FlushWAL(bool sync) {
return s; return s;
} }
} }
if (!sync) {
return Status::OK();
}
// sync = true // sync = true
ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true"); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true");
return SyncWAL(); return SyncWAL();

@ -737,6 +737,7 @@ class DBImpl : public DB {
#endif #endif
friend struct SuperVersion; friend struct SuperVersion;
friend class CompactedDBImpl; friend class CompactedDBImpl;
friend class DBTest_ConcurrentFlushWAL_Test;
#ifndef NDEBUG #ifndef NDEBUG
friend class DBTest2_ReadCallbackTest_Test; friend class DBTest2_ReadCallbackTest_Test;
friend class WriteCallbackTest_WriteWithCallbackTest_Test; friend class WriteCallbackTest_WriteWithCallbackTest_Test;

@ -808,7 +808,21 @@ Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
assert(log_size != nullptr); assert(log_size != nullptr);
Slice log_entry = WriteBatchInternal::Contents(&merged_batch); Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
*log_size = log_entry.size(); *log_size = log_entry.size();
// When two_write_queues_ WriteToWAL has to be protected from concurretn calls
// from the two queues anyway and log_write_mutex_ is already held. Otherwise
// if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
// from possible concurrent calls via the FlushWAL by the application.
const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
// Due to performance cocerns of missed branch prediction penalize the new
// manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
// when we do not need any locking.
if (UNLIKELY(needs_locking)) {
log_write_mutex_.Lock();
}
Status status = log_writer->AddRecord(log_entry); Status status = log_writer->AddRecord(log_entry);
if (UNLIKELY(needs_locking)) {
log_write_mutex_.Unlock();
}
if (log_used != nullptr) { if (log_used != nullptr) {
*log_used = logfile_number_; *log_used = logfile_number_;
} }

@ -3348,6 +3348,56 @@ TEST_F(DBTest, WriteSingleThreadEntry) {
} }
} }
TEST_F(DBTest, ConcurrentFlushWAL) {
const size_t cnt = 100;
Options options;
WriteOptions wopt;
ReadOptions ropt;
for (bool two_write_queues : {false, true}) {
for (bool manual_wal_flush : {false, true}) {
options.two_write_queues = two_write_queues;
options.manual_wal_flush = manual_wal_flush;
options.create_if_missing = true;
DestroyAndReopen(options);
std::vector<port::Thread> threads;
threads.emplace_back([&] {
for (size_t i = 0; i < cnt; i++) {
auto istr = ToString(i);
db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr, "b" + istr);
}
});
if (two_write_queues) {
threads.emplace_back([&] {
for (size_t i = cnt; i < 2 * cnt; i++) {
auto istr = ToString(i);
WriteBatch batch;
batch.Put("a" + istr, "b" + istr);
dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true);
}
});
}
threads.emplace_back([&] {
for (size_t i = 0; i < cnt * 100; i++) { // FlushWAL is faster than Put
db_->FlushWAL(false);
}
});
for (auto& t : threads) {
t.join();
}
options.create_if_missing = false;
// Recover from the wal and make sure that it is not corrupted
Reopen(options);
for (size_t i = 0; i < cnt; i++) {
PinnableSlice pval;
auto istr = ToString(i);
ASSERT_OK(
db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval));
ASSERT_TRUE(pval == ("b" + istr));
}
}
}
}
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
TEST_F(DBTest, DynamicMemtableOptions) { TEST_F(DBTest, DynamicMemtableOptions) {
const uint64_t k64KB = 1 << 16; const uint64_t k64KB = 1 << 16;

@ -222,9 +222,7 @@ Status CheckpointImpl::CreateCustomCheckpoint(
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1"); TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"); TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
if (db_options.manual_wal_flush) { db_->FlushWAL(false /* sync */);
db_->FlushWAL(false /* sync */);
}
} }
// if we have more than one column family, we need to also get WAL files // if we have more than one column family, we need to also get WAL files
if (s.ok()) { if (s.ok()) {

Loading…
Cancel
Save