Fix race condition with WAL tracking and `FlushWAL(true /* sync */)` (#10185)

Summary:
`FlushWAL(true /* sync */)` is used internally and for manual WAL sync. It had a bug when used together with `track_and_verify_wals_in_manifest` where the synced size tracked in MANIFEST was larger than the number of bytes actually synced.

The bug could be repro'd almost immediately with the following crash test command: `python3 tools/db_crashtest.py blackbox --simple --write_buffer_size=524288 --max_bytes_for_level_base=2097152 --target_file_size_base=524288 --duration=3600 --interval=10 --sync_fault_injection=1 --disable_wal=0 --checkpoint_one_in=1000 --max_key=10000 --value_size_mult=33`.

An example error message produced by the above command is shown below. The error sometimes arose from the checkpoint and other times arose from the main stress test DB.

```
Corruption: Size mismatch: WAL (log number: 119) in MANIFEST is 27938 bytes , but actually is 27859 bytes on disk.
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10185

Test Plan:
- repro unit test
- the above crash test command no longer finds the error. It does find a different error after a while longer such as "Corruption: WAL file 481 required by manifest but not in directory list"

Reviewed By: riversand963

Differential Revision: D37200993

Pulled By: ajkr

fbshipit-source-id: 98e0071c1a89f4d009888512ed89f9219779ae5f
main
Andrew Kryczka 2 years ago committed by Facebook GitHub Bot
parent a5d773e077
commit d5d8920f2c
  1. 1
      HISTORY.md
  2. 20
      db/db_impl/db_impl.cc
  3. 26
      db/db_impl/db_impl.h
  4. 5
      db/db_impl/db_impl_compaction_flush.cc
  5. 2
      db/db_impl/db_impl_files.cc
  6. 5
      db/db_impl/db_impl_write.cc
  7. 35
      db/db_write_test.cc
  8. 8
      file/writable_file_writer.cc
  9. 10
      file/writable_file_writer.h
  10. 45
      utilities/checkpoint/checkpoint_test.cc

@ -11,6 +11,7 @@
* Fix a bug that could return wrong results with `index_type=kHashSearch` and using `SetOptions` to change the `prefix_extractor`. * Fix a bug that could return wrong results with `index_type=kHashSearch` and using `SetOptions` to change the `prefix_extractor`.
* Fixed a bug in WAL tracking with wal_compression. WAL compression writes a kSetCompressionType record which is not associated with any sequence number. As result, WalManager::GetSortedWalsOfType() will skip these WALs and not return them to caller, e.g. Checkpoint, Backup, causing the operations to fail. * Fixed a bug in WAL tracking with wal_compression. WAL compression writes a kSetCompressionType record which is not associated with any sequence number. As result, WalManager::GetSortedWalsOfType() will skip these WALs and not return them to caller, e.g. Checkpoint, Backup, causing the operations to fail.
* Avoid a crash if the IDENTITY file is accidentally truncated to empty. A new DB ID will be written and generated on Open. * Avoid a crash if the IDENTITY file is accidentally truncated to empty. A new DB ID will be written and generated on Open.
* Fixed a possible corruption for users of `manual_wal_flush` and/or `FlushWAL(true /* sync */)`, together with `track_and_verify_wals_in_manifest == true`. For those users, losing unsynced data (e.g., due to power loss) could make future DB opens fail with a `Status::Corruption` complaining about missing WAL data.
### Public API changes ### Public API changes
* Add new API GetUnixTime in Snapshot class which returns the unix time at which Snapshot is taken. * Add new API GetUnixTime in Snapshot class which returns the unix time at which Snapshot is taken.

@ -1367,6 +1367,7 @@ Status DBImpl::FlushWAL(bool sync) {
} }
Status DBImpl::SyncWAL() { Status DBImpl::SyncWAL() {
TEST_SYNC_POINT("DBImpl::SyncWAL:Begin");
autovector<log::Writer*, 1> logs_to_sync; autovector<log::Writer*, 1> logs_to_sync;
bool need_log_dir_sync; bool need_log_dir_sync;
uint64_t current_log_number; uint64_t current_log_number;
@ -1379,7 +1380,7 @@ Status DBImpl::SyncWAL() {
current_log_number = logfile_number_; current_log_number = logfile_number_;
while (logs_.front().number <= current_log_number && while (logs_.front().number <= current_log_number &&
logs_.front().getting_synced) { logs_.front().IsSyncing()) {
log_sync_cv_.Wait(); log_sync_cv_.Wait();
} }
// First check that logs are safe to sync in background. // First check that logs are safe to sync in background.
@ -1396,8 +1397,7 @@ Status DBImpl::SyncWAL() {
for (auto it = logs_.begin(); for (auto it = logs_.begin();
it != logs_.end() && it->number <= current_log_number; ++it) { it != logs_.end() && it->number <= current_log_number; ++it) {
auto& log = *it; auto& log = *it;
assert(!log.getting_synced); log.PrepareForSync();
log.getting_synced = true;
logs_to_sync.push_back(log.writer); logs_to_sync.push_back(log.writer);
} }
@ -1470,11 +1470,10 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
VersionEdit synced_wals; VersionEdit synced_wals;
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
auto& wal = *it; auto& wal = *it;
assert(wal.getting_synced); assert(wal.IsSyncing());
if (immutable_db_options_.track_and_verify_wals_in_manifest && if (immutable_db_options_.track_and_verify_wals_in_manifest &&
wal.writer->file()->GetFileSize() > 0) { wal.GetPreSyncSize() > 0) {
synced_wals.AddWal(wal.number, synced_wals.AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
WalMetadata(wal.writer->file()->GetFileSize()));
} }
if (logs_.size() > 1) { if (logs_.size() > 1) {
@ -1483,12 +1482,12 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
InstrumentedMutexLock l(&log_write_mutex_); InstrumentedMutexLock l(&log_write_mutex_);
it = logs_.erase(it); it = logs_.erase(it);
} else { } else {
wal.getting_synced = false; wal.FinishSync();
++it; ++it;
} }
} }
assert(logs_.empty() || logs_[0].number > up_to || assert(logs_.empty() || logs_[0].number > up_to ||
(logs_.size() == 1 && !logs_[0].getting_synced)); (logs_.size() == 1 && !logs_[0].IsSyncing()));
Status s; Status s;
if (synced_wals.IsWalAddition()) { if (synced_wals.IsWalAddition()) {
@ -1508,8 +1507,7 @@ void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to; for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
++it) { ++it) {
auto& wal = *it; auto& wal = *it;
assert(wal.getting_synced); wal.FinishSync();
wal.getting_synced = false;
} }
log_sync_cv_.SignalAll(); log_sync_cv_.SignalAll();
} }

@ -1593,12 +1593,38 @@ class DBImpl : public DB {
return s; return s;
} }
bool IsSyncing() { return getting_synced; }
uint64_t GetPreSyncSize() {
assert(getting_synced);
return pre_sync_size;
}
void PrepareForSync() {
assert(!getting_synced);
// Size is expected to be monotonically increasing.
assert(writer->file()->GetFlushedSize() >= pre_sync_size);
getting_synced = true;
pre_sync_size = writer->file()->GetFlushedSize();
}
void FinishSync() {
assert(getting_synced);
getting_synced = false;
}
uint64_t number; uint64_t number;
// Visual Studio doesn't support deque's member to be noncopyable because // Visual Studio doesn't support deque's member to be noncopyable because
// of a std::unique_ptr as a member. // of a std::unique_ptr as a member.
log::Writer* writer; // own log::Writer* writer; // own
private:
// true for some prefix of logs_ // true for some prefix of logs_
bool getting_synced = false; bool getting_synced = false;
// The size of the file before the sync happens. This amount is guaranteed
// to be persisted even if appends happen during sync so it can be used for
// tracking the synced size in MANIFEST.
uint64_t pre_sync_size = 0;
}; };
// PurgeFileInfo is a structure to hold information of files to be deleted in // PurgeFileInfo is a structure to hold information of files to be deleted in

@ -88,14 +88,13 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
autovector<log::Writer*, 1> logs_to_sync; autovector<log::Writer*, 1> logs_to_sync;
uint64_t current_log_number = logfile_number_; uint64_t current_log_number = logfile_number_;
while (logs_.front().number < current_log_number && while (logs_.front().number < current_log_number &&
logs_.front().getting_synced) { logs_.front().IsSyncing()) {
log_sync_cv_.Wait(); log_sync_cv_.Wait();
} }
for (auto it = logs_.begin(); for (auto it = logs_.begin();
it != logs_.end() && it->number < current_log_number; ++it) { it != logs_.end() && it->number < current_log_number; ++it) {
auto& log = *it; auto& log = *it;
assert(!log.getting_synced); log.PrepareForSync();
log.getting_synced = true;
logs_to_sync.push_back(log.writer); logs_to_sync.push_back(log.writer);
} }

@ -288,7 +288,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
} }
while (!logs_.empty() && logs_.front().number < min_log_number) { while (!logs_.empty() && logs_.front().number < min_log_number) {
auto& log = logs_.front(); auto& log = logs_.front();
if (log.getting_synced) { if (log.IsSyncing()) {
log_sync_cv_.Wait(); log_sync_cv_.Wait();
// logs_ could have changed while we were waiting. // logs_ could have changed while we were waiting.
continue; continue;

@ -1181,17 +1181,16 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
// Note: there does not seem to be a reason to wait for parallel sync at // Note: there does not seem to be a reason to wait for parallel sync at
// this early step but it is not important since parallel sync (SyncWAL) and // this early step but it is not important since parallel sync (SyncWAL) and
// need_log_sync are usually not used together. // need_log_sync are usually not used together.
while (logs_.front().getting_synced) { while (logs_.front().IsSyncing()) {
log_sync_cv_.Wait(); log_sync_cv_.Wait();
} }
for (auto& log : logs_) { for (auto& log : logs_) {
assert(!log.getting_synced);
// This is just to prevent the logs to be synced by a parallel SyncWAL // This is just to prevent the logs to be synced by a parallel SyncWAL
// call. We will do the actual syncing later after we will write to the // call. We will do the actual syncing later after we will write to the
// WAL. // WAL.
// Note: there does not seem to be a reason to set this early before we // Note: there does not seem to be a reason to set this early before we
// actually write to the WAL // actually write to the WAL
log.getting_synced = true; log.PrepareForSync();
} }
} else { } else {
*need_log_sync = false; *need_log_sync = false;

@ -334,6 +334,41 @@ TEST_P(DBWriteTest, ManualWalFlushInEffect) {
ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty()); ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
} }
TEST_P(DBWriteTest, UnflushedPutRaceWithTrackedWalSync) {
// Repro race condition bug where unflushed WAL data extended the synced size
// recorded to MANIFEST despite being unrecoverable.
Options options = GetOptions();
std::unique_ptr<FaultInjectionTestEnv> fault_env(
new FaultInjectionTestEnv(env_));
options.env = fault_env.get();
options.manual_wal_flush = true;
options.track_and_verify_wals_in_manifest = true;
Reopen(options);
ASSERT_OK(Put("key1", "val1"));
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::SyncWAL:Begin",
[this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(db_->FlushWAL(true /* sync */));
// Ensure callback ran.
ASSERT_EQ("val2", Get("key2"));
Close();
// Simulate full loss of unsynced data. This drops "key2" -> "val2" from the
// DB WAL.
fault_env->DropUnsyncedFileData();
Reopen(options);
// Need to close before `fault_env` goes out of scope.
Close();
}
TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) { TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
std::unique_ptr<FaultInjectionTestEnv> mock_env( std::unique_ptr<FaultInjectionTestEnv> mock_env(
new FaultInjectionTestEnv(env_)); new FaultInjectionTestEnv(env_));

@ -585,6 +585,8 @@ IOStatus WritableFileWriter::WriteBuffered(
left -= allowed; left -= allowed;
src += allowed; src += allowed;
uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
flushed_size_.store(cur_size + allowed, std::memory_order_release);
} }
buf_.Size(0); buf_.Size(0);
buffered_data_crc32c_checksum_ = 0; buffered_data_crc32c_checksum_ = 0;
@ -675,6 +677,8 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum(
// the corresponding checksum value // the corresponding checksum value
buf_.Size(0); buf_.Size(0);
buffered_data_crc32c_checksum_ = 0; buffered_data_crc32c_checksum_ = 0;
uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
flushed_size_.store(cur_size + left, std::memory_order_release);
return s; return s;
} }
@ -782,6 +786,8 @@ IOStatus WritableFileWriter::WriteDirect(
left -= size; left -= size;
src += size; src += size;
write_offset += size; write_offset += size;
uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
flushed_size_.store(cur_size + size, std::memory_order_release);
assert((next_write_offset_ % alignment) == 0); assert((next_write_offset_ % alignment) == 0);
} }
@ -884,6 +890,8 @@ IOStatus WritableFileWriter::WriteDirectWithChecksum(
IOSTATS_ADD(bytes_written, left); IOSTATS_ADD(bytes_written, left);
assert((next_write_offset_ % alignment) == 0); assert((next_write_offset_ % alignment) == 0);
uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
flushed_size_.store(cur_size + left, std::memory_order_release);
if (s.ok()) { if (s.ok()) {
// Move the tail to the beginning of the buffer // Move the tail to the beginning of the buffer

@ -143,6 +143,7 @@ class WritableFileWriter {
// Actually written data size can be used for truncate // Actually written data size can be used for truncate
// not counting padding data // not counting padding data
std::atomic<uint64_t> filesize_; std::atomic<uint64_t> filesize_;
std::atomic<uint64_t> flushed_size_;
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
// This is necessary when we use unbuffered access // This is necessary when we use unbuffered access
// and writes must happen on aligned offsets // and writes must happen on aligned offsets
@ -180,6 +181,7 @@ class WritableFileWriter {
buf_(), buf_(),
max_buffer_size_(options.writable_file_max_buffer_size), max_buffer_size_(options.writable_file_max_buffer_size),
filesize_(0), filesize_(0),
flushed_size_(0),
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
next_write_offset_(0), next_write_offset_(0),
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
@ -259,6 +261,14 @@ class WritableFileWriter {
return filesize_.load(std::memory_order_acquire); return filesize_.load(std::memory_order_acquire);
} }
// Returns the size of data flushed to the underlying `FSWritableFile`.
// Expected to match `writable_file()->GetFileSize()`.
// The return value can serve as a lower-bound for the amount of data synced
// by a future call to `SyncWithoutFlush()`.
uint64_t GetFlushedSize() const {
return flushed_size_.load(std::memory_order_acquire);
}
IOStatus InvalidateCache(size_t offset, size_t length) { IOStatus InvalidateCache(size_t offset, size_t length) {
return writable_file_->InvalidateCache(offset, length); return writable_file_->InvalidateCache(offset, length);
} }

@ -915,6 +915,51 @@ TEST_F(CheckpointTest, CheckpointWithDbPath) {
delete checkpoint; delete checkpoint;
} }
TEST_F(CheckpointTest, PutRaceWithCheckpointTrackedWalSync) {
// Repro for a race condition where a user write comes in after the checkpoint
// syncs WAL for `track_and_verify_wals_in_manifest` but before the
// corresponding MANIFEST update. With the bug, that scenario resulted in an
// unopenable DB with error "Corruption: Size mismatch: WAL ...".
Options options = CurrentOptions();
std::unique_ptr<FaultInjectionTestEnv> fault_env(
new FaultInjectionTestEnv(env_));
options.env = fault_env.get();
options.track_and_verify_wals_in_manifest = true;
Reopen(options);
ASSERT_OK(Put("key1", "val1"));
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::SyncWAL:BeforeMarkLogsSynced:1",
[this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
std::unique_ptr<Checkpoint> checkpoint;
{
Checkpoint* checkpoint_ptr;
ASSERT_OK(Checkpoint::Create(db_, &checkpoint_ptr));
checkpoint.reset(checkpoint_ptr);
}
ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
// Ensure callback ran.
ASSERT_EQ("val2", Get("key2"));
Close();
// Simulate full loss of unsynced data. This drops "key2" -> "val2" from the
// DB WAL.
fault_env->DropUnsyncedFileData();
// Before the bug fix, reopening the DB would fail because the MANIFEST's
// AddWal entry indicated the WAL should be synced through "key2" -> "val2".
Reopen(options);
// Need to close before `fault_env` goes out of scope.
Close();
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

Loading…
Cancel
Save