From 538d2365e97555367cd22e286e09357a3b80bb24 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Wed, 22 Dec 2021 21:58:11 -0800 Subject: [PATCH] Fix race condition in BackupEngineTest.ChangeManifestDuringBackupCreation (#9327) Summary: The failure looked like this: ``` utilities/backupable/backupable_db_test.cc:3161: Failure Value of: db_chroot_env_->FileExists(prev_manifest_path).IsNotFound() Actual: false Expected: true ``` The failure could be coerced consistently with the following patch: ``` diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 80410f671..637636791 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -2772,6 +2772,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { if (job_context.HaveSomethingToClean() || job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { mutex_.Unlock(); + bg_cv_.SignalAll(); + sleep(1); TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound"); // Have to flush the info logs before bg_flush_scheduled_-- // because if bg_flush_scheduled_ becomes 0 and the lock is ``` The cause was a familiar problem, which is manual flush/compaction may return before files they obsoleted are removed. The solution is just to wait for "scheduled" work to complete, which includes all phases including cleanup. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9327 Test Plan: after this PR, even the above patch to coerce the bug cannot cause the test to fail. Reviewed By: riversand963 Differential Revision: D33252208 Pulled By: ajkr fbshipit-source-id: 720a7eaca58c7247d221911fffe3d5e1dbf581e9 --- db/db_impl/db_impl.h | 3 +++ db/db_impl/db_impl_debug.cc | 6 ++++++ utilities/backupable/backupable_db_test.cc | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 0d8ce7624..f46d41ce2 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -974,6 +974,9 @@ class DBImpl : public DB { Status TEST_AtomicFlushMemTables(const autovector& cfds, const FlushOptions& flush_opts); + // Wait for background threads to complete scheduled work. + Status TEST_WaitForBackgroundWork(); + // Wait for memtable compaction Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 2bdc66688..d7e0628f6 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -156,6 +156,12 @@ Status DBImpl::TEST_AtomicFlushMemTables( return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest); } +Status DBImpl::TEST_WaitForBackgroundWork() { + InstrumentedMutexLock l(&mutex_); + WaitForBackgroundWork(); + return Status::OK(); +} + Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; if (column_family == nullptr) { diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 80111fc1c..1c6a4ba43 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -3158,6 +3158,10 @@ TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) { FillDB(db_.get(), 0, 100, kAutoFlushOnly); ASSERT_OK(db_chroot_env_->FileExists(prev_manifest_path)); ASSERT_OK(db_->Flush(FlushOptions())); + // Even though manual flush completed above, the background thread may not + // have finished its cleanup work. `TEST_WaitForBackgroundWork()` will wait + // until all the background thread's work has completed, including cleanup. + ASSERT_OK(db_impl->TEST_WaitForBackgroundWork()); ASSERT_TRUE(db_chroot_env_->FileExists(prev_manifest_path).IsNotFound()); CloseDBAndBackupEngine();