From f3818948e8166a0ff123f7a44eb537abcd998c81 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Mon, 17 Apr 2023 11:00:08 -0700 Subject: [PATCH] Deflake DBWriteTest.LockWALInEffect (#11382) Summary: This test exhibited the following flaky failure: ``` db/db_write_test.cc:653: Failure db_->Resume() Corruption: Not active ``` I was able to repro it by applying the following patch to coerce a specific race condition: ``` diff --git a/db/db_write_test.cc b/db/db_write_test.cc index d82c57376..775ba3cde 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -636,6 +636,10 @@ TEST_P(DBWriteTest, LockWALInEffect) { ASSERT_TRUE(dbfull()->WALBufferIsEmpty()); ASSERT_OK(db_->UnlockWAL()); + // Test thread: sleep interval: [0, 3) + // In this interval, the file system is active + sleep(3); + // Fail the WAL flush if applicable fault_fs->SetFilesystemActive(false); Status s = Put("key2", "value"); @@ -649,6 +653,11 @@ TEST_P(DBWriteTest, LockWALInEffect) { ASSERT_OK(db_->LockWAL()); ASSERT_OK(db_->UnlockWAL()); } + + // Test thread: sleep interval: [3, 6) + // In this interval, the file system is inactive + sleep(3); + fault_fs->SetFilesystemActive(true); ASSERT_OK(db_->Resume()); // Writes should work again diff --git a/db/flush_job.cc b/db/flush_job.cc index 8193f594f..602ee2c9f 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -979,6 +979,10 @@ Status FlushJob::WriteLevel0Table() { DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_); + // Flush thread: sleep interval: [0, 4) + // Upon awakening, the file system will be inactive. Then the MANIFEST + // update will fail. + sleep(4); db_mutex_->Lock(); } base_->Unref(); ``` The fix for this scenario is explained in the code change. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11382 Reviewed By: cbi42 Differential Revision: D45027632 Pulled By: ajkr fbshipit-source-id: 6bfa35a5781c0c080fb74e13f2b2c9f871f7effb --- db/db_write_test.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/db/db_write_test.cc b/db/db_write_test.cc index d82c57376..d1e3d53f6 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -636,6 +636,11 @@ TEST_P(DBWriteTest, LockWALInEffect) { ASSERT_TRUE(dbfull()->WALBufferIsEmpty()); ASSERT_OK(db_->UnlockWAL()); + // The above `TEST_SwitchWAL()` triggered a flush. That flush needs to finish + // before we make the filesystem inactive, otherwise the flush might hit an + // unrecoverable error (e.g., failed MANIFEST update). + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); + // Fail the WAL flush if applicable fault_fs->SetFilesystemActive(false); Status s = Put("key2", "value");