From 15bb4ea08428aa70d078c3162de5f1b283919e8c Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Tue, 6 Dec 2022 18:31:43 -0800 Subject: [PATCH] Deflake DBWALTest.FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL (#11016) Summary: **Context/Summary:** Credit to ajkr's https://github.com/facebook/rocksdb/pull/11016#pullrequestreview-1205020134, flaky test https://app.circleci.com/pipelines/github/facebook/rocksdb/21985/workflows/5f6cc355-78c1-46d8-89ee-0fd679725a8a/jobs/540878 is due to `Flush()` called in the test returned earlier than obsoleted WAL being found in background flush and SyncWAL() was called (i.e, "sync_point_called" sets to true). Fix this by making checking `sync_point_called == true` after obsoleted WAL is found and `SyncWAL()` is called. Also rename the "sync_point_called" to be something more specific. Also, fix a potential flakiness due to manually setting a log threshold to force new manifest creation. This is unreliable so I decided to use sync point to force new manifest creation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11016 Test Plan: make check Reviewed By: pdillinger Differential Revision: D41717786 Pulled By: hx235 fbshipit-source-id: ad1e4701a987285bbe6c8e7d9b05c4db06b4edf4 --- db/db_wal_test.cc | 39 +++++++++++++++++++-------------------- db/version_set.cc | 3 +++ 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 6246cc70f..2bdfaada5 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -1602,9 +1602,6 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) { TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) { Options options = CurrentOptions(); options.track_and_verify_wals_in_manifest = true; - // Set a small max_manifest_file_size to force manifest creation - // in SyncWAL() for tet purpose - options.max_manifest_file_size = 170; DestroyAndReopen(options); // Accumulate memtable m1 and create the 1st wal (i.e, 4.log) @@ -1619,25 +1616,26 @@ TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) { // active) log and release the lock // (2) SyncWAL() proceeds with the lock. It // creates a new manifest and syncs all the inactive wals before the latest - // (i.e, active log), which is 4.log. SyncWAL() is not aware of the fact - // that 4.log has marked as to be obseleted. - // Prior to the fix, such wal sync will then add a WAL addition record of - // 4.log to the new manifest without any special treatment. + // (i.e, active log), which is 4.log. Note that SyncWAL() is not aware of the + // fact that 4.log has marked as to be obseleted. Prior to the fix, such wal + // sync will then add a WAL addition record of 4.log to the new manifest + // without any special treatment. // (3) BackgroundFlush() will eventually purge 4.log. - bool sync_point_called = false; - bool new_manifest_created = false; + bool wal_synced = false; SyncPoint::GetInstance()->SetCallBack( "FindObsoleteFiles::PostMutexUnlock", [&](void*) { ASSERT_OK(env_->FileExists(wal_file_path)); SyncPoint::GetInstance()->SetCallBack( - "VersionSet::ProcessManifestWrites:BeforeNewManifest", - [&](void*) { new_manifest_created = true; }); + "VersionSet::ProcessManifestWrites:" + "PostDecidingCreateNewManifestOrNot", + [&](void* arg) { + bool* new_descriptor_log = (bool*)arg; + *new_descriptor_log = true; + }); ASSERT_OK(db_->SyncWAL()); - ASSERT_TRUE(new_manifest_created); - - sync_point_called = true; + wal_synced = true; }); SyncPoint::GetInstance()->SetCallBack( @@ -1650,21 +1648,22 @@ TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) { "PostDeleteWAL"); } }); + SyncPoint::GetInstance()->LoadDependency( - {{"DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::" + {{"DBImpl::BackgroundCallFlush:FilesFound", + "PreConfrimObsoletedWALSynced"}, + {"DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::" "PostDeleteWAL", - "DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::" "PreConfrimWALDeleted"}}); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Flush()); - ASSERT_TRUE(sync_point_called); - TEST_SYNC_POINT( - "DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::" - "PreConfrimWALDeleted"); + TEST_SYNC_POINT("PreConfrimObsoletedWALSynced"); + ASSERT_TRUE(wal_synced); + TEST_SYNC_POINT("PreConfrimWALDeleted"); // BackgroundFlush() purged 4.log // because the memtable associated with the WAL was flushed and new WAL was // created (i.e, 8.log) diff --git a/db/version_set.cc b/db/version_set.cc index 33cd70022..becdd6790 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4983,6 +4983,9 @@ Status VersionSet::ProcessManifestWrites( } else { pending_manifest_file_number_ = manifest_file_number_; } + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:PostDecidingCreateNewManifestOrNot", + &new_descriptor_log); // Local cached copy of state variable(s). WriteCurrentStateToManifest() // reads its content after releasing db mutex to avoid race with