From 538df26fcc6a8f22e14271e067e1ce2c71343964 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 4 Aug 2022 12:14:28 -0700 Subject: [PATCH] Deflake DBWALTest.RaceInstallFlushResultsWithWalObsoletion (#10456) Summary: Existing DBWALTest.RaceInstallFlushResultsWithWalObsoletion test relies on a specific interleaving of two background flush threads. We call them bg1 and bg2, and assume bg1 starts to install flush results ahead of bg2. After bg1 enters `ProcessManifestWrites`, bg1 waits for bg2 to also enter `MemTableList::TryInstallMemtableFlushResults()` before bg1 can proceed with MANIFEST write. However, if bg2 called `SyncClosedLogs()` and needed to commit to the MANIFEST but falls behind bg1, then bg2 needs to wait for bg1 to finish writing to MANIFEST. This is a circular dependency. Fix this by allowing bg2 to start only after bg1 grabs the chance to sync the WAL and commit to MANIFEST. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10456 Test Plan: 1. make check 2. export TEST_TMPDIR=/dev/shm && gtest-parallel -r 1000 -w 32 ./db_wal_test --gtest_filter=DBWALTest.RaceInstallFlushResultsWithWalObsoletion Reviewed By: ltamasi Differential Revision: D38391856 Pulled By: riversand963 fbshipit-source-id: 55f647d5b94e534c008a4dd2fb082675ddf58c96 --- db/db_impl/db_impl_compaction_flush.cc | 2 ++ db/db_wal_test.cc | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 4378f3212..49ce9d3a0 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -229,6 +229,8 @@ Status DBImpl::FlushMemTableToOutputFile( mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1", + nullptr); } if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 96b5d4f91..5b5ec76af 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -1523,8 +1523,25 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) { /*wait=*/false, /*allow_write_stall=*/true, handles_[0])); bool called = false; + std::atomic bg_flush_threads{0}; + std::atomic wal_synced{false}; SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) { + int cur = bg_flush_threads.load(); + int desired = cur + 1; + if (cur > 0 || + !bg_flush_threads.compare_exchange_strong(cur, desired)) { + while (!wal_synced.load()) { + // Wait until the other bg flush thread finishes committing WAL sync + // operation to the MANIFEST. + } + } + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushMemTableToOutputFile:CommitWal:1", + [&](void* /*arg*/) { wal_synced.store(true); }); // This callback will be called when the first bg flush thread reaches the // point before entering the MANIFEST write queue after flushing the SST // file.