Deflake DBWALTest.FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL (#11016)

Summary:
**Context/Summary:**
Credit to ajkr's https://github.com/facebook/rocksdb/pull/11016#pullrequestreview-1205020134,
flaky test https://app.circleci.com/pipelines/github/facebook/rocksdb/21985/workflows/5f6cc355-78c1-46d8-89ee-0fd679725a8a/jobs/540878 is due to `Flush()` called in the test returned earlier than obsoleted WAL being found in background flush and SyncWAL() was called (i.e, "sync_point_called" sets to true).  Fix this by making checking `sync_point_called == true` after obsoleted WAL is found and `SyncWAL()` is called. Also rename the "sync_point_called" to be something more specific.

Also, fix a potential flakiness due to manually setting a log threshold to force new manifest creation. This is unreliable so I decided to use sync point to force new manifest creation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11016

Test Plan: make check

Reviewed By: pdillinger

Differential Revision: D41717786

Pulled By: hx235

fbshipit-source-id: ad1e4701a987285bbe6c8e7d9b05c4db06b4edf4
main
Hui Xiao 2 years ago committed by Facebook GitHub Bot
parent 23af6786a9
commit 15bb4ea084
  1. 39
      db/db_wal_test.cc
  2. 3
      db/version_set.cc

@ -1602,9 +1602,6 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) { TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.track_and_verify_wals_in_manifest = true; options.track_and_verify_wals_in_manifest = true;
// Set a small max_manifest_file_size to force manifest creation
// in SyncWAL() for tet purpose
options.max_manifest_file_size = 170;
DestroyAndReopen(options); DestroyAndReopen(options);
// Accumulate memtable m1 and create the 1st wal (i.e, 4.log) // Accumulate memtable m1 and create the 1st wal (i.e, 4.log)
@ -1619,25 +1616,26 @@ TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) {
// active) log and release the lock // active) log and release the lock
// (2) SyncWAL() proceeds with the lock. It // (2) SyncWAL() proceeds with the lock. It
// creates a new manifest and syncs all the inactive wals before the latest // creates a new manifest and syncs all the inactive wals before the latest
// (i.e, active log), which is 4.log. SyncWAL() is not aware of the fact // (i.e, active log), which is 4.log. Note that SyncWAL() is not aware of the
// that 4.log has marked as to be obseleted. // fact that 4.log has marked as to be obseleted. Prior to the fix, such wal
// Prior to the fix, such wal sync will then add a WAL addition record of // sync will then add a WAL addition record of 4.log to the new manifest
// 4.log to the new manifest without any special treatment. // without any special treatment.
// (3) BackgroundFlush() will eventually purge 4.log. // (3) BackgroundFlush() will eventually purge 4.log.
bool sync_point_called = false; bool wal_synced = false;
bool new_manifest_created = false;
SyncPoint::GetInstance()->SetCallBack( SyncPoint::GetInstance()->SetCallBack(
"FindObsoleteFiles::PostMutexUnlock", [&](void*) { "FindObsoleteFiles::PostMutexUnlock", [&](void*) {
ASSERT_OK(env_->FileExists(wal_file_path)); ASSERT_OK(env_->FileExists(wal_file_path));
SyncPoint::GetInstance()->SetCallBack( SyncPoint::GetInstance()->SetCallBack(
"VersionSet::ProcessManifestWrites:BeforeNewManifest", "VersionSet::ProcessManifestWrites:"
[&](void*) { new_manifest_created = true; }); "PostDecidingCreateNewManifestOrNot",
[&](void* arg) {
bool* new_descriptor_log = (bool*)arg;
*new_descriptor_log = true;
});
ASSERT_OK(db_->SyncWAL()); ASSERT_OK(db_->SyncWAL());
ASSERT_TRUE(new_manifest_created); wal_synced = true;
sync_point_called = true;
}); });
SyncPoint::GetInstance()->SetCallBack( SyncPoint::GetInstance()->SetCallBack(
@ -1650,21 +1648,22 @@ TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) {
"PostDeleteWAL"); "PostDeleteWAL");
} }
}); });
SyncPoint::GetInstance()->LoadDependency( SyncPoint::GetInstance()->LoadDependency(
{{"DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::" {{"DBImpl::BackgroundCallFlush:FilesFound",
"PreConfrimObsoletedWALSynced"},
{"DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::"
"PostDeleteWAL", "PostDeleteWAL",
"DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::"
"PreConfrimWALDeleted"}}); "PreConfrimWALDeleted"}});
SyncPoint::GetInstance()->EnableProcessing(); SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(Flush()); ASSERT_OK(Flush());
ASSERT_TRUE(sync_point_called);
TEST_SYNC_POINT( TEST_SYNC_POINT("PreConfrimObsoletedWALSynced");
"DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::" ASSERT_TRUE(wal_synced);
"PreConfrimWALDeleted");
TEST_SYNC_POINT("PreConfrimWALDeleted");
// BackgroundFlush() purged 4.log // BackgroundFlush() purged 4.log
// because the memtable associated with the WAL was flushed and new WAL was // because the memtable associated with the WAL was flushed and new WAL was
// created (i.e, 8.log) // created (i.e, 8.log)

@ -4983,6 +4983,9 @@ Status VersionSet::ProcessManifestWrites(
} else { } else {
pending_manifest_file_number_ = manifest_file_number_; pending_manifest_file_number_ = manifest_file_number_;
} }
TEST_SYNC_POINT_CALLBACK(
"VersionSet::ProcessManifestWrites:PostDecidingCreateNewManifestOrNot",
&new_descriptor_log);
// Local cached copy of state variable(s). WriteCurrentStateToManifest() // Local cached copy of state variable(s). WriteCurrentStateToManifest()
// reads its content after releasing db mutex to avoid race with // reads its content after releasing db mutex to avoid race with

Loading…
Cancel
Save