From fc61e98ae698ae1443aa799abf09a5146a6e6838 Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Fri, 4 Mar 2022 11:35:28 -0800 Subject: [PATCH] Attempt to deflake DBLogicalBlockSizeCacheTest.CreateColumnFamilies (#9516) Summary: **Context:** `DBLogicalBlockSizeCacheTest.CreateColumnFamilies` is flaky on a rare occurrence of assertion failure below ``` db/db_logical_block_size_cache_test.cc:210 Expected equality of these values: 1 cache_->GetRefCount(cf_path_0_) Which is: 2 ``` Root-cause: `ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));` in the test may not successfully decrease the ref count of `cf_path_0_` since the decreasing only happens in the clean-up of `ColumnFamilyData` when `ColumnFamilyData` has no referencing to it, which may not be true when `db->DestroyColumnFamilyHandle(cfs[0])` is called since background work such as `DumpStats()` can hold reference to that `ColumnFamilyData` (suggested and repro-d by ajkr ). Similar case `ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));`. See following for a deterministic repro: ``` diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 196b428a3..4e7a834c4 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -956,10 +956,16 @@ void DBImpl::DumpStats() { // near-atomically. // Get a ref before unlocking cfd->Ref(); + if (cfd->GetName() == "cf1" || cfd->GetName() == "cf2") { + TEST_SYNC_POINT("DBImpl::DumpStats:PostCFDRef"); + } { InstrumentedMutexUnlock u(&mutex_); cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false); } + if (cfd->GetName() == "cf1" || cfd->GetName() == "cf2") { + TEST_SYNC_POINT("DBImpl::DumpStats::PreCFDUnrefAndTryDelete"); + } cfd->UnrefAndTryDelete(); } } diff --git a/db/db_logical_block_size_cache_test.cc b/db/db_logical_block_size_cache_test.cc index 1057871c9..c3872c036 100644 --- a/db/db_logical_block_size_cache_test.cc +++ b/db/db_logical_block_size_cache_test.cc @@ -9,6 +9,7 @@ #include "env/io_posix.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "test_util/sync_point.h" namespace ROCKSDB_NAMESPACE { class EnvWithCustomLogicalBlockSizeCache : public EnvWrapper { @@ -183,6 +184,15 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { ASSERT_EQ(1, cache_->GetRefCount(dbname_)); std::vector cfs; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBLogicalBlockSizeCacheTest::CreateColumnFamilies::PostSetupTwoCFH", + "DBImpl::DumpStats:StartRunning"}, + {"DBImpl::DumpStats:PostCFDRef", + "DBLogicalBlockSizeCacheTest::CreateColumnFamilies::PreDeleteTwoCFH"}, + {"DBLogicalBlockSizeCacheTest::CreateColumnFamilies::" + "PostFinishCheckingRef", + "DBImpl::DumpStats::PreCFDUnrefAndTryDelete"}}); ASSERT_OK(db->CreateColumnFamilies(cf_options, {"cf1", "cf2"}, &cfs)); ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(dbname_)); @@ -190,7 +200,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { ASSERT_TRUE(cache_->Contains(cf_path_0_)); ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); } // Delete one handle will not drop cache because another handle is still // referencing cf_path_0_. + TEST_SYNC_POINT( + "DBLogicalBlockSizeCacheTest::CreateColumnFamilies::PostSetupTwoCFH"); + TEST_SYNC_POINT( + "DBLogicalBlockSizeCacheTest::CreateColumnFamilies::PreDeleteTwoCFH"); ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0])); ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(dbname_)); @@ -209,16 +221,20 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { ASSERT_TRUE(cache_->Contains(cf_path_0_)); // Will fail ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); // Delete the last handle will drop cache. ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1])); ASSERT_EQ(1, cache_->Size()); ASSERT_TRUE(cache_->Contains(dbname_)); // Will fail ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + TEST_SYNC_POINT( + "DBLogicalBlockSizeCacheTest::CreateColumnFamilies::" + "PostFinishCheckingRef"); delete db; ASSERT_EQ(0, cache_->Size()); ASSERT_OK(DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}})); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } ``` **Summary** - Removed the flaky assertion - Clarified the comments for the test Pull Request resolved: https://github.com/facebook/rocksdb/pull/9516 Test Plan: - CI - Monitor for future flakiness Reviewed By: ajkr Differential Revision: D34055232 Pulled By: hx235 fbshipit-source-id: 9bf83ae5fa88bf6fc829876494d4692082e4c357 --- db/db_logical_block_size_cache_test.cc | 27 ++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/db/db_logical_block_size_cache_test.cc b/db/db_logical_block_size_cache_test.cc index 1057871c9..9c6e2e6da 100644 --- a/db/db_logical_block_size_cache_test.cc +++ b/db/db_logical_block_size_cache_test.cc @@ -167,9 +167,14 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) { } TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { - // Tests that CreateColumnFamilies will cache the cf_paths, - // drop the column family handle won't drop the cache, - // drop and then delete the column family handle will drop the cache. + // To test: + // (1) CreateColumnFamilies will cache the cf_paths in + // DBLogicalBlockSizeCache + // (2) Dropping column family handles associated with + // that cf_paths won't drop the cached cf_paths + // (3) Deleting all the column family handles associated + // with that cf_paths will drop the cached cf_paths + Options options; options.create_if_missing = true; options.env = env_.get(); @@ -190,7 +195,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { ASSERT_TRUE(cache_->Contains(cf_path_0_)); ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); - // Drop column family does not drop cache. + // Drop column family does not drop cf_path_0_'s entry from cache for (ColumnFamilyHandle* cf : cfs) { ASSERT_OK(db->DropColumnFamily(cf)); ASSERT_EQ(2, cache_->Size()); @@ -200,22 +205,24 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); } - // Delete one handle will not drop cache because another handle is still - // referencing cf_path_0_. + // Delete one cf handle will not drop cf_path_0_'s entry from cache because + // another handle is still referencing cf_path_0_. ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0])); ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(dbname_)); ASSERT_EQ(1, cache_->GetRefCount(dbname_)); ASSERT_TRUE(cache_->Contains(cf_path_0_)); - ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); - // Delete the last handle will drop cache. + // Delete all cf handles and ensure the ref count of cf_path_0_ in cache_ + // can be properly decreased by releasing any background reference to the + // ColumnFamilyData during db deletion ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1])); - ASSERT_EQ(1, cache_->Size()); ASSERT_TRUE(cache_->Contains(dbname_)); ASSERT_EQ(1, cache_->GetRefCount(dbname_)); - delete db; + + // Now cf_path_0_ in cache_ has been properly decreased and cf_path_0_'s entry + // is dropped from cache ASSERT_EQ(0, cache_->Size()); ASSERT_OK(DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}}));