Fix flaky test `DBTestUniversalManualCompactionOutputPathId.ManualCompactionOutputPathId` (#11412)

Summary:
the test is flaky when compiled with `make -j56 COERCE_CONTEXT_SWITCH=1 ./db_universal_compaction_test`. The cause is that a manual compaction `CompactRange()` can finish and return before obsolete files are deleted. One reason for this is that a manual compaction waits until `manual.done` is set here 62fc15f009/db/db_impl/db_impl_compaction_flush.cc (L1978)
and the compaction thread can set `manual.done`:
62fc15f009/db/db_impl/db_impl_compaction_flush.cc (L3672)
and then temporarily release mutex_:
62fc15f009/db/db_impl/db_impl_files.cc (L317)
before purging obsolete files:
62fc15f009/db/db_impl/db_impl_compaction_flush.cc (L3144)

With `COERCE_CONTEXT_SWITCH=1`, `bg_cv_.SignalAll()` is called during `mutex_.Lock()`, so the manual compaction thread can wake up and return before obsolete files are deleted. Updated the test to only count live SST files.

Also updated `FindObsoleteFiles()` to avoid locking a locked mutex.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11412

Test Plan: `make -j56 COERCE_CONTEXT_SWITCH=1 ./db_universal_compaction_test`

Reviewed By: ajkr

Differential Revision: D45342242

Pulled By: cbi42

fbshipit-source-id: 955c9796aa3f484e3557d300f97cffacb3ed9b0c
oxigraph-8.3.2
Changyu Bi 2 years ago committed by Facebook GitHub Bot
parent c81d58016b
commit a11f1e12ca
  1. 2
      db/db_impl/db_impl.h
  2. 2
      db/db_impl/db_impl_compaction_flush.cc
  3. 4
      db/db_impl/db_impl_files.cc
  4. 18
      db/db_test_util.cc
  5. 2
      db/db_test_util.h
  6. 20
      db/db_universal_compaction_test.cc

@ -2310,7 +2310,7 @@ class DBImpl : public DB {
// logfile_number_. With two_write_queues it also protects alive_log_files_, // logfile_number_. With two_write_queues it also protects alive_log_files_,
// and log_empty_. Refer to the definition of each variable below for more // and log_empty_. Refer to the definition of each variable below for more
// details. // details.
// Note: to avoid dealock, if needed to acquire both log_write_mutex_ and // Note: to avoid deadlock, if needed to acquire both log_write_mutex_ and
// mutex_, the order should be first mutex_ and then log_write_mutex_. // mutex_, the order should be first mutex_ and then log_write_mutex_.
InstrumentedMutex log_write_mutex_; InstrumentedMutex log_write_mutex_;

@ -3569,7 +3569,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
mutex_.Unlock(); mutex_.Unlock();
TEST_SYNC_POINT_CALLBACK( TEST_SYNC_POINT_CALLBACK(
"DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
// Should handle erorr? // Should handle error?
compaction_job.Run().PermitUncheckedError(); compaction_job.Run().PermitUncheckedError();
TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
mutex_.Lock(); mutex_.Lock();

@ -286,6 +286,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
return; return;
} }
bool mutex_unlocked = false;
if (!alive_log_files_.empty() && !logs_.empty()) { if (!alive_log_files_.empty() && !logs_.empty()) {
uint64_t min_log_number = job_context->log_number; uint64_t min_log_number = job_context->log_number;
size_t num_alive_log_files = alive_log_files_.size(); size_t num_alive_log_files = alive_log_files_.size();
@ -315,6 +316,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
} }
log_write_mutex_.Unlock(); log_write_mutex_.Unlock();
mutex_.Unlock(); mutex_.Unlock();
mutex_unlocked = true;
TEST_SYNC_POINT_CALLBACK("FindObsoleteFiles::PostMutexUnlock", nullptr); TEST_SYNC_POINT_CALLBACK("FindObsoleteFiles::PostMutexUnlock", nullptr);
log_write_mutex_.Lock(); log_write_mutex_.Lock();
while (!logs_.empty() && logs_.front().number < min_log_number) { while (!logs_.empty() && logs_.front().number < min_log_number) {
@ -337,7 +339,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
logs_to_free_.clear(); logs_to_free_.clear();
log_write_mutex_.Unlock(); log_write_mutex_.Unlock();
if (mutex_unlocked) {
mutex_.Lock(); mutex_.Lock();
}
job_context->log_recycle_files.assign(log_recycle_files_.begin(), job_context->log_recycle_files.assign(log_recycle_files_.begin(),
log_recycle_files_.end()); log_recycle_files_.end());
} }

@ -1074,6 +1074,24 @@ size_t DBTestBase::TotalLiveFiles(int cf) {
return num_files; return num_files;
} }
size_t DBTestBase::TotalLiveFilesAtPath(int cf, const std::string& path) {
ColumnFamilyMetaData cf_meta;
if (cf == 0) {
db_->GetColumnFamilyMetaData(&cf_meta);
} else {
db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
}
size_t num_files = 0;
for (auto& level : cf_meta.levels) {
for (auto& f : level.files) {
if (f.directory == path) {
num_files++;
}
}
}
return num_files;
}
size_t DBTestBase::CountLiveFiles() { size_t DBTestBase::CountLiveFiles() {
std::vector<LiveFileMetaData> metadata; std::vector<LiveFileMetaData> metadata;
db_->GetLiveFilesMetaData(&metadata); db_->GetLiveFilesMetaData(&metadata);

@ -1192,6 +1192,8 @@ class DBTestBase : public testing::Test {
size_t TotalLiveFiles(int cf = 0); size_t TotalLiveFiles(int cf = 0);
size_t TotalLiveFilesAtPath(int cf, const std::string& path);
size_t CountLiveFiles(); size_t CountLiveFiles();
int NumTableFilesAtLevel(int level, int cf = 0); int NumTableFilesAtLevel(int level, int cf = 0);

@ -1850,31 +1850,31 @@ TEST_P(DBTestUniversalManualCompactionOutputPathId,
compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
ASSERT_EQ(1, TotalLiveFiles(1)); ASSERT_EQ(1, TotalLiveFiles(1));
ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(0, TotalLiveFilesAtPath(1, options.db_paths[0].path));
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[1].path));
ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
ASSERT_EQ(1, TotalLiveFiles(1)); ASSERT_EQ(1, TotalLiveFiles(1));
ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(0, TotalLiveFilesAtPath(1, options.db_paths[0].path));
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[1].path));
MakeTables(1, "p", "q", 1); MakeTables(1, "p", "q", 1);
ASSERT_EQ(2, TotalLiveFiles(1)); ASSERT_EQ(2, TotalLiveFiles(1));
ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[0].path));
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[1].path));
ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
ASSERT_EQ(2, TotalLiveFiles(1)); ASSERT_EQ(2, TotalLiveFiles(1));
ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[0].path));
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[1].path));
// Full compaction to DB path 0 // Full compaction to DB path 0
compact_options.target_path_id = 0; compact_options.target_path_id = 0;
compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
ASSERT_EQ(1, TotalLiveFiles(1)); ASSERT_EQ(1, TotalLiveFiles(1));
ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[0].path));
ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path)); ASSERT_EQ(0, TotalLiveFilesAtPath(1, options.db_paths[1].path));
// Fail when compacting to an invalid path ID // Fail when compacting to an invalid path ID
compact_options.target_path_id = 2; compact_options.target_path_id = 2;

Loading…
Cancel
Save