From 6cac4c79d457f199271e05caca07539fc588e7d0 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Thu, 20 Apr 2023 12:48:53 -0700 Subject: [PATCH] Fix race condition in db_stress checkpoint cleanup (#11389) Summary: The old cleanup code had a race condition: 1. Test thread: DestroyDB() marked a file as trash 2. DeleteScheduler thread: Got the file's size and decided to delete it in chunks 3. Test thread: DestroyDir() deleted that trash file 4. DeleteScheduler thread: Began deleting in chunks starting by calling ReopenWritableFile(). Unfortunately this recreates the deleted trash file 5. Test thread: DestroyDir() fails to remove the parent directory because it contains the file created in 4. 6. Test thread: Checkpoint::Create() fails due to the directory already existing It could be repro'd with the following patch/command. Patch: ``` diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 8a2d1615d..337d24a60 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -317,6 +317,12 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, &num_hard_links, nullptr); if (my_status.ok()) { if (num_hard_links == 1) { + // Give some time for DestroyDir() to delete file entries. Then, the + // below `ReopenWritableFile()` will recreate files, preventing the + // parent directory from being deleted. + if (rand() % 2 == 0) { + usleep(1000); + } std::unique_ptr wf; my_status = fs_->ReopenWritableFile(path_in_trash, FileOptions(), &wf, nullptr); diff --git a/file/file_util.cc b/file/file_util.cc index 43608fcdc..2cee1ad8e 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -263,6 +263,13 @@ Status DestroyDir(Env* env, const std::string& dir) { } } + // Give some time for the DeleteScheduler thread's ReopenWritableFile() to + // recreate deleted files + if (dir.find("checkpoint") != std::string::npos) { + fprintf(stderr, "waiting to destroy %s\n", dir.c_str()); + usleep(10000); + } + if (s.ok()) { s = env->DeleteDir(dir); // DeleteDir might or might not report NotFound ``` Command: ``` TEST_TMPDIR=/dev/shm python3 tools/db_crashtest.py blackbox --simple --write_buffer_size=131072 --target_file_size_base=131072 --max_bytes_for_level_base=524288 --checkpoint_one_in=100 --clear_column_family_one_in=0 --max_key=1000 --value_size_mult=33 --sst_file_manager_bytes_per_truncate=4096 --sst_file_manager_bytes_per_sec=1048576 --interval=3 --compression_type=none --sync_fault_injection=1 ``` Obviously we don't want to use scheduled deletion here as we need the checkpoint directory deleted immediately. I suspect the DestroyDir() was an attempt to fixup incomplete DestroyDB()s. Now that we expect DestroyDB() to be complete I removed that code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11389 Reviewed By: hx235 Differential Revision: D45137142 Pulled By: ajkr fbshipit-source-id: 2af743d342c77cc414fd25fc4c9d7c9c6079ad24 --- db_stress_tool/db_stress_test_base.cc | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 69a2b9c66..e508dadb5 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -1835,19 +1835,11 @@ Status StressTest::TestCheckpoint(ThreadState* thread, Options tmp_opts(options_); tmp_opts.listeners.clear(); tmp_opts.env = db_stress_env; + // Avoid delayed deletion so whole directory can be deleted + tmp_opts.sst_file_manager.reset(); DestroyDB(checkpoint_dir, tmp_opts); - if (db_stress_env->FileExists(checkpoint_dir).ok()) { - // If the directory might still exist, try to delete the files one by one. - // Likely a trash file is still there. - Status my_s = DestroyDir(db_stress_env, checkpoint_dir); - if (!my_s.ok()) { - fprintf(stderr, "Fail to destory directory before checkpoint: %s", - my_s.ToString().c_str()); - } - } - Checkpoint* checkpoint = nullptr; Status s = Checkpoint::Create(db_, &checkpoint); if (s.ok()) {