From 926f3a78a64b327475ee6c60b6c8ab4f34253204 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Mon, 9 Jul 2018 15:17:38 -0700 Subject: [PATCH] In delete scheduler, before ftruncate file for slow delete, check whether there is other hard links (#4093) Summary: Right now slow deletion with ftruncate doesn't work well with checkpoints because it ruin hard linked files in checkpoints. To fix it, check the file has no other hard link before ftruncate it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4093 Differential Revision: D8730360 Pulled By: siying fbshipit-source-id: 756eea5bce8a87b9a2ea3a5bfa190b2cab6f75df --- env/env_posix.cc | 9 +++++ include/rocksdb/env.h | 10 ++++++ include/rocksdb/sst_file_manager.h | 14 +++++--- java/rocksjni/sst_file_manager.cc | 3 +- util/delete_scheduler.cc | 54 +++++++++++++++++++++--------- util/delete_scheduler.h | 2 ++ util/delete_scheduler_test.cc | 34 +++++++++++++++++++ 7 files changed, 103 insertions(+), 23 deletions(-) diff --git a/env/env_posix.cc b/env/env_posix.cc index b29a64752..cc377f161 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -613,6 +613,15 @@ class PosixEnv : public Env { return result; } + Status NumFileLinks(const std::string& fname, uint64_t* count) override { + struct stat s; + if (stat(fname.c_str(), &s) != 0) { + return IOError("while stat a file for num file links", fname, errno); + } + *count = static_cast(s.st_nlink); + return Status::OK(); + } + virtual Status AreFilesSame(const std::string& first, const std::string& second, bool* res) override { struct stat statbuf[2]; diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 79ff3f38b..c6ca725c5 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -289,6 +289,12 @@ class Env { return Status::NotSupported("LinkFile is not supported for this Env"); } + virtual Status NumFileLinks(const std::string& /*fname*/, + uint64_t* /*count*/) { + return Status::NotSupported( + "Getting number of file links is not supported for this Env"); + } + virtual Status AreFilesSame(const std::string& /*first*/, const std::string& /*second*/, bool* /*res*/) { return Status::NotSupported("AreFilesSame is not supported for this Env"); @@ -1064,6 +1070,10 @@ class EnvWrapper : public Env { return target_->LinkFile(s, t); } + Status NumFileLinks(const std::string& fname, uint64_t* count) override { + return target_->NumFileLinks(fname, count); + } + Status AreFilesSame(const std::string& first, const std::string& second, bool* res) override { return target_->AreFilesSame(first, second, res); diff --git a/include/rocksdb/sst_file_manager.h b/include/rocksdb/sst_file_manager.h index 439537e2a..2d3c3be66 100644 --- a/include/rocksdb/sst_file_manager.h +++ b/include/rocksdb/sst_file_manager.h @@ -96,14 +96,18 @@ class SstFileManager { // @param max_trash_db_ratio: If the trash size constitutes for more than this // fraction of the total DB size we will start deleting new files passed to // DeleteScheduler immediately -// @param bytes_max_delete_chunk: if a single file is larger than delete chunk, -// ftruncate the file by this size each time, rather than dropping the whole -// file. 0 means to always delete the whole file. NOTE this options may not -// work well with checkpoints, which relies on file system hard links. +// @param bytes_max_delete_chunk: if a file to delete is larger than delete +// chunk, ftruncate the file by this size each time, rather than dropping the +// whole file. 0 means to always delete the whole file. If the file has more +// than one linked names, the file will be deleted as a whole. Either way, +// `rate_bytes_per_sec` will be appreciated. NOTE that with this option, +// files already renamed as a trash may be partial, so users should not +// directly recover them without checking. extern SstFileManager* NewSstFileManager( Env* env, std::shared_ptr info_log = nullptr, std::string trash_dir = "", int64_t rate_bytes_per_sec = 0, bool delete_existing_trash = true, Status* status = nullptr, - double max_trash_db_ratio = 0.25, uint64_t bytes_max_delete_chunk = 0); + double max_trash_db_ratio = 0.25, + uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024); } // namespace rocksdb diff --git a/java/rocksjni/sst_file_manager.cc b/java/rocksjni/sst_file_manager.cc index 440411c39..c83ea00ef 100644 --- a/java/rocksjni/sst_file_manager.cc +++ b/java/rocksjni/sst_file_manager.cc @@ -138,8 +138,7 @@ jobject Java_org_rocksdb_SstFileManager_getTrackedFiles(JNIEnv* env, const rocksdb::HashMapJni::FnMapKV fn_map_kv = - [env]( - const std::pair& pair) { + [env](const std::pair& pair) { const jstring jtracked_file_path = env->NewStringUTF(pair.first.c_str()); if (jtracked_file_path == nullptr) { diff --git a/util/delete_scheduler.cc b/util/delete_scheduler.cc index 8ef66fdfa..1d51055a3 100644 --- a/util/delete_scheduler.cc +++ b/util/delete_scheduler.cc @@ -267,24 +267,46 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, if (s.ok()) { bool need_full_delete = true; if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) { - unique_ptr wf; - Status my_status = - env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions()); + uint64_t num_hard_links = 2; + // We don't have to worry aobut data race between linking a new + // file after the number of file link check and ftruncte because + // the file is now in trash and no hardlink is supposed to create + // to trash files by RocksDB. + Status my_status = env_->NumFileLinks(path_in_trash, &num_hard_links); if (my_status.ok()) { - my_status = wf->Truncate(file_size - bytes_max_delete_chunk_); - if (my_status.ok()) { - TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync"); - my_status = wf->Fsync(); + if (num_hard_links == 1) { + unique_ptr wf; + my_status = + env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions()); + if (my_status.ok()) { + my_status = wf->Truncate(file_size - bytes_max_delete_chunk_); + if (my_status.ok()) { + TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync"); + my_status = wf->Fsync(); + } + } + if (my_status.ok()) { + *deleted_bytes = bytes_max_delete_chunk_; + need_full_delete = false; + *is_complete = false; + } else { + ROCKS_LOG_WARN(info_log_, + "Failed to partially delete %s from trash -- %s", + path_in_trash.c_str(), my_status.ToString().c_str()); + } + } else { + ROCKS_LOG_INFO(info_log_, + "Cannot delete %s slowly through ftruncate from trash " + "as it has other links", + path_in_trash.c_str()); } - } - if (my_status.ok()) { - *deleted_bytes = bytes_max_delete_chunk_; - need_full_delete = false; - *is_complete = false; - } else { - ROCKS_LOG_WARN(info_log_, - "Failed to partially delete %s from trash -- %s", - path_in_trash.c_str(), my_status.ToString().c_str()); + } else if (!num_link_error_printed_) { + ROCKS_LOG_INFO( + info_log_, + "Cannot delete files slowly through ftruncate from trash " + "as Env::NumFileLinks() returns error: %s", + my_status.ToString().c_str()); + num_link_error_printed_ = true; } } diff --git a/util/delete_scheduler.h b/util/delete_scheduler.h index 0d94ca72d..cbd13ecef 100644 --- a/util/delete_scheduler.h +++ b/util/delete_scheduler.h @@ -108,6 +108,8 @@ class DeleteScheduler { uint64_t bytes_max_delete_chunk_; // Errors that happened in BackgroundEmptyTrash (file_path => error) std::map bg_errors_; + + bool num_link_error_printed_ = false; // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop bool closing_; // Condition variable signaled in these conditions diff --git a/util/delete_scheduler_test.cc b/util/delete_scheduler_test.cc index e7fd8d49f..78d5313f8 100644 --- a/util/delete_scheduler_test.cc +++ b/util/delete_scheduler_test.cc @@ -478,6 +478,40 @@ TEST_F(DeleteSchedulerTest, DeletePartialFile) { rocksdb::SyncPoint::GetInstance()->EnableProcessing(); } +#ifdef OS_LINUX +TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) { + int bg_delete_file = 0; + int bg_fsync = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void*) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec + NewDeleteScheduler(); + + std::string file1 = NewDummyFile("data_1", 500 * 1024); + std::string file2 = NewDummyFile("data_2", 100 * 1024); + + ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b")); + ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b")); + + // Should delete in 4 batch if there is no hardlink + ASSERT_OK(delete_scheduler_->DeleteFile(file1, "")); + ASSERT_OK(delete_scheduler_->DeleteFile(file2, "")); + + delete_scheduler_->WaitForEmptyTrash(); + + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 0); + ASSERT_EQ(2, bg_delete_file); + ASSERT_EQ(0, bg_fsync); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +} +#endif + // 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec) // 2- Delete 100 files using DeleteScheduler // 3- Delete the DeleteScheduler (call the destructor while queue is not empty)