In delete scheduler, before ftruncate file for slow delete, check whether there is other hard links (#4093)

Summary:
Right now slow deletion with ftruncate doesn't work well with checkpoints because it ruin hard linked files in checkpoints. To fix it, check the file has no other hard link before ftruncate it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4093

Differential Revision: D8730360

Pulled By: siying

fbshipit-source-id: 756eea5bce8a87b9a2ea3a5bfa190b2cab6f75df
main
Siying Dong 7 years ago committed by Facebook Github Bot
parent b9846370e9
commit 926f3a78a6
  1. 9
      env/env_posix.cc
  2. 10
      include/rocksdb/env.h
  3. 14
      include/rocksdb/sst_file_manager.h
  4. 3
      java/rocksjni/sst_file_manager.cc
  5. 24
      util/delete_scheduler.cc
  6. 2
      util/delete_scheduler.h
  7. 34
      util/delete_scheduler_test.cc

9
env/env_posix.cc vendored

@ -613,6 +613,15 @@ class PosixEnv : public Env {
return result; return result;
} }
Status NumFileLinks(const std::string& fname, uint64_t* count) override {
struct stat s;
if (stat(fname.c_str(), &s) != 0) {
return IOError("while stat a file for num file links", fname, errno);
}
*count = static_cast<uint64_t>(s.st_nlink);
return Status::OK();
}
virtual Status AreFilesSame(const std::string& first, virtual Status AreFilesSame(const std::string& first,
const std::string& second, bool* res) override { const std::string& second, bool* res) override {
struct stat statbuf[2]; struct stat statbuf[2];

@ -289,6 +289,12 @@ class Env {
return Status::NotSupported("LinkFile is not supported for this Env"); return Status::NotSupported("LinkFile is not supported for this Env");
} }
virtual Status NumFileLinks(const std::string& /*fname*/,
uint64_t* /*count*/) {
return Status::NotSupported(
"Getting number of file links is not supported for this Env");
}
virtual Status AreFilesSame(const std::string& /*first*/, virtual Status AreFilesSame(const std::string& /*first*/,
const std::string& /*second*/, bool* /*res*/) { const std::string& /*second*/, bool* /*res*/) {
return Status::NotSupported("AreFilesSame is not supported for this Env"); return Status::NotSupported("AreFilesSame is not supported for this Env");
@ -1064,6 +1070,10 @@ class EnvWrapper : public Env {
return target_->LinkFile(s, t); return target_->LinkFile(s, t);
} }
Status NumFileLinks(const std::string& fname, uint64_t* count) override {
return target_->NumFileLinks(fname, count);
}
Status AreFilesSame(const std::string& first, const std::string& second, Status AreFilesSame(const std::string& first, const std::string& second,
bool* res) override { bool* res) override {
return target_->AreFilesSame(first, second, res); return target_->AreFilesSame(first, second, res);

@ -96,14 +96,18 @@ class SstFileManager {
// @param max_trash_db_ratio: If the trash size constitutes for more than this // @param max_trash_db_ratio: If the trash size constitutes for more than this
// fraction of the total DB size we will start deleting new files passed to // fraction of the total DB size we will start deleting new files passed to
// DeleteScheduler immediately // DeleteScheduler immediately
// @param bytes_max_delete_chunk: if a single file is larger than delete chunk, // @param bytes_max_delete_chunk: if a file to delete is larger than delete
// ftruncate the file by this size each time, rather than dropping the whole // chunk, ftruncate the file by this size each time, rather than dropping the
// file. 0 means to always delete the whole file. NOTE this options may not // whole file. 0 means to always delete the whole file. If the file has more
// work well with checkpoints, which relies on file system hard links. // than one linked names, the file will be deleted as a whole. Either way,
// `rate_bytes_per_sec` will be appreciated. NOTE that with this option,
// files already renamed as a trash may be partial, so users should not
// directly recover them without checking.
extern SstFileManager* NewSstFileManager( extern SstFileManager* NewSstFileManager(
Env* env, std::shared_ptr<Logger> info_log = nullptr, Env* env, std::shared_ptr<Logger> info_log = nullptr,
std::string trash_dir = "", int64_t rate_bytes_per_sec = 0, std::string trash_dir = "", int64_t rate_bytes_per_sec = 0,
bool delete_existing_trash = true, Status* status = nullptr, bool delete_existing_trash = true, Status* status = nullptr,
double max_trash_db_ratio = 0.25, uint64_t bytes_max_delete_chunk = 0); double max_trash_db_ratio = 0.25,
uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
} // namespace rocksdb } // namespace rocksdb

@ -138,8 +138,7 @@ jobject Java_org_rocksdb_SstFileManager_getTrackedFiles(JNIEnv* env,
const rocksdb::HashMapJni::FnMapKV<const std::string, const uint64_t> const rocksdb::HashMapJni::FnMapKV<const std::string, const uint64_t>
fn_map_kv = fn_map_kv =
[env]( [env](const std::pair<const std::string, const uint64_t>& pair) {
const std::pair<const std::string, const uint64_t>& pair) {
const jstring jtracked_file_path = const jstring jtracked_file_path =
env->NewStringUTF(pair.first.c_str()); env->NewStringUTF(pair.first.c_str());
if (jtracked_file_path == nullptr) { if (jtracked_file_path == nullptr) {

@ -267,8 +267,16 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
if (s.ok()) { if (s.ok()) {
bool need_full_delete = true; bool need_full_delete = true;
if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) { if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
uint64_t num_hard_links = 2;
// We don't have to worry aobut data race between linking a new
// file after the number of file link check and ftruncte because
// the file is now in trash and no hardlink is supposed to create
// to trash files by RocksDB.
Status my_status = env_->NumFileLinks(path_in_trash, &num_hard_links);
if (my_status.ok()) {
if (num_hard_links == 1) {
unique_ptr<WritableFile> wf; unique_ptr<WritableFile> wf;
Status my_status = my_status =
env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions()); env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions());
if (my_status.ok()) { if (my_status.ok()) {
my_status = wf->Truncate(file_size - bytes_max_delete_chunk_); my_status = wf->Truncate(file_size - bytes_max_delete_chunk_);
@ -286,6 +294,20 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
"Failed to partially delete %s from trash -- %s", "Failed to partially delete %s from trash -- %s",
path_in_trash.c_str(), my_status.ToString().c_str()); path_in_trash.c_str(), my_status.ToString().c_str());
} }
} else {
ROCKS_LOG_INFO(info_log_,
"Cannot delete %s slowly through ftruncate from trash "
"as it has other links",
path_in_trash.c_str());
}
} else if (!num_link_error_printed_) {
ROCKS_LOG_INFO(
info_log_,
"Cannot delete files slowly through ftruncate from trash "
"as Env::NumFileLinks() returns error: %s",
my_status.ToString().c_str());
num_link_error_printed_ = true;
}
} }
if (need_full_delete) { if (need_full_delete) {

@ -108,6 +108,8 @@ class DeleteScheduler {
uint64_t bytes_max_delete_chunk_; uint64_t bytes_max_delete_chunk_;
// Errors that happened in BackgroundEmptyTrash (file_path => error) // Errors that happened in BackgroundEmptyTrash (file_path => error)
std::map<std::string, Status> bg_errors_; std::map<std::string, Status> bg_errors_;
bool num_link_error_printed_ = false;
// Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop
bool closing_; bool closing_;
// Condition variable signaled in these conditions // Condition variable signaled in these conditions

@ -478,6 +478,40 @@ TEST_F(DeleteSchedulerTest, DeletePartialFile) {
rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rocksdb::SyncPoint::GetInstance()->EnableProcessing();
} }
#ifdef OS_LINUX
TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) {
int bg_delete_file = 0;
int bg_fsync = 0;
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void*) { bg_delete_file++; });
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec
NewDeleteScheduler();
std::string file1 = NewDummyFile("data_1", 500 * 1024);
std::string file2 = NewDummyFile("data_2", 100 * 1024);
ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
// Should delete in 4 batch if there is no hardlink
ASSERT_OK(delete_scheduler_->DeleteFile(file1, ""));
ASSERT_OK(delete_scheduler_->DeleteFile(file2, ""));
delete_scheduler_->WaitForEmptyTrash();
auto bg_errors = delete_scheduler_->GetBackgroundErrors();
ASSERT_EQ(bg_errors.size(), 0);
ASSERT_EQ(2, bg_delete_file);
ASSERT_EQ(0, bg_fsync);
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
}
#endif
// 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec) // 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec)
// 2- Delete 100 files using DeleteScheduler // 2- Delete 100 files using DeleteScheduler
// 3- Delete the DeleteScheduler (call the destructor while queue is not empty) // 3- Delete the DeleteScheduler (call the destructor while queue is not empty)

Loading…
Cancel
Save