Add SstFileManager (component tracking all SST file in DBs and control the deletion rate)

Summary:
Add a new class SstFileTracker that will be notified whenever a DB add/delete/move and sst file, it will also replace DeleteScheduler
SstFileTracker can be used later to abort writes when we exceed a specific size

Test Plan: unit tests

Reviewers: rven, anthony, yhchiang, sdong

Reviewed By: sdong

Subscribers: igor, lovro, march, dhruba

Differential Revision: https://reviews.facebook.net/D50469
main
Islam AbdelRahman 9 years ago
parent 45768ade4f
commit d6c838f1e1
  1. 3
      CMakeLists.txt
  2. 2
      HISTORY.md
  3. 6
      db/compaction_job.cc
  4. 43
      db/db_impl.cc
  5. 102
      db/db_test.cc
  6. 18
      db/db_test_util.cc
  7. 3
      db/db_test_util.h
  8. 67
      include/rocksdb/delete_scheduler.h
  9. 14
      include/rocksdb/options.h
  10. 64
      include/rocksdb/sst_file_manager.h
  11. 3
      src.mk
  12. 98
      util/delete_scheduler.cc
  13. 26
      util/delete_scheduler.h
  14. 87
      util/delete_scheduler_test.cc
  15. 15
      util/file_util.cc
  16. 4
      util/file_util.h
  17. 11
      util/options.cc
  18. 4
      util/options_test.cc
  19. 143
      util/sst_file_manager_impl.cc
  20. 77
      util/sst_file_manager_impl.h

@ -196,13 +196,14 @@ set(SOURCES
util/comparator.cc util/comparator.cc
util/concurrent_arena.cc util/concurrent_arena.cc
util/crc32c.cc util/crc32c.cc
util/delete_scheduler_impl.cc util/delete_scheduler.cc
util/dynamic_bloom.cc util/dynamic_bloom.cc
util/env.cc util/env.cc
util/env_hdfs.cc util/env_hdfs.cc
util/event_logger.cc util/event_logger.cc
util/file_util.cc util/file_util.cc
util/file_reader_writer.cc util/file_reader_writer.cc
util/sst_file_manager_impl.cc
util/filter_policy.cc util/filter_policy.cc
util/hash.cc util/hash.cc
util/histogram.cc util/histogram.cc

@ -3,9 +3,11 @@
### Public API Changes ### Public API Changes
* Add a new perf context level between kEnableCount and kEnableTime. Level 2 now doesn't include timers for mutexes. * Add a new perf context level between kEnableCount and kEnableTime. Level 2 now doesn't include timers for mutexes.
* Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll. * Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll.
* DBOptions::delete_scheduler and NewDeleteScheduler() are removed, please use DBOptions::sst_file_manager and NewSstFileManager() instead
### New Features ### New Features
* ldb tool now supports operations to non-default column families. * ldb tool now supports operations to non-default column families.
* Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate.
## 4.4.0 (1/14/2016) ## 4.4.0 (1/14/2016)
### Public API Changes ### Public API Changes

@ -51,6 +51,7 @@
#include "util/iostats_context_imp.h" #include "util/iostats_context_imp.h"
#include "util/log_buffer.h" #include "util/log_buffer.h"
#include "util/logging.h" #include "util/logging.h"
#include "util/sst_file_manager_impl.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/perf_context_imp.h" #include "util/perf_context_imp.h"
#include "util/stop_watch.h" #include "util/stop_watch.h"
@ -498,11 +499,16 @@ Status CompactionJob::Run() {
} }
TablePropertiesCollection tp; TablePropertiesCollection tp;
auto sfm =
static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
for (const auto& state : compact_->sub_compact_states) { for (const auto& state : compact_->sub_compact_states) {
for (const auto& output : state.outputs) { for (const auto& output : state.outputs) {
auto fn = TableFileName(db_options_.db_paths, output.meta.fd.GetNumber(), auto fn = TableFileName(db_options_.db_paths, output.meta.fd.GetNumber(),
output.meta.fd.GetPathId()); output.meta.fd.GetPathId());
tp[fn] = output.table_properties; tp[fn] = output.table_properties;
if (sfm && output.meta.fd.GetPathId() == 0) {
sfm->OnAddFile(fn);
}
} }
} }
compact_->compaction->SetOutputTableProperties(std::move(tp)); compact_->compaction->SetOutputTableProperties(std::move(tp));

@ -64,7 +64,6 @@
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/delete_scheduler.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/merge_operator.h" #include "rocksdb/merge_operator.h"
#include "rocksdb/sst_file_writer.h" #include "rocksdb/sst_file_writer.h"
@ -89,6 +88,7 @@
#include "util/log_buffer.h" #include "util/log_buffer.h"
#include "util/logging.h" #include "util/logging.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/sst_file_manager_impl.h"
#include "util/options_helper.h" #include "util/options_helper.h"
#include "util/options_parser.h" #include "util/options_parser.h"
#include "util/perf_context_imp.h" #include "util/perf_context_imp.h"
@ -786,8 +786,8 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
} }
#endif // !ROCKSDB_LITE #endif // !ROCKSDB_LITE
Status file_deletion_status; Status file_deletion_status;
if (type == kTableFile && path_id == 0) { if (type == kTableFile) {
file_deletion_status = DeleteOrMoveToTrash(&db_options_, fname); file_deletion_status = DeleteSSTFile(&db_options_, fname, path_id);
} else { } else {
file_deletion_status = env_->DeleteFile(fname); file_deletion_status = env_->DeleteFile(fname);
} }
@ -1509,6 +1509,14 @@ Status DBImpl::FlushMemTableToOutputFile(
// may temporarily unlock and lock the mutex. // may temporarily unlock and lock the mutex.
NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options, NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options,
job_context->job_id, flush_job.GetTableProperties()); job_context->job_id, flush_job.GetTableProperties());
auto sfm =
static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
if (sfm) {
// Notify sst_file_manager that a new file was added
std::string file_path = MakeTableFileName(db_options_.db_paths[0].path,
file_meta.fd.GetNumber());
sfm->OnAddFile(file_path);
}
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
return s; return s;
@ -5406,6 +5414,25 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
} }
impl->mutex_.Unlock(); impl->mutex_.Unlock();
auto sfm = static_cast<SstFileManagerImpl*>(
impl->db_options_.sst_file_manager.get());
if (s.ok() && sfm) {
// Notify SstFileManager about all sst files that already exist in
// db_paths[0] when the DB is opened.
auto& db_path = impl->db_options_.db_paths[0];
std::vector<std::string> existing_files;
impl->db_options_.env->GetChildren(db_path.path, &existing_files);
for (auto& file_name : existing_files) {
uint64_t file_number;
FileType file_type;
std::string file_path = db_path.path + "/" + file_name;
if (ParseFileName(file_name, &file_number, &file_type) &&
file_type == kTableFile) {
sfm->OnAddFile(file_path);
}
}
}
if (s.ok()) { if (s.ok()) {
Log(InfoLogLevel::INFO_LEVEL, impl->db_options_.info_log, "DB pointer %p", Log(InfoLogLevel::INFO_LEVEL, impl->db_options_.info_log, "DB pointer %p",
impl); impl);
@ -5465,7 +5492,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
if (type == kMetaDatabase) { if (type == kMetaDatabase) {
del = DestroyDB(path_to_delete, options); del = DestroyDB(path_to_delete, options);
} else if (type == kTableFile) { } else if (type == kTableFile) {
del = DeleteOrMoveToTrash(&options, path_to_delete); del = DeleteSSTFile(&options, path_to_delete, 0);
} else { } else {
del = env->DeleteFile(path_to_delete); del = env->DeleteFile(path_to_delete);
} }
@ -5481,13 +5508,9 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
for (size_t i = 0; i < filenames.size(); i++) { for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) && if (ParseFileName(filenames[i], &number, &type) &&
type == kTableFile) { // Lock file will be deleted at end type == kTableFile) { // Lock file will be deleted at end
Status del;
std::string table_path = db_path.path + "/" + filenames[i]; std::string table_path = db_path.path + "/" + filenames[i];
if (path_id == 0) { Status del = DeleteSSTFile(&options, table_path,
del = DeleteOrMoveToTrash(&options, table_path); static_cast<uint32_t>(path_id));
} else {
del = env->DeleteFile(table_path);
}
if (result.ok() && !del.ok()) { if (result.ok() && !del.ok()) {
result = del; result = del;
} }

@ -37,9 +37,9 @@
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/convenience.h" #include "rocksdb/convenience.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/delete_scheduler.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/experimental.h" #include "rocksdb/experimental.h"
#include "rocksdb/sst_file_manager.h"
#include "rocksdb/filter_policy.h" #include "rocksdb/filter_policy.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "rocksdb/perf_context.h" #include "rocksdb/perf_context.h"
@ -65,9 +65,10 @@
#include "util/compression.h" #include "util/compression.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/rate_limiter.h" #include "util/rate_limiter.h"
#include "util/sst_file_manager_impl.h"
#include "util/statistics.h" #include "util/statistics.h"
#include "util/testharness.h"
#include "util/sync_point.h" #include "util/sync_point.h"
#include "util/testharness.h"
#include "util/testutil.h" #include "util/testutil.h"
#include "util/mock_env.h" #include "util/mock_env.h"
#include "util/string_util.h" #include "util/string_util.h"
@ -8431,15 +8432,78 @@ TEST_F(DBTest, DeletingOldWalAfterDrop) {
} }
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
TEST_F(DBTest, DBWithSstFileManager) {
std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
int files_added = 0;
int files_deleted = 0;
int files_moved = 0;
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::OnAddFile", [&](void* arg) { files_added++; });
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::OnDeleteFile", [&](void* arg) { files_deleted++; });
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::OnMoveFile", [&](void* arg) { files_moved++; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
Options options = CurrentOptions();
options.sst_file_manager = sst_file_manager;
DestroyAndReopen(options);
Random rnd(301);
for (int i = 0; i < 25; i++) {
GenerateNewRandomFile(&rnd);
ASSERT_OK(Flush());
dbfull()->TEST_WaitForFlushMemTable();
dbfull()->TEST_WaitForCompact();
// Verify that we are tracking all sst files in dbname_
ASSERT_EQ(sfm->GetTrackedFiles(), GetAllSSTFiles());
}
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
auto files_in_db = GetAllSSTFiles();
// Verify that we are tracking all sst files in dbname_
ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
// Verify the total files size
uint64_t total_files_size = 0;
for (auto& file_to_size : files_in_db) {
total_files_size += file_to_size.second;
}
ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
// We flushed at least 25 files
ASSERT_GE(files_added, 25);
// Compaction must have deleted some files
ASSERT_GT(files_deleted, 0);
// No files were moved
ASSERT_EQ(files_moved, 0);
Close();
Reopen(options);
ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
// Verify that we track all the files again after the DB is closed and opened
Close();
sst_file_manager.reset(NewSstFileManager(env_));
options.sst_file_manager = sst_file_manager;
sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
Reopen(options);
ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
}
TEST_F(DBTest, RateLimitedDelete) { TEST_F(DBTest, RateLimitedDelete) {
rocksdb::SyncPoint::GetInstance()->LoadDependency({ rocksdb::SyncPoint::GetInstance()->LoadDependency({
{"DBTest::RateLimitedDelete:1", {"DBTest::RateLimitedDelete:1", "DeleteScheduler::BackgroundEmptyTrash"},
"DeleteSchedulerImpl::BackgroundEmptyTrash"},
}); });
std::vector<uint64_t> penalties; std::vector<uint64_t> penalties;
rocksdb::SyncPoint::GetInstance()->SetCallBack( rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteSchedulerImpl::BackgroundEmptyTrash:Wait", "DeleteScheduler::BackgroundEmptyTrash:Wait",
[&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); }); [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
rocksdb::SyncPoint::GetInstance()->DisableProcessing(); rocksdb::SyncPoint::GetInstance()->DisableProcessing();
@ -8450,9 +8514,10 @@ TEST_F(DBTest, RateLimitedDelete) {
std::string trash_dir = test::TmpDir(env_) + "/trash"; std::string trash_dir = test::TmpDir(env_) + "/trash";
int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec
Status s; Status s;
options.delete_scheduler.reset(NewDeleteScheduler( options.sst_file_manager.reset(NewSstFileManager(
env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s)); env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s));
ASSERT_OK(s); ASSERT_OK(s);
auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
Destroy(last_options_); Destroy(last_options_);
rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rocksdb::SyncPoint::GetInstance()->EnableProcessing();
@ -8479,7 +8544,7 @@ TEST_F(DBTest, RateLimitedDelete) {
uint64_t delete_start_time = env_->NowMicros(); uint64_t delete_start_time = env_->NowMicros();
// Hold BackgroundEmptyTrash // Hold BackgroundEmptyTrash
TEST_SYNC_POINT("DBTest::RateLimitedDelete:1"); TEST_SYNC_POINT("DBTest::RateLimitedDelete:1");
options.delete_scheduler->WaitForEmptyTrash(); sfm->WaitForEmptyTrash();
uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
uint64_t total_files_size = 0; uint64_t total_files_size = 0;
@ -8502,7 +8567,7 @@ TEST_F(DBTest, RateLimitedDelete) {
TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) { TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) {
int bg_delete_file = 0; int bg_delete_file = 0;
rocksdb::SyncPoint::GetInstance()->SetCallBack( rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", "DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void* arg) { bg_delete_file++; }); [&](void* arg) { bg_delete_file++; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rocksdb::SyncPoint::GetInstance()->EnableProcessing();
@ -8515,9 +8580,10 @@ TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) {
std::string trash_dir = test::TmpDir(env_) + "/trash"; std::string trash_dir = test::TmpDir(env_) + "/trash";
int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec
Status s; Status s;
options.delete_scheduler.reset(NewDeleteScheduler( options.sst_file_manager.reset(NewSstFileManager(
env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s)); env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s));
ASSERT_OK(s); ASSERT_OK(s);
auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
DestroyAndReopen(options); DestroyAndReopen(options);
@ -8551,7 +8617,7 @@ TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) {
ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
ASSERT_EQ("0,2", FilesPerLevel(0)); ASSERT_EQ("0,2", FilesPerLevel(0));
options.delete_scheduler->WaitForEmptyTrash(); sfm->WaitForEmptyTrash();
ASSERT_EQ(bg_delete_file, 8); ASSERT_EQ(bg_delete_file, 8);
compact_options.bottommost_level_compaction = compact_options.bottommost_level_compaction =
@ -8559,7 +8625,7 @@ TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) {
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
ASSERT_EQ("0,1", FilesPerLevel(0)); ASSERT_EQ("0,1", FilesPerLevel(0));
options.delete_scheduler->WaitForEmptyTrash(); sfm->WaitForEmptyTrash();
ASSERT_EQ(bg_delete_file, 8); ASSERT_EQ(bg_delete_file, 8);
rocksdb::SyncPoint::GetInstance()->DisableProcessing(); rocksdb::SyncPoint::GetInstance()->DisableProcessing();
@ -8568,7 +8634,7 @@ TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) {
TEST_F(DBTest, DestroyDBWithRateLimitedDelete) { TEST_F(DBTest, DestroyDBWithRateLimitedDelete) {
int bg_delete_file = 0; int bg_delete_file = 0;
rocksdb::SyncPoint::GetInstance()->SetCallBack( rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", "DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void* arg) { bg_delete_file++; }); [&](void* arg) { bg_delete_file++; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rocksdb::SyncPoint::GetInstance()->EnableProcessing();
@ -8590,12 +8656,13 @@ TEST_F(DBTest, DestroyDBWithRateLimitedDelete) {
std::string trash_dir = test::TmpDir(env_) + "/trash"; std::string trash_dir = test::TmpDir(env_) + "/trash";
int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec
Status s; Status s;
options.delete_scheduler.reset(NewDeleteScheduler( options.sst_file_manager.reset(NewSstFileManager(
env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s)); env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s));
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_OK(DestroyDB(dbname_, options));
options.delete_scheduler->WaitForEmptyTrash(); auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
sfm->WaitForEmptyTrash();
// We have deleted the 4 sst files in the delete_scheduler // We have deleted the 4 sst files in the delete_scheduler
ASSERT_EQ(bg_delete_file, 4); ASSERT_EQ(bg_delete_file, 4);
} }
@ -10073,7 +10140,6 @@ TEST_F(DBTest, WalFilterTestWithChangeBatchExtraKeys) {
ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
class SliceTransformLimitedDomain : public SliceTransform { class SliceTransformLimitedDomain : public SliceTransform {

@ -1005,4 +1005,22 @@ void DBTestBase::CopyFile(const std::string& source,
ASSERT_OK(destfile->Close()); ASSERT_OK(destfile->Close());
} }
std::unordered_map<std::string, uint64_t> DBTestBase::GetAllSSTFiles() {
std::unordered_map<std::string, uint64_t> res;
std::vector<std::string> files;
env_->GetChildren(dbname_, &files);
for (auto& file_name : files) {
uint64_t number;
FileType type;
std::string file_path = dbname_ + "/" + file_name;
if (ParseFileName(file_name, &number, &type) && type == kTableFile) {
uint64_t file_size = 0;
env_->GetFileSize(file_path, &file_size);
res[file_path] = file_size;
}
}
return res;
}
} // namespace rocksdb } // namespace rocksdb

@ -19,6 +19,7 @@
#endif #endif
#include <algorithm> #include <algorithm>
#include <map>
#include <set> #include <set>
#include <string> #include <string>
#include <thread> #include <thread>
@ -750,6 +751,8 @@ class DBTestBase : public testing::Test {
void CopyFile(const std::string& source, const std::string& destination, void CopyFile(const std::string& source, const std::string& destination,
uint64_t size = 0); uint64_t size = 0);
std::unordered_map<std::string, uint64_t> GetAllSSTFiles();
}; };
} // namespace rocksdb } // namespace rocksdb

@ -1,67 +0,0 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <map>
#include <string>
#include <memory>
#include "rocksdb/status.h"
namespace rocksdb {
class Env;
class Logger;
// DeleteScheduler allow the DB to enforce a rate limit on file deletion,
// Instead of deleteing files immediately, files are moved to trash_dir
// and deleted in a background thread that apply sleep penlty between deletes
// if they are happening in a rate faster than rate_bytes_per_sec,
//
// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this
// case DeleteScheduler will delete files immediately.
class DeleteScheduler {
public:
virtual ~DeleteScheduler() {}
// Return delete rate limit in bytes per second
virtual int64_t GetRateBytesPerSecond() = 0;
// Move file to trash directory and schedule it's deletion
virtual Status DeleteFile(const std::string& fname) = 0;
// Return a map containing errors that happened in the background thread
// file_path => error status
virtual std::map<std::string, Status> GetBackgroundErrors() = 0;
// Wait for all files being deleteing in the background to finish or for
// destructor to be called.
virtual void WaitForEmptyTrash() = 0;
};
// Create a new DeleteScheduler that can be shared among multiple RocksDB
// instances to control the file deletion rate.
//
// @env: Pointer to Env object, please see "rocksdb/env.h".
// @trash_dir: Path to the directory where deleted files will be moved into
// to be deleted in a background thread while applying rate limiting. If this
// directory dont exist, it will be created. This directory should not be
// used by any other process or any other DeleteScheduler.
// @rate_bytes_per_sec: How many bytes should be deleted per second, If this
// value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb
// in 1 second, we will wait for another 3 seconds before we delete other
// files, Set to 0 to disable rate limiting.
// @info_log: If not nullptr, info_log will be used to log errors.
// @delete_exisitng_trash: If set to true, the newly created DeleteScheduler
// will delete files that already exist in trash_dir.
// @status: If not nullptr, status will contain any errors that happened during
// creating the missing trash_dir or deleting existing files in trash.
extern DeleteScheduler* NewDeleteScheduler(
Env* env, const std::string& trash_dir, int64_t rate_bytes_per_sec,
std::shared_ptr<Logger> info_log = nullptr,
bool delete_exisitng_trash = true, Status* status = nullptr);
} // namespace rocksdb

@ -33,6 +33,7 @@ class CompactionFilterFactory;
class Comparator; class Comparator;
class Env; class Env;
enum InfoLogLevel : unsigned char; enum InfoLogLevel : unsigned char;
class SstFileManager;
class FilterPolicy; class FilterPolicy;
class Logger; class Logger;
class MergeOperator; class MergeOperator;
@ -41,7 +42,6 @@ class TableFactory;
class MemTableRepFactory; class MemTableRepFactory;
class TablePropertiesCollectorFactory; class TablePropertiesCollectorFactory;
class RateLimiter; class RateLimiter;
class DeleteScheduler;
class Slice; class Slice;
class SliceTransform; class SliceTransform;
class Statistics; class Statistics;
@ -830,12 +830,12 @@ struct DBOptions {
// Default: nullptr // Default: nullptr
std::shared_ptr<RateLimiter> rate_limiter; std::shared_ptr<RateLimiter> rate_limiter;
// Use to control files deletion rate, can be used among multiple // Use to track SST files and control their file deletion rate, can be used
// RocksDB instances. delete_scheduler is only used to delete table files that // among multiple RocksDB instances, sst_file_manager only track and throttle
// need to be deleted from the first db_path (db_name if db_paths is empty), // deletes of SST files in first db_path (db_name if db_paths is empty), other
// other files types and other db_paths wont be affected by delete_scheduler. // files and other db_paths wont be tracked or affected by sst_file_manager.
// Default: nullptr (disabled) // Default: nullptr
std::shared_ptr<DeleteScheduler> delete_scheduler; std::shared_ptr<SstFileManager> sst_file_manager;
// Any internal progress/error information generated by the db will // Any internal progress/error information generated by the db will
// be written to info_log if it is non-nullptr, or to a file stored // be written to info_log if it is non-nullptr, or to a file stored

@ -0,0 +1,64 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "rocksdb/status.h"
namespace rocksdb {
class Env;
class Logger;
// SstFileManager is used to track SST files in the DB and control there
// deletion rate.
// All SstFileManager public functions are thread-safe.
class SstFileManager {
public:
virtual ~SstFileManager() {}
// Return the total size of all tracked files.
// thread-safe
virtual uint64_t GetTotalSize() = 0;
// Return a map containing all tracked files and there corresponding sizes.
// thread-safe
virtual std::unordered_map<std::string, uint64_t> GetTrackedFiles() = 0;
// Return delete rate limit in bytes per second.
// thread-safe
virtual int64_t GetDeleteRateBytesPerSecond() = 0;
};
// Create a new SstFileManager that can be shared among multiple RocksDB
// instances to track SST file and control there deletion rate.
//
// @param env: Pointer to Env object, please see "rocksdb/env.h".
// @param info_log: If not nullptr, info_log will be used to log errors.
//
// == Deletion rate limiting specific arguments ==
// @param trash_dir: Path to the directory where deleted files will be moved
// to be deleted in a background thread while applying rate limiting. If this
// directory dont exist, it will be created. This directory should not be
// used by any other process or any other SstFileManager, Set to "" to
// disable deletion rate limiting.
// @param rate_bytes_per_sec: How many bytes should be deleted per second, If
// this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb
// in 1 second, we will wait for another 3 seconds before we delete other
// files, Set to 0 to disable deletion rate limiting.
// @param delete_exisitng_trash: If set to true, the newly created
// SstFileManager will delete files that already exist in trash_dir.
// @param status: If not nullptr, status will contain any errors that happened
// during creating the missing trash_dir or deleting existing files in trash.
extern SstFileManager* NewSstFileManager(
Env* env, std::shared_ptr<Logger> info_log = nullptr,
std::string trash_dir = "", int64_t rate_bytes_per_sec = 0,
bool delete_exisitng_trash = true, Status* status = nullptr);
} // namespace rocksdb

@ -94,13 +94,14 @@ LIB_SOURCES = \
util/compaction_job_stats_impl.cc \ util/compaction_job_stats_impl.cc \
util/concurrent_arena.cc \ util/concurrent_arena.cc \
util/crc32c.cc \ util/crc32c.cc \
util/delete_scheduler_impl.cc \ util/delete_scheduler.cc \
util/dynamic_bloom.cc \ util/dynamic_bloom.cc \
util/env.cc \ util/env.cc \
util/env_hdfs.cc \ util/env_hdfs.cc \
util/env_posix.cc \ util/env_posix.cc \
util/io_posix.cc \ util/io_posix.cc \
util/thread_posix.cc \ util/thread_posix.cc \
util/sst_file_manager_impl.cc \
util/file_util.cc \ util/file_util.cc \
util/file_reader_writer.cc \ util/file_reader_writer.cc \
util/filter_policy.cc \ util/filter_policy.cc \

@ -3,38 +3,40 @@
// LICENSE file in the root directory of this source tree. An additional grant // LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory. // of patent rights can be found in the PATENTS file in the same directory.
#include "util/delete_scheduler_impl.h" #include "util/delete_scheduler.h"
#include <thread> #include <thread>
#include <vector> #include <vector>
#include "port/port.h" #include "port/port.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "util/sst_file_manager_impl.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/sync_point.h" #include "util/sync_point.h"
namespace rocksdb { namespace rocksdb {
DeleteSchedulerImpl::DeleteSchedulerImpl(Env* env, const std::string& trash_dir, DeleteScheduler::DeleteScheduler(Env* env, const std::string& trash_dir,
int64_t rate_bytes_per_sec, int64_t rate_bytes_per_sec, Logger* info_log,
std::shared_ptr<Logger> info_log) SstFileManagerImpl* sst_file_manager)
: env_(env), : env_(env),
trash_dir_(trash_dir), trash_dir_(trash_dir),
rate_bytes_per_sec_(rate_bytes_per_sec), rate_bytes_per_sec_(rate_bytes_per_sec),
pending_files_(0), pending_files_(0),
closing_(false), closing_(false),
cv_(&mu_), cv_(&mu_),
info_log_(info_log) { info_log_(info_log),
if (rate_bytes_per_sec_ == 0) { sst_file_manager_(sst_file_manager) {
if (rate_bytes_per_sec_ <= 0) {
// Rate limiting is disabled // Rate limiting is disabled
bg_thread_.reset(); bg_thread_.reset();
} else { } else {
bg_thread_.reset( bg_thread_.reset(
new std::thread(&DeleteSchedulerImpl::BackgroundEmptyTrash, this)); new std::thread(&DeleteScheduler::BackgroundEmptyTrash, this));
} }
} }
DeleteSchedulerImpl::~DeleteSchedulerImpl() { DeleteScheduler::~DeleteScheduler() {
{ {
MutexLock l(&mu_); MutexLock l(&mu_);
closing_ = true; closing_ = true;
@ -45,20 +47,29 @@ DeleteSchedulerImpl::~DeleteSchedulerImpl() {
} }
} }
Status DeleteSchedulerImpl::DeleteFile(const std::string& file_path) { Status DeleteScheduler::DeleteFile(const std::string& file_path) {
if (rate_bytes_per_sec_ == 0) { Status s;
if (rate_bytes_per_sec_ <= 0) {
// Rate limiting is disabled // Rate limiting is disabled
return env_->DeleteFile(file_path); s = env_->DeleteFile(file_path);
if (s.ok() && sst_file_manager_) {
sst_file_manager_->OnDeleteFile(file_path);
}
return s;
} }
// Move file to trash // Move file to trash
std::string path_in_trash; std::string path_in_trash;
Status s = MoveToTrash(file_path, &path_in_trash); s = MoveToTrash(file_path, &path_in_trash);
if (!s.ok()) { if (!s.ok()) {
Log(InfoLogLevel::ERROR_LEVEL, info_log_, Log(InfoLogLevel::ERROR_LEVEL, info_log_,
"Failed to move %s to trash directory (%s)", file_path.c_str(), "Failed to move %s to trash directory (%s)", file_path.c_str(),
trash_dir_.c_str()); trash_dir_.c_str());
return env_->DeleteFile(file_path); s = env_->DeleteFile(file_path);
if (s.ok() && sst_file_manager_) {
sst_file_manager_->OnDeleteFile(file_path);
}
return s;
} }
// Add file to delete queue // Add file to delete queue
@ -73,13 +84,13 @@ Status DeleteSchedulerImpl::DeleteFile(const std::string& file_path) {
return s; return s;
} }
std::map<std::string, Status> DeleteSchedulerImpl::GetBackgroundErrors() { std::map<std::string, Status> DeleteScheduler::GetBackgroundErrors() {
MutexLock l(&mu_); MutexLock l(&mu_);
return bg_errors_; return bg_errors_;
} }
Status DeleteSchedulerImpl::MoveToTrash(const std::string& file_path, Status DeleteScheduler::MoveToTrash(const std::string& file_path,
std::string* path_in_trash) { std::string* path_in_trash) {
Status s; Status s;
// Figure out the name of the file in trash folder // Figure out the name of the file in trash folder
size_t idx = file_path.rfind("/"); size_t idx = file_path.rfind("/");
@ -112,11 +123,14 @@ Status DeleteSchedulerImpl::MoveToTrash(const std::string& file_path,
break; break;
} }
} }
if (s.ok() && sst_file_manager_) {
sst_file_manager_->OnMoveFile(file_path, *path_in_trash);
}
return s; return s;
} }
void DeleteSchedulerImpl::BackgroundEmptyTrash() { void DeleteScheduler::BackgroundEmptyTrash() {
TEST_SYNC_POINT("DeleteSchedulerImpl::BackgroundEmptyTrash"); TEST_SYNC_POINT("DeleteScheduler::BackgroundEmptyTrash");
while (true) { while (true) {
MutexLock l(&mu_); MutexLock l(&mu_);
@ -151,7 +165,7 @@ void DeleteSchedulerImpl::BackgroundEmptyTrash() {
uint64_t total_penlty = uint64_t total_penlty =
((total_deleted_bytes * kMicrosInSecond) / rate_bytes_per_sec_); ((total_deleted_bytes * kMicrosInSecond) / rate_bytes_per_sec_);
while (!closing_ && !cv_.TimedWait(start_time + total_penlty)) {} while (!closing_ && !cv_.TimedWait(start_time + total_penlty)) {}
TEST_SYNC_POINT_CALLBACK("DeleteSchedulerImpl::BackgroundEmptyTrash:Wait", TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
&total_penlty); &total_penlty);
pending_files_--; pending_files_--;
@ -164,12 +178,12 @@ void DeleteSchedulerImpl::BackgroundEmptyTrash() {
} }
} }
Status DeleteSchedulerImpl::DeleteTrashFile(const std::string& path_in_trash, Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
uint64_t* deleted_bytes) { uint64_t* deleted_bytes) {
uint64_t file_size; uint64_t file_size;
Status s = env_->GetFileSize(path_in_trash, &file_size); Status s = env_->GetFileSize(path_in_trash, &file_size);
if (s.ok()) { if (s.ok()) {
TEST_SYNC_POINT("DeleteSchedulerImpl::DeleteTrashFile:DeleteFile"); TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
s = env_->DeleteFile(path_in_trash); s = env_->DeleteFile(path_in_trash);
} }
@ -181,51 +195,19 @@ Status DeleteSchedulerImpl::DeleteTrashFile(const std::string& path_in_trash,
*deleted_bytes = 0; *deleted_bytes = 0;
} else { } else {
*deleted_bytes = file_size; *deleted_bytes = file_size;
if (sst_file_manager_) {
sst_file_manager_->OnDeleteFile(path_in_trash);
}
} }
return s; return s;
} }
void DeleteSchedulerImpl::WaitForEmptyTrash() { void DeleteScheduler::WaitForEmptyTrash() {
MutexLock l(&mu_); MutexLock l(&mu_);
while (pending_files_ > 0 && !closing_) { while (pending_files_ > 0 && !closing_) {
cv_.Wait(); cv_.Wait();
} }
} }
DeleteScheduler* NewDeleteScheduler(Env* env, const std::string& trash_dir,
int64_t rate_bytes_per_sec,
std::shared_ptr<Logger> info_log,
bool delete_exisitng_trash,
Status* status) {
DeleteScheduler* res =
new DeleteSchedulerImpl(env, trash_dir, rate_bytes_per_sec, info_log);
Status s;
if (trash_dir != "") {
s = env->CreateDirIfMissing(trash_dir);
if (s.ok() && delete_exisitng_trash) {
std::vector<std::string> files_in_trash;
s = env->GetChildren(trash_dir, &files_in_trash);
if (s.ok()) {
for (const std::string& trash_file : files_in_trash) {
if (trash_file == "." || trash_file == "..") {
continue;
}
Status file_delete = res->DeleteFile(trash_dir + "/" + trash_file);
if (s.ok() && !file_delete.ok()) {
s = file_delete;
}
}
}
}
}
if (status) {
*status = s;
}
return res;
}
} // namespace rocksdb } // namespace rocksdb

@ -12,21 +12,28 @@
#include "port/port.h" #include "port/port.h"
#include "rocksdb/delete_scheduler.h"
#include "rocksdb/status.h" #include "rocksdb/status.h"
namespace rocksdb { namespace rocksdb {
class Env; class Env;
class Logger; class Logger;
class SstFileManagerImpl;
class DeleteSchedulerImpl : public DeleteScheduler {
// DeleteScheduler allows the DB to enforce a rate limit on file deletion,
// Instead of deleteing files immediately, files are moved to trash_dir
// and deleted in a background thread that apply sleep penlty between deletes
// if they are happening in a rate faster than rate_bytes_per_sec,
//
// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this
// case DeleteScheduler will delete files immediately.
class DeleteScheduler {
public: public:
DeleteSchedulerImpl(Env* env, const std::string& trash_dir, DeleteScheduler(Env* env, const std::string& trash_dir,
int64_t rate_bytes_per_sec, int64_t rate_bytes_per_sec, Logger* info_log,
std::shared_ptr<Logger> info_log); SstFileManagerImpl* sst_file_manager);
~DeleteSchedulerImpl(); ~DeleteScheduler();
// Return delete rate limit in bytes per second // Return delete rate limit in bytes per second
int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_; } int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_; }
@ -63,7 +70,7 @@ class DeleteSchedulerImpl : public DeleteScheduler {
int32_t pending_files_; int32_t pending_files_;
// Errors that happened in BackgroundEmptyTrash (file_path => error) // Errors that happened in BackgroundEmptyTrash (file_path => error)
std::map<std::string, Status> bg_errors_; std::map<std::string, Status> bg_errors_;
// Set to true in ~DeleteSchedulerImpl() to force BackgroundEmptyTrash to stop // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop
bool closing_; bool closing_;
// Condition variable signaled in these conditions // Condition variable signaled in these conditions
// - pending_files_ value change from 0 => 1 // - pending_files_ value change from 0 => 1
@ -74,7 +81,8 @@ class DeleteSchedulerImpl : public DeleteScheduler {
std::unique_ptr<std::thread> bg_thread_; std::unique_ptr<std::thread> bg_thread_;
// Mutex to protect threads from file name conflicts // Mutex to protect threads from file name conflicts
port::Mutex file_move_mu_; port::Mutex file_move_mu_;
std::shared_ptr<Logger> info_log_; Logger* info_log_;
SstFileManagerImpl* sst_file_manager_;
static const uint64_t kMicrosInSecond = 1000 * 1000LL; static const uint64_t kMicrosInSecond = 1000 * 1000LL;
}; };

@ -12,9 +12,9 @@
#include <thread> #include <thread>
#include <vector> #include <vector>
#include "rocksdb/delete_scheduler.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "util/delete_scheduler.h"
#include "util/string_util.h" #include "util/string_util.h"
#include "util/sync_point.h" #include "util/sync_point.h"
#include "util/testharness.h" #include "util/testharness.h"
@ -74,6 +74,12 @@ class DeleteSchedulerTest : public testing::Test {
return file_path; return file_path;
} }
void NewDeleteScheduler() {
ASSERT_OK(env_->CreateDirIfMissing(trash_dir_));
delete_scheduler_.reset(new DeleteScheduler(
env_, trash_dir_, rate_bytes_per_sec_, nullptr, nullptr));
}
Env* env_; Env* env_;
std::string dummy_files_dir_; std::string dummy_files_dir_;
std::string trash_dir_; std::string trash_dir_;
@ -84,19 +90,19 @@ class DeleteSchedulerTest : public testing::Test {
// Test the basic functionality of DeleteScheduler (Rate Limiting). // Test the basic functionality of DeleteScheduler (Rate Limiting).
// 1- Create 100 dummy files // 1- Create 100 dummy files
// 2- Delete the 100 dummy files using DeleteScheduler // 2- Delete the 100 dummy files using DeleteScheduler
// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash --- // --- Hold DeleteScheduler::BackgroundEmptyTrash ---
// 3- Wait for DeleteScheduler to delete all files in trash // 3- Wait for DeleteScheduler to delete all files in trash
// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files // 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
// 5- Make sure that all created files were completely deleted // 5- Make sure that all created files were completely deleted
TEST_F(DeleteSchedulerTest, BasicRateLimiting) { TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
rocksdb::SyncPoint::GetInstance()->LoadDependency({ rocksdb::SyncPoint::GetInstance()->LoadDependency({
{"DeleteSchedulerTest::BasicRateLimiting:1", {"DeleteSchedulerTest::BasicRateLimiting:1",
"DeleteSchedulerImpl::BackgroundEmptyTrash"}, "DeleteScheduler::BackgroundEmptyTrash"},
}); });
std::vector<uint64_t> penalties; std::vector<uint64_t> penalties;
rocksdb::SyncPoint::GetInstance()->SetCallBack( rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteSchedulerImpl::BackgroundEmptyTrash:Wait", "DeleteScheduler::BackgroundEmptyTrash:Wait",
[&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); }); [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
int num_files = 100; // 100 files int num_files = 100; // 100 files
@ -110,8 +116,7 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
DestroyAndCreateDir(dummy_files_dir_); DestroyAndCreateDir(dummy_files_dir_);
rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024; rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
delete_scheduler_.reset( NewDeleteScheduler();
NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
// Create 100 dummy files, every file is 1 Kb // Create 100 dummy files, every file is 1 Kb
std::vector<std::string> generated_files; std::vector<std::string> generated_files;
@ -152,19 +157,19 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
// Same as the BasicRateLimiting test but delete files in multiple threads. // Same as the BasicRateLimiting test but delete files in multiple threads.
// 1- Create 100 dummy files // 1- Create 100 dummy files
// 2- Delete the 100 dummy files using DeleteScheduler using 10 threads // 2- Delete the 100 dummy files using DeleteScheduler using 10 threads
// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash --- // --- Hold DeleteScheduler::BackgroundEmptyTrash ---
// 3- Wait for DeleteScheduler to delete all files in queue // 3- Wait for DeleteScheduler to delete all files in queue
// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files // 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
// 5- Make sure that all created files were completely deleted // 5- Make sure that all created files were completely deleted
TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) { TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
rocksdb::SyncPoint::GetInstance()->LoadDependency({ rocksdb::SyncPoint::GetInstance()->LoadDependency({
{"DeleteSchedulerTest::RateLimitingMultiThreaded:1", {"DeleteSchedulerTest::RateLimitingMultiThreaded:1",
"DeleteSchedulerImpl::BackgroundEmptyTrash"}, "DeleteScheduler::BackgroundEmptyTrash"},
}); });
std::vector<uint64_t> penalties; std::vector<uint64_t> penalties;
rocksdb::SyncPoint::GetInstance()->SetCallBack( rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteSchedulerImpl::BackgroundEmptyTrash:Wait", "DeleteScheduler::BackgroundEmptyTrash:Wait",
[&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); }); [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
int thread_cnt = 10; int thread_cnt = 10;
@ -179,8 +184,7 @@ TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
DestroyAndCreateDir(dummy_files_dir_); DestroyAndCreateDir(dummy_files_dir_);
rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024; rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
delete_scheduler_.reset( NewDeleteScheduler();
NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
// Create 100 dummy files, every file is 1 Kb // Create 100 dummy files, every file is 1 Kb
std::vector<std::string> generated_files; std::vector<std::string> generated_files;
@ -239,12 +243,13 @@ TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
TEST_F(DeleteSchedulerTest, DisableRateLimiting) { TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
int bg_delete_file = 0; int bg_delete_file = 0;
rocksdb::SyncPoint::GetInstance()->SetCallBack( rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", "DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void* arg) { bg_delete_file++; }); [&](void* arg) { bg_delete_file++; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rocksdb::SyncPoint::GetInstance()->EnableProcessing();
delete_scheduler_.reset(NewDeleteScheduler(env_, "", 0)); rate_bytes_per_sec_ = 0;
NewDeleteScheduler();
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
// Every file we delete will be deleted immediately // Every file we delete will be deleted immediately
@ -264,18 +269,17 @@ TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
// 1- Create 10 files with the same name "conflict.data" // 1- Create 10 files with the same name "conflict.data"
// 2- Delete the 10 files using DeleteScheduler // 2- Delete the 10 files using DeleteScheduler
// 3- Make sure that trash directory contain 10 files ("conflict.data" x 10) // 3- Make sure that trash directory contain 10 files ("conflict.data" x 10)
// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash --- // --- Hold DeleteScheduler::BackgroundEmptyTrash ---
// 4- Make sure that files are deleted from trash // 4- Make sure that files are deleted from trash
TEST_F(DeleteSchedulerTest, ConflictNames) { TEST_F(DeleteSchedulerTest, ConflictNames) {
rocksdb::SyncPoint::GetInstance()->LoadDependency({ rocksdb::SyncPoint::GetInstance()->LoadDependency({
{"DeleteSchedulerTest::ConflictNames:1", {"DeleteSchedulerTest::ConflictNames:1",
"DeleteSchedulerImpl::BackgroundEmptyTrash"}, "DeleteScheduler::BackgroundEmptyTrash"},
}); });
rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rocksdb::SyncPoint::GetInstance()->EnableProcessing();
rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec
delete_scheduler_.reset( NewDeleteScheduler();
NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
// Create "conflict.data" and move it to trash 10 times // Create "conflict.data" and move it to trash 10 times
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
@ -300,19 +304,18 @@ TEST_F(DeleteSchedulerTest, ConflictNames) {
// 1- Create 10 dummy files // 1- Create 10 dummy files
// 2- Delete the 10 files using DeleteScheduler (move them to trsah) // 2- Delete the 10 files using DeleteScheduler (move them to trsah)
// 3- Delete the 10 files directly (using env_->DeleteFile) // 3- Delete the 10 files directly (using env_->DeleteFile)
// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash --- // --- Hold DeleteScheduler::BackgroundEmptyTrash ---
// 4- Make sure that DeleteScheduler failed to delete the 10 files and // 4- Make sure that DeleteScheduler failed to delete the 10 files and
// reported 10 background errors // reported 10 background errors
TEST_F(DeleteSchedulerTest, BackgroundError) { TEST_F(DeleteSchedulerTest, BackgroundError) {
rocksdb::SyncPoint::GetInstance()->LoadDependency({ rocksdb::SyncPoint::GetInstance()->LoadDependency({
{"DeleteSchedulerTest::BackgroundError:1", {"DeleteSchedulerTest::BackgroundError:1",
"DeleteSchedulerImpl::BackgroundEmptyTrash"}, "DeleteScheduler::BackgroundEmptyTrash"},
}); });
rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rocksdb::SyncPoint::GetInstance()->EnableProcessing();
rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec
delete_scheduler_.reset( NewDeleteScheduler();
NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
// Generate 10 dummy files and move them to trash // Generate 10 dummy files and move them to trash
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
@ -339,32 +342,6 @@ TEST_F(DeleteSchedulerTest, BackgroundError) {
rocksdb::SyncPoint::GetInstance()->DisableProcessing(); rocksdb::SyncPoint::GetInstance()->DisableProcessing();
} }
// 1- Create 10 files in trash
// 2- Create a DeleteScheduler with delete_exisitng_trash = true
// 3- Wait for DeleteScheduler to delete all files in queue
// 4- Make sure that all files in trash directory were deleted
TEST_F(DeleteSchedulerTest, TrashWithExistingFiles) {
std::vector<std::string> dummy_files;
for (int i = 0; i < 10; i++) {
std::string file_name = "data_" + ToString(i) + ".data";
std::string trash_path = trash_dir_ + "/" + file_name;
env_->RenameFile(NewDummyFile(file_name), trash_path);
}
ASSERT_EQ(CountFilesInDir(trash_dir_), 10);
Status s;
rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec
delete_scheduler_.reset(NewDeleteScheduler(
env_, trash_dir_, rate_bytes_per_sec_, nullptr, true, &s));
ASSERT_OK(s);
delete_scheduler_->WaitForEmptyTrash();
ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
auto bg_errors = delete_scheduler_->GetBackgroundErrors();
ASSERT_EQ(bg_errors.size(), 0);
}
// 1- Create 10 dummy files // 1- Create 10 dummy files
// 2- Delete 10 dummy files using DeleteScheduler // 2- Delete 10 dummy files using DeleteScheduler
// 3- Wait for DeleteScheduler to delete all files in queue // 3- Wait for DeleteScheduler to delete all files in queue
@ -373,13 +350,12 @@ TEST_F(DeleteSchedulerTest, TrashWithExistingFiles) {
TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) { TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
int bg_delete_file = 0; int bg_delete_file = 0;
rocksdb::SyncPoint::GetInstance()->SetCallBack( rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", "DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void* arg) { bg_delete_file++; }); [&](void* arg) { bg_delete_file++; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rocksdb::SyncPoint::GetInstance()->EnableProcessing();
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec
delete_scheduler_.reset( NewDeleteScheduler();
NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
// Move files to trash, wait for empty trash, start again // Move files to trash, wait for empty trash, start again
for (int run = 1; run <= 5; run++) { for (int run = 1; run <= 5; run++) {
@ -409,13 +385,12 @@ TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) { TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
int bg_delete_file = 0; int bg_delete_file = 0;
rocksdb::SyncPoint::GetInstance()->SetCallBack( rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", "DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void* arg) { bg_delete_file++; }); [&](void* arg) { bg_delete_file++; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rocksdb::SyncPoint::GetInstance()->EnableProcessing();
rate_bytes_per_sec_ = 1; // 1 Byte / sec rate_bytes_per_sec_ = 1; // 1 Byte / sec
delete_scheduler_.reset( NewDeleteScheduler();
NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
for (int i = 0; i < 100; i++) { for (int i = 0; i < 100; i++) {
std::string file_name = "data_" + ToString(i) + ".data"; std::string file_name = "data_" + ToString(i) + ".data";
@ -439,13 +414,12 @@ TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
TEST_F(DeleteSchedulerTest, MoveToTrashError) { TEST_F(DeleteSchedulerTest, MoveToTrashError) {
int bg_delete_file = 0; int bg_delete_file = 0;
rocksdb::SyncPoint::GetInstance()->SetCallBack( rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", "DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void* arg) { bg_delete_file++; }); [&](void* arg) { bg_delete_file++; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rocksdb::SyncPoint::GetInstance()->EnableProcessing();
rate_bytes_per_sec_ = 1024; // 1 Kb / sec rate_bytes_per_sec_ = 1024; // 1 Kb / sec
delete_scheduler_.reset( NewDeleteScheduler();
NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
// We will delete the trash directory, that mean that DeleteScheduler wont // We will delete the trash directory, that mean that DeleteScheduler wont
// be able to move files to trash and will delete files them immediately. // be able to move files to trash and will delete files them immediately.
@ -460,7 +434,6 @@ TEST_F(DeleteSchedulerTest, MoveToTrashError) {
rocksdb::SyncPoint::GetInstance()->DisableProcessing(); rocksdb::SyncPoint::GetInstance()->DisableProcessing();
} }
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -8,9 +8,9 @@
#include <string> #include <string>
#include <algorithm> #include <algorithm>
#include "rocksdb/delete_scheduler.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "util/sst_file_manager_impl.h"
#include "util/file_reader_writer.h" #include "util/file_reader_writer.h"
namespace rocksdb { namespace rocksdb {
@ -66,12 +66,15 @@ Status CopyFile(Env* env, const std::string& source,
return Status::OK(); return Status::OK();
} }
Status DeleteOrMoveToTrash(const DBOptions* db_options, Status DeleteSSTFile(const DBOptions* db_options, const std::string& fname,
const std::string& fname) { uint32_t path_id) {
if (db_options->delete_scheduler == nullptr) { // TODO(tec): support sst_file_manager for multiple path_ids
return db_options->env->DeleteFile(fname); auto sfm =
static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
if (sfm && path_id == 0) {
return sfm->ScheduleFileDeletion(fname);
} else { } else {
return db_options->delete_scheduler->DeleteFile(fname); return db_options->env->DeleteFile(fname);
} }
} }

@ -16,7 +16,7 @@ namespace rocksdb {
extern Status CopyFile(Env* env, const std::string& source, extern Status CopyFile(Env* env, const std::string& source,
const std::string& destination, uint64_t size = 0); const std::string& destination, uint64_t size = 0);
extern Status DeleteOrMoveToTrash(const DBOptions* db_options, extern Status DeleteSSTFile(const DBOptions* db_options,
const std::string& fname); const std::string& fname, uint32_t path_id);
} // namespace rocksdb } // namespace rocksdb

@ -20,8 +20,8 @@
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/comparator.h" #include "rocksdb/comparator.h"
#include "rocksdb/delete_scheduler.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/sst_file_manager.h"
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
#include "rocksdb/merge_operator.h" #include "rocksdb/merge_operator.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
@ -213,7 +213,7 @@ DBOptions::DBOptions()
paranoid_checks(true), paranoid_checks(true),
env(Env::Default()), env(Env::Default()),
rate_limiter(nullptr), rate_limiter(nullptr),
delete_scheduler(nullptr), sst_file_manager(nullptr),
info_log(nullptr), info_log(nullptr),
#ifdef NDEBUG #ifdef NDEBUG
info_log_level(INFO_LEVEL), info_log_level(INFO_LEVEL),
@ -281,7 +281,7 @@ DBOptions::DBOptions(const Options& options)
paranoid_checks(options.paranoid_checks), paranoid_checks(options.paranoid_checks),
env(options.env), env(options.env),
rate_limiter(options.rate_limiter), rate_limiter(options.rate_limiter),
delete_scheduler(options.delete_scheduler), sst_file_manager(options.sst_file_manager),
info_log(options.info_log), info_log(options.info_log),
info_log_level(options.info_log_level), info_log_level(options.info_log_level),
max_open_files(options.max_open_files), max_open_files(options.max_open_files),
@ -433,8 +433,9 @@ void DBOptions::Dump(Logger* log) const {
use_adaptive_mutex); use_adaptive_mutex);
Header(log, " Options.rate_limiter: %p", Header(log, " Options.rate_limiter: %p",
rate_limiter.get()); rate_limiter.get());
Header(log, " Options.delete_scheduler.rate_bytes_per_sec: %" PRIi64, Header(
delete_scheduler ? delete_scheduler->GetRateBytesPerSecond() : 0); log, " Options.sst_file_manager.rate_bytes_per_sec: %" PRIi64,
sst_file_manager ? sst_file_manager->GetDeleteRateBytesPerSecond() : 0);
Header(log, " Options.bytes_per_sync: %" PRIu64, Header(log, " Options.bytes_per_sync: %" PRIu64,
bytes_per_sync); bytes_per_sync);
Header(log, " Options.wal_bytes_per_sync: %" PRIu64, Header(log, " Options.wal_bytes_per_sync: %" PRIu64,

@ -1609,8 +1609,8 @@ TEST_F(OptionsParserTest, DBOptionsAllFieldsSettable) {
{offsetof(struct DBOptions, env), sizeof(Env*)}, {offsetof(struct DBOptions, env), sizeof(Env*)},
{offsetof(struct DBOptions, rate_limiter), {offsetof(struct DBOptions, rate_limiter),
sizeof(std::shared_ptr<RateLimiter>)}, sizeof(std::shared_ptr<RateLimiter>)},
{offsetof(struct DBOptions, delete_scheduler), {offsetof(struct DBOptions, sst_file_manager),
sizeof(std::shared_ptr<DeleteScheduler>)}, sizeof(std::shared_ptr<SstFileManager>)},
{offsetof(struct DBOptions, info_log), sizeof(std::shared_ptr<Logger>)}, {offsetof(struct DBOptions, info_log), sizeof(std::shared_ptr<Logger>)},
{offsetof(struct DBOptions, statistics), {offsetof(struct DBOptions, statistics),
sizeof(std::shared_ptr<Statistics>)}, sizeof(std::shared_ptr<Statistics>)},

@ -0,0 +1,143 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "util/sst_file_manager_impl.h"
#include <vector>
#include "port/port.h"
#include "rocksdb/env.h"
#include "util/mutexlock.h"
#include "util/sync_point.h"
namespace rocksdb {
SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr<Logger> logger,
const std::string& trash_dir,
int64_t rate_bytes_per_sec)
: env_(env),
logger_(logger),
total_files_size_(0),
delete_scheduler_(env, trash_dir, rate_bytes_per_sec, logger.get(),
this) {}
SstFileManagerImpl::~SstFileManagerImpl() {}
Status SstFileManagerImpl::OnAddFile(const std::string& file_path) {
uint64_t file_size;
Status s = env_->GetFileSize(file_path, &file_size);
if (s.ok()) {
MutexLock l(&mu_);
OnAddFileImpl(file_path, file_size);
}
TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
return s;
}
Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
{
MutexLock l(&mu_);
OnDeleteFileImpl(file_path);
}
TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile");
return Status::OK();
}
Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
const std::string& new_path) {
{
MutexLock l(&mu_);
OnAddFileImpl(new_path, tracked_files_[old_path]);
OnDeleteFileImpl(old_path);
}
TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
return Status::OK();
}
uint64_t SstFileManagerImpl::GetTotalSize() {
MutexLock l(&mu_);
return total_files_size_;
}
std::unordered_map<std::string, uint64_t>
SstFileManagerImpl::GetTrackedFiles() {
MutexLock l(&mu_);
return tracked_files_;
}
int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() {
return delete_scheduler_.GetRateBytesPerSecond();
}
Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path) {
return delete_scheduler_.DeleteFile(file_path);
}
void SstFileManagerImpl::WaitForEmptyTrash() {
delete_scheduler_.WaitForEmptyTrash();
}
void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
uint64_t file_size) {
auto tracked_file = tracked_files_.find(file_path);
if (tracked_file != tracked_files_.end()) {
// File was added before, we will just update the size
total_files_size_ -= tracked_file->second;
total_files_size_ += file_size;
} else {
total_files_size_ += file_size;
}
tracked_files_[file_path] = file_size;
}
void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
auto tracked_file = tracked_files_.find(file_path);
if (tracked_file == tracked_files_.end()) {
// File is not tracked
return;
}
total_files_size_ -= tracked_file->second;
tracked_files_.erase(tracked_file);
}
SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
std::string trash_dir,
int64_t rate_bytes_per_sec,
bool delete_exisitng_trash, Status* status) {
SstFileManagerImpl* res =
new SstFileManagerImpl(env, info_log, trash_dir, rate_bytes_per_sec);
Status s;
if (trash_dir != "" && rate_bytes_per_sec > 0) {
s = env->CreateDirIfMissing(trash_dir);
if (s.ok() && delete_exisitng_trash) {
std::vector<std::string> files_in_trash;
s = env->GetChildren(trash_dir, &files_in_trash);
if (s.ok()) {
for (const std::string& trash_file : files_in_trash) {
if (trash_file == "." || trash_file == "..") {
continue;
}
std::string path_in_trash = trash_dir + "/" + trash_file;
res->OnAddFile(path_in_trash);
Status file_delete = res->ScheduleFileDeletion(path_in_trash);
if (s.ok() && !file_delete.ok()) {
s = file_delete;
}
}
}
}
}
if (status) {
*status = s;
}
return res;
}
} // namespace rocksdb

@ -0,0 +1,77 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <string>
#include "port/port.h"
#include "rocksdb/sst_file_manager.h"
#include "util/delete_scheduler.h"
namespace rocksdb {
class Env;
class Logger;
// SstFileManager is used to track SST files in the DB and control there
// deletion rate.
// All SstFileManager public functions are thread-safe.
class SstFileManagerImpl : public SstFileManager {
public:
explicit SstFileManagerImpl(Env* env, std::shared_ptr<Logger> logger,
const std::string& trash_dir,
int64_t rate_bytes_per_sec);
~SstFileManagerImpl();
// DB will call OnAddFile whenever a new sst file is added.
Status OnAddFile(const std::string& file_path);
// DB will call OnDeleteFile whenever an sst file is deleted.
Status OnDeleteFile(const std::string& file_path);
// DB will call OnMoveFile whenever an sst file is move to a new path.
Status OnMoveFile(const std::string& old_path, const std::string& new_path);
// Return the total size of all tracked files.
uint64_t GetTotalSize() override;
// Return a map containing all tracked files and there corresponding sizes.
std::unordered_map<std::string, uint64_t> GetTrackedFiles() override;
// Return delete rate limit in bytes per second.
virtual int64_t GetDeleteRateBytesPerSecond() override;
// Move file to trash directory and schedule it's deletion.
virtual Status ScheduleFileDeletion(const std::string& file_path);
// Wait for all files being deleteing in the background to finish or for
// destructor to be called.
virtual void WaitForEmptyTrash();
private:
// REQUIRES: mutex locked
void OnAddFileImpl(const std::string& file_path, uint64_t file_size);
// REQUIRES: mutex locked
void OnDeleteFileImpl(const std::string& file_path);
Env* env_;
std::shared_ptr<Logger> logger_;
// Mutex to protect tracked_files_, total_files_size_
port::Mutex mu_;
// The summation of the sizes of all files in tracked_files_ map
uint64_t total_files_size_;
// A map containing all tracked files and there sizes
// file_path => file_size
std::unordered_map<std::string, uint64_t> tracked_files_;
// DeleteScheduler used to throttle file deletition, if SstFileManagerImpl was
// created with rate_bytes_per_sec == 0 or trash_dir == "", delete_scheduler_
// rate limiting will be disabled and will simply delete the files.
DeleteScheduler delete_scheduler_;
};
} // namespace rocksdb
Loading…
Cancel
Save