Make it possible to force the garbage collection of the oldest blob files (#8994)

Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.

In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).

These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)

This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994

Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.

Reviewed By: riversand963

Differential Revision: D31489850

Pulled By: ltamasi

fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
main
Levi Tamasi 3 years ago committed by Facebook GitHub Bot
parent a282eff3d1
commit 3e1bf771a3
  1. 9
      db/c.cc
  2. 7
      db/c_test.c
  3. 19
      db/column_family.cc
  4. 24
      db/column_family_test.cc
  5. 2
      db/compaction/compaction_job.cc
  6. 10
      db/compaction/compaction_picker_level.cc
  7. 109
      db/version_set.cc
  8. 18
      db/version_set.h
  9. 215
      db/version_set_test.cc
  10. 1
      db_stress_tool/db_stress_common.h
  11. 6
      db_stress_tool/db_stress_gflags.cc
  12. 11
      db_stress_tool/db_stress_test_base.cc
  13. 13
      include/rocksdb/advanced_options.h
  14. 5
      include/rocksdb/c.h
  15. 2
      include/rocksdb/listener.h
  16. 7
      options/cf_options.cc
  17. 4
      options/cf_options.h
  18. 26
      options/options.cc
  19. 2
      options/options_helper.cc
  20. 1
      options/options_settable_test.cc
  21. 4
      options/options_test.cc
  22. 2
      test_util/testutil.cc
  23. 8
      tools/db_bench_tool.cc
  24. 3
      tools/db_bench_tool_test.cc
  25. 1
      tools/db_crashtest.py

@ -2751,6 +2751,15 @@ double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) {
return opt->rep.blob_garbage_collection_age_cutoff; return opt->rep.blob_garbage_collection_age_cutoff;
} }
void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt,
double val) {
opt->rep.blob_garbage_collection_force_threshold = val;
}
double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) {
return opt->rep.blob_garbage_collection_force_threshold;
}
void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
opt->rep.num_levels = n; opt->rep.num_levels = n;
} }

@ -1793,8 +1793,11 @@ int main(int argc, char** argv) {
rocksdb_options_set_enable_blob_gc(o, 1); rocksdb_options_set_enable_blob_gc(o, 1);
CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o)); CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o));
rocksdb_options_set_blob_gc_age_cutoff(o, 0.75); rocksdb_options_set_blob_gc_age_cutoff(o, 0.5);
CheckCondition(0.75 == rocksdb_options_get_blob_gc_age_cutoff(o)); CheckCondition(0.5 == rocksdb_options_get_blob_gc_age_cutoff(o));
rocksdb_options_set_blob_gc_force_threshold(o, 0.75);
CheckCondition(0.75 == rocksdb_options_get_blob_gc_force_threshold(o));
// Create a copy that should be equal to the original. // Create a copy that should be equal to the original.
rocksdb_options_t* copy; rocksdb_options_t* copy;

@ -1358,12 +1358,19 @@ Status ColumnFamilyData::ValidateOptions(
} }
} }
if (cf_options.enable_blob_garbage_collection && if (cf_options.enable_blob_garbage_collection) {
(cf_options.blob_garbage_collection_age_cutoff < 0.0 || if (cf_options.blob_garbage_collection_age_cutoff < 0.0 ||
cf_options.blob_garbage_collection_age_cutoff > 1.0)) { cf_options.blob_garbage_collection_age_cutoff > 1.0) {
return Status::InvalidArgument( return Status::InvalidArgument(
"The age cutoff for blob garbage collection should be in the range " "The age cutoff for blob garbage collection should be in the range "
"[0.0, 1.0]."); "[0.0, 1.0].");
}
if (cf_options.blob_garbage_collection_force_threshold < 0.0 ||
cf_options.blob_garbage_collection_force_threshold > 1.0) {
return Status::InvalidArgument(
"The garbage ratio threshold for forcing blob garbage collection "
"should be in the range [0.0, 1.0].");
}
} }
if (cf_options.compaction_style == kCompactionStyleFIFO && if (cf_options.compaction_style == kCompactionStyleFIFO &&

@ -3407,6 +3407,30 @@ TEST(ColumnFamilyTest, ValidateBlobGCCutoff) {
.IsInvalidArgument()); .IsInvalidArgument());
} }
TEST(ColumnFamilyTest, ValidateBlobGCForceThreshold) {
DBOptions db_options;
ColumnFamilyOptions cf_options;
cf_options.enable_blob_garbage_collection = true;
cf_options.blob_garbage_collection_force_threshold = -0.5;
ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
.IsInvalidArgument());
cf_options.blob_garbage_collection_force_threshold = 0.0;
ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
cf_options.blob_garbage_collection_force_threshold = 0.5;
ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
cf_options.blob_garbage_collection_force_threshold = 1.0;
ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
cf_options.blob_garbage_collection_force_threshold = 1.5;
ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
.IsInvalidArgument());
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS

@ -109,6 +109,8 @@ const char* GetCompactionReasonString(CompactionReason compaction_reason) {
return "PeriodicCompaction"; return "PeriodicCompaction";
case CompactionReason::kChangeTemperature: case CompactionReason::kChangeTemperature:
return "ChangeTemperature"; return "ChangeTemperature";
case CompactionReason::kForcedBlobGC:
return "ForcedBlobGC";
case CompactionReason::kNumOfReasons: case CompactionReason::kNumOfReasons:
// fall through // fall through
default: default:

@ -31,6 +31,9 @@ bool LevelCompactionPicker::NeedsCompaction(
if (!vstorage->FilesMarkedForCompaction().empty()) { if (!vstorage->FilesMarkedForCompaction().empty()) {
return true; return true;
} }
if (!vstorage->FilesMarkedForForcedBlobGC().empty()) {
return true;
}
for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
if (vstorage->CompactionScore(i) >= 1) { if (vstorage->CompactionScore(i) >= 1) {
return true; return true;
@ -248,6 +251,13 @@ void LevelCompactionBuilder::SetupInitialFiles() {
compaction_reason_ = CompactionReason::kPeriodicCompaction; compaction_reason_ = CompactionReason::kPeriodicCompaction;
return; return;
} }
// Forced blob garbage collection
PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false);
if (!start_level_inputs_.empty()) {
compaction_reason_ = CompactionReason::kForcedBlobGC;
return;
}
} }
bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {

@ -2817,6 +2817,15 @@ void VersionStorageInfo::ComputeCompactionScore(
ComputeFilesMarkedForPeriodicCompaction( ComputeFilesMarkedForPeriodicCompaction(
immutable_options, mutable_cf_options.periodic_compaction_seconds); immutable_options, mutable_cf_options.periodic_compaction_seconds);
} }
if (mutable_cf_options.enable_blob_garbage_collection &&
mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 &&
mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) {
ComputeFilesMarkedForForcedBlobGC(
mutable_cf_options.blob_garbage_collection_age_cutoff,
mutable_cf_options.blob_garbage_collection_force_threshold);
}
EstimateCompactionBytesNeeded(mutable_cf_options); EstimateCompactionBytesNeeded(mutable_cf_options);
} }
@ -2926,6 +2935,106 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
} }
} }
void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
double blob_garbage_collection_age_cutoff,
double blob_garbage_collection_force_threshold) {
files_marked_for_forced_blob_gc_.clear();
if (blob_files_.empty()) {
return;
}
// Number of blob files eligible for GC based on age
const size_t cutoff_count = static_cast<size_t>(
blob_garbage_collection_age_cutoff * blob_files_.size());
if (!cutoff_count) {
return;
}
// Compute the sum of total and garbage bytes over the oldest batch of blob
// files. The oldest batch is defined as the set of blob files which are
// kept alive by the same SSTs as the very oldest one. Here is a toy example.
// Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11,
// 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and
// potentially some higher-numbered ones, while SST 3 relies on blob file 12
// and potentially some higher-numbered ones. Then, the SST to oldest blob
// file mapping is as follows:
//
// SST file number Oldest blob file number
// 1 10
// 2 10
// 3 12
//
// This is what the same thing looks like from the blob files' POV. (Note that
// the linked SSTs simply denote the inverse mapping of the above.)
//
// Blob file number Linked SST set
// 10 {1, 2}
// 11 {}
// 12 {3}
// 13 {}
//
// Then, the oldest batch of blob files consists of blob files 10 and 11,
// and we can get rid of them by forcing the compaction of SSTs 1 and 2.
//
// Note that the overall ratio of garbage computed for the batch has to exceed
// blob_garbage_collection_force_threshold and the entire batch has to be
// eligible for GC according to blob_garbage_collection_age_cutoff in order
// for us to schedule any compactions.
const auto oldest_it = blob_files_.begin();
const auto& oldest_meta = oldest_it->second;
assert(oldest_meta);
const auto& linked_ssts = oldest_meta->GetLinkedSsts();
assert(!linked_ssts.empty());
size_t count = 1;
uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes();
uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes();
auto it = oldest_it;
for (++it; it != blob_files_.end(); ++it) {
const auto& meta = it->second;
assert(meta);
if (!meta->GetLinkedSsts().empty()) {
break;
}
if (++count > cutoff_count) {
return;
}
sum_total_blob_bytes += meta->GetTotalBlobBytes();
sum_garbage_blob_bytes += meta->GetGarbageBlobBytes();
}
if (sum_garbage_blob_bytes <
blob_garbage_collection_force_threshold * sum_total_blob_bytes) {
return;
}
for (uint64_t sst_file_number : linked_ssts) {
const FileLocation location = GetFileLocation(sst_file_number);
assert(location.IsValid());
const int level = location.GetLevel();
assert(level >= 0);
const size_t pos = location.GetPosition();
FileMetaData* const sst_meta = files_[level][pos];
assert(sst_meta);
if (sst_meta->being_compacted) {
continue;
}
files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta);
}
}
namespace { namespace {
// used to sort files by size // used to sort files by size

@ -184,6 +184,14 @@ class VersionStorageInfo {
// REQUIRES: DB mutex held // REQUIRES: DB mutex held
void ComputeBottommostFilesMarkedForCompaction(); void ComputeBottommostFilesMarkedForCompaction();
// This computes files_marked_for_forced_blob_gc_ and is called by
// ComputeCompactionScore()
//
// REQUIRES: DB mutex held
void ComputeFilesMarkedForForcedBlobGC(
double blob_garbage_collection_age_cutoff,
double blob_garbage_collection_force_threshold);
// Generate level_files_brief_ from files_ // Generate level_files_brief_ from files_
void GenerateLevelFilesBrief(); void GenerateLevelFilesBrief();
// Sort all files for this version based on their file size and // Sort all files for this version based on their file size and
@ -404,6 +412,14 @@ class VersionStorageInfo {
return bottommost_files_marked_for_compaction_; return bottommost_files_marked_for_compaction_;
} }
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
// REQUIRES: DB mutex held during access
const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForForcedBlobGC()
const {
assert(finalized_);
return files_marked_for_forced_blob_gc_;
}
int base_level() const { return base_level_; } int base_level() const { return base_level_; }
double level_multiplier() const { return level_multiplier_; } double level_multiplier() const { return level_multiplier_; }
@ -586,6 +602,8 @@ class VersionStorageInfo {
autovector<std::pair<int, FileMetaData*>> autovector<std::pair<int, FileMetaData*>>
bottommost_files_marked_for_compaction_; bottommost_files_marked_for_compaction_;
autovector<std::pair<int, FileMetaData*>> files_marked_for_forced_blob_gc_;
// Threshold for needing to mark another bottommost file. Maintain it so we // Threshold for needing to mark another bottommost file. Maintain it so we
// can quickly check when releasing a snapshot whether more bottommost files // can quickly check when releasing a snapshot whether more bottommost files
// became eligible for compaction. It's defined as the min of the max nonzero // became eligible for compaction. It's defined as the min of the max nonzero

@ -9,6 +9,8 @@
#include "db/version_set.h" #include "db/version_set.h"
#include <algorithm>
#include "db/db_impl/db_impl.h" #include "db/db_impl/db_impl.h"
#include "db/log_writer.h" #include "db/log_writer.h"
#include "rocksdb/convenience.h" #include "rocksdb/convenience.h"
@ -135,31 +137,55 @@ class VersionStorageInfoTestBase : public testing::Test {
} }
void Add(int level, uint32_t file_number, const char* smallest, void Add(int level, uint32_t file_number, const char* smallest,
const char* largest, uint64_t file_size = 0) { const char* largest, uint64_t file_size = 0,
assert(level < vstorage_.num_levels()); uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
FileMetaData* f = new FileMetaData( constexpr SequenceNumber dummy_seq = 0;
file_number, 0, file_size, GetInternalKey(smallest, 0),
GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0, Add(level, file_number, GetInternalKey(smallest, dummy_seq),
/* marked_for_compact */ false, kInvalidBlobFileNumber, GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number);
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
kUnknownFileChecksum, kUnknownFileChecksumFuncName);
f->compensated_file_size = file_size;
vstorage_.AddFile(level, f);
} }
void Add(int level, uint32_t file_number, const InternalKey& smallest, void Add(int level, uint32_t file_number, const InternalKey& smallest,
const InternalKey& largest, uint64_t file_size = 0) { const InternalKey& largest, uint64_t file_size = 0,
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
assert(level < vstorage_.num_levels()); assert(level < vstorage_.num_levels());
FileMetaData* f = new FileMetaData( FileMetaData* f = new FileMetaData(
file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0, file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
/* largest_seq */ 0, /* marked_for_compact */ false, /* largest_seq */ 0, /* marked_for_compact */ false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime, oldest_blob_file_number, kUnknownOldestAncesterTime,
kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileCreationTime, kUnknownFileChecksum,
kUnknownFileChecksumFuncName); kUnknownFileChecksumFuncName);
f->compensated_file_size = file_size; f->compensated_file_size = file_size;
vstorage_.AddFile(level, f); vstorage_.AddFile(level, f);
} }
void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count,
uint64_t total_blob_bytes,
BlobFileMetaData::LinkedSsts linked_ssts,
uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) {
auto shared_meta = SharedBlobFileMetaData::Create(
blob_file_number, total_blob_count, total_blob_bytes,
/* checksum_method */ std::string(),
/* checksum_value */ std::string());
auto meta =
BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
garbage_blob_count, garbage_blob_bytes);
vstorage_.AddBlobFile(std::move(meta));
}
void Finalize() {
vstorage_.UpdateNumNonEmptyLevels();
vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
vstorage_.UpdateFilesByCompactionPri(ioptions_.compaction_pri);
vstorage_.GenerateFileIndexer();
vstorage_.GenerateLevelFilesBrief();
vstorage_.GenerateLevel0NonOverlapping();
vstorage_.GenerateBottommostFiles();
vstorage_.SetFinalized();
}
std::string GetOverlappingFiles(int level, const InternalKey& begin, std::string GetOverlappingFiles(int level, const InternalKey& begin,
const InternalKey& end) { const InternalKey& end) {
std::vector<FileMetaData*> inputs; std::vector<FileMetaData*> inputs;
@ -445,6 +471,171 @@ TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) {
ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr); ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr);
} }
TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) {
// No SST or blob files in VersionStorageInfo
Finalize();
constexpr double age_cutoff = 0.5;
constexpr double force_threshold = 0.75;
vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
}
TEST_F(VersionStorageInfoTest, ForcedBlobGC) {
// Add three L0 SSTs and four blob files. The first two SSTs keep alive the
// first two blob files, while the third SST keeps alive the third and fourth
// blob files.
constexpr int level = 0;
constexpr uint64_t first_sst = 1;
constexpr uint64_t second_sst = 2;
constexpr uint64_t third_sst = 3;
constexpr uint64_t first_blob = 10;
constexpr uint64_t second_blob = 11;
constexpr uint64_t third_blob = 12;
constexpr uint64_t fourth_blob = 13;
{
constexpr char smallest[] = "bar1";
constexpr char largest[] = "foo1";
constexpr uint64_t file_size = 1000;
Add(level, first_sst, smallest, largest, file_size, first_blob);
}
{
constexpr char smallest[] = "bar2";
constexpr char largest[] = "foo2";
constexpr uint64_t file_size = 2000;
Add(level, second_sst, smallest, largest, file_size, first_blob);
}
{
constexpr char smallest[] = "bar3";
constexpr char largest[] = "foo3";
constexpr uint64_t file_size = 3000;
Add(level, third_sst, smallest, largest, file_size, third_blob);
}
{
constexpr uint64_t total_blob_count = 10;
constexpr uint64_t total_blob_bytes = 100000;
constexpr uint64_t garbage_blob_count = 2;
constexpr uint64_t garbage_blob_bytes = 15000;
AddBlob(first_blob, total_blob_count, total_blob_bytes,
BlobFileMetaData::LinkedSsts{first_sst, second_sst},
garbage_blob_count, garbage_blob_bytes);
}
{
constexpr uint64_t total_blob_count = 4;
constexpr uint64_t total_blob_bytes = 400000;
constexpr uint64_t garbage_blob_count = 3;
constexpr uint64_t garbage_blob_bytes = 235000;
AddBlob(second_blob, total_blob_count, total_blob_bytes,
BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
garbage_blob_bytes);
}
{
constexpr uint64_t total_blob_count = 20;
constexpr uint64_t total_blob_bytes = 1000000;
constexpr uint64_t garbage_blob_count = 8;
constexpr uint64_t garbage_blob_bytes = 123456;
AddBlob(third_blob, total_blob_count, total_blob_bytes,
BlobFileMetaData::LinkedSsts{third_sst}, garbage_blob_count,
garbage_blob_bytes);
}
{
constexpr uint64_t total_blob_count = 128;
constexpr uint64_t total_blob_bytes = 789012345;
constexpr uint64_t garbage_blob_count = 67;
constexpr uint64_t garbage_blob_bytes = 88888888;
AddBlob(fourth_blob, total_blob_count, total_blob_bytes,
BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
garbage_blob_bytes);
}
Finalize();
assert(vstorage_.num_levels() > 0);
const auto& level_files = vstorage_.LevelFiles(level);
assert(level_files.size() == 3);
assert(level_files[0] && level_files[0]->fd.GetNumber() == first_sst);
assert(level_files[1] && level_files[1]->fd.GetNumber() == second_sst);
assert(level_files[2] && level_files[2]->fd.GetNumber() == third_sst);
// No blob files eligible for GC due to the age cutoff
{
constexpr double age_cutoff = 0.1;
constexpr double force_threshold = 0.0;
vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
}
// Part of the oldest batch of blob files (specifically, the second file) is
// ineligible for GC due to the age cutoff
{
constexpr double age_cutoff = 0.25;
constexpr double force_threshold = 0.0;
vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
}
// Oldest batch is eligible based on age cutoff but its overall garbage ratio
// is below threshold
{
constexpr double age_cutoff = 0.5;
constexpr double force_threshold = 0.6;
vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
}
// Oldest batch is eligible based on age cutoff and its overall garbage ratio
// meets threshold
{
constexpr double age_cutoff = 0.5;
constexpr double force_threshold = 0.5;
vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
ASSERT_EQ(ssts_to_be_compacted.size(), 2);
std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(),
[](const std::pair<int, FileMetaData*>& lhs,
const std::pair<int, FileMetaData*>& rhs) {
assert(lhs.second);
assert(rhs.second);
return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
});
const autovector<std::pair<int, FileMetaData*>>
expected_ssts_to_be_compacted{{level, level_files[0]},
{level, level_files[1]}};
ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
}
}
class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase { class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase {
public: public:
VersionStorageInfoTimestampTest() VersionStorageInfoTimestampTest()

@ -252,6 +252,7 @@ DECLARE_uint64(blob_file_size);
DECLARE_string(blob_compression_type); DECLARE_string(blob_compression_type);
DECLARE_bool(enable_blob_garbage_collection); DECLARE_bool(enable_blob_garbage_collection);
DECLARE_double(blob_garbage_collection_age_cutoff); DECLARE_double(blob_garbage_collection_age_cutoff);
DECLARE_double(blob_garbage_collection_force_threshold);
DECLARE_int32(approximate_size_one_in); DECLARE_int32(approximate_size_one_in);
DECLARE_bool(sync_fault_injection); DECLARE_bool(sync_fault_injection);

@ -399,6 +399,12 @@ DEFINE_double(blob_garbage_collection_age_cutoff,
"[Integrated BlobDB] The cutoff in terms of blob file age for " "[Integrated BlobDB] The cutoff in terms of blob file age for "
"garbage collection."); "garbage collection.");
DEFINE_double(blob_garbage_collection_force_threshold,
ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
.blob_garbage_collection_force_threshold,
"[Integrated BlobDB] The threshold for the ratio of garbage in "
"the oldest blob files for forcing garbage collection.");
static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) = static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range); RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);

@ -262,6 +262,8 @@ bool StressTest::BuildOptionsTable() {
options_tbl.emplace( options_tbl.emplace(
"blob_garbage_collection_age_cutoff", "blob_garbage_collection_age_cutoff",
std::vector<std::string>{"0.0", "0.25", "0.5", "0.75", "1.0"}); std::vector<std::string>{"0.0", "0.25", "0.5", "0.75", "1.0"});
options_tbl.emplace("blob_garbage_collection_force_threshold",
std::vector<std::string>{"0.5", "0.75", "1.0"});
} }
options_table_ = std::move(options_tbl); options_table_ = std::move(options_tbl);
@ -2310,6 +2312,8 @@ void StressTest::Open() {
FLAGS_enable_blob_garbage_collection; FLAGS_enable_blob_garbage_collection;
options_.blob_garbage_collection_age_cutoff = options_.blob_garbage_collection_age_cutoff =
FLAGS_blob_garbage_collection_age_cutoff; FLAGS_blob_garbage_collection_age_cutoff;
options_.blob_garbage_collection_force_threshold =
FLAGS_blob_garbage_collection_force_threshold;
} else { } else {
#ifdef ROCKSDB_LITE #ifdef ROCKSDB_LITE
fprintf(stderr, "--options_file not supported in lite mode\n"); fprintf(stderr, "--options_file not supported in lite mode\n");
@ -2418,8 +2422,11 @@ void StressTest::Open() {
} }
if (options_.enable_blob_garbage_collection) { if (options_.enable_blob_garbage_collection) {
fprintf(stdout, "Integrated BlobDB: blob GC enabled, cutoff %f\n", fprintf(
options_.blob_garbage_collection_age_cutoff); stdout,
"Integrated BlobDB: blob GC enabled, cutoff %f, force threshold %f\n",
options_.blob_garbage_collection_age_cutoff,
options_.blob_garbage_collection_force_threshold);
} }
fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());

@ -846,6 +846,19 @@ struct AdvancedColumnFamilyOptions {
// Dynamically changeable through the SetOptions() API // Dynamically changeable through the SetOptions() API
double blob_garbage_collection_age_cutoff = 0.25; double blob_garbage_collection_age_cutoff = 0.25;
// If the ratio of garbage in the oldest blob files exceeds this threshold,
// targeted compactions are scheduled in order to force garbage collecting
// the blob files in question, assuming they are all eligible based on the
// value of blob_garbage_collection_age_cutoff above. This option is
// currently only supported with leveled compactions.
// Note that enable_blob_garbage_collection has to be set in order for this
// option to have any effect.
//
// Default: 1.0
//
// Dynamically changeable through the SetOptions() API
double blob_garbage_collection_force_threshold = 1.0;
// Create ColumnFamilyOptions with default values for all fields // Create ColumnFamilyOptions with default values for all fields
AdvancedColumnFamilyOptions(); AdvancedColumnFamilyOptions();
// Create ColumnFamilyOptions from Options // Create ColumnFamilyOptions from Options

@ -1117,6 +1117,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff(
extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff( extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff(
rocksdb_options_t* opt); rocksdb_options_t* opt);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_force_threshold(
rocksdb_options_t* opt, double val);
extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_force_threshold(
rocksdb_options_t* opt);
/* returns a pointer to a malloc()-ed, null terminated string */ /* returns a pointer to a malloc()-ed, null terminated string */
extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
rocksdb_options_t* opt); rocksdb_options_t* opt);

@ -144,6 +144,8 @@ enum class CompactionReason : int {
kPeriodicCompaction, kPeriodicCompaction,
// Compaction in order to move files to temperature // Compaction in order to move files to temperature
kChangeTemperature, kChangeTemperature,
// Compaction scheduled to force garbage collection of blob files
kForcedBlobGC,
// total number of compaction reasons, new reasons must be added above this. // total number of compaction reasons, new reasons must be added above this.
kNumOfReasons, kNumOfReasons,
}; };

@ -425,6 +425,11 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct MutableCFOptions, blob_garbage_collection_age_cutoff), {offsetof(struct MutableCFOptions, blob_garbage_collection_age_cutoff),
OptionType::kDouble, OptionVerificationType::kNormal, OptionType::kDouble, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}}, OptionTypeFlags::kMutable}},
{"blob_garbage_collection_force_threshold",
{offsetof(struct MutableCFOptions,
blob_garbage_collection_force_threshold),
OptionType::kDouble, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"sample_for_compression", {"sample_for_compression",
{offsetof(struct MutableCFOptions, sample_for_compression), {offsetof(struct MutableCFOptions, sample_for_compression),
OptionType::kUInt64T, OptionVerificationType::kNormal, OptionType::kUInt64T, OptionVerificationType::kNormal,
@ -1042,6 +1047,8 @@ void MutableCFOptions::Dump(Logger* log) const {
enable_blob_garbage_collection ? "true" : "false"); enable_blob_garbage_collection ? "true" : "false");
ROCKS_LOG_INFO(log, " blob_garbage_collection_age_cutoff: %f", ROCKS_LOG_INFO(log, " blob_garbage_collection_age_cutoff: %f",
blob_garbage_collection_age_cutoff); blob_garbage_collection_age_cutoff);
ROCKS_LOG_INFO(log, " blob_garbage_collection_force_threshold: %f",
blob_garbage_collection_force_threshold);
} }
MutableCFOptions::MutableCFOptions(const Options& options) MutableCFOptions::MutableCFOptions(const Options& options)

@ -141,6 +141,8 @@ struct MutableCFOptions {
enable_blob_garbage_collection(options.enable_blob_garbage_collection), enable_blob_garbage_collection(options.enable_blob_garbage_collection),
blob_garbage_collection_age_cutoff( blob_garbage_collection_age_cutoff(
options.blob_garbage_collection_age_cutoff), options.blob_garbage_collection_age_cutoff),
blob_garbage_collection_force_threshold(
options.blob_garbage_collection_force_threshold),
max_sequential_skip_in_iterations( max_sequential_skip_in_iterations(
options.max_sequential_skip_in_iterations), options.max_sequential_skip_in_iterations),
check_flush_compaction_key_order( check_flush_compaction_key_order(
@ -187,6 +189,7 @@ struct MutableCFOptions {
blob_compression_type(kNoCompression), blob_compression_type(kNoCompression),
enable_blob_garbage_collection(false), enable_blob_garbage_collection(false),
blob_garbage_collection_age_cutoff(0.0), blob_garbage_collection_age_cutoff(0.0),
blob_garbage_collection_force_threshold(0.0),
max_sequential_skip_in_iterations(0), max_sequential_skip_in_iterations(0),
check_flush_compaction_key_order(true), check_flush_compaction_key_order(true),
paranoid_file_checks(false), paranoid_file_checks(false),
@ -251,6 +254,7 @@ struct MutableCFOptions {
CompressionType blob_compression_type; CompressionType blob_compression_type;
bool enable_blob_garbage_collection; bool enable_blob_garbage_collection;
double blob_garbage_collection_age_cutoff; double blob_garbage_collection_age_cutoff;
double blob_garbage_collection_force_threshold;
// Misc options // Misc options
uint64_t max_sequential_skip_in_iterations; uint64_t max_sequential_skip_in_iterations;

@ -97,7 +97,9 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
blob_compression_type(options.blob_compression_type), blob_compression_type(options.blob_compression_type),
enable_blob_garbage_collection(options.enable_blob_garbage_collection), enable_blob_garbage_collection(options.enable_blob_garbage_collection),
blob_garbage_collection_age_cutoff( blob_garbage_collection_age_cutoff(
options.blob_garbage_collection_age_cutoff) { options.blob_garbage_collection_age_cutoff),
blob_garbage_collection_force_threshold(
options.blob_garbage_collection_force_threshold) {
assert(memtable_factory.get() != nullptr); assert(memtable_factory.get() != nullptr);
if (max_bytes_for_level_multiplier_additional.size() < if (max_bytes_for_level_multiplier_additional.size() <
static_cast<unsigned int>(num_levels)) { static_cast<unsigned int>(num_levels)) {
@ -387,20 +389,22 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
ROCKS_LOG_HEADER(log, ROCKS_LOG_HEADER(log,
" Options.periodic_compaction_seconds: %" PRIu64, " Options.periodic_compaction_seconds: %" PRIu64,
periodic_compaction_seconds); periodic_compaction_seconds);
ROCKS_LOG_HEADER(log, " Options.enable_blob_files: %s", ROCKS_LOG_HEADER(log, " Options.enable_blob_files: %s",
enable_blob_files ? "true" : "false"); enable_blob_files ? "true" : "false");
ROCKS_LOG_HEADER(log, ROCKS_LOG_HEADER(
" Options.min_blob_size: %" PRIu64, log, " Options.min_blob_size: %" PRIu64,
min_blob_size); min_blob_size);
ROCKS_LOG_HEADER(log, ROCKS_LOG_HEADER(
" Options.blob_file_size: %" PRIu64, log, " Options.blob_file_size: %" PRIu64,
blob_file_size); blob_file_size);
ROCKS_LOG_HEADER(log, " Options.blob_compression_type: %s", ROCKS_LOG_HEADER(log, " Options.blob_compression_type: %s",
CompressionTypeToString(blob_compression_type).c_str()); CompressionTypeToString(blob_compression_type).c_str());
ROCKS_LOG_HEADER(log, " Options.enable_blob_garbage_collection: %s", ROCKS_LOG_HEADER(log, " Options.enable_blob_garbage_collection: %s",
enable_blob_garbage_collection ? "true" : "false"); enable_blob_garbage_collection ? "true" : "false");
ROCKS_LOG_HEADER(log, " Options.blob_garbage_collection_age_cutoff: %f", ROCKS_LOG_HEADER(log, " Options.blob_garbage_collection_age_cutoff: %f",
blob_garbage_collection_age_cutoff); blob_garbage_collection_age_cutoff);
ROCKS_LOG_HEADER(log, "Options.blob_garbage_collection_force_threshold: %f",
blob_garbage_collection_force_threshold);
} // ColumnFamilyOptions::Dump } // ColumnFamilyOptions::Dump
void Options::Dump(Logger* log) const { void Options::Dump(Logger* log) const {

@ -250,6 +250,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
moptions.enable_blob_garbage_collection; moptions.enable_blob_garbage_collection;
cf_opts->blob_garbage_collection_age_cutoff = cf_opts->blob_garbage_collection_age_cutoff =
moptions.blob_garbage_collection_age_cutoff; moptions.blob_garbage_collection_age_cutoff;
cf_opts->blob_garbage_collection_force_threshold =
moptions.blob_garbage_collection_force_threshold;
// Misc options // Misc options
cf_opts->max_sequential_skip_in_iterations = cf_opts->max_sequential_skip_in_iterations =

@ -515,6 +515,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
"blob_compression_type=kBZip2Compression;" "blob_compression_type=kBZip2Compression;"
"enable_blob_garbage_collection=true;" "enable_blob_garbage_collection=true;"
"blob_garbage_collection_age_cutoff=0.5;" "blob_garbage_collection_age_cutoff=0.5;"
"blob_garbage_collection_force_threshold=0.75;"
"compaction_options_fifo={max_table_files_size=3;allow_" "compaction_options_fifo={max_table_files_size=3;allow_"
"compaction=false;age_for_warm=1;};", "compaction=false;age_for_warm=1;};",
new_options)); new_options));

@ -108,6 +108,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"blob_compression_type", "kZSTD"}, {"blob_compression_type", "kZSTD"},
{"enable_blob_garbage_collection", "true"}, {"enable_blob_garbage_collection", "true"},
{"blob_garbage_collection_age_cutoff", "0.5"}, {"blob_garbage_collection_age_cutoff", "0.5"},
{"blob_garbage_collection_force_threshold", "0.75"},
}; };
std::unordered_map<std::string, std::string> db_options_map = { std::unordered_map<std::string, std::string> db_options_map = {
@ -239,6 +240,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD); ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD);
ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true); ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true);
ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5); ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5);
ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75);
cf_options_map["write_buffer_size"] = "hello"; cf_options_map["write_buffer_size"] = "hello";
ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map, ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
@ -2264,6 +2266,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
{"blob_compression_type", "kZSTD"}, {"blob_compression_type", "kZSTD"},
{"enable_blob_garbage_collection", "true"}, {"enable_blob_garbage_collection", "true"},
{"blob_garbage_collection_age_cutoff", "0.5"}, {"blob_garbage_collection_age_cutoff", "0.5"},
{"blob_garbage_collection_force_threshold", "0.75"},
}; };
std::unordered_map<std::string, std::string> db_options_map = { std::unordered_map<std::string, std::string> db_options_map = {
@ -2387,6 +2390,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD); ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD);
ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true); ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true);
ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5); ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5);
ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75);
cf_options_map["write_buffer_size"] = "hello"; cf_options_map["write_buffer_size"] = "hello";
ASSERT_NOK(GetColumnFamilyOptionsFromMap( ASSERT_NOK(GetColumnFamilyOptionsFromMap(

@ -395,6 +395,8 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
cf_opt->memtable_prefix_bloom_size_ratio = cf_opt->memtable_prefix_bloom_size_ratio =
static_cast<double>(rnd->Uniform(10000)) / 20000.0; static_cast<double>(rnd->Uniform(10000)) / 20000.0;
cf_opt->blob_garbage_collection_age_cutoff = rnd->Uniform(10000) / 10000.0; cf_opt->blob_garbage_collection_age_cutoff = rnd->Uniform(10000) / 10000.0;
cf_opt->blob_garbage_collection_force_threshold =
rnd->Uniform(10000) / 10000.0;
// int options // int options
cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100); cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100);

@ -979,6 +979,12 @@ DEFINE_double(blob_garbage_collection_age_cutoff,
"[Integrated BlobDB] The cutoff in terms of blob file age for " "[Integrated BlobDB] The cutoff in terms of blob file age for "
"garbage collection."); "garbage collection.");
DEFINE_double(blob_garbage_collection_force_threshold,
ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
.blob_garbage_collection_force_threshold,
"[Integrated BlobDB] The threshold for the ratio of garbage in "
"the oldest blob files for forcing garbage collection.");
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
// Secondary DB instance Options // Secondary DB instance Options
@ -4331,6 +4337,8 @@ class Benchmark {
FLAGS_enable_blob_garbage_collection; FLAGS_enable_blob_garbage_collection;
options.blob_garbage_collection_age_cutoff = options.blob_garbage_collection_age_cutoff =
FLAGS_blob_garbage_collection_age_cutoff; FLAGS_blob_garbage_collection_age_cutoff;
options.blob_garbage_collection_force_threshold =
FLAGS_blob_garbage_collection_force_threshold;
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
if (FLAGS_readonly && FLAGS_transaction_db) { if (FLAGS_readonly && FLAGS_transaction_db) {

@ -276,7 +276,8 @@ const std::string options_file_content = R"OPTIONS_FILE(
blob_file_size=10485760 blob_file_size=10485760
blob_compression_type=kNoCompression blob_compression_type=kNoCompression
enable_blob_garbage_collection=true enable_blob_garbage_collection=true
blob_garbage_collection_age_cutoff=0.75 blob_garbage_collection_age_cutoff=0.5
blob_garbage_collection_force_threshold=0.75
[TableOptions/BlockBasedTable "default"] [TableOptions/BlockBasedTable "default"]
format_version=0 format_version=0

@ -292,6 +292,7 @@ blob_params = {
"blob_compression_type": lambda: random.choice(["none", "snappy", "lz4", "zstd"]), "blob_compression_type": lambda: random.choice(["none", "snappy", "lz4", "zstd"]),
"enable_blob_garbage_collection": lambda: random.choice([0] + [1] * 3), "enable_blob_garbage_collection": lambda: random.choice([0] + [1] * 3),
"blob_garbage_collection_age_cutoff": lambda: random.choice([0.0, 0.25, 0.5, 0.75, 1.0]), "blob_garbage_collection_age_cutoff": lambda: random.choice([0.0, 0.25, 0.5, 0.75, 1.0]),
"blob_garbage_collection_force_threshold": lambda: random.choice([0.5, 0.75, 1.0]),
} }
ts_params = { ts_params = {

Loading…
Cancel
Save