diff --git a/db/c.cc b/db/c.cc index 72274e68f..89dfa5303 100644 --- a/db/c.cc +++ b/db/c.cc @@ -2751,6 +2751,15 @@ double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) { return opt->rep.blob_garbage_collection_age_cutoff; } +void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt, + double val) { + opt->rep.blob_garbage_collection_force_threshold = val; +} + +double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) { + return opt->rep.blob_garbage_collection_force_threshold; +} + void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { opt->rep.num_levels = n; } diff --git a/db/c_test.c b/db/c_test.c index bbc4e9db9..fb8f65635 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -1793,8 +1793,11 @@ int main(int argc, char** argv) { rocksdb_options_set_enable_blob_gc(o, 1); CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o)); - rocksdb_options_set_blob_gc_age_cutoff(o, 0.75); - CheckCondition(0.75 == rocksdb_options_get_blob_gc_age_cutoff(o)); + rocksdb_options_set_blob_gc_age_cutoff(o, 0.5); + CheckCondition(0.5 == rocksdb_options_get_blob_gc_age_cutoff(o)); + + rocksdb_options_set_blob_gc_force_threshold(o, 0.75); + CheckCondition(0.75 == rocksdb_options_get_blob_gc_force_threshold(o)); // Create a copy that should be equal to the original. rocksdb_options_t* copy; diff --git a/db/column_family.cc b/db/column_family.cc index b1cfa24f0..2aa1fbd6d 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1358,12 +1358,19 @@ Status ColumnFamilyData::ValidateOptions( } } - if (cf_options.enable_blob_garbage_collection && - (cf_options.blob_garbage_collection_age_cutoff < 0.0 || - cf_options.blob_garbage_collection_age_cutoff > 1.0)) { - return Status::InvalidArgument( - "The age cutoff for blob garbage collection should be in the range " - "[0.0, 1.0]."); + if (cf_options.enable_blob_garbage_collection) { + if (cf_options.blob_garbage_collection_age_cutoff < 0.0 || + cf_options.blob_garbage_collection_age_cutoff > 1.0) { + return Status::InvalidArgument( + "The age cutoff for blob garbage collection should be in the range " + "[0.0, 1.0]."); + } + if (cf_options.blob_garbage_collection_force_threshold < 0.0 || + cf_options.blob_garbage_collection_force_threshold > 1.0) { + return Status::InvalidArgument( + "The garbage ratio threshold for forcing blob garbage collection " + "should be in the range [0.0, 1.0]."); + } } if (cf_options.compaction_style == kCompactionStyleFIFO && diff --git a/db/column_family_test.cc b/db/column_family_test.cc index b772fb1ce..cdec7bac9 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -3407,6 +3407,30 @@ TEST(ColumnFamilyTest, ValidateBlobGCCutoff) { .IsInvalidArgument()); } +TEST(ColumnFamilyTest, ValidateBlobGCForceThreshold) { + DBOptions db_options; + + ColumnFamilyOptions cf_options; + cf_options.enable_blob_garbage_collection = true; + + cf_options.blob_garbage_collection_force_threshold = -0.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); + + cf_options.blob_garbage_collection_force_threshold = 0.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 0.5; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 1.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 1.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); +} + } // namespace ROCKSDB_NAMESPACE #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 8b67d3323..d36cf8ab5 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -109,6 +109,8 @@ const char* GetCompactionReasonString(CompactionReason compaction_reason) { return "PeriodicCompaction"; case CompactionReason::kChangeTemperature: return "ChangeTemperature"; + case CompactionReason::kForcedBlobGC: + return "ForcedBlobGC"; case CompactionReason::kNumOfReasons: // fall through default: diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index 0a70c89c0..52a3d5c35 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -31,6 +31,9 @@ bool LevelCompactionPicker::NeedsCompaction( if (!vstorage->FilesMarkedForCompaction().empty()) { return true; } + if (!vstorage->FilesMarkedForForcedBlobGC().empty()) { + return true; + } for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { if (vstorage->CompactionScore(i) >= 1) { return true; @@ -248,6 +251,13 @@ void LevelCompactionBuilder::SetupInitialFiles() { compaction_reason_ = CompactionReason::kPeriodicCompaction; return; } + + // Forced blob garbage collection + PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kForcedBlobGC; + return; + } } bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { diff --git a/db/version_set.cc b/db/version_set.cc index c8cbe3bb4..e99f44501 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2817,6 +2817,15 @@ void VersionStorageInfo::ComputeCompactionScore( ComputeFilesMarkedForPeriodicCompaction( immutable_options, mutable_cf_options.periodic_compaction_seconds); } + + if (mutable_cf_options.enable_blob_garbage_collection && + mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 && + mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) { + ComputeFilesMarkedForForcedBlobGC( + mutable_cf_options.blob_garbage_collection_age_cutoff, + mutable_cf_options.blob_garbage_collection_force_threshold); + } + EstimateCompactionBytesNeeded(mutable_cf_options); } @@ -2926,6 +2935,106 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( } } +void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC( + double blob_garbage_collection_age_cutoff, + double blob_garbage_collection_force_threshold) { + files_marked_for_forced_blob_gc_.clear(); + + if (blob_files_.empty()) { + return; + } + + // Number of blob files eligible for GC based on age + const size_t cutoff_count = static_cast( + blob_garbage_collection_age_cutoff * blob_files_.size()); + if (!cutoff_count) { + return; + } + + // Compute the sum of total and garbage bytes over the oldest batch of blob + // files. The oldest batch is defined as the set of blob files which are + // kept alive by the same SSTs as the very oldest one. Here is a toy example. + // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11, + // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and + // potentially some higher-numbered ones, while SST 3 relies on blob file 12 + // and potentially some higher-numbered ones. Then, the SST to oldest blob + // file mapping is as follows: + // + // SST file number Oldest blob file number + // 1 10 + // 2 10 + // 3 12 + // + // This is what the same thing looks like from the blob files' POV. (Note that + // the linked SSTs simply denote the inverse mapping of the above.) + // + // Blob file number Linked SST set + // 10 {1, 2} + // 11 {} + // 12 {3} + // 13 {} + // + // Then, the oldest batch of blob files consists of blob files 10 and 11, + // and we can get rid of them by forcing the compaction of SSTs 1 and 2. + // + // Note that the overall ratio of garbage computed for the batch has to exceed + // blob_garbage_collection_force_threshold and the entire batch has to be + // eligible for GC according to blob_garbage_collection_age_cutoff in order + // for us to schedule any compactions. + const auto oldest_it = blob_files_.begin(); + + const auto& oldest_meta = oldest_it->second; + assert(oldest_meta); + + const auto& linked_ssts = oldest_meta->GetLinkedSsts(); + assert(!linked_ssts.empty()); + + size_t count = 1; + uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes(); + uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes(); + + auto it = oldest_it; + for (++it; it != blob_files_.end(); ++it) { + const auto& meta = it->second; + assert(meta); + + if (!meta->GetLinkedSsts().empty()) { + break; + } + + if (++count > cutoff_count) { + return; + } + + sum_total_blob_bytes += meta->GetTotalBlobBytes(); + sum_garbage_blob_bytes += meta->GetGarbageBlobBytes(); + } + + if (sum_garbage_blob_bytes < + blob_garbage_collection_force_threshold * sum_total_blob_bytes) { + return; + } + + for (uint64_t sst_file_number : linked_ssts) { + const FileLocation location = GetFileLocation(sst_file_number); + assert(location.IsValid()); + + const int level = location.GetLevel(); + assert(level >= 0); + + const size_t pos = location.GetPosition(); + + FileMetaData* const sst_meta = files_[level][pos]; + assert(sst_meta); + + if (sst_meta->being_compacted) { + continue; + } + + files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta); + } +} + namespace { // used to sort files by size diff --git a/db/version_set.h b/db/version_set.h index 4abb96f49..d4ce284a5 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -184,6 +184,14 @@ class VersionStorageInfo { // REQUIRES: DB mutex held void ComputeBottommostFilesMarkedForCompaction(); + // This computes files_marked_for_forced_blob_gc_ and is called by + // ComputeCompactionScore() + // + // REQUIRES: DB mutex held + void ComputeFilesMarkedForForcedBlobGC( + double blob_garbage_collection_age_cutoff, + double blob_garbage_collection_force_threshold); + // Generate level_files_brief_ from files_ void GenerateLevelFilesBrief(); // Sort all files for this version based on their file size and @@ -404,6 +412,14 @@ class VersionStorageInfo { return bottommost_files_marked_for_compaction_; } + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + // REQUIRES: DB mutex held during access + const autovector>& FilesMarkedForForcedBlobGC() + const { + assert(finalized_); + return files_marked_for_forced_blob_gc_; + } + int base_level() const { return base_level_; } double level_multiplier() const { return level_multiplier_; } @@ -586,6 +602,8 @@ class VersionStorageInfo { autovector> bottommost_files_marked_for_compaction_; + autovector> files_marked_for_forced_blob_gc_; + // Threshold for needing to mark another bottommost file. Maintain it so we // can quickly check when releasing a snapshot whether more bottommost files // became eligible for compaction. It's defined as the min of the max nonzero diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 90088100c..7bfea79b6 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -9,6 +9,8 @@ #include "db/version_set.h" +#include + #include "db/db_impl/db_impl.h" #include "db/log_writer.h" #include "rocksdb/convenience.h" @@ -135,31 +137,55 @@ class VersionStorageInfoTestBase : public testing::Test { } void Add(int level, uint32_t file_number, const char* smallest, - const char* largest, uint64_t file_size = 0) { - assert(level < vstorage_.num_levels()); - FileMetaData* f = new FileMetaData( - file_number, 0, file_size, GetInternalKey(smallest, 0), - GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0, - /* marked_for_compact */ false, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); - f->compensated_file_size = file_size; - vstorage_.AddFile(level, f); + const char* largest, uint64_t file_size = 0, + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { + constexpr SequenceNumber dummy_seq = 0; + + Add(level, file_number, GetInternalKey(smallest, dummy_seq), + GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number); } void Add(int level, uint32_t file_number, const InternalKey& smallest, - const InternalKey& largest, uint64_t file_size = 0) { + const InternalKey& largest, uint64_t file_size = 0, + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { assert(level < vstorage_.num_levels()); FileMetaData* f = new FileMetaData( file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0, /* largest_seq */ 0, /* marked_for_compact */ false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName); f->compensated_file_size = file_size; vstorage_.AddFile(level, f); } + void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, + BlobFileMetaData::LinkedSsts linked_ssts, + uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) { + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, + /* checksum_method */ std::string(), + /* checksum_value */ std::string()); + auto meta = + BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts), + garbage_blob_count, garbage_blob_bytes); + + vstorage_.AddBlobFile(std::move(meta)); + } + + void Finalize() { + vstorage_.UpdateNumNonEmptyLevels(); + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + vstorage_.UpdateFilesByCompactionPri(ioptions_.compaction_pri); + vstorage_.GenerateFileIndexer(); + vstorage_.GenerateLevelFilesBrief(); + vstorage_.GenerateLevel0NonOverlapping(); + vstorage_.GenerateBottommostFiles(); + + vstorage_.SetFinalized(); + } + std::string GetOverlappingFiles(int level, const InternalKey& begin, const InternalKey& end) { std::vector inputs; @@ -445,6 +471,171 @@ TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) { ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr); } +TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) { + // No SST or blob files in VersionStorageInfo + Finalize(); + + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.75; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); +} + +TEST_F(VersionStorageInfoTest, ForcedBlobGC) { + // Add three L0 SSTs and four blob files. The first two SSTs keep alive the + // first two blob files, while the third SST keeps alive the third and fourth + // blob files. + + constexpr int level = 0; + + constexpr uint64_t first_sst = 1; + constexpr uint64_t second_sst = 2; + constexpr uint64_t third_sst = 3; + + constexpr uint64_t first_blob = 10; + constexpr uint64_t second_blob = 11; + constexpr uint64_t third_blob = 12; + constexpr uint64_t fourth_blob = 13; + + { + constexpr char smallest[] = "bar1"; + constexpr char largest[] = "foo1"; + constexpr uint64_t file_size = 1000; + + Add(level, first_sst, smallest, largest, file_size, first_blob); + } + + { + constexpr char smallest[] = "bar2"; + constexpr char largest[] = "foo2"; + constexpr uint64_t file_size = 2000; + + Add(level, second_sst, smallest, largest, file_size, first_blob); + } + + { + constexpr char smallest[] = "bar3"; + constexpr char largest[] = "foo3"; + constexpr uint64_t file_size = 3000; + + Add(level, third_sst, smallest, largest, file_size, third_blob); + } + + { + constexpr uint64_t total_blob_count = 10; + constexpr uint64_t total_blob_bytes = 100000; + constexpr uint64_t garbage_blob_count = 2; + constexpr uint64_t garbage_blob_bytes = 15000; + + AddBlob(first_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{first_sst, second_sst}, + garbage_blob_count, garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 4; + constexpr uint64_t total_blob_bytes = 400000; + constexpr uint64_t garbage_blob_count = 3; + constexpr uint64_t garbage_blob_bytes = 235000; + + AddBlob(second_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 20; + constexpr uint64_t total_blob_bytes = 1000000; + constexpr uint64_t garbage_blob_count = 8; + constexpr uint64_t garbage_blob_bytes = 123456; + + AddBlob(third_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{third_sst}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 128; + constexpr uint64_t total_blob_bytes = 789012345; + constexpr uint64_t garbage_blob_count = 67; + constexpr uint64_t garbage_blob_bytes = 88888888; + + AddBlob(fourth_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + Finalize(); + + assert(vstorage_.num_levels() > 0); + const auto& level_files = vstorage_.LevelFiles(level); + + assert(level_files.size() == 3); + assert(level_files[0] && level_files[0]->fd.GetNumber() == first_sst); + assert(level_files[1] && level_files[1]->fd.GetNumber() == second_sst); + assert(level_files[2] && level_files[2]->fd.GetNumber() == third_sst); + + // No blob files eligible for GC due to the age cutoff + + { + constexpr double age_cutoff = 0.1; + constexpr double force_threshold = 0.0; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Part of the oldest batch of blob files (specifically, the second file) is + // ineligible for GC due to the age cutoff + + { + constexpr double age_cutoff = 0.25; + constexpr double force_threshold = 0.0; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff but its overall garbage ratio + // is below threshold + + { + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.6; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff and its overall garbage ratio + // meets threshold + + { + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.5; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC(); + ASSERT_EQ(ssts_to_be_compacted.size(), 2); + + std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(), + [](const std::pair& lhs, + const std::pair& rhs) { + assert(lhs.second); + assert(rhs.second); + return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber(); + }); + + const autovector> + expected_ssts_to_be_compacted{{level, level_files[0]}, + {level, level_files[1]}}; + + ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]); + ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]); + } +} + class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase { public: VersionStorageInfoTimestampTest() diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 29a968e6e..87840b0e4 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -252,6 +252,7 @@ DECLARE_uint64(blob_file_size); DECLARE_string(blob_compression_type); DECLARE_bool(enable_blob_garbage_collection); DECLARE_double(blob_garbage_collection_age_cutoff); +DECLARE_double(blob_garbage_collection_force_threshold); DECLARE_int32(approximate_size_one_in); DECLARE_bool(sync_fault_injection); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 15adef915..89ccf5bc4 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -399,6 +399,12 @@ DEFINE_double(blob_garbage_collection_age_cutoff, "[Integrated BlobDB] The cutoff in terms of blob file age for " "garbage collection."); +DEFINE_double(blob_garbage_collection_force_threshold, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_garbage_collection_force_threshold, + "[Integrated BlobDB] The threshold for the ratio of garbage in " + "the oldest blob files for forcing garbage collection."); + static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range); diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 527cd775c..5d4c414fd 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -262,6 +262,8 @@ bool StressTest::BuildOptionsTable() { options_tbl.emplace( "blob_garbage_collection_age_cutoff", std::vector{"0.0", "0.25", "0.5", "0.75", "1.0"}); + options_tbl.emplace("blob_garbage_collection_force_threshold", + std::vector{"0.5", "0.75", "1.0"}); } options_table_ = std::move(options_tbl); @@ -2310,6 +2312,8 @@ void StressTest::Open() { FLAGS_enable_blob_garbage_collection; options_.blob_garbage_collection_age_cutoff = FLAGS_blob_garbage_collection_age_cutoff; + options_.blob_garbage_collection_force_threshold = + FLAGS_blob_garbage_collection_force_threshold; } else { #ifdef ROCKSDB_LITE fprintf(stderr, "--options_file not supported in lite mode\n"); @@ -2418,8 +2422,11 @@ void StressTest::Open() { } if (options_.enable_blob_garbage_collection) { - fprintf(stdout, "Integrated BlobDB: blob GC enabled, cutoff %f\n", - options_.blob_garbage_collection_age_cutoff); + fprintf( + stdout, + "Integrated BlobDB: blob GC enabled, cutoff %f, force threshold %f\n", + options_.blob_garbage_collection_age_cutoff, + options_.blob_garbage_collection_force_threshold); } fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 2e6bb7fbc..d810bedf3 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -846,6 +846,19 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through the SetOptions() API double blob_garbage_collection_age_cutoff = 0.25; + // If the ratio of garbage in the oldest blob files exceeds this threshold, + // targeted compactions are scheduled in order to force garbage collecting + // the blob files in question, assuming they are all eligible based on the + // value of blob_garbage_collection_age_cutoff above. This option is + // currently only supported with leveled compactions. + // Note that enable_blob_garbage_collection has to be set in order for this + // option to have any effect. + // + // Default: 1.0 + // + // Dynamically changeable through the SetOptions() API + double blob_garbage_collection_force_threshold = 1.0; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index c8ee0c939..730bef9f2 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -1117,6 +1117,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff( extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff( rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_force_threshold( + rocksdb_options_t* opt, double val); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_force_threshold( + rocksdb_options_t* opt); + /* returns a pointer to a malloc()-ed, null terminated string */ extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( rocksdb_options_t* opt); diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index c79de186d..6ffb3fe4c 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -144,6 +144,8 @@ enum class CompactionReason : int { kPeriodicCompaction, // Compaction in order to move files to temperature kChangeTemperature, + // Compaction scheduled to force garbage collection of blob files + kForcedBlobGC, // total number of compaction reasons, new reasons must be added above this. kNumOfReasons, }; diff --git a/options/cf_options.cc b/options/cf_options.cc index c7f6538a9..5767e759c 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -425,6 +425,11 @@ static std::unordered_map {offsetof(struct MutableCFOptions, blob_garbage_collection_age_cutoff), OptionType::kDouble, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"blob_garbage_collection_force_threshold", + {offsetof(struct MutableCFOptions, + blob_garbage_collection_force_threshold), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"sample_for_compression", {offsetof(struct MutableCFOptions, sample_for_compression), OptionType::kUInt64T, OptionVerificationType::kNormal, @@ -1042,6 +1047,8 @@ void MutableCFOptions::Dump(Logger* log) const { enable_blob_garbage_collection ? "true" : "false"); ROCKS_LOG_INFO(log, " blob_garbage_collection_age_cutoff: %f", blob_garbage_collection_age_cutoff); + ROCKS_LOG_INFO(log, " blob_garbage_collection_force_threshold: %f", + blob_garbage_collection_force_threshold); } MutableCFOptions::MutableCFOptions(const Options& options) diff --git a/options/cf_options.h b/options/cf_options.h index d4e77f04f..d08096da1 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -141,6 +141,8 @@ struct MutableCFOptions { enable_blob_garbage_collection(options.enable_blob_garbage_collection), blob_garbage_collection_age_cutoff( options.blob_garbage_collection_age_cutoff), + blob_garbage_collection_force_threshold( + options.blob_garbage_collection_force_threshold), max_sequential_skip_in_iterations( options.max_sequential_skip_in_iterations), check_flush_compaction_key_order( @@ -187,6 +189,7 @@ struct MutableCFOptions { blob_compression_type(kNoCompression), enable_blob_garbage_collection(false), blob_garbage_collection_age_cutoff(0.0), + blob_garbage_collection_force_threshold(0.0), max_sequential_skip_in_iterations(0), check_flush_compaction_key_order(true), paranoid_file_checks(false), @@ -251,6 +254,7 @@ struct MutableCFOptions { CompressionType blob_compression_type; bool enable_blob_garbage_collection; double blob_garbage_collection_age_cutoff; + double blob_garbage_collection_force_threshold; // Misc options uint64_t max_sequential_skip_in_iterations; diff --git a/options/options.cc b/options/options.cc index 2b3053fe3..9a8eee209 100644 --- a/options/options.cc +++ b/options/options.cc @@ -97,7 +97,9 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) blob_compression_type(options.blob_compression_type), enable_blob_garbage_collection(options.enable_blob_garbage_collection), blob_garbage_collection_age_cutoff( - options.blob_garbage_collection_age_cutoff) { + options.blob_garbage_collection_age_cutoff), + blob_garbage_collection_force_threshold( + options.blob_garbage_collection_force_threshold) { assert(memtable_factory.get() != nullptr); if (max_bytes_for_level_multiplier_additional.size() < static_cast(num_levels)) { @@ -387,20 +389,22 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.periodic_compaction_seconds: %" PRIu64, periodic_compaction_seconds); - ROCKS_LOG_HEADER(log, " Options.enable_blob_files: %s", + ROCKS_LOG_HEADER(log, " Options.enable_blob_files: %s", enable_blob_files ? "true" : "false"); - ROCKS_LOG_HEADER(log, - " Options.min_blob_size: %" PRIu64, - min_blob_size); - ROCKS_LOG_HEADER(log, - " Options.blob_file_size: %" PRIu64, - blob_file_size); - ROCKS_LOG_HEADER(log, " Options.blob_compression_type: %s", + ROCKS_LOG_HEADER( + log, " Options.min_blob_size: %" PRIu64, + min_blob_size); + ROCKS_LOG_HEADER( + log, " Options.blob_file_size: %" PRIu64, + blob_file_size); + ROCKS_LOG_HEADER(log, " Options.blob_compression_type: %s", CompressionTypeToString(blob_compression_type).c_str()); - ROCKS_LOG_HEADER(log, " Options.enable_blob_garbage_collection: %s", + ROCKS_LOG_HEADER(log, " Options.enable_blob_garbage_collection: %s", enable_blob_garbage_collection ? "true" : "false"); - ROCKS_LOG_HEADER(log, " Options.blob_garbage_collection_age_cutoff: %f", + ROCKS_LOG_HEADER(log, " Options.blob_garbage_collection_age_cutoff: %f", blob_garbage_collection_age_cutoff); + ROCKS_LOG_HEADER(log, "Options.blob_garbage_collection_force_threshold: %f", + blob_garbage_collection_force_threshold); } // ColumnFamilyOptions::Dump void Options::Dump(Logger* log) const { diff --git a/options/options_helper.cc b/options/options_helper.cc index 668a6caa1..02d0171a5 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -250,6 +250,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.enable_blob_garbage_collection; cf_opts->blob_garbage_collection_age_cutoff = moptions.blob_garbage_collection_age_cutoff; + cf_opts->blob_garbage_collection_force_threshold = + moptions.blob_garbage_collection_force_threshold; // Misc options cf_opts->max_sequential_skip_in_iterations = diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index f79021374..613fd8400 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -515,6 +515,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "blob_compression_type=kBZip2Compression;" "enable_blob_garbage_collection=true;" "blob_garbage_collection_age_cutoff=0.5;" + "blob_garbage_collection_force_threshold=0.75;" "compaction_options_fifo={max_table_files_size=3;allow_" "compaction=false;age_for_warm=1;};", new_options)); diff --git a/options/options_test.cc b/options/options_test.cc index 2e3130284..52b5ac7e1 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -108,6 +108,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"blob_compression_type", "kZSTD"}, {"enable_blob_garbage_collection", "true"}, {"blob_garbage_collection_age_cutoff", "0.5"}, + {"blob_garbage_collection_force_threshold", "0.75"}, }; std::unordered_map db_options_map = { @@ -239,6 +240,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD); ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true); ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5); + ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75); cf_options_map["write_buffer_size"] = "hello"; ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map, @@ -2264,6 +2266,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"blob_compression_type", "kZSTD"}, {"enable_blob_garbage_collection", "true"}, {"blob_garbage_collection_age_cutoff", "0.5"}, + {"blob_garbage_collection_force_threshold", "0.75"}, }; std::unordered_map db_options_map = { @@ -2387,6 +2390,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD); ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true); ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5); + ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75); cf_options_map["write_buffer_size"] = "hello"; ASSERT_NOK(GetColumnFamilyOptionsFromMap( diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 0950326d6..9c72972ee 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -395,6 +395,8 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options, cf_opt->memtable_prefix_bloom_size_ratio = static_cast(rnd->Uniform(10000)) / 20000.0; cf_opt->blob_garbage_collection_age_cutoff = rnd->Uniform(10000) / 10000.0; + cf_opt->blob_garbage_collection_force_threshold = + rnd->Uniform(10000) / 10000.0; // int options cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100); diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index e8ed9bff1..67060e240 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -979,6 +979,12 @@ DEFINE_double(blob_garbage_collection_age_cutoff, "[Integrated BlobDB] The cutoff in terms of blob file age for " "garbage collection."); +DEFINE_double(blob_garbage_collection_force_threshold, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_garbage_collection_force_threshold, + "[Integrated BlobDB] The threshold for the ratio of garbage in " + "the oldest blob files for forcing garbage collection."); + #ifndef ROCKSDB_LITE // Secondary DB instance Options @@ -4331,6 +4337,8 @@ class Benchmark { FLAGS_enable_blob_garbage_collection; options.blob_garbage_collection_age_cutoff = FLAGS_blob_garbage_collection_age_cutoff; + options.blob_garbage_collection_force_threshold = + FLAGS_blob_garbage_collection_force_threshold; #ifndef ROCKSDB_LITE if (FLAGS_readonly && FLAGS_transaction_db) { diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc index 9c3515876..bad4ae5c0 100644 --- a/tools/db_bench_tool_test.cc +++ b/tools/db_bench_tool_test.cc @@ -276,7 +276,8 @@ const std::string options_file_content = R"OPTIONS_FILE( blob_file_size=10485760 blob_compression_type=kNoCompression enable_blob_garbage_collection=true - blob_garbage_collection_age_cutoff=0.75 + blob_garbage_collection_age_cutoff=0.5 + blob_garbage_collection_force_threshold=0.75 [TableOptions/BlockBasedTable "default"] format_version=0 diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index a616740cd..5838304f0 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -292,6 +292,7 @@ blob_params = { "blob_compression_type": lambda: random.choice(["none", "snappy", "lz4", "zstd"]), "enable_blob_garbage_collection": lambda: random.choice([0] + [1] * 3), "blob_garbage_collection_age_cutoff": lambda: random.choice([0.0, 0.25, 0.5, 0.75, 1.0]), + "blob_garbage_collection_force_threshold": lambda: random.choice([0.5, 0.75, 1.0]), } ts_params = {