diff --git a/HISTORY.md b/HISTORY.md index c3dac417a..a959c26be 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -8,6 +8,7 @@ * FIFO compaction now supports migrating from a multi-level DB via DB::Open(). During the migration phase, FIFO compaction picker will: * picks the sst file with the smallest starting key in the bottom-most non-empty level. * Note that during the migration phase, the file purge order will only be an approximation of "FIFO" as files in lower-level might sometime contain newer keys than files in upper-level. +* Added an option `ignore_max_compaction_bytes_for_input` to ignore max_compaction_bytes limit when adding files to be compacted from input level. This should help reduce write amplification. The option is enabled by default. ### Bug Fixes * Fix a bug in io_uring_prep_cancel in AbortIO API for posix which expects sqe->addr to match with read request submitted and wrong paramter was being passed. diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index fde3dfcee..e0327a9e8 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -526,7 +526,8 @@ bool CompactionPicker::SetupOtherInputs( try_overlapping_inputs = false; } if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() && - output_level_inputs_size + expanded_inputs_size < limit && + (mutable_cf_options.ignore_max_compaction_bytes_for_input || + output_level_inputs_size + expanded_inputs_size < limit) && !AreFilesInCompaction(expanded_inputs.files)) { InternalKey new_start, new_limit; GetRange(expanded_inputs, &new_start, &new_limit); @@ -549,7 +550,8 @@ bool CompactionPicker::SetupOtherInputs( base_index, nullptr); expanded_inputs_size = TotalFileSize(expanded_inputs.files); if (expanded_inputs.size() > inputs->size() && - output_level_inputs_size + expanded_inputs_size < limit && + (mutable_cf_options.ignore_max_compaction_bytes_for_input || + output_level_inputs_size + expanded_inputs_size < limit) && !AreFilesInCompaction(expanded_inputs.files)) { expand_inputs = true; } diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 539d25a33..3a0e1b9ad 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -1340,6 +1340,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) { ioptions_.compaction_pri = kMinOverlappingRatio; mutable_cf_options_.max_bytes_for_level_base = 10000000; mutable_cf_options_.max_bytes_for_level_multiplier = 10; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; // file 7 and 8 over lap with the same file, but file 8 is smaller so // it will be picked. @@ -2358,6 +2359,7 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) { TEST_F(CompactionPickerTest, MaxCompactionBytesHit) { mutable_cf_options_.max_bytes_for_level_base = 1000000u; mutable_cf_options_.max_compaction_bytes = 800000u; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; ioptions_.level_compaction_dynamic_level_bytes = false; NewVersionStorage(6, kCompactionStyleLevel); // A compaction should be triggered and pick file 2 and 5. @@ -2384,6 +2386,7 @@ TEST_F(CompactionPickerTest, MaxCompactionBytesHit) { TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) { mutable_cf_options_.max_bytes_for_level_base = 800000u; mutable_cf_options_.max_compaction_bytes = 1000000u; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; ioptions_.level_compaction_dynamic_level_bytes = false; NewVersionStorage(6, kCompactionStyleLevel); // A compaction should be triggered and pick file 2 and 5. diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index a768f5dd4..5cd52223f 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -684,6 +684,17 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API uint64_t max_compaction_bytes = 0; + // When setting up compaction input files, we ignore the + // `max_compaction_bytes` limit when pulling in input files that are entirely + // within output key range. + // + // Default: true + // + // Dynamically changeable through SetOptions() API + // We could remove this knob and always ignore the limit once it is proven + // safe. + bool ignore_max_compaction_bytes_for_input = true; + // All writes will be slowed down to at least delayed_write_rate if estimated // bytes needed to be compaction exceed this threshold. // diff --git a/options/cf_options.cc b/options/cf_options.cc index f34e8849e..dbf0bf9b0 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -270,6 +270,11 @@ static std::unordered_map {offsetof(struct MutableCFOptions, max_compaction_bytes), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"ignore_max_compaction_bytes_for_input", + {offsetof(struct MutableCFOptions, + ignore_max_compaction_bytes_for_input), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"expanded_compaction_factor", {0, OptionType::kInt, OptionVerificationType::kDeprecated, OptionTypeFlags::kMutable}}, @@ -1034,6 +1039,8 @@ void MutableCFOptions::Dump(Logger* log) const { level0_stop_writes_trigger); ROCKS_LOG_INFO(log, " max_compaction_bytes: %" PRIu64, max_compaction_bytes); + ROCKS_LOG_INFO(log, " ignore_max_compaction_bytes_for_input: %s", + ignore_max_compaction_bytes_for_input ? "true" : "false"); ROCKS_LOG_INFO(log, " target_file_size_base: %" PRIu64, target_file_size_base); ROCKS_LOG_INFO(log, " target_file_size_multiplier: %d", diff --git a/options/cf_options.h b/options/cf_options.h index 5ac4c0c8e..050618eda 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -130,6 +130,8 @@ struct MutableCFOptions { level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), level0_stop_writes_trigger(options.level0_stop_writes_trigger), max_compaction_bytes(options.max_compaction_bytes), + ignore_max_compaction_bytes_for_input( + options.ignore_max_compaction_bytes_for_input), target_file_size_base(options.target_file_size_base), target_file_size_multiplier(options.target_file_size_multiplier), max_bytes_for_level_base(options.max_bytes_for_level_base), @@ -192,6 +194,7 @@ struct MutableCFOptions { level0_slowdown_writes_trigger(0), level0_stop_writes_trigger(0), max_compaction_bytes(0), + ignore_max_compaction_bytes_for_input(true), target_file_size_base(0), target_file_size_multiplier(0), max_bytes_for_level_base(0), @@ -273,6 +276,7 @@ struct MutableCFOptions { int level0_slowdown_writes_trigger; int level0_stop_writes_trigger; uint64_t max_compaction_bytes; + bool ignore_max_compaction_bytes_for_input; uint64_t target_file_size_base; int target_file_size_multiplier; uint64_t max_bytes_for_level_base; diff --git a/options/options.cc b/options/options.cc index 153c5f8ef..316d3550e 100644 --- a/options/options.cc +++ b/options/options.cc @@ -71,6 +71,8 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) max_bytes_for_level_multiplier_additional( options.max_bytes_for_level_multiplier_additional), max_compaction_bytes(options.max_compaction_bytes), + ignore_max_compaction_bytes_for_input( + options.ignore_max_compaction_bytes_for_input), soft_pending_compaction_bytes_limit( options.soft_pending_compaction_bytes_limit), hard_pending_compaction_bytes_limit( @@ -281,6 +283,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.max_compaction_bytes: %" PRIu64, max_compaction_bytes); + ROCKS_LOG_HEADER(log, " Options.ignore_max_compaction_bytes_for_input: %s", + ignore_max_compaction_bytes_for_input ? "true" : "false"); ROCKS_LOG_HEADER( log, " Options.arena_block_size: %" ROCKSDB_PRIszt, diff --git a/options/options_helper.cc b/options/options_helper.cc index 0d63edb89..59b01e6fb 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -229,6 +229,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.level0_slowdown_writes_trigger; cf_opts->level0_stop_writes_trigger = moptions.level0_stop_writes_trigger; cf_opts->max_compaction_bytes = moptions.max_compaction_bytes; + cf_opts->ignore_max_compaction_bytes_for_input = + moptions.ignore_max_compaction_bytes_for_input; cf_opts->target_file_size_base = moptions.target_file_size_base; cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier; cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index f0ee5ecc6..63e9721ca 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -486,6 +486,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "max_write_buffer_number=84;" "write_buffer_size=1653;" "max_compaction_bytes=64;" + "ignore_max_compaction_bytes_for_input=true;" "max_bytes_for_level_multiplier=60;" "memtable_factory=SkipListFactory;" "compression=kNoCompression;"