From 333abe9c559609078efa485da8d85973d6f81be4 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Fri, 21 Oct 2022 10:22:41 -0700 Subject: [PATCH] Ignore max_compaction_bytes for compaction input that are within output key-range (#10835) Summary: When picking compaction input files, we sometimes stop picking a file that is fully included in the output key-range due to hitting max_compaction_bytes. Including these input files can potentially reduce WA at the expense of larger compactions. Larger compaction should be fine as files from input level are usually 10X smaller than files from output level. This PR adds a mutable CF option `ignore_max_compaction_bytes_for_input` that is enabled by default. We can remove this option once we are sure it is safe. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10835 Test Plan: - CI, a unit test on max_compaction_bytes fails before turning this flag off. - Benchmark does not show much difference in WA: `./db_bench --benchmarks=fillrandom,waitforcompaction,stats,levelstats -max_background_jobs=12 -num=2000000000 -target_file_size_base=33554432 --write_buffer_size=33554432` ``` main: ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ L0 3/0 91.59 MB 0.8 70.9 0.0 70.9 200.8 129.9 0.0 1.5 25.2 71.2 2886.55 2463.45 9725 0.297 1093M 254K 0.0 0.0 L1 9/0 248.03 MB 1.0 392.0 129.8 262.2 391.7 129.5 0.0 3.0 69.0 68.9 5821.71 5536.90 804 7.241 6029M 5814K 0.0 0.0 L2 87/0 2.50 GB 1.0 537.0 128.5 408.5 533.8 125.2 0.7 4.2 69.5 69.1 7912.24 7323.70 4417 1.791 8299M 36M 0.0 0.0 L3 836/0 24.99 GB 1.0 616.9 118.3 498.7 594.5 95.8 5.2 5.0 66.9 64.5 9442.38 8490.28 4204 2.246 9749M 306M 0.0 0.0 L4 2355/0 62.95 GB 0.3 67.3 37.1 30.2 54.2 24.0 38.9 1.5 72.2 58.2 954.37 821.18 917 1.041 1076M 173M 0.0 0.0 Sum 3290/0 90.77 GB 0.0 1684.2 413.7 1270.5 1775.0 504.5 44.9 13.7 63.8 67.3 27017.25 24635.52 20067 1.346 26G 522M 0.0 0.0 Cumulative compaction: 1774.96 GB write, 154.29 MB/s write, 1684.19 GB read, 146.40 MB/s read, 27017.3 seconds This PR: ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ L0 3/0 45.71 MB 0.8 72.9 0.0 72.9 202.8 129.9 0.0 1.6 25.4 70.7 2938.16 2510.36 9741 0.302 1124M 265K 0.0 0.0 L1 8/0 234.54 MB 0.9 384.5 129.8 254.7 384.2 129.6 0.0 3.0 69.0 68.9 5708.08 5424.43 791 7.216 5913M 5753K 0.0 0.0 L2 84/0 2.47 GB 1.0 543.1 128.6 414.5 539.9 125.4 0.7 4.2 69.6 69.2 7989.31 7403.13 4418 1.808 8393M 36M 0.0 0.0 L3 839/0 24.96 GB 1.0 615.6 118.4 497.2 593.2 96.0 5.1 5.0 66.6 64.1 9471.23 8489.31 4193 2.259 9726M 306M 0.0 0.0 L4 2360/0 63.04 GB 0.3 67.6 37.3 30.3 54.4 24.1 38.9 1.5 71.5 57.6 967.30 827.99 907 1.066 1080M 173M 0.0 0.0 Sum 3294/0 90.75 GB 0.0 1683.8 414.2 1269.6 1774.5 504.9 44.8 13.7 63.7 67.1 27074.08 24655.22 20050 1.350 26G 522M 0.0 0.0 Cumulative compaction: 1774.52 GB write, 157.09 MB/s write, 1683.77 GB read, 149.06 MB/s read, 27074.1 seconds ``` Reviewed By: ajkr Differential Revision: D40518319 Pulled By: cbi42 fbshipit-source-id: f4ea614bc0ebefe007ffaf05bb9aec9a8ca25b60 --- HISTORY.md | 1 + db/compaction/compaction_picker.cc | 6 ++++-- db/compaction/compaction_picker_test.cc | 3 +++ include/rocksdb/advanced_options.h | 11 +++++++++++ options/cf_options.cc | 7 +++++++ options/cf_options.h | 4 ++++ options/options.cc | 4 ++++ options/options_helper.cc | 2 ++ options/options_settable_test.cc | 1 + 9 files changed, 37 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c3dac417a..a959c26be 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -8,6 +8,7 @@ * FIFO compaction now supports migrating from a multi-level DB via DB::Open(). During the migration phase, FIFO compaction picker will: * picks the sst file with the smallest starting key in the bottom-most non-empty level. * Note that during the migration phase, the file purge order will only be an approximation of "FIFO" as files in lower-level might sometime contain newer keys than files in upper-level. +* Added an option `ignore_max_compaction_bytes_for_input` to ignore max_compaction_bytes limit when adding files to be compacted from input level. This should help reduce write amplification. The option is enabled by default. ### Bug Fixes * Fix a bug in io_uring_prep_cancel in AbortIO API for posix which expects sqe->addr to match with read request submitted and wrong paramter was being passed. diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index fde3dfcee..e0327a9e8 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -526,7 +526,8 @@ bool CompactionPicker::SetupOtherInputs( try_overlapping_inputs = false; } if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() && - output_level_inputs_size + expanded_inputs_size < limit && + (mutable_cf_options.ignore_max_compaction_bytes_for_input || + output_level_inputs_size + expanded_inputs_size < limit) && !AreFilesInCompaction(expanded_inputs.files)) { InternalKey new_start, new_limit; GetRange(expanded_inputs, &new_start, &new_limit); @@ -549,7 +550,8 @@ bool CompactionPicker::SetupOtherInputs( base_index, nullptr); expanded_inputs_size = TotalFileSize(expanded_inputs.files); if (expanded_inputs.size() > inputs->size() && - output_level_inputs_size + expanded_inputs_size < limit && + (mutable_cf_options.ignore_max_compaction_bytes_for_input || + output_level_inputs_size + expanded_inputs_size < limit) && !AreFilesInCompaction(expanded_inputs.files)) { expand_inputs = true; } diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 539d25a33..3a0e1b9ad 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -1340,6 +1340,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) { ioptions_.compaction_pri = kMinOverlappingRatio; mutable_cf_options_.max_bytes_for_level_base = 10000000; mutable_cf_options_.max_bytes_for_level_multiplier = 10; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; // file 7 and 8 over lap with the same file, but file 8 is smaller so // it will be picked. @@ -2358,6 +2359,7 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) { TEST_F(CompactionPickerTest, MaxCompactionBytesHit) { mutable_cf_options_.max_bytes_for_level_base = 1000000u; mutable_cf_options_.max_compaction_bytes = 800000u; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; ioptions_.level_compaction_dynamic_level_bytes = false; NewVersionStorage(6, kCompactionStyleLevel); // A compaction should be triggered and pick file 2 and 5. @@ -2384,6 +2386,7 @@ TEST_F(CompactionPickerTest, MaxCompactionBytesHit) { TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) { mutable_cf_options_.max_bytes_for_level_base = 800000u; mutable_cf_options_.max_compaction_bytes = 1000000u; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; ioptions_.level_compaction_dynamic_level_bytes = false; NewVersionStorage(6, kCompactionStyleLevel); // A compaction should be triggered and pick file 2 and 5. diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index a768f5dd4..5cd52223f 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -684,6 +684,17 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API uint64_t max_compaction_bytes = 0; + // When setting up compaction input files, we ignore the + // `max_compaction_bytes` limit when pulling in input files that are entirely + // within output key range. + // + // Default: true + // + // Dynamically changeable through SetOptions() API + // We could remove this knob and always ignore the limit once it is proven + // safe. + bool ignore_max_compaction_bytes_for_input = true; + // All writes will be slowed down to at least delayed_write_rate if estimated // bytes needed to be compaction exceed this threshold. // diff --git a/options/cf_options.cc b/options/cf_options.cc index f34e8849e..dbf0bf9b0 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -270,6 +270,11 @@ static std::unordered_map {offsetof(struct MutableCFOptions, max_compaction_bytes), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"ignore_max_compaction_bytes_for_input", + {offsetof(struct MutableCFOptions, + ignore_max_compaction_bytes_for_input), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"expanded_compaction_factor", {0, OptionType::kInt, OptionVerificationType::kDeprecated, OptionTypeFlags::kMutable}}, @@ -1034,6 +1039,8 @@ void MutableCFOptions::Dump(Logger* log) const { level0_stop_writes_trigger); ROCKS_LOG_INFO(log, " max_compaction_bytes: %" PRIu64, max_compaction_bytes); + ROCKS_LOG_INFO(log, " ignore_max_compaction_bytes_for_input: %s", + ignore_max_compaction_bytes_for_input ? "true" : "false"); ROCKS_LOG_INFO(log, " target_file_size_base: %" PRIu64, target_file_size_base); ROCKS_LOG_INFO(log, " target_file_size_multiplier: %d", diff --git a/options/cf_options.h b/options/cf_options.h index 5ac4c0c8e..050618eda 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -130,6 +130,8 @@ struct MutableCFOptions { level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), level0_stop_writes_trigger(options.level0_stop_writes_trigger), max_compaction_bytes(options.max_compaction_bytes), + ignore_max_compaction_bytes_for_input( + options.ignore_max_compaction_bytes_for_input), target_file_size_base(options.target_file_size_base), target_file_size_multiplier(options.target_file_size_multiplier), max_bytes_for_level_base(options.max_bytes_for_level_base), @@ -192,6 +194,7 @@ struct MutableCFOptions { level0_slowdown_writes_trigger(0), level0_stop_writes_trigger(0), max_compaction_bytes(0), + ignore_max_compaction_bytes_for_input(true), target_file_size_base(0), target_file_size_multiplier(0), max_bytes_for_level_base(0), @@ -273,6 +276,7 @@ struct MutableCFOptions { int level0_slowdown_writes_trigger; int level0_stop_writes_trigger; uint64_t max_compaction_bytes; + bool ignore_max_compaction_bytes_for_input; uint64_t target_file_size_base; int target_file_size_multiplier; uint64_t max_bytes_for_level_base; diff --git a/options/options.cc b/options/options.cc index 153c5f8ef..316d3550e 100644 --- a/options/options.cc +++ b/options/options.cc @@ -71,6 +71,8 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) max_bytes_for_level_multiplier_additional( options.max_bytes_for_level_multiplier_additional), max_compaction_bytes(options.max_compaction_bytes), + ignore_max_compaction_bytes_for_input( + options.ignore_max_compaction_bytes_for_input), soft_pending_compaction_bytes_limit( options.soft_pending_compaction_bytes_limit), hard_pending_compaction_bytes_limit( @@ -281,6 +283,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.max_compaction_bytes: %" PRIu64, max_compaction_bytes); + ROCKS_LOG_HEADER(log, " Options.ignore_max_compaction_bytes_for_input: %s", + ignore_max_compaction_bytes_for_input ? "true" : "false"); ROCKS_LOG_HEADER( log, " Options.arena_block_size: %" ROCKSDB_PRIszt, diff --git a/options/options_helper.cc b/options/options_helper.cc index 0d63edb89..59b01e6fb 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -229,6 +229,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.level0_slowdown_writes_trigger; cf_opts->level0_stop_writes_trigger = moptions.level0_stop_writes_trigger; cf_opts->max_compaction_bytes = moptions.max_compaction_bytes; + cf_opts->ignore_max_compaction_bytes_for_input = + moptions.ignore_max_compaction_bytes_for_input; cf_opts->target_file_size_base = moptions.target_file_size_base; cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier; cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index f0ee5ecc6..63e9721ca 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -486,6 +486,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "max_write_buffer_number=84;" "write_buffer_size=1653;" "max_compaction_bytes=64;" + "ignore_max_compaction_bytes_for_input=true;" "max_bytes_for_level_multiplier=60;" "memtable_factory=SkipListFactory;" "compression=kNoCompression;"