From 375534752afdc7258361cecdd13a0e8bfa036ee5 Mon Sep 17 00:00:00 2001 From: Jay Zhuang Date: Mon, 8 Aug 2022 14:34:36 -0700 Subject: [PATCH] Improve universal compaction picker for tiered compaction (#10467) Summary: Current universal compaction picker may cause extra size amplification compaction if there're more hot data on penultimate level. Improve the picker to skip the last level for size amp calculation if tiered compaction is enabled, which can 1. avoid extra unnecessary size amp compaction; 2. typically cold tier (the last level) is not size constrained, so skip size amp for cold tier is intended; Pull Request resolved: https://github.com/facebook/rocksdb/pull/10467 Test Plan: CI and added unittest Reviewed By: siying Differential Revision: D38391350 Pulled By: jay-zhuang fbshipit-source-id: 103c0731c05e0a7e8f267e9e829d022328be25d2 --- HISTORY.md | 1 + db/compaction/compaction_picker_test.cc | 142 +++++++++++++++++++ db/compaction/compaction_picker_universal.cc | 61 +++++--- include/rocksdb/advanced_options.h | 6 + 4 files changed, 192 insertions(+), 18 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index ea937ea0c..845d7d324 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -27,6 +27,7 @@ * To minimize the internal fragmentation caused by the variable size of the compressed blocks in `CompressedSecondaryCache`, the original block is split according to the jemalloc bin size in `Insert()` and then merged back in `Lookup()`. * PosixLogger is removed and by default EnvLogger will be used for info logging. The behavior of the two loggers should be very similar when using the default Posix Env. * Remove [min|max]_timestamp from VersionEdit for now since they are not tracked in MANIFEST anyway but consume two empty std::string (up to 64 bytes) for each file. Should they be added back in the future, we should store them more compactly. +* Improve universal tiered storage compaction picker to avoid extra major compaction triggered by size amplification. If `preclude_last_level_data_seconds` is enabled, the size amplification is calculated within non last_level data only which skip the last level and use the penultimate level as the size base. ### Performance Improvements * Instead of constructing `FragmentedRangeTombstoneList` during every read operation, it is now constructed once and stored in immutable memtables. This improves speed of querying range tombstones from immutable memtables. diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index fadfee0fa..ca1289da8 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -3175,6 +3175,148 @@ TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) { ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size()); } +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) { + // This test make sure size amplification compaction could still be triggered + // if the last sorted run is not the last level. + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(4, 90U, "100", "600", 4 * kFileSize); + Add(5, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // Make sure it's a size amp compaction and includes all files + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kLastLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(4)->num_files, 1); + ASSERT_EQ(compaction->input_levels(5)->num_files, 1); +} + +TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) { + // This test makes sure the size amp calculation skips the last level (L6), so + // size amp compaction is not triggered, instead a size ratio compaction is + // triggered. + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + const int kPenultimateLevel = kLastLevel - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(5, 90U, "100", "600", 4 * kFileSize); + Add(6, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // Internally, size amp compaction is evaluated before size ratio compaction. + // Here to make sure it's size ratio compaction instead of size amp + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeRatio); + ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(5)->num_files, 0); + ASSERT_EQ(compaction->input_levels(6)->num_files, 0); +} + +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) { + // Tiered compaction only support level_num > 2 (otherwise the penultimate + // level is going to be level 0, which may make thing more complicated), so + // when there's only 2 level, still treating level 1 as the last level for + // size amp compaction + const uint64_t kFileSize = 100000; + const int kNumLevels = 2; + const int kLastLevel = kNumLevels - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(0, 90U, "100", "600", 4 * kFileSize); + Add(1, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // size amp compaction is still triggered even preclude_last_level is set + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kLastLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 3); + ASSERT_EQ(compaction->input_levels(1)->num_files, 1); +} + +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) { + // This test makes sure the size amp compaction for tiered storage could still + // be triggered, but only for non-last-level files + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + const int kPenultimateLevel = kLastLevel - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 3 * kFileSize); + Add(0, 101U, "200", "400", 2 * kFileSize); + Add(5, 90U, "100", "600", 2 * kFileSize); + Add(6, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // It's a Size Amp compaction, but doesn't include the last level file and + // output to the penultimate level. + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kPenultimateLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(5)->num_files, 1); + ASSERT_EQ(compaction->input_levels(6)->num_files, 0); +} + class PerKeyPlacementCompactionPickerTest : public CompactionPickerTest, public testing::WithParamInterface { diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index 686b40bd0..8a231f93f 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -106,6 +106,9 @@ class UniversalCompactionBuilder { Compaction* PickCompactionToOldest(size_t start_index, CompactionReason compaction_reason); + Compaction* PickCompactionWithSortedRunRange( + size_t start_index, size_t end_index, CompactionReason compaction_reason); + // Try to pick periodic compaction. The caller should only call it // if there is at least one file marked for periodic compaction. // null will be returned if no such a compaction can be formed @@ -811,8 +814,20 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); } + // size of the base sorted run for size amp calculation + uint64_t base_sr_size = sorted_runs_.back().size; + size_t sr_end_idx = sorted_runs_.size() - 1; + // If tiered compaction is enabled and the last sorted run is the last level + if (ioptions_.preclude_last_level_data_seconds > 0 && + ioptions_.num_levels > 2 && + sorted_runs_.back().level == ioptions_.num_levels - 1 && + sorted_runs_.size() > 1) { + sr_end_idx = sorted_runs_.size() - 2; + base_sr_size = sorted_runs_[sr_end_idx].size; + } + // keep adding up all the remaining files - for (size_t loop = start_index; loop + 1 < sorted_runs_.size(); loop++) { + for (size_t loop = start_index; loop < sr_end_idx; loop++) { sr = &sorted_runs_[loop]; if (sr->being_compacted) { // TODO with incremental compaction is supported, we might want to @@ -832,23 +847,20 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { return nullptr; } - // size of earliest file - uint64_t earliest_file_size = sorted_runs_.back().size; - // size amplification = percentage of additional size - if (candidate_size * 100 < ratio * earliest_file_size) { + if (candidate_size * 100 < ratio * base_sr_size) { ROCKS_LOG_BUFFER( log_buffer_, "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 " earliest-file-size %" PRIu64, - cf_name_.c_str(), candidate_size, earliest_file_size); + cf_name_.c_str(), candidate_size, base_sr_size); return nullptr; } else { ROCKS_LOG_BUFFER( log_buffer_, "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 " earliest-file-size %" PRIu64, - cf_name_.c_str(), candidate_size, earliest_file_size); + cf_name_.c_str(), candidate_size, base_sr_size); } // Since incremental compaction can't include more than second last // level, it can introduce penalty, compared to full compaction. We @@ -860,7 +872,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { // This also prevent the case when compaction falls behind and we // need to compact more levels for compactions to catch up. if (mutable_cf_options_.compaction_options_universal.incremental) { - double fanout_threshold = static_cast(earliest_file_size) / + double fanout_threshold = static_cast(base_sr_size) / static_cast(candidate_size) * 1.8; Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold); if (picked != nullptr) { @@ -869,8 +881,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { return picked; } } - return PickCompactionToOldest(start_index, - CompactionReason::kUniversalSizeAmplification); + return PickCompactionWithSortedRunRange( + start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification); } Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( @@ -1233,11 +1245,17 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { Compaction* UniversalCompactionBuilder::PickCompactionToOldest( size_t start_index, CompactionReason compaction_reason) { + return PickCompactionWithSortedRunRange(start_index, sorted_runs_.size() - 1, + compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( + size_t start_index, size_t end_index, CompactionReason compaction_reason) { assert(start_index < sorted_runs_.size()); // Estimate total file size uint64_t estimated_total_size = 0; - for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) { + for (size_t loop = start_index; loop <= end_index; loop++) { estimated_total_size += sorted_runs_[loop].size; } uint32_t path_id = @@ -1248,7 +1266,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToOldest( for (size_t i = 0; i < inputs.size(); ++i) { inputs[i].level = start_level + static_cast(i); } - for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) { + for (size_t loop = start_index; loop <= end_index; loop++) { auto& picking_sr = sorted_runs_[loop]; if (picking_sr.level == 0) { FileMetaData* f = picking_sr.file; @@ -1279,12 +1297,19 @@ Compaction* UniversalCompactionBuilder::PickCompactionToOldest( file_num_buf); } - // output files at the bottom most level, unless it's reserved - int output_level = vstorage_->num_levels() - 1; - // last level is reserved for the files ingested behind - if (ioptions_.allow_ingest_behind) { - assert(output_level > 1); - output_level--; + int output_level; + if (end_index == sorted_runs_.size() - 1) { + // output files at the last level, unless it's reserved + output_level = vstorage_->num_levels() - 1; + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind) { + assert(output_level > 1); + output_level--; + } + } else { + // if it's not including all sorted_runs, it can only output to the level + // above the `end_index + 1` sorted_run. + output_level = sorted_runs_[end_index + 1].level - 1; } // We never check size for diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index cd2582e8a..3e6e16bb0 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -885,6 +885,12 @@ struct AdvancedColumnFamilyOptions { // will be precluded from the last level. // 0 means no key will be precluded from the last level. // + // Note: when enabled, universal size amplification (controlled by option + // `compaction_options_universal.max_size_amplification_percent`) calculation + // will exclude the last level. As the feature is designed for tiered storage + // and a typical setting is the last level is cold tier which is likely not + // size constrained, the size amp is going to be only for non-last levels. + // // Default: 0 (disable the feature) uint64_t preclude_last_level_data_seconds = 0;