diff --git a/HISTORY.md b/HISTORY.md index ea937ea0c..845d7d324 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -27,6 +27,7 @@ * To minimize the internal fragmentation caused by the variable size of the compressed blocks in `CompressedSecondaryCache`, the original block is split according to the jemalloc bin size in `Insert()` and then merged back in `Lookup()`. * PosixLogger is removed and by default EnvLogger will be used for info logging. The behavior of the two loggers should be very similar when using the default Posix Env. * Remove [min|max]_timestamp from VersionEdit for now since they are not tracked in MANIFEST anyway but consume two empty std::string (up to 64 bytes) for each file. Should they be added back in the future, we should store them more compactly. +* Improve universal tiered storage compaction picker to avoid extra major compaction triggered by size amplification. If `preclude_last_level_data_seconds` is enabled, the size amplification is calculated within non last_level data only which skip the last level and use the penultimate level as the size base. ### Performance Improvements * Instead of constructing `FragmentedRangeTombstoneList` during every read operation, it is now constructed once and stored in immutable memtables. This improves speed of querying range tombstones from immutable memtables. diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index fadfee0fa..ca1289da8 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -3175,6 +3175,148 @@ TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) { ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size()); } +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) { + // This test make sure size amplification compaction could still be triggered + // if the last sorted run is not the last level. + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(4, 90U, "100", "600", 4 * kFileSize); + Add(5, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // Make sure it's a size amp compaction and includes all files + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kLastLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(4)->num_files, 1); + ASSERT_EQ(compaction->input_levels(5)->num_files, 1); +} + +TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) { + // This test makes sure the size amp calculation skips the last level (L6), so + // size amp compaction is not triggered, instead a size ratio compaction is + // triggered. + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + const int kPenultimateLevel = kLastLevel - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(5, 90U, "100", "600", 4 * kFileSize); + Add(6, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // Internally, size amp compaction is evaluated before size ratio compaction. + // Here to make sure it's size ratio compaction instead of size amp + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeRatio); + ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(5)->num_files, 0); + ASSERT_EQ(compaction->input_levels(6)->num_files, 0); +} + +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) { + // Tiered compaction only support level_num > 2 (otherwise the penultimate + // level is going to be level 0, which may make thing more complicated), so + // when there's only 2 level, still treating level 1 as the last level for + // size amp compaction + const uint64_t kFileSize = 100000; + const int kNumLevels = 2; + const int kLastLevel = kNumLevels - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(0, 90U, "100", "600", 4 * kFileSize); + Add(1, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // size amp compaction is still triggered even preclude_last_level is set + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kLastLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 3); + ASSERT_EQ(compaction->input_levels(1)->num_files, 1); +} + +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) { + // This test makes sure the size amp compaction for tiered storage could still + // be triggered, but only for non-last-level files + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + const int kPenultimateLevel = kLastLevel - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 3 * kFileSize); + Add(0, 101U, "200", "400", 2 * kFileSize); + Add(5, 90U, "100", "600", 2 * kFileSize); + Add(6, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // It's a Size Amp compaction, but doesn't include the last level file and + // output to the penultimate level. + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kPenultimateLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(5)->num_files, 1); + ASSERT_EQ(compaction->input_levels(6)->num_files, 0); +} + class PerKeyPlacementCompactionPickerTest : public CompactionPickerTest, public testing::WithParamInterface { diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index 686b40bd0..8a231f93f 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -106,6 +106,9 @@ class UniversalCompactionBuilder { Compaction* PickCompactionToOldest(size_t start_index, CompactionReason compaction_reason); + Compaction* PickCompactionWithSortedRunRange( + size_t start_index, size_t end_index, CompactionReason compaction_reason); + // Try to pick periodic compaction. The caller should only call it // if there is at least one file marked for periodic compaction. // null will be returned if no such a compaction can be formed @@ -811,8 +814,20 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); } + // size of the base sorted run for size amp calculation + uint64_t base_sr_size = sorted_runs_.back().size; + size_t sr_end_idx = sorted_runs_.size() - 1; + // If tiered compaction is enabled and the last sorted run is the last level + if (ioptions_.preclude_last_level_data_seconds > 0 && + ioptions_.num_levels > 2 && + sorted_runs_.back().level == ioptions_.num_levels - 1 && + sorted_runs_.size() > 1) { + sr_end_idx = sorted_runs_.size() - 2; + base_sr_size = sorted_runs_[sr_end_idx].size; + } + // keep adding up all the remaining files - for (size_t loop = start_index; loop + 1 < sorted_runs_.size(); loop++) { + for (size_t loop = start_index; loop < sr_end_idx; loop++) { sr = &sorted_runs_[loop]; if (sr->being_compacted) { // TODO with incremental compaction is supported, we might want to @@ -832,23 +847,20 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { return nullptr; } - // size of earliest file - uint64_t earliest_file_size = sorted_runs_.back().size; - // size amplification = percentage of additional size - if (candidate_size * 100 < ratio * earliest_file_size) { + if (candidate_size * 100 < ratio * base_sr_size) { ROCKS_LOG_BUFFER( log_buffer_, "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 " earliest-file-size %" PRIu64, - cf_name_.c_str(), candidate_size, earliest_file_size); + cf_name_.c_str(), candidate_size, base_sr_size); return nullptr; } else { ROCKS_LOG_BUFFER( log_buffer_, "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 " earliest-file-size %" PRIu64, - cf_name_.c_str(), candidate_size, earliest_file_size); + cf_name_.c_str(), candidate_size, base_sr_size); } // Since incremental compaction can't include more than second last // level, it can introduce penalty, compared to full compaction. We @@ -860,7 +872,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { // This also prevent the case when compaction falls behind and we // need to compact more levels for compactions to catch up. if (mutable_cf_options_.compaction_options_universal.incremental) { - double fanout_threshold = static_cast(earliest_file_size) / + double fanout_threshold = static_cast(base_sr_size) / static_cast(candidate_size) * 1.8; Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold); if (picked != nullptr) { @@ -869,8 +881,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { return picked; } } - return PickCompactionToOldest(start_index, - CompactionReason::kUniversalSizeAmplification); + return PickCompactionWithSortedRunRange( + start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification); } Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( @@ -1233,11 +1245,17 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { Compaction* UniversalCompactionBuilder::PickCompactionToOldest( size_t start_index, CompactionReason compaction_reason) { + return PickCompactionWithSortedRunRange(start_index, sorted_runs_.size() - 1, + compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( + size_t start_index, size_t end_index, CompactionReason compaction_reason) { assert(start_index < sorted_runs_.size()); // Estimate total file size uint64_t estimated_total_size = 0; - for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) { + for (size_t loop = start_index; loop <= end_index; loop++) { estimated_total_size += sorted_runs_[loop].size; } uint32_t path_id = @@ -1248,7 +1266,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToOldest( for (size_t i = 0; i < inputs.size(); ++i) { inputs[i].level = start_level + static_cast(i); } - for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) { + for (size_t loop = start_index; loop <= end_index; loop++) { auto& picking_sr = sorted_runs_[loop]; if (picking_sr.level == 0) { FileMetaData* f = picking_sr.file; @@ -1279,12 +1297,19 @@ Compaction* UniversalCompactionBuilder::PickCompactionToOldest( file_num_buf); } - // output files at the bottom most level, unless it's reserved - int output_level = vstorage_->num_levels() - 1; - // last level is reserved for the files ingested behind - if (ioptions_.allow_ingest_behind) { - assert(output_level > 1); - output_level--; + int output_level; + if (end_index == sorted_runs_.size() - 1) { + // output files at the last level, unless it's reserved + output_level = vstorage_->num_levels() - 1; + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind) { + assert(output_level > 1); + output_level--; + } + } else { + // if it's not including all sorted_runs, it can only output to the level + // above the `end_index + 1` sorted_run. + output_level = sorted_runs_[end_index + 1].level - 1; } // We never check size for diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index cd2582e8a..3e6e16bb0 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -885,6 +885,12 @@ struct AdvancedColumnFamilyOptions { // will be precluded from the last level. // 0 means no key will be precluded from the last level. // + // Note: when enabled, universal size amplification (controlled by option + // `compaction_options_universal.max_size_amplification_percent`) calculation + // will exclude the last level. As the feature is designed for tiered storage + // and a typical setting is the last level is cold tier which is likely not + // size constrained, the size amp is going to be only for non-last levels. + // // Default: 0 (disable the feature) uint64_t preclude_last_level_data_seconds = 0;