diff --git a/HISTORY.md b/HISTORY.md index ab004fced..7ee49bbc5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,7 @@ ### New Features * Introduced `CacheAllocator`, which lets the user specify custom allocator for memory in block cache. * Introduced `PerfContextByLevel` as part of `PerfContext` which allows storing perf context at each level. Also replaced `__thread` with `thread_local` keyword for perf_context. +* With level_compaction_dynamic_level_bytes = true, level multiplier may be adjusted automatically when Level 0 to 1 compaction is lagged behind. ### Bug Fixes * Fix corner case where a write group leader blocked due to write stall blocks other writers in queue with WriteOptions::no_slowdown set. diff --git a/db/version_set.cc b/db/version_set.cc index 25e551f48..aec3469f1 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1112,6 +1112,7 @@ VersionStorageInfo::VersionStorageInfo( compaction_style_(compaction_style), files_(new std::vector[num_levels_]), base_level_(num_levels_ == 1 ? -1 : 1), + level_multiplier_(0.0), files_by_compaction_pri_(num_levels_), level0_non_overlapping_(false), next_file_to_compact_by_size_(num_levels_), @@ -2379,9 +2380,12 @@ const char* VersionStorageInfo::LevelSummary( int len = 0; if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) { assert(base_level_ < static_cast(level_max_bytes_.size())); - len = snprintf(scratch->buffer, sizeof(scratch->buffer), - "base level %d max bytes base %" PRIu64 " ", base_level_, - level_max_bytes_[base_level_]); + if (level_multiplier_ != 0.0) { + len = snprintf( + scratch->buffer, sizeof(scratch->buffer), + "base level %d level multiplier %.2f max bytes base %" PRIu64 " ", + base_level_, level_multiplier_, level_max_bytes_[base_level_]); + } } len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files["); @@ -2517,7 +2521,13 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, // No compaction from L1+ needs to be scheduled. base_level_ = num_levels_ - 1; } else { - uint64_t base_bytes_max = options.max_bytes_for_level_base; + uint64_t l0_size = 0; + for (const auto& f : files_[0]) { + l0_size += f->fd.GetFileSize(); + } + + uint64_t base_bytes_max = + std::max(options.max_bytes_for_level_base, l0_size); uint64_t base_bytes_min = static_cast( base_bytes_max / options.max_bytes_for_level_multiplier); @@ -2557,11 +2567,33 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, } } + level_multiplier_ = options.max_bytes_for_level_multiplier; + assert(base_level_size > 0); + if (l0_size > base_level_size && + (l0_size > options.max_bytes_for_level_base || + static_cast(files_[0].size() / 2) >= + options.level0_file_num_compaction_trigger)) { + // We adjust the base level according to actual L0 size, and adjust + // the level multiplier accordingly, when: + // 1. the L0 size is larger than level size base, or + // 2. number of L0 files reaches twice the L0->L1 compaction trigger + // We don't do this otherwise to keep the LSM-tree structure stable + // unless the L0 compation is backlogged. + base_level_size = l0_size; + if (base_level_ == num_levels_ - 1) { + level_multiplier_ = 1.0; + } else { + level_multiplier_ = std::pow( + static_cast(max_level_size) / + static_cast(base_level_size), + 1.0 / static_cast(num_levels_ - base_level_ - 1)); + } + } + uint64_t level_size = base_level_size; for (int i = base_level_; i < num_levels_; i++) { if (i > base_level_) { - level_size = MultiplyCheckOverflow( - level_size, options.max_bytes_for_level_multiplier); + level_size = MultiplyCheckOverflow(level_size, level_multiplier_); } // Don't set any level below base_bytes_max. Otherwise, the LSM can // assume an hourglass shape where L1+ sizes are smaller than L0. This diff --git a/db/version_set.h b/db/version_set.h index bfa8fc781..e3bea32f4 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -309,6 +309,7 @@ class VersionStorageInfo { } int base_level() const { return base_level_; } + double level_multiplier() const { return level_multiplier_; } // REQUIRES: lock is held // Set the index that is used to offset into files_by_compaction_pri_ to find @@ -435,6 +436,8 @@ class VersionStorageInfo { // be empty. -1 if it is not level-compaction so it's not applicable. int base_level_; + double level_multiplier_; + // A list for the same set of files that are stored in files_, // but files in each level are now sorted based on file // size. The file with the largest size is at the front. diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 59ab521bf..505da7775 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -268,6 +268,93 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) { ASSERT_EQ(0, logger_->log_count); } +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 40000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + Add(0, 1U, "1", "2", 10000U); + Add(0, 2U, "1", "2", 10000U); + Add(0, 3U, "1", "2", 10000U); + + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 200000U); + Add(3, 6U, "1", "2", 40000U); + Add(2, 7U, "1", "2", 8000U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + // level multiplier should be 3.5 + ASSERT_EQ(vstorage_.level_multiplier(), 5.0); + // Level size should be around 30,000, 105,000, 367,500 + ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2)); + ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3)); + ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4)); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 10000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + Add(0, 11U, "1", "2", 10000U); + Add(0, 12U, "1", "2", 10000U); + Add(0, 13U, "1", "2", 10000U); + + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 200000U); + Add(3, 6U, "1", "2", 40000U); + Add(2, 7U, "1", "2", 8000U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + // level multiplier should be 3.5 + ASSERT_LT(vstorage_.level_multiplier(), 3.6); + ASSERT_GT(vstorage_.level_multiplier(), 3.4); + // Level size should be around 30,000, 105,000, 367,500 + ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2)); + ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U); + ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 10000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + Add(0, 11U, "1", "2", 5000U); + Add(0, 12U, "1", "2", 5000U); + Add(0, 13U, "1", "2", 5000U); + Add(0, 14U, "1", "2", 5000U); + Add(0, 15U, "1", "2", 5000U); + Add(0, 16U, "1", "2", 5000U); + + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 200000U); + Add(3, 6U, "1", "2", 40000U); + Add(2, 7U, "1", "2", 8000U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + // level multiplier should be 3.5 + ASSERT_LT(vstorage_.level_multiplier(), 3.6); + ASSERT_GT(vstorage_.level_multiplier(), 3.4); + // Level size should be around 30,000, 105,000, 367,500 + ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2)); + ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U); + ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U); +} + TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) { // Test whether the overlaps are detected as expected Add(1, 1U, "4", "7", 1U); // Perfect overlap with last level diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 940a6f6b7..9f71cce63 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -413,6 +413,7 @@ struct AdvancedColumnFamilyOptions { // of the level. // At the same time max_bytes_for_level_multiplier and // max_bytes_for_level_multiplier_additional are still satisfied. + // (When L0 is too large, we make some adjustment. See below.) // // With this option on, from an empty DB, we make last level the base level, // which means merging L0 data into the last level, until it exceeds @@ -451,6 +452,29 @@ struct AdvancedColumnFamilyOptions { // max_bytes_for_level_base, for a more predictable LSM tree shape. It is // useful to limit worse case space amplification. // + // + // If the compaction from L0 is lagged behind, a special mode will be turned + // on to prioritize write amplification against max_bytes_for_level_multiplier + // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking + // at number of L0 files and total L0 size. If number of L0 files is at least + // the double of level0_file_num_compaction_trigger, or the total size is + // at least max_bytes_for_level_base, this mode is on. The target of L1 grows + // to the actual data size in L0, and then determine the target for each level + // so that each level will have the same level multiplier. + // + // For example, when L0 size is 100MB, the size of last level is 1600MB, + // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10. + // Since L0 size is larger than max_bytes_for_level_base, this is a L0 + // compaction backlogged mode. So that the L1 size is determined to be 100MB. + // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will + // be needed. The level multiplier will be calculated to be 4 and the three + // levels' target to be [100MB, 400MB, 1600MB]. + // + // In this mode, The number of levels will be no more than the normal mode, + // and the level multiplier will be lower. The write amplification will + // likely to be reduced. + // + // // max_bytes_for_level_multiplier_additional is ignored with this flag on. // // Turning this feature on or off for an existing DB can cause unexpected