From 70242636820b45c74ca1879d92318467336eda10 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Mon, 22 Oct 2018 10:18:51 -0700 Subject: [PATCH] Dynamic level to adjust level multiplier when write is too heavy (#4338) Summary: Level compaction usually performs poorly when the writes so heavy that the level targets can't be guaranteed. With this improvement, we improve level_compaction_dynamic_level_bytes = true so that in the write heavy cases, the level multiplier can be slightly adjusted based on the size of L0. We keep the behavior the same if number of L0 files is under 2X compaction trigger and the total size is less than options.max_bytes_for_level_base, so that unless write is so heavy that compaction cannot keep up, the behavior doesn't change. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4338 Differential Revision: D9636782 Pulled By: siying fbshipit-source-id: e27fc17a7c29c84b00064cc17536a01dacef7595 --- HISTORY.md | 1 + db/version_set.cc | 44 ++++++++++++--- db/version_set.h | 3 ++ db/version_set_test.cc | 87 ++++++++++++++++++++++++++++++ include/rocksdb/advanced_options.h | 24 +++++++++ 5 files changed, 153 insertions(+), 6 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index ab004fced..7ee49bbc5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,7 @@ ### New Features * Introduced `CacheAllocator`, which lets the user specify custom allocator for memory in block cache. * Introduced `PerfContextByLevel` as part of `PerfContext` which allows storing perf context at each level. Also replaced `__thread` with `thread_local` keyword for perf_context. +* With level_compaction_dynamic_level_bytes = true, level multiplier may be adjusted automatically when Level 0 to 1 compaction is lagged behind. ### Bug Fixes * Fix corner case where a write group leader blocked due to write stall blocks other writers in queue with WriteOptions::no_slowdown set. diff --git a/db/version_set.cc b/db/version_set.cc index 25e551f48..aec3469f1 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1112,6 +1112,7 @@ VersionStorageInfo::VersionStorageInfo( compaction_style_(compaction_style), files_(new std::vector[num_levels_]), base_level_(num_levels_ == 1 ? -1 : 1), + level_multiplier_(0.0), files_by_compaction_pri_(num_levels_), level0_non_overlapping_(false), next_file_to_compact_by_size_(num_levels_), @@ -2379,9 +2380,12 @@ const char* VersionStorageInfo::LevelSummary( int len = 0; if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) { assert(base_level_ < static_cast(level_max_bytes_.size())); - len = snprintf(scratch->buffer, sizeof(scratch->buffer), - "base level %d max bytes base %" PRIu64 " ", base_level_, - level_max_bytes_[base_level_]); + if (level_multiplier_ != 0.0) { + len = snprintf( + scratch->buffer, sizeof(scratch->buffer), + "base level %d level multiplier %.2f max bytes base %" PRIu64 " ", + base_level_, level_multiplier_, level_max_bytes_[base_level_]); + } } len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files["); @@ -2517,7 +2521,13 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, // No compaction from L1+ needs to be scheduled. base_level_ = num_levels_ - 1; } else { - uint64_t base_bytes_max = options.max_bytes_for_level_base; + uint64_t l0_size = 0; + for (const auto& f : files_[0]) { + l0_size += f->fd.GetFileSize(); + } + + uint64_t base_bytes_max = + std::max(options.max_bytes_for_level_base, l0_size); uint64_t base_bytes_min = static_cast( base_bytes_max / options.max_bytes_for_level_multiplier); @@ -2557,11 +2567,33 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, } } + level_multiplier_ = options.max_bytes_for_level_multiplier; + assert(base_level_size > 0); + if (l0_size > base_level_size && + (l0_size > options.max_bytes_for_level_base || + static_cast(files_[0].size() / 2) >= + options.level0_file_num_compaction_trigger)) { + // We adjust the base level according to actual L0 size, and adjust + // the level multiplier accordingly, when: + // 1. the L0 size is larger than level size base, or + // 2. number of L0 files reaches twice the L0->L1 compaction trigger + // We don't do this otherwise to keep the LSM-tree structure stable + // unless the L0 compation is backlogged. + base_level_size = l0_size; + if (base_level_ == num_levels_ - 1) { + level_multiplier_ = 1.0; + } else { + level_multiplier_ = std::pow( + static_cast(max_level_size) / + static_cast(base_level_size), + 1.0 / static_cast(num_levels_ - base_level_ - 1)); + } + } + uint64_t level_size = base_level_size; for (int i = base_level_; i < num_levels_; i++) { if (i > base_level_) { - level_size = MultiplyCheckOverflow( - level_size, options.max_bytes_for_level_multiplier); + level_size = MultiplyCheckOverflow(level_size, level_multiplier_); } // Don't set any level below base_bytes_max. Otherwise, the LSM can // assume an hourglass shape where L1+ sizes are smaller than L0. This diff --git a/db/version_set.h b/db/version_set.h index bfa8fc781..e3bea32f4 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -309,6 +309,7 @@ class VersionStorageInfo { } int base_level() const { return base_level_; } + double level_multiplier() const { return level_multiplier_; } // REQUIRES: lock is held // Set the index that is used to offset into files_by_compaction_pri_ to find @@ -435,6 +436,8 @@ class VersionStorageInfo { // be empty. -1 if it is not level-compaction so it's not applicable. int base_level_; + double level_multiplier_; + // A list for the same set of files that are stored in files_, // but files in each level are now sorted based on file // size. The file with the largest size is at the front. diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 59ab521bf..505da7775 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -268,6 +268,93 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) { ASSERT_EQ(0, logger_->log_count); } +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 40000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + Add(0, 1U, "1", "2", 10000U); + Add(0, 2U, "1", "2", 10000U); + Add(0, 3U, "1", "2", 10000U); + + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 200000U); + Add(3, 6U, "1", "2", 40000U); + Add(2, 7U, "1", "2", 8000U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + // level multiplier should be 3.5 + ASSERT_EQ(vstorage_.level_multiplier(), 5.0); + // Level size should be around 30,000, 105,000, 367,500 + ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2)); + ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3)); + ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4)); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 10000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + Add(0, 11U, "1", "2", 10000U); + Add(0, 12U, "1", "2", 10000U); + Add(0, 13U, "1", "2", 10000U); + + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 200000U); + Add(3, 6U, "1", "2", 40000U); + Add(2, 7U, "1", "2", 8000U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + // level multiplier should be 3.5 + ASSERT_LT(vstorage_.level_multiplier(), 3.6); + ASSERT_GT(vstorage_.level_multiplier(), 3.4); + // Level size should be around 30,000, 105,000, 367,500 + ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2)); + ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U); + ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 10000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + Add(0, 11U, "1", "2", 5000U); + Add(0, 12U, "1", "2", 5000U); + Add(0, 13U, "1", "2", 5000U); + Add(0, 14U, "1", "2", 5000U); + Add(0, 15U, "1", "2", 5000U); + Add(0, 16U, "1", "2", 5000U); + + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 200000U); + Add(3, 6U, "1", "2", 40000U); + Add(2, 7U, "1", "2", 8000U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + // level multiplier should be 3.5 + ASSERT_LT(vstorage_.level_multiplier(), 3.6); + ASSERT_GT(vstorage_.level_multiplier(), 3.4); + // Level size should be around 30,000, 105,000, 367,500 + ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2)); + ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U); + ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U); +} + TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) { // Test whether the overlaps are detected as expected Add(1, 1U, "4", "7", 1U); // Perfect overlap with last level diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 940a6f6b7..9f71cce63 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -413,6 +413,7 @@ struct AdvancedColumnFamilyOptions { // of the level. // At the same time max_bytes_for_level_multiplier and // max_bytes_for_level_multiplier_additional are still satisfied. + // (When L0 is too large, we make some adjustment. See below.) // // With this option on, from an empty DB, we make last level the base level, // which means merging L0 data into the last level, until it exceeds @@ -451,6 +452,29 @@ struct AdvancedColumnFamilyOptions { // max_bytes_for_level_base, for a more predictable LSM tree shape. It is // useful to limit worse case space amplification. // + // + // If the compaction from L0 is lagged behind, a special mode will be turned + // on to prioritize write amplification against max_bytes_for_level_multiplier + // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking + // at number of L0 files and total L0 size. If number of L0 files is at least + // the double of level0_file_num_compaction_trigger, or the total size is + // at least max_bytes_for_level_base, this mode is on. The target of L1 grows + // to the actual data size in L0, and then determine the target for each level + // so that each level will have the same level multiplier. + // + // For example, when L0 size is 100MB, the size of last level is 1600MB, + // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10. + // Since L0 size is larger than max_bytes_for_level_base, this is a L0 + // compaction backlogged mode. So that the L1 size is determined to be 100MB. + // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will + // be needed. The level multiplier will be calculated to be 4 and the three + // levels' target to be [100MB, 400MB, 1600MB]. + // + // In this mode, The number of levels will be no more than the normal mode, + // and the level multiplier will be lower. The write amplification will + // likely to be reduced. + // + // // max_bytes_for_level_multiplier_additional is ignored with this flag on. // // Turning this feature on or off for an existing DB can cause unexpected