From 8c3a180e833755dab7d443b96a29e72b3f448546 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Thu, 4 May 2017 18:03:22 -0700 Subject: [PATCH] Set lower-bound on dynamic level sizes Summary: Changed dynamic leveling to stop setting the base level's size bound below `max_bytes_for_level_base`. Behavior for config where `max_bytes_for_level_base == level0_file_num_compaction_trigger * write_buffer_size` and same amount of data in L0 and base-level: - Before #2027, compaction scoring would favor base-level due to dividing by size smaller than `max_bytes_for_level_base`. - After #2027, L0 and Lbase get equal scores. The disadvantage is L0 is often compacted before reaching the num files trigger since `write_buffer_size` can be bigger than the dynamically chosen base-level size. This increases write-amp. - After this diff, L0 and Lbase still get equal scores. Now it takes `level0_file_num_compaction_trigger` files of size `write_buffer_size` to trigger L0 compaction by size, fixing the write-amp problem above. Closes https://github.com/facebook/rocksdb/pull/2123 Differential Revision: D4861570 Pulled By: ajkr fbshipit-source-id: 467ddef56ed1f647c14d86bb018bcb044c39b964 --- db/compaction_picker_test.cc | 11 ++++++----- db/version_set.cc | 6 +++++- db/version_set_test.cc | 10 +++++----- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc index 8354894c4..ba5eab5a9 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction_picker_test.cc @@ -1013,7 +1013,8 @@ TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) { // Set Last level size 50000 // num_levels - 1 target 5000 - // num_levels - 2 is base level with taret 500 + // num_levels - 2 is base level with target 1000 (rounded up to + // max_bytes_for_level_base). Add(num_levels - 1, 10U, "400", "500", 50000); Add(0, 1U, "150", "200", 200); @@ -1022,16 +1023,16 @@ TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) { Add(0, 5U, "150", "200", 200); Add(0, 6U, "150", "200", 200); // num_levels - 3 is over target by 100 + 1000 - Add(num_levels - 3, 7U, "400", "500", 300); - Add(num_levels - 3, 8U, "600", "700", 300); + Add(num_levels - 3, 7U, "400", "500", 550); + Add(num_levels - 3, 8U, "600", "700", 550); // num_levels - 2 is over target by 1100 + 200 Add(num_levels - 2, 9U, "150", "200", 5200); UpdateVersionStorageInfo(); - // Merging to the second last level: (5200 / 1600 + 1) * 1100 + // Merging to the second last level: (5200 / 2100 + 1) * 1100 // Merging to the last level: (50000 / 6300 + 1) * 1300 - ASSERT_EQ(1600u + 4675u + 11617u, + ASSERT_EQ(2100u + 3823u + 11617u, vstorage_->estimated_compaction_needed_bytes()); } diff --git a/db/version_set.cc b/db/version_set.cc index 957a88701..58dc25c7e 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2113,7 +2113,11 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, level_size = MultiplyCheckOverflow( level_size, options.max_bytes_for_level_multiplier); } - level_max_bytes_[i] = level_size; + // Don't set any level below base_bytes_max. Otherwise, the LSM can + // assume an hourglass shape where L1+ sizes are smaller than L0. This + // causes compaction scoring, which depends on level sizes, to favor L1+ + // at the expense of L0, which may fill up and stall. + level_max_bytes_[i] = std::max(level_size, base_bytes_max); } } } diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 7208c7507..c8c8541f7 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -168,13 +168,13 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic) { Add(5, 2U, "3", "4", 550U); vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); ASSERT_EQ(0, logger_->log_count); - ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 210U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U); ASSERT_EQ(vstorage_.base_level(), 4); Add(4, 3U, "3", "4", 550U); vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); ASSERT_EQ(0, logger_->log_count); - ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 210U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U); ASSERT_EQ(vstorage_.base_level(), 4); Add(3, 4U, "3", "4", 250U); @@ -182,7 +182,7 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic) { vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); ASSERT_EQ(1, logger_->log_count); ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1005U); - ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 201U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 1000U); ASSERT_EQ(vstorage_.base_level(), 3); Add(1, 6U, "3", "4", 5U); @@ -193,7 +193,7 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic) { ASSERT_GT(vstorage_.MaxBytesForLevel(4), 1005U); ASSERT_GT(vstorage_.MaxBytesForLevel(3), 1005U); ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 1005U); - ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 201U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 1000U); ASSERT_EQ(vstorage_.base_level(), 1); } @@ -231,7 +231,7 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) { ASSERT_EQ(vstorage_.MaxBytesForLevel(5), 3000U * kOneGB); ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 300U * kOneGB); ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 30U * kOneGB); - ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 3U * kOneGB); + ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 10U * kOneGB); ASSERT_EQ(vstorage_.base_level(), 2); ASSERT_EQ(0, logger_->log_count); }