From b3c43a5b999b1631f259c3f5b9a19c3606d9f5cf Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Thu, 6 Apr 2023 11:20:43 -0700 Subject: [PATCH] Drain unnecessary levels when `level_compaction_dynamic_level_bytes=true` (#11340) Summary: When a user migrates to level compaction + `level_compaction_dynamic_level_bytes=true`, or when a DB shrinks, there can be unnecessary levels in the DB. Before this PR, this is no way to remove these levels except a manual compaction. These extra unnecessary levels make it harder to guarantee max_bytes_for_level_multiplier and can cause extra space amp. This PR boosts compaction score for these levels to allow RocksDB to automatically drain these levels. Together with https://github.com/facebook/rocksdb/issues/11321, this makes migration to `level_compaction_dynamic_level_bytes=true` automatic without needing user to do a one time full manual compaction. Credit: this PR is modified from https://github.com/facebook/rocksdb/issues/3921. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11340 Test Plan: - New unit tests - `python3 tools/db_crashtest.py whitebox --simple` which randomly sets level_compaction_dynamic_level_bytes in each run. Reviewed By: ajkr Differential Revision: D44563884 Pulled By: cbi42 fbshipit-source-id: e20d3620bd73dff22be18c5a91a07f340740bcc8 --- HISTORY.md | 3 +- db/compact_files_test.cc | 5 +- db/compaction/compaction.h | 13 +- db/compaction/compaction_iterator.cc | 21 +-- db/compaction/tiered_compaction_test.cc | 1 + db/db_compaction_test.cc | 202 +++++++++++++++++------- db/db_impl/db_impl_open.cc | 3 + db/db_test.cc | 4 +- db/version_set.cc | 69 ++++++-- db/version_set.h | 6 + db/version_set_test.cc | 31 ++++ include/rocksdb/advanced_options.h | 6 +- 12 files changed, 269 insertions(+), 95 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index a48a77a6a..6bcc48efd 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,8 +6,9 @@ ### Behavior changes * Changed default block cache size from an 8MB to 32MB LRUCache, which increases the default number of cache shards from 16 to 64. This change is intended to minimize cache mutex contention under stress conditions. See https://github.com/facebook/rocksdb/wiki/Block-Cache for more information. -* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes`. +* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes` (#11321). * User-provided `ReadOptions` take effect for more reads of non-`CacheEntryRole::kDataBlock` blocks. +* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now drains unnecessary levels through background compaction automatically (#11340). This together with #11321 makes it automatic to migrate other compaction settings to level compaction with `level_compaction_dynamic_level_bytes=true`. In addition, a live DB that becomes smaller will now have unnecessary levels drained which can help to reduce read and space amp. ### Bug Fixes * In the DB::VerifyFileChecksums API, ensure that file system reads of SST files are equal to the readahead_size in ReadOptions, if specified. Previously, each read was 2x the readahead_size. diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index ad94ad340..eecab196f 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -120,7 +120,9 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) { TEST_F(CompactFilesTest, MultipleLevel) { Options options; options.create_if_missing = true; - options.level_compaction_dynamic_level_bytes = true; + // Otherwise background compaction can happen to + // drain unnecessary level + options.level_compaction_dynamic_level_bytes = false; options.num_levels = 6; // Add listener FlushedFileCollector* collector = new FlushedFileCollector(); @@ -181,7 +183,6 @@ TEST_F(CompactFilesTest, MultipleLevel) { for (int invalid_output_level = 0; invalid_output_level < 5; invalid_output_level++) { s = db->CompactFiles(CompactionOptions(), files, invalid_output_level); - std::cout << s.ToString() << std::endl; ASSERT_TRUE(s.IsInvalidArgument()); } diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index ee8639601..4fa1dce71 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -546,13 +546,16 @@ struct PerKeyPlacementContext { const Slice value; const SequenceNumber seq_num; - bool output_to_penultimate_level; + bool& output_to_penultimate_level; PerKeyPlacementContext(int _level, Slice _key, Slice _value, - SequenceNumber _seq_num) - : level(_level), key(_key), value(_value), seq_num(_seq_num) { - output_to_penultimate_level = false; - } + SequenceNumber _seq_num, + bool& _output_to_penultimate_level) + : level(_level), + key(_key), + value(_value), + seq_num(_seq_num), + output_to_penultimate_level(_output_to_penultimate_level) {} }; #endif /* !NDEBUG */ diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index fcd40e116..c2ac7f692 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -1201,17 +1201,7 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() { void CompactionIterator::DecideOutputLevel() { assert(compaction_->SupportsPerKeyPlacement()); -#ifndef NDEBUG - // Could be overridden by unittest - PerKeyPlacementContext context(level_, ikey_.user_key, value_, - ikey_.sequence); - TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", - &context); - output_to_penultimate_level_ = context.output_to_penultimate_level; -#else output_to_penultimate_level_ = false; -#endif // NDEBUG - // if the key is newer than the cutoff sequence or within the earliest // snapshot, it should output to the penultimate level. if (ikey_.sequence > preclude_last_level_min_seqno_ || @@ -1219,6 +1209,17 @@ void CompactionIterator::DecideOutputLevel() { output_to_penultimate_level_ = true; } +#ifndef NDEBUG + // Could be overridden by unittest + PerKeyPlacementContext context(level_, ikey_.user_key, value_, ikey_.sequence, + output_to_penultimate_level_); + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", + &context); + if (ikey_.sequence > earliest_snapshot_) { + output_to_penultimate_level_ = true; + } +#endif // NDEBUG + if (output_to_penultimate_level_) { // If it's decided to output to the penultimate level, but unsafe to do so, // still output to the last level. For example, moving the data from a lower diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc index 5100570ee..919716f9f 100644 --- a/db/compaction/tiered_compaction_test.cc +++ b/db/compaction/tiered_compaction_test.cc @@ -1111,6 +1111,7 @@ TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) { options.num_levels = kNumLevels; options.statistics = CreateDBStatistics(); options.max_subcompactions = 10; + options.preclude_last_level_data_seconds = 10000; DestroyAndReopen(options); auto cmp = options.comparator; diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 4f6ca802c..119935c66 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -9118,11 +9118,19 @@ TEST_F(DBCompactionTest, TurnOnLevelCompactionDynamicLevelBytes) { options.level_compaction_dynamic_level_bytes = false; options.num_levels = 6; options.compression = kNoCompression; + options.max_bytes_for_level_base = 1 << 20; + options.max_bytes_for_level_multiplier = 10; DestroyAndReopen(options); // put files in L0, L1 and L2 WriteOptions write_opts; ASSERT_OK(db_->Put(write_opts, Key(1), "val1")); + Random rnd(33); + // Fill L2 with size larger than max_bytes_for_level_base, + // so the level above it won't be drained. + for (int i = 2; i <= (1 << 10); ++i) { + ASSERT_OK(db_->Put(write_opts, Key(i), rnd.RandomString(2 << 10))); + } ASSERT_OK(Flush()); MoveFilesToLevel(2); ASSERT_OK(db_->Put(write_opts, Key(2), "val2")); @@ -9157,63 +9165,11 @@ TEST_F(DBCompactionTest, TurnOnLevelCompactionDynamicLevelBytes) { // newly flushed file is also pushed down options.level_compaction_dynamic_level_bytes = true; Reopen(options); - ASSERT_EQ("0,0,0,1,1,2", FilesPerLevel()); - verify_db(); -} - -TEST_F(DBCompactionTest, TurnOnLevelCompactionDynamicLevelBytesIngestBehind) { - Options options = CurrentOptions(); - options.compaction_style = kCompactionStyleLevel; - options.allow_ingest_behind = true; - options.level_compaction_dynamic_level_bytes = false; - options.num_levels = 6; - options.compression = kNoCompression; - DestroyAndReopen(options); - - // put files in L0, L1 and L2 - WriteOptions write_opts; - ASSERT_OK(db_->Put(write_opts, Key(1), "val1")); - ASSERT_OK(Flush()); - MoveFilesToLevel(2); - ASSERT_OK(db_->Put(write_opts, Key(2), "val2")); - ASSERT_OK(Flush()); - MoveFilesToLevel(2); - ASSERT_OK(db_->Put(write_opts, Key(1), "new_val1")); - ASSERT_OK(Flush()); - MoveFilesToLevel(1); - ASSERT_OK(db_->Put(write_opts, Key(3), "val3")); - ASSERT_OK(Flush()); - ASSERT_EQ("1,1,2", FilesPerLevel()); - auto verify_db = [&]() { - ASSERT_EQ(Get(Key(1)), "new_val1"); - ASSERT_EQ(Get(Key(2)), "val2"); - ASSERT_EQ(Get(Key(3)), "val3"); - }; - verify_db(); - - options.level_compaction_dynamic_level_bytes = true; - Reopen(options); - // note that last level (L6) should be empty - ASSERT_EQ("1,0,0,1,2", FilesPerLevel()); - verify_db(); - - // turning the options on and off should both be safe - options.level_compaction_dynamic_level_bytes = false; - Reopen(options); - MoveFilesToLevel(1); - ASSERT_EQ("0,1,0,1,2", FilesPerLevel()); - verify_db(); - - // newly flushed file is also pushed down - options.level_compaction_dynamic_level_bytes = true; - Reopen(options); - ASSERT_EQ("0,0,1,1,2", FilesPerLevel()); - verify_db(); - - // files will be pushed down to last level (L6) - options.allow_ingest_behind = false; - Reopen(options); - ASSERT_EQ("0,0,0,1,1,2", FilesPerLevel()); + // Files in L1 should be trivially moved down during DB opening. + // The file should be moved to L3, and then may be drained and compacted to + // L4. So we just check L1 and L2 here. + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + ASSERT_EQ(0, NumTableFilesAtLevel(2)); verify_db(); } @@ -9253,6 +9209,138 @@ TEST_F(DBCompactionTest, TurnOnLevelCompactionDynamicLevelBytesUCToLC) { ASSERT_EQ(expected_lsm, FilesPerLevel(1)); } +TEST_F(DBCompactionTest, DrainUnnecessaryLevelsAfterMultiplierChanged) { + // When the level size multiplier increases such that fewer levels become + // necessary, unnecessary levels should to be drained. + const int kBaseLevelBytes = 256 << 10; // 256KB + const int kFileBytes = 64 << 10; // 64KB + const int kInitMultiplier = 2, kChangedMultiplier = 10; + const int kNumFiles = 32; + const int kNumLevels = 5; + const int kValueBytes = 1 << 10; // 1KB + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kInitMultiplier; + options.num_levels = kNumLevels; + Reopen(options); + + // Initially we setup the LSM to look roughly as follows: + // + // L0: empty + // L1: 256KB + // ... + // L4: 1MB + Random rnd(301); + for (int file = 0; file < kNumFiles; ++file) { + for (int i = 0; i < kFileBytes / kValueBytes; ++i) { + ASSERT_OK(Put(Key(file * kFileBytes / kValueBytes + i), + rnd.RandomString(kValueBytes))); + } + ASSERT_OK(Flush()); + } + + int init_num_nonempty = 0; + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++init_num_nonempty; + } + } + + // After increasing the multiplier and running compaction fewer levels are + // needed to hold all the data. Unnecessary levels should be drained. + ASSERT_OK(db_->SetOptions({{"max_bytes_for_level_multiplier", + std::to_string(kChangedMultiplier)}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + int final_num_nonempty = 0; + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++final_num_nonempty; + } + } + ASSERT_GT(init_num_nonempty, final_num_nonempty); +} + +TEST_F(DBCompactionTest, DrainUnnecessaryLevelsAfterDBBecomesSmall) { + // When the DB size is smaller, e.g., large chunk of data deleted by + // DeleteRange(), unnecessary levels should to be drained. + const int kBaseLevelBytes = 256 << 10; // 256KB + const int kFileBytes = 64 << 10; // 64KB + const int kMultiplier = 2; + const int kNumFiles = 32; + const int kNumLevels = 5; + const int kValueBytes = 1 << 10; // 1KB + const int kDeleteFileNum = 8; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kMultiplier; + options.num_levels = kNumLevels; + Reopen(options); + + // Initially we setup the LSM to look roughly as follows: + // + // L0: empty + // L1: 256KB + // ... + // L4: 1MB + Random rnd(301); + for (int file = 0; file < kNumFiles; ++file) { + for (int i = 0; i < kFileBytes / kValueBytes; ++i) { + ASSERT_OK(Put(Key(file * kFileBytes / kValueBytes + i), + rnd.RandomString(kValueBytes))); + } + ASSERT_OK(Flush()); + if (file == kDeleteFileNum) { + // Ensure the DeleteRange() call below only delete data from last level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(kNumLevels - 1), kDeleteFileNum + 1); + } + } + + int init_num_nonempty = 0; + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++init_num_nonempty; + } + } + + // Disable auto compaction CompactRange() below + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}})); + // Delete keys within first (kDeleteFileNum + 1) files' key ranges. + // This should reduce DB size enough such that there is now + // an unneeded level. + std::string begin = Key(0); + std::string end = Key(kDeleteFileNum * kFileBytes / kValueBytes); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), begin, end)); + Slice begin_slice = begin; + Slice end_slice = end; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_slice, &end_slice)); + int after_delete_range_nonempty = 0; + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++after_delete_range_nonempty; + } + } + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + int final_num_nonempty = 0; + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++final_num_nonempty; + } + } + ASSERT_GE(init_num_nonempty, after_delete_range_nonempty); + ASSERT_GT(after_delete_range_nonempty, final_num_nonempty); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 478384c91..1758f4cc6 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -555,6 +555,9 @@ Status DBImpl::Recover( !cfd->GetLatestMutableCFOptions()->disable_auto_compactions) { int to_level = cfd->ioptions()->num_levels - 1; // last level is reserved + // allow_ingest_behind does not support Level Compaction, + // and per_key_placement can have infinite compaction loop for Level + // Compaction. Adjust to_level here just to be safe. if (cfd->ioptions()->allow_ingest_behind || cfd->ioptions()->preclude_last_level_data_seconds > 0) { to_level -= 1; diff --git a/db/db_test.cc b/db/db_test.cc index 05ee14fe2..7720cbbe4 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -353,7 +353,7 @@ TEST_F(DBTest, MixedSlowdownOptionsInQueue) { for (int i = 0; i < 2; ++i) { threads.emplace_back(write_no_slowdown_func); } - // Sleep for 2s to allow the threads to insert themselves into the + // Sleep for 3s to allow the threads to insert themselves into the // write queue env_->SleepForMicroseconds(3000000ULL); } @@ -424,7 +424,7 @@ TEST_F(DBTest, MixedSlowdownOptionsStop) { for (int i = 0; i < 2; ++i) { threads.emplace_back(write_no_slowdown_func); } - // Sleep for 2s to allow the threads to insert themselves into the + // Sleep for 3s to allow the threads to insert themselves into the // write queue env_->SleepForMicroseconds(3000000ULL); } diff --git a/db/version_set.cc b/db/version_set.cc index d7500ff08..8610e326f 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3373,7 +3373,7 @@ void VersionStorageInfo::ComputeCompactionScore( } } } - } else { + } else { // level > 0 // Compute the ratio of current size to size limit. uint64_t level_bytes_no_compacting = 0; uint64_t level_total_bytes = 0; @@ -3383,21 +3383,36 @@ void VersionStorageInfo::ComputeCompactionScore( level_bytes_no_compacting += f->compensated_file_size; } } - if (!immutable_options.level_compaction_dynamic_level_bytes || - level_bytes_no_compacting < MaxBytesForLevel(level)) { + if (!immutable_options.level_compaction_dynamic_level_bytes) { score = static_cast(level_bytes_no_compacting) / MaxBytesForLevel(level); } else { - // If there are a large mount of data being compacted down to the - // current level soon, we would de-prioritize compaction from - // a level where the incoming data would be a large ratio. We do - // it by dividing level size not by target level size, but - // the target size and the incoming compaction bytes. - score = static_cast(level_bytes_no_compacting) / - (MaxBytesForLevel(level) + total_downcompact_bytes) * - kScoreScale; + if (level_bytes_no_compacting < MaxBytesForLevel(level)) { + score = static_cast(level_bytes_no_compacting) / + MaxBytesForLevel(level); + } else { + // If there are a large mount of data being compacted down to the + // current level soon, we would de-prioritize compaction from + // a level where the incoming data would be a large ratio. We do + // it by dividing level size not by target level size, but + // the target size and the incoming compaction bytes. + score = static_cast(level_bytes_no_compacting) / + (MaxBytesForLevel(level) + total_downcompact_bytes) * + kScoreScale; + } + // Drain unnecessary levels, but with lower priority compared to + // when L0 is eligible. Only non-empty levels can be unnecessary. + // If there is no unnecessary levels, lowest_unnecessary_level_ = -1. + if (level_bytes_no_compacting > 0 && + level <= lowest_unnecessary_level_) { + score = std::max( + score, kScoreScale * + (1.001 + 0.001 * (lowest_unnecessary_level_ - level))); + } } - if (level_total_bytes > MaxBytesForLevel(level)) { + if (level <= lowest_unnecessary_level_) { + total_downcompact_bytes += level_total_bytes; + } else if (level_total_bytes > MaxBytesForLevel(level)) { total_downcompact_bytes += static_cast(level_total_bytes - MaxBytesForLevel(level)); } @@ -4470,6 +4485,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, } } } else { + assert(ioptions.compaction_style == kCompactionStyleLevel); uint64_t max_level_size = 0; int first_non_empty_level = -1; @@ -4494,11 +4510,13 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, level_max_bytes_[i] = std::numeric_limits::max(); } + lowest_unnecessary_level_ = -1; if (max_level_size == 0) { // No data for L1 and up. L0 compacts to last level directly. // No compaction from L1+ needs to be scheduled. base_level_ = num_levels_ - 1; } else { + assert(first_non_empty_level >= 1); uint64_t base_bytes_max = options.max_bytes_for_level_base; uint64_t base_bytes_min = static_cast( base_bytes_max / options.max_bytes_for_level_multiplier); @@ -4509,20 +4527,41 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, // Round up after dividing cur_level_size = static_cast( cur_level_size / options.max_bytes_for_level_multiplier); + if (lowest_unnecessary_level_ == -1 && + cur_level_size <= base_bytes_min && + (ioptions.preclude_last_level_data_seconds == 0 || + i < num_levels_ - 2)) { + // When per_key_placement is enabled, the penultimate level is + // necessary. + lowest_unnecessary_level_ = i; + } } // Calculate base level and its size. uint64_t base_level_size; if (cur_level_size <= base_bytes_min) { + // If per_key_placement is not enabled, + // either there is only one non-empty level after level 0, + // which can less than base_bytes_min AND necessary, + // or there is some unnecessary level. + assert(first_non_empty_level == num_levels_ - 1 || + ioptions.preclude_last_level_data_seconds > 0 || + lowest_unnecessary_level_ != -1); // Case 1. If we make target size of last level to be max_level_size, // target size of the first non-empty level would be smaller than // base_bytes_min. We set it be base_bytes_min. base_level_size = base_bytes_min + 1U; base_level_ = first_non_empty_level; - ROCKS_LOG_INFO(ioptions.logger, - "More existing levels in DB than needed. " - "max_bytes_for_level_multiplier may not be guaranteed."); + if (base_level_ < num_levels_ - 1) { + ROCKS_LOG_INFO( + ioptions.logger, + "More existing levels in DB than needed: all non-zero " + "levels <= level %d are unnecessary. " + "max_bytes_for_level_multiplier may not be guaranteed.", + lowest_unnecessary_level_); + } } else { + assert(lowest_unnecessary_level_ == -1); // Find base level (where L0 data is compacted to). base_level_ = first_non_empty_level; while (base_level_ > 1 && cur_level_size > base_bytes_max) { diff --git a/db/version_set.h b/db/version_set.h index bc5b4177b..8d0633ea1 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -650,6 +650,12 @@ class VersionStorageInfo { // be empty. -1 if it is not level-compaction so it's not applicable. int base_level_; + // Applies to level compaction when + // `level_compaction_dynamic_level_bytes=true`. All non-empty levels <= + // lowest_unnecessary_level_ are not needed and will be drained automatically. + // -1 if there is no unnecessary level, + int lowest_unnecessary_level_; + double level_multiplier_; // A list for the same set of files that are stored in files_, diff --git a/db/version_set_test.cc b/db/version_set_test.cc index a83fabcd0..0815d4cab 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -454,6 +454,37 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) { ASSERT_EQ(4, vstorage_.CompactionScoreLevel(2)); } +TEST_F(VersionStorageInfoTest, DrainUnnecessaryLevel) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // Create a few unnecessary levels. + // See if score is calculated correctly. + Add(5, 1U, "1", "2", 2000U); // target size 1010000 + Add(4, 2U, "1", "2", 200U); // target size 101000 + // Unnecessary levels + Add(3, 3U, "1", "2", 100U); // target size 10100 + // Level 2: target size 1010 + Add(1, 4U, "1", "2", + 10U); // target size 1000 = max(base_bytes_min + 1, base_bytes_max) + + UpdateVersionStorageInfo(); + + ASSERT_EQ(1, vstorage_.base_level()); + ASSERT_EQ(1000, vstorage_.MaxBytesForLevel(1)); + ASSERT_EQ(10100, vstorage_.MaxBytesForLevel(3)); + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + + // Tests that levels 1 and 3 are eligible for compaction. + // Levels 1 and 3 are much smaller than target size, + // so size does not contribute to a high compaction score. + ASSERT_EQ(1, vstorage_.CompactionScoreLevel(0)); + ASSERT_GT(vstorage_.CompactionScore(0), 10); + ASSERT_EQ(3, vstorage_.CompactionScoreLevel(1)); + ASSERT_GT(vstorage_.CompactionScore(1), 10); +} + TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) { // Test whether the overlaps are detected as expected Add(1, 1U, "4", "7", 1U); // Perfect overlap with last level diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 07261bd06..630d1df95 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -592,7 +592,7 @@ struct AdvancedColumnFamilyOptions { // and max_bytes_for_level_base=10MB. // Target sizes of level 1 to 5 starts with: // [- - - - 10MB] - // with base level is level. Target sizes of level 1 to 4 are not applicable + // with base level is level 5. Target sizes of level 1 to 4 are not applicable // because they will not be used. // Until the size of Level 5 grows to more than 10MB, say 11MB, we make // base target to level 4 and now the targets looks like: @@ -665,8 +665,8 @@ struct AdvancedColumnFamilyOptions { // computed based on this feature) in the LSM after a user migrates to turn // this feature on. This is especially likely when a user migrates from // leveled compaction with a smaller multiplier or from universal compaction. - // A full manual compaction is needed to drain these levels explicitly. - // + // RocksDB will gradually drain these unnecessary levels by compacting files + // down the LSM. // // Default: false bool level_compaction_dynamic_level_bytes = false;