From a9565ccb26fb2b12b6bd384fc1d313fe24b7e81c Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 5 Jul 2022 10:10:37 -0700 Subject: [PATCH] Try to trivial move more than one files (#10190) Summary: In leveled compaction, try to trivial move more than one files if possible, up to 4 files or max_compaction_bytes. This is to allow higher write throughput for some use cases where data is loaded in sequential order, where appying compaction results is the bottleneck. When pick up a file to compact and it doesn't have overlapping files in the next level, try to expand to the next file if there is still no overlapping. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10190 Test Plan: Add some unit tests. For performance, Try to run ./db_bench_multi_move --benchmarks=fillseq --compression_type=lz4 --write_buffer_size=5000000 --num=100000000 --value_size=1000 -level_compaction_dynamic_level_bytes Together with https://github.com/facebook/rocksdb/pull/10188 , stalling will be eliminated in this benchmark. Reviewed By: jay-zhuang Differential Revision: D37230647 fbshipit-source-id: 42b260f545c46abc5d90335ac2bbfcd09602b549 --- HISTORY.md | 1 + db/compaction/compaction_picker_level.cc | 73 +++++++- db/compaction/compaction_picker_test.cc | 206 ++++++++++++++++++++++- db/db_block_cache_test.cc | 1 + db/db_range_del_test.cc | 2 + db/db_test2.cc | 2 +- db/version_set.cc | 9 + 7 files changed, 280 insertions(+), 14 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index cb2283664..8b2aaa130 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -77,6 +77,7 @@ ### Performance Improvements * Rather than doing total sort against all files in a level, SortFileByOverlappingRatio() to only find the top 50 files based on score. This can improve write throughput for the use cases where data is loaded in increasing key order and there are a lot of files in one LSM-tree, where applying compaction results is the bottleneck. * In leveled compaction, L0->L1 trivial move will allow more than one file to be moved in one compaction. This would allow L0 files to be moved down faster when data is loaded in sequential order, making slowdown or stop condition harder to hit. Also seek L0->L1 trivial move when only some files qualify. +* In leveled compaction, try to trivial move more than one files if possible, up to 4 files or max_compaction_bytes. This is to allow higher write throughput for some use cases where data is loaded in sequential order, where appying compaction results is the bottleneck. ## 7.3.0 (05/20/2022) ### Bug Fixes diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index 1d19ae84f..3c82fa5f8 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -103,6 +103,10 @@ class LevelCompactionBuilder { // otherwise, returns false. bool PickIntraL0Compaction(); + // Return true if TrivialMove is extended. `start_index` is the index of + // the intiial file picked, which should already be in `start_level_inputs_`. + bool TryExtendNonL0TrivialMove(int start_index); + // Picks a file from level_files to compact. // level_files is a vector of (level, file metadata) in ascending order of // level. If compact_to_next_level is true, compact the file to the next @@ -496,6 +500,55 @@ bool LevelCompactionBuilder::TryPickL0TrivialMove() { return false; } +bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) { + if (start_level_inputs_.size() == 1 && + (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) && + (mutable_cf_options_.compression_per_level.empty())) { + // Only file of `index`, and it is likely a trivial move. Try to + // expand if it is still a trivial move, but not beyond + // max_compaction_bytes or 4 files, so that we don't create too + // much compaction pressure for the next level. + // Ignore if there are more than one DB path, as it would be hard + // to predict whether it is a trivial move. + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + const size_t kMaxMultiTrivialMove = 4; + FileMetaData* initial_file = start_level_inputs_.files[0]; + size_t total_size = initial_file->fd.GetFileSize(); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + for (int i = start_index + 1; + i < static_cast(level_files.size()) && + start_level_inputs_.size() < kMaxMultiTrivialMove; + i++) { + FileMetaData* next_file = level_files[i]; + if (next_file->being_compacted) { + break; + } + vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest), + &(next_file->largest), + &output_level_inputs.files); + if (!output_level_inputs.empty()) { + break; + } + if (i < static_cast(level_files.size()) - 1 && + compaction_picker_->icmp()->user_comparator()->Compare( + next_file->largest.user_key(), + level_files[i + 1]->smallest.user_key()) == 0) { + // Not a clean up after adding the next file. Skip. + break; + } + total_size += next_file->fd.GetFileSize(); + if (total_size > mutable_cf_options_.max_compaction_bytes) { + break; + } + start_level_inputs_.files.push_back(next_file); + } + return start_level_inputs_.size() > 1; + } + return false; +} + bool LevelCompactionBuilder::PickFileToCompact() { // level 0 files are overlapping. So we cannot pick more // than one concurrent compactions at this level. This @@ -574,15 +627,19 @@ bool LevelCompactionBuilder::PickFileToCompact() { output_level_inputs.level = output_level_; vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, &output_level_inputs.files); - if (!output_level_inputs.empty() && - !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &output_level_inputs)) { - start_level_inputs_.clear(); - // The same reason as above to ensure the round-robin compaction - if (ioptions_.compaction_pri == kRoundRobin) { - return false; + if (output_level_inputs.empty()) { + if (TryExtendNonL0TrivialMove(index)) { + break; + } + } else { + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + start_level_inputs_.clear(); + if (ioptions_.compaction_pri == kRoundRobin) { + return false; + } + continue; } - continue; } base_index_ = index; break; diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 5accd231a..96780ca1f 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -12,6 +12,7 @@ #include "db/compaction/compaction_picker_level.h" #include "db/compaction/compaction_picker_universal.h" #include "db/compaction/file_pri.h" +#include "rocksdb/advanced_options.h" #include "table/unique_id_impl.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -2383,6 +2384,197 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOff) { ASSERT_FALSE(compaction->IsTrivialMove()); } +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "200", 3001U); + Add(2, 3U, "301", "350", 3000U); + Add(2, 4U, "451", "400", 3000U); + Add(2, 5U, "551", "500", 3000U); + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 16U, "170", "180", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(4, compaction->num_input_files(0)); + ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(5, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(6, compaction->input(0, 3)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "160", 3001U); + Add(2, 3U, "161", "179", 3000U); + Add(2, 4U, "220", "400", 3000U); + Add(2, 5U, "551", "500", 3000U); + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + // Even if consecutive files can be trivial moved, we don't pick them + // since in case trivial move can't be issued for a reason, we cannot + // fall back to normal compactions. + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "160", 3001U); + Add(2, 5U, "551", "500", 3000U); + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(1, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 4000U); + Add(2, 2U, "151", "160", 4001U); + Add(2, 3U, "161", "179", 4000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + // File 4 and 5 aren't clean cut, so only 2 and 3 are picked. + Add(2, 1U, "100", "150", 4000U); + Add(2, 2U, "151", "160", 4001U); + Add(2, 3U, "161", "179", 4000U); + Add(2, 4U, "180", "185", 4000U); + Add(2, 5U, "185", "190", 4000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "200", 3001U); + Add(2, 3U, "301", "350", 3000U); + Add(2, 4U, "451", "400", 3000U); + Add(2, 5U, "551", "500", 3000U); + file_map_[5U].first->being_compacted = true; + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 16U, "170", "180", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + // Since the next file is being compacted. Stopping at 3 and 4. + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber()); +} + TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { NewVersionStorage(6, kCompactionStyleLevel); mutable_cf_options_.max_compaction_bytes = 100000000000u; @@ -2397,7 +2589,11 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { Add(1 /* level */, 4U /* file_number */, "250" /* smallest */, "299" /* largest */, 700000000U /* file_size */); Add(2 /* level */, 5U /* file_number */, "150" /* smallest */, - "199" /* largest */, 1U /* file_size */); + "199" /* largest */, 100U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "200" /* smallest */, + "240" /* largest */, 1U /* file_size */); + Add(2 /* level */, 7U /* file_number */, "260" /* smallest */, + "270" /* largest */, 1U /* file_size */); file_map_[5U].first->being_compacted = true; UpdateVersionStorageInfo(); @@ -2406,9 +2602,9 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); - ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); - ASSERT_EQ(0U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->num_input_files(1)); ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */)); @@ -2416,9 +2612,9 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); - ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); - ASSERT_EQ(0U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->num_input_files(1)); ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber()); ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */)); diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 0594c7950..dc7fedfd4 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -676,6 +676,7 @@ TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) { Options options = CurrentOptions(); options.create_if_missing = true; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.max_compaction_bytes = 2000; BlockBasedTableOptions table_options; table_options.block_cache = NewLRUCache(1 << 25, 0, false); diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 66fe2e892..7ecf3e4bd 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -401,6 +401,7 @@ TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) { options.num_levels = 3; options.target_file_size_base = kFileBytes; options.target_file_size_multiplier = 1; + options.max_compaction_bytes = 1500; Reopen(options); Random rnd(301); @@ -1028,6 +1029,7 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { options.memtable_factory.reset( test::NewSpecialSkipListFactory(2 /* num_entries_flush */)); options.target_file_size_base = kValueBytes; + options.max_compaction_bytes = 1500; // i == 0: CompactFiles // i == 1: CompactRange // i == 2: automatic compaction diff --git a/db/db_test2.cc b/db/db_test2.cc index 22d2fbcf9..a6d6813b8 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2371,7 +2371,7 @@ TEST_F(DBTest2, MaxCompactionBytesTest) { GenerateNewRandomFile(&rnd); } CompactRangeOptions cro; - cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_EQ("0,0,8", FilesPerLevel(0)); diff --git a/db/version_set.cc b/db/version_set.cc index f07f12841..be2f156d5 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3180,6 +3180,15 @@ void SortFileByOverlappingRatio( std::partial_sort(temp->begin(), temp->begin() + num_to_sort, temp->end(), [&](const Fsize& f1, const Fsize& f2) -> bool { + // If score is the same, pick file with smaller keys. + // This makes the algorithm more deterministic, and also + // help the trivial move case to have more files to + // extend. + if (file_to_order[f1.file->fd.GetNumber()] == + file_to_order[f2.file->fd.GetNumber()]) { + return icmp.Compare(f1.file->smallest, + f2.file->smallest) < 0; + } return file_to_order[f1.file->fd.GetNumber()] < file_to_order[f2.file->fd.GetNumber()]; });