From 92a9ccf1a63ecb883440f73df0a255a7b82ff1fd Mon Sep 17 00:00:00 2001 From: sdong Date: Thu, 11 Feb 2016 13:45:53 -0800 Subject: [PATCH] Add a new compaction priority that picks file whose overlapping ratio is smallest Summary: Add a new compaction priority as following: For every file, we calculate total size of files overalapping with the file in the next level, over the file's size itself. The file with smallest ratio will be picked first. My "db_bench --fillrandom" shows about 5% less compaction than kOldestSmallestSeqFirst if --hard_pending_compaction_bytes_limit value to keep LSM tree in shape. If not limiting hard_pending_compaction_bytes_limit, improvement is only 1% or 2%. Test Plan: Add a unit test Reviewers: andrewkr, kradhakrishnan, anthony, IslamAbdelRahman, yhchiang Reviewed By: yhchiang Subscribers: MarkCallaghan, leveldb, dhruba Differential Revision: https://reviews.facebook.net/D54075 --- HISTORY.md | 2 + db/compaction_picker_test.cc | 81 ++++++++++++++++++++++++++++++++++++ db/db_compaction_test.cc | 8 +++- db/version_set.cc | 45 ++++++++++++++++++++ include/rocksdb/options.h | 7 ++++ 5 files changed, 141 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9a14a8bf4..abf86a430 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,7 @@ # Rocksdb Change Log ## Unreleased +### New Features +* Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification. ## 4.5.0 (2/5/2016) ### Public API Changes diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc index f3801ae9f..98d80eea8 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction_picker_test.cc @@ -487,6 +487,87 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { } #endif // ROCKSDB_LITE +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + Add(2, 6U, "150", "179", 50000000U); + Add(2, 7U, "180", "220", 50000000U); + Add(2, 8U, "321", "400", 50000000U); // File not overlapping + Add(2, 9U, "721", "800", 50000000U); + + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + Add(3, 30U, "750", "900", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Pick file 8 because it overlaps with 0 files on level 3. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + Add(2, 6U, "150", "175", + 60000000U); // Overlaps with file 26, 27, total size 521M + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size + // 520M, the smalelst overlapping + Add(2, 8U, "201", "300", + 60000000U); // Overlaps with file 28, 29, total size 521M + + Add(3, 26U, "100", "110", 261000000U); + Add(3, 26U, "150", "170", 261000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 261000000U); + Add(3, 30U, "321", "400", 261000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 7 because overlapping ratio is the biggest. + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + Add(2, 6U, "150", "175", 60000000U); // Overlaps with file 26, 27 + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 27 + + Add(3, 26U, "160", "165", 260000000U); + Add(3, 26U, "166", "170", 260000000U); + Add(3, 27U, "180", "400", 260000000U); + Add(3, 28U, "401", "500", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + // This test exhibits the bug where we don't properly reset parent_index in // PickCompaction() TEST_F(CompactionPickerTest, ParentIndexResetBug) { diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index b900aa5f6..6dedccd82 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -2507,8 +2507,12 @@ TEST_P(CompactionPriTest, Test) { } } -INSTANTIATE_TEST_CASE_P(CompactionPriTest, CompactionPriTest, - ::testing::Values(0, 1, 2)); +INSTANTIATE_TEST_CASE_P( + CompactionPriTest, CompactionPriTest, + ::testing::Values(CompactionPri::kByCompensatedSize, + CompactionPri::kOldestLargestSeqFirst, + CompactionPri::kOldestSmallestSeqFirst, + CompactionPri::kMinOverlappingRatio)); #endif // !defined(ROCKSDB_LITE) } // namespace rocksdb diff --git a/db/version_set.cc b/db/version_set.cc index 2e600f8a2..4cf493f91 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1377,6 +1377,47 @@ void VersionStorageInfo::UpdateNumNonEmptyLevels() { } } +namespace { +// Sort `temp` based on ratio of overlapping size over file size +void SortFileByOverlappingRatio( + const InternalKeyComparator& icmp, const std::vector& files, + const std::vector& next_level_files, + std::vector* temp) { + std::unordered_map file_to_order; + auto next_level_it = next_level_files.begin(); + + for (auto& file : files) { + uint64_t overlapping_bytes = 0; + // Skip files in next level that is smaller than current file + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->largest, file->smallest) < 0) { + next_level_it++; + } + + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->smallest, file->largest) < 0) { + overlapping_bytes += (*next_level_it)->fd.file_size; + + if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + next_level_it++; + } + + assert(file->fd.file_size != 0); + file_to_order[file->fd.GetNumber()] = + overlapping_bytes * 1024u / file->fd.file_size; + } + + std::sort(temp->begin(), temp->end(), + [&](const Fsize& f1, const Fsize& f2) -> bool { + return file_to_order[f1.file->fd.GetNumber()] < + file_to_order[f2.file->fd.GetNumber()]; + }); +} +} // namespace + void VersionStorageInfo::UpdateFilesByCompactionPri( const MutableCFOptions& mutable_cf_options) { if (compaction_style_ == kCompactionStyleFIFO || @@ -1419,6 +1460,10 @@ void VersionStorageInfo::UpdateFilesByCompactionPri( return f1.file->smallest_seqno < f2.file->smallest_seqno; }); break; + case kMinOverlappingRatio: + SortFileByOverlappingRatio(*internal_comparator_, files_[level], + files_[level + 1], &temp); + break; default: assert(false); } diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 0718b2342..c0fe0b81a 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -80,6 +80,9 @@ enum CompactionStyle : char { kCompactionStyleNone = 0x3, }; +// In Level-based comapction, it Determines which file from a level to be +// picked to merge to the next level. We suggest people try +// kMinOverlappingRatio first when you tune your database. enum CompactionPri : char { // Slightly Priotize larger files by size compensated by #deletes kByCompensatedSize = 0x0, @@ -90,6 +93,10 @@ enum CompactionPri : char { // for the longest. If your updates are random across the key space, // write amplification is slightly better with this option. kOldestSmallestSeqFirst = 0x2, + // First compact files whose ratio between overlapping size in next level + // and its size is the smallest. It in many cases can optimize write + // amplification. + kMinOverlappingRatio = 0x3, }; enum class WALRecoveryMode : char {