diff --git a/HISTORY.md b/HISTORY.md index 9a14a8bf4..abf86a430 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,7 @@ # Rocksdb Change Log ## Unreleased +### New Features +* Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification. ## 4.5.0 (2/5/2016) ### Public API Changes diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc index f3801ae9f..98d80eea8 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction_picker_test.cc @@ -487,6 +487,87 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { } #endif // ROCKSDB_LITE +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + Add(2, 6U, "150", "179", 50000000U); + Add(2, 7U, "180", "220", 50000000U); + Add(2, 8U, "321", "400", 50000000U); // File not overlapping + Add(2, 9U, "721", "800", 50000000U); + + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + Add(3, 30U, "750", "900", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Pick file 8 because it overlaps with 0 files on level 3. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + Add(2, 6U, "150", "175", + 60000000U); // Overlaps with file 26, 27, total size 521M + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size + // 520M, the smalelst overlapping + Add(2, 8U, "201", "300", + 60000000U); // Overlaps with file 28, 29, total size 521M + + Add(3, 26U, "100", "110", 261000000U); + Add(3, 26U, "150", "170", 261000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 261000000U); + Add(3, 30U, "321", "400", 261000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 7 because overlapping ratio is the biggest. + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + Add(2, 6U, "150", "175", 60000000U); // Overlaps with file 26, 27 + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 27 + + Add(3, 26U, "160", "165", 260000000U); + Add(3, 26U, "166", "170", 260000000U); + Add(3, 27U, "180", "400", 260000000U); + Add(3, 28U, "401", "500", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + // This test exhibits the bug where we don't properly reset parent_index in // PickCompaction() TEST_F(CompactionPickerTest, ParentIndexResetBug) { diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index b900aa5f6..6dedccd82 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -2507,8 +2507,12 @@ TEST_P(CompactionPriTest, Test) { } } -INSTANTIATE_TEST_CASE_P(CompactionPriTest, CompactionPriTest, - ::testing::Values(0, 1, 2)); +INSTANTIATE_TEST_CASE_P( + CompactionPriTest, CompactionPriTest, + ::testing::Values(CompactionPri::kByCompensatedSize, + CompactionPri::kOldestLargestSeqFirst, + CompactionPri::kOldestSmallestSeqFirst, + CompactionPri::kMinOverlappingRatio)); #endif // !defined(ROCKSDB_LITE) } // namespace rocksdb diff --git a/db/version_set.cc b/db/version_set.cc index 2e600f8a2..4cf493f91 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1377,6 +1377,47 @@ void VersionStorageInfo::UpdateNumNonEmptyLevels() { } } +namespace { +// Sort `temp` based on ratio of overlapping size over file size +void SortFileByOverlappingRatio( + const InternalKeyComparator& icmp, const std::vector& files, + const std::vector& next_level_files, + std::vector* temp) { + std::unordered_map file_to_order; + auto next_level_it = next_level_files.begin(); + + for (auto& file : files) { + uint64_t overlapping_bytes = 0; + // Skip files in next level that is smaller than current file + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->largest, file->smallest) < 0) { + next_level_it++; + } + + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->smallest, file->largest) < 0) { + overlapping_bytes += (*next_level_it)->fd.file_size; + + if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + next_level_it++; + } + + assert(file->fd.file_size != 0); + file_to_order[file->fd.GetNumber()] = + overlapping_bytes * 1024u / file->fd.file_size; + } + + std::sort(temp->begin(), temp->end(), + [&](const Fsize& f1, const Fsize& f2) -> bool { + return file_to_order[f1.file->fd.GetNumber()] < + file_to_order[f2.file->fd.GetNumber()]; + }); +} +} // namespace + void VersionStorageInfo::UpdateFilesByCompactionPri( const MutableCFOptions& mutable_cf_options) { if (compaction_style_ == kCompactionStyleFIFO || @@ -1419,6 +1460,10 @@ void VersionStorageInfo::UpdateFilesByCompactionPri( return f1.file->smallest_seqno < f2.file->smallest_seqno; }); break; + case kMinOverlappingRatio: + SortFileByOverlappingRatio(*internal_comparator_, files_[level], + files_[level + 1], &temp); + break; default: assert(false); } diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 0718b2342..c0fe0b81a 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -80,6 +80,9 @@ enum CompactionStyle : char { kCompactionStyleNone = 0x3, }; +// In Level-based comapction, it Determines which file from a level to be +// picked to merge to the next level. We suggest people try +// kMinOverlappingRatio first when you tune your database. enum CompactionPri : char { // Slightly Priotize larger files by size compensated by #deletes kByCompensatedSize = 0x0, @@ -90,6 +93,10 @@ enum CompactionPri : char { // for the longest. If your updates are random across the key space, // write amplification is slightly better with this option. kOldestSmallestSeqFirst = 0x2, + // First compact files whose ratio between overlapping size in next level + // and its size is the smallest. It in many cases can optimize write + // amplification. + kMinOverlappingRatio = 0x3, }; enum class WALRecoveryMode : char {