From 23fa5b7789d6acd0c211d6bdd41448bbf1513bb6 Mon Sep 17 00:00:00 2001 From: Jay Zhuang Date: Thu, 6 Oct 2022 15:54:58 -0700 Subject: [PATCH] Use `sstableKeyCompare()` for compaction output boundary check (#10763) Summary: To make it consistent with the compaction picker which uses the `sstableKeyCompare()` to pick the overlap files. For example, without this change, it may cut L1 files like: ``` L1: [2-21] [22-30] L2: [1-10] [21-30] ``` Because "21" on L1 is smaller than "21" on L2. But for compaction, these 2 files are overlapped. `sstableKeyCompare()` also take range delete into consideration which may cut file for the same key. It also makes the `max_compaction_bytes` calculation more accurate for cases like above, the overlapped bytes was under estimated. Also make sure the 2 keys won't be splitted to 2 files because of reaching `max_compaction_bytes`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10763 Reviewed By: cbi42 Differential Revision: D39971904 Pulled By: cbi42 fbshipit-source-id: bcc309e9c3dc61a8f50667a6f633e6132c0154a8 --- db/compaction/compaction_job_test.cc | 156 ++++++++++++++++++++++++++- db/compaction/compaction_outputs.cc | 92 +++++++++++----- db/compaction/compaction_outputs.h | 5 + db/db_range_del_test.cc | 51 +++++++-- 4 files changed, 267 insertions(+), 37 deletions(-) diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 3241b0ceb..57580445f 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -1748,8 +1748,8 @@ class CompactionJobDynamicFileSizeTest }; TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) { - // dynamic_file_size option should has no impact on cutting for max compaction - // bytes. + // dynamic_file_size option should have no impact on cutting for max + // compaction bytes. bool enable_dyanmic_file_size = GetParam(); cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; @@ -1800,12 +1800,21 @@ TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) { {KeyStr("o", 2U, kTypeValue), "val"}}); AddMockFile(file5, 2); + // The expected output should be: + // L1: [c, h, j] [n] + // L2: [b ... e] [h ... k] [l ... o] + // It's better to have "j" in the first file, because anyway it's overlapping + // with the second file on L2. + // (Note: before this PR, it was cut at "h" because it's using the internal + // comparator which think L1 "h" with seqno 3 is smaller than L2 "h" with + // seqno 1, but actually they're overlapped with the compaction picker). + auto expected_file1 = mock::MakeMockFile({{KeyStr("c", 5U, kTypeValue), "val2"}, - {KeyStr("h", 3U, kTypeValue), "val"}}); + {KeyStr("h", 3U, kTypeValue), "val"}, + {KeyStr("j", 4U, kTypeValue), "val"}}); auto expected_file2 = - mock::MakeMockFile({{KeyStr("j", 4U, kTypeValue), "val"}, - {KeyStr("n", 6U, kTypeValue), "val3"}}); + mock::MakeMockFile({{KeyStr("n", 6U, kTypeValue), "val3"}}); SetLastSequence(6U); @@ -1974,6 +1983,143 @@ TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundary) { } } +TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) { + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + NewDB(); + + // MockTable has 1 byte per entry by default and each file is 10 bytes. + // When the file size is smaller than 100, it won't cut file earlier to align + // with its grandparent boundary. + const size_t kKeyValueSize = 10000; + mock_table_factory_->SetKeyValueSize(kKeyValueSize); + + mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize; + + mock::KVVector file1; + for (int i = 0; i < 7; i++) { + file1.emplace_back(KeyStr("a", 100 - i, kTypeValue), + "val" + std::to_string(100 - i)); + } + file1.emplace_back(KeyStr("b", 90, kTypeValue), "valb"); + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 93U, kTypeValue), "val93"}, + {KeyStr("b", 90U, kTypeValue), "valb"}}); + AddMockFile(file2, 1); + + auto file3 = mock::MakeMockFile({{KeyStr("a", 89U, kTypeValue), "val"}, + {KeyStr("a", 88U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("a", 87U, kTypeValue), "val"}, + {KeyStr("a", 86U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("b", 85U, kTypeValue), "val"}, + {KeyStr("b", 84U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + mock::KVVector expected_file1; + mock::KVVector expected_file_disable_dynamic_file_size; + + for (int i = 0; i < 8; i++) { + expected_file1.emplace_back(KeyStr("a", 100 - i, kTypeValue), + "val" + std::to_string(100 - i)); + expected_file_disable_dynamic_file_size.emplace_back( + KeyStr("a", 100 - i, kTypeValue), "val" + std::to_string(100 - i)); + } + + // make sure `b` is cut in a separated file (so internally it's not using + // internal comparator, which will think the "b:90" (seqno 90) here is smaller + // than "b:85" on L2.) + auto expected_file2 = + mock::MakeMockFile({{KeyStr("b", 90U, kTypeValue), "valb"}}); + + expected_file_disable_dynamic_file_size.emplace_back( + KeyStr("b", 90U, kTypeValue), "valb"); + + SetLastSequence(122U); + const std::vector input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + + // Just keep all the history + std::vector snapshots; + for (int i = 80; i <= 100; i++) { + snapshots.emplace_back(i); + } + if (enable_dyanmic_file_size) { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}, snapshots); + } else { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file_disable_dynamic_file_size}, snapshots); + } +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytesSameKey) { + // dynamic_file_size option should have no impact on cutting for max + // compaction bytes. + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + + NewDB(); + mutable_cf_options_.target_file_size_base = 80; + mutable_cf_options_.max_compaction_bytes = 20; + + auto file1 = mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"}, + {KeyStr("b", 103U, kTypeValue), "val"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 102U, kTypeValue), "val2"}, + {KeyStr("c", 101U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + for (int i = 0; i < 10; i++) { + auto file = + mock::MakeMockFile({{KeyStr("a", 100 - (i * 2), kTypeValue), "val"}, + {KeyStr("a", 99 - (i * 2), kTypeValue), "val"}}); + AddMockFile(file, 2); + } + + for (int i = 0; i < 10; i++) { + auto file = + mock::MakeMockFile({{KeyStr("b", 80 - (i * 2), kTypeValue), "val"}, + {KeyStr("b", 79 - (i * 2), kTypeValue), "val"}}); + AddMockFile(file, 2); + } + + auto file5 = mock::MakeMockFile({{KeyStr("c", 60U, kTypeValue), "valc"}, + {KeyStr("c", 59U, kTypeValue), "valc"}}); + + // "a" has 10 overlapped grandparent files (each size 10), which is far + // exceeded the `max_compaction_bytes`, but make sure 2 "a" are not separated, + // as splitting them won't help reducing the compaction size. + // also make sure "b" and "c" are cut separately. + mock::KVVector expected_file1 = + mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"}, + {KeyStr("a", 102U, kTypeValue), "val2"}}); + mock::KVVector expected_file2 = + mock::MakeMockFile({{KeyStr("b", 103U, kTypeValue), "val"}}); + mock::KVVector expected_file3 = + mock::MakeMockFile({{KeyStr("c", 101U, kTypeValue), "val"}}); + + SetLastSequence(122U); + const std::vector input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + + // Just keep all the history + std::vector snapshots; + for (int i = 80; i <= 105; i++) { + snapshots.emplace_back(i); + } + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2, expected_file3}, snapshots); +} + INSTANTIATE_TEST_CASE_P(CompactionJobDynamicFileSizeTest, CompactionJobDynamicFileSizeTest, testing::Bool()); diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index bb077015c..e74378e2a 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -84,25 +84,20 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( if (grandparents.empty()) { return curr_key_boundary_switched_num; } + assert(!internal_key.empty()); + InternalKey ikey; + ikey.DecodeFrom(internal_key); + assert(ikey.Valid()); - // TODO: here it uses the internal comparator but Compaction picker uses user - // comparator to get a clean cut with `GetOverlappingInputs()`, which means - // this function may cut files still be overlapped in compaction, for example - // current function can generate L1 files like: - // L1: [2-21] [22-30] - // L2: [1-10] [21-30] - // Because L1 `21` has higher seq_number, which is smaller than `21` on L2, - // it cuts in the first file. But for compaction picker L1 [2-21] file - // overlaps with both files on L2. Ideally it should cut to - // L1: [2-20] [21-30] - const InternalKeyComparator* icmp = - &compaction_->column_family_data()->internal_comparator(); + const Comparator* ucmp = compaction_->column_family_data()->user_comparator(); + // Move the grandparent_index_ to the file containing the current user_key. + // If there are multiple files containing the same user_key, make sure the + // index points to the last file containing the key. while (grandparent_index_ < grandparents.size()) { if (being_grandparent_gap_) { - if (icmp->Compare(internal_key, - grandparents[grandparent_index_]->smallest.Encode()) < - 0) { + if (sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_]->smallest) < 0) { break; } if (seen_key_) { @@ -113,9 +108,16 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( } being_grandparent_gap_ = false; } else { - if (icmp->Compare(internal_key, - grandparents[grandparent_index_]->largest.Encode()) <= - 0) { + int cmp_result = sstableKeyCompare( + ucmp, ikey, grandparents[grandparent_index_]->largest); + // If it's same key, make sure grandparent_index_ is pointing to the last + // one. + if (cmp_result < 0 || + (cmp_result == 0 && + (grandparent_index_ == grandparents.size() - 1 || + sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_ + 1]->smallest) < + 0))) { break; } if (seen_key_) { @@ -132,13 +134,55 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( if (!seen_key_ && !being_grandparent_gap_) { assert(grandparent_overlapped_bytes_ == 0); grandparent_overlapped_bytes_ = - grandparents[grandparent_index_]->fd.GetFileSize(); + GetCurrentKeyGrandparentOverlappedBytes(internal_key); } seen_key_ = true; return curr_key_boundary_switched_num; } +uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes( + const Slice& internal_key) const { + // no overlap with any grandparent file + if (being_grandparent_gap_) { + return 0; + } + uint64_t overlapped_bytes = 0; + + const std::vector& grandparents = compaction_->grandparents(); + const Comparator* ucmp = compaction_->column_family_data()->user_comparator(); + InternalKey ikey; + ikey.DecodeFrom(internal_key); +#ifndef NDEBUG + // make sure the grandparent_index_ is pointing to the last files containing + // the current key. + int cmp_result = + sstableKeyCompare(ucmp, ikey, grandparents[grandparent_index_]->largest); + assert( + cmp_result < 0 || + (cmp_result == 0 && + (grandparent_index_ == grandparents.size() - 1 || + sstableKeyCompare( + ucmp, ikey, grandparents[grandparent_index_ + 1]->smallest) < 0))); + assert(sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_]->smallest) >= 0); +#endif + overlapped_bytes += grandparents[grandparent_index_]->fd.GetFileSize(); + + // go backwards to find all overlapped files, one key can overlap multiple + // files. In the following example, if the current output key is `c`, and one + // compaction file was cut before `c`, current `c` can overlap with 3 files: + // [a b] [c... + // [b, b] [c, c] [c, c] [c, d] + for (int64_t i = static_cast(grandparent_index_) - 1; + i >= 0 && sstableKeyCompare(ucmp, ikey, grandparents[i]->largest) == 0; + i--) { + overlapped_bytes += grandparents[i]->fd.GetFileSize(); + } + + return overlapped_bytes; +} + bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { assert(c_iter.Valid()); @@ -238,7 +282,7 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { if (compaction_->immutable_options()->compaction_style == kCompactionStyleLevel && compaction_->immutable_options()->level_compaction_dynamic_file_size && - current_output_file_size_ > + current_output_file_size_ >= ((compaction_->target_output_file_size() + 99) / 100) * (50 + std::min(grandparent_boundary_switched_num_ * 5, size_t{40}))) { @@ -297,13 +341,9 @@ Status CompactionOutputs::AddToOutput( return s; } // reset grandparent information - const std::vector& grandparents = - compaction_->grandparents(); - grandparent_overlapped_bytes_ = - being_grandparent_gap_ - ? 0 - : grandparents[grandparent_index_]->fd.GetFileSize(); grandparent_boundary_switched_num_ = 0; + grandparent_overlapped_bytes_ = + GetCurrentKeyGrandparentOverlappedBytes(key); } // Open output file if necessary diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index 081a87d48..f40aa8215 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -227,6 +227,11 @@ class CompactionOutputs { // It returns how many boundaries it crosses by including current key. size_t UpdateGrandparentBoundaryInfo(const Slice& internal_key); + // helper function to get the overlapped grandparent files size, it's only + // used for calculating the first key's overlap. + uint64_t GetCurrentKeyGrandparentOverlappedBytes( + const Slice& internal_key) const; + // Add current key from compaction_iterator to the output file. If needed // close and open new compaction output with the functions provided. Status AddToOutput(const CompactionIterator& c_iter, diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 3f982003c..8c3884863 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -1726,17 +1726,16 @@ TEST_F(DBRangeDelTest, OverlappedKeys) { ASSERT_OK(db_->Flush(FlushOptions())); ASSERT_EQ(1, NumTableFilesAtLevel(0)); - // The key range is broken up into 4 SSTs to avoid a future big compaction + // The key range is broken up into three SSTs to avoid a future big compaction // with the grandparent ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, true /* disallow_trivial_move */)); - ASSERT_EQ(4, NumTableFilesAtLevel(1)); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, true /* disallow_trivial_move */)); - // L1->L2 compaction outputs to 2 files because there are 2 separated - // compactions: [0-4] and [5-9] - ASSERT_EQ(2, NumTableFilesAtLevel(2)); + // L1->L2 compaction size is limited to max_compaction_bytes + ASSERT_EQ(3, NumTableFilesAtLevel(2)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); } @@ -1985,6 +1984,40 @@ TEST_F(DBRangeDelTest, TombstoneFromCurrentLevel) { delete iter; } +class TombstoneTestSstPartitioner : public SstPartitioner { + public: + const char* Name() const override { return "SingleKeySstPartitioner"; } + + PartitionerResult ShouldPartition( + const PartitionerRequest& request) override { + if (cmp->Compare(*request.current_user_key, DBTestBase::Key(5)) == 0) { + return kRequired; + } else { + return kNotRequired; + } + } + + bool CanDoTrivialMove(const Slice& /*smallest_user_key*/, + const Slice& /*largest_user_key*/) override { + return false; + } + + const Comparator* cmp = BytewiseComparator(); +}; + +class TombstoneTestSstPartitionerFactory : public SstPartitionerFactory { + public: + static const char* kClassName() { + return "TombstoneTestSstPartitionerFactory"; + } + const char* Name() const override { return kClassName(); } + + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /* context */) const override { + return std::unique_ptr(new TombstoneTestSstPartitioner()); + } +}; + TEST_F(DBRangeDelTest, TombstoneAcrossFileBoundary) { // Verify that a range tombstone across file boundary covers keys from older // levels. Test set up: @@ -1998,11 +2031,17 @@ TEST_F(DBRangeDelTest, TombstoneAcrossFileBoundary) { options.target_file_size_base = 2 * 1024; options.max_compaction_bytes = 2 * 1024; + // Make sure L1 files are split before "5" + auto factory = std::make_shared(); + options.sst_partitioner_factory = factory; + DestroyAndReopen(options); Random rnd(301); // L2 - ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 10))); + // the file should be smaller than max_compaction_bytes, otherwise the file + // will be cut before 7. + ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 9))); ASSERT_OK(db_->Flush(FlushOptions())); MoveFilesToLevel(2); ASSERT_EQ(1, NumTableFilesAtLevel(2));