diff --git a/HISTORY.md b/HISTORY.md index 45c4b7761..639a9074c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,9 @@ * Fixed an optimistic transaction validation bug caused by DBImpl::GetLatestSequenceForKey() returning non-latest seq for merge (#10724). * Fixed a bug in iterator refresh which could segfault for DeleteRange users (#10739). +### Performance Improvements +* Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files. + ## 7.7.0 (09/18/2022) ### Bug Fixes * Fixed a hang when an operation such as `GetLiveFiles` or `CreateNewBackup` is asked to trigger and wait for memtable flush on a read-only DB. Such indirect requests for memtable flush are now ignored on a read-only DB. diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index 8b1e9aaef..9dffb0899 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -221,7 +221,7 @@ Compaction::Compaction( : input_vstorage_(vstorage), start_level_(_inputs[0].level), output_level_(_output_level), - max_output_file_size_(_target_file_size), + target_output_file_size_(_target_file_size), max_compaction_bytes_(_max_compaction_bytes), max_subcompactions_(_max_subcompactions), immutable_options_(_immutable_options), @@ -268,6 +268,14 @@ Compaction::Compaction( max_subcompactions_ = _mutable_db_options.max_subcompactions; } + // for the non-bottommost levels, it tries to build files match the target + // file size, but not guaranteed. It could be 2x the size of the target size. + max_output_file_size_ = + bottommost_level_ || grandparents_.empty() || + !_immutable_options.level_compaction_dynamic_file_size + ? target_output_file_size_ + : 2 * target_output_file_size_; + #ifndef NDEBUG for (size_t i = 1; i < inputs_.size(); ++i) { assert(inputs_[i].level > inputs_[i - 1].level); diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index b624035e6..22c30c9c3 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -163,6 +163,9 @@ class Compaction { // Maximum size of files to build during this compaction. uint64_t max_output_file_size() const { return max_output_file_size_; } + // Target output file size for this compaction + uint64_t target_output_file_size() const { return target_output_file_size_; } + // What compression for output CompressionType output_compression() const { return output_compression_; } @@ -412,6 +415,7 @@ class Compaction { const int start_level_; // the lowest level to be compacted const int output_level_; // levels to which output files are stored + uint64_t target_output_file_size_; uint64_t max_output_file_size_; uint64_t max_compaction_bytes_; uint32_t max_subcompactions_; diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 311d29335..3241b0ceb 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -228,6 +228,9 @@ class CompactionJobTestBase : public testing::Test { test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); env_ = base_env; fs_ = env_->GetFileSystem(); + // set default for the tests + mutable_cf_options_.target_file_size_base = 1024 * 1024; + mutable_cf_options_.max_compaction_bytes = 10 * 1024 * 1024; } void SetUp() override { @@ -415,8 +418,9 @@ class CompactionJobTestBase : public testing::Test { auto cfd = versions_->GetColumnFamilySet()->GetDefault(); if (table_type_ == TableTypeForTest::kMockTable) { - assert(expected_results.size() == 1); - mock_table_factory_->AssertLatestFile(expected_results[0]); + ASSERT_EQ(compaction_job_stats_.num_output_files, + expected_results.size()); + mock_table_factory_->AssertLatestFiles(expected_results); } else { assert(table_type_ == TableTypeForTest::kBlockBasedTable); } @@ -426,7 +430,8 @@ class CompactionJobTestBase : public testing::Test { ASSERT_EQ(expected_output_file_num, output_files.size()); if (table_type_ == TableTypeForTest::kMockTable) { - assert(output_files.size() == 1); + assert(output_files.size() == + static_cast(expected_output_file_num)); const FileMetaData* const output_file = output_files[0]; ASSERT_EQ(output_file->oldest_blob_file_number, expected_oldest_blob_file_numbers[0]); @@ -620,12 +625,22 @@ class CompactionJobTestBase : public testing::Test { num_input_files += level_files.size(); } + std::vector grandparents; + // it should actually be the next non-empty level + const int kGrandparentsLevel = output_level + 1; + if (kGrandparentsLevel < cf_options_.num_levels) { + grandparents = + cfd_->current()->storage_info()->LevelFiles(kGrandparentsLevel); + } + Compaction compaction( cfd->current()->storage_info(), *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), mutable_db_options_, - compaction_input_files, output_level, 1024 * 1024, 10 * 1024 * 1024, 0, - kNoCompression, cfd->GetLatestMutableCFOptions()->compression_opts, - Temperature::kUnknown, max_subcompactions, {}, true); + compaction_input_files, output_level, + mutable_cf_options_.target_file_size_base, + mutable_cf_options_.max_compaction_bytes, 0, kNoCompression, + cfd->GetLatestMutableCFOptions()->compression_opts, + Temperature::kUnknown, max_subcompactions, grandparents, true); compaction.SetInputVersion(cfd->current()); assert(db_options_.info_log); @@ -1721,6 +1736,246 @@ TEST_F(CompactionJobTest, ResultSerialization) { } } +class CompactionJobDynamicFileSizeTest + : public CompactionJobTestBase, + public ::testing::WithParamInterface { + public: + CompactionJobDynamicFileSizeTest() + : CompactionJobTestBase( + test::PerThreadDBPath("compaction_job_dynamic_file_size_test"), + BytewiseComparator(), [](uint64_t /*ts*/) { return ""; }, + /*test_io_priority=*/false, TableTypeForTest::kMockTable) {} +}; + +TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) { + // dynamic_file_size option should has no impact on cutting for max compaction + // bytes. + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + + NewDB(); + mutable_cf_options_.target_file_size_base = 80; + mutable_cf_options_.max_compaction_bytes = 21; + + auto file1 = mock::MakeMockFile({ + {KeyStr("c", 5U, kTypeValue), "val2"}, + {KeyStr("n", 6U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("h", 3U, kTypeValue), "val"}, + {KeyStr("j", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + // Create three L2 files, each size 10. + // max_compaction_bytes 21 means the compaction output in L1 will + // be cut to at least two files. + auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"}, + {KeyStr("c", 1U, kTypeValue), "val"}, + {KeyStr("c1", 1U, kTypeValue), "val"}, + {KeyStr("c2", 1U, kTypeValue), "val"}, + {KeyStr("c3", 1U, kTypeValue), "val"}, + {KeyStr("c4", 1U, kTypeValue), "val"}, + {KeyStr("d", 1U, kTypeValue), "val"}, + {KeyStr("e", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"}, + {KeyStr("i", 1U, kTypeValue), "val"}, + {KeyStr("i1", 1U, kTypeValue), "val"}, + {KeyStr("i2", 1U, kTypeValue), "val"}, + {KeyStr("i3", 1U, kTypeValue), "val"}, + {KeyStr("i4", 1U, kTypeValue), "val"}, + {KeyStr("j", 1U, kTypeValue), "val"}, + {KeyStr("k", 2U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("l", 1U, kTypeValue), "val"}, + {KeyStr("m", 1U, kTypeValue), "val"}, + {KeyStr("m1", 1U, kTypeValue), "val"}, + {KeyStr("m2", 1U, kTypeValue), "val"}, + {KeyStr("m3", 1U, kTypeValue), "val"}, + {KeyStr("m4", 1U, kTypeValue), "val"}, + {KeyStr("n", 1U, kTypeValue), "val"}, + {KeyStr("o", 2U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + auto expected_file1 = + mock::MakeMockFile({{KeyStr("c", 5U, kTypeValue), "val2"}, + {KeyStr("h", 3U, kTypeValue), "val"}}); + auto expected_file2 = + mock::MakeMockFile({{KeyStr("j", 4U, kTypeValue), "val"}, + {KeyStr("n", 6U, kTypeValue), "val3"}}); + + SetLastSequence(6U); + + const std::vector input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}); +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutToSkipGrandparentFile) { + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + + NewDB(); + // Make sure the grandparent level file size (10) qualifies skipping. + // Currently, it has to be > 1/8 of target file size. + mutable_cf_options_.target_file_size_base = 70; + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("z", 6U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("c", 3U, kTypeValue), "val"}, + {KeyStr("x", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"}, + {KeyStr("d", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"}, + {KeyStr("i", 2U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("v", 1U, kTypeValue), "val"}, + {KeyStr("y", 2U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + auto expected_file1 = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("c", 3U, kTypeValue), "val"}}); + auto expected_file2 = + mock::MakeMockFile({{KeyStr("x", 4U, kTypeValue), "val"}, + {KeyStr("z", 6U, kTypeValue), "val3"}}); + + auto expected_file_disable_dynamic_file_size = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("c", 3U, kTypeValue), "val"}, + {KeyStr("x", 4U, kTypeValue), "val"}, + {KeyStr("z", 6U, kTypeValue), "val3"}}); + + SetLastSequence(6U); + const std::vector input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + if (enable_dyanmic_file_size) { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}); + } else { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file_disable_dynamic_file_size}); + } +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundary) { + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + NewDB(); + + // MockTable has 1 byte per entry by default and each file is 10 bytes. + // When the file size is smaller than 100, it won't cut file earlier to align + // with its grandparent boundary. + const size_t kKeyValueSize = 10000; + mock_table_factory_->SetKeyValueSize(kKeyValueSize); + + mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize; + + mock::KVVector file1; + char ch = 'd'; + // Add value from d -> o + for (char i = 0; i < 12; i++) { + file1.emplace_back(KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("e", 3U, kTypeValue), "val"}, + {KeyStr("s", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + // the 1st grandparent file should be skipped + auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("c", 1U, kTypeValue), "val"}, + {KeyStr("e", 2U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"}, + {KeyStr("j", 2U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + auto file6 = mock::MakeMockFile({{KeyStr("k", 1U, kTypeValue), "val"}, + {KeyStr("n", 2U, kTypeValue), "val"}}); + AddMockFile(file6, 2); + + auto file7 = mock::MakeMockFile({{KeyStr("q", 1U, kTypeValue), "val"}, + {KeyStr("t", 2U, kTypeValue), "val"}}); + AddMockFile(file7, 2); + + // The expected outputs are: + // L1: [d,e,f,g,h,i,j] [k,l,m,n,o,s] + // L2: [a, b] [c, e] [h, j] [k, n] [q, t] + // The first output cut earlier at "j", so it could be aligned with L2 files. + // If dynamic_file_size is not enabled, it will be cut based on the + // target_file_size + mock::KVVector expected_file1; + for (char i = 0; i < 7; i++) { + expected_file1.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + mock::KVVector expected_file2; + for (char i = 7; i < 12; i++) { + expected_file2.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + expected_file2.emplace_back(KeyStr("s", 4U, kTypeValue), "val"); + + mock::KVVector expected_file_disable_dynamic_file_size1; + for (char i = 0; i < 10; i++) { + expected_file_disable_dynamic_file_size1.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + mock::KVVector expected_file_disable_dynamic_file_size2; + for (char i = 10; i < 12; i++) { + expected_file_disable_dynamic_file_size2.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + expected_file_disable_dynamic_file_size2.emplace_back( + KeyStr("s", 4U, kTypeValue), "val"); + + SetLastSequence(22U); + const std::vector input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + if (enable_dyanmic_file_size) { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}); + } else { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file_disable_dynamic_file_size1, + expected_file_disable_dynamic_file_size2}); + } +} + +INSTANTIATE_TEST_CASE_P(CompactionJobDynamicFileSizeTest, + CompactionJobDynamicFileSizeTest, testing::Bool()); class CompactionJobTimestampTest : public CompactionJobTestBase { public: diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index 677b47530..dfdaa20bf 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -76,14 +76,87 @@ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status, return io_s; } +size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( + const Slice& internal_key) { + size_t curr_key_boundary_switched_num = 0; + const std::vector& grandparents = compaction_->grandparents(); + + if (grandparents.empty()) { + return curr_key_boundary_switched_num; + } + + // TODO: here it uses the internal comparator but Compaction picker uses user + // comparator to get a clean cut with `GetOverlappingInputs()`, which means + // this function may cut files still be overlapped in compaction, for example + // current function can generate L1 files like: + // L1: [2-21] [22-30] + // L2: [1-10] [21-30] + // Because L1 `21` has higher seq_number, which is smaller than `21` on L2, + // it cuts in the first file. But for compaction picker L1 [2-21] file + // overlaps with both files on L2. Ideally it should cut to + // L1: [2-20] [21-30] + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + + while (grandparent_index_ < grandparents.size()) { + if (being_grandparent_gap_) { + if (icmp->Compare(internal_key, + grandparents[grandparent_index_]->smallest.Encode()) < + 0) { + break; + } + if (seen_key_) { + curr_key_boundary_switched_num++; + grandparent_overlapped_bytes_ += + grandparents[grandparent_index_]->fd.GetFileSize(); + grandparent_boundary_switched_num_++; + } + being_grandparent_gap_ = false; + } else { + if (icmp->Compare(internal_key, + grandparents[grandparent_index_]->largest.Encode()) <= + 0) { + break; + } + if (seen_key_) { + curr_key_boundary_switched_num++; + grandparent_boundary_switched_num_++; + } + being_grandparent_gap_ = true; + grandparent_index_++; + } + } + + // If the first key is in the middle of a grandparent file, adding it to the + // overlap + if (!seen_key_ && !being_grandparent_gap_) { + assert(grandparent_overlapped_bytes_ == 0); + grandparent_overlapped_bytes_ = + grandparents[grandparent_index_]->fd.GetFileSize(); + } + + seen_key_ = true; + return curr_key_boundary_switched_num; +} + bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { assert(c_iter.Valid()); + // always update grandparent information like overlapped file number, size + // etc. + const Slice& internal_key = c_iter.key(); + const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_; + size_t num_grandparent_boundaries_crossed = + UpdateGrandparentBoundaryInfo(internal_key); + + if (!HasBuilder()) { + return false; + } + // If there's user defined partitioner, check that first - if (HasBuilder() && partitioner_ && - partitioner_->ShouldPartition( - PartitionerRequest(last_key_for_partitioner_, c_iter.user_key(), - current_output_file_size_)) == kRequired) { + if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest( + last_key_for_partitioner_, c_iter.user_key(), + current_output_file_size_)) == kRequired) { return true; } @@ -92,12 +165,11 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { return false; } - // reach the target file size + // reach the max file size if (current_output_file_size_ >= compaction_->max_output_file_size()) { return true; } - const Slice& internal_key = c_iter.key(); const InternalKeyComparator* icmp = &compaction_->column_family_data()->internal_comparator(); @@ -111,32 +183,67 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { } } - // Update grandparent information - const std::vector& grandparents = compaction_->grandparents(); - bool grandparant_file_switched = false; - // Scan to find the earliest grandparent file that contains key. - while (grandparent_index_ < grandparents.size() && - icmp->Compare(internal_key, - grandparents[grandparent_index_]->largest.Encode()) > - 0) { - if (seen_key_) { - overlapped_bytes_ += grandparents[grandparent_index_]->fd.GetFileSize(); - grandparant_file_switched = true; + // only check if the current key is going to cross the grandparents file + // boundary (either the file beginning or ending). + if (num_grandparent_boundaries_crossed > 0) { + // Cut the file before the current key if the size of the current output + // file + its overlapped grandparent files is bigger than + // max_compaction_bytes. Which is to prevent future bigger than + // max_compaction_bytes compaction from the current output level. + if (grandparent_overlapped_bytes_ + current_output_file_size_ > + compaction_->max_compaction_bytes()) { + return true; } - assert(grandparent_index_ + 1 >= grandparents.size() || - icmp->Compare( - grandparents[grandparent_index_]->largest.Encode(), - grandparents[grandparent_index_ + 1]->smallest.Encode()) <= 0); - grandparent_index_++; - } - seen_key_ = true; - if (grandparant_file_switched && - overlapped_bytes_ + current_output_file_size_ > - compaction_->max_compaction_bytes()) { - // Too much overlap for current output; start new output - overlapped_bytes_ = 0; - return true; + // Cut the file if including the key is going to add a skippable file on + // the grandparent level AND its size is reasonably big (1/8 of target file + // size). For example, if it's compacting the files L0 + L1: + // L0: [1, 21] + // L1: [3, 23] + // L2: [2, 4] [11, 15] [22, 24] + // Without this break, it will output as: + // L1: [1,3, 21,23] + // With this break, it will output as (assuming [11, 15] at L2 is bigger + // than 1/8 of target size): + // L1: [1,3] [21,23] + // Then for the future compactions, [11,15] won't be included. + // For random datasets (either evenly distributed or skewed), it rarely + // triggers this condition, but if the user is adding 2 different datasets + // without any overlap, it may likely happen. + // More details, check PR #1963 + const size_t num_skippable_boundaries_crossed = + being_grandparent_gap_ ? 2 : 3; + if (compaction_->immutable_options()->compaction_style == + kCompactionStyleLevel && + compaction_->immutable_options()->level_compaction_dynamic_file_size && + num_grandparent_boundaries_crossed >= + num_skippable_boundaries_crossed && + grandparent_overlapped_bytes_ - previous_overlapped_bytes > + compaction_->target_output_file_size() / 8) { + return true; + } + + // Pre-cut the output file if it's reaching a certain size AND it's at the + // boundary of a grandparent file. It can reduce the future compaction size, + // the cost is having smaller files. + // The pre-cut size threshold is based on how many grandparent boundaries + // it has seen before. Basically, if it has seen no boundary at all, then it + // will pre-cut at 50% target file size. Every boundary it has seen + // increases the threshold by 5%, max at 90%, which it will always cut. + // The idea is based on if it has seen more boundaries before, it will more + // likely to see another boundary (file cutting opportunity) before the + // target file size. The test shows it can generate larger files than a + // static threshold like 75% and has a similar write amplification + // improvement. + if (compaction_->immutable_options()->compaction_style == + kCompactionStyleLevel && + compaction_->immutable_options()->level_compaction_dynamic_file_size && + current_output_file_size_ > + ((compaction_->target_output_file_size() + 99) / 100) * + (50 + std::min(grandparent_boundary_switched_num_ * 5, + size_t{40}))) { + return true; + } } // check ttl file boundaries if there's any @@ -189,6 +296,14 @@ Status CompactionOutputs::AddToOutput( if (!s.ok()) { return s; } + // reset grandparent information + const std::vector& grandparents = + compaction_->grandparents(); + grandparent_overlapped_bytes_ = + being_grandparent_gap_ + ? 0 + : grandparents[grandparent_index_]->fd.GetFileSize(); + grandparent_boundary_switched_num_ = 0; } // Open output file if necessary @@ -199,10 +314,9 @@ Status CompactionOutputs::AddToOutput( } } - Output& curr = current_output(); assert(builder_ != nullptr); const Slice& value = c_iter.value(); - s = curr.validator.Add(key, value); + s = current_output().validator.Add(key, value); if (!s.ok()) { return s; } diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index 107044a0d..dbf5f348a 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -216,9 +216,11 @@ class CompactionOutputs { } } - uint64_t GetCurrentOutputFileSize() const { - return current_output_file_size_; - } + // update tracked grandparents information like grandparent index, if it's + // in the gap between 2 grandparent files, accumulated grandparent files size + // etc. + // It returns how many boundaries it crosses by including current key. + size_t UpdateGrandparentBoundaryInfo(const Slice& internal_key); // Add current key from compaction_iterator to the output file. If needed // close and open new compaction output with the functions provided. @@ -311,12 +313,21 @@ class CompactionOutputs { // An index that used to speed up ShouldStopBefore(). size_t grandparent_index_ = 0; + // if the output key is being grandparent files gap, so: + // key > grandparents[grandparent_index_ - 1].largest && + // key < grandparents[grandparent_index_].smallest + bool being_grandparent_gap_ = true; + // The number of bytes overlapping between the current output and // grandparent files used in ShouldStopBefore(). - uint64_t overlapped_bytes_ = 0; + uint64_t grandparent_overlapped_bytes_ = 0; // A flag determines whether the key has been seen in ShouldStopBefore() bool seen_key_ = false; + + // for the current output file, how many file boundaries has it crossed, + // basically number of files overlapped * 2 + size_t grandparent_boundary_switched_num_ = 0; }; // helper struct to concatenate the last level and penultimate level outputs diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 0b6962b64..7b4cc8dfd 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -1029,7 +1029,11 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { options.level0_file_num_compaction_trigger = kNumFilesPerLevel; options.memtable_factory.reset( test::NewSpecialSkipListFactory(2 /* num_entries_flush */)); - options.target_file_size_base = kValueBytes; + // max file size could be 2x of target file size, so set it to half of that + options.target_file_size_base = kValueBytes / 2; + // disable dynamic_file_size, as it will cut L1 files into more files (than + // kNumFilesPerLevel). + options.level_compaction_dynamic_file_size = false; options.max_compaction_bytes = 1500; // i == 0: CompactFiles // i == 1: CompactRange @@ -1100,6 +1104,12 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { test::NewSpecialSkipListFactory(2 /* num_entries_flush */)); options.target_file_size_base = kValueBytes; options.disable_auto_compactions = true; + // disable it for now, otherwise the L1 files are going be cut before data 1: + // L1: [0] [1,4] + // L2: [0,0] + // because the grandparent file is between [0]->[1] and it's size is more than + // 1/8 of target size (4k). + options.level_compaction_dynamic_file_size = false; DestroyAndReopen(options); @@ -1715,17 +1725,17 @@ TEST_F(DBRangeDelTest, OverlappedKeys) { ASSERT_OK(db_->Flush(FlushOptions())); ASSERT_EQ(1, NumTableFilesAtLevel(0)); - // The key range is broken up into three SSTs to avoid a future big compaction + // The key range is broken up into 4 SSTs to avoid a future big compaction // with the grandparent ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, true /* disallow_trivial_move */)); - ASSERT_EQ(3, NumTableFilesAtLevel(1)); + ASSERT_EQ(4, NumTableFilesAtLevel(1)); ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, true /* disallow_trivial_move */)); - ASSERT_EQ( - 3, NumTableFilesAtLevel( - 2)); // L1->L2 compaction size is limited to max_compaction_bytes + // L1->L2 compaction outputs to 2 files because there are 2 separated + // compactions: [0-4] and [5-9] + ASSERT_EQ(2, NumTableFilesAtLevel(2)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); } diff --git a/db/db_test2.cc b/db/db_test2.cc index a4aa41a2e..aeac004b8 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2379,9 +2379,15 @@ TEST_F(DBTest2, MaxCompactionBytesTest) { } ASSERT_OK(dbfull()->TEST_WaitForCompact()); - // Output files to L1 are cut to three pieces, according to - // options.max_compaction_bytes - ASSERT_EQ("0,3,8", FilesPerLevel(0)); + // Output files to L1 are cut to 4 pieces, according to + // options.max_compaction_bytes (300K) + // There are 8 files on L2 (grandparents level), each one is 100K. The first + // file overlaps with a, b which max_compaction_bytes is less than 300K, the + // second one overlaps with d, e, which is also less than 300K. Including any + // extra grandparent file will make the future compaction larger than 300K. + // L1: [ 1 ] [ 2 ] [ 3 ] [ 4 ] + // L2: [a] [b] [c] [d] [e] [f] [g] [h] + ASSERT_EQ("0,4,8", FilesPerLevel(0)); } static void UniqueIdCallback(void* arg) { diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index cfce13d88..b552d66ae 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -648,6 +648,15 @@ struct AdvancedColumnFamilyOptions { // Default: false bool level_compaction_dynamic_level_bytes = false; + // Allows RocksDB to generate files that are not exactly the target_file_size + // only for the non-bottommost files. Which can reduce the write-amplification + // from compaction. The file size could be from 0 to 2x target_file_size. + // Once enabled, non-bottommost compaction will try to cut the files align + // with the next level file boundaries (grandparent level). + // + // Default: true + bool level_compaction_dynamic_file_size = true; + // Default: 10. // // Dynamically changeable through SetOptions() API diff --git a/options/cf_options.cc b/options/cf_options.cc index 53d705ecd..5ae6a86e4 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -550,6 +550,11 @@ static std::unordered_map level_compaction_dynamic_level_bytes), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"level_compaction_dynamic_file_size", + {offsetof(struct ImmutableCFOptions, + level_compaction_dynamic_file_size), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"optimize_filters_for_hits", {offsetof(struct ImmutableCFOptions, optimize_filters_for_hits), OptionType::kBoolean, OptionVerificationType::kNormal, @@ -892,6 +897,8 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) bloom_locality(cf_options.bloom_locality), level_compaction_dynamic_level_bytes( cf_options.level_compaction_dynamic_level_bytes), + level_compaction_dynamic_file_size( + cf_options.level_compaction_dynamic_file_size), num_levels(cf_options.num_levels), optimize_filters_for_hits(cf_options.optimize_filters_for_hits), force_consistency_checks(cf_options.force_consistency_checks), diff --git a/options/cf_options.h b/options/cf_options.h index 47de8e7ae..da6b7252a 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -64,6 +64,8 @@ struct ImmutableCFOptions { bool level_compaction_dynamic_level_bytes; + bool level_compaction_dynamic_file_size; + int num_levels; bool optimize_filters_for_hits; diff --git a/options/options_helper.cc b/options/options_helper.cc index efb1d382e..4118f3961 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -301,6 +301,8 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, cf_opts->bloom_locality = ioptions.bloom_locality; cf_opts->level_compaction_dynamic_level_bytes = ioptions.level_compaction_dynamic_level_bytes; + cf_opts->level_compaction_dynamic_file_size = + ioptions.level_compaction_dynamic_file_size; cf_opts->num_levels = ioptions.num_levels; cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits; cf_opts->force_consistency_checks = ioptions.force_consistency_checks; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 08e86e7fd..fe430718c 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -509,6 +509,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "experimental_mempurge_threshold=0.0001;" "optimize_filters_for_hits=false;" "level_compaction_dynamic_level_bytes=false;" + "level_compaction_dynamic_file_size=true;" "inplace_update_support=false;" "compaction_style=kCompactionStyleFIFO;" "compaction_pri=kMinOverlappingRatio;" diff --git a/options/options_test.cc b/options/options_test.cc index eb50994f5..e6a824c9b 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -2322,6 +2322,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"target_file_size_multiplier", "13"}, {"max_bytes_for_level_base", "14"}, {"level_compaction_dynamic_level_bytes", "true"}, + {"level_compaction_dynamic_file_size", "true"}, {"max_bytes_for_level_multiplier", "15.0"}, {"max_bytes_for_level_multiplier_additional", "16:17:18"}, {"max_compaction_bytes", "21"}, @@ -2458,6 +2459,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13); ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U); ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true); + ASSERT_EQ(new_cf_opt.level_compaction_dynamic_file_size, true); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[0], 16); diff --git a/table/mock_table.cc b/table/mock_table.cc index 03c594bc7..130889eaa 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -122,8 +122,12 @@ class MockTableBuilder : public TableBuilder { public: MockTableBuilder(uint32_t id, MockTableFileSystem* file_system, MockTableFactory::MockCorruptionMode corrupt_mode = - MockTableFactory::kCorruptNone) - : id_(id), file_system_(file_system), corrupt_mode_(corrupt_mode) { + MockTableFactory::kCorruptNone, + size_t key_value_size = 1) + : id_(id), + file_system_(file_system), + corrupt_mode_(corrupt_mode), + key_value_size_(key_value_size) { table_ = MakeMockFile({}); } @@ -171,7 +175,7 @@ class MockTableBuilder : public TableBuilder { uint64_t NumEntries() const override { return table_.size(); } - uint64_t FileSize() const override { return table_.size(); } + uint64_t FileSize() const override { return table_.size() * key_value_size_; } TableProperties GetTableProperties() const override { return TableProperties(); @@ -191,6 +195,7 @@ class MockTableBuilder : public TableBuilder { MockTableFileSystem* file_system_; int corrupt_mode_; KVVector table_; + size_t key_value_size_; }; InternalIterator* MockTableReader::NewIterator( @@ -260,7 +265,8 @@ TableBuilder* MockTableFactory::NewTableBuilder( Status s = GetAndWriteNextID(file, &id); assert(s.ok()); - return new MockTableBuilder(id, &file_system_, corrupt_mode_); + return new MockTableBuilder(id, &file_system_, corrupt_mode_, + key_value_size_); } Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname, @@ -303,22 +309,34 @@ void MockTableFactory::AssertSingleFile(const KVVector& file_contents) { ASSERT_EQ(file_contents, file_system_.files.begin()->second); } -void MockTableFactory::AssertLatestFile(const KVVector& file_contents) { - ASSERT_GE(file_system_.files.size(), 1U); - auto latest = file_system_.files.end(); - --latest; - - if (file_contents != latest->second) { - std::cout << "Wrong content! Content of latest file:" << std::endl; - for (const auto& kv : latest->second) { - ParsedInternalKey ikey; - std::string key, value; - std::tie(key, value) = kv; - ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */)); - std::cout << ikey.DebugString(true, false) << " -> " << value - << std::endl; +void MockTableFactory::AssertLatestFiles( + const std::vector& files_contents) { + ASSERT_GE(file_system_.files.size(), files_contents.size()); + auto it = file_system_.files.rbegin(); + for (auto expect = files_contents.rbegin(); expect != files_contents.rend(); + expect++, it++) { + ASSERT_TRUE(it != file_system_.files.rend()); + if (*expect != it->second) { + std::cout << "Wrong content! Content of file, expect:" << std::endl; + for (const auto& kv : *expect) { + ParsedInternalKey ikey; + std::string key, value; + std::tie(key, value) = kv; + ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */)); + std::cout << ikey.DebugString(true, false) << " -> " << value + << std::endl; + } + std::cout << "actual:" << std::endl; + for (const auto& kv : it->second) { + ParsedInternalKey ikey; + std::string key, value; + std::tie(key, value) = kv; + ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */)); + std::cout << ikey.DebugString(true, false) << " -> " << value + << std::endl; + } + FAIL(); } - FAIL(); } } diff --git a/table/mock_table.h b/table/mock_table.h index a1a7742ae..e4850d060 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -72,10 +72,12 @@ class MockTableFactory : public TableFactory { } void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; } + + void SetKeyValueSize(size_t size) { key_value_size_ = size; } // This function will assert that only a single file exists and that the // contents are equal to file_contents void AssertSingleFile(const KVVector& file_contents); - void AssertLatestFile(const KVVector& file_contents); + void AssertLatestFiles(const std::vector& files_contents); private: Status GetAndWriteNextID(WritableFileWriter* file, uint32_t* id) const; @@ -84,6 +86,8 @@ class MockTableFactory : public TableFactory { mutable MockTableFileSystem file_system_; mutable std::atomic next_id_; MockCorruptionMode corrupt_mode_; + + size_t key_value_size_ = 1; }; } // namespace mock