diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index a8f314a5b..021964e82 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -753,8 +753,10 @@ Compaction* CompactionPicker::CompactRange( // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out // files that are created during the current compaction. - if (compact_range_options.bottommost_level_compaction == - BottommostLevelCompaction::kForceOptimized && + if ((compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized || + compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter) && max_file_num_to_ignore != std::numeric_limits::max()) { assert(input_level == output_level); // inputs_shrunk holds a continuous subset of input files which were all diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 792118609..69face563 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -2614,7 +2614,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { compact_options.target_level = 7; ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); - ASSERT_EQ(trivial_move, 1); + ASSERT_GE(trivial_move, 1); ASSERT_EQ(non_trivial_move, 0); prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 3b1650412..1e9721949 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -7966,26 +7966,14 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { auto start_idx = key_idx; GenerateNewFile(&rnd, &key_idx); GenerateNewFile(&rnd, &key_idx); - auto end_idx = key_idx - 1; ASSERT_EQ("1,1,2", FilesPerLevel(0)); - // Next two CompactRange() calls are used to test exercise error paths within - // RefitLevel() before triggering a valid RefitLevel() call - - // Trigger a refit to L1 first - { - std::string begin_string = Key(start_idx); - std::string end_string = Key(end_idx); - Slice begin(begin_string); - Slice end(end_string); - - CompactRangeOptions cro; - cro.change_level = true; - cro.target_level = 1; - ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end)); - } - ASSERT_EQ("0,3,2", FilesPerLevel(0)); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,2", FilesPerLevel(0)); + // The next CompactRange() call is used to test exercise error paths within + // RefitLevel() before triggering a valid RefitLevel() call + // // Try a refit from L2->L1 - this should fail and exercise error paths in // RefitLevel() { @@ -8000,7 +7988,7 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { cro.target_level = 1; ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end)); } - ASSERT_EQ("0,3,2", FilesPerLevel(0)); + ASSERT_EQ("0,2,2", FilesPerLevel(0)); // Try a valid Refit request to ensure, the path is still working { @@ -9637,6 +9625,134 @@ TEST_F(DBCompactionTest, DrainUnnecessaryLevelsAfterDBBecomesSmall) { ASSERT_GT(after_delete_range_nonempty, final_num_nonempty); } +TEST_F(DBCompactionTest, ManualCompactionCompactAllKeysInRange) { + // CompactRange() used to pre-compute target level to compact to + // before running compactions. However, the files at target level + // could be trivially moved down by some background compaction. This means + // some keys in the manual compaction key range may not be compacted + // during the manual compaction. This unit test tests this scenario. + // A fix has been applied for this scenario to always compact + // to the bottommost level. + const int kBaseLevelBytes = 8 << 20; // 8MB + const int kMultiplier = 2; + Options options = CurrentOptions(); + options.num_levels = 7; + options.level_compaction_dynamic_level_bytes = false; + options.compaction_style = kCompactionStyleLevel; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kMultiplier; + options.compression = kNoCompression; + options.target_file_size_base = 2 * kBaseLevelBytes; + + DestroyAndReopen(options); + Random rnd(301); + // Populate L2 so that manual compaction will compact to at least L2. + // Otherwise, there is still a possibility of race condition where + // the manual compaction thread believes that max non-empty level is L1 + // while there is some auto compaction that moves some files from L1 to L2. + ASSERT_OK(db_->Put(WriteOptions(), Key(1000), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // one file in L1: [Key(5), Key(6)] + ASSERT_OK( + db_->Put(WriteOptions(), Key(5), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK( + db_->Put(WriteOptions(), Key(6), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + ASSERT_OK( + db_->Put(WriteOptions(), Key(1), rnd.RandomString(kBaseLevelBytes / 2))); + // We now do manual compaction for key range [Key(1), Key(6)]. + // First it compacts file [Key(1)] to L1. + // L1 will have two files [Key(1)], and [Key(5), Key(6)]. + // After L0 -> L1 manual compaction, an automatic compaction will trivially + // move both files from L1 to L2. Here the dependency makes manual compaction + // wait for auto-compaction to pick a compaction before proceeding. Manual + // compaction should not stop at L1 and keep compacting L2. With kForce + // specified, expected output is that manual compaction compacts to L2 and L2 + // will contain 2 files: one for Key(1000) and one for Key(1), Key(5) and + // Key(6). + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction():AfterPickCompaction", + "DBImpl::RunManualCompaction()::1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + std::string begin_str = Key(1); + std::string end_str = Key(6); + Slice begin_slice = begin_str; + Slice end_slice = end_str; + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice)); + + ASSERT_EQ(NumTableFilesAtLevel(2), 2); +} + +TEST_F(DBCompactionTest, + ManualCompactionCompactAllKeysInRangeDynamicLevelBytes) { + // Similar to the test above (ManualCompactionCompactAllKeysInRange), but with + // level_compaction_dynamic_level_bytes = true. + const int kBaseLevelBytes = 8 << 20; // 8MB + const int kMultiplier = 2; + Options options = CurrentOptions(); + options.num_levels = 7; + options.level_compaction_dynamic_level_bytes = true; + options.compaction_style = kCompactionStyleLevel; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kMultiplier; + options.compression = kNoCompression; + options.target_file_size_base = 2 * kBaseLevelBytes; + DestroyAndReopen(options); + + Random rnd(301); + ASSERT_OK(db_->Put(WriteOptions(), Key(5), + rnd.RandomString(3 * kBaseLevelBytes / 2))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + // L6 now has one file with size ~ 3/2 * kBaseLevelBytes. + // L5 is the new base level, with target size ~ 3/4 * kBaseLevelBytes. + + ASSERT_OK( + db_->Put(WriteOptions(), Key(3), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK( + db_->Put(WriteOptions(), Key(4), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK(Flush()); + + MoveFilesToLevel(5); + ASSERT_EQ(1, NumTableFilesAtLevel(5)); + // L5 now has one file with size ~ 2/3 * kBaseLevelBytes, which is below its + // target size. + + ASSERT_OK( + db_->Put(WriteOptions(), Key(1), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK( + db_->Put(WriteOptions(), Key(2), rnd.RandomString(kBaseLevelBytes / 3))); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction():AfterPickCompaction", + "DBImpl::RunManualCompaction()::1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + // After compacting the file with [Key(1), Key(2)] to L5, + // L5 has size ~ 4/3 * kBaseLevelBytes > its target size. + // We let manual compaction wait for an auto-compaction to pick + // a compaction before proceeding. The auto-compaction would + // trivially move both files in L5 down to L6. If manual compaction + // works correctly with kForce specified, it should rewrite the two files in + // L6 into a single file. + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + std::string begin_str = Key(1); + std::string end_str = Key(4); + Slice begin_slice = begin_str; + Slice end_slice = end_str; + ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice)); + ASSERT_EQ(2, NumTableFilesAtLevel(6)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index c40dc1df9..4e313af81 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1066,7 +1066,6 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, std::numeric_limits::max(), trim_ts); } else { int first_overlapped_level = kInvalidLevel; - int max_overlapped_level = kInvalidLevel; { SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); Version* current_version = super_version->current; @@ -1142,10 +1141,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, begin, end); } if (overlap) { - if (first_overlapped_level == kInvalidLevel) { - first_overlapped_level = level; - } - max_overlapped_level = level; + first_overlapped_level = level; + break; } } CleanupSuperVersion(super_version); @@ -1159,7 +1156,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, end, exclusive, true /* disallow_trivial_move */, std::numeric_limits::max() /* max_file_num_to_ignore */, trim_ts); - final_output_level = max_overlapped_level; + final_output_level = first_overlapped_level; } else { assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel); uint64_t next_file_number = versions_->current_next_file_number(); @@ -1171,7 +1168,29 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, int level = first_overlapped_level; final_output_level = level; int output_level = 0, base_level = 0; - while (level < max_overlapped_level || level == 0) { + for (;;) { + // Always allow L0 -> L1 compaction + if (level > 0) { + if (cfd->ioptions()->level_compaction_dynamic_level_bytes) { + assert(final_output_level < cfd->ioptions()->num_levels); + if (final_output_level + 1 == cfd->ioptions()->num_levels) { + break; + } + } else { + // TODO(cbi): there is still a race condition here where + // if a background compaction compacts some file beyond + // current()->storage_info()->num_non_empty_levels() right after + // the check here.This should happen very infrequently and should + // not happen once a user populates the last level of the LSM. + InstrumentedMutexLock l(&mutex_); + // num_non_empty_levels may be lower after a compaction, so + // we check for >= here. + if (final_output_level + 1 >= + cfd->current()->storage_info()->num_non_empty_levels()) { + break; + } + } + } output_level = level + 1; if (cfd->ioptions()->level_compaction_dynamic_level_bytes && level == 0) { @@ -1203,17 +1222,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, if (s.ok()) { assert(final_output_level > 0); // bottommost level intra-level compaction - // TODO(cbi): this preserves earlier behavior where if - // max_overlapped_level = 0 and bottommost_level_compaction is - // kIfHaveCompactionFilter, we only do a L0 -> LBase compaction - // and do not do intra-LBase compaction even when user configures - // compaction filter. We may want to still do a LBase -> LBase - // compaction in case there is some file in LBase that did not go - // through L0 -> LBase compaction, and hence did not go through - // compaction filter. if ((options.bottommost_level_compaction == BottommostLevelCompaction::kIfHaveCompactionFilter && - max_overlapped_level != 0 && (cfd->ioptions()->compaction_filter != nullptr || cfd->ioptions()->compaction_filter_factory != nullptr)) || options.bottommost_level_compaction == @@ -1221,10 +1231,11 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, options.bottommost_level_compaction == BottommostLevelCompaction::kForce) { // Use `next_file_number` as `max_file_num_to_ignore` to avoid - // rewriting newly compacted files when it is kForceOptimized. + // rewriting newly compacted files when it is kForceOptimized + // or kIfHaveCompactionFilter with compaction filter set. s = RunManualCompaction( cfd, final_output_level, final_output_level, options, begin, - end, exclusive, !trim_ts.empty() /* disallow_trivial_move */, + end, exclusive, true /* disallow_trivial_move */, next_file_number /* max_file_num_to_ignore */, trim_ts); } } diff --git a/db/db_test2.cc b/db/db_test2.cc index 91ded2890..52d64a900 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2063,6 +2063,7 @@ class PinL0IndexAndFilterBlocksTest ASSERT_OK(Flush(1)); // move this table to L1 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_EQ(1, NumTableFilesAtLevel(1, 1)); // reset block cache table_options.block_cache = NewLRUCache(64 * 1024); @@ -2220,7 +2221,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { // this should be read from L1 value = Get(1, "a"); if (!disallow_preload_) { - // In inifinite max files case, there's a cache miss in executing Get() + // In infinite max files case, there's a cache miss in executing Get() // because index and filter are not prefetched before. ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); @@ -2248,12 +2249,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } // Bloom and index hit will happen when a Get() happens. @@ -2262,12 +2263,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } } diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index b92cb794b..2214ece0d 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -286,9 +286,9 @@ TEST_F(ManualCompactionTest, SkipLevel) { filter->Reset(); ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); ASSERT_EQ(4, filter->NumKeys()); - // 1 is first compacted to L1 and then further compacted into [2, 4, 8], - // so finally the logged level for 1 is L1. - ASSERT_EQ(1, filter->KeyLevel("1")); + // 1 is first compacted from L0 to L1, and then L1 intra level compaction + // compacts [2, 4, 8] only. + ASSERT_EQ(0, filter->KeyLevel("1")); ASSERT_EQ(1, filter->KeyLevel("2")); ASSERT_EQ(1, filter->KeyLevel("4")); ASSERT_EQ(1, filter->KeyLevel("8")); diff --git a/db/seqno_time_test.cc b/db/seqno_time_test.cc index 5914f8607..b18b25512 100644 --- a/db/seqno_time_test.cc +++ b/db/seqno_time_test.cc @@ -226,7 +226,8 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { } ASSERT_OK(Flush()); } - ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // Second to last level + MoveFilesToLevel(5); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index ac9d19191..97999fcc1 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1817,15 +1817,18 @@ struct CompactionOptions { // For level based compaction, we can configure if we want to skip/force // bottommost level compaction. enum class BottommostLevelCompaction { - // Skip bottommost level compaction + // Skip bottommost level compaction. kSkip, - // Only compact bottommost level if there is a compaction filter - // This is the default option + // Only compact bottommost level if there is a compaction filter. + // This is the default option. + // Similar to kForceOptimized, when compacting bottommost level, avoid + // double-compacting files + // created in the same manual compaction. kIfHaveCompactionFilter, - // Always compact bottommost level + // Always compact bottommost level. kForce, // Always compact bottommost level but in bottommost level avoid - // double-compacting files created in the same compaction + // double-compacting files created in the same compaction. kForceOptimized, }; diff --git a/unreleased_history/behavior_changes/compact_range_to_bottom.md b/unreleased_history/behavior_changes/compact_range_to_bottom.md new file mode 100644 index 000000000..480caed1e --- /dev/null +++ b/unreleased_history/behavior_changes/compact_range_to_bottom.md @@ -0,0 +1,2 @@ +For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468) +For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468) \ No newline at end of file