`CompactRange()` always compacts to bottommost level for leveled compaction (#11468)

Summary: currently for leveled compaction, the max output level of a call to `CompactRange()` is pre-computed before compacting each level. This max output level is the max level whose key range overlaps with the manual compaction key range. However, during manual compaction, files in the max output level may be compacted down further by some background compaction. When this background compaction is a trivial move, there is a race condition and the manual compaction may not be able to compact all keys in the specified key range. This PR updates `CompactRange()` to always compact to the bottommost level to make this race condition more unlikely (it can still happen, see more in comment here: 796f58f42a/db/db_impl/db_impl_compaction_flush.cc (L1180C29-L1184)). This PR also changes the behavior of CompactRange() when `bottommost_level_compaction=kIfHaveCompactionFilter` (the default option). The old behavior is that, if a compaction filter is provided, CompactRange() always does an intra-level compaction at the final output level for all files in the manual compaction key range. The only exception when `first_overlapped_level = 0` and `max_overlapped_level = 0`. It’s awkward to maintain the same behavior after this PR since we do not compute max_overlapped_level anymore. So the new behavior is similar to kForceOptimized: always does intra-level compaction at the bottommost level, but not including new files generated during this manual compaction. Several unit tests are updated to work with this new manual compaction behavior. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11468 Test Plan: Add new unit tests `DBCompactionTest.ManualCompactionCompactAllKeysInRange*` Reviewed By: ajkr Differential Revision: D46079619 Pulled By: cbi42 fbshipit-source-id: 19d844ba4ec8dc1a0b8af5d2f36ff15820c6e76f
2 years ago · e95cc1217d
parent 9f7877f246
commit e95cc1217d
9 changed files with 189 additions and 53 deletions
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@ -753,8 +753,10 @@ Compaction* CompactionPicker::CompactRange(

  // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
  // files that are created during the current compaction.
-  if (compact_range_options.bottommost_level_compaction ==
-          BottommostLevelCompaction::kForceOptimized &&
+  if ((compact_range_options.bottommost_level_compaction ==
+           BottommostLevelCompaction::kForceOptimized ||
+       compact_range_options.bottommost_level_compaction ==
+           BottommostLevelCompaction::kIfHaveCompactionFilter) &&
      max_file_num_to_ignore != std::numeric_limits<uint64_t>::max()) {
    assert(input_level == output_level);
    // inputs_shrunk holds a continuous subset of input files which were all
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@ -2614,7 +2614,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
  compact_options.target_level = 7;
  ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));

-  ASSERT_EQ(trivial_move, 1);
+  ASSERT_GE(trivial_move, 1);
  ASSERT_EQ(non_trivial_move, 0);

  prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@ -7966,26 +7966,14 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) {
  auto start_idx = key_idx;
  GenerateNewFile(&rnd, &key_idx);
  GenerateNewFile(&rnd, &key_idx);
-  auto end_idx = key_idx - 1;
  ASSERT_EQ("1,1,2", FilesPerLevel(0));

-  // Next two CompactRange() calls are used to test exercise error paths within
-  // RefitLevel() before triggering a valid RefitLevel() call
-
-  // Trigger a refit to L1 first
-  {
-    std::string begin_string = Key(start_idx);
-    std::string end_string = Key(end_idx);
-    Slice begin(begin_string);
-    Slice end(end_string);
-
-    CompactRangeOptions cro;
-    cro.change_level = true;
-    cro.target_level = 1;
-    ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end));
-  }
-  ASSERT_EQ("0,3,2", FilesPerLevel(0));
+  MoveFilesToLevel(1);
+  ASSERT_EQ("0,2,2", FilesPerLevel(0));

+  // The next CompactRange() call is used to test exercise error paths within
+  // RefitLevel() before triggering a valid RefitLevel() call
+  //
  // Try a refit from L2->L1 - this should fail and exercise error paths in
  // RefitLevel()
  {
@ -8000,7 +7988,7 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) {
    cro.target_level = 1;
    ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end));
  }
-  ASSERT_EQ("0,3,2", FilesPerLevel(0));
+  ASSERT_EQ("0,2,2", FilesPerLevel(0));

  // Try a valid Refit request to ensure, the path is still working
  {
@ -9637,6 +9625,134 @@ TEST_F(DBCompactionTest, DrainUnnecessaryLevelsAfterDBBecomesSmall) {
  ASSERT_GT(after_delete_range_nonempty, final_num_nonempty);
 }

+TEST_F(DBCompactionTest, ManualCompactionCompactAllKeysInRange) {
+  // CompactRange() used to pre-compute target level to compact to
+  // before running compactions. However, the files at target level
+  // could be trivially moved down by some background compaction. This means
+  // some keys in the manual compaction key range may not be compacted
+  // during the manual compaction. This unit test tests this scenario.
+  // A fix has been applied for this scenario to always compact
+  // to the bottommost level.
+  const int kBaseLevelBytes = 8 << 20;  // 8MB
+  const int kMultiplier = 2;
+  Options options = CurrentOptions();
+  options.num_levels = 7;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.compaction_style = kCompactionStyleLevel;
+  options.max_bytes_for_level_base = kBaseLevelBytes;
+  options.max_bytes_for_level_multiplier = kMultiplier;
+  options.compression = kNoCompression;
+  options.target_file_size_base = 2 * kBaseLevelBytes;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // Populate L2 so that manual compaction will compact to at least L2.
+  // Otherwise, there is still a possibility of race condition where
+  // the manual compaction thread believes that max non-empty level is L1
+  // while there is some auto compaction that moves some files from L1 to L2.
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1000), rnd.RandomString(100)));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // one file in L1: [Key(5), Key(6)]
+  ASSERT_OK(
+      db_->Put(WriteOptions(), Key(5), rnd.RandomString(kBaseLevelBytes / 3)));
+  ASSERT_OK(
+      db_->Put(WriteOptions(), Key(6), rnd.RandomString(kBaseLevelBytes / 3)));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(
+      db_->Put(WriteOptions(), Key(1), rnd.RandomString(kBaseLevelBytes / 2)));
+  // We now do manual compaction for key range [Key(1), Key(6)].
+  // First it compacts file [Key(1)] to L1.
+  // L1 will have two files [Key(1)], and [Key(5), Key(6)].
+  // After L0 -> L1 manual compaction, an automatic compaction will trivially
+  // move both files from L1 to L2. Here the dependency makes manual compaction
+  // wait for auto-compaction to pick a compaction before proceeding. Manual
+  // compaction should not stop at L1 and keep compacting L2. With kForce
+  // specified, expected output is that manual compaction compacts to L2 and L2
+  // will contain 2 files: one for Key(1000) and one for Key(1), Key(5) and
+  // Key(6).
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction():AfterPickCompaction",
+        "DBImpl::RunManualCompaction()::1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  std::string begin_str = Key(1);
+  std::string end_str = Key(6);
+  Slice begin_slice = begin_str;
+  Slice end_slice = end_str;
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice));
+
+  ASSERT_EQ(NumTableFilesAtLevel(2), 2);
+}
+
+TEST_F(DBCompactionTest,
+       ManualCompactionCompactAllKeysInRangeDynamicLevelBytes) {
+  // Similar to the test above (ManualCompactionCompactAllKeysInRange), but with
+  // level_compaction_dynamic_level_bytes = true.
+  const int kBaseLevelBytes = 8 << 20;  // 8MB
+  const int kMultiplier = 2;
+  Options options = CurrentOptions();
+  options.num_levels = 7;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.compaction_style = kCompactionStyleLevel;
+  options.max_bytes_for_level_base = kBaseLevelBytes;
+  options.max_bytes_for_level_multiplier = kMultiplier;
+  options.compression = kNoCompression;
+  options.target_file_size_base = 2 * kBaseLevelBytes;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5),
+                     rnd.RandomString(3 * kBaseLevelBytes / 2)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(6));
+  // L6 now has one file with size ~ 3/2 * kBaseLevelBytes.
+  // L5 is the new base level, with target size ~ 3/4 * kBaseLevelBytes.
+
+  ASSERT_OK(
+      db_->Put(WriteOptions(), Key(3), rnd.RandomString(kBaseLevelBytes / 3)));
+  ASSERT_OK(
+      db_->Put(WriteOptions(), Key(4), rnd.RandomString(kBaseLevelBytes / 3)));
+  ASSERT_OK(Flush());
+
+  MoveFilesToLevel(5);
+  ASSERT_EQ(1, NumTableFilesAtLevel(5));
+  // L5 now has one file with size ~ 2/3 * kBaseLevelBytes, which is below its
+  // target size.
+
+  ASSERT_OK(
+      db_->Put(WriteOptions(), Key(1), rnd.RandomString(kBaseLevelBytes / 3)));
+  ASSERT_OK(
+      db_->Put(WriteOptions(), Key(2), rnd.RandomString(kBaseLevelBytes / 3)));
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction():AfterPickCompaction",
+        "DBImpl::RunManualCompaction()::1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  // After compacting the file with [Key(1), Key(2)] to L5,
+  // L5 has size ~ 4/3 * kBaseLevelBytes > its target size.
+  // We let manual compaction wait for an auto-compaction to pick
+  // a compaction before proceeding. The auto-compaction would
+  // trivially move both files in L5 down to L6. If manual compaction
+  // works correctly with kForce specified, it should rewrite the two files in
+  // L6 into a single file.
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  std::string begin_str = Key(1);
+  std::string end_str = Key(4);
+  Slice begin_slice = begin_str;
+  Slice end_slice = end_str;
+  ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice));
+  ASSERT_EQ(2, NumTableFilesAtLevel(6));
+  ASSERT_EQ(0, NumTableFilesAtLevel(5));
+}
+
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@ -1066,7 +1066,6 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
                            std::numeric_limits<uint64_t>::max(), trim_ts);
  } else {
    int first_overlapped_level = kInvalidLevel;
-    int max_overlapped_level = kInvalidLevel;
    {
      SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
      Version* current_version = super_version->current;
@ -1142,10 +1141,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
                                                                    begin, end);
        }
        if (overlap) {
-          if (first_overlapped_level == kInvalidLevel) {
-            first_overlapped_level = level;
-          }
-          max_overlapped_level = level;
+          first_overlapped_level = level;
+          break;
        }
      }
      CleanupSuperVersion(super_version);
@ -1159,7 +1156,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
            end, exclusive, true /* disallow_trivial_move */,
            std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
            trim_ts);
-        final_output_level = max_overlapped_level;
+        final_output_level = first_overlapped_level;
      } else {
        assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel);
        uint64_t next_file_number = versions_->current_next_file_number();
@ -1171,7 +1168,29 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
        int level = first_overlapped_level;
        final_output_level = level;
        int output_level = 0, base_level = 0;
-        while (level < max_overlapped_level || level == 0) {
+        for (;;) {
+          // Always allow L0 -> L1 compaction
+          if (level > 0) {
+            if (cfd->ioptions()->level_compaction_dynamic_level_bytes) {
+              assert(final_output_level < cfd->ioptions()->num_levels);
+              if (final_output_level + 1 == cfd->ioptions()->num_levels) {
+                break;
+              }
+            } else {
+              // TODO(cbi): there is still a race condition here where
+              //  if a background compaction compacts some file beyond
+              //  current()->storage_info()->num_non_empty_levels() right after
+              //  the check here.This should happen very infrequently and should
+              //  not happen once a user populates the last level of the LSM.
+              InstrumentedMutexLock l(&mutex_);
+              // num_non_empty_levels may be lower after a compaction, so
+              // we check for >= here.
+              if (final_output_level + 1 >=
+                  cfd->current()->storage_info()->num_non_empty_levels()) {
+                break;
+              }
+            }
+          }
          output_level = level + 1;
          if (cfd->ioptions()->level_compaction_dynamic_level_bytes &&
              level == 0) {
@ -1203,17 +1222,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
        if (s.ok()) {
          assert(final_output_level > 0);
          // bottommost level intra-level compaction
-          // TODO(cbi): this preserves earlier behavior where if
-          //  max_overlapped_level = 0 and bottommost_level_compaction is
-          //  kIfHaveCompactionFilter, we only do a L0 -> LBase compaction
-          //  and do not do intra-LBase compaction even when user configures
-          //  compaction filter. We may want to still do a LBase -> LBase
-          //  compaction in case there is some file in LBase that did not go
-          //  through L0 -> LBase compaction, and hence did not go through
-          //  compaction filter.
          if ((options.bottommost_level_compaction ==
                   BottommostLevelCompaction::kIfHaveCompactionFilter &&
-               max_overlapped_level != 0 &&
               (cfd->ioptions()->compaction_filter != nullptr ||
                cfd->ioptions()->compaction_filter_factory != nullptr)) ||
              options.bottommost_level_compaction ==
@ -1221,10 +1231,11 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
              options.bottommost_level_compaction ==
                  BottommostLevelCompaction::kForce) {
            // Use `next_file_number` as `max_file_num_to_ignore` to avoid
-            // rewriting newly compacted files when it is kForceOptimized.
+            // rewriting newly compacted files when it is kForceOptimized
+            // or kIfHaveCompactionFilter with compaction filter set.
            s = RunManualCompaction(
                cfd, final_output_level, final_output_level, options, begin,
-                end, exclusive, !trim_ts.empty() /* disallow_trivial_move */,
+                end, exclusive, true /* disallow_trivial_move */,
                next_file_number /* max_file_num_to_ignore */, trim_ts);
          }
        }
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@ -2063,6 +2063,7 @@ class PinL0IndexAndFilterBlocksTest
    ASSERT_OK(Flush(1));
    // move this table to L1
    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_EQ(1, NumTableFilesAtLevel(1, 1));

    // reset block cache
    table_options.block_cache = NewLRUCache(64 * 1024);
@ -2220,7 +2221,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
  // this should be read from L1
  value = Get(1, "a");
  if (!disallow_preload_) {
-    // In inifinite max files case, there's a cache miss in executing Get()
+    // In infinite max files case, there's a cache miss in executing Get()
    // because index and filter are not prefetched before.
    ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
@ -2248,12 +2249,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
-    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+    ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  } else {
    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
-    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  }

  // Bloom and index hit will happen when a Get() happens.
@ -2262,12 +2263,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
-    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  } else {
    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
-    ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  }
 }

--- a/db/manual_compaction_test.cc
+++ b/db/manual_compaction_test.cc
@ -286,9 +286,9 @@ TEST_F(ManualCompactionTest, SkipLevel) {
    filter->Reset();
    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
    ASSERT_EQ(4, filter->NumKeys());
-    // 1 is first compacted to L1 and then further compacted into [2, 4, 8],
-    // so finally the logged level for 1 is L1.
-    ASSERT_EQ(1, filter->KeyLevel("1"));
+    // 1 is first compacted from L0 to L1, and then L1 intra level compaction
+    // compacts [2, 4, 8] only.
+    ASSERT_EQ(0, filter->KeyLevel("1"));
    ASSERT_EQ(1, filter->KeyLevel("2"));
    ASSERT_EQ(1, filter->KeyLevel("4"));
    ASSERT_EQ(1, filter->KeyLevel("8"));
--- a/db/seqno_time_test.cc
+++ b/db/seqno_time_test.cc
@ -226,7 +226,8 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
    }
    ASSERT_OK(Flush());
  }
-  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  // Second to last level
+  MoveFilesToLevel(5);
  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);

--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -1817,15 +1817,18 @@ struct CompactionOptions {
 // For level based compaction, we can configure if we want to skip/force
 // bottommost level compaction.
 enum class BottommostLevelCompaction {
-  // Skip bottommost level compaction
+  // Skip bottommost level compaction.
  kSkip,
-  // Only compact bottommost level if there is a compaction filter
-  // This is the default option
+  // Only compact bottommost level if there is a compaction filter.
+  // This is the default option.
+  // Similar to kForceOptimized, when compacting bottommost level, avoid
+  // double-compacting files
+  // created in the same manual compaction.
  kIfHaveCompactionFilter,
-  // Always compact bottommost level
+  // Always compact bottommost level.
  kForce,
  // Always compact bottommost level but in bottommost level avoid
-  // double-compacting files created in the same compaction
+  // double-compacting files created in the same compaction.
  kForceOptimized,
 };

--- a/unreleased_history/behavior_changes/compact_range_to_bottom.md
+++ b/unreleased_history/behavior_changes/compact_range_to_bottom.md
@ -0,0 +1,2 @@
+For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468)
+For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468)