diff --git a/HISTORY.md b/HISTORY.md index fbd993af4..ebb95663d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes` (#11321). * User-provided `ReadOptions` take effect for more reads of non-`CacheEntryRole::kDataBlock` blocks. * For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now drains unnecessary levels through background compaction automatically (#11340). This together with #11321 makes it automatic to migrate other compaction settings to level compaction with `level_compaction_dynamic_level_bytes=true`. In addition, a live DB that becomes smaller will now have unnecessary levels drained which can help to reduce read and space amp. +* If `CompactRange()` is called with `CompactRangeOptions::bottommost_level_compaction=kForce*` to compact from L0 to L1, RocksDB now will try to do trivial move from L0 to L1 and then do an intra L1 compaction, instead of a L0 to L1 compaction with trivial move disabled (#11375)). ### Bug Fixes * In the DB::VerifyFileChecksums API, ensure that file system reads of SST files are equal to the readahead_size in ReadOptions, if specified. Previously, each read was 2x the readahead_size. diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index 0b3f3dedc..596dfefc5 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -742,7 +742,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) { ASSERT_TRUE(filter->compaction_filter_created()); } -// Compaction filters aplies to all records, regardless snapshots. +// Compaction filters applies to all records, regardless snapshots. TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { std::string five = std::to_string(5); Options options = CurrentOptions(); diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index b214b201b..fb4c2d0ed 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -136,11 +136,12 @@ class DBCompactionTestWithParam class DBCompactionTestWithBottommostParam : public DBTestBase, - public testing::WithParamInterface { + public testing::WithParamInterface< + std::tuple> { public: DBCompactionTestWithBottommostParam() : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { - bottommost_level_compaction_ = GetParam(); + bottommost_level_compaction_ = std::get<0>(GetParam()); } BottommostLevelCompaction bottommost_level_compaction_; @@ -7339,10 +7340,63 @@ TEST_P(DBCompactionTestL0FilesMisorderCorruptionWithParam, Destroy(options_); } +TEST_F(DBCompactionTest, SingleLevelUniveresal) { + // Tests that manual compaction works with single level universal compaction. + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.disable_auto_compactions = true; + options.num_levels = 1; + DestroyAndReopen(options); + + Random rnd(31); + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < 50; ++j) { + ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); +} + +TEST_F(DBCompactionTest, SingleOverlappingNonL0BottommostManualCompaction) { + // Tests that manual compact will rewrite bottommost level + // when there is only a single non-L0 level that overlaps with + // manual compaction range. + constexpr int kSstNum = 10; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + for (auto b : {BottommostLevelCompaction::kForce, + BottommostLevelCompaction::kForceOptimized}) { + DestroyAndReopen(options); + + // Generate some sst files on level 0 with sequence keys (no overlap) + for (int i = 0; i < kSstNum; i++) { + for (int j = 1; j < UCHAR_MAX; j++) { + auto key = std::string(kSstNum, '\0'); + key[kSstNum - i] += static_cast(j); + ASSERT_OK(Put(key, std::string(i % 1000, 'A'))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(4); + ASSERT_EQ(NumTableFilesAtLevel(4), kSstNum); + CompactRangeOptions cro; + cro.bottommost_level_compaction = b; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(4), 1); + } +} + TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { constexpr int kSstNum = 10; Options options = CurrentOptions(); options.disable_auto_compactions = true; + options.num_levels = 7; + const bool dynamic_level = std::get<1>(GetParam()); + options.level_compaction_dynamic_level_bytes = dynamic_level; DestroyAndReopen(options); // Generate some sst files on level 0 with sequence keys (no overlap) @@ -7360,25 +7414,42 @@ TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { auto cro = CompactRangeOptions(); cro.bottommost_level_compaction = bottommost_level_compaction_; + bool trivial_moved = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_moved = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // All bottommost_level_compaction options should allow l0 -> l1 trivial move. ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(trivial_moved); if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce || bottommost_level_compaction_ == BottommostLevelCompaction::kForceOptimized) { - // Real compaction to compact all sst files from level 0 to 1 file on level - // 1 - ASSERT_EQ("0,1", FilesPerLevel(0)); + // bottommost level should go through intra-level compaction + // and has only 1 file + if (dynamic_level) { + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel(0)); + } else { + ASSERT_EQ("0,1", FilesPerLevel(0)); + } } else { - // Just trivial move from level 0 -> 1 - ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0)); + // Just trivial move from level 0 -> 1/base + if (dynamic_level) { + ASSERT_EQ("0,0,0,0,0,0," + std::to_string(kSstNum), FilesPerLevel(0)); + } else { + ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0)); + } } } INSTANTIATE_TEST_CASE_P( DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam, - ::testing::Values(BottommostLevelCompaction::kSkip, - BottommostLevelCompaction::kIfHaveCompactionFilter, - BottommostLevelCompaction::kForce, - BottommostLevelCompaction::kForceOptimized)); + ::testing::Combine( + ::testing::Values(BottommostLevelCompaction::kSkip, + BottommostLevelCompaction::kIfHaveCompactionFilter, + BottommostLevelCompaction::kForce, + BottommostLevelCompaction::kForceOptimized), + ::testing::Bool())); TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) { Options options = CurrentOptions(); diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 226772bdc..28a6a4f31 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -734,13 +734,17 @@ class DBImpl : public DB { // max_file_num_to_ignore allows bottom level compaction to filter out newly // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will // disable the filtering + // If `final_output_level` is not nullptr, it is set to manual compaction's + // output level if returned status is OK, and it may or may not be set to + // manual compaction's output level if returned status is not OK. Status RunManualCompaction(ColumnFamilyData* cfd, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const Slice* begin, const Slice* end, bool exclusive, bool disallow_trivial_move, uint64_t max_file_num_to_ignore, - const std::string& trim_ts); + const std::string& trim_ts, + int* final_output_level = nullptr); // Return an internal iterator over the current state of the database. // The keys of this iterator are internal keys (see format.h). diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index da43d609d..47ce69aeb 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1054,8 +1054,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, } s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, final_output_level, options, begin, end, exclusive, - false, std::numeric_limits::max(), - trim_ts); + false /* disable_trivial_move */, + std::numeric_limits::max(), trim_ts); } else { int first_overlapped_level = kInvalidLevel; int max_overlapped_level = kInvalidLevel; @@ -1142,74 +1142,83 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, CleanupSuperVersion(super_version); } if (s.ok() && first_overlapped_level != kInvalidLevel) { - // max_file_num_to_ignore can be used to filter out newly created SST - // files, useful for bottom level compaction in a manual compaction - uint64_t max_file_num_to_ignore = std::numeric_limits::max(); - uint64_t next_file_number = versions_->current_next_file_number(); - final_output_level = max_overlapped_level; - int output_level; - for (int level = first_overlapped_level; level <= max_overlapped_level; - level++) { - bool disallow_trivial_move = false; - // in case the compaction is universal or if we're compacting the - // bottom-most level, the output level will be the same as input one. - // level 0 can never be the bottommost level (i.e. if all files are in - // level 0, we will compact to level 1) - if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { - output_level = level; - } else if (level == max_overlapped_level && level > 0) { - if (options.bottommost_level_compaction == - BottommostLevelCompaction::kSkip) { - // Skip bottommost level compaction - continue; - } else if (options.bottommost_level_compaction == - BottommostLevelCompaction::kIfHaveCompactionFilter && - cfd->ioptions()->compaction_filter == nullptr && - cfd->ioptions()->compaction_filter_factory == nullptr) { - // Skip bottommost level compaction since we don't have a compaction - // filter - continue; - } - output_level = level; - // update max_file_num_to_ignore only for bottom level compaction - // because data in newly compacted files in middle levels may still - // need to be pushed down - max_file_num_to_ignore = next_file_number; - } else { + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + assert(first_overlapped_level == 0); + s = RunManualCompaction( + cfd, first_overlapped_level, first_overlapped_level, options, begin, + end, exclusive, true /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */, + trim_ts); + final_output_level = max_overlapped_level; + } else { + assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel); + uint64_t next_file_number = versions_->current_next_file_number(); + // Start compaction from `first_overlapped_level`, one level down at a + // time, until output level >= max_overlapped_level. + // When max_overlapped_level == 0, we will still compact from L0 -> L1 + // (or LBase), and followed by a bottommost level intra-level compaction + // at L1 (or LBase), if applicable. + int level = first_overlapped_level; + final_output_level = level; + int output_level, base_level; + while (level < max_overlapped_level || level == 0) { output_level = level + 1; - if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && - cfd->ioptions()->level_compaction_dynamic_level_bytes && + if (cfd->ioptions()->level_compaction_dynamic_level_bytes && level == 0) { output_level = ColumnFamilyData::kCompactToBaseLevel; } - // if it's a BottommostLevel compaction and `kForce*` compaction is - // set, disallow trivial move - if (level == max_overlapped_level && - (options.bottommost_level_compaction == - BottommostLevelCompaction::kForce || - options.bottommost_level_compaction == - BottommostLevelCompaction::kForceOptimized)) { - disallow_trivial_move = true; + // Use max value for `max_file_num_to_ignore` to always compact + // files down. + s = RunManualCompaction( + cfd, level, output_level, options, begin, end, exclusive, + !trim_ts.empty() /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */, + trim_ts, + output_level == ColumnFamilyData::kCompactToBaseLevel + ? &base_level + : nullptr); + if (!s.ok()) { + break; } + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + assert(base_level > 0); + level = base_level; + } else { + ++level; + } + final_output_level = level; + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); } - // trim_ts need real compaction to remove latest record - if (!trim_ts.empty()) { - disallow_trivial_move = true; - } - s = RunManualCompaction(cfd, level, output_level, options, begin, end, - exclusive, disallow_trivial_move, - max_file_num_to_ignore, trim_ts); - if (!s.ok()) { - break; - } - if (output_level == ColumnFamilyData::kCompactToBaseLevel) { - final_output_level = cfd->NumberLevels() - 1; - } else if (output_level > final_output_level) { - final_output_level = output_level; + if (s.ok()) { + assert(final_output_level > 0); + // bottommost level intra-level compaction + // TODO(cbi): this preserves earlier behavior where if + // max_overlapped_level = 0 and bottommost_level_compaction is + // kIfHaveCompactionFilter, we only do a L0 -> LBase compaction + // and do not do intra-LBase compaction even when user configures + // compaction filter. We may want to still do a LBase -> LBase + // compaction in case there is some file in LBase that did not go + // through L0 -> LBase compaction, and hence did not go through + // compaction filter. + if ((options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter && + max_overlapped_level != 0 && + (cfd->ioptions()->compaction_filter != nullptr || + cfd->ioptions()->compaction_filter_factory != nullptr)) || + options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized || + options.bottommost_level_compaction == + BottommostLevelCompaction::kForce) { + // Use `next_file_number` as `max_file_num_to_ignore` to avoid + // rewriting newly compacted files when it is kForceOptimized. + s = RunManualCompaction( + cfd, final_output_level, final_output_level, options, begin, + end, exclusive, !trim_ts.empty() /* disallow_trivial_move */, + next_file_number /* max_file_num_to_ignore */, trim_ts); + } } - TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); - TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); } } } @@ -1853,7 +1862,8 @@ Status DBImpl::RunManualCompaction( ColumnFamilyData* cfd, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const Slice* begin, const Slice* end, bool exclusive, bool disallow_trivial_move, - uint64_t max_file_num_to_ignore, const std::string& trim_ts) { + uint64_t max_file_num_to_ignore, const std::string& trim_ts, + int* final_output_level) { assert(input_level == ColumnFamilyData::kCompactAllLevels || input_level >= 0); @@ -2004,6 +2014,15 @@ Status DBImpl::RunManualCompaction( } else if (!scheduled) { if (compaction == nullptr) { manual.done = true; + if (final_output_level) { + // No compaction needed or there is a conflicting compaction. + // Still set `final_output_level` to the level where we would + // have compacted to. + *final_output_level = output_level; + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + *final_output_level = cfd->current()->storage_info()->base_level(); + } + } bg_cv_.SignalAll(); continue; } @@ -2037,6 +2056,9 @@ Status DBImpl::RunManualCompaction( } scheduled = true; TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled"); + if (final_output_level) { + *final_output_level = compaction->output_level(); + } } } diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 11e7f49fa..f60a724eb 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -810,9 +810,10 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) { // We created 4 sst files in L0 ASSERT_EQ("4", FilesPerLevel(0)); - // Compaction will move the 4 files in L0 to trash and create 1 L1 file + // Compaction will move the 4 files in L0 to trash and create 1 L1 file. + // Use kForceOptimized to not rewrite the new L1 file. CompactRangeOptions cro; - cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); ASSERT_EQ("0,1", FilesPerLevel(0));