Improve universal compaction picker for tiered compaction (#10467)

Summary:
Current universal compaction picker may cause extra size amplification
compaction if there're more hot data on penultimate level. Improve the picker
to skip the last level for size amp calculation if tiered compaction is
enabled, which can
1. avoid extra unnecessary size amp compaction;
2. typically cold tier (the last level) is not size constrained, so skip size
   amp for cold tier is intended;

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10467

Test Plan: CI and added unittest

Reviewed By: siying

Differential Revision: D38391350

Pulled By: jay-zhuang

fbshipit-source-id: 103c0731c05e0a7e8f267e9e829d022328be25d2
main
Jay Zhuang 2 years ago committed by Facebook GitHub Bot
parent 563f574372
commit 375534752a
  1. 1
      HISTORY.md
  2. 142
      db/compaction/compaction_picker_test.cc
  3. 61
      db/compaction/compaction_picker_universal.cc
  4. 6
      include/rocksdb/advanced_options.h

@ -27,6 +27,7 @@
* To minimize the internal fragmentation caused by the variable size of the compressed blocks in `CompressedSecondaryCache`, the original block is split according to the jemalloc bin size in `Insert()` and then merged back in `Lookup()`.
* PosixLogger is removed and by default EnvLogger will be used for info logging. The behavior of the two loggers should be very similar when using the default Posix Env.
* Remove [min|max]_timestamp from VersionEdit for now since they are not tracked in MANIFEST anyway but consume two empty std::string (up to 64 bytes) for each file. Should they be added back in the future, we should store them more compactly.
* Improve universal tiered storage compaction picker to avoid extra major compaction triggered by size amplification. If `preclude_last_level_data_seconds` is enabled, the size amplification is calculated within non last_level data only which skip the last level and use the penultimate level as the size base.
### Performance Improvements
* Instead of constructing `FragmentedRangeTombstoneList` during every read operation, it is now constructed once and stored in immutable memtables. This improves speed of querying range tombstones from immutable memtables.

@ -3175,6 +3175,148 @@ TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) {
ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size());
}
TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) {
// This test make sure size amplification compaction could still be triggered
// if the last sorted run is not the last level.
const uint64_t kFileSize = 100000;
const int kNumLevels = 7;
const int kLastLevel = kNumLevels - 1;
ioptions_.compaction_style = kCompactionStyleUniversal;
ioptions_.preclude_last_level_data_seconds = 1000;
mutable_cf_options_.compaction_options_universal
.max_size_amplification_percent = 200;
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
Add(0, 100U, "100", "300", 1 * kFileSize);
Add(0, 101U, "200", "400", 1 * kFileSize);
Add(4, 90U, "100", "600", 4 * kFileSize);
Add(5, 80U, "200", "300", 2 * kFileSize);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(
universal_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
// Make sure it's a size amp compaction and includes all files
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kUniversalSizeAmplification);
ASSERT_EQ(compaction->output_level(), kLastLevel);
ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
ASSERT_EQ(compaction->input_levels(4)->num_files, 1);
ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
}
TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) {
// This test makes sure the size amp calculation skips the last level (L6), so
// size amp compaction is not triggered, instead a size ratio compaction is
// triggered.
const uint64_t kFileSize = 100000;
const int kNumLevels = 7;
const int kLastLevel = kNumLevels - 1;
const int kPenultimateLevel = kLastLevel - 1;
ioptions_.compaction_style = kCompactionStyleUniversal;
ioptions_.preclude_last_level_data_seconds = 1000;
mutable_cf_options_.compaction_options_universal
.max_size_amplification_percent = 200;
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
Add(0, 100U, "100", "300", 1 * kFileSize);
Add(0, 101U, "200", "400", 1 * kFileSize);
Add(5, 90U, "100", "600", 4 * kFileSize);
Add(6, 80U, "200", "300", 2 * kFileSize);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(
universal_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
// Internally, size amp compaction is evaluated before size ratio compaction.
// Here to make sure it's size ratio compaction instead of size amp
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kUniversalSizeRatio);
ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1);
ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
ASSERT_EQ(compaction->input_levels(5)->num_files, 0);
ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
}
TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) {
// Tiered compaction only support level_num > 2 (otherwise the penultimate
// level is going to be level 0, which may make thing more complicated), so
// when there's only 2 level, still treating level 1 as the last level for
// size amp compaction
const uint64_t kFileSize = 100000;
const int kNumLevels = 2;
const int kLastLevel = kNumLevels - 1;
ioptions_.compaction_style = kCompactionStyleUniversal;
ioptions_.preclude_last_level_data_seconds = 1000;
mutable_cf_options_.compaction_options_universal
.max_size_amplification_percent = 200;
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
Add(0, 100U, "100", "300", 1 * kFileSize);
Add(0, 101U, "200", "400", 1 * kFileSize);
Add(0, 90U, "100", "600", 4 * kFileSize);
Add(1, 80U, "200", "300", 2 * kFileSize);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(
universal_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
// size amp compaction is still triggered even preclude_last_level is set
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kUniversalSizeAmplification);
ASSERT_EQ(compaction->output_level(), kLastLevel);
ASSERT_EQ(compaction->input_levels(0)->num_files, 3);
ASSERT_EQ(compaction->input_levels(1)->num_files, 1);
}
TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) {
// This test makes sure the size amp compaction for tiered storage could still
// be triggered, but only for non-last-level files
const uint64_t kFileSize = 100000;
const int kNumLevels = 7;
const int kLastLevel = kNumLevels - 1;
const int kPenultimateLevel = kLastLevel - 1;
ioptions_.compaction_style = kCompactionStyleUniversal;
ioptions_.preclude_last_level_data_seconds = 1000;
mutable_cf_options_.compaction_options_universal
.max_size_amplification_percent = 200;
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
Add(0, 100U, "100", "300", 3 * kFileSize);
Add(0, 101U, "200", "400", 2 * kFileSize);
Add(5, 90U, "100", "600", 2 * kFileSize);
Add(6, 80U, "200", "300", 2 * kFileSize);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(
universal_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
// It's a Size Amp compaction, but doesn't include the last level file and
// output to the penultimate level.
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kUniversalSizeAmplification);
ASSERT_EQ(compaction->output_level(), kPenultimateLevel);
ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
}
class PerKeyPlacementCompactionPickerTest
: public CompactionPickerTest,
public testing::WithParamInterface<bool> {

@ -106,6 +106,9 @@ class UniversalCompactionBuilder {
Compaction* PickCompactionToOldest(size_t start_index,
CompactionReason compaction_reason);
Compaction* PickCompactionWithSortedRunRange(
size_t start_index, size_t end_index, CompactionReason compaction_reason);
// Try to pick periodic compaction. The caller should only call it
// if there is at least one file marked for periodic compaction.
// null will be returned if no such a compaction can be formed
@ -811,8 +814,20 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
}
// size of the base sorted run for size amp calculation
uint64_t base_sr_size = sorted_runs_.back().size;
size_t sr_end_idx = sorted_runs_.size() - 1;
// If tiered compaction is enabled and the last sorted run is the last level
if (ioptions_.preclude_last_level_data_seconds > 0 &&
ioptions_.num_levels > 2 &&
sorted_runs_.back().level == ioptions_.num_levels - 1 &&
sorted_runs_.size() > 1) {
sr_end_idx = sorted_runs_.size() - 2;
base_sr_size = sorted_runs_[sr_end_idx].size;
}
// keep adding up all the remaining files
for (size_t loop = start_index; loop + 1 < sorted_runs_.size(); loop++) {
for (size_t loop = start_index; loop < sr_end_idx; loop++) {
sr = &sorted_runs_[loop];
if (sr->being_compacted) {
// TODO with incremental compaction is supported, we might want to
@ -832,23 +847,20 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
return nullptr;
}
// size of earliest file
uint64_t earliest_file_size = sorted_runs_.back().size;
// size amplification = percentage of additional size
if (candidate_size * 100 < ratio * earliest_file_size) {
if (candidate_size * 100 < ratio * base_sr_size) {
ROCKS_LOG_BUFFER(
log_buffer_,
"[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
" earliest-file-size %" PRIu64,
cf_name_.c_str(), candidate_size, earliest_file_size);
cf_name_.c_str(), candidate_size, base_sr_size);
return nullptr;
} else {
ROCKS_LOG_BUFFER(
log_buffer_,
"[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
" earliest-file-size %" PRIu64,
cf_name_.c_str(), candidate_size, earliest_file_size);
cf_name_.c_str(), candidate_size, base_sr_size);
}
// Since incremental compaction can't include more than second last
// level, it can introduce penalty, compared to full compaction. We
@ -860,7 +872,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
// This also prevent the case when compaction falls behind and we
// need to compact more levels for compactions to catch up.
if (mutable_cf_options_.compaction_options_universal.incremental) {
double fanout_threshold = static_cast<double>(earliest_file_size) /
double fanout_threshold = static_cast<double>(base_sr_size) /
static_cast<double>(candidate_size) * 1.8;
Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold);
if (picked != nullptr) {
@ -869,8 +881,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
return picked;
}
}
return PickCompactionToOldest(start_index,
CompactionReason::kUniversalSizeAmplification);
return PickCompactionWithSortedRunRange(
start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification);
}
Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
@ -1233,11 +1245,17 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
size_t start_index, CompactionReason compaction_reason) {
return PickCompactionWithSortedRunRange(start_index, sorted_runs_.size() - 1,
compaction_reason);
}
Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
size_t start_index, size_t end_index, CompactionReason compaction_reason) {
assert(start_index < sorted_runs_.size());
// Estimate total file size
uint64_t estimated_total_size = 0;
for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
for (size_t loop = start_index; loop <= end_index; loop++) {
estimated_total_size += sorted_runs_[loop].size;
}
uint32_t path_id =
@ -1248,7 +1266,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
for (size_t i = 0; i < inputs.size(); ++i) {
inputs[i].level = start_level + static_cast<int>(i);
}
for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
for (size_t loop = start_index; loop <= end_index; loop++) {
auto& picking_sr = sorted_runs_[loop];
if (picking_sr.level == 0) {
FileMetaData* f = picking_sr.file;
@ -1279,12 +1297,19 @@ Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
file_num_buf);
}
// output files at the bottom most level, unless it's reserved
int output_level = vstorage_->num_levels() - 1;
// last level is reserved for the files ingested behind
if (ioptions_.allow_ingest_behind) {
assert(output_level > 1);
output_level--;
int output_level;
if (end_index == sorted_runs_.size() - 1) {
// output files at the last level, unless it's reserved
output_level = vstorage_->num_levels() - 1;
// last level is reserved for the files ingested behind
if (ioptions_.allow_ingest_behind) {
assert(output_level > 1);
output_level--;
}
} else {
// if it's not including all sorted_runs, it can only output to the level
// above the `end_index + 1` sorted_run.
output_level = sorted_runs_[end_index + 1].level - 1;
}
// We never check size for

@ -885,6 +885,12 @@ struct AdvancedColumnFamilyOptions {
// will be precluded from the last level.
// 0 means no key will be precluded from the last level.
//
// Note: when enabled, universal size amplification (controlled by option
// `compaction_options_universal.max_size_amplification_percent`) calculation
// will exclude the last level. As the feature is designed for tiered storage
// and a typical setting is the last level is cold tier which is likely not
// size constrained, the size amp is going to be only for non-last levels.
//
// Default: 0 (disable the feature)
uint64_t preclude_last_level_data_seconds = 0;

Loading…
Cancel
Save