From f1b9f804e9500390497f46b1626dd2a8ee64e496 Mon Sep 17 00:00:00 2001 From: sdong Date: Mon, 21 Sep 2015 17:16:31 -0700 Subject: [PATCH] Add a mode to always pick the oldest file to compact for each level Summary: Add options.compaction_pri, which specifies the policy about which file to compact first. kCompactionPriByLargestSeq will compact oldest files first. Verified the behavior in db_bench but did not write unit tests yet. Also need to make it settable through option string and dynamically changeable. Test Plan: Will write unit tests Reviewers: igor, rven, anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, MarkCallaghan Reviewed By: yhchiang Subscribers: leveldb, dhruba Differential Revision: https://reviews.facebook.net/D45951 --- db/compaction_picker.cc | 7 +------ db/compaction_picker_test.cc | 2 +- db/db_bench.cc | 6 ++++++ db/version_builder_test.cc | 2 +- db/version_set.cc | 36 ++++++++++++++++++++++++------------ db/version_set.h | 27 +++++++++++++++------------ include/rocksdb/options.h | 12 ++++++++++++ util/mutable_cf_options.h | 3 +++ util/options.cc | 4 ++++ 9 files changed, 67 insertions(+), 32 deletions(-) diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 26f396c93..27935085d 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -1028,7 +1028,7 @@ bool LevelCompactionPicker::PickCompactionBySize(VersionStorageInfo* vstorage, // Pick the largest file in this level that is not already // being compacted - const std::vector& file_size = vstorage->FilesBySize(level); + const std::vector& file_size = vstorage->FilesByCompactionPri(level); const std::vector& level_files = vstorage->LevelFiles(level); // record the first file that is not yet compacted @@ -1039,11 +1039,6 @@ bool LevelCompactionPicker::PickCompactionBySize(VersionStorageInfo* vstorage, int index = file_size[i]; auto* f = level_files[index]; - assert((i == file_size.size() - 1) || - (i >= VersionStorageInfo::kNumberFilesToSort - 1) || - (f->compensated_file_size >= - level_files[file_size[i + 1]]->compensated_file_size)); - // do not pick a file to compact if it is being compacted // from n-1 level. if (f->being_compacted) { diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc index 888865fea..bb4afb510 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction_picker_test.cc @@ -117,7 +117,7 @@ class CompactionPickerTest : public testing::Test { void UpdateVersionStorageInfo() { vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_); - vstorage_->UpdateFilesBySize(); + vstorage_->UpdateFilesByCompactionPri(mutable_cf_options_); vstorage_->UpdateNumNonEmptyLevels(); vstorage_->GenerateFileIndexer(); vstorage_->GenerateLevelFilesBrief(); diff --git a/db/db_bench.cc b/db/db_bench.cc index c02457e22..8ed4ff517 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -314,6 +314,10 @@ static rocksdb::CompactionStyle FLAGS_compaction_style_e; DEFINE_int32(compaction_style, (int32_t) rocksdb::Options().compaction_style, "style of compaction: level-based vs universal"); +static rocksdb::CompactionPri FLAGS_compaction_pri_e; +DEFINE_int32(compaction_pri, (int32_t)rocksdb::Options().compaction_style, + "priority of files to compaction: by size or by data age"); + DEFINE_int32(universal_size_ratio, 0, "Percentage flexibility while comparing file size" " (for universal compaction only)."); @@ -2248,6 +2252,7 @@ class Benchmark { options.max_subcompactions = static_cast(FLAGS_subcompactions); options.max_background_flushes = FLAGS_max_background_flushes; options.compaction_style = FLAGS_compaction_style_e; + options.compaction_pri = FLAGS_compaction_pri_e; if (FLAGS_prefix_size != 0) { options.prefix_extractor.reset( NewFixedPrefixTransform(FLAGS_prefix_size)); @@ -3957,6 +3962,7 @@ int main(int argc, char** argv) { if (FLAGS_statistics) { dbstats = rocksdb::CreateDBStatistics(); } + FLAGS_compaction_pri_e = (rocksdb::CompactionPri)FLAGS_compaction_pri; std::vector fanout = rocksdb::StringSplit( FLAGS_max_bytes_for_level_multiplier_additional, ','); diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 5312fd1ba..66230eef4 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -77,7 +77,7 @@ class VersionBuilderTest : public testing::Test { } void UpdateVersionStorageInfo() { - vstorage_.UpdateFilesBySize(); + vstorage_.UpdateFilesByCompactionPri(mutable_cf_options_); vstorage_.UpdateNumNonEmptyLevels(); vstorage_.GenerateFileIndexer(); vstorage_.GenerateLevelFilesBrief(); diff --git a/db/version_set.cc b/db/version_set.cc index fd3105539..91471c49d 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -782,7 +782,7 @@ VersionStorageInfo::VersionStorageInfo( compaction_style_(compaction_style), files_(new std::vector[num_levels_]), base_level_(num_levels_ == 1 ? -1 : 1), - files_by_size_(num_levels_), + files_by_compaction_pri_(num_levels_), level0_non_overlapping_(false), next_file_to_compact_by_size_(num_levels_), compaction_score_(num_levels_), @@ -923,7 +923,7 @@ void Version::PrepareApply( UpdateAccumulatedStats(update_stats); storage_info_.UpdateNumNonEmptyLevels(); storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options); - storage_info_.UpdateFilesBySize(); + storage_info_.UpdateFilesByCompactionPri(mutable_cf_options); storage_info_.GenerateFileIndexer(); storage_info_.GenerateLevelFilesBrief(); storage_info_.GenerateLevel0NonOverlapping(); @@ -1227,7 +1227,6 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { return (first.file->compensated_file_size > second.file->compensated_file_size); } - } // anonymous namespace void VersionStorageInfo::AddFile(int level, FileMetaData* f) { @@ -1245,7 +1244,7 @@ void VersionStorageInfo::AddFile(int level, FileMetaData* f) { // following functions called: // 1. UpdateNumNonEmptyLevels(); // 2. CalculateBaseBytes(); -// 3. UpdateFilesBySize(); +// 3. UpdateFilesByCompactionPri(); // 4. GenerateFileIndexer(); // 5. GenerateLevelFilesBrief(); // 6. GenerateLevel0NonOverlapping(); @@ -1297,7 +1296,8 @@ void VersionStorageInfo::UpdateNumNonEmptyLevels() { } } -void VersionStorageInfo::UpdateFilesBySize() { +void VersionStorageInfo::UpdateFilesByCompactionPri( + const MutableCFOptions& mutable_cf_options) { if (compaction_style_ == kCompactionStyleFIFO || compaction_style_ == kCompactionStyleUniversal) { // don't need this @@ -1306,8 +1306,8 @@ void VersionStorageInfo::UpdateFilesBySize() { // No need to sort the highest level because it is never compacted. for (int level = 0; level < num_levels() - 1; level++) { const std::vector& files = files_[level]; - auto& files_by_size = files_by_size_[level]; - assert(files_by_size.size() == 0); + auto& files_by_compaction_pri = files_by_compaction_pri_[level]; + assert(files_by_compaction_pri.size() == 0); // populate a temp vector for sorting based on size std::vector temp(files.size()); @@ -1321,16 +1321,28 @@ void VersionStorageInfo::UpdateFilesBySize() { if (num > temp.size()) { num = temp.size(); } - std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), - CompareCompensatedSizeDescending); + switch (mutable_cf_options.compaction_pri) { + case kCompactionPriByCompensatedSize: + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareCompensatedSizeDescending); + break; + case kCompactionPriByLargestSeq: + std::sort(temp.begin(), temp.end(), + [this](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->largest_seqno < f2.file->largest_seqno; + }); + break; + default: + assert(false); + } assert(temp.size() == files.size()); - // initialize files_by_size_ + // initialize files_by_compaction_pri_ for (unsigned int i = 0; i < temp.size(); i++) { - files_by_size.push_back(temp[i].index); + files_by_compaction_pri.push_back(temp[i].index); } next_file_to_compact_by_size_[level] = 0; - assert(files_[level].size() == files_by_size_[level].size()); + assert(files_[level].size() == files_by_compaction_pri_[level].size()); } } diff --git a/db/version_set.h b/db/version_set.h index 7707bb1ac..2930f0a92 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -132,8 +132,9 @@ class VersionStorageInfo { // Generate level_files_brief_ from files_ void GenerateLevelFilesBrief(); // Sort all files for this version based on their file size and - // record results in files_by_size_. The largest files are listed first. - void UpdateFilesBySize(); + // record results in files_by_compaction_pri_. The largest files are listed + // first. + void UpdateFilesByCompactionPri(const MutableCFOptions& mutable_cf_options); void GenerateLevel0NonOverlapping(); bool level0_non_overlapping() const { @@ -226,9 +227,9 @@ class VersionStorageInfo { } // REQUIRES: This version has been saved (see VersionSet::SaveTo) - const std::vector& FilesBySize(int level) const { + const std::vector& FilesByCompactionPri(int level) const { assert(finalized_); - return files_by_size_[level]; + return files_by_compaction_pri_[level]; } // REQUIRES: This version has been saved (see VersionSet::SaveTo) @@ -242,7 +243,7 @@ class VersionStorageInfo { int base_level() const { return base_level_; } // REQUIRES: lock is held - // Set the index that is used to offset into files_by_size_ to find + // Set the index that is used to offset into files_by_compaction_pri_ to find // the next compaction candidate file. void SetNextCompactionIndex(int level, int index) { next_file_to_compact_by_size_[level] = index; @@ -259,7 +260,7 @@ class VersionStorageInfo { return file_indexer_; } - // Only the first few entries of files_by_size_ are sorted. + // Only the first few entries of files_by_compaction_pri_ are sorted. // There is no need to sort all the files because it is likely // that on a running system, we need to look at only the first // few largest files because a new version is created every few @@ -299,7 +300,8 @@ class VersionStorageInfo { uint64_t GetEstimatedActiveKeys() const; - // re-initializes the index that is used to offset into files_by_size_ + // re-initializes the index that is used to offset into + // files_by_compaction_pri_ // to find the next compaction candidate file. void ResetNextCompactionIndex(int level) { next_file_to_compact_by_size_[level] = 0; @@ -351,16 +353,16 @@ class VersionStorageInfo { // but files in each level are now sorted based on file // size. The file with the largest size is at the front. // This vector stores the index of the file from files_. - std::vector> files_by_size_; + std::vector> files_by_compaction_pri_; // If true, means that files in L0 have keys with non overlapping ranges bool level0_non_overlapping_; - // An index into files_by_size_ that specifies the first + // An index into files_by_compaction_pri_ that specifies the first // file that is not yet compacted std::vector next_file_to_compact_by_size_; - // Only the first few entries of files_by_size_ are sorted. + // Only the first few entries of files_by_compaction_pri_ are sorted. // There is no need to sort all the files because it is likely // that on a running system, we need to look at only the first // few largest files because a new version is created every few @@ -513,8 +515,9 @@ class Version { void UpdateAccumulatedStats(bool update_stats); // Sort all files for this version based on their file size and - // record results in files_by_size_. The largest files are listed first. - void UpdateFilesBySize(); + // record results in files_by_compaction_pri_. The largest files are listed + // first. + void UpdateFilesByCompactionPri(); ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs Logger* info_log_; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index f92e5cd66..b89d5d247 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -80,6 +80,13 @@ enum CompactionStyle : char { kCompactionStyleNone = 0x3, }; +enum CompactionPri : char { + // Slightly Priotize larger files by size compensated by #deletes + kCompactionPriByCompensatedSize = 0x0, + // First compact files whose data is oldest. + kCompactionPriByLargestSeq = 0x1, +}; + enum class WALRecoveryMode : char { // Original levelDB recovery // We tolerate incomplete record in trailing data on all logs @@ -547,6 +554,11 @@ struct ColumnFamilyOptions { // The compaction style. Default: kCompactionStyleLevel CompactionStyle compaction_style; + // If level compaction_style = kCompactionStyleLevel, for each level, + // which files are prioritized to be picked to compact. + // Default: kCompactionPriByCompensatedSize + CompactionPri compaction_pri; + // If true, compaction will verify checksum on every read that happens // as part of compaction // diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h index 3ce5ce36e..94c40195d 100644 --- a/util/mutable_cf_options.h +++ b/util/mutable_cf_options.h @@ -31,6 +31,7 @@ struct MutableCFOptions { options.level0_file_num_compaction_trigger), level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), level0_stop_writes_trigger(options.level0_stop_writes_trigger), + compaction_pri(options.compaction_pri), max_grandparent_overlap_factor(options.max_grandparent_overlap_factor), expanded_compaction_factor(options.expanded_compaction_factor), source_compaction_factor(options.source_compaction_factor), @@ -66,6 +67,7 @@ struct MutableCFOptions { level0_file_num_compaction_trigger(0), level0_slowdown_writes_trigger(0), level0_stop_writes_trigger(0), + compaction_pri(kCompactionPriByCompensatedSize), max_grandparent_overlap_factor(0), expanded_compaction_factor(0), source_compaction_factor(0), @@ -117,6 +119,7 @@ struct MutableCFOptions { int level0_file_num_compaction_trigger; int level0_slowdown_writes_trigger; int level0_stop_writes_trigger; + CompactionPri compaction_pri; int max_grandparent_overlap_factor; int expanded_compaction_factor; int source_compaction_factor; diff --git a/util/options.cc b/util/options.cc index 5644ce06c..8d8a1e23f 100644 --- a/util/options.cc +++ b/util/options.cc @@ -109,6 +109,7 @@ ColumnFamilyOptions::ColumnFamilyOptions() disable_auto_compactions(false), purge_redundant_kvs_while_flush(true), compaction_style(kCompactionStyleLevel), + compaction_pri(kCompactionPriByCompensatedSize), verify_checksums_in_compaction(true), filter_deletes(false), max_sequential_skip_in_iterations(8), @@ -170,6 +171,7 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) disable_auto_compactions(options.disable_auto_compactions), purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush), compaction_style(options.compaction_style), + compaction_pri(options.compaction_pri), verify_checksums_in_compaction(options.verify_checksums_in_compaction), compaction_options_universal(options.compaction_options_universal), compaction_options_fifo(options.compaction_options_fifo), @@ -492,6 +494,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const { verify_checksums_in_compaction); Header(log, " Options.compaction_style: %d", compaction_style); + Header(log, " Options.compaction_pri: %d", + compaction_pri); Header(log, " Options.compaction_options_universal.size_ratio: %u", compaction_options_universal.size_ratio); Header(log, "Options.compaction_options_universal.min_merge_width: %u",