From 04c11b867df9190da204e38357a14d20296fb244 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Mon, 2 Apr 2018 21:57:28 -0700 Subject: [PATCH] Level Compaction with TTL Summary: Level Compaction with TTL. As of today, a file could exist in the LSM tree without going through the compaction process for a really long time if there are no updates to the data in the file's key range. For example, in certain use cases, the keys are not actually "deleted"; instead they are just set to empty values. There might not be any more writes to this "deleted" key range, and if so, such data could remain in the LSM for a really long time resulting in wasted space. Introducing a TTL could solve this problem. Files (and, in turn, data) older than TTL will be scheduled for compaction when there is no other background work. This will make the data go through the regular compaction process and get rid of old unwanted data. This also has the (good) side-effect of all the data in the non-bottommost level being newer than ttl, and all data in the bottommost level older than ttl. It could lead to more writes while reducing space. This functionality can be controlled by the newly introduced column family option -- ttl. TODO for later: - Make ttl mutable - Extend TTL to Universal compaction as well? (TTL is already supported in FIFO) - Maybe deprecate CompactionOptionsFIFO.ttl in favor of this new ttl option. Closes https://github.com/facebook/rocksdb/pull/3591 Differential Revision: D7275442 Pulled By: sagar0 fbshipit-source-id: dcba484717341200d419b0953dafcdf9eb2f0267 --- HISTORY.md | 1 + db/compaction_picker.cc | 91 +++++++++++++++++++++++------- db/db_compaction_test.cc | 52 +++++++++++++++++ db/db_impl_open.cc | 7 +-- db/version_set.cc | 31 ++++++++++ db/version_set.h | 13 +++++ include/rocksdb/advanced_options.h | 7 +++ include/rocksdb/listener.h | 1 + options/cf_options.cc | 3 +- options/cf_options.h | 2 + options/options.cc | 4 +- options/options_helper.cc | 5 +- options/options_settable_test.cc | 1 + util/testutil.cc | 1 + 14 files changed, 191 insertions(+), 28 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index b6e479d16..f4454dd84 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,7 @@ * Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages. ### New Features +* * Introduce TTL for level compaction so that all files older than ttl go through the compaction process to get rid of old data. ### Bug Fixes * Fsync after writing global seq number to the ingestion file in ExternalSstFileIngestionJob. diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 9e20b63f7..713da4266 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -941,6 +941,9 @@ void CompactionPicker::UnregisterCompaction(Compaction* c) { bool LevelCompactionPicker::NeedsCompaction( const VersionStorageInfo* vstorage) const { + if (!vstorage->ExpiredTtlFiles().empty()) { + return true; + } if (!vstorage->BottommostFilesMarkedForCompaction().empty()) { return true; } @@ -1010,6 +1013,8 @@ class LevelCompactionBuilder { // If there is any file marked for compaction, put put it into inputs. void PickFilesMarkedForCompaction(); + void PickExpiredTtlFiles(); + const std::string& cf_name_; VersionStorageInfo* vstorage_; CompactionPicker* compaction_picker_; @@ -1080,6 +1085,42 @@ void LevelCompactionBuilder::PickFilesMarkedForCompaction() { start_level_inputs_.files.clear(); } +void LevelCompactionBuilder::PickExpiredTtlFiles() { + if (vstorage_->ExpiredTtlFiles().empty()) { + return; + } + + auto continuation = [&](std::pair level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + start_level_ = level_file.first; + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + + if ((start_level_ == vstorage_->num_non_empty_levels() - 1) || + (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty())) { + return false; + } + + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_); + }; + + for (auto& level_file : vstorage_->ExpiredTtlFiles()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + + start_level_inputs_.files.clear(); +} + void LevelCompactionBuilder::SetupInitialFiles() { // Find the compactions by size on all levels. bool skipped_l0_to_base = false; @@ -1133,30 +1174,38 @@ void LevelCompactionBuilder::SetupInitialFiles() { if (start_level_inputs_.empty()) { is_manual_ = true; parent_index_ = base_index_ = -1; + PickFilesMarkedForCompaction(); - if (start_level_inputs_.empty()) { - size_t i; - for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size(); - ++i) { - auto& level_and_file = - vstorage_->BottommostFilesMarkedForCompaction()[i]; - assert(!level_and_file.second->being_compacted); - start_level_inputs_.level = output_level_ = start_level_ = - level_and_file.first; - start_level_inputs_.files = {level_and_file.second}; - if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_)) { - break; - } - } - if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) { - start_level_inputs_.clear(); - } else { - assert(!start_level_inputs_.empty()); - compaction_reason_ = CompactionReason::kBottommostFiles; + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; + return; + } + + size_t i; + for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size(); + ++i) { + auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i]; + assert(!level_and_file.second->being_compacted); + start_level_inputs_.level = output_level_ = start_level_ = + level_and_file.first; + start_level_inputs_.files = {level_and_file.second}; + if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_)) { + break; } + } + if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) { + start_level_inputs_.clear(); } else { - compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; + assert(!start_level_inputs_.empty()); + compaction_reason_ = CompactionReason::kBottommostFiles; + return; + } + + assert(start_level_inputs_.empty()); + PickExpiredTtlFiles(); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kTtl; } } } diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 57beaa407..70c2ea937 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -3106,6 +3106,58 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { } } +TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 1024; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.ttl = 24 * 60 * 60; // 24 hours + options.max_open_files = -1; + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + MoveFilesToLevel(3); + ASSERT_EQ("0,0,0,2", FilesPerLevel()); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + // Overwrite previous keys with smaller, but predictable, values. + ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); + } + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("2,0,0,2", FilesPerLevel()); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + env_->addon_time_.fetch_add(36 * 60 * 60); // 36 hours + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + // Just do a siimple write + flush so that the Ttl expired files get + // compacted. + ASSERT_OK(Put("a", "1")); + Flush(); + dbfull()->TEST_WaitForCompact(); + // All non-L0 files are deleted, as they contained only deleted data. + ASSERT_EQ("1", FilesPerLevel()); +} + TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) { // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual // compaction only triggers flush after it's sure stall won't be triggered for diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 6e05af70a..78e871c58 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -174,17 +174,16 @@ static Status ValidateOptions( "universal and level compaction styles. "); } } - if (cfd.options.compaction_options_fifo.ttl > 0) { + if (cfd.options.ttl > 0 || cfd.options.compaction_options_fifo.ttl > 0) { if (db_options.max_open_files != -1) { return Status::NotSupported( - "FIFO Compaction with TTL is only supported when files are always " + "TTL is only supported when files are always " "kept open (set max_open_files = -1). "); } if (cfd.options.table_factory->Name() != BlockBasedTableFactory().Name()) { return Status::NotSupported( - "FIFO Compaction with TTL is only supported in " - "Block-Based Table format. "); + "TTL is only supported in Block-Based Table format. "); } } } diff --git a/db/version_set.cc b/db/version_set.cc index d2bfcc8de..b66b3436e 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1667,6 +1667,9 @@ void VersionStorageInfo::ComputeCompactionScore( } ComputeFilesMarkedForCompaction(); ComputeBottommostFilesMarkedForCompaction(); + if (immutable_cf_options.ttl > 0) { + ComputeExpiredTtlFiles(immutable_cf_options); + } EstimateCompactionBytesNeeded(mutable_cf_options); } @@ -1693,6 +1696,34 @@ void VersionStorageInfo::ComputeFilesMarkedForCompaction() { } } +void VersionStorageInfo::ComputeExpiredTtlFiles( + const ImmutableCFOptions& ioptions) { + assert(ioptions.ttl > 0); + + expired_ttl_files_.clear(); + + int64_t _current_time; + auto status = ioptions.env->GetCurrentTime(&_current_time); + if (!status.ok()) { + return; + } + const uint64_t current_time = static_cast(_current_time); + + for (int level = 0; level < num_levels() - 1; level++) { + for (auto f : files_[level]) { + if (!f->being_compacted && f->fd.table_reader != nullptr && + f->fd.table_reader->GetTableProperties() != nullptr) { + auto creation_time = + f->fd.table_reader->GetTableProperties()->creation_time; + if (creation_time > 0 && + creation_time < (current_time - ioptions.ttl)) { + expired_ttl_files_.emplace_back(level, f); + } + } + } + } +} + namespace { // used to sort files by size diff --git a/db/version_set.h b/db/version_set.h index bf92b200c..3783da314 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -135,6 +135,10 @@ class VersionStorageInfo { // ComputeCompactionScore() void ComputeFilesMarkedForCompaction(); + // This computes ttl_expired_files_ and is called by + // ComputeCompactionScore() + void ComputeExpiredTtlFiles(const ImmutableCFOptions& ioptions); + // This computes bottommost_files_marked_for_compaction_ and is called by // ComputeCompactionScore() or UpdateOldestSnapshot(). // @@ -286,6 +290,13 @@ class VersionStorageInfo { return files_marked_for_compaction_; } + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + // REQUIRES: DB mutex held during access + const autovector>& ExpiredTtlFiles() const { + assert(finalized_); + return expired_ttl_files_; + } + // REQUIRES: This version has been saved (see VersionSet::SaveTo) // REQUIRES: DB mutex held during access const autovector>& @@ -446,6 +457,8 @@ class VersionStorageInfo { // ComputeCompactionScore() autovector> files_marked_for_compaction_; + autovector> expired_ttl_files_; + // These files are considered bottommost because none of their keys can exist // at lower levels. They are not necessarily all in the same level. The marked // ones are eligible for compaction because they contain duplicate key diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 12cb6a317..24b65d93e 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -570,6 +570,13 @@ struct AdvancedColumnFamilyOptions { // Default: false bool report_bg_io_stats = false; + // Non-bottom-level files older than TTL will go through the compaction + // process. This needs max_open_files to be set to -1. + // Enabled only for level compaction for now. + // + // Default: 0 (disabled) + uint64_t ttl = 0; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index ad2df66f8..62f15d520 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -80,6 +80,7 @@ enum class CompactionReason { // [Level] Automatic compaction within bottommost level to cleanup duplicate // versions of same user key, usually due to a released snapshot. kBottommostFiles, + kTtl, }; enum class FlushReason : int { diff --git a/options/cf_options.cc b/options/cf_options.cc index 227c662e4..c7e12f587 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -74,7 +74,8 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options, row_cache(db_options.row_cache), max_subcompactions(db_options.max_subcompactions), memtable_insert_with_hint_prefix_extractor( - cf_options.memtable_insert_with_hint_prefix_extractor.get()) {} + cf_options.memtable_insert_with_hint_prefix_extractor.get()), + ttl(cf_options.ttl) {} // Multiple two operands. If they overflow, return op1. uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) { diff --git a/options/cf_options.h b/options/cf_options.h index 40db7c0ea..fec659652 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -118,6 +118,8 @@ struct ImmutableCFOptions { uint32_t max_subcompactions; const SliceTransform* memtable_insert_with_hint_prefix_extractor; + + uint64_t ttl; }; struct MutableCFOptions { diff --git a/options/options.cc b/options/options.cc index 0ebf15473..2a6af76b3 100644 --- a/options/options.cc +++ b/options/options.cc @@ -85,7 +85,8 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) optimize_filters_for_hits(options.optimize_filters_for_hits), paranoid_file_checks(options.paranoid_file_checks), force_consistency_checks(options.force_consistency_checks), - report_bg_io_stats(options.report_bg_io_stats) { + report_bg_io_stats(options.report_bg_io_stats), + ttl(options.ttl) { assert(memtable_factory.get() != nullptr); if (max_bytes_for_level_multiplier_additional.size() < static_cast(num_levels)) { @@ -321,6 +322,7 @@ void ColumnFamilyOptions::Dump(Logger* log) const { force_consistency_checks); ROCKS_LOG_HEADER(log, " Options.report_bg_io_stats: %d", report_bg_io_stats); + ROCKS_LOG_HEADER(log, " Options.ttl: %d", ttl); } // ColumnFamilyOptions::Dump void Options::Dump(Logger* log) const { diff --git a/options/options_helper.cc b/options/options_helper.cc index afcad5bf4..9088b589e 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -1819,7 +1819,10 @@ std::unordered_map {offset_of(&ColumnFamilyOptions::compaction_options_universal), OptionType::kCompactionOptionsUniversal, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, compaction_options_universal)}}}; + offsetof(struct MutableCFOptions, compaction_options_universal)}}, + {"ttl", + {offset_of(&ColumnFamilyOptions::ttl), OptionType::kUInt64T, + OptionVerificationType::kNormal, false, 0}}}; std::unordered_map OptionsHelper::fifo_compaction_options_type_info = { diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 11f708fe7..08ae53986 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -437,6 +437,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "hard_pending_compaction_bytes_limit=0;" "disable_auto_compactions=false;" "report_bg_io_stats=true;" + "ttl=60;" "compaction_options_fifo={max_table_files_size=3;ttl=100;allow_" "compaction=false;};", new_options)); diff --git a/util/testutil.cc b/util/testutil.cc index 1aa4bce75..eff624e91 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -349,6 +349,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) { // uint64_t options static const uint64_t uint_max = static_cast(UINT_MAX); + cf_opt->ttl = uint_max + rnd->Uniform(10000); cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000); cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000); cf_opt->max_compaction_bytes =