From 6ce0b2ca34c0d73d17bd322e355a78542b2dfe50 Mon Sep 17 00:00:00 2001 From: Jay Zhuang Date: Wed, 13 Jul 2022 20:54:49 -0700 Subject: [PATCH] Tiered Compaction: per key placement support (#9964) Summary: Support per_key_placement for last level compaction, which will be used for tiered compaction. * compaction iterator reports which level a key should output to; * compaction get the output level information and check if it's safe to output the data to penultimate level; * all compaction output files will be installed. * extra internal compaction stats added for penultimate level. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9964 Test Plan: * Unittest * db_bench, no significate difference: https://gist.github.com/jay-zhuang/3645f8fb97ec0ab47c10704bb39fd6e4 * microbench manual compaction no significate difference: https://gist.github.com/jay-zhuang/ba679b3e89e24992615ee9eef310e6dd * run the db_stress multiple times (not covering the new feature) looks good (internal: https://fburl.com/sandcastle/9w84pp2m) Reviewed By: ajkr Differential Revision: D36249494 Pulled By: jay-zhuang fbshipit-source-id: a96da57c8031c1df83e4a7a8567b657a112b80a3 --- CMakeLists.txt | 5 + Makefile | 3 + TARGETS | 14 + db/compaction/compaction.cc | 83 +- db/compaction/compaction.h | 70 +- db/compaction/compaction_iterator.cc | 53 +- db/compaction/compaction_iterator.h | 29 + db/compaction/compaction_iterator_test.cc | 126 ++ db/compaction/compaction_job.cc | 1781 +++------------------ db/compaction/compaction_job.h | 119 +- db/compaction/compaction_job_test.cc | 88 +- db/compaction/compaction_outputs.cc | 314 ++++ db/compaction/compaction_outputs.h | 328 ++++ db/compaction/compaction_picker.cc | 28 +- db/compaction/compaction_picker.h | 3 +- db/compaction/compaction_picker_test.cc | 189 ++- db/compaction/compaction_service_job.cc | 825 ++++++++++ db/compaction/compaction_state.cc | 46 + db/compaction/compaction_state.h | 42 + db/compaction/subcompaction_state.cc | 223 +++ db/compaction/subcompaction_state.h | 255 +++ db/compaction/tiered_compaction_test.cc | 1253 +++++++++++++++ db/db_compaction_test.cc | 165 +- db/internal_stats.h | 158 +- include/rocksdb/compaction_job_stats.h | 2 + src.mk | 5 + 26 files changed, 4507 insertions(+), 1700 deletions(-) create mode 100644 db/compaction/compaction_outputs.cc create mode 100644 db/compaction/compaction_outputs.h create mode 100644 db/compaction/compaction_service_job.cc create mode 100644 db/compaction/compaction_state.cc create mode 100644 db/compaction/compaction_state.h create mode 100644 db/compaction/subcompaction_state.cc create mode 100644 db/compaction/subcompaction_state.h create mode 100644 db/compaction/tiered_compaction_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index de32c6f2d..f2bf831a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -627,7 +627,11 @@ set(SOURCES db/compaction/compaction_picker_fifo.cc db/compaction/compaction_picker_level.cc db/compaction/compaction_picker_universal.cc + db/compaction/compaction_service_job.cc + db/compaction/compaction_state.cc + db/compaction/compaction_outputs.cc db/compaction/sst_partitioner.cc + db/compaction/subcompaction_state.cc db/convenience.cc db/db_filesnapshot.cc db/db_impl/compacted_db_impl.cc @@ -1231,6 +1235,7 @@ if(WITH_TESTS) db/compaction/compaction_iterator_test.cc db/compaction/compaction_picker_test.cc db/compaction/compaction_service_test.cc + db/compaction/tiered_compaction_test.cc db/comparator_db_test.cc db/corruption_test.cc db/cuckoo_table_db_test.cc diff --git a/Makefile b/Makefile index 85b607ca9..a79ab7143 100644 --- a/Makefile +++ b/Makefile @@ -1783,6 +1783,9 @@ write_unprepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_unpre timestamped_snapshot_test: $(OBJ_DIR)/utilities/transactions/timestamped_snapshot_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +tiered_compaction_test: $(OBJ_DIR)/db/compaction/tiered_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + sst_dump: $(OBJ_DIR)/tools/sst_dump.o $(TOOLS_LIBRARY) $(LIBRARY) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 3aa61918c..48904020c 100644 --- a/TARGETS +++ b/TARGETS @@ -38,11 +38,15 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "db/compaction/compaction.cc", "db/compaction/compaction_iterator.cc", "db/compaction/compaction_job.cc", + "db/compaction/compaction_outputs.cc", "db/compaction/compaction_picker.cc", "db/compaction/compaction_picker_fifo.cc", "db/compaction/compaction_picker_level.cc", "db/compaction/compaction_picker_universal.cc", + "db/compaction/compaction_service_job.cc", + "db/compaction/compaction_state.cc", "db/compaction/sst_partitioner.cc", + "db/compaction/subcompaction_state.cc", "db/convenience.cc", "db/db_filesnapshot.cc", "db/db_impl/compacted_db_impl.cc", @@ -368,11 +372,15 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[ "db/compaction/compaction.cc", "db/compaction/compaction_iterator.cc", "db/compaction/compaction_job.cc", + "db/compaction/compaction_outputs.cc", "db/compaction/compaction_picker.cc", "db/compaction/compaction_picker_fifo.cc", "db/compaction/compaction_picker_level.cc", "db/compaction/compaction_picker_universal.cc", + "db/compaction/compaction_service_job.cc", + "db/compaction/compaction_state.cc", "db/compaction/sst_partitioner.cc", + "db/compaction/subcompaction_state.cc", "db/convenience.cc", "db/db_filesnapshot.cc", "db/db_impl/compacted_db_impl.cc", @@ -5764,6 +5772,12 @@ cpp_unittest_wrapper(name="thread_local_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="tiered_compaction_test", + srcs=["db/compaction/tiered_compaction_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="timer_queue_test", srcs=["util/timer_queue_test.cc"], deps=[":rocksdb_test_lib"], diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index 997b0c714..885da2138 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -77,11 +77,11 @@ void Compaction::SetInputVersion(Version* _input_version) { void Compaction::GetBoundaryKeys( VersionStorageInfo* vstorage, const std::vector& inputs, Slice* smallest_user_key, - Slice* largest_user_key) { + Slice* largest_user_key, int exclude_level) { bool initialized = false; const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); for (size_t i = 0; i < inputs.size(); ++i) { - if (inputs[i].files.empty()) { + if (inputs[i].files.empty() || inputs[i].level == exclude_level) { continue; } if (inputs[i].level == 0) { @@ -257,7 +257,9 @@ Compaction::Compaction( _blob_garbage_collection_age_cutoff < 0 || _blob_garbage_collection_age_cutoff > 1 ? mutable_cf_options()->blob_garbage_collection_age_cutoff - : _blob_garbage_collection_age_cutoff) { + : _blob_garbage_collection_age_cutoff), + penultimate_level_(EvaluatePenultimateLevel( + immutable_options_, start_level_, output_level_)) { MarkFilesBeingCompacted(true); if (is_manual_compaction_) { compaction_reason_ = CompactionReason::kManualCompaction; @@ -303,6 +305,18 @@ Compaction::Compaction( } } } + + PopulatePenultimateLevelOutputRange(); +} + +void Compaction::PopulatePenultimateLevelOutputRange() { + if (!SupportsPerKeyPlacement()) { + return; + } + + GetBoundaryKeys(input_vstorage_, inputs_, + &penultimate_level_smallest_user_key_, + &penultimate_level_largest_user_key_, number_levels_ - 1); } Compaction::~Compaction() { @@ -314,6 +328,37 @@ Compaction::~Compaction() { } } +bool Compaction::SupportsPerKeyPlacement() const { + return penultimate_level_ != kInvalidLevel; +} + +int Compaction::GetPenultimateLevel() const { return penultimate_level_; } + +bool Compaction::OverlapPenultimateLevelOutputRange( + const Slice& smallest_key, const Slice& largest_key) const { + if (!SupportsPerKeyPlacement()) { + return false; + } + const Comparator* ucmp = + input_vstorage_->InternalComparator()->user_comparator(); + + return ucmp->Compare(smallest_key, penultimate_level_largest_user_key_) <= + 0 && + ucmp->Compare(largest_key, penultimate_level_smallest_user_key_) >= 0; +} + +bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const { + if (!SupportsPerKeyPlacement()) { + return false; + } + + const Comparator* ucmp = + input_vstorage_->InternalComparator()->user_comparator(); + + return ucmp->Compare(key, penultimate_level_smallest_user_key_) >= 0 && + ucmp->Compare(key, penultimate_level_largest_user_key_) <= 0; +} + bool Compaction::InputCompressionMatchesOutput() const { int base_level = input_vstorage_->base_level(); bool matches = @@ -677,8 +722,36 @@ uint64_t Compaction::MinInputFileOldestAncesterTime( return min_oldest_ancester_time; } -int Compaction::GetInputBaseLevel() const { - return input_vstorage_->base_level(); +int Compaction::EvaluatePenultimateLevel( + const ImmutableOptions& immutable_options, const int start_level, + const int output_level) { + // TODO: currently per_key_placement feature only support level and universal + // compaction + if (immutable_options.compaction_style != kCompactionStyleLevel && + immutable_options.compaction_style != kCompactionStyleUniversal) { + return kInvalidLevel; + } + if (output_level != immutable_options.num_levels - 1) { + return kInvalidLevel; + } + + int penultimate_level = output_level - 1; + assert(penultimate_level < immutable_options.num_levels); + if (penultimate_level <= 0 || penultimate_level < start_level) { + return kInvalidLevel; + } + + // TODO: will add public like `options.preclude_last_level_data_seconds` for + // per_key_placement feature, will check that option here. Currently, only + // set by unittest + bool supports_per_key_placement = false; + TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled", + &supports_per_key_placement); + if (!supports_per_key_placement) { + return kInvalidLevel; + } + + return penultimate_level; } } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index a8fa25066..bd204b122 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -302,7 +302,25 @@ class Compaction { Slice GetLargestUserKey() const { return largest_user_key_; } - int GetInputBaseLevel() const; + // Return true if the compaction supports per_key_placement + bool SupportsPerKeyPlacement() const; + + // Get per_key_placement penultimate output level, which is `last_level - 1` + // if per_key_placement feature is supported. Otherwise, return -1. + int GetPenultimateLevel() const; + + // Return true if the given range is overlap with penultimate level output + // range. + bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key, + const Slice& largest_key) const; + + // Return true if the key is within penultimate level output range for + // per_key_placement feature, which is safe to place the key to the + // penultimate level. different compaction strategy has different rules. + // If per_key_placement is not supported, always return false. + // TODO: currently it doesn't support moving data from the last level to the + // penultimate level + bool WithinPenultimateLevelOutputRange(const Slice& key) const; CompactionReason compaction_reason() const { return compaction_reason_; } @@ -339,6 +357,15 @@ class Compaction { return notify_on_compaction_completion_; } + static constexpr int kInvalidLevel = -1; + // Evaluate penultimate output level. If the compaction supports + // per_key_placement feature, it returns the penultimate level number. + // Otherwise, it's set to kInvalidLevel (-1), which means + // output_to_penultimate_level is not supported. + static int EvaluatePenultimateLevel(const ImmutableOptions& immutable_options, + const int start_level, + const int output_level); + private: // mark (or clear) all files that are being compacted void MarkFilesBeingCompacted(bool mark_as_compacted); @@ -346,7 +373,18 @@ class Compaction { // get the smallest and largest key present in files to be compacted static void GetBoundaryKeys(VersionStorageInfo* vstorage, const std::vector& inputs, - Slice* smallest_key, Slice* largest_key); + Slice* smallest_key, Slice* largest_key, + int exclude_level = -1); + + // populate penultimate level output range, which will be used to determine if + // a key is safe to output to the penultimate level (details see + // `Compaction::WithinPenultimateLevelOutputRange()`. + // TODO: Currently the penultimate level output range is the min/max keys of + // non-last-level input files. Which is only good if there's no key moved + // from the last level to the penultimate level. For a more complicated per + // key placement which may move data from the last level to the penultimate + // level, it needs extra check. + void PopulatePenultimateLevelOutputRange(); // Get the atomic file boundaries for all files in the compaction. Necessary // in order to avoid the scenario described in @@ -444,7 +482,35 @@ class Compaction { // Blob garbage collection age cutoff. double blob_garbage_collection_age_cutoff_; + + // only set when per_key_placement feature is enabled, -1 (kInvalidLevel) + // means not supported. + const int penultimate_level_; + + // Key range for penultimate level output + Slice penultimate_level_smallest_user_key_; + Slice penultimate_level_largest_user_key_; +}; + +#ifndef NDEBUG +// Helper struct only for tests, which contains the data to decide if a key +// should be output to the penultimate level. +// TODO: remove this when the public feature knob is available +struct PerKeyPlacementContext { + const int level; + const Slice key; + const Slice value; + const SequenceNumber seq_num; + + bool output_to_penultimate_level; + + PerKeyPlacementContext(int _level, Slice _key, Slice _value, + SequenceNumber _seq_num) + : level(_level), key(_key), value(_value), seq_num(_seq_num) { + output_to_penultimate_level = false; + } }; +#endif /* !NDEBUG */ // Return sum of sizes of all files in `files`. extern uint64_t TotalFileSize(const std::vector& files); diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index 395b8be42..54c58887c 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -1075,6 +1075,52 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() { } } +void CompactionIterator::DecideOutputLevel() { +#ifndef NDEBUG + // TODO: will be set by sequence number or key range, for now, it will only be + // set by unittest + PerKeyPlacementContext context(level_, ikey_.user_key, value_, + ikey_.sequence); + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", + &context); + output_to_penultimate_level_ = context.output_to_penultimate_level; +#endif /* !NDEBUG */ + + // if the key is within the earliest snapshot, it has to output to the + // penultimate level. + if (ikey_.sequence > earliest_snapshot_) { + output_to_penultimate_level_ = true; + } + + if (output_to_penultimate_level_) { + // If it's decided to output to the penultimate level, but unsafe to do so, + // still output to the last level. For example, moving the data from a lower + // level to a higher level outside of the higher-level input key range is + // considered unsafe, because the key may conflict with higher-level SSTs + // not from this compaction. + // TODO: add statistic for declined output_to_penultimate_level + bool safe_to_penultimate_level = + compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key); + if (!safe_to_penultimate_level) { + output_to_penultimate_level_ = false; + // It could happen when disable/enable `bottommost_temperature` while + // holding a snapshot. When `bottommost_temperature` is not set + // (==kUnknown), the data newer than any snapshot is pushed to the last + // level, but when the per_key_placement feature is enabled on the fly, + // the data later than the snapshot has to be moved to the penultimate + // level, which may or may not be safe. So the user needs to make sure all + // snapshot is released before enabling `bottommost_temperature` feature + // We will migrate the feature to `last_level_temperature` and maybe make + // it not dynamically changeable. + if (ikey_.sequence > earliest_snapshot_) { + status_ = Status::Corruption( + "Unsafe to store Seq later than snapshot in the last level if " + "per_key_placement is enabled"); + } + } + } +} + void CompactionIterator::PrepareOutput() { if (valid_) { if (ikey_.type == kTypeValue) { @@ -1083,6 +1129,10 @@ void CompactionIterator::PrepareOutput() { GarbageCollectBlobIfNeeded(); } + if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) { + DecideOutputLevel(); + } + // Zeroing out the sequence number leads to better compression. // If this is the bottommost level (no files in lower levels) // and the earliest snapshot is larger than this seqno @@ -1097,7 +1147,8 @@ void CompactionIterator::PrepareOutput() { if (valid_ && compaction_ != nullptr && !compaction_->allow_ingest_behind() && bottommost_level_ && DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && - ikey_.type != kTypeMerge && current_key_committed_) { + ikey_.type != kTypeMerge && current_key_committed_ && + !output_to_penultimate_level_) { if (ikey_.type == kTypeDeletion || (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) { ROCKS_LOG_FATAL( diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index 64145420a..9c4dbab60 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -105,6 +105,10 @@ class CompactionIterator { virtual bool DoesInputReferenceBlobFiles() const = 0; virtual const Compaction* real_compaction() const = 0; + + virtual bool SupportsPerKeyPlacement() const = 0; + + virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0; }; class RealCompaction : public CompactionProxy { @@ -163,6 +167,16 @@ class CompactionIterator { const Compaction* real_compaction() const override { return compaction_; } + bool SupportsPerKeyPlacement() const override { + return compaction_->SupportsPerKeyPlacement(); + } + + // Check if key is within penultimate level output range, to see if it's + // safe to output to the penultimate level for per_key_placement feature. + bool WithinPenultimateLevelOutputRange(const Slice& key) const override { + return compaction_->WithinPenultimateLevelOutputRange(key); + } + private: const Compaction* compaction_; }; @@ -227,6 +241,12 @@ class CompactionIterator { const Slice& user_key() const { return current_user_key_; } const CompactionIterationStats& iter_stats() const { return iter_stats_; } uint64_t num_input_entry_scanned() const { return input_.num_itered(); } + // If the current key should be placed on penultimate level, only valid if + // per_key_placement is supported + bool output_to_penultimate_level() const { + return output_to_penultimate_level_; + } + Status InputStatus() const { return input_.status(); } private: // Processes the input stream to find the next output @@ -235,6 +255,10 @@ class CompactionIterator { // Do final preparations before presenting the output to the callee. void PrepareOutput(); + // Decide the current key should be output to the last level or penultimate + // level, only call for compaction supports per key placement + void DecideOutputLevel(); + // Passes the output value to the blob file builder (if any), and replaces it // with the corresponding blob reference if it has been actually written to a // blob file (i.e. if it passed the value size check). Returns true if the @@ -417,6 +441,11 @@ class CompactionIterator { // just been zeroed out during bottommost compaction. bool last_key_seq_zeroed_{false}; + // True if the current key should be output to the penultimate level if + // possible, compaction logic makes the final decision on which level to + // output to. + bool output_to_penultimate_level_{false}; + void AdvanceInputIter() { input_.Next(); } void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); } diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc index 2f0250804..ccfe0728e 100644 --- a/db/compaction/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -180,11 +180,21 @@ class FakeCompaction : public CompactionIterator::CompactionProxy { const Compaction* real_compaction() const override { return nullptr; } + bool SupportsPerKeyPlacement() const override { + return supports_per_key_placement; + } + + bool WithinPenultimateLevelOutputRange(const Slice& key) const override { + return (!key.starts_with("unsafe_pb")); + } + bool key_not_exists_beyond_output_level = false; bool is_bottommost_level = false; bool is_allow_ingest_behind = false; + + bool supports_per_key_placement = false; }; // A simplified snapshot checker which assumes each snapshot has a global @@ -254,6 +264,7 @@ class CompactionIteratorTest : public testing::TestWithParam { compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind(); compaction_proxy_->key_not_exists_beyond_output_level = key_not_exists_beyond_output_level; + compaction_proxy_->supports_per_key_placement = SupportsPerKeyPlacement(); compaction.reset(compaction_proxy_); } bool use_snapshot_checker = UseSnapshotChecker() || GetParam(); @@ -295,6 +306,8 @@ class CompactionIteratorTest : public testing::TestWithParam { virtual bool AllowIngestBehind() const { return false; } + virtual bool SupportsPerKeyPlacement() const { return false; } + void RunTest( const std::vector& input_keys, const std::vector& input_values, @@ -756,6 +769,119 @@ TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) { INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest, testing::Values(true, false)); +class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest { + public: + bool SupportsPerKeyPlacement() const override { return true; } +}; + +TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) { + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + latest_cold_seq = 5; + + InitIterators( + {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue), + test::KeyStr("c", 5, kTypeValue)}, + {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + nullptr, nullptr, true); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + + // the first 2 keys are hot, which should has + // `output_to_penultimate_level()==true` and seq num not zeroed out + ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + // `a` is cold data, which should be output to bottommost + ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString()); + ASSERT_FALSE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) { + AddSnapshot(5); + + InitIterators( + {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion), + test::KeyStr("b", 5, kTypeValue)}, + {"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + nullptr, nullptr, true); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + + // The first key and the tombstone are within snapshot, which should output + // to the penultimate level (and seq num cannot be zeroed out). + ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + // `a` is not protected by the snapshot, the sequence number is zero out and + // should output bottommost + ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString()); + ASSERT_FALSE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) { + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + latest_cold_seq = 6; + + AddSnapshot(5); + + InitIterators({test::KeyStr("a", 7, kTypeValue), + test::KeyStr("unsafe_pb", 6, kTypeValue), + test::KeyStr("c", 5, kTypeValue)}, + {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, + kMaxSequenceNumber, nullptr, nullptr, true); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + + ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + // the 2nd key is unsafe to output_to_penultimate_level, but it's within + // snapshot so for per_key_placement feature it has to be outputted to the + // penultimate level. which is a corruption. We should never see + // such case as the data with seq num (within snapshot) should always come + // from higher compaction input level, which makes it safe to + // output_to_penultimate_level. + c_iter_->Next(); + ASSERT_TRUE(c_iter_->status().IsCorruption()); +} + +INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest, + PerKeyPlacementCompIteratorTest, + testing::Values(true, false)); + // Tests how CompactionIterator work together with SnapshotChecker. class CompactionIteratorWithSnapshotCheckerTest : public CompactionIteratorTest { diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 3be0b63d7..d609ac0b7 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -11,34 +11,24 @@ #include #include -#include -#include #include -#include #include -#include #include #include #include "db/blob/blob_counting_iterator.h" #include "db/blob/blob_file_addition.h" #include "db/blob/blob_file_builder.h" -#include "db/blob/blob_garbage_meter.h" #include "db/builder.h" #include "db/compaction/clipping_iterator.h" +#include "db/compaction/compaction_state.h" #include "db/db_impl/db_impl.h" -#include "db/db_iter.h" #include "db/dbformat.h" #include "db/error_handler.h" #include "db/event_helpers.h" #include "db/history_trimming_iterator.h" -#include "db/log_reader.h" #include "db/log_writer.h" -#include "db/memtable.h" -#include "db/memtable_list.h" -#include "db/merge_context.h" #include "db/merge_helper.h" -#include "db/output_validator.h" #include "db/range_del_aggregator.h" #include "db/version_set.h" #include "file/filename.h" @@ -48,30 +38,21 @@ #include "logging/log_buffer.h" #include "logging/logging.h" #include "monitoring/iostats_context_imp.h" -#include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" #include "options/configurable_helper.h" #include "options/options_helper.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "rocksdb/sst_partitioner.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" #include "rocksdb/utilities/options_type.h" -#include "table/block_based/block.h" -#include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "table/table_builder.h" #include "table/unique_id_impl.h" #include "test_util/sync_point.h" -#include "util/coding.h" -#include "util/hash.h" -#include "util/mutexlock.h" -#include "util/random.h" #include "util/stop_watch.h" -#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -121,328 +102,6 @@ const char* GetCompactionReasonString(CompactionReason compaction_reason) { } } -// Maintains state for each sub-compaction -struct CompactionJob::SubcompactionState { - const Compaction* compaction; - std::unique_ptr c_iter; - - // The boundaries of the key-range this compaction is interested in. No two - // subcompactions may have overlapping key-ranges. - // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded - Slice *start, *end; - - // The return status of this subcompaction - Status status; - - // The return IO Status of this subcompaction - IOStatus io_status; - - // Files produced by this subcompaction - struct Output { - Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp, - bool _enable_order_check, bool _enable_hash, bool _finished = false, - uint64_t precalculated_hash = 0) - : meta(std::move(_meta)), - validator(_icmp, _enable_order_check, _enable_hash, - precalculated_hash), - finished(_finished) {} - FileMetaData meta; - OutputValidator validator; - bool finished; - std::shared_ptr table_properties; - }; - - // State kept for output being generated - std::vector outputs; - std::vector blob_file_additions; - std::unique_ptr blob_garbage_meter; - std::unique_ptr outfile; - std::unique_ptr builder; - - Output* current_output() { - if (outputs.empty()) { - // This subcompaction's output could be empty if compaction was aborted - // before this subcompaction had a chance to generate any output files. - // When subcompactions are executed sequentially this is more likely and - // will be particularly likely for the later subcompactions to be empty. - // Once they are run in parallel however it should be much rarer. - return nullptr; - } else { - return &outputs.back(); - } - } - - // Some identified files with old oldest ancester time and the range should be - // isolated out so that the output file(s) in that range can be merged down - // for TTL and clear the timestamps for the range. - std::vector files_to_cut_for_ttl; - int cur_files_to_cut_for_ttl = -1; - int next_files_to_cut_for_ttl = 0; - - uint64_t current_output_file_size = 0; - - // State during the subcompaction - uint64_t total_bytes = 0; - uint64_t num_output_records = 0; - CompactionJobStats compaction_job_stats; - uint64_t approx_size = 0; - // An index that used to speed up ShouldStopBefore(). - size_t grandparent_index = 0; - // The number of bytes overlapping between the current output and - // grandparent files used in ShouldStopBefore(). - uint64_t overlapped_bytes = 0; - // A flag determines whether the key has been seen in ShouldStopBefore() - bool seen_key = false; - // sub compaction job id, which is used to identify different sub-compaction - // within the same compaction job. - const uint32_t sub_job_id; - - // Notify on sub-compaction completion only if listener was notified on - // sub-compaction begin. - bool notify_on_subcompaction_completion = false; - - // A flag determines if this subcompaction has been split by the cursor - bool is_split = false; - // We also maintain the output split key for each subcompaction to avoid - // repetitive comparison in ShouldStopBefore() - const InternalKey* local_output_split_key = nullptr; - - SubcompactionState(Compaction* c, Slice* _start, Slice* _end, uint64_t size, - uint32_t _sub_job_id) - : compaction(c), - start(_start), - end(_end), - approx_size(size), - sub_job_id(_sub_job_id) { - assert(compaction != nullptr); - const InternalKeyComparator* icmp = - &compaction->column_family_data()->internal_comparator(); - const InternalKey* output_split_key = compaction->GetOutputSplitKey(); - // Invalid output_split_key indicates that we do not need to split - if (output_split_key != nullptr) { - // We may only split the output when the cursor is in the range. Split - if ((end == nullptr || icmp->user_comparator()->Compare( - ExtractUserKey(output_split_key->Encode()), - ExtractUserKey(*end)) < 0) && - (start == nullptr || icmp->user_comparator()->Compare( - ExtractUserKey(output_split_key->Encode()), - ExtractUserKey(*start)) > 0)) { - local_output_split_key = output_split_key; - } - } - } - - // Adds the key and value to the builder - // If paranoid is true, adds the key-value to the paranoid hash - Status AddToBuilder(const Slice& key, const Slice& value) { - auto curr = current_output(); - assert(builder != nullptr); - assert(curr != nullptr); - Status s = curr->validator.Add(key, value); - if (!s.ok()) { - return s; - } - builder->Add(key, value); - return Status::OK(); - } - - void FillFilesToCutForTtl(); - - // Returns true iff we should stop building the current output - // before processing "internal_key". - bool ShouldStopBefore(const Slice& internal_key, uint64_t curr_file_size) { - const InternalKeyComparator* icmp = - &compaction->column_family_data()->internal_comparator(); - const std::vector& grandparents = compaction->grandparents(); - - // Invalid local_output_split_key indicates that we do not need to split - if (local_output_split_key != nullptr && !is_split) { - // Split occurs when the next key is larger than/equal to the cursor - if (icmp->Compare(internal_key, local_output_split_key->Encode()) >= 0) { - is_split = true; - return true; - } - } - bool grandparant_file_switched = false; - // Scan to find earliest grandparent file that contains key. - while (grandparent_index < grandparents.size() && - icmp->Compare(internal_key, - grandparents[grandparent_index]->largest.Encode()) > - 0) { - if (seen_key) { - overlapped_bytes += grandparents[grandparent_index]->fd.GetFileSize(); - grandparant_file_switched = true; - } - assert(grandparent_index + 1 >= grandparents.size() || - icmp->Compare( - grandparents[grandparent_index]->largest.Encode(), - grandparents[grandparent_index + 1]->smallest.Encode()) <= 0); - grandparent_index++; - } - seen_key = true; - - if (grandparant_file_switched && overlapped_bytes + curr_file_size > - compaction->max_compaction_bytes()) { - // Too much overlap for current output; start new output - overlapped_bytes = 0; - return true; - } - - if (!files_to_cut_for_ttl.empty()) { - if (cur_files_to_cut_for_ttl != -1) { - // Previous key is inside the range of a file - if (icmp->Compare(internal_key, - files_to_cut_for_ttl[cur_files_to_cut_for_ttl] - ->largest.Encode()) > 0) { - next_files_to_cut_for_ttl = cur_files_to_cut_for_ttl + 1; - cur_files_to_cut_for_ttl = -1; - return true; - } - } else { - // Look for the key position - while (next_files_to_cut_for_ttl < - static_cast(files_to_cut_for_ttl.size())) { - if (icmp->Compare(internal_key, - files_to_cut_for_ttl[next_files_to_cut_for_ttl] - ->smallest.Encode()) >= 0) { - if (icmp->Compare(internal_key, - files_to_cut_for_ttl[next_files_to_cut_for_ttl] - ->largest.Encode()) <= 0) { - // With in the current file - cur_files_to_cut_for_ttl = next_files_to_cut_for_ttl; - return true; - } - // Beyond the current file - next_files_to_cut_for_ttl++; - } else { - // Still fall into the gap - break; - } - } - } - } - - return false; - } - - Status ProcessOutFlowIfNeeded(const Slice& key, const Slice& value) { - if (!blob_garbage_meter) { - return Status::OK(); - } - - return blob_garbage_meter->ProcessOutFlow(key, value); - } -}; - -void CompactionJob::SubcompactionState::FillFilesToCutForTtl() { - if (compaction->immutable_options()->compaction_style != - CompactionStyle::kCompactionStyleLevel || - compaction->immutable_options()->compaction_pri != - CompactionPri::kMinOverlappingRatio || - compaction->mutable_cf_options()->ttl == 0 || - compaction->num_input_levels() < 2 || compaction->bottommost_level()) { - return; - } - - // We define new file with oldest ancestor time to be younger than 1/4 TTL, - // and an old one to be older than 1/2 TTL time. - int64_t temp_current_time; - auto get_time_status = compaction->immutable_options()->clock->GetCurrentTime( - &temp_current_time); - if (!get_time_status.ok()) { - return; - } - uint64_t current_time = static_cast(temp_current_time); - if (current_time < compaction->mutable_cf_options()->ttl) { - return; - } - uint64_t old_age_thres = - current_time - compaction->mutable_cf_options()->ttl / 2; - - const std::vector& olevel = - *(compaction->inputs(compaction->num_input_levels() - 1)); - for (FileMetaData* file : olevel) { - // Worth filtering out by start and end? - uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); - // We put old files if they are not too small to prevent a flood - // of small files. - if (oldest_ancester_time < old_age_thres && - file->fd.GetFileSize() > - compaction->mutable_cf_options()->target_file_size_base / 2) { - files_to_cut_for_ttl.push_back(file); - } - } -} - -// Maintains state for the entire compaction -struct CompactionJob::CompactionState { - Compaction* const compaction; - - // REQUIRED: subcompaction states are stored in order of increasing - // key-range - std::vector sub_compact_states; - Status status; - - size_t num_output_files = 0; - uint64_t total_bytes = 0; - size_t num_blob_output_files = 0; - uint64_t total_blob_bytes = 0; - uint64_t num_output_records = 0; - - explicit CompactionState(Compaction* c) : compaction(c) {} - - Slice SmallestUserKey() { - for (const auto& sub_compact_state : sub_compact_states) { - if (!sub_compact_state.outputs.empty() && - sub_compact_state.outputs[0].finished) { - return sub_compact_state.outputs[0].meta.smallest.user_key(); - } - } - // If there is no finished output, return an empty slice. - return Slice(nullptr, 0); - } - - Slice LargestUserKey() { - for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend(); - ++it) { - if (!it->outputs.empty() && it->current_output()->finished) { - assert(it->current_output() != nullptr); - return it->current_output()->meta.largest.user_key(); - } - } - // If there is no finished output, return an empty slice. - return Slice(nullptr, 0); - } -}; - -void CompactionJob::AggregateStatistics() { - assert(compact_); - - for (SubcompactionState& sc : compact_->sub_compact_states) { - auto& outputs = sc.outputs; - - if (!outputs.empty() && !outputs.back().meta.fd.file_size) { - // An error occurred, so ignore the last output. - outputs.pop_back(); - } - - compact_->num_output_files += outputs.size(); - compact_->total_bytes += sc.total_bytes; - - const auto& blobs = sc.blob_file_additions; - - compact_->num_blob_output_files += blobs.size(); - - for (const auto& blob : blobs) { - compact_->total_blob_bytes += blob.GetTotalBlobBytes(); - } - - compact_->num_output_records += sc.num_output_records; - - compaction_job_stats_->Add(sc.compaction_job_stats); - } -} - CompactionJob::CompactionJob( int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, const MutableDBOptions& mutable_db_options, const FileOptions& file_options, @@ -471,8 +130,8 @@ CompactionJob::CompactionJob( stats_(stats), bottommost_level_(false), write_hint_(Env::WLTH_NOT_SET), - job_id_(job_id), compaction_job_stats_(compaction_job_stats), + job_id_(job_id), dbname_(dbname), db_id_(db_id), db_session_id_(db_session_id), @@ -577,12 +236,11 @@ void CompactionJob::Prepare() { StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME); GenSubcompactionBoundaries(); } - assert(sizes_.size() == boundaries_.size() + 1); for (size_t i = 0; i <= boundaries_.size(); i++) { Slice* start = i == 0 ? nullptr : &boundaries_[i - 1]; Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i]; - compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i], + compact_->sub_compact_states.emplace_back(c, start, end, static_cast(i)); } RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED, @@ -590,10 +248,8 @@ void CompactionJob::Prepare() { } else { constexpr Slice* start = nullptr; constexpr Slice* end = nullptr; - constexpr uint64_t size = 0; - compact_->sub_compact_states.emplace_back(c, start, end, size, - /*sub_job_id*/ 0); + compact_->sub_compact_states.emplace_back(c, start, end, /*sub_job_id*/ 0); } } @@ -722,15 +378,10 @@ void CompactionJob::GenSubcompactionBoundaries() { } if (sum >= mean) { boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit)); - sizes_.emplace_back(sum); subcompactions--; sum = 0; } } - sizes_.emplace_back(sum + ranges.back().size); - } else { - // Only one range so its size is the total sum of sizes computed above - sizes_.emplace_back(sum); } } @@ -762,16 +413,17 @@ Status CompactionJob::Run() { thread.join(); } - compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros; - compaction_stats_.cpu_micros = 0; - for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) { - compaction_stats_.cpu_micros += - compact_->sub_compact_states[i].compaction_job_stats.cpu_micros; + compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros); + + for (auto& state : compact_->sub_compact_states) { + compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros); + state.RemoveLastEmptyOutput(); } - RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros); + RecordTimeToHistogram(stats_, COMPACTION_TIME, + compaction_stats_.stats.micros); RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, - compaction_stats_.cpu_micros); + compaction_stats_.stats.cpu_micros); TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify"); @@ -787,7 +439,7 @@ Status CompactionJob::Run() { break; } - if (!state.blob_file_additions.empty()) { + if (state.Current().HasBlobFileAdditions()) { wrote_new_blob_files = true; } } @@ -819,9 +471,9 @@ Status CompactionJob::Run() { } if (status.ok()) { thread_pool.clear(); - std::vector files_output; + std::vector files_output; for (const auto& state : compact_->sub_compact_states) { - for (const auto& output : state.outputs) { + for (const auto& output : state.GetOutputs()) { files_output.emplace_back(&output); } } @@ -903,7 +555,7 @@ Status CompactionJob::Run() { TablePropertiesCollection tp; for (const auto& state : compact_->sub_compact_states) { - for (const auto& output : state.outputs) { + for (const auto& output : state.GetOutputs()) { auto fn = TableFileName(state.compaction->immutable_options()->cf_paths, output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); @@ -913,7 +565,7 @@ Status CompactionJob::Run() { compact_->compaction->SetOutputTableProperties(std::move(tp)); // Finish up all book-keeping to unify the subcompaction results - AggregateStatistics(); + compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); UpdateCompactionStats(); RecordCompactionIOStats(); @@ -935,8 +587,9 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { ColumnFamilyData* cfd = compact_->compaction->column_family_data(); assert(cfd); - cfd->internal_stats()->AddCompactionStats( - compact_->compaction->output_level(), thread_pri_, compaction_stats_); + int output_level = compact_->compaction->output_level(); + cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_, + compaction_stats_); if (status.ok()) { status = InstallCompactionResults(mutable_cf_options); @@ -947,7 +600,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { VersionStorageInfo::LevelSummaryStorage tmp; auto vstorage = cfd->current()->storage_info(); - const auto& stats = compaction_stats_; + const auto& stats = compaction_stats_.stats; double read_write_amp = 0.0; double write_amp = 0.0; @@ -1009,6 +662,18 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { blob_files.back()->GetBlobFileNumber()); } + if (compaction_stats_.has_penultimate_level_output) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] has Penultimate Level output: %" PRIu64 + ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64, + column_family_name.c_str(), + compaction_stats_.penultimate_level_stats.bytes_written, + compact_->compaction->GetPenultimateLevel(), + compaction_stats_.penultimate_level_stats.num_output_files, + compaction_stats_.penultimate_level_stats.num_output_records); + } + UpdateCompactionJobStats(stats); auto stream = event_logger_->LogToBuffer(log_buffer_, 8192); @@ -1017,16 +682,16 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { << "compaction_time_micros" << stats.micros << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level" << compact_->compaction->output_level() << "num_output_files" - << compact_->num_output_files << "total_output_size" - << compact_->total_bytes; + << stats.num_output_files << "total_output_size" + << stats.bytes_written; - if (compact_->num_blob_output_files > 0) { - stream << "num_blob_output_files" << compact_->num_blob_output_files - << "total_blob_output_size" << compact_->total_blob_bytes; + if (stats.num_output_files_blob > 0) { + stream << "num_blob_output_files" << stats.num_output_files_blob + << "total_blob_output_size" << stats.bytes_written_blob; } stream << "num_input_records" << stats.num_input_records - << "num_output_records" << compact_->num_output_records + << "num_output_records" << stats.num_output_records << "num_subcompactions" << compact_->sub_compact_states.size() << "output_compression" << CompressionTypeToString(compact_->compaction->output_compression()); @@ -1060,217 +725,22 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber(); } - CleanupCompaction(); - return status; -} - -#ifndef ROCKSDB_LITE -CompactionServiceJobStatus -CompactionJob::ProcessKeyValueCompactionWithCompactionService( - SubcompactionState* sub_compact) { - assert(sub_compact); - assert(sub_compact->compaction); - assert(db_options_.compaction_service); - - const Compaction* compaction = sub_compact->compaction; - CompactionServiceInput compaction_input; - compaction_input.output_level = compaction->output_level(); - compaction_input.db_id = db_id_; - - const std::vector& inputs = - *(compact_->compaction->inputs()); - for (const auto& files_per_level : inputs) { - for (const auto& file : files_per_level.files) { - compaction_input.input_files.emplace_back( - MakeTableFileName(file->fd.GetNumber())); - } - } - compaction_input.column_family.name = - compaction->column_family_data()->GetName(); - compaction_input.column_family.options = - compaction->column_family_data()->GetLatestCFOptions(); - compaction_input.db_options = - BuildDBOptions(db_options_, mutable_db_options_copy_); - compaction_input.snapshots = existing_snapshots_; - compaction_input.has_begin = sub_compact->start; - compaction_input.begin = - compaction_input.has_begin ? sub_compact->start->ToString() : ""; - compaction_input.has_end = sub_compact->end; - compaction_input.end = - compaction_input.has_end ? sub_compact->end->ToString() : ""; - compaction_input.approx_size = sub_compact->approx_size; - - std::string compaction_input_binary; - Status s = compaction_input.Write(&compaction_input_binary); - if (!s.ok()) { - sub_compact->status = s; - return CompactionServiceJobStatus::kFailure; - } - - std::ostringstream input_files_oss; - bool is_first_one = true; - for (const auto& file : compaction_input.input_files) { - input_files_oss << (is_first_one ? "" : ", ") << file; - is_first_one = false; - } - - ROCKS_LOG_INFO( - db_options_.info_log, - "[%s] [JOB %d] Starting remote compaction (output level: %d): %s", - compaction_input.column_family.name.c_str(), job_id_, - compaction_input.output_level, input_files_oss.str().c_str()); - CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_, - GetCompactionId(sub_compact), thread_pri_); - CompactionServiceJobStatus compaction_status = - db_options_.compaction_service->StartV2(info, compaction_input_binary); - switch (compaction_status) { - case CompactionServiceJobStatus::kSuccess: - break; - case CompactionServiceJobStatus::kFailure: - sub_compact->status = Status::Incomplete( - "CompactionService failed to start compaction job."); - ROCKS_LOG_WARN(db_options_.info_log, - "[%s] [JOB %d] Remote compaction failed to start.", - compaction_input.column_family.name.c_str(), job_id_); - return compaction_status; - case CompactionServiceJobStatus::kUseLocal: - ROCKS_LOG_INFO( - db_options_.info_log, - "[%s] [JOB %d] Remote compaction fallback to local by API Start.", - compaction_input.column_family.name.c_str(), job_id_); - return compaction_status; - default: - assert(false); // unknown status - break; - } - - ROCKS_LOG_INFO(db_options_.info_log, - "[%s] [JOB %d] Waiting for remote compaction...", - compaction_input.column_family.name.c_str(), job_id_); - std::string compaction_result_binary; - compaction_status = db_options_.compaction_service->WaitForCompleteV2( - info, &compaction_result_binary); - - if (compaction_status == CompactionServiceJobStatus::kUseLocal) { - ROCKS_LOG_INFO(db_options_.info_log, - "[%s] [JOB %d] Remote compaction fallback to local by API " - "WaitForComplete.", - compaction_input.column_family.name.c_str(), job_id_); - return compaction_status; + if (compaction_stats_.has_penultimate_level_output) { + InternalStats::CompactionStats& pl_stats = + compaction_stats_.penultimate_level_stats; + stream << "penultimate_level_num_output_files" << pl_stats.num_output_files; + stream << "penultimate_level_bytes_written" << pl_stats.bytes_written; + stream << "penultimate_level_num_output_records" + << pl_stats.num_output_records; + stream << "penultimate_level_num_output_files_blob" + << pl_stats.num_output_files_blob; + stream << "penultimate_level_bytes_written_blob" + << pl_stats.bytes_written_blob; } - CompactionServiceResult compaction_result; - s = CompactionServiceResult::Read(compaction_result_binary, - &compaction_result); - - if (compaction_status == CompactionServiceJobStatus::kFailure) { - if (s.ok()) { - if (compaction_result.status.ok()) { - sub_compact->status = Status::Incomplete( - "CompactionService failed to run the compaction job (even though " - "the internal status is okay)."); - } else { - // set the current sub compaction status with the status returned from - // remote - sub_compact->status = compaction_result.status; - } - } else { - sub_compact->status = Status::Incomplete( - "CompactionService failed to run the compaction job (and no valid " - "result is returned)."); - compaction_result.status.PermitUncheckedError(); - } - ROCKS_LOG_WARN(db_options_.info_log, - "[%s] [JOB %d] Remote compaction failed.", - compaction_input.column_family.name.c_str(), job_id_); - return compaction_status; - } - - if (!s.ok()) { - sub_compact->status = s; - compaction_result.status.PermitUncheckedError(); - return CompactionServiceJobStatus::kFailure; - } - sub_compact->status = compaction_result.status; - - std::ostringstream output_files_oss; - is_first_one = true; - for (const auto& file : compaction_result.output_files) { - output_files_oss << (is_first_one ? "" : ", ") << file.file_name; - is_first_one = false; - } - - ROCKS_LOG_INFO(db_options_.info_log, - "[%s] [JOB %d] Receive remote compaction result, output path: " - "%s, files: %s", - compaction_input.column_family.name.c_str(), job_id_, - compaction_result.output_path.c_str(), - output_files_oss.str().c_str()); - - if (!s.ok()) { - sub_compact->status = s; - return CompactionServiceJobStatus::kFailure; - } - - for (const auto& file : compaction_result.output_files) { - uint64_t file_num = versions_->NewFileNumber(); - auto src_file = compaction_result.output_path + "/" + file.file_name; - auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths, - file_num, compaction->output_path_id()); - s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr); - if (!s.ok()) { - sub_compact->status = s; - return CompactionServiceJobStatus::kFailure; - } - - FileMetaData meta; - uint64_t file_size; - s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); - if (!s.ok()) { - sub_compact->status = s; - return CompactionServiceJobStatus::kFailure; - } - meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, - file.smallest_seqno, file.largest_seqno); - meta.smallest.DecodeFrom(file.smallest_internal_key); - meta.largest.DecodeFrom(file.largest_internal_key); - meta.oldest_ancester_time = file.oldest_ancester_time; - meta.file_creation_time = file.file_creation_time; - meta.marked_for_compaction = file.marked_for_compaction; - meta.unique_id = file.unique_id; - - auto cfd = compaction->column_family_data(); - sub_compact->outputs.emplace_back(std::move(meta), - cfd->internal_comparator(), false, false, - true, file.paranoid_hash); - } - sub_compact->compaction_job_stats = compaction_result.stats; - sub_compact->num_output_records = compaction_result.num_output_records; - sub_compact->approx_size = compaction_input.approx_size; // is this used? - sub_compact->total_bytes = compaction_result.total_bytes; - RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); - RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, - compaction_result.bytes_written); - return CompactionServiceJobStatus::kSuccess; -} - -void CompactionJob::BuildSubcompactionJobInfo( - SubcompactionState* sub_compact, - SubcompactionJobInfo* subcompaction_job_info) const { - Compaction* c = compact_->compaction; - ColumnFamilyData* cfd = c->column_family_data(); - - subcompaction_job_info->cf_id = cfd->GetID(); - subcompaction_job_info->cf_name = cfd->GetName(); - subcompaction_job_info->status = sub_compact->status; - subcompaction_job_info->thread_id = env_->GetThreadID(); - subcompaction_job_info->job_id = job_id_; - subcompaction_job_info->subcompaction_job_id = sub_compact->sub_job_id; - subcompaction_job_info->base_input_level = c->start_level(); - subcompaction_job_info->output_level = c->output_level(); - subcompaction_job_info->stats = sub_compact->compaction_job_stats; + CleanupCompaction(); + return status; } -#endif // !ROCKSDB_LITE void CompactionJob::NotifyOnSubcompactionBegin( SubcompactionState* sub_compact) { @@ -1291,9 +761,11 @@ void CompactionJob::NotifyOnSubcompactionBegin( sub_compact->notify_on_subcompaction_completion = true; SubcompactionJobInfo info{}; - BuildSubcompactionJobInfo(sub_compact, &info); + sub_compact->BuildSubcompactionJobInfo(info); + info.job_id = static_cast(job_id_); + info.thread_id = env_->GetThreadID(); - for (auto listener : db_options_.listeners) { + for (const auto& listener : db_options_.listeners) { listener->OnSubcompactionBegin(info); } info.status.PermitUncheckedError(); @@ -1319,9 +791,11 @@ void CompactionJob::NotifyOnSubcompactionCompleted( } SubcompactionJobInfo info{}; - BuildSubcompactionJobInfo(sub_compact, &info); + sub_compact->BuildSubcompactionJobInfo(info); + info.job_id = static_cast(job_id_); + info.thread_id = env_->GetThreadID(); - for (auto listener : db_options_.listeners) { + for (const auto& listener : db_options_.listeners) { listener->OnSubcompactionCompleted(info); } #else @@ -1369,8 +843,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { NotifyOnSubcompactionBegin(sub_compact); - CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(), - existing_snapshots_); + auto range_del_agg = std::make_unique( + &cfd->internal_comparator(), existing_snapshots_); // TODO: since we already use C++17, should use // std::optional instead. @@ -1396,7 +870,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // Although the v2 aggregator is what the level iterator(s) know about, // the AddTombstones calls will be propagated down to the v1 aggregator. std::unique_ptr raw_input(versions_->MakeInputIterator( - read_options, sub_compact->compaction, &range_del_agg, + read_options, sub_compact->compaction, range_del_agg.get(), file_options_for_read_, (start == nullptr) ? std::optional{} : std::optional{*start}, @@ -1429,9 +903,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { std::unique_ptr blob_counter; if (sub_compact->compaction->DoesInputReferenceBlobFiles()) { - sub_compact->blob_garbage_meter = std::make_unique(); - blob_counter = std::make_unique( - input, sub_compact->blob_garbage_meter.get()); + BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter(); + blob_counter = std::make_unique(input, meter); input = blob_counter.get(); } @@ -1480,6 +953,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { std::vector blob_file_paths; + // TODO: BlobDB to support output_to_penultimate_level compaction, which needs + // 2 builders, so may need to move to `CompactionOutputs` std::unique_ptr blob_file_builder( (mutable_cf_options->enable_blob_files && sub_compact->compaction->output_level() >= @@ -1490,7 +965,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { mutable_cf_options, &file_options_, job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW, write_hint_, io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction, - &blob_file_paths, &sub_compact->blob_file_additions) + &blob_file_paths, + sub_compact->Current().GetBlobFileAdditionsPtr()) : nullptr); TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); @@ -1499,44 +975,53 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { reinterpret_cast( const_cast*>(&manual_compaction_canceled_))); - Status status; const std::string* const full_history_ts_low = full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_; const SequenceNumber job_snapshot_seq = job_context_ ? job_context_->GetJobSnapshotSequence() : kMaxSequenceNumber; - sub_compact->c_iter.reset(new CompactionIterator( + + auto c_iter = std::make_unique( input, cfd->user_comparator(), &merge, versions_->LastSequence(), &existing_snapshots_, earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), - /*expect_valid_internal_key=*/true, &range_del_agg, + /*expect_valid_internal_key=*/true, range_del_agg.get(), blob_file_builder.get(), db_options_.allow_data_in_errors, db_options_.enforce_single_del_contracts, manual_compaction_canceled_, sub_compact->compaction, compaction_filter, shutting_down_, - db_options_.info_log, full_history_ts_low)); - auto c_iter = sub_compact->c_iter.get(); + db_options_.info_log, full_history_ts_low); c_iter->SeekToFirst(); + + // Assign range delete aggregator to the target output level, which makes sure + // it only output to single level + sub_compact->AssignRangeDelAggregator(std::move(range_del_agg)); + if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) { sub_compact->FillFilesToCutForTtl(); // ShouldStopBefore() maintains state based on keys processed so far. The // compaction loop always calls it on the "next" key, thus won't tell it the // first key. So we do that here. - sub_compact->ShouldStopBefore(c_iter->key(), - sub_compact->current_output_file_size); + sub_compact->ShouldStopBefore(c_iter->key()); } const auto& c_iter_stats = c_iter->iter_stats(); - std::unique_ptr partitioner = - sub_compact->compaction->output_level() == 0 - ? nullptr - : sub_compact->compaction->CreateSstPartitioner(); - std::string last_key_for_partitioner; + // define the open and close functions for the compaction files, which will be + // used open/close output files when needed. + const CompactionFileOpenFunc open_file_func = + [this, sub_compact](CompactionOutputs& outputs) { + return this->OpenCompactionOutputFile(sub_compact, outputs); + }; + const CompactionFileCloseFunc close_file_func = + [this, sub_compact](CompactionOutputs& outputs, const Status& status, + const Slice& next_table_min_key) { + return this->FinishCompactionOutputFile(status, sub_compact, outputs, + next_table_min_key); + }; + Status status; while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) { // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid() // returns true. - const Slice& key = c_iter->key(); - const Slice& value = c_iter->value(); assert(!end || cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0); @@ -1548,88 +1033,33 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { RecordCompactionIOStats(); } - // Open output file if necessary - if (sub_compact->builder == nullptr) { - status = OpenCompactionOutputFile(sub_compact); - if (!status.ok()) { - break; - } - } - status = sub_compact->AddToBuilder(key, value); - if (!status.ok()) { - break; - } - - status = sub_compact->ProcessOutFlowIfNeeded(key, value); - if (!status.ok()) { - break; - } - - const ParsedInternalKey& ikey = c_iter->ikey(); - status = sub_compact->current_output()->meta.UpdateBoundaries( - key, value, ikey.sequence, ikey.type); + // Add current compaction_iterator key to target compaction output, if the + // output file needs to be close or open, it will call the `open_file_func` + // and `close_file_func`. + // TODO: it would be better to have the compaction file open/close moved + // into `CompactionOutputs` which has the output file information. + status = sub_compact->AddToOutput(*c_iter, open_file_func, close_file_func); if (!status.ok()) { break; } - sub_compact->current_output_file_size = - sub_compact->builder->EstimatedFileSize(); - sub_compact->num_output_records++; - - // Close output file if it is big enough. Two possibilities determine it's - // time to close it: (1) the current key should be this file's last key, (2) - // the next key should not be in this file. - // - // TODO(aekmekji): determine if file should be closed earlier than this - // during subcompactions (i.e. if output size, estimated by input size, is - // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB - // and 0.6MB instead of 1MB and 0.2MB) - bool output_file_ended = false; - if (sub_compact->compaction->output_level() != 0 && - sub_compact->current_output_file_size >= - sub_compact->compaction->max_output_file_size()) { - // (1) this key terminates the file. For historical reasons, the iterator - // status before advancing will be given to FinishCompactionOutputFile(). - output_file_ended = true; - } TEST_SYNC_POINT_CALLBACK( "CompactionJob::Run():PausingManualCompaction:2", reinterpret_cast( const_cast*>(&manual_compaction_canceled_))); - if (partitioner.get()) { - last_key_for_partitioner.assign(c_iter->user_key().data_, - c_iter->user_key().size_); - } c_iter->Next(); if (c_iter->status().IsManualCompactionPaused()) { break; } - if (!output_file_ended && c_iter->Valid()) { - if (((partitioner.get() && - partitioner->ShouldPartition(PartitionerRequest( - last_key_for_partitioner, c_iter->user_key(), - sub_compact->current_output_file_size)) == kRequired) || - (sub_compact->compaction->output_level() != 0 && - sub_compact->ShouldStopBefore( - c_iter->key(), sub_compact->current_output_file_size))) && - sub_compact->builder != nullptr) { - // (2) this key belongs to the next file. For historical reasons, the - // iterator status after advancing will be given to - // FinishCompactionOutputFile(). - output_file_ended = true; - } - } - if (output_file_ended) { - const Slice* next_key = nullptr; - if (c_iter->Valid()) { - next_key = &c_iter->key(); - } - CompactionIterationStats range_del_out_stats; - status = FinishCompactionOutputFile(input->status(), sub_compact, - &range_del_agg, &range_del_out_stats, - next_key); - RecordDroppedKeys(range_del_out_stats, - &sub_compact->compaction_job_stats); + + // TODO: Support earlier file cut for the penultimate level files. Maybe by + // moving `ShouldStopBefore()` to `CompactionOutputs` class. Currently + // the penultimate level output is only cut when it reaches the size limit. + if (!sub_compact->Current().IsPendingClose() && + sub_compact->compaction->output_level() != 0 && + !sub_compact->compaction->SupportsPerKeyPlacement() && + sub_compact->ShouldStopBefore(c_iter->key())) { + sub_compact->Current().SetPendingClose(); } } @@ -1684,23 +1114,12 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { status = c_iter->status(); } - if (status.ok() && sub_compact->builder == nullptr && - sub_compact->outputs.size() == 0 && !range_del_agg.IsEmpty()) { - // handle subcompaction containing only range deletions - status = OpenCompactionOutputFile(sub_compact); - } - // Call FinishCompactionOutputFile() even if status is not ok: it needs to - // close the output file. - if (sub_compact->builder != nullptr) { - CompactionIterationStats range_del_out_stats; - Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg, - &range_del_out_stats); - if (!s.ok() && status.ok()) { - status = s; - } - RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); - } + // close the output files. Open file function is also passed, in case there's + // only range-dels, no file was opened, to save the range-dels, it need to + // create a new output file. + status = sub_compact->CloseCompactionFiles(status, open_file_func, + close_file_func); if (blob_file_builder) { if (status.ok()) { @@ -1709,6 +1128,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { blob_file_builder->Abandon(status); } blob_file_builder.reset(); + sub_compact->Current().UpdateBlobStats(); } sub_compact->compaction_job_stats.cpu_micros = @@ -1733,8 +1153,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } #ifdef ROCKSDB_ASSERT_STATUS_CHECKED if (!status.ok()) { - if (sub_compact->c_iter) { - sub_compact->c_iter->status().PermitUncheckedError(); + if (c_iter) { + c_iter->status().PermitUncheckedError(); } if (input) { input->status().PermitUncheckedError(); @@ -1742,7 +1162,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } #endif // ROCKSDB_ASSERT_STATUS_CHECKED - sub_compact->c_iter.reset(); blob_counter.reset(); clip.reset(); raw_input.reset(); @@ -1750,7 +1169,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { NotifyOnSubcompactionCompleted(sub_compact); } -uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) { +uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const { return (uint64_t)job_id_ << 32 | sub_compact->sub_job_id; } @@ -1793,204 +1212,50 @@ void CompactionJob::RecordDroppedKeys( Status CompactionJob::FinishCompactionOutputFile( const Status& input_status, SubcompactionState* sub_compact, - CompactionRangeDelAggregator* range_del_agg, - CompactionIterationStats* range_del_out_stats, - const Slice* next_table_min_key /* = nullptr */) { + CompactionOutputs& outputs, const Slice& next_table_min_key) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_SYNC_FILE); assert(sub_compact != nullptr); - assert(sub_compact->outfile); - assert(sub_compact->builder != nullptr); - assert(sub_compact->current_output() != nullptr); + assert(outputs.HasBuilder()); - uint64_t output_number = sub_compact->current_output()->meta.fd.GetNumber(); + FileMetaData* meta = outputs.GetMetaData(); + uint64_t output_number = meta->fd.GetNumber(); assert(output_number != 0); ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); - const Comparator* ucmp = cfd->user_comparator(); std::string file_checksum = kUnknownFileChecksum; std::string file_checksum_func_name = kUnknownFileChecksumFuncName; // Check for iterator errors Status s = input_status; - auto meta = &sub_compact->current_output()->meta; - assert(meta != nullptr); - if (s.ok()) { - Slice lower_bound_guard, upper_bound_guard; - std::string smallest_user_key; - const Slice *lower_bound, *upper_bound; - bool lower_bound_from_sub_compact = false; - if (sub_compact->outputs.size() == 1) { - // For the first output table, include range tombstones before the min key - // but after the subcompaction boundary. - lower_bound = sub_compact->start; - lower_bound_from_sub_compact = true; - } else if (meta->smallest.size() > 0) { - // For subsequent output tables, only include range tombstones from min - // key onwards since the previous file was extended to contain range - // tombstones falling before min key. - smallest_user_key = meta->smallest.user_key().ToString(false /*hex*/); - lower_bound_guard = Slice(smallest_user_key); - lower_bound = &lower_bound_guard; - } else { - lower_bound = nullptr; - } - if (next_table_min_key != nullptr) { - // This may be the last file in the subcompaction in some cases, so we - // need to compare the end key of subcompaction with the next file start - // key. When the end key is chosen by the subcompaction, we know that - // it must be the biggest key in output file. Therefore, it is safe to - // use the smaller key as the upper bound of the output file, to ensure - // that there is no overlapping between different output files. - upper_bound_guard = ExtractUserKey(*next_table_min_key); - if (sub_compact->end != nullptr && - ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) { - upper_bound = sub_compact->end; - } else { - upper_bound = &upper_bound_guard; - } - } else { - // This is the last file in the subcompaction, so extend until the - // subcompaction ends. - upper_bound = sub_compact->end; - } - auto earliest_snapshot = kMaxSequenceNumber; - if (existing_snapshots_.size() > 0) { - earliest_snapshot = existing_snapshots_[0]; - } - bool has_overlapping_endpoints; - if (upper_bound != nullptr && meta->largest.size() > 0) { - has_overlapping_endpoints = - ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0; - } else { - has_overlapping_endpoints = false; - } - // The end key of the subcompaction must be bigger or equal to the upper - // bound. If the end of subcompaction is null or the upper bound is null, - // it means that this file is the last file in the compaction. So there - // will be no overlapping between this file and others. - assert(sub_compact->end == nullptr || - upper_bound == nullptr || - ucmp->Compare(*upper_bound , *sub_compact->end) <= 0); - auto it = range_del_agg->NewIterator(lower_bound, upper_bound, - has_overlapping_endpoints); - // Position the range tombstone output iterator. There may be tombstone - // fragments that are entirely out of range, so make sure that we do not - // include those. - if (lower_bound != nullptr) { - it->Seek(*lower_bound); - } else { - it->SeekToFirst(); + // Add range tombstones + auto earliest_snapshot = kMaxSequenceNumber; + if (existing_snapshots_.size() > 0) { + earliest_snapshot = existing_snapshots_[0]; + } + if (s.ok()) { + CompactionIterationStats range_del_out_stats; + // if the compaction supports per_key_placement, only output range dels to + // the penultimate level. + // Note: Use `bottommost_level_ = true` for both bottommost and + // output_to_penultimate_level compaction here, as it's only used to decide + // if range dels could be dropped. + if (outputs.HasRangeDel()) { + s = outputs.AddRangeDels(sub_compact->start, sub_compact->end, + range_del_out_stats, bottommost_level_, + cfd->internal_comparator(), earliest_snapshot, + next_table_min_key); } + RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1"); - for (; it->Valid(); it->Next()) { - auto tombstone = it->Tombstone(); - if (upper_bound != nullptr) { - int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_); - if ((has_overlapping_endpoints && cmp < 0) || - (!has_overlapping_endpoints && cmp <= 0)) { - // Tombstones starting after upper_bound only need to be included in - // the next table. If the current SST ends before upper_bound, i.e., - // `has_overlapping_endpoints == false`, we can also skip over range - // tombstones that start exactly at upper_bound. Such range tombstones - // will be included in the next file and are not relevant to the point - // keys or endpoints of the current file. - break; - } - } + } - if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) { - // TODO(andrewkr): tombstones that span multiple output files are - // counted for each compaction output file, so lots of double counting. - range_del_out_stats->num_range_del_drop_obsolete++; - range_del_out_stats->num_record_drop_obsolete++; - continue; - } + const uint64_t current_entries = outputs.NumEntries(); + + s = outputs.Finish(s); - auto kv = tombstone.Serialize(); - assert(lower_bound == nullptr || - ucmp->Compare(*lower_bound, kv.second) < 0); - // Range tombstone is not supported by output validator yet. - sub_compact->builder->Add(kv.first.Encode(), kv.second); - InternalKey smallest_candidate = std::move(kv.first); - if (lower_bound != nullptr && - ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) { - // Pretend the smallest key has the same user key as lower_bound - // (the max key in the previous table or subcompaction) in order for - // files to appear key-space partitioned. - // - // When lower_bound is chosen by a subcompaction, we know that - // subcompactions over smaller keys cannot contain any keys at - // lower_bound. We also know that smaller subcompactions exist, because - // otherwise the subcompaction woud be unbounded on the left. As a - // result, we know that no other files on the output level will contain - // actual keys at lower_bound (an output file may have a largest key of - // lower_bound@kMaxSequenceNumber, but this only indicates a large range - // tombstone was truncated). Therefore, it is safe to use the - // tombstone's sequence number, to ensure that keys at lower_bound at - // lower levels are covered by truncated tombstones. - // - // If lower_bound was chosen by the smallest data key in the file, - // choose lowest seqnum so this file's smallest internal key comes after - // the previous file's largest. The fake seqnum is OK because the read - // path's file-picking code only considers user key. - smallest_candidate = InternalKey( - *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0, - kTypeRangeDeletion); - } - InternalKey largest_candidate = tombstone.SerializeEndKey(); - if (upper_bound != nullptr && - ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) { - // Pretend the largest key has the same user key as upper_bound (the - // min key in the following table or subcompaction) in order for files - // to appear key-space partitioned. - // - // Choose highest seqnum so this file's largest internal key comes - // before the next file's/subcompaction's smallest. The fake seqnum is - // OK because the read path's file-picking code only considers the user - // key portion. - // - // Note Seek() also creates InternalKey with (user_key, - // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of - // kTypeRangeDeletion (0xF), so the range tombstone comes before the - // Seek() key in InternalKey's ordering. So Seek() will look in the - // next file for the user key. - largest_candidate = - InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); - } -#ifndef NDEBUG - SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; - if (meta->smallest.size() > 0) { - smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode()); - } -#endif - meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate, - tombstone.seq_, - cfd->internal_comparator()); - // The smallest key in a file is used for range tombstone truncation, so - // it cannot have a seqnum of 0 (unless the smallest data key in a file - // has a seqnum of 0). Otherwise, the truncated tombstone may expose - // deleted keys at lower levels. - assert(smallest_ikey_seqnum == 0 || - ExtractInternalKeyFooter(meta->smallest.Encode()) != - PackSequenceAndType(0, kTypeRangeDeletion)); - } - } - const uint64_t current_entries = sub_compact->builder->NumEntries(); - if (s.ok()) { - s = sub_compact->builder->Finish(); - } else { - sub_compact->builder->Abandon(); - } - IOStatus io_s = sub_compact->builder->io_status(); - if (s.ok()) { - s = io_s; - } - const uint64_t current_bytes = sub_compact->builder->FileSize(); if (s.ok()) { - meta->fd.file_size = current_bytes; - meta->marked_for_compaction = sub_compact->builder->NeedCompact(); // With accurate smallest and largest key, we can get a slightly more // accurate oldest ancester time. // This makes oldest ancester time in manifest more accurate than in @@ -2010,25 +1275,16 @@ Status CompactionJob::FinishCompactionOutputFile( } } } - sub_compact->current_output()->finished = true; - sub_compact->total_bytes += current_bytes; // Finish and check for file errors - if (s.ok()) { - StopWatch sw(db_options_.clock, stats_, COMPACTION_OUTFILE_SYNC_MICROS); - io_s = sub_compact->outfile->Sync(db_options_.use_fsync); - } - if (s.ok() && io_s.ok()) { - io_s = sub_compact->outfile->Close(); - } + IOStatus io_s = outputs.WriterSyncClose(s, db_options_.clock, stats_, + db_options_.use_fsync); + if (s.ok() && io_s.ok()) { - // Add the checksum information to file metadata. - meta->file_checksum = sub_compact->outfile->GetFileChecksum(); - meta->file_checksum_func_name = - sub_compact->outfile->GetFileChecksumFuncName(); file_checksum = meta->file_checksum; file_checksum_func_name = meta->file_checksum_func_name; } + if (s.ok()) { s = io_s; } @@ -2038,11 +1294,10 @@ Status CompactionJob::FinishCompactionOutputFile( // "normal" status, it does not also need to be checked sub_compact->io_status.PermitUncheckedError(); } - sub_compact->outfile.reset(); TableProperties tp; if (s.ok()) { - tp = sub_compact->builder->GetTableProperties(); + tp = outputs.GetTableProperties(); } if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) { @@ -2067,21 +1322,20 @@ Status CompactionJob::FinishCompactionOutputFile( // Also need to remove the file from outputs, or it will be added to the // VersionEdit. - assert(!sub_compact->outputs.empty()); - sub_compact->outputs.pop_back(); + outputs.RemoveLastOutput(); meta = nullptr; } if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) { // Output to event logger and fire events. - sub_compact->current_output()->table_properties = - std::make_shared(tp); + outputs.UpdateTableProperties(); ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64 - " keys, %" PRIu64 " bytes%s", + " keys, %" PRIu64 " bytes%s, temperature: %s", cfd->GetName().c_str(), job_id_, output_number, - current_entries, current_bytes, - meta->marked_for_compaction ? " (need compaction)" : ""); + current_entries, meta->fd.file_size, + meta->marked_for_compaction ? " (need compaction)" : "", + temperature_to_string[meta->temperature].c_str()); } std::string fname; FileDescriptor output_fd; @@ -2117,16 +1371,14 @@ Status CompactionJob::FinishCompactionOutputFile( // compaction output file (similarly to how flush works when full)? s = Status::SpaceLimit("Max allowed space was reached"); TEST_SYNC_POINT( - "CompactionJob::FinishCompactionOutputFile:" - "MaxAllowedSpaceReached"); + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached"); InstrumentedMutexLock l(db_mutex_); db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction); } } #endif - sub_compact->builder.reset(); - sub_compact->current_output_file_size = 0; + outputs.ResetBuilder(); return s; } @@ -2141,11 +1393,23 @@ Status CompactionJob::InstallCompactionResults( { Compaction::InputLevelSummaryBuffer inputs_summary; - ROCKS_LOG_BUFFER(log_buffer_, - "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", - compaction->column_family_data()->GetName().c_str(), - job_id_, compaction->InputLevelSummary(&inputs_summary), - compact_->total_bytes + compact_->total_blob_bytes); + if (compaction_stats_.has_penultimate_level_output) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64 + " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), + compaction_stats_.penultimate_level_stats.bytes_written, + compaction_stats_.stats.bytes_written, + compaction_stats_.TotalBytesWritten()); + } else { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), + job_id_, compaction->InputLevelSummary(&inputs_summary), + compaction_stats_.TotalBytesWritten()); + } } VersionEdit* const edit = compaction->edit(); @@ -2157,16 +1421,14 @@ Status CompactionJob::InstallCompactionResults( std::unordered_map blob_total_garbage; for (const auto& sub_compact : compact_->sub_compact_states) { - for (const auto& out : sub_compact.outputs) { - edit->AddFile(compaction->output_level(), out.meta); - } + sub_compact.AddOutputsEdit(edit); - for (const auto& blob : sub_compact.blob_file_additions) { + for (const auto& blob : sub_compact.Current().GetBlobFileAdditions()) { edit->AddBlobFile(blob); } - if (sub_compact.blob_garbage_meter) { - const auto& flows = sub_compact.blob_garbage_meter->flows(); + if (sub_compact.Current().GetBlobGarbageMeter()) { + const auto& flows = sub_compact.Current().GetBlobGarbageMeter()->flows(); for (const auto& pair : flows) { const uint64_t blob_file_number = pair.first; @@ -2227,10 +1489,10 @@ void CompactionJob::RecordCompactionIOStats() { IOSTATS_RESET(bytes_written); } -Status CompactionJob::OpenCompactionOutputFile( - SubcompactionState* sub_compact) { +Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, + CompactionOutputs& outputs) { assert(sub_compact != nullptr); - assert(sub_compact->builder == nullptr); + // no need to lock because VersionSet::next_file_number_ is atomic uint64_t file_number = versions_->NewFileNumber(); std::string fname = GetTableFileName(file_number); @@ -2252,7 +1514,8 @@ Status CompactionJob::OpenCompactionOutputFile( // Pass temperature of botommost files to FileSystem. FileOptions fo_copy = file_options_; Temperature temperature = sub_compact->compaction->output_temperature(); - if (temperature == Temperature::kUnknown && bottommost_level_) { + if (temperature == Temperature::kUnknown && bottommost_level_ && + !sub_compact->IsCurrentPenultimateLevel()) { temperature = sub_compact->compaction->mutable_cf_options()->bottommost_temperature; } @@ -2328,12 +1591,11 @@ Status CompactionJob::OpenCompactionOutputFile( s.ToString().c_str()); return s; } - sub_compact->outputs.emplace_back( - std::move(meta), cfd->internal_comparator(), - /*enable_order_check=*/ - sub_compact->compaction->mutable_cf_options() - ->check_flush_compaction_key_order, - /*enable_hash=*/paranoid_file_checks_); + + outputs.AddOutput(std::move(meta), cfd->internal_comparator(), + sub_compact->compaction->mutable_cf_options() + ->check_flush_compaction_key_order, + paranoid_file_checks_); } writable_file->SetIOPriority(GetRateLimiterPriority()); @@ -2343,7 +1605,7 @@ Status CompactionJob::OpenCompactionOutputFile( sub_compact->compaction->OutputFilePreallocationSize())); const auto& listeners = sub_compact->compaction->immutable_options()->listeners; - sub_compact->outfile.reset(new WritableFileWriter( + outputs.AssignFileWriter(new WritableFileWriter( std::move(writable_file), fname, fo_copy, db_options_.clock, io_tracer_, db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(), tmp_set.Contains(FileType::kTableFile), false)); @@ -2358,33 +1620,16 @@ Status CompactionJob::OpenCompactionOutputFile( oldest_ancester_time, 0 /* oldest_key_time */, current_time, db_id_, db_session_id_, sub_compact->compaction->max_output_file_size(), file_number); - sub_compact->builder.reset( - NewTableBuilder(tboptions, sub_compact->outfile.get())); + + outputs.NewBuilder(tboptions); + LogFlush(db_options_.info_log); return s; } void CompactionJob::CleanupCompaction() { for (SubcompactionState& sub_compact : compact_->sub_compact_states) { - const auto& sub_status = sub_compact.status; - - if (sub_compact.builder != nullptr) { - // May happen if we get a shutdown call in the middle of compaction - sub_compact.builder->Abandon(); - sub_compact.builder.reset(); - } else { - assert(!sub_status.ok() || sub_compact.outfile == nullptr); - } - for (const auto& out : sub_compact.outputs) { - // If this file was inserted into the table cache then remove - // them here because this compaction was not committed. - if (!sub_status.ok()) { - TableCache::Evict(table_cache_.get(), out.meta.fd.GetNumber()); - } - } - // TODO: sub_compact.io_status is not checked like status. Not sure if thats - // intentional. So ignoring the io_status as of now. - sub_compact.io_status.PermitUncheckedError(); + sub_compact.Cleanup(table_cache_.get()); } delete compact_; compact_ = nullptr; @@ -2405,37 +1650,28 @@ void CompactionJob::UpdateCompactionStats() { assert(compact_); Compaction* compaction = compact_->compaction; - compaction_stats_.num_input_files_in_non_output_levels = 0; - compaction_stats_.num_input_files_in_output_level = 0; + compaction_stats_.stats.num_input_files_in_non_output_levels = 0; + compaction_stats_.stats.num_input_files_in_output_level = 0; for (int input_level = 0; input_level < static_cast(compaction->num_input_levels()); ++input_level) { if (compaction->level(input_level) != compaction->output_level()) { UpdateCompactionInputStatsHelper( - &compaction_stats_.num_input_files_in_non_output_levels, - &compaction_stats_.bytes_read_non_output_levels, input_level); + &compaction_stats_.stats.num_input_files_in_non_output_levels, + &compaction_stats_.stats.bytes_read_non_output_levels, input_level); } else { UpdateCompactionInputStatsHelper( - &compaction_stats_.num_input_files_in_output_level, - &compaction_stats_.bytes_read_output_level, input_level); + &compaction_stats_.stats.num_input_files_in_output_level, + &compaction_stats_.stats.bytes_read_output_level, input_level); } } assert(compaction_job_stats_); - compaction_stats_.bytes_read_blob = + compaction_stats_.stats.bytes_read_blob = compaction_job_stats_->total_blob_bytes_read; - compaction_stats_.num_output_files = - static_cast(compact_->num_output_files); - compaction_stats_.num_output_files_blob = - static_cast(compact_->num_blob_output_files); - compaction_stats_.bytes_written = compact_->total_bytes; - compaction_stats_.bytes_written_blob = compact_->total_blob_bytes; - - if (compaction_stats_.num_input_records > compact_->num_output_records) { - compaction_stats_.num_dropped_records = - compaction_stats_.num_input_records - compact_->num_output_records; - } + compaction_stats_.stats.num_dropped_records = + compaction_stats_.DroppedRecords(); } void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files, @@ -2448,7 +1684,7 @@ void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files, for (size_t i = 0; i < num_input_files; ++i) { const auto* file_meta = compaction->input(input_level, i); *bytes_read += file_meta->fd.GetFileSize(); - compaction_stats_.num_input_records += + compaction_stats_.stats.num_input_records += static_cast(file_meta->num_entries); } } @@ -2471,7 +1707,7 @@ void CompactionJob::UpdateCompactionJobStats( // output information compaction_job_stats_->total_output_bytes = stats.bytes_written; compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob; - compaction_job_stats_->num_output_records = compact_->num_output_records; + compaction_job_stats_->num_output_records = stats.num_output_records; compaction_job_stats_->num_output_files = stats.num_output_files; compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob; @@ -2540,617 +1776,4 @@ Env::IOPriority CompactionJob::GetRateLimiterPriority() { return Env::IO_LOW; } -#ifndef ROCKSDB_LITE -std::string CompactionServiceCompactionJob::GetTableFileName( - uint64_t file_number) { - return MakeTableFileName(output_path_, file_number); -} - -void CompactionServiceCompactionJob::RecordCompactionIOStats() { - compaction_result_->bytes_read += IOSTATS(bytes_read); - compaction_result_->bytes_written += IOSTATS(bytes_written); - CompactionJob::RecordCompactionIOStats(); -} - -CompactionServiceCompactionJob::CompactionServiceCompactionJob( - int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, - const MutableDBOptions& mutable_db_options, const FileOptions& file_options, - VersionSet* versions, const std::atomic* shutting_down, - LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats, - InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, - std::vector existing_snapshots, - std::shared_ptr table_cache, EventLogger* event_logger, - const std::string& dbname, const std::shared_ptr& io_tracer, - const std::atomic& manual_compaction_canceled, - const std::string& db_id, const std::string& db_session_id, - const std::string& output_path, - const CompactionServiceInput& compaction_service_input, - CompactionServiceResult* compaction_service_result) - : CompactionJob( - job_id, compaction, db_options, mutable_db_options, file_options, - versions, shutting_down, log_buffer, nullptr, output_directory, - nullptr, stats, db_mutex, db_error_handler, existing_snapshots, - kMaxSequenceNumber, nullptr, nullptr, table_cache, event_logger, - compaction->mutable_cf_options()->paranoid_file_checks, - compaction->mutable_cf_options()->report_bg_io_stats, dbname, - &(compaction_service_result->stats), Env::Priority::USER, io_tracer, - manual_compaction_canceled, db_id, db_session_id, - compaction->column_family_data()->GetFullHistoryTsLow()), - output_path_(output_path), - compaction_input_(compaction_service_input), - compaction_result_(compaction_service_result) {} - -Status CompactionServiceCompactionJob::Run() { - AutoThreadOperationStageUpdater stage_updater( - ThreadStatus::STAGE_COMPACTION_RUN); - - auto* c = compact_->compaction; - assert(c->column_family_data() != nullptr); - assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( - compact_->compaction->level()) > 0); - - write_hint_ = - c->column_family_data()->CalculateSSTWriteHint(c->output_level()); - bottommost_level_ = c->bottommost_level(); - - Slice begin = compaction_input_.begin; - Slice end = compaction_input_.end; - compact_->sub_compact_states.emplace_back( - c, compaction_input_.has_begin ? &begin : nullptr, - compaction_input_.has_end ? &end : nullptr, compaction_input_.approx_size, - /*sub_job_id*/ 0); - - log_buffer_->FlushBufferToLog(); - LogCompaction(); - const uint64_t start_micros = db_options_.clock->NowMicros(); - // Pick the only sub-compaction we should have - assert(compact_->sub_compact_states.size() == 1); - SubcompactionState* sub_compact = compact_->sub_compact_states.data(); - - ProcessKeyValueCompaction(sub_compact); - - compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros; - compaction_stats_.cpu_micros = sub_compact->compaction_job_stats.cpu_micros; - - RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros); - RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, - compaction_stats_.cpu_micros); - - Status status = sub_compact->status; - IOStatus io_s = sub_compact->io_status; - - if (io_status_.ok()) { - io_status_ = io_s; - } - - if (status.ok()) { - constexpr IODebugContext* dbg = nullptr; - - if (output_directory_) { - io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg, - DirFsyncOptions()); - } - } - if (io_status_.ok()) { - io_status_ = io_s; - } - if (status.ok()) { - status = io_s; - } - if (status.ok()) { - // TODO: Add verify_table() - } - - // Finish up all book-keeping to unify the subcompaction results - AggregateStatistics(); - UpdateCompactionStats(); - RecordCompactionIOStats(); - - LogFlush(db_options_.info_log); - compact_->status = status; - compact_->status.PermitUncheckedError(); - - // Build compaction result - compaction_result_->output_level = compact_->compaction->output_level(); - compaction_result_->output_path = output_path_; - for (const auto& output_file : sub_compact->outputs) { - auto& meta = output_file.meta; - compaction_result_->output_files.emplace_back( - MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, - meta.fd.largest_seqno, meta.smallest.Encode().ToString(), - meta.largest.Encode().ToString(), meta.oldest_ancester_time, - meta.file_creation_time, output_file.validator.GetHash(), - meta.marked_for_compaction, meta.unique_id); - } - compaction_result_->num_output_records = sub_compact->num_output_records; - compaction_result_->total_bytes = sub_compact->total_bytes; - - return status; -} - -void CompactionServiceCompactionJob::CleanupCompaction() { - CompactionJob::CleanupCompaction(); -} - -// Internal binary format for the input and result data -enum BinaryFormatVersion : uint32_t { - kOptionsString = 1, // Use string format similar to Option string format -}; - -static std::unordered_map cfd_type_info = { - {"name", - {offsetof(struct ColumnFamilyDescriptor, name), OptionType::kEncodedString, - OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - {"options", - {offsetof(struct ColumnFamilyDescriptor, options), - OptionType::kConfigurable, OptionVerificationType::kNormal, - OptionTypeFlags::kNone, - [](const ConfigOptions& opts, const std::string& /*name*/, - const std::string& value, void* addr) { - auto cf_options = static_cast(addr); - return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(), - value, cf_options); - }, - [](const ConfigOptions& opts, const std::string& /*name*/, - const void* addr, std::string* value) { - const auto cf_options = static_cast(addr); - std::string result; - auto status = - GetStringFromColumnFamilyOptions(opts, *cf_options, &result); - *value = "{" + result + "}"; - return status; - }, - [](const ConfigOptions& opts, const std::string& name, const void* addr1, - const void* addr2, std::string* mismatch) { - const auto this_one = static_cast(addr1); - const auto that_one = static_cast(addr2); - auto this_conf = CFOptionsAsConfigurable(*this_one); - auto that_conf = CFOptionsAsConfigurable(*that_one); - std::string mismatch_opt; - bool result = - this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); - if (!result) { - *mismatch = name + "." + mismatch_opt; - } - return result; - }}}, -}; - -static std::unordered_map cs_input_type_info = { - {"column_family", - OptionTypeInfo::Struct( - "column_family", &cfd_type_info, - offsetof(struct CompactionServiceInput, column_family), - OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, - {"db_options", - {offsetof(struct CompactionServiceInput, db_options), - OptionType::kConfigurable, OptionVerificationType::kNormal, - OptionTypeFlags::kNone, - [](const ConfigOptions& opts, const std::string& /*name*/, - const std::string& value, void* addr) { - auto options = static_cast(addr); - return GetDBOptionsFromString(opts, DBOptions(), value, options); - }, - [](const ConfigOptions& opts, const std::string& /*name*/, - const void* addr, std::string* value) { - const auto options = static_cast(addr); - std::string result; - auto status = GetStringFromDBOptions(opts, *options, &result); - *value = "{" + result + "}"; - return status; - }, - [](const ConfigOptions& opts, const std::string& name, const void* addr1, - const void* addr2, std::string* mismatch) { - const auto this_one = static_cast(addr1); - const auto that_one = static_cast(addr2); - auto this_conf = DBOptionsAsConfigurable(*this_one); - auto that_conf = DBOptionsAsConfigurable(*that_one); - std::string mismatch_opt; - bool result = - this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); - if (!result) { - *mismatch = name + "." + mismatch_opt; - } - return result; - }}}, - {"snapshots", OptionTypeInfo::Vector( - offsetof(struct CompactionServiceInput, snapshots), - OptionVerificationType::kNormal, OptionTypeFlags::kNone, - {0, OptionType::kUInt64T})}, - {"input_files", OptionTypeInfo::Vector( - offsetof(struct CompactionServiceInput, input_files), - OptionVerificationType::kNormal, OptionTypeFlags::kNone, - {0, OptionType::kEncodedString})}, - {"output_level", - {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt, - OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - {"db_id", - {offsetof(struct CompactionServiceInput, db_id), - OptionType::kEncodedString}}, - {"has_begin", - {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean, - OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - {"begin", - {offsetof(struct CompactionServiceInput, begin), - OptionType::kEncodedString, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"has_end", - {offsetof(struct CompactionServiceInput, has_end), OptionType::kBoolean, - OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - {"end", - {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString, - OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - {"approx_size", - {offsetof(struct CompactionServiceInput, approx_size), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, -}; - -static std::unordered_map - cs_output_file_type_info = { - {"file_name", - {offsetof(struct CompactionServiceOutputFile, file_name), - OptionType::kEncodedString, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"smallest_seqno", - {offsetof(struct CompactionServiceOutputFile, smallest_seqno), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"largest_seqno", - {offsetof(struct CompactionServiceOutputFile, largest_seqno), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"smallest_internal_key", - {offsetof(struct CompactionServiceOutputFile, smallest_internal_key), - OptionType::kEncodedString, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"largest_internal_key", - {offsetof(struct CompactionServiceOutputFile, largest_internal_key), - OptionType::kEncodedString, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"oldest_ancester_time", - {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"file_creation_time", - {offsetof(struct CompactionServiceOutputFile, file_creation_time), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"paranoid_hash", - {offsetof(struct CompactionServiceOutputFile, paranoid_hash), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"marked_for_compaction", - {offsetof(struct CompactionServiceOutputFile, marked_for_compaction), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"unique_id", - OptionTypeInfo::Array( - offsetof(struct CompactionServiceOutputFile, unique_id), - OptionVerificationType::kNormal, OptionTypeFlags::kNone, - {0, OptionType::kUInt64T})}, -}; - -static std::unordered_map - compaction_job_stats_type_info = { - {"elapsed_micros", - {offsetof(struct CompactionJobStats, elapsed_micros), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"cpu_micros", - {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T, - OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - {"num_input_records", - {offsetof(struct CompactionJobStats, num_input_records), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_blobs_read", - {offsetof(struct CompactionJobStats, num_blobs_read), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_input_files", - {offsetof(struct CompactionJobStats, num_input_files), - OptionType::kSizeT, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_input_files_at_output_level", - {offsetof(struct CompactionJobStats, num_input_files_at_output_level), - OptionType::kSizeT, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_output_records", - {offsetof(struct CompactionJobStats, num_output_records), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_output_files", - {offsetof(struct CompactionJobStats, num_output_files), - OptionType::kSizeT, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_output_files_blob", - {offsetof(struct CompactionJobStats, num_output_files_blob), - OptionType::kSizeT, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"is_full_compaction", - {offsetof(struct CompactionJobStats, is_full_compaction), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"is_manual_compaction", - {offsetof(struct CompactionJobStats, is_manual_compaction), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"total_input_bytes", - {offsetof(struct CompactionJobStats, total_input_bytes), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"total_blob_bytes_read", - {offsetof(struct CompactionJobStats, total_blob_bytes_read), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"total_output_bytes", - {offsetof(struct CompactionJobStats, total_output_bytes), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"total_output_bytes_blob", - {offsetof(struct CompactionJobStats, total_output_bytes_blob), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_records_replaced", - {offsetof(struct CompactionJobStats, num_records_replaced), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"total_input_raw_key_bytes", - {offsetof(struct CompactionJobStats, total_input_raw_key_bytes), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"total_input_raw_value_bytes", - {offsetof(struct CompactionJobStats, total_input_raw_value_bytes), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_input_deletion_records", - {offsetof(struct CompactionJobStats, num_input_deletion_records), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_expired_deletion_records", - {offsetof(struct CompactionJobStats, num_expired_deletion_records), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_corrupt_keys", - {offsetof(struct CompactionJobStats, num_corrupt_keys), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"file_write_nanos", - {offsetof(struct CompactionJobStats, file_write_nanos), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"file_range_sync_nanos", - {offsetof(struct CompactionJobStats, file_range_sync_nanos), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"file_fsync_nanos", - {offsetof(struct CompactionJobStats, file_fsync_nanos), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"file_prepare_write_nanos", - {offsetof(struct CompactionJobStats, file_prepare_write_nanos), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"smallest_output_key_prefix", - {offsetof(struct CompactionJobStats, smallest_output_key_prefix), - OptionType::kEncodedString, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"largest_output_key_prefix", - {offsetof(struct CompactionJobStats, largest_output_key_prefix), - OptionType::kEncodedString, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_single_del_fallthru", - {offsetof(struct CompactionJobStats, num_single_del_fallthru), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_single_del_mismatch", - {offsetof(struct CompactionJobStats, num_single_del_mismatch), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, -}; - -namespace { -// this is a helper struct to serialize and deserialize class Status, because -// Status's members are not public. -struct StatusSerializationAdapter { - uint8_t code; - uint8_t subcode; - uint8_t severity; - std::string message; - - StatusSerializationAdapter() {} - explicit StatusSerializationAdapter(const Status& s) { - code = s.code(); - subcode = s.subcode(); - severity = s.severity(); - auto msg = s.getState(); - message = msg ? msg : ""; - } - - Status GetStatus() { - return Status(static_cast(code), - static_cast(subcode), - static_cast(severity), message); - } -}; -} // namespace - -static std::unordered_map - status_adapter_type_info = { - {"code", - {offsetof(struct StatusSerializationAdapter, code), - OptionType::kUInt8T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"subcode", - {offsetof(struct StatusSerializationAdapter, subcode), - OptionType::kUInt8T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"severity", - {offsetof(struct StatusSerializationAdapter, severity), - OptionType::kUInt8T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"message", - {offsetof(struct StatusSerializationAdapter, message), - OptionType::kEncodedString, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, -}; - -static std::unordered_map cs_result_type_info = { - {"status", - {offsetof(struct CompactionServiceResult, status), - OptionType::kCustomizable, OptionVerificationType::kNormal, - OptionTypeFlags::kNone, - [](const ConfigOptions& opts, const std::string& /*name*/, - const std::string& value, void* addr) { - auto status_obj = static_cast(addr); - StatusSerializationAdapter adapter; - Status s = OptionTypeInfo::ParseType( - opts, value, status_adapter_type_info, &adapter); - *status_obj = adapter.GetStatus(); - return s; - }, - [](const ConfigOptions& opts, const std::string& /*name*/, - const void* addr, std::string* value) { - const auto status_obj = static_cast(addr); - StatusSerializationAdapter adapter(*status_obj); - std::string result; - Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info, - &adapter, &result); - *value = "{" + result + "}"; - return s; - }, - [](const ConfigOptions& opts, const std::string& /*name*/, - const void* addr1, const void* addr2, std::string* mismatch) { - const auto status1 = static_cast(addr1); - const auto status2 = static_cast(addr2); - - StatusSerializationAdapter adatper1(*status1); - StatusSerializationAdapter adapter2(*status2); - return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info, - &adatper1, &adapter2, mismatch); - }}}, - {"output_files", - OptionTypeInfo::Vector( - offsetof(struct CompactionServiceResult, output_files), - OptionVerificationType::kNormal, OptionTypeFlags::kNone, - OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0, - OptionVerificationType::kNormal, - OptionTypeFlags::kNone))}, - {"output_level", - {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt, - OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - {"output_path", - {offsetof(struct CompactionServiceResult, output_path), - OptionType::kEncodedString, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"num_output_records", - {offsetof(struct CompactionServiceResult, num_output_records), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"total_bytes", - {offsetof(struct CompactionServiceResult, total_bytes), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"bytes_read", - {offsetof(struct CompactionServiceResult, bytes_read), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"bytes_written", - {offsetof(struct CompactionServiceResult, bytes_written), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"stats", OptionTypeInfo::Struct( - "stats", &compaction_job_stats_type_info, - offsetof(struct CompactionServiceResult, stats), - OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, -}; - -Status CompactionServiceInput::Read(const std::string& data_str, - CompactionServiceInput* obj) { - if (data_str.size() <= sizeof(BinaryFormatVersion)) { - return Status::InvalidArgument("Invalid CompactionServiceInput string"); - } - auto format_version = DecodeFixed32(data_str.data()); - if (format_version == kOptionsString) { - ConfigOptions cf; - cf.invoke_prepare_options = false; - cf.ignore_unknown_options = true; - return OptionTypeInfo::ParseType( - cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info, - obj); - } else { - return Status::NotSupported( - "Compaction Service Input data version not supported: " + - std::to_string(format_version)); - } -} - -Status CompactionServiceInput::Write(std::string* output) { - char buf[sizeof(BinaryFormatVersion)]; - EncodeFixed32(buf, kOptionsString); - output->append(buf, sizeof(BinaryFormatVersion)); - ConfigOptions cf; - cf.invoke_prepare_options = false; - return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output); -} - -Status CompactionServiceResult::Read(const std::string& data_str, - CompactionServiceResult* obj) { - if (data_str.size() <= sizeof(BinaryFormatVersion)) { - return Status::InvalidArgument("Invalid CompactionServiceResult string"); - } - auto format_version = DecodeFixed32(data_str.data()); - if (format_version == kOptionsString) { - ConfigOptions cf; - cf.invoke_prepare_options = false; - cf.ignore_unknown_options = true; - return OptionTypeInfo::ParseType( - cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info, - obj); - } else { - return Status::NotSupported( - "Compaction Service Result data version not supported: " + - std::to_string(format_version)); - } -} - -Status CompactionServiceResult::Write(std::string* output) { - char buf[sizeof(BinaryFormatVersion)]; - EncodeFixed32(buf, kOptionsString); - output->append(buf, sizeof(BinaryFormatVersion)); - ConfigOptions cf; - cf.invoke_prepare_options = false; - return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output); -} - -#ifndef NDEBUG -bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) { - std::string mismatch; - return TEST_Equals(other, &mismatch); -} - -bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other, - std::string* mismatch) { - ConfigOptions cf; - cf.invoke_prepare_options = false; - return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other, - mismatch); -} - -bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) { - std::string mismatch; - return TEST_Equals(other, &mismatch); -} - -bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other, - std::string* mismatch) { - ConfigOptions cf; - cf.invoke_prepare_options = false; - return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other, - mismatch); -} -#endif // NDEBUG -#endif // !ROCKSDB_LITE - } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index a8f0e4eeb..4c3ab4206 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -20,6 +20,7 @@ #include "db/blob/blob_file_completion_callback.h" #include "db/column_family.h" #include "db/compaction/compaction_iterator.h" +#include "db/compaction/compaction_outputs.h" #include "db/flush_scheduler.h" #include "db/internal_stats.h" #include "db/job_context.h" @@ -47,6 +48,7 @@ namespace ROCKSDB_NAMESPACE { class Arena; +class CompactionState; class ErrorHandler; class MemTable; class SnapshotChecker; @@ -56,11 +58,91 @@ class Version; class VersionEdit; class VersionSet; +class SubcompactionState; + // CompactionJob is responsible for executing the compaction. Each (manual or // automated) compaction corresponds to a CompactionJob object, and usually // goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob // will divide the compaction into subcompactions and execute them in parallel // if needed. +// +// CompactionJob has 2 main stats: +// 1. CompactionJobStats compaction_job_stats_ +// CompactionJobStats is a public data structure which is part of Compaction +// event listener that rocksdb share the job stats with the user. +// Internally it's an aggregation of all the compaction_job_stats from each +// `SubcompactionState`: +// +------------------------+ +// | SubcompactionState | +// | | +// +--------->| compaction_job_stats | +// | | | +// | +------------------------+ +// +------------------------+ | +// | CompactionJob | | +------------------------+ +// | | | | SubcompactionState | +// | compaction_job_stats +-----+ | | +// | | +--------->| compaction_job_stats | +// | | | | | +// +------------------------+ | +------------------------+ +// | +// | +------------------------+ +// | | SubcompactionState | +// | | | +// +--------->+ compaction_job_stats | +// | | | +// | +------------------------+ +// | +// | +------------------------+ +// | | ... | +// +--------->+ | +// +------------------------+ +// +// 2. CompactionStatsFull compaction_stats_ +// `CompactionStatsFull` is an internal stats about the compaction, which +// is eventually sent to `ColumnFamilyData::internal_stats_` and used for +// logging and public metrics. +// Internally, it's an aggregation of stats_ from each `SubcompactionState`. +// It has 2 parts, normal stats about the main compaction information and +// the penultimate level output stats. +// `SubcompactionState` maintains the CompactionOutputs for normal output and +// the penultimate level output if exists, the per_level stats is +// stored with the outputs. +// +---------------------------+ +// | SubcompactionState | +// | | +// | +----------------------+ | +// | | CompactionOutputs | | +// | | (normal output) | | +// +---->| stats_ | | +// | | +----------------------+ | +// | | | +// | | +----------------------+ | +// +--------------------------------+ | | | CompactionOutputs | | +// | CompactionJob | | | | (penultimate_level) | | +// | | +--------->| stats_ | | +// | compaction_stats_ | | | | +----------------------+ | +// | +-------------------------+ | | | | | +// | |stats (normal) |------|----+ +---------------------------+ +// | +-------------------------+ | | | +// | | | | +// | +-------------------------+ | | | +---------------------------+ +// | |penultimate_level_stats +------+ | | SubcompactionState | +// | +-------------------------+ | | | | | +// | | | | | +----------------------+ | +// | | | | | | CompactionOutputs | | +// +--------------------------------+ | | | | (normal output) | | +// | +---->| stats_ | | +// | | +----------------------+ | +// | | | +// | | +----------------------+ | +// | | | CompactionOutputs | | +// | | | (penultimate_level) | | +// +--------->| stats_ | | +// | +----------------------+ | +// | | +// +---------------------------+ + class CompactionJob { public: CompactionJob( @@ -107,11 +189,6 @@ class CompactionJob { IOStatus io_status() const { return io_status_; } protected: - struct SubcompactionState; - // CompactionJob state - struct CompactionState; - - void AggregateStatistics(); void UpdateCompactionStats(); void LogCompaction(); virtual void RecordCompactionIOStats(); @@ -122,7 +199,7 @@ class CompactionJob { void ProcessKeyValueCompaction(SubcompactionState* sub_compact); CompactionState* compact_; - InternalStats::CompactionStats compaction_stats_; + InternalStats::CompactionStatsFull compaction_stats_; const ImmutableDBOptions& db_options_; const MutableDBOptions mutable_db_options_copy_; LogBuffer* log_buffer_; @@ -135,6 +212,8 @@ class CompactionJob { IOStatus io_status_; + CompactionJobStats* compaction_job_stats_; + private: friend class CompactionJobTestBase; @@ -150,15 +229,14 @@ class CompactionJob { // update the thread status for starting a compaction. void ReportStartedCompaction(Compaction* compaction); - void AllocateCompactionOutputFileNumbers(); - Status FinishCompactionOutputFile( - const Status& input_status, SubcompactionState* sub_compact, - CompactionRangeDelAggregator* range_del_agg, - CompactionIterationStats* range_del_out_stats, - const Slice* next_table_min_key = nullptr); + Status FinishCompactionOutputFile(const Status& input_status, + SubcompactionState* sub_compact, + CompactionOutputs& outputs, + const Slice& next_table_min_key); Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); - Status OpenCompactionOutputFile(SubcompactionState* sub_compact); + Status OpenCompactionOutputFile(SubcompactionState* sub_compact, + CompactionOutputs& outputs); void UpdateCompactionJobStats( const InternalStats::CompactionStats& stats) const; void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, @@ -167,20 +245,12 @@ class CompactionJob { void UpdateCompactionInputStatsHelper( int* num_files, uint64_t* bytes_read, int input_level); -#ifndef ROCKSDB_LITE - void BuildSubcompactionJobInfo( - SubcompactionState* sub_compact, - SubcompactionJobInfo* subcompaction_job_info) const; -#endif // ROCKSDB_LITE - void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact); void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact); uint32_t job_id_; - CompactionJobStats* compaction_job_stats_; - // DBImpl state const std::string& dbname_; const std::string db_id_; @@ -222,14 +292,12 @@ class CompactionJob { bool measure_io_stats_; // Stores the Slices that designate the boundaries for each subcompaction std::vector boundaries_; - // Stores the approx size of keys covered in the range of each subcompaction - std::vector sizes_; Env::Priority thread_pri_; std::string full_history_ts_low_; std::string trim_ts_; BlobFileCompletionCallback* blob_callback_; - uint64_t GetCompactionId(SubcompactionState* sub_compact); + uint64_t GetCompactionId(SubcompactionState* sub_compact) const; // Get table file name in where it's outputting to, which should also be in // `output_directory_`. @@ -265,7 +333,6 @@ struct CompactionServiceInput { std::string begin; bool has_end = false; std::string end; - uint64_t approx_size = 0; // serialization interface to read and write the object static Status Read(const std::string& data_str, CompactionServiceInput* obj); @@ -357,7 +424,7 @@ class CompactionServiceCompactionJob : private CompactionJob { const std::string& dbname, const std::shared_ptr& io_tracer, const std::atomic& manual_compaction_canceled, const std::string& db_id, const std::string& db_session_id, - const std::string& output_path, + std::string output_path, const CompactionServiceInput& compaction_service_input, CompactionServiceResult* compaction_service_result); diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 2079d313c..369f0b267 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -482,6 +482,17 @@ class CompactionJobTestBase : public testing::Test { cfd_ = versions_->GetColumnFamilySet()->GetDefault(); } + void RunLastLevelCompaction( + const std::vector>& input_files, + std::function&& verify_func, + const std::vector& snapshots = {}) { + const int kLastLevel = cf_options_.num_levels - 1; + verify_per_key_placement_ = std::move(verify_func); + mock::KVVector empty_map; + RunCompaction(input_files, empty_map, snapshots, kMaxSequenceNumber, + kLastLevel, false); + } + void RunCompaction( const std::vector>& input_files, const mock::KVVector& expected_results, @@ -571,6 +582,12 @@ class CompactionJobTestBase : public testing::Test { if (check_get_priority) { CheckGetRateLimiterPriority(compaction_job); } + + if (verify_per_key_placement_) { + // Verify per_key_placement compaction + assert(compaction.SupportsPerKeyPlacement()); + verify_per_key_placement_(compaction); + } } void CheckGetRateLimiterPriority(CompactionJob& compaction_job) { @@ -620,6 +637,7 @@ class CompactionJobTestBase : public testing::Test { std::string full_history_ts_low_; const std::function encode_u64_ts_; bool test_io_priority_; + std::function verify_per_key_placement_; }; // TODO(icanadi) Make it simpler once we mock out VersionSet @@ -1311,6 +1329,75 @@ TEST_F(CompactionJobTest, OldestBlobFileNumber) { /* expected_oldest_blob_file_number */ 19); } +TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) { + cf_options_.bottommost_temperature = Temperature::kCold; + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = true; + }); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + NewDB(); + + // Add files on different levels that may overlap + auto file0_1 = mock::MakeMockFile({{KeyStr("z", 12U, kTypeValue), "val"}}); + AddMockFile(file0_1); + + auto file1_1 = mock::MakeMockFile({{KeyStr("b", 10U, kTypeValue), "val"}, + {KeyStr("f", 11U, kTypeValue), "val"}}); + AddMockFile(file1_1, 1); + auto file1_2 = mock::MakeMockFile({{KeyStr("j", 12U, kTypeValue), "val"}, + {KeyStr("k", 13U, kTypeValue), "val"}}); + AddMockFile(file1_2, 1); + auto file1_3 = mock::MakeMockFile({{KeyStr("p", 14U, kTypeValue), "val"}, + {KeyStr("u", 15U, kTypeValue), "val"}}); + AddMockFile(file1_3, 1); + + auto file2_1 = mock::MakeMockFile({{KeyStr("f", 8U, kTypeValue), "val"}, + {KeyStr("h", 9U, kTypeValue), "val"}}); + AddMockFile(file2_1, 2); + auto file2_2 = mock::MakeMockFile({{KeyStr("m", 6U, kTypeValue), "val"}, + {KeyStr("p", 7U, kTypeValue), "val"}}); + AddMockFile(file2_2, 2); + + auto file3_1 = mock::MakeMockFile({{KeyStr("g", 2U, kTypeValue), "val"}, + {KeyStr("k", 3U, kTypeValue), "val"}}); + AddMockFile(file3_1, 3); + auto file3_2 = mock::MakeMockFile({{KeyStr("v", 4U, kTypeValue), "val"}, + {KeyStr("x", 5U, kTypeValue), "val"}}); + AddMockFile(file3_2, 3); + + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto files0 = cfd->current()->storage_info()->LevelFiles(0); + auto files1 = cfd->current()->storage_info()->LevelFiles(1); + auto files2 = cfd->current()->storage_info()->LevelFiles(2); + auto files3 = cfd->current()->storage_info()->LevelFiles(3); + + RunLastLevelCompaction( + {files0, files1, files2, files3}, /*verify_func=*/[&](Compaction& comp) { + for (char c = 'a'; c <= 'z'; c++) { + std::string c_str; + c_str = c; + const Slice key(c_str); + if (c == 'a') { + ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key)); + } else { + ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key)); + } + } + }); +} + TEST_F(CompactionJobTest, NoEnforceSingleDeleteContract) { db_options_.enforce_single_del_contracts = false; NewDB(); @@ -1360,7 +1447,6 @@ TEST_F(CompactionJobTest, InputSerialization) { if (input.has_end) { input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)); } - input.approx_size = rnd64.Uniform(UINT64_MAX); std::string output; ASSERT_OK(input.Write(&output)); diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc new file mode 100644 index 000000000..346df5aec --- /dev/null +++ b/db/compaction/compaction_outputs.cc @@ -0,0 +1,314 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_outputs.h" + +#include "db/builder.h" + +namespace ROCKSDB_NAMESPACE { + +void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) { + builder_.reset(NewTableBuilder(tboptions, file_writer_.get())); +} + +Status CompactionOutputs::Finish(const Status& intput_status) { + FileMetaData* meta = GetMetaData(); + assert(meta != nullptr); + Status s = intput_status; + if (s.ok()) { + s = builder_->Finish(); + } else { + builder_->Abandon(); + } + Status io_s = builder_->io_status(); + if (s.ok()) { + s = io_s; + } else { + io_s.PermitUncheckedError(); + } + const uint64_t current_bytes = builder_->FileSize(); + if (s.ok()) { + meta->fd.file_size = current_bytes; + meta->marked_for_compaction = builder_->NeedCompact(); + } + current_output().finished = true; + stats_.bytes_written += current_bytes; + stats_.num_output_files = outputs_.size(); + + return s; +} + +IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status, + SystemClock* clock, + Statistics* statistics, + bool use_fsync) { + IOStatus io_s; + if (input_status.ok()) { + StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS); + io_s = file_writer_->Sync(use_fsync); + } + if (input_status.ok() && io_s.ok()) { + io_s = file_writer_->Close(); + } + + if (input_status.ok() && io_s.ok()) { + FileMetaData* meta = GetMetaData(); + meta->file_checksum = file_writer_->GetFileChecksum(); + meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName(); + } + + file_writer_.reset(); + + return io_s; +} + +Status CompactionOutputs::AddToOutput( + const CompactionIterator& c_iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + Status s; + const Slice& key = c_iter.key(); + + if (!pending_close_ && c_iter.Valid() && partitioner_ && HasBuilder() && + partitioner_->ShouldPartition( + PartitionerRequest(last_key_for_partitioner_, c_iter.user_key(), + current_output_file_size_)) == kRequired) { + pending_close_ = true; + } + + if (pending_close_) { + s = close_file_func(*this, c_iter.InputStatus(), key); + pending_close_ = false; + } + if (!s.ok()) { + return s; + } + + // Open output file if necessary + if (!HasBuilder()) { + s = open_file_func(*this); + } + if (!s.ok()) { + return s; + } + + Output& curr = current_output(); + assert(builder_ != nullptr); + const Slice& value = c_iter.value(); + s = curr.validator.Add(key, value); + if (!s.ok()) { + return s; + } + builder_->Add(key, value); + + stats_.num_output_records++; + current_output_file_size_ = builder_->EstimatedFileSize(); + + if (blob_garbage_meter_) { + s = blob_garbage_meter_->ProcessOutFlow(key, value); + } + + if (!s.ok()) { + return s; + } + + const ParsedInternalKey& ikey = c_iter.ikey(); + s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence, + ikey.type); + + // Close output file if it is big enough. Two possibilities determine it's + // time to close it: (1) the current key should be this file's last key, (2) + // the next key should not be in this file. + // + // TODO(aekmekji): determine if file should be closed earlier than this + // during subcompactions (i.e. if output size, estimated by input size, is + // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB + // and 0.6MB instead of 1MB and 0.2MB) + if (compaction_->output_level() != 0 && + current_output_file_size_ >= compaction_->max_output_file_size()) { + pending_close_ = true; + } + + if (partitioner_) { + last_key_for_partitioner_.assign(c_iter.user_key().data_, + c_iter.user_key().size_); + } + + return s; +} + +Status CompactionOutputs::AddRangeDels( + const Slice* comp_start, const Slice* comp_end, + CompactionIterationStats& range_del_out_stats, bool bottommost_level, + const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot, + const Slice& next_table_min_key) { + assert(HasRangeDel()); + FileMetaData& meta = current_output().meta; + const Comparator* ucmp = icmp.user_comparator(); + + Slice lower_bound_guard, upper_bound_guard; + std::string smallest_user_key; + const Slice *lower_bound, *upper_bound; + bool lower_bound_from_sub_compact = false; + + size_t output_size = outputs_.size(); + if (output_size == 1) { + // For the first output table, include range tombstones before the min + // key but after the subcompaction boundary. + lower_bound = comp_start; + lower_bound_from_sub_compact = true; + } else if (meta.smallest.size() > 0) { + // For subsequent output tables, only include range tombstones from min + // key onwards since the previous file was extended to contain range + // tombstones falling before min key. + smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/); + lower_bound_guard = Slice(smallest_user_key); + lower_bound = &lower_bound_guard; + } else { + lower_bound = nullptr; + } + if (!next_table_min_key.empty()) { + // This may be the last file in the subcompaction in some cases, so we + // need to compare the end key of subcompaction with the next file start + // key. When the end key is chosen by the subcompaction, we know that + // it must be the biggest key in output file. Therefore, it is safe to + // use the smaller key as the upper bound of the output file, to ensure + // that there is no overlapping between different output files. + upper_bound_guard = ExtractUserKey(next_table_min_key); + if (comp_end != nullptr && + ucmp->Compare(upper_bound_guard, *comp_end) >= 0) { + upper_bound = comp_end; + } else { + upper_bound = &upper_bound_guard; + } + } else { + // This is the last file in the subcompaction, so extend until the + // subcompaction ends. + upper_bound = comp_end; + } + bool has_overlapping_endpoints; + if (upper_bound != nullptr && meta.largest.size() > 0) { + has_overlapping_endpoints = + ucmp->Compare(meta.largest.user_key(), *upper_bound) == 0; + } else { + has_overlapping_endpoints = false; + } + + // The end key of the subcompaction must be bigger or equal to the upper + // bound. If the end of subcompaction is null or the upper bound is null, + // it means that this file is the last file in the compaction. So there + // will be no overlapping between this file and others. + assert(comp_end == nullptr || upper_bound == nullptr || + ucmp->Compare(*upper_bound, *comp_end) <= 0); + auto it = range_del_agg_->NewIterator(lower_bound, upper_bound, + has_overlapping_endpoints); + // Position the range tombstone output iterator. There may be tombstone + // fragments that are entirely out of range, so make sure that we do not + // include those. + if (lower_bound != nullptr) { + it->Seek(*lower_bound); + } else { + it->SeekToFirst(); + } + for (; it->Valid(); it->Next()) { + auto tombstone = it->Tombstone(); + if (upper_bound != nullptr) { + int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_); + if ((has_overlapping_endpoints && cmp < 0) || + (!has_overlapping_endpoints && cmp <= 0)) { + // Tombstones starting after upper_bound only need to be included in + // the next table. If the current SST ends before upper_bound, i.e., + // `has_overlapping_endpoints == false`, we can also skip over range + // tombstones that start exactly at upper_bound. Such range + // tombstones will be included in the next file and are not relevant + // to the point keys or endpoints of the current file. + break; + } + } + + if (bottommost_level && tombstone.seq_ <= earliest_snapshot) { + // TODO(andrewkr): tombstones that span multiple output files are + // counted for each compaction output file, so lots of double + // counting. + range_del_out_stats.num_range_del_drop_obsolete++; + range_del_out_stats.num_record_drop_obsolete++; + continue; + } + + auto kv = tombstone.Serialize(); + assert(lower_bound == nullptr || + ucmp->Compare(*lower_bound, kv.second) < 0); + // Range tombstone is not supported by output validator yet. + builder_->Add(kv.first.Encode(), kv.second); + InternalKey smallest_candidate = std::move(kv.first); + if (lower_bound != nullptr && + ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) { + // Pretend the smallest key has the same user key as lower_bound + // (the max key in the previous table or subcompaction) in order for + // files to appear key-space partitioned. + // + // When lower_bound is chosen by a subcompaction, we know that + // subcompactions over smaller keys cannot contain any keys at + // lower_bound. We also know that smaller subcompactions exist, + // because otherwise the subcompaction woud be unbounded on the left. + // As a result, we know that no other files on the output level will + // contain actual keys at lower_bound (an output file may have a + // largest key of lower_bound@kMaxSequenceNumber, but this only + // indicates a large range tombstone was truncated). Therefore, it is + // safe to use the tombstone's sequence number, to ensure that keys at + // lower_bound at lower levels are covered by truncated tombstones. + // + // If lower_bound was chosen by the smallest data key in the file, + // choose lowest seqnum so this file's smallest internal key comes + // after the previous file's largest. The fake seqnum is OK because + // the read path's file-picking code only considers user key. + smallest_candidate = InternalKey( + *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0, + kTypeRangeDeletion); + } + InternalKey largest_candidate = tombstone.SerializeEndKey(); + if (upper_bound != nullptr && + ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) { + // Pretend the largest key has the same user key as upper_bound (the + // min key in the following table or subcompaction) in order for files + // to appear key-space partitioned. + // + // Choose highest seqnum so this file's largest internal key comes + // before the next file's/subcompaction's smallest. The fake seqnum is + // OK because the read path's file-picking code only considers the + // user key portion. + // + // Note Seek() also creates InternalKey with (user_key, + // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of + // kTypeRangeDeletion (0xF), so the range tombstone comes before the + // Seek() key in InternalKey's ordering. So Seek() will look in the + // next file for the user key. + largest_candidate = + InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); + } +#ifndef NDEBUG + SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; + if (meta.smallest.size() > 0) { + smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode()); + } +#endif + meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate, + tombstone.seq_, icmp); + // The smallest key in a file is used for range tombstone truncation, so + // it cannot have a seqnum of 0 (unless the smallest data key in a file + // has a seqnum of 0). Otherwise, the truncated tombstone may expose + // deleted keys at lower levels. + assert(smallest_ikey_seqnum == 0 || + ExtractInternalKeyFooter(meta.smallest.Encode()) != + PackSequenceAndType(0, kTypeRangeDeletion)); + } + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h new file mode 100644 index 000000000..bfacc4b45 --- /dev/null +++ b/db/compaction/compaction_outputs.h @@ -0,0 +1,328 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/blob/blob_garbage_meter.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iterator.h" +#include "db/internal_stats.h" +#include "db/output_validator.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactionOutputs; +using CompactionFileOpenFunc = std::function; +using CompactionFileCloseFunc = + std::function; + +// Files produced by subcompaction, most of the functions are used by +// compaction_job Open/Close compaction file functions. +class CompactionOutputs { + public: + // compaction output file + struct Output { + Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp, + bool _enable_order_check, bool _enable_hash, bool _finished, + uint64_t precalculated_hash) + : meta(std::move(_meta)), + validator(_icmp, _enable_order_check, _enable_hash, + precalculated_hash), + finished(_finished) {} + FileMetaData meta; + OutputValidator validator; + bool finished; + std::shared_ptr table_properties; + }; + + CompactionOutputs() = delete; + + explicit CompactionOutputs(const Compaction* compaction, + const bool is_penultimate_level) + : compaction_(compaction), is_penultimate_level_(is_penultimate_level) { + partitioner_ = compaction->output_level() == 0 + ? nullptr + : compaction->CreateSstPartitioner(); + } + + // Add generated output to the list + void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp, + bool enable_order_check, bool enable_hash, + bool finished = false, uint64_t precalculated_hash = 0) { + outputs_.emplace_back(std::move(meta), icmp, enable_order_check, + enable_hash, finished, precalculated_hash); + } + + // Set new table builder for the current output + void NewBuilder(const TableBuilderOptions& tboptions); + + // Assign a new WritableFileWriter to the current output + void AssignFileWriter(WritableFileWriter* writer) { + file_writer_.reset(writer); + } + + // TODO: Remove it when remote compaction support tiered compaction + void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; } + void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; } + + // TODO: Move the BlobDB builder into CompactionOutputs + const std::vector& GetBlobFileAdditions() const { + if (is_penultimate_level_) { + assert(blob_file_additions_.empty()); + } + return blob_file_additions_; + } + + std::vector* GetBlobFileAdditionsPtr() { + assert(!is_penultimate_level_); + return &blob_file_additions_; + } + + bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); } + + BlobGarbageMeter* CreateBlobGarbageMeter() { + assert(!is_penultimate_level_); + blob_garbage_meter_ = std::make_unique(); + return blob_garbage_meter_.get(); + } + + BlobGarbageMeter* GetBlobGarbageMeter() const { + if (is_penultimate_level_) { + // blobdb doesn't support per_key_placement yet + assert(blob_garbage_meter_ == nullptr); + return nullptr; + } + return blob_garbage_meter_.get(); + } + + void UpdateBlobStats() { + assert(!is_penultimate_level_); + stats_.num_output_files_blob = blob_file_additions_.size(); + for (const auto& blob : blob_file_additions_) { + stats_.bytes_written_blob += blob.GetTotalBlobBytes(); + } + } + + // Finish the current output file + Status Finish(const Status& intput_status); + + // Update output table properties from table builder + void UpdateTableProperties() { + current_output().table_properties = + std::make_shared(GetTableProperties()); + } + + IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock, + Statistics* statistics, bool use_fsync); + + TableProperties GetTableProperties() { + return builder_->GetTableProperties(); + } + + Slice SmallestUserKey() const { + if (!outputs_.empty() && outputs_[0].finished) { + return outputs_[0].meta.smallest.user_key(); + } else { + return Slice{nullptr, 0}; + } + } + + Slice LargestUserKey() const { + if (!outputs_.empty() && outputs_.back().finished) { + return outputs_.back().meta.largest.user_key(); + } else { + return Slice{nullptr, 0}; + } + } + + // In case the last output file is empty, which doesn't need to keep. + void RemoveLastEmptyOutput() { + if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) { + // An error occurred, so ignore the last output. + outputs_.pop_back(); + } + } + + // Remove the last output, for example the last output doesn't have data (no + // entry and no range-dels), but file_size might not be 0, as it has SST + // metadata. + void RemoveLastOutput() { + assert(!outputs_.empty()); + outputs_.pop_back(); + } + + bool HasBuilder() const { return builder_ != nullptr; } + + FileMetaData* GetMetaData() { return ¤t_output().meta; } + + bool HasOutput() const { return !outputs_.empty(); } + + uint64_t NumEntries() const { return builder_->NumEntries(); } + + void ResetBuilder() { + builder_.reset(); + current_output_file_size_ = 0; + } + + // Add range-dels from the aggregator to the current output file + Status AddRangeDels(const Slice* comp_start, const Slice* comp_end, + CompactionIterationStats& range_del_out_stats, + bool bottommost_level, const InternalKeyComparator& icmp, + SequenceNumber earliest_snapshot, + const Slice& next_table_min_key); + + // Is the current file is already pending for close + bool IsPendingClose() const { return pending_close_; } + + // Current file should close before adding a new key + void SetPendingClose() { pending_close_ = true; } + + // if the outputs have range delete, range delete is also data + bool HasRangeDel() const { + return range_del_agg_ && !range_del_agg_->IsEmpty(); + } + + private: + friend class SubcompactionState; + + void Cleanup() { + if (builder_ != nullptr) { + // May happen if we get a shutdown call in the middle of compaction + builder_->Abandon(); + builder_.reset(); + } + } + + uint64_t GetCurrentOutputFileSize() const { + return current_output_file_size_; + } + + // Add curent key from compaction_iterator to the output file. If needed + // close and open new compaction output with the functions provided. + Status AddToOutput(const CompactionIterator& c_iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func); + + // Close the current output. `open_file_func` is needed for creating new file + // for range-dels only output file. + Status CloseOutput(const Status& curr_status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + Status status = curr_status; + // handle subcompaction containing only range deletions + if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) { + status = open_file_func(*this); + } + if (HasBuilder()) { + const Slice empty_key{}; + Status s = close_file_func(*this, status, empty_key); + if (!s.ok() && status.ok()) { + status = s; + } + } + + return status; + } + + // This subcompaction's output could be empty if compaction was aborted before + // this subcompaction had a chance to generate any output files. When + // subcompactions are executed sequentially this is more likely and will be + // particularly likely for the later subcompactions to be empty. Once they are + // run in parallel however it should be much rarer. + // It's caller's responsibility to make sure it's not empty. + Output& current_output() { + assert(!outputs_.empty()); + return outputs_.back(); + } + + // Assign the range_del_agg to the target output level. There's only one + // range-del-aggregator per compaction outputs, for + // output_to_penultimate_level compaction it is only assigned to the + // penultimate level. + void AssignRangeDelAggregator( + std::unique_ptr&& range_del_agg) { + assert(range_del_agg_ == nullptr); + range_del_agg_ = std::move(range_del_agg); + } + + const Compaction* compaction_; + + // The current file is pending close, which needs to run `close_file_func()` + // first to add a new key. + bool pending_close_ = false; + + // current output builder and writer + std::unique_ptr builder_; + std::unique_ptr file_writer_; + uint64_t current_output_file_size_ = 0; + + // all the compaction outputs so far + std::vector outputs_; + + // BlobDB info + std::vector blob_file_additions_; + std::unique_ptr blob_garbage_meter_; + + // Basic compaction output stats for this level's outputs + InternalStats::CompactionOutputsStats stats_; + + // indicate if this CompactionOutputs obj for penultimate_level, should always + // be false if per_key_placement feature is not enabled. + const bool is_penultimate_level_; + std::unique_ptr range_del_agg_ = nullptr; + + // partitioner information + std::string last_key_for_partitioner_; + std::unique_ptr partitioner_; +}; + +// helper struct to concatenate the last level and penultimate level outputs +// which could be replaced by std::ranges::join_view() in c++20 +struct OutputIterator { + public: + explicit OutputIterator(const std::vector& a, + const std::vector& b) + : a_(a), b_(b) { + within_a = !a_.empty(); + idx_ = 0; + } + + OutputIterator begin() { return *this; } + + OutputIterator end() { return *this; } + + size_t size() { return a_.size() + b_.size(); } + + const CompactionOutputs::Output& operator*() const { + return within_a ? a_[idx_] : b_[idx_]; + } + + OutputIterator& operator++() { + idx_++; + if (within_a && idx_ >= a_.size()) { + within_a = false; + idx_ = 0; + } + assert(within_a || idx_ <= b_.size()); + return *this; + } + + bool operator!=(const OutputIterator& /*rhs*/) const { + return within_a || idx_ < b_.size(); + } + + private: + const std::vector& a_; + const std::vector& b_; + bool within_a; + size_t idx_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index 6bef17238..cc13d044a 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -214,13 +214,13 @@ void CompactionPicker::GetRange(const CompactionInputFiles& inputs1, } void CompactionPicker::GetRange(const std::vector& inputs, - InternalKey* smallest, - InternalKey* largest) const { + InternalKey* smallest, InternalKey* largest, + int exclude_level) const { InternalKey current_smallest; InternalKey current_largest; bool initialized = false; for (const auto& in : inputs) { - if (in.empty()) { + if (in.empty() || in.level == exclude_level) { continue; } GetRange(in, ¤t_smallest, ¤t_largest); @@ -293,6 +293,12 @@ bool CompactionPicker::RangeOverlapWithCompaction( // Overlap return true; } + if (c->SupportsPerKeyPlacement()) { + if (c->OverlapPenultimateLevelOutputRange(smallest_user_key, + largest_user_key)) { + return true; + } + } } // Did not overlap with any running compaction in level `level` return false; @@ -301,9 +307,11 @@ bool CompactionPicker::RangeOverlapWithCompaction( bool CompactionPicker::FilesRangeOverlapWithCompaction( const std::vector& inputs, int level) const { bool is_empty = true; + int start_level = -1; for (auto& in : inputs) { if (!in.empty()) { is_empty = false; + start_level = in.level; // inputs are sorted by level break; } } @@ -313,7 +321,19 @@ bool CompactionPicker::FilesRangeOverlapWithCompaction( } InternalKey smallest, largest; - GetRange(inputs, &smallest, &largest); + GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel); + int penultimate_level = + Compaction::EvaluatePenultimateLevel(ioptions_, start_level, level); + if (penultimate_level != Compaction::kInvalidLevel) { + InternalKey penultimate_smallest, penultimate_largest; + GetRange(inputs, &penultimate_smallest, &penultimate_largest, level); + if (RangeOverlapWithCompaction(penultimate_smallest.user_key(), + penultimate_largest.user_key(), + penultimate_level)) { + return true; + } + } + return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(), level); } diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h index 7afbf437d..389ba8174 100644 --- a/db/compaction/compaction_picker.h +++ b/db/compaction/compaction_picker.h @@ -154,7 +154,8 @@ class CompactionPicker { // in *smallest, *largest. // REQUIRES: inputs is not empty (at least on entry have one file) void GetRange(const std::vector& inputs, - InternalKey* smallest, InternalKey* largest) const; + InternalKey* smallest, InternalKey* largest, + int exclude_level) const; int NumberLevels() const { return ioptions_.num_levels; } diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 96780ca1f..6f214b3a3 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -430,8 +430,7 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { #ifndef ROCKSDB_LITE TEST_F(CompactionPickerTest, NeedsCompactionUniversal) { NewVersionStorage(1, kCompactionStyleUniversal); - UniversalCompactionPicker universal_compaction_picker( - ioptions_, &icmp_); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); UpdateVersionStorageInfo(); // must return false when there's no files. ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), @@ -3048,6 +3047,192 @@ TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) { ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size()); } +class PerKeyPlacementCompactionPickerTest + : public CompactionPickerTest, + public testing::WithParamInterface { + public: + PerKeyPlacementCompactionPickerTest() : CompactionPickerTest() {} + + void SetUp() override { enable_per_key_placement_ = GetParam(); } + + protected: + bool enable_per_key_placement_ = false; +}; + +TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(5, 40U, "200", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(40); + std::vector input_files; + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(level_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ( + enable_per_key_placement_, + level_compaction_picker.FilesRangeOverlapWithCompaction(input_files, 6)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + std::vector input_files; + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(level_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(40); + input_set.insert(41); + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ( + enable_per_key_placement_, + level_compaction_picker.FilesRangeOverlapWithCompaction(input_files, 5)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, + OverlapWithNormalCompactionUniveral) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(5, 40U, "200", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(40); + std::vector input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 6)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + std::vector input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(40); + input_set.insert(41); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5)); +} + +INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest, + PerKeyPlacementCompactionPickerTest, ::testing::Bool()); + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc new file mode 100644 index 000000000..eeb936878 --- /dev/null +++ b/db/compaction/compaction_service_job.cc @@ -0,0 +1,825 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_job.h" +#include "db/compaction/compaction_state.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "rocksdb/utilities/options_type.h" + +#ifndef ROCKSDB_LITE +namespace ROCKSDB_NAMESPACE { +class SubcompactionState; + +CompactionServiceJobStatus +CompactionJob::ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact) { + assert(sub_compact); + assert(sub_compact->compaction); + assert(db_options_.compaction_service); + + const Compaction* compaction = sub_compact->compaction; + CompactionServiceInput compaction_input; + compaction_input.output_level = compaction->output_level(); + compaction_input.db_id = db_id_; + + const std::vector& inputs = + *(compact_->compaction->inputs()); + for (const auto& files_per_level : inputs) { + for (const auto& file : files_per_level.files) { + compaction_input.input_files.emplace_back( + MakeTableFileName(file->fd.GetNumber())); + } + } + compaction_input.column_family.name = + compaction->column_family_data()->GetName(); + compaction_input.column_family.options = + compaction->column_family_data()->GetLatestCFOptions(); + compaction_input.db_options = + BuildDBOptions(db_options_, mutable_db_options_copy_); + compaction_input.snapshots = existing_snapshots_; + compaction_input.has_begin = sub_compact->start; + compaction_input.begin = + compaction_input.has_begin ? sub_compact->start->ToString() : ""; + compaction_input.has_end = sub_compact->end; + compaction_input.end = + compaction_input.has_end ? sub_compact->end->ToString() : ""; + + std::string compaction_input_binary; + Status s = compaction_input.Write(&compaction_input_binary); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + std::ostringstream input_files_oss; + bool is_first_one = true; + for (const auto& file : compaction_input.input_files) { + input_files_oss << (is_first_one ? "" : ", ") << file; + is_first_one = false; + } + + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Starting remote compaction (output level: %d): %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_input.output_level, input_files_oss.str().c_str()); + CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_, + GetCompactionId(sub_compact), thread_pri_); + CompactionServiceJobStatus compaction_status = + db_options_.compaction_service->StartV2(info, compaction_input_binary); + switch (compaction_status) { + case CompactionServiceJobStatus::kSuccess: + break; + case CompactionServiceJobStatus::kFailure: + sub_compact->status = Status::Incomplete( + "CompactionService failed to start compaction job."); + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed to start.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + case CompactionServiceJobStatus::kUseLocal: + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API Start.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + default: + assert(false); // unknown status + break; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Waiting for remote compaction...", + compaction_input.column_family.name.c_str(), job_id_); + std::string compaction_result_binary; + compaction_status = db_options_.compaction_service->WaitForCompleteV2( + info, &compaction_result_binary); + + if (compaction_status == CompactionServiceJobStatus::kUseLocal) { + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API " + "WaitForComplete.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + } + + CompactionServiceResult compaction_result; + s = CompactionServiceResult::Read(compaction_result_binary, + &compaction_result); + + if (compaction_status == CompactionServiceJobStatus::kFailure) { + if (s.ok()) { + if (compaction_result.status.ok()) { + sub_compact->status = Status::Incomplete( + "CompactionService failed to run the compaction job (even though " + "the internal status is okay)."); + } else { + // set the current sub compaction status with the status returned from + // remote + sub_compact->status = compaction_result.status; + } + } else { + sub_compact->status = Status::Incomplete( + "CompactionService failed to run the compaction job (and no valid " + "result is returned)."); + compaction_result.status.PermitUncheckedError(); + } + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + } + + if (!s.ok()) { + sub_compact->status = s; + compaction_result.status.PermitUncheckedError(); + return CompactionServiceJobStatus::kFailure; + } + sub_compact->status = compaction_result.status; + + std::ostringstream output_files_oss; + is_first_one = true; + for (const auto& file : compaction_result.output_files) { + output_files_oss << (is_first_one ? "" : ", ") << file.file_name; + is_first_one = false; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Receive remote compaction result, output path: " + "%s, files: %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_result.output_path.c_str(), + output_files_oss.str().c_str()); + + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + for (const auto& file : compaction_result.output_files) { + uint64_t file_num = versions_->NewFileNumber(); + auto src_file = compaction_result.output_path + "/" + file.file_name; + auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths, + file_num, compaction->output_path_id()); + s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + FileMetaData meta; + uint64_t file_size; + s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, + file.smallest_seqno, file.largest_seqno); + meta.smallest.DecodeFrom(file.smallest_internal_key); + meta.largest.DecodeFrom(file.largest_internal_key); + meta.oldest_ancester_time = file.oldest_ancester_time; + meta.file_creation_time = file.file_creation_time; + meta.marked_for_compaction = file.marked_for_compaction; + meta.unique_id = file.unique_id; + + auto cfd = compaction->column_family_data(); + sub_compact->Current().AddOutput(std::move(meta), + cfd->internal_comparator(), false, false, + true, file.paranoid_hash); + } + sub_compact->compaction_job_stats = compaction_result.stats; + sub_compact->Current().SetNumOutputRecords( + compaction_result.num_output_records); + sub_compact->Current().SetTotalBytes(compaction_result.total_bytes); + RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); + RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, + compaction_result.bytes_written); + return CompactionServiceJobStatus::kSuccess; +} + +std::string CompactionServiceCompactionJob::GetTableFileName( + uint64_t file_number) { + return MakeTableFileName(output_path_, file_number); +} + +void CompactionServiceCompactionJob::RecordCompactionIOStats() { + compaction_result_->bytes_read += IOSTATS(bytes_read); + compaction_result_->bytes_written += IOSTATS(bytes_written); + CompactionJob::RecordCompactionIOStats(); +} + +CompactionServiceCompactionJob::CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic* shutting_down, + LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + std::shared_ptr table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr& io_tracer, + const std::atomic& manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result) + : CompactionJob( + job_id, compaction, db_options, mutable_db_options, file_options, + versions, shutting_down, log_buffer, nullptr, output_directory, + nullptr, stats, db_mutex, db_error_handler, + std::move(existing_snapshots), kMaxSequenceNumber, nullptr, nullptr, + std::move(table_cache), event_logger, + compaction->mutable_cf_options()->paranoid_file_checks, + compaction->mutable_cf_options()->report_bg_io_stats, dbname, + &(compaction_service_result->stats), Env::Priority::USER, io_tracer, + manual_compaction_canceled, db_id, db_session_id, + compaction->column_family_data()->GetFullHistoryTsLow()), + output_path_(std::move(output_path)), + compaction_input_(compaction_service_input), + compaction_result_(compaction_service_result) {} + +Status CompactionServiceCompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + + auto* c = compact_->compaction; + assert(c->column_family_data() != nullptr); + assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = + c->column_family_data()->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + Slice begin = compaction_input_.begin; + Slice end = compaction_input_.end; + compact_->sub_compact_states.emplace_back( + c, compaction_input_.has_begin ? &begin : nullptr, + compaction_input_.has_end ? &end : nullptr, /*sub_job_id*/ 0); + + log_buffer_->FlushBufferToLog(); + LogCompaction(); + const uint64_t start_micros = db_options_.clock->NowMicros(); + // Pick the only sub-compaction we should have + assert(compact_->sub_compact_states.size() == 1); + SubcompactionState* sub_compact = compact_->sub_compact_states.data(); + + ProcessKeyValueCompaction(sub_compact); + + compaction_stats_.stats.micros = + db_options_.clock->NowMicros() - start_micros; + compaction_stats_.stats.cpu_micros = + sub_compact->compaction_job_stats.cpu_micros; + + RecordTimeToHistogram(stats_, COMPACTION_TIME, + compaction_stats_.stats.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.stats.cpu_micros); + + Status status = sub_compact->status; + IOStatus io_s = sub_compact->io_status; + + if (io_status_.ok()) { + io_status_ = io_s; + } + + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg, + DirFsyncOptions()); + } + } + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + status = io_s; + } + if (status.ok()) { + // TODO: Add verify_table() + } + + // Finish up all book-keeping to unify the subcompaction results + compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); + UpdateCompactionStats(); + RecordCompactionIOStats(); + + LogFlush(db_options_.info_log); + compact_->status = status; + compact_->status.PermitUncheckedError(); + + // Build compaction result + compaction_result_->output_level = compact_->compaction->output_level(); + compaction_result_->output_path = output_path_; + for (const auto& output_file : sub_compact->GetOutputs()) { + auto& meta = output_file.meta; + compaction_result_->output_files.emplace_back( + MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.smallest.Encode().ToString(), + meta.largest.Encode().ToString(), meta.oldest_ancester_time, + meta.file_creation_time, output_file.validator.GetHash(), + meta.marked_for_compaction, meta.unique_id); + } + InternalStats::CompactionStatsFull compaction_stats; + sub_compact->AggregateCompactionStats(compaction_stats); + compaction_result_->num_output_records = + compaction_stats.stats.num_output_records; + compaction_result_->total_bytes = compaction_stats.TotalBytesWritten(); + + return status; +} + +void CompactionServiceCompactionJob::CleanupCompaction() { + CompactionJob::CleanupCompaction(); +} + +// Internal binary format for the input and result data +enum BinaryFormatVersion : uint32_t { + kOptionsString = 1, // Use string format similar to Option string format +}; + +static std::unordered_map cfd_type_info = { + {"name", + {offsetof(struct ColumnFamilyDescriptor, name), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"options", + {offsetof(struct ColumnFamilyDescriptor, options), + OptionType::kConfigurable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto cf_options = static_cast(addr); + return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(), + value, cf_options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto cf_options = static_cast(addr); + std::string result; + auto status = + GetStringFromColumnFamilyOptions(opts, *cf_options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast(addr1); + const auto that_one = static_cast(addr2); + auto this_conf = CFOptionsAsConfigurable(*this_one); + auto that_conf = CFOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, +}; + +static std::unordered_map cs_input_type_info = { + {"column_family", + OptionTypeInfo::Struct( + "column_family", &cfd_type_info, + offsetof(struct CompactionServiceInput, column_family), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"db_options", + {offsetof(struct CompactionServiceInput, db_options), + OptionType::kConfigurable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto options = static_cast(addr); + return GetDBOptionsFromString(opts, DBOptions(), value, options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto options = static_cast(addr); + std::string result; + auto status = GetStringFromDBOptions(opts, *options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast(addr1); + const auto that_one = static_cast(addr2); + auto this_conf = DBOptionsAsConfigurable(*this_one); + auto that_conf = DBOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, + {"snapshots", OptionTypeInfo::Vector( + offsetof(struct CompactionServiceInput, snapshots), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, + {"input_files", OptionTypeInfo::Vector( + offsetof(struct CompactionServiceInput, input_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kEncodedString})}, + {"output_level", + {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"db_id", + {offsetof(struct CompactionServiceInput, db_id), + OptionType::kEncodedString}}, + {"has_begin", + {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"begin", + {offsetof(struct CompactionServiceInput, begin), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"has_end", + {offsetof(struct CompactionServiceInput, has_end), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"end", + {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; + +static std::unordered_map + cs_output_file_type_info = { + {"file_name", + {offsetof(struct CompactionServiceOutputFile, file_name), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_seqno", + {offsetof(struct CompactionServiceOutputFile, smallest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_seqno", + {offsetof(struct CompactionServiceOutputFile, largest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_internal_key", + {offsetof(struct CompactionServiceOutputFile, smallest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_internal_key", + {offsetof(struct CompactionServiceOutputFile, largest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"oldest_ancester_time", + {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_creation_time", + {offsetof(struct CompactionServiceOutputFile, file_creation_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"paranoid_hash", + {offsetof(struct CompactionServiceOutputFile, paranoid_hash), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"marked_for_compaction", + {offsetof(struct CompactionServiceOutputFile, marked_for_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"unique_id", + OptionTypeInfo::Array( + offsetof(struct CompactionServiceOutputFile, unique_id), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, +}; + +static std::unordered_map + compaction_job_stats_type_info = { + {"elapsed_micros", + {offsetof(struct CompactionJobStats, elapsed_micros), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cpu_micros", + {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"num_input_records", + {offsetof(struct CompactionJobStats, num_input_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_blobs_read", + {offsetof(struct CompactionJobStats, num_blobs_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files", + {offsetof(struct CompactionJobStats, num_input_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files_at_output_level", + {offsetof(struct CompactionJobStats, num_input_files_at_output_level), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionJobStats, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files", + {offsetof(struct CompactionJobStats, num_output_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files_blob", + {offsetof(struct CompactionJobStats, num_output_files_blob), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_full_compaction", + {offsetof(struct CompactionJobStats, is_full_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_manual_compaction", + {offsetof(struct CompactionJobStats, is_manual_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_bytes", + {offsetof(struct CompactionJobStats, total_input_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_blob_bytes_read", + {offsetof(struct CompactionJobStats, total_blob_bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes", + {offsetof(struct CompactionJobStats, total_output_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes_blob", + {offsetof(struct CompactionJobStats, total_output_bytes_blob), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_records_replaced", + {offsetof(struct CompactionJobStats, num_records_replaced), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_key_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_key_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_value_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_value_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_deletion_records", + {offsetof(struct CompactionJobStats, num_input_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_expired_deletion_records", + {offsetof(struct CompactionJobStats, num_expired_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_corrupt_keys", + {offsetof(struct CompactionJobStats, num_corrupt_keys), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_write_nanos", + {offsetof(struct CompactionJobStats, file_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_range_sync_nanos", + {offsetof(struct CompactionJobStats, file_range_sync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_fsync_nanos", + {offsetof(struct CompactionJobStats, file_fsync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_prepare_write_nanos", + {offsetof(struct CompactionJobStats, file_prepare_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_output_key_prefix", + {offsetof(struct CompactionJobStats, smallest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_output_key_prefix", + {offsetof(struct CompactionJobStats, largest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_fallthru", + {offsetof(struct CompactionJobStats, num_single_del_fallthru), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_mismatch", + {offsetof(struct CompactionJobStats, num_single_del_mismatch), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +namespace { +// this is a helper struct to serialize and deserialize class Status, because +// Status's members are not public. +struct StatusSerializationAdapter { + uint8_t code; + uint8_t subcode; + uint8_t severity; + std::string message; + + StatusSerializationAdapter() = default; + explicit StatusSerializationAdapter(const Status& s) { + code = s.code(); + subcode = s.subcode(); + severity = s.severity(); + auto msg = s.getState(); + message = msg ? msg : ""; + } + + Status GetStatus() const { + return Status{static_cast(code), + static_cast(subcode), + static_cast(severity), message}; + } +}; +} // namespace + +static std::unordered_map + status_adapter_type_info = { + {"code", + {offsetof(struct StatusSerializationAdapter, code), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"subcode", + {offsetof(struct StatusSerializationAdapter, subcode), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"severity", + {offsetof(struct StatusSerializationAdapter, severity), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"message", + {offsetof(struct StatusSerializationAdapter, message), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +static std::unordered_map cs_result_type_info = { + {"status", + {offsetof(struct CompactionServiceResult, status), + OptionType::kCustomizable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto status_obj = static_cast(addr); + StatusSerializationAdapter adapter; + Status s = OptionTypeInfo::ParseType( + opts, value, status_adapter_type_info, &adapter); + *status_obj = adapter.GetStatus(); + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto status_obj = static_cast(addr); + StatusSerializationAdapter adapter(*status_obj); + std::string result; + Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info, + &adapter, &result); + *value = "{" + result + "}"; + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr1, const void* addr2, std::string* mismatch) { + const auto status1 = static_cast(addr1); + const auto status2 = static_cast(addr2); + + StatusSerializationAdapter adatper1(*status1); + StatusSerializationAdapter adapter2(*status2); + return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info, + &adatper1, &adapter2, mismatch); + }}}, + {"output_files", + OptionTypeInfo::Vector( + offsetof(struct CompactionServiceResult, output_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone))}, + {"output_level", + {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"output_path", + {offsetof(struct CompactionServiceResult, output_path), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionServiceResult, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_bytes", + {offsetof(struct CompactionServiceResult, total_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_read", + {offsetof(struct CompactionServiceResult, bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_written", + {offsetof(struct CompactionServiceResult, bytes_written), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"stats", OptionTypeInfo::Struct( + "stats", &compaction_job_stats_type_info, + offsetof(struct CompactionServiceResult, stats), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, +}; + +Status CompactionServiceInput::Read(const std::string& data_str, + CompactionServiceInput* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceInput string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Input data version not supported: " + + std::to_string(format_version)); + } +} + +Status CompactionServiceInput::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output); +} + +Status CompactionServiceResult::Read(const std::string& data_str, + CompactionServiceResult* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceResult string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Result data version not supported: " + + std::to_string(format_version)); + } +} + +Status CompactionServiceResult::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output); +} + +#ifndef NDEBUG +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other, + mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other, + mismatch); +} +#endif // NDEBUG +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/db/compaction/compaction_state.cc b/db/compaction/compaction_state.cc new file mode 100644 index 000000000..ee4b0c189 --- /dev/null +++ b/db/compaction/compaction_state.cc @@ -0,0 +1,46 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_state.h" + +namespace ROCKSDB_NAMESPACE { + +Slice CompactionState::SmallestUserKey() { + for (const auto& sub_compact_state : sub_compact_states) { + Slice smallest = sub_compact_state.SmallestUserKey(); + if (!smallest.empty()) { + return smallest; + } + } + // If there is no finished output, return an empty slice. + return Slice{nullptr, 0}; +} + +Slice CompactionState::LargestUserKey() { + for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend(); + ++it) { + Slice largest = it->LargestUserKey(); + if (!largest.empty()) { + return largest; + } + } + // If there is no finished output, return an empty slice. + return Slice{nullptr, 0}; +} + +void CompactionState::AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats, + CompactionJobStats& compaction_job_stats) { + for (const auto& sc : sub_compact_states) { + sc.AggregateCompactionStats(compaction_stats); + compaction_job_stats.Add(sc.compaction_job_stats); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_state.h b/db/compaction/compaction_state.h new file mode 100644 index 000000000..cc5b66c68 --- /dev/null +++ b/db/compaction/compaction_state.h @@ -0,0 +1,42 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction.h" +#include "db/compaction/subcompaction_state.h" +#include "db/internal_stats.h" + +// Data structures used for compaction_job and compaction_service_job which has +// the list of sub_compact_states and the aggregated information for the +// compaction. +namespace ROCKSDB_NAMESPACE { + +// Maintains state for the entire compaction +class CompactionState { + public: + Compaction* const compaction; + + // REQUIRED: subcompaction states are stored in order of increasing key-range + std::vector sub_compact_states; + Status status; + + void AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats, + CompactionJobStats& compaction_job_stats); + + explicit CompactionState(Compaction* c) : compaction(c) {} + + Slice SmallestUserKey(); + + Slice LargestUserKey(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/subcompaction_state.cc b/db/compaction/subcompaction_state.cc new file mode 100644 index 000000000..89914f479 --- /dev/null +++ b/db/compaction/subcompaction_state.cc @@ -0,0 +1,223 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/subcompaction_state.h" + +#include "rocksdb/sst_partitioner.h" + +namespace ROCKSDB_NAMESPACE { +void SubcompactionState::AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats) const { + compaction_stats.stats.Add(compaction_outputs_.stats_); + if (HasPenultimateLevelOutputs()) { + compaction_stats.has_penultimate_level_output = true; + compaction_stats.penultimate_level_stats.Add( + penultimate_level_outputs_.stats_); + } +} + +void SubcompactionState::FillFilesToCutForTtl() { + if (compaction->immutable_options()->compaction_style != + CompactionStyle::kCompactionStyleLevel || + compaction->immutable_options()->compaction_pri != + CompactionPri::kMinOverlappingRatio || + compaction->mutable_cf_options()->ttl == 0 || + compaction->num_input_levels() < 2 || compaction->bottommost_level()) { + return; + } + + // We define new file with the oldest ancestor time to be younger than 1/4 + // TTL, and an old one to be older than 1/2 TTL time. + int64_t temp_current_time; + auto get_time_status = compaction->immutable_options()->clock->GetCurrentTime( + &temp_current_time); + if (!get_time_status.ok()) { + return; + } + auto current_time = static_cast(temp_current_time); + if (current_time < compaction->mutable_cf_options()->ttl) { + return; + } + uint64_t old_age_thres = + current_time - compaction->mutable_cf_options()->ttl / 2; + + const std::vector& olevel = + *(compaction->inputs(compaction->num_input_levels() - 1)); + for (FileMetaData* file : olevel) { + // Worth filtering out by start and end? + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + // We put old files if they are not too small to prevent a flood + // of small files. + if (oldest_ancester_time < old_age_thres && + file->fd.GetFileSize() > + compaction->mutable_cf_options()->target_file_size_base / 2) { + files_to_cut_for_ttl_.push_back(file); + } + } +} + +OutputIterator SubcompactionState::GetOutputs() const { + return OutputIterator(penultimate_level_outputs_.outputs_, + compaction_outputs_.outputs_); +} + +void SubcompactionState::Cleanup(Cache* cache) { + penultimate_level_outputs_.Cleanup(); + compaction_outputs_.Cleanup(); + + if (!status.ok()) { + for (const auto& out : GetOutputs()) { + // If this file was inserted into the table cache then remove + // them here because this compaction was not committed. + TableCache::Evict(cache, out.meta.fd.GetNumber()); + } + } + // TODO: sub_compact.io_status is not checked like status. Not sure if thats + // intentional. So ignoring the io_status as of now. + io_status.PermitUncheckedError(); +} + +Slice SubcompactionState::SmallestUserKey() const { + if (has_penultimate_level_outputs_) { + Slice a = compaction_outputs_.SmallestUserKey(); + Slice b = penultimate_level_outputs_.SmallestUserKey(); + if (a.empty()) { + return b; + } + if (b.empty()) { + return a; + } + const Comparator* user_cmp = + compaction->column_family_data()->user_comparator(); + if (user_cmp->Compare(a, b) > 0) { + return b; + } else { + return a; + } + } else { + return compaction_outputs_.SmallestUserKey(); + } +} + +Slice SubcompactionState::LargestUserKey() const { + if (has_penultimate_level_outputs_) { + Slice a = compaction_outputs_.LargestUserKey(); + Slice b = penultimate_level_outputs_.LargestUserKey(); + if (a.empty()) { + return b; + } + if (b.empty()) { + return a; + } + const Comparator* user_cmp = + compaction->column_family_data()->user_comparator(); + if (user_cmp->Compare(a, b) < 0) { + return b; + } else { + return a; + } + } else { + return compaction_outputs_.LargestUserKey(); + } +} + +bool SubcompactionState::ShouldStopBefore(const Slice& internal_key) { + uint64_t curr_file_size = Current().GetCurrentOutputFileSize(); + const InternalKeyComparator* icmp = + &compaction->column_family_data()->internal_comparator(); + + // Invalid local_output_split_key indicates that we do not need to split + if (local_output_split_key_ != nullptr && !is_split_) { + // Split occurs when the next key is larger than/equal to the cursor + if (icmp->Compare(internal_key, local_output_split_key_->Encode()) >= 0) { + is_split_ = true; + return true; + } + } + + const std::vector& grandparents = compaction->grandparents(); + bool grandparant_file_switched = false; + // Scan to find the earliest grandparent file that contains key. + while (grandparent_index_ < grandparents.size() && + icmp->Compare(internal_key, + grandparents[grandparent_index_]->largest.Encode()) > + 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents[grandparent_index_]->fd.GetFileSize(); + grandparant_file_switched = true; + } + assert(grandparent_index_ + 1 >= grandparents.size() || + icmp->Compare( + grandparents[grandparent_index_]->largest.Encode(), + grandparents[grandparent_index_ + 1]->smallest.Encode()) <= 0); + grandparent_index_++; + } + seen_key_ = true; + + if (grandparant_file_switched && + overlapped_bytes_ + curr_file_size > compaction->max_compaction_bytes()) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; + return true; + } + + if (!files_to_cut_for_ttl_.empty()) { + if (cur_files_to_cut_for_ttl_ != -1) { + // Previous key is inside the range of a file + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_] + ->largest.Encode()) > 0) { + next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1; + cur_files_to_cut_for_ttl_ = -1; + return true; + } + } else { + // Look for the key position + while (next_files_to_cut_for_ttl_ < + static_cast(files_to_cut_for_ttl_.size())) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->smallest.Encode()) >= 0) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->largest.Encode()) <= 0) { + // With in the current file + cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_; + return true; + } + // Beyond the current file + next_files_to_cut_for_ttl_++; + } else { + // Still fall into the gap + break; + } + } + } + } + + return false; +} + +Status SubcompactionState::AddToOutput( + const CompactionIterator& iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + // update target output first + is_current_penultimate_level_ = iter.output_to_penultimate_level(); + current_outputs_ = is_current_penultimate_level_ ? &penultimate_level_outputs_ + : &compaction_outputs_; + if (is_current_penultimate_level_) { + has_penultimate_level_outputs_ = true; + } + + return Current().AddToOutput(iter, open_file_func, close_file_func); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h new file mode 100644 index 000000000..6774ffd15 --- /dev/null +++ b/db/compaction/subcompaction_state.h @@ -0,0 +1,255 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_garbage_meter.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iterator.h" +#include "db/compaction/compaction_outputs.h" +#include "db/internal_stats.h" +#include "db/output_validator.h" +#include "db/range_del_aggregator.h" + +namespace ROCKSDB_NAMESPACE { + +// Maintains state and outputs for each sub-compaction +// It contains 2 `CompactionOutputs`: +// 1. one for the normal output files +// 2. another for the penultimate level outputs +// a `current` pointer maintains the current output group, when calling +// `AddToOutput()`, it checks the output of the current compaction_iterator key +// and point `current` to the target output group. By default, it just points to +// normal compaction_outputs, if the compaction_iterator key should be placed on +// the penultimate level, `current` is changed to point to +// `penultimate_level_outputs`. +// The later operations uses `Current()` to get the target group. +// +// +----------+ +-----------------------------+ +---------+ +// | *current |--------> | compaction_outputs |----->| output | +// +----------+ +-----------------------------+ +---------+ +// | | output | +// | +---------+ +// | | ... | +// | +// | +-----------------------------+ +---------+ +// +-------------> | penultimate_level_outputs |----->| output | +// +-----------------------------+ +---------+ +// | ... | + +class SubcompactionState { + public: + const Compaction* compaction; + + // The boundaries of the key-range this compaction is interested in. No two + // sub-compactions may have overlapping key-ranges. + // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded + const Slice *start, *end; + + // The return status of this sub-compaction + Status status; + + // The return IO Status of this sub-compaction + IOStatus io_status; + + // Notify on sub-compaction completion only if listener was notified on + // sub-compaction begin. + bool notify_on_subcompaction_completion = false; + + // compaction job stats for this sub-compaction + CompactionJobStats compaction_job_stats; + + // sub-compaction job id, which is used to identify different sub-compaction + // within the same compaction job. + const uint32_t sub_job_id; + + Slice SmallestUserKey() const; + + Slice LargestUserKey() const; + + // Get all outputs from the subcompaction. For per_key_placement compaction, + // it returns both the last level outputs and penultimate level outputs. + OutputIterator GetOutputs() const; + + // Assign range dels aggregator, for each range_del, it can only be assigned + // to one output level, for per_key_placement, it's going to be the + // penultimate level. + void AssignRangeDelAggregator( + std::unique_ptr&& range_del_agg) { + if (compaction->SupportsPerKeyPlacement()) { + penultimate_level_outputs_.AssignRangeDelAggregator( + std::move(range_del_agg)); + } else { + compaction_outputs_.AssignRangeDelAggregator(std::move(range_del_agg)); + } + } + + void RemoveLastEmptyOutput() { + compaction_outputs_.RemoveLastEmptyOutput(); + penultimate_level_outputs_.RemoveLastEmptyOutput(); + } + +#ifndef ROCKSDB_LITE + void BuildSubcompactionJobInfo( + SubcompactionJobInfo& subcompaction_job_info) const { + const Compaction* c = compaction; + const ColumnFamilyData* cfd = c->column_family_data(); + + subcompaction_job_info.cf_id = cfd->GetID(); + subcompaction_job_info.cf_name = cfd->GetName(); + subcompaction_job_info.status = status; + subcompaction_job_info.subcompaction_job_id = static_cast(sub_job_id); + subcompaction_job_info.base_input_level = c->start_level(); + subcompaction_job_info.output_level = c->output_level(); + subcompaction_job_info.stats = compaction_job_stats; + } +#endif // !ROCKSDB_LITE + + SubcompactionState() = delete; + SubcompactionState(const SubcompactionState&) = delete; + SubcompactionState& operator=(const SubcompactionState&) = delete; + + SubcompactionState(Compaction* c, Slice* _start, Slice* _end, + uint32_t _sub_job_id) + : compaction(c), + start(_start), + end(_end), + sub_job_id(_sub_job_id), + compaction_outputs_(c, /*is_penultimate_level=*/false), + penultimate_level_outputs_(c, /*is_penultimate_level=*/true) { + assert(compaction != nullptr); + const InternalKeyComparator* icmp = + &compaction->column_family_data()->internal_comparator(); + const InternalKey* output_split_key = compaction->GetOutputSplitKey(); + // Invalid output_split_key indicates that we do not need to split + if (output_split_key != nullptr) { + // We may only split the output when the cursor is in the range. Split + if ((end == nullptr || icmp->user_comparator()->Compare( + ExtractUserKey(output_split_key->Encode()), + ExtractUserKey(*end)) < 0) && + (start == nullptr || icmp->user_comparator()->Compare( + ExtractUserKey(output_split_key->Encode()), + ExtractUserKey(*start)) > 0)) { + local_output_split_key_ = output_split_key; + } + } + } + + SubcompactionState(SubcompactionState&& state) noexcept + : compaction(state.compaction), + start(state.start), + end(state.end), + status(std::move(state.status)), + io_status(std::move(state.io_status)), + notify_on_subcompaction_completion( + state.notify_on_subcompaction_completion), + compaction_job_stats(std::move(state.compaction_job_stats)), + sub_job_id(state.sub_job_id), + files_to_cut_for_ttl_(std::move(state.files_to_cut_for_ttl_)), + cur_files_to_cut_for_ttl_(state.cur_files_to_cut_for_ttl_), + next_files_to_cut_for_ttl_(state.next_files_to_cut_for_ttl_), + grandparent_index_(state.grandparent_index_), + overlapped_bytes_(state.overlapped_bytes_), + seen_key_(state.seen_key_), + compaction_outputs_(std::move(state.compaction_outputs_)), + penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)), + is_current_penultimate_level_(state.is_current_penultimate_level_), + has_penultimate_level_outputs_(state.has_penultimate_level_outputs_) { + current_outputs_ = is_current_penultimate_level_ + ? &penultimate_level_outputs_ + : &compaction_outputs_; + } + + bool HasPenultimateLevelOutputs() const { + return has_penultimate_level_outputs_ || + penultimate_level_outputs_.HasRangeDel(); + } + + void FillFilesToCutForTtl(); + + // Returns true iff we should stop building the current output + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key); + + bool IsCurrentPenultimateLevel() const { + return is_current_penultimate_level_; + } + + // Add all the new files from this compaction to version_edit + void AddOutputsEdit(VersionEdit* out_edit) const { + for (const auto& file : penultimate_level_outputs_.outputs_) { + out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta); + } + for (const auto& file : compaction_outputs_.outputs_) { + out_edit->AddFile(compaction->output_level(), file.meta); + } + } + + void Cleanup(Cache* cache); + + void AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats) const; + + CompactionOutputs& Current() const { + assert(current_outputs_); + return *current_outputs_; + } + + // Add compaction_iterator key/value to the `Current` output group. + Status AddToOutput(const CompactionIterator& iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func); + + // Close all compaction output files, both output_to_penultimate_level outputs + // and normal outputs. + Status CloseCompactionFiles(const Status& curr_status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + // Call FinishCompactionOutputFile() even if status is not ok: it needs to + // close the output file. + Status s = penultimate_level_outputs_.CloseOutput( + curr_status, open_file_func, close_file_func); + s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func); + return s; + } + + private: + // Some identified files with old oldest ancester time and the range should be + // isolated out so that the output file(s) in that range can be merged down + // for TTL and clear the timestamps for the range. + std::vector files_to_cut_for_ttl_; + int cur_files_to_cut_for_ttl_ = -1; + int next_files_to_cut_for_ttl_ = 0; + + // An index that used to speed up ShouldStopBefore(). + size_t grandparent_index_ = 0; + // The number of bytes overlapping between the current output and + // grandparent files used in ShouldStopBefore(). + uint64_t overlapped_bytes_ = 0; + // A flag determines whether the key has been seen in ShouldStopBefore() + bool seen_key_ = false; + + // A flag determines if this subcompaction has been split by the cursor + bool is_split_ = false; + + // We also maintain the output split key for each subcompaction to avoid + // repetitive comparison in ShouldStopBefore() + const InternalKey* local_output_split_key_ = nullptr; + + // State kept for output being generated + CompactionOutputs compaction_outputs_; + CompactionOutputs penultimate_level_outputs_; + CompactionOutputs* current_outputs_ = &compaction_outputs_; + bool is_current_penultimate_level_ = false; + bool has_penultimate_level_outputs_ = false; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc new file mode 100644 index 000000000..1dc0fd232 --- /dev/null +++ b/db/compaction/tiered_compaction_test.cc @@ -0,0 +1,1253 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/listener.h" + +namespace ROCKSDB_NAMESPACE { + +#if !defined(ROCKSDB_LITE) + +class TieredCompactionTest : public DBTestBase { + public: + TieredCompactionTest() + : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true), + kBasicCompStats(CompactionReason::kUniversalSizeAmplification, 1), + kBasicPerKeyPlacementCompStats( + CompactionReason::kUniversalSizeAmplification, 1), + kBasicFlushStats(CompactionReason::kFlush, 1) { + kBasicCompStats.micros = kHasValue; + kBasicCompStats.cpu_micros = kHasValue; + kBasicCompStats.bytes_read_non_output_levels = kHasValue; + kBasicCompStats.num_input_files_in_non_output_levels = kHasValue; + kBasicCompStats.num_input_records = kHasValue; + kBasicCompStats.num_dropped_records = kHasValue; + + kBasicPerLevelStats.num_output_records = kHasValue; + kBasicPerLevelStats.bytes_written = kHasValue; + kBasicPerLevelStats.num_output_files = kHasValue; + + kBasicPerKeyPlacementCompStats.micros = kHasValue; + kBasicPerKeyPlacementCompStats.cpu_micros = kHasValue; + kBasicPerKeyPlacementCompStats.Add(kBasicPerLevelStats); + + kBasicFlushStats.micros = kHasValue; + kBasicFlushStats.cpu_micros = kHasValue; + kBasicFlushStats.bytes_written = kHasValue; + kBasicFlushStats.num_output_files = kHasValue; + } + + protected: + static constexpr uint8_t kHasValue = 1; + + InternalStats::CompactionStats kBasicCompStats; + InternalStats::CompactionStats kBasicPerKeyPlacementCompStats; + InternalStats::CompactionOutputsStats kBasicPerLevelStats; + InternalStats::CompactionStats kBasicFlushStats; + + void SetUp() override { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + } + +#ifndef ROCKSDB_LITE + uint64_t GetSstSizeHelper(Temperature temperature) { + std::string prop; + EXPECT_TRUE(dbfull()->GetProperty( + DB::Properties::kLiveSstFilesSizeAtTemperature + + std::to_string(static_cast(temperature)), + &prop)); + return static_cast(std::atoi(prop.c_str())); + } +#endif // ROCKSDB_LITE + + const std::vector& GetCompactionStats() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + return internal_stats->TEST_GetCompactionStats(); + } + + const InternalStats::CompactionStats& GetPerKeyPlacementCompactionStats() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + return internal_stats->TEST_GetPerKeyPlacementCompactionStats(); + } + + // Verify the compaction stats, the stats are roughly compared + void VerifyCompactionStats( + const std::vector& expect_stats, + const InternalStats::CompactionStats& expect_pl_stats) { + const std::vector& stats = + GetCompactionStats(); + const size_t kLevels = expect_stats.size(); + ASSERT_EQ(kLevels, stats.size()); + + for (auto it = stats.begin(), expect = expect_stats.begin(); + it != stats.end(); it++, expect++) { + VerifyCompactionStats(*it, *expect); + } + + const InternalStats::CompactionStats& pl_stats = + GetPerKeyPlacementCompactionStats(); + VerifyCompactionStats(pl_stats, expect_pl_stats); + } + + void ResetAllStats(std::vector& stats, + InternalStats::CompactionStats& pl_stats) { + ASSERT_OK(dbfull()->ResetStats()); + for (auto& level_stats : stats) { + level_stats.Clear(); + } + pl_stats.Clear(); + } + + private: + void CompareStats(uint64_t val, uint64_t expect) { + if (expect > 0) { + ASSERT_TRUE(val > 0); + } else { + ASSERT_EQ(val, 0); + } + } + + void VerifyCompactionStats( + const InternalStats::CompactionStats& stats, + const InternalStats::CompactionStats& expect_stats) { + CompareStats(stats.micros, expect_stats.micros); + CompareStats(stats.cpu_micros, expect_stats.cpu_micros); + CompareStats(stats.bytes_read_non_output_levels, + expect_stats.bytes_read_non_output_levels); + CompareStats(stats.bytes_read_output_level, + expect_stats.bytes_read_output_level); + CompareStats(stats.bytes_read_blob, expect_stats.bytes_read_blob); + CompareStats(stats.bytes_written, expect_stats.bytes_written); + CompareStats(stats.bytes_moved, expect_stats.bytes_moved); + CompareStats(stats.num_input_files_in_non_output_levels, + expect_stats.num_input_files_in_non_output_levels); + CompareStats(stats.num_input_files_in_output_level, + expect_stats.num_input_files_in_output_level); + CompareStats(stats.num_output_files, expect_stats.num_output_files); + CompareStats(stats.num_output_files_blob, + expect_stats.num_output_files_blob); + CompareStats(stats.num_input_records, expect_stats.num_input_records); + CompareStats(stats.num_dropped_records, expect_stats.num_dropped_records); + CompareStats(stats.num_output_records, expect_stats.num_output_records); + ASSERT_EQ(stats.count, expect_stats.count); + for (int i = 0; i < static_cast(CompactionReason::kNumOfReasons); + i++) { + ASSERT_EQ(stats.counts[i], expect_stats.counts[i]); + } + } +}; + +TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.bottommost_temperature = Temperature::kCold; + options.level0_file_num_compaction_trigger = kNumTrigger; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + std::vector seq_history; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector expect_stats(kNumLevels); + InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; + InternalStats::CompactionStats expect_pl_stats; + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); + expect_stats[0].Add(kBasicFlushStats); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // the penultimate level file temperature is not cold, all data are output to + // the penultimate level. + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // basic compaction stats are still counted to the last level + expect_stats[kLastLevel].Add(kBasicCompStats); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + + VerifyCompactionStats(expect_stats, expect_pl_stats); + + ResetAllStats(expect_stats, expect_pl_stats); + + // move forward the cold_seq to split the file into 2 levels, so should have + // both the last level stats and the output_to_penultimate_level stats + latest_cold_seq = seq_history[0]; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + last_stats.Add(kBasicPerLevelStats); + last_stats.num_dropped_records = 0; + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // delete all cold data, so all data will be on penultimate level + for (int i = 0; i < 10; i++) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + + ResetAllStats(expect_stats, expect_pl_stats); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // move forward the cold_seq again with range delete, take a snapshot to keep + // the range dels in both cold and hot SSTs + auto snap = db_->GetSnapshot(); + latest_cold_seq = seq_history[2]; + std::string start = Key(25), end = Key(35); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + + ResetAllStats(expect_stats, expect_pl_stats); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.Add(kBasicPerLevelStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // verify data + std::string value; + for (int i = 0; i < kNumKeys; i++) { + if (i < 10 || (i >= 25 && i < 35)) { + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + } + } + + // range delete all hot data + start = Key(30); + end = Key(130); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped because of snapshot + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + // release the snapshot and do compaction again should remove all hot data + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // 2 range dels are dropped + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 3); + + // move backward the cold_seq, for example the user may change the setting of + // hot/cold data, but it won't impact the existing cold data, as the sequence + // number is zeroed out. + latest_cold_seq = seq_history[1]; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_F(TieredCompactionTest, RangeBasedTieredStorageUniversal) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.bottommost_temperature = Temperature::kCold; + options.level0_file_num_compaction_trigger = kNumTrigger; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + auto cmp = options.comparator; + + port::Mutex mutex; + std::string hot_start = Key(10); + std::string hot_end = Key(50); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + MutexLock l(&mutex); + context->output_to_penultimate_level = + cmp->Compare(context->key, hot_start) >= 0 && + cmp->Compare(context->key, hot_end) < 0; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector expect_stats(kNumLevels); + InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; + InternalStats::CompactionStats expect_pl_stats; + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(j), "value" + std::to_string(j))); + } + ASSERT_OK(Flush()); + expect_stats[0].Add(kBasicFlushStats); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.Add(kBasicPerLevelStats); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + ResetAllStats(expect_stats, expect_pl_stats); + + // change to all cold, no output_to_penultimate_level output + { + MutexLock l(&mutex); + hot_start = Key(100); + hot_end = Key(200); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + last_stats.Add(kBasicPerLevelStats); + last_stats.num_dropped_records = 0; + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // change to all hot, universal compaction support moving data to up level if + // it's within compaction level range. + { + MutexLock l(&mutex); + hot_start = Key(0); + hot_end = Key(100); + } + + // No data is moved from cold tier to hot tier because no input files from L5 + // or higher, it's not safe to move data to output_to_penultimate_level level. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // Add 2 keys in higher level, but in separated files, the keys within that + // range should be moved up to the penultimate level + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(50), "value" + std::to_string(0))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + + // Add an SST with a key range cover all the data we want to move from the + // last level to the penultimate level + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Put(Key(99), "value" + std::to_string(0))); + ASSERT_OK(Flush()); + + ResetAllStats(expect_stats, expect_pl_stats); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // change to only 1 key cold, to test compaction could stop even it matches + // size amp compaction threshold + { + MutexLock l(&mutex); + hot_start = Key(1); + hot_end = Key(1000); + } + + // generate files just enough to trigger compaction + for (int i = 0; i < kNumTrigger - 1; i++) { + for (int j = 0; j < 1000; j++) { + ASSERT_OK(Put(Key(j), "value" + std::to_string(j))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact( + true)); // make sure the compaction is able to finish + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + auto opts = db_->GetOptions(); + auto max_size_amp = + opts.compaction_options_universal.max_size_amplification_percent / 100; + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), + GetSstSizeHelper(Temperature::kCold) * max_size_amp); + + // delete all cold data + ASSERT_OK(Delete(Key(0))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // range delete overlap with both hot/cold data, with a snapshot to make sure + // the range del is saved + auto snap = db_->GetSnapshot(); + { + MutexLock l(&mutex); + hot_start = Key(50); + hot_end = Key(100); + } + std::string start = Key(1), end = Key(70); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped until snapshot is released + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + // verify data + std::string value; + for (int i = 0; i < kNumKeys; i++) { + if (i < 70) { + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + } + } + + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // range del is dropped + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 1); +} + +TEST_F(TieredCompactionTest, LevelColdRangeDelete) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + + auto options = CurrentOptions(); + options.bottommost_temperature = Temperature::kCold; + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + MoveFilesToLevel(kNumLevels - 1); + + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + auto snap = db_->GetSnapshot(); + + std::string start = Key(10); + std::string end = Key(50); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + + // 20->30 will be marked as cold data, but it cannot be placed to cold tier + // (bottommost) otherwise, it will be "deleted" by the range del in + // output_to_penultimate_level level verify that these data will be able to + // queried + for (int i = 20; i < 30; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + // make the range tombstone and data after that cold + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + + // add home hot data, just for test + for (int i = 30; i < 40; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + std::string value; + for (int i = 0; i < kNumKeys; i++) { + auto s = db_->Get(ReadOptions(), Key(i), &value); + if ((i >= 10 && i < 20) || (i >= 40 && i < 50)) { + ASSERT_TRUE(s.IsNotFound()); + } else { + ASSERT_OK(s); + } + } + + db_->ReleaseSnapshot(snap); +} + +// Test SST partitioner cut after every single key +class SingleKeySstPartitioner : public SstPartitioner { + public: + const char* Name() const override { return "SingleKeySstPartitioner"; } + + PartitionerResult ShouldPartition( + const PartitionerRequest& /*request*/) override { + return kRequired; + } + + bool CanDoTrivialMove(const Slice& /*smallest_user_key*/, + const Slice& /*largest_user_key*/) override { + return false; + } +}; + +class SingleKeySstPartitionerFactory : public SstPartitionerFactory { + public: + static const char* kClassName() { return "SingleKeySstPartitionerFactory"; } + const char* Name() const override { return kClassName(); } + + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /* context */) const override { + return std::unique_ptr(new SingleKeySstPartitioner()); + } +}; + +TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { + const int kNumTrigger = 4; + const int kNumLevels = 3; + const int kNumKeys = 10; + + auto factory = std::make_shared(); + auto options = CurrentOptions(); + options.bottommost_temperature = Temperature::kCold; + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.sst_partitioner_factory = factory; + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + MoveFilesToLevel(kNumLevels - 1); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_EQ("0,0,10", FilesPerLevel()); + + auto snap = db_->GetSnapshot(); + + // only range delete + std::string start = Key(3); + std::string end = Key(5); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), + 0); // tombstone has no size, even it's in hot tier + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_EQ("0,1,10", + FilesPerLevel()); // one file is at the penultimate level which + // only contains a range delete + + // Add 2 hot keys, each is a new SST, they will be placed in the same level as + // range del, but they don't have overlap with range del, make sure the range + // del will still be placed there + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(Put(Key(0), "new value" + std::to_string(0))); + auto snap2 = db_->GetSnapshot(); + ASSERT_OK(Put(Key(6), "new value" + std::to_string(6))); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,2,10", + FilesPerLevel()); // one file is at the penultimate level + // which only contains a range delete + std::vector live_file_meta; + db_->GetLiveFilesMetaData(&live_file_meta); + bool found_sst_with_del = false; + uint64_t sst_with_del_num = 0; + for (const auto& meta : live_file_meta) { + if (meta.num_deletions > 0) { + // found SST with del, which has 2 entries, one for data one for range del + ASSERT_EQ(meta.level, + kNumLevels - 2); // output to penultimate level + ASSERT_EQ(meta.num_entries, 2); + ASSERT_EQ(meta.num_deletions, 1); + found_sst_with_del = true; + sst_with_del_num = meta.file_number; + } + } + ASSERT_TRUE(found_sst_with_del); + + // release the first snapshot and compact, which should compact the range del + // but new inserted key `0` and `6` are still hot data which will be placed on + // the penultimate level + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,2,7", FilesPerLevel()); + db_->GetLiveFilesMetaData(&live_file_meta); + found_sst_with_del = false; + for (const auto& meta : live_file_meta) { + // check new SST with del (the old one may not yet be deleted after + // compaction) + if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) { + found_sst_with_del = true; + } + } + ASSERT_FALSE(found_sst_with_del); + + // Now make all data cold, key 0 will be moved to the last level, but key 6 is + // still in snap2, so it will be kept at the penultimate level + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,1,8", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + db_->ReleaseSnapshot(snap2); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,8", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_F(TieredCompactionTest, UniversalRangeDelete) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 10; + + auto factory = std::make_shared(); + + auto options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.bottommost_temperature = Temperature::kCold; + options.level0_file_num_compaction_trigger = kNumTrigger; + options.statistics = CreateDBStatistics(); + options.sst_partitioner_factory = factory; + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + // compact to the penultimate level with 10 files + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,0,0,10", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // make all data cold + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,10", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // range del which considered as hot data, but it will be merged and deleted + // with the last level data + std::string start = Key(3); + std::string end = Key(5); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,0,0,0,8", FilesPerLevel()); + + // range del with snapshot should be preserved in the penultimate level + auto snap = db_->GetSnapshot(); + + start = Key(6); + end = Key(8); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,8", FilesPerLevel()); + + // Add 2 hot keys, each is a new SST, they will be placed in the same level as + // range del, but no overlap with range del. + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(Put(Key(4), "new value" + std::to_string(0))); + auto snap2 = db_->GetSnapshot(); + ASSERT_OK(Put(Key(9), "new value" + std::to_string(6))); + + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,2,8", FilesPerLevel()); + // find the SST with range del + std::vector live_file_meta; + db_->GetLiveFilesMetaData(&live_file_meta); + bool found_sst_with_del = false; + uint64_t sst_with_del_num = 0; + for (const auto& meta : live_file_meta) { + if (meta.num_deletions > 0) { + // found SST with del, which has 2 entries, one for data one for range del + ASSERT_EQ(meta.level, + kNumLevels - 2); // output_to_penultimate_level level + ASSERT_EQ(meta.num_entries, 2); + ASSERT_EQ(meta.num_deletions, 1); + found_sst_with_del = true; + sst_with_del_num = meta.file_number; + } + } + ASSERT_TRUE(found_sst_with_del); + + // release the first snapshot which should compact the range del, but data on + // the same level is still hot + db_->ReleaseSnapshot(snap); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,2,6", FilesPerLevel()); + db_->GetLiveFilesMetaData(&live_file_meta); + // no range del should be found in SST + found_sst_with_del = false; + for (const auto& meta : live_file_meta) { + // check new SST with del (the old one may not yet be deleted after + // compaction) + if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) { + found_sst_with_del = true; + } + } + ASSERT_FALSE(found_sst_with_del); + + // make all data to cold, but key 6 is still protected by snap2 + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,7", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + db_->ReleaseSnapshot(snap2); + + // release snapshot, everything go to bottommost + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,7", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + options.bottommost_temperature = Temperature::kCold; + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + std::vector seq_history; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector expect_stats(kNumLevels); + InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; + InternalStats::CompactionStats expect_pl_stats; + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + expect_stats[0].Add(kBasicFlushStats); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // non-last-level compaction doesn't support per_key_placement + ASSERT_EQ("0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + expect_stats[1].Add(kBasicCompStats); + expect_stats[1].Add(kBasicPerLevelStats); + expect_stats[1].ResetCompactionReason(CompactionReason::kLevelL0FilesNum); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + MoveFilesToLevel(kLastLevel); + + ResetAllStats(expect_stats, expect_pl_stats); + + // the data should be all hot, and it's a last level compaction, but all + // sequence numbers have been zeroed out, so they're still treated as old + // data. + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.Add(kBasicPerLevelStats); + last_stats.num_dropped_records = 0; + last_stats.bytes_read_non_output_levels = 0; + last_stats.num_input_files_in_non_output_levels = 0; + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // Add new data, which is all hot and overriding all existing data + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + ResetAllStats(expect_stats, expect_pl_stats); + + // after compaction, all data are hot + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + for (int level = 2; level < kNumLevels - 1; level++) { + expect_stats[level].bytes_moved = kHasValue; + } + + last_stats.Add(kBasicCompStats); + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // move forward the cold_seq, try to split the data into cold and hot, but in + // this case it's unsafe to split the data + latest_cold_seq = seq_history[1]; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + seq_history.clear(); + + // Add new data again + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + ResetAllStats(expect_stats, expect_pl_stats); + + // Try to split the last level cold data into hot and cold, which + // is not supported + latest_cold_seq = seq_history[0]; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + auto comp_stats = kBasicCompStats; + comp_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + const int bottommost_level = 5; + expect_stats[bottommost_level].Add(comp_stats); + expect_stats[bottommost_level].Add( + comp_stats); // bottommost level has 2 compactions + expect_stats[bottommost_level].Add(kBasicPerLevelStats); + expect_stats[bottommost_level].bytes_read_output_level = kHasValue; + expect_stats[bottommost_level].num_input_files_in_output_level = kHasValue; + + for (int level = 2; level < bottommost_level; level++) { + expect_stats[level].bytes_moved = kHasValue; + } + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // manually move all data (cold) to last level + MoveFilesToLevel(kLastLevel); + seq_history.clear(); + // Add new data once again + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + latest_cold_seq = seq_history[0]; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // delete all cold data + for (int i = 0; i < 10; i++) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + MoveFilesToLevel(kLastLevel); + + // move forward the cold_seq again with range delete, take a snapshot to keep + // the range dels in bottommost + auto snap = db_->GetSnapshot(); + latest_cold_seq = seq_history[2]; + std::string start = Key(25), end = Key(35); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + // add one small key and large key in the input level, to make sure it's able + // to move hot data to input level within that range + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Put(Key(100), "value" + std::to_string(0))); + + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // verify data + std::string value; + for (int i = 1; i < 130; i++) { + if (i < 10 || (i >= 25 && i < 35)) { + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + } + } + + // delete all hot data + ASSERT_OK(Delete(Key(0))); + start = Key(30); + end = Key(101); // range [101, 130] is cold, because it's not in input range + // in previous compaction + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped because of snapshot + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + db_->ReleaseSnapshot(snap); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // 3 range dels dropped, the first one is double counted as expected, which is + // spread into 2 SST files + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 3); + + // move backward of cold_seq, which might happen when the user change the + // setting. the hot data won't move up, just to make sure it still runs + // fine, which is because: + // 1. sequence number is zeroed out, so no time information + // 2. leveled compaction only support move data up within the higher level + // input range + latest_cold_seq = seq_history[1]; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_F(TieredCompactionTest, RangeBasedTieredStorageLevel) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + + auto options = CurrentOptions(); + options.bottommost_temperature = Temperature::kCold; + options.level0_file_num_compaction_trigger = kNumTrigger; + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + auto cmp = options.comparator; + + port::Mutex mutex; + std::string hot_start = Key(10); + std::string hot_end = Key(50); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + MutexLock l(&mutex); + context->output_to_penultimate_level = + cmp->Compare(context->key, hot_start) >= 0 && + cmp->Compare(context->key, hot_end) < 0; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(j), "value" + std::to_string(j))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // change to all cold + { + MutexLock l(&mutex); + hot_start = Key(100); + hot_end = Key(200); + } + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // change to all hot, but level compaction only support move cold to hot + // within it's higher level input range. + { + MutexLock l(&mutex); + hot_start = Key(0); + hot_end = Key(100); + } + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // with mixed hot/cold data + { + MutexLock l(&mutex); + hot_start = Key(50); + hot_end = Key(100); + } + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Put(Key(100), "value" + std::to_string(100))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // delete all hot data, but with snapshot to keep the range del + auto snap = db_->GetSnapshot(); + std::string start = Key(50); + std::string end = Key(100); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped because of snapshot + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + // release the snapshot and do compaction again should remove all hot data + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 1); +} + +#endif // !defined(ROCKSDB_LITE) + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void)argc; + (void)argv; + return 0; +#endif +} diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index e48e8f2d5..1d615a425 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -30,10 +30,100 @@ namespace ROCKSDB_NAMESPACE { // SYNC_POINT is not supported in released Windows mode. #if !defined(ROCKSDB_LITE) +class CompactionStatsCollector : public EventListener { + public: + CompactionStatsCollector() + : compaction_completed_( + static_cast(CompactionReason::kNumOfReasons)) { + for (auto& v : compaction_completed_) { + v.store(0); + } + } + + ~CompactionStatsCollector() override {} + + void OnCompactionCompleted(DB* /* db */, + const CompactionJobInfo& info) override { + int k = static_cast(info.compaction_reason); + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + assert(k >= 0 && k < num_of_reasons); + compaction_completed_[k]++; + } + + void OnExternalFileIngested( + DB* /* db */, const ExternalFileIngestionInfo& /* info */) override { + int k = static_cast(CompactionReason::kExternalSstIngestion); + compaction_completed_[k]++; + } + + void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override { + int k = static_cast(CompactionReason::kFlush); + compaction_completed_[k]++; + } + + int NumberOfCompactions(CompactionReason reason) const { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + int k = static_cast(reason); + assert(k >= 0 && k < num_of_reasons); + return compaction_completed_.at(k).load(); + } + + private: + std::vector> compaction_completed_; +}; + class DBCompactionTest : public DBTestBase { public: DBCompactionTest() : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {} + + protected: +#ifndef ROCKSDB_LITE + uint64_t GetSstSizeHelper(Temperature temperature) { + std::string prop; + EXPECT_TRUE(dbfull()->GetProperty( + DB::Properties::kLiveSstFilesSizeAtTemperature + + std::to_string(static_cast(temperature)), + &prop)); + return static_cast(std::atoi(prop.c_str())); + } +#endif // ROCKSDB_LITE + + /* + * Verifies compaction stats of cfd are valid. + * + * For each level of cfd, its compaction stats are valid if + * 1) sum(stat.counts) == stat.count, and + * 2) stat.counts[i] == collector.NumberOfCompactions(i) + */ + void VerifyCompactionStats(ColumnFamilyData& cfd, + const CompactionStatsCollector& collector) { +#ifndef NDEBUG + InternalStats* internal_stats_ptr = cfd.internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + const std::vector& comp_stats = + internal_stats_ptr->TEST_GetCompactionStats(); + const int num_of_reasons = + static_cast(CompactionReason::kNumOfReasons); + std::vector counts(num_of_reasons, 0); + // Count the number of compactions caused by each CompactionReason across + // all levels. + for (const auto& stat : comp_stats) { + int sum = 0; + for (int i = 0; i < num_of_reasons; i++) { + counts[i] += stat.counts[i]; + sum += stat.counts[i]; + } + ASSERT_EQ(sum, stat.count); + } + // Verify InternalStats bookkeeping matches that of + // CompactionStatsCollector, assuming that all compactions complete. + for (int i = 0; i < num_of_reasons; i++) { + ASSERT_EQ(collector.NumberOfCompactions(static_cast(i)), + counts[i]); + } +#endif /* NDEBUG */ + } }; class DBCompactionTestWithParam @@ -110,47 +200,6 @@ class FlushedFileCollector : public EventListener { std::mutex mutex_; }; -class CompactionStatsCollector : public EventListener { -public: - CompactionStatsCollector() - : compaction_completed_(static_cast(CompactionReason::kNumOfReasons)) { - for (auto& v : compaction_completed_) { - v.store(0); - } - } - - ~CompactionStatsCollector() override {} - - void OnCompactionCompleted(DB* /* db */, - const CompactionJobInfo& info) override { - int k = static_cast(info.compaction_reason); - int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); - assert(k >= 0 && k < num_of_reasons); - compaction_completed_[k]++; - } - - void OnExternalFileIngested( - DB* /* db */, const ExternalFileIngestionInfo& /* info */) override { - int k = static_cast(CompactionReason::kExternalSstIngestion); - compaction_completed_[k]++; - } - - void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override { - int k = static_cast(CompactionReason::kFlush); - compaction_completed_[k]++; - } - - int NumberOfCompactions(CompactionReason reason) const { - int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); - int k = static_cast(reason); - assert(k >= 0 && k < num_of_reasons); - return compaction_completed_.at(k).load(); - } - -private: - std::vector> compaction_completed_; -}; - class SstStatsCollector : public EventListener { public: SstStatsCollector() : num_ssts_creation_started_(0) {} @@ -247,40 +296,6 @@ void VerifyCompactionResult( #endif } -/* - * Verifies compaction stats of cfd are valid. - * - * For each level of cfd, its compaction stats are valid if - * 1) sum(stat.counts) == stat.count, and - * 2) stat.counts[i] == collector.NumberOfCompactions(i) - */ -void VerifyCompactionStats(ColumnFamilyData& cfd, - const CompactionStatsCollector& collector) { -#ifndef NDEBUG - InternalStats* internal_stats_ptr = cfd.internal_stats(); - ASSERT_NE(internal_stats_ptr, nullptr); - const std::vector& comp_stats = - internal_stats_ptr->TEST_GetCompactionStats(); - const int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); - std::vector counts(num_of_reasons, 0); - // Count the number of compactions caused by each CompactionReason across - // all levels. - for (const auto& stat : comp_stats) { - int sum = 0; - for (int i = 0; i < num_of_reasons; i++) { - counts[i] += stat.counts[i]; - sum += stat.counts[i]; - } - ASSERT_EQ(sum, stat.count); - } - // Verify InternalStats bookkeeping matches that of CompactionStatsCollector, - // assuming that all compactions complete. - for (int i = 0; i < num_of_reasons; i++) { - ASSERT_EQ(collector.NumberOfCompactions(static_cast(i)), counts[i]); - } -#endif /* NDEBUG */ -} - const SstFileMetaData* PickFileRandomly( const ColumnFamilyMetaData& cf_meta, Random* rand, diff --git a/db/internal_stats.h b/db/internal_stats.h index f1ede3f82..73c1f29e7 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -20,8 +20,6 @@ #include "rocksdb/system_clock.h" #include "util/hash_containers.h" -class ColumnFamilyData; - namespace ROCKSDB_NAMESPACE { template @@ -140,6 +138,23 @@ class InternalStats { InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd); + // Per level compaction stats + struct CompactionOutputsStats { + uint64_t num_output_records = 0; + uint64_t bytes_written = 0; + uint64_t bytes_written_blob = 0; + uint64_t num_output_files = 0; + uint64_t num_output_files_blob = 0; + + void Add(const CompactionOutputsStats& stats) { + this->num_output_records += stats.num_output_records; + this->bytes_written += stats.bytes_written; + this->bytes_written_blob += stats.bytes_written_blob; + this->num_output_files += stats.num_output_files; + this->num_output_files_blob += stats.num_output_files_blob; + } + }; + // Per level compaction stats. comp_stats_[level] stores the stats for // compactions that produced data for the specified "level". struct CompactionStats { @@ -184,11 +199,14 @@ class InternalStats { // (num input entries - num output entries) for compaction levels N and N+1 uint64_t num_dropped_records; + // Total output entries from compaction + uint64_t num_output_records; + // Number of compactions done int count; // Number of compactions done per CompactionReason - int counts[static_cast(CompactionReason::kNumOfReasons)]; + int counts[static_cast(CompactionReason::kNumOfReasons)]{}; explicit CompactionStats() : micros(0), @@ -205,6 +223,7 @@ class InternalStats { num_output_files_blob(0), num_input_records(0), num_dropped_records(0), + num_output_records(0), count(0) { int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); for (int i = 0; i < num_of_reasons; i++) { @@ -227,6 +246,7 @@ class InternalStats { num_output_files_blob(0), num_input_records(0), num_dropped_records(0), + num_output_records(0), count(c) { int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); for (int i = 0; i < num_of_reasons; i++) { @@ -240,7 +260,7 @@ class InternalStats { } } - explicit CompactionStats(const CompactionStats& c) + CompactionStats(const CompactionStats& c) : micros(c.micros), cpu_micros(c.cpu_micros), bytes_read_non_output_levels(c.bytes_read_non_output_levels), @@ -256,6 +276,7 @@ class InternalStats { num_output_files_blob(c.num_output_files_blob), num_input_records(c.num_input_records), num_dropped_records(c.num_dropped_records), + num_output_records(c.num_output_records), count(c.count) { int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); for (int i = 0; i < num_of_reasons; i++) { @@ -279,6 +300,7 @@ class InternalStats { num_output_files_blob = c.num_output_files_blob; num_input_records = c.num_input_records; num_dropped_records = c.num_dropped_records; + num_output_records = c.num_output_records; count = c.count; int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); @@ -303,6 +325,7 @@ class InternalStats { this->num_output_files_blob = 0; this->num_input_records = 0; this->num_dropped_records = 0; + this->num_output_records = 0; this->count = 0; int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); for (int i = 0; i < num_of_reasons; i++) { @@ -327,6 +350,7 @@ class InternalStats { this->num_output_files_blob += c.num_output_files_blob; this->num_input_records += c.num_input_records; this->num_dropped_records += c.num_dropped_records; + this->num_output_records += c.num_output_records; this->count += c.count; int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); for (int i = 0; i< num_of_reasons; i++) { @@ -334,6 +358,15 @@ class InternalStats { } } + void Add(const CompactionOutputsStats& stats) { + this->num_output_files += static_cast(stats.num_output_files); + this->num_output_records += stats.num_output_records; + this->bytes_written += stats.bytes_written; + this->bytes_written_blob += stats.bytes_written_blob; + this->num_output_files_blob += + static_cast(stats.num_output_files_blob); + } + void Subtract(const CompactionStats& c) { this->micros -= c.micros; this->cpu_micros -= c.cpu_micros; @@ -351,12 +384,70 @@ class InternalStats { this->num_output_files_blob -= c.num_output_files_blob; this->num_input_records -= c.num_input_records; this->num_dropped_records -= c.num_dropped_records; + this->num_output_records -= c.num_output_records; this->count -= c.count; int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); for (int i = 0; i < num_of_reasons; i++) { counts[i] -= c.counts[i]; } } + + void ResetCompactionReason(CompactionReason reason) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + assert(count == 1); // only support update one compaction reason + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + int r = static_cast(reason); + assert(r >= 0 && r < num_of_reasons); + counts[r] = 1; + } + }; + + // Compaction stats, for per_key_placement compaction, it includes 2 levels + // stats: the last level and the penultimate level. + struct CompactionStatsFull { + // the stats for the target primary output level + CompactionStats stats; + + // stats for penultimate level output if exist + bool has_penultimate_level_output = false; + CompactionStats penultimate_level_stats; + + explicit CompactionStatsFull() : stats(), penultimate_level_stats() {} + + explicit CompactionStatsFull(CompactionReason reason, int c) + : stats(reason, c), penultimate_level_stats(reason, c){}; + + uint64_t TotalBytesWritten() const { + uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob; + if (has_penultimate_level_output) { + bytes_written += penultimate_level_stats.bytes_written + + penultimate_level_stats.bytes_written_blob; + } + return bytes_written; + } + + uint64_t DroppedRecords() { + uint64_t output_records = stats.num_output_records; + if (has_penultimate_level_output) { + output_records += penultimate_level_stats.num_output_records; + } + if (stats.num_input_records > output_records) { + return stats.num_input_records - output_records; + } + return 0; + } + + void SetMicros(uint64_t val) { + stats.micros = val; + penultimate_level_stats.micros = val; + } + + void AddCpuMicros(uint64_t val) { + stats.cpu_micros += val; + penultimate_level_stats.cpu_micros += val; + } }; // For use with CacheEntryStatsCollector @@ -403,6 +494,7 @@ class InternalStats { for (auto& comp_stat : comp_stats_) { comp_stat.Clear(); } + per_key_placement_comp_stats_.Clear(); for (auto& h : file_read_latency_) { h.Clear(); } @@ -419,6 +511,15 @@ class InternalStats { comp_stats_by_pri_[thread_pri].Add(stats); } + void AddCompactionStats(int level, Env::Priority thread_pri, + const CompactionStatsFull& comp_stats_full) { + AddCompactionStats(level, thread_pri, comp_stats_full.stats); + if (comp_stats_full.has_penultimate_level_output) { + per_key_placement_comp_stats_.Add( + comp_stats_full.penultimate_level_stats); + } + } + void IncBytesMoved(int level, uint64_t amount) { comp_stats_[level].bytes_moved += amount; } @@ -479,6 +580,10 @@ class InternalStats { return comp_stats_; } + const CompactionStats& TEST_GetPerKeyPlacementCompactionStats() const { + return per_key_placement_comp_stats_; + } + void TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, bool foreground); // Store a mapping from the user-facing DB::Properties string to our @@ -518,6 +623,7 @@ class InternalStats { // Per-ColumnFamily/level compaction stats std::vector comp_stats_; std::vector comp_stats_by_pri_; + CompactionStats per_key_placement_comp_stats_; std::vector file_read_latency_; HistogramImpl blob_file_read_latency_; @@ -749,6 +855,23 @@ class InternalStats { InternalStats(int /*num_levels*/, SystemClock* /*clock*/, ColumnFamilyData* /*cfd*/) {} + // Per level compaction stats + struct CompactionOutputsStats { + uint64_t num_output_records = 0; + uint64_t bytes_written = 0; + uint64_t bytes_written_blob = 0; + uint64_t num_output_files = 0; + uint64_t num_output_files_blob = 0; + + void Add(const CompactionOutputsStats& stats) { + this->num_output_records += stats.num_output_records; + this->bytes_written += stats.bytes_written; + this->bytes_written_blob += stats.bytes_written_blob; + this->num_output_files += stats.num_output_files; + this->num_output_files_blob += stats.num_output_files_blob; + } + }; + struct CompactionStats { uint64_t micros; uint64_t cpu_micros; @@ -764,6 +887,7 @@ class InternalStats { int num_output_files_blob; uint64_t num_input_records; uint64_t num_dropped_records; + uint64_t num_output_records; int count; explicit CompactionStats() {} @@ -774,12 +898,38 @@ class InternalStats { void Add(const CompactionStats& /*c*/) {} + void Add(const CompactionOutputsStats& /*c*/) {} + void Subtract(const CompactionStats& /*c*/) {} }; + struct CompactionStatsFull { + // the stats for the target primary output level (per level stats) + CompactionStats stats; + + // stats for output_to_penultimate_level level (per level stats) + bool has_penultimate_level_output = false; + CompactionStats penultimate_level_stats; + + explicit CompactionStatsFull(){}; + + explicit CompactionStatsFull(CompactionReason /*reason*/, int /*c*/){}; + + uint64_t TotalBytesWritten() const { return 0; } + + uint64_t DroppedRecords() { return 0; } + + void SetMicros(uint64_t /*val*/){}; + + void AddCpuMicros(uint64_t /*val*/){}; + }; + void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/, const CompactionStats& /*stats*/) {} + void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/, + const CompactionStatsFull& /*unmerged_stats*/) {} + void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {} void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {} diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h index 626f3202f..0d6fbd028 100644 --- a/include/rocksdb/compaction_job_stats.h +++ b/include/rocksdb/compaction_job_stats.h @@ -102,5 +102,7 @@ struct CompactionJobStats { // number of single-deletes which meet something other than a put uint64_t num_single_del_mismatch; + + // TODO: Add output_to_penultimate_level output information }; } // namespace ROCKSDB_NAMESPACE diff --git a/src.mk b/src.mk index b7a5325b3..e892ca899 100644 --- a/src.mk +++ b/src.mk @@ -33,7 +33,11 @@ LIB_SOURCES = \ db/compaction/compaction_picker_fifo.cc \ db/compaction/compaction_picker_level.cc \ db/compaction/compaction_picker_universal.cc \ + db/compaction/compaction_service_job.cc \ + db/compaction/compaction_state.cc \ + db/compaction/compaction_outputs.cc \ db/compaction/sst_partitioner.cc \ + db/compaction/subcompaction_state.cc \ db/convenience.cc \ db/db_filesnapshot.cc \ db/db_impl/compacted_db_impl.cc \ @@ -433,6 +437,7 @@ TEST_MAIN_SOURCES = \ db/compaction/compaction_job_stats_test.cc \ db/compaction/compaction_picker_test.cc \ db/compaction/compaction_service_test.cc \ + db/compaction/tiered_compaction_test.cc \ db/comparator_db_test.cc \ db/corruption_test.cc \ db/cuckoo_table_db_test.cc \