// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once #include "db/version_set.h" #include "memory/arena.h" #include "options/cf_options.h" #include "rocksdb/sst_partitioner.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { // The file contains class Compaction, as well as some helper functions // and data structures used by the class. // Utility for comparing sstable boundary keys. Returns -1 if either a or b is // null which provides the property that a==null indicates a key that is less // than any key and b==null indicates a key that is greater than any key. Note // that the comparison is performed primarily on the user-key portion of the // key. If the user-keys compare equal, an additional test is made to sort // range tombstone sentinel keys before other keys with the same user-key. The // result is that 2 user-keys will compare equal if they differ purely on // their sequence number and value, but the range tombstone sentinel for that // user-key will compare not equal. This is necessary because the range // tombstone sentinel key is set as the largest key for an sstable even though // that key never appears in the database. We don't want adjacent sstables to // be considered overlapping if they are separated by the range tombstone // sentinel. int sstableKeyCompare(const Comparator* user_cmp, const Slice&, const Slice&); inline int sstableKeyCompare(const Comparator* user_cmp, const Slice& a, const InternalKey& b) { return sstableKeyCompare(user_cmp, a, b.Encode()); } inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, const Slice& b) { return sstableKeyCompare(user_cmp, a.Encode(), b); } inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, const InternalKey& b) { return sstableKeyCompare(user_cmp, a.Encode(), b.Encode()); } int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, const InternalKey& b); int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, const InternalKey* b); // An AtomicCompactionUnitBoundary represents a range of keys [smallest, // largest] that exactly spans one ore more neighbouring SSTs on the same // level. Every pair of SSTs in this range "overlap" (i.e., the largest // user key of one file is the smallest user key of the next file). These // boundaries are propagated down to RangeDelAggregator during compaction // to provide safe truncation boundaries for range tombstones. struct AtomicCompactionUnitBoundary { const InternalKey* smallest = nullptr; const InternalKey* largest = nullptr; }; // The structure that manages compaction input files associated // with the same physical level. struct CompactionInputFiles { int level; std::vector files; std::vector atomic_compaction_unit_boundaries; inline bool empty() const { return files.empty(); } inline size_t size() const { return files.size(); } inline void clear() { files.clear(); } inline FileMetaData* operator[](size_t i) const { return files[i]; } }; class Version; class ColumnFamilyData; class VersionStorageInfo; class CompactionFilter; // A Compaction encapsulates metadata about a compaction. class Compaction { public: Compaction(VersionStorageInfo* input_version, const ImmutableOptions& immutable_options, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, std::vector inputs, int output_level, uint64_t target_file_size, uint64_t max_compaction_bytes, uint32_t output_path_id, CompressionType compression, CompressionOptions compression_opts, Temperature output_temperature, uint32_t max_subcompactions, std::vector grandparents, bool manual_compaction = false, const std::string& trim_ts = "", double score = -1, bool deletion_compaction = false, bool l0_files_might_overlap = true, CompactionReason compaction_reason = CompactionReason::kUnknown, BlobGarbageCollectionPolicy blob_garbage_collection_policy = BlobGarbageCollectionPolicy::kUseDefault, double blob_garbage_collection_age_cutoff = -1); // The type of the penultimate level output range enum class PenultimateOutputRangeType : int { kNotSupported, // it cannot output to the penultimate level kFullRange, // any data could be output to the penultimate level kNonLastRange, // only the keys within non_last_level compaction inputs can // be outputted to the penultimate level kDisabled, // no data can be outputted to the penultimate level }; // No copying allowed Compaction(const Compaction&) = delete; void operator=(const Compaction&) = delete; ~Compaction(); // Returns the level associated to the specified compaction input level. // If compaction_input_level is not specified, then input_level is set to 0. int level(size_t compaction_input_level = 0) const { return inputs_[compaction_input_level].level; } int start_level() const { return start_level_; } // Outputs will go to this level int output_level() const { return output_level_; } // Returns the number of input levels in this compaction. size_t num_input_levels() const { return inputs_.size(); } // Return the object that holds the edits to the descriptor done // by this compaction. VersionEdit* edit() { return &edit_; } // Returns the number of input files associated to the specified // compaction input level. // The function will return 0 if when "compaction_input_level" < 0 // or "compaction_input_level" >= "num_input_levels()". size_t num_input_files(size_t compaction_input_level) const { if (compaction_input_level < inputs_.size()) { return inputs_[compaction_input_level].size(); } return 0; } // Returns input version of the compaction Version* input_version() const { return input_version_; } // Returns the ColumnFamilyData associated with the compaction. ColumnFamilyData* column_family_data() const { return cfd_; } // Returns the file meta data of the 'i'th input file at the // specified compaction input level. // REQUIREMENT: "compaction_input_level" must be >= 0 and // < "input_levels()" FileMetaData* input(size_t compaction_input_level, size_t i) const { assert(compaction_input_level < inputs_.size()); return inputs_[compaction_input_level][i]; } const std::vector* boundaries( size_t compaction_input_level) const { assert(compaction_input_level < inputs_.size()); return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries; } // Returns the list of file meta data of the specified compaction // input level. // REQUIREMENT: "compaction_input_level" must be >= 0 and // < "input_levels()" const std::vector* inputs( size_t compaction_input_level) const { assert(compaction_input_level < inputs_.size()); return &inputs_[compaction_input_level].files; } const std::vector* inputs() { return &inputs_; } // Returns the LevelFilesBrief of the specified compaction input level. const LevelFilesBrief* input_levels(size_t compaction_input_level) const { return &input_levels_[compaction_input_level]; } // Maximum size of files to build during this compaction. uint64_t max_output_file_size() const { return max_output_file_size_; } // Target output file size for this compaction uint64_t target_output_file_size() const { return target_output_file_size_; } // What compression for output CompressionType output_compression() const { return output_compression_; } // What compression options for output const CompressionOptions& output_compression_opts() const { return output_compression_opts_; } // Whether need to write output file to second DB path. uint32_t output_path_id() const { return output_path_id_; } // Is this a trivial compaction that can be implemented by just // moving a single input file to the next level (no merging or splitting) bool IsTrivialMove() const; // The split user key in the output level if this compaction is required to // split the output files according to the existing cursor in the output // level under round-robin compaction policy. Empty indicates no required // splitting key const InternalKey* GetOutputSplitKey() const { return output_split_key_; } // If true, then the compaction can be done by simply deleting input files. bool deletion_compaction() const { return deletion_compaction_; } // Add all inputs to this compaction as delete operations to *edit. void AddInputDeletions(VersionEdit* edit); // Returns true if the available information we have guarantees that // the input "user_key" does not exist in any level beyond `output_level()`. bool KeyNotExistsBeyondOutputLevel(const Slice& user_key, std::vector* level_ptrs) const; // Returns true if the user key range [begin_key, end_key) does not exist // in any level beyond `output_level()`. // Used for checking range tombstones, so we assume begin_key < end_key. // begin_key and end_key should include timestamp if enabled. bool KeyRangeNotExistsBeyondOutputLevel( const Slice& begin_key, const Slice& end_key, std::vector* level_ptrs) const; // Clear all files to indicate that they are not being compacted // Delete this compaction from the list of running compactions. // // Requirement: DB mutex held void ReleaseCompactionFiles(Status status); // Returns the summary of the compaction in "output" with maximum "len" // in bytes. The caller is responsible for the memory management of // "output". void Summary(char* output, int len); // Return the score that was used to pick this compaction run. double score() const { return score_; } // Is this compaction creating a file in the bottom most level? bool bottommost_level() const { return bottommost_level_; } // Is the compaction compact to the last level bool is_last_level() const { return output_level_ == immutable_options_.num_levels - 1; } // Does this compaction include all sst files? bool is_full_compaction() const { return is_full_compaction_; } // Was this compaction triggered manually by the client? bool is_manual_compaction() const { return is_manual_compaction_; } std::string trim_ts() const { return trim_ts_; } // Used when allow_trivial_move option is set in // Universal compaction. If all the input files are // non overlapping, then is_trivial_move_ variable // will be set true, else false void set_is_trivial_move(bool trivial_move) { is_trivial_move_ = trivial_move; } // Used when allow_trivial_move option is set in // Universal compaction. Returns true, if the input files // are non-overlapping and can be trivially moved. bool is_trivial_move() const { return is_trivial_move_; } // How many total levels are there? int number_levels() const { return number_levels_; } // Return the ImmutableOptions that should be used throughout the compaction // procedure const ImmutableOptions* immutable_options() const { return &immutable_options_; } // Return the MutableCFOptions that should be used throughout the compaction // procedure const MutableCFOptions* mutable_cf_options() const { return &mutable_cf_options_; } // Returns the size in bytes that the output file should be preallocated to. // In level compaction, that is max_file_size_. In universal compaction, that // is the sum of all input file sizes. uint64_t OutputFilePreallocationSize() const; void SetInputVersion(Version* input_version); struct InputLevelSummaryBuffer { char buffer[128]; }; const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const; uint64_t CalculateTotalInputSize() const; // In case of compaction error, reset the nextIndex that is used // to pick up the next file to be compacted from files_by_size_ void ResetNextCompactionIndex(); // Create a CompactionFilter from compaction_filter_factory std::unique_ptr CreateCompactionFilter() const; // Create a SstPartitioner from sst_partitioner_factory std::unique_ptr CreateSstPartitioner() const; // Is the input level corresponding to output_level_ empty? bool IsOutputLevelEmpty() const; // Should this compaction be broken up into smaller ones run in parallel? bool ShouldFormSubcompactions() const; // Returns true iff at least one input file references a blob file. // // PRE: input version has been set. bool DoesInputReferenceBlobFiles() const; // test function to validate the functionality of IsBottommostLevel() // function -- determines if compaction with inputs and storage is bottommost static bool TEST_IsBottommostLevel( int output_level, VersionStorageInfo* vstorage, const std::vector& inputs); TablePropertiesCollection GetOutputTableProperties() const { return output_table_properties_; } void SetOutputTableProperties(TablePropertiesCollection tp) { output_table_properties_ = std::move(tp); } Slice GetSmallestUserKey() const { return smallest_user_key_; } Slice GetLargestUserKey() const { return largest_user_key_; } Slice GetPenultimateLevelSmallestUserKey() const { return penultimate_level_smallest_user_key_; } Slice GetPenultimateLevelLargestUserKey() const { return penultimate_level_largest_user_key_; } PenultimateOutputRangeType GetPenultimateOutputRangeType() const { return penultimate_output_range_type_; } // Return true if the compaction supports per_key_placement bool SupportsPerKeyPlacement() const; // Get per_key_placement penultimate output level, which is `last_level - 1` // if per_key_placement feature is supported. Otherwise, return -1. int GetPenultimateLevel() const; // Return true if the given range is overlap with penultimate level output // range. // Both smallest_key and largest_key include timestamps if user-defined // timestamp is enabled. bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key, const Slice& largest_key) const; // Return true if the key is within penultimate level output range for // per_key_placement feature, which is safe to place the key to the // penultimate level. different compaction strategy has different rules. // If per_key_placement is not supported, always return false. // TODO: currently it doesn't support moving data from the last level to the // penultimate level // key includes timestamp if user-defined timestamp is enabled. bool WithinPenultimateLevelOutputRange(const Slice& key) const; CompactionReason compaction_reason() const { return compaction_reason_; } const std::vector& grandparents() const { return grandparents_; } uint64_t max_compaction_bytes() const { return max_compaction_bytes_; } Temperature output_temperature() const { return output_temperature_; } uint32_t max_subcompactions() const { return max_subcompactions_; } bool enable_blob_garbage_collection() const { return enable_blob_garbage_collection_; } double blob_garbage_collection_age_cutoff() const { return blob_garbage_collection_age_cutoff_; } // start and end are sub compact range. Null if no boundary. // This is used to filter out some input files' ancester's time range. uint64_t MinInputFileOldestAncesterTime(const InternalKey* start, const InternalKey* end) const; // Return the minimum epoch number among // input files' associated with this compaction uint64_t MinInputFileEpochNumber() const; // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of // compaction begin and compaction completion callbacks match. void SetNotifyOnCompactionCompleted() { notify_on_compaction_completion_ = true; } bool ShouldNotifyOnCompactionCompleted() const { return notify_on_compaction_completion_; } static constexpr int kInvalidLevel = -1; // Evaluate penultimate output level. If the compaction supports // per_key_placement feature, it returns the penultimate level number. // Otherwise, it's set to kInvalidLevel (-1), which means // output_to_penultimate_level is not supported. // Note: even the penultimate level output is supported (PenultimateLevel != // kInvalidLevel), some key range maybe unsafe to be outputted to the // penultimate level. The safe key range is populated by // `PopulatePenultimateLevelOutputRange()`. // Which could potentially disable all penultimate level output. static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage, const ImmutableOptions& immutable_options, const int start_level, const int output_level); private: // mark (or clear) all files that are being compacted void MarkFilesBeingCompacted(bool mark_as_compacted); // get the smallest and largest key present in files to be compacted static void GetBoundaryKeys(VersionStorageInfo* vstorage, const std::vector& inputs, Slice* smallest_key, Slice* largest_key, int exclude_level = -1); // populate penultimate level output range, which will be used to determine if // a key is safe to output to the penultimate level (details see // `Compaction::WithinPenultimateLevelOutputRange()`. void PopulatePenultimateLevelOutputRange(); // Get the atomic file boundaries for all files in the compaction. Necessary // in order to avoid the scenario described in // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and // plumb down appropriate key boundaries to RangeDelAggregator during // compaction. static std::vector PopulateWithAtomicBoundaries( VersionStorageInfo* vstorage, std::vector inputs); // helper function to determine if compaction with inputs and storage is // bottommost static bool IsBottommostLevel( int output_level, VersionStorageInfo* vstorage, const std::vector& inputs); static bool IsFullCompaction(VersionStorageInfo* vstorage, const std::vector& inputs); VersionStorageInfo* input_vstorage_; const int start_level_; // the lowest level to be compacted const int output_level_; // levels to which output files are stored uint64_t target_output_file_size_; uint64_t max_output_file_size_; uint64_t max_compaction_bytes_; uint32_t max_subcompactions_; const ImmutableOptions immutable_options_; const MutableCFOptions mutable_cf_options_; Version* input_version_; VersionEdit edit_; const int number_levels_; ColumnFamilyData* cfd_; Arena arena_; // Arena used to allocate space for file_levels_ const uint32_t output_path_id_; CompressionType output_compression_; CompressionOptions output_compression_opts_; Temperature output_temperature_; // If true, then the compaction can be done by simply deleting input files. const bool deletion_compaction_; // should it split the output file using the compact cursor? const InternalKey* output_split_key_; // L0 files in LSM-tree might be overlapping. But the compaction picking // logic might pick a subset of the files that aren't overlapping. if // that is the case, set the value to false. Otherwise, set it true. bool l0_files_might_overlap_; // Compaction input files organized by level. Constant after construction const std::vector inputs_; // A copy of inputs_, organized more closely in memory autovector input_levels_; // State used to check for number of overlapping grandparent files // (grandparent == "output_level_ + 1") std::vector grandparents_; const double score_; // score that was used to pick this compaction. // Is this compaction creating a file in the bottom most level? const bool bottommost_level_; // Does this compaction include all sst files? const bool is_full_compaction_; // Is this compaction requested by the client? const bool is_manual_compaction_; // The data with timestamp > trim_ts_ will be removed const std::string trim_ts_; // True if we can do trivial move in Universal multi level // compaction bool is_trivial_move_; // Does input compression match the output compression? bool InputCompressionMatchesOutput() const; // table properties of output files TablePropertiesCollection output_table_properties_; // smallest user keys in compaction // includes timestamp if user-defined timestamp is enabled. Slice smallest_user_key_; // largest user keys in compaction // includes timestamp if user-defined timestamp is enabled. Slice largest_user_key_; // Reason for compaction CompactionReason compaction_reason_; // Notify on compaction completion only if listener was notified on compaction // begin. bool notify_on_compaction_completion_; // Enable/disable GC collection for blobs during compaction. bool enable_blob_garbage_collection_; // Blob garbage collection age cutoff. double blob_garbage_collection_age_cutoff_; // only set when per_key_placement feature is enabled, -1 (kInvalidLevel) // means not supported. const int penultimate_level_; // Key range for penultimate level output // includes timestamp if user-defined timestamp is enabled. // penultimate_output_range_type_ shows the range type Slice penultimate_level_smallest_user_key_; Slice penultimate_level_largest_user_key_; PenultimateOutputRangeType penultimate_output_range_type_ = PenultimateOutputRangeType::kNotSupported; }; #ifndef NDEBUG // Helper struct only for tests, which contains the data to decide if a key // should be output to the penultimate level. // TODO: remove this when the public feature knob is available struct PerKeyPlacementContext { const int level; const Slice key; const Slice value; const SequenceNumber seq_num; bool& output_to_penultimate_level; PerKeyPlacementContext(int _level, Slice _key, Slice _value, SequenceNumber _seq_num, bool& _output_to_penultimate_level) : level(_level), key(_key), value(_value), seq_num(_seq_num), output_to_penultimate_level(_output_to_penultimate_level) {} }; #endif /* !NDEBUG */ // Return sum of sizes of all files in `files`. extern uint64_t TotalFileSize(const std::vector& files); } // namespace ROCKSDB_NAMESPACE