// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #pragma once #include #include #include #include #include #include "db/compaction_iteration_stats.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" #include "db/version_edit.h" #include "include/rocksdb/comparator.h" #include "include/rocksdb/types.h" #include "table/internal_iterator.h" #include "table/scoped_arena_iterator.h" #include "table/table_builder.h" #include "util/kv_map.h" namespace rocksdb { // RangeDelMaps maintain position across calls to ShouldDelete. The caller may // wish to specify a mode to optimize positioning the iterator during the next // call to ShouldDelete. The non-kFullScan modes are only available when // deletion collapsing is enabled. // // For example, if we invoke Next() on an iterator, kForwardTraversal should be // specified to advance one-by-one through deletions until one is found with its // interval containing the key. This will typically be faster than doing a full // binary search (kBinarySearch). enum class RangeDelPositioningMode { kFullScan, // used iff collapse_deletions_ == false kForwardTraversal, kBackwardTraversal, kBinarySearch, }; // TruncatedRangeTombstones are a slight generalization of regular // RangeTombstones that can represent truncations caused by SST boundaries. // Instead of using user keys to represent the start and end keys, they instead // use internal keys, whose sequence number indicates the sequence number of // the smallest/largest SST key (in the case where a tombstone is untruncated, // the sequence numbers will be kMaxSequenceNumber for both start and end // keys). Like RangeTombstones, TruncatedRangeTombstone are also // end-key-exclusive. struct TruncatedRangeTombstone { TruncatedRangeTombstone(const ParsedInternalKey& sk, const ParsedInternalKey& ek, SequenceNumber s) : start_key_(sk), end_key_(ek), seq_(s) {} RangeTombstone Tombstone() const { // The RangeTombstone returned here can cover less than the // TruncatedRangeTombstone when its end key has a seqnum that is not // kMaxSequenceNumber. Since this method is only used by RangeDelIterators // (which in turn are only used during flush/compaction), we avoid this // problem by using truncation boundaries spanning multiple SSTs, which // are selected in a way that guarantee a clean break at the end key. assert(end_key_.sequence == kMaxSequenceNumber); return RangeTombstone(start_key_.user_key, end_key_.user_key, seq_); } ParsedInternalKey start_key_; ParsedInternalKey end_key_; SequenceNumber seq_; }; // A RangeDelIterator iterates over range deletion tombstones. class RangeDelIterator { public: virtual ~RangeDelIterator() = default; virtual bool Valid() const = 0; virtual void Next() = 0; // NOTE: the Slice passed to this method must be a user key. virtual void Seek(const Slice& target) = 0; virtual void Seek(const ParsedInternalKey& target) = 0; virtual RangeTombstone Tombstone() const = 0; }; // A RangeDelMap keeps track of range deletion tombstones within a snapshot // stripe. // // RangeDelMaps are used internally by RangeDelAggregator. They are not intended // to be used directly. class RangeDelMap { public: virtual ~RangeDelMap() = default; virtual bool ShouldDelete(const ParsedInternalKey& parsed, RangeDelPositioningMode mode) = 0; virtual bool IsRangeOverlapped(const ParsedInternalKey& start, const ParsedInternalKey& end) = 0; virtual void InvalidatePosition() = 0; virtual size_t Size() const = 0; bool IsEmpty() const { return Size() == 0; } virtual void AddTombstone(TruncatedRangeTombstone tombstone) = 0; virtual std::unique_ptr NewIterator() = 0; }; // A RangeDelAggregator aggregates range deletion tombstones as they are // encountered in memtables/SST files. It provides methods that check whether a // key is covered by range tombstones or write the relevant tombstones to a new // SST file. class RangeDelAggregator { public: // @param snapshots These are used to organize the tombstones into snapshot // stripes, which is the seqnum range between consecutive snapshots, // including the higher snapshot and excluding the lower one. Currently, // this is used by ShouldDelete() to prevent deletion of keys that are // covered by range tombstones in other snapshot stripes. This constructor // is used for writes (flush/compaction). All DB snapshots are provided // such that no keys are removed that are uncovered according to any DB // snapshot. // Note this overload does not lazily initialize Rep. RangeDelAggregator(const InternalKeyComparator& icmp, const std::vector& snapshots, bool collapse_deletions = true); // @param upper_bound Similar to snapshots above, except with a single // snapshot, which allows us to store the snapshot on the stack and defer // initialization of heap-allocating members (in Rep) until the first range // deletion is encountered. This constructor is used in case of reads (get/ // iterator), for which only the user snapshot (upper_bound) is provided // such that the seqnum space is divided into two stripes. Only the older // stripe will be used by ShouldDelete(). RangeDelAggregator(const InternalKeyComparator& icmp, SequenceNumber upper_bound, bool collapse_deletions = false); // Returns whether the key should be deleted, which is the case when it is // covered by a range tombstone residing in the same snapshot stripe. // @param mode If collapse_deletions_ is true, this dictates how we will find // the deletion whose interval contains this key. Otherwise, its // value must be kFullScan indicating linear scan from beginning. bool ShouldDelete( const ParsedInternalKey& parsed, RangeDelPositioningMode mode = RangeDelPositioningMode::kFullScan) { if (rep_ == nullptr) { return false; } return ShouldDeleteImpl(parsed, mode); } bool ShouldDelete( const Slice& internal_key, RangeDelPositioningMode mode = RangeDelPositioningMode::kFullScan) { if (rep_ == nullptr) { return false; } return ShouldDeleteImpl(internal_key, mode); } bool ShouldDeleteImpl(const ParsedInternalKey& parsed, RangeDelPositioningMode mode); bool ShouldDeleteImpl(const Slice& internal_key, RangeDelPositioningMode mode); // Checks whether range deletions cover any keys between `start` and `end`, // inclusive. // // @param start User key representing beginning of range to check for overlap. // @param end User key representing end of range to check for overlap. This // argument is inclusive, so the existence of a range deletion covering // `end` causes this to return true. bool IsRangeOverlapped(const Slice& start, const Slice& end); // Adds tombstones to the tombstone aggregation structure maintained by this // object. Tombstones are truncated to smallest and largest. If smallest (or // largest) is null, it is not used for truncation. When adding range // tombstones present in an sstable, smallest and largest should be set to // the smallest and largest keys from the sstable file metadata. Note that // tombstones end keys are exclusive while largest is inclusive. // @return non-OK status if any of the tombstone keys are corrupted. Status AddTombstones(std::unique_ptr input, const InternalKey* smallest = nullptr, const InternalKey* largest = nullptr); // Resets iterators maintained across calls to ShouldDelete(). This may be // called when the tombstones change, or the owner may call explicitly, e.g., // if it's an iterator that just seeked to an arbitrary position. The effect // of invalidation is that the following call to ShouldDelete() will binary // search for its tombstone. void InvalidateRangeDelMapPositions(); bool IsEmpty(); bool AddFile(uint64_t file_number); // Create a new iterator over the range deletion tombstones in all of the // snapshot stripes in this aggregator. Tombstones are presented in start key // order. Tombstones with the same start key are presented in arbitrary order. // // The iterator is invalidated after any call to AddTombstones. It is the // caller's responsibility to avoid using invalid iterators. std::unique_ptr NewIterator(); private: // Maps snapshot seqnum -> map of tombstones that fall in that stripe, i.e., // their seqnums are greater than the next smaller snapshot's seqnum, and the // corresponding index into the list of snapshots. Each entry is lazily // initialized. typedef std::map, size_t>> StripeMap; struct Rep { std::vector snapshots_; StripeMap stripe_map_; PinnedIteratorsManager pinned_iters_mgr_; std::list pinned_slices_; std::set added_files_; }; // Initializes rep_ lazily. This aggregator object is constructed for every // read, so expensive members should only be created when necessary, i.e., // once the first range deletion is encountered. void InitRep(const std::vector& snapshots); std::unique_ptr NewRangeDelMap(); RangeDelMap* GetRangeDelMapIfExists(SequenceNumber seq); RangeDelMap& GetRangeDelMap(SequenceNumber seq); SequenceNumber upper_bound_; std::unique_ptr rep_; const InternalKeyComparator& icmp_; // collapse range deletions so they're binary searchable const bool collapse_deletions_; }; } // namespace rocksdb