Truncate range tombstones by leveraging InternalKeys (#4432)

Summary:
To more accurately truncate range tombstones at SST boundaries,
we now represent them in RangeDelAggregator using InternalKeys, which
are end-key-exclusive as they were before this change.

During compaction, "atomic compaction unit boundaries" (the range of
keys contained in neighbouring and overlaping SSTs) are propagated down
to RangeDelAggregator to truncate range tombstones at those boundariies
instead. See https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and https://github.com/facebook/rocksdb/pull/4432#discussion_r221138683
for motivating examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4432

Differential Revision: D10263952

Pulled By: abhimadan

fbshipit-source-id: 2fe85ff8a02b3a6a2de2edfe708012797a7bd579
main
Abhishek Madan 6 years ago committed by Facebook Github Bot
parent 283a700f5d
commit 3a4bd36fed
  1. 82
      db/compaction.cc
  2. 45
      db/compaction.h
  3. 9
      db/compaction_job.cc
  4. 183
      db/range_del_aggregator.cc
  5. 36
      db/range_del_aggregator.h
  6. 210
      db/range_del_aggregator_test.cc
  7. 18
      db/table_cache.cc
  8. 4
      db/table_cache.h
  9. 77
      db/version_set.cc

@ -23,6 +23,43 @@
namespace rocksdb {
const uint64_t kRangeTombstoneSentinel =
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey& b) {
auto c = user_cmp->Compare(a.user_key(), b.user_key());
if (c != 0) {
return c;
}
auto a_footer = ExtractInternalKeyFooter(a.Encode());
auto b_footer = ExtractInternalKeyFooter(b.Encode());
if (a_footer == kRangeTombstoneSentinel) {
if (b_footer != kRangeTombstoneSentinel) {
return -1;
}
} else if (b_footer == kRangeTombstoneSentinel) {
return 1;
}
return 0;
}
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
const InternalKey& b) {
if (a == nullptr) {
return -1;
}
return sstableKeyCompare(user_cmp, *a, b);
}
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey* b) {
if (b == nullptr) {
return -1;
}
return sstableKeyCompare(user_cmp, a, *b);
}
uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
uint64_t sum = 0;
for (size_t i = 0; i < files.size() && files[i]; i++) {
@ -81,6 +118,49 @@ void Compaction::GetBoundaryKeys(
}
}
std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
for (size_t i = 0; i < inputs.size(); i++) {
if (inputs[i].level == 0 || inputs[i].files.empty()) {
continue;
}
inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
AtomicCompactionUnitBoundary cur_boundary;
size_t first_atomic_idx = 0;
auto add_unit_boundary = [&](size_t to) {
if (first_atomic_idx == to) return;
for (size_t k = first_atomic_idx; k < to; k++) {
inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
}
first_atomic_idx = to;
};
for (size_t j = 0; j < inputs[i].files.size(); j++) {
const auto* f = inputs[i].files[j];
if (j == 0) {
// First file in a level.
cur_boundary.smallest = &f->smallest;
cur_boundary.largest = &f->largest;
} else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
0) {
// SSTs overlap but the end key of the previous file was not
// artificially extended by a range tombstone. Extend the current
// boundary.
cur_boundary.largest = &f->largest;
} else {
// Atomic compaction unit has ended.
add_unit_boundary(j);
cur_boundary.smallest = &f->smallest;
cur_boundary.largest = &f->largest;
}
}
add_unit_boundary(inputs[i].files.size());
assert(inputs[i].files.size() ==
inputs[i].atomic_compaction_unit_boundaries.size());
}
return inputs;
}
// helper function to determine if compaction is creating files at the
// bottommost level
bool Compaction::IsBottommostLevel(
@ -155,7 +235,7 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
output_compression_(_compression),
output_compression_opts_(_compression_opts),
deletion_compaction_(_deletion_compaction),
inputs_(std::move(_inputs)),
inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
grandparents_(std::move(_grandparents)),
score_(_score),
bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),

@ -15,11 +15,43 @@
namespace rocksdb {
// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
// null which provides the property that a==null indicates a key that is less
// than any key and b==null indicates a key that is greater than any key. Note
// that the comparison is performed primarily on the user-key portion of the
// key. If the user-keys compare equal, an additional test is made to sort
// range tombstone sentinel keys before other keys with the same user-key. The
// result is that 2 user-keys will compare equal if they differ purely on
// their sequence number and value, but the range tombstone sentinel for that
// user-key will compare not equal. This is necessary because the range
// tombstone sentinel key is set as the largest key for an sstable even though
// that key never appears in the database. We don't want adjacent sstables to
// be considered overlapping if they are separated by the range tombstone
// sentinel.
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey& b);
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
const InternalKey& b);
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey* b);
// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
// largest] that exactly spans one ore more neighbouring SSTs on the same
// level. Every pair of SSTs in this range "overlap" (i.e., the largest
// user key of one file is the smallest user key of the next file). These
// boundaries are propagated down to RangeDelAggregator during compaction
// to provide safe truncation boundaries for range tombstones.
struct AtomicCompactionUnitBoundary {
const InternalKey* smallest = nullptr;
const InternalKey* largest = nullptr;
};
// The structure that manages compaction input files associated
// with the same physical level.
struct CompactionInputFiles {
int level;
std::vector<FileMetaData*> files;
std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
inline bool empty() const { return files.empty(); }
inline size_t size() const { return files.size(); }
inline void clear() { files.clear(); }
@ -96,6 +128,12 @@ class Compaction {
return inputs_[compaction_input_level][i];
}
const std::vector<AtomicCompactionUnitBoundary>* boundaries(
size_t compaction_input_level) const {
assert(compaction_input_level < inputs_.size());
return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
}
// Returns the list of file meta data of the specified compaction
// input level.
// REQUIREMENT: "compaction_input_level" must be >= 0 and
@ -262,6 +300,13 @@ class Compaction {
const std::vector<CompactionInputFiles>& inputs,
Slice* smallest_key, Slice* largest_key);
// Get the atomic file boundaries for all files in the compaction. Necessary
// in order to avoid the scenario described in
// https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb
// down appropriate key boundaries to RangeDelAggregator during compaction.
static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
// helper function to determine if compaction with inputs and storage is
// bottommost
static bool IsBottommostLevel(

@ -1212,10 +1212,11 @@ Status CompactionJob::FinishCompactionOutputFile(
for (; it->Valid(); it->Next()) {
auto tombstone = it->Tombstone();
if (upper_bound != nullptr &&
ucmp->Compare(*upper_bound, tombstone.start_key_) <= 0) {
// Tombstones starting at upper_bound or later only need to be included
// in the next table. Break because subsequent tombstones will start
// even later.
ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) {
// Tombstones starting after upper_bound only need to be included in the
// next table (if the SSTs overlap, then upper_bound is contained in
// this SST and hence must be covered). Break because subsequent
// tombstones will start even later.
break;
}

@ -10,20 +10,39 @@
namespace rocksdb {
namespace {
struct TombstoneStartKeyComparator {
TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {}
explicit TombstoneStartKeyComparator(const InternalKeyComparator* c)
: cmp(c) {}
bool operator()(const RangeTombstone& a, const RangeTombstone& b) const {
bool operator()(const TruncatedRangeTombstone& a,
const TruncatedRangeTombstone& b) const {
return cmp->Compare(a.start_key_, b.start_key_) < 0;
}
const Comparator* cmp;
const InternalKeyComparator* cmp;
};
struct ParsedInternalKeyComparator {
explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
: cmp(c) {}
bool operator()(const ParsedInternalKey& a,
const ParsedInternalKey& b) const {
return cmp->Compare(a, b) < 0;
}
const InternalKeyComparator* cmp;
};
} // namespace
// An UncollapsedRangeDelMap is quick to create but slow to answer ShouldDelete
// queries.
class UncollapsedRangeDelMap : public RangeDelMap {
typedef std::multiset<RangeTombstone, TombstoneStartKeyComparator> Rep;
typedef std::multiset<TruncatedRangeTombstone, TombstoneStartKeyComparator>
Rep;
class Iterator : public RangeDelIterator {
const Rep& rep_;
@ -35,48 +54,57 @@ class UncollapsedRangeDelMap : public RangeDelMap {
void Next() override { iter_++; }
void Seek(const Slice&) override {
fprintf(stderr, "UncollapsedRangeDelMap::Iterator::Seek unimplemented\n");
fprintf(stderr,
"UncollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n");
abort();
}
void Seek(const ParsedInternalKey&) override {
fprintf(stderr,
"UncollapsedRangeDelMap::Iterator::Seek(ParsedInternalKey&) "
"unimplemented\n");
abort();
}
RangeTombstone Tombstone() const override { return *iter_; }
RangeTombstone Tombstone() const override { return iter_->Tombstone(); }
};
Rep rep_;
const Comparator* ucmp_;
const InternalKeyComparator* icmp_;
public:
UncollapsedRangeDelMap(const Comparator* ucmp)
: rep_(TombstoneStartKeyComparator(ucmp)), ucmp_(ucmp) {}
explicit UncollapsedRangeDelMap(const InternalKeyComparator* icmp)
: rep_(TombstoneStartKeyComparator(icmp)), icmp_(icmp) {}
bool ShouldDelete(const ParsedInternalKey& parsed,
RangeDelPositioningMode mode) override {
(void)mode;
assert(mode == RangeDelPositioningMode::kFullScan);
for (const auto& tombstone : rep_) {
if (ucmp_->Compare(parsed.user_key, tombstone.start_key_) < 0) {
if (icmp_->Compare(parsed, tombstone.start_key_) < 0) {
break;
}
if (parsed.sequence < tombstone.seq_ &&
ucmp_->Compare(parsed.user_key, tombstone.end_key_) < 0) {
icmp_->Compare(parsed, tombstone.end_key_) < 0) {
return true;
}
}
return false;
}
bool IsRangeOverlapped(const Slice& start, const Slice& end) override {
bool IsRangeOverlapped(const ParsedInternalKey& start,
const ParsedInternalKey& end) override {
for (const auto& tombstone : rep_) {
if (ucmp_->Compare(start, tombstone.end_key_) < 0 &&
ucmp_->Compare(tombstone.start_key_, end) <= 0 &&
ucmp_->Compare(tombstone.start_key_, tombstone.end_key_) < 0) {
if (icmp_->Compare(start, tombstone.end_key_) < 0 &&
icmp_->Compare(tombstone.start_key_, end) <= 0 &&
icmp_->Compare(tombstone.start_key_, tombstone.end_key_) < 0) {
return true;
}
}
return false;
}
void AddTombstone(RangeTombstone tombstone) override {
void AddTombstone(TruncatedRangeTombstone tombstone) override {
rep_.emplace(tombstone);
}
@ -126,7 +154,9 @@ class UncollapsedRangeDelMap : public RangeDelMap {
// compared against the map entry g → 3 and determined to be uncovered. By
// contrast, the key h @ 2 would be determined to be covered.
class CollapsedRangeDelMap : public RangeDelMap {
typedef std::map<Slice, SequenceNumber, stl_wrappers::LessOfComparator> Rep;
typedef std::map<ParsedInternalKey, SequenceNumber,
ParsedInternalKeyComparator>
Rep;
class Iterator : public RangeDelIterator {
void MaybeSeekPastSentinel() {
@ -148,7 +178,12 @@ class CollapsedRangeDelMap : public RangeDelMap {
MaybeSeekPastSentinel();
}
void Seek(const Slice& target) override {
void Seek(const Slice&) override {
fprintf(stderr, "CollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n");
abort();
}
void Seek(const ParsedInternalKey& target) override {
iter_ = rep_.upper_bound(target);
if (iter_ != rep_.begin()) {
iter_--;
@ -161,8 +196,8 @@ class CollapsedRangeDelMap : public RangeDelMap {
assert(std::next(iter_) != rep_.end());
assert(iter_->second != 0);
RangeTombstone tombstone;
tombstone.start_key_ = iter_->first;
tombstone.end_key_ = std::next(iter_)->first;
tombstone.start_key_ = iter_->first.user_key;
tombstone.end_key_ = std::next(iter_)->first.user_key;
tombstone.seq_ = iter_->second;
return tombstone;
}
@ -170,12 +205,12 @@ class CollapsedRangeDelMap : public RangeDelMap {
Rep rep_;
Rep::iterator iter_;
const Comparator* ucmp_;
const InternalKeyComparator* icmp_;
public:
explicit CollapsedRangeDelMap(const Comparator* ucmp)
: rep_(stl_wrappers::LessOfComparator(ucmp)),
ucmp_(ucmp) {
explicit CollapsedRangeDelMap(const InternalKeyComparator* icmp)
: rep_(ParsedInternalKeyComparator(icmp)),
icmp_(icmp) {
InvalidatePosition();
}
@ -194,29 +229,29 @@ class CollapsedRangeDelMap : public RangeDelMap {
case RangeDelPositioningMode::kForwardTraversal:
assert(iter_ != rep_.end());
if (iter_ == rep_.begin() &&
ucmp_->Compare(parsed.user_key, iter_->first) < 0) {
icmp_->Compare(parsed, iter_->first) < 0) {
// before start of deletion intervals
return false;
}
while (std::next(iter_) != rep_.end() &&
ucmp_->Compare(std::next(iter_)->first, parsed.user_key) <= 0) {
icmp_->Compare(std::next(iter_)->first, parsed) <= 0) {
++iter_;
}
break;
case RangeDelPositioningMode::kBackwardTraversal:
assert(iter_ != rep_.end());
while (iter_ != rep_.begin() &&
ucmp_->Compare(parsed.user_key, iter_->first) < 0) {
icmp_->Compare(parsed, iter_->first) < 0) {
--iter_;
}
if (iter_ == rep_.begin() &&
ucmp_->Compare(parsed.user_key, iter_->first) < 0) {
icmp_->Compare(parsed, iter_->first) < 0) {
// before start of deletion intervals
return false;
}
break;
case RangeDelPositioningMode::kBinarySearch:
iter_ = rep_.upper_bound(parsed.user_key);
iter_ = rep_.upper_bound(parsed);
if (iter_ == rep_.begin()) {
// before start of deletion intervals
return false;
@ -225,21 +260,22 @@ class CollapsedRangeDelMap : public RangeDelMap {
break;
}
assert(iter_ != rep_.end() &&
ucmp_->Compare(iter_->first, parsed.user_key) <= 0);
icmp_->Compare(iter_->first, parsed) <= 0);
assert(std::next(iter_) == rep_.end() ||
ucmp_->Compare(parsed.user_key, std::next(iter_)->first) < 0);
icmp_->Compare(parsed, std::next(iter_)->first) < 0);
return parsed.sequence < iter_->second;
}
bool IsRangeOverlapped(const Slice&, const Slice&) override {
bool IsRangeOverlapped(const ParsedInternalKey&,
const ParsedInternalKey&) override {
// Unimplemented because the only client of this method, file ingestion,
// uses uncollapsed maps.
fprintf(stderr, "CollapsedRangeDelMap::IsRangeOverlapped unimplemented");
abort();
}
void AddTombstone(RangeTombstone t) override {
if (ucmp_->Compare(t.start_key_, t.end_key_) >= 0 || t.seq_ == 0) {
void AddTombstone(TruncatedRangeTombstone t) override {
if (icmp_->Compare(t.start_key_, t.end_key_) >= 0 || t.seq_ == 0) {
// The tombstone covers no keys. Nothing to do.
return;
}
@ -272,7 +308,8 @@ class CollapsedRangeDelMap : public RangeDelMap {
end_seq = prev_seq();
Rep::iterator pit;
if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() &&
ucmp_->Compare(pit->first, t.start_key_) == 0 && std::prev(pit)->second == t.seq_) {
icmp_->Compare(pit->first, t.start_key_) == 0 &&
std::prev(pit)->second == t.seq_) {
// The new tombstone starts at the end of an existing tombstone with an
// identical seqno:
//
@ -297,7 +334,7 @@ class CollapsedRangeDelMap : public RangeDelMap {
}
// Look at all the existing transitions that overlap the new tombstone.
while (it != rep_.end() && ucmp_->Compare(it->first, t.end_key_) < 0) {
while (it != rep_.end() && icmp_->Compare(it->first, t.end_key_) < 0) {
if (t.seq_ >= it->second) {
// The transition is to an existing tombstone that the new tombstone
// covers. Save the covered tombstone's seqno. We'll need to return to
@ -343,12 +380,14 @@ class CollapsedRangeDelMap : public RangeDelMap {
if (t.seq_ == prev_seq()) {
// The new tombstone is unterminated in the map.
if (it != rep_.end() && t.seq_ == it->second && ucmp_->Compare(it->first, t.end_key_) == 0) {
if (it != rep_.end() && t.seq_ == it->second &&
icmp_->Compare(it->first, t.end_key_) == 0) {
// The new tombstone ends at the start of another tombstone with an
// identical seqno. Merge the tombstones by removing the existing
// tombstone's start key.
rep_.erase(it);
} else if (end_seq == prev_seq() || (it != rep_.end() && end_seq == it->second)) {
} else if (end_seq == prev_seq() ||
(it != rep_.end() && end_seq == it->second)) {
// The new tombstone is implicitly ended because its end point is
// contained within an existing tombstone with the same seqno:
//
@ -363,7 +402,8 @@ class CollapsedRangeDelMap : public RangeDelMap {
// Install one that returns to the last seqno we covered. Because end
// keys are exclusive, if there's an existing transition at t.end_key_,
// it takes precedence over the transition that we install here.
rep_.emplace(t.end_key_, end_seq); // emplace is a noop if existing entry
rep_.emplace(t.end_key_,
end_seq); // emplace is a noop if existing entry
}
} else {
// The new tombstone is implicitly ended because its end point is covered
@ -416,9 +456,9 @@ void RangeDelAggregator::InitRep(const std::vector<SequenceNumber>& snapshots) {
std::unique_ptr<RangeDelMap> RangeDelAggregator::NewRangeDelMap() {
RangeDelMap* tombstone_map;
if (collapse_deletions_) {
tombstone_map = new CollapsedRangeDelMap(icmp_.user_comparator());
tombstone_map = new CollapsedRangeDelMap(&icmp_);
} else {
tombstone_map = new UncollapsedRangeDelMap(icmp_.user_comparator());
tombstone_map = new UncollapsedRangeDelMap(&icmp_);
}
return std::unique_ptr<RangeDelMap>(tombstone_map);
}
@ -429,8 +469,9 @@ bool RangeDelAggregator::ShouldDeleteImpl(const Slice& internal_key,
ParsedInternalKey parsed;
if (!ParseInternalKey(internal_key, &parsed)) {
assert(false);
return false;
}
return ShouldDelete(parsed, mode);
return ShouldDeleteImpl(parsed, mode);
}
bool RangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed,
@ -452,8 +493,10 @@ bool RangeDelAggregator::IsRangeOverlapped(const Slice& start,
if (rep_ == nullptr) {
return false;
}
ParsedInternalKey start_ikey(start, kMaxSequenceNumber, kMaxValue);
ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
for (const auto& stripe : rep_->stripe_map_) {
if (stripe.second->IsRangeOverlapped(start, end)) {
if (stripe.second->IsRangeOverlapped(start_ikey, end_ikey)) {
return true;
}
}
@ -492,40 +535,49 @@ Status RangeDelAggregator::AddTombstones(
if (!parsed) {
return Status::Corruption("Unable to parse range tombstone InternalKey");
}
RangeTombstone tombstone;
Slice end_user_key;
if (input->IsValuePinned()) {
tombstone = RangeTombstone(parsed_key, input->value());
end_user_key = input->value();
} else {
// The tombstone map holds slices into the iterator's memory. Make a
// copy of the value if it is not pinned.
rep_->pinned_slices_.emplace_back(input->value().data(),
input->value().size());
tombstone = RangeTombstone(parsed_key, rep_->pinned_slices_.back());
end_user_key = rep_->pinned_slices_.back();
}
ParsedInternalKey start_key(parsed_key.user_key, kMaxSequenceNumber,
kMaxValue);
ParsedInternalKey end_key(end_user_key, kMaxSequenceNumber, kMaxValue);
// Truncate the tombstone to the range [smallest, largest].
if (smallest != nullptr) {
if (icmp_.user_comparator()->Compare(
tombstone.start_key_, smallest->user_key()) < 0) {
tombstone.start_key_ = smallest->user_key();
ParsedInternalKey parsed_smallest;
if (ParseInternalKey(smallest->Encode(), &parsed_smallest) &&
icmp_.Compare(start_key, parsed_smallest) < 0) {
start_key.user_key = parsed_smallest.user_key;
start_key.sequence = parsed_smallest.sequence;
}
}
if (largest != nullptr) {
// To safely truncate the range tombstone's end key, it must extend past
// the largest key in the sstable (which may have been extended to the
// smallest key in the next sstable), and largest must be a tombstone
// sentinel key. A range tombstone may straddle two sstables and not be
// the tombstone sentinel key in the first sstable if a user-key also
// straddles the sstables (possible if there is a snapshot between the
// two versions of the user-key), in which case we cannot truncate the
// range tombstone.
if (icmp_.user_comparator()->Compare(tombstone.end_key_,
largest->user_key()) > 0 &&
GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber) {
tombstone.end_key_ = largest->user_key();
ParsedInternalKey parsed_largest;
if (ParseInternalKey(largest->Encode(), &parsed_largest) &&
icmp_.Compare(end_key, parsed_largest) > 0) {
end_key.user_key = parsed_largest.user_key;
if (parsed_largest.sequence != kMaxSequenceNumber) {
// The same user key straddles two adjacent sstables. To make sure we
// can truncate to a range that includes the largest point key in the
// first sstable, set the tombstone end key's sequence number to 1
// less than the largest key.
assert(parsed_largest.sequence != 0);
end_key.sequence = parsed_largest.sequence - 1;
} else {
// The SST file boundary was artificially extended by a range tombstone.
// We will not see any entries in this SST with this user key, so we
// can leave the seqnum at kMaxSequenceNumber.
}
}
}
auto seq = tombstone.seq_;
GetRangeDelMap(seq).AddTombstone(std::move(tombstone));
TruncatedRangeTombstone tombstone(start_key, end_key, parsed_key.sequence);
GetRangeDelMap(parsed_key.sequence).AddTombstone(std::move(tombstone));
input->Next();
}
if (!first_iter) {
@ -604,6 +656,11 @@ class MergingRangeDelIter : public RangeDelIterator {
}
void Seek(const Slice& target) override {
ParsedInternalKey ikey(target, kMaxSequenceNumber, kMaxValue);
Seek(ikey);
}
void Seek(const ParsedInternalKey& target) override {
heap_.clear();
for (auto& iter : iters_) {
iter->Seek(target);

@ -40,6 +40,35 @@ enum class RangeDelPositioningMode {
kBinarySearch,
};
// TruncatedRangeTombstones are a slight generalization of regular
// RangeTombstones that can represent truncations caused by SST boundaries.
// Instead of using user keys to represent the start and end keys, they instead
// use internal keys, whose sequence number indicates the sequence number of
// the smallest/largest SST key (in the case where a tombstone is untruncated,
// the sequence numbers will be kMaxSequenceNumber for both start and end
// keys). Like RangeTombstones, TruncatedRangeTombstone are also
// end-key-exclusive.
struct TruncatedRangeTombstone {
TruncatedRangeTombstone(const ParsedInternalKey& sk,
const ParsedInternalKey& ek, SequenceNumber s)
: start_key_(sk), end_key_(ek), seq_(s) {}
RangeTombstone Tombstone() const {
// The RangeTombstone returned here can cover less than the
// TruncatedRangeTombstone when its end key has a seqnum that is not
// kMaxSequenceNumber. Since this method is only used by RangeDelIterators
// (which in turn are only used during flush/compaction), we avoid this
// problem by using truncation boundaries spanning multiple SSTs, which
// are selected in a way that guarantee a clean break at the end key.
assert(end_key_.sequence == kMaxSequenceNumber);
return RangeTombstone(start_key_.user_key, end_key_.user_key, seq_);
}
ParsedInternalKey start_key_;
ParsedInternalKey end_key_;
SequenceNumber seq_;
};
// A RangeDelIterator iterates over range deletion tombstones.
class RangeDelIterator {
public:
@ -47,7 +76,9 @@ class RangeDelIterator {
virtual bool Valid() const = 0;
virtual void Next() = 0;
// NOTE: the Slice passed to this method must be a user key.
virtual void Seek(const Slice& target) = 0;
virtual void Seek(const ParsedInternalKey& target) = 0;
virtual RangeTombstone Tombstone() const = 0;
};
@ -62,13 +93,14 @@ class RangeDelMap {
virtual bool ShouldDelete(const ParsedInternalKey& parsed,
RangeDelPositioningMode mode) = 0;
virtual bool IsRangeOverlapped(const Slice& start, const Slice& end) = 0;
virtual bool IsRangeOverlapped(const ParsedInternalKey& start,
const ParsedInternalKey& end) = 0;
virtual void InvalidatePosition() = 0;
virtual size_t Size() const = 0;
bool IsEmpty() const { return Size() == 0; }
virtual void AddTombstone(RangeTombstone tombstone) = 0;
virtual void AddTombstone(TruncatedRangeTombstone tombstone) = 0;
virtual std::unique_ptr<RangeDelIterator> NewIterator() = 0;
};

@ -27,6 +27,12 @@ enum Direction {
kReverse,
};
struct AddTombstonesArgs {
const std::vector<RangeTombstone> tombstones;
const InternalKey* smallest;
const InternalKey* largest;
};
static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
void AddTombstones(RangeDelAggregator* range_del_agg,
@ -54,8 +60,7 @@ void VerifyRangeDelIter(
RangeDelIterator* range_del_iter,
const std::vector<RangeTombstone>& expected_range_dels) {
size_t i = 0;
for (; range_del_iter->Valid() && i < expected_range_dels.size();
range_del_iter->Next(), i++) {
for (; range_del_iter->Valid(); range_del_iter->Next(), i++) {
VerifyTombstonesEq(expected_range_dels[i], range_del_iter->Tombstone());
}
ASSERT_EQ(expected_range_dels.size(), i);
@ -63,22 +68,26 @@ void VerifyRangeDelIter(
}
void VerifyRangeDels(
const std::vector<RangeTombstone>& range_dels_in,
const std::vector<AddTombstonesArgs>& all_args,
const std::vector<ExpectedPoint>& expected_points,
const std::vector<RangeTombstone>& expected_collapsed_range_dels,
const InternalKey* smallest = nullptr, const InternalKey* largest = nullptr,
const InternalKeyComparator& icmp = bytewise_icmp) {
// Test same result regardless of which order the range deletions are added
// and regardless of collapsed mode.
for (bool collapsed : {false, true}) {
for (Direction dir : {kForward, kReverse}) {
RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, collapsed);
std::vector<RangeTombstone> all_range_dels;
std::vector<RangeTombstone> range_dels = range_dels_in;
if (dir == kReverse) {
std::reverse(range_dels.begin(), range_dels.end());
for (const auto& args : all_args) {
std::vector<RangeTombstone> range_dels = args.tombstones;
if (dir == kReverse) {
std::reverse(range_dels.begin(), range_dels.end());
}
all_range_dels.insert(all_range_dels.end(), range_dels.begin(),
range_dels.end());
AddTombstones(&range_del_agg, range_dels, args.smallest, args.largest);
}
AddTombstones(&range_del_agg, range_dels, smallest, largest);
auto mode = RangeDelPositioningMode::kFullScan;
if (collapsed) {
@ -90,38 +99,45 @@ void VerifyRangeDels(
parsed_key.user_key = expected_point.begin;
parsed_key.sequence = expected_point.seq;
parsed_key.type = kTypeValue;
ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode));
std::string ikey;
AppendInternalKey(&ikey, parsed_key);
ASSERT_FALSE(range_del_agg.ShouldDelete(ikey, mode));
if (parsed_key.sequence > 0) {
--parsed_key.sequence;
ikey.clear();
AppendInternalKey(&ikey, parsed_key);
if (expected_point.expectAlive) {
ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode));
ASSERT_FALSE(range_del_agg.ShouldDelete(ikey, mode));
} else {
ASSERT_TRUE(range_del_agg.ShouldDelete(parsed_key, mode));
ASSERT_TRUE(range_del_agg.ShouldDelete(ikey, mode));
}
}
}
if (collapsed) {
range_dels = expected_collapsed_range_dels;
VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels);
} else if (smallest == nullptr && largest == nullptr) {
all_range_dels = expected_collapsed_range_dels;
VerifyRangeDelIter(range_del_agg.NewIterator().get(), all_range_dels);
} else if (all_args.size() == 1 && all_args[0].smallest == nullptr &&
all_args[0].largest == nullptr) {
// Tombstones in an uncollapsed map are presented in start key
// order. Tombstones with the same start key are presented in
// insertion order. We don't handle tombstone truncation here, so the
// verification is only performed if no truncation was requested.
std::stable_sort(range_dels.begin(), range_dels.end(),
std::stable_sort(all_range_dels.begin(), all_range_dels.end(),
[&](const RangeTombstone& a, const RangeTombstone& b) {
return icmp.user_comparator()->Compare(
a.start_key_, b.start_key_) < 0;
});
VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels);
VerifyRangeDelIter(range_del_agg.NewIterator().get(), all_range_dels);
}
}
}
RangeDelAggregator range_del_agg(icmp, {} /* snapshots */,
false /* collapse_deletions */);
AddTombstones(&range_del_agg, range_dels_in);
for (const auto& args : all_args) {
AddTombstones(&range_del_agg, args.tombstones, args.smallest, args.largest);
}
for (size_t i = 1; i < expected_points.size(); ++i) {
bool overlapped = range_del_agg.IsRangeOverlapped(
expected_points[i - 1].begin, expected_points[i].begin);
@ -138,65 +154,64 @@ void VerifyRangeDels(
TEST_F(RangeDelAggregatorTest, Empty) { VerifyRangeDels({}, {{"a", 0}}, {}); }
TEST_F(RangeDelAggregatorTest, SameStartAndEnd) {
VerifyRangeDels({{"a", "a", 5}}, {{" ", 0}, {"a", 0}, {"b", 0}}, {});
VerifyRangeDels({{{{"a", "a", 5}}}}, {{" ", 0}, {"a", 0}, {"b", 0}}, {});
}
TEST_F(RangeDelAggregatorTest, Single) {
VerifyRangeDels({{"a", "b", 10}}, {{" ", 0}, {"a", 10}, {"b", 0}},
VerifyRangeDels({{{{"a", "b", 10}}}}, {{" ", 0}, {"a", 10}, {"b", 0}},
{{"a", "b", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlapAboveLeft) {
VerifyRangeDels({{"a", "c", 10}, {"b", "d", 5}},
VerifyRangeDels({{{{"a", "c", 10}, {"b", "d", 5}}}},
{{" ", 0}, {"a", 10}, {"c", 5}, {"d", 0}},
{{"a", "c", 10}, {"c", "d", 5}});
}
TEST_F(RangeDelAggregatorTest, OverlapAboveRight) {
VerifyRangeDels({{"a", "c", 5}, {"b", "d", 10}},
VerifyRangeDels({{{{"a", "c", 5}, {"b", "d", 10}}}},
{{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}},
{{"a", "b", 5}, {"b", "d", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlapAboveMiddle) {
VerifyRangeDels({{"a", "d", 5}, {"b", "c", 10}},
VerifyRangeDels({{{{"a", "d", 5}, {"b", "c", 10}}}},
{{" ", 0}, {"a", 5}, {"b", 10}, {"c", 5}, {"d", 0}},
{{"a", "b", 5}, {"b", "c", 10}, {"c", "d", 5}});
}
TEST_F(RangeDelAggregatorTest, OverlapAboveMiddleReverse) {
VerifyRangeDels({{"d", "a", 5}, {"c", "b", 10}},
VerifyRangeDels({{{{"d", "a", 5}, {"c", "b", 10}}}},
{{"z", 0}, {"d", 5}, {"c", 10}, {"b", 5}, {"a", 0}},
{{"d", "c", 5}, {"c", "b", 10}, {"b", "a", 5}},
nullptr /* smallest */, nullptr /* largest */,
InternalKeyComparator(ReverseBytewiseComparator()));
}
TEST_F(RangeDelAggregatorTest, OverlapFully) {
VerifyRangeDels({{"a", "d", 10}, {"b", "c", 5}},
VerifyRangeDels({{{{"a", "d", 10}, {"b", "c", 5}}}},
{{" ", 0}, {"a", 10}, {"d", 0}}, {{"a", "d", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlapPoint) {
VerifyRangeDels({{"a", "b", 5}, {"b", "c", 10}},
VerifyRangeDels({{{{"a", "b", 5}, {"b", "c", 10}}}},
{{" ", 0}, {"a", 5}, {"b", 10}, {"c", 0}},
{{"a", "b", 5}, {"b", "c", 10}});
}
TEST_F(RangeDelAggregatorTest, SameStartKey) {
VerifyRangeDels({{"a", "c", 5}, {"a", "b", 10}},
VerifyRangeDels({{{{"a", "c", 5}, {"a", "b", 10}}}},
{{" ", 0}, {"a", 10}, {"b", 5}, {"c", 0}},
{{"a", "b", 10}, {"b", "c", 5}});
}
TEST_F(RangeDelAggregatorTest, SameEndKey) {
VerifyRangeDels({{"a", "d", 5}, {"b", "d", 10}},
VerifyRangeDels({{{{"a", "d", 5}, {"b", "d", 10}}}},
{{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}},
{{"a", "b", 5}, {"b", "d", 10}});
}
TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) {
VerifyRangeDels({{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}},
VerifyRangeDels({{{{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}}},
{{" ", 0},
{"a", 5},
{"b", 0},
@ -209,25 +224,25 @@ TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) {
}
TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) {
VerifyRangeDels({{"a", "b", 5}, {"a", "b", 5}},
VerifyRangeDels({{{{"a", "b", 5}, {"a", "b", 5}}}},
{{" ", 0}, {"a", 5}, {"b", 0}},
{{"a", "b", 5}});
}
TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) {
VerifyRangeDels({{"a", "b", 5}, {"b", "c", 5}},
VerifyRangeDels({{{{"a", "b", 5}, {"b", "c", 5}}}},
{{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}},
{{"a", "c", 5}});
}
TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) {
VerifyRangeDels({{"a", "c", 5}, {"b", "d", 5}},
VerifyRangeDels({{{{"a", "c", 5}, {"b", "d", 5}}}},
{{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}},
{{"a", "d", 5}});
}
TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) {
VerifyRangeDels({{"a", "d", 5}, {"b", "c", 5}},
VerifyRangeDels({{{{"a", "d", 5}, {"b", "c", 5}}}},
{{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}},
{{"a", "d", 5}});
}
@ -236,27 +251,27 @@ TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) {
// larger one when VerifyRangeDels() runs them in reverse
TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) {
VerifyRangeDels(
{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}},
{{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}}}},
{{" ", 0}, {"a", 20}, {"f", 15}, {"g", 0}},
{{"a", "f", 20}, {"f", "g", 15}});
}
TEST_F(RangeDelAggregatorTest, CoverMultipleFromRight) {
VerifyRangeDels(
{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}},
{{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}}}},
{{" ", 0}, {"b", 5}, {"c", 20}, {"h", 0}},
{{"b", "c", 5}, {"c", "h", 20}});
}
TEST_F(RangeDelAggregatorTest, CoverMultipleFully) {
VerifyRangeDels(
{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}},
{{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}}}},
{{" ", 0}, {"a", 20}, {"h", 0}}, {{"a", "h", 20}});
}
TEST_F(RangeDelAggregatorTest, AlternateMultipleAboveBelow) {
VerifyRangeDels(
{{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}},
{{{{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}}}},
{{" ", 0}, {"a", 5}, {"b", 15}, {"d", 10}, {"e", 20}, {"g", 5}, {"h", 0}},
{{"a", "b", 5},
{"b", "d", 15},
@ -321,31 +336,132 @@ TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) {
}
TEST_F(RangeDelAggregatorTest, TruncateTombstones) {
const InternalKey smallest("b", 1, kTypeRangeDeletion);
const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{"a", "c", 10}, {"d", "f", 10}},
{{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}},
{{"a", 10, true}, // truncated
{"b", 10, false}, // not truncated
{"d", 10, false}, // not truncated
{"e", 10, true}}, // truncated
{{"b", "c", 10}, {"d", "e", 10}},
&smallest, &largest);
{{"b", "c", 10}, {"d", "e", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateTombstones) {
const InternalKey smallest("b", 1, kTypeRangeDeletion);
TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateBelowTombstone) {
const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest(
"e", 3, // could happen if "e" is in consecutive sstables
kTypeValue);
VerifyRangeDels(
{{"a", "c", 10}, {"d", "f", 10}},
{{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}},
{{"a", 10, true}, // truncated
{"b", 10, false}, // not truncated
{"d", 10, false}, // not truncated
{"e", 10, false}, // not truncated
{"e", 2, true}}, // truncated here
{{"b", "c", 10}, {"d", "e", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateAboveTombstone) {
const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest(
"e", 15, // could happen if "e" is in consecutive sstables
kTypeValue);
VerifyRangeDels(
{{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}},
{{"a", 10, true}, // truncated
{"b", 10, false}, // not truncated
{"d", 10, false}, // not truncated
{"e", kMaxSequenceNumber, true}}, // truncated
{{"b", "c", 10}, {"d", "e", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlappingSmallestKeyTruncateBelowTombstone) {
const InternalKey smallest("b", 5, kTypeValue);
const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}},
{{"a", 10, true}, // truncated
{"b", 10, true}, // truncated
{"b", 6, false}, // not truncated; start boundary moved
{"d", 10, false}, // not truncated
{"e", kMaxSequenceNumber, true}}, // truncated
{{"b", "c", 10}, {"d", "e", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlappingSmallestKeyTruncateAboveTombstone) {
const InternalKey smallest("b", 15, kTypeValue);
const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}},
{{"a", 10, true}, // truncated
{"b", 15, true}, // truncated
{"b", 10, false}, // not truncated
{"d", 10, false}, // not truncated
{"e", 10, false}}, // not truncated
{{"b", "c", 10}, {"d", "f", 10}},
&smallest, &largest);
{"e", kMaxSequenceNumber, true}}, // truncated
{{"b", "c", 10}, {"d", "e", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapAboveTombstone) {
const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest1("c", 20, kTypeValue);
const InternalKey smallest2("c", 10, kTypeValue);
const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{{{"b", "d", 5}}, &smallest1, &largest1},
{{{"b", "d", 5}}, &smallest2, &largest2}},
{{"b", 5, false}, // not truncated
{"c", 5, false}}, // not truncated
{{"b", "c", 5}, {"c", "d", 5}}); // not collapsed due to boundaries
}
TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapBelowTombstone) {
const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest1("c", 20, kTypeValue);
const InternalKey smallest2("c", 10, kTypeValue);
const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{{{"b", "d", 30}}, &smallest1, &largest1},
{{{"b", "d", 30}}, &smallest2, &largest2}},
{{"b", 30, false}, // not truncated
{"c", 30, false}, // not truncated
{"c", 19, true}, // truncated here (keys in this range should not exist)
{"c", 11, false}}, // not truncated again
{{"b", "c", 30}, {"c", "d", 30}}); // not collapsed due to boundaries
}
TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapContainsTombstone) {
const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest1("c", 20, kTypeValue);
const InternalKey smallest2("c", 10, kTypeValue);
const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{{{"b", "d", 15}}, &smallest1, &largest1},
{{{"b", "d", 15}}, &smallest2, &largest2}},
{{"b", 15, false}, // not truncated
{"c", 15, true}, // truncated (keys in this range should not exist)
{"c", 11, false}}, // not truncated here
{{"b", "c", 15}, {"c", "d", 15}}); // not collapsed due to boundaries
}
TEST_F(RangeDelAggregatorTest, FileCoversOneKeyAndTombstoneAbove) {
const InternalKey smallest("a", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest("a", 20, kTypeValue);
VerifyRangeDels(
{{{{"a", "b", 35}}, &smallest, &largest}},
{{"a", 40, true}, // not truncated
{"a", 35, false}}, // not truncated
{{"a", "a", 35}}); // empty tombstone but can't occur during a compaction
}
TEST_F(RangeDelAggregatorTest, FileCoversOneKeyAndTombstoneBelow) {
const InternalKey smallest("a", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest("a", 20, kTypeValue);
VerifyRangeDels(
{{{{"a", "b", 15}}, &smallest, &largest}},
{{"a", 20, true}, // truncated here
{"a", 15, true}}, // truncated
{{"a", "a", 15}}); // empty tombstone but can't occur during a compaction
}
} // namespace rocksdb

@ -183,7 +183,9 @@ InternalIterator* TableCache::NewIterator(
const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
bool for_compaction, Arena* arena, bool skip_filters, int level) {
bool for_compaction, Arena* arena, bool skip_filters, int level,
const InternalKey* smallest_compaction_key,
const InternalKey* largest_compaction_key) {
PERF_TIMER_GUARD(new_table_iterator_nanos);
Status s;
@ -266,10 +268,16 @@ InternalIterator* TableCache::NewIterator(
s = range_del_iter->status();
}
if (s.ok()) {
s = range_del_agg->AddTombstones(
std::move(range_del_iter),
&file_meta.smallest,
&file_meta.largest);
const InternalKey* smallest = &file_meta.smallest;
const InternalKey* largest = &file_meta.largest;
if (smallest_compaction_key != nullptr) {
smallest = smallest_compaction_key;
}
if (largest_compaction_key != nullptr) {
largest = largest_compaction_key;
}
s = range_del_agg->AddTombstones(std::move(range_del_iter), smallest,
largest);
}
}
}

@ -56,7 +56,9 @@ class TableCache {
const SliceTransform* prefix_extractor = nullptr,
TableReader** table_reader_ptr = nullptr,
HistogramImpl* file_read_hist = nullptr, bool for_compaction = false,
Arena* arena = nullptr, bool skip_filters = false, int level = -1);
Arena* arena = nullptr, bool skip_filters = false, int level = -1,
const InternalKey* smallest_compaction_key = nullptr,
const InternalKey* largest_compaction_key = nullptr);
// If a seek to internal key "k" in specified file finds an entry,
// call (*handle_result)(arg, found_key, found_value) repeatedly until

@ -451,6 +451,7 @@ bool SomeFileOverlapsRange(
}
namespace {
class LevelIterator final : public InternalIterator {
public:
LevelIterator(TableCache* table_cache, const ReadOptions& read_options,
@ -459,7 +460,9 @@ class LevelIterator final : public InternalIterator {
const LevelFilesBrief* flevel,
const SliceTransform* prefix_extractor, bool should_sample,
HistogramImpl* file_read_hist, bool for_compaction,
bool skip_filters, int level, RangeDelAggregator* range_del_agg)
bool skip_filters, int level, RangeDelAggregator* range_del_agg,
const std::vector<AtomicCompactionUnitBoundary>*
compaction_boundaries = nullptr)
: table_cache_(table_cache),
read_options_(read_options),
env_options_(env_options),
@ -473,7 +476,8 @@ class LevelIterator final : public InternalIterator {
file_index_(flevel_->num_files),
level_(level),
range_del_agg_(range_del_agg),
pinned_iters_mgr_(nullptr) {
pinned_iters_mgr_(nullptr),
compaction_boundaries_(compaction_boundaries) {
// Empty level is not supported.
assert(flevel_ != nullptr && flevel_->num_files > 0);
}
@ -540,12 +544,18 @@ class LevelIterator final : public InternalIterator {
sample_file_read_inc(file_meta.file_metadata);
}
const InternalKey* smallest_compaction_key = nullptr;
const InternalKey* largest_compaction_key = nullptr;
if (compaction_boundaries_ != nullptr) {
smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
}
return table_cache_->NewIterator(
read_options_, env_options_, icomparator_, *file_meta.file_metadata,
range_del_agg_, prefix_extractor_,
nullptr /* don't need reference to table */,
file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_,
level_);
level_, smallest_compaction_key, largest_compaction_key);
}
TableCache* table_cache_;
@ -565,6 +575,10 @@ class LevelIterator final : public InternalIterator {
RangeDelAggregator* range_del_agg_;
IteratorWrapper file_iter_; // May be nullptr
PinnedIteratorsManager* pinned_iters_mgr_;
// To be propagated to RangeDelAggregator in order to safely truncate range
// tombstones.
const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
};
void LevelIterator::Seek(const Slice& target) {
@ -2134,60 +2148,6 @@ void VersionStorageInfo::GetCleanInputsWithinInterval(
true /* within_interval */);
}
namespace {
const uint64_t kRangeTombstoneSentinel =
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
// null which provides the property that a==null indicates a key that is less
// than any key and b==null indicates a key that is greater than any key. Note
// that the comparison is performed primarily on the user-key portion of the
// key. If the user-keys compare equal, an additional test is made to sort
// range tombstone sentinel keys before other keys with the same user-key. The
// result is that 2 user-keys will compare equal if they differ purely on
// their sequence number and value, but the range tombstone sentinel for that
// user-key will compare not equal. This is necessary because the range
// tombstone sentinel key is set as the largest key for an sstable even though
// that key never appears in the database. We don't want adjacent sstables to
// be considered overlapping if they are separated by the range tombstone
// sentinel.
int sstableKeyCompare(const Comparator* user_cmp,
const InternalKey& a, const InternalKey& b) {
auto c = user_cmp->Compare(a.user_key(), b.user_key());
if (c != 0) {
return c;
}
auto a_footer = ExtractInternalKeyFooter(a.Encode());
auto b_footer = ExtractInternalKeyFooter(b.Encode());
if (a_footer == kRangeTombstoneSentinel) {
if (b_footer != kRangeTombstoneSentinel) {
return -1;
}
} else if (b_footer == kRangeTombstoneSentinel) {
return 1;
}
return 0;
}
int sstableKeyCompare(const Comparator* user_cmp,
const InternalKey* a, const InternalKey& b) {
if (a == nullptr) {
return -1;
}
return sstableKeyCompare(user_cmp, *a, b);
}
int sstableKeyCompare(const Comparator* user_cmp,
const InternalKey& a, const InternalKey* b) {
if (b == nullptr) {
return -1;
}
return sstableKeyCompare(user_cmp, a, *b);
}
} // namespace
// Store in "*inputs" all files in "level" that overlap [begin,end]
// Employ binary search to find at least one file that overlaps the
// specified range. From that file, iterate backwards and
@ -4248,7 +4208,8 @@ InternalIterator* VersionSet::MakeInputIterator(
false /* should_sample */,
nullptr /* no per level latency histogram */,
true /* for_compaction */, false /* skip_filters */,
static_cast<int>(which) /* level */, range_del_agg);
static_cast<int>(which) /* level */, range_del_agg,
c->boundaries(which));
}
}
}

Loading…
Cancel
Save