Truncate range tombstones by leveraging InternalKeys (#4432)

Summary:
To more accurately truncate range tombstones at SST boundaries,
we now represent them in RangeDelAggregator using InternalKeys, which
are end-key-exclusive as they were before this change.

During compaction, "atomic compaction unit boundaries" (the range of
keys contained in neighbouring and overlaping SSTs) are propagated down
to RangeDelAggregator to truncate range tombstones at those boundariies
instead. See https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and https://github.com/facebook/rocksdb/pull/4432#discussion_r221138683
for motivating examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4432

Differential Revision: D10263952

Pulled By: abhimadan

fbshipit-source-id: 2fe85ff8a02b3a6a2de2edfe708012797a7bd579
main
Abhishek Madan 6 years ago committed by Facebook Github Bot
parent 283a700f5d
commit 3a4bd36fed
  1. 82
      db/compaction.cc
  2. 45
      db/compaction.h
  3. 9
      db/compaction_job.cc
  4. 183
      db/range_del_aggregator.cc
  5. 36
      db/range_del_aggregator.h
  6. 210
      db/range_del_aggregator_test.cc
  7. 18
      db/table_cache.cc
  8. 4
      db/table_cache.h
  9. 77
      db/version_set.cc

@ -23,6 +23,43 @@
namespace rocksdb { namespace rocksdb {
const uint64_t kRangeTombstoneSentinel =
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey& b) {
auto c = user_cmp->Compare(a.user_key(), b.user_key());
if (c != 0) {
return c;
}
auto a_footer = ExtractInternalKeyFooter(a.Encode());
auto b_footer = ExtractInternalKeyFooter(b.Encode());
if (a_footer == kRangeTombstoneSentinel) {
if (b_footer != kRangeTombstoneSentinel) {
return -1;
}
} else if (b_footer == kRangeTombstoneSentinel) {
return 1;
}
return 0;
}
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
const InternalKey& b) {
if (a == nullptr) {
return -1;
}
return sstableKeyCompare(user_cmp, *a, b);
}
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey* b) {
if (b == nullptr) {
return -1;
}
return sstableKeyCompare(user_cmp, a, *b);
}
uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) { uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
uint64_t sum = 0; uint64_t sum = 0;
for (size_t i = 0; i < files.size() && files[i]; i++) { for (size_t i = 0; i < files.size() && files[i]; i++) {
@ -81,6 +118,49 @@ void Compaction::GetBoundaryKeys(
} }
} }
std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
for (size_t i = 0; i < inputs.size(); i++) {
if (inputs[i].level == 0 || inputs[i].files.empty()) {
continue;
}
inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
AtomicCompactionUnitBoundary cur_boundary;
size_t first_atomic_idx = 0;
auto add_unit_boundary = [&](size_t to) {
if (first_atomic_idx == to) return;
for (size_t k = first_atomic_idx; k < to; k++) {
inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
}
first_atomic_idx = to;
};
for (size_t j = 0; j < inputs[i].files.size(); j++) {
const auto* f = inputs[i].files[j];
if (j == 0) {
// First file in a level.
cur_boundary.smallest = &f->smallest;
cur_boundary.largest = &f->largest;
} else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
0) {
// SSTs overlap but the end key of the previous file was not
// artificially extended by a range tombstone. Extend the current
// boundary.
cur_boundary.largest = &f->largest;
} else {
// Atomic compaction unit has ended.
add_unit_boundary(j);
cur_boundary.smallest = &f->smallest;
cur_boundary.largest = &f->largest;
}
}
add_unit_boundary(inputs[i].files.size());
assert(inputs[i].files.size() ==
inputs[i].atomic_compaction_unit_boundaries.size());
}
return inputs;
}
// helper function to determine if compaction is creating files at the // helper function to determine if compaction is creating files at the
// bottommost level // bottommost level
bool Compaction::IsBottommostLevel( bool Compaction::IsBottommostLevel(
@ -155,7 +235,7 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
output_compression_(_compression), output_compression_(_compression),
output_compression_opts_(_compression_opts), output_compression_opts_(_compression_opts),
deletion_compaction_(_deletion_compaction), deletion_compaction_(_deletion_compaction),
inputs_(std::move(_inputs)), inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
grandparents_(std::move(_grandparents)), grandparents_(std::move(_grandparents)),
score_(_score), score_(_score),
bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)), bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),

@ -15,11 +15,43 @@
namespace rocksdb { namespace rocksdb {
// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
// null which provides the property that a==null indicates a key that is less
// than any key and b==null indicates a key that is greater than any key. Note
// that the comparison is performed primarily on the user-key portion of the
// key. If the user-keys compare equal, an additional test is made to sort
// range tombstone sentinel keys before other keys with the same user-key. The
// result is that 2 user-keys will compare equal if they differ purely on
// their sequence number and value, but the range tombstone sentinel for that
// user-key will compare not equal. This is necessary because the range
// tombstone sentinel key is set as the largest key for an sstable even though
// that key never appears in the database. We don't want adjacent sstables to
// be considered overlapping if they are separated by the range tombstone
// sentinel.
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey& b);
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
const InternalKey& b);
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey* b);
// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
// largest] that exactly spans one ore more neighbouring SSTs on the same
// level. Every pair of SSTs in this range "overlap" (i.e., the largest
// user key of one file is the smallest user key of the next file). These
// boundaries are propagated down to RangeDelAggregator during compaction
// to provide safe truncation boundaries for range tombstones.
struct AtomicCompactionUnitBoundary {
const InternalKey* smallest = nullptr;
const InternalKey* largest = nullptr;
};
// The structure that manages compaction input files associated // The structure that manages compaction input files associated
// with the same physical level. // with the same physical level.
struct CompactionInputFiles { struct CompactionInputFiles {
int level; int level;
std::vector<FileMetaData*> files; std::vector<FileMetaData*> files;
std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
inline bool empty() const { return files.empty(); } inline bool empty() const { return files.empty(); }
inline size_t size() const { return files.size(); } inline size_t size() const { return files.size(); }
inline void clear() { files.clear(); } inline void clear() { files.clear(); }
@ -96,6 +128,12 @@ class Compaction {
return inputs_[compaction_input_level][i]; return inputs_[compaction_input_level][i];
} }
const std::vector<AtomicCompactionUnitBoundary>* boundaries(
size_t compaction_input_level) const {
assert(compaction_input_level < inputs_.size());
return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
}
// Returns the list of file meta data of the specified compaction // Returns the list of file meta data of the specified compaction
// input level. // input level.
// REQUIREMENT: "compaction_input_level" must be >= 0 and // REQUIREMENT: "compaction_input_level" must be >= 0 and
@ -262,6 +300,13 @@ class Compaction {
const std::vector<CompactionInputFiles>& inputs, const std::vector<CompactionInputFiles>& inputs,
Slice* smallest_key, Slice* largest_key); Slice* smallest_key, Slice* largest_key);
// Get the atomic file boundaries for all files in the compaction. Necessary
// in order to avoid the scenario described in
// https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb
// down appropriate key boundaries to RangeDelAggregator during compaction.
static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
// helper function to determine if compaction with inputs and storage is // helper function to determine if compaction with inputs and storage is
// bottommost // bottommost
static bool IsBottommostLevel( static bool IsBottommostLevel(

@ -1212,10 +1212,11 @@ Status CompactionJob::FinishCompactionOutputFile(
for (; it->Valid(); it->Next()) { for (; it->Valid(); it->Next()) {
auto tombstone = it->Tombstone(); auto tombstone = it->Tombstone();
if (upper_bound != nullptr && if (upper_bound != nullptr &&
ucmp->Compare(*upper_bound, tombstone.start_key_) <= 0) { ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) {
// Tombstones starting at upper_bound or later only need to be included // Tombstones starting after upper_bound only need to be included in the
// in the next table. Break because subsequent tombstones will start // next table (if the SSTs overlap, then upper_bound is contained in
// even later. // this SST and hence must be covered). Break because subsequent
// tombstones will start even later.
break; break;
} }

@ -10,20 +10,39 @@
namespace rocksdb { namespace rocksdb {
namespace {
struct TombstoneStartKeyComparator { struct TombstoneStartKeyComparator {
TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {} explicit TombstoneStartKeyComparator(const InternalKeyComparator* c)
: cmp(c) {}
bool operator()(const RangeTombstone& a, const RangeTombstone& b) const { bool operator()(const TruncatedRangeTombstone& a,
const TruncatedRangeTombstone& b) const {
return cmp->Compare(a.start_key_, b.start_key_) < 0; return cmp->Compare(a.start_key_, b.start_key_) < 0;
} }
const Comparator* cmp; const InternalKeyComparator* cmp;
};
struct ParsedInternalKeyComparator {
explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
: cmp(c) {}
bool operator()(const ParsedInternalKey& a,
const ParsedInternalKey& b) const {
return cmp->Compare(a, b) < 0;
}
const InternalKeyComparator* cmp;
}; };
} // namespace
// An UncollapsedRangeDelMap is quick to create but slow to answer ShouldDelete // An UncollapsedRangeDelMap is quick to create but slow to answer ShouldDelete
// queries. // queries.
class UncollapsedRangeDelMap : public RangeDelMap { class UncollapsedRangeDelMap : public RangeDelMap {
typedef std::multiset<RangeTombstone, TombstoneStartKeyComparator> Rep; typedef std::multiset<TruncatedRangeTombstone, TombstoneStartKeyComparator>
Rep;
class Iterator : public RangeDelIterator { class Iterator : public RangeDelIterator {
const Rep& rep_; const Rep& rep_;
@ -35,48 +54,57 @@ class UncollapsedRangeDelMap : public RangeDelMap {
void Next() override { iter_++; } void Next() override { iter_++; }
void Seek(const Slice&) override { void Seek(const Slice&) override {
fprintf(stderr, "UncollapsedRangeDelMap::Iterator::Seek unimplemented\n"); fprintf(stderr,
"UncollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n");
abort();
}
void Seek(const ParsedInternalKey&) override {
fprintf(stderr,
"UncollapsedRangeDelMap::Iterator::Seek(ParsedInternalKey&) "
"unimplemented\n");
abort(); abort();
} }
RangeTombstone Tombstone() const override { return *iter_; } RangeTombstone Tombstone() const override { return iter_->Tombstone(); }
}; };
Rep rep_; Rep rep_;
const Comparator* ucmp_; const InternalKeyComparator* icmp_;
public: public:
UncollapsedRangeDelMap(const Comparator* ucmp) explicit UncollapsedRangeDelMap(const InternalKeyComparator* icmp)
: rep_(TombstoneStartKeyComparator(ucmp)), ucmp_(ucmp) {} : rep_(TombstoneStartKeyComparator(icmp)), icmp_(icmp) {}
bool ShouldDelete(const ParsedInternalKey& parsed, bool ShouldDelete(const ParsedInternalKey& parsed,
RangeDelPositioningMode mode) override { RangeDelPositioningMode mode) override {
(void)mode; (void)mode;
assert(mode == RangeDelPositioningMode::kFullScan); assert(mode == RangeDelPositioningMode::kFullScan);
for (const auto& tombstone : rep_) { for (const auto& tombstone : rep_) {
if (ucmp_->Compare(parsed.user_key, tombstone.start_key_) < 0) { if (icmp_->Compare(parsed, tombstone.start_key_) < 0) {
break; break;
} }
if (parsed.sequence < tombstone.seq_ && if (parsed.sequence < tombstone.seq_ &&
ucmp_->Compare(parsed.user_key, tombstone.end_key_) < 0) { icmp_->Compare(parsed, tombstone.end_key_) < 0) {
return true; return true;
} }
} }
return false; return false;
} }
bool IsRangeOverlapped(const Slice& start, const Slice& end) override { bool IsRangeOverlapped(const ParsedInternalKey& start,
const ParsedInternalKey& end) override {
for (const auto& tombstone : rep_) { for (const auto& tombstone : rep_) {
if (ucmp_->Compare(start, tombstone.end_key_) < 0 && if (icmp_->Compare(start, tombstone.end_key_) < 0 &&
ucmp_->Compare(tombstone.start_key_, end) <= 0 && icmp_->Compare(tombstone.start_key_, end) <= 0 &&
ucmp_->Compare(tombstone.start_key_, tombstone.end_key_) < 0) { icmp_->Compare(tombstone.start_key_, tombstone.end_key_) < 0) {
return true; return true;
} }
} }
return false; return false;
} }
void AddTombstone(RangeTombstone tombstone) override { void AddTombstone(TruncatedRangeTombstone tombstone) override {
rep_.emplace(tombstone); rep_.emplace(tombstone);
} }
@ -126,7 +154,9 @@ class UncollapsedRangeDelMap : public RangeDelMap {
// compared against the map entry g → 3 and determined to be uncovered. By // compared against the map entry g → 3 and determined to be uncovered. By
// contrast, the key h @ 2 would be determined to be covered. // contrast, the key h @ 2 would be determined to be covered.
class CollapsedRangeDelMap : public RangeDelMap { class CollapsedRangeDelMap : public RangeDelMap {
typedef std::map<Slice, SequenceNumber, stl_wrappers::LessOfComparator> Rep; typedef std::map<ParsedInternalKey, SequenceNumber,
ParsedInternalKeyComparator>
Rep;
class Iterator : public RangeDelIterator { class Iterator : public RangeDelIterator {
void MaybeSeekPastSentinel() { void MaybeSeekPastSentinel() {
@ -148,7 +178,12 @@ class CollapsedRangeDelMap : public RangeDelMap {
MaybeSeekPastSentinel(); MaybeSeekPastSentinel();
} }
void Seek(const Slice& target) override { void Seek(const Slice&) override {
fprintf(stderr, "CollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n");
abort();
}
void Seek(const ParsedInternalKey& target) override {
iter_ = rep_.upper_bound(target); iter_ = rep_.upper_bound(target);
if (iter_ != rep_.begin()) { if (iter_ != rep_.begin()) {
iter_--; iter_--;
@ -161,8 +196,8 @@ class CollapsedRangeDelMap : public RangeDelMap {
assert(std::next(iter_) != rep_.end()); assert(std::next(iter_) != rep_.end());
assert(iter_->second != 0); assert(iter_->second != 0);
RangeTombstone tombstone; RangeTombstone tombstone;
tombstone.start_key_ = iter_->first; tombstone.start_key_ = iter_->first.user_key;
tombstone.end_key_ = std::next(iter_)->first; tombstone.end_key_ = std::next(iter_)->first.user_key;
tombstone.seq_ = iter_->second; tombstone.seq_ = iter_->second;
return tombstone; return tombstone;
} }
@ -170,12 +205,12 @@ class CollapsedRangeDelMap : public RangeDelMap {
Rep rep_; Rep rep_;
Rep::iterator iter_; Rep::iterator iter_;
const Comparator* ucmp_; const InternalKeyComparator* icmp_;
public: public:
explicit CollapsedRangeDelMap(const Comparator* ucmp) explicit CollapsedRangeDelMap(const InternalKeyComparator* icmp)
: rep_(stl_wrappers::LessOfComparator(ucmp)), : rep_(ParsedInternalKeyComparator(icmp)),
ucmp_(ucmp) { icmp_(icmp) {
InvalidatePosition(); InvalidatePosition();
} }
@ -194,29 +229,29 @@ class CollapsedRangeDelMap : public RangeDelMap {
case RangeDelPositioningMode::kForwardTraversal: case RangeDelPositioningMode::kForwardTraversal:
assert(iter_ != rep_.end()); assert(iter_ != rep_.end());
if (iter_ == rep_.begin() && if (iter_ == rep_.begin() &&
ucmp_->Compare(parsed.user_key, iter_->first) < 0) { icmp_->Compare(parsed, iter_->first) < 0) {
// before start of deletion intervals // before start of deletion intervals
return false; return false;
} }
while (std::next(iter_) != rep_.end() && while (std::next(iter_) != rep_.end() &&
ucmp_->Compare(std::next(iter_)->first, parsed.user_key) <= 0) { icmp_->Compare(std::next(iter_)->first, parsed) <= 0) {
++iter_; ++iter_;
} }
break; break;
case RangeDelPositioningMode::kBackwardTraversal: case RangeDelPositioningMode::kBackwardTraversal:
assert(iter_ != rep_.end()); assert(iter_ != rep_.end());
while (iter_ != rep_.begin() && while (iter_ != rep_.begin() &&
ucmp_->Compare(parsed.user_key, iter_->first) < 0) { icmp_->Compare(parsed, iter_->first) < 0) {
--iter_; --iter_;
} }
if (iter_ == rep_.begin() && if (iter_ == rep_.begin() &&
ucmp_->Compare(parsed.user_key, iter_->first) < 0) { icmp_->Compare(parsed, iter_->first) < 0) {
// before start of deletion intervals // before start of deletion intervals
return false; return false;
} }
break; break;
case RangeDelPositioningMode::kBinarySearch: case RangeDelPositioningMode::kBinarySearch:
iter_ = rep_.upper_bound(parsed.user_key); iter_ = rep_.upper_bound(parsed);
if (iter_ == rep_.begin()) { if (iter_ == rep_.begin()) {
// before start of deletion intervals // before start of deletion intervals
return false; return false;
@ -225,21 +260,22 @@ class CollapsedRangeDelMap : public RangeDelMap {
break; break;
} }
assert(iter_ != rep_.end() && assert(iter_ != rep_.end() &&
ucmp_->Compare(iter_->first, parsed.user_key) <= 0); icmp_->Compare(iter_->first, parsed) <= 0);
assert(std::next(iter_) == rep_.end() || assert(std::next(iter_) == rep_.end() ||
ucmp_->Compare(parsed.user_key, std::next(iter_)->first) < 0); icmp_->Compare(parsed, std::next(iter_)->first) < 0);
return parsed.sequence < iter_->second; return parsed.sequence < iter_->second;
} }
bool IsRangeOverlapped(const Slice&, const Slice&) override { bool IsRangeOverlapped(const ParsedInternalKey&,
const ParsedInternalKey&) override {
// Unimplemented because the only client of this method, file ingestion, // Unimplemented because the only client of this method, file ingestion,
// uses uncollapsed maps. // uses uncollapsed maps.
fprintf(stderr, "CollapsedRangeDelMap::IsRangeOverlapped unimplemented"); fprintf(stderr, "CollapsedRangeDelMap::IsRangeOverlapped unimplemented");
abort(); abort();
} }
void AddTombstone(RangeTombstone t) override { void AddTombstone(TruncatedRangeTombstone t) override {
if (ucmp_->Compare(t.start_key_, t.end_key_) >= 0 || t.seq_ == 0) { if (icmp_->Compare(t.start_key_, t.end_key_) >= 0 || t.seq_ == 0) {
// The tombstone covers no keys. Nothing to do. // The tombstone covers no keys. Nothing to do.
return; return;
} }
@ -272,7 +308,8 @@ class CollapsedRangeDelMap : public RangeDelMap {
end_seq = prev_seq(); end_seq = prev_seq();
Rep::iterator pit; Rep::iterator pit;
if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() && if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() &&
ucmp_->Compare(pit->first, t.start_key_) == 0 && std::prev(pit)->second == t.seq_) { icmp_->Compare(pit->first, t.start_key_) == 0 &&
std::prev(pit)->second == t.seq_) {
// The new tombstone starts at the end of an existing tombstone with an // The new tombstone starts at the end of an existing tombstone with an
// identical seqno: // identical seqno:
// //
@ -297,7 +334,7 @@ class CollapsedRangeDelMap : public RangeDelMap {
} }
// Look at all the existing transitions that overlap the new tombstone. // Look at all the existing transitions that overlap the new tombstone.
while (it != rep_.end() && ucmp_->Compare(it->first, t.end_key_) < 0) { while (it != rep_.end() && icmp_->Compare(it->first, t.end_key_) < 0) {
if (t.seq_ >= it->second) { if (t.seq_ >= it->second) {
// The transition is to an existing tombstone that the new tombstone // The transition is to an existing tombstone that the new tombstone
// covers. Save the covered tombstone's seqno. We'll need to return to // covers. Save the covered tombstone's seqno. We'll need to return to
@ -343,12 +380,14 @@ class CollapsedRangeDelMap : public RangeDelMap {
if (t.seq_ == prev_seq()) { if (t.seq_ == prev_seq()) {
// The new tombstone is unterminated in the map. // The new tombstone is unterminated in the map.
if (it != rep_.end() && t.seq_ == it->second && ucmp_->Compare(it->first, t.end_key_) == 0) { if (it != rep_.end() && t.seq_ == it->second &&
icmp_->Compare(it->first, t.end_key_) == 0) {
// The new tombstone ends at the start of another tombstone with an // The new tombstone ends at the start of another tombstone with an
// identical seqno. Merge the tombstones by removing the existing // identical seqno. Merge the tombstones by removing the existing
// tombstone's start key. // tombstone's start key.
rep_.erase(it); rep_.erase(it);
} else if (end_seq == prev_seq() || (it != rep_.end() && end_seq == it->second)) { } else if (end_seq == prev_seq() ||
(it != rep_.end() && end_seq == it->second)) {
// The new tombstone is implicitly ended because its end point is // The new tombstone is implicitly ended because its end point is
// contained within an existing tombstone with the same seqno: // contained within an existing tombstone with the same seqno:
// //
@ -363,7 +402,8 @@ class CollapsedRangeDelMap : public RangeDelMap {
// Install one that returns to the last seqno we covered. Because end // Install one that returns to the last seqno we covered. Because end
// keys are exclusive, if there's an existing transition at t.end_key_, // keys are exclusive, if there's an existing transition at t.end_key_,
// it takes precedence over the transition that we install here. // it takes precedence over the transition that we install here.
rep_.emplace(t.end_key_, end_seq); // emplace is a noop if existing entry rep_.emplace(t.end_key_,
end_seq); // emplace is a noop if existing entry
} }
} else { } else {
// The new tombstone is implicitly ended because its end point is covered // The new tombstone is implicitly ended because its end point is covered
@ -416,9 +456,9 @@ void RangeDelAggregator::InitRep(const std::vector<SequenceNumber>& snapshots) {
std::unique_ptr<RangeDelMap> RangeDelAggregator::NewRangeDelMap() { std::unique_ptr<RangeDelMap> RangeDelAggregator::NewRangeDelMap() {
RangeDelMap* tombstone_map; RangeDelMap* tombstone_map;
if (collapse_deletions_) { if (collapse_deletions_) {
tombstone_map = new CollapsedRangeDelMap(icmp_.user_comparator()); tombstone_map = new CollapsedRangeDelMap(&icmp_);
} else { } else {
tombstone_map = new UncollapsedRangeDelMap(icmp_.user_comparator()); tombstone_map = new UncollapsedRangeDelMap(&icmp_);
} }
return std::unique_ptr<RangeDelMap>(tombstone_map); return std::unique_ptr<RangeDelMap>(tombstone_map);
} }
@ -429,8 +469,9 @@ bool RangeDelAggregator::ShouldDeleteImpl(const Slice& internal_key,
ParsedInternalKey parsed; ParsedInternalKey parsed;
if (!ParseInternalKey(internal_key, &parsed)) { if (!ParseInternalKey(internal_key, &parsed)) {
assert(false); assert(false);
return false;
} }
return ShouldDelete(parsed, mode); return ShouldDeleteImpl(parsed, mode);
} }
bool RangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed, bool RangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed,
@ -452,8 +493,10 @@ bool RangeDelAggregator::IsRangeOverlapped(const Slice& start,
if (rep_ == nullptr) { if (rep_ == nullptr) {
return false; return false;
} }
ParsedInternalKey start_ikey(start, kMaxSequenceNumber, kMaxValue);
ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
for (const auto& stripe : rep_->stripe_map_) { for (const auto& stripe : rep_->stripe_map_) {
if (stripe.second->IsRangeOverlapped(start, end)) { if (stripe.second->IsRangeOverlapped(start_ikey, end_ikey)) {
return true; return true;
} }
} }
@ -492,40 +535,49 @@ Status RangeDelAggregator::AddTombstones(
if (!parsed) { if (!parsed) {
return Status::Corruption("Unable to parse range tombstone InternalKey"); return Status::Corruption("Unable to parse range tombstone InternalKey");
} }
RangeTombstone tombstone; Slice end_user_key;
if (input->IsValuePinned()) { if (input->IsValuePinned()) {
tombstone = RangeTombstone(parsed_key, input->value()); end_user_key = input->value();
} else { } else {
// The tombstone map holds slices into the iterator's memory. Make a // The tombstone map holds slices into the iterator's memory. Make a
// copy of the value if it is not pinned. // copy of the value if it is not pinned.
rep_->pinned_slices_.emplace_back(input->value().data(), rep_->pinned_slices_.emplace_back(input->value().data(),
input->value().size()); input->value().size());
tombstone = RangeTombstone(parsed_key, rep_->pinned_slices_.back()); end_user_key = rep_->pinned_slices_.back();
} }
ParsedInternalKey start_key(parsed_key.user_key, kMaxSequenceNumber,
kMaxValue);
ParsedInternalKey end_key(end_user_key, kMaxSequenceNumber, kMaxValue);
// Truncate the tombstone to the range [smallest, largest]. // Truncate the tombstone to the range [smallest, largest].
if (smallest != nullptr) { if (smallest != nullptr) {
if (icmp_.user_comparator()->Compare( ParsedInternalKey parsed_smallest;
tombstone.start_key_, smallest->user_key()) < 0) { if (ParseInternalKey(smallest->Encode(), &parsed_smallest) &&
tombstone.start_key_ = smallest->user_key(); icmp_.Compare(start_key, parsed_smallest) < 0) {
start_key.user_key = parsed_smallest.user_key;
start_key.sequence = parsed_smallest.sequence;
} }
} }
if (largest != nullptr) { if (largest != nullptr) {
// To safely truncate the range tombstone's end key, it must extend past ParsedInternalKey parsed_largest;
// the largest key in the sstable (which may have been extended to the if (ParseInternalKey(largest->Encode(), &parsed_largest) &&
// smallest key in the next sstable), and largest must be a tombstone icmp_.Compare(end_key, parsed_largest) > 0) {
// sentinel key. A range tombstone may straddle two sstables and not be end_key.user_key = parsed_largest.user_key;
// the tombstone sentinel key in the first sstable if a user-key also if (parsed_largest.sequence != kMaxSequenceNumber) {
// straddles the sstables (possible if there is a snapshot between the // The same user key straddles two adjacent sstables. To make sure we
// two versions of the user-key), in which case we cannot truncate the // can truncate to a range that includes the largest point key in the
// range tombstone. // first sstable, set the tombstone end key's sequence number to 1
if (icmp_.user_comparator()->Compare(tombstone.end_key_, // less than the largest key.
largest->user_key()) > 0 && assert(parsed_largest.sequence != 0);
GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber) { end_key.sequence = parsed_largest.sequence - 1;
tombstone.end_key_ = largest->user_key(); } else {
// The SST file boundary was artificially extended by a range tombstone.
// We will not see any entries in this SST with this user key, so we
// can leave the seqnum at kMaxSequenceNumber.
}
} }
} }
auto seq = tombstone.seq_; TruncatedRangeTombstone tombstone(start_key, end_key, parsed_key.sequence);
GetRangeDelMap(seq).AddTombstone(std::move(tombstone)); GetRangeDelMap(parsed_key.sequence).AddTombstone(std::move(tombstone));
input->Next(); input->Next();
} }
if (!first_iter) { if (!first_iter) {
@ -604,6 +656,11 @@ class MergingRangeDelIter : public RangeDelIterator {
} }
void Seek(const Slice& target) override { void Seek(const Slice& target) override {
ParsedInternalKey ikey(target, kMaxSequenceNumber, kMaxValue);
Seek(ikey);
}
void Seek(const ParsedInternalKey& target) override {
heap_.clear(); heap_.clear();
for (auto& iter : iters_) { for (auto& iter : iters_) {
iter->Seek(target); iter->Seek(target);

@ -40,6 +40,35 @@ enum class RangeDelPositioningMode {
kBinarySearch, kBinarySearch,
}; };
// TruncatedRangeTombstones are a slight generalization of regular
// RangeTombstones that can represent truncations caused by SST boundaries.
// Instead of using user keys to represent the start and end keys, they instead
// use internal keys, whose sequence number indicates the sequence number of
// the smallest/largest SST key (in the case where a tombstone is untruncated,
// the sequence numbers will be kMaxSequenceNumber for both start and end
// keys). Like RangeTombstones, TruncatedRangeTombstone are also
// end-key-exclusive.
struct TruncatedRangeTombstone {
TruncatedRangeTombstone(const ParsedInternalKey& sk,
const ParsedInternalKey& ek, SequenceNumber s)
: start_key_(sk), end_key_(ek), seq_(s) {}
RangeTombstone Tombstone() const {
// The RangeTombstone returned here can cover less than the
// TruncatedRangeTombstone when its end key has a seqnum that is not
// kMaxSequenceNumber. Since this method is only used by RangeDelIterators
// (which in turn are only used during flush/compaction), we avoid this
// problem by using truncation boundaries spanning multiple SSTs, which
// are selected in a way that guarantee a clean break at the end key.
assert(end_key_.sequence == kMaxSequenceNumber);
return RangeTombstone(start_key_.user_key, end_key_.user_key, seq_);
}
ParsedInternalKey start_key_;
ParsedInternalKey end_key_;
SequenceNumber seq_;
};
// A RangeDelIterator iterates over range deletion tombstones. // A RangeDelIterator iterates over range deletion tombstones.
class RangeDelIterator { class RangeDelIterator {
public: public:
@ -47,7 +76,9 @@ class RangeDelIterator {
virtual bool Valid() const = 0; virtual bool Valid() const = 0;
virtual void Next() = 0; virtual void Next() = 0;
// NOTE: the Slice passed to this method must be a user key.
virtual void Seek(const Slice& target) = 0; virtual void Seek(const Slice& target) = 0;
virtual void Seek(const ParsedInternalKey& target) = 0;
virtual RangeTombstone Tombstone() const = 0; virtual RangeTombstone Tombstone() const = 0;
}; };
@ -62,13 +93,14 @@ class RangeDelMap {
virtual bool ShouldDelete(const ParsedInternalKey& parsed, virtual bool ShouldDelete(const ParsedInternalKey& parsed,
RangeDelPositioningMode mode) = 0; RangeDelPositioningMode mode) = 0;
virtual bool IsRangeOverlapped(const Slice& start, const Slice& end) = 0; virtual bool IsRangeOverlapped(const ParsedInternalKey& start,
const ParsedInternalKey& end) = 0;
virtual void InvalidatePosition() = 0; virtual void InvalidatePosition() = 0;
virtual size_t Size() const = 0; virtual size_t Size() const = 0;
bool IsEmpty() const { return Size() == 0; } bool IsEmpty() const { return Size() == 0; }
virtual void AddTombstone(RangeTombstone tombstone) = 0; virtual void AddTombstone(TruncatedRangeTombstone tombstone) = 0;
virtual std::unique_ptr<RangeDelIterator> NewIterator() = 0; virtual std::unique_ptr<RangeDelIterator> NewIterator() = 0;
}; };

@ -27,6 +27,12 @@ enum Direction {
kReverse, kReverse,
}; };
struct AddTombstonesArgs {
const std::vector<RangeTombstone> tombstones;
const InternalKey* smallest;
const InternalKey* largest;
};
static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
void AddTombstones(RangeDelAggregator* range_del_agg, void AddTombstones(RangeDelAggregator* range_del_agg,
@ -54,8 +60,7 @@ void VerifyRangeDelIter(
RangeDelIterator* range_del_iter, RangeDelIterator* range_del_iter,
const std::vector<RangeTombstone>& expected_range_dels) { const std::vector<RangeTombstone>& expected_range_dels) {
size_t i = 0; size_t i = 0;
for (; range_del_iter->Valid() && i < expected_range_dels.size(); for (; range_del_iter->Valid(); range_del_iter->Next(), i++) {
range_del_iter->Next(), i++) {
VerifyTombstonesEq(expected_range_dels[i], range_del_iter->Tombstone()); VerifyTombstonesEq(expected_range_dels[i], range_del_iter->Tombstone());
} }
ASSERT_EQ(expected_range_dels.size(), i); ASSERT_EQ(expected_range_dels.size(), i);
@ -63,22 +68,26 @@ void VerifyRangeDelIter(
} }
void VerifyRangeDels( void VerifyRangeDels(
const std::vector<RangeTombstone>& range_dels_in, const std::vector<AddTombstonesArgs>& all_args,
const std::vector<ExpectedPoint>& expected_points, const std::vector<ExpectedPoint>& expected_points,
const std::vector<RangeTombstone>& expected_collapsed_range_dels, const std::vector<RangeTombstone>& expected_collapsed_range_dels,
const InternalKey* smallest = nullptr, const InternalKey* largest = nullptr,
const InternalKeyComparator& icmp = bytewise_icmp) { const InternalKeyComparator& icmp = bytewise_icmp) {
// Test same result regardless of which order the range deletions are added // Test same result regardless of which order the range deletions are added
// and regardless of collapsed mode. // and regardless of collapsed mode.
for (bool collapsed : {false, true}) { for (bool collapsed : {false, true}) {
for (Direction dir : {kForward, kReverse}) { for (Direction dir : {kForward, kReverse}) {
RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, collapsed); RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, collapsed);
std::vector<RangeTombstone> all_range_dels;
std::vector<RangeTombstone> range_dels = range_dels_in; for (const auto& args : all_args) {
if (dir == kReverse) { std::vector<RangeTombstone> range_dels = args.tombstones;
std::reverse(range_dels.begin(), range_dels.end()); if (dir == kReverse) {
std::reverse(range_dels.begin(), range_dels.end());
}
all_range_dels.insert(all_range_dels.end(), range_dels.begin(),
range_dels.end());
AddTombstones(&range_del_agg, range_dels, args.smallest, args.largest);
} }
AddTombstones(&range_del_agg, range_dels, smallest, largest);
auto mode = RangeDelPositioningMode::kFullScan; auto mode = RangeDelPositioningMode::kFullScan;
if (collapsed) { if (collapsed) {
@ -90,38 +99,45 @@ void VerifyRangeDels(
parsed_key.user_key = expected_point.begin; parsed_key.user_key = expected_point.begin;
parsed_key.sequence = expected_point.seq; parsed_key.sequence = expected_point.seq;
parsed_key.type = kTypeValue; parsed_key.type = kTypeValue;
ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode)); std::string ikey;
AppendInternalKey(&ikey, parsed_key);
ASSERT_FALSE(range_del_agg.ShouldDelete(ikey, mode));
if (parsed_key.sequence > 0) { if (parsed_key.sequence > 0) {
--parsed_key.sequence; --parsed_key.sequence;
ikey.clear();
AppendInternalKey(&ikey, parsed_key);
if (expected_point.expectAlive) { if (expected_point.expectAlive) {
ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode)); ASSERT_FALSE(range_del_agg.ShouldDelete(ikey, mode));
} else { } else {
ASSERT_TRUE(range_del_agg.ShouldDelete(parsed_key, mode)); ASSERT_TRUE(range_del_agg.ShouldDelete(ikey, mode));
} }
} }
} }
if (collapsed) { if (collapsed) {
range_dels = expected_collapsed_range_dels; all_range_dels = expected_collapsed_range_dels;
VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels); VerifyRangeDelIter(range_del_agg.NewIterator().get(), all_range_dels);
} else if (smallest == nullptr && largest == nullptr) { } else if (all_args.size() == 1 && all_args[0].smallest == nullptr &&
all_args[0].largest == nullptr) {
// Tombstones in an uncollapsed map are presented in start key // Tombstones in an uncollapsed map are presented in start key
// order. Tombstones with the same start key are presented in // order. Tombstones with the same start key are presented in
// insertion order. We don't handle tombstone truncation here, so the // insertion order. We don't handle tombstone truncation here, so the
// verification is only performed if no truncation was requested. // verification is only performed if no truncation was requested.
std::stable_sort(range_dels.begin(), range_dels.end(), std::stable_sort(all_range_dels.begin(), all_range_dels.end(),
[&](const RangeTombstone& a, const RangeTombstone& b) { [&](const RangeTombstone& a, const RangeTombstone& b) {
return icmp.user_comparator()->Compare( return icmp.user_comparator()->Compare(
a.start_key_, b.start_key_) < 0; a.start_key_, b.start_key_) < 0;
}); });
VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels); VerifyRangeDelIter(range_del_agg.NewIterator().get(), all_range_dels);
} }
} }
} }
RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, RangeDelAggregator range_del_agg(icmp, {} /* snapshots */,
false /* collapse_deletions */); false /* collapse_deletions */);
AddTombstones(&range_del_agg, range_dels_in); for (const auto& args : all_args) {
AddTombstones(&range_del_agg, args.tombstones, args.smallest, args.largest);
}
for (size_t i = 1; i < expected_points.size(); ++i) { for (size_t i = 1; i < expected_points.size(); ++i) {
bool overlapped = range_del_agg.IsRangeOverlapped( bool overlapped = range_del_agg.IsRangeOverlapped(
expected_points[i - 1].begin, expected_points[i].begin); expected_points[i - 1].begin, expected_points[i].begin);
@ -138,65 +154,64 @@ void VerifyRangeDels(
TEST_F(RangeDelAggregatorTest, Empty) { VerifyRangeDels({}, {{"a", 0}}, {}); } TEST_F(RangeDelAggregatorTest, Empty) { VerifyRangeDels({}, {{"a", 0}}, {}); }
TEST_F(RangeDelAggregatorTest, SameStartAndEnd) { TEST_F(RangeDelAggregatorTest, SameStartAndEnd) {
VerifyRangeDels({{"a", "a", 5}}, {{" ", 0}, {"a", 0}, {"b", 0}}, {}); VerifyRangeDels({{{{"a", "a", 5}}}}, {{" ", 0}, {"a", 0}, {"b", 0}}, {});
} }
TEST_F(RangeDelAggregatorTest, Single) { TEST_F(RangeDelAggregatorTest, Single) {
VerifyRangeDels({{"a", "b", 10}}, {{" ", 0}, {"a", 10}, {"b", 0}}, VerifyRangeDels({{{{"a", "b", 10}}}}, {{" ", 0}, {"a", 10}, {"b", 0}},
{{"a", "b", 10}}); {{"a", "b", 10}});
} }
TEST_F(RangeDelAggregatorTest, OverlapAboveLeft) { TEST_F(RangeDelAggregatorTest, OverlapAboveLeft) {
VerifyRangeDels({{"a", "c", 10}, {"b", "d", 5}}, VerifyRangeDels({{{{"a", "c", 10}, {"b", "d", 5}}}},
{{" ", 0}, {"a", 10}, {"c", 5}, {"d", 0}}, {{" ", 0}, {"a", 10}, {"c", 5}, {"d", 0}},
{{"a", "c", 10}, {"c", "d", 5}}); {{"a", "c", 10}, {"c", "d", 5}});
} }
TEST_F(RangeDelAggregatorTest, OverlapAboveRight) { TEST_F(RangeDelAggregatorTest, OverlapAboveRight) {
VerifyRangeDels({{"a", "c", 5}, {"b", "d", 10}}, VerifyRangeDels({{{{"a", "c", 5}, {"b", "d", 10}}}},
{{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}},
{{"a", "b", 5}, {"b", "d", 10}}); {{"a", "b", 5}, {"b", "d", 10}});
} }
TEST_F(RangeDelAggregatorTest, OverlapAboveMiddle) { TEST_F(RangeDelAggregatorTest, OverlapAboveMiddle) {
VerifyRangeDels({{"a", "d", 5}, {"b", "c", 10}}, VerifyRangeDels({{{{"a", "d", 5}, {"b", "c", 10}}}},
{{" ", 0}, {"a", 5}, {"b", 10}, {"c", 5}, {"d", 0}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 5}, {"d", 0}},
{{"a", "b", 5}, {"b", "c", 10}, {"c", "d", 5}}); {{"a", "b", 5}, {"b", "c", 10}, {"c", "d", 5}});
} }
TEST_F(RangeDelAggregatorTest, OverlapAboveMiddleReverse) { TEST_F(RangeDelAggregatorTest, OverlapAboveMiddleReverse) {
VerifyRangeDels({{"d", "a", 5}, {"c", "b", 10}}, VerifyRangeDels({{{{"d", "a", 5}, {"c", "b", 10}}}},
{{"z", 0}, {"d", 5}, {"c", 10}, {"b", 5}, {"a", 0}}, {{"z", 0}, {"d", 5}, {"c", 10}, {"b", 5}, {"a", 0}},
{{"d", "c", 5}, {"c", "b", 10}, {"b", "a", 5}}, {{"d", "c", 5}, {"c", "b", 10}, {"b", "a", 5}},
nullptr /* smallest */, nullptr /* largest */,
InternalKeyComparator(ReverseBytewiseComparator())); InternalKeyComparator(ReverseBytewiseComparator()));
} }
TEST_F(RangeDelAggregatorTest, OverlapFully) { TEST_F(RangeDelAggregatorTest, OverlapFully) {
VerifyRangeDels({{"a", "d", 10}, {"b", "c", 5}}, VerifyRangeDels({{{{"a", "d", 10}, {"b", "c", 5}}}},
{{" ", 0}, {"a", 10}, {"d", 0}}, {{"a", "d", 10}}); {{" ", 0}, {"a", 10}, {"d", 0}}, {{"a", "d", 10}});
} }
TEST_F(RangeDelAggregatorTest, OverlapPoint) { TEST_F(RangeDelAggregatorTest, OverlapPoint) {
VerifyRangeDels({{"a", "b", 5}, {"b", "c", 10}}, VerifyRangeDels({{{{"a", "b", 5}, {"b", "c", 10}}}},
{{" ", 0}, {"a", 5}, {"b", 10}, {"c", 0}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 0}},
{{"a", "b", 5}, {"b", "c", 10}}); {{"a", "b", 5}, {"b", "c", 10}});
} }
TEST_F(RangeDelAggregatorTest, SameStartKey) { TEST_F(RangeDelAggregatorTest, SameStartKey) {
VerifyRangeDels({{"a", "c", 5}, {"a", "b", 10}}, VerifyRangeDels({{{{"a", "c", 5}, {"a", "b", 10}}}},
{{" ", 0}, {"a", 10}, {"b", 5}, {"c", 0}}, {{" ", 0}, {"a", 10}, {"b", 5}, {"c", 0}},
{{"a", "b", 10}, {"b", "c", 5}}); {{"a", "b", 10}, {"b", "c", 5}});
} }
TEST_F(RangeDelAggregatorTest, SameEndKey) { TEST_F(RangeDelAggregatorTest, SameEndKey) {
VerifyRangeDels({{"a", "d", 5}, {"b", "d", 10}}, VerifyRangeDels({{{{"a", "d", 5}, {"b", "d", 10}}}},
{{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}},
{{"a", "b", 5}, {"b", "d", 10}}); {{"a", "b", 5}, {"b", "d", 10}});
} }
TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) { TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) {
VerifyRangeDels({{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}, VerifyRangeDels({{{{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}}},
{{" ", 0}, {{" ", 0},
{"a", 5}, {"a", 5},
{"b", 0}, {"b", 0},
@ -209,25 +224,25 @@ TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) {
} }
TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) { TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) {
VerifyRangeDels({{"a", "b", 5}, {"a", "b", 5}}, VerifyRangeDels({{{{"a", "b", 5}, {"a", "b", 5}}}},
{{" ", 0}, {"a", 5}, {"b", 0}}, {{" ", 0}, {"a", 5}, {"b", 0}},
{{"a", "b", 5}}); {{"a", "b", 5}});
} }
TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) { TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) {
VerifyRangeDels({{"a", "b", 5}, {"b", "c", 5}}, VerifyRangeDels({{{{"a", "b", 5}, {"b", "c", 5}}}},
{{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}}, {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}},
{{"a", "c", 5}}); {{"a", "c", 5}});
} }
TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) { TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) {
VerifyRangeDels({{"a", "c", 5}, {"b", "d", 5}}, VerifyRangeDels({{{{"a", "c", 5}, {"b", "d", 5}}}},
{{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}},
{{"a", "d", 5}}); {{"a", "d", 5}});
} }
TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) { TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) {
VerifyRangeDels({{"a", "d", 5}, {"b", "c", 5}}, VerifyRangeDels({{{{"a", "d", 5}, {"b", "c", 5}}}},
{{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}},
{{"a", "d", 5}}); {{"a", "d", 5}});
} }
@ -236,27 +251,27 @@ TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) {
// larger one when VerifyRangeDels() runs them in reverse // larger one when VerifyRangeDels() runs them in reverse
TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) { TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) {
VerifyRangeDels( VerifyRangeDels(
{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}}, {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}}}},
{{" ", 0}, {"a", 20}, {"f", 15}, {"g", 0}}, {{" ", 0}, {"a", 20}, {"f", 15}, {"g", 0}},
{{"a", "f", 20}, {"f", "g", 15}}); {{"a", "f", 20}, {"f", "g", 15}});
} }
TEST_F(RangeDelAggregatorTest, CoverMultipleFromRight) { TEST_F(RangeDelAggregatorTest, CoverMultipleFromRight) {
VerifyRangeDels( VerifyRangeDels(
{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}}, {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}}}},
{{" ", 0}, {"b", 5}, {"c", 20}, {"h", 0}}, {{" ", 0}, {"b", 5}, {"c", 20}, {"h", 0}},
{{"b", "c", 5}, {"c", "h", 20}}); {{"b", "c", 5}, {"c", "h", 20}});
} }
TEST_F(RangeDelAggregatorTest, CoverMultipleFully) { TEST_F(RangeDelAggregatorTest, CoverMultipleFully) {
VerifyRangeDels( VerifyRangeDels(
{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}}, {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}}}},
{{" ", 0}, {"a", 20}, {"h", 0}}, {{"a", "h", 20}}); {{" ", 0}, {"a", 20}, {"h", 0}}, {{"a", "h", 20}});
} }
TEST_F(RangeDelAggregatorTest, AlternateMultipleAboveBelow) { TEST_F(RangeDelAggregatorTest, AlternateMultipleAboveBelow) {
VerifyRangeDels( VerifyRangeDels(
{{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}}, {{{{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}}}},
{{" ", 0}, {"a", 5}, {"b", 15}, {"d", 10}, {"e", 20}, {"g", 5}, {"h", 0}}, {{" ", 0}, {"a", 5}, {"b", 15}, {"d", 10}, {"e", 20}, {"g", 5}, {"h", 0}},
{{"a", "b", 5}, {{"a", "b", 5},
{"b", "d", 15}, {"b", "d", 15},
@ -321,31 +336,132 @@ TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) {
} }
TEST_F(RangeDelAggregatorTest, TruncateTombstones) { TEST_F(RangeDelAggregatorTest, TruncateTombstones) {
const InternalKey smallest("b", 1, kTypeRangeDeletion); const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels( VerifyRangeDels(
{{"a", "c", 10}, {"d", "f", 10}}, {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}},
{{"a", 10, true}, // truncated {{"a", 10, true}, // truncated
{"b", 10, false}, // not truncated {"b", 10, false}, // not truncated
{"d", 10, false}, // not truncated {"d", 10, false}, // not truncated
{"e", 10, true}}, // truncated {"e", 10, true}}, // truncated
{{"b", "c", 10}, {"d", "e", 10}}, {{"b", "c", 10}, {"d", "e", 10}});
&smallest, &largest);
} }
TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateTombstones) { TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateBelowTombstone) {
const InternalKey smallest("b", 1, kTypeRangeDeletion); const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest( const InternalKey largest(
"e", 3, // could happen if "e" is in consecutive sstables "e", 3, // could happen if "e" is in consecutive sstables
kTypeValue); kTypeValue);
VerifyRangeDels( VerifyRangeDels(
{{"a", "c", 10}, {"d", "f", 10}}, {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}},
{{"a", 10, true}, // truncated
{"b", 10, false}, // not truncated
{"d", 10, false}, // not truncated
{"e", 10, false}, // not truncated
{"e", 2, true}}, // truncated here
{{"b", "c", 10}, {"d", "e", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateAboveTombstone) {
const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest(
"e", 15, // could happen if "e" is in consecutive sstables
kTypeValue);
VerifyRangeDels(
{{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}},
{{"a", 10, true}, // truncated
{"b", 10, false}, // not truncated
{"d", 10, false}, // not truncated
{"e", kMaxSequenceNumber, true}}, // truncated
{{"b", "c", 10}, {"d", "e", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlappingSmallestKeyTruncateBelowTombstone) {
const InternalKey smallest("b", 5, kTypeValue);
const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}},
{{"a", 10, true}, // truncated
{"b", 10, true}, // truncated
{"b", 6, false}, // not truncated; start boundary moved
{"d", 10, false}, // not truncated
{"e", kMaxSequenceNumber, true}}, // truncated
{{"b", "c", 10}, {"d", "e", 10}});
}
TEST_F(RangeDelAggregatorTest, OverlappingSmallestKeyTruncateAboveTombstone) {
const InternalKey smallest("b", 15, kTypeValue);
const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}},
{{"a", 10, true}, // truncated {{"a", 10, true}, // truncated
{"b", 15, true}, // truncated
{"b", 10, false}, // not truncated {"b", 10, false}, // not truncated
{"d", 10, false}, // not truncated {"d", 10, false}, // not truncated
{"e", 10, false}}, // not truncated {"e", kMaxSequenceNumber, true}}, // truncated
{{"b", "c", 10}, {"d", "f", 10}}, {{"b", "c", 10}, {"d", "e", 10}});
&smallest, &largest); }
TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapAboveTombstone) {
const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest1("c", 20, kTypeValue);
const InternalKey smallest2("c", 10, kTypeValue);
const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{{{"b", "d", 5}}, &smallest1, &largest1},
{{{"b", "d", 5}}, &smallest2, &largest2}},
{{"b", 5, false}, // not truncated
{"c", 5, false}}, // not truncated
{{"b", "c", 5}, {"c", "d", 5}}); // not collapsed due to boundaries
}
TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapBelowTombstone) {
const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest1("c", 20, kTypeValue);
const InternalKey smallest2("c", 10, kTypeValue);
const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{{{"b", "d", 30}}, &smallest1, &largest1},
{{{"b", "d", 30}}, &smallest2, &largest2}},
{{"b", 30, false}, // not truncated
{"c", 30, false}, // not truncated
{"c", 19, true}, // truncated here (keys in this range should not exist)
{"c", 11, false}}, // not truncated again
{{"b", "c", 30}, {"c", "d", 30}}); // not collapsed due to boundaries
}
TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapContainsTombstone) {
const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest1("c", 20, kTypeValue);
const InternalKey smallest2("c", 10, kTypeValue);
const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{{{"b", "d", 15}}, &smallest1, &largest1},
{{{"b", "d", 15}}, &smallest2, &largest2}},
{{"b", 15, false}, // not truncated
{"c", 15, true}, // truncated (keys in this range should not exist)
{"c", 11, false}}, // not truncated here
{{"b", "c", 15}, {"c", "d", 15}}); // not collapsed due to boundaries
}
TEST_F(RangeDelAggregatorTest, FileCoversOneKeyAndTombstoneAbove) {
const InternalKey smallest("a", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest("a", 20, kTypeValue);
VerifyRangeDels(
{{{{"a", "b", 35}}, &smallest, &largest}},
{{"a", 40, true}, // not truncated
{"a", 35, false}}, // not truncated
{{"a", "a", 35}}); // empty tombstone but can't occur during a compaction
}
TEST_F(RangeDelAggregatorTest, FileCoversOneKeyAndTombstoneBelow) {
const InternalKey smallest("a", kMaxSequenceNumber, kTypeRangeDeletion);
const InternalKey largest("a", 20, kTypeValue);
VerifyRangeDels(
{{{{"a", "b", 15}}, &smallest, &largest}},
{{"a", 20, true}, // truncated here
{"a", 15, true}}, // truncated
{{"a", "a", 15}}); // empty tombstone but can't occur during a compaction
} }
} // namespace rocksdb } // namespace rocksdb

@ -183,7 +183,9 @@ InternalIterator* TableCache::NewIterator(
const InternalKeyComparator& icomparator, const FileMetaData& file_meta, const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor, RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
TableReader** table_reader_ptr, HistogramImpl* file_read_hist, TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
bool for_compaction, Arena* arena, bool skip_filters, int level) { bool for_compaction, Arena* arena, bool skip_filters, int level,
const InternalKey* smallest_compaction_key,
const InternalKey* largest_compaction_key) {
PERF_TIMER_GUARD(new_table_iterator_nanos); PERF_TIMER_GUARD(new_table_iterator_nanos);
Status s; Status s;
@ -266,10 +268,16 @@ InternalIterator* TableCache::NewIterator(
s = range_del_iter->status(); s = range_del_iter->status();
} }
if (s.ok()) { if (s.ok()) {
s = range_del_agg->AddTombstones( const InternalKey* smallest = &file_meta.smallest;
std::move(range_del_iter), const InternalKey* largest = &file_meta.largest;
&file_meta.smallest, if (smallest_compaction_key != nullptr) {
&file_meta.largest); smallest = smallest_compaction_key;
}
if (largest_compaction_key != nullptr) {
largest = largest_compaction_key;
}
s = range_del_agg->AddTombstones(std::move(range_del_iter), smallest,
largest);
} }
} }
} }

@ -56,7 +56,9 @@ class TableCache {
const SliceTransform* prefix_extractor = nullptr, const SliceTransform* prefix_extractor = nullptr,
TableReader** table_reader_ptr = nullptr, TableReader** table_reader_ptr = nullptr,
HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, HistogramImpl* file_read_hist = nullptr, bool for_compaction = false,
Arena* arena = nullptr, bool skip_filters = false, int level = -1); Arena* arena = nullptr, bool skip_filters = false, int level = -1,
const InternalKey* smallest_compaction_key = nullptr,
const InternalKey* largest_compaction_key = nullptr);
// If a seek to internal key "k" in specified file finds an entry, // If a seek to internal key "k" in specified file finds an entry,
// call (*handle_result)(arg, found_key, found_value) repeatedly until // call (*handle_result)(arg, found_key, found_value) repeatedly until

@ -451,6 +451,7 @@ bool SomeFileOverlapsRange(
} }
namespace { namespace {
class LevelIterator final : public InternalIterator { class LevelIterator final : public InternalIterator {
public: public:
LevelIterator(TableCache* table_cache, const ReadOptions& read_options, LevelIterator(TableCache* table_cache, const ReadOptions& read_options,
@ -459,7 +460,9 @@ class LevelIterator final : public InternalIterator {
const LevelFilesBrief* flevel, const LevelFilesBrief* flevel,
const SliceTransform* prefix_extractor, bool should_sample, const SliceTransform* prefix_extractor, bool should_sample,
HistogramImpl* file_read_hist, bool for_compaction, HistogramImpl* file_read_hist, bool for_compaction,
bool skip_filters, int level, RangeDelAggregator* range_del_agg) bool skip_filters, int level, RangeDelAggregator* range_del_agg,
const std::vector<AtomicCompactionUnitBoundary>*
compaction_boundaries = nullptr)
: table_cache_(table_cache), : table_cache_(table_cache),
read_options_(read_options), read_options_(read_options),
env_options_(env_options), env_options_(env_options),
@ -473,7 +476,8 @@ class LevelIterator final : public InternalIterator {
file_index_(flevel_->num_files), file_index_(flevel_->num_files),
level_(level), level_(level),
range_del_agg_(range_del_agg), range_del_agg_(range_del_agg),
pinned_iters_mgr_(nullptr) { pinned_iters_mgr_(nullptr),
compaction_boundaries_(compaction_boundaries) {
// Empty level is not supported. // Empty level is not supported.
assert(flevel_ != nullptr && flevel_->num_files > 0); assert(flevel_ != nullptr && flevel_->num_files > 0);
} }
@ -540,12 +544,18 @@ class LevelIterator final : public InternalIterator {
sample_file_read_inc(file_meta.file_metadata); sample_file_read_inc(file_meta.file_metadata);
} }
const InternalKey* smallest_compaction_key = nullptr;
const InternalKey* largest_compaction_key = nullptr;
if (compaction_boundaries_ != nullptr) {
smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
}
return table_cache_->NewIterator( return table_cache_->NewIterator(
read_options_, env_options_, icomparator_, *file_meta.file_metadata, read_options_, env_options_, icomparator_, *file_meta.file_metadata,
range_del_agg_, prefix_extractor_, range_del_agg_, prefix_extractor_,
nullptr /* don't need reference to table */, nullptr /* don't need reference to table */,
file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_, file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_,
level_); level_, smallest_compaction_key, largest_compaction_key);
} }
TableCache* table_cache_; TableCache* table_cache_;
@ -565,6 +575,10 @@ class LevelIterator final : public InternalIterator {
RangeDelAggregator* range_del_agg_; RangeDelAggregator* range_del_agg_;
IteratorWrapper file_iter_; // May be nullptr IteratorWrapper file_iter_; // May be nullptr
PinnedIteratorsManager* pinned_iters_mgr_; PinnedIteratorsManager* pinned_iters_mgr_;
// To be propagated to RangeDelAggregator in order to safely truncate range
// tombstones.
const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
}; };
void LevelIterator::Seek(const Slice& target) { void LevelIterator::Seek(const Slice& target) {
@ -2134,60 +2148,6 @@ void VersionStorageInfo::GetCleanInputsWithinInterval(
true /* within_interval */); true /* within_interval */);
} }
namespace {
const uint64_t kRangeTombstoneSentinel =
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
// null which provides the property that a==null indicates a key that is less
// than any key and b==null indicates a key that is greater than any key. Note
// that the comparison is performed primarily on the user-key portion of the
// key. If the user-keys compare equal, an additional test is made to sort
// range tombstone sentinel keys before other keys with the same user-key. The
// result is that 2 user-keys will compare equal if they differ purely on
// their sequence number and value, but the range tombstone sentinel for that
// user-key will compare not equal. This is necessary because the range
// tombstone sentinel key is set as the largest key for an sstable even though
// that key never appears in the database. We don't want adjacent sstables to
// be considered overlapping if they are separated by the range tombstone
// sentinel.
int sstableKeyCompare(const Comparator* user_cmp,
const InternalKey& a, const InternalKey& b) {
auto c = user_cmp->Compare(a.user_key(), b.user_key());
if (c != 0) {
return c;
}
auto a_footer = ExtractInternalKeyFooter(a.Encode());
auto b_footer = ExtractInternalKeyFooter(b.Encode());
if (a_footer == kRangeTombstoneSentinel) {
if (b_footer != kRangeTombstoneSentinel) {
return -1;
}
} else if (b_footer == kRangeTombstoneSentinel) {
return 1;
}
return 0;
}
int sstableKeyCompare(const Comparator* user_cmp,
const InternalKey* a, const InternalKey& b) {
if (a == nullptr) {
return -1;
}
return sstableKeyCompare(user_cmp, *a, b);
}
int sstableKeyCompare(const Comparator* user_cmp,
const InternalKey& a, const InternalKey* b) {
if (b == nullptr) {
return -1;
}
return sstableKeyCompare(user_cmp, a, *b);
}
} // namespace
// Store in "*inputs" all files in "level" that overlap [begin,end] // Store in "*inputs" all files in "level" that overlap [begin,end]
// Employ binary search to find at least one file that overlaps the // Employ binary search to find at least one file that overlaps the
// specified range. From that file, iterate backwards and // specified range. From that file, iterate backwards and
@ -4248,7 +4208,8 @@ InternalIterator* VersionSet::MakeInputIterator(
false /* should_sample */, false /* should_sample */,
nullptr /* no per level latency histogram */, nullptr /* no per level latency histogram */,
true /* for_compaction */, false /* skip_filters */, true /* for_compaction */, false /* skip_filters */,
static_cast<int>(which) /* level */, range_del_agg); static_cast<int>(which) /* level */, range_del_agg,
c->boundaries(which));
} }
} }
} }

Loading…
Cancel
Save