Handle tombstones at the same seqno in the CollapsedRangeDelMap (#4424)

Summary:
The CollapsedRangeDelMap was entirely mishandling tombstones at the same
sequence number when the tombstones did not have identical start and end
keys. Such tombstones are common since 90fc40690, which causes
tombstones to be split during compactions.

For example, if the tombstone [a, c) @ 1 lies across a compaction
boundary at b, it will be split into [a, b) @ 1 and [b, c) @ 1. Without
this patch, the collapsed range deletion map would look like this:

  a -> 1
  b -> 1
  c -> 0

Notice how the b -> 1 entry is redundant. When the tombstones overlap,
the problem is even worse. Consider tombstones [a, c) @ 1 and [b, d) @
1, which produces this map without this patch:

  a -> 1
  b -> 1
  c -> 0
  d -> 0

This map is corrupt, as a map can never contain adjacent sentinel (zero)
entries. When the iterator advances from b to c, it will notice that c
is a sentinel enty and skip to d--but d is also a sentinel entry! Asking
what tombstone this iterator points to will trigger an assertion, as it
is not pointing to a valid tombstone.

/cc ajkr
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4424

Differential Revision: D10039248

Pulled By: abhimadan

fbshipit-source-id: 6d737c1e88d60e80cf27286726627ba44463e7f4
main
Nikhil Benesch 6 years ago committed by Facebook Github Bot
parent 31d46993cc
commit 17edc82a4b
  1. 60
      db/range_del_aggregator.cc
  2. 24
      db/range_del_aggregator_test.cc

@ -269,22 +269,36 @@ class CollapsedRangeDelMap : public RangeDelMap {
// 2: c--- OR 2: c--- OR 2: c--- OR 2: c------
// 1: A--C 1: 1: A------ 1: C------
// ^ ^ ^ ^
// Insert a new transition at the new tombstone's start point, or raise
// the existing transition at that point to the new tombstone's seqno.
end_seq = prev_seq();
rep_[t.start_key_] = t.seq_; // operator[] will overwrite existing entry
Rep::iterator pit;
if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() &&
ucmp_->Compare(pit->first, t.start_key_) == 0 && std::prev(pit)->second == t.seq_) {
// The new tombstone starts at the end of an existing tombstone with an
// identical seqno:
//
// 3:
// 2: A--C---
// 1:
// ^
// Merge the tombstones by removing the existing tombstone's end key.
it = rep_.erase(std::prev(it));
} else {
// Insert a new transition at the new tombstone's start point, or raise
// the existing transition at that point to the new tombstone's seqno.
rep_[t.start_key_] = t.seq_; // operator[] will overwrite existing entry
}
} else {
// The new tombstone's start point is covered by an existing tombstone:
//
// 3: A----- OR 3: C------
// 2: c--- 2: c------
// ^ ^
// 3: A----- OR 3: C------ OR
// 2: c--- 2: c------ 2: C------
// ^ ^ ^
// Do nothing.
}
// Look at all the existing transitions that overlap the new tombstone.
while (it != rep_.end() && ucmp_->Compare(it->first, t.end_key_) < 0) {
if (t.seq_ > it->second) {
if (t.seq_ >= it->second) {
// The transition is to an existing tombstone that the new tombstone
// covers. Save the covered tombstone's seqno. We'll need to return to
// it if the new tombstone ends before the existing tombstone.
@ -328,15 +342,29 @@ class CollapsedRangeDelMap : public RangeDelMap {
}
if (t.seq_ == prev_seq()) {
// The new tombstone is unterminated in the map:
//
// 3: OR 3: --G OR 3: --G K--
// 2: C-------k 2: G---k 2: G---k
// ^ ^ ^
// End it now, returning to the last seqno we covered. Because end keys
// are exclusive, if there's an existing transition at t.end_key_, it
// takes precedence over the transition that we install here.
rep_.emplace(t.end_key_, end_seq); // emplace is a noop if existing entry
// The new tombstone is unterminated in the map.
if (it != rep_.end() && t.seq_ == it->second && ucmp_->Compare(it->first, t.end_key_) == 0) {
// The new tombstone ends at the start of another tombstone with an
// identical seqno. Merge the tombstones by removing the existing
// tombstone's start key.
rep_.erase(it);
} else if (end_seq == prev_seq() || (it != rep_.end() && end_seq == it->second)) {
// The new tombstone is implicitly ended because its end point is
// contained within an existing tombstone with the same seqno:
//
// 2: ---k--N
// ^
} else {
// The new tombstone needs an explicit end point.
//
// 3: OR 3: --G OR 3: --G K--
// 2: C-------k 2: G---k 2: G---k
// ^ ^ ^
// Install one that returns to the last seqno we covered. Because end
// keys are exclusive, if there's an existing transition at t.end_key_,
// it takes precedence over the transition that we install here.
rep_.emplace(t.end_key_, end_seq); // emplace is a noop if existing entry
}
} else {
// The new tombstone is implicitly ended because its end point is covered
// by an existing tombstone with a higher seqno.

@ -208,6 +208,30 @@ TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) {
{{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}});
}
TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) {
VerifyRangeDels({{"a", "b", 5}, {"a", "b", 5}},
{{" ", 0}, {"a", 5}, {"b", 0}},
{{"a", "b", 5}});
}
TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) {
VerifyRangeDels({{"a", "b", 5}, {"b", "c", 5}},
{{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}},
{{"a", "c", 5}});
}
TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) {
VerifyRangeDels({{"a", "c", 5}, {"b", "d", 5}},
{{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}},
{{"a", "d", 5}});
}
TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) {
VerifyRangeDels({{"a", "d", 5}, {"b", "c", 5}},
{{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}},
{{"a", "d", 5}});
}
// Note the Cover* tests also test cases where tombstones are inserted under a
// larger one when VerifyRangeDels() runs them in reverse
TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) {

Loading…
Cancel
Save