Consider subcompaction boundaries when updating file boundaries for range deletion

Summary:
Adjusted AddToBuilder() to take lower_bound and upper_bound, which serve two purposes: (1) only range deletions overlapping with the interval [lower_bound, upper_bound) will be added to the output file, and (2) the output file's boundaries will not be extended before lower_bound or after upper_bound. Our computation of lower_bound/upper_bound consider both subcompaction boundaries and previous/next files within the subcompaction.

Test cases are here (level subcompactions: https://gist.github.com/ajkr/63c7eae3e9667c5ebdc0a7efb74ac332, and universal subcompactions: https://gist.github.com/ajkr/5a62af77c4ebe4052a1955c496d51fdb) but can't be included in this diff as they depend on committing the API first. They fail before this change and pass after.
Closes https://github.com/facebook/rocksdb/pull/1501

Reviewed By: yhchiang

Differential Revision: D4171685

Pulled By: ajkr

fbshipit-source-id: ee99db8
main
Andrew Kryczka 8 years ago committed by Facebook Github Bot
parent 800e51553e
commit ec2f64794b
  1. 4
      db/builder.cc
  2. 33
      db/compaction_job.cc
  3. 83
      db/range_del_aggregator.cc
  4. 17
      db/range_del_aggregator.h

@ -146,8 +146,8 @@ Status BuildTable(
}
}
// nullptr for table_{min,max} so all range tombstones will be flushed
range_del_agg->AddToBuilder(builder, true /* extend_before_min_key */,
nullptr /* next_table_min_key*/, meta);
range_del_agg->AddToBuilder(builder, nullptr /* lower_bound */,
nullptr /* upper_bound */, meta);
// Finish and check for builder errors
bool empty = builder->NumEntries() == 0;

@ -972,14 +972,33 @@ Status CompactionJob::FinishCompactionOutputFile(
Status s = input_status;
auto meta = &sub_compact->current_output()->meta;
if (s.ok()) {
Slice lower_bound_guard, upper_bound_guard;
const Slice *lower_bound, *upper_bound;
if (sub_compact->outputs.size() == 1) {
// For the first output table, include range tombstones before the min key
// boundary. For subsequent output tables, this is unnecessary because we
// extend each file's max key boundary up until the next file's min key when
// range tombstones fall in the gap.
range_del_agg->AddToBuilder(
sub_compact->builder.get(),
sub_compact->outputs.size() == 1 /* extend_before_min_key */,
next_table_min_key, meta, bottommost_level_);
// but after the subcompaction boundary.
lower_bound = sub_compact->start;
} else if (meta->smallest.size() > 0) {
// For subsequent output tables, only include range tombstones from min
// key onwards since the previous file was extended to contain range
// tombstones falling before min key.
lower_bound_guard = meta->smallest.user_key();
lower_bound = &lower_bound_guard;
} else {
lower_bound = nullptr;
}
if (next_table_min_key != nullptr) {
// This isn't the last file in the subcompaction, so extend until the next
// file starts.
upper_bound_guard = ExtractUserKey(*next_table_min_key);
upper_bound = &upper_bound_guard;
} else {
// This is the last file in the subcompaction, so extend until the
// subcompaction ends.
upper_bound = sub_compact->end;
}
range_del_agg->AddToBuilder(sub_compact->builder.get(), lower_bound,
upper_bound, meta, bottommost_level_);
}
const uint64_t current_entries = sub_compact->builder->NumEntries();
meta->marked_for_compaction = sub_compact->builder->NeedCompact();

@ -113,9 +113,8 @@ RangeDelAggregator::TombstoneMap& RangeDelAggregator::GetTombstoneMap(
// tombstones are known to be available, without the code duplication we have
// in ShouldAddTombstones(). It'll also allow us to move the table-modifying
// code into more coherent places: CompactionJob and BuildTable().
void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
bool extend_before_min_key,
const Slice* next_table_min_key,
void RangeDelAggregator::AddToBuilder(
TableBuilder* builder, const Slice* lower_bound, const Slice* upper_bound,
FileMetaData* meta,
bool bottommost_level /* = false */) {
auto stripe_map_iter = stripe_map_.begin();
@ -132,20 +131,20 @@ void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
while (stripe_map_iter != stripe_map_.end()) {
for (const auto& start_key_and_tombstone : stripe_map_iter->second) {
const auto& tombstone = start_key_and_tombstone.second;
if (next_table_min_key != nullptr &&
icmp_.user_comparator()->Compare(*next_table_min_key,
tombstone.start_key_) < 0) {
// Tombstones starting after next_table_min_key only need to be included
// in the next table.
if (upper_bound != nullptr &&
icmp_.user_comparator()->Compare(*upper_bound,
tombstone.start_key_) <= 0) {
// Tombstones starting at upper_bound or later only need to be included
// in the next table. Break because subsequent tombstones will start
// even later.
break;
}
if (!extend_before_min_key && meta->smallest.size() != 0 &&
if (lower_bound != nullptr &&
icmp_.user_comparator()->Compare(tombstone.end_key_,
meta->smallest.user_key()) < 0) {
// Tombstones ending before this table's smallest key can conditionally
// be excluded, e.g., when this table is a non-first compaction output,
// we know such tombstones are included in the previous table. In that
// case extend_before_min_key would be false.
*lower_bound) <= 0) {
// Tombstones ending before or at lower_bound only need to be included
// in the prev table. Continue because subsequent tombstones may still
// overlap [lower_bound, upper_bound).
continue;
}
@ -153,35 +152,49 @@ void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
if (!first_added) {
first_added = true;
if (extend_before_min_key &&
(meta->smallest.size() == 0 ||
icmp_.Compare(ikey_and_end_key.first, meta->smallest) < 0)) {
meta->smallest = ikey_and_end_key.first;
}
}
auto end_ikey = tombstone.SerializeEndKey();
if (meta->largest.size() == 0 ||
icmp_.Compare(meta->largest, end_ikey) < 0) {
if (next_table_min_key != nullptr &&
icmp_.Compare(*next_table_min_key, end_ikey.Encode()) < 0) {
// Pretend the largest key has the same user key as the min key in the
// following table in order for files to appear key-space partitioned.
// Choose highest seqnum so this file's largest comes before the next
// file's smallest. The fake seqnum is OK because the read path's
// file-picking code only considers the user key portion.
InternalKey smallest_candidate = std::move(ikey_and_end_key.first);;
if (lower_bound != nullptr &&
icmp_.user_comparator()->Compare(smallest_candidate.user_key(),
*lower_bound) <= 0) {
// Pretend the smallest key has the same user key as lower_bound
// (the max key in the previous table or subcompaction) in order for
// files to appear key-space partitioned.
//
// Choose lowest seqnum so this file's smallest internal key comes
// after the previous file's/subcompaction's largest. The fake seqnum
// is OK because the read path's file-picking code only considers user
// key.
smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
}
if (meta->smallest.size() == 0 ||
icmp_.Compare(smallest_candidate, meta->smallest) < 0) {
meta->smallest = std::move(smallest_candidate);
}
}
InternalKey largest_candidate = tombstone.SerializeEndKey();
if (upper_bound != nullptr &&
icmp_.user_comparator()->Compare(*upper_bound,
largest_candidate.user_key()) <= 0) {
// Pretend the largest key has the same user key as upper_bound (the
// min key in the following table or subcompaction) in order for files
// to appear key-space partitioned.
//
// Choose highest seqnum so this file's largest internal key comes
// before the next file's/subcompaction's smallest. The fake seqnum is
// OK because the read path's file-picking code only considers the user
// key portion.
//
// Note Seek() also creates InternalKey with (user_key,
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
// Seek() key in InternalKey's ordering. So Seek() will look in the
// next file for the user key.
ParsedInternalKey parsed;
ParseInternalKey(*next_table_min_key, &parsed);
meta->largest = InternalKey(parsed.user_key, kMaxSequenceNumber,
largest_candidate = InternalKey(*upper_bound, kMaxSequenceNumber,
kTypeRangeDeletion);
} else {
meta->largest = std::move(end_ikey);
}
if (meta->largest.size() == 0 ||
icmp_.Compare(meta->largest, largest_candidate) < 0) {
meta->largest = std::move(largest_candidate);
}
meta->smallest_seqno = std::min(meta->smallest_seqno, tombstone.seq_);
meta->largest_seqno = std::max(meta->largest_seqno, tombstone.seq_);

@ -56,19 +56,24 @@ class RangeDelAggregator {
// @param extend_before_min_key If true, the range of tombstones to be added
// to the TableBuilder starts from the beginning of the key-range;
// otherwise, it starts from meta->smallest.
// @param next_table_min_key If nullptr, the range of tombstones to be added
// to the TableBuilder ends at the end of the key-range; otherwise, it
// ends at next_table_min_key.
// @param lower_bound/upper_bound Any range deletion with [start_key, end_key)
// that overlaps the target range [*lower_bound, *upper_bound) is added to
// the builder. If lower_bound is nullptr, the target range extends
// infinitely to the left. If upper_bound is nullptr, the target range
// extends infinitely to the right. If both are nullptr, the target range
// extends infinitely in both directions, i.e., all range deletions are
// added to the builder.
// @param meta The file's metadata. We modify the begin and end keys according
// to the range tombstones added to this file such that the read path does
// not miss range tombstones that cover gaps before/after/between files in
// a level.
// a level. lower_bound/upper_bound above constrain how far file boundaries
// can be extended.
// @param bottommost_level If true, we will filter out any tombstones
// belonging to the oldest snapshot stripe, because all keys potentially
// covered by this tombstone are guaranteed to have been deleted by
// compaction.
void AddToBuilder(TableBuilder* builder, bool extend_before_min_key,
const Slice* next_table_min_key, FileMetaData* meta,
void AddToBuilder(TableBuilder* builder, const Slice* lower_bound,
const Slice* upper_bound, FileMetaData* meta,
bool bottommost_level = false);
Arena* GetArena() { return &arena_; }
bool IsEmpty();

Loading…
Cancel
Save