Consider subcompaction boundaries when updating file boundaries for range deletion

Summary:
Adjusted AddToBuilder() to take lower_bound and upper_bound, which serve two purposes: (1) only range deletions overlapping with the interval [lower_bound, upper_bound) will be added to the output file, and (2) the output file's boundaries will not be extended before lower_bound or after upper_bound. Our computation of lower_bound/upper_bound consider both subcompaction boundaries and previous/next files within the subcompaction.

Test cases are here (level subcompactions: https://gist.github.com/ajkr/63c7eae3e9667c5ebdc0a7efb74ac332, and universal subcompactions: https://gist.github.com/ajkr/5a62af77c4ebe4052a1955c496d51fdb) but can't be included in this diff as they depend on committing the API first. They fail before this change and pass after.
Closes https://github.com/facebook/rocksdb/pull/1501

Reviewed By: yhchiang

Differential Revision: D4171685

Pulled By: ajkr

fbshipit-source-id: ee99db8
main
Andrew Kryczka 8 years ago committed by Facebook Github Bot
parent 800e51553e
commit ec2f64794b
  1. 4
      db/builder.cc
  2. 33
      db/compaction_job.cc
  3. 83
      db/range_del_aggregator.cc
  4. 17
      db/range_del_aggregator.h

@ -146,8 +146,8 @@ Status BuildTable(
} }
} }
// nullptr for table_{min,max} so all range tombstones will be flushed // nullptr for table_{min,max} so all range tombstones will be flushed
range_del_agg->AddToBuilder(builder, true /* extend_before_min_key */, range_del_agg->AddToBuilder(builder, nullptr /* lower_bound */,
nullptr /* next_table_min_key*/, meta); nullptr /* upper_bound */, meta);
// Finish and check for builder errors // Finish and check for builder errors
bool empty = builder->NumEntries() == 0; bool empty = builder->NumEntries() == 0;

@ -972,14 +972,33 @@ Status CompactionJob::FinishCompactionOutputFile(
Status s = input_status; Status s = input_status;
auto meta = &sub_compact->current_output()->meta; auto meta = &sub_compact->current_output()->meta;
if (s.ok()) { if (s.ok()) {
Slice lower_bound_guard, upper_bound_guard;
const Slice *lower_bound, *upper_bound;
if (sub_compact->outputs.size() == 1) {
// For the first output table, include range tombstones before the min key // For the first output table, include range tombstones before the min key
// boundary. For subsequent output tables, this is unnecessary because we // but after the subcompaction boundary.
// extend each file's max key boundary up until the next file's min key when lower_bound = sub_compact->start;
// range tombstones fall in the gap. } else if (meta->smallest.size() > 0) {
range_del_agg->AddToBuilder( // For subsequent output tables, only include range tombstones from min
sub_compact->builder.get(), // key onwards since the previous file was extended to contain range
sub_compact->outputs.size() == 1 /* extend_before_min_key */, // tombstones falling before min key.
next_table_min_key, meta, bottommost_level_); lower_bound_guard = meta->smallest.user_key();
lower_bound = &lower_bound_guard;
} else {
lower_bound = nullptr;
}
if (next_table_min_key != nullptr) {
// This isn't the last file in the subcompaction, so extend until the next
// file starts.
upper_bound_guard = ExtractUserKey(*next_table_min_key);
upper_bound = &upper_bound_guard;
} else {
// This is the last file in the subcompaction, so extend until the
// subcompaction ends.
upper_bound = sub_compact->end;
}
range_del_agg->AddToBuilder(sub_compact->builder.get(), lower_bound,
upper_bound, meta, bottommost_level_);
} }
const uint64_t current_entries = sub_compact->builder->NumEntries(); const uint64_t current_entries = sub_compact->builder->NumEntries();
meta->marked_for_compaction = sub_compact->builder->NeedCompact(); meta->marked_for_compaction = sub_compact->builder->NeedCompact();

@ -113,9 +113,8 @@ RangeDelAggregator::TombstoneMap& RangeDelAggregator::GetTombstoneMap(
// tombstones are known to be available, without the code duplication we have // tombstones are known to be available, without the code duplication we have
// in ShouldAddTombstones(). It'll also allow us to move the table-modifying // in ShouldAddTombstones(). It'll also allow us to move the table-modifying
// code into more coherent places: CompactionJob and BuildTable(). // code into more coherent places: CompactionJob and BuildTable().
void RangeDelAggregator::AddToBuilder(TableBuilder* builder, void RangeDelAggregator::AddToBuilder(
bool extend_before_min_key, TableBuilder* builder, const Slice* lower_bound, const Slice* upper_bound,
const Slice* next_table_min_key,
FileMetaData* meta, FileMetaData* meta,
bool bottommost_level /* = false */) { bool bottommost_level /* = false */) {
auto stripe_map_iter = stripe_map_.begin(); auto stripe_map_iter = stripe_map_.begin();
@ -132,20 +131,20 @@ void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
while (stripe_map_iter != stripe_map_.end()) { while (stripe_map_iter != stripe_map_.end()) {
for (const auto& start_key_and_tombstone : stripe_map_iter->second) { for (const auto& start_key_and_tombstone : stripe_map_iter->second) {
const auto& tombstone = start_key_and_tombstone.second; const auto& tombstone = start_key_and_tombstone.second;
if (next_table_min_key != nullptr && if (upper_bound != nullptr &&
icmp_.user_comparator()->Compare(*next_table_min_key, icmp_.user_comparator()->Compare(*upper_bound,
tombstone.start_key_) < 0) { tombstone.start_key_) <= 0) {
// Tombstones starting after next_table_min_key only need to be included // Tombstones starting at upper_bound or later only need to be included
// in the next table. // in the next table. Break because subsequent tombstones will start
// even later.
break; break;
} }
if (!extend_before_min_key && meta->smallest.size() != 0 && if (lower_bound != nullptr &&
icmp_.user_comparator()->Compare(tombstone.end_key_, icmp_.user_comparator()->Compare(tombstone.end_key_,
meta->smallest.user_key()) < 0) { *lower_bound) <= 0) {
// Tombstones ending before this table's smallest key can conditionally // Tombstones ending before or at lower_bound only need to be included
// be excluded, e.g., when this table is a non-first compaction output, // in the prev table. Continue because subsequent tombstones may still
// we know such tombstones are included in the previous table. In that // overlap [lower_bound, upper_bound).
// case extend_before_min_key would be false.
continue; continue;
} }
@ -153,35 +152,49 @@ void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second); builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
if (!first_added) { if (!first_added) {
first_added = true; first_added = true;
if (extend_before_min_key && InternalKey smallest_candidate = std::move(ikey_and_end_key.first);;
(meta->smallest.size() == 0 || if (lower_bound != nullptr &&
icmp_.Compare(ikey_and_end_key.first, meta->smallest) < 0)) { icmp_.user_comparator()->Compare(smallest_candidate.user_key(),
meta->smallest = ikey_and_end_key.first; *lower_bound) <= 0) {
} // Pretend the smallest key has the same user key as lower_bound
} // (the max key in the previous table or subcompaction) in order for
auto end_ikey = tombstone.SerializeEndKey(); // files to appear key-space partitioned.
if (meta->largest.size() == 0 || //
icmp_.Compare(meta->largest, end_ikey) < 0) { // Choose lowest seqnum so this file's smallest internal key comes
if (next_table_min_key != nullptr && // after the previous file's/subcompaction's largest. The fake seqnum
icmp_.Compare(*next_table_min_key, end_ikey.Encode()) < 0) { // is OK because the read path's file-picking code only considers user
// Pretend the largest key has the same user key as the min key in the // key.
// following table in order for files to appear key-space partitioned. smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
// Choose highest seqnum so this file's largest comes before the next }
// file's smallest. The fake seqnum is OK because the read path's if (meta->smallest.size() == 0 ||
// file-picking code only considers the user key portion. icmp_.Compare(smallest_candidate, meta->smallest) < 0) {
meta->smallest = std::move(smallest_candidate);
}
}
InternalKey largest_candidate = tombstone.SerializeEndKey();
if (upper_bound != nullptr &&
icmp_.user_comparator()->Compare(*upper_bound,
largest_candidate.user_key()) <= 0) {
// Pretend the largest key has the same user key as upper_bound (the
// min key in the following table or subcompaction) in order for files
// to appear key-space partitioned.
//
// Choose highest seqnum so this file's largest internal key comes
// before the next file's/subcompaction's smallest. The fake seqnum is
// OK because the read path's file-picking code only considers the user
// key portion.
// //
// Note Seek() also creates InternalKey with (user_key, // Note Seek() also creates InternalKey with (user_key,
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
// kTypeRangeDeletion (0xF), so the range tombstone comes before the // kTypeRangeDeletion (0xF), so the range tombstone comes before the
// Seek() key in InternalKey's ordering. So Seek() will look in the // Seek() key in InternalKey's ordering. So Seek() will look in the
// next file for the user key. // next file for the user key.
ParsedInternalKey parsed; largest_candidate = InternalKey(*upper_bound, kMaxSequenceNumber,
ParseInternalKey(*next_table_min_key, &parsed);
meta->largest = InternalKey(parsed.user_key, kMaxSequenceNumber,
kTypeRangeDeletion); kTypeRangeDeletion);
} else {
meta->largest = std::move(end_ikey);
} }
if (meta->largest.size() == 0 ||
icmp_.Compare(meta->largest, largest_candidate) < 0) {
meta->largest = std::move(largest_candidate);
} }
meta->smallest_seqno = std::min(meta->smallest_seqno, tombstone.seq_); meta->smallest_seqno = std::min(meta->smallest_seqno, tombstone.seq_);
meta->largest_seqno = std::max(meta->largest_seqno, tombstone.seq_); meta->largest_seqno = std::max(meta->largest_seqno, tombstone.seq_);

@ -56,19 +56,24 @@ class RangeDelAggregator {
// @param extend_before_min_key If true, the range of tombstones to be added // @param extend_before_min_key If true, the range of tombstones to be added
// to the TableBuilder starts from the beginning of the key-range; // to the TableBuilder starts from the beginning of the key-range;
// otherwise, it starts from meta->smallest. // otherwise, it starts from meta->smallest.
// @param next_table_min_key If nullptr, the range of tombstones to be added // @param lower_bound/upper_bound Any range deletion with [start_key, end_key)
// to the TableBuilder ends at the end of the key-range; otherwise, it // that overlaps the target range [*lower_bound, *upper_bound) is added to
// ends at next_table_min_key. // the builder. If lower_bound is nullptr, the target range extends
// infinitely to the left. If upper_bound is nullptr, the target range
// extends infinitely to the right. If both are nullptr, the target range
// extends infinitely in both directions, i.e., all range deletions are
// added to the builder.
// @param meta The file's metadata. We modify the begin and end keys according // @param meta The file's metadata. We modify the begin and end keys according
// to the range tombstones added to this file such that the read path does // to the range tombstones added to this file such that the read path does
// not miss range tombstones that cover gaps before/after/between files in // not miss range tombstones that cover gaps before/after/between files in
// a level. // a level. lower_bound/upper_bound above constrain how far file boundaries
// can be extended.
// @param bottommost_level If true, we will filter out any tombstones // @param bottommost_level If true, we will filter out any tombstones
// belonging to the oldest snapshot stripe, because all keys potentially // belonging to the oldest snapshot stripe, because all keys potentially
// covered by this tombstone are guaranteed to have been deleted by // covered by this tombstone are guaranteed to have been deleted by
// compaction. // compaction.
void AddToBuilder(TableBuilder* builder, bool extend_before_min_key, void AddToBuilder(TableBuilder* builder, const Slice* lower_bound,
const Slice* next_table_min_key, FileMetaData* meta, const Slice* upper_bound, FileMetaData* meta,
bool bottommost_level = false); bool bottommost_level = false);
Arena* GetArena() { return &arena_; } Arena* GetArena() { return &arena_; }
bool IsEmpty(); bool IsEmpty();

Loading…
Cancel
Save