diff --git a/db/builder.cc b/db/builder.cc index 2e142a239..4c5e27b11 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -146,8 +146,8 @@ Status BuildTable( } } // nullptr for table_{min,max} so all range tombstones will be flushed - range_del_agg->AddToBuilder(builder, true /* extend_before_min_key */, - nullptr /* next_table_min_key*/, meta); + range_del_agg->AddToBuilder(builder, nullptr /* lower_bound */, + nullptr /* upper_bound */, meta); // Finish and check for builder errors bool empty = builder->NumEntries() == 0; diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 04bd3a734..091c91856 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -972,14 +972,33 @@ Status CompactionJob::FinishCompactionOutputFile( Status s = input_status; auto meta = &sub_compact->current_output()->meta; if (s.ok()) { - // For the first output table, include range tombstones before the min key - // boundary. For subsequent output tables, this is unnecessary because we - // extend each file's max key boundary up until the next file's min key when - // range tombstones fall in the gap. - range_del_agg->AddToBuilder( - sub_compact->builder.get(), - sub_compact->outputs.size() == 1 /* extend_before_min_key */, - next_table_min_key, meta, bottommost_level_); + Slice lower_bound_guard, upper_bound_guard; + const Slice *lower_bound, *upper_bound; + if (sub_compact->outputs.size() == 1) { + // For the first output table, include range tombstones before the min key + // but after the subcompaction boundary. + lower_bound = sub_compact->start; + } else if (meta->smallest.size() > 0) { + // For subsequent output tables, only include range tombstones from min + // key onwards since the previous file was extended to contain range + // tombstones falling before min key. + lower_bound_guard = meta->smallest.user_key(); + lower_bound = &lower_bound_guard; + } else { + lower_bound = nullptr; + } + if (next_table_min_key != nullptr) { + // This isn't the last file in the subcompaction, so extend until the next + // file starts. + upper_bound_guard = ExtractUserKey(*next_table_min_key); + upper_bound = &upper_bound_guard; + } else { + // This is the last file in the subcompaction, so extend until the + // subcompaction ends. + upper_bound = sub_compact->end; + } + range_del_agg->AddToBuilder(sub_compact->builder.get(), lower_bound, + upper_bound, meta, bottommost_level_); } const uint64_t current_entries = sub_compact->builder->NumEntries(); meta->marked_for_compaction = sub_compact->builder->NeedCompact(); diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index ad68eee09..64b2dcdf4 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -113,11 +113,10 @@ RangeDelAggregator::TombstoneMap& RangeDelAggregator::GetTombstoneMap( // tombstones are known to be available, without the code duplication we have // in ShouldAddTombstones(). It'll also allow us to move the table-modifying // code into more coherent places: CompactionJob and BuildTable(). -void RangeDelAggregator::AddToBuilder(TableBuilder* builder, - bool extend_before_min_key, - const Slice* next_table_min_key, - FileMetaData* meta, - bool bottommost_level /* = false */) { +void RangeDelAggregator::AddToBuilder( + TableBuilder* builder, const Slice* lower_bound, const Slice* upper_bound, + FileMetaData* meta, + bool bottommost_level /* = false */) { auto stripe_map_iter = stripe_map_.begin(); assert(stripe_map_iter != stripe_map_.end()); if (bottommost_level) { @@ -132,20 +131,20 @@ void RangeDelAggregator::AddToBuilder(TableBuilder* builder, while (stripe_map_iter != stripe_map_.end()) { for (const auto& start_key_and_tombstone : stripe_map_iter->second) { const auto& tombstone = start_key_and_tombstone.second; - if (next_table_min_key != nullptr && - icmp_.user_comparator()->Compare(*next_table_min_key, - tombstone.start_key_) < 0) { - // Tombstones starting after next_table_min_key only need to be included - // in the next table. + if (upper_bound != nullptr && + icmp_.user_comparator()->Compare(*upper_bound, + tombstone.start_key_) <= 0) { + // Tombstones starting at upper_bound or later only need to be included + // in the next table. Break because subsequent tombstones will start + // even later. break; } - if (!extend_before_min_key && meta->smallest.size() != 0 && + if (lower_bound != nullptr && icmp_.user_comparator()->Compare(tombstone.end_key_, - meta->smallest.user_key()) < 0) { - // Tombstones ending before this table's smallest key can conditionally - // be excluded, e.g., when this table is a non-first compaction output, - // we know such tombstones are included in the previous table. In that - // case extend_before_min_key would be false. + *lower_bound) <= 0) { + // Tombstones ending before or at lower_bound only need to be included + // in the prev table. Continue because subsequent tombstones may still + // overlap [lower_bound, upper_bound). continue; } @@ -153,35 +152,49 @@ void RangeDelAggregator::AddToBuilder(TableBuilder* builder, builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second); if (!first_added) { first_added = true; - if (extend_before_min_key && - (meta->smallest.size() == 0 || - icmp_.Compare(ikey_and_end_key.first, meta->smallest) < 0)) { - meta->smallest = ikey_and_end_key.first; + InternalKey smallest_candidate = std::move(ikey_and_end_key.first);; + if (lower_bound != nullptr && + icmp_.user_comparator()->Compare(smallest_candidate.user_key(), + *lower_bound) <= 0) { + // Pretend the smallest key has the same user key as lower_bound + // (the max key in the previous table or subcompaction) in order for + // files to appear key-space partitioned. + // + // Choose lowest seqnum so this file's smallest internal key comes + // after the previous file's/subcompaction's largest. The fake seqnum + // is OK because the read path's file-picking code only considers user + // key. + smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion); + } + if (meta->smallest.size() == 0 || + icmp_.Compare(smallest_candidate, meta->smallest) < 0) { + meta->smallest = std::move(smallest_candidate); } } - auto end_ikey = tombstone.SerializeEndKey(); + InternalKey largest_candidate = tombstone.SerializeEndKey(); + if (upper_bound != nullptr && + icmp_.user_comparator()->Compare(*upper_bound, + largest_candidate.user_key()) <= 0) { + // Pretend the largest key has the same user key as upper_bound (the + // min key in the following table or subcompaction) in order for files + // to appear key-space partitioned. + // + // Choose highest seqnum so this file's largest internal key comes + // before the next file's/subcompaction's smallest. The fake seqnum is + // OK because the read path's file-picking code only considers the user + // key portion. + // + // Note Seek() also creates InternalKey with (user_key, + // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of + // kTypeRangeDeletion (0xF), so the range tombstone comes before the + // Seek() key in InternalKey's ordering. So Seek() will look in the + // next file for the user key. + largest_candidate = InternalKey(*upper_bound, kMaxSequenceNumber, + kTypeRangeDeletion); + } if (meta->largest.size() == 0 || - icmp_.Compare(meta->largest, end_ikey) < 0) { - if (next_table_min_key != nullptr && - icmp_.Compare(*next_table_min_key, end_ikey.Encode()) < 0) { - // Pretend the largest key has the same user key as the min key in the - // following table in order for files to appear key-space partitioned. - // Choose highest seqnum so this file's largest comes before the next - // file's smallest. The fake seqnum is OK because the read path's - // file-picking code only considers the user key portion. - // - // Note Seek() also creates InternalKey with (user_key, - // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of - // kTypeRangeDeletion (0xF), so the range tombstone comes before the - // Seek() key in InternalKey's ordering. So Seek() will look in the - // next file for the user key. - ParsedInternalKey parsed; - ParseInternalKey(*next_table_min_key, &parsed); - meta->largest = InternalKey(parsed.user_key, kMaxSequenceNumber, - kTypeRangeDeletion); - } else { - meta->largest = std::move(end_ikey); - } + icmp_.Compare(meta->largest, largest_candidate) < 0) { + meta->largest = std::move(largest_candidate); } meta->smallest_seqno = std::min(meta->smallest_seqno, tombstone.seq_); meta->largest_seqno = std::max(meta->largest_seqno, tombstone.seq_); diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index 543085047..d1b11873e 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -56,19 +56,24 @@ class RangeDelAggregator { // @param extend_before_min_key If true, the range of tombstones to be added // to the TableBuilder starts from the beginning of the key-range; // otherwise, it starts from meta->smallest. - // @param next_table_min_key If nullptr, the range of tombstones to be added - // to the TableBuilder ends at the end of the key-range; otherwise, it - // ends at next_table_min_key. + // @param lower_bound/upper_bound Any range deletion with [start_key, end_key) + // that overlaps the target range [*lower_bound, *upper_bound) is added to + // the builder. If lower_bound is nullptr, the target range extends + // infinitely to the left. If upper_bound is nullptr, the target range + // extends infinitely to the right. If both are nullptr, the target range + // extends infinitely in both directions, i.e., all range deletions are + // added to the builder. // @param meta The file's metadata. We modify the begin and end keys according // to the range tombstones added to this file such that the read path does // not miss range tombstones that cover gaps before/after/between files in - // a level. + // a level. lower_bound/upper_bound above constrain how far file boundaries + // can be extended. // @param bottommost_level If true, we will filter out any tombstones // belonging to the oldest snapshot stripe, because all keys potentially // covered by this tombstone are guaranteed to have been deleted by // compaction. - void AddToBuilder(TableBuilder* builder, bool extend_before_min_key, - const Slice* next_table_min_key, FileMetaData* meta, + void AddToBuilder(TableBuilder* builder, const Slice* lower_bound, + const Slice* upper_bound, FileMetaData* meta, bool bottommost_level = false); Arena* GetArena() { return &arena_; } bool IsEmpty();