Consider subcompaction boundaries when updating file boundaries for range deletion

Summary: Adjusted AddToBuilder() to take lower_bound and upper_bound, which serve two purposes: (1) only range deletions overlapping with the interval [lower_bound, upper_bound) will be added to the output file, and (2) the output file's boundaries will not be extended before lower_bound or after upper_bound. Our computation of lower_bound/upper_bound consider both subcompaction boundaries and previous/next files within the subcompaction. Test cases are here (level subcompactions: https://gist.github.com/ajkr/63c7eae3e9667c5ebdc0a7efb74ac332, and universal subcompactions: https://gist.github.com/ajkr/5a62af77c4ebe4052a1955c496d51fdb) but can't be included in this diff as they depend on committing the API first. They fail before this change and pass after. Closes https://github.com/facebook/rocksdb/pull/1501 Reviewed By: yhchiang Differential Revision: D4171685 Pulled By: ajkr fbshipit-source-id: ee99db8
9 years ago · ec2f64794b
parent 800e51553e
commit ec2f64794b
4 changed files with 95 additions and 58 deletions
--- a/db/builder.cc
+++ b/db/builder.cc
@ -146,8 +146,8 @@ Status BuildTable(
      }
    }
    // nullptr for table_{min,max} so all range tombstones will be flushed
-    range_del_agg->AddToBuilder(builder, true /* extend_before_min_key */,
-                                nullptr /* next_table_min_key*/, meta);
+    range_del_agg->AddToBuilder(builder, nullptr /* lower_bound */,
+                                nullptr /* upper_bound */, meta);

    // Finish and check for builder errors
    bool empty = builder->NumEntries() == 0;
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@ -972,14 +972,33 @@ Status CompactionJob::FinishCompactionOutputFile(
  Status s = input_status;
  auto meta = &sub_compact->current_output()->meta;
  if (s.ok()) {
-    // For the first output table, include range tombstones before the min key
-    // boundary. For subsequent output tables, this is unnecessary because we
-    // extend each file's max key boundary up until the next file's min key when
-    // range tombstones fall in the gap.
-    range_del_agg->AddToBuilder(
-        sub_compact->builder.get(),
-        sub_compact->outputs.size() == 1 /* extend_before_min_key */,
-        next_table_min_key, meta, bottommost_level_);
+    Slice lower_bound_guard, upper_bound_guard;
+    const Slice *lower_bound, *upper_bound;
+    if (sub_compact->outputs.size() == 1) {
+      // For the first output table, include range tombstones before the min key
+      // but after the subcompaction boundary.
+      lower_bound = sub_compact->start;
+    } else if (meta->smallest.size() > 0) {
+      // For subsequent output tables, only include range tombstones from min
+      // key onwards since the previous file was extended to contain range
+      // tombstones falling before min key.
+      lower_bound_guard = meta->smallest.user_key();
+      lower_bound = &lower_bound_guard;
+    } else {
+      lower_bound = nullptr;
+    }
+    if (next_table_min_key != nullptr) {
+      // This isn't the last file in the subcompaction, so extend until the next
+      // file starts.
+      upper_bound_guard = ExtractUserKey(*next_table_min_key);
+      upper_bound = &upper_bound_guard;
+    } else {
+      // This is the last file in the subcompaction, so extend until the
+      // subcompaction ends.
+      upper_bound = sub_compact->end;
+    }
+    range_del_agg->AddToBuilder(sub_compact->builder.get(), lower_bound,
+                                upper_bound, meta, bottommost_level_);
  }
  const uint64_t current_entries = sub_compact->builder->NumEntries();
  meta->marked_for_compaction = sub_compact->builder->NeedCompact();
--- a/db/range_del_aggregator.cc
+++ b/db/range_del_aggregator.cc
@ -113,11 +113,10 @@ RangeDelAggregator::TombstoneMap& RangeDelAggregator::GetTombstoneMap(
 // tombstones are known to be available, without the code duplication we have
 // in ShouldAddTombstones(). It'll also allow us to move the table-modifying
 // code into more coherent places: CompactionJob and BuildTable().
-void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
-                                      bool extend_before_min_key,
-                                      const Slice* next_table_min_key,
-                                      FileMetaData* meta,
-                                      bool bottommost_level /* = false */) {
+void RangeDelAggregator::AddToBuilder(
+    TableBuilder* builder, const Slice* lower_bound, const Slice* upper_bound,
+    FileMetaData* meta,
+    bool bottommost_level /* = false */) {
  auto stripe_map_iter = stripe_map_.begin();
  assert(stripe_map_iter != stripe_map_.end());
  if (bottommost_level) {
@ -132,20 +131,20 @@ void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
  while (stripe_map_iter != stripe_map_.end()) {
    for (const auto& start_key_and_tombstone : stripe_map_iter->second) {
      const auto& tombstone = start_key_and_tombstone.second;
-      if (next_table_min_key != nullptr &&
-          icmp_.user_comparator()->Compare(*next_table_min_key,
-                                           tombstone.start_key_) < 0) {
-        // Tombstones starting after next_table_min_key only need to be included
-        // in the next table.
+      if (upper_bound != nullptr &&
+          icmp_.user_comparator()->Compare(*upper_bound,
+                                           tombstone.start_key_) <= 0) {
+        // Tombstones starting at upper_bound or later only need to be included
+        // in the next table. Break because subsequent tombstones will start
+        // even later.
        break;
      }
-      if (!extend_before_min_key && meta->smallest.size() != 0 &&
+      if (lower_bound != nullptr &&
          icmp_.user_comparator()->Compare(tombstone.end_key_,
-                                           meta->smallest.user_key()) < 0) {
-        // Tombstones ending before this table's smallest key can conditionally
-        // be excluded, e.g., when this table is a non-first compaction output,
-        // we know such tombstones are included in the previous table. In that
-        // case extend_before_min_key would be false.
+                                           *lower_bound) <= 0) {
+        // Tombstones ending before or at lower_bound only need to be included
+        // in the prev table. Continue because subsequent tombstones may still
+        // overlap [lower_bound, upper_bound).
        continue;
      }

@ -153,35 +152,49 @@ void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
      builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
      if (!first_added) {
        first_added = true;
-        if (extend_before_min_key &&
-            (meta->smallest.size() == 0 ||
-             icmp_.Compare(ikey_and_end_key.first, meta->smallest) < 0)) {
-          meta->smallest = ikey_and_end_key.first;
+        InternalKey smallest_candidate = std::move(ikey_and_end_key.first);;
+        if (lower_bound != nullptr &&
+            icmp_.user_comparator()->Compare(smallest_candidate.user_key(),
+                                             *lower_bound) <= 0) {
+          // Pretend the smallest key has the same user key as lower_bound
+          // (the max key in the previous table or subcompaction) in order for
+          // files to appear key-space partitioned.
+          //
+          // Choose lowest seqnum so this file's smallest internal key comes
+          // after the previous file's/subcompaction's largest. The fake seqnum
+          // is OK because the read path's file-picking code only considers user
+          // key.
+          smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
+        }
+        if (meta->smallest.size() == 0 ||
+            icmp_.Compare(smallest_candidate, meta->smallest) < 0) {
+          meta->smallest = std::move(smallest_candidate);
        }
      }
-      auto end_ikey = tombstone.SerializeEndKey();
+      InternalKey largest_candidate = tombstone.SerializeEndKey();
+      if (upper_bound != nullptr &&
+          icmp_.user_comparator()->Compare(*upper_bound,
+                                           largest_candidate.user_key()) <= 0) {
+        // Pretend the largest key has the same user key as upper_bound (the
+        // min key in the following table or subcompaction) in order for files
+        // to appear key-space partitioned.
+        //
+        // Choose highest seqnum so this file's largest internal key comes
+        // before the next file's/subcompaction's smallest. The fake seqnum is
+        // OK because the read path's file-picking code only considers the user
+        // key portion.
+        //
+        // Note Seek() also creates InternalKey with (user_key,
+        // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
+        // kTypeRangeDeletion (0xF), so the range tombstone comes before the
+        // Seek() key in InternalKey's ordering. So Seek() will look in the
+        // next file for the user key.
+        largest_candidate = InternalKey(*upper_bound, kMaxSequenceNumber,
+                                        kTypeRangeDeletion);
+      }
      if (meta->largest.size() == 0 ||
-          icmp_.Compare(meta->largest, end_ikey) < 0) {
-        if (next_table_min_key != nullptr &&
-            icmp_.Compare(*next_table_min_key, end_ikey.Encode()) < 0) {
-          // Pretend the largest key has the same user key as the min key in the
-          // following table in order for files to appear key-space partitioned.
-          // Choose highest seqnum so this file's largest comes before the next
-          // file's smallest. The fake seqnum is OK because the read path's
-          // file-picking code only considers the user key portion.
-          //
-          // Note Seek() also creates InternalKey with (user_key,
-          // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
-          // kTypeRangeDeletion (0xF), so the range tombstone comes before the
-          // Seek() key in InternalKey's ordering. So Seek() will look in the
-          // next file for the user key.
-          ParsedInternalKey parsed;
-          ParseInternalKey(*next_table_min_key, &parsed);
-          meta->largest = InternalKey(parsed.user_key, kMaxSequenceNumber,
-                                      kTypeRangeDeletion);
-        } else {
-          meta->largest = std::move(end_ikey);
-        }
+          icmp_.Compare(meta->largest, largest_candidate) < 0) {
+        meta->largest = std::move(largest_candidate);
      }
      meta->smallest_seqno = std::min(meta->smallest_seqno, tombstone.seq_);
      meta->largest_seqno = std::max(meta->largest_seqno, tombstone.seq_);
--- a/db/range_del_aggregator.h
+++ b/db/range_del_aggregator.h
@ -56,19 +56,24 @@ class RangeDelAggregator {
  // @param extend_before_min_key If true, the range of tombstones to be added
  //    to the TableBuilder starts from the beginning of the key-range;
  //    otherwise, it starts from meta->smallest.
-  // @param next_table_min_key If nullptr, the range of tombstones to be added
-  //    to the TableBuilder ends at the end of the key-range; otherwise, it
-  //    ends at next_table_min_key.
+  // @param lower_bound/upper_bound Any range deletion with [start_key, end_key)
+  //    that overlaps the target range [*lower_bound, *upper_bound) is added to
+  //    the builder. If lower_bound is nullptr, the target range extends
+  //    infinitely to the left. If upper_bound is nullptr, the target range
+  //    extends infinitely to the right. If both are nullptr, the target range
+  //    extends infinitely in both directions, i.e., all range deletions are
+  //    added to the builder.
  // @param meta The file's metadata. We modify the begin and end keys according
  //    to the range tombstones added to this file such that the read path does
  //    not miss range tombstones that cover gaps before/after/between files in
-  //    a level.
+  //    a level. lower_bound/upper_bound above constrain how far file boundaries
+  //    can be extended.
  // @param bottommost_level If true, we will filter out any tombstones
  //    belonging to the oldest snapshot stripe, because all keys potentially
  //    covered by this tombstone are guaranteed to have been deleted by
  //    compaction.
-  void AddToBuilder(TableBuilder* builder, bool extend_before_min_key,
-                    const Slice* next_table_min_key, FileMetaData* meta,
+  void AddToBuilder(TableBuilder* builder, const Slice* lower_bound,
+                    const Slice* upper_bound, FileMetaData* meta,
                    bool bottommost_level = false);
  Arena* GetArena() { return &arena_; }
  bool IsEmpty();