From cae540ebefe57c5c2e56b650574511dbfcc20efb Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Mon, 29 Oct 2018 19:21:24 -0700
Subject: [PATCH] Fix range tombstones written to more files than necessary
 (#4592)

Summary:
When there's a gap between files, we do not need to output tombstones starting at the next output file's begin key to the current output file.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4592

Differential Revision: D12808627

Pulled By: ajkr

fbshipit-source-id: 77c8b2e7523a95b1cd6611194144092c06acb505
---
 db/compaction_job.cc    | 27 +++++++++---
 db/db_range_del_test.cc | 94 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+), 7 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index a3c5488c3..d4aa28bad 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -1209,15 +1209,28 @@ Status CompactionJob::FinishCompactionOutputFile(
     if (lower_bound != nullptr) {
       it->Seek(*lower_bound);
     }
+
+    bool has_overlapping_endpoints;
+    if (upper_bound != nullptr && meta->largest.size() > 0) {
+      has_overlapping_endpoints =
+          ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
+    } else {
+      has_overlapping_endpoints = false;
+    }
     for (; it->Valid(); it->Next()) {
       auto tombstone = it->Tombstone();
-      if (upper_bound != nullptr &&
-          ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) {
-        // Tombstones starting after upper_bound only need to be included in the
-        // next table (if the SSTs overlap, then upper_bound is contained in
-        // this SST and hence must be covered). Break because subsequent
-        // tombstones will start even later.
-        break;
+      if (upper_bound != nullptr) {
+        int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
+        if ((has_overlapping_endpoints && cmp < 0) ||
+            (!has_overlapping_endpoints && cmp <= 0)) {
+          // Tombstones starting after upper_bound only need to be included in
+          // the next table. If the current SST ends before upper_bound, i.e.,
+          // `has_overlapping_endpoints == false`, we can also skip over range
+          // tombstones that start exactly at upper_bound. Such range tombstones
+          // will be included in the next file and are not relevant to the point
+          // keys or endpoints of the current file.
+          break;
+        }
       }
 
       if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index 4e97651b7..508149a34 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -1389,6 +1389,100 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
   db_->ReleaseSnapshot(snapshot);
 }
 
+TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
+  // Adapted from
+  // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
+  // Regression test for issue where range tombstone was written to more files
+  // than necessary when it began exactly at the begin key in the next
+  // compaction output file.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  // Have a bit of slack in the size limits but we enforce them more strictly
+  // when manually flushing/compacting.
+  options.max_compaction_bytes = 2 * kFileBytes;
+  options.target_file_size_base = 2 * kFileBytes;
+  options.write_buffer_size = 2 * kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  for (char first_char : {'a', 'b', 'c'}) {
+    for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
+      std::string key(1, first_char);
+      key.append(Key(i));
+      std::string value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(Put(key, value));
+    }
+    db_->Flush(FlushOptions());
+    MoveFilesToLevel(2);
+  }
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(3, NumTableFilesAtLevel(2));
+
+  // Populate the memtable lightly while spanning the whole key-space. The
+  // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
+  // files to prevent a large L1->L2 compaction later.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "c" + Key(1), "d"));
+  // Our compaction output file cutting logic currently only considers point
+  // keys. So, in order for the range tombstone to have a chance at landing at
+  // the start of a new file, we need a point key at the range tombstone's
+  // start.
+  // TODO(ajkr): remove this `Put` after file cutting accounts for range
+  // tombstones (#3977).
+  ASSERT_OK(Put("c" + Key(1), "value"));
+  db_->Flush(FlushOptions());
+
+  // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
+  // and the range tombstone is only placed in the second SST.
+  std::string begin_key_storage("c" + Key(1));
+  Slice begin_key(begin_key_storage);
+  std::string end_key_storage("d");
+  Slice end_key(end_key_storage);
+  dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
+                              &end_key /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> all_metadata;
+  std::vector<LiveFileMetaData> l1_metadata;
+  db_->GetLiveFilesMetaData(&all_metadata);
+  for (const auto& metadata : all_metadata) {
+    if (metadata.level == 1) {
+      l1_metadata.push_back(metadata);
+    }
+  }
+  std::sort(l1_metadata.begin(), l1_metadata.end(),
+            [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
+              return options.comparator->Compare(a.smallestkey, b.smallestkey) <
+                     0;
+            });
+  ASSERT_EQ("a", l1_metadata[0].smallestkey);
+  ASSERT_EQ("a", l1_metadata[0].largestkey);
+  ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
+  ASSERT_EQ("d", l1_metadata[1].largestkey);
+
+  TablePropertiesCollection all_table_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
+  int64_t num_range_deletions = 0;
+  for (const auto& name_and_table_props : all_table_props) {
+    const auto& name = name_and_table_props.first;
+    const auto& table_props = name_and_table_props.second;
+    // The range tombstone should only be output to the second L1 SST.
+    if (name.size() >= l1_metadata[1].name.size() &&
+        name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) {
+      ASSERT_EQ(1, table_props->num_range_deletions);
+      ++num_range_deletions;
+    } else {
+      ASSERT_EQ(0, table_props->num_range_deletions);
+    }
+  }
+  ASSERT_EQ(1, num_range_deletions);
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb