From 6a82b68788d0fcf2b7a720edde7e443b71a93d78 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Tue, 17 Jan 2023 12:47:44 -0800 Subject: [PATCH] Avoid counting extra range tombstone compensated size in `AddRangeDels()` (#11091) Summary: in `CompactionOutputs::AddRangeDels()`, range tombstones with the same start and end key but different sequence numbers all contribute to compensated range tombstone size. This PR removes this redundancy. This PR also includes a fix from https://github.com/facebook/rocksdb/issues/11067 where a range tombstone that is not within a file's range was being added to the file. This fixes an assertion failure for `icmp.Compare(start, end) <= 0` in VersionSet::ApproximateSize() when calculating compensated range tombstone size. Assertions and a comment/essay was added to reason that no such range tombstone will be added after this fix. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11091 Test Plan: - Added unit tests - Stress test with small key range: `python3 tools/db_crashtest.py blackbox --simple --max_key=100 --interval=600 --write_buffer_size=262144 --target_file_size_base=256 --max_bytes_for_level_base=262144 --block_size=128 --value_size_mult=33 --subcompactions=10` Reviewed By: ajkr Differential Revision: D42521588 Pulled By: cbi42 fbshipit-source-id: 5bda3fe38997995314e1f7592319af12b69bc4f8 --- db/builder.cc | 29 ++-- db/compaction/compaction_outputs.cc | 220 ++++++++++++++++++++++++---- db/db_range_del_test.cc | 115 +++++++++++++++ db/version_edit.h | 1 + 4 files changed, 323 insertions(+), 42 deletions(-) diff --git a/db/builder.cc b/db/builder.cc index cb7769e96..4cea24bfd 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -176,10 +176,10 @@ Status BuildTable( builder = NewTableBuilder(tboptions, file_writer.get()); } + auto ucmp = tboptions.internal_comparator.user_comparator(); MergeHelper merge( - env, tboptions.internal_comparator.user_comparator(), - ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger, - true /* internal key corruption is not ok */, + env, ucmp, ioptions.merge_operator.get(), compaction_filter.get(), + ioptions.logger, true /* internal key corruption is not ok */, snapshots.empty() ? 0 : snapshots.back(), snapshot_checker); std::unique_ptr blob_file_builder( @@ -197,9 +197,8 @@ Status BuildTable( const std::atomic kManualCompactionCanceledFalse{false}; CompactionIterator c_iter( - iter, tboptions.internal_comparator.user_comparator(), &merge, - kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot, - job_snapshot, snapshot_checker, env, + iter, ucmp, &merge, kMaxSequenceNumber, &snapshots, + earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env, ShouldReportDetailedTime(env, ioptions.stats), true /* internal key corruption is not ok */, range_del_agg.get(), blob_file_builder.get(), ioptions.allow_data_in_errors, @@ -242,6 +241,7 @@ Status BuildTable( if (s.ok()) { auto range_del_it = range_del_agg->NewIterator(); + Slice last_tombstone_start_user_key{}; for (range_del_it->SeekToFirst(); range_del_it->Valid(); range_del_it->Next()) { auto tombstone = range_del_it->Tombstone(); @@ -251,12 +251,17 @@ Status BuildTable( meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_, tboptions.internal_comparator); if (version) { - SizeApproximationOptions approx_opts; - approx_opts.files_size_error_margin = 0.1; - meta->compensated_range_deletion_size += versions->ApproximateSize( - approx_opts, version, kv.first.Encode(), tombstone_end.Encode(), - 0 /* start_level */, -1 /* end_level */, - TableReaderCaller::kFlush); + if (last_tombstone_start_user_key.empty() || + ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key, + tombstone.start_key_) < 0) { + SizeApproximationOptions approx_opts; + approx_opts.files_size_error_margin = 0.1; + meta->compensated_range_deletion_size += versions->ApproximateSize( + approx_opts, version, kv.first.Encode(), tombstone_end.Encode(), + 0 /* start_level */, -1 /* end_level */, + TableReaderCaller::kFlush); + } + last_tombstone_start_user_key = tombstone.start_key_; } } } diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index 7bcce37ed..eba3208ac 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -399,6 +399,17 @@ Status CompactionOutputs::AddRangeDels( const Slice *lower_bound, *upper_bound; bool lower_bound_from_sub_compact = false; + // The following example does not happen since + // CompactionOutput::ShouldStopBefore() always return false for the first + // point key. But we should consider removing this dependency. Suppose for the + // first compaction output file, + // - next_table_min_key.user_key == comp_start_user_key + // - no point key is in the output file + // - there is a range tombstone @seqno to be added that covers + // comp_start_user_key + // Then meta.smallest will be set to comp_start_user_key@seqno + // and meta.largest will be set to comp_start_user_key@kMaxSequenceNumber + // which violates the assumption that meta.smallest should be <= meta.largest. size_t output_size = outputs_.size(); if (output_size == 1) { // For the first output table, include range tombstones before the min @@ -459,20 +470,34 @@ Status CompactionOutputs::AddRangeDels( } else { it->SeekToFirst(); } + Slice last_tombstone_start_user_key{}; for (; it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); if (upper_bound != nullptr) { int cmp = ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_); - if ((has_overlapping_endpoints && cmp < 0) || - (!has_overlapping_endpoints && cmp <= 0)) { - // Tombstones starting after upper_bound only need to be included in - // the next table. If the current SST ends before upper_bound, i.e., - // `has_overlapping_endpoints == false`, we can also skip over range - // tombstones that start exactly at upper_bound. Such range - // tombstones will be included in the next file and are not relevant - // to the point keys or endpoints of the current file. + // Tombstones starting after upper_bound only need to be included in + // the next table. + // If the current SST ends before upper_bound, i.e., + // `has_overlapping_endpoints == false`, we can also skip over range + // tombstones that start exactly at upper_bound. Such range + // tombstones will be included in the next file and are not relevant + // to the point keys or endpoints of the current file. + // If the current SST ends at the same user key at upper_bound, + // i.e., `has_overlapping_endpoints == true`, AND the tombstone has + // the same start key as upper_bound, i.e., cmp == 0, then + // the tombstone is relevant only if the tombstone's sequence number + // is no larger than this file's largest key's sequence number. This + // is because the upper bound to truncate this file's range tombstone + // will be meta.largest in this case, and any tombstone that starts after + // it will not be relevant. + if (cmp < 0) { break; + } else if (cmp == 0) { + if (!has_overlapping_endpoints || + tombstone.seq_ < GetInternalKeySeqno(meta.largest.Encode())) { + break; + } } } @@ -571,15 +596,14 @@ Status CompactionOutputs::AddRangeDels( InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); } } -#ifndef NDEBUG - SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; - if (meta.smallest.size() > 0) { - smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode()); - } -#endif meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate, tombstone.seq_, icmp); if (!bottommost_level) { + bool start_user_key_changed = + last_tombstone_start_user_key.empty() || + ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key, + tombstone.start_key_) < 0; + last_tombstone_start_user_key = tombstone.start_key_; // Range tombstones are truncated at file boundaries if (icmp.Compare(tombstone_start, meta.smallest) < 0) { tombstone_start = meta.smallest; @@ -587,23 +611,159 @@ Status CompactionOutputs::AddRangeDels( if (icmp.Compare(tombstone_end, meta.largest) > 0) { tombstone_end = meta.largest; } - SizeApproximationOptions approx_opts; - approx_opts.files_size_error_margin = 0.1; - auto approximate_covered_size = - compaction_->input_version()->version_set()->ApproximateSize( - approx_opts, compaction_->input_version(), - tombstone_start.Encode(), tombstone_end.Encode(), - compaction_->output_level() + 1 /* start_level */, - -1 /* end_level */, kCompaction); - meta.compensated_range_deletion_size += approximate_covered_size; + // this assertion validates invariant (2) in the comment below. + assert(icmp.Compare(tombstone_start, tombstone_end) <= 0); + if (start_user_key_changed) { + // if tombstone_start >= tombstone_end, then either no key range is + // covered, or that they have the same user key. If they have the same + // user key, then the internal key range should only be within this + // level, and no keys from older levels is covered. + if (ucmp->CompareWithoutTimestamp(tombstone_start.user_key(), + tombstone_end.user_key()) < 0) { + SizeApproximationOptions approx_opts; + approx_opts.files_size_error_margin = 0.1; + auto approximate_covered_size = + compaction_->input_version()->version_set()->ApproximateSize( + approx_opts, compaction_->input_version(), + tombstone_start.Encode(), tombstone_end.Encode(), + compaction_->output_level() + 1 /* start_level */, + -1 /* end_level */, kCompaction); + meta.compensated_range_deletion_size += approximate_covered_size; + } + } } - // The smallest key in a file is used for range tombstone truncation, so - // it cannot have a seqnum of 0 (unless the smallest data key in a file - // has a seqnum of 0). Otherwise, the truncated tombstone may expose - // deleted keys at lower levels. - assert(smallest_ikey_seqnum == 0 || - ExtractInternalKeyFooter(meta.smallest.Encode()) != - PackSequenceAndType(0, kTypeRangeDeletion)); + // TODO: show invariants that ensure all necessary range tombstones are + // added + // and that file boundaries ensure no coverage is lost. + // Each range tombstone with internal key range [tombstone_start, + // tombstone_end] is being added to the current compaction output file here. + // The range tombstone is going to be truncated at range [meta.smallest, + // meta.largest] during reading/scanning. We should maintain invariants + // (1) meta.smallest <= meta.largest and, + // (2) [tombstone_start, tombstone_end] and [meta.smallest, meta.largest] + // overlaps, as there is no point adding range tombstone with a range + // outside the file's range. + // Since `tombstone_end` is always some user_key@kMaxSeqno, it is okay to + // use either open or closed range. Using closed range here to make + // reasoning easier, and it is more consistent with an ongoing work that + // tries to simplify this method. + // + // There are two cases: + // Case 1. Output file has no point key: + // First we show this case only happens when the entire compaction output + // is range tombstone only. This is true if CompactionIterator does not + // emit any point key. Suppose CompactionIterator emits some point key. + // Based on the assumption that CompactionOutputs::ShouldStopBefore() + // always return false for the first point key, the first compaction + // output file always contains a point key. Each new compaction output + // file is created if there is a point key for which ShouldStopBefore() + // returns true, and the point key would be added to the new compaction + // output file. So each new compaction file always contains a point key. + // So Case 1 only happens when CompactionIterator does not emit any + // point key. + // + // To show (1) meta.smallest <= meta.largest: + // Since the compaction output is range tombstone only, `lower_bound` and + // `upper_bound` are either null or comp_start/end_user_key respectively. + // According to how UpdateBoundariesForRange() is implemented, it blindly + // updates meta.smallest and meta.largest to smallest_candidate and + // largest_candidate the first time it is called. Subsequently, it + // compares input parameter with meta.smallest and meta.largest and only + // updates them when input is smaller/larger. So we only need to show + // smallest_candidate <= largest_candidate the first time + // UpdateBoundariesForRange() is called. Here we show something stronger + // that smallest_candidate.user_key < largest_candidate.user_key always + // hold for Case 1. + // We assume comp_start_user_key < comp_end_user_key, if provided. We + // assume that tombstone_start < tombstone_end. This assumption is based + // on that each fragment in FragmentedTombstoneList has + // start_key < end_key (user_key) and that + // FragmentedTombstoneIterator::Tombstone() returns the pair + // (start_key@tombstone_seqno with op_type kTypeRangeDeletion, end_key). + // The logic in this loop sets smallest_candidate to + // max(tombstone_start.user_key, comp_start_user_key)@tombstone.seq_ with + // op_type kTypeRangeDeletion, largest_candidate to + // min(tombstone_end.user_key, comp_end_user_key)@kMaxSequenceNumber with + // op_type kTypeRangeDeletion. When a bound is null, there is no + // truncation on that end. To show that smallest_candidate.user_key < + // largest_candidate.user_key, it suffices to show + // tombstone_start.user_key < comp_end_user_key (if not null) AND + // comp_start_user_key (if not null) < tombstone_end.user_key. + // Since the file has no point key, `has_overlapping_endpoints` is false. + // In the first sanity check of this for-loop, we compare + // tombstone_start.user_key against upper_bound = comp_end_user_key, + // and only proceed if tombstone_start.user_key < comp_end_user_key. + // We assume FragmentedTombstoneIterator::Seek(k) lands + // on a tombstone with end_key > k. So the call it->Seek(*lower_bound) + // above implies compact_start_user_key < tombstone_end.user_key. + // + // To show (2) [tombstone_start, tombstone_end] and [meta.smallest, + // meta.largest] overlaps (after the call to UpdateBoundariesForRange()): + // In the proof for (1) we have shown that + // smallest_candidate <= largest_candidate. Since tombstone_start <= + // smallest_candidate <= largest_candidate <= tombstone_end, for (2) to + // hold, it suffices to show that [smallest_candidate, largest_candidate] + // overlaps with [meta.smallest, meta.largest]. too. + // Given meta.smallest <= meta.largest shown above, we need to show + // that it is impossible to have largest_candidate < meta.smallest or + // meta.largest < smallest_candidate. If the above + // meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate) + // updates meta.largest or meta.smallest, then the two ranges overlap. + // So we assume meta.UpdateBoundariesForRange(smallest_candidate, + // largest_candidate) did not update meta.smallest nor meta.largest, which + // means meta.smallest < smallest_candidate and largest_candidate < + // meta.largest. + // + // Case 2. Output file has >= 1 point key. This means meta.smallest and + // meta.largest are not empty when AddRangeDels() is called. + // To show (1) meta.smallest <= meta.largest: + // Assume meta.smallest <= meta.largest when AddRangeDels() is called, + // this follow from how UpdateBoundariesForRange() is implemented where it + // takes min or max to update meta.smallest or meta.largest. + // + // To show (2) [tombstone_start, tombstone_end] and [meta.smallest, + // meta.largest] overlaps (after the call to UpdateBoundariesForRange()): + // When smallest_candidate <= largest_candidate, the proof in Case 1 + // applies, so we only need to show (2) holds when smallest_candidate > + // largest_candidate. When both bounds are either null or from + // subcompaction boundary, the proof in Case 1 applies, so we only need to + // show (2) holds when at least one bound is from a point key (either + // meta.smallest for lower bound or next_table_min_key for upper bound). + // + // Suppose lower bound is meta.smallest.user_key. The call + // it->Seek(*lower_bound) implies tombstone_end.user_key > + // meta.smallest.user_key. We have smallest_candidate.user_key = + // max(tombstone_start.user_key, meta.smallest.user_key). For + // smallest_candidate to be > largest_candidate, we need + // largest_candidate.user_key = upper_bound = smallest_candidate.user_key, + // where tombstone_end is truncated to largest_candidate. + // Subcase 1: + // Suppose largest_candidate.user_key = comp_end_user_key (there is no + // next point key). Subcompaction ensures any point key from this + // subcompaction has a user_key < comp_end_user_key, so 1) + // meta.smallest.user_key < comp_end_user_key, 2) + // `has_overlapping_endpoints` is false, and the first if condition in + // this for-loop ensures tombstone_start.user_key < comp_end_user_key. So + // smallest_candidate.user_key < largest_candidate.user_key. This case + // cannot happen when smallest > largest_candidate. + // Subcase 2: + // Suppose largest_candidate.user_key = next_table_min_key.user_key. + // The first if condition in this for-loop together with + // smallest_candidate.user_key = next_table_min_key.user_key = + // upper_bound implies `has_overlapping_endpoints` is true (so meta + // largest.user_key = upper_bound) and + // tombstone.seq_ < meta.largest.seqno. So + // tombstone_start < meta.largest < tombstone_end. + // + // Suppose lower bound is comp_start_user_key and upper_bound is + // next_table_min_key. The call it->Seek(*lower_bound) implies we have + // tombstone_end_key.user_key > comp_start_user_key. So + // tombstone_end_key.user_key > smallest_candidate.user_key. For + // smallest_candidate to be > largest_candidate, we need + // tombstone_start.user_key = largest_candidate.user_key = upper_bound = + // next_table_min_key.user_key. This means `has_overlapping_endpoints` is + // true (so meta.largest.user_key = upper_bound) and tombstone.seq_ < + // meta.largest.seqno. So tombstone_start < meta.largest < tombstone_end. } return Status::OK(); } diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 19f7e95ba..bfabc42fb 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -2909,6 +2909,121 @@ TEST_F(DBRangeDelTest, RangetombesoneCompensateFilesizePersistDuringReopen) { ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, l2_size); } +TEST_F(DBRangeDelTest, SingleKeyFile) { + // Test for a bug fix where a range tombstone could be added + // to an SST file while is not within the file's key range. + // Create 3 files in L0 and then L1 where all keys have the same user key + // `Key(2)`. The middle file will contain Key(2)@6 and Key(2)@5. Before fix, + // the range tombstone [Key(2), Key(5))@2 would be added to this file during + // compaction, but it is not in this file's key range. + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.target_file_size_base = 1 << 10; + opts.level_compaction_dynamic_file_size = false; + DestroyAndReopen(opts); + + // prevent range tombstone drop + std::vector snapshots; + snapshots.push_back(db_->GetSnapshot()); + + // write a key to bottommost file so the compactions below + // are not bottommost compactions and will calculate + // compensated range tombstone size. Before bug fix, an assert would fail + // during this process. + Random rnd(301); + ASSERT_OK(Put(Key(2), rnd.RandomString(8 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(5))); + snapshots.push_back(db_->GetSnapshot()); + std::vector values; + + values.push_back(rnd.RandomString(8 << 10)); + ASSERT_OK(Put(Key(2), rnd.RandomString(8 << 10))); + snapshots.push_back(db_->GetSnapshot()); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(2), rnd.RandomString(8 << 10))); + snapshots.push_back(db_->GetSnapshot()); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(2), rnd.RandomString(8 << 10))); + snapshots.push_back(db_->GetSnapshot()); + ASSERT_OK(Flush()); + + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + CompactRangeOptions co; + co.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + ASSERT_OK(dbfull()->RunManualCompaction( + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(), + 0, 1, co, nullptr, nullptr, true, true, + std::numeric_limits::max() /*max_file_num_to_ignore*/, + "" /*trim_ts*/)); + + for (const auto s : snapshots) { + db_->ReleaseSnapshot(s); + } +} + +TEST_F(DBRangeDelTest, DoubleCountRangeTombstoneCompensatedSize) { + // Test for a bug fix if a file has multiple range tombstones + // with same start and end key but with different sequence numbers, + // we should only calculate compensated range tombstone size + // for one of them. + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + + std::vector values; + Random rnd(301); + // file in L2 + ASSERT_OK(Put(Key(1), rnd.RandomString(1 << 10))); + ASSERT_OK(Put(Key(2), rnd.RandomString(1 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + uint64_t l2_size = 0; + ASSERT_OK(Size(Key(1), Key(3), 0 /* cf */, &l2_size)); + ASSERT_GT(l2_size, 0); + + // file in L1 + ASSERT_OK(Put(Key(3), rnd.RandomString(1 << 10))); + ASSERT_OK(Put(Key(4), rnd.RandomString(1 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + uint64_t l1_size = 0; + ASSERT_OK(Size(Key(3), Key(5), 0 /* cf */, &l1_size)); + ASSERT_GT(l1_size, 0); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1), + Key(5))); + // so that the range tombstone above is not dropped + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1), + Key(5))); + ASSERT_OK(Flush()); + // Range deletion compensated size computed during flush time + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(level_to_files[0].size(), 1); + // instead of 2 * (l1_size + l2_size) + ASSERT_EQ(level_to_files[0][0].compensated_range_deletion_size, + l1_size + l2_size); + + // Range deletion compensated size computed during compaction time + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(level_to_files[1].size(), 1); + ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, l2_size); + db_->ReleaseSnapshot(snapshot); +} + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/version_edit.h b/db/version_edit.h index 8be5f4f52..1029ca28a 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -282,6 +282,7 @@ struct FileMetaData { if (largest.size() == 0 || icmp.Compare(largest, end) < 0) { largest = end; } + assert(icmp.Compare(smallest, largest) <= 0); fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); fd.largest_seqno = std::max(fd.largest_seqno, seqno); }