optimize file ingestion checks for range deletion overlap

Summary:
Before we were checking every file in the level which was unnecessary. We can piggyback onto the code for checking point-key overlap, which already opens all the files that could possibly contain overlapping range deletions. This PR makes us check just the range deletions from those files, so no extra ones will be opened.
Closes https://github.com/facebook/rocksdb/pull/3179

Differential Revision: D6358125

Pulled By: ajkr

fbshipit-source-id: 00e200770fdb8f3cc6b1b2da232b755e4ba36279
main
Andrew Kryczka 7 years ago committed by Facebook Github Bot
parent 022c598abb
commit 1bdb44de95
  1. 44
      db/external_sst_file_basic_test.cc
  2. 89
      db/external_sst_file_ingestion_job.cc
  3. 7
      db/external_sst_file_ingestion_job.h
  4. 23
      db/range_del_aggregator.cc
  5. 9
      db/range_del_aggregator.h
  6. 34
      db/range_del_aggregator_test.cc
  7. 16
      db/version_set.cc
  8. 4
      db/version_set.h

@ -558,8 +558,10 @@ TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
} }
TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) { TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
int kNumLevels = 7;
Options options = CurrentOptions(); Options options = CurrentOptions();
options.disable_auto_compactions = true; options.disable_auto_compactions = true;
options.num_levels = kNumLevels;
Reopen(options); Reopen(options);
std::map<std::string, std::string> true_data; std::map<std::string, std::string> true_data;
@ -567,43 +569,65 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
// prevent range deletions from being dropped due to becoming obsolete. // prevent range deletions from being dropped due to becoming obsolete.
const Snapshot* snapshot = db_->GetSnapshot(); const Snapshot* snapshot = db_->GetSnapshot();
// range del [0, 50) in L0 file, [50, 100) in memtable // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable
for (int i = 0; i < 2; i++) { for (int i = 0; i < 3; i++) {
if (i == 1) { if (i != 0) {
db_->Flush(FlushOptions()); db_->Flush(FlushOptions());
if (i == 1) {
MoveFilesToLevel(kNumLevels - 1);
}
} }
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
Key(50 * i), Key(50 * (i + 1)))); Key(50 * i), Key(50 * (i + 1))));
} }
ASSERT_EQ(1, NumTableFilesAtLevel(0)); ASSERT_EQ(1, NumTableFilesAtLevel(0));
ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1));
// overlaps with L0 file but not memtable, so flush is skipped // overlaps with L0 file but not memtable, so flush is skipped and file is
// ingested into L0
SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber(); SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
ASSERT_OK(GenerateAndAddExternalFile(
options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
file_id++, &true_data));
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
ASSERT_EQ(2, NumTableFilesAtLevel(0));
ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
// overlaps with L6 file but not memtable or L0 file, so flush is skipped and
// file is ingested into L5
ASSERT_OK(GenerateAndAddExternalFile( ASSERT_OK(GenerateAndAddExternalFile(
options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue}, options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue},
file_id++, &true_data)); file_id++, &true_data));
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno); ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
ASSERT_EQ(2, NumTableFilesAtLevel(0)); ASSERT_EQ(2, NumTableFilesAtLevel(0));
ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
// overlaps with memtable, so flush is triggered (thus file count increases by // ingested file overlaps with memtable, so flush is triggered before the file
// two at this step). // is ingested such that the ingested data is considered newest. So L0 file
// count increases by two.
ASSERT_OK(GenerateAndAddExternalFile( ASSERT_OK(GenerateAndAddExternalFile(
options, {50, 90}, {ValueType::kTypeValue, ValueType::kTypeValue}, options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue},
file_id++, &true_data)); file_id++, &true_data));
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno); ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
ASSERT_EQ(4, NumTableFilesAtLevel(0)); ASSERT_EQ(4, NumTableFilesAtLevel(0));
ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
// snapshot unneeded now that both range deletions are persisted // snapshot unneeded now that all range deletions are persisted
db_->ReleaseSnapshot(snapshot); db_->ReleaseSnapshot(snapshot);
// overlaps with nothing, so places at bottom level and skips incrementing // overlaps with nothing, so places at bottom level and skips incrementing
// seqnum. // seqnum.
ASSERT_OK(GenerateAndAddExternalFile( ASSERT_OK(GenerateAndAddExternalFile(
options, {101, 125}, {ValueType::kTypeValue, ValueType::kTypeValue}, options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue},
file_id++, &true_data)); file_id++, &true_data));
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno); ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
ASSERT_EQ(4, NumTableFilesAtLevel(0)); ASSERT_EQ(4, NumTableFilesAtLevel(0));
ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1)); ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE

@ -376,6 +376,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
Status ExternalSstFileIngestionJob::IngestedFilesOverlapWithMemtables( Status ExternalSstFileIngestionJob::IngestedFilesOverlapWithMemtables(
SuperVersion* sv, bool* overlap) { SuperVersion* sv, bool* overlap) {
*overlap = false;
// Create an InternalIterator over all memtables // Create an InternalIterator over all memtables
Arena arena; Arena arena;
ReadOptions ro; ReadOptions ro;
@ -391,26 +392,33 @@ Status ExternalSstFileIngestionJob::IngestedFilesOverlapWithMemtables(
memtable_range_del_iters.push_back(active_range_del_iter); memtable_range_del_iters.push_back(active_range_del_iter);
} }
sv->imm->AddRangeTombstoneIterators(ro, &memtable_range_del_iters); sv->imm->AddRangeTombstoneIterators(ro, &memtable_range_del_iters);
std::unique_ptr<InternalIterator> memtable_range_del_iter(NewMergingIterator( RangeDelAggregator range_del_agg(cfd_->internal_comparator(),
&cfd_->internal_comparator(), {} /* snapshots */,
memtable_range_del_iters.empty() ? nullptr : &memtable_range_del_iters[0], false /* collapse_deletions */);
static_cast<int>(memtable_range_del_iters.size())));
Status status; Status status;
*overlap = false; {
std::unique_ptr<InternalIterator> memtable_range_del_iter(
NewMergingIterator(&cfd_->internal_comparator(),
memtable_range_del_iters.empty()
? nullptr
: &memtable_range_del_iters[0],
static_cast<int>(memtable_range_del_iters.size())));
status = range_del_agg.AddTombstones(std::move(memtable_range_del_iter));
}
if (status.ok()) {
for (IngestedFileInfo& f : files_to_ingest_) { for (IngestedFileInfo& f : files_to_ingest_) {
status = status = IngestedFileOverlapWithIteratorRange(&f, memtable_iter.get(),
IngestedFileOverlapWithIteratorRange(&f, memtable_iter.get(), overlap); overlap);
if (!status.ok() || *overlap == true) { if (!status.ok() || *overlap == true) {
break; break;
} }
status = IngestedFileOverlapWithRangeDeletions( if (range_del_agg.IsRangeOverlapped(f.smallest_user_key,
&f, memtable_range_del_iter.get(), overlap); f.largest_user_key)) {
if (!status.ok() || *overlap == true) { *overlap = true;
break; break;
} }
} }
}
return status; return status;
} }
@ -575,34 +583,6 @@ Status ExternalSstFileIngestionJob::IngestedFileOverlapWithIteratorRange(
return iter->status(); return iter->status();
} }
Status ExternalSstFileIngestionJob::IngestedFileOverlapWithRangeDeletions(
const IngestedFileInfo* file_to_ingest, InternalIterator* range_del_iter,
bool* overlap) {
auto* vstorage = cfd_->current()->storage_info();
auto* ucmp = vstorage->InternalComparator()->user_comparator();
*overlap = false;
if (range_del_iter != nullptr) {
for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
range_del_iter->Next()) {
ParsedInternalKey parsed_key;
if (!ParseInternalKey(range_del_iter->key(), &parsed_key)) {
return Status::Corruption("corrupted range deletion key: " +
range_del_iter->key().ToString());
}
RangeTombstone range_del(parsed_key, range_del_iter->value());
if (ucmp->Compare(range_del.start_key_,
file_to_ingest->largest_user_key) <= 0 &&
ucmp->Compare(file_to_ingest->smallest_user_key,
range_del.end_key_) <= 0) {
*overlap = true;
break;
}
}
}
return Status::OK();
}
bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
const IngestedFileInfo* file_to_ingest, int level) { const IngestedFileInfo* file_to_ingest, int level) {
if (level == 0) { if (level == 0) {
@ -639,23 +619,26 @@ Status ExternalSstFileIngestionJob::IngestedFileOverlapWithLevel(
ro.total_order_seek = true; ro.total_order_seek = true;
MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(), MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(),
&arena); &arena);
// Files are opened lazily when the iterator needs them, thus range deletions
// are also added lazily to the aggregator. We need to check for range
// deletion overlap only in the case where there's no point-key overlap. Then,
// we've already opened the file with range containing the ingested file's
// begin key, and iterated through all files until the one containing the
// ingested file's end key. So any files maybe containing range deletions
// overlapping the ingested file must have been opened and had their range
// deletions added to the aggregator.
RangeDelAggregator range_del_agg(cfd_->internal_comparator(),
{} /* snapshots */,
false /* collapse_deletions */);
sv->current->AddIteratorsForLevel(ro, env_options_, &merge_iter_builder, lvl, sv->current->AddIteratorsForLevel(ro, env_options_, &merge_iter_builder, lvl,
nullptr /* range_del_agg */); &range_del_agg);
ScopedArenaIterator level_iter(merge_iter_builder.Finish()); ScopedArenaIterator level_iter(merge_iter_builder.Finish());
std::vector<InternalIterator*> level_range_del_iters;
sv->current->AddRangeDelIteratorsForLevel(ro, env_options_, lvl,
&level_range_del_iters);
std::unique_ptr<InternalIterator> level_range_del_iter(NewMergingIterator(
&cfd_->internal_comparator(),
level_range_del_iters.empty() ? nullptr : &level_range_del_iters[0],
static_cast<int>(level_range_del_iters.size())));
Status status = IngestedFileOverlapWithIteratorRange( Status status = IngestedFileOverlapWithIteratorRange(
file_to_ingest, level_iter.get(), overlap_with_level); file_to_ingest, level_iter.get(), overlap_with_level);
if (status.ok() && *overlap_with_level == false) { if (status.ok() && *overlap_with_level == false &&
status = IngestedFileOverlapWithRangeDeletions( range_del_agg.IsRangeOverlapped(file_to_ingest->smallest_user_key,
file_to_ingest, level_range_del_iter.get(), overlap_with_level); file_to_ingest->largest_user_key)) {
*overlap_with_level = true;
} }
return status; return status;
} }

@ -139,13 +139,6 @@ class ExternalSstFileIngestionJob {
const IngestedFileInfo* file_to_ingest, InternalIterator* iter, const IngestedFileInfo* file_to_ingest, InternalIterator* iter,
bool* overlap); bool* overlap);
// Check if `file_to_ingest` key range overlaps with any range deletions
// specified by `iter`.
// REQUIRES: Mutex held
Status IngestedFileOverlapWithRangeDeletions(
const IngestedFileInfo* file_to_ingest, InternalIterator* range_del_iter,
bool* overlap);
// Check if `file_to_ingest` key range overlap with level // Check if `file_to_ingest` key range overlap with level
// REQUIRES: Mutex held // REQUIRES: Mutex held
Status IngestedFileOverlapWithLevel(SuperVersion* sv, Status IngestedFileOverlapWithLevel(SuperVersion* sv,

@ -140,6 +140,29 @@ bool RangeDelAggregator::ShouldDeleteImpl(
return parsed.sequence < tombstone_map_iter->second.seq_; return parsed.sequence < tombstone_map_iter->second.seq_;
} }
bool RangeDelAggregator::IsRangeOverlapped(const Slice& start,
const Slice& end) {
// so far only implemented for non-collapsed mode since file ingestion (only
// client) doesn't use collapsing
assert(!collapse_deletions_);
if (rep_ == nullptr) {
return false;
}
for (const auto& seqnum_and_tombstone_map : rep_->stripe_map_) {
for (const auto& start_key_and_tombstone :
seqnum_and_tombstone_map.second.raw_map) {
const auto& tombstone = start_key_and_tombstone.second;
if (icmp_.user_comparator()->Compare(start, tombstone.end_key_) < 0 &&
icmp_.user_comparator()->Compare(tombstone.start_key_, end) <= 0 &&
icmp_.user_comparator()->Compare(tombstone.start_key_,
tombstone.end_key_) < 0) {
return true;
}
}
}
return false;
}
bool RangeDelAggregator::ShouldAddTombstones( bool RangeDelAggregator::ShouldAddTombstones(
bool bottommost_level /* = false */) { bool bottommost_level /* = false */) {
// TODO(andrewkr): can we just open a file and throw it away if it ends up // TODO(andrewkr): can we just open a file and throw it away if it ends up

@ -92,6 +92,15 @@ class RangeDelAggregator {
bool ShouldDeleteImpl(const Slice& internal_key, bool ShouldDeleteImpl(const Slice& internal_key,
RangePositioningMode mode = kFullScan); RangePositioningMode mode = kFullScan);
// Checks whether range deletions cover any keys between `start` and `end`,
// inclusive.
//
// @param start User key representing beginning of range to check for overlap.
// @param end User key representing end of range to check for overlap. This
// argument is inclusive, so the existence of a range deletion covering
// `end` causes this to return true.
bool IsRangeOverlapped(const Slice& start, const Slice& end);
bool ShouldAddTombstones(bool bottommost_level = false); bool ShouldAddTombstones(bool bottommost_level = false);
// Adds tombstones to the tombstone aggregation structure maintained by this // Adds tombstones to the tombstone aggregation structure maintained by this

@ -28,9 +28,9 @@ enum Direction {
void VerifyRangeDels(const std::vector<RangeTombstone>& range_dels, void VerifyRangeDels(const std::vector<RangeTombstone>& range_dels,
const std::vector<ExpectedPoint>& expected_points) { const std::vector<ExpectedPoint>& expected_points) {
auto icmp = InternalKeyComparator(BytewiseComparator());
// Test same result regardless of which order the range deletions are added. // Test same result regardless of which order the range deletions are added.
for (Direction dir : {kForward, kReverse}) { for (Direction dir : {kForward, kReverse}) {
auto icmp = InternalKeyComparator(BytewiseComparator());
RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, true); RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, true);
std::vector<std::string> keys, values; std::vector<std::string> keys, values;
for (const auto& range_del : range_dels) { for (const auto& range_del : range_dels) {
@ -62,6 +62,27 @@ void VerifyRangeDels(const std::vector<RangeTombstone>& range_dels,
} }
} }
} }
RangeDelAggregator range_del_agg(icmp, {} /* snapshots */,
false /* collapse_deletions */);
std::vector<std::string> keys, values;
for (const auto& range_del : range_dels) {
auto key_and_value = range_del.Serialize();
keys.push_back(key_and_value.first.Encode().ToString());
values.push_back(key_and_value.second.ToString());
}
std::unique_ptr<test::VectorIterator> range_del_iter(
new test::VectorIterator(keys, values));
range_del_agg.AddTombstones(std::move(range_del_iter));
for (size_t i = 1; i < expected_points.size(); ++i) {
bool overlapped = range_del_agg.IsRangeOverlapped(
expected_points[i - 1].begin, expected_points[i].begin);
if (expected_points[i - 1].seq > 0 || expected_points[i].seq > 0) {
ASSERT_TRUE(overlapped);
} else {
ASSERT_FALSE(overlapped);
}
}
} }
} // anonymous namespace } // anonymous namespace
@ -112,9 +133,14 @@ TEST_F(RangeDelAggregatorTest, SameEndKey) {
} }
TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) { TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) {
VerifyRangeDels( VerifyRangeDels({{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}, {{" ", 0},
{{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}, {"a", 5},
{{" ", 0}, {"a", 5}, {"b", 0}, {"c", 10}, {"d", 0}, {"e", 15}, {"f", 0}}); {"b", 0},
{"c", 10},
{"d", 0},
{"da", 0},
{"e", 15},
{"f", 0}});
} }
// Note the Cover* tests also test cases where tombstones are inserted under a // Note the Cover* tests also test cases where tombstones are inserted under a

@ -874,22 +874,6 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
} }
} }
void Version::AddRangeDelIteratorsForLevel(
const ReadOptions& read_options, const EnvOptions& soptions, int level,
std::vector<InternalIterator*>* range_del_iters) {
range_del_iters->clear();
for (size_t i = 0; i < storage_info_.LevelFilesBrief(level).num_files; i++) {
const auto& file = storage_info_.LevelFilesBrief(level).files[i];
auto* range_del_iter = cfd_->table_cache()->NewRangeTombstoneIterator(
read_options, soptions, cfd_->internal_comparator(), file.fd,
cfd_->internal_stats()->GetFileReadHist(level),
false /* skip_filters */, level);
if (range_del_iter != nullptr) {
range_del_iters->push_back(range_del_iter);
}
}
}
VersionStorageInfo::VersionStorageInfo( VersionStorageInfo::VersionStorageInfo(
const InternalKeyComparator* internal_comparator, const InternalKeyComparator* internal_comparator,
const Comparator* user_comparator, int levels, const Comparator* user_comparator, int levels,

@ -525,10 +525,6 @@ class Version {
MergeIteratorBuilder* merger_iter_builder, MergeIteratorBuilder* merger_iter_builder,
int level, RangeDelAggregator* range_del_agg); int level, RangeDelAggregator* range_del_agg);
void AddRangeDelIteratorsForLevel(
const ReadOptions& read_options, const EnvOptions& soptions, int level,
std::vector<InternalIterator*>* range_del_iters);
// Lookup the value for key. If found, store it in *val and // Lookup the value for key. If found, store it in *val and
// return OK. Else return a non-OK status. // return OK. Else return a non-OK status.
// Uses *operands to store merge_operator operations to apply later. // Uses *operands to store merge_operator operations to apply later.

Loading…
Cancel
Save