Fix memtable-only iterator regression (#10705)

Summary:
when there is a single memtable without range tombstones and no SST files in the database, DBIter should wrap memtable iterator directly. Currently we create a merging iterator on top of the memtable iterator, and have DBIter wrap around it. This causes iterator regression and this PR fixes this issue.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10705

Test Plan:
- `make check`
- Performance:
  - Set up: `./db_bench -benchmarks=filluniquerandom -write_buffer_size=$((1 << 30)) -num=10000`
  - Benchmark: `./db_bench -benchmarks=seekrandom -use_existing_db=true -avoid_flush_during_recovery=true -write_buffer_size=$((1 << 30)) -num=10000 -threads=16 -duration=60 -seek_nexts=$seek_nexts`
```
seek_nexts    main op/sec    https://github.com/facebook/rocksdb/issues/10705      RocksDB v7.6
0             5746568        5749033     5786180
30            2411690        3006466     2837699
1000          102556         128902      124667
```

Reviewed By: ajkr

Differential Revision: D39644221

Pulled By: cbi42

fbshipit-source-id: 8063ff611ba31b0e5670041da3927c8c54b2097d
main
Changyu Bi 2 years ago committed by Facebook GitHub Bot
parent 9e01de9066
commit 749b849a34
  1. 68
      db/arena_wrapped_db_iter.cc
  2. 2
      db/arena_wrapped_db_iter.h
  3. 3
      db/column_family.cc
  4. 28
      db/db_impl/db_impl.cc
  5. 1
      db/db_range_del_test.cc
  6. 54
      db/memtable_list.cc
  7. 6
      db/memtable_list.h
  8. 67
      db/version_set.cc
  9. 41
      table/merging_iterator.cc
  10. 26
      table/merging_iterator.h

@ -58,48 +58,60 @@ Status ArenaWrappedDBIter::Refresh() {
uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
while (true) {
if (sv_number_ != cur_sv_number) {
Env* env = db_iter_->env();
db_iter_->~DBIter();
arena_.~Arena();
new (&arena_) Arena();
auto reinit_internal_iter = [&]() {
Env* env = db_iter_->env();
db_iter_->~DBIter();
arena_.~Arena();
new (&arena_) Arena();
SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
if (read_callback_) {
read_callback_->Refresh(latest_seq);
}
Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
sv->current, latest_seq,
sv->mutable_cf_options.max_sequential_skip_in_iterations,
cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
allow_refresh_);
SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
if (read_callback_) {
read_callback_->Refresh(latest_seq);
}
Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
sv->current, latest_seq,
sv->mutable_cf_options.max_sequential_skip_in_iterations,
cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
allow_refresh_);
InternalIterator* internal_iter = db_impl_->NewInternalIterator(
read_options_, cfd_, sv, &arena_, latest_seq,
/* allow_unprepared_value */ true, /* db_iter */ this);
SetIterUnderDBIter(internal_iter);
InternalIterator* internal_iter = db_impl_->NewInternalIterator(
read_options_, cfd_, sv, &arena_, latest_seq,
/* allow_unprepared_value */ true, /* db_iter */ this);
SetIterUnderDBIter(internal_iter);
};
while (true) {
if (sv_number_ != cur_sv_number) {
reinit_internal_iter();
break;
} else {
SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
// Refresh range-tombstones in MemTable
if (!read_options_.ignore_range_deletions) {
assert(memtable_range_tombstone_iter_ != nullptr);
if (memtable_range_tombstone_iter_ != nullptr) {
SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
auto t = sv->mem->NewRangeTombstoneIterator(
read_options_, latest_seq, false /* immutable_memtable */);
delete *memtable_range_tombstone_iter_;
if (t == nullptr || t->empty()) {
SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
auto t = sv->mem->NewRangeTombstoneIterator(
read_options_, latest_seq, false /* immutable_memtable */);
if (!t || t->empty()) {
if (memtable_range_tombstone_iter_) {
*memtable_range_tombstone_iter_ = nullptr;
}
delete t;
} else { // current mutable memtable has range tombstones
if (!memtable_range_tombstone_iter_) {
delete t;
cfd_->ReturnThreadLocalSuperVersion(sv);
// The memtable under DBIter did not have range tombstone before
// refresh.
reinit_internal_iter();
break;
} else {
delete *memtable_range_tombstone_iter_;
*memtable_range_tombstone_iter_ = new TruncatedRangeDelIterator(
std::unique_ptr<FragmentedRangeTombstoneIterator>(t),
&cfd_->internal_comparator(), nullptr, nullptr);
}
cfd_->ReturnThreadLocalSuperVersion(sv);
}
cfd_->ReturnThreadLocalSuperVersion(sv);
}
// Refresh latest sequence number
db_iter_->set_sequence(latest_seq);

@ -107,6 +107,8 @@ class ArenaWrappedDBIter : public Iterator {
ReadCallback* read_callback_;
bool expose_blob_index_ = false;
bool allow_refresh_ = true;
// If this is nullptr, it means the mutable memtable does not contain range
// tombstone when added under this DBIter.
TruncatedRangeDelIterator** memtable_range_tombstone_iter_ = nullptr;
};

@ -1144,7 +1144,8 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
merge_iter_builder.AddIterator(
super_version->mem->NewIterator(read_opts, &arena));
super_version->imm->AddIterators(read_opts, &merge_iter_builder);
super_version->imm->AddIterators(read_opts, &merge_iter_builder,
false /* add_range_tombstone_iter */);
ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
auto read_seq = super_version->current->version_set()->LastSequence();

@ -1803,31 +1803,31 @@ InternalIterator* DBImpl::NewInternalIterator(
&cfd->internal_comparator(), arena,
!read_options.total_order_seek &&
super_version->mutable_cf_options.prefix_extractor != nullptr);
// Collect iterator for mutable mem
merge_iter_builder.AddIterator(
super_version->mem->NewIterator(read_options, arena));
// Collect iterator for mutable memtable
auto mem_iter = super_version->mem->NewIterator(read_options, arena);
Status s;
if (!read_options.ignore_range_deletions) {
TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
auto range_del_iter = super_version->mem->NewRangeTombstoneIterator(
read_options, sequence, false /* immutable_memtable */);
if (range_del_iter == nullptr || range_del_iter->empty()) {
delete range_del_iter;
merge_iter_builder.AddRangeTombstoneIterator(nullptr);
} else {
merge_iter_builder.AddRangeTombstoneIterator(
new TruncatedRangeDelIterator(
std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
&cfd->ioptions()->internal_comparator, nullptr /* smallest */,
nullptr /* largest */));
mem_tombstone_iter = new TruncatedRangeDelIterator(
std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
&cfd->ioptions()->internal_comparator, nullptr /* smallest */,
nullptr /* largest */);
}
merge_iter_builder.AddPointAndTombstoneIterator(mem_iter,
mem_tombstone_iter);
} else {
merge_iter_builder.AddIterator(mem_iter);
}
// Collect all needed child iterators for immutable memtables
if (s.ok()) {
super_version->imm->AddIterators(read_options, &merge_iter_builder);
if (!read_options.ignore_range_deletions) {
s = super_version->imm->AddRangeTombstoneIterators(read_options, arena,
merge_iter_builder);
}
super_version->imm->AddIterators(read_options, &merge_iter_builder,
!read_options.ignore_range_deletions);
}
TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
if (s.ok()) {

@ -2303,7 +2303,6 @@ TEST_F(DBRangeDelTest, TombstoneOnlyLevel) {
InternalIterator* level_iter = sv->current->TEST_GetLevelIterator(
read_options, &merge_iter_builder, 1 /* level */, true);
// This is needed to make LevelIterator range tombstone aware
merge_iter_builder.AddIterator(level_iter);
auto miter = merge_iter_builder.Finish();
auto k = Key(3);
IterKey target;

@ -210,30 +210,6 @@ Status MemTableListVersion::AddRangeTombstoneIterators(
return Status::OK();
}
Status MemTableListVersion::AddRangeTombstoneIterators(
const ReadOptions& read_opts, Arena* /*arena*/,
MergeIteratorBuilder& builder) {
// Except for snapshot read, using kMaxSequenceNumber is OK because these
// are immutable memtables.
SequenceNumber read_seq = read_opts.snapshot != nullptr
? read_opts.snapshot->GetSequenceNumber()
: kMaxSequenceNumber;
for (auto& m : memlist_) {
auto range_del_iter = m->NewRangeTombstoneIterator(
read_opts, read_seq, true /* immutale_memtable */);
if (range_del_iter == nullptr || range_del_iter->empty()) {
delete range_del_iter;
builder.AddRangeTombstoneIterator(nullptr);
} else {
builder.AddRangeTombstoneIterator(new TruncatedRangeDelIterator(
std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
&m->GetInternalKeyComparator(), nullptr /* smallest */,
nullptr /* largest */));
}
}
return Status::OK();
}
void MemTableListVersion::AddIterators(
const ReadOptions& options, std::vector<InternalIterator*>* iterator_list,
Arena* arena) {
@ -242,11 +218,33 @@ void MemTableListVersion::AddIterators(
}
}
void MemTableListVersion::AddIterators(
const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder) {
void MemTableListVersion::AddIterators(const ReadOptions& options,
MergeIteratorBuilder* merge_iter_builder,
bool add_range_tombstone_iter) {
for (auto& m : memlist_) {
merge_iter_builder->AddIterator(
m->NewIterator(options, merge_iter_builder->GetArena()));
auto mem_iter = m->NewIterator(options, merge_iter_builder->GetArena());
if (!add_range_tombstone_iter || options.ignore_range_deletions) {
merge_iter_builder->AddIterator(mem_iter);
} else {
// Except for snapshot read, using kMaxSequenceNumber is OK because these
// are immutable memtables.
SequenceNumber read_seq = options.snapshot != nullptr
? options.snapshot->GetSequenceNumber()
: kMaxSequenceNumber;
TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
auto range_del_iter = m->NewRangeTombstoneIterator(
options, read_seq, true /* immutale_memtable */);
if (range_del_iter == nullptr || range_del_iter->empty()) {
delete range_del_iter;
} else {
mem_tombstone_iter = new TruncatedRangeDelIterator(
std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
&m->GetInternalKeyComparator(), nullptr /* smallest */,
nullptr /* largest */);
}
merge_iter_builder->AddPointAndTombstoneIterator(mem_iter,
mem_tombstone_iter);
}
}
}

@ -111,15 +111,13 @@ class MemTableListVersion {
Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
RangeDelAggregator* range_del_agg);
Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
MergeIteratorBuilder& builder);
void AddIterators(const ReadOptions& options,
std::vector<InternalIterator*>* iterator_list,
Arena* arena);
void AddIterators(const ReadOptions& options,
MergeIteratorBuilder* merge_iter_builder);
MergeIteratorBuilder* merge_iter_builder,
bool add_range_tombstone_iter);
uint64_t GetTotalNumEntries() const;

@ -941,18 +941,18 @@ namespace {
class LevelIterator final : public InternalIterator {
public:
// @param read_options Must outlive this iterator.
LevelIterator(TableCache* table_cache, const ReadOptions& read_options,
const FileOptions& file_options,
const InternalKeyComparator& icomparator,
const LevelFilesBrief* flevel,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
bool should_sample, HistogramImpl* file_read_hist,
TableReaderCaller caller, bool skip_filters, int level,
RangeDelAggregator* range_del_agg,
const std::vector<AtomicCompactionUnitBoundary>*
compaction_boundaries = nullptr,
bool allow_unprepared_value = false,
MergeIteratorBuilder* merge_iter_builder = nullptr)
LevelIterator(
TableCache* table_cache, const ReadOptions& read_options,
const FileOptions& file_options, const InternalKeyComparator& icomparator,
const LevelFilesBrief* flevel,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
bool should_sample, HistogramImpl* file_read_hist,
TableReaderCaller caller, bool skip_filters, int level,
RangeDelAggregator* range_del_agg,
const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
nullptr,
bool allow_unprepared_value = false,
TruncatedRangeDelIterator**** range_tombstone_iter_ptr_ = nullptr)
: table_cache_(table_cache),
read_options_(read_options),
file_options_(file_options),
@ -975,10 +975,8 @@ class LevelIterator final : public InternalIterator {
to_return_sentinel_(false) {
// Empty level is not supported.
assert(flevel_ != nullptr && flevel_->num_files > 0);
if (merge_iter_builder && !read_options.ignore_range_deletions) {
// lazily initialize range_tombstone_iter_ together with file_iter_
merge_iter_builder->AddRangeTombstoneIterator(nullptr,
&range_tombstone_iter_);
if (range_tombstone_iter_ptr_) {
*range_tombstone_iter_ptr_ = &range_tombstone_iter_;
}
}
@ -1840,14 +1838,22 @@ InternalIterator* Version::TEST_GetLevelIterator(
int level, bool allow_unprepared_value) {
auto* arena = merge_iter_builder->GetArena();
auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
return new (mem) LevelIterator(
TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
auto level_iter = new (mem) LevelIterator(
cfd_->table_cache(), read_options, file_options_,
cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
mutable_cf_options_.prefix_extractor, should_sample_file_read(),
cfd_->internal_stats()->GetFileReadHist(level),
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
allow_unprepared_value, merge_iter_builder);
allow_unprepared_value, &tombstone_iter_ptr);
if (read_options.ignore_range_deletions) {
merge_iter_builder->AddIterator(level_iter);
} else {
merge_iter_builder->AddPointAndTombstoneIterator(
level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
}
return level_iter;
}
uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
@ -1927,10 +1933,10 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
auto* arena = merge_iter_builder->GetArena();
if (level == 0) {
// Merge all level zero files together since they may overlap
TruncatedRangeDelIterator* iter = nullptr;
TruncatedRangeDelIterator* tombstone_iter = nullptr;
for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
const auto& file = storage_info_.LevelFilesBrief(0).files[i];
merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
auto table_iter = cfd_->table_cache()->NewIterator(
read_options, soptions, cfd_->internal_comparator(),
*file.file_metadata, /*range_del_agg=*/nullptr,
mutable_cf_options_.prefix_extractor, nullptr,
@ -1938,9 +1944,13 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
TableReaderCaller::kUserIterator, arena,
/*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr, allow_unprepared_value, &iter));
if (!read_options.ignore_range_deletions) {
merge_iter_builder->AddRangeTombstoneIterator(iter);
/*largest_compaction_key=*/nullptr, allow_unprepared_value,
&tombstone_iter);
if (read_options.ignore_range_deletions) {
merge_iter_builder->AddIterator(table_iter);
} else {
merge_iter_builder->AddPointAndTombstoneIterator(table_iter,
tombstone_iter);
}
}
if (should_sample) {
@ -1957,14 +1967,21 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
// walks through the non-overlapping files in the level, opening them
// lazily.
auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
merge_iter_builder->AddIterator(new (mem) LevelIterator(
TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
auto level_iter = new (mem) LevelIterator(
cfd_->table_cache(), read_options, soptions,
cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
mutable_cf_options_.prefix_extractor, should_sample_file_read(),
cfd_->internal_stats()->GetFileReadHist(level),
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
/*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr,
allow_unprepared_value, merge_iter_builder));
allow_unprepared_value, &tombstone_iter_ptr);
if (read_options.ignore_range_deletions) {
merge_iter_builder->AddIterator(level_iter);
} else {
merge_iter_builder->AddPointAndTombstoneIterator(
level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
}
}
}

@ -1295,22 +1295,40 @@ void MergeIteratorBuilder::AddIterator(InternalIterator* iter) {
}
}
void MergeIteratorBuilder::AddRangeTombstoneIterator(
TruncatedRangeDelIterator* iter, TruncatedRangeDelIterator*** iter_ptr) {
if (!use_merging_iter) {
void MergeIteratorBuilder::AddPointAndTombstoneIterator(
InternalIterator* point_iter, TruncatedRangeDelIterator* tombstone_iter,
TruncatedRangeDelIterator*** tombstone_iter_ptr) {
// tombstone_iter_ptr != nullptr means point_iter is a LevelIterator.
bool add_range_tombstone = tombstone_iter ||
!merge_iter->range_tombstone_iters_.empty() ||
tombstone_iter_ptr;
if (!use_merging_iter && (add_range_tombstone || first_iter)) {
use_merging_iter = true;
if (first_iter) {
merge_iter->AddIterator(first_iter);
first_iter = nullptr;
}
}
merge_iter->AddRangeTombstoneIterator(iter);
if (iter_ptr) {
// This is needed instead of setting to &range_tombstone_iters_[i] directly
// here since the memory address of range_tombstone_iters_[i] might change
// during vector resizing.
range_del_iter_ptrs_.emplace_back(
merge_iter->range_tombstone_iters_.size() - 1, iter_ptr);
if (use_merging_iter) {
merge_iter->AddIterator(point_iter);
if (add_range_tombstone) {
// If there was a gap, fill in nullptr as empty range tombstone iterators.
while (merge_iter->range_tombstone_iters_.size() <
merge_iter->children_.size() - 1) {
merge_iter->AddRangeTombstoneIterator(nullptr);
}
merge_iter->AddRangeTombstoneIterator(tombstone_iter);
}
if (tombstone_iter_ptr) {
// This is needed instead of setting to &range_tombstone_iters_[i]
// directly here since the memory address of range_tombstone_iters_[i]
// might change during vector resizing.
range_del_iter_ptrs_.emplace_back(
merge_iter->range_tombstone_iters_.size() - 1, tombstone_iter_ptr);
}
} else {
first_iter = point_iter;
}
}
@ -1323,8 +1341,7 @@ InternalIterator* MergeIteratorBuilder::Finish(ArenaWrappedDBIter* db_iter) {
for (auto& p : range_del_iter_ptrs_) {
*(p.second) = &(merge_iter->range_tombstone_iters_[p.first]);
}
if (db_iter) {
assert(!merge_iter->range_tombstone_iters_.empty());
if (db_iter && !merge_iter->range_tombstone_iters_.empty()) {
// memtable is always the first level
db_iter->SetMemtableRangetombstoneIter(
&merge_iter->range_tombstone_iters_.front());

@ -38,6 +38,8 @@ extern InternalIterator* NewMergingIterator(
class MergingIterator;
// A builder class to build a merging iterator by adding iterators one by one.
// User should call only one of AddIterator() or AddPointAndTombstoneIterator()
// exclusively for the same builder.
class MergeIteratorBuilder {
public:
// comparator: the comparator used in merging comparator
@ -49,16 +51,20 @@ class MergeIteratorBuilder {
// Add iter to the merging iterator.
void AddIterator(InternalIterator* iter);
// Add a range tombstone iterator to underlying merge iterator.
// See MergingIterator::AddRangeTombstoneIterator() for more detail.
//
// If `iter_ptr` is not nullptr, *iter_ptr will be set to where the merging
// iterator stores `iter` when MergeIteratorBuilder::Finish() is called. This
// is used by level iterator to update range tombstone iters when switching to
// a different SST file.
void AddRangeTombstoneIterator(
TruncatedRangeDelIterator* iter,
TruncatedRangeDelIterator*** iter_ptr = nullptr);
// Add a point key iterator and a range tombstone iterator.
// `tombstone_iter_ptr` should and only be set by LevelIterator.
// *tombstone_iter_ptr will be set to where the merging iterator stores
// `tombstone_iter` when MergeIteratorBuilder::Finish() is called. This is
// used by LevelIterator to update range tombstone iters when switching to a
// different SST file. If a single point iterator with a nullptr range
// tombstone iterator is provided, and the point iterator is not a level
// iterator, then this builder will return the point iterator directly,
// instead of creating a merging iterator on top of it. Internally, if all
// point iterators are not LevelIterator, then range tombstone iterator is
// only added to the merging iter if there is a non-null `tombstone_iter`.
void AddPointAndTombstoneIterator(
InternalIterator* point_iter, TruncatedRangeDelIterator* tombstone_iter,
TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr);
// Get arena used to build the merging iterator. It is called one a child
// iterator needs to be allocated.

Loading…
Cancel
Save