Reduce iterator key comparison for upper/lower bound check (2nd attempt) (#5468)

Summary:
This is a second attempt for https://github.com/facebook/rocksdb/issues/5111, with the fix to redo iterate bounds check after `SeekXXX()`. This is because MyRocks may change iterate bounds between seek.

See https://github.com/facebook/rocksdb/issues/5111 for original benchmark result and discussion.

Closes https://github.com/facebook/rocksdb/issues/5463.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5468

Test Plan: Existing rocksdb tests, plus myrocks test `rocksdb.optimizer_loose_index_scans` and `rocksdb.group_min_max`.

Differential Revision: D15863332

fbshipit-source-id: ab4aba5899838591806b8673899bd465f3f53e18
main
Yi Wu 5 years ago committed by Facebook Github Bot
parent cfdf2116d3
commit 662ce62044
  1. 1
      HISTORY.md
  2. 9
      db/db_iter.cc
  3. 62
      db/db_iterator_test.cc
  4. 48
      db/version_set.cc
  5. 30
      table/block_based/block_based_table_reader.cc
  6. 15
      table/block_based/block_based_table_reader.h
  7. 25
      table/internal_iterator.h
  8. 22
      table/iterator_wrapper.h
  9. 24
      table/merging_iterator.cc

@ -24,6 +24,7 @@
* Reduce binary search when iterator reseek into the same data block.
* DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
* Merging iterator to avoid child iterator reseek for some cases
* Reduce iterator key comparision for upper/lower bound check.
* Log Writer will flush after finishing the whole record, rather than a fragment.
* Lower MultiGet batching API latency by reading data blocks from disk in parallel

@ -467,7 +467,9 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
is_key_seqnum_zero_ = (ikey_.sequence == 0);
if (iterate_upper_bound_ != nullptr &&
assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() ||
user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0);
if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
break;
}
@ -859,7 +861,10 @@ void DBIter::PrevInternal() {
return;
}
if (iterate_lower_bound_ != nullptr &&
assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
user_comparator_.Compare(saved_key_.GetUserKey(),
*iterate_lower_bound_) >= 0);
if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
user_comparator_.Compare(saved_key_.GetUserKey(),
*iterate_lower_bound_) < 0) {
// We've iterated earlier than the user-specified lower bound.

@ -2759,6 +2759,68 @@ TEST_P(DBIteratorTest, AvoidReseekChildIterator) {
SyncPoint::GetInstance()->DisableProcessing();
}
// MyRocks may change iterate bounds before seek. Simply test to make sure such
// usage doesn't break iterator.
TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
Options options = CurrentOptions();
options.compression = CompressionType::kNoCompression;
BlockBasedTableOptions table_options;
table_options.block_size = 100;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
std::string value(50, 'v');
Reopen(options);
ASSERT_OK(Put("aaa", value));
ASSERT_OK(Flush());
ASSERT_OK(Put("bbb", "v"));
ASSERT_OK(Put("ccc", "v"));
ASSERT_OK(Put("ddd", "v"));
ASSERT_OK(Flush());
ASSERT_OK(Put("eee", "v"));
ASSERT_OK(Flush());
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
std::string ub1 = "e";
std::string ub2 = "c";
Slice ub(ub1);
ReadOptions read_opts1;
read_opts1.iterate_upper_bound = &ub;
Iterator* iter = NewIterator(read_opts1);
// Seek and iterate accross block boundary.
iter->Seek("b");
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("bbb", iter->key());
ub = Slice(ub2);
iter->Seek("b");
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("bbb", iter->key());
iter->Next();
ASSERT_FALSE(iter->Valid());
ASSERT_OK(iter->status());
delete iter;
std::string lb1 = "a";
std::string lb2 = "c";
Slice lb(lb1);
ReadOptions read_opts2;
read_opts2.iterate_lower_bound = &lb;
iter = NewIterator(read_opts2);
iter->SeekForPrev("d");
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("ccc", iter->key());
lb = Slice(lb2);
iter->SeekForPrev("d");
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("ccc", iter->key());
iter->Prev();
ASSERT_FALSE(iter->Valid());
ASSERT_OK(iter->status());
delete iter;
}
INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
testing::Values(true, false));

@ -887,7 +887,7 @@ class LevelIterator final : public InternalIterator {
void SeekToFirst() override;
void SeekToLast() override;
void Next() final override;
bool NextAndGetResult(Slice* ret_key) override;
bool NextAndGetResult(IterateResult* result) override;
void Prev() override;
bool Valid() const override { return file_iter_.Valid(); }
@ -895,23 +895,38 @@ class LevelIterator final : public InternalIterator {
assert(Valid());
return file_iter_.key();
}
Slice value() const override {
assert(Valid());
return file_iter_.value();
}
Status status() const override {
return file_iter_.iter() ? file_iter_.status() : Status::OK();
}
inline bool MayBeOutOfLowerBound() override {
assert(Valid());
return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
}
inline bool MayBeOutOfUpperBound() override {
assert(Valid());
return file_iter_.MayBeOutOfUpperBound();
}
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
pinned_iters_mgr_ = pinned_iters_mgr;
if (file_iter_.iter()) {
file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
}
}
bool IsKeyPinned() const override {
return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
file_iter_.iter() && file_iter_.IsKeyPinned();
}
bool IsValuePinned() const override {
return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
file_iter_.iter() && file_iter_.IsValuePinned();
@ -955,6 +970,7 @@ class LevelIterator final : public InternalIterator {
smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
}
CheckMayBeOutOfLowerBound();
return table_cache_->NewIterator(
read_options_, env_options_, icomparator_, *file_meta.file_metadata,
range_del_agg_, prefix_extractor_,
@ -963,6 +979,19 @@ class LevelIterator final : public InternalIterator {
largest_compaction_key);
}
// Check if current file being fully within iterate_lower_bound.
//
// Note MyRocks may update iterate bounds between seek. To workaround it,
// we need to check and update may_be_out_of_lower_bound_ accordingly.
void CheckMayBeOutOfLowerBound() {
if (Valid() && read_options_.iterate_lower_bound != nullptr) {
may_be_out_of_lower_bound_ =
user_comparator_.Compare(
ExtractUserKey(file_smallest_key(file_index_)),
*read_options_.iterate_lower_bound) < 0;
}
}
TableCache* table_cache_;
const ReadOptions read_options_;
const EnvOptions& env_options_;
@ -976,6 +1005,7 @@ class LevelIterator final : public InternalIterator {
bool should_sample_;
TableReaderCaller caller_;
bool skip_filters_;
bool may_be_out_of_lower_bound_ = true;
size_t file_index_;
int level_;
RangeDelAggregator* range_del_agg_;
@ -1011,6 +1041,7 @@ void LevelIterator::Seek(const Slice& target) {
file_iter_.Seek(target);
}
SkipEmptyFileForward();
CheckMayBeOutOfLowerBound();
}
void LevelIterator::SeekForPrev(const Slice& target) {
@ -1024,6 +1055,7 @@ void LevelIterator::SeekForPrev(const Slice& target) {
file_iter_.SeekForPrev(target);
SkipEmptyFileBackward();
}
CheckMayBeOutOfLowerBound();
}
void LevelIterator::SeekToFirst() {
@ -1032,6 +1064,7 @@ void LevelIterator::SeekToFirst() {
file_iter_.SeekToFirst();
}
SkipEmptyFileForward();
CheckMayBeOutOfLowerBound();
}
void LevelIterator::SeekToLast() {
@ -1040,15 +1073,17 @@ void LevelIterator::SeekToLast() {
file_iter_.SeekToLast();
}
SkipEmptyFileBackward();
CheckMayBeOutOfLowerBound();
}
void LevelIterator::Next() { NextImpl(); }
bool LevelIterator::NextAndGetResult(Slice* ret_key) {
bool LevelIterator::NextAndGetResult(IterateResult* result) {
NextImpl();
bool is_valid = Valid();
if (is_valid) {
*ret_key = key();
result->key = key();
result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
}
return is_valid;
}
@ -4366,10 +4401,9 @@ Status VersionSet::Recover(
", last_sequence is %" PRIu64 ", log_number is %" PRIu64
",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
",min_log_number_to_keep is %" PRIu64 "\n",
manifest_path.c_str(), manifest_file_number_,
next_file_number_.load(), last_sequence_.load(), log_number,
prev_log_number_, column_family_set_->GetMaxColumnFamily(),
min_log_number_to_keep_2pc());
manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
last_sequence_.load(), log_number, prev_log_number_,
column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());
for (auto cfd : *column_family_set_) {
if (cfd->IsDropped()) {

@ -2896,6 +2896,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
FindKeyForward();
}
CheckDataBlockWithinUpperBound();
CheckOutOfBound();
if (target) {
@ -2952,6 +2953,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
block_iter_.SeekForPrev(target);
FindKeyBackward();
CheckDataBlockWithinUpperBound();
assert(!block_iter_.Valid() ||
icomp_.Compare(target, block_iter_.key()) >= 0);
}
@ -2969,6 +2971,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
InitDataBlock();
block_iter_.SeekToLast();
FindKeyBackward();
CheckDataBlockWithinUpperBound();
}
template <class TBlockIter, typename TValue>
@ -2984,11 +2987,12 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
template <class TBlockIter, typename TValue>
bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
Slice* ret_key) {
IterateResult* result) {
Next();
bool is_valid = Valid();
if (is_valid) {
*ret_key = key();
result->key = key();
result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
}
return is_valid;
}
@ -3087,6 +3091,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
/*for_compaction=*/lookup_context_.caller ==
TableReaderCaller::kCompaction);
block_iter_points_to_real_block_ = true;
CheckDataBlockWithinUpperBound();
}
}
@ -3140,13 +3145,12 @@ void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
return;
}
// Whether next data block is out of upper bound, if there is one.
bool next_block_is_out_of_bound = false;
if (read_options_.iterate_upper_bound != nullptr &&
block_iter_points_to_real_block_) {
next_block_is_out_of_bound =
(user_comparator_.Compare(*read_options_.iterate_upper_bound,
const bool next_block_is_out_of_bound =
read_options_.iterate_upper_bound != nullptr &&
block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
assert(!next_block_is_out_of_bound ||
user_comparator_.Compare(*read_options_.iterate_upper_bound,
index_iter_->user_key()) <= 0);
}
ResetDataIter();
index_iter_->Next();
if (next_block_is_out_of_bound) {
@ -3210,6 +3214,16 @@ void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
}
}
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::CheckDataBlockWithinUpperBound() {
if (read_options_.iterate_upper_bound != nullptr &&
block_iter_points_to_real_block_) {
data_block_within_upper_bound_ =
(user_comparator_.Compare(*read_options_.iterate_upper_bound,
index_iter_->user_key()) > 0);
}
}
InternalIterator* BlockBasedTable::NewIterator(
const ReadOptions& read_options, const SliceTransform* prefix_extractor,
Arena* arena, bool skip_filters, TableReaderCaller caller, size_t compaction_readahead_size) {

@ -652,7 +652,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
void SeekToFirst() override;
void SeekToLast() override;
void Next() final override;
bool NextAndGetResult(Slice* ret_key) override;
bool NextAndGetResult(IterateResult* result) override;
void Prev() override;
bool Valid() const override {
return !is_out_of_bound_ &&
@ -702,6 +702,11 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
// Whether iterator invalidated for being out of bound.
bool IsOutOfBound() override { return is_out_of_bound_; }
inline bool MayBeOutOfUpperBound() override {
assert(Valid());
return !data_block_within_upper_bound_;
}
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
pinned_iters_mgr_ = pinned_iters_mgr;
}
@ -768,6 +773,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
bool block_iter_points_to_real_block_;
// See InternalIteratorBase::IsOutOfBound().
bool is_out_of_bound_ = false;
// Whether current data block being fully within iterate upper bound.
bool data_block_within_upper_bound_ = false;
// True if we're standing at the first key of a block, and we haven't loaded
// that block yet. A call to value() will trigger loading the block.
bool is_at_first_key_from_index_ = false;
@ -802,6 +809,12 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
void FindBlockForward();
void FindKeyBackward();
void CheckOutOfBound();
// Check if data block is fully within iterate_upper_bound.
//
// Note MyRocks may update iterate bounds between seek. To workaround it,
// we need to check and update data_block_within_upper_bound_ accordingly.
void CheckDataBlockWithinUpperBound();
};
} // namespace rocksdb

@ -17,6 +17,11 @@ namespace rocksdb {
class PinnedIteratorsManager;
struct IterateResult {
Slice key;
bool may_be_out_of_upper_bound;
};
template <class TValue>
class InternalIteratorBase : public Cleanable {
public:
@ -55,11 +60,20 @@ class InternalIteratorBase : public Cleanable {
// REQUIRES: Valid()
virtual void Next() = 0;
virtual bool NextAndGetResult(Slice* ret_key) {
// Moves to the next entry in the source, and return result. Iterator
// implementation should override this method to help methods inline better,
// or when MayBeOutOfUpperBound() is non-trivial.
// REQUIRES: Valid()
virtual bool NextAndGetResult(IterateResult* result) {
Next();
bool is_valid = Valid();
if (is_valid) {
*ret_key = key();
result->key = key();
// Default may_be_out_of_upper_bound to true to avoid unnecessary virtual
// call. If an implementation has non-trivial MayBeOutOfUpperBound(),
// it should also override NextAndGetResult().
result->may_be_out_of_upper_bound = true;
assert(MayBeOutOfUpperBound());
}
return is_valid;
}
@ -97,6 +111,13 @@ class InternalIteratorBase : public Cleanable {
// keys above the upper bound, IsOutOfBound() must return false.
virtual bool IsOutOfBound() { return false; }
// Keys return from this iterator can be smaller than iterate_lower_bound.
virtual bool MayBeOutOfLowerBound() { return true; }
// Keys return from this iterator can be larger or equal to
// iterate_upper_bound.
virtual bool MayBeOutOfUpperBound() { return true; }
// Pass the PinnedIteratorsManager to the Iterator, most Iterators dont
// communicate with PinnedIteratorsManager so default implementation is no-op
// but for Iterators that need to communicate with PinnedIteratorsManager

@ -56,7 +56,10 @@ class IteratorWrapperBase {
// Iterator interface methods
bool Valid() const { return valid_; }
Slice key() const { assert(Valid()); return key_; }
Slice key() const {
assert(Valid());
return result_.key;
}
TValue value() const {
assert(Valid());
return iter_->value();
@ -65,7 +68,7 @@ class IteratorWrapperBase {
Status status() const { assert(iter_); return iter_->status(); }
void Next() {
assert(iter_);
valid_ = iter_->NextAndGetResult(&key_);
valid_ = iter_->NextAndGetResult(&result_);
assert(!valid_ || iter_->status().ok());
}
void Prev() { assert(iter_); iter_->Prev(); Update(); }
@ -83,6 +86,16 @@ class IteratorWrapperBase {
void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); }
void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); }
bool MayBeOutOfLowerBound() {
assert(Valid());
return iter_->MayBeOutOfLowerBound();
}
bool MayBeOutOfUpperBound() {
assert(Valid());
return result_.may_be_out_of_upper_bound;
}
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {
assert(iter_);
iter_->SetPinnedItersMgr(pinned_iters_mgr);
@ -100,14 +113,15 @@ class IteratorWrapperBase {
void Update() {
valid_ = iter_->Valid();
if (valid_) {
key_ = iter_->key();
assert(iter_->status().ok());
result_.key = iter_->key();
result_.may_be_out_of_upper_bound = true;
}
}
InternalIteratorBase<TValue>* iter_;
IterateResult result_;
bool valid_;
Slice key_;
};
using IteratorWrapper = IteratorWrapperBase<Slice>;

@ -227,6 +227,16 @@ class MergingIterator : public InternalIterator {
current_ = CurrentForward();
}
bool NextAndGetResult(IterateResult* result) override {
Next();
bool is_valid = Valid();
if (is_valid) {
result->key = key();
result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
}
return is_valid;
}
void Prev() override {
assert(Valid());
// Ensure that all children are positioned before key().
@ -296,6 +306,20 @@ class MergingIterator : public InternalIterator {
return current_->value();
}
// Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
// from current child iterator. Potentially as long as one of child iterator
// report out of bound is not possible, we know current key is within bound.
bool MayBeOutOfLowerBound() override {
assert(Valid());
return current_->MayBeOutOfLowerBound();
}
bool MayBeOutOfUpperBound() override {
assert(Valid());
return current_->MayBeOutOfUpperBound();
}
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
pinned_iters_mgr_ = pinned_iters_mgr;
for (auto& child : children_) {

Loading…
Cancel
Save