Revert "Merging iterator to avoid child iterator reseek for some cases (#5286)" (#5871)

Summary:
This reverts commit 9fad3e21eb.

Iterator verification in stress tests sometimes fail for assertion
table/block_based/block_based_table_reader.cc:2973: void rocksdb::BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() [with TBlockIter = rocksdb::DataBlockIter; TValue = rocksdb::Slice]: Assertion `!next_block_is_out_of_bound || user_comparator_.Compare(*read_options_.iterate_upper_bound, index_iter_->user_key()) <= 0' failed.

It is likely to be linked to https://github.com/facebook/rocksdb/pull/5286 together with https://github.com/facebook/rocksdb/pull/5468 as the former PR makes some child iterator's seek being avoided, so that upper bound condition fails to be updated there. Strictly speaking, the former PR was merged before the latter one, but the latter one feels a more important improvement so I choose to revert the former one for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5871

Differential Revision: D17689196

fbshipit-source-id: 4ded5be68f67bee2782d31a29cb72ea68f59dd8c
main
sdong 5 years ago committed by Facebook Github Bot
parent 503a756e42
commit 846e05005d
  1. 2
      HISTORY.md
  2. 69
      db/db_iterator_test.cc
  3. 3
      db/version_set.cc
  4. 3
      table/block_based/block_based_table_reader.h
  5. 5
      table/internal_iterator.h
  6. 1
      table/iterator_wrapper.h
  7. 33
      table/merging_iterator.cc

@ -6,6 +6,7 @@
* Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
* Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2.
* Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix.
* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strong results when reseek happens with a different iterator upper bound.
### New Features
* Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
@ -108,7 +109,6 @@
* Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
* On DB open, delete WAL trash files left behind in wal_dir
## 6.2.0 (4/30/2019)
### New Features
* Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.

@ -2692,75 +2692,6 @@ TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
SyncPoint::GetInstance()->DisableProcessing();
}
TEST_P(DBIteratorTest, AvoidReseekChildIterator) {
Options options = CurrentOptions();
options.compression = CompressionType::kNoCompression;
BlockBasedTableOptions table_options;
table_options.block_size = 800;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Reopen(options);
Random rnd(301);
std::string random_str = RandomString(&rnd, 180);
ASSERT_OK(Put("1", random_str));
ASSERT_OK(Put("2", random_str));
ASSERT_OK(Put("3", random_str));
ASSERT_OK(Put("4", random_str));
ASSERT_OK(Put("8", random_str));
ASSERT_OK(Put("9", random_str));
ASSERT_OK(Flush());
ASSERT_OK(Put("5", random_str));
ASSERT_OK(Put("6", random_str));
ASSERT_OK(Put("7", random_str));
ASSERT_OK(Flush());
// These two keys will be kept in memtable.
ASSERT_OK(Put("0", random_str));
ASSERT_OK(Put("8", random_str));
int num_iter_wrapper_seek = 0;
SyncPoint::GetInstance()->SetCallBack(
"IteratorWrapper::Seek:0",
[&](void* /*arg*/) { num_iter_wrapper_seek++; });
SyncPoint::GetInstance()->EnableProcessing();
{
std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
iter->Seek("1");
ASSERT_TRUE(iter->Valid());
// DBIter always wraps internal iterator with IteratorWrapper,
// and in merging iterator each child iterator will be wrapped
// with IteratorWrapper.
ASSERT_EQ(4, num_iter_wrapper_seek);
// child position: 1 and 5
num_iter_wrapper_seek = 0;
iter->Seek("2");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(3, num_iter_wrapper_seek);
// child position: 2 and 5
num_iter_wrapper_seek = 0;
iter->Seek("6");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(4, num_iter_wrapper_seek);
// child position: 8 and 6
num_iter_wrapper_seek = 0;
iter->Seek("7");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(3, num_iter_wrapper_seek);
// child position: 8 and 7
num_iter_wrapper_seek = 0;
iter->Seek("5");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(4, num_iter_wrapper_seek);
}
SyncPoint::GetInstance()->DisableProcessing();
}
// MyRocks may change iterate bounds before seek. Simply test to make sure such
// usage doesn't break iterator.
TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {

@ -861,8 +861,7 @@ class LevelIterator final : public InternalIterator {
bool skip_filters, int level, RangeDelAggregator* range_del_agg,
const std::vector<AtomicCompactionUnitBoundary>*
compaction_boundaries = nullptr)
: InternalIterator(false),
table_cache_(table_cache),
: table_cache_(table_cache),
read_options_(read_options),
env_options_(env_options),
icomparator_(icomparator),

@ -615,8 +615,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
const SliceTransform* prefix_extractor,
BlockType block_type, TableReaderCaller caller,
size_t compaction_readahead_size = 0)
: InternalIteratorBase<TValue>(false),
table_(table),
: table_(table),
read_options_(read_options),
icomp_(icomp),
user_comparator_(icomp.user_comparator()),

@ -25,8 +25,8 @@ struct IterateResult {
template <class TValue>
class InternalIteratorBase : public Cleanable {
public:
InternalIteratorBase() : is_mutable_(true) {}
InternalIteratorBase(bool _is_mutable) : is_mutable_(_is_mutable) {}
InternalIteratorBase() {}
// No copying allowed
InternalIteratorBase(const InternalIteratorBase&) = delete;
InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
@ -148,7 +148,6 @@ class InternalIteratorBase : public Cleanable {
virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) {
return Status::NotSupported("");
}
bool is_mutable() const { return is_mutable_; }
protected:
void SeekForPrevImpl(const Slice& target, const Comparator* cmp) {

@ -74,7 +74,6 @@ class IteratorWrapperBase {
}
void Prev() { assert(iter_); iter_->Prev(); Update(); }
void Seek(const Slice& k) {
TEST_SYNC_POINT("IteratorWrapper::Seek:0");
assert(iter_);
iter_->Seek(k);
Update();

@ -109,37 +109,18 @@ class MergingIterator : public InternalIterator {
}
void Seek(const Slice& target) override {
bool is_increasing_reseek = false;
if (current_ != nullptr && direction_ == kForward && status_.ok() &&
!prefix_seek_mode_ && comparator_->Compare(target, key()) >= 0) {
is_increasing_reseek = true;
}
ClearHeaps();
status_ = Status::OK();
for (auto& child : children_) {
// If upper bound never changes, we can skip Seek() for
// the !Valid() case too, but people do hack the code to change
// upper bound between Seek(), so it's not a good idea to break
// the API.
// If DBIter is used on top of merging iterator, we probably
// can skip mutable child iterators if they are invalid too,
// but it's a less clean API. We can optimize for it later if
// needed.
if (!is_increasing_reseek || !child.Valid() ||
comparator_->Compare(target, child.key()) > 0 ||
child.iter()->is_mutable()) {
PERF_TIMER_GUARD(seek_child_seek_time);
PERF_TIMER_GUARD(seek_child_seek_time);
child.Seek(target);
child.Seek(target);
PERF_COUNTER_ADD(seek_child_seek_count, 1);
}
{
// Strictly, we timed slightly more than min heap operation,
// but these operations are very cheap.
PERF_TIMER_GUARD(seek_min_heap_time);
AddToMinHeapOrCheckStatus(&child);
}
PERF_COUNTER_ADD(seek_child_seek_count, 1);
// Strictly, we timed slightly more than min heap operation,
// but these operations are very cheap.
PERF_TIMER_GUARD(seek_min_heap_time);
AddToMinHeapOrCheckStatus(&child);
}
direction_ = kForward;
{

Loading…
Cancel
Save