diff --git a/db/db_iter.cc b/db/db_iter.cc index e8643d727..38e995304 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -65,6 +65,7 @@ class DBIter: public Iterator { current_entry_is_merged_(false), statistics_(options.statistics) { RecordTick(statistics_, NO_ITERATORS, 1); + max_skip_ = options.max_sequential_skip_in_iterations; } virtual ~DBIter() { RecordTick(statistics_, NO_ITERATORS, -1); @@ -129,6 +130,7 @@ class DBIter: public Iterator { bool valid_; bool current_entry_is_merged_; std::shared_ptr statistics_; + uint64_t max_skip_; // No copying allowed DBIter(const DBIter&); @@ -188,12 +190,13 @@ void DBIter::FindNextUserEntry(bool skipping) { assert(iter_->Valid()); assert(direction_ == kForward); current_entry_is_merged_ = false; + uint64_t num_skipped = 0; do { ParsedInternalKey ikey; if (ParseKey(&ikey) && ikey.sequence <= sequence_) { if (skipping && user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) { - // skip this entry + num_skipped++; // skip this entry } else { skipping = false; switch (ikey.type) { @@ -202,6 +205,7 @@ void DBIter::FindNextUserEntry(bool skipping) { // they are hidden by this deletion. SaveKey(ikey.user_key, &saved_key_); skipping = true; + num_skipped = 0; break; case kTypeValue: valid_ = true; @@ -220,7 +224,20 @@ void DBIter::FindNextUserEntry(bool skipping) { } } } - iter_->Next(); + // If we have sequentially iterated via numerous keys and still not + // found the next user-key, then it is better to seek so that we can + // avoid too many key comparisons. We seek to the last occurence of + // our current key by looking for sequence number 0. + if (skipping && num_skipped > max_skip_) { + num_skipped = 0; + std::string last_key; + AppendInternalKey(&last_key, + ParsedInternalKey(Slice(saved_key_), 0, kValueTypeForSeek)); + iter_->Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_->Next(); + } } while (iter_->Valid()); valid_ = false; } @@ -342,6 +359,7 @@ void DBIter::Prev() { void DBIter::FindPrevUserEntry() { assert(direction_ == kReverse); + uint64_t num_skipped = 0; ValueType value_type = kTypeDeletion; if (iter_->Valid()) { @@ -367,7 +385,22 @@ void DBIter::FindPrevUserEntry() { saved_value_.assign(raw_value.data(), raw_value.size()); } } - iter_->Prev(); + num_skipped++; + // If we have sequentially iterated via numerous keys and still not + // found the prev user-key, then it is better to seek so that we can + // avoid too many key comparisons. We seek to the first occurence of + // our current key by looking for max sequence number. + if (num_skipped > max_skip_) { + num_skipped = 0; + std::string last_key; + AppendInternalKey(&last_key, + ParsedInternalKey(Slice(saved_key_), kMaxSequenceNumber, + kValueTypeForSeek)); + iter_->Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_->Prev(); + } } while (iter_->Valid()); } diff --git a/db/db_test.cc b/db/db_test.cc index fce858d6c..3f2879027 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -69,6 +69,7 @@ class AtomicCounter { count_ = 0; } }; + } // Special Env used to delay background operations @@ -1133,6 +1134,95 @@ TEST(DBTest, IterMulti) { } while (ChangeCompactOptions()); } +// Check that we can skip over a run of user keys +// by using reseek rather than sequential scan +TEST(DBTest, IterReseek) { + Options options = CurrentOptions(); + options.max_sequential_skip_in_iterations = 3; + options.create_if_missing = true; + options.statistics = leveldb::CreateDBStatistics(); + DestroyAndReopen(&options); + + // insert two keys with same userkey and verify that + // reseek is not invoked. For each of these test cases, + // verify that we can find the next key "b". + ASSERT_OK(Put("a", "one")); + ASSERT_OK(Put("a", "two")); + ASSERT_OK(Put("b", "bone")); + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->two"); + iter->Next(); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of three keys with same userkey and verify + // that reseek is still not invoked. + ASSERT_OK(Put("a", "three")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->three"); + iter->Next(); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of four keys with same userkey and verify + // that reseek is invoked. + ASSERT_OK(Put("a", "four")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->four"); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), 0); + iter->Next(); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // Testing reverse iterator + // At this point, we have three versions of "a" and one version of "b". + // The reseek statistics is already at 1. + int num_reseeks = (int)options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION); + + // Insert another version of b and assert that reseek is not invoked + ASSERT_OK(Put("b", "btwo")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->btwo"); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks); + iter->Prev(); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; + + // insert two more versions of b. This makes a total of 4 versions + // of b and 4 versions of a. + ASSERT_OK(Put("b", "bthree")); + ASSERT_OK(Put("b", "bfour")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->bfour"); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2); + iter->Prev(); + + // the previous Prev call should have invoked reseek + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; +} + TEST(DBTest, IterSmallAndLargeMix) { do { ASSERT_OK(Put("a", "va")); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 6d028e256..0c8bdde3a 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -532,6 +532,13 @@ struct Options { // Default: false bool filter_deletes; + // An iteration->Next() sequentially skips over keys with the same + // user-key unless this option is set. This number specifies the number + // of keys (with the same userkey) that will be sequentially + // skipped before a reseek is issued. + // Default: 8 + uint64_t max_sequential_skip_in_iterations; + // This is a factory that provides MemTableRep objects. // Default: a factory that provides a skip-list-based implementation of // MemTableRep. diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index e665278b0..5525a092b 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -58,6 +58,8 @@ enum Tickers { NUMBER_MULTIGET_KEYS_READ = 19, NUMBER_MULTIGET_BYTES_READ = 20, + // Number of deletes records that were not required to be + // written to storage because key does not exist NUMBER_FILTERED_DELETES = 21, NUMBER_MERGE_FAILURES = 22, SEQUENCE_NUMBER = 23, @@ -68,9 +70,15 @@ enum Tickers { BLOOM_FILTER_PREFIX_CHECKED = 24, BLOOM_FILTER_PREFIX_USEFUL = 25, - TICKER_ENUM_MAX = 26 + // Number of times we had to reseek inside an iteration to skip + // over large number of keys with same userkey. + NUMBER_OF_RESEEKS_IN_ITERATION = 26, + + TICKER_ENUM_MAX = 27 }; +// The order of items listed in Tickers should be the same as +// the order listed in TickersNameMap const std::vector> TickersNameMap = { { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" }, { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" }, @@ -97,7 +105,8 @@ const std::vector> TickersNameMap = { { NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" }, { SEQUENCE_NUMBER, "rocksdb.sequence.number" }, { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" }, - { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" } + { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" }, + { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" } }; /** diff --git a/util/options.cc b/util/options.cc index 7907f7171..3ca71e5ff 100644 --- a/util/options.cc +++ b/util/options.cc @@ -81,11 +81,11 @@ Options::Options() bytes_per_sync(0), compaction_style(kCompactionStyleLevel), filter_deletes(false), + max_sequential_skip_in_iterations(8), memtable_factory(std::shared_ptr(new SkipListFactory)), compaction_filter_factory( std::shared_ptr( new DefaultCompactionFilterFactory())) { - assert(memtable_factory.get() != nullptr); } @@ -174,6 +174,8 @@ Options::Dump(Logger* log) const Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d", i, max_bytes_for_level_multiplier_additional[i]); } + Log(log," Options.max_sequential_skip_in_iterations: %ld", + max_sequential_skip_in_iterations); Log(log," Options.expanded_compaction_factor: %d", expanded_compaction_factor); Log(log," Options.source_compaction_factor: %d",