An iterator may automatically invoke reseeks.

Summary:
An iterator invokes reseek if the number of sequential skips over the
same userkey exceeds a configured number. This makes iter->Next()
faster (bacause of fewer key compares) if a large number of
adjacent internal keys in a table (sst or memtable) have the
same userkey.

Test Plan: Unit test DBTest.IterReseek.

Reviewers: emayanke, haobo, xjin

Reviewed By: xjin

CC: leveldb, xjin

Differential Revision: https://reviews.facebook.net/D11865
main
Dhruba Borthakur 12 years ago
parent de98c1d9aa
commit 197034e4c3
  1. 39
      db/db_iter.cc
  2. 90
      db/db_test.cc
  3. 7
      include/rocksdb/options.h
  4. 13
      include/rocksdb/statistics.h
  5. 4
      util/options.cc

@ -65,6 +65,7 @@ class DBIter: public Iterator {
current_entry_is_merged_(false), current_entry_is_merged_(false),
statistics_(options.statistics) { statistics_(options.statistics) {
RecordTick(statistics_, NO_ITERATORS, 1); RecordTick(statistics_, NO_ITERATORS, 1);
max_skip_ = options.max_sequential_skip_in_iterations;
} }
virtual ~DBIter() { virtual ~DBIter() {
RecordTick(statistics_, NO_ITERATORS, -1); RecordTick(statistics_, NO_ITERATORS, -1);
@ -129,6 +130,7 @@ class DBIter: public Iterator {
bool valid_; bool valid_;
bool current_entry_is_merged_; bool current_entry_is_merged_;
std::shared_ptr<Statistics> statistics_; std::shared_ptr<Statistics> statistics_;
uint64_t max_skip_;
// No copying allowed // No copying allowed
DBIter(const DBIter&); DBIter(const DBIter&);
@ -188,12 +190,13 @@ void DBIter::FindNextUserEntry(bool skipping) {
assert(iter_->Valid()); assert(iter_->Valid());
assert(direction_ == kForward); assert(direction_ == kForward);
current_entry_is_merged_ = false; current_entry_is_merged_ = false;
uint64_t num_skipped = 0;
do { do {
ParsedInternalKey ikey; ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) { if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
if (skipping && if (skipping &&
user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) { user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) {
// skip this entry num_skipped++; // skip this entry
} else { } else {
skipping = false; skipping = false;
switch (ikey.type) { switch (ikey.type) {
@ -202,6 +205,7 @@ void DBIter::FindNextUserEntry(bool skipping) {
// they are hidden by this deletion. // they are hidden by this deletion.
SaveKey(ikey.user_key, &saved_key_); SaveKey(ikey.user_key, &saved_key_);
skipping = true; skipping = true;
num_skipped = 0;
break; break;
case kTypeValue: case kTypeValue:
valid_ = true; valid_ = true;
@ -220,7 +224,20 @@ void DBIter::FindNextUserEntry(bool skipping) {
} }
} }
} }
iter_->Next(); // If we have sequentially iterated via numerous keys and still not
// found the next user-key, then it is better to seek so that we can
// avoid too many key comparisons. We seek to the last occurence of
// our current key by looking for sequence number 0.
if (skipping && num_skipped > max_skip_) {
num_skipped = 0;
std::string last_key;
AppendInternalKey(&last_key,
ParsedInternalKey(Slice(saved_key_), 0, kValueTypeForSeek));
iter_->Seek(last_key);
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
} else {
iter_->Next();
}
} while (iter_->Valid()); } while (iter_->Valid());
valid_ = false; valid_ = false;
} }
@ -342,6 +359,7 @@ void DBIter::Prev() {
void DBIter::FindPrevUserEntry() { void DBIter::FindPrevUserEntry() {
assert(direction_ == kReverse); assert(direction_ == kReverse);
uint64_t num_skipped = 0;
ValueType value_type = kTypeDeletion; ValueType value_type = kTypeDeletion;
if (iter_->Valid()) { if (iter_->Valid()) {
@ -367,7 +385,22 @@ void DBIter::FindPrevUserEntry() {
saved_value_.assign(raw_value.data(), raw_value.size()); saved_value_.assign(raw_value.data(), raw_value.size());
} }
} }
iter_->Prev(); num_skipped++;
// If we have sequentially iterated via numerous keys and still not
// found the prev user-key, then it is better to seek so that we can
// avoid too many key comparisons. We seek to the first occurence of
// our current key by looking for max sequence number.
if (num_skipped > max_skip_) {
num_skipped = 0;
std::string last_key;
AppendInternalKey(&last_key,
ParsedInternalKey(Slice(saved_key_), kMaxSequenceNumber,
kValueTypeForSeek));
iter_->Seek(last_key);
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
} else {
iter_->Prev();
}
} while (iter_->Valid()); } while (iter_->Valid());
} }

@ -69,6 +69,7 @@ class AtomicCounter {
count_ = 0; count_ = 0;
} }
}; };
} }
// Special Env used to delay background operations // Special Env used to delay background operations
@ -1133,6 +1134,95 @@ TEST(DBTest, IterMulti) {
} while (ChangeCompactOptions()); } while (ChangeCompactOptions());
} }
// Check that we can skip over a run of user keys
// by using reseek rather than sequential scan
TEST(DBTest, IterReseek) {
Options options = CurrentOptions();
options.max_sequential_skip_in_iterations = 3;
options.create_if_missing = true;
options.statistics = leveldb::CreateDBStatistics();
DestroyAndReopen(&options);
// insert two keys with same userkey and verify that
// reseek is not invoked. For each of these test cases,
// verify that we can find the next key "b".
ASSERT_OK(Put("a", "one"));
ASSERT_OK(Put("a", "two"));
ASSERT_OK(Put("b", "bone"));
Iterator* iter = db_->NewIterator(ReadOptions());
iter->SeekToFirst();
ASSERT_EQ(options.statistics.get()->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
ASSERT_EQ(IterStatus(iter), "a->two");
iter->Next();
ASSERT_EQ(options.statistics.get()->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
ASSERT_EQ(IterStatus(iter), "b->bone");
delete iter;
// insert a total of three keys with same userkey and verify
// that reseek is still not invoked.
ASSERT_OK(Put("a", "three"));
iter = db_->NewIterator(ReadOptions());
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->three");
iter->Next();
ASSERT_EQ(options.statistics.get()->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
ASSERT_EQ(IterStatus(iter), "b->bone");
delete iter;
// insert a total of four keys with same userkey and verify
// that reseek is invoked.
ASSERT_OK(Put("a", "four"));
iter = db_->NewIterator(ReadOptions());
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->four");
ASSERT_EQ(options.statistics.get()->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
iter->Next();
ASSERT_EQ(options.statistics.get()->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION), 1);
ASSERT_EQ(IterStatus(iter), "b->bone");
delete iter;
// Testing reverse iterator
// At this point, we have three versions of "a" and one version of "b".
// The reseek statistics is already at 1.
int num_reseeks = (int)options.statistics.get()->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION);
// Insert another version of b and assert that reseek is not invoked
ASSERT_OK(Put("b", "btwo"));
iter = db_->NewIterator(ReadOptions());
iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "b->btwo");
ASSERT_EQ(options.statistics.get()->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks);
iter->Prev();
ASSERT_EQ(options.statistics.get()->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1);
ASSERT_EQ(IterStatus(iter), "a->four");
delete iter;
// insert two more versions of b. This makes a total of 4 versions
// of b and 4 versions of a.
ASSERT_OK(Put("b", "bthree"));
ASSERT_OK(Put("b", "bfour"));
iter = db_->NewIterator(ReadOptions());
iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "b->bfour");
ASSERT_EQ(options.statistics.get()->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2);
iter->Prev();
// the previous Prev call should have invoked reseek
ASSERT_EQ(options.statistics.get()->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3);
ASSERT_EQ(IterStatus(iter), "a->four");
delete iter;
}
TEST(DBTest, IterSmallAndLargeMix) { TEST(DBTest, IterSmallAndLargeMix) {
do { do {
ASSERT_OK(Put("a", "va")); ASSERT_OK(Put("a", "va"));

@ -532,6 +532,13 @@ struct Options {
// Default: false // Default: false
bool filter_deletes; bool filter_deletes;
// An iteration->Next() sequentially skips over keys with the same
// user-key unless this option is set. This number specifies the number
// of keys (with the same userkey) that will be sequentially
// skipped before a reseek is issued.
// Default: 8
uint64_t max_sequential_skip_in_iterations;
// This is a factory that provides MemTableRep objects. // This is a factory that provides MemTableRep objects.
// Default: a factory that provides a skip-list-based implementation of // Default: a factory that provides a skip-list-based implementation of
// MemTableRep. // MemTableRep.

@ -58,6 +58,8 @@ enum Tickers {
NUMBER_MULTIGET_KEYS_READ = 19, NUMBER_MULTIGET_KEYS_READ = 19,
NUMBER_MULTIGET_BYTES_READ = 20, NUMBER_MULTIGET_BYTES_READ = 20,
// Number of deletes records that were not required to be
// written to storage because key does not exist
NUMBER_FILTERED_DELETES = 21, NUMBER_FILTERED_DELETES = 21,
NUMBER_MERGE_FAILURES = 22, NUMBER_MERGE_FAILURES = 22,
SEQUENCE_NUMBER = 23, SEQUENCE_NUMBER = 23,
@ -68,9 +70,15 @@ enum Tickers {
BLOOM_FILTER_PREFIX_CHECKED = 24, BLOOM_FILTER_PREFIX_CHECKED = 24,
BLOOM_FILTER_PREFIX_USEFUL = 25, BLOOM_FILTER_PREFIX_USEFUL = 25,
TICKER_ENUM_MAX = 26 // Number of times we had to reseek inside an iteration to skip
// over large number of keys with same userkey.
NUMBER_OF_RESEEKS_IN_ITERATION = 26,
TICKER_ENUM_MAX = 27
}; };
// The order of items listed in Tickers should be the same as
// the order listed in TickersNameMap
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = { const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{ BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" }, { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
{ BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" }, { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
@ -97,7 +105,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{ NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" }, { NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
{ SEQUENCE_NUMBER, "rocksdb.sequence.number" }, { SEQUENCE_NUMBER, "rocksdb.sequence.number" },
{ BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" }, { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
{ BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" } { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
{ NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" }
}; };
/** /**

@ -81,11 +81,11 @@ Options::Options()
bytes_per_sync(0), bytes_per_sync(0),
compaction_style(kCompactionStyleLevel), compaction_style(kCompactionStyleLevel),
filter_deletes(false), filter_deletes(false),
max_sequential_skip_in_iterations(8),
memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)), memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)),
compaction_filter_factory( compaction_filter_factory(
std::shared_ptr<CompactionFilterFactory>( std::shared_ptr<CompactionFilterFactory>(
new DefaultCompactionFilterFactory())) { new DefaultCompactionFilterFactory())) {
assert(memtable_factory.get() != nullptr); assert(memtable_factory.get() != nullptr);
} }
@ -174,6 +174,8 @@ Options::Dump(Logger* log) const
Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d", Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d",
i, max_bytes_for_level_multiplier_additional[i]); i, max_bytes_for_level_multiplier_additional[i]);
} }
Log(log," Options.max_sequential_skip_in_iterations: %ld",
max_sequential_skip_in_iterations);
Log(log," Options.expanded_compaction_factor: %d", Log(log," Options.expanded_compaction_factor: %d",
expanded_compaction_factor); expanded_compaction_factor);
Log(log," Options.source_compaction_factor: %d", Log(log," Options.source_compaction_factor: %d",

Loading…
Cancel
Save