Less linear search in DBIter::Seek() when keys are overwritten a lot

Summary:
In one deployment we saw high latencies (presumably from slow iterator operations) and a lot of CPU time reported by perf with this stack:

```
  rocksdb::MergingIterator::Next
  rocksdb::DBIter::FindNextUserEntryInternal
  rocksdb::DBIter::Seek
```

I think what's happening is:
1. we create a snapshot iterator,
2. we do lots of Put()s for the same key x; this creates lots of entries in memtable,
3. we seek the iterator to a key slightly smaller than x,
4. the seek walks over lots of entries in memtable for key x, skipping them because of high sequence numbers.

CC IslamAbdelRahman
Closes https://github.com/facebook/rocksdb/pull/1413

Differential Revision: D4083879

Pulled By: IslamAbdelRahman

fbshipit-source-id: a83ddae
main
Mike Kolupaev 8 years ago committed by Facebook Github Bot
parent cd7c4143d7
commit 236d4c67e9
  1. 1
      .gitignore
  2. 198
      db/db_iter.cc
  3. 58
      db/db_iterator_test.cc
  4. 2
      db/memtable.cc
  5. 15
      include/rocksdb/perf_context.h
  6. 8
      util/perf_context.cc

1
.gitignore vendored

@ -37,6 +37,7 @@ util/build_version.cc
build_tools/VALGRIND_LOGS/ build_tools/VALGRIND_LOGS/
coverage/COVERAGE_REPORT coverage/COVERAGE_REPORT
.gdbhistory .gdbhistory
.gdb_history
package/ package/
.phutil_module_cache .phutil_module_cache
unity.a unity.a

@ -339,7 +339,7 @@ void DBIter::Next() {
// saved_value_ => the merged value // saved_value_ => the merged value
// //
// NOTE: In between, saved_key_ can point to a user key that has // NOTE: In between, saved_key_ can point to a user key that has
// a delete marker // a delete marker or a sequence number higher than sequence_
// //
// The prefix_check parameter controls whether we check the iterated // The prefix_check parameter controls whether we check the iterated
// keys against the prefix of the seeked key. Set to false when // keys against the prefix of the seeked key. Set to false when
@ -356,90 +356,137 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
assert(iter_->Valid()); assert(iter_->Valid());
assert(direction_ == kForward); assert(direction_ == kForward);
current_entry_is_merged_ = false; current_entry_is_merged_ = false;
// How many times in a row we have skipped an entry with user key less than
// or equal to saved_key_. We could skip these entries either because
// sequence numbers were too high or because skipping = true.
// What saved_key_ contains throughout this method:
// - if skipping : saved_key_ contains the key that we need to skip,
// and we haven't seen any keys greater than that,
// - if num_skipped > 0 : saved_key_ contains the key that we have skipped
// num_skipped times, and we haven't seen any keys
// greater than that,
// - none of the above : saved_key_ can contain anything, it doesn't matter.
uint64_t num_skipped = 0; uint64_t num_skipped = 0;
do { do {
ParsedInternalKey ikey; ParsedInternalKey ikey;
if (ParseKey(&ikey)) { if (!ParseKey(&ikey)) {
if (iterate_upper_bound_ != nullptr && // Skip corrupted keys.
user_comparator_->Compare(ikey.user_key, *iterate_upper_bound_) >= 0) { iter_->Next();
break; continue;
} }
if (prefix_extractor_ && prefix_check && if (iterate_upper_bound_ != nullptr &&
prefix_extractor_->Transform(ikey.user_key).compare(prefix_start_key_) != 0) { user_comparator_->Compare(ikey.user_key, *iterate_upper_bound_) >= 0) {
break; break;
} }
if (ikey.sequence <= sequence_) { if (prefix_extractor_ && prefix_check &&
if (skipping && prefix_extractor_->Transform(ikey.user_key)
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { .compare(prefix_start_key_) != 0) {
num_skipped++; // skip this entry break;
PERF_COUNTER_ADD(internal_key_skipped_count, 1); }
} else {
switch (ikey.type) { if (ikey.sequence <= sequence_) {
case kTypeDeletion: if (skipping &&
case kTypeSingleDeletion: user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
num_skipped++; // skip this entry
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
} else {
skipping = false; // ikey > saved_key_, i.e. saved_key_ is skipped
num_skipped = 0;
switch (ikey.type) {
case kTypeDeletion:
case kTypeSingleDeletion:
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
skipping = true;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break;
case kTypeValue:
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
if (range_del_agg_.ShouldDelete(ikey)) {
// Arrange to skip all upcoming entries for this key since // Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion. // they are hidden by this deletion.
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
skipping = true; skipping = true;
num_skipped = 0; num_skipped = 0;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1); PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break; } else {
case kTypeValue: valid_ = true;
saved_key_.SetKey( return;
ikey.user_key, }
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); break;
if (range_del_agg_.ShouldDelete(ikey)) { case kTypeMerge:
// Arrange to skip all upcoming entries for this key since saved_key_.SetKey(
// they are hidden by this deletion. ikey.user_key,
skipping = true; !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
num_skipped = 0; if (range_del_agg_.ShouldDelete(ikey)) {
PERF_COUNTER_ADD(internal_delete_skipped_count, 1); // Arrange to skip all upcoming entries for this key since
} else { // they are hidden by this deletion.
valid_ = true; skipping = true;
return; num_skipped = 0;
} PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break; } else {
case kTypeMerge: // By now, we are sure the current ikey is going to yield a
saved_key_.SetKey( // value
ikey.user_key, current_entry_is_merged_ = true;
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); valid_ = true;
if (range_del_agg_.ShouldDelete(ikey)) { MergeValuesNewToOld(); // Go to a different state machine
// Arrange to skip all upcoming entries for this key since return;
// they are hidden by this deletion. }
skipping = true; break;
num_skipped = 0; default:
PERF_COUNTER_ADD(internal_delete_skipped_count, 1); assert(false);
} else { break;
// By now, we are sure the current ikey is going to yield a
// value
current_entry_is_merged_ = true;
valid_ = true;
MergeValuesNewToOld(); // Go to a different state machine
return;
}
break;
default:
assert(false);
break;
}
} }
} }
} else {
// This key was inserted after our snapshot was taken.
PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
// Here saved_key_ may contain some old key, or the default empty key, or
// key assigned by some random other method. We don't care.
if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
num_skipped++;
} else {
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
skipping = false;
num_skipped = 0;
}
} }
// If we have sequentially iterated via numerous keys and still not
// found the next user-key, then it is better to seek so that we can // If we have sequentially iterated via numerous equal keys, then it's
// avoid too many key comparisons. We seek to the last occurrence of // better to seek so that we can avoid too many key comparisons.
// our current key by looking for sequence number 0 and type deletion if (num_skipped > max_skip_) {
// (the smallest type).
if (skipping && num_skipped > max_skip_) {
num_skipped = 0; num_skipped = 0;
std::string last_key; std::string last_key;
AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0, if (skipping) {
kTypeDeletion)); // We're looking for the next user-key but all we see are the same
// user-key with decreasing sequence numbers. Fast forward to
// sequence number 0 and type deletion (the smallest type).
AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0,
kTypeDeletion));
// Don't set skipping = false because we may still see more user-keys
// equal to saved_key_.
} else {
// We saw multiple entries with this user key and sequence numbers
// higher than sequence_. Fast forward to sequence_.
// Note that this only covers a case when a higher key was overwritten
// many times since our snapshot was taken, not the case when a lot of
// different keys were inserted after our snapshot was taken.
AppendInternalKey(&last_key,
ParsedInternalKey(saved_key_.GetKey(), sequence_,
kValueTypeForSeek));
}
iter_->Seek(last_key); iter_->Seek(last_key);
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
} else { } else {
@ -503,6 +550,7 @@ void DBIter::MergeValuesNewToOld() {
// when complete, add result to operands and continue. // when complete, add result to operands and continue.
merge_context_.PushOperand(iter_->value(), merge_context_.PushOperand(iter_->value(),
iter_->IsValuePinned() /* operand_pinned */); iter_->IsValuePinned() /* operand_pinned */);
PERF_COUNTER_ADD(internal_merge_count, 1);
} else { } else {
assert(false); assert(false);
} }
@ -564,6 +612,11 @@ void DBIter::ReverseToBackward() {
FindParseableKey(&ikey, kReverse); FindParseableKey(&ikey, kReverse);
while (iter_->Valid() && while (iter_->Valid() &&
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) > 0) { user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) > 0) {
if (ikey.sequence > sequence_) {
PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
} else {
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
}
iter_->Prev(); iter_->Prev();
FindParseableKey(&ikey, kReverse); FindParseableKey(&ikey, kReverse);
} }
@ -676,6 +729,7 @@ bool DBIter::FindValueForCurrentKey() {
assert(merge_operator_ != nullptr); assert(merge_operator_ != nullptr);
merge_context_.PushOperandBack( merge_context_.PushOperandBack(
iter_->value(), iter_->IsValuePinned() /* operand_pinned */); iter_->value(), iter_->IsValuePinned() /* operand_pinned */);
PERF_COUNTER_ADD(internal_merge_count, 1);
} }
break; break;
default: default:
@ -760,6 +814,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
ikey.type == kTypeMerge && !range_del_agg_.ShouldDelete(ikey)) { ikey.type == kTypeMerge && !range_del_agg_.ShouldDelete(ikey)) {
merge_context_.PushOperand(iter_->value(), merge_context_.PushOperand(iter_->value(),
iter_->IsValuePinned() /* operand_pinned */); iter_->IsValuePinned() /* operand_pinned */);
PERF_COUNTER_ADD(internal_merge_count, 1);
iter_->Next(); iter_->Next();
FindParseableKey(&ikey, kForward); FindParseableKey(&ikey, kForward);
} }
@ -830,6 +885,11 @@ void DBIter::FindPrevUserKey() {
++num_skipped; ++num_skipped;
} }
} }
if (ikey.sequence > sequence_) {
PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
} else {
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
}
iter_->Prev(); iter_->Prev();
FindParseableKey(&ikey, kReverse); FindParseableKey(&ikey, kReverse);
} }

@ -8,10 +8,10 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h" #include "db/db_test_util.h"
#include "port/port.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "rocksdb/iostats_context.h" #include "rocksdb/iostats_context.h"
#include "rocksdb/perf_context.h" #include "rocksdb/perf_context.h"
#include "port/port.h"
namespace rocksdb { namespace rocksdb {
@ -1778,6 +1778,62 @@ TEST_F(DBIteratorTest, ReadAhead) {
delete iter; delete iter;
} }
// Insert a key, create a snapshot iterator, overwrite key lots of times,
// seek to a smaller key. Expect DBIter to fall back to a seek instead of
// going through all the overwrites linearly.
TEST_F(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
Options options = CurrentOptions();
options.env = env_;
options.create_if_missing = true;
options.max_sequential_skip_in_iterations = 3;
options.prefix_extractor = nullptr;
options.write_buffer_size = 1 << 27; // big enough to avoid flush
options.statistics = rocksdb::CreateDBStatistics();
DestroyAndReopen(options);
// Insert.
ASSERT_OK(Put("b", "0"));
// Create iterator.
ReadOptions ro;
std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
// Insert a lot.
for (int i = 0; i < 100; ++i) {
ASSERT_OK(Put("b", std::to_string(i + 1).c_str()));
}
#ifndef ROCKSDB_LITE
// Check that memtable wasn't flushed.
std::string val;
ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val));
EXPECT_EQ("0", val);
#endif
// Seek iterator to a smaller key.
perf_context.Reset();
iter->Seek("a");
ASSERT_TRUE(iter->Valid());
EXPECT_EQ("b", iter->key().ToString());
EXPECT_EQ("0", iter->value().ToString());
// Check that the seek didn't do too much work.
// Checks are not tight, just make sure that everything is well below 100.
EXPECT_LT(perf_context.internal_key_skipped_count, 4);
EXPECT_LT(perf_context.internal_recent_skipped_count, 8);
EXPECT_LT(perf_context.seek_on_memtable_count, 10);
EXPECT_LT(perf_context.next_on_memtable_count, 10);
EXPECT_LT(perf_context.prev_on_memtable_count, 10);
// Check that iterator did something like what we expect.
EXPECT_EQ(perf_context.internal_delete_skipped_count, 0);
EXPECT_EQ(perf_context.internal_merge_count, 0);
EXPECT_GE(perf_context.internal_recent_skipped_count, 2);
EXPECT_GE(perf_context.seek_on_memtable_count, 2);
EXPECT_EQ(1, options.statistics->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION));
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -322,11 +322,13 @@ class MemTableIterator : public InternalIterator {
valid_ = iter_->Valid(); valid_ = iter_->Valid();
} }
virtual void Next() override { virtual void Next() override {
PERF_COUNTER_ADD(next_on_memtable_count, 1);
assert(Valid()); assert(Valid());
iter_->Next(); iter_->Next();
valid_ = iter_->Valid(); valid_ = iter_->Valid();
} }
virtual void Prev() override { virtual void Prev() override {
PERF_COUNTER_ADD(prev_on_memtable_count, 1);
assert(Valid()); assert(Valid());
iter_->Prev(); iter_->Prev();
valid_ = iter_->Valid(); valid_ = iter_->Valid();

@ -45,8 +45,7 @@ struct PerfContext {
// tombstones are not included in this counter, while previous updates // tombstones are not included in this counter, while previous updates
// hidden by the tombstones will be included here. // hidden by the tombstones will be included here.
// 4. symmetric cases for Prev() and SeekToLast() // 4. symmetric cases for Prev() and SeekToLast()
// We sometimes also skip entries of more recent updates than the snapshot // internal_recent_skipped_count is not included in this counter.
// we read from, but they are not included in this counter.
// //
uint64_t internal_key_skipped_count; uint64_t internal_key_skipped_count;
// Total number of deletes and single deletes skipped over during iteration // Total number of deletes and single deletes skipped over during iteration
@ -57,6 +56,13 @@ struct PerfContext {
// still older updates invalidated by the tombstones. // still older updates invalidated by the tombstones.
// //
uint64_t internal_delete_skipped_count; uint64_t internal_delete_skipped_count;
// How many times iterators skipped over internal keys that are more recent
// than the snapshot that iterator is using.
//
uint64_t internal_recent_skipped_count;
// How many values were fed into merge operator by iterators.
//
uint64_t internal_merge_count;
uint64_t get_snapshot_time; // total nanos spent on getting snapshot uint64_t get_snapshot_time; // total nanos spent on getting snapshot
uint64_t get_from_memtable_time; // total nanos spent on querying memtables uint64_t get_from_memtable_time; // total nanos spent on querying memtables
@ -67,7 +73,12 @@ struct PerfContext {
// total nanos spent on seeking memtable // total nanos spent on seeking memtable
uint64_t seek_on_memtable_time; uint64_t seek_on_memtable_time;
// number of seeks issued on memtable // number of seeks issued on memtable
// (including SeekForPrev but not SeekToFirst and SeekToLast)
uint64_t seek_on_memtable_count; uint64_t seek_on_memtable_count;
// number of Next()s issued on memtable
uint64_t next_on_memtable_count;
// number of Prev()s issued on memtable
uint64_t prev_on_memtable_count;
// total nanos spent on seeking child iters // total nanos spent on seeking child iters
uint64_t seek_child_seek_time; uint64_t seek_child_seek_time;
// number of seek issued in child iterators // number of seek issued in child iterators

@ -28,6 +28,8 @@ void PerfContext::Reset() {
block_decompress_time = 0; block_decompress_time = 0;
internal_key_skipped_count = 0; internal_key_skipped_count = 0;
internal_delete_skipped_count = 0; internal_delete_skipped_count = 0;
internal_recent_skipped_count = 0;
internal_merge_count = 0;
write_wal_time = 0; write_wal_time = 0;
get_snapshot_time = 0; get_snapshot_time = 0;
@ -37,6 +39,8 @@ void PerfContext::Reset() {
get_from_output_files_time = 0; get_from_output_files_time = 0;
seek_on_memtable_time = 0; seek_on_memtable_time = 0;
seek_on_memtable_count = 0; seek_on_memtable_count = 0;
next_on_memtable_count = 0;
prev_on_memtable_count = 0;
seek_child_seek_time = 0; seek_child_seek_time = 0;
seek_child_seek_count = 0; seek_child_seek_count = 0;
seek_min_heap_time = 0; seek_min_heap_time = 0;
@ -80,6 +84,8 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
PERF_CONTEXT_OUTPUT(block_decompress_time); PERF_CONTEXT_OUTPUT(block_decompress_time);
PERF_CONTEXT_OUTPUT(internal_key_skipped_count); PERF_CONTEXT_OUTPUT(internal_key_skipped_count);
PERF_CONTEXT_OUTPUT(internal_delete_skipped_count); PERF_CONTEXT_OUTPUT(internal_delete_skipped_count);
PERF_CONTEXT_OUTPUT(internal_recent_skipped_count);
PERF_CONTEXT_OUTPUT(internal_merge_count);
PERF_CONTEXT_OUTPUT(write_wal_time); PERF_CONTEXT_OUTPUT(write_wal_time);
PERF_CONTEXT_OUTPUT(get_snapshot_time); PERF_CONTEXT_OUTPUT(get_snapshot_time);
PERF_CONTEXT_OUTPUT(get_from_memtable_time); PERF_CONTEXT_OUTPUT(get_from_memtable_time);
@ -88,6 +94,8 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
PERF_CONTEXT_OUTPUT(get_from_output_files_time); PERF_CONTEXT_OUTPUT(get_from_output_files_time);
PERF_CONTEXT_OUTPUT(seek_on_memtable_time); PERF_CONTEXT_OUTPUT(seek_on_memtable_time);
PERF_CONTEXT_OUTPUT(seek_on_memtable_count); PERF_CONTEXT_OUTPUT(seek_on_memtable_count);
PERF_CONTEXT_OUTPUT(next_on_memtable_count);
PERF_CONTEXT_OUTPUT(prev_on_memtable_count);
PERF_CONTEXT_OUTPUT(seek_child_seek_time); PERF_CONTEXT_OUTPUT(seek_child_seek_time);
PERF_CONTEXT_OUTPUT(seek_child_seek_count); PERF_CONTEXT_OUTPUT(seek_child_seek_count);
PERF_CONTEXT_OUTPUT(seek_min_heap_time); PERF_CONTEXT_OUTPUT(seek_min_heap_time);

Loading…
Cancel
Save