|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include <functional>
|
|
|
|
|
|
|
|
#include "db/arena_wrapped_db_iter.h"
|
|
|
|
#include "db/db_iter.h"
|
|
|
|
#include "db/db_test_util.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "port/stack_trace.h"
|
|
|
|
#include "rocksdb/iostats_context.h"
|
|
|
|
#include "rocksdb/perf_context.h"
|
|
|
|
#include "table/block_based/flush_block_policy.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
// A dumb ReadCallback which saying every key is committed.
|
|
|
|
class DummyReadCallback : public ReadCallback {
|
|
|
|
public:
|
|
|
|
DummyReadCallback() : ReadCallback(kMaxSequenceNumber) {}
|
|
|
|
bool IsVisibleFullCheck(SequenceNumber /*seq*/) override { return true; }
|
|
|
|
void SetSnapshot(SequenceNumber seq) { max_visible_seq_ = seq; }
|
|
|
|
};
|
|
|
|
|
|
|
|
// Test param:
|
|
|
|
// bool: whether to pass read_callback to NewIterator().
|
|
|
|
class DBIteratorTest : public DBTestBase,
|
|
|
|
public testing::WithParamInterface<bool> {
|
|
|
|
public:
|
|
|
|
DBIteratorTest() : DBTestBase("/db_iterator_test") {}
|
|
|
|
|
|
|
|
Iterator* NewIterator(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family = nullptr) {
|
|
|
|
if (column_family == nullptr) {
|
|
|
|
column_family = db_->DefaultColumnFamily();
|
|
|
|
}
|
|
|
|
auto* cfd =
|
|
|
|
static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
|
|
|
|
SequenceNumber seq = read_options.snapshot != nullptr
|
|
|
|
? read_options.snapshot->GetSequenceNumber()
|
|
|
|
: db_->GetLatestSequenceNumber();
|
|
|
|
bool use_read_callback = GetParam();
|
|
|
|
DummyReadCallback* read_callback = nullptr;
|
|
|
|
if (use_read_callback) {
|
|
|
|
read_callback = new DummyReadCallback();
|
|
|
|
read_callback->SetSnapshot(seq);
|
|
|
|
InstrumentedMutexLock lock(&mutex_);
|
|
|
|
read_callbacks_.push_back(
|
|
|
|
std::unique_ptr<DummyReadCallback>(read_callback));
|
|
|
|
}
|
|
|
|
return dbfull()->NewIteratorImpl(read_options, cfd, seq, read_callback);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
InstrumentedMutex mutex_;
|
|
|
|
std::vector<std::unique_ptr<DummyReadCallback>> read_callbacks_;
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IteratorProperty) {
|
|
|
|
// The test needs to be changed if kPersistedTier is supported in iterator.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
Put(1, "1", "2");
|
|
|
|
Delete(1, "2");
|
|
|
|
ReadOptions ropt;
|
|
|
|
ropt.pin_data = false;
|
|
|
|
{
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ropt, handles_[1]));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
std::string prop_value;
|
|
|
|
ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value));
|
|
|
|
ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
|
|
|
|
ASSERT_EQ("0", prop_value);
|
|
|
|
ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
|
|
|
|
ASSERT_EQ("1", prop_value);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
|
|
|
|
ASSERT_EQ("Iterator is not valid.", prop_value);
|
|
|
|
|
|
|
|
// Get internal key at which the iteration stopped (tombstone in this case).
|
|
|
|
ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
|
|
|
|
ASSERT_EQ("2", prop_value);
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, PersistedTierOnIterator) {
|
|
|
|
// The test needs to be changed if kPersistedTier is supported in iterator.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
ReadOptions ropt;
|
|
|
|
ropt.read_tier = kPersistedTier;
|
|
|
|
|
|
|
|
auto* iter = db_->NewIterator(ropt, handles_[1]);
|
|
|
|
ASSERT_TRUE(iter->status().IsNotSupported());
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
std::vector<Iterator*> iters;
|
|
|
|
ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported());
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, NonBlockingIteration) {
|
|
|
|
do {
|
|
|
|
ReadOptions non_blocking_opts, regular_opts;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
non_blocking_opts.read_tier = kBlockCacheTier;
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
// write one kv to the database.
|
|
|
|
ASSERT_OK(Put(1, "a", "b"));
|
|
|
|
|
|
|
|
// scan using non-blocking iterator. We should find it because
|
|
|
|
// it is in memtable.
|
|
|
|
Iterator* iter = NewIterator(non_blocking_opts, handles_[1]);
|
|
|
|
int count = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(count, 1);
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// flush memtable to storage. Now, the key should not be in the
|
|
|
|
// memtable neither in the block cache.
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
|
|
|
|
// verify that a non-blocking iterator does not find any
|
|
|
|
// kvs. Neither does it do any IOs to storage.
|
|
|
|
uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
|
|
|
|
uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
|
|
|
|
iter = NewIterator(non_blocking_opts, handles_[1]);
|
|
|
|
count = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(count, 0);
|
|
|
|
ASSERT_TRUE(iter->status().IsIncomplete());
|
|
|
|
ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
|
|
|
|
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// read in the specified block via a regular get
|
|
|
|
ASSERT_EQ(Get(1, "a"), "b");
|
|
|
|
|
|
|
|
// verify that we can find it via a non-blocking scan
|
|
|
|
numopen = TestGetTickerCount(options, NO_FILE_OPENS);
|
|
|
|
cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
|
|
|
|
iter = NewIterator(non_blocking_opts, handles_[1]);
|
|
|
|
count = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(count, 1);
|
|
|
|
ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
|
|
|
|
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// This test verifies block cache behaviors, which is not used by plain
|
|
|
|
// table format.
|
|
|
|
} while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipMmapReads));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterSeekBeforePrev) {
|
|
|
|
ASSERT_OK(Put("a", "b"));
|
|
|
|
ASSERT_OK(Put("c", "d"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ASSERT_OK(Put("0", "f"));
|
|
|
|
ASSERT_OK(Put("1", "h"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ASSERT_OK(Put("2", "j"));
|
|
|
|
auto iter = NewIterator(ReadOptions());
|
|
|
|
iter->Seek(Slice("c"));
|
|
|
|
iter->Prev();
|
|
|
|
iter->Seek(Slice("a"));
|
|
|
|
iter->Prev();
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterReseekNewUpperBound) {
|
|
|
|
Random rnd(301);
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1024;
|
|
|
|
table_options.block_size_deviation = 50;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
ASSERT_OK(Put("a", RandomString(&rnd, 400)));
|
|
|
|
ASSERT_OK(Put("aabb", RandomString(&rnd, 400)));
|
|
|
|
ASSERT_OK(Put("aaef", RandomString(&rnd, 400)));
|
|
|
|
ASSERT_OK(Put("b", RandomString(&rnd, 400)));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ReadOptions opts;
|
|
|
|
Slice ub = Slice("aa");
|
|
|
|
opts.iterate_upper_bound = &ub;
|
|
|
|
auto iter = NewIterator(opts);
|
|
|
|
iter->Seek(Slice("a"));
|
|
|
|
ub = Slice("b");
|
|
|
|
iter->Seek(Slice("aabc"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "aaef");
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
|
|
|
|
ASSERT_OK(Put("a", "b"));
|
|
|
|
ASSERT_OK(Put("c", "d"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ASSERT_OK(Put("0", "f"));
|
|
|
|
ASSERT_OK(Put("1", "h"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ASSERT_OK(Put("2", "j"));
|
|
|
|
auto iter = NewIterator(ReadOptions());
|
|
|
|
iter->SeekForPrev(Slice("0"));
|
|
|
|
iter->Next();
|
|
|
|
iter->SeekForPrev(Slice("1"));
|
|
|
|
iter->Next();
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
std::string MakeLongKey(size_t length, char c) {
|
|
|
|
return std::string(length, c);
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterLongKeys) {
|
|
|
|
ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
|
|
|
|
ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
|
|
|
|
ASSERT_OK(Put("a", "b"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
|
|
|
|
ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
|
|
|
|
ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
|
|
|
|
auto iter = NewIterator(ReadOptions());
|
|
|
|
|
|
|
|
// Create a key that needs to be skipped for Seq too new
|
|
|
|
iter->Seek(MakeLongKey(20, 0));
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4");
|
|
|
|
|
|
|
|
iter->SeekForPrev(MakeLongKey(127, 3));
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
iter = NewIterator(ReadOptions());
|
|
|
|
iter->Seek(MakeLongKey(50, 1));
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterNextWithNewerSeq) {
|
|
|
|
ASSERT_OK(Put("0", "0"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ASSERT_OK(Put("a", "b"));
|
|
|
|
ASSERT_OK(Put("c", "d"));
|
|
|
|
ASSERT_OK(Put("d", "e"));
|
|
|
|
auto iter = NewIterator(ReadOptions());
|
|
|
|
|
|
|
|
// Create a key that needs to be skipped for Seq too new
|
|
|
|
for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
|
|
|
|
i++) {
|
|
|
|
ASSERT_OK(Put("b", "f"));
|
|
|
|
}
|
|
|
|
|
|
|
|
iter->Seek(Slice("a"));
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->b");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->d");
|
|
|
|
iter->SeekForPrev(Slice("b"));
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->b");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->d");
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterPrevWithNewerSeq) {
|
|
|
|
ASSERT_OK(Put("0", "0"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ASSERT_OK(Put("a", "b"));
|
|
|
|
ASSERT_OK(Put("c", "d"));
|
|
|
|
ASSERT_OK(Put("d", "e"));
|
|
|
|
auto iter = NewIterator(ReadOptions());
|
|
|
|
|
|
|
|
// Create a key that needs to be skipped for Seq too new
|
|
|
|
for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
|
|
|
|
i++) {
|
|
|
|
ASSERT_OK(Put("b", "f"));
|
|
|
|
}
|
|
|
|
|
|
|
|
iter->Seek(Slice("d"));
|
|
|
|
ASSERT_EQ(IterStatus(iter), "d->e");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->d");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->b");
|
|
|
|
iter->Prev();
|
|
|
|
iter->SeekForPrev(Slice("d"));
|
|
|
|
ASSERT_EQ(IterStatus(iter), "d->e");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->d");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->b");
|
|
|
|
iter->Prev();
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) {
|
|
|
|
ASSERT_OK(Put("0", "0"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ASSERT_OK(Put("a", "b"));
|
|
|
|
ASSERT_OK(Put("c", "d"));
|
|
|
|
ASSERT_OK(Put("e", "f"));
|
|
|
|
auto iter = NewIterator(ReadOptions());
|
|
|
|
auto iter2 = NewIterator(ReadOptions());
|
|
|
|
iter->Seek(Slice("c"));
|
|
|
|
iter2->SeekForPrev(Slice("d"));
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->d");
|
|
|
|
ASSERT_EQ(IterStatus(iter2), "c->d");
|
|
|
|
|
|
|
|
// Create a key that needs to be skipped for Seq too new
|
|
|
|
for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
|
|
|
|
i++) {
|
|
|
|
ASSERT_OK(Put("b", "f"));
|
|
|
|
}
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->b");
|
|
|
|
iter->Prev();
|
|
|
|
iter2->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter2), "a->b");
|
|
|
|
iter2->Prev();
|
|
|
|
delete iter;
|
|
|
|
delete iter2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterEmpty) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
iter->SeekForPrev("foo");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterSingle) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "a", "va"));
|
|
|
|
Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
iter->Seek("");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
iter->SeekForPrev("");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
iter->Seek("a");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
iter->SeekForPrev("a");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
iter->Seek("b");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
iter->SeekForPrev("b");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterMulti) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "a", "va"));
|
|
|
|
ASSERT_OK(Put(1, "b", "vb"));
|
|
|
|
ASSERT_OK(Put(1, "c", "vc"));
|
|
|
|
Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->vc");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->vc");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->vc");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
iter->Seek("");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Seek("a");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Seek("ax");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
|
|
|
iter->SeekForPrev("d");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->vc");
|
|
|
|
iter->SeekForPrev("c");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->vc");
|
|
|
|
iter->SeekForPrev("bx");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
|
|
|
|
|
|
|
iter->Seek("b");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
|
|
|
iter->Seek("z");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
iter->SeekForPrev("b");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
|
|
|
iter->SeekForPrev("");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
// Switch from reverse to forward
|
|
|
|
iter->SeekToLast();
|
|
|
|
iter->Prev();
|
|
|
|
iter->Prev();
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
|
|
|
|
|
|
|
// Switch from forward to reverse
|
|
|
|
iter->SeekToFirst();
|
|
|
|
iter->Next();
|
|
|
|
iter->Next();
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
|
|
|
|
|
|
|
// Make sure iter stays at snapshot
|
|
|
|
ASSERT_OK(Put(1, "a", "va2"));
|
|
|
|
ASSERT_OK(Put(1, "a2", "va3"));
|
|
|
|
ASSERT_OK(Put(1, "b", "vb2"));
|
|
|
|
ASSERT_OK(Put(1, "c", "vc2"));
|
|
|
|
ASSERT_OK(Delete(1, "b"));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->vc");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->vc");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check that we can skip over a run of user keys
|
|
|
|
// by using reseek rather than sequential scan
|
|
|
|
TEST_P(DBIteratorTest, IterReseek) {
|
|
|
|
anon::OptionsOverride options_override;
|
|
|
|
options_override.skip_policy = kSkipNoSnapshot;
|
|
|
|
Options options = CurrentOptions(options_override);
|
|
|
|
options.max_sequential_skip_in_iterations = 3;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
// insert three keys with same userkey and verify that
|
|
|
|
// reseek is not invoked. For each of these test cases,
|
|
|
|
// verify that we can find the next key "b".
|
|
|
|
ASSERT_OK(Put(1, "a", "zero"));
|
|
|
|
ASSERT_OK(Put(1, "a", "one"));
|
|
|
|
ASSERT_OK(Put(1, "a", "two"));
|
|
|
|
ASSERT_OK(Put(1, "b", "bone"));
|
|
|
|
Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->two");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->bone");
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// insert a total of three keys with same userkey and verify
|
|
|
|
// that reseek is still not invoked.
|
|
|
|
ASSERT_OK(Put(1, "a", "three"));
|
|
|
|
iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->three");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->bone");
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// insert a total of four keys with same userkey and verify
|
|
|
|
// that reseek is invoked.
|
|
|
|
ASSERT_OK(Put(1, "a", "four"));
|
|
|
|
iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->four");
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->bone");
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// Testing reverse iterator
|
|
|
|
// At this point, we have three versions of "a" and one version of "b".
|
|
|
|
// The reseek statistics is already at 1.
|
|
|
|
int num_reseeks = static_cast<int>(
|
|
|
|
TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
|
|
|
|
|
|
|
|
// Insert another version of b and assert that reseek is not invoked
|
|
|
|
ASSERT_OK(Put(1, "b", "btwo"));
|
|
|
|
iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->btwo");
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
|
|
|
|
num_reseeks);
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
|
|
|
|
num_reseeks + 1);
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->four");
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// insert two more versions of b. This makes a total of 4 versions
|
|
|
|
// of b and 4 versions of a.
|
|
|
|
ASSERT_OK(Put(1, "b", "bthree"));
|
|
|
|
ASSERT_OK(Put(1, "b", "bfour"));
|
|
|
|
iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->bfour");
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
|
|
|
|
num_reseeks + 2);
|
|
|
|
iter->Prev();
|
|
|
|
|
|
|
|
// the previous Prev call should have invoked reseek
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
|
|
|
|
num_reseeks + 3);
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->four");
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterSmallAndLargeMix) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "a", "va"));
|
|
|
|
ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
|
|
|
|
ASSERT_OK(Put(1, "c", "vc"));
|
|
|
|
ASSERT_OK(Put(1, "d", std::string(100000, 'd')));
|
|
|
|
ASSERT_OK(Put(1, "e", std::string(100000, 'e')));
|
|
|
|
|
|
|
|
Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->vc");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->vc");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterMultiWithDelete) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "ka", "va"));
|
|
|
|
ASSERT_OK(Put(1, "kb", "vb"));
|
|
|
|
ASSERT_OK(Put(1, "kc", "vc"));
|
|
|
|
ASSERT_OK(Delete(1, "kb"));
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get(1, "kb"));
|
|
|
|
|
|
|
|
Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
iter->Seek("kc");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "kc->vc");
|
|
|
|
if (!CurrentOptions().merge_operator) {
|
|
|
|
// TODO: merge operator does not support backward iteration yet
|
|
|
|
if (kPlainTableAllBytesPrefix != option_config_ &&
|
|
|
|
kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
|
|
|
|
kHashLinkList != option_config_ &&
|
|
|
|
kHashSkipList != option_config_) { // doesn't support SeekToLast
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "ka->va");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
delete iter;
|
|
|
|
} while (ChangeOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterPrevMaxSkip) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
ASSERT_OK(Put(1, "key1", "v1"));
|
|
|
|
ASSERT_OK(Put(1, "key2", "v2"));
|
|
|
|
ASSERT_OK(Put(1, "key3", "v3"));
|
|
|
|
ASSERT_OK(Put(1, "key4", "v4"));
|
|
|
|
ASSERT_OK(Put(1, "key5", "v5"));
|
|
|
|
}
|
|
|
|
|
|
|
|
VerifyIterLast("key5->v5", 1);
|
|
|
|
|
|
|
|
ASSERT_OK(Delete(1, "key5"));
|
|
|
|
VerifyIterLast("key4->v4", 1);
|
|
|
|
|
|
|
|
ASSERT_OK(Delete(1, "key4"));
|
|
|
|
VerifyIterLast("key3->v3", 1);
|
|
|
|
|
|
|
|
ASSERT_OK(Delete(1, "key3"));
|
|
|
|
VerifyIterLast("key2->v2", 1);
|
|
|
|
|
|
|
|
ASSERT_OK(Delete(1, "key2"));
|
|
|
|
VerifyIterLast("key1->v1", 1);
|
|
|
|
|
|
|
|
ASSERT_OK(Delete(1, "key1"));
|
|
|
|
VerifyIterLast("(invalid)", 1);
|
|
|
|
} while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterWithSnapshot) {
|
|
|
|
anon::OptionsOverride options_override;
|
|
|
|
options_override.skip_policy = kSkipNoSnapshot;
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
|
|
|
|
ASSERT_OK(Put(1, "key1", "val1"));
|
|
|
|
ASSERT_OK(Put(1, "key2", "val2"));
|
|
|
|
ASSERT_OK(Put(1, "key3", "val3"));
|
|
|
|
ASSERT_OK(Put(1, "key4", "val4"));
|
|
|
|
ASSERT_OK(Put(1, "key5", "val5"));
|
|
|
|
|
|
|
|
const Snapshot* snapshot = db_->GetSnapshot();
|
|
|
|
ReadOptions options;
|
|
|
|
options.snapshot = snapshot;
|
|
|
|
Iterator* iter = NewIterator(options, handles_[1]);
|
|
|
|
|
|
|
|
ASSERT_OK(Put(1, "key0", "val0"));
|
|
|
|
// Put more values after the snapshot
|
|
|
|
ASSERT_OK(Put(1, "key100", "val100"));
|
|
|
|
ASSERT_OK(Put(1, "key101", "val101"));
|
|
|
|
|
|
|
|
iter->Seek("key5");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "key5->val5");
|
|
|
|
if (!CurrentOptions().merge_operator) {
|
|
|
|
// TODO: merge operator does not support backward iteration yet
|
|
|
|
if (kPlainTableAllBytesPrefix != option_config_ &&
|
|
|
|
kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
|
|
|
|
kHashLinkList != option_config_ && kHashSkipList != option_config_) {
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "key4->val4");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "key3->val3");
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "key4->val4");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "key5->val5");
|
|
|
|
}
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!CurrentOptions().merge_operator) {
|
|
|
|
// TODO(gzh): merge operator does not support backward iteration yet
|
|
|
|
if (kPlainTableAllBytesPrefix != option_config_ &&
|
|
|
|
kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
|
|
|
|
kHashLinkList != option_config_ && kHashSkipList != option_config_) {
|
|
|
|
iter->SeekForPrev("key1");
|
|
|
|
ASSERT_EQ(IterStatus(iter), "key1->val1");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "key2->val2");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "key3->val3");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "key2->val2");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "key1->val1");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
delete iter;
|
|
|
|
} while (ChangeOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IteratorPinsRef) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
Put(1, "foo", "hello");
|
|
|
|
|
|
|
|
// Get iterator that will yield the current contents of the DB.
|
|
|
|
Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
|
|
|
|
// Write to force compactions
|
|
|
|
Put(1, "foo", "newvalue1");
|
|
|
|
for (int i = 0; i < 100; i++) {
|
|
|
|
// 100K values
|
|
|
|
ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
|
|
|
|
}
|
|
|
|
Put(1, "foo", "newvalue2");
|
|
|
|
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("foo", iter->key().ToString());
|
|
|
|
ASSERT_EQ("hello", iter->value().ToString());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
delete iter;
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
|
|
|
|
Put(1, "foo", "delete-cf-then-delete-iter");
|
|
|
|
Put(1, "hello", "value2");
|
|
|
|
|
|
|
|
ColumnFamilyHandle* cf = handles_[1];
|
|
|
|
ReadOptions ro;
|
|
|
|
|
|
|
|
auto* iter = db_->NewIterator(ro, cf);
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter");
|
|
|
|
|
|
|
|
// delete CF handle
|
|
|
|
db_->DestroyColumnFamilyHandle(cf);
|
|
|
|
handles_.erase(std::begin(handles_) + 1);
|
|
|
|
|
|
|
|
// delete Iterator after CF handle is deleted
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "hello->value2");
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
|
|
|
|
Put(1, "foo", "drop-cf-then-delete-iter");
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ColumnFamilyHandle* cf = handles_[1];
|
|
|
|
|
|
|
|
auto* iter = db_->NewIterator(ro, cf);
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter");
|
|
|
|
|
|
|
|
// drop and delete CF
|
|
|
|
db_->DropColumnFamily(cf);
|
|
|
|
db_->DestroyColumnFamilyHandle(cf);
|
|
|
|
handles_.erase(std::begin(handles_) + 1);
|
|
|
|
|
|
|
|
// delete Iterator after CF handle is dropped
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
// SetOptions not defined in ROCKSDB LITE
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
TEST_P(DBIteratorTest, DBIteratorBoundTest) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
|
|
|
|
options.prefix_extractor = nullptr;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("a", "0"));
|
|
|
|
ASSERT_OK(Put("foo", "bar"));
|
|
|
|
ASSERT_OK(Put("foo1", "bar1"));
|
|
|
|
ASSERT_OK(Put("g1", "0"));
|
|
|
|
|
|
|
|
// testing basic case with no iterate_upper_bound and no prefix_extractor
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.iterate_upper_bound = nullptr;
|
|
|
|
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ro));
|
|
|
|
|
|
|
|
iter->Seek("foo");
|
|
|
|
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
|
|
|
|
|
|
|
|
iter->SeekForPrev("g1");
|
|
|
|
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// testing iterate_upper_bound and forward iterator
|
|
|
|
// to make sure it stops at bound
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
// iterate_upper_bound points beyond the last expected entry
|
|
|
|
Slice prefix("foo2");
|
|
|
|
ro.iterate_upper_bound = &prefix;
|
|
|
|
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ro));
|
|
|
|
|
|
|
|
iter->Seek("foo");
|
|
|
|
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(("foo1")), 0);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
// should stop here...
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
}
|
|
|
|
// Testing SeekToLast with iterate_upper_bound set
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
|
|
|
|
Slice prefix("foo");
|
|
|
|
ro.iterate_upper_bound = &prefix;
|
|
|
|
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ro));
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("a")), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// prefix is the first letter of the key
|
|
|
|
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
|
|
|
|
ASSERT_OK(Put("a", "0"));
|
|
|
|
ASSERT_OK(Put("foo", "bar"));
|
|
|
|
ASSERT_OK(Put("foo1", "bar1"));
|
|
|
|
ASSERT_OK(Put("g1", "0"));
|
|
|
|
|
|
|
|
// testing with iterate_upper_bound and prefix_extractor
|
|
|
|
// Seek target and iterate_upper_bound are not is same prefix
|
|
|
|
// This should be an error
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
Slice upper_bound("g");
|
|
|
|
ro.iterate_upper_bound = &upper_bound;
|
|
|
|
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ro));
|
|
|
|
|
|
|
|
iter->Seek("foo");
|
|
|
|
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("foo", iter->key().ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("foo1", iter->key().ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
}
|
|
|
|
|
|
|
|
// testing that iterate_upper_bound prevents iterating over deleted items
|
|
|
|
// if the bound has already reached
|
|
|
|
{
|
|
|
|
options.prefix_extractor = nullptr;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("a", "0"));
|
|
|
|
ASSERT_OK(Put("b", "0"));
|
|
|
|
ASSERT_OK(Put("b1", "0"));
|
|
|
|
ASSERT_OK(Put("c", "0"));
|
|
|
|
ASSERT_OK(Put("d", "0"));
|
|
|
|
ASSERT_OK(Put("e", "0"));
|
|
|
|
ASSERT_OK(Delete("c"));
|
|
|
|
ASSERT_OK(Delete("d"));
|
|
|
|
|
|
|
|
// base case with no bound
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.iterate_upper_bound = nullptr;
|
|
|
|
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ro));
|
|
|
|
|
|
|
|
iter->Seek("b");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("b")), 0);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(("b1")), 0);
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
iter->Next();
|
|
|
|
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(static_cast<int>(get_perf_context()->internal_delete_skipped_count), 2);
|
|
|
|
|
|
|
|
// now testing with iterate_bound
|
|
|
|
Slice prefix("c");
|
|
|
|
ro.iterate_upper_bound = &prefix;
|
|
|
|
|
|
|
|
iter.reset(NewIterator(ro));
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
|
|
|
|
iter->Seek("b");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("b")), 0);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(("b1")), 0);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
// the iteration should stop as soon as the bound key is reached
|
|
|
|
// even though the key is deleted
|
|
|
|
// hence internal_delete_skipped_count should be 0
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
ASSERT_EQ(static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
options.prefix_extractor = nullptr;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("a", "0"));
|
|
|
|
ASSERT_OK(Put("z", "0"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("foo1", "bar1"));
|
|
|
|
ASSERT_OK(Put("foo2", "bar2"));
|
|
|
|
ASSERT_OK(Put("foo3", "bar3"));
|
|
|
|
ASSERT_OK(Put("foo4", "bar4"));
|
|
|
|
|
|
|
|
{
|
|
|
|
std::string up_str = "foo5";
|
|
|
|
Slice up(up_str);
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.iterate_upper_bound = &up;
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ro));
|
|
|
|
|
|
|
|
iter->Seek("foo1");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
|
|
|
|
|
|
|
|
uint64_t prev_block_cache_hit =
|
|
|
|
TestGetTickerCount(options, BLOCK_CACHE_HIT);
|
|
|
|
uint64_t prev_block_cache_miss =
|
|
|
|
TestGetTickerCount(options, BLOCK_CACHE_MISS);
|
|
|
|
|
|
|
|
ASSERT_GT(prev_block_cache_hit + prev_block_cache_miss, 0);
|
|
|
|
|
|
|
|
iter->Seek("foo4");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo4")), 0);
|
|
|
|
ASSERT_EQ(prev_block_cache_hit,
|
|
|
|
TestGetTickerCount(options, BLOCK_CACHE_HIT));
|
|
|
|
ASSERT_EQ(prev_block_cache_miss,
|
|
|
|
TestGetTickerCount(options, BLOCK_CACHE_MISS));
|
|
|
|
|
|
|
|
iter->Seek("foo2");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo3")), 0);
|
|
|
|
ASSERT_EQ(prev_block_cache_hit,
|
|
|
|
TestGetTickerCount(options, BLOCK_CACHE_HIT));
|
|
|
|
ASSERT_EQ(prev_block_cache_miss,
|
|
|
|
TestGetTickerCount(options, BLOCK_CACHE_MISS));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) {
|
|
|
|
for (auto format_version : {2, 3, 4}) {
|
|
|
|
int upper_bound_hits = 0;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BlockBasedTableIterator:out_of_bound",
|
|
|
|
[&upper_bound_hits](void*) { upper_bound_hits++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.prefix_extractor = nullptr;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.format_version = format_version;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("foo1", "bar1"));
|
|
|
|
ASSERT_OK(Put("foo2", "bar2"));
|
|
|
|
ASSERT_OK(Put("foo4", "bar4"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
Slice ub("foo3");
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.iterate_upper_bound = &ub;
|
|
|
|
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ro));
|
|
|
|
|
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
|
|
|
|
ASSERT_EQ(upper_bound_hits, 0);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
|
|
|
|
ASSERT_EQ(upper_bound_hits, 0);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_EQ(upper_bound_hits, 1);
|
|
|
|
}
|
|
|
|
}
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
|
|
|
|
// Enable kBinarySearchWithFirstKey, do some iterator operations and check that
|
|
|
|
// they don't do unnecessary block reads.
|
|
|
|
TEST_P(DBIteratorTest, IndexWithFirstKey) {
|
|
|
|
for (int tailing = 0; tailing < 2; ++tailing) {
|
|
|
|
SCOPED_TRACE("tailing = " + std::to_string(tailing));
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.prefix_extractor = nullptr;
|
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
Statistics* stats = options.statistics.get();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.index_type =
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
|
|
|
|
table_options.index_shortening =
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
table_options.block_cache =
|
|
|
|
NewLRUCache(8000); // fits all blocks and their cache metadata overhead
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Merge("a1", "x1"));
|
|
|
|
ASSERT_OK(Merge("b1", "y1"));
|
|
|
|
ASSERT_OK(Merge("c0", "z1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Merge("a2", "x2"));
|
|
|
|
ASSERT_OK(Merge("b2", "y2"));
|
|
|
|
ASSERT_OK(Merge("c0", "z2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Merge("a3", "x3"));
|
|
|
|
ASSERT_OK(Merge("b3", "y3"));
|
|
|
|
ASSERT_OK(Merge("c3", "z3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// Block cache is not important for this test.
|
|
|
|
// We use BLOCK_CACHE_DATA_* counters just because they're the most readily
|
|
|
|
// available way of counting block accesses.
|
|
|
|
|
|
|
|
ReadOptions ropt;
|
|
|
|
ropt.tailing = tailing;
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ropt));
|
|
|
|
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
ropt.read_tier = ReadTier::kBlockCacheTier;
|
|
|
|
std::unique_ptr<Iterator> nonblocking_iter(NewIterator(ropt));
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
iter->Seek("b10");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ("b2", iter->key().ToString());
|
|
|
|
EXPECT_EQ("y2", iter->value().ToString());
|
|
|
|
EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
// The cache-only iterator should succeed too, using the blocks pulled into
|
|
|
|
// the cache by the previous iterator.
|
|
|
|
nonblocking_iter->Seek("b10");
|
|
|
|
ASSERT_TRUE(nonblocking_iter->Valid());
|
|
|
|
EXPECT_EQ("b2", nonblocking_iter->key().ToString());
|
|
|
|
EXPECT_EQ("y2", nonblocking_iter->value().ToString());
|
|
|
|
EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// ... but it shouldn't be able to step forward since the next block is
|
|
|
|
// not in cache yet.
|
|
|
|
nonblocking_iter->Next();
|
|
|
|
ASSERT_FALSE(nonblocking_iter->Valid());
|
|
|
|
ASSERT_TRUE(nonblocking_iter->status().IsIncomplete());
|
|
|
|
|
|
|
|
// ... nor should a seek to the next key succeed.
|
|
|
|
nonblocking_iter->Seek("b20");
|
|
|
|
ASSERT_FALSE(nonblocking_iter->Valid());
|
|
|
|
ASSERT_TRUE(nonblocking_iter->status().IsIncomplete());
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ("b3", iter->key().ToString());
|
|
|
|
EXPECT_EQ("y3", iter->value().ToString());
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// After the blocking iterator loaded the next block, the nonblocking
|
|
|
|
// iterator's seek should succeed.
|
|
|
|
nonblocking_iter->Seek("b20");
|
|
|
|
ASSERT_TRUE(nonblocking_iter->Valid());
|
|
|
|
EXPECT_EQ("b3", nonblocking_iter->key().ToString());
|
|
|
|
EXPECT_EQ("y3", nonblocking_iter->value().ToString());
|
|
|
|
EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
|
|
|
|
iter->Seek("c0");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ("c0", iter->key().ToString());
|
|
|
|
EXPECT_EQ("z1,z2", iter->value().ToString());
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
EXPECT_EQ(6, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ("c3", iter->key().ToString());
|
|
|
|
EXPECT_EQ("z3", iter->value().ToString());
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
|
|
|
|
iter.reset();
|
|
|
|
|
|
|
|
// Enable iterate_upper_bound and check that iterator is not trying to read
|
|
|
|
// blocks that are fully above upper bound.
|
|
|
|
std::string ub = "b3";
|
|
|
|
Slice ub_slice(ub);
|
|
|
|
ropt.iterate_upper_bound = &ub_slice;
|
|
|
|
iter.reset(NewIterator(ropt));
|
|
|
|
|
|
|
|
iter->Seek("b2");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ("b2", iter->key().ToString());
|
|
|
|
EXPECT_EQ("y2", iter->value().ToString());
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IndexWithFirstKeyGet) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.prefix_extractor = nullptr;
|
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
6 years ago
|
|
|
Statistics* stats = options.statistics.get();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.index_type =
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
|
|
|
|
table_options.index_shortening =
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
table_options.block_cache = NewLRUCache(1000); // fits all blocks
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Merge("a", "x1"));
|
|
|
|
ASSERT_OK(Merge("c", "y1"));
|
|
|
|
ASSERT_OK(Merge("e", "z1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Merge("c", "y2"));
|
|
|
|
ASSERT_OK(Merge("e", "z2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// Get() between blocks shouldn't read any blocks.
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("b"));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Get() of an existing key shouldn't read any unnecessary blocks when there's
|
|
|
|
// only one key per block.
|
|
|
|
|
|
|
|
ASSERT_EQ("y1,y2", Get("c"));
|
|
|
|
EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
ASSERT_EQ("x1", Get("a"));
|
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
EXPECT_EQ(std::vector<std::string>({"NOT_FOUND", "z1,z2"}),
|
|
|
|
MultiGet({"b", "e"}));
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary
|
|
|
|
// return the biggest key which is smaller than the seek key.
|
|
|
|
TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) {
|
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.merge_operator = MergeOperators::CreatePutOperator();
|
|
|
|
options.env = env_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// write three entries with different keys using Merge()
|
|
|
|
WriteOptions wopts;
|
|
|
|
db_->Merge(wopts, "1", "data1");
|
|
|
|
db_->Merge(wopts, "2", "data2");
|
|
|
|
db_->Merge(wopts, "3", "data3");
|
|
|
|
|
|
|
|
std::unique_ptr<Iterator> it(NewIterator(ReadOptions()));
|
|
|
|
|
|
|
|
it->Seek("2");
|
|
|
|
ASSERT_TRUE(it->Valid());
|
|
|
|
ASSERT_EQ("2", it->key().ToString());
|
|
|
|
|
|
|
|
it->Prev();
|
|
|
|
ASSERT_TRUE(it->Valid());
|
|
|
|
ASSERT_EQ("1", it->key().ToString());
|
|
|
|
|
|
|
|
it->SeekForPrev("1");
|
|
|
|
ASSERT_TRUE(it->Valid());
|
|
|
|
ASSERT_EQ("1", it->key().ToString());
|
|
|
|
|
|
|
|
it->Next();
|
|
|
|
ASSERT_TRUE(it->Valid());
|
|
|
|
ASSERT_EQ("2", it->key().ToString());
|
|
|
|
}
|
|
|
|
|
|
|
|
class DBIteratorTestForPinnedData : public DBIteratorTest {
|
|
|
|
public:
|
|
|
|
enum TestConfig {
|
|
|
|
NORMAL,
|
|
|
|
CLOSE_AND_OPEN,
|
|
|
|
COMPACT_BEFORE_READ,
|
|
|
|
FLUSH_EVERY_1000,
|
|
|
|
MAX
|
|
|
|
};
|
|
|
|
DBIteratorTestForPinnedData() : DBIteratorTest() {}
|
|
|
|
void PinnedDataIteratorRandomized(TestConfig run_config) {
|
|
|
|
// Generate Random data
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
int puts = 100000;
|
|
|
|
int key_pool = static_cast<int>(puts * 0.7);
|
|
|
|
int key_size = 100;
|
|
|
|
int val_size = 1000;
|
|
|
|
int seeks_percentage = 20; // 20% of keys will be used to test seek()
|
|
|
|
int delete_percentage = 20; // 20% of keys will be deleted
|
|
|
|
int merge_percentage = 20; // 20% of keys will be added using Merge()
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.use_delta_encoding = false;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.merge_operator = MergeOperators::CreatePutOperator();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
std::vector<std::string> generated_keys(key_pool);
|
|
|
|
for (int i = 0; i < key_pool; i++) {
|
|
|
|
generated_keys[i] = RandomString(&rnd, key_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::map<std::string, std::string> true_data;
|
|
|
|
std::vector<std::string> random_keys;
|
|
|
|
std::vector<std::string> deleted_keys;
|
|
|
|
for (int i = 0; i < puts; i++) {
|
|
|
|
auto& k = generated_keys[rnd.Next() % key_pool];
|
|
|
|
auto v = RandomString(&rnd, val_size);
|
|
|
|
|
|
|
|
// Insert data to true_data map and to DB
|
|
|
|
true_data[k] = v;
|
|
|
|
if (rnd.PercentTrue(merge_percentage)) {
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), k, v));
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(Put(k, v));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Pick random keys to be used to test Seek()
|
|
|
|
if (rnd.PercentTrue(seeks_percentage)) {
|
|
|
|
random_keys.push_back(k);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete some random keys
|
|
|
|
if (rnd.PercentTrue(delete_percentage)) {
|
|
|
|
deleted_keys.push_back(k);
|
|
|
|
true_data.erase(k);
|
|
|
|
ASSERT_OK(Delete(k));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (run_config == TestConfig::FLUSH_EVERY_1000) {
|
|
|
|
if (i && i % 1000 == 0) {
|
|
|
|
Flush();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (run_config == TestConfig::CLOSE_AND_OPEN) {
|
|
|
|
Close();
|
|
|
|
Reopen(options);
|
|
|
|
} else if (run_config == TestConfig::COMPACT_BEFORE_READ) {
|
|
|
|
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.pin_data = true;
|
|
|
|
auto iter = NewIterator(ro);
|
|
|
|
|
|
|
|
{
|
|
|
|
// Test Seek to random keys
|
|
|
|
std::vector<Slice> keys_slices;
|
|
|
|
std::vector<std::string> true_keys;
|
|
|
|
for (auto& k : random_keys) {
|
|
|
|
iter->Seek(k);
|
|
|
|
if (!iter->Valid()) {
|
|
|
|
ASSERT_EQ(true_data.lower_bound(k), true_data.end());
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
std::string prop_value;
|
|
|
|
ASSERT_OK(
|
|
|
|
iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
|
|
|
|
ASSERT_EQ("1", prop_value);
|
|
|
|
keys_slices.push_back(iter->key());
|
|
|
|
true_keys.push_back(true_data.lower_bound(k)->first);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < keys_slices.size(); i++) {
|
|
|
|
ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Test SeekForPrev to random keys
|
|
|
|
std::vector<Slice> keys_slices;
|
|
|
|
std::vector<std::string> true_keys;
|
|
|
|
for (auto& k : random_keys) {
|
|
|
|
iter->SeekForPrev(k);
|
|
|
|
if (!iter->Valid()) {
|
|
|
|
ASSERT_EQ(true_data.upper_bound(k), true_data.begin());
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
std::string prop_value;
|
|
|
|
ASSERT_OK(
|
|
|
|
iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
|
|
|
|
ASSERT_EQ("1", prop_value);
|
|
|
|
keys_slices.push_back(iter->key());
|
|
|
|
true_keys.push_back((--true_data.upper_bound(k))->first);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < keys_slices.size(); i++) {
|
|
|
|
ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Test iterating all data forward
|
|
|
|
std::vector<Slice> all_keys;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
std::string prop_value;
|
|
|
|
ASSERT_OK(
|
|
|
|
iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
|
|
|
|
ASSERT_EQ("1", prop_value);
|
|
|
|
all_keys.push_back(iter->key());
|
|
|
|
}
|
|
|
|
ASSERT_EQ(all_keys.size(), true_data.size());
|
|
|
|
|
|
|
|
// Verify that all keys slices are valid
|
|
|
|
auto data_iter = true_data.begin();
|
|
|
|
for (size_t i = 0; i < all_keys.size(); i++) {
|
|
|
|
ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
|
|
|
|
data_iter++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Test iterating all data backward
|
|
|
|
std::vector<Slice> all_keys;
|
|
|
|
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
|
|
|
|
std::string prop_value;
|
|
|
|
ASSERT_OK(
|
|
|
|
iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
|
|
|
|
ASSERT_EQ("1", prop_value);
|
|
|
|
all_keys.push_back(iter->key());
|
|
|
|
}
|
|
|
|
ASSERT_EQ(all_keys.size(), true_data.size());
|
|
|
|
|
|
|
|
// Verify that all keys slices are valid (backward)
|
|
|
|
auto data_iter = true_data.rbegin();
|
|
|
|
for (size_t i = 0; i < all_keys.size(); i++) {
|
|
|
|
ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
|
|
|
|
data_iter++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) {
|
|
|
|
PinnedDataIteratorRandomized(TestConfig::NORMAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) {
|
|
|
|
PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTestForPinnedData,
|
|
|
|
PinnedDataIteratorRandomizedCompactBeforeRead) {
|
|
|
|
PinnedDataIteratorRandomized(TestConfig::COMPACT_BEFORE_READ);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) {
|
|
|
|
PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.use_delta_encoding = false;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
std::map<std::string, std::string> true_data;
|
|
|
|
|
|
|
|
// Generate 4 sst files in L2
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 1; i <= 1000; i++) {
|
|
|
|
std::string k = Key(i * 3);
|
|
|
|
std::string v = RandomString(&rnd, 100);
|
|
|
|
ASSERT_OK(Put(k, v));
|
|
|
|
true_data[k] = v;
|
|
|
|
if (i % 250 == 0) {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_EQ(FilesPerLevel(0), "4");
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
ASSERT_EQ(FilesPerLevel(0), "0,4");
|
|
|
|
|
|
|
|
// Generate 4 sst files in L0
|
|
|
|
for (int i = 1; i <= 1000; i++) {
|
|
|
|
std::string k = Key(i * 2);
|
|
|
|
std::string v = RandomString(&rnd, 100);
|
|
|
|
ASSERT_OK(Put(k, v));
|
|
|
|
true_data[k] = v;
|
|
|
|
if (i % 250 == 0) {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_EQ(FilesPerLevel(0), "4,4");
|
|
|
|
|
|
|
|
// Add some keys/values in memtables
|
|
|
|
for (int i = 1; i <= 1000; i++) {
|
|
|
|
std::string k = Key(i);
|
|
|
|
std::string v = RandomString(&rnd, 100);
|
|
|
|
ASSERT_OK(Put(k, v));
|
|
|
|
true_data[k] = v;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(FilesPerLevel(0), "4,4");
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.pin_data = true;
|
|
|
|
auto iter = NewIterator(ro);
|
|
|
|
|
|
|
|
std::vector<std::pair<Slice, std::string>> results;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
std::string prop_value;
|
|
|
|
ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
|
|
|
|
ASSERT_EQ("1", prop_value);
|
|
|
|
results.emplace_back(iter->key(), iter->value().ToString());
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(results.size(), true_data.size());
|
|
|
|
auto data_iter = true_data.begin();
|
|
|
|
for (size_t i = 0; i < results.size(); i++, data_iter++) {
|
|
|
|
auto& kv = results[i];
|
|
|
|
ASSERT_EQ(kv.first, data_iter->first);
|
|
|
|
ASSERT_EQ(kv.second, data_iter->second);
|
|
|
|
}
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.use_delta_encoding = false;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.merge_operator = MergeOperators::CreateUInt64AddOperator();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
std::string numbers[7];
|
|
|
|
for (int val = 0; val <= 6; val++) {
|
|
|
|
PutFixed64(numbers + val, val);
|
|
|
|
}
|
|
|
|
|
|
|
|
// +1 all keys in range [ 0 => 999]
|
|
|
|
for (int i = 0; i < 1000; i++) {
|
|
|
|
WriteOptions wo;
|
|
|
|
ASSERT_OK(db_->Merge(wo, Key(i), numbers[1]));
|
|
|
|
}
|
|
|
|
|
|
|
|
// +2 all keys divisible by 2 in range [ 0 => 999]
|
|
|
|
for (int i = 0; i < 1000; i += 2) {
|
|
|
|
WriteOptions wo;
|
|
|
|
ASSERT_OK(db_->Merge(wo, Key(i), numbers[2]));
|
|
|
|
}
|
|
|
|
|
|
|
|
// +3 all keys divisible by 5 in range [ 0 => 999]
|
|
|
|
for (int i = 0; i < 1000; i += 5) {
|
|
|
|
WriteOptions wo;
|
|
|
|
ASSERT_OK(db_->Merge(wo, Key(i), numbers[3]));
|
|
|
|
}
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.pin_data = true;
|
|
|
|
auto iter = NewIterator(ro);
|
|
|
|
|
|
|
|
std::vector<std::pair<Slice, std::string>> results;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
std::string prop_value;
|
|
|
|
ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
|
|
|
|
ASSERT_EQ("1", prop_value);
|
|
|
|
results.emplace_back(iter->key(), iter->value().ToString());
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(results.size(), 1000);
|
|
|
|
for (size_t i = 0; i < results.size(); i++) {
|
|
|
|
auto& kv = results[i];
|
|
|
|
ASSERT_EQ(kv.first, Key(static_cast<int>(i)));
|
|
|
|
int expected_val = 1;
|
|
|
|
if (i % 2 == 0) {
|
|
|
|
expected_val += 2;
|
|
|
|
}
|
|
|
|
if (i % 5 == 0) {
|
|
|
|
expected_val += 3;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(kv.second, numbers[expected_val]);
|
|
|
|
}
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.use_delta_encoding = false;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.write_buffer_size = 100000;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
std::map<std::string, std::string> true_data;
|
|
|
|
for (int i = 0; i < 1000; i++) {
|
|
|
|
std::string k = RandomString(&rnd, 10);
|
|
|
|
std::string v = RandomString(&rnd, 1000);
|
|
|
|
ASSERT_OK(Put(k, v));
|
|
|
|
true_data[k] = v;
|
|
|
|
}
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.pin_data = true;
|
|
|
|
auto iter = NewIterator(ro);
|
|
|
|
|
|
|
|
// Delete 50% of the keys and update the other 50%
|
|
|
|
for (auto& kv : true_data) {
|
|
|
|
if (rnd.OneIn(2)) {
|
|
|
|
ASSERT_OK(Delete(kv.first));
|
|
|
|
} else {
|
|
|
|
std::string new_val = RandomString(&rnd, 1000);
|
|
|
|
ASSERT_OK(Put(kv.first, new_val));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::pair<Slice, std::string>> results;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
std::string prop_value;
|
|
|
|
ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
|
|
|
|
ASSERT_EQ("1", prop_value);
|
|
|
|
results.emplace_back(iter->key(), iter->value().ToString());
|
|
|
|
}
|
|
|
|
|
|
|
|
auto data_iter = true_data.begin();
|
|
|
|
for (size_t i = 0; i < results.size(); i++, data_iter++) {
|
|
|
|
auto& kv = results[i];
|
|
|
|
ASSERT_EQ(kv.first, data_iter->first);
|
|
|
|
ASSERT_EQ(kv.second, data_iter->second);
|
|
|
|
}
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
class SliceTransformLimitedDomainGeneric : public SliceTransform {
|
|
|
|
const char* Name() const override {
|
|
|
|
return "SliceTransformLimitedDomainGeneric";
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice Transform(const Slice& src) const override {
|
|
|
|
return Slice(src.data(), 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool InDomain(const Slice& src) const override {
|
|
|
|
// prefix will be x????
|
|
|
|
return src.size() >= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool InRange(const Slice& dst) const override {
|
|
|
|
// prefix will be x????
|
|
|
|
return dst.size() == 1;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
// Enable prefix bloom for SST files
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ASSERT_OK(Put("a1", "va1"));
|
|
|
|
ASSERT_OK(Put("a2", "va2"));
|
|
|
|
ASSERT_OK(Put("a3", "va3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("b1", "vb1"));
|
|
|
|
ASSERT_OK(Put("b2", "vb2"));
|
|
|
|
ASSERT_OK(Put("b3", "vb3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("b4", "vb4"));
|
|
|
|
ASSERT_OK(Put("d1", "vd1"));
|
|
|
|
ASSERT_OK(Put("d2", "vd2"));
|
|
|
|
ASSERT_OK(Put("d4", "vd4"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
Iterator* iter = NewIterator(ro);
|
|
|
|
|
|
|
|
iter->SeekForPrev("a4");
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "a3");
|
|
|
|
ASSERT_EQ(iter->value().ToString(), "va3");
|
|
|
|
|
|
|
|
iter->SeekForPrev("c2");
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "b3");
|
|
|
|
iter->SeekForPrev("d3");
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "d2");
|
|
|
|
iter->SeekForPrev("b5");
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "b4");
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.prefix_same_as_start = true;
|
|
|
|
Iterator* iter = NewIterator(ro);
|
|
|
|
iter->SeekForPrev("c2");
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.prefix_extractor =
|
|
|
|
std::make_shared<SliceTransformLimitedDomainGeneric>();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
// Enable prefix bloom for SST files
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ASSERT_OK(Put("a1", "va1"));
|
|
|
|
ASSERT_OK(Put("a2", "va2"));
|
|
|
|
ASSERT_OK(Put("a3", "va3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("b1", "vb1"));
|
|
|
|
ASSERT_OK(Put("b2", "vb2"));
|
|
|
|
ASSERT_OK(Put("b3", "vb3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("b4", "vb4"));
|
|
|
|
ASSERT_OK(Put("d1", "vd1"));
|
|
|
|
ASSERT_OK(Put("d2", "vd2"));
|
|
|
|
ASSERT_OK(Put("d4", "vd4"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
Iterator* iter = NewIterator(ro);
|
|
|
|
|
|
|
|
iter->SeekForPrev("a4");
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "a3");
|
|
|
|
ASSERT_EQ(iter->value().ToString(), "va3");
|
|
|
|
|
|
|
|
iter->SeekForPrev("c2");
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "b3");
|
|
|
|
iter->SeekForPrev("d3");
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "d2");
|
|
|
|
iter->SeekForPrev("b5");
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "b4");
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.prefix_same_as_start = true;
|
|
|
|
Iterator* iter = NewIterator(ro);
|
|
|
|
iter->SeekForPrev("c2");
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocks) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1; // every block will contain one entry
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.max_sequential_skip_in_iterations = 8;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Putting such deletes will force DBIter::Prev() to fallback to a Seek
|
|
|
|
for (int file_num = 0; file_num < 10; file_num++) {
|
|
|
|
ASSERT_OK(Delete("key4"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
// First File containing 5 blocks of puts
|
|
|
|
ASSERT_OK(Put("key1", "val1.0"));
|
|
|
|
ASSERT_OK(Put("key2", "val2.0"));
|
|
|
|
ASSERT_OK(Put("key3", "val3.0"));
|
|
|
|
ASSERT_OK(Put("key4", "val4.0"));
|
|
|
|
ASSERT_OK(Put("key5", "val5.0"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// Second file containing 9 blocks of merge operands
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.1"));
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.2"));
|
|
|
|
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.1"));
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.2"));
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.3"));
|
|
|
|
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.1"));
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.2"));
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.3"));
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.4"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.fill_cache = false;
|
|
|
|
Iterator* iter = NewIterator(ro);
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "key5");
|
|
|
|
ASSERT_EQ(iter->value().ToString(), "val5.0");
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "key4");
|
|
|
|
ASSERT_EQ(iter->value().ToString(), "val4.0");
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "key3");
|
|
|
|
ASSERT_EQ(iter->value().ToString(), "val3.0,val3.1,val3.2,val3.3,val3.4");
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "key2");
|
|
|
|
ASSERT_EQ(iter->value().ToString(), "val2.0,val2.1,val2.2,val2.3");
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "key1");
|
|
|
|
ASSERT_EQ(iter->value().ToString(), "val1.0,val1.1,val1.2");
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.level0_slowdown_writes_trigger = (1 << 30);
|
|
|
|
options.level0_stop_writes_trigger = (1 << 30);
|
|
|
|
options.max_sequential_skip_in_iterations = 8;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
const int kNumKeys = 500;
|
|
|
|
// Small number of merge operands to make sure that DBIter::Prev() don't
|
|
|
|
// fall back to Seek()
|
|
|
|
const int kNumMergeOperands = 3;
|
|
|
|
// Use value size that will make sure that every block contain 1 key
|
|
|
|
const int kValSize =
|
|
|
|
static_cast<int>(BlockBasedTableOptions().block_size) * 4;
|
|
|
|
// Percentage of keys that wont get merge operations
|
|
|
|
const int kNoMergeOpPercentage = 20;
|
|
|
|
// Percentage of keys that will be deleted
|
|
|
|
const int kDeletePercentage = 10;
|
|
|
|
|
|
|
|
// For half of the key range we will write multiple deletes first to
|
|
|
|
// force DBIter::Prev() to fall back to Seek()
|
|
|
|
for (int file_num = 0; file_num < 10; file_num++) {
|
|
|
|
for (int i = 0; i < kNumKeys; i += 2) {
|
|
|
|
ASSERT_OK(Delete(Key(i)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::map<std::string, std::string> true_data;
|
|
|
|
std::string gen_key;
|
|
|
|
std::string gen_val;
|
|
|
|
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
gen_key = Key(i);
|
|
|
|
gen_val = RandomString(&rnd, kValSize);
|
|
|
|
|
|
|
|
ASSERT_OK(Put(gen_key, gen_val));
|
|
|
|
true_data[gen_key] = gen_val;
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// Separate values and merge operands in different file so that we
|
|
|
|
// make sure that we don't merge them while flushing but actually
|
|
|
|
// merge them in the read path
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
if (rnd.PercentTrue(kNoMergeOpPercentage)) {
|
|
|
|
// Dont give merge operations for some keys
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int j = 0; j < kNumMergeOperands; j++) {
|
|
|
|
gen_key = Key(i);
|
|
|
|
gen_val = RandomString(&rnd, kValSize);
|
|
|
|
|
|
|
|
ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val));
|
|
|
|
true_data[gen_key] += "," + gen_val;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
if (rnd.PercentTrue(kDeletePercentage)) {
|
|
|
|
gen_key = Key(i);
|
|
|
|
|
|
|
|
ASSERT_OK(Delete(gen_key));
|
|
|
|
true_data.erase(gen_key);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.fill_cache = false;
|
|
|
|
Iterator* iter = NewIterator(ro);
|
|
|
|
auto data_iter = true_data.rbegin();
|
|
|
|
|
|
|
|
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
|
|
|
|
ASSERT_EQ(iter->key().ToString(), data_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), data_iter->second);
|
|
|
|
data_iter++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(data_iter, true_data.rend());
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.fill_cache = false;
|
|
|
|
Iterator* iter = NewIterator(ro);
|
|
|
|
auto data_iter = true_data.rbegin();
|
|
|
|
|
|
|
|
int entries_right = 0;
|
|
|
|
std::string seek_key;
|
|
|
|
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
|
|
|
|
// Verify key/value of current position
|
|
|
|
ASSERT_EQ(iter->key().ToString(), data_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), data_iter->second);
|
|
|
|
|
|
|
|
bool restore_position_with_seek = rnd.Uniform(2);
|
|
|
|
if (restore_position_with_seek) {
|
|
|
|
seek_key = iter->key().ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Do some Next() operations the restore the iterator to orignal position
|
|
|
|
int next_count =
|
|
|
|
entries_right > 0 ? rnd.Uniform(std::min(entries_right, 10)) : 0;
|
|
|
|
for (int i = 0; i < next_count; i++) {
|
|
|
|
iter->Next();
|
|
|
|
data_iter--;
|
|
|
|
|
|
|
|
ASSERT_EQ(iter->key().ToString(), data_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), data_iter->second);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (restore_position_with_seek) {
|
|
|
|
// Restore orignal position using Seek()
|
|
|
|
iter->Seek(seek_key);
|
|
|
|
for (int i = 0; i < next_count; i++) {
|
|
|
|
data_iter++;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(iter->key().ToString(), data_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), data_iter->second);
|
|
|
|
} else {
|
|
|
|
// Restore original position using Prev()
|
|
|
|
for (int i = 0; i < next_count; i++) {
|
|
|
|
iter->Prev();
|
|
|
|
data_iter++;
|
|
|
|
|
|
|
|
ASSERT_EQ(iter->key().ToString(), data_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), data_iter->second);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
entries_right++;
|
|
|
|
data_iter++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(data_iter, true_data.rend());
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IteratorWithLocalStatistics) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 0; i < 1000; i++) {
|
|
|
|
// Key 10 bytes / Value 10 bytes
|
|
|
|
ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10)));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::atomic<uint64_t> total_next(0);
|
|
|
|
std::atomic<uint64_t> total_next_found(0);
|
|
|
|
std::atomic<uint64_t> total_prev(0);
|
|
|
|
std::atomic<uint64_t> total_prev_found(0);
|
|
|
|
std::atomic<uint64_t> total_bytes(0);
|
|
|
|
|
|
|
|
std::vector<port::Thread> threads;
|
|
|
|
std::function<void()> reader_func_next = [&]() {
|
|
|
|
SetPerfLevel(kEnableCount);
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
Iterator* iter = NewIterator(ReadOptions());
|
|
|
|
|
|
|
|
iter->SeekToFirst();
|
|
|
|
// Seek will bump ITER_BYTES_READ
|
|
|
|
uint64_t bytes = 0;
|
|
|
|
bytes += iter->key().size();
|
|
|
|
bytes += iter->value().size();
|
|
|
|
while (true) {
|
|
|
|
iter->Next();
|
|
|
|
total_next++;
|
|
|
|
|
|
|
|
if (!iter->Valid()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
total_next_found++;
|
|
|
|
bytes += iter->key().size();
|
|
|
|
bytes += iter->value().size();
|
|
|
|
}
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
|
|
|
|
SetPerfLevel(kDisable);
|
|
|
|
total_bytes += bytes;
|
|
|
|
};
|
|
|
|
|
|
|
|
std::function<void()> reader_func_prev = [&]() {
|
|
|
|
SetPerfLevel(kEnableCount);
|
|
|
|
Iterator* iter = NewIterator(ReadOptions());
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
// Seek will bump ITER_BYTES_READ
|
|
|
|
uint64_t bytes = 0;
|
|
|
|
bytes += iter->key().size();
|
|
|
|
bytes += iter->value().size();
|
|
|
|
while (true) {
|
|
|
|
iter->Prev();
|
|
|
|
total_prev++;
|
|
|
|
|
|
|
|
if (!iter->Valid()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
total_prev_found++;
|
|
|
|
bytes += iter->key().size();
|
|
|
|
bytes += iter->value().size();
|
|
|
|
}
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
|
|
|
|
SetPerfLevel(kDisable);
|
|
|
|
total_bytes += bytes;
|
|
|
|
};
|
|
|
|
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
threads.emplace_back(reader_func_next);
|
|
|
|
}
|
|
|
|
for (int i = 0; i < 15; i++) {
|
|
|
|
threads.emplace_back(reader_func_prev);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto& t : threads) {
|
|
|
|
t.join();
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), (uint64_t)total_next);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND),
|
|
|
|
(uint64_t)total_next_found);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), (uint64_t)total_prev);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND),
|
|
|
|
(uint64_t)total_prev_found);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), (uint64_t)total_bytes);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, ReadAhead) {
|
|
|
|
Options options;
|
|
|
|
env_->count_random_reads_ = true;
|
|
|
|
options.env = env_;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.write_buffer_size = 4 << 20;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1024;
|
|
|
|
table_options.no_block_cache = true;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
std::string value(1024, 'a');
|
|
|
|
for (int i = 0; i < 100; i++) {
|
|
|
|
Put(Key(i), value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
for (int i = 0; i < 100; i++) {
|
|
|
|
Put(Key(i), value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
|
|
|
|
for (int i = 0; i < 100; i++) {
|
|
|
|
Put(Key(i), value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
ASSERT_EQ("1,1,1", FilesPerLevel());
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
|
|
|
|
env_->random_read_bytes_counter_ = 0;
|
|
|
|
options.statistics->setTickerCount(NO_FILE_OPENS, 0);
|
|
|
|
ReadOptions read_options;
|
|
|
|
auto* iter = NewIterator(read_options);
|
|
|
|
iter->SeekToFirst();
|
|
|
|
int64_t num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS);
|
|
|
|
size_t bytes_read = env_->random_read_bytes_counter_;
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES);
|
|
|
|
env_->random_read_bytes_counter_ = 0;
|
|
|
|
options.statistics->setTickerCount(NO_FILE_OPENS, 0);
|
|
|
|
read_options.readahead_size = 1024 * 10;
|
|
|
|
iter = NewIterator(read_options);
|
|
|
|
iter->SeekToFirst();
|
|
|
|
int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS);
|
|
|
|
size_t bytes_read_readahead = env_->random_read_bytes_counter_;
|
|
|
|
delete iter;
|
|
|
|
int64_t num_file_closes_readahead =
|
|
|
|
TestGetTickerCount(options, NO_FILE_CLOSES);
|
|
|
|
ASSERT_EQ(num_file_opens, num_file_opens_readahead);
|
|
|
|
ASSERT_EQ(num_file_closes, num_file_closes_readahead);
|
|
|
|
ASSERT_GT(bytes_read_readahead, bytes_read);
|
|
|
|
ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3);
|
|
|
|
|
|
|
|
// Verify correctness.
|
|
|
|
iter = NewIterator(read_options);
|
|
|
|
int count = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_EQ(value, iter->value());
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(100, count);
|
|
|
|
for (int i = 0; i < 100; i++) {
|
|
|
|
iter->Seek(Key(i));
|
|
|
|
ASSERT_EQ(value, iter->value());
|
|
|
|
}
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Insert a key, create a snapshot iterator, overwrite key lots of times,
|
|
|
|
// seek to a smaller key. Expect DBIter to fall back to a seek instead of
|
|
|
|
// going through all the overwrites linearly.
|
|
|
|
TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.max_sequential_skip_in_iterations = 3;
|
|
|
|
options.prefix_extractor = nullptr;
|
|
|
|
options.write_buffer_size = 1 << 27; // big enough to avoid flush
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Insert.
|
|
|
|
ASSERT_OK(Put("b", "0"));
|
|
|
|
|
|
|
|
// Create iterator.
|
|
|
|
ReadOptions ro;
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ro));
|
|
|
|
|
|
|
|
// Insert a lot.
|
|
|
|
for (int i = 0; i < 100; ++i) {
|
|
|
|
ASSERT_OK(Put("b", std::to_string(i + 1).c_str()));
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
// Check that memtable wasn't flushed.
|
|
|
|
std::string val;
|
|
|
|
ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val));
|
|
|
|
EXPECT_EQ("0", val);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// Seek iterator to a smaller key.
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
iter->Seek("a");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ("b", iter->key().ToString());
|
|
|
|
EXPECT_EQ("0", iter->value().ToString());
|
|
|
|
|
|
|
|
// Check that the seek didn't do too much work.
|
|
|
|
// Checks are not tight, just make sure that everything is well below 100.
|
|
|
|
EXPECT_LT(get_perf_context()->internal_key_skipped_count, 4);
|
|
|
|
EXPECT_LT(get_perf_context()->internal_recent_skipped_count, 8);
|
|
|
|
EXPECT_LT(get_perf_context()->seek_on_memtable_count, 10);
|
|
|
|
EXPECT_LT(get_perf_context()->next_on_memtable_count, 10);
|
|
|
|
EXPECT_LT(get_perf_context()->prev_on_memtable_count, 10);
|
|
|
|
|
|
|
|
// Check that iterator did something like what we expect.
|
|
|
|
EXPECT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
|
|
|
|
EXPECT_EQ(get_perf_context()->internal_merge_count, 0);
|
|
|
|
EXPECT_GE(get_perf_context()->internal_recent_skipped_count, 2);
|
|
|
|
EXPECT_GE(get_perf_context()->seek_on_memtable_count, 2);
|
|
|
|
EXPECT_EQ(1, options.statistics->getTickerCount(
|
|
|
|
NUMBER_OF_RESEEKS_IN_ITERATION));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, Refresh) {
|
|
|
|
ASSERT_OK(Put("x", "y"));
|
|
|
|
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
|
|
|
|
iter->Seek(Slice("a"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("x")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("c", "d"));
|
|
|
|
|
|
|
|
iter->Seek(Slice("a"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("x")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
iter->Refresh();
|
|
|
|
|
|
|
|
iter->Seek(Slice("a"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("c")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("x")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("m", "n"));
|
|
|
|
|
|
|
|
iter->Seek(Slice("a"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("c")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("x")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
iter->Refresh();
|
|
|
|
|
|
|
|
iter->Seek(Slice("a"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("c")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("m")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("x")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
iter.reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, RefreshWithSnapshot) {
|
|
|
|
ASSERT_OK(Put("x", "y"));
|
|
|
|
const Snapshot* snapshot = db_->GetSnapshot();
|
|
|
|
ReadOptions options;
|
|
|
|
options.snapshot = snapshot;
|
|
|
|
Iterator* iter = NewIterator(options);
|
|
|
|
|
|
|
|
iter->Seek(Slice("a"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("x")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("c", "d"));
|
|
|
|
|
|
|
|
iter->Seek(Slice("a"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().compare(Slice("x")), 0);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
s = iter->Refresh();
|
|
|
|
ASSERT_TRUE(s.IsNotSupported());
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, CreationFailure) {
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::NewInternalIterator:StatusCallback", [](void* arg) {
|
|
|
|
*(reinterpret_cast<Status*>(arg)) = Status::Corruption("test status");
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Iterator* iter = NewIterator(ReadOptions());
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_TRUE(iter->status().IsCorruption());
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.max_sequential_skip_in_iterations = 3;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// write a bunch of kvs to the database.
|
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
ASSERT_OK(Put("y", "1"));
|
|
|
|
ASSERT_OK(Put("y1", "1"));
|
|
|
|
ASSERT_OK(Put("y2", "1"));
|
|
|
|
ASSERT_OK(Put("y3", "1"));
|
|
|
|
ASSERT_OK(Put("z", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
ASSERT_OK(Put("z", "1"));
|
|
|
|
ASSERT_OK(Put("bar", "1"));
|
|
|
|
ASSERT_OK(Put("foo", "1"));
|
|
|
|
|
|
|
|
std::string upper_bound = "x";
|
|
|
|
Slice ub_slice(upper_bound);
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.iterate_upper_bound = &ub_slice;
|
|
|
|
ro.max_skippable_internal_keys = 1000;
|
|
|
|
|
|
|
|
Iterator* iter = NewIterator(ro);
|
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("foo", iter->key().ToString());
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("bar", iter->key().ToString());
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, TableFilter) {
|
expose a hook to skip tables during iteration
Summary:
As discussed on the mailing list (["Skipping entire SSTs while iterating"](https://groups.google.com/forum/#!topic/rocksdb/ujHCJVLrHlU)), this patch adds a `table_filter` to `ReadOptions` that allows specifying a callback to be executed during iteration before each table in the database is scanned. The callback is passed the table's properties; the table is scanned iff the callback returns true.
This can be used in conjunction with a `TablePropertiesCollector` to dramatically speed up scans by skipping tables that are known to contain irrelevant data for the scan at hand.
We're using this [downstream in CockroachDB](https://github.com/cockroachdb/cockroach/blob/master/pkg/storage/engine/db.cc#L2009-L2022) already. With this feature, under ideal conditions, we can reduce the time of an incremental backup in from hours to seconds.
FYI, the first commit in this PR fixes a segfault that I unfortunately have not figured out how to reproduce outside of CockroachDB. I'm hoping you accept it on the grounds that it is not correct to return 8-byte aligned memory from a call to `malloc` on some 64-bit platforms; one correct approach is to infer the necessary alignment from `std::max_align_t`, as done here. As noted in the first commit message, the bug is tickled by having a`std::function` in `struct ReadOptions`. That is, the following patch alone is enough to cause RocksDB to segfault when run from CockroachDB on Darwin.
```diff
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1546,6 +1546,13 @@ struct ReadOptions {
// Default: false
bool ignore_range_deletions;
+ // A callback to determine whether relevant keys for this scan exist in a
+ // given table based on the table's properties. The callback is passed the
+ // properties of each table during iteration. If the callback returns false,
+ // the table will not be scanned.
+ // Default: empty (every table will be scanned)
+ std::function<bool(const TableProperties&)> table_filter;
+
ReadOptions();
ReadOptions(bool cksum, bool cache);
};
```
/cc danhhz
Closes https://github.com/facebook/rocksdb/pull/2265
Differential Revision: D5054262
Pulled By: yiwu-arbug
fbshipit-source-id: dd6b28f2bba6cb8466250d8c5c542d3c92785476
7 years ago
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ASSERT_OK(Put("b", "2"));
|
|
|
|
ASSERT_OK(Put("c", "3"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
ASSERT_OK(Put("d", "4"));
|
|
|
|
ASSERT_OK(Put("e", "5"));
|
|
|
|
ASSERT_OK(Put("f", "6"));
|
|
|
|
dbfull()->Flush(FlushOptions());
|
|
|
|
|
|
|
|
// Ensure the table_filter callback is called once for each table.
|
|
|
|
{
|
|
|
|
std::set<uint64_t> unseen{1, 2, 3};
|
expose a hook to skip tables during iteration
Summary:
As discussed on the mailing list (["Skipping entire SSTs while iterating"](https://groups.google.com/forum/#!topic/rocksdb/ujHCJVLrHlU)), this patch adds a `table_filter` to `ReadOptions` that allows specifying a callback to be executed during iteration before each table in the database is scanned. The callback is passed the table's properties; the table is scanned iff the callback returns true.
This can be used in conjunction with a `TablePropertiesCollector` to dramatically speed up scans by skipping tables that are known to contain irrelevant data for the scan at hand.
We're using this [downstream in CockroachDB](https://github.com/cockroachdb/cockroach/blob/master/pkg/storage/engine/db.cc#L2009-L2022) already. With this feature, under ideal conditions, we can reduce the time of an incremental backup in from hours to seconds.
FYI, the first commit in this PR fixes a segfault that I unfortunately have not figured out how to reproduce outside of CockroachDB. I'm hoping you accept it on the grounds that it is not correct to return 8-byte aligned memory from a call to `malloc` on some 64-bit platforms; one correct approach is to infer the necessary alignment from `std::max_align_t`, as done here. As noted in the first commit message, the bug is tickled by having a`std::function` in `struct ReadOptions`. That is, the following patch alone is enough to cause RocksDB to segfault when run from CockroachDB on Darwin.
```diff
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1546,6 +1546,13 @@ struct ReadOptions {
// Default: false
bool ignore_range_deletions;
+ // A callback to determine whether relevant keys for this scan exist in a
+ // given table based on the table's properties. The callback is passed the
+ // properties of each table during iteration. If the callback returns false,
+ // the table will not be scanned.
+ // Default: empty (every table will be scanned)
+ std::function<bool(const TableProperties&)> table_filter;
+
ReadOptions();
ReadOptions(bool cksum, bool cache);
};
```
/cc danhhz
Closes https://github.com/facebook/rocksdb/pull/2265
Differential Revision: D5054262
Pulled By: yiwu-arbug
fbshipit-source-id: dd6b28f2bba6cb8466250d8c5c542d3c92785476
7 years ago
|
|
|
ReadOptions opts;
|
|
|
|
opts.table_filter = [&](const TableProperties& props) {
|
|
|
|
auto it = unseen.find(props.num_entries);
|
|
|
|
if (it == unseen.end()) {
|
|
|
|
ADD_FAILURE() << "saw table properties with an unexpected "
|
|
|
|
<< props.num_entries << " entries";
|
expose a hook to skip tables during iteration
Summary:
As discussed on the mailing list (["Skipping entire SSTs while iterating"](https://groups.google.com/forum/#!topic/rocksdb/ujHCJVLrHlU)), this patch adds a `table_filter` to `ReadOptions` that allows specifying a callback to be executed during iteration before each table in the database is scanned. The callback is passed the table's properties; the table is scanned iff the callback returns true.
This can be used in conjunction with a `TablePropertiesCollector` to dramatically speed up scans by skipping tables that are known to contain irrelevant data for the scan at hand.
We're using this [downstream in CockroachDB](https://github.com/cockroachdb/cockroach/blob/master/pkg/storage/engine/db.cc#L2009-L2022) already. With this feature, under ideal conditions, we can reduce the time of an incremental backup in from hours to seconds.
FYI, the first commit in this PR fixes a segfault that I unfortunately have not figured out how to reproduce outside of CockroachDB. I'm hoping you accept it on the grounds that it is not correct to return 8-byte aligned memory from a call to `malloc` on some 64-bit platforms; one correct approach is to infer the necessary alignment from `std::max_align_t`, as done here. As noted in the first commit message, the bug is tickled by having a`std::function` in `struct ReadOptions`. That is, the following patch alone is enough to cause RocksDB to segfault when run from CockroachDB on Darwin.
```diff
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1546,6 +1546,13 @@ struct ReadOptions {
// Default: false
bool ignore_range_deletions;
+ // A callback to determine whether relevant keys for this scan exist in a
+ // given table based on the table's properties. The callback is passed the
+ // properties of each table during iteration. If the callback returns false,
+ // the table will not be scanned.
+ // Default: empty (every table will be scanned)
+ std::function<bool(const TableProperties&)> table_filter;
+
ReadOptions();
ReadOptions(bool cksum, bool cache);
};
```
/cc danhhz
Closes https://github.com/facebook/rocksdb/pull/2265
Differential Revision: D5054262
Pulled By: yiwu-arbug
fbshipit-source-id: dd6b28f2bba6cb8466250d8c5c542d3c92785476
7 years ago
|
|
|
} else {
|
|
|
|
unseen.erase(it);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
auto iter = NewIterator(opts);
|
expose a hook to skip tables during iteration
Summary:
As discussed on the mailing list (["Skipping entire SSTs while iterating"](https://groups.google.com/forum/#!topic/rocksdb/ujHCJVLrHlU)), this patch adds a `table_filter` to `ReadOptions` that allows specifying a callback to be executed during iteration before each table in the database is scanned. The callback is passed the table's properties; the table is scanned iff the callback returns true.
This can be used in conjunction with a `TablePropertiesCollector` to dramatically speed up scans by skipping tables that are known to contain irrelevant data for the scan at hand.
We're using this [downstream in CockroachDB](https://github.com/cockroachdb/cockroach/blob/master/pkg/storage/engine/db.cc#L2009-L2022) already. With this feature, under ideal conditions, we can reduce the time of an incremental backup in from hours to seconds.
FYI, the first commit in this PR fixes a segfault that I unfortunately have not figured out how to reproduce outside of CockroachDB. I'm hoping you accept it on the grounds that it is not correct to return 8-byte aligned memory from a call to `malloc` on some 64-bit platforms; one correct approach is to infer the necessary alignment from `std::max_align_t`, as done here. As noted in the first commit message, the bug is tickled by having a`std::function` in `struct ReadOptions`. That is, the following patch alone is enough to cause RocksDB to segfault when run from CockroachDB on Darwin.
```diff
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1546,6 +1546,13 @@ struct ReadOptions {
// Default: false
bool ignore_range_deletions;
+ // A callback to determine whether relevant keys for this scan exist in a
+ // given table based on the table's properties. The callback is passed the
+ // properties of each table during iteration. If the callback returns false,
+ // the table will not be scanned.
+ // Default: empty (every table will be scanned)
+ std::function<bool(const TableProperties&)> table_filter;
+
ReadOptions();
ReadOptions(bool cksum, bool cache);
};
```
/cc danhhz
Closes https://github.com/facebook/rocksdb/pull/2265
Differential Revision: D5054262
Pulled By: yiwu-arbug
fbshipit-source-id: dd6b28f2bba6cb8466250d8c5c542d3c92785476
7 years ago
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->1");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "b->2");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "c->3");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "d->4");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "e->5");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "f->6");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_TRUE(unseen.empty());
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure returning false in the table_filter hides the keys from that table
|
|
|
|
// during iteration.
|
|
|
|
{
|
|
|
|
ReadOptions opts;
|
|
|
|
opts.table_filter = [](const TableProperties& props) {
|
|
|
|
return props.num_entries != 2;
|
|
|
|
};
|
|
|
|
auto iter = NewIterator(opts);
|
expose a hook to skip tables during iteration
Summary:
As discussed on the mailing list (["Skipping entire SSTs while iterating"](https://groups.google.com/forum/#!topic/rocksdb/ujHCJVLrHlU)), this patch adds a `table_filter` to `ReadOptions` that allows specifying a callback to be executed during iteration before each table in the database is scanned. The callback is passed the table's properties; the table is scanned iff the callback returns true.
This can be used in conjunction with a `TablePropertiesCollector` to dramatically speed up scans by skipping tables that are known to contain irrelevant data for the scan at hand.
We're using this [downstream in CockroachDB](https://github.com/cockroachdb/cockroach/blob/master/pkg/storage/engine/db.cc#L2009-L2022) already. With this feature, under ideal conditions, we can reduce the time of an incremental backup in from hours to seconds.
FYI, the first commit in this PR fixes a segfault that I unfortunately have not figured out how to reproduce outside of CockroachDB. I'm hoping you accept it on the grounds that it is not correct to return 8-byte aligned memory from a call to `malloc` on some 64-bit platforms; one correct approach is to infer the necessary alignment from `std::max_align_t`, as done here. As noted in the first commit message, the bug is tickled by having a`std::function` in `struct ReadOptions`. That is, the following patch alone is enough to cause RocksDB to segfault when run from CockroachDB on Darwin.
```diff
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1546,6 +1546,13 @@ struct ReadOptions {
// Default: false
bool ignore_range_deletions;
+ // A callback to determine whether relevant keys for this scan exist in a
+ // given table based on the table's properties. The callback is passed the
+ // properties of each table during iteration. If the callback returns false,
+ // the table will not be scanned.
+ // Default: empty (every table will be scanned)
+ std::function<bool(const TableProperties&)> table_filter;
+
ReadOptions();
ReadOptions(bool cksum, bool cache);
};
```
/cc danhhz
Closes https://github.com/facebook/rocksdb/pull/2265
Differential Revision: D5054262
Pulled By: yiwu-arbug
fbshipit-source-id: dd6b28f2bba6cb8466250d8c5c542d3c92785476
7 years ago
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "a->1");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "d->4");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "e->5");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_EQ(IterStatus(iter), "f->6");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, UpperBoundWithPrevReseek) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.max_sequential_skip_in_iterations = 3;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// write a bunch of kvs to the database.
|
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
ASSERT_OK(Put("y", "1"));
|
|
|
|
ASSERT_OK(Put("z", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
ASSERT_OK(Put("z", "1"));
|
|
|
|
ASSERT_OK(Put("bar", "1"));
|
|
|
|
ASSERT_OK(Put("foo", "1"));
|
|
|
|
ASSERT_OK(Put("foo", "2"));
|
|
|
|
|
|
|
|
ASSERT_OK(Put("foo", "3"));
|
|
|
|
ASSERT_OK(Put("foo", "4"));
|
|
|
|
ASSERT_OK(Put("foo", "5"));
|
|
|
|
const Snapshot* snapshot = db_->GetSnapshot();
|
|
|
|
ASSERT_OK(Put("foo", "6"));
|
|
|
|
|
|
|
|
std::string upper_bound = "x";
|
|
|
|
Slice ub_slice(upper_bound);
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.snapshot = snapshot;
|
|
|
|
ro.iterate_upper_bound = &ub_slice;
|
|
|
|
|
|
|
|
Iterator* iter = NewIterator(ro);
|
|
|
|
iter->SeekForPrev("goo");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("foo", iter->key().ToString());
|
|
|
|
iter->Prev();
|
|
|
|
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bar", iter->key().ToString());
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, SkipStatistics) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
int skip_count = 0;
|
|
|
|
|
|
|
|
// write a bunch of kvs to the database.
|
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
ASSERT_OK(Put("b", "1"));
|
|
|
|
ASSERT_OK(Put("c", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("d", "1"));
|
|
|
|
ASSERT_OK(Put("e", "1"));
|
|
|
|
ASSERT_OK(Put("f", "1"));
|
|
|
|
ASSERT_OK(Put("a", "2"));
|
|
|
|
ASSERT_OK(Put("b", "2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Delete("d"));
|
|
|
|
ASSERT_OK(Delete("e"));
|
|
|
|
ASSERT_OK(Delete("f"));
|
|
|
|
|
|
|
|
Iterator* iter = NewIterator(ReadOptions());
|
|
|
|
int count = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(count, 3);
|
|
|
|
delete iter;
|
|
|
|
skip_count += 8; // 3 deletes + 3 original keys + 2 lower in sequence
|
|
|
|
ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
|
|
|
|
|
|
|
|
iter = NewIterator(ReadOptions());
|
|
|
|
count = 0;
|
|
|
|
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(count, 3);
|
|
|
|
delete iter;
|
|
|
|
skip_count += 8; // Same as above, but in reverse order
|
|
|
|
ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
|
|
|
|
|
|
|
|
ASSERT_OK(Put("aa", "1"));
|
|
|
|
ASSERT_OK(Put("ab", "1"));
|
|
|
|
ASSERT_OK(Put("ac", "1"));
|
|
|
|
ASSERT_OK(Put("ad", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Delete("ab"));
|
|
|
|
ASSERT_OK(Delete("ac"));
|
|
|
|
ASSERT_OK(Delete("ad"));
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
Slice prefix("b");
|
|
|
|
ro.iterate_upper_bound = &prefix;
|
|
|
|
|
|
|
|
iter = NewIterator(ro);
|
|
|
|
count = 0;
|
|
|
|
for(iter->Seek("aa"); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(count, 1);
|
|
|
|
delete iter;
|
|
|
|
skip_count += 6; // 3 deletes + 3 original keys
|
|
|
|
ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
|
|
|
|
|
|
|
|
iter = NewIterator(ro);
|
|
|
|
count = 0;
|
|
|
|
for(iter->SeekToLast(); iter->Valid(); iter->Prev()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(count, 2);
|
|
|
|
delete iter;
|
|
|
|
// 3 deletes + 3 original keys + lower sequence of "a"
|
|
|
|
skip_count += 7;
|
|
|
|
ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ReadOptions ropts;
|
|
|
|
ropts.max_skippable_internal_keys = 2;
|
|
|
|
|
|
|
|
Put("1", "val_1");
|
|
|
|
// Add more tombstones than max_skippable_internal_keys so that Next() fails.
|
|
|
|
Delete("2");
|
|
|
|
Delete("3");
|
|
|
|
Delete("4");
|
|
|
|
Delete("5");
|
|
|
|
Put("6", "val_6");
|
|
|
|
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ropts));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key().ToString(), "1");
|
|
|
|
ASSERT_EQ(iter->value().ToString(), "val_1");
|
|
|
|
|
|
|
|
// This should fail as incomplete due to too many non-visible internal keys on
|
|
|
|
// the way to the next valid user key.
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
ASSERT_TRUE(iter->status().IsIncomplete());
|
|
|
|
|
|
|
|
// Get the internal key at which Next() failed.
|
|
|
|
std::string prop_value;
|
|
|
|
ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
|
|
|
|
ASSERT_EQ("4", prop_value);
|
|
|
|
|
|
|
|
// Create a new iterator to seek to the internal key.
|
|
|
|
std::unique_ptr<Iterator> iter2(NewIterator(ropts));
|
|
|
|
iter2->Seek(prop_value);
|
|
|
|
ASSERT_TRUE(iter2->Valid());
|
|
|
|
ASSERT_OK(iter2->status());
|
|
|
|
|
|
|
|
ASSERT_EQ(iter2->key().ToString(), "6");
|
|
|
|
ASSERT_EQ(iter2->value().ToString(), "val_6");
|
|
|
|
}
|
|
|
|
|
Change and clarify the relationship between Valid(), status() and Seek*() for all iterators. Also fix some bugs
Summary:
Before this PR, Iterator/InternalIterator may simultaneously have non-ok status() and Valid() = true. That state means that the last operation failed, but the iterator is nevertheless positioned on some unspecified record. Likely intended uses of that are:
* If some sst files are corrupted, a normal iterator can be used to read the data from files that are not corrupted.
* When using read_tier = kBlockCacheTier, read the data that's in block cache, skipping over the data that is not.
However, this behavior wasn't documented well (and until recently the wiki on github had misleading incorrect information). In the code there's a lot of confusion about the relationship between status() and Valid(), and about whether Seek()/SeekToLast()/etc reset the status or not. There were a number of bugs caused by this confusion, both inside rocksdb and in the code that uses rocksdb (including ours).
This PR changes the convention to:
* If status() is not ok, Valid() always returns false.
* Any seek operation resets status. (Before the PR, it depended on iterator type and on particular error.)
This does sacrifice the two use cases listed above, but siying said it's ok.
Overview of the changes:
* A commit that adds missing status checks in MergingIterator. This fixes a bug that actually affects us, and we need it fixed. `DBIteratorTest.NonBlockingIterationBugRepro` explains the scenario.
* Changes to lots of iterator types to make all of them conform to the new convention. Some bug fixes along the way. By far the biggest changes are in DBIter, which is a big messy piece of code; I tried to make it less big and messy but mostly failed.
* A stress-test for DBIter, to gain some confidence that I didn't break it. It does a few million random operations on the iterator, while occasionally modifying the underlying data (like ForwardIterator does) and occasionally returning non-ok status from internal iterator.
To find the iterator types that needed changes I searched for "public .*Iterator" in the code. Here's an overview of all 27 iterator types:
Iterators that didn't need changes:
* status() is always ok(), or Valid() is always false: MemTableIterator, ModelIter, TestIterator, KVIter (2 classes with this name anonymous namespaces), LoggingForwardVectorIterator, VectorIterator, MockTableIterator, EmptyIterator, EmptyInternalIterator.
* Thin wrappers that always pass through Valid() and status(): ArenaWrappedDBIter, TtlIterator, InternalIteratorFromIterator.
Iterators with changes (see inline comments for details):
* DBIter - an overhaul:
- It used to silently skip corrupted keys (`FindParseableKey()`), which seems dangerous. This PR makes it just stop immediately after encountering a corrupted key, just like it would for other kinds of corruption. Let me know if there was actually some deeper meaning in this behavior and I should put it back.
- It had a few code paths silently discarding subiterator's status. The stress test caught a few.
- The backwards iteration code path was expecting the internal iterator's set of keys to be immutable. It's probably always true in practice at the moment, since ForwardIterator doesn't support backwards iteration, but this PR fixes it anyway. See added DBIteratorTest.ReverseToForwardBug for an example.
- Some parts of backwards iteration code path even did things like `assert(iter_->Valid())` after a seek, which is never a safe assumption.
- It used to not reset status on seek for some types of errors.
- Some simplifications and better comments.
- Some things got more complicated from the added error handling. I'm open to ideas for how to make it nicer.
* MergingIterator - check status after every operation on every subiterator, and in some places assert that valid subiterators have ok status.
* ForwardIterator - changed to the new convention, also slightly simplified.
* ForwardLevelIterator - fixed some bugs and simplified.
* LevelIterator - simplified.
* TwoLevelIterator - changed to the new convention. Also fixed a bug that would make SeekForPrev() sometimes silently ignore errors from first_level_iter_.
* BlockBasedTableIterator - minor changes.
* BlockIter - replaced `SetStatus()` with `Invalidate()` to make sure non-ok BlockIter is always invalid.
* PlainTableIterator - some seeks used to not reset status.
* CuckooTableIterator - tiny code cleanup.
* ManagedIterator - fixed some bugs.
* BaseDeltaIterator - changed to the new convention and fixed a bug.
* BlobDBIterator - seeks used to not reset status.
* KeyConvertingIterator - some small change.
Closes https://github.com/facebook/rocksdb/pull/3810
Differential Revision: D7888019
Pulled By: al13n321
fbshipit-source-id: 4aaf6d3421c545d16722a815b2fa2e7912bc851d
7 years ago
|
|
|
// Reproduces a former bug where iterator would skip some records when DBIter
|
|
|
|
// re-seeks subiterator with Incomplete status.
|
|
|
|
TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
// Make sure the sst file has more than one block.
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Two records in sst file, each in its own block.
|
|
|
|
Put("b", "");
|
|
|
|
Put("d", "");
|
|
|
|
Flush();
|
|
|
|
|
|
|
|
// Create a nonblocking iterator before writing to memtable.
|
|
|
|
ReadOptions ropt;
|
|
|
|
ropt.read_tier = kBlockCacheTier;
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ropt));
|
Change and clarify the relationship between Valid(), status() and Seek*() for all iterators. Also fix some bugs
Summary:
Before this PR, Iterator/InternalIterator may simultaneously have non-ok status() and Valid() = true. That state means that the last operation failed, but the iterator is nevertheless positioned on some unspecified record. Likely intended uses of that are:
* If some sst files are corrupted, a normal iterator can be used to read the data from files that are not corrupted.
* When using read_tier = kBlockCacheTier, read the data that's in block cache, skipping over the data that is not.
However, this behavior wasn't documented well (and until recently the wiki on github had misleading incorrect information). In the code there's a lot of confusion about the relationship between status() and Valid(), and about whether Seek()/SeekToLast()/etc reset the status or not. There were a number of bugs caused by this confusion, both inside rocksdb and in the code that uses rocksdb (including ours).
This PR changes the convention to:
* If status() is not ok, Valid() always returns false.
* Any seek operation resets status. (Before the PR, it depended on iterator type and on particular error.)
This does sacrifice the two use cases listed above, but siying said it's ok.
Overview of the changes:
* A commit that adds missing status checks in MergingIterator. This fixes a bug that actually affects us, and we need it fixed. `DBIteratorTest.NonBlockingIterationBugRepro` explains the scenario.
* Changes to lots of iterator types to make all of them conform to the new convention. Some bug fixes along the way. By far the biggest changes are in DBIter, which is a big messy piece of code; I tried to make it less big and messy but mostly failed.
* A stress-test for DBIter, to gain some confidence that I didn't break it. It does a few million random operations on the iterator, while occasionally modifying the underlying data (like ForwardIterator does) and occasionally returning non-ok status from internal iterator.
To find the iterator types that needed changes I searched for "public .*Iterator" in the code. Here's an overview of all 27 iterator types:
Iterators that didn't need changes:
* status() is always ok(), or Valid() is always false: MemTableIterator, ModelIter, TestIterator, KVIter (2 classes with this name anonymous namespaces), LoggingForwardVectorIterator, VectorIterator, MockTableIterator, EmptyIterator, EmptyInternalIterator.
* Thin wrappers that always pass through Valid() and status(): ArenaWrappedDBIter, TtlIterator, InternalIteratorFromIterator.
Iterators with changes (see inline comments for details):
* DBIter - an overhaul:
- It used to silently skip corrupted keys (`FindParseableKey()`), which seems dangerous. This PR makes it just stop immediately after encountering a corrupted key, just like it would for other kinds of corruption. Let me know if there was actually some deeper meaning in this behavior and I should put it back.
- It had a few code paths silently discarding subiterator's status. The stress test caught a few.
- The backwards iteration code path was expecting the internal iterator's set of keys to be immutable. It's probably always true in practice at the moment, since ForwardIterator doesn't support backwards iteration, but this PR fixes it anyway. See added DBIteratorTest.ReverseToForwardBug for an example.
- Some parts of backwards iteration code path even did things like `assert(iter_->Valid())` after a seek, which is never a safe assumption.
- It used to not reset status on seek for some types of errors.
- Some simplifications and better comments.
- Some things got more complicated from the added error handling. I'm open to ideas for how to make it nicer.
* MergingIterator - check status after every operation on every subiterator, and in some places assert that valid subiterators have ok status.
* ForwardIterator - changed to the new convention, also slightly simplified.
* ForwardLevelIterator - fixed some bugs and simplified.
* LevelIterator - simplified.
* TwoLevelIterator - changed to the new convention. Also fixed a bug that would make SeekForPrev() sometimes silently ignore errors from first_level_iter_.
* BlockBasedTableIterator - minor changes.
* BlockIter - replaced `SetStatus()` with `Invalidate()` to make sure non-ok BlockIter is always invalid.
* PlainTableIterator - some seeks used to not reset status.
* CuckooTableIterator - tiny code cleanup.
* ManagedIterator - fixed some bugs.
* BaseDeltaIterator - changed to the new convention and fixed a bug.
* BlobDBIterator - seeks used to not reset status.
* KeyConvertingIterator - some small change.
Closes https://github.com/facebook/rocksdb/pull/3810
Differential Revision: D7888019
Pulled By: al13n321
fbshipit-source-id: 4aaf6d3421c545d16722a815b2fa2e7912bc851d
7 years ago
|
|
|
|
|
|
|
// Overwrite a key in memtable many times to hit
|
|
|
|
// max_sequential_skip_in_iterations (which is 8 by default).
|
|
|
|
for (int i = 0; i < 20; ++i) {
|
|
|
|
Put("c", "");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Load the second block in sst file into the block cache.
|
|
|
|
{
|
|
|
|
std::unique_ptr<Iterator> iter2(NewIterator(ReadOptions()));
|
Change and clarify the relationship between Valid(), status() and Seek*() for all iterators. Also fix some bugs
Summary:
Before this PR, Iterator/InternalIterator may simultaneously have non-ok status() and Valid() = true. That state means that the last operation failed, but the iterator is nevertheless positioned on some unspecified record. Likely intended uses of that are:
* If some sst files are corrupted, a normal iterator can be used to read the data from files that are not corrupted.
* When using read_tier = kBlockCacheTier, read the data that's in block cache, skipping over the data that is not.
However, this behavior wasn't documented well (and until recently the wiki on github had misleading incorrect information). In the code there's a lot of confusion about the relationship between status() and Valid(), and about whether Seek()/SeekToLast()/etc reset the status or not. There were a number of bugs caused by this confusion, both inside rocksdb and in the code that uses rocksdb (including ours).
This PR changes the convention to:
* If status() is not ok, Valid() always returns false.
* Any seek operation resets status. (Before the PR, it depended on iterator type and on particular error.)
This does sacrifice the two use cases listed above, but siying said it's ok.
Overview of the changes:
* A commit that adds missing status checks in MergingIterator. This fixes a bug that actually affects us, and we need it fixed. `DBIteratorTest.NonBlockingIterationBugRepro` explains the scenario.
* Changes to lots of iterator types to make all of them conform to the new convention. Some bug fixes along the way. By far the biggest changes are in DBIter, which is a big messy piece of code; I tried to make it less big and messy but mostly failed.
* A stress-test for DBIter, to gain some confidence that I didn't break it. It does a few million random operations on the iterator, while occasionally modifying the underlying data (like ForwardIterator does) and occasionally returning non-ok status from internal iterator.
To find the iterator types that needed changes I searched for "public .*Iterator" in the code. Here's an overview of all 27 iterator types:
Iterators that didn't need changes:
* status() is always ok(), or Valid() is always false: MemTableIterator, ModelIter, TestIterator, KVIter (2 classes with this name anonymous namespaces), LoggingForwardVectorIterator, VectorIterator, MockTableIterator, EmptyIterator, EmptyInternalIterator.
* Thin wrappers that always pass through Valid() and status(): ArenaWrappedDBIter, TtlIterator, InternalIteratorFromIterator.
Iterators with changes (see inline comments for details):
* DBIter - an overhaul:
- It used to silently skip corrupted keys (`FindParseableKey()`), which seems dangerous. This PR makes it just stop immediately after encountering a corrupted key, just like it would for other kinds of corruption. Let me know if there was actually some deeper meaning in this behavior and I should put it back.
- It had a few code paths silently discarding subiterator's status. The stress test caught a few.
- The backwards iteration code path was expecting the internal iterator's set of keys to be immutable. It's probably always true in practice at the moment, since ForwardIterator doesn't support backwards iteration, but this PR fixes it anyway. See added DBIteratorTest.ReverseToForwardBug for an example.
- Some parts of backwards iteration code path even did things like `assert(iter_->Valid())` after a seek, which is never a safe assumption.
- It used to not reset status on seek for some types of errors.
- Some simplifications and better comments.
- Some things got more complicated from the added error handling. I'm open to ideas for how to make it nicer.
* MergingIterator - check status after every operation on every subiterator, and in some places assert that valid subiterators have ok status.
* ForwardIterator - changed to the new convention, also slightly simplified.
* ForwardLevelIterator - fixed some bugs and simplified.
* LevelIterator - simplified.
* TwoLevelIterator - changed to the new convention. Also fixed a bug that would make SeekForPrev() sometimes silently ignore errors from first_level_iter_.
* BlockBasedTableIterator - minor changes.
* BlockIter - replaced `SetStatus()` with `Invalidate()` to make sure non-ok BlockIter is always invalid.
* PlainTableIterator - some seeks used to not reset status.
* CuckooTableIterator - tiny code cleanup.
* ManagedIterator - fixed some bugs.
* BaseDeltaIterator - changed to the new convention and fixed a bug.
* BlobDBIterator - seeks used to not reset status.
* KeyConvertingIterator - some small change.
Closes https://github.com/facebook/rocksdb/pull/3810
Differential Revision: D7888019
Pulled By: al13n321
fbshipit-source-id: 4aaf6d3421c545d16722a815b2fa2e7912bc851d
7 years ago
|
|
|
iter2->Seek("d");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finally seek the nonblocking iterator.
|
|
|
|
iter->Seek("a");
|
|
|
|
// With the bug, the status used to be OK, and the iterator used to point to
|
|
|
|
// "d".
|
|
|
|
EXPECT_TRUE(iter->status().IsIncomplete());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
|
|
|
|
Put("a", "");
|
|
|
|
Put("b", "");
|
|
|
|
Flush();
|
|
|
|
|
|
|
|
ReadOptions ropt;
|
|
|
|
Slice ub = "b";
|
|
|
|
ropt.iterate_upper_bound = &ub;
|
|
|
|
|
|
|
|
std::unique_ptr<Iterator> it(dbfull()->NewIterator(ropt));
|
|
|
|
it->SeekForPrev("a");
|
|
|
|
ASSERT_TRUE(it->Valid());
|
|
|
|
ASSERT_OK(it->status());
|
|
|
|
ASSERT_EQ("a", it->key().ToString());
|
|
|
|
it->Next();
|
|
|
|
ASSERT_FALSE(it->Valid());
|
|
|
|
ASSERT_OK(it->status());
|
|
|
|
it->SeekForPrev("a");
|
|
|
|
ASSERT_OK(it->status());
|
|
|
|
|
|
|
|
ASSERT_TRUE(it->Valid());
|
|
|
|
ASSERT_EQ("a", it->key().ToString());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compression = CompressionType::kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 800;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::string random_str = RandomString(&rnd, 180);
|
|
|
|
|
|
|
|
ASSERT_OK(Put("1", random_str));
|
|
|
|
ASSERT_OK(Put("2", random_str));
|
|
|
|
ASSERT_OK(Put("3", random_str));
|
|
|
|
ASSERT_OK(Put("4", random_str));
|
|
|
|
// A new block
|
|
|
|
ASSERT_OK(Put("5", random_str));
|
|
|
|
ASSERT_OK(Put("6", random_str));
|
|
|
|
ASSERT_OK(Put("7", random_str));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("8", random_str));
|
|
|
|
ASSERT_OK(Put("9", random_str));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
|
|
|
int num_find_file_in_level = 0;
|
|
|
|
int num_idx_blk_seek = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelIterator::Seek:BeforeFindFile",
|
|
|
|
[&](void* /*arg*/) { num_find_file_in_level++; });
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
{
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
|
|
|
|
iter->Seek("1");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(1, num_find_file_in_level);
|
|
|
|
ASSERT_EQ(1, num_idx_blk_seek);
|
|
|
|
|
|
|
|
iter->Seek("2");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(1, num_find_file_in_level);
|
|
|
|
ASSERT_EQ(1, num_idx_blk_seek);
|
|
|
|
|
|
|
|
iter->Seek("3");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(1, num_find_file_in_level);
|
|
|
|
ASSERT_EQ(1, num_idx_blk_seek);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(1, num_find_file_in_level);
|
|
|
|
ASSERT_EQ(1, num_idx_blk_seek);
|
|
|
|
|
|
|
|
iter->Seek("5");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(1, num_find_file_in_level);
|
|
|
|
ASSERT_EQ(2, num_idx_blk_seek);
|
|
|
|
|
|
|
|
iter->Seek("6");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(1, num_find_file_in_level);
|
|
|
|
ASSERT_EQ(2, num_idx_blk_seek);
|
|
|
|
|
|
|
|
iter->Seek("7");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(1, num_find_file_in_level);
|
|
|
|
ASSERT_EQ(3, num_idx_blk_seek);
|
|
|
|
|
|
|
|
iter->Seek("8");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(2, num_find_file_in_level);
|
|
|
|
// Still re-seek because "8" is the boundary key, which has
|
|
|
|
// the same user key as the seek key.
|
|
|
|
ASSERT_EQ(4, num_idx_blk_seek);
|
|
|
|
|
|
|
|
iter->Seek("5");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(3, num_find_file_in_level);
|
|
|
|
ASSERT_EQ(5, num_idx_blk_seek);
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(3, num_find_file_in_level);
|
|
|
|
ASSERT_EQ(5, num_idx_blk_seek);
|
|
|
|
|
|
|
|
// Seek backward never triggers the index block seek to be skipped
|
|
|
|
iter->Seek("5");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(3, num_find_file_in_level);
|
|
|
|
ASSERT_EQ(6, num_idx_blk_seek);
|
|
|
|
}
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
// MyRocks may change iterate bounds before seek. Simply test to make sure such
|
|
|
|
// usage doesn't break iterator.
|
|
|
|
TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compression = CompressionType::kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 100;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
std::string value(50, 'v');
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(Put("aaa", value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("bbb", "v"));
|
|
|
|
ASSERT_OK(Put("ccc", "v"));
|
|
|
|
ASSERT_OK(Put("ddd", "v"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("eee", "v"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
|
|
|
std::string ub1 = "e";
|
|
|
|
std::string ub2 = "c";
|
|
|
|
Slice ub(ub1);
|
|
|
|
ReadOptions read_opts1;
|
|
|
|
read_opts1.iterate_upper_bound = &ub;
|
|
|
|
Iterator* iter = NewIterator(read_opts1);
|
|
|
|
// Seek and iterate accross block boundary.
|
|
|
|
iter->Seek("b");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("bbb", iter->key());
|
|
|
|
ub = Slice(ub2);
|
|
|
|
iter->Seek("b");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("bbb", iter->key());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
std::string lb1 = "a";
|
|
|
|
std::string lb2 = "c";
|
|
|
|
Slice lb(lb1);
|
|
|
|
ReadOptions read_opts2;
|
|
|
|
read_opts2.iterate_lower_bound = &lb;
|
|
|
|
iter = NewIterator(read_opts2);
|
|
|
|
iter->SeekForPrev("d");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("ccc", iter->key());
|
|
|
|
lb = Slice(lb2);
|
|
|
|
iter->SeekForPrev("d");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("ccc", iter->key());
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) {
|
|
|
|
ASSERT_OK(Put("aaa", "v"));
|
|
|
|
ASSERT_OK(Put("bbb", "v"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("ccc", "v"));
|
|
|
|
ASSERT_OK(Put("ddd", "v"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
// Move both files to bottom level.
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
Slice lower_bound("b");
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.iterate_lower_bound = &lower_bound;
|
|
|
|
std::unique_ptr<Iterator> iter(NewIterator(read_opts));
|
|
|
|
iter->SeekForPrev("d");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("ccc", iter->key());
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("bbb", iter->key());
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
|
|
|
|
testing::Values(true, false));
|
|
|
|
|
|
|
|
// Tests how DBIter work with ReadCallback
|
|
|
|
class DBIteratorWithReadCallbackTest : public DBIteratorTest {};
|
|
|
|
|
|
|
|
TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) {
|
|
|
|
class TestReadCallback : public ReadCallback {
|
|
|
|
public:
|
|
|
|
explicit TestReadCallback(SequenceNumber _max_visible_seq)
|
|
|
|
: ReadCallback(_max_visible_seq) {}
|
|
|
|
|
|
|
|
bool IsVisibleFullCheck(SequenceNumber seq) override {
|
|
|
|
return seq <= max_visible_seq_;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
ASSERT_OK(Put("foo", "v1"));
|
|
|
|
ASSERT_OK(Put("foo", "v2"));
|
|
|
|
ASSERT_OK(Put("foo", "v3"));
|
|
|
|
ASSERT_OK(Put("a", "va"));
|
|
|
|
ASSERT_OK(Put("z", "vz"));
|
|
|
|
SequenceNumber seq1 = db_->GetLatestSequenceNumber();
|
|
|
|
TestReadCallback callback1(seq1);
|
|
|
|
ASSERT_OK(Put("foo", "v4"));
|
|
|
|
ASSERT_OK(Put("foo", "v5"));
|
|
|
|
ASSERT_OK(Put("bar", "v7"));
|
|
|
|
|
|
|
|
SequenceNumber seq2 = db_->GetLatestSequenceNumber();
|
|
|
|
auto* cfd =
|
|
|
|
static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
|
|
|
|
->cfd();
|
|
|
|
// The iterator are suppose to see data before seq1.
|
|
|
|
Iterator* iter =
|
|
|
|
dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq2, &callback1);
|
|
|
|
|
|
|
|
// Seek
|
|
|
|
// The latest value of "foo" before seq1 is "v3"
|
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("foo", iter->key());
|
|
|
|
ASSERT_EQ("v3", iter->value());
|
|
|
|
// "bar" is not visible to the iterator. It will move on to the next key
|
|
|
|
// "foo".
|
|
|
|
iter->Seek("bar");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("foo", iter->key());
|
|
|
|
ASSERT_EQ("v3", iter->value());
|
|
|
|
|
|
|
|
// Next
|
|
|
|
// Seek to "a"
|
|
|
|
iter->Seek("a");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("va", iter->value());
|
|
|
|
// "bar" is not visible to the iterator. It will move on to the next key
|
|
|
|
// "foo".
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("foo", iter->key());
|
|
|
|
ASSERT_EQ("v3", iter->value());
|
|
|
|
|
|
|
|
// Prev
|
|
|
|
// Seek to "z"
|
|
|
|
iter->Seek("z");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("vz", iter->value());
|
|
|
|
// The previous key is "foo", which is visible to the iterator.
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("foo", iter->key());
|
|
|
|
ASSERT_EQ("v3", iter->value());
|
|
|
|
// "bar" is not visible to the iterator. It will move on to the next key "a".
|
|
|
|
iter->Prev(); // skipping "bar"
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("a", iter->key());
|
|
|
|
ASSERT_EQ("va", iter->value());
|
|
|
|
|
|
|
|
// SeekForPrev
|
|
|
|
// The previous key is "foo", which is visible to the iterator.
|
|
|
|
iter->SeekForPrev("y");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("foo", iter->key());
|
|
|
|
ASSERT_EQ("v3", iter->value());
|
|
|
|
// "bar" is not visible to the iterator. It will move on to the next key "a".
|
|
|
|
iter->SeekForPrev("bar");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("a", iter->key());
|
|
|
|
ASSERT_EQ("va", iter->value());
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// Prev beyond max_sequential_skip_in_iterations
|
|
|
|
uint64_t num_versions =
|
|
|
|
CurrentOptions().max_sequential_skip_in_iterations + 10;
|
|
|
|
for (uint64_t i = 0; i < num_versions; i++) {
|
|
|
|
ASSERT_OK(Put("bar", ToString(i)));
|
|
|
|
}
|
|
|
|
SequenceNumber seq3 = db_->GetLatestSequenceNumber();
|
|
|
|
TestReadCallback callback2(seq3);
|
|
|
|
ASSERT_OK(Put("bar", "v8"));
|
|
|
|
SequenceNumber seq4 = db_->GetLatestSequenceNumber();
|
|
|
|
|
|
|
|
// The iterator is suppose to see data before seq3.
|
|
|
|
iter = dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq4, &callback2);
|
|
|
|
// Seek to "z", which is visible.
|
|
|
|
iter->Seek("z");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("vz", iter->value());
|
|
|
|
// Previous key is "foo" and the last value "v5" is visible.
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("foo", iter->key());
|
|
|
|
ASSERT_EQ("v5", iter->value());
|
|
|
|
// Since the number of values of "bar" is more than
|
|
|
|
// max_sequential_skip_in_iterations, Prev() will ultimately fallback to
|
|
|
|
// seek in forward direction. Here we test the fallback seek is correct.
|
|
|
|
// The last visible value should be (num_versions - 1), as "v8" is not
|
|
|
|
// visible.
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ("bar", iter->key());
|
|
|
|
ASSERT_EQ(ToString(num_versions - 1), iter->value());
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|