[RocksDB] Generalize prefix-aware iterator to be used for more than one Seek

Summary: Added a prefix_seek flag in ReadOptions to indicate that Seek is prefix aware(might not return data with different prefix), and also not bound to a specific prefix. Multiple Seeks and range scans can be invoked on the same iterator. If a specific prefix is specified, this flag will be ignored. Just a quick prototype that works for PrefixHashRep, the new lockless memtable could be easily extended with this support too.

Test Plan: test it on Leaf

Reviewers: dhruba, kailiu, sdong, igor

Reviewed By: igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D13929
main
Haobo Xu 11 years ago
parent c2be2cba04
commit fd2044883a
  1. 4
      db/db_impl.cc
  2. 23
      db/memtable.cc
  3. 12
      db/memtable.h
  4. 89
      db/prefix_test.cc
  5. 6
      include/rocksdb/memtablerep.h
  6. 7
      include/rocksdb/options.h
  7. 80
      util/transformrep.cc

@ -2454,7 +2454,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
// Collect together all needed child iterators for mem // Collect together all needed child iterators for mem
std::vector<Iterator*> list; std::vector<Iterator*> list;
mem_->Ref(); mem_->Ref();
list.push_back(mem_->NewIterator(options.prefix)); list.push_back(mem_->NewIterator(options));
cleanup->mem.push_back(mem_); cleanup->mem.push_back(mem_);
@ -2464,7 +2464,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
for (unsigned int i = 0; i < immutables.size(); i++) { for (unsigned int i = 0; i < immutables.size(); i++) {
MemTable* m = immutables[i]; MemTable* m = immutables[i];
m->Ref(); m->Ref();
list.push_back(m->NewIterator(options.prefix)); list.push_back(m->NewIterator(options));
cleanup->mem.push_back(m); cleanup->mem.push_back(m);
} }

@ -84,11 +84,16 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) {
class MemTableIterator: public Iterator { class MemTableIterator: public Iterator {
public: public:
explicit MemTableIterator(MemTableRep* table) MemTableIterator(MemTableRep* table, const ReadOptions& options)
: iter_(table->GetIterator()) { } : iter_() {
if (options.prefix) {
MemTableIterator(MemTableRep* table, const Slice* prefix) iter_ = table->GetPrefixIterator(*options.prefix);
: iter_(table->GetPrefixIterator(*prefix)) { } } else if (options.prefix_seek) {
iter_ = table->GetDynamicPrefixIterator();
} else {
iter_ = table->GetIterator();
}
}
virtual bool Valid() const { return iter_->Valid(); } virtual bool Valid() const { return iter_->Valid(); }
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
@ -115,12 +120,8 @@ class MemTableIterator: public Iterator {
void operator=(const MemTableIterator&); void operator=(const MemTableIterator&);
}; };
Iterator* MemTable::NewIterator(const Slice* prefix) { Iterator* MemTable::NewIterator(const ReadOptions& options) {
if (prefix) { return new MemTableIterator(table_.get(), options);
return new MemTableIterator(table_.get(), prefix);
} else {
return new MemTableIterator(table_.get());
}
} }
port::RWMutex* MemTable::GetLock(const Slice& key) { port::RWMutex* MemTable::GetLock(const Slice& key) {

@ -65,10 +65,14 @@ class MemTable {
// iterator are internal keys encoded by AppendInternalKey in the // iterator are internal keys encoded by AppendInternalKey in the
// db/dbformat.{h,cc} module. // db/dbformat.{h,cc} module.
// //
// If a prefix is supplied, it is passed to the underlying MemTableRep as a // If options.prefix is supplied, it is passed to the underlying MemTableRep
// hint that the iterator only need to support access to keys with that // as a hint that the iterator only need to support access to keys with that
// prefix. // specific prefix.
Iterator* NewIterator(const Slice* prefix = nullptr); // If options.prefix is not supplied and options.prefix_seek is set, the
// iterator is not bound to a specific prefix. However, the semantics of
// Seek is changed - the result might only include keys with the same prefix
// as the seek-key.
Iterator* NewIterator(const ReadOptions& options = ReadOptions());
// Add an entry into memtable that maps key to value at the // Add an entry into memtable that maps key to value at the
// specified sequence number and with the specified type. // specified sequence number and with the specified type.

@ -75,8 +75,6 @@ class TestKeyComparator : public Comparator {
virtual void FindShortSuccessor(std::string* key) const {} virtual void FindShortSuccessor(std::string* key) const {}
private:
}; };
class PrefixTest { class PrefixTest {
@ -116,6 +114,93 @@ class PrefixTest {
Options options; Options options;
}; };
TEST(PrefixTest, DynamicPrefixIterator) {
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
std::vector<uint64_t> prefixes;
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
prefixes.push_back(i);
}
if (FLAGS_random_prefix) {
std::random_shuffle(prefixes.begin(), prefixes.end());
}
// insert x random prefix, each with y continuous element.
for (auto prefix : prefixes) {
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
TestKey test_key(prefix, sorted);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(sorted);
ASSERT_OK(db->Put(write_options, key, value));
}
}
// test seek existing keys
HistogramImpl hist_seek_time;
HistogramImpl hist_seek_comparison;
if (FLAGS_use_prefix_hash_memtable) {
read_options.prefix_seek = true;
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
for (auto prefix : prefixes) {
TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(0);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
uint64_t total_keys = 0;
for (iter->Seek(key); iter->Valid(); iter->Next()) {
if (FLAGS_trigger_deadlock) {
std::cout << "Behold the deadlock!\n";
db->Delete(write_options, iter->key());
}
auto test_key = SliceToTestKey(iter->key());
if (test_key->prefix != prefix) break;
total_keys++;
}
hist_seek_time.Add(timer.ElapsedNanos());
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
}
std::cout << "Seek key comparison: \n"
<< hist_seek_comparison.ToString()
<< "Seek time: \n"
<< hist_seek_time.ToString();
// test non-existing keys
HistogramImpl hist_no_seek_time;
HistogramImpl hist_no_seek_comparison;
for (auto prefix = FLAGS_total_prefixes;
prefix < FLAGS_total_prefixes + 100;
prefix++) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->Seek(key);
hist_no_seek_time.Add(timer.ElapsedNanos());
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_TRUE(!iter->Valid());
}
std::cout << "non-existing Seek key comparison: \n"
<< hist_no_seek_comparison.ToString()
<< "non-existing Seek time: \n"
<< hist_no_seek_time.ToString();
}
TEST(PrefixTest, PrefixHash) { TEST(PrefixTest, PrefixHash) {

@ -130,6 +130,12 @@ class MemTableRep {
return GetIterator(); return GetIterator();
} }
// Return an iterator that has a special Seek semantics. The result of
// a Seek might only include keys with the same prefix as the target key.
virtual std::shared_ptr<Iterator> GetDynamicPrefixIterator() {
return GetIterator();
}
protected: protected:
// When *key is an internal key concatenated with the value, returns the // When *key is an internal key concatenated with the value, returns the
// user key. // user key.

@ -649,6 +649,10 @@ struct ReadOptions {
// Default: true // Default: true
bool fill_cache; bool fill_cache;
// If this option is set and memtable implementation allows, Seek
// might only return keys with the same prefix as the seek-key
bool prefix_seek;
// If "snapshot" is non-nullptr, read as of the supplied snapshot // If "snapshot" is non-nullptr, read as of the supplied snapshot
// (which must belong to the DB that is being read and which must // (which must belong to the DB that is being read and which must
// not have been released). If "snapshot" is nullptr, use an impliicit // not have been released). If "snapshot" is nullptr, use an impliicit
@ -678,13 +682,14 @@ struct ReadOptions {
ReadOptions() ReadOptions()
: verify_checksums(false), : verify_checksums(false),
fill_cache(true), fill_cache(true),
prefix_seek(false),
snapshot(nullptr), snapshot(nullptr),
prefix(nullptr), prefix(nullptr),
read_tier(kReadAllTier) { read_tier(kReadAllTier) {
} }
ReadOptions(bool cksum, bool cache) : ReadOptions(bool cksum, bool cache) :
verify_checksums(cksum), fill_cache(cache), verify_checksums(cksum), fill_cache(cache),
snapshot(nullptr), prefix(nullptr), prefix_seek(false), snapshot(nullptr), prefix(nullptr),
read_tier(kReadAllTier) { read_tier(kReadAllTier) {
} }
}; };

@ -51,10 +51,16 @@ class TransformRep : public MemTableRep {
virtual std::shared_ptr<MemTableRep::Iterator> GetIterator( virtual std::shared_ptr<MemTableRep::Iterator> GetIterator(
const Slice& slice) override; const Slice& slice) override;
virtual std::shared_ptr<MemTableRep::Iterator> GetDynamicPrefixIterator()
override {
return std::make_shared<DynamicPrefixIterator>(*this);
}
std::shared_ptr<MemTableRep::Iterator> GetTransformIterator( std::shared_ptr<MemTableRep::Iterator> GetTransformIterator(
const Slice& transformed); const Slice& transformed);
private: private:
friend class DynamicPrefixIterator;
typedef std::set<const char*, Compare> Bucket; typedef std::set<const char*, Compare> Bucket;
typedef std::unordered_map<Slice, std::shared_ptr<Bucket>> BucketMap; typedef std::unordered_map<Slice, std::shared_ptr<Bucket>> BucketMap;
@ -148,6 +154,80 @@ class TransformRep : public MemTableRep {
private: private:
const ReadLock l_; const ReadLock l_;
}; };
class DynamicPrefixIterator : public MemTableRep::Iterator {
private:
// the underlying memtable rep
const TransformRep& memtable_rep_;
// the result of a prefix seek
std::unique_ptr<MemTableRep::Iterator> bucket_iterator_;
public:
explicit DynamicPrefixIterator(const TransformRep& memtable_rep)
: memtable_rep_(memtable_rep) {}
virtual ~DynamicPrefixIterator() { };
// Returns true iff the iterator is positioned at a valid node.
virtual bool Valid() const {
return bucket_iterator_ && bucket_iterator_->Valid();
}
// Returns the key at the current position.
// REQUIRES: Valid()
virtual const char* key() const {
assert(Valid());
return bucket_iterator_->key();
}
// Advances to the next position.
// REQUIRES: Valid()
virtual void Next() {
assert(Valid());
bucket_iterator_->Next();
}
// Advances to the previous position.
// REQUIRES: Valid()
virtual void Prev() {
assert(Valid());
bucket_iterator_->Prev();
}
// Advance to the first entry with a key >= target within the
// same bucket as target
virtual void Seek(const char* target) {
Slice prefix = memtable_rep_.transform_->Transform(
memtable_rep_.UserKey(target));
ReadLock l(&memtable_rep_.rwlock_);
auto bucket = memtable_rep_.buckets_.find(prefix);
if (bucket == memtable_rep_.buckets_.end()) {
bucket_iterator_.reset(nullptr);
} else {
bucket_iterator_.reset(
new TransformIterator(bucket->second, memtable_rep_.GetLock(prefix)));
bucket_iterator_->Seek(target);
}
}
// Position at the first entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
virtual void SeekToFirst() {
// Prefix iterator does not support total order.
// We simply set the iterator to invalid state
bucket_iterator_.reset(nullptr);
}
// Position at the last entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
virtual void SeekToLast() {
// Prefix iterator does not support total order.
// We simply set the iterator to invalid state
bucket_iterator_.reset(nullptr);
}
};
}; };
class PrefixHashRep : public TransformRep { class PrefixHashRep : public TransformRep {

Loading…
Cancel
Save