diff --git a/db/db_impl.cc b/db/db_impl.cc index dd526392b..6fd319698 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -2454,7 +2454,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, // Collect together all needed child iterators for mem std::vector list; mem_->Ref(); - list.push_back(mem_->NewIterator(options.prefix)); + list.push_back(mem_->NewIterator(options)); cleanup->mem.push_back(mem_); @@ -2464,7 +2464,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, for (unsigned int i = 0; i < immutables.size(); i++) { MemTable* m = immutables[i]; m->Ref(); - list.push_back(m->NewIterator(options.prefix)); + list.push_back(m->NewIterator(options)); cleanup->mem.push_back(m); } diff --git a/db/memtable.cc b/db/memtable.cc index ab9ef46e3..291899c21 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -84,11 +84,16 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator: public Iterator { public: - explicit MemTableIterator(MemTableRep* table) - : iter_(table->GetIterator()) { } - - MemTableIterator(MemTableRep* table, const Slice* prefix) - : iter_(table->GetPrefixIterator(*prefix)) { } + MemTableIterator(MemTableRep* table, const ReadOptions& options) + : iter_() { + if (options.prefix) { + iter_ = table->GetPrefixIterator(*options.prefix); + } else if (options.prefix_seek) { + iter_ = table->GetDynamicPrefixIterator(); + } else { + iter_ = table->GetIterator(); + } + } virtual bool Valid() const { return iter_->Valid(); } virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } @@ -115,12 +120,8 @@ class MemTableIterator: public Iterator { void operator=(const MemTableIterator&); }; -Iterator* MemTable::NewIterator(const Slice* prefix) { - if (prefix) { - return new MemTableIterator(table_.get(), prefix); - } else { - return new MemTableIterator(table_.get()); - } +Iterator* MemTable::NewIterator(const ReadOptions& options) { + return new MemTableIterator(table_.get(), options); } port::RWMutex* MemTable::GetLock(const Slice& key) { diff --git a/db/memtable.h b/db/memtable.h index 1a10f8088..93b9b7e2c 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -65,10 +65,14 @@ class MemTable { // iterator are internal keys encoded by AppendInternalKey in the // db/dbformat.{h,cc} module. // - // If a prefix is supplied, it is passed to the underlying MemTableRep as a - // hint that the iterator only need to support access to keys with that - // prefix. - Iterator* NewIterator(const Slice* prefix = nullptr); + // If options.prefix is supplied, it is passed to the underlying MemTableRep + // as a hint that the iterator only need to support access to keys with that + // specific prefix. + // If options.prefix is not supplied and options.prefix_seek is set, the + // iterator is not bound to a specific prefix. However, the semantics of + // Seek is changed - the result might only include keys with the same prefix + // as the seek-key. + Iterator* NewIterator(const ReadOptions& options = ReadOptions()); // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. diff --git a/db/prefix_test.cc b/db/prefix_test.cc index 48238d52e..eede170f0 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -75,8 +75,6 @@ class TestKeyComparator : public Comparator { virtual void FindShortSuccessor(std::string* key) const {} - private: - }; class PrefixTest { @@ -116,6 +114,93 @@ class PrefixTest { Options options; }; +TEST(PrefixTest, DynamicPrefixIterator) { + + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::vector prefixes; + for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { + prefixes.push_back(i); + } + + if (FLAGS_random_prefix) { + std::random_shuffle(prefixes.begin(), prefixes.end()); + } + + // insert x random prefix, each with y continuous element. + for (auto prefix : prefixes) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + TestKey test_key(prefix, sorted); + + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(sorted); + + ASSERT_OK(db->Put(write_options, key, value)); + } + } + + // test seek existing keys + HistogramImpl hist_seek_time; + HistogramImpl hist_seek_comparison; + + if (FLAGS_use_prefix_hash_memtable) { + read_options.prefix_seek = true; + } + std::unique_ptr iter(db->NewIterator(read_options)); + + for (auto prefix : prefixes) { + TestKey test_key(prefix, FLAGS_items_per_prefix / 2); + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(0); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + uint64_t total_keys = 0; + for (iter->Seek(key); iter->Valid(); iter->Next()) { + if (FLAGS_trigger_deadlock) { + std::cout << "Behold the deadlock!\n"; + db->Delete(write_options, iter->key()); + } + auto test_key = SliceToTestKey(iter->key()); + if (test_key->prefix != prefix) break; + total_keys++; + } + hist_seek_time.Add(timer.ElapsedNanos()); + hist_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2); + } + + std::cout << "Seek key comparison: \n" + << hist_seek_comparison.ToString() + << "Seek time: \n" + << hist_seek_time.ToString(); + + // test non-existing keys + HistogramImpl hist_no_seek_time; + HistogramImpl hist_no_seek_comparison; + + for (auto prefix = FLAGS_total_prefixes; + prefix < FLAGS_total_prefixes + 100; + prefix++) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + hist_no_seek_time.Add(timer.ElapsedNanos()); + hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_TRUE(!iter->Valid()); + } + + std::cout << "non-existing Seek key comparison: \n" + << hist_no_seek_comparison.ToString() + << "non-existing Seek time: \n" + << hist_no_seek_time.ToString(); +} TEST(PrefixTest, PrefixHash) { diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 1b3a71cc3..d01c35a19 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -130,6 +130,12 @@ class MemTableRep { return GetIterator(); } + // Return an iterator that has a special Seek semantics. The result of + // a Seek might only include keys with the same prefix as the target key. + virtual std::shared_ptr GetDynamicPrefixIterator() { + return GetIterator(); + } + protected: // When *key is an internal key concatenated with the value, returns the // user key. diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 08f6cf0c2..8eca47133 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -649,6 +649,10 @@ struct ReadOptions { // Default: true bool fill_cache; + // If this option is set and memtable implementation allows, Seek + // might only return keys with the same prefix as the seek-key + bool prefix_seek; + // If "snapshot" is non-nullptr, read as of the supplied snapshot // (which must belong to the DB that is being read and which must // not have been released). If "snapshot" is nullptr, use an impliicit @@ -678,13 +682,14 @@ struct ReadOptions { ReadOptions() : verify_checksums(false), fill_cache(true), + prefix_seek(false), snapshot(nullptr), prefix(nullptr), read_tier(kReadAllTier) { } ReadOptions(bool cksum, bool cache) : verify_checksums(cksum), fill_cache(cache), - snapshot(nullptr), prefix(nullptr), + prefix_seek(false), snapshot(nullptr), prefix(nullptr), read_tier(kReadAllTier) { } }; diff --git a/util/transformrep.cc b/util/transformrep.cc index 9d620f9bf..4c7df1321 100644 --- a/util/transformrep.cc +++ b/util/transformrep.cc @@ -51,10 +51,16 @@ class TransformRep : public MemTableRep { virtual std::shared_ptr GetIterator( const Slice& slice) override; + virtual std::shared_ptr GetDynamicPrefixIterator() + override { + return std::make_shared(*this); + } + std::shared_ptr GetTransformIterator( const Slice& transformed); private: + friend class DynamicPrefixIterator; typedef std::set Bucket; typedef std::unordered_map> BucketMap; @@ -148,6 +154,80 @@ class TransformRep : public MemTableRep { private: const ReadLock l_; }; + + + class DynamicPrefixIterator : public MemTableRep::Iterator { + private: + // the underlying memtable rep + const TransformRep& memtable_rep_; + // the result of a prefix seek + std::unique_ptr bucket_iterator_; + + public: + explicit DynamicPrefixIterator(const TransformRep& memtable_rep) + : memtable_rep_(memtable_rep) {} + + virtual ~DynamicPrefixIterator() { }; + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return bucket_iterator_ && bucket_iterator_->Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + assert(Valid()); + return bucket_iterator_->key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + assert(Valid()); + bucket_iterator_->Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + assert(Valid()); + bucket_iterator_->Prev(); + } + + // Advance to the first entry with a key >= target within the + // same bucket as target + virtual void Seek(const char* target) { + Slice prefix = memtable_rep_.transform_->Transform( + memtable_rep_.UserKey(target)); + + ReadLock l(&memtable_rep_.rwlock_); + auto bucket = memtable_rep_.buckets_.find(prefix); + if (bucket == memtable_rep_.buckets_.end()) { + bucket_iterator_.reset(nullptr); + } else { + bucket_iterator_.reset( + new TransformIterator(bucket->second, memtable_rep_.GetLock(prefix))); + bucket_iterator_->Seek(target); + } + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + bucket_iterator_.reset(nullptr); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + bucket_iterator_.reset(nullptr); + } + }; }; class PrefixHashRep : public TransformRep {