From c2bd8f4824bda98db8699f1e08d6969cf21ef86f Mon Sep 17 00:00:00 2001 From: Tyler Harter Date: Thu, 22 Aug 2013 16:06:50 -0700 Subject: [PATCH] Prefix scan: db_bench and bug fixes Summary: If use_prefix_filters is set and read_range>1, then the random seeks will set a the prefix filter to be the prefix of the key which was randomly selected as the target. Still need to add statistics (perhaps in a separate diff). Test Plan: ./db_bench --benchmarks=fillseq,prefixscanrandom --num=10000000 --statistics=1 --use_prefix_blooms=1 --use_prefix_api=1 --bloom_bits=10 Reviewers: dhruba Reviewed By: dhruba CC: leveldb, haobo Differential Revision: https://reviews.facebook.net/D12273 --- db/db_bench.cc | 61 ++++++++++++++++++++++++++++++++++-- db/table_cache.cc | 18 +++++++++++ db/table_cache.h | 6 ++++ db/version_set.cc | 48 +++++++++++++++++++++++++--- db/version_set.h | 2 ++ include/leveldb/statistics.h | 10 +++++- include/leveldb/status.h | 2 +- table/filter_block.cc | 11 +++++-- table/table.cc | 20 +++++++++--- table/table.h | 2 +- 10 files changed, 164 insertions(+), 16 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index 56ddd2356..99f1549a2 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -41,8 +41,9 @@ // readrandom -- read N times in random order // readmissing -- read N missing keys in random order // readhot -- read N times in random order from 1% section of DB -// readwhilewriting -- 1 writer, N threads doing random reads -// readrandomwriterandom - N threads doing random-read, random-write +// readwhilewriting -- 1 writer, N threads doing random reads +// readrandomwriterandom -- N threads doing random-read, random-write +// prefixscanrandom -- prefix scan N times in random order // updaterandom -- N threads doing read-modify-write for random keys // appendrandom -- N threads doing read-modify-write with growing values // mergerandom -- same as updaterandom/appendrandom using merge operator @@ -95,6 +96,13 @@ static long FLAGS_reads = -1; // When ==1 reads use ::Get, when >1 reads use an iterator static long FLAGS_read_range = 1; +// Whether to place prefixes in blooms +static bool FLAGS_use_prefix_blooms = false; + +// Whether to set ReadOptions.prefix for prefixscanrandom. If this +// true, use_prefix_blooms must also be true. +static bool FLAGS_use_prefix_api = false; + // Seed base for random number generators. When 0 it is deterministic. static long FLAGS_seed = 0; @@ -631,6 +639,7 @@ class Benchmark { private: shared_ptr cache_; const FilterPolicy* filter_policy_; + const SliceTransform* prefix_extractor_; DB* db_; long num_; int value_size_; @@ -773,6 +782,7 @@ class Benchmark { filter_policy_(FLAGS_bloom_bits >= 0 ? NewBloomFilterPolicy(FLAGS_bloom_bits) : nullptr), + prefix_extractor_(NewFixedPrefixTransform(FLAGS_key_size-1)), db_(nullptr), num_(FLAGS_num), value_size_(FLAGS_value_size), @@ -799,6 +809,7 @@ class Benchmark { ~Benchmark() { delete db_; delete filter_policy_; + delete prefix_extractor_; } //this function will construct string format for key. e.g "%016d" @@ -894,6 +905,8 @@ class Benchmark { } else if (name == Slice("readrandomsmall")) { reads_ /= 1000; method = &Benchmark::ReadRandom; + } else if (name == Slice("prefixscanrandom")) { + method = &Benchmark::PrefixScanRandom; } else if (name == Slice("deleteseq")) { method = &Benchmark::DeleteSeq; } else if (name == Slice("deleterandom")) { @@ -1146,6 +1159,8 @@ class Benchmark { FLAGS_compaction_universal_min_merge_width; options.block_size = FLAGS_block_size; options.filter_policy = filter_policy_; + options.prefix_extractor = FLAGS_use_prefix_blooms ? prefix_extractor_ + : nullptr; options.max_open_files = FLAGS_open_files; options.statistics = dbstats; options.env = FLAGS_env; @@ -1467,6 +1482,41 @@ class Benchmark { thread->stats.AddMessage(msg); } + void PrefixScanRandom(ThreadState* thread) { + if (FLAGS_use_prefix_api) { + assert(FLAGS_use_prefix_blooms); + assert(FLAGS_bloom_bits >= 1); + } + + ReadOptions options(FLAGS_verify_checksum, true); + Duration duration(FLAGS_duration, reads_); + + long found = 0; + + while (!duration.Done(1)) { + std::string value; + const int k = thread->rand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k); + Slice skey(key.get()); + Slice prefix = prefix_extractor_->Transform(skey); + options.prefix = FLAGS_use_prefix_api ? &prefix : nullptr; + + Iterator* iter = db_->NewIterator(options); + for (iter->Seek(skey); + iter->Valid() && iter->key().starts_with(prefix); + iter->Next()) { + found++; + } + delete iter; + + thread->stats.FinishedSingleOp(db_); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%ld of %ld found)", found, reads_); + thread->stats.AddMessage(msg); + } + void ReadMissing(ThreadState* thread) { FLAGS_warn_missing_keys = false; // Never warn about missing keys @@ -2170,6 +2220,13 @@ int main(int argc, char** argv) { FLAGS_reads = n; } else if (sscanf(argv[i], "--read_range=%d%c", &n, &junk) == 1) { FLAGS_read_range = n; + + } else if (sscanf(argv[i], "--use_prefix_blooms=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_use_prefix_blooms = n; + } else if (sscanf(argv[i], "--use_prefix_api=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_use_prefix_api = n; } else if (sscanf(argv[i], "--duration=%d%c", &n, &junk) == 1) { FLAGS_duration = n; } else if (sscanf(argv[i], "--seed=%ld%c", &l, &junk) == 1) { diff --git a/db/table_cache.cc b/db/table_cache.cc index 02408d95c..2d1a4bf39 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -135,6 +135,24 @@ Status TableCache::Get(const ReadOptions& options, return s; } +bool TableCache::PrefixMayMatch(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + const Slice& internal_prefix, + bool* table_io) { + Cache::Handle* handle = nullptr; + Status s = FindTable(storage_options_, file_number, + file_size, &handle, table_io); + bool may_match = true; + if (s.ok()) { + Table* t = + reinterpret_cast(cache_->Value(handle)); + may_match = t->PrefixMayMatch(internal_prefix); + cache_->Release(handle); + } + return may_match; +} + void TableCache::Evict(uint64_t file_number) { char buf[sizeof(file_number)]; EncodeFixed64(buf, file_number); diff --git a/db/table_cache.h b/db/table_cache.h index 2f3787609..6b7b4b53a 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -52,6 +52,12 @@ class TableCache { void (*mark_key_may_exist)(void*) = nullptr, const bool no_io = false); + // Determine whether the table may contain the specified prefix. If + // the table index of blooms are not in memory, this may cause an I/O + bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number, + uint64_t file_size, const Slice& internal_prefix, + bool* table_io); + // Evict any entry for the specified file number void Evict(uint64_t file_number); diff --git a/db/version_set.cc b/db/version_set.cc index 29741ba14..bf9376e84 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -189,7 +189,14 @@ static Iterator* GetFileIterator(void* arg, return NewErrorIterator( Status::Corruption("FileReader invoked with unexpected value")); } else { - return cache->NewIterator(options, + ReadOptions options_copy; + if (options.prefix) { + // suppress prefix filtering since we have already checked the + // filters once at this point + options_copy = options; + options_copy.prefix = nullptr; + } + return cache->NewIterator(options.prefix ? options_copy : options, soptions, DecodeFixed64(file_value.data()), DecodeFixed64(file_value.data() + 8), @@ -198,12 +205,45 @@ static Iterator* GetFileIterator(void* arg, } } +bool Version::PrefixMayMatch(const ReadOptions& options, + const EnvOptions& soptions, + const Slice& internal_prefix, + Iterator* level_iter) const { + bool may_match = true; + level_iter->Seek(internal_prefix); + if (!level_iter->Valid()) { + // we're past end of level + may_match = false; + } else if (ExtractUserKey(level_iter->key()).starts_with( + ExtractUserKey(internal_prefix))) { + // TODO(tylerharter): do we need this case? Or are we guaranteed + // key() will always be the biggest value for this SST? + may_match = true; + } else { + may_match = vset_->table_cache_->PrefixMayMatch( + options, + DecodeFixed64(level_iter->value().data()), + DecodeFixed64(level_iter->value().data() + 8), + internal_prefix, nullptr); + } + return may_match; +} + Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, const EnvOptions& soptions, int level) const { - return NewTwoLevelIterator( - new LevelFileNumIterator(vset_->icmp_, &files_[level]), - &GetFileIterator, vset_->table_cache_, options, soptions); + Iterator* level_iter = new LevelFileNumIterator(vset_->icmp_, &files_[level]); + if (options.prefix) { + InternalKey internal_prefix(*options.prefix, 0, kTypeValue); + if (!PrefixMayMatch(options, soptions, + internal_prefix.Encode(), level_iter)) { + delete level_iter; + // nothing in this level can match the prefix + return NewEmptyIterator(); + } + } + return NewTwoLevelIterator(level_iter, &GetFileIterator, + vset_->table_cache_, options, soptions); } void Version::AddIterators(const ReadOptions& options, diff --git a/db/version_set.h b/db/version_set.h index ddfd17cdc..9a7aeb20b 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -152,6 +152,8 @@ class Version { Iterator* NewConcatenatingIterator(const ReadOptions&, const EnvOptions& soptions, int level) const; + bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions, + const Slice& internal_prefix, Iterator* level_iter) const; VersionSet* vset_; // VersionSet to which this Version belongs Version* next_; // Next version in linked list diff --git a/include/leveldb/statistics.h b/include/leveldb/statistics.h index 9a36ffccf..78643ff67 100644 --- a/include/leveldb/statistics.h +++ b/include/leveldb/statistics.h @@ -62,13 +62,21 @@ enum Tickers { NUMBER_MERGE_FAILURES = 22, SEQUENCE_NUMBER = 23, - TICKER_ENUM_MAX = 24 + // number of times bloom was checked before creating iterator on a + // file, and the number of times the check was useful in avoiding + // iterator creation (and thus likely IOPs). + BLOOM_FILTER_PREFIX_CHECKED = 24, + BLOOM_FILTER_PREFIX_USEFUL = 25, + + TICKER_ENUM_MAX = 26 }; const std::vector> TickersNameMap = { { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" }, { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" }, { BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" }, + { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" }, + { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" }, { COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" }, { COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" }, { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" }, diff --git a/include/leveldb/status.h b/include/leveldb/status.h index a20351f32..e41adb39f 100644 --- a/include/leveldb/status.h +++ b/include/leveldb/status.h @@ -91,7 +91,7 @@ class Status { kNotSupported = 3, kInvalidArgument = 4, kIOError = 5, - kMergeInProgress = 6 + kMergeInProgress = 6, }; Code code() const { diff --git a/table/filter_block.cc b/table/filter_block.cc index 5f687ebc7..337cbacd8 100644 --- a/table/filter_block.cc +++ b/table/filter_block.cc @@ -45,6 +45,7 @@ bool FilterBlockBuilder::SamePrefix(const Slice &key1, } void FilterBlockBuilder::AddKey(const Slice& key) { + // get slice for most recently added entry Slice prev; if (start_.size() > 0) { size_t prev_start = start_[start_.size() - 1]; @@ -53,17 +54,21 @@ void FilterBlockBuilder::AddKey(const Slice& key) { prev = Slice(base, length); } + // add key to filter if needed if (whole_key_filtering_) { start_.push_back(entries_.size()); entries_.append(key.data(), key.size()); } - if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { + // add prefix to filter if needed + Slice user_key = ExtractUserKey(key); + if (prefix_extractor_ && prefix_extractor_->InDomain(user_key)) { // this assumes prefix(prefix(key)) == prefix(key), as the last // entry in entries_ may be either a key or prefix, and we use // prefix(last entry) to get the prefix of the last key. - if (prev.size() == 0 || ! SamePrefix(key, prev)) { - Slice prefix = prefix_extractor_->Transform(key); + if (prev.size() == 0 || + !SamePrefix(user_key, ExtractUserKey(prev))) { + Slice prefix = prefix_extractor_->Transform(user_key); InternalKey internal_prefix_tmp(prefix, 0, kTypeValue); Slice internal_prefix = internal_prefix_tmp.Encode(); assert(comparator_->Compare(internal_prefix, key) <= 0); diff --git a/table/table.cc b/table/table.cc index 9a5f1b761..31c2d2679 100644 --- a/table/table.cc +++ b/table/table.cc @@ -328,6 +328,11 @@ Iterator* Table::BlockReader(void* arg, // 1) key.starts_with(prefix(key)) // 2) Compare(prefix(key), key) <= 0. // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 +// +// TODO(tylerharter): right now, this won't cause I/O since blooms are +// in memory. When blooms may need to be paged in, we should refactor so that +// this is only ever called lazily. In particular, this shouldn't be called +// while the DB lock is held like it is now. bool Table::PrefixMayMatch(const Slice& internal_prefix) const { FilterBlockReader* filter = rep_->filter; bool may_match = true; @@ -337,12 +342,14 @@ bool Table::PrefixMayMatch(const Slice& internal_prefix) const { return true; } - Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); + std::unique_ptr iiter(rep_->index_block->NewIterator( + rep_->options.comparator)); iiter->Seek(internal_prefix); - if (! iiter->Valid()) { + if (!iiter->Valid()) { // we're past end of file may_match = false; - } else if (iiter->key().starts_with(internal_prefix)) { + } else if (ExtractUserKey(iiter->key()).starts_with( + ExtractUserKey(internal_prefix))) { // we need to check for this subtle case because our only // guarantee is that "the key is a string >= last key in that data // block" according to the doc/table_format.txt spec. @@ -366,7 +373,12 @@ bool Table::PrefixMayMatch(const Slice& internal_prefix) const { assert(s.ok()); may_match = filter->PrefixMayMatch(handle.offset(), internal_prefix); } - delete iiter; + + RecordTick(rep_->options.statistics, BLOOM_FILTER_PREFIX_CHECKED); + if (!may_match) { + RecordTick(rep_->options.statistics, BLOOM_FILTER_PREFIX_USEFUL); + } + return may_match; } diff --git a/table/table.h b/table/table.h index c2cb5a32b..b1b2dd57c 100644 --- a/table/table.h +++ b/table/table.h @@ -47,7 +47,7 @@ class Table { ~Table(); - bool PrefixMayMatch(const Slice& prefix) const; + bool PrefixMayMatch(const Slice& internal_prefix) const; // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must