Prefix scan: db_bench and bug fixes

Summary: If use_prefix_filters is set and read_range>1, then the random seeks will set a the prefix filter to be the prefix of the key which was randomly selected as the target.  Still need to add statistics (perhaps in a separate diff).

Test Plan: ./db_bench --benchmarks=fillseq,prefixscanrandom --num=10000000 --statistics=1 --use_prefix_blooms=1 --use_prefix_api=1 --bloom_bits=10

Reviewers: dhruba

Reviewed By: dhruba

CC: leveldb, haobo

Differential Revision: https://reviews.facebook.net/D12273
main
Tyler Harter 11 years ago
parent 60bf2b7d4a
commit c2bd8f4824
  1. 61
      db/db_bench.cc
  2. 18
      db/table_cache.cc
  3. 6
      db/table_cache.h
  4. 48
      db/version_set.cc
  5. 2
      db/version_set.h
  6. 10
      include/leveldb/statistics.h
  7. 2
      include/leveldb/status.h
  8. 11
      table/filter_block.cc
  9. 20
      table/table.cc
  10. 2
      table/table.h

@ -41,8 +41,9 @@
// readrandom -- read N times in random order // readrandom -- read N times in random order
// readmissing -- read N missing keys in random order // readmissing -- read N missing keys in random order
// readhot -- read N times in random order from 1% section of DB // readhot -- read N times in random order from 1% section of DB
// readwhilewriting -- 1 writer, N threads doing random reads // readwhilewriting -- 1 writer, N threads doing random reads
// readrandomwriterandom - N threads doing random-read, random-write // readrandomwriterandom -- N threads doing random-read, random-write
// prefixscanrandom -- prefix scan N times in random order
// updaterandom -- N threads doing read-modify-write for random keys // updaterandom -- N threads doing read-modify-write for random keys
// appendrandom -- N threads doing read-modify-write with growing values // appendrandom -- N threads doing read-modify-write with growing values
// mergerandom -- same as updaterandom/appendrandom using merge operator // mergerandom -- same as updaterandom/appendrandom using merge operator
@ -95,6 +96,13 @@ static long FLAGS_reads = -1;
// When ==1 reads use ::Get, when >1 reads use an iterator // When ==1 reads use ::Get, when >1 reads use an iterator
static long FLAGS_read_range = 1; static long FLAGS_read_range = 1;
// Whether to place prefixes in blooms
static bool FLAGS_use_prefix_blooms = false;
// Whether to set ReadOptions.prefix for prefixscanrandom. If this
// true, use_prefix_blooms must also be true.
static bool FLAGS_use_prefix_api = false;
// Seed base for random number generators. When 0 it is deterministic. // Seed base for random number generators. When 0 it is deterministic.
static long FLAGS_seed = 0; static long FLAGS_seed = 0;
@ -631,6 +639,7 @@ class Benchmark {
private: private:
shared_ptr<Cache> cache_; shared_ptr<Cache> cache_;
const FilterPolicy* filter_policy_; const FilterPolicy* filter_policy_;
const SliceTransform* prefix_extractor_;
DB* db_; DB* db_;
long num_; long num_;
int value_size_; int value_size_;
@ -773,6 +782,7 @@ class Benchmark {
filter_policy_(FLAGS_bloom_bits >= 0 filter_policy_(FLAGS_bloom_bits >= 0
? NewBloomFilterPolicy(FLAGS_bloom_bits) ? NewBloomFilterPolicy(FLAGS_bloom_bits)
: nullptr), : nullptr),
prefix_extractor_(NewFixedPrefixTransform(FLAGS_key_size-1)),
db_(nullptr), db_(nullptr),
num_(FLAGS_num), num_(FLAGS_num),
value_size_(FLAGS_value_size), value_size_(FLAGS_value_size),
@ -799,6 +809,7 @@ class Benchmark {
~Benchmark() { ~Benchmark() {
delete db_; delete db_;
delete filter_policy_; delete filter_policy_;
delete prefix_extractor_;
} }
//this function will construct string format for key. e.g "%016d" //this function will construct string format for key. e.g "%016d"
@ -894,6 +905,8 @@ class Benchmark {
} else if (name == Slice("readrandomsmall")) { } else if (name == Slice("readrandomsmall")) {
reads_ /= 1000; reads_ /= 1000;
method = &Benchmark::ReadRandom; method = &Benchmark::ReadRandom;
} else if (name == Slice("prefixscanrandom")) {
method = &Benchmark::PrefixScanRandom;
} else if (name == Slice("deleteseq")) { } else if (name == Slice("deleteseq")) {
method = &Benchmark::DeleteSeq; method = &Benchmark::DeleteSeq;
} else if (name == Slice("deleterandom")) { } else if (name == Slice("deleterandom")) {
@ -1146,6 +1159,8 @@ class Benchmark {
FLAGS_compaction_universal_min_merge_width; FLAGS_compaction_universal_min_merge_width;
options.block_size = FLAGS_block_size; options.block_size = FLAGS_block_size;
options.filter_policy = filter_policy_; options.filter_policy = filter_policy_;
options.prefix_extractor = FLAGS_use_prefix_blooms ? prefix_extractor_
: nullptr;
options.max_open_files = FLAGS_open_files; options.max_open_files = FLAGS_open_files;
options.statistics = dbstats; options.statistics = dbstats;
options.env = FLAGS_env; options.env = FLAGS_env;
@ -1467,6 +1482,41 @@ class Benchmark {
thread->stats.AddMessage(msg); thread->stats.AddMessage(msg);
} }
void PrefixScanRandom(ThreadState* thread) {
if (FLAGS_use_prefix_api) {
assert(FLAGS_use_prefix_blooms);
assert(FLAGS_bloom_bits >= 1);
}
ReadOptions options(FLAGS_verify_checksum, true);
Duration duration(FLAGS_duration, reads_);
long found = 0;
while (!duration.Done(1)) {
std::string value;
const int k = thread->rand.Next() % FLAGS_num;
unique_ptr<char []> key = GenerateKeyFromInt(k);
Slice skey(key.get());
Slice prefix = prefix_extractor_->Transform(skey);
options.prefix = FLAGS_use_prefix_api ? &prefix : nullptr;
Iterator* iter = db_->NewIterator(options);
for (iter->Seek(skey);
iter->Valid() && iter->key().starts_with(prefix);
iter->Next()) {
found++;
}
delete iter;
thread->stats.FinishedSingleOp(db_);
}
char msg[100];
snprintf(msg, sizeof(msg), "(%ld of %ld found)", found, reads_);
thread->stats.AddMessage(msg);
}
void ReadMissing(ThreadState* thread) { void ReadMissing(ThreadState* thread) {
FLAGS_warn_missing_keys = false; // Never warn about missing keys FLAGS_warn_missing_keys = false; // Never warn about missing keys
@ -2170,6 +2220,13 @@ int main(int argc, char** argv) {
FLAGS_reads = n; FLAGS_reads = n;
} else if (sscanf(argv[i], "--read_range=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--read_range=%d%c", &n, &junk) == 1) {
FLAGS_read_range = n; FLAGS_read_range = n;
} else if (sscanf(argv[i], "--use_prefix_blooms=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_prefix_blooms = n;
} else if (sscanf(argv[i], "--use_prefix_api=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_prefix_api = n;
} else if (sscanf(argv[i], "--duration=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--duration=%d%c", &n, &junk) == 1) {
FLAGS_duration = n; FLAGS_duration = n;
} else if (sscanf(argv[i], "--seed=%ld%c", &l, &junk) == 1) { } else if (sscanf(argv[i], "--seed=%ld%c", &l, &junk) == 1) {

@ -135,6 +135,24 @@ Status TableCache::Get(const ReadOptions& options,
return s; return s;
} }
bool TableCache::PrefixMayMatch(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& internal_prefix,
bool* table_io) {
Cache::Handle* handle = nullptr;
Status s = FindTable(storage_options_, file_number,
file_size, &handle, table_io);
bool may_match = true;
if (s.ok()) {
Table* t =
reinterpret_cast<Table*>(cache_->Value(handle));
may_match = t->PrefixMayMatch(internal_prefix);
cache_->Release(handle);
}
return may_match;
}
void TableCache::Evict(uint64_t file_number) { void TableCache::Evict(uint64_t file_number) {
char buf[sizeof(file_number)]; char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number); EncodeFixed64(buf, file_number);

@ -52,6 +52,12 @@ class TableCache {
void (*mark_key_may_exist)(void*) = nullptr, void (*mark_key_may_exist)(void*) = nullptr,
const bool no_io = false); const bool no_io = false);
// Determine whether the table may contain the specified prefix. If
// the table index of blooms are not in memory, this may cause an I/O
bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
uint64_t file_size, const Slice& internal_prefix,
bool* table_io);
// Evict any entry for the specified file number // Evict any entry for the specified file number
void Evict(uint64_t file_number); void Evict(uint64_t file_number);

@ -189,7 +189,14 @@ static Iterator* GetFileIterator(void* arg,
return NewErrorIterator( return NewErrorIterator(
Status::Corruption("FileReader invoked with unexpected value")); Status::Corruption("FileReader invoked with unexpected value"));
} else { } else {
return cache->NewIterator(options, ReadOptions options_copy;
if (options.prefix) {
// suppress prefix filtering since we have already checked the
// filters once at this point
options_copy = options;
options_copy.prefix = nullptr;
}
return cache->NewIterator(options.prefix ? options_copy : options,
soptions, soptions,
DecodeFixed64(file_value.data()), DecodeFixed64(file_value.data()),
DecodeFixed64(file_value.data() + 8), DecodeFixed64(file_value.data() + 8),
@ -198,12 +205,45 @@ static Iterator* GetFileIterator(void* arg,
} }
} }
bool Version::PrefixMayMatch(const ReadOptions& options,
const EnvOptions& soptions,
const Slice& internal_prefix,
Iterator* level_iter) const {
bool may_match = true;
level_iter->Seek(internal_prefix);
if (!level_iter->Valid()) {
// we're past end of level
may_match = false;
} else if (ExtractUserKey(level_iter->key()).starts_with(
ExtractUserKey(internal_prefix))) {
// TODO(tylerharter): do we need this case? Or are we guaranteed
// key() will always be the biggest value for this SST?
may_match = true;
} else {
may_match = vset_->table_cache_->PrefixMayMatch(
options,
DecodeFixed64(level_iter->value().data()),
DecodeFixed64(level_iter->value().data() + 8),
internal_prefix, nullptr);
}
return may_match;
}
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
const EnvOptions& soptions, const EnvOptions& soptions,
int level) const { int level) const {
return NewTwoLevelIterator( Iterator* level_iter = new LevelFileNumIterator(vset_->icmp_, &files_[level]);
new LevelFileNumIterator(vset_->icmp_, &files_[level]), if (options.prefix) {
&GetFileIterator, vset_->table_cache_, options, soptions); InternalKey internal_prefix(*options.prefix, 0, kTypeValue);
if (!PrefixMayMatch(options, soptions,
internal_prefix.Encode(), level_iter)) {
delete level_iter;
// nothing in this level can match the prefix
return NewEmptyIterator();
}
}
return NewTwoLevelIterator(level_iter, &GetFileIterator,
vset_->table_cache_, options, soptions);
} }
void Version::AddIterators(const ReadOptions& options, void Version::AddIterators(const ReadOptions& options,

@ -152,6 +152,8 @@ class Version {
Iterator* NewConcatenatingIterator(const ReadOptions&, Iterator* NewConcatenatingIterator(const ReadOptions&,
const EnvOptions& soptions, const EnvOptions& soptions,
int level) const; int level) const;
bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
const Slice& internal_prefix, Iterator* level_iter) const;
VersionSet* vset_; // VersionSet to which this Version belongs VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list Version* next_; // Next version in linked list

@ -62,13 +62,21 @@ enum Tickers {
NUMBER_MERGE_FAILURES = 22, NUMBER_MERGE_FAILURES = 22,
SEQUENCE_NUMBER = 23, SEQUENCE_NUMBER = 23,
TICKER_ENUM_MAX = 24 // number of times bloom was checked before creating iterator on a
// file, and the number of times the check was useful in avoiding
// iterator creation (and thus likely IOPs).
BLOOM_FILTER_PREFIX_CHECKED = 24,
BLOOM_FILTER_PREFIX_USEFUL = 25,
TICKER_ENUM_MAX = 26
}; };
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = { const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{ BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" }, { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
{ BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" }, { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
{ BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" }, { BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" },
{ BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
{ BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
{ COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" }, { COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" },
{ COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" }, { COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" },
{ COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" }, { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },

@ -91,7 +91,7 @@ class Status {
kNotSupported = 3, kNotSupported = 3,
kInvalidArgument = 4, kInvalidArgument = 4,
kIOError = 5, kIOError = 5,
kMergeInProgress = 6 kMergeInProgress = 6,
}; };
Code code() const { Code code() const {

@ -45,6 +45,7 @@ bool FilterBlockBuilder::SamePrefix(const Slice &key1,
} }
void FilterBlockBuilder::AddKey(const Slice& key) { void FilterBlockBuilder::AddKey(const Slice& key) {
// get slice for most recently added entry
Slice prev; Slice prev;
if (start_.size() > 0) { if (start_.size() > 0) {
size_t prev_start = start_[start_.size() - 1]; size_t prev_start = start_[start_.size() - 1];
@ -53,17 +54,21 @@ void FilterBlockBuilder::AddKey(const Slice& key) {
prev = Slice(base, length); prev = Slice(base, length);
} }
// add key to filter if needed
if (whole_key_filtering_) { if (whole_key_filtering_) {
start_.push_back(entries_.size()); start_.push_back(entries_.size());
entries_.append(key.data(), key.size()); entries_.append(key.data(), key.size());
} }
if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { // add prefix to filter if needed
Slice user_key = ExtractUserKey(key);
if (prefix_extractor_ && prefix_extractor_->InDomain(user_key)) {
// this assumes prefix(prefix(key)) == prefix(key), as the last // this assumes prefix(prefix(key)) == prefix(key), as the last
// entry in entries_ may be either a key or prefix, and we use // entry in entries_ may be either a key or prefix, and we use
// prefix(last entry) to get the prefix of the last key. // prefix(last entry) to get the prefix of the last key.
if (prev.size() == 0 || ! SamePrefix(key, prev)) { if (prev.size() == 0 ||
Slice prefix = prefix_extractor_->Transform(key); !SamePrefix(user_key, ExtractUserKey(prev))) {
Slice prefix = prefix_extractor_->Transform(user_key);
InternalKey internal_prefix_tmp(prefix, 0, kTypeValue); InternalKey internal_prefix_tmp(prefix, 0, kTypeValue);
Slice internal_prefix = internal_prefix_tmp.Encode(); Slice internal_prefix = internal_prefix_tmp.Encode();
assert(comparator_->Compare(internal_prefix, key) <= 0); assert(comparator_->Compare(internal_prefix, key) <= 0);

@ -328,6 +328,11 @@ Iterator* Table::BlockReader(void* arg,
// 1) key.starts_with(prefix(key)) // 1) key.starts_with(prefix(key))
// 2) Compare(prefix(key), key) <= 0. // 2) Compare(prefix(key), key) <= 0.
// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
//
// TODO(tylerharter): right now, this won't cause I/O since blooms are
// in memory. When blooms may need to be paged in, we should refactor so that
// this is only ever called lazily. In particular, this shouldn't be called
// while the DB lock is held like it is now.
bool Table::PrefixMayMatch(const Slice& internal_prefix) const { bool Table::PrefixMayMatch(const Slice& internal_prefix) const {
FilterBlockReader* filter = rep_->filter; FilterBlockReader* filter = rep_->filter;
bool may_match = true; bool may_match = true;
@ -337,12 +342,14 @@ bool Table::PrefixMayMatch(const Slice& internal_prefix) const {
return true; return true;
} }
Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); std::unique_ptr<Iterator> iiter(rep_->index_block->NewIterator(
rep_->options.comparator));
iiter->Seek(internal_prefix); iiter->Seek(internal_prefix);
if (! iiter->Valid()) { if (!iiter->Valid()) {
// we're past end of file // we're past end of file
may_match = false; may_match = false;
} else if (iiter->key().starts_with(internal_prefix)) { } else if (ExtractUserKey(iiter->key()).starts_with(
ExtractUserKey(internal_prefix))) {
// we need to check for this subtle case because our only // we need to check for this subtle case because our only
// guarantee is that "the key is a string >= last key in that data // guarantee is that "the key is a string >= last key in that data
// block" according to the doc/table_format.txt spec. // block" according to the doc/table_format.txt spec.
@ -366,7 +373,12 @@ bool Table::PrefixMayMatch(const Slice& internal_prefix) const {
assert(s.ok()); assert(s.ok());
may_match = filter->PrefixMayMatch(handle.offset(), internal_prefix); may_match = filter->PrefixMayMatch(handle.offset(), internal_prefix);
} }
delete iiter;
RecordTick(rep_->options.statistics, BLOOM_FILTER_PREFIX_CHECKED);
if (!may_match) {
RecordTick(rep_->options.statistics, BLOOM_FILTER_PREFIX_USEFUL);
}
return may_match; return may_match;
} }

@ -47,7 +47,7 @@ class Table {
~Table(); ~Table();
bool PrefixMayMatch(const Slice& prefix) const; bool PrefixMayMatch(const Slice& internal_prefix) const;
// Returns a new iterator over the table contents. // Returns a new iterator over the table contents.
// The result of NewIterator() is initially invalid (caller must // The result of NewIterator() is initially invalid (caller must

Loading…
Cancel
Save