More improvements to filter_bench (#5968)

Summary:
* Adds support for plain table filter. This is not critical right now, but does add a -impl flag that will be useful for new filter implementations initially targeted at block-based table (and maybe later ported to plain table)
* Better mixing of inside vs. outside queries, for more realism
* A -best_case option handy for implementation tuning inner loop
* Option for whether to include hashing time in dry run / net timings

No modifications to production code, just filter_bench.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5968

Differential Revision: D18139872

Pulled By: pdillinger

fbshipit-source-id: 5b09eba963111b48f9e0525a706e9921070990e8
main
Peter Dillinger 5 years ago committed by Facebook Github Bot
parent b3dc2f3691
commit 3f891c40a0
  1. 250
      util/filter_bench.cc

@ -16,11 +16,13 @@ int main() {
#include <sstream> #include <sstream>
#include <vector> #include <vector>
#include "memory/arena.h"
#include "port/port.h" #include "port/port.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "rocksdb/filter_policy.h" #include "rocksdb/filter_policy.h"
#include "table/block_based/full_filter_block.h" #include "table/block_based/full_filter_block.h"
#include "table/block_based/mock_block_based_table.h" #include "table/block_based/mock_block_based_table.h"
#include "table/plain/plain_table_bloom.h"
#include "util/gflags_compat.h" #include "util/gflags_compat.h"
#include "util/hash.h" #include "util/hash.h"
#include "util/random.h" #include "util/random.h"
@ -57,8 +59,24 @@ DEFINE_double(m_queries, 200, "Millions of queries for each test mode");
DEFINE_bool(use_full_block_reader, false, DEFINE_bool(use_full_block_reader, false,
"Use FullFilterBlockReader interface rather than FilterBitsReader"); "Use FullFilterBlockReader interface rather than FilterBitsReader");
DEFINE_bool(use_plain_table_bloom, false,
"Use PlainTableBloom structure and interface rather than "
"FilterBitsReader/FullFilterBlockReader");
DEFINE_uint32(impl, 0,
"Select filter implementation. Without -use_plain_table_bloom:"
"0 = full filter, 1 = block-based filter. With "
"-use_plain_table_bloom: 0 = no locality, 1 = locality.");
DEFINE_bool(net_includes_hashing, false,
"Whether query net ns/op times should include hashing. "
"(if not, dry run will include hashing) "
"(build times always include hashing)");
DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries"); DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries");
DEFINE_bool(best_case, false, "Run limited tests only for best-case");
DEFINE_bool(allow_bad_fp_rate, false, "Continue even if FP rate is bad"); DEFINE_bool(allow_bad_fp_rate, false, "Continue even if FP rate is bad");
DEFINE_bool(legend, false, DEFINE_bool(legend, false,
@ -73,14 +91,18 @@ void _always_assert_fail(int line, const char *file, const char *expr) {
#define ALWAYS_ASSERT(cond) \ #define ALWAYS_ASSERT(cond) \
((cond) ? (void)0 : ::_always_assert_fail(__LINE__, __FILE__, #cond)) ((cond) ? (void)0 : ::_always_assert_fail(__LINE__, __FILE__, #cond))
using rocksdb::Arena;
using rocksdb::BlockContents; using rocksdb::BlockContents;
using rocksdb::BloomHash;
using rocksdb::CachableEntry; using rocksdb::CachableEntry;
using rocksdb::EncodeFixed32; using rocksdb::EncodeFixed32;
using rocksdb::fastrange32; using rocksdb::fastrange32;
using rocksdb::FilterBitsBuilder; using rocksdb::FilterBitsBuilder;
using rocksdb::FilterBitsReader; using rocksdb::FilterBitsReader;
using rocksdb::FullFilterBlockReader; using rocksdb::FullFilterBlockReader;
using rocksdb::GetSliceHash;
using rocksdb::ParsedFullFilterBlock; using rocksdb::ParsedFullFilterBlock;
using rocksdb::PlainTableBloomV1;
using rocksdb::Random32; using rocksdb::Random32;
using rocksdb::Slice; using rocksdb::Slice;
using rocksdb::mock::MockBlockBasedTableTester; using rocksdb::mock::MockBlockBasedTableTester;
@ -142,6 +164,7 @@ struct FilterInfo {
uint32_t keys_added_ = 0; uint32_t keys_added_ = 0;
std::unique_ptr<FilterBitsReader> reader_; std::unique_ptr<FilterBitsReader> reader_;
std::unique_ptr<FullFilterBlockReader> full_block_reader_; std::unique_ptr<FullFilterBlockReader> full_block_reader_;
std::unique_ptr<PlainTableBloomV1> plain_table_bloom_;
uint64_t outside_queries_ = 0; uint64_t outside_queries_ = 0;
uint64_t false_positives_ = 0; uint64_t false_positives_ = 0;
}; };
@ -165,6 +188,10 @@ static const std::vector<TestMode> quickTestModes = {
kRandomFilter, kRandomFilter,
}; };
static const std::vector<TestMode> bestCaseTestModes = {
kSingleFilter,
};
const char *TestModeToString(TestMode tm) { const char *TestModeToString(TestMode tm) {
switch (tm) { switch (tm) {
case kSingleFilter: case kSingleFilter:
@ -183,11 +210,23 @@ const char *TestModeToString(TestMode tm) {
return "Bad TestMode"; return "Bad TestMode";
} }
// Do just enough to keep some data dependence for the
// compiler / CPU
static inline uint32_t NoHash(Slice &s) {
uint32_t sz = static_cast<uint32_t>(s.size());
if (sz >= 4) {
return sz + s.data()[3];
} else {
return sz;
}
}
struct FilterBench : public MockBlockBasedTableTester { struct FilterBench : public MockBlockBasedTableTester {
std::vector<KeyMaker> kms_; std::vector<KeyMaker> kms_;
std::vector<FilterInfo> infos_; std::vector<FilterInfo> infos_;
Random32 random_; Random32 random_;
std::ostringstream fp_rate_report_; std::ostringstream fp_rate_report_;
Arena arena_;
FilterBench() FilterBench()
: MockBlockBasedTableTester( : MockBlockBasedTableTester(
@ -200,12 +239,27 @@ struct FilterBench : public MockBlockBasedTableTester {
void Go(); void Go();
double RandomQueryTest(bool inside, bool dry_run, TestMode mode); double RandomQueryTest(uint32_t inside_threshold, bool dry_run,
TestMode mode);
}; };
void FilterBench::Go() { void FilterBench::Go() {
std::unique_ptr<FilterBitsBuilder> builder( if (FLAGS_use_plain_table_bloom && FLAGS_use_full_block_reader) {
table_options_.filter_policy->GetFilterBitsBuilder()); throw std::runtime_error(
"Can't combine -use_plain_table_bloom and -use_full_block_reader");
}
if (FLAGS_impl > 1) {
throw std::runtime_error("-impl must currently be >= 0 and <= 1");
}
if (!FLAGS_use_plain_table_bloom && FLAGS_impl == 1) {
throw std::runtime_error(
"Block-based filter not currently supported by filter_bench");
}
std::unique_ptr<FilterBitsBuilder> builder;
if (!FLAGS_use_plain_table_bloom && FLAGS_impl != 1) {
builder.reset(table_options_.filter_policy->GetFilterBitsBuilder());
}
uint32_t variance_mask = 1; uint32_t variance_mask = 1;
while (variance_mask * variance_mask * 4 < FLAGS_average_keys_per_filter) { while (variance_mask * variance_mask * 4 < FLAGS_average_keys_per_filter) {
@ -213,9 +267,13 @@ void FilterBench::Go() {
} }
const std::vector<TestMode> &testModes = const std::vector<TestMode> &testModes =
FLAGS_quick ? quickTestModes : allTestModes; FLAGS_best_case ? bestCaseTestModes
: FLAGS_quick ? quickTestModes : allTestModes;
if (FLAGS_quick) { if (FLAGS_quick) {
FLAGS_m_queries /= 7.0; FLAGS_m_queries /= 7.0;
} else if (FLAGS_best_case) {
FLAGS_m_queries /= 3.0;
FLAGS_working_mem_size_mb /= 10.0;
} }
std::cout << "Building..." << std::endl; std::cout << "Building..." << std::endl;
@ -230,22 +288,35 @@ void FilterBench::Go() {
uint32_t keys_to_add = FLAGS_average_keys_per_filter + uint32_t keys_to_add = FLAGS_average_keys_per_filter +
(random_.Next() & variance_mask) - (random_.Next() & variance_mask) -
(variance_mask / 2); (variance_mask / 2);
for (uint32_t i = 0; i < keys_to_add; ++i) {
builder->AddKey(kms_[0].Get(filter_id, i));
}
infos_.emplace_back(); infos_.emplace_back();
FilterInfo &info = infos_.back(); FilterInfo &info = infos_.back();
info.filter_id_ = filter_id; info.filter_id_ = filter_id;
info.filter_ = builder->Finish(&info.owner_);
info.keys_added_ = keys_to_add; info.keys_added_ = keys_to_add;
info.reader_.reset( if (FLAGS_use_plain_table_bloom) {
table_options_.filter_policy->GetFilterBitsReader(info.filter_)); info.plain_table_bloom_.reset(new PlainTableBloomV1());
CachableEntry<ParsedFullFilterBlock> block( info.plain_table_bloom_->SetTotalBits(
new ParsedFullFilterBlock(table_options_.filter_policy.get(), &arena_, keys_to_add * FLAGS_bits_per_key, FLAGS_impl,
BlockContents(info.filter_)), 0 /*huge_page*/, nullptr /*logger*/);
nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); for (uint32_t i = 0; i < keys_to_add; ++i) {
info.full_block_reader_.reset( uint32_t hash = GetSliceHash(kms_[0].Get(filter_id, i));
new FullFilterBlockReader(table_.get(), std::move(block))); info.plain_table_bloom_->AddHash(hash);
}
info.filter_ = info.plain_table_bloom_->GetRawData();
} else {
for (uint32_t i = 0; i < keys_to_add; ++i) {
builder->AddKey(kms_[0].Get(filter_id, i));
}
info.filter_ = builder->Finish(&info.owner_);
info.reader_.reset(
table_options_.filter_policy->GetFilterBitsReader(info.filter_));
CachableEntry<ParsedFullFilterBlock> block(
new ParsedFullFilterBlock(table_options_.filter_policy.get(),
BlockContents(info.filter_)),
nullptr /* cache */, nullptr /* cache_handle */,
true /* own_value */);
info.full_block_reader_.reset(
new FullFilterBlockReader(table_.get(), std::move(block)));
}
total_memory_used += info.filter_.size(); total_memory_used += info.filter_.size();
total_keys_added += keys_to_add; total_keys_added += keys_to_add;
} }
@ -259,7 +330,7 @@ void FilterBench::Go() {
double bpk = total_memory_used * 8.0 / total_keys_added; double bpk = total_memory_used * 8.0 / total_keys_added;
std::cout << "Bits/key actual: " << bpk << std::endl; std::cout << "Bits/key actual: " << bpk << std::endl;
if (!FLAGS_quick) { if (!FLAGS_quick && !FLAGS_best_case) {
double tolerable_rate = std::pow(2.0, -(bpk - 1.0) / (1.4 + bpk / 50.0)); double tolerable_rate = std::pow(2.0, -(bpk - 1.0) / (1.4 + bpk / 50.0));
std::cout << "Best possible FP rate %: " << 100.0 * std::pow(2.0, -bpk) std::cout << "Best possible FP rate %: " << 100.0 * std::pow(2.0, -bpk)
<< std::endl; << std::endl;
@ -273,11 +344,23 @@ void FilterBench::Go() {
for (uint32_t i = 0; i < infos_.size(); ++i) { for (uint32_t i = 0; i < infos_.size(); ++i) {
FilterInfo &info = infos_[i]; FilterInfo &info = infos_[i];
for (uint32_t j = 0; j < info.keys_added_; ++j) { for (uint32_t j = 0; j < info.keys_added_; ++j) {
ALWAYS_ASSERT(info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j))); if (FLAGS_use_plain_table_bloom) {
uint32_t hash = GetSliceHash(kms_[0].Get(info.filter_id_, j));
ALWAYS_ASSERT(info.plain_table_bloom_->MayContainHash(hash));
} else {
ALWAYS_ASSERT(
info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j)));
}
} }
for (uint32_t j = 0; j < outside_q_per_f; ++j) { for (uint32_t j = 0; j < outside_q_per_f; ++j) {
fps += info.reader_->MayMatch( if (FLAGS_use_plain_table_bloom) {
kms_[0].Get(info.filter_id_, j | 0x80000000)); uint32_t hash =
GetSliceHash(kms_[0].Get(info.filter_id_, j | 0x80000000));
fps += info.plain_table_bloom_->MayContainHash(hash);
} else {
fps += info.reader_->MayMatch(
kms_[0].Get(info.filter_id_, j | 0x80000000));
}
} }
} }
std::cout << " No FNs :)" << std::endl; std::cout << " No FNs :)" << std::endl;
@ -290,26 +373,46 @@ void FilterBench::Go() {
} }
std::cout << "----------------------------" << std::endl; std::cout << "----------------------------" << std::endl;
std::cout << "Inside queries..." << std::endl; std::cout << "Mixed inside/outside queries..." << std::endl;
// 50% each inside and outside
uint32_t inside_threshold = UINT32_MAX / 2;
for (TestMode tm : testModes) { for (TestMode tm : testModes) {
random_.Seed(FLAGS_seed + 1); random_.Seed(FLAGS_seed + 1);
double f = RandomQueryTest(/*inside*/ true, /*dry_run*/ false, tm); double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
random_.Seed(FLAGS_seed + 1); random_.Seed(FLAGS_seed + 1);
double d = RandomQueryTest(/*inside*/ true, /*dry_run*/ true, tm); double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d) std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d)
<< std::endl; << std::endl;
} }
std::cout << fp_rate_report_.str();
std::cout << "----------------------------" << std::endl; if (!FLAGS_quick) {
std::cout << "Outside queries..." << std::endl; std::cout << "----------------------------" << std::endl;
for (TestMode tm : testModes) { std::cout << "Inside queries (mostly)..." << std::endl;
random_.Seed(FLAGS_seed + 2); // Do about 95% inside queries rather than 100% so that branch predictor
double f = RandomQueryTest(/*inside*/ false, /*dry_run*/ false, tm); // can't give itself an artifically crazy advantage.
random_.Seed(FLAGS_seed + 2); inside_threshold = UINT32_MAX / 20 * 19;
double d = RandomQueryTest(/*inside*/ false, /*dry_run*/ true, tm); for (TestMode tm : testModes) {
std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d) random_.Seed(FLAGS_seed + 1);
<< std::endl; double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
random_.Seed(FLAGS_seed + 1);
double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d)
<< std::endl;
}
std::cout << "----------------------------" << std::endl;
std::cout << "Outside queries (mostly)..." << std::endl;
// Do about 95% outside queries rather than 100% so that branch predictor
// can't give itself an artifically crazy advantage.
inside_threshold = UINT32_MAX / 20;
for (TestMode tm : testModes) {
random_.Seed(FLAGS_seed + 2);
double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
random_.Seed(FLAGS_seed + 2);
double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d)
<< std::endl;
}
} }
std::cout << fp_rate_report_.str(); std::cout << fp_rate_report_.str();
@ -317,7 +420,8 @@ void FilterBench::Go() {
std::cout << "Done. (For more info, run with -legend or -help.)" << std::endl; std::cout << "Done. (For more info, run with -legend or -help.)" << std::endl;
} }
double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) { double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
TestMode mode) {
for (auto &info : infos_) { for (auto &info : infos_) {
info.outside_queries_ = 0; info.outside_queries_ = 0;
info.false_positives_ = 0; info.false_positives_ = 0;
@ -368,6 +472,8 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true); rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
for (uint64_t q = 0; q < max_queries; q += batch_size) { for (uint64_t q = 0; q < max_queries; q += batch_size) {
bool inside_this_time = random_.Next() <= inside_threshold;
uint32_t filter_index; uint32_t filter_index;
if (random_.Next() <= primary_filter_threshold) { if (random_.Next() <= primary_filter_threshold) {
filter_index = random_.Uniformish(num_primary_filters); filter_index = random_.Uniformish(num_primary_filters);
@ -378,7 +484,7 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
} }
FilterInfo &info = infos_[filter_index]; FilterInfo &info = infos_[filter_index];
for (uint32_t i = 0; i < batch_size; ++i) { for (uint32_t i = 0; i < batch_size; ++i) {
if (inside) { if (inside_this_time) {
batch_slices[i] = batch_slices[i] =
kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_)); kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_));
} else { } else {
@ -389,14 +495,27 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
} }
} }
// TODO: implement batched interface to full block reader // TODO: implement batched interface to full block reader
if (mode == kBatchPrepared && !dry_run && !FLAGS_use_full_block_reader) { // TODO: implement batched interface to plain table bloom
if (mode == kBatchPrepared && !FLAGS_use_full_block_reader &&
!FLAGS_use_plain_table_bloom) {
for (uint32_t i = 0; i < batch_size; ++i) { for (uint32_t i = 0; i < batch_size; ++i) {
batch_results[i] = false; batch_results[i] = false;
} }
info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(), if (dry_run) {
batch_results.get()); for (uint32_t i = 0; i < batch_size; ++i) {
batch_results[i] = true;
if (FLAGS_net_includes_hashing) {
dry_run_hash += NoHash(batch_slices[i]);
} else {
dry_run_hash ^= BloomHash(batch_slices[i]);
}
}
} else {
info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(),
batch_results.get());
}
for (uint32_t i = 0; i < batch_size; ++i) { for (uint32_t i = 0; i < batch_size; ++i) {
if (inside) { if (inside_this_time) {
ALWAYS_ASSERT(batch_results[i]); ALWAYS_ASSERT(batch_results[i]);
} else { } else {
info.false_positives_ += batch_results[i]; info.false_positives_ += batch_results[i];
@ -404,11 +523,28 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
} }
} else { } else {
for (uint32_t i = 0; i < batch_size; ++i) { for (uint32_t i = 0; i < batch_size; ++i) {
if (dry_run) { bool may_match;
dry_run_hash ^= rocksdb::BloomHash(batch_slices[i]); if (FLAGS_use_plain_table_bloom) {
} else { if (dry_run) {
bool may_match; if (FLAGS_net_includes_hashing) {
if (FLAGS_use_full_block_reader) { dry_run_hash += NoHash(batch_slices[i]);
} else {
dry_run_hash ^= GetSliceHash(batch_slices[i]);
}
may_match = true;
} else {
uint32_t hash = GetSliceHash(batch_slices[i]);
may_match = info.plain_table_bloom_->MayContainHash(hash);
}
} else if (FLAGS_use_full_block_reader) {
if (dry_run) {
if (FLAGS_net_includes_hashing) {
dry_run_hash += NoHash(batch_slices[i]);
} else {
dry_run_hash ^= BloomHash(batch_slices[i]);
}
may_match = true;
} else {
may_match = info.full_block_reader_->KeyMayMatch( may_match = info.full_block_reader_->KeyMayMatch(
batch_slices[i], batch_slices[i],
/*prefix_extractor=*/nullptr, /*prefix_extractor=*/nullptr,
@ -416,15 +552,24 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
/*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
/*get_context=*/nullptr, /*get_context=*/nullptr,
/*lookup_context=*/nullptr); /*lookup_context=*/nullptr);
} else {
may_match = info.reader_->MayMatch(batch_slices[i]);
} }
if (inside) { } else {
ALWAYS_ASSERT(may_match); if (dry_run) {
if (FLAGS_net_includes_hashing) {
dry_run_hash += NoHash(batch_slices[i]);
} else {
dry_run_hash ^= BloomHash(batch_slices[i]);
}
may_match = true;
} else { } else {
info.false_positives_ += may_match; may_match = info.reader_->MayMatch(batch_slices[i]);
} }
} }
if (inside_this_time) {
ALWAYS_ASSERT(may_match);
} else {
info.false_positives_ += may_match;
}
} }
} }
} }
@ -444,7 +589,8 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
std::cout << "ns/op: " << ns << std::endl; std::cout << "ns/op: " << ns << std::endl;
} }
if (!inside && !dry_run && mode == kRandomFilter) { if (!dry_run) {
fp_rate_report_ = std::ostringstream();
uint64_t q = 0; uint64_t q = 0;
uint64_t fp = 0; uint64_t fp = 0;
double worst_fp_rate = 0.0; double worst_fp_rate = 0.0;
@ -459,7 +605,7 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
} }
} }
fp_rate_report_ << " Average FP rate %: " << 100.0 * fp / q << std::endl; fp_rate_report_ << " Average FP rate %: " << 100.0 * fp / q << std::endl;
if (!FLAGS_quick) { if (!FLAGS_quick && !FLAGS_best_case) {
fp_rate_report_ << " Worst FP rate %: " << 100.0 * worst_fp_rate fp_rate_report_ << " Worst FP rate %: " << 100.0 * worst_fp_rate
<< std::endl; << std::endl;
fp_rate_report_ << " Best FP rate %: " << 100.0 * best_fp_rate fp_rate_report_ << " Best FP rate %: " << 100.0 * best_fp_rate
@ -467,8 +613,6 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
fp_rate_report_ << " Best possible bits/key: " fp_rate_report_ << " Best possible bits/key: "
<< -std::log(double(fp) / q) / std::log(2.0) << std::endl; << -std::log(double(fp) / q) / std::log(2.0) << std::endl;
} }
} else {
fp_rate_report_.clear();
} }
return ns; return ns;
} }

Loading…
Cancel
Save