Prepare filter tests for more implementations (#5967)

Summary:
This change sets up for alternate implementations underlying
BloomFilterPolicy:

* Refactor BloomFilterPolicy and expose in internal .h file so that it's easy to iterate over / select implementations for testing, regardless of what the best public interface will look like. Most notably updated db_bloom_filter_test to use this.
* Hide FullFilterBitsBuilder from unit tests (alternate derived classes planned); expose the part important for testing (CalculateSpace), as abstract class BuiltinFilterBitsBuilder. (Also cleaned up internally exposed interface to CalculateSpace.)
* Rename BloomTest -> BlockBasedBloomTest for clarity (despite ongoing confusion between block-based table and block-based filter)
* Assert that block-based filter construction interface is only used on BloomFilterPolicy appropriately constructed. (A couple of tests updated to add ", true".)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5967

Test Plan: make check

Differential Revision: D18138704

Pulled By: pdillinger

fbshipit-source-id: 55ef9273423b0696309e251f50b8c1b5e9ec7597
main
Peter Dillinger 5 years ago committed by Facebook Github Bot
parent 351e25401b
commit 685e895652
  1. 136
      db/db_bloom_filter_test.cc
  2. 4
      include/rocksdb/filter_policy.h
  3. 2
      table/block_based/block_based_filter_block_test.cc
  4. 340
      table/block_based/filter_policy.cc
  5. 90
      table/block_based/filter_policy_internal.h
  6. 44
      util/bloom_test.cc

@ -10,9 +10,14 @@
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "rocksdb/perf_context.h"
#include "table/block_based/filter_policy_internal.h"
namespace rocksdb {
namespace {
using BFP = BloomFilterPolicy;
} // namespace
// DB tests related to bloom filter.
class DBBloomFilterTest : public DBTestBase {
@ -20,12 +25,12 @@ class DBBloomFilterTest : public DBTestBase {
DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {}
};
class DBBloomFilterTestWithParam
: public DBTestBase,
public testing::WithParamInterface<std::tuple<bool, bool, uint32_t>> {
class DBBloomFilterTestWithParam : public DBTestBase,
public testing::WithParamInterface<
std::tuple<BFP::Impl, bool, uint32_t>> {
// public testing::WithParamInterface<bool> {
protected:
bool use_block_based_filter_;
BFP::Impl bfp_impl_;
bool partition_filters_;
uint32_t format_version_;
@ -35,7 +40,7 @@ class DBBloomFilterTestWithParam
~DBBloomFilterTestWithParam() override {}
void SetUp() override {
use_block_based_filter_ = std::get<0>(GetParam());
bfp_impl_ = std::get<0>(GetParam());
partition_filters_ = std::get<1>(GetParam());
format_version_ = std::get<2>(GetParam());
}
@ -71,8 +76,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
ReadOptions ropts;
std::string value;
anon::OptionsOverride options_override;
options_override.filter_policy.reset(
NewBloomFilterPolicy(20, use_block_based_filter_));
options_override.filter_policy.reset(new BFP(20, bfp_impl_));
options_override.partition_filters = partition_filters_;
options_override.metadata_block_size = 32;
Options options = CurrentOptions(options_override);
@ -432,8 +436,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
// trigger reset of table_factory
BlockBasedTableOptions table_options;
table_options.no_block_cache = true;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_filter_));
table_options.filter_policy.reset(new BFP(10, bfp_impl_));
table_options.partition_filters = partition_filters_;
if (partition_filters_) {
table_options.index_type =
@ -502,24 +505,24 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
#ifndef ROCKSDB_VALGRIND_RUN
INSTANTIATE_TEST_CASE_P(
FormatDef, DBBloomFilterTestDefFormatVersion,
::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
std::make_tuple(false, true, test::kDefaultFormatVersion),
std::make_tuple(false, false,
test::kDefaultFormatVersion)));
::testing::Values(
std::make_tuple(BFP::kBlock, false, test::kDefaultFormatVersion),
std::make_tuple(BFP::kFull, true, test::kDefaultFormatVersion),
std::make_tuple(BFP::kFull, false, test::kDefaultFormatVersion)));
INSTANTIATE_TEST_CASE_P(
FormatDef, DBBloomFilterTestWithParam,
::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
std::make_tuple(false, true, test::kDefaultFormatVersion),
std::make_tuple(false, false,
test::kDefaultFormatVersion)));
::testing::Values(
std::make_tuple(BFP::kBlock, false, test::kDefaultFormatVersion),
std::make_tuple(BFP::kFull, true, test::kDefaultFormatVersion),
std::make_tuple(BFP::kFull, false, test::kDefaultFormatVersion)));
INSTANTIATE_TEST_CASE_P(
FormatLatest, DBBloomFilterTestWithParam,
::testing::Values(std::make_tuple(true, false, test::kLatestFormatVersion),
std::make_tuple(false, true, test::kLatestFormatVersion),
std::make_tuple(false, false,
test::kLatestFormatVersion)));
::testing::Values(
std::make_tuple(BFP::kBlock, false, test::kLatestFormatVersion),
std::make_tuple(BFP::kFull, true, test::kLatestFormatVersion),
std::make_tuple(BFP::kFull, false, test::kLatestFormatVersion)));
#endif // ROCKSDB_VALGRIND_RUN
TEST_F(DBBloomFilterTest, BloomFilterRate) {
@ -640,7 +643,7 @@ namespace {
class WrappedBloom : public FilterPolicy {
public:
explicit WrappedBloom(int bits_per_key)
: filter_(NewBloomFilterPolicy(bits_per_key)), counter_(0) {}
: filter_(NewBloomFilterPolicy(bits_per_key, true)), counter_(0) {}
~WrappedBloom() override { delete filter_; }
@ -858,11 +861,11 @@ TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
#ifndef ROCKSDB_LITE
class BloomStatsTestWithParam
: public DBBloomFilterTest,
public testing::WithParamInterface<std::tuple<bool, bool, bool>> {
public testing::WithParamInterface<std::tuple<bool, BFP::Impl, bool>> {
public:
BloomStatsTestWithParam() {
use_block_table_ = std::get<0>(GetParam());
use_block_based_builder_ = std::get<1>(GetParam());
bfp_impl_ = std::get<1>(GetParam());
partition_filters_ = std::get<2>(GetParam());
options_.create_if_missing = true;
@ -873,13 +876,12 @@ class BloomStatsTestWithParam
BlockBasedTableOptions table_options;
table_options.hash_index_allow_collision = false;
if (partition_filters_) {
assert(!use_block_based_builder_);
assert(bfp_impl_ != BFP::kBlock);
table_options.partition_filters = partition_filters_;
table_options.index_type =
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
}
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder_));
table_options.filter_policy.reset(new BFP(10, bfp_impl_));
options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
} else {
assert(!partition_filters_); // not supported in plain table
@ -902,7 +904,7 @@ class BloomStatsTestWithParam
static void TearDownTestCase() {}
bool use_block_table_;
bool use_block_based_builder_;
BFP::Impl bfp_impl_;
bool partition_filters_;
Options options_;
};
@ -1006,7 +1008,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
ASSERT_EQ(value3, iter->value().ToString());
// The seek doesn't check block-based bloom filter because last index key
// starts with the same prefix we're seeking to.
uint64_t expected_hits = use_block_based_builder_ ? 1 : 2;
uint64_t expected_hits = bfp_impl_ == BFP::kBlock ? 1 : 2;
ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
iter->Seek(key2);
@ -1016,12 +1018,12 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
}
INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam,
::testing::Values(std::make_tuple(true, true, false),
std::make_tuple(true, false, false),
std::make_tuple(true, false, true),
std::make_tuple(false, false,
false)));
INSTANTIATE_TEST_CASE_P(
BloomStatsTestWithParam, BloomStatsTestWithParam,
::testing::Values(std::make_tuple(true, BFP::kBlock, false),
std::make_tuple(true, BFP::kFull, false),
std::make_tuple(true, BFP::kFull, true),
std::make_tuple(false, BFP::kFull, false)));
namespace {
void PrefixScanInit(DBBloomFilterTest* dbtest) {
@ -1328,8 +1330,8 @@ int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
// into the same string, or 2) the transformed seek key is of the same length
// as the upper bound and two keys are adjacent according to the comparator.
TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
int iteration = 0;
for (bool use_block_based_builder : {true, false}) {
for (auto bfp_impl : BFP::kAllImpls) {
int using_full_builder = bfp_impl != BFP::kBlock;
Options options;
options.create_if_missing = true;
options.prefix_extractor.reset(NewCappedPrefixTransform(4));
@ -1338,8 +1340,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.cache_index_and_filter_blocks = true;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder));
table_options.filter_policy.reset(new BFP(10, bfp_impl));
table_options.index_shortening = BlockBasedTableOptions::
IndexShorteningMode::kShortenSeparatorsAndSuccessor;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@ -1382,7 +1383,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
ASSERT_EQ(CountIter(iter, "abcdxx00"), 4);
// should check bloom filter since upper bound meets requirement
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration);
2 + using_full_builder);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
{
@ -1396,7 +1397,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
ASSERT_EQ(CountIter(iter, "abcdxx01"), 4);
// should skip bloom filter since upper bound is too long
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration);
2 + using_full_builder);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
{
@ -1410,7 +1411,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
// should check bloom filter since upper bound matches transformed seek
// key
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration * 2);
2 + using_full_builder * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
{
@ -1424,7 +1425,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0);
// should skip bloom filter since mismatch is found
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration * 2);
2 + using_full_builder * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}}));
@ -1438,7 +1439,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "abc"), 4);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration * 2);
2 + using_full_builder * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}}));
@ -1451,18 +1452,17 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "abc"), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
3 + iteration * 2);
3 + using_full_builder * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
}
iteration++;
}
}
// Create multiple SST files each with a different prefix_extractor config,
// verify iterators can read all SST files using the latest config.
TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
int iteration = 0;
for (bool use_block_based_builder : {true, false}) {
for (auto bfp_impl : BFP::kAllImpls) {
int using_full_builder = bfp_impl != BFP::kBlock;
Options options;
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@ -1470,8 +1470,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
options.statistics = CreateDBStatistics();
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder));
table_options.filter_policy.reset(new BFP(10, bfp_impl));
table_options.cache_index_and_filter_blocks = true;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
DestroyAndReopen(options);
@ -1497,10 +1496,10 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "foo"), 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
1 + iteration);
1 + using_full_builder);
ASSERT_EQ(CountIter(iter, "gpk"), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
1 + iteration);
1 + using_full_builder);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
// second SST with capped:3 BF
@ -1514,13 +1513,13 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration * 2);
2 + using_full_builder * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
// both counters are incremented because BF is "not changed" for 1 of the
// 2 SST files, so filter is checked once and found no match.
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
3 + iteration * 2);
3 + using_full_builder * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
}
@ -1539,24 +1538,24 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
// the first and last BF are checked
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
4 + iteration * 3);
4 + using_full_builder * 3);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
// only last BF is checked and not found
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
5 + iteration * 3);
5 + using_full_builder * 3);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
}
// iter_old can only see the first SST, so checked plus 1
ASSERT_EQ(CountIter(iter_old, "foo"), 4);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
6 + iteration * 3);
6 + using_full_builder * 3);
// iter was created after the first setoptions call so only full filter
// will check the filter
ASSERT_EQ(CountIter(iter, "foo"), 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
6 + iteration * 4);
6 + using_full_builder * 4);
{
// keys in all three SSTs are visible to iterator
@ -1565,11 +1564,11 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter_all, "foo"), 9);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
7 + iteration * 5);
7 + using_full_builder * 5);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
8 + iteration * 5);
8 + using_full_builder * 5);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
}
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
@ -1581,15 +1580,14 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
// all three SST are checked because the current options has the same as
// the remaining SST (capped:3)
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
9 + iteration * 7);
9 + using_full_builder * 7);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
10 + iteration * 7);
10 + using_full_builder * 7);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4);
}
// TODO(Zhongyi): Maybe also need to add Get calls to test point look up?
iteration++;
}
}
@ -1598,7 +1596,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
// as expected
TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
int iteration = 0;
for (bool use_block_based_builder : {true, false}) {
for (auto bfp_impl : BFP::kAllImpls) {
Options options = CurrentOptions();
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@ -1607,8 +1605,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.cache_index_and_filter_blocks = true;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder));
table_options.filter_policy.reset(new BFP(10, bfp_impl));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options);
ReadOptions read_options;
@ -1657,8 +1654,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
// Verify it's possible to change prefix_extractor at runtime and iterators
// behaves as expected
TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
int iteration = 0;
for (bool use_block_based_builder : {true, false}) {
for (auto bfp_impl : BFP::kAllImpls) {
Options options;
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@ -1667,8 +1663,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.cache_index_and_filter_blocks = true;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder));
table_options.filter_policy.reset(new BFP(10, bfp_impl));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
DestroyAndReopen(options);
@ -1719,7 +1714,6 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
ASSERT_EQ(CountIter(iter_old, "abc"), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
iteration++;
}
}

@ -138,8 +138,8 @@ class FilterPolicy {
//
// bits_per_key: bits per key in bloom filter. A good value for bits_per_key
// is 10, which yields a filter with ~ 1% false positive rate.
// use_block_based_builder: use block based filter rather than full filter.
// If you want to builder full filter, it needs to be set to false.
// use_block_based_builder: use deprecated block based filter (true) rather
// than full or partitioned filter (false).
//
// Callers must delete the result after any database that is using the
// result has been closed.

@ -240,7 +240,7 @@ class BlockBasedFilterBlockTest : public mock::MockBlockBasedTableTester,
public testing::Test {
public:
BlockBasedFilterBlockTest()
: mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10)) {}
: mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, true)) {}
};
TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {

@ -20,9 +20,66 @@
namespace rocksdb {
namespace {
typedef LegacyLocalityBloomImpl</*ExtraRotates*/ false> LegacyFullFilterImpl;
class BlockBasedFilterBlockBuilder;
class FullFilterBlockBuilder;
class FullFilterBitsBuilder : public BuiltinFilterBitsBuilder {
public:
explicit FullFilterBitsBuilder(const int bits_per_key, const int num_probes);
// No Copy allowed
FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete;
void operator=(const FullFilterBitsBuilder&) = delete;
~FullFilterBitsBuilder() override;
void AddKey(const Slice& key) override;
// Create a filter that for hashes [0, n-1], the filter is allocated here
// When creating filter, it is ensured that
// total_bits = num_lines * CACHE_LINE_SIZE * 8
// dst len is >= 5, 1 for num_probes, 4 for num_lines
// Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
// +----------------------------------------------------------------+
// | filter data with length total_bits/8 |
// +----------------------------------------------------------------+
// | |
// | ... |
// | |
// +----------------------------------------------------------------+
// | ... | num_probes : 1 byte | num_lines : 4 bytes |
// +----------------------------------------------------------------+
Slice Finish(std::unique_ptr<const char[]>* buf) override;
int CalculateNumEntry(const uint32_t bytes) override;
uint32_t CalculateSpace(const int num_entry) override {
uint32_t dont_care1;
uint32_t dont_care2;
return CalculateSpace(num_entry, &dont_care1, &dont_care2);
}
private:
friend class FullFilterBlockTest_DuplicateEntries_Test;
int bits_per_key_;
int num_probes_;
std::vector<uint32_t> hash_entries_;
// Get totalbits that optimized for cpu cache line
uint32_t GetTotalBitsForLocality(uint32_t total_bits);
// Reserve space for new filter
char* ReserveSpace(const int num_entry, uint32_t* total_bits,
uint32_t* num_lines);
// Implementation-specific variant of public CalculateSpace
uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
uint32_t* num_lines);
// Assuming single threaded access to this function.
void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
};
FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
const int num_probes)
@ -30,35 +87,35 @@ FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
assert(bits_per_key_);
}
FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
void FullFilterBitsBuilder::AddKey(const Slice& key) {
uint32_t hash = BloomHash(key);
if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
hash_entries_.push_back(hash);
}
void FullFilterBitsBuilder::AddKey(const Slice& key) {
uint32_t hash = BloomHash(key);
if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
hash_entries_.push_back(hash);
}
}
Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
uint32_t total_bits, num_lines;
char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
&total_bits, &num_lines);
assert(data);
Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
uint32_t total_bits, num_lines;
char* data = ReserveSpace(static_cast<int>(hash_entries_.size()), &total_bits,
&num_lines);
assert(data);
if (total_bits != 0 && num_lines != 0) {
for (auto h : hash_entries_) {
AddHash(h, data, num_lines, total_bits);
}
if (total_bits != 0 && num_lines != 0) {
for (auto h : hash_entries_) {
AddHash(h, data, num_lines, total_bits);
}
data[total_bits/8] = static_cast<char>(num_probes_);
EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
}
data[total_bits / 8] = static_cast<char>(num_probes_);
EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines));
const char* const_data = data;
buf->reset(const_data);
hash_entries_.clear();
const char* const_data = data;
buf->reset(const_data);
hash_entries_.clear();
return Slice(data, total_bits / 8 + 5);
}
return Slice(data, total_bits / 8 + 5);
}
uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
uint32_t num_lines =
@ -106,13 +163,11 @@ char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
assert(bits_per_key_);
assert(bytes > 0);
uint32_t dont_care1, dont_care2;
int high = static_cast<int>(bytes * 8 / bits_per_key_ + 1);
int low = 1;
int n = high;
for (; n >= low; n--) {
uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
if (sz <= bytes) {
if (CalculateSpace(n) <= bytes) {
break;
}
}
@ -131,7 +186,6 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
folly::constexpr_log2(CACHE_LINE_SIZE));
}
namespace {
class AlwaysTrueFilter : public FilterBitsReader {
public:
bool MayMatch(const Slice&) override { return true; }
@ -196,148 +250,148 @@ class FullFilterBitsReader : public FilterBitsReader {
const uint32_t log2_cache_line_size_;
};
} // namespace
// An implementation of filter policy
class BloomFilterPolicy : public FilterPolicy {
public:
explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
: bits_per_key_(bits_per_key),
use_block_based_builder_(use_block_based_builder) {
initialize();
}
~BloomFilterPolicy() override {}
const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; }
const std::vector<BloomFilterPolicy::Impl> BloomFilterPolicy::kAllImpls = {
kFull,
kBlock,
};
void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
// Compute bloom filter size (in both bits and bytes)
uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
BloomFilterPolicy::BloomFilterPolicy(int bits_per_key, Impl impl)
: bits_per_key_(bits_per_key), impl_(impl) {
// We intentionally round down to reduce probing cost a little bit
num_probes_ = static_cast<int>(bits_per_key_ * 0.69); // 0.69 =~ ln(2)
if (num_probes_ < 1) num_probes_ = 1;
if (num_probes_ > 30) num_probes_ = 30;
}
// For small n, we can see a very high false positive rate. Fix it
// by enforcing a minimum bloom filter length.
if (bits < 64) bits = 64;
BloomFilterPolicy::~BloomFilterPolicy() {}
uint32_t bytes = (bits + 7) / 8;
bits = bytes * 8;
const char* BloomFilterPolicy::Name() const {
return "rocksdb.BuiltinBloomFilter";
}
const size_t init_size = dst->size();
dst->resize(init_size + bytes, 0);
dst->push_back(static_cast<char>(num_probes_)); // Remember # of probes
char* array = &(*dst)[init_size];
for (int i = 0; i < n; i++) {
LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes_,
array);
}
void BloomFilterPolicy::CreateFilter(const Slice* keys, int n,
std::string* dst) const {
// We should ideally only be using this deprecated interface for
// appropriately constructed BloomFilterPolicy
assert(impl_ == kBlock);
// Compute bloom filter size (in both bits and bytes)
uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
// For small n, we can see a very high false positive rate. Fix it
// by enforcing a minimum bloom filter length.
if (bits < 64) bits = 64;
uint32_t bytes = (bits + 7) / 8;
bits = bytes * 8;
const size_t init_size = dst->size();
dst->resize(init_size + bytes, 0);
dst->push_back(static_cast<char>(num_probes_)); // Remember # of probes
char* array = &(*dst)[init_size];
for (int i = 0; i < n; i++) {
LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes_,
array);
}
}
bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
const size_t len = bloom_filter.size();
if (len < 2 || len > 0xffffffffU) {
return false;
}
bool BloomFilterPolicy::KeyMayMatch(const Slice& key,
const Slice& bloom_filter) const {
const size_t len = bloom_filter.size();
if (len < 2 || len > 0xffffffffU) {
return false;
}
const char* array = bloom_filter.data();
const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
const char* array = bloom_filter.data();
const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
// Use the encoded k so that we can read filters generated by
// bloom filters created using different parameters.
const int k = static_cast<uint8_t>(array[len - 1]);
if (k > 30) {
// Reserved for potentially new encodings for short bloom filters.
// Consider it a match.
return true;
}
// NB: using k not num_probes_
return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
array);
// Use the encoded k so that we can read filters generated by
// bloom filters created using different parameters.
const int k = static_cast<uint8_t>(array[len - 1]);
if (k > 30) {
// Reserved for potentially new encodings for short bloom filters.
// Consider it a match.
return true;
}
// NB: using k not num_probes_
return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
array);
}
FilterBitsBuilder* GetFilterBitsBuilder() const override {
if (use_block_based_builder_) {
return nullptr;
}
FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const {
if (impl_ == kBlock) {
return nullptr;
} else {
return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
}
}
// Read metadata to determine what kind of FilterBitsReader is needed
// and return a new one.
FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
if (len_with_meta <= 5) {
// filter is empty or broken. Treat like zero keys added.
return new AlwaysFalseFilter();
}
// Read metadata to determine what kind of FilterBitsReader is needed
// and return a new one.
FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
const Slice& contents) const {
uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
if (len_with_meta <= 5) {
// filter is empty or broken. Treat like zero keys added.
return new AlwaysFalseFilter();
}
char raw_num_probes = contents.data()[len_with_meta - 5];
// NB: *num_probes > 30 and < 128 probably have not been used, because of
// BloomFilterPolicy::initialize, unless directly calling
// FullFilterBitsBuilder as an API, but we are leaving those cases in
// limbo with FullFilterBitsReader for now.
char raw_num_probes = contents.data()[len_with_meta - 5];
// NB: *num_probes > 30 and < 128 probably have not been used, because of
// BloomFilterPolicy::initialize, unless directly calling
// FullFilterBitsBuilder as an API, but we are leaving those cases in
// limbo with FullFilterBitsReader for now.
if (raw_num_probes < 1) {
// Treat as zero probes (always FP) for now.
// NB: < 0 (or unsigned > 127) effectively reserved for future use.
return new AlwaysTrueFilter();
}
// else attempt decode for FullFilterBitsReader
if (raw_num_probes < 1) {
// Treat as zero probes (always FP) for now.
// NB: < 0 (or unsigned > 127) effectively reserved for future use.
return new AlwaysTrueFilter();
}
// else attempt decode for FullFilterBitsReader
int num_probes = raw_num_probes;
assert(num_probes >= 1);
assert(num_probes <= 127);
int num_probes = raw_num_probes;
assert(num_probes >= 1);
assert(num_probes <= 127);
uint32_t len = len_with_meta - 5;
assert(len > 0);
uint32_t len = len_with_meta - 5;
assert(len > 0);
uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
uint32_t log2_cache_line_size;
uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
uint32_t log2_cache_line_size;
if (num_lines * CACHE_LINE_SIZE == len) {
// Common case
log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
} else if (num_lines == 0 || len % num_lines != 0) {
// Invalid (no solution to num_lines * x == len)
if (num_lines * CACHE_LINE_SIZE == len) {
// Common case
log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
} else if (num_lines == 0 || len % num_lines != 0) {
// Invalid (no solution to num_lines * x == len)
// Treat as zero probes (always FP) for now.
return new AlwaysTrueFilter();
} else {
// Determine the non-native cache line size (from another system)
log2_cache_line_size = 0;
while ((num_lines << log2_cache_line_size) < len) {
++log2_cache_line_size;
}
if ((num_lines << log2_cache_line_size) != len) {
// Invalid (block size not a power of two)
// Treat as zero probes (always FP) for now.
return new AlwaysTrueFilter();
} else {
// Determine the non-native cache line size (from another system)
log2_cache_line_size = 0;
while ((num_lines << log2_cache_line_size) < len) {
++log2_cache_line_size;
}
if ((num_lines << log2_cache_line_size) != len) {
// Invalid (block size not a power of two)
// Treat as zero probes (always FP) for now.
return new AlwaysTrueFilter();
}
}
// if not early return
return new FullFilterBitsReader(contents.data(), num_probes, num_lines,
log2_cache_line_size);
}
// If choose to use block based builder
bool UseBlockBasedBuilder() { return use_block_based_builder_; }
private:
int bits_per_key_;
int num_probes_;
const bool use_block_based_builder_;
void initialize() {
// We intentionally round down to reduce probing cost a little bit
num_probes_ = static_cast<int>(bits_per_key_ * 0.69); // 0.69 =~ ln(2)
if (num_probes_ < 1) num_probes_ = 1;
if (num_probes_ > 30) num_probes_ = 30;
}
};
} // namespace
// if not early return
return new FullFilterBitsReader(contents.data(), num_probes, num_lines,
log2_cache_line_size);
}
const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
bool use_block_based_builder) {
return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
if (use_block_based_builder) {
return new BloomFilterPolicy(bits_per_key, BloomFilterPolicy::kBlock);
} else {
return new BloomFilterPolicy(bits_per_key, BloomFilterPolicy::kFull);
}
}
FilterPolicy::~FilterPolicy() { }

@ -18,57 +18,61 @@ namespace rocksdb {
class Slice;
class FullFilterBitsBuilder : public FilterBitsBuilder {
// Exposes any extra information needed for testing built-in
// FilterBitsBuilders
class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
public:
explicit FullFilterBitsBuilder(const int bits_per_key, const int num_probes);
// No Copy allowed
FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete;
void operator=(const FullFilterBitsBuilder&) = delete;
~FullFilterBitsBuilder();
void AddKey(const Slice& key) override;
// Create a filter that for hashes [0, n-1], the filter is allocated here
// When creating filter, it is ensured that
// total_bits = num_lines * CACHE_LINE_SIZE * 8
// dst len is >= 5, 1 for num_probes, 4 for num_lines
// Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
// +----------------------------------------------------------------+
// | filter data with length total_bits/8 |
// +----------------------------------------------------------------+
// | |
// | ... |
// | |
// +----------------------------------------------------------------+
// | ... | num_probes : 1 byte | num_lines : 4 bytes |
// +----------------------------------------------------------------+
Slice Finish(std::unique_ptr<const char[]>* buf) override;
int CalculateNumEntry(const uint32_t bytes) override;
// Calculate number of bytes needed for a new filter, including
// metadata. Passing the result to CalculateNumEntry should
// return >= the num_entry passed in.
uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
uint32_t* num_lines);
virtual uint32_t CalculateSpace(const int num_entry) = 0;
};
private:
friend class FullFilterBlockTest_DuplicateEntries_Test;
int bits_per_key_;
int num_probes_;
std::vector<uint32_t> hash_entries_;
// RocksDB built-in filter policy for Bloom or Bloom-like filters.
// This class is considered internal API and subject to change.
// See NewBloomFilterPolicy.
class BloomFilterPolicy : public FilterPolicy {
public:
// An internal marker for which Bloom filter implementation to use.
// This makes it easier for tests to track or to walk over the built-in
// set of Bloom filter policies. The only variance in BloomFilterPolicy
// by implementation is in GetFilterBitsBuilder(), so an enum is practical
// here vs. subclasses.
enum Impl {
// Implementation of Bloom filter for full and partitioned filters.
// Set to 0 in case of value confusion with bool use_block_based_builder
kFull = 0,
// Deprecated block-based Bloom filter implementation.
// Set to 1 in case of value confusion with bool use_block_based_builder
kBlock = 1,
};
static const std::vector<Impl> kAllImpls;
explicit BloomFilterPolicy(int bits_per_key, Impl impl);
// Get totalbits that optimized for cpu cache line
uint32_t GetTotalBitsForLocality(uint32_t total_bits);
~BloomFilterPolicy() override;
// Reserve space for new filter
char* ReserveSpace(const int num_entry, uint32_t* total_bits,
uint32_t* num_lines);
const char* Name() const override;
// Assuming single threaded access to this function.
void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
// Deprecated block-based filter only
void CreateFilter(const Slice* keys, int n, std::string* dst) const override;
// Deprecated block-based filter only
bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override;
FilterBitsBuilder* GetFilterBitsBuilder() const override;
// Read metadata to determine what kind of FilterBitsReader is needed
// and return a new one. This must successfully process any filter data
// generated by a built-in FilterBitsBuilder, regardless of the impl
// chosen for this BloomFilterPolicy. Not compatible with CreateFilter.
FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
private:
int bits_per_key_;
int num_probes_;
// Selected implementation for building new SST filters
Impl impl_;
};
} // namespace rocksdb

@ -55,15 +55,15 @@ static int NextLength(int length) {
return length;
}
class BloomTest : public testing::Test {
class BlockBasedBloomTest : public testing::Test {
private:
std::unique_ptr<const FilterPolicy> policy_;
std::string filter_;
std::vector<std::string> keys_;
public:
BloomTest() : policy_(
NewBloomFilterPolicy(FLAGS_bits_per_key)) {}
BlockBasedBloomTest()
: policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, true)) {}
void Reset() {
keys_.clear();
@ -72,7 +72,7 @@ class BloomTest : public testing::Test {
void ResetPolicy(const FilterPolicy* policy = nullptr) {
if (policy == nullptr) {
policy_.reset(NewBloomFilterPolicy(FLAGS_bits_per_key));
policy_.reset(NewBloomFilterPolicy(FLAGS_bits_per_key, true));
} else {
policy_.reset(policy);
}
@ -131,12 +131,12 @@ class BloomTest : public testing::Test {
}
};
TEST_F(BloomTest, EmptyFilter) {
TEST_F(BlockBasedBloomTest, EmptyFilter) {
ASSERT_TRUE(! Matches("hello"));
ASSERT_TRUE(! Matches("world"));
}
TEST_F(BloomTest, Small) {
TEST_F(BlockBasedBloomTest, Small) {
Add("hello");
Add("world");
ASSERT_TRUE(Matches("hello"));
@ -145,7 +145,7 @@ TEST_F(BloomTest, Small) {
ASSERT_TRUE(! Matches("foo"));
}
TEST_F(BloomTest, VaryingLengths) {
TEST_F(BlockBasedBloomTest, VaryingLengths) {
char buffer[sizeof(int)];
// Count number of filters that significantly exceed the false positive rate
@ -186,45 +186,45 @@ TEST_F(BloomTest, VaryingLengths) {
// Ensure the implementation doesn't accidentally change in an
// incompatible way
TEST_F(BloomTest, Schema) {
TEST_F(BlockBasedBloomTest, Schema) {
char buffer[sizeof(int)];
ResetPolicy(NewBloomFilterPolicy(8)); // num_probes = 5
ResetPolicy(NewBloomFilterPolicy(8, true)); // num_probes = 5
for (int key = 0; key < 87; key++) {
Add(Key(key, buffer));
}
Build();
ASSERT_EQ(BloomHash(FilterData()), 3589896109U);
ResetPolicy(NewBloomFilterPolicy(9)); // num_probes = 6
ResetPolicy(NewBloomFilterPolicy(9, true)); // num_probes = 6
for (int key = 0; key < 87; key++) {
Add(Key(key, buffer));
}
Build();
ASSERT_EQ(BloomHash(FilterData()), 969445585);
ResetPolicy(NewBloomFilterPolicy(11)); // num_probes = 7
ResetPolicy(NewBloomFilterPolicy(11, true)); // num_probes = 7
for (int key = 0; key < 87; key++) {
Add(Key(key, buffer));
}
Build();
ASSERT_EQ(BloomHash(FilterData()), 1694458207);
ResetPolicy(NewBloomFilterPolicy(10)); // num_probes = 6
ResetPolicy(NewBloomFilterPolicy(10, true)); // num_probes = 6
for (int key = 0; key < 87; key++) {
Add(Key(key, buffer));
}
Build();
ASSERT_EQ(BloomHash(FilterData()), 2373646410U);
ResetPolicy(NewBloomFilterPolicy(10));
ResetPolicy(NewBloomFilterPolicy(10, true));
for (int key = 1; key < 87; key++) {
Add(Key(key, buffer));
}
Build();
ASSERT_EQ(BloomHash(FilterData()), 1908442116);
ResetPolicy(NewBloomFilterPolicy(10));
ResetPolicy(NewBloomFilterPolicy(10, true));
for (int key = 1; key < 88; key++) {
Add(Key(key, buffer));
}
@ -251,8 +251,9 @@ class FullBloomTest : public testing::Test {
Reset();
}
FullFilterBitsBuilder* GetFullFilterBitsBuilder() {
return dynamic_cast<FullFilterBitsBuilder*>(bits_builder_.get());
BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() {
// Throws on bad cast
return &dynamic_cast<BuiltinFilterBitsBuilder&>(*bits_builder_.get());
}
void Reset() {
@ -322,15 +323,12 @@ class FullBloomTest : public testing::Test {
};
TEST_F(FullBloomTest, FilterSize) {
uint32_t dont_care1, dont_care2;
auto full_bits_builder = GetFullFilterBitsBuilder();
ASSERT_TRUE(full_bits_builder != nullptr);
auto bits_builder = GetBuiltinFilterBitsBuilder();
for (int n = 1; n < 100; n++) {
auto space = full_bits_builder->CalculateSpace(n, &dont_care1, &dont_care2);
auto n2 = full_bits_builder->CalculateNumEntry(space);
auto space = bits_builder->CalculateSpace(n);
auto n2 = bits_builder->CalculateNumEntry(space);
ASSERT_GE(n2, n);
auto space2 =
full_bits_builder->CalculateSpace(n2, &dont_care1, &dont_care2);
auto space2 = bits_builder->CalculateSpace(n2);
ASSERT_EQ(space, space2);
}
}

Loading…
Cancel
Save