|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "rocksdb/filter_policy.h"
|
|
|
|
|
|
|
|
#include "rocksdb/slice.h"
|
|
|
|
#include "table/block_based/block_based_filter_block.h"
|
|
|
|
#include "table/block_based/full_filter_block.h"
|
|
|
|
#include "table/full_filter_bits_builder.h"
|
|
|
|
#include "third-party/folly/folly/ConstexprMath.h"
|
|
|
|
#include "util/bloom_impl.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/hash.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
typedef LegacyLocalityBloomImpl</*ExtraRotates*/ false> LegacyFullFilterImpl;
|
|
|
|
class BlockBasedFilterBlockBuilder;
|
|
|
|
class FullFilterBlockBuilder;
|
|
|
|
|
|
|
|
FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
|
|
|
|
const int num_probes)
|
|
|
|
: bits_per_key_(bits_per_key), num_probes_(num_probes) {
|
|
|
|
assert(bits_per_key_);
|
|
|
|
}
|
|
|
|
|
|
|
|
FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
|
|
|
|
|
|
|
|
void FullFilterBitsBuilder::AddKey(const Slice& key) {
|
|
|
|
uint32_t hash = BloomHash(key);
|
|
|
|
if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
|
|
|
|
hash_entries_.push_back(hash);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
|
|
|
|
uint32_t total_bits, num_lines;
|
|
|
|
char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
|
|
|
|
&total_bits, &num_lines);
|
|
|
|
assert(data);
|
|
|
|
|
|
|
|
if (total_bits != 0 && num_lines != 0) {
|
|
|
|
for (auto h : hash_entries_) {
|
|
|
|
AddHash(h, data, num_lines, total_bits);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
data[total_bits/8] = static_cast<char>(num_probes_);
|
|
|
|
EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
|
|
|
|
|
|
|
|
const char* const_data = data;
|
|
|
|
buf->reset(const_data);
|
|
|
|
hash_entries_.clear();
|
|
|
|
|
|
|
|
return Slice(data, total_bits / 8 + 5);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
|
|
|
|
uint32_t num_lines =
|
|
|
|
(total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
|
|
|
|
|
|
|
|
// Make num_lines an odd number to make sure more bits are involved
|
|
|
|
// when determining which block.
|
|
|
|
if (num_lines % 2 == 0) {
|
|
|
|
num_lines++;
|
|
|
|
}
|
|
|
|
return num_lines * (CACHE_LINE_SIZE * 8);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
|
|
|
|
uint32_t* total_bits,
|
|
|
|
uint32_t* num_lines) {
|
|
|
|
assert(bits_per_key_);
|
|
|
|
if (num_entry != 0) {
|
|
|
|
uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
|
|
|
|
|
|
|
|
*total_bits = GetTotalBitsForLocality(total_bits_tmp);
|
|
|
|
*num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
|
|
|
|
assert(*total_bits > 0 && *total_bits % 8 == 0);
|
|
|
|
} else {
|
|
|
|
// filter is empty, just leave space for metadata
|
|
|
|
*total_bits = 0;
|
|
|
|
*num_lines = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reserve space for Filter
|
|
|
|
uint32_t sz = *total_bits / 8;
|
|
|
|
sz += 5; // 4 bytes for num_lines, 1 byte for num_probes
|
|
|
|
return sz;
|
|
|
|
}
|
|
|
|
|
|
|
|
char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
|
|
|
|
uint32_t* total_bits,
|
|
|
|
uint32_t* num_lines) {
|
|
|
|
uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
|
|
|
|
char* data = new char[sz];
|
|
|
|
memset(data, 0, sz);
|
|
|
|
return data;
|
|
|
|
}
|
|
|
|
|
|
|
|
int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
|
|
|
|
assert(bits_per_key_);
|
|
|
|
assert(space > 0);
|
|
|
|
uint32_t dont_care1, dont_care2;
|
|
|
|
int high = static_cast<int>(space * 8 / bits_per_key_ + 1);
|
|
|
|
int low = 1;
|
|
|
|
int n = high;
|
|
|
|
for (; n >= low; n--) {
|
|
|
|
uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
|
|
|
|
if (sz <= space) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(n < high); // High should be an overestimation
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
|
|
|
|
uint32_t num_lines, uint32_t total_bits) {
|
|
|
|
#ifdef NDEBUG
|
|
|
|
static_cast<void>(total_bits);
|
|
|
|
#endif
|
|
|
|
assert(num_lines > 0 && total_bits > 0);
|
|
|
|
|
|
|
|
LegacyFullFilterImpl::AddHash(h, num_lines, num_probes_, data,
|
|
|
|
folly::constexpr_log2(CACHE_LINE_SIZE));
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
Refactor / clean up / optimize FullFilterBitsReader (#5941)
Summary:
FullFilterBitsReader, after creating in BloomFilterPolicy, was
responsible for decoding metadata bits. This meant that
FullFilterBitsReader::MayMatch had some metadata checks in order to
implement "always true" or "always false" functionality in the case
of inconsistent or trivial metadata. This made for ugly
mixing-of-concerns code and probably had some runtime cost. It also
didn't really support plugging in alternative filter implementations
with extensions to the existing metadata schema.
BloomFilterPolicy::GetFilterBitsReader is now (exclusively) responsible
for decoding filter metadata bits and constructing appropriate instances
deriving from FilterBitsReader. "Always false" and "always true" derived
classes allow FullFilterBitsReader not to be concerned with handling of
trivial or inconsistent metadata. This also makes for easy expansion
to alternative filter implementations in new, alternative derived
classes. This change makes calls to FilterBitsReader::MayMatch
*necessarily* virtual because there's now more than one built-in
implementation. Compared with the previous implementation's extra
'if' checks in MayMatch, there's no consistent performance difference,
measured by (an older revision of) filter_bench (differences here seem
to be within noise):
Inside queries...
- Dry run (407) ns/op: 35.9996
+ Dry run (407) ns/op: 35.2034
- Single filter ns/op: 47.5483
+ Single filter ns/op: 47.4034
- Batched, prepared ns/op: 43.1559
+ Batched, prepared ns/op: 42.2923
...
- Random filter ns/op: 150.697
+ Random filter ns/op: 149.403
----------------------------
Outside queries...
- Dry run (980) ns/op: 34.6114
+ Dry run (980) ns/op: 34.0405
- Single filter ns/op: 56.8326
+ Single filter ns/op: 55.8414
- Batched, prepared ns/op: 48.2346
+ Batched, prepared ns/op: 47.5667
- Random filter ns/op: 155.377
+ Random filter ns/op: 153.942
Average FP rate %: 1.1386
Also, the FullFilterBitsReader ctor was responsible for a surprising
amount of CPU in production, due in part to inefficient determination of
the CACHE_LINE_SIZE used to construct the filter being read. The
overwhelming common case (same as my CACHE_LINE_SIZE) is now
substantially optimized, as shown with filter_bench with
-new_reader_every=1 (old option - see below) (repeatable result):
Inside queries...
- Dry run (453) ns/op: 118.799
+ Dry run (453) ns/op: 105.869
- Single filter ns/op: 82.5831
+ Single filter ns/op: 74.2509
...
- Random filter ns/op: 224.936
+ Random filter ns/op: 194.833
----------------------------
Outside queries...
- Dry run (aa1) ns/op: 118.503
+ Dry run (aa1) ns/op: 104.925
- Single filter ns/op: 90.3023
+ Single filter ns/op: 83.425
...
- Random filter ns/op: 220.455
+ Random filter ns/op: 175.7
Average FP rate %: 1.13886
However PR#5936 has/will reclaim most of this cost. After that PR, the optimization of this code path is likely negligible, but nonetheless it's clear we aren't making performance any worse.
Also fixed inadequate check of consistency between filter data size and
num_lines. (Unit test updated.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5941
Test Plan:
previously added unit tests FullBloomTest.CorruptFilters and
FullBloomTest.RawSchema
Differential Revision: D18018353
Pulled By: pdillinger
fbshipit-source-id: 8e04c2b4a7d93223f49a237fd52ef2483929ed9c
5 years ago
|
|
|
class AlwaysTrueFilter : public FilterBitsReader {
|
|
|
|
public:
|
|
|
|
bool MayMatch(const Slice&) override { return true; }
|
|
|
|
using FilterBitsReader::MayMatch; // inherit overload
|
|
|
|
};
|
|
|
|
|
|
|
|
class AlwaysFalseFilter : public FilterBitsReader {
|
|
|
|
public:
|
|
|
|
bool MayMatch(const Slice&) override { return false; }
|
|
|
|
using FilterBitsReader::MayMatch; // inherit overload
|
|
|
|
};
|
|
|
|
|
|
|
|
class FullFilterBitsReader : public FilterBitsReader {
|
|
|
|
public:
|
Refactor / clean up / optimize FullFilterBitsReader (#5941)
Summary:
FullFilterBitsReader, after creating in BloomFilterPolicy, was
responsible for decoding metadata bits. This meant that
FullFilterBitsReader::MayMatch had some metadata checks in order to
implement "always true" or "always false" functionality in the case
of inconsistent or trivial metadata. This made for ugly
mixing-of-concerns code and probably had some runtime cost. It also
didn't really support plugging in alternative filter implementations
with extensions to the existing metadata schema.
BloomFilterPolicy::GetFilterBitsReader is now (exclusively) responsible
for decoding filter metadata bits and constructing appropriate instances
deriving from FilterBitsReader. "Always false" and "always true" derived
classes allow FullFilterBitsReader not to be concerned with handling of
trivial or inconsistent metadata. This also makes for easy expansion
to alternative filter implementations in new, alternative derived
classes. This change makes calls to FilterBitsReader::MayMatch
*necessarily* virtual because there's now more than one built-in
implementation. Compared with the previous implementation's extra
'if' checks in MayMatch, there's no consistent performance difference,
measured by (an older revision of) filter_bench (differences here seem
to be within noise):
Inside queries...
- Dry run (407) ns/op: 35.9996
+ Dry run (407) ns/op: 35.2034
- Single filter ns/op: 47.5483
+ Single filter ns/op: 47.4034
- Batched, prepared ns/op: 43.1559
+ Batched, prepared ns/op: 42.2923
...
- Random filter ns/op: 150.697
+ Random filter ns/op: 149.403
----------------------------
Outside queries...
- Dry run (980) ns/op: 34.6114
+ Dry run (980) ns/op: 34.0405
- Single filter ns/op: 56.8326
+ Single filter ns/op: 55.8414
- Batched, prepared ns/op: 48.2346
+ Batched, prepared ns/op: 47.5667
- Random filter ns/op: 155.377
+ Random filter ns/op: 153.942
Average FP rate %: 1.1386
Also, the FullFilterBitsReader ctor was responsible for a surprising
amount of CPU in production, due in part to inefficient determination of
the CACHE_LINE_SIZE used to construct the filter being read. The
overwhelming common case (same as my CACHE_LINE_SIZE) is now
substantially optimized, as shown with filter_bench with
-new_reader_every=1 (old option - see below) (repeatable result):
Inside queries...
- Dry run (453) ns/op: 118.799
+ Dry run (453) ns/op: 105.869
- Single filter ns/op: 82.5831
+ Single filter ns/op: 74.2509
...
- Random filter ns/op: 224.936
+ Random filter ns/op: 194.833
----------------------------
Outside queries...
- Dry run (aa1) ns/op: 118.503
+ Dry run (aa1) ns/op: 104.925
- Single filter ns/op: 90.3023
+ Single filter ns/op: 83.425
...
- Random filter ns/op: 220.455
+ Random filter ns/op: 175.7
Average FP rate %: 1.13886
However PR#5936 has/will reclaim most of this cost. After that PR, the optimization of this code path is likely negligible, but nonetheless it's clear we aren't making performance any worse.
Also fixed inadequate check of consistency between filter data size and
num_lines. (Unit test updated.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5941
Test Plan:
previously added unit tests FullBloomTest.CorruptFilters and
FullBloomTest.RawSchema
Differential Revision: D18018353
Pulled By: pdillinger
fbshipit-source-id: 8e04c2b4a7d93223f49a237fd52ef2483929ed9c
5 years ago
|
|
|
FullFilterBitsReader(const char* data, int num_probes, uint32_t num_lines,
|
|
|
|
uint32_t log2_cache_line_size)
|
|
|
|
: data_(data),
|
|
|
|
num_probes_(num_probes),
|
|
|
|
num_lines_(num_lines),
|
|
|
|
log2_cache_line_size_(log2_cache_line_size) {}
|
|
|
|
|
|
|
|
// No Copy allowed
|
|
|
|
FullFilterBitsReader(const FullFilterBitsReader&) = delete;
|
|
|
|
void operator=(const FullFilterBitsReader&) = delete;
|
|
|
|
|
|
|
|
~FullFilterBitsReader() override {}
|
|
|
|
|
|
|
|
// "contents" contains the data built by a preceding call to
|
|
|
|
// FilterBitsBuilder::Finish. MayMatch must return true if the key was
|
|
|
|
// passed to FilterBitsBuilder::AddKey. This method may return true or false
|
|
|
|
// if the key was not on the list, but it should aim to return false with a
|
|
|
|
// high probability.
|
|
|
|
bool MayMatch(const Slice& key) override {
|
|
|
|
uint32_t hash = BloomHash(key);
|
|
|
|
uint32_t byte_offset;
|
|
|
|
LegacyFullFilterImpl::PrepareHashMayMatch(
|
|
|
|
hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
|
|
|
|
return LegacyFullFilterImpl::HashMayMatchPrepared(
|
|
|
|
hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
}
|
|
|
|
|
|
|
|
virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
|
|
|
|
uint32_t hashes[MultiGetContext::MAX_BATCH_SIZE];
|
|
|
|
uint32_t byte_offsets[MultiGetContext::MAX_BATCH_SIZE];
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
|
|
hashes[i] = BloomHash(*keys[i]);
|
|
|
|
LegacyFullFilterImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
|
|
|
|
/*out*/ &byte_offsets[i],
|
|
|
|
log2_cache_line_size_);
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
}
|
|
|
|
for (int i = 0; i < num_keys; ++i) {
|
Refactor / clean up / optimize FullFilterBitsReader (#5941)
Summary:
FullFilterBitsReader, after creating in BloomFilterPolicy, was
responsible for decoding metadata bits. This meant that
FullFilterBitsReader::MayMatch had some metadata checks in order to
implement "always true" or "always false" functionality in the case
of inconsistent or trivial metadata. This made for ugly
mixing-of-concerns code and probably had some runtime cost. It also
didn't really support plugging in alternative filter implementations
with extensions to the existing metadata schema.
BloomFilterPolicy::GetFilterBitsReader is now (exclusively) responsible
for decoding filter metadata bits and constructing appropriate instances
deriving from FilterBitsReader. "Always false" and "always true" derived
classes allow FullFilterBitsReader not to be concerned with handling of
trivial or inconsistent metadata. This also makes for easy expansion
to alternative filter implementations in new, alternative derived
classes. This change makes calls to FilterBitsReader::MayMatch
*necessarily* virtual because there's now more than one built-in
implementation. Compared with the previous implementation's extra
'if' checks in MayMatch, there's no consistent performance difference,
measured by (an older revision of) filter_bench (differences here seem
to be within noise):
Inside queries...
- Dry run (407) ns/op: 35.9996
+ Dry run (407) ns/op: 35.2034
- Single filter ns/op: 47.5483
+ Single filter ns/op: 47.4034
- Batched, prepared ns/op: 43.1559
+ Batched, prepared ns/op: 42.2923
...
- Random filter ns/op: 150.697
+ Random filter ns/op: 149.403
----------------------------
Outside queries...
- Dry run (980) ns/op: 34.6114
+ Dry run (980) ns/op: 34.0405
- Single filter ns/op: 56.8326
+ Single filter ns/op: 55.8414
- Batched, prepared ns/op: 48.2346
+ Batched, prepared ns/op: 47.5667
- Random filter ns/op: 155.377
+ Random filter ns/op: 153.942
Average FP rate %: 1.1386
Also, the FullFilterBitsReader ctor was responsible for a surprising
amount of CPU in production, due in part to inefficient determination of
the CACHE_LINE_SIZE used to construct the filter being read. The
overwhelming common case (same as my CACHE_LINE_SIZE) is now
substantially optimized, as shown with filter_bench with
-new_reader_every=1 (old option - see below) (repeatable result):
Inside queries...
- Dry run (453) ns/op: 118.799
+ Dry run (453) ns/op: 105.869
- Single filter ns/op: 82.5831
+ Single filter ns/op: 74.2509
...
- Random filter ns/op: 224.936
+ Random filter ns/op: 194.833
----------------------------
Outside queries...
- Dry run (aa1) ns/op: 118.503
+ Dry run (aa1) ns/op: 104.925
- Single filter ns/op: 90.3023
+ Single filter ns/op: 83.425
...
- Random filter ns/op: 220.455
+ Random filter ns/op: 175.7
Average FP rate %: 1.13886
However PR#5936 has/will reclaim most of this cost. After that PR, the optimization of this code path is likely negligible, but nonetheless it's clear we aren't making performance any worse.
Also fixed inadequate check of consistency between filter data size and
num_lines. (Unit test updated.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5941
Test Plan:
previously added unit tests FullBloomTest.CorruptFilters and
FullBloomTest.RawSchema
Differential Revision: D18018353
Pulled By: pdillinger
fbshipit-source-id: 8e04c2b4a7d93223f49a237fd52ef2483929ed9c
5 years ago
|
|
|
may_match[i] = LegacyFullFilterImpl::HashMayMatchPrepared(
|
|
|
|
hashes[i], num_probes_, data_ + byte_offsets[i],
|
|
|
|
log2_cache_line_size_);
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
const char* data_;
|
Refactor / clean up / optimize FullFilterBitsReader (#5941)
Summary:
FullFilterBitsReader, after creating in BloomFilterPolicy, was
responsible for decoding metadata bits. This meant that
FullFilterBitsReader::MayMatch had some metadata checks in order to
implement "always true" or "always false" functionality in the case
of inconsistent or trivial metadata. This made for ugly
mixing-of-concerns code and probably had some runtime cost. It also
didn't really support plugging in alternative filter implementations
with extensions to the existing metadata schema.
BloomFilterPolicy::GetFilterBitsReader is now (exclusively) responsible
for decoding filter metadata bits and constructing appropriate instances
deriving from FilterBitsReader. "Always false" and "always true" derived
classes allow FullFilterBitsReader not to be concerned with handling of
trivial or inconsistent metadata. This also makes for easy expansion
to alternative filter implementations in new, alternative derived
classes. This change makes calls to FilterBitsReader::MayMatch
*necessarily* virtual because there's now more than one built-in
implementation. Compared with the previous implementation's extra
'if' checks in MayMatch, there's no consistent performance difference,
measured by (an older revision of) filter_bench (differences here seem
to be within noise):
Inside queries...
- Dry run (407) ns/op: 35.9996
+ Dry run (407) ns/op: 35.2034
- Single filter ns/op: 47.5483
+ Single filter ns/op: 47.4034
- Batched, prepared ns/op: 43.1559
+ Batched, prepared ns/op: 42.2923
...
- Random filter ns/op: 150.697
+ Random filter ns/op: 149.403
----------------------------
Outside queries...
- Dry run (980) ns/op: 34.6114
+ Dry run (980) ns/op: 34.0405
- Single filter ns/op: 56.8326
+ Single filter ns/op: 55.8414
- Batched, prepared ns/op: 48.2346
+ Batched, prepared ns/op: 47.5667
- Random filter ns/op: 155.377
+ Random filter ns/op: 153.942
Average FP rate %: 1.1386
Also, the FullFilterBitsReader ctor was responsible for a surprising
amount of CPU in production, due in part to inefficient determination of
the CACHE_LINE_SIZE used to construct the filter being read. The
overwhelming common case (same as my CACHE_LINE_SIZE) is now
substantially optimized, as shown with filter_bench with
-new_reader_every=1 (old option - see below) (repeatable result):
Inside queries...
- Dry run (453) ns/op: 118.799
+ Dry run (453) ns/op: 105.869
- Single filter ns/op: 82.5831
+ Single filter ns/op: 74.2509
...
- Random filter ns/op: 224.936
+ Random filter ns/op: 194.833
----------------------------
Outside queries...
- Dry run (aa1) ns/op: 118.503
+ Dry run (aa1) ns/op: 104.925
- Single filter ns/op: 90.3023
+ Single filter ns/op: 83.425
...
- Random filter ns/op: 220.455
+ Random filter ns/op: 175.7
Average FP rate %: 1.13886
However PR#5936 has/will reclaim most of this cost. After that PR, the optimization of this code path is likely negligible, but nonetheless it's clear we aren't making performance any worse.
Also fixed inadequate check of consistency between filter data size and
num_lines. (Unit test updated.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5941
Test Plan:
previously added unit tests FullBloomTest.CorruptFilters and
FullBloomTest.RawSchema
Differential Revision: D18018353
Pulled By: pdillinger
fbshipit-source-id: 8e04c2b4a7d93223f49a237fd52ef2483929ed9c
5 years ago
|
|
|
const int num_probes_;
|
|
|
|
const uint32_t num_lines_;
|
|
|
|
const uint32_t log2_cache_line_size_;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// An implementation of filter policy
|
|
|
|
class BloomFilterPolicy : public FilterPolicy {
|
|
|
|
public:
|
|
|
|
explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
|
|
|
|
: bits_per_key_(bits_per_key), hash_func_(BloomHash),
|
|
|
|
use_block_based_builder_(use_block_based_builder) {
|
|
|
|
initialize();
|
|
|
|
}
|
|
|
|
|
|
|
|
~BloomFilterPolicy() override {}
|
|
|
|
|
|
|
|
const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; }
|
|
|
|
|
|
|
|
void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
|
|
|
|
// Compute bloom filter size (in both bits and bytes)
|
|
|
|
uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
|
|
|
|
|
|
|
|
// For small n, we can see a very high false positive rate. Fix it
|
|
|
|
// by enforcing a minimum bloom filter length.
|
|
|
|
if (bits < 64) bits = 64;
|
|
|
|
|
|
|
|
uint32_t bytes = (bits + 7) / 8;
|
|
|
|
bits = bytes * 8;
|
|
|
|
|
|
|
|
const size_t init_size = dst->size();
|
|
|
|
dst->resize(init_size + bytes, 0);
|
|
|
|
dst->push_back(static_cast<char>(num_probes_)); // Remember # of probes
|
|
|
|
char* array = &(*dst)[init_size];
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
LegacyNoLocalityBloomImpl::AddHash(hash_func_(keys[i]), bits, num_probes_,
|
|
|
|
array);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
|
|
|
|
const size_t len = bloom_filter.size();
|
|
|
|
if (len < 2 || len > 0xffffffffU) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* array = bloom_filter.data();
|
|
|
|
const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
|
|
|
|
|
|
|
|
// Use the encoded k so that we can read filters generated by
|
|
|
|
// bloom filters created using different parameters.
|
|
|
|
const int k = static_cast<uint8_t>(array[len - 1]);
|
|
|
|
if (k > 30) {
|
|
|
|
// Reserved for potentially new encodings for short bloom filters.
|
|
|
|
// Consider it a match.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// NB: using k not num_probes_
|
|
|
|
return LegacyNoLocalityBloomImpl::HashMayMatch(hash_func_(key), bits, k,
|
|
|
|
array);
|
|
|
|
}
|
|
|
|
|
|
|
|
FilterBitsBuilder* GetFilterBitsBuilder() const override {
|
|
|
|
if (use_block_based_builder_) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
|
|
|
|
}
|
|
|
|
|
Refactor / clean up / optimize FullFilterBitsReader (#5941)
Summary:
FullFilterBitsReader, after creating in BloomFilterPolicy, was
responsible for decoding metadata bits. This meant that
FullFilterBitsReader::MayMatch had some metadata checks in order to
implement "always true" or "always false" functionality in the case
of inconsistent or trivial metadata. This made for ugly
mixing-of-concerns code and probably had some runtime cost. It also
didn't really support plugging in alternative filter implementations
with extensions to the existing metadata schema.
BloomFilterPolicy::GetFilterBitsReader is now (exclusively) responsible
for decoding filter metadata bits and constructing appropriate instances
deriving from FilterBitsReader. "Always false" and "always true" derived
classes allow FullFilterBitsReader not to be concerned with handling of
trivial or inconsistent metadata. This also makes for easy expansion
to alternative filter implementations in new, alternative derived
classes. This change makes calls to FilterBitsReader::MayMatch
*necessarily* virtual because there's now more than one built-in
implementation. Compared with the previous implementation's extra
'if' checks in MayMatch, there's no consistent performance difference,
measured by (an older revision of) filter_bench (differences here seem
to be within noise):
Inside queries...
- Dry run (407) ns/op: 35.9996
+ Dry run (407) ns/op: 35.2034
- Single filter ns/op: 47.5483
+ Single filter ns/op: 47.4034
- Batched, prepared ns/op: 43.1559
+ Batched, prepared ns/op: 42.2923
...
- Random filter ns/op: 150.697
+ Random filter ns/op: 149.403
----------------------------
Outside queries...
- Dry run (980) ns/op: 34.6114
+ Dry run (980) ns/op: 34.0405
- Single filter ns/op: 56.8326
+ Single filter ns/op: 55.8414
- Batched, prepared ns/op: 48.2346
+ Batched, prepared ns/op: 47.5667
- Random filter ns/op: 155.377
+ Random filter ns/op: 153.942
Average FP rate %: 1.1386
Also, the FullFilterBitsReader ctor was responsible for a surprising
amount of CPU in production, due in part to inefficient determination of
the CACHE_LINE_SIZE used to construct the filter being read. The
overwhelming common case (same as my CACHE_LINE_SIZE) is now
substantially optimized, as shown with filter_bench with
-new_reader_every=1 (old option - see below) (repeatable result):
Inside queries...
- Dry run (453) ns/op: 118.799
+ Dry run (453) ns/op: 105.869
- Single filter ns/op: 82.5831
+ Single filter ns/op: 74.2509
...
- Random filter ns/op: 224.936
+ Random filter ns/op: 194.833
----------------------------
Outside queries...
- Dry run (aa1) ns/op: 118.503
+ Dry run (aa1) ns/op: 104.925
- Single filter ns/op: 90.3023
+ Single filter ns/op: 83.425
...
- Random filter ns/op: 220.455
+ Random filter ns/op: 175.7
Average FP rate %: 1.13886
However PR#5936 has/will reclaim most of this cost. After that PR, the optimization of this code path is likely negligible, but nonetheless it's clear we aren't making performance any worse.
Also fixed inadequate check of consistency between filter data size and
num_lines. (Unit test updated.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5941
Test Plan:
previously added unit tests FullBloomTest.CorruptFilters and
FullBloomTest.RawSchema
Differential Revision: D18018353
Pulled By: pdillinger
fbshipit-source-id: 8e04c2b4a7d93223f49a237fd52ef2483929ed9c
5 years ago
|
|
|
// Read metadata to determine what kind of FilterBitsReader is needed
|
|
|
|
// and return a new one.
|
|
|
|
FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
|
Refactor / clean up / optimize FullFilterBitsReader (#5941)
Summary:
FullFilterBitsReader, after creating in BloomFilterPolicy, was
responsible for decoding metadata bits. This meant that
FullFilterBitsReader::MayMatch had some metadata checks in order to
implement "always true" or "always false" functionality in the case
of inconsistent or trivial metadata. This made for ugly
mixing-of-concerns code and probably had some runtime cost. It also
didn't really support plugging in alternative filter implementations
with extensions to the existing metadata schema.
BloomFilterPolicy::GetFilterBitsReader is now (exclusively) responsible
for decoding filter metadata bits and constructing appropriate instances
deriving from FilterBitsReader. "Always false" and "always true" derived
classes allow FullFilterBitsReader not to be concerned with handling of
trivial or inconsistent metadata. This also makes for easy expansion
to alternative filter implementations in new, alternative derived
classes. This change makes calls to FilterBitsReader::MayMatch
*necessarily* virtual because there's now more than one built-in
implementation. Compared with the previous implementation's extra
'if' checks in MayMatch, there's no consistent performance difference,
measured by (an older revision of) filter_bench (differences here seem
to be within noise):
Inside queries...
- Dry run (407) ns/op: 35.9996
+ Dry run (407) ns/op: 35.2034
- Single filter ns/op: 47.5483
+ Single filter ns/op: 47.4034
- Batched, prepared ns/op: 43.1559
+ Batched, prepared ns/op: 42.2923
...
- Random filter ns/op: 150.697
+ Random filter ns/op: 149.403
----------------------------
Outside queries...
- Dry run (980) ns/op: 34.6114
+ Dry run (980) ns/op: 34.0405
- Single filter ns/op: 56.8326
+ Single filter ns/op: 55.8414
- Batched, prepared ns/op: 48.2346
+ Batched, prepared ns/op: 47.5667
- Random filter ns/op: 155.377
+ Random filter ns/op: 153.942
Average FP rate %: 1.1386
Also, the FullFilterBitsReader ctor was responsible for a surprising
amount of CPU in production, due in part to inefficient determination of
the CACHE_LINE_SIZE used to construct the filter being read. The
overwhelming common case (same as my CACHE_LINE_SIZE) is now
substantially optimized, as shown with filter_bench with
-new_reader_every=1 (old option - see below) (repeatable result):
Inside queries...
- Dry run (453) ns/op: 118.799
+ Dry run (453) ns/op: 105.869
- Single filter ns/op: 82.5831
+ Single filter ns/op: 74.2509
...
- Random filter ns/op: 224.936
+ Random filter ns/op: 194.833
----------------------------
Outside queries...
- Dry run (aa1) ns/op: 118.503
+ Dry run (aa1) ns/op: 104.925
- Single filter ns/op: 90.3023
+ Single filter ns/op: 83.425
...
- Random filter ns/op: 220.455
+ Random filter ns/op: 175.7
Average FP rate %: 1.13886
However PR#5936 has/will reclaim most of this cost. After that PR, the optimization of this code path is likely negligible, but nonetheless it's clear we aren't making performance any worse.
Also fixed inadequate check of consistency between filter data size and
num_lines. (Unit test updated.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5941
Test Plan:
previously added unit tests FullBloomTest.CorruptFilters and
FullBloomTest.RawSchema
Differential Revision: D18018353
Pulled By: pdillinger
fbshipit-source-id: 8e04c2b4a7d93223f49a237fd52ef2483929ed9c
5 years ago
|
|
|
uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
|
|
|
|
if (len_with_meta <= 5) {
|
|
|
|
// filter is empty or broken. Treat like zero keys added.
|
|
|
|
return new AlwaysFalseFilter();
|
|
|
|
}
|
|
|
|
|
|
|
|
char raw_num_probes = contents.data()[len_with_meta - 5];
|
|
|
|
// NB: *num_probes > 30 and < 128 probably have not been used, because of
|
|
|
|
// BloomFilterPolicy::initialize, unless directly calling
|
|
|
|
// FullFilterBitsBuilder as an API, but we are leaving those cases in
|
|
|
|
// limbo with FullFilterBitsReader for now.
|
|
|
|
|
|
|
|
if (raw_num_probes < 1) {
|
|
|
|
// Treat as zero probes (always FP) for now.
|
|
|
|
// NB: < 0 (or unsigned > 127) effectively reserved for future use.
|
|
|
|
return new AlwaysTrueFilter();
|
|
|
|
}
|
|
|
|
// else attempt decode for FullFilterBitsReader
|
|
|
|
|
|
|
|
int num_probes = raw_num_probes;
|
|
|
|
assert(num_probes >= 1);
|
|
|
|
assert(num_probes <= 127);
|
|
|
|
|
|
|
|
uint32_t len = len_with_meta - 5;
|
|
|
|
assert(len > 0);
|
|
|
|
|
|
|
|
uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
|
|
|
|
uint32_t log2_cache_line_size;
|
|
|
|
|
|
|
|
if (num_lines * CACHE_LINE_SIZE == len) {
|
|
|
|
// Common case
|
|
|
|
log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
|
|
|
|
} else if (num_lines == 0 || len % num_lines != 0) {
|
|
|
|
// Invalid (no solution to num_lines * x == len)
|
|
|
|
// Treat as zero probes (always FP) for now.
|
|
|
|
return new AlwaysTrueFilter();
|
|
|
|
} else {
|
|
|
|
// Determine the non-native cache line size (from another system)
|
|
|
|
log2_cache_line_size = 0;
|
|
|
|
while ((num_lines << log2_cache_line_size) < len) {
|
|
|
|
++log2_cache_line_size;
|
|
|
|
}
|
|
|
|
if ((num_lines << log2_cache_line_size) != len) {
|
|
|
|
// Invalid (block size not a power of two)
|
|
|
|
// Treat as zero probes (always FP) for now.
|
|
|
|
return new AlwaysTrueFilter();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// if not early return
|
|
|
|
return new FullFilterBitsReader(contents.data(), num_probes, num_lines,
|
|
|
|
log2_cache_line_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If choose to use block based builder
|
|
|
|
bool UseBlockBasedBuilder() { return use_block_based_builder_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
int bits_per_key_;
|
|
|
|
int num_probes_;
|
|
|
|
uint32_t (*hash_func_)(const Slice& key);
|
|
|
|
|
|
|
|
const bool use_block_based_builder_;
|
|
|
|
|
|
|
|
void initialize() {
|
|
|
|
// We intentionally round down to reduce probing cost a little bit
|
|
|
|
num_probes_ = static_cast<int>(bits_per_key_ * 0.69); // 0.69 =~ ln(2)
|
|
|
|
if (num_probes_ < 1) num_probes_ = 1;
|
|
|
|
if (num_probes_ > 30) num_probes_ = 30;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
|
|
|
|
bool use_block_based_builder) {
|
|
|
|
return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace rocksdb
|