From 462796697cdebd41935bcd1d6028b64d67d14620 Mon Sep 17 00:00:00 2001 From: sdong Date: Mon, 2 Jun 2014 16:52:29 -0700 Subject: [PATCH] dynamic_bloom: replace some divide (remainder) operations with shifts in locality mode, and other improvements Summary: This patch changes meaning of options.bloom_locality: 0 means disable cache line optimization and any positive number means use CACHE_LINE_SIZE as block size (the previous behavior is the block size will be CACHE_LINE_SIZE*options.bloom_locality). By doing it, the divide operations inside a block can be replaced by a shift. Performance is improved: https://reviews.facebook.net/P471 Also, improve the basic algorithm in two ways: (1) make sure num of blocks is an odd number (2) rotate bytes after every probe in locality mode. Since the divider is 2^n, unless doing it, we are never able to use all the bits. Improvements of false positive: https://reviews.facebook.net/P459 Test Plan: make all check Reviewers: ljin, haobo Reviewed By: haobo Subscribers: dhruba, yhchiang, igor, leveldb Differential Revision: https://reviews.facebook.net/D18843 --- include/rocksdb/options.h | 9 +++------ util/dynamic_bloom.cc | 29 ++++++++++++++++++----------- util/dynamic_bloom.h | 30 +++++++++++++++++++----------- util/dynamic_bloom_test.cc | 35 ++++++++++++++++++----------------- 4 files changed, 58 insertions(+), 45 deletions(-) diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 9ba6a522c..ded76a3ab 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -547,12 +547,9 @@ struct ColumnFamilyOptions { // Control locality of bloom filter probes to improve cache miss rate. // This option only applies to memtable prefix bloom and plaintable - // prefix bloom. It essentially limits the max number of cache lines each - // bloom filter check can touch. - // This optimization is turned off when set to 0. The number should never - // be greater than number of probes. This option can boost performance - // for in-memory workload but should use with care since it can cause - // higher false positive rate. + // prefix bloom. It essentially limits every bloom checking to one cache line. + // This optimization is turned off when set to 0, and positive number to turn + // it on. // Default: 0 uint32_t bloom_locality; diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index 09ffe71ec..7173bbb93 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -17,34 +17,41 @@ namespace { static uint32_t BloomHash(const Slice& key) { return Hash(key.data(), key.size(), 0xbc9f1d34); } + +uint32_t GetNumBlocks(uint32_t total_bits) { + uint32_t num_blocks = (total_bits + CACHE_LINE_SIZE * 8 - 1) / + (CACHE_LINE_SIZE * 8) * (CACHE_LINE_SIZE * 8); + // Make num_blocks an odd number to make sure more bits are involved + // when determining which block. + if (num_blocks % 2 == 0) { + num_blocks++; + } + return num_blocks; +} } -DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block, +DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t locality, uint32_t num_probes, uint32_t (*hash_func)(const Slice& key), size_t huge_page_tlb_size, Logger* logger) - : kBlocked(cl_per_block > 0), - kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8), - kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock * - kBitsPerBlock - : total_bits + 7) / + : kTotalBits(((locality > 0) ? GetNumBlocks(total_bits) : total_bits + 7) / 8 * 8), - kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1), + kNumBlocks((locality > 0) ? kTotalBits / (CACHE_LINE_SIZE * 8) : 0), kNumProbes(num_probes), hash_func_(hash_func == nullptr ? &BloomHash : hash_func) { - assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock); + assert(kNumBlocks > 0 || kTotalBits > 0); assert(kNumProbes > 0); uint32_t sz = kTotalBits / 8; - if (kBlocked) { + if (kNumBlocks > 0) { sz += CACHE_LINE_SIZE - 1; } raw_ = reinterpret_cast( arena_.AllocateAligned(sz, huge_page_tlb_size, logger)); memset(raw_, 0, sz); - if (kBlocked && (reinterpret_cast(raw_) % CACHE_LINE_SIZE)) { + if (kNumBlocks > 0 && (reinterpret_cast(raw_) % CACHE_LINE_SIZE)) { data_ = raw_ + CACHE_LINE_SIZE - - reinterpret_cast(raw_) % CACHE_LINE_SIZE; + reinterpret_cast(raw_) % CACHE_LINE_SIZE; } else { data_ = raw_; } diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index 73476eb3b..e59134591 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -8,6 +8,7 @@ #include #include +#include "port/port.h" #include namespace rocksdb { @@ -19,15 +20,14 @@ class DynamicBloom { public: // total_bits: fixed total bits for the bloom // num_probes: number of hash probes for a single key - // cl_per_block: block size in cache lines. When this is non-zero, a - // query/set is done within a block to improve cache locality. + // locality: If positive, optimize for cache line locality, 0 otherwise. // hash_func: customized hash function // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB // withi this page size. Need to reserve huge pages for // it to be allocated, like: // sysctl -w vm.nr_hugepages=20 // See linux doc Documentation/vm/hugetlbpage.txt - explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0, + explicit DynamicBloom(uint32_t total_bits, uint32_t locality = 0, uint32_t num_probes = 6, uint32_t (*hash_func)(const Slice& key) = nullptr, size_t huge_page_tlb_size = 0, @@ -48,8 +48,6 @@ class DynamicBloom { bool MayContainHash(uint32_t hash); private: - const bool kBlocked; - const uint32_t kBitsPerBlock; const uint32_t kTotalBits; const uint32_t kNumBlocks; const uint32_t kNumProbes; @@ -69,13 +67,18 @@ inline bool DynamicBloom::MayContain(const Slice& key) { inline bool DynamicBloom::MayContainHash(uint32_t h) { const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits - if (kBlocked) { - uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock; + if (kNumBlocks != 0) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); for (uint32_t i = 0; i < kNumProbes; ++i) { - const uint32_t bitpos = b + h % kBitsPerBlock; + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { return false; } + // Rotate h so that we don't reuse the same bytes. + h = h / (CACHE_LINE_SIZE * 8) + + (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE); h += delta; } } else { @@ -92,11 +95,16 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) { inline void DynamicBloom::AddHash(uint32_t h) { const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits - if (kBlocked) { - uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock; + if (kNumBlocks != 0) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); for (uint32_t i = 0; i < kNumProbes; ++i) { - const uint32_t bitpos = b + h % kBitsPerBlock; + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); data_[bitpos / 8] |= (1 << (bitpos % 8)); + // Rotate h so that we don't reuse the same bytes. + h = h / (CACHE_LINE_SIZE * 8) + + (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE); h += delta; } } else { diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index b72ab24c5..d345addba 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -91,17 +91,16 @@ TEST(DynamicBloomTest, VaryingLengths) { fprintf(stderr, "bits_per_key: %d num_probes: %d\n", FLAGS_bits_per_key, num_probes); - for (uint32_t cl_per_block = 0; cl_per_block < num_probes; - ++cl_per_block) { + for (uint32_t enable_locality = 0; enable_locality < 2; ++enable_locality) { for (uint32_t num = 1; num <= 10000; num = NextNum(num)) { uint32_t bloom_bits = 0; - if (cl_per_block == 0) { + if (enable_locality == 0) { bloom_bits = std::max(num * FLAGS_bits_per_key, 64U); } else { bloom_bits = std::max(num * FLAGS_bits_per_key, - cl_per_block * CACHE_LINE_SIZE * 8); + enable_locality * CACHE_LINE_SIZE * 8); } - DynamicBloom bloom(bloom_bits, cl_per_block, num_probes); + DynamicBloom bloom(bloom_bits, enable_locality, num_probes); for (uint64_t i = 0; i < num; i++) { bloom.Add(Key(i, buffer)); ASSERT_TRUE(bloom.MayContain(Key(i, buffer))); @@ -123,8 +122,10 @@ TEST(DynamicBloomTest, VaryingLengths) { } double rate = result / 10000.0; - fprintf(stderr, "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, " - "cl per block = %u\n", rate*100.0, num, bloom_bits, cl_per_block); + fprintf(stderr, + "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, " + "enable locality?%u\n", + rate * 100.0, num, bloom_bits, enable_locality); if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often @@ -173,20 +174,20 @@ TEST(DynamicBloomTest, perf) { elapsed / count); ASSERT_TRUE(count == num_keys); - for (uint32_t cl_per_block = 1; cl_per_block <= num_probes; - ++cl_per_block) { - DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, num_probes); + // Locality enabled version + DynamicBloom blocked_bloom(num_keys * 10, 1, num_probes); timer.Start(); for (uint64_t i = 1; i <= num_keys; ++i) { blocked_bloom.Add(Slice(reinterpret_cast(&i), 8)); } - uint64_t elapsed = timer.ElapsedNanos(); - fprintf(stderr, "blocked bloom(%d), avg add latency %" PRIu64 "\n", - cl_per_block, elapsed / num_keys); + elapsed = timer.ElapsedNanos(); + fprintf(stderr, + "blocked bloom(enable locality), avg add latency %" PRIu64 "\n", + elapsed / num_keys); - uint64_t count = 0; + count = 0; timer.Start(); for (uint64_t i = 1; i <= num_keys; ++i) { if (blocked_bloom.MayContain( @@ -196,11 +197,11 @@ TEST(DynamicBloomTest, perf) { } elapsed = timer.ElapsedNanos(); - fprintf(stderr, "blocked bloom(%d), avg query latency %" PRIu64 "\n", - cl_per_block, elapsed / count); + fprintf(stderr, + "blocked bloom(enable locality), avg query latency %" PRIu64 "\n", + elapsed / count); ASSERT_TRUE(count == num_keys); } - } } } // namespace rocksdb