diff --git a/HISTORY.md b/HISTORY.md index d7a3f0c41..4a3d715a0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,6 +11,9 @@ ### Public API Change * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables. * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine. +* The MemTable Bloom filter, when enabled, now always uses cache locality. Options::bloom_locality now only affects the PlainTable SST format. +### Performance Improvements +* Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance. ## 6.4.0 (7/30/2019) ### Default Option Change diff --git a/db/memtable.cc b/db/memtable.cc index 06cb2222e..21d3e347b 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -116,7 +116,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, moptions_.memtable_prefix_bloom_bits > 0) { bloom_filter_.reset( new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits, - ioptions.bloom_locality, 6 /* hard coded 6 probes */, + 6 /* hard coded 6 probes */, moptions_.memtable_huge_page_size, ioptions.info_log)); } } diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h index 08c72b2dc..b9248cdaf 100644 --- a/table/plain/plain_table_bloom.h +++ b/table/plain/plain_table_bloom.h @@ -19,6 +19,8 @@ class Slice; class Allocator; class Logger; +// A legacy Bloom filter implementation used by Plain Table db format, for +// schema backward compatibility. Not for use in new filter applications. class PlainTableBloomV1 { public: // allocator: pass allocator to bloom filter, hence trace the usage of memory diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index e5210d1fb..3c153c719 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -16,44 +16,49 @@ namespace rocksdb { namespace { -uint32_t GetTotalBitsForLocality(uint32_t total_bits) { - uint32_t num_blocks = - (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); - - // Make num_blocks an odd number to make sure more bits are involved - // when determining which block. - if (num_blocks % 2 == 0) { - num_blocks++; +uint32_t roundUpToPow2(uint32_t x) { + uint32_t rv = 1; + while (rv < x) { + rv <<= 1; } - - return num_blocks * (CACHE_LINE_SIZE * 8); + return rv; } + } DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits, - uint32_t locality, uint32_t num_probes, + uint32_t num_probes, size_t huge_page_tlb_size, Logger* logger) - : kNumProbes(num_probes) { - kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits) - : (total_bits + 7) / 8 * 8; - kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0; + // Round down, except round up with 1 + : kNumDoubleProbes((num_probes + (num_probes == 1)) / 2) { + assert(num_probes % 2 == 0); // limitation of current implementation + assert(num_probes <= 10); // limitation of current implementation + assert(kNumDoubleProbes > 0); - assert(kNumBlocks > 0 || kTotalBits > 0); - assert(kNumProbes > 0); + // Determine how much to round off + align by so that x ^ i (that's xor) is + // a valid u64 index if x is a valid u64 index and 0 <= i < kNumDoubleProbes. + uint32_t block_bytes = /*bytes/u64*/ 8 * + /*u64s*/ std::max(1U, roundUpToPow2(kNumDoubleProbes)); + kLen = (total_bits + (/*bits/byte*/ 8 * block_bytes - 1)) / + /*bits/u64*/ 64; + assert(kLen > 0); - uint32_t sz = kTotalBits / 8; - if (kNumBlocks > 0) { - sz += CACHE_LINE_SIZE - 1; - } + uint32_t sz = kLen * /*bytes/u64*/ 8; + // Padding to correct for allocation not originally aligned on block_bytes + // boundary + sz += block_bytes - 1; assert(allocator); char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger); memset(raw, 0, sz); - auto cache_line_offset = reinterpret_cast(raw) % CACHE_LINE_SIZE; - if (kNumBlocks > 0 && cache_line_offset > 0) { - raw += CACHE_LINE_SIZE - cache_line_offset; + auto block_offset = reinterpret_cast(raw) % block_bytes; + if (block_offset > 0) { + // Align on block_bytes boundary + raw += block_bytes - block_offset; } - data_ = reinterpret_cast*>(raw); + static_assert(sizeof(std::atomic) == sizeof(uint64_t), + "Expecting zero-space-overhead atomic"); + data_ = reinterpret_cast*>(raw); } } // rocksdb diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index 8b31f3c48..cf6ac4060 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -24,12 +24,20 @@ class Logger; // A Bloom filter intended only to be used in memory, never serialized in a way // that could lead to schema incompatibility. Supports opt-in lock-free // concurrent access. +// +// This implementation is also intended for applications generally preferring +// speed vs. maximum accuracy: roughly 0.9x BF op latency for 1.1x FP rate. +// For 1% FP rate, that means that the latency of a look-up triggered by an FP +// should be less than roughly 100x the cost of a Bloom filter op. +// +// For simplicity and performance, the current implementation requires +// num_probes to be a multiple of two and <= 10. +// class DynamicBloom { public: // allocator: pass allocator to bloom filter, hence trace the usage of memory // total_bits: fixed total bits for the bloom // num_probes: number of hash probes for a single key - // locality: If positive, optimize for cache line locality, 0 otherwise. // hash_func: customized hash function // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB // within this page size. Need to reserve huge pages for @@ -37,7 +45,7 @@ class DynamicBloom { // sysctl -w vm.nr_hugepages=20 // See linux doc Documentation/vm/hugetlbpage.txt explicit DynamicBloom(Allocator* allocator, - uint32_t total_bits, uint32_t locality = 0, + uint32_t total_bits, uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, Logger* logger = nullptr); @@ -64,14 +72,15 @@ class DynamicBloom { void Prefetch(uint32_t h); - uint32_t GetNumBlocks() const { return kNumBlocks; } - private: - uint32_t kTotalBits; - uint32_t kNumBlocks; - const uint32_t kNumProbes; + // Length of the structure, in 64-bit words. For this structure, "word" + // will always refer to 64-bit words. + uint32_t kLen; + // We make the k probes in pairs, two for each 64-bit read/write. Thus, + // this stores k/2, the number of words to double-probe. + const uint32_t kNumDoubleProbes; - std::atomic* data_; + std::atomic* data_; // or_func(ptr, mask) should effect *ptr |= mask with the appropriate // concurrency safety, working with bytes. @@ -86,14 +95,14 @@ inline void DynamicBloom::AddConcurrently(const Slice& key) { } inline void DynamicBloom::AddHash(uint32_t hash) { - AddHash(hash, [](std::atomic* ptr, uint8_t mask) { + AddHash(hash, [](std::atomic* ptr, uint64_t mask) { ptr->store(ptr->load(std::memory_order_relaxed) | mask, std::memory_order_relaxed); }); } inline void DynamicBloom::AddHashConcurrently(uint32_t hash) { - AddHash(hash, [](std::atomic* ptr, uint8_t mask) { + AddHash(hash, [](std::atomic* ptr, uint64_t mask) { // Happens-before between AddHash and MaybeContains is handled by // access to versions_->LastSequence(), so all we have to do here is // avoid races (so we don't give the compiler a license to mess up @@ -114,67 +123,69 @@ inline bool DynamicBloom::MayContain(const Slice& key) const { // local variable is initialized but not referenced #pragma warning(disable : 4189) #endif -inline void DynamicBloom::Prefetch(uint32_t h) { - if (kNumBlocks != 0) { - uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); - PREFETCH(&(data_[b / 8]), 0, 3); - } +inline void DynamicBloom::Prefetch(uint32_t h32) { + size_t a = fastrange32(kLen, h32); + PREFETCH(data_ + a, 0, 3); } #if defined(_MSC_VER) #pragma warning(pop) #endif -inline bool DynamicBloom::MayContainHash(uint32_t h) const { - const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits - if (kNumBlocks != 0) { - uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); - for (uint32_t i = 0; i < kNumProbes; ++i) { - // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized - // to a simple and operation by compiler. - const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); - uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed); - if ((byteval & (1 << (bitpos % 8))) == 0) { - return false; - } - // Rotate h so that we don't reuse the same bytes. - h = h / (CACHE_LINE_SIZE * 8) + - (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE); - h += delta; - } - } else { - for (uint32_t i = 0; i < kNumProbes; ++i) { - const uint32_t bitpos = h % kTotalBits; - uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed); - if ((byteval & (1 << (bitpos % 8))) == 0) { - return false; - } - h += delta; +// Speed hacks in this implementation: +// * Uses fastrange instead of % +// * Minimum logic to determine first (and all) probed memory addresses. +// (Uses constant bit-xor offsets from the starting probe address.) +// * (Major) Two probes per 64-bit memory fetch/write. +// Code simplification / optimization: only allow even number of probes. +// * Very fast and effective (murmur-like) hash expansion/re-mixing. (At +// least on recent CPUs, integer multiplication is very cheap. Each 64-bit +// remix provides five pairs of bit addresses within a uint64_t.) +// Code simplification / optimization: only allow up to 10 probes, from a +// single 64-bit remix. +// +// The FP rate penalty for this implementation, vs. standard Bloom filter, is +// roughly 1.12x on top of the 1.15x penalty for a 512-bit cache-local Bloom. +// This implementation does not explicitly use the cache line size, but is +// effectively cache-local (up to 16 probes) because of the bit-xor offsetting. +// +// NB: could easily be upgraded to support a 64-bit hash and +// total_bits > 2^32 (512MB). (The latter is a bad idea without the former, +// because of false positives.) + +inline bool DynamicBloom::MayContainHash(uint32_t h32) const { + size_t a = fastrange32(kLen, h32); + PREFETCH(data_ + a, 0, 3); + // Expand/remix with 64-bit golden ratio + uint64_t h = 0x9e3779b97f4a7c13ULL * h32; + for (unsigned i = 0;; ++i) { + // Two bit probes per uint64_t probe + uint64_t mask = ((uint64_t)1 << (h & 63)) + | ((uint64_t)1 << ((h >> 6) & 63)); + uint64_t val = data_[a ^ i].load(std::memory_order_relaxed); + if (i + 1 >= kNumDoubleProbes) { + return (val & mask) == mask; + } else if ((val & mask) != mask) { + return false; } + h = (h >> 12) | (h << 52); } - return true; } template -inline void DynamicBloom::AddHash(uint32_t h, const OrFunc& or_func) { - const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits - if (kNumBlocks != 0) { - uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); - for (uint32_t i = 0; i < kNumProbes; ++i) { - // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized - // to a simple and operation by compiler. - const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); - or_func(&data_[bitpos / 8], (1 << (bitpos % 8))); - // Rotate h so that we don't reuse the same bytes. - h = h / (CACHE_LINE_SIZE * 8) + - (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE); - h += delta; - } - } else { - for (uint32_t i = 0; i < kNumProbes; ++i) { - const uint32_t bitpos = h % kTotalBits; - or_func(&data_[bitpos / 8], (1 << (bitpos % 8))); - h += delta; +inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) { + size_t a = fastrange32(kLen, h32); + PREFETCH(data_ + a, 0, 3); + // Expand/remix with 64-bit golden ratio + uint64_t h = 0x9e3779b97f4a7c13ULL * h32; + for (unsigned i = 0;; ++i) { + // Two bit probes per uint64_t probe + uint64_t mask = ((uint64_t)1 << (h & 63)) + | ((uint64_t)1 << ((h >> 6) & 63)); + or_func(&data_[a ^ i], mask); + if (i + 1 >= kNumDoubleProbes) { + return; } + h = (h >> 12) | (h << 52); } } diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index 3f98ccd01..22741ed87 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -45,18 +45,18 @@ class DynamicBloomTest : public testing::Test {}; TEST_F(DynamicBloomTest, EmptyFilter) { Arena arena; - DynamicBloom bloom1(&arena, 100, 0, 2); + DynamicBloom bloom1(&arena, 100, 2); ASSERT_TRUE(!bloom1.MayContain("hello")); ASSERT_TRUE(!bloom1.MayContain("world")); - DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2); + DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2); ASSERT_TRUE(!bloom2.MayContain("hello")); ASSERT_TRUE(!bloom2.MayContain("world")); } TEST_F(DynamicBloomTest, Small) { Arena arena; - DynamicBloom bloom1(&arena, 100, 0, 2); + DynamicBloom bloom1(&arena, 100, 2); bloom1.Add("hello"); bloom1.Add("world"); ASSERT_TRUE(bloom1.MayContain("hello")); @@ -64,7 +64,7 @@ TEST_F(DynamicBloomTest, Small) { ASSERT_TRUE(!bloom1.MayContain("x")); ASSERT_TRUE(!bloom1.MayContain("foo")); - DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2); + DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2); bloom2.Add("hello"); bloom2.Add("world"); ASSERT_TRUE(bloom2.MayContain("hello")); @@ -75,7 +75,7 @@ TEST_F(DynamicBloomTest, Small) { TEST_F(DynamicBloomTest, SmallConcurrentAdd) { Arena arena; - DynamicBloom bloom1(&arena, 100, 0, 2); + DynamicBloom bloom1(&arena, 100, 2); bloom1.AddConcurrently("hello"); bloom1.AddConcurrently("world"); ASSERT_TRUE(bloom1.MayContain("hello")); @@ -83,7 +83,7 @@ TEST_F(DynamicBloomTest, SmallConcurrentAdd) { ASSERT_TRUE(!bloom1.MayContain("x")); ASSERT_TRUE(!bloom1.MayContain("foo")); - DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2); + DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2); bloom2.AddConcurrently("hello"); bloom2.AddConcurrently("world"); ASSERT_TRUE(bloom2.MayContain("hello")); @@ -116,53 +116,44 @@ TEST_F(DynamicBloomTest, VaryingLengths) { fprintf(stderr, "bits_per_key: %d num_probes: %d\n", FLAGS_bits_per_key, num_probes); - for (uint32_t enable_locality = 0; enable_locality < 2; ++enable_locality) { - for (uint32_t num = 1; num <= 10000; num = NextNum(num)) { - uint32_t bloom_bits = 0; - Arena arena; - if (enable_locality == 0) { - bloom_bits = std::max(num * FLAGS_bits_per_key, 64U); - } else { - bloom_bits = std::max(num * FLAGS_bits_per_key, - enable_locality * CACHE_LINE_SIZE * 8); - } - DynamicBloom bloom(&arena, bloom_bits, enable_locality, num_probes); - for (uint64_t i = 0; i < num; i++) { - bloom.Add(Key(i, buffer)); - ASSERT_TRUE(bloom.MayContain(Key(i, buffer))); - } - - // All added keys must match - for (uint64_t i = 0; i < num; i++) { - ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) << "Num " << num - << "; key " << i; - } + for (uint32_t num = 1; num <= 10000; num = NextNum(num)) { + uint32_t bloom_bits = 0; + Arena arena; + bloom_bits = num * FLAGS_bits_per_key; + DynamicBloom bloom(&arena, bloom_bits, num_probes); + for (uint64_t i = 0; i < num; i++) { + bloom.Add(Key(i, buffer)); + ASSERT_TRUE(bloom.MayContain(Key(i, buffer))); + } - // Check false positive rate + // All added keys must match + for (uint64_t i = 0; i < num; i++) { + ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) << "Num " << num + << "; key " << i; + } - int result = 0; - for (uint64_t i = 0; i < 10000; i++) { - if (bloom.MayContain(Key(i + 1000000000, buffer))) { - result++; - } + // Check false positive rate + int result = 0; + for (uint64_t i = 0; i < 10000; i++) { + if (bloom.MayContain(Key(i + 1000000000, buffer))) { + result++; } - double rate = result / 10000.0; - - fprintf(stderr, - "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, " - "enable locality?%u\n", - rate * 100.0, num, bloom_bits, enable_locality); - - if (rate > 0.0125) - mediocre_filters++; // Allowed, but not too often - else - good_filters++; } + double rate = result / 10000.0; - fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters, - mediocre_filters); - ASSERT_LE(mediocre_filters, good_filters / 5); + fprintf(stderr, + "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u\n", + rate * 100.0, num, bloom_bits); + + if (rate > 0.0125) + mediocre_filters++; // Allowed, but not too often + else + good_filters++; } + + fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters, + mediocre_filters); + ASSERT_LE(mediocre_filters, good_filters / 5); } TEST_F(DynamicBloomTest, perf) { @@ -178,7 +169,7 @@ TEST_F(DynamicBloomTest, perf) { const uint32_t num_keys = m * 8 * 1024 * 1024; fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8); - DynamicBloom std_bloom(&arena, num_keys * 10, 0, num_probes); + DynamicBloom std_bloom(&arena, num_keys * 10, num_probes); timer.Start(); for (uint64_t i = 1; i <= num_keys; ++i) { @@ -186,8 +177,8 @@ TEST_F(DynamicBloomTest, perf) { } uint64_t elapsed = timer.ElapsedNanos(); - fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n", - elapsed / num_keys); + fprintf(stderr, "dynamic bloom, avg add latency %3g\n", + static_cast(elapsed) / num_keys); uint32_t count = 0; timer.Start(); @@ -199,128 +190,99 @@ TEST_F(DynamicBloomTest, perf) { ASSERT_EQ(count, num_keys); elapsed = timer.ElapsedNanos(); assert(count > 0); - fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n", - elapsed / count); - - // Locality enabled version - DynamicBloom blocked_bloom(&arena, num_keys * 10, 1, num_probes); - - timer.Start(); - for (uint64_t i = 1; i <= num_keys; ++i) { - blocked_bloom.Add(Slice(reinterpret_cast(&i), 8)); - } - - elapsed = timer.ElapsedNanos(); - fprintf(stderr, - "blocked bloom(enable locality), avg add latency %" PRIu64 "\n", - elapsed / num_keys); - - count = 0; - timer.Start(); - for (uint64_t i = 1; i <= num_keys; ++i) { - if (blocked_bloom.MayContain( - Slice(reinterpret_cast(&i), 8))) { - ++count; - } - } - - elapsed = timer.ElapsedNanos(); - assert(count > 0); - fprintf(stderr, - "blocked bloom(enable locality), avg query latency %" PRIu64 "\n", - elapsed / count); - ASSERT_TRUE(count == num_keys); + fprintf(stderr, "dynamic bloom, avg query latency %3g\n", + static_cast(elapsed) / count); } } TEST_F(DynamicBloomTest, concurrent_with_perf) { - StopWatchNano timer(Env::Default()); uint32_t num_probes = static_cast(FLAGS_num_probes); uint32_t m_limit = FLAGS_enable_perf ? 8 : 1; - uint32_t locality_limit = FLAGS_enable_perf ? 1 : 0; uint32_t num_threads = 4; std::vector threads; for (uint32_t m = 1; m <= m_limit; ++m) { - for (uint32_t locality = 0; locality <= locality_limit; ++locality) { - Arena arena; - const uint32_t num_keys = m * 8 * 1024 * 1024; - fprintf(stderr, "testing %" PRIu32 "M keys with %" PRIu32 " locality\n", - m * 8, locality); + Arena arena; + const uint32_t num_keys = m * 8 * 1024 * 1024; + fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8); - DynamicBloom std_bloom(&arena, num_keys * 10, locality, num_probes); + DynamicBloom std_bloom(&arena, num_keys * 10, num_probes); - timer.Start(); + std::atomic elapsed(0); - std::function adder([&](size_t t) { - for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) { - std_bloom.AddConcurrently( - Slice(reinterpret_cast(&i), 8)); - } - }); - for (size_t t = 0; t < num_threads; ++t) { - threads.emplace_back(adder, t); - } - while (threads.size() > 0) { - threads.back().join(); - threads.pop_back(); + std::function adder([&](size_t t) { + StopWatchNano timer(Env::Default()); + timer.Start(); + for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) { + std_bloom.AddConcurrently( + Slice(reinterpret_cast(&i), 8)); } + elapsed += timer.ElapsedNanos(); + }); + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back(adder, t); + } + while (threads.size() > 0) { + threads.back().join(); + threads.pop_back(); + } - uint64_t elapsed = timer.ElapsedNanos(); - fprintf(stderr, "standard bloom, avg parallel add latency %" PRIu64 - " nanos/key\n", - elapsed / num_keys); + fprintf(stderr, "dynamic bloom, avg parallel add latency %3g" + " nanos/key\n", + static_cast(elapsed) / num_threads / num_keys); + elapsed = 0; + std::function hitter([&](size_t t) { + StopWatchNano timer(Env::Default()); timer.Start(); - - std::function hitter([&](size_t t) { - for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) { - bool f = - std_bloom.MayContain(Slice(reinterpret_cast(&i), 8)); - ASSERT_TRUE(f); - } - }); - for (size_t t = 0; t < num_threads; ++t) { - threads.emplace_back(hitter, t); - } - while (threads.size() > 0) { - threads.back().join(); - threads.pop_back(); + for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) { + bool f = + std_bloom.MayContain(Slice(reinterpret_cast(&i), 8)); + ASSERT_TRUE(f); } + elapsed += timer.ElapsedNanos(); + }); + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back(hitter, t); + } + while (threads.size() > 0) { + threads.back().join(); + threads.pop_back(); + } - elapsed = timer.ElapsedNanos(); - fprintf(stderr, "standard bloom, avg parallel hit latency %" PRIu64 - " nanos/key\n", - elapsed / num_keys); + fprintf(stderr, "dynamic bloom, avg parallel hit latency %3g" + " nanos/key\n", + static_cast(elapsed) / num_threads / num_keys); + elapsed = 0; + std::atomic false_positives(0); + std::function misser([&](size_t t) { + StopWatchNano timer(Env::Default()); timer.Start(); - - std::atomic false_positives(0); - std::function misser([&](size_t t) { - for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys; - i += num_threads) { - bool f = - std_bloom.MayContain(Slice(reinterpret_cast(&i), 8)); - if (f) { - ++false_positives; - } + for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys; + i += num_threads) { + bool f = + std_bloom.MayContain(Slice(reinterpret_cast(&i), 8)); + if (f) { + ++false_positives; } - }); - for (size_t t = 0; t < num_threads; ++t) { - threads.emplace_back(misser, t); - } - while (threads.size() > 0) { - threads.back().join(); - threads.pop_back(); } - - elapsed = timer.ElapsedNanos(); - fprintf(stderr, "standard bloom, avg parallel miss latency %" PRIu64 - " nanos/key, %f%% false positive rate\n", - elapsed / num_keys, false_positives.load() * 100.0 / num_keys); + elapsed += timer.ElapsedNanos(); + }); + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back(misser, t); + } + while (threads.size() > 0) { + threads.back().join(); + threads.pop_back(); } + + fprintf(stderr, "dynamic bloom, avg parallel miss latency %3g" + " nanos/key, %f%% false positive rate\n", + static_cast(elapsed) / num_threads / num_keys, + false_positives.load() * 100.0 / num_keys); } } diff --git a/util/hash.h b/util/hash.h index ed42b0894..836f325ef 100644 --- a/util/hash.h +++ b/util/hash.h @@ -49,4 +49,12 @@ struct SliceHasher { uint32_t operator()(const Slice& s) const { return GetSliceHash(s); } }; +// An alternative to % for mapping a hash value to an arbitrary range. See +// https://github.com/lemire/fastrange and +// https://github.com/pdillinger/wormhashing/blob/2c4035a4462194bf15f3e9fc180c27c513335225/bloom_simulation_tests/foo.cc#L57 +inline uint32_t fastrange32(uint32_t a, uint32_t h) { + uint64_t product = static_cast(a) * h; + return static_cast(product >> 32); +} + } // namespace rocksdb