diff --git a/HISTORY.md b/HISTORY.md
index d7a3f0c41..4a3d715a0 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -11,6 +11,9 @@
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
+* The MemTable Bloom filter, when enabled, now always uses cache locality. Options::bloom_locality now only affects the PlainTable SST format.
+### Performance Improvements
+* Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
 
 ## 6.4.0 (7/30/2019)
 ### Default Option Change
diff --git a/db/memtable.cc b/db/memtable.cc
index 06cb2222e..21d3e347b 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -116,7 +116,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       moptions_.memtable_prefix_bloom_bits > 0) {
     bloom_filter_.reset(
         new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
-                         ioptions.bloom_locality, 6 /* hard coded 6 probes */,
+                         6 /* hard coded 6 probes */,
                          moptions_.memtable_huge_page_size, ioptions.info_log));
   }
 }
diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h
index 08c72b2dc..b9248cdaf 100644
--- a/table/plain/plain_table_bloom.h
+++ b/table/plain/plain_table_bloom.h
@@ -19,6 +19,8 @@ class Slice;
 class Allocator;
 class Logger;
 
+// A legacy Bloom filter implementation used by Plain Table db format, for
+// schema backward compatibility. Not for use in new filter applications.
 class PlainTableBloomV1 {
  public:
   // allocator: pass allocator to bloom filter, hence trace the usage of memory
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index e5210d1fb..3c153c719 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -16,44 +16,49 @@ namespace rocksdb {
 
 namespace {
 
-uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
-  uint32_t num_blocks =
-      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
-
-  // Make num_blocks an odd number to make sure more bits are involved
-  // when determining which block.
-  if (num_blocks % 2 == 0) {
-    num_blocks++;
+uint32_t roundUpToPow2(uint32_t x) {
+  uint32_t rv = 1;
+  while (rv < x) {
+    rv <<= 1;
   }
-
-  return num_blocks * (CACHE_LINE_SIZE * 8);
+  return rv;
 }
+
 }
 
 DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
-                           uint32_t locality, uint32_t num_probes,
+                           uint32_t num_probes,
                            size_t huge_page_tlb_size, Logger* logger)
-    : kNumProbes(num_probes) {
-  kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
-                              : (total_bits + 7) / 8 * 8;
-  kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+    // Round down, except round up with 1
+    : kNumDoubleProbes((num_probes + (num_probes == 1)) / 2) {
+  assert(num_probes % 2 == 0); // limitation of current implementation
+  assert(num_probes <= 10); // limitation of current implementation
+  assert(kNumDoubleProbes > 0);
 
-  assert(kNumBlocks > 0 || kTotalBits > 0);
-  assert(kNumProbes > 0);
+  // Determine how much to round off + align by so that x ^ i (that's xor) is
+  // a valid u64 index if x is a valid u64 index and 0 <= i < kNumDoubleProbes.
+  uint32_t block_bytes = /*bytes/u64*/ 8 *
+                         /*u64s*/ std::max(1U, roundUpToPow2(kNumDoubleProbes));
+  kLen = (total_bits + (/*bits/byte*/ 8 * block_bytes - 1)) /
+         /*bits/u64*/ 64;
+  assert(kLen > 0);
 
-  uint32_t sz = kTotalBits / 8;
-  if (kNumBlocks > 0) {
-    sz += CACHE_LINE_SIZE - 1;
-  }
+  uint32_t sz = kLen * /*bytes/u64*/ 8;
+  // Padding to correct for allocation not originally aligned on block_bytes
+  // boundary
+  sz += block_bytes - 1;
   assert(allocator);
 
   char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
   memset(raw, 0, sz);
-  auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
-  if (kNumBlocks > 0 && cache_line_offset > 0) {
-    raw += CACHE_LINE_SIZE - cache_line_offset;
+  auto block_offset = reinterpret_cast<uintptr_t>(raw) % block_bytes;
+  if (block_offset > 0) {
+    // Align on block_bytes boundary
+    raw += block_bytes - block_offset;
   }
-  data_ = reinterpret_cast<std::atomic<uint8_t>*>(raw);
+  static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
+                "Expecting zero-space-overhead atomic");
+  data_ = reinterpret_cast<std::atomic<uint64_t>*>(raw);
 }
 
 }  // rocksdb
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index 8b31f3c48..cf6ac4060 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -24,12 +24,20 @@ class Logger;
 // A Bloom filter intended only to be used in memory, never serialized in a way
 // that could lead to schema incompatibility. Supports opt-in lock-free
 // concurrent access.
+//
+// This implementation is also intended for applications generally preferring
+// speed vs. maximum accuracy: roughly 0.9x BF op latency for 1.1x FP rate.
+// For 1% FP rate, that means that the latency of a look-up triggered by an FP
+// should be less than roughly 100x the cost of a Bloom filter op.
+//
+// For simplicity and performance, the current implementation requires
+// num_probes to be a multiple of two and <= 10.
+//
 class DynamicBloom {
  public:
   // allocator: pass allocator to bloom filter, hence trace the usage of memory
   // total_bits: fixed total bits for the bloom
   // num_probes: number of hash probes for a single key
-  // locality:  If positive, optimize for cache line locality, 0 otherwise.
   // hash_func:  customized hash function
   // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
   //                      within this page size. Need to reserve huge pages for
@@ -37,7 +45,7 @@ class DynamicBloom {
   //                         sysctl -w vm.nr_hugepages=20
   //                     See linux doc Documentation/vm/hugetlbpage.txt
   explicit DynamicBloom(Allocator* allocator,
-                        uint32_t total_bits, uint32_t locality = 0,
+                        uint32_t total_bits,
                         uint32_t num_probes = 6,
                         size_t huge_page_tlb_size = 0,
                         Logger* logger = nullptr);
@@ -64,14 +72,15 @@ class DynamicBloom {
 
   void Prefetch(uint32_t h);
 
-  uint32_t GetNumBlocks() const { return kNumBlocks; }
-
  private:
-  uint32_t kTotalBits;
-  uint32_t kNumBlocks;
-  const uint32_t kNumProbes;
+  // Length of the structure, in 64-bit words. For this structure, "word"
+  // will always refer to 64-bit words.
+  uint32_t kLen;
+  // We make the k probes in pairs, two for each 64-bit read/write. Thus,
+  // this stores k/2, the number of words to double-probe.
+  const uint32_t kNumDoubleProbes;
 
-  std::atomic<uint8_t>* data_;
+  std::atomic<uint64_t>* data_;
 
   // or_func(ptr, mask) should effect *ptr |= mask with the appropriate
   // concurrency safety, working with bytes.
@@ -86,14 +95,14 @@ inline void DynamicBloom::AddConcurrently(const Slice& key) {
 }
 
 inline void DynamicBloom::AddHash(uint32_t hash) {
-  AddHash(hash, [](std::atomic<uint8_t>* ptr, uint8_t mask) {
+  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
     ptr->store(ptr->load(std::memory_order_relaxed) | mask,
                std::memory_order_relaxed);
   });
 }
 
 inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
-  AddHash(hash, [](std::atomic<uint8_t>* ptr, uint8_t mask) {
+  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
     // Happens-before between AddHash and MaybeContains is handled by
     // access to versions_->LastSequence(), so all we have to do here is
     // avoid races (so we don't give the compiler a license to mess up
@@ -114,67 +123,69 @@ inline bool DynamicBloom::MayContain(const Slice& key) const {
 // local variable is initialized but not referenced
 #pragma warning(disable : 4189)
 #endif
-inline void DynamicBloom::Prefetch(uint32_t h) {
-  if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    PREFETCH(&(data_[b / 8]), 0, 3);
-  }
+inline void DynamicBloom::Prefetch(uint32_t h32) {
+  size_t a = fastrange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
 }
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
 
-inline bool DynamicBloom::MayContainHash(uint32_t h) const {
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-      //  to a simple and operation by compiler.
-      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-      uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed);
-      if ((byteval & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      // Rotate h so that we don't reuse the same bytes.
-      h = h / (CACHE_LINE_SIZE * 8) +
-          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
-      h += delta;
-    }
-  } else {
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = h % kTotalBits;
-      uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed);
-      if ((byteval & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      h += delta;
+// Speed hacks in this implementation:
+// * Uses fastrange instead of %
+// * Minimum logic to determine first (and all) probed memory addresses.
+//   (Uses constant bit-xor offsets from the starting probe address.)
+// * (Major) Two probes per 64-bit memory fetch/write.
+//   Code simplification / optimization: only allow even number of probes.
+// * Very fast and effective (murmur-like) hash expansion/re-mixing. (At
+// least on recent CPUs, integer multiplication is very cheap. Each 64-bit
+// remix provides five pairs of bit addresses within a uint64_t.)
+//   Code simplification / optimization: only allow up to 10 probes, from a
+//   single 64-bit remix.
+//
+// The FP rate penalty for this implementation, vs. standard Bloom filter, is
+// roughly 1.12x on top of the 1.15x penalty for a 512-bit cache-local Bloom.
+// This implementation does not explicitly use the cache line size, but is
+// effectively cache-local (up to 16 probes) because of the bit-xor offsetting.
+//
+// NB: could easily be upgraded to support a 64-bit hash and
+// total_bits > 2^32 (512MB). (The latter is a bad idea without the former,
+// because of false positives.)
+
+inline bool DynamicBloom::MayContainHash(uint32_t h32) const {
+  size_t a = fastrange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
+  // Expand/remix with 64-bit golden ratio
+  uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
+  for (unsigned i = 0;; ++i) {
+    // Two bit probes per uint64_t probe
+    uint64_t mask = ((uint64_t)1 << (h & 63))
+                  | ((uint64_t)1 << ((h >> 6) & 63));
+    uint64_t val = data_[a ^ i].load(std::memory_order_relaxed);
+    if (i + 1 >= kNumDoubleProbes) {
+      return (val & mask) == mask;
+    } else if ((val & mask) != mask) {
+      return false;
     }
+    h = (h >> 12) | (h << 52);
   }
-  return true;
 }
 
 template <typename OrFunc>
-inline void DynamicBloom::AddHash(uint32_t h, const OrFunc& or_func) {
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-      // to a simple and operation by compiler.
-      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-      or_func(&data_[bitpos / 8], (1 << (bitpos % 8)));
-      // Rotate h so that we don't reuse the same bytes.
-      h = h / (CACHE_LINE_SIZE * 8) +
-          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
-      h += delta;
-    }
-  } else {
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = h % kTotalBits;
-      or_func(&data_[bitpos / 8], (1 << (bitpos % 8)));
-      h += delta;
+inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) {
+  size_t a = fastrange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
+  // Expand/remix with 64-bit golden ratio
+  uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
+  for (unsigned i = 0;; ++i) {
+    // Two bit probes per uint64_t probe
+    uint64_t mask = ((uint64_t)1 << (h & 63))
+                  | ((uint64_t)1 << ((h >> 6) & 63));
+    or_func(&data_[a ^ i], mask);
+    if (i + 1 >= kNumDoubleProbes) {
+      return;
     }
+    h = (h >> 12) | (h << 52);
   }
 }
 
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 3f98ccd01..22741ed87 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -45,18 +45,18 @@ class DynamicBloomTest : public testing::Test {};
 
 TEST_F(DynamicBloomTest, EmptyFilter) {
   Arena arena;
-  DynamicBloom bloom1(&arena, 100, 0, 2);
+  DynamicBloom bloom1(&arena, 100, 2);
   ASSERT_TRUE(!bloom1.MayContain("hello"));
   ASSERT_TRUE(!bloom1.MayContain("world"));
 
-  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
   ASSERT_TRUE(!bloom2.MayContain("hello"));
   ASSERT_TRUE(!bloom2.MayContain("world"));
 }
 
 TEST_F(DynamicBloomTest, Small) {
   Arena arena;
-  DynamicBloom bloom1(&arena, 100, 0, 2);
+  DynamicBloom bloom1(&arena, 100, 2);
   bloom1.Add("hello");
   bloom1.Add("world");
   ASSERT_TRUE(bloom1.MayContain("hello"));
@@ -64,7 +64,7 @@ TEST_F(DynamicBloomTest, Small) {
   ASSERT_TRUE(!bloom1.MayContain("x"));
   ASSERT_TRUE(!bloom1.MayContain("foo"));
 
-  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
   bloom2.Add("hello");
   bloom2.Add("world");
   ASSERT_TRUE(bloom2.MayContain("hello"));
@@ -75,7 +75,7 @@ TEST_F(DynamicBloomTest, Small) {
 
 TEST_F(DynamicBloomTest, SmallConcurrentAdd) {
   Arena arena;
-  DynamicBloom bloom1(&arena, 100, 0, 2);
+  DynamicBloom bloom1(&arena, 100, 2);
   bloom1.AddConcurrently("hello");
   bloom1.AddConcurrently("world");
   ASSERT_TRUE(bloom1.MayContain("hello"));
@@ -83,7 +83,7 @@ TEST_F(DynamicBloomTest, SmallConcurrentAdd) {
   ASSERT_TRUE(!bloom1.MayContain("x"));
   ASSERT_TRUE(!bloom1.MayContain("foo"));
 
-  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
   bloom2.AddConcurrently("hello");
   bloom2.AddConcurrently("world");
   ASSERT_TRUE(bloom2.MayContain("hello"));
@@ -116,53 +116,44 @@ TEST_F(DynamicBloomTest, VaryingLengths) {
   fprintf(stderr, "bits_per_key: %d  num_probes: %d\n", FLAGS_bits_per_key,
           num_probes);
 
-  for (uint32_t enable_locality = 0; enable_locality < 2; ++enable_locality) {
-    for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
-      uint32_t bloom_bits = 0;
-      Arena arena;
-      if (enable_locality == 0) {
-        bloom_bits = std::max(num * FLAGS_bits_per_key, 64U);
-      } else {
-        bloom_bits = std::max(num * FLAGS_bits_per_key,
-                              enable_locality * CACHE_LINE_SIZE * 8);
-      }
-      DynamicBloom bloom(&arena, bloom_bits, enable_locality, num_probes);
-      for (uint64_t i = 0; i < num; i++) {
-        bloom.Add(Key(i, buffer));
-        ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
-      }
-
-      // All added keys must match
-      for (uint64_t i = 0; i < num; i++) {
-        ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) << "Num " << num
-                                                      << "; key " << i;
-      }
+  for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
+    uint32_t bloom_bits = 0;
+    Arena arena;
+    bloom_bits = num * FLAGS_bits_per_key;
+    DynamicBloom bloom(&arena, bloom_bits, num_probes);
+    for (uint64_t i = 0; i < num; i++) {
+      bloom.Add(Key(i, buffer));
+      ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
+    }
 
-      // Check false positive rate
+    // All added keys must match
+    for (uint64_t i = 0; i < num; i++) {
+      ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) << "Num " << num
+                                                    << "; key " << i;
+    }
 
-      int result = 0;
-      for (uint64_t i = 0; i < 10000; i++) {
-        if (bloom.MayContain(Key(i + 1000000000, buffer))) {
-          result++;
-        }
+    // Check false positive rate
+    int result = 0;
+    for (uint64_t i = 0; i < 10000; i++) {
+      if (bloom.MayContain(Key(i + 1000000000, buffer))) {
+        result++;
       }
-      double rate = result / 10000.0;
-
-      fprintf(stderr,
-              "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, "
-              "enable locality?%u\n",
-              rate * 100.0, num, bloom_bits, enable_locality);
-
-      if (rate > 0.0125)
-        mediocre_filters++;  // Allowed, but not too often
-      else
-        good_filters++;
     }
+    double rate = result / 10000.0;
 
-    fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
-            mediocre_filters);
-    ASSERT_LE(mediocre_filters, good_filters / 5);
+    fprintf(stderr,
+            "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u\n",
+            rate * 100.0, num, bloom_bits);
+
+    if (rate > 0.0125)
+      mediocre_filters++;  // Allowed, but not too often
+    else
+      good_filters++;
   }
+
+  fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
+          mediocre_filters);
+  ASSERT_LE(mediocre_filters, good_filters / 5);
 }
 
 TEST_F(DynamicBloomTest, perf) {
@@ -178,7 +169,7 @@ TEST_F(DynamicBloomTest, perf) {
     const uint32_t num_keys = m * 8 * 1024 * 1024;
     fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
 
-    DynamicBloom std_bloom(&arena, num_keys * 10, 0, num_probes);
+    DynamicBloom std_bloom(&arena, num_keys * 10, num_probes);
 
     timer.Start();
     for (uint64_t i = 1; i <= num_keys; ++i) {
@@ -186,8 +177,8 @@ TEST_F(DynamicBloomTest, perf) {
     }
 
     uint64_t elapsed = timer.ElapsedNanos();
-    fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n",
-            elapsed / num_keys);
+    fprintf(stderr, "dynamic bloom, avg add latency %3g\n",
+            static_cast<double>(elapsed) / num_keys);
 
     uint32_t count = 0;
     timer.Start();
@@ -199,128 +190,99 @@ TEST_F(DynamicBloomTest, perf) {
     ASSERT_EQ(count, num_keys);
     elapsed = timer.ElapsedNanos();
     assert(count > 0);
-    fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n",
-            elapsed / count);
-
-    // Locality enabled version
-    DynamicBloom blocked_bloom(&arena, num_keys * 10, 1, num_probes);
-
-    timer.Start();
-    for (uint64_t i = 1; i <= num_keys; ++i) {
-      blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
-    }
-
-    elapsed = timer.ElapsedNanos();
-    fprintf(stderr,
-            "blocked bloom(enable locality), avg add latency %" PRIu64 "\n",
-            elapsed / num_keys);
-
-    count = 0;
-    timer.Start();
-    for (uint64_t i = 1; i <= num_keys; ++i) {
-      if (blocked_bloom.MayContain(
-              Slice(reinterpret_cast<const char*>(&i), 8))) {
-        ++count;
-      }
-    }
-
-    elapsed = timer.ElapsedNanos();
-    assert(count > 0);
-    fprintf(stderr,
-            "blocked bloom(enable locality), avg query latency %" PRIu64 "\n",
-            elapsed / count);
-    ASSERT_TRUE(count == num_keys);
+    fprintf(stderr, "dynamic bloom, avg query latency %3g\n",
+            static_cast<double>(elapsed) / count);
   }
 }
 
 TEST_F(DynamicBloomTest, concurrent_with_perf) {
-  StopWatchNano timer(Env::Default());
   uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
 
   uint32_t m_limit = FLAGS_enable_perf ? 8 : 1;
-  uint32_t locality_limit = FLAGS_enable_perf ? 1 : 0;
 
   uint32_t num_threads = 4;
   std::vector<port::Thread> threads;
 
   for (uint32_t m = 1; m <= m_limit; ++m) {
-    for (uint32_t locality = 0; locality <= locality_limit; ++locality) {
-      Arena arena;
-      const uint32_t num_keys = m * 8 * 1024 * 1024;
-      fprintf(stderr, "testing %" PRIu32 "M keys with %" PRIu32 " locality\n",
-              m * 8, locality);
+    Arena arena;
+    const uint32_t num_keys = m * 8 * 1024 * 1024;
+    fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
 
-      DynamicBloom std_bloom(&arena, num_keys * 10, locality, num_probes);
+    DynamicBloom std_bloom(&arena, num_keys * 10, num_probes);
 
-      timer.Start();
+    std::atomic<uint64_t> elapsed(0);
 
-      std::function<void(size_t)> adder([&](size_t t) {
-        for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
-          std_bloom.AddConcurrently(
-              Slice(reinterpret_cast<const char*>(&i), 8));
-        }
-      });
-      for (size_t t = 0; t < num_threads; ++t) {
-        threads.emplace_back(adder, t);
-      }
-      while (threads.size() > 0) {
-        threads.back().join();
-        threads.pop_back();
+    std::function<void(size_t)> adder([&](size_t t) {
+      StopWatchNano timer(Env::Default());
+      timer.Start();
+      for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+        std_bloom.AddConcurrently(
+            Slice(reinterpret_cast<const char*>(&i), 8));
       }
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(adder, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
+    }
 
-      uint64_t elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "standard bloom, avg parallel add latency %" PRIu64
-                      " nanos/key\n",
-              elapsed / num_keys);
+    fprintf(stderr, "dynamic bloom, avg parallel add latency %3g"
+                    " nanos/key\n",
+            static_cast<double>(elapsed) / num_threads / num_keys);
 
+    elapsed = 0;
+    std::function<void(size_t)> hitter([&](size_t t) {
+      StopWatchNano timer(Env::Default());
       timer.Start();
-
-      std::function<void(size_t)> hitter([&](size_t t) {
-        for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
-          bool f =
-              std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
-          ASSERT_TRUE(f);
-        }
-      });
-      for (size_t t = 0; t < num_threads; ++t) {
-        threads.emplace_back(hitter, t);
-      }
-      while (threads.size() > 0) {
-        threads.back().join();
-        threads.pop_back();
+      for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+        bool f =
+            std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
+        ASSERT_TRUE(f);
       }
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(hitter, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
+    }
 
-      elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "standard bloom, avg parallel hit latency %" PRIu64
-                      " nanos/key\n",
-              elapsed / num_keys);
+    fprintf(stderr, "dynamic bloom, avg parallel hit latency %3g"
+                    " nanos/key\n",
+            static_cast<double>(elapsed) / num_threads / num_keys);
 
+    elapsed = 0;
+    std::atomic<uint32_t> false_positives(0);
+    std::function<void(size_t)> misser([&](size_t t) {
+      StopWatchNano timer(Env::Default());
       timer.Start();
-
-      std::atomic<uint32_t> false_positives(0);
-      std::function<void(size_t)> misser([&](size_t t) {
-        for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
-             i += num_threads) {
-          bool f =
-              std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
-          if (f) {
-            ++false_positives;
-          }
+      for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
+           i += num_threads) {
+        bool f =
+            std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
+        if (f) {
+          ++false_positives;
         }
-      });
-      for (size_t t = 0; t < num_threads; ++t) {
-        threads.emplace_back(misser, t);
-      }
-      while (threads.size() > 0) {
-        threads.back().join();
-        threads.pop_back();
       }
-
-      elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "standard bloom, avg parallel miss latency %" PRIu64
-                      " nanos/key, %f%% false positive rate\n",
-              elapsed / num_keys, false_positives.load() * 100.0 / num_keys);
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(misser, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
     }
+
+    fprintf(stderr, "dynamic bloom, avg parallel miss latency %3g"
+                    " nanos/key, %f%% false positive rate\n",
+            static_cast<double>(elapsed) / num_threads / num_keys,
+            false_positives.load() * 100.0 / num_keys);
   }
 }
 
diff --git a/util/hash.h b/util/hash.h
index ed42b0894..836f325ef 100644
--- a/util/hash.h
+++ b/util/hash.h
@@ -49,4 +49,12 @@ struct SliceHasher {
   uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
 };
 
+// An alternative to % for mapping a hash value to an arbitrary range. See
+// https://github.com/lemire/fastrange and
+// https://github.com/pdillinger/wormhashing/blob/2c4035a4462194bf15f3e9fc180c27c513335225/bloom_simulation_tests/foo.cc#L57
+inline uint32_t fastrange32(uint32_t a, uint32_t h) {
+  uint64_t product = static_cast<uint64_t>(a) * h;
+  return static_cast<uint32_t>(product >> 32);
+}
+
 }  // namespace rocksdb