From 25403c2265cb700462d59fa3cb9dbec85d25d48f Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Thu, 28 Jun 2018 13:02:55 -0700
Subject: [PATCH] Prefetch cache lines for filter lookup (#4068)

Summary:
Since the filter data is unaligned, even though we ensure all probes are within a span of `cache_line_size` bytes, those bytes can span two cache lines. In that case I doubt hardware prefetching does a great job considering we don't necessarily access those two cache lines in order. This guess seems correct since adding explicit prefetch instructions reduced filter lookup overhead by 19.4%.
Closes https://github.com/facebook/rocksdb/pull/4068

Differential Revision: D8674189

Pulled By: ajkr

fbshipit-source-id: 747427d9a17900151c17820488e3f7efe06b1871
---
 util/bloom.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/util/bloom.cc b/util/bloom.cc
index 542d4fe7d..51de07953 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -228,6 +228,8 @@ bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
   uint32_t h = hash;
   const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   uint32_t b = (h % num_lines) * (cache_line_size * 8);
+  PREFETCH(&data[b / 8], 0 /* rw */, 1 /* locality */);
+  PREFETCH(&data[b / 8 + cache_line_size - 1], 0 /* rw */, 1 /* locality */);
 
   for (uint32_t i = 0; i < num_probes; ++i) {
     // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized