From 079e77ff9eb7e62bfde0745800efb74529f3a566 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Fri, 3 Apr 2020 10:24:09 -0700
Subject: [PATCH] Revamp cache_bench to resemble a real workload (#6629)

Summary:
I suspect LRUCache could use some optimization, and to support
such an effort, a good benchmarking tool is needed. The existing
cache_bench was heavily skewed toward insertion and lookup misses, and
did not saturate memory with other work. This change should improve
those things to better resemble a real workload.

(All below using clang compiler, for some consistency, but not
necessarily same version and settings.)

The real workload is from production MySQL on RocksDB, filtering stacks
containing "LRU", "ShardedCache" or "CacheShard."
Lookup inclusive: 66%
Insert inclusive: 17%
Release inclusive: 15%

An alternate simulated workload is MySQL running a LinkBench read test:
Lookup inclusive: 54%
Insert inclusive: 24%
Release inclusive: 21%

cache_bench default settings, prior to this change:
Lookup inclusive: 35.8%
Insert inclusive: 63.6%
Release inclusive: 0%

cache_bench after this change (intended as somewhat "tighter" workload
than average production, more like LinkBench):
Lookup inclusive: 52%
Insert inclusive: 20%
Release inclusive: 26%

And top exclusive stacks (portion of stack samples as filtered above):
Production MySQL:
LRUHandleTable::FindPointer: 25.3%
rocksdb::operator==: 15.1%  <-- Slice ==
LRUCacheShard::LRU_Remove: 13.8%
ShardedCache::Lookup: 8.9%
__pthread_mutex_lock: 7.1%
LRUCacheShard::LRU_Insert: 6.3%
MurmurHash64A: 4.8%  <-- Since upgraded to XXH3p
...

Old cache_bench:
LRUHandleTable::FindPointer: 23.6%
__pthread_mutex_lock: 15.0%
__pthread_mutex_unlock_usercnt: 11.7%
__lll_lock_wait: 8.6%
__lll_unlock_wake: 6.8%
LRUCacheShard::LRU_Insert: 6.0%
ShardedCache::Lookup: 4.4%
LRUCacheShard::LRU_Remove: 2.8%
...
rocksdb::operator==: 0.2%  <-- Slice ==
...

New cache_bench:
LRUHandleTable::FindPointer: 22.8%
__pthread_mutex_unlock_usercnt: 14.3%
rocksdb::operator==: 10.5%  <-- Slice ==
LRUCacheShard::LRU_Insert: 9.0%
__pthread_mutex_lock: 5.9%
LRUCacheShard::LRU_Remove: 5.0%
...
ShardedCache::Lookup: 2.9%
...

So there's a bit more lock contention in the benchmark than in
production, but otherwise looks similar enough to me. At least it's a
big improvement over the existing code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6629

Test Plan: No production code changes, ran cache_bench with ASAN

Reviewed By: ltamasi

Differential Revision: D20824318

Pulled By: pdillinger

fbshipit-source-id: 6f8dc5891ead0f87edbed3a615ecd5289d9abe12
---
 cache/cache_bench.cc        | 230 ++++++++++++++++++++++++++----------
 db/error_handler_fs_test.cc |   6 +-
 util/work_queue.h           |   3 +-
 3 files changed, 169 insertions(+), 70 deletions(-)
diff --git a/cache/cache_bench.cc b/cache/cache_bench.cc
index 6ff36a32d..163599546 100644
--- a/cache/cache_bench.cc
+++ b/cache/cache_bench.cc
@@ -14,34 +14,47 @@ int main() {
 #include <stdio.h>
 #include <sys/types.h>
 #include <cinttypes>
+#include <limits>
 
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "util/coding.h"
 #include "util/gflags_compat.h"
+#include "util/hash.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
-static const uint32_t KB = 1024;
-
-DEFINE_int32(threads, 16, "Number of concurrent threads to run.");
-DEFINE_int64(cache_size, 8 * KB * KB,
-             "Number of bytes to use as a cache of uncompressed data.");
-DEFINE_int32(num_shard_bits, 4, "shard_bits.");
-
-DEFINE_int64(max_key, 1 * KB * KB * KB, "Max number of key to place in cache");
-DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
-
-DEFINE_bool(populate_cache, false, "Populate cache before operations");
-DEFINE_int32(insert_percent, 40,
-             "Ratio of insert to total workload (expressed as a percentage)");
-DEFINE_int32(lookup_percent, 50,
-             "Ratio of lookup to total workload (expressed as a percentage)");
-DEFINE_int32(erase_percent, 10,
-             "Ratio of erase to total workload (expressed as a percentage)");
+static constexpr uint32_t KiB = uint32_t{1} << 10;
+static constexpr uint32_t MiB = KiB << 10;
+static constexpr uint64_t GiB = MiB << 10;
+
+DEFINE_uint32(threads, 16, "Number of concurrent threads to run.");
+DEFINE_uint64(cache_size, 1 * GiB,
+              "Number of bytes to use as a cache of uncompressed data.");
+DEFINE_uint32(num_shard_bits, 6, "shard_bits.");
+
+DEFINE_double(resident_ratio, 0.25,
+              "Ratio of keys fitting in cache to keyspace.");
+DEFINE_uint64(ops_per_thread, 0,
+              "Number of operations per thread. (Default: 5 * keyspace size)");
+DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
+
+DEFINE_uint32(skew, 5, "Degree of skew in key selection");
+DEFINE_bool(populate_cache, true, "Populate cache before operations");
+
+DEFINE_uint32(lookup_insert_percent, 87,
+              "Ratio of lookup (+ insert on not found) to total workload "
+              "(expressed as a percentage)");
+DEFINE_uint32(insert_percent, 2,
+              "Ratio of insert to total workload (expressed as a percentage)");
+DEFINE_uint32(lookup_percent, 10,
+              "Ratio of lookup to total workload (expressed as a percentage)");
+DEFINE_uint32(erase_percent, 1,
+              "Ratio of erase to total workload (expressed as a percentage)");
 
 DEFINE_bool(use_clock_cache, false, "");
 
@@ -49,21 +62,15 @@ namespace ROCKSDB_NAMESPACE {
 
 class CacheBench;
 namespace {
-void deleter(const Slice& /*key*/, void* value) {
-    delete reinterpret_cast<char *>(value);
-}
-
 // State shared by all concurrent executions of the same benchmark.
 class SharedState {
  public:
   explicit SharedState(CacheBench* cache_bench)
       : cv_(&mu_),
-        num_threads_(FLAGS_threads),
         num_initialized_(0),
         start_(false),
         num_done_(0),
-        cache_bench_(cache_bench) {
-  }
+        cache_bench_(cache_bench) {}
 
   ~SharedState() {}
 
@@ -87,13 +94,9 @@ class SharedState {
     num_done_++;
   }
 
-  bool AllInitialized() const {
-    return num_initialized_ >= num_threads_;
-  }
+  bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; }
 
-  bool AllDone() const {
-    return num_done_ >= num_threads_;
-  }
+  bool AllDone() const { return num_done_ >= FLAGS_threads; }
 
   void SetStart() {
     start_ = true;
@@ -107,7 +110,6 @@ class SharedState {
   port::Mutex mu_;
   port::CondVar cv_;
 
-  const uint64_t num_threads_;
   uint64_t num_initialized_;
   bool start_;
   uint64_t num_done_;
@@ -118,17 +120,69 @@ class SharedState {
 // Per-thread state for concurrent executions of the same benchmark.
 struct ThreadState {
   uint32_t tid;
-  Random rnd;
+  Random64 rnd;
   SharedState* shared;
 
   ThreadState(uint32_t index, SharedState* _shared)
       : tid(index), rnd(1000 + index), shared(_shared) {}
 };
+
+struct KeyGen {
+  char key_data[27];
+
+  Slice GetRand(Random64& rnd, uint64_t max_key) {
+    uint64_t raw = rnd.Next();
+    // Skew according to setting
+    for (uint32_t i = 0; i < FLAGS_skew; ++i) {
+      raw = std::min(raw, rnd.Next());
+    }
+    uint64_t key = fastrange64(raw, max_key);
+    // Variable size and alignment
+    size_t off = key % 8;
+    key_data[0] = char{42};
+    EncodeFixed64(key_data + 1, key);
+    key_data[9] = char{11};
+    EncodeFixed64(key_data + 10, key);
+    key_data[18] = char{4};
+    EncodeFixed64(key_data + 19, key);
+    return Slice(&key_data[off], sizeof(key_data) - off);
+  }
+};
+
+char* createValue(Random64& rnd) {
+  char* rv = new char[FLAGS_value_bytes];
+  // Fill with some filler data, and take some CPU time
+  for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
+    EncodeFixed64(rv + i, rnd.Next());
+  }
+  return rv;
+}
+
+void deleter(const Slice& /*key*/, void* value) {
+  delete[] static_cast<char*>(value);
+}
 }  // namespace
 
 class CacheBench {
+  static constexpr uint64_t kHundredthUint64 =
+      std::numeric_limits<uint64_t>::max() / 100U;
+
  public:
-  CacheBench() : num_threads_(FLAGS_threads) {
+  CacheBench()
+      : max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio /
+                                       FLAGS_value_bytes)),
+        lookup_insert_threshold_(kHundredthUint64 *
+                                 FLAGS_lookup_insert_percent),
+        insert_threshold_(lookup_insert_threshold_ +
+                          kHundredthUint64 * FLAGS_insert_percent),
+        lookup_threshold_(insert_threshold_ +
+                          kHundredthUint64 * FLAGS_lookup_percent),
+        erase_threshold_(lookup_threshold_ +
+                         kHundredthUint64 * FLAGS_erase_percent) {
+    if (erase_threshold_ != 100U * kHundredthUint64) {
+      fprintf(stderr, "Percentages must add to 100.\n");
+      exit(1);
+    }
     if (FLAGS_use_clock_cache) {
       cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits);
       if (!cache_) {
@@ -138,18 +192,19 @@ class CacheBench {
     } else {
       cache_ = NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits);
     }
+    if (FLAGS_ops_per_thread == 0) {
+      FLAGS_ops_per_thread = 5 * max_key_;
+    }
   }
 
   ~CacheBench() {}
 
   void PopulateCache() {
-    Random rnd(1);
-    for (int64_t i = 0; i < FLAGS_cache_size; i++) {
-      uint64_t rand_key = rnd.Next() % FLAGS_max_key;
-      // Cast uint64* to be char*, data would be copied to cache
-      Slice key(reinterpret_cast<char*>(&rand_key), 8);
-      // do insert
-      cache_->Insert(key, new char[10], 1, &deleter);
+    Random64 rnd(1);
+    KeyGen keygen;
+    for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
+      cache_->Insert(keygen.GetRand(rnd, max_key_), createValue(rnd),
+                     FLAGS_value_bytes, &deleter);
     }
   }
 
@@ -158,10 +213,10 @@ class CacheBench {
 
     PrintEnv();
     SharedState shared(this);
-    std::vector<ThreadState*> threads(num_threads_);
-    for (uint32_t i = 0; i < num_threads_; i++) {
-      threads[i] = new ThreadState(i, &shared);
-      env->StartThread(ThreadBody, threads[i]);
+    std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
+    for (uint32_t i = 0; i < FLAGS_threads; i++) {
+      threads[i].reset(new ThreadState(i, &shared));
+      env->StartThread(ThreadBody, threads[i].get());
     }
     {
       MutexLock l(shared.GetMutex());
@@ -192,10 +247,15 @@ class CacheBench {
 
  private:
   std::shared_ptr<Cache> cache_;
-  uint32_t num_threads_;
+  const uint64_t max_key_;
+  // Cumulative thresholds in the space of a random uint64_t
+  const uint64_t lookup_insert_threshold_;
+  const uint64_t insert_threshold_;
+  const uint64_t lookup_threshold_;
+  const uint64_t erase_threshold_;
 
   static void ThreadBody(void* v) {
-    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    ThreadState* thread = static_cast<ThreadState*>(v);
     SharedState* shared = thread->shared;
 
     {
@@ -220,40 +280,78 @@ class CacheBench {
   }
 
   void OperateCache(ThreadState* thread) {
+    // To use looked-up values
+    uint64_t result = 0;
+    // To hold handles for a non-trivial amount of time
+    Cache::Handle* handle = nullptr;
+    KeyGen gen;
     for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
-      uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key;
-      // Cast uint64* to be char*, data would be copied to cache
-      Slice key(reinterpret_cast<char*>(&rand_key), 8);
-      int32_t prob_op = thread->rnd.Uniform(100);
-      if (prob_op >= 0 && prob_op < FLAGS_insert_percent) {
-        // do insert
-        cache_->Insert(key, new char[10], 1, &deleter);
-      } else if (prob_op -= FLAGS_insert_percent &&
-                 prob_op < FLAGS_lookup_percent) {
+      Slice key = gen.GetRand(thread->rnd, max_key_);
+      uint64_t random_op = thread->rnd.Next();
+      if (random_op < lookup_insert_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
         // do lookup
-        auto handle = cache_->Lookup(key);
+        handle = cache_->Lookup(key);
+        if (handle) {
+          // do something with the data
+          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
+                             FLAGS_value_bytes);
+        } else {
+          // do insert
+          cache_->Insert(key, createValue(thread->rnd), FLAGS_value_bytes,
+                         &deleter, &handle);
+        }
+      } else if (random_op < insert_threshold_) {
         if (handle) {
           cache_->Release(handle);
+          handle = nullptr;
         }
-      } else if (prob_op -= FLAGS_lookup_percent &&
-                 prob_op < FLAGS_erase_percent) {
+        // do insert
+        cache_->Insert(key, createValue(thread->rnd), FLAGS_value_bytes,
+                       &deleter, &handle);
+      } else if (random_op < lookup_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
+        // do lookup
+        handle = cache_->Lookup(key);
+        if (handle) {
+          // do something with the data
+          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
+                             FLAGS_value_bytes);
+        }
+      } else if (random_op < erase_threshold_) {
         // do erase
         cache_->Erase(key);
+      } else {
+        // Should be extremely unlikely (noop)
+        assert(random_op >= kHundredthUint64 * 100U);
       }
     }
+    if (handle) {
+      cache_->Release(handle);
+      handle = nullptr;
+    }
   }
 
   void PrintEnv() const {
     printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
-    printf("Number of threads   : %d\n", FLAGS_threads);
+    printf("Number of threads   : %u\n", FLAGS_threads);
     printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
     printf("Cache size          : %" PRIu64 "\n", FLAGS_cache_size);
-    printf("Num shard bits      : %d\n", FLAGS_num_shard_bits);
-    printf("Max key             : %" PRIu64 "\n", FLAGS_max_key);
-    printf("Populate cache      : %d\n", FLAGS_populate_cache);
-    printf("Insert percentage   : %d%%\n", FLAGS_insert_percent);
-    printf("Lookup percentage   : %d%%\n", FLAGS_lookup_percent);
-    printf("Erase percentage    : %d%%\n", FLAGS_erase_percent);
+    printf("Num shard bits      : %u\n", FLAGS_num_shard_bits);
+    printf("Max key             : %" PRIu64 "\n", max_key_);
+    printf("Resident ratio      : %g\n", FLAGS_resident_ratio);
+    printf("Skew degree         : %u\n", FLAGS_skew);
+    printf("Populate cache      : %d\n", int{FLAGS_populate_cache});
+    printf("Lookup+Insert pct   : %u%%\n", FLAGS_lookup_insert_percent);
+    printf("Insert percentage   : %u%%\n", FLAGS_insert_percent);
+    printf("Lookup percentage   : %u%%\n", FLAGS_lookup_percent);
+    printf("Erase percentage    : %u%%\n", FLAGS_erase_percent);
     printf("----------------------------\n");
   }
 };
@@ -270,6 +368,8 @@ int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::CacheBench bench;
   if (FLAGS_populate_cache) {
     bench.PopulateCache();
+    printf("Population complete\n");
+    printf("----------------------------\n");
   }
   if (bench.Run()) {
     return 0;
diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc
index 78a795b4f..912baa1ba 100644
--- a/db/error_handler_fs_test.cc
+++ b/db/error_handler_fs_test.cc
@@ -498,15 +498,15 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::LogAndApply:WriteManifest", [&](void*) {
         if (fail_manifest.load()) {
-          fault_fs->SetFilesystemActive(false,error_msg); }
-        });
+          fault_fs->SetFilesystemActive(false, error_msg);
+        }
+      });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   Put(Key(1), "val");
   s = Flush();
   ASSERT_EQ(s, Status::OK());
 
-
   TEST_SYNC_POINT("CompactionManifestWriteError:0");
   TEST_SYNC_POINT("CompactionManifestWriteError:1");
 
diff --git a/util/work_queue.h b/util/work_queue.h
index 3d9126364..f120ca77c 100644
--- a/util/work_queue.h
+++ b/util/work_queue.h
@@ -17,7 +17,6 @@
 #include <cassert>
 #include <condition_variable>
 #include <cstddef>
-#include <cstddef>
 #include <functional>
 #include <mutex>
 #include <queue>
@@ -146,4 +145,4 @@ class WorkQueue {
     }
   }
 };
-}
+}  // namespace ROCKSDB_NAMESPACE