From f9de217353f2d45f06fe5b9eab50b191f1a2d7a2 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Wed, 2 Aug 2023 13:19:20 -0700 Subject: [PATCH] Some cache_bench enhancements (#11661) Summary: ... used in validating some HyperClockCache development in progress. * Revamp the "populate cache" step to avoid redundant insertions (very rare in practice) and more consistently approach the desired resident_ratio while maintaining appropriate skew (still not perfect). * Track and print hit ratio on lookups, to ensure a fair comparison is happening between implementations etc. * Add an option to disable tracking and printing histograms (lots of output) * Add an option to specify a random seed (for more reproducibility) * Remove confusing/redundant "-skewed" option Uses BitwiseAnd from https://github.com/facebook/rocksdb/issues/11660 (tested there) Pull Request resolved: https://github.com/facebook/rocksdb/pull/11661 Test Plan: manual Reviewed By: akankshamahajan15, jowlyzhang Differential Revision: D47937671 Pulled By: pdillinger fbshipit-source-id: 85a2bb881b1bca4f63e015bac684105fd91c9f35 --- cache/cache_bench_tool.cc | 175 +++++++++++++++++++++++++------------- 1 file changed, 116 insertions(+), 59 deletions(-) diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index 61b12b19e..fd1b44d16 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -50,7 +50,7 @@ DEFINE_double(resident_ratio, 0.25, DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread."); DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added."); -DEFINE_uint32(skew, 5, "Degree of skew in key selection"); +DEFINE_uint32(skew, 5, "Degree of skew in key selection. 0 = no skew"); DEFINE_bool(populate_cache, true, "Populate cache before operations"); DEFINE_uint32(lookup_insert_percent, 87, @@ -71,7 +71,6 @@ DEFINE_uint32( DEFINE_uint32(gather_stats_entries_per_lock, 256, "For Cache::ApplyToAllEntries"); -DEFINE_bool(skewed, false, "If true, skew the key access distribution"); DEFINE_bool(lean, false, "If true, no additional computation is performed besides cache " @@ -81,6 +80,11 @@ DEFINE_bool(early_exit, false, "Exit before deallocating most memory. Good for malloc stats, e.g." "MALLOC_CONF=\"stats_print:true\""); +DEFINE_bool(histograms, true, + "Whether to track and print histogram statistics."); + +DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random"); + DEFINE_string(secondary_cache_uri, "", "Full URI for creating a custom secondary cache object"); static class std::shared_ptr secondary_cache; @@ -149,9 +153,6 @@ class SharedState { public: explicit SharedState(CacheBench* cache_bench) : cv_(&mu_), - num_initialized_(0), - start_(false), - num_done_(0), cache_bench_(cache_bench) {} ~SharedState() {} @@ -174,15 +175,27 @@ class SharedState { bool Started() const { return start_; } + void AddLookupStats(uint64_t hits, uint64_t misses) { + MutexLock l(&mu_); + lookup_count_ += hits + misses; + lookup_hits_ += hits; + } + + double GetLookupHitRatio() const { + return 1.0 * lookup_hits_ / lookup_count_; + } + private: port::Mutex mu_; port::CondVar cv_; - uint64_t num_initialized_; - bool start_; - uint64_t num_done_; - CacheBench* cache_bench_; + + uint64_t num_initialized_ = 0; + bool start_ = false; + uint64_t num_done_ = 0; + uint64_t lookup_count_ = 0; + uint64_t lookup_hits_ = 0; }; // Per-thread state for concurrent executions of the same benchmark. @@ -194,27 +207,19 @@ struct ThreadState { uint64_t duration_us = 0; ThreadState(uint32_t index, SharedState* _shared) - : tid(index), rnd(1000 + index), shared(_shared) {} + : tid(index), rnd(FLAGS_seed + 1 + index), shared(_shared) {} }; struct KeyGen { char key_data[27]; - Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) { - uint64_t key = 0; - if (!FLAGS_skewed) { - uint64_t raw = rnd.Next(); - // Skew according to setting - for (uint32_t i = 0; i < FLAGS_skew; ++i) { - raw = std::min(raw, rnd.Next()); - } - key = FastRange64(raw, max_key); - } else { - key = rnd.Skewed(max_log); - if (key > max_key) { - key -= max_key; - } + Slice GetRand(Random64& rnd, uint64_t max_key, uint32_t skew) { + uint64_t raw = rnd.Next(); + // Skew according to setting + for (uint32_t i = 0; i < skew; ++i) { + raw = std::min(raw, rnd.Next()); } + uint64_t key = FastRange64(raw, max_key); // Variable size and alignment size_t off = key % 8; key_data[0] = char{42}; @@ -285,31 +290,25 @@ class CacheBench { lookup_threshold_(insert_threshold_ + kHundredthUint64 * FLAGS_lookup_percent), erase_threshold_(lookup_threshold_ + - kHundredthUint64 * FLAGS_erase_percent), - skewed_(FLAGS_skewed) { + kHundredthUint64 * FLAGS_erase_percent) { if (erase_threshold_ != 100U * kHundredthUint64) { fprintf(stderr, "Percentages must add to 100.\n"); exit(1); } - max_log_ = 0; - if (skewed_) { - uint64_t max_key = max_key_; - while (max_key >>= 1) max_log_++; - if (max_key > (static_cast(1) << max_log_)) max_log_++; - } - if (FLAGS_cache_type == "clock_cache") { fprintf(stderr, "Old clock cache implementation has been removed.\n"); exit(1); } else if (FLAGS_cache_type == "hyper_clock_cache") { - cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes, - FLAGS_num_shard_bits) - .MakeSharedCache(); + HyperClockCacheOptions opts(FLAGS_cache_size, FLAGS_value_bytes, + FLAGS_num_shard_bits); + opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX); + cache_ = opts.MakeSharedCache(); } else if (FLAGS_cache_type == "lru_cache") { LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false /* strict_capacity_limit */, 0.5 /* high_pri_pool_ratio */); + opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX); if (!FLAGS_secondary_cache_uri.empty()) { Status s = SecondaryCache::CreateFromString( ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); @@ -333,13 +332,50 @@ class CacheBench { ~CacheBench() {} void PopulateCache() { - Random64 rnd(1); + Random64 rnd(FLAGS_seed); KeyGen keygen; - for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) { - Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_), - createValue(rnd), &helper1, FLAGS_value_bytes); + size_t max_occ = 0; + size_t inserts_since_max_occ_increase = 0; + size_t keys_since_last_not_found = 0; + + // Avoid redundant insertions by checking Lookup before Insert. + // Loop until insertions consistently fail to increase max occupancy or + // it becomes difficult to find keys not already inserted. + while (inserts_since_max_occ_increase < 100 && + keys_since_last_not_found < 100) { + Slice key = keygen.GetRand(rnd, max_key_, FLAGS_skew); + + Cache::Handle* handle = cache_->Lookup(key); + if (handle != nullptr) { + cache_->Release(handle); + ++keys_since_last_not_found; + continue; + } + keys_since_last_not_found = 0; + + Status s = + cache_->Insert(key, createValue(rnd), &helper1, FLAGS_value_bytes); assert(s.ok()); + + handle = cache_->Lookup(key); + if (!handle) { + fprintf(stderr, "Failed to lookup key just inserted.\n"); + assert(false); + exit(42); + } else { + cache_->Release(handle); + } + + size_t occ = cache_->GetOccupancyCount(); + if (occ > max_occ) { + max_occ = occ; + inserts_since_max_occ_increase = 0; + } else { + ++inserts_since_max_occ_increase; + } } + printf("Population complete (%zu entries, %g average charge)\n", max_occ, + 1.0 * FLAGS_cache_size / max_occ); } bool Run() { @@ -398,18 +434,21 @@ class CacheBench { FLAGS_ops_per_thread / elapsed_secs); printf("Thread ops/sec = %u\n", ops_per_sec); - printf("\nOperation latency (ns):\n"); - HistogramImpl combined; - for (uint32_t i = 0; i < FLAGS_threads; i++) { - combined.Merge(threads[i]->latency_ns_hist); - } - printf("%s", combined.ToString().c_str()); + printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio()); - if (FLAGS_gather_stats) { - printf("\nGather stats latency (us):\n"); - printf("%s", stats_hist.ToString().c_str()); - } + if (FLAGS_histograms) { + printf("\nOperation latency (ns):\n"); + HistogramImpl combined; + for (uint32_t i = 0; i < FLAGS_threads; i++) { + combined.Merge(threads[i]->latency_ns_hist); + } + printf("%s", combined.ToString().c_str()); + if (FLAGS_gather_stats) { + printf("\nGather stats latency (us):\n"); + printf("%s", stats_hist.ToString().c_str()); + } + } printf("\n%s", stats_report.c_str()); return true; @@ -423,8 +462,6 @@ class CacheBench { const uint64_t insert_threshold_; const uint64_t lookup_threshold_; const uint64_t erase_threshold_; - const bool skewed_; - int max_log_; // A benchmark version of gathering stats on an active block cache by // iterating over it. The primary purpose is to measure the impact of @@ -494,13 +531,17 @@ class CacheBench { // Something slightly more expensive as in stats by category helpers.insert(helper); }; - timer.Start(); + if (FLAGS_histograms) { + timer.Start(); + } Cache::ApplyToAllEntriesOptions opts; opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock; shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts); table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount(); table_size = shared->GetCacheBench()->cache_->GetTableAddressCount(); - stats_hist->Add(timer.ElapsedNanos() / 1000); + if (FLAGS_histograms) { + stats_hist->Add(timer.ElapsedNanos() / 1000); + } } } @@ -531,6 +572,8 @@ class CacheBench { void OperateCache(ThreadState* thread) { // To use looked-up values uint64_t result = 0; + uint64_t lookup_misses = 0; + uint64_t lookup_hits = 0; // To hold handles for a non-trivial amount of time Cache::Handle* handle = nullptr; KeyGen gen; @@ -539,10 +582,12 @@ class CacheBench { StopWatchNano timer(clock); for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { - Slice key = gen.GetRand(thread->rnd, max_key_, max_log_); + Slice key = gen.GetRand(thread->rnd, max_key_, FLAGS_skew); uint64_t random_op = thread->rnd.Next(); - timer.Start(); + if (FLAGS_histograms) { + timer.Start(); + } if (random_op < lookup_insert_threshold_) { if (handle) { @@ -553,12 +598,14 @@ class CacheBench { handle = cache_->Lookup(key, &helper2, /*context*/ nullptr, Cache::Priority::LOW); if (handle) { + ++lookup_hits; if (!FLAGS_lean) { // do something with the data result += NPHash64(static_cast(cache_->Value(handle)), FLAGS_value_bytes); } } else { + ++lookup_misses; // do insert Status s = cache_->Insert(key, createValue(thread->rnd), &helper2, FLAGS_value_bytes, &handle); @@ -582,11 +629,14 @@ class CacheBench { handle = cache_->Lookup(key, &helper2, /*context*/ nullptr, Cache::Priority::LOW); if (handle) { + ++lookup_hits; if (!FLAGS_lean) { // do something with the data result += NPHash64(static_cast(cache_->Value(handle)), FLAGS_value_bytes); } + } else { + ++lookup_misses; } } else if (random_op < erase_threshold_) { // do erase @@ -595,7 +645,10 @@ class CacheBench { // Should be extremely unlikely (noop) assert(random_op >= kHundredthUint64 * 100U); } - thread->latency_ns_hist.Add(timer.ElapsedNanos()); + if (FLAGS_histograms) { + thread->latency_ns_hist.Add(timer.ElapsedNanos()); + } + thread->shared->AddLookupStats(lookup_hits, lookup_misses); } if (FLAGS_early_exit) { MutexLock l(thread->shared->GetMutex()); @@ -621,6 +674,7 @@ class CacheBench { #ifndef NDEBUG printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); #endif + printf("----------------------------\n"); printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); printf("DMutex impl name : %s\n", DMutex::kName()); printf("Number of threads : %u\n", FLAGS_threads); @@ -960,11 +1014,14 @@ int cache_bench_tool(int argc, char** argv) { exit(1); } + if (FLAGS_seed == 0) { + FLAGS_seed = static_cast(port::GetProcessID()); + printf("Using seed = %" PRIu32 "\n", FLAGS_seed); + } + ROCKSDB_NAMESPACE::CacheBench bench; if (FLAGS_populate_cache) { bench.PopulateCache(); - printf("Population complete\n"); - printf("----------------------------\n"); } if (bench.Run()) { return 0;