Some cache_bench enhancements (#11661)

Summary:
... used in validating some HyperClockCache development in progress.

* Revamp the "populate cache" step to avoid redundant insertions (very rare in practice) and more consistently approach the desired resident_ratio while maintaining appropriate skew (still not perfect).
* Track and print hit ratio on lookups, to ensure a fair comparison is happening between implementations etc.
* Add an option to disable tracking and printing histograms (lots of output)
* Add an option to specify a random seed (for more reproducibility)
* Remove confusing/redundant "-skewed" option

Uses BitwiseAnd from https://github.com/facebook/rocksdb/issues/11660 (tested there)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11661

Test Plan: manual

Reviewed By: akankshamahajan15, jowlyzhang

Differential Revision: D47937671

Pulled By: pdillinger

fbshipit-source-id: 85a2bb881b1bca4f63e015bac684105fd91c9f35
oxigraph-main
Peter Dillinger 1 year ago committed by Facebook GitHub Bot
parent cf95821fb6
commit f9de217353
  1. 175
      cache/cache_bench_tool.cc

@ -50,7 +50,7 @@ DEFINE_double(resident_ratio, 0.25,
DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread."); DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added."); DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
DEFINE_uint32(skew, 5, "Degree of skew in key selection"); DEFINE_uint32(skew, 5, "Degree of skew in key selection. 0 = no skew");
DEFINE_bool(populate_cache, true, "Populate cache before operations"); DEFINE_bool(populate_cache, true, "Populate cache before operations");
DEFINE_uint32(lookup_insert_percent, 87, DEFINE_uint32(lookup_insert_percent, 87,
@ -71,7 +71,6 @@ DEFINE_uint32(
DEFINE_uint32(gather_stats_entries_per_lock, 256, DEFINE_uint32(gather_stats_entries_per_lock, 256,
"For Cache::ApplyToAllEntries"); "For Cache::ApplyToAllEntries");
DEFINE_bool(skewed, false, "If true, skew the key access distribution");
DEFINE_bool(lean, false, DEFINE_bool(lean, false,
"If true, no additional computation is performed besides cache " "If true, no additional computation is performed besides cache "
@ -81,6 +80,11 @@ DEFINE_bool(early_exit, false,
"Exit before deallocating most memory. Good for malloc stats, e.g." "Exit before deallocating most memory. Good for malloc stats, e.g."
"MALLOC_CONF=\"stats_print:true\""); "MALLOC_CONF=\"stats_print:true\"");
DEFINE_bool(histograms, true,
"Whether to track and print histogram statistics.");
DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random");
DEFINE_string(secondary_cache_uri, "", DEFINE_string(secondary_cache_uri, "",
"Full URI for creating a custom secondary cache object"); "Full URI for creating a custom secondary cache object");
static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache; static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
@ -149,9 +153,6 @@ class SharedState {
public: public:
explicit SharedState(CacheBench* cache_bench) explicit SharedState(CacheBench* cache_bench)
: cv_(&mu_), : cv_(&mu_),
num_initialized_(0),
start_(false),
num_done_(0),
cache_bench_(cache_bench) {} cache_bench_(cache_bench) {}
~SharedState() {} ~SharedState() {}
@ -174,15 +175,27 @@ class SharedState {
bool Started() const { return start_; } bool Started() const { return start_; }
void AddLookupStats(uint64_t hits, uint64_t misses) {
MutexLock l(&mu_);
lookup_count_ += hits + misses;
lookup_hits_ += hits;
}
double GetLookupHitRatio() const {
return 1.0 * lookup_hits_ / lookup_count_;
}
private: private:
port::Mutex mu_; port::Mutex mu_;
port::CondVar cv_; port::CondVar cv_;
uint64_t num_initialized_;
bool start_;
uint64_t num_done_;
CacheBench* cache_bench_; CacheBench* cache_bench_;
uint64_t num_initialized_ = 0;
bool start_ = false;
uint64_t num_done_ = 0;
uint64_t lookup_count_ = 0;
uint64_t lookup_hits_ = 0;
}; };
// Per-thread state for concurrent executions of the same benchmark. // Per-thread state for concurrent executions of the same benchmark.
@ -194,27 +207,19 @@ struct ThreadState {
uint64_t duration_us = 0; uint64_t duration_us = 0;
ThreadState(uint32_t index, SharedState* _shared) ThreadState(uint32_t index, SharedState* _shared)
: tid(index), rnd(1000 + index), shared(_shared) {} : tid(index), rnd(FLAGS_seed + 1 + index), shared(_shared) {}
}; };
struct KeyGen { struct KeyGen {
char key_data[27]; char key_data[27];
Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) { Slice GetRand(Random64& rnd, uint64_t max_key, uint32_t skew) {
uint64_t key = 0; uint64_t raw = rnd.Next();
if (!FLAGS_skewed) { // Skew according to setting
uint64_t raw = rnd.Next(); for (uint32_t i = 0; i < skew; ++i) {
// Skew according to setting raw = std::min(raw, rnd.Next());
for (uint32_t i = 0; i < FLAGS_skew; ++i) {
raw = std::min(raw, rnd.Next());
}
key = FastRange64(raw, max_key);
} else {
key = rnd.Skewed(max_log);
if (key > max_key) {
key -= max_key;
}
} }
uint64_t key = FastRange64(raw, max_key);
// Variable size and alignment // Variable size and alignment
size_t off = key % 8; size_t off = key % 8;
key_data[0] = char{42}; key_data[0] = char{42};
@ -285,31 +290,25 @@ class CacheBench {
lookup_threshold_(insert_threshold_ + lookup_threshold_(insert_threshold_ +
kHundredthUint64 * FLAGS_lookup_percent), kHundredthUint64 * FLAGS_lookup_percent),
erase_threshold_(lookup_threshold_ + erase_threshold_(lookup_threshold_ +
kHundredthUint64 * FLAGS_erase_percent), kHundredthUint64 * FLAGS_erase_percent) {
skewed_(FLAGS_skewed) {
if (erase_threshold_ != 100U * kHundredthUint64) { if (erase_threshold_ != 100U * kHundredthUint64) {
fprintf(stderr, "Percentages must add to 100.\n"); fprintf(stderr, "Percentages must add to 100.\n");
exit(1); exit(1);
} }
max_log_ = 0;
if (skewed_) {
uint64_t max_key = max_key_;
while (max_key >>= 1) max_log_++;
if (max_key > (static_cast<uint64_t>(1) << max_log_)) max_log_++;
}
if (FLAGS_cache_type == "clock_cache") { if (FLAGS_cache_type == "clock_cache") {
fprintf(stderr, "Old clock cache implementation has been removed.\n"); fprintf(stderr, "Old clock cache implementation has been removed.\n");
exit(1); exit(1);
} else if (FLAGS_cache_type == "hyper_clock_cache") { } else if (FLAGS_cache_type == "hyper_clock_cache") {
cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes, HyperClockCacheOptions opts(FLAGS_cache_size, FLAGS_value_bytes,
FLAGS_num_shard_bits) FLAGS_num_shard_bits);
.MakeSharedCache(); opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
cache_ = opts.MakeSharedCache();
} else if (FLAGS_cache_type == "lru_cache") { } else if (FLAGS_cache_type == "lru_cache") {
LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
false /* strict_capacity_limit */, false /* strict_capacity_limit */,
0.5 /* high_pri_pool_ratio */); 0.5 /* high_pri_pool_ratio */);
opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
if (!FLAGS_secondary_cache_uri.empty()) { if (!FLAGS_secondary_cache_uri.empty()) {
Status s = SecondaryCache::CreateFromString( Status s = SecondaryCache::CreateFromString(
ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
@ -333,13 +332,50 @@ class CacheBench {
~CacheBench() {} ~CacheBench() {}
void PopulateCache() { void PopulateCache() {
Random64 rnd(1); Random64 rnd(FLAGS_seed);
KeyGen keygen; KeyGen keygen;
for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) { size_t max_occ = 0;
Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_), size_t inserts_since_max_occ_increase = 0;
createValue(rnd), &helper1, FLAGS_value_bytes); size_t keys_since_last_not_found = 0;
// Avoid redundant insertions by checking Lookup before Insert.
// Loop until insertions consistently fail to increase max occupancy or
// it becomes difficult to find keys not already inserted.
while (inserts_since_max_occ_increase < 100 &&
keys_since_last_not_found < 100) {
Slice key = keygen.GetRand(rnd, max_key_, FLAGS_skew);
Cache::Handle* handle = cache_->Lookup(key);
if (handle != nullptr) {
cache_->Release(handle);
++keys_since_last_not_found;
continue;
}
keys_since_last_not_found = 0;
Status s =
cache_->Insert(key, createValue(rnd), &helper1, FLAGS_value_bytes);
assert(s.ok()); assert(s.ok());
handle = cache_->Lookup(key);
if (!handle) {
fprintf(stderr, "Failed to lookup key just inserted.\n");
assert(false);
exit(42);
} else {
cache_->Release(handle);
}
size_t occ = cache_->GetOccupancyCount();
if (occ > max_occ) {
max_occ = occ;
inserts_since_max_occ_increase = 0;
} else {
++inserts_since_max_occ_increase;
}
} }
printf("Population complete (%zu entries, %g average charge)\n", max_occ,
1.0 * FLAGS_cache_size / max_occ);
} }
bool Run() { bool Run() {
@ -398,18 +434,21 @@ class CacheBench {
FLAGS_ops_per_thread / elapsed_secs); FLAGS_ops_per_thread / elapsed_secs);
printf("Thread ops/sec = %u\n", ops_per_sec); printf("Thread ops/sec = %u\n", ops_per_sec);
printf("\nOperation latency (ns):\n"); printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio());
HistogramImpl combined;
for (uint32_t i = 0; i < FLAGS_threads; i++) {
combined.Merge(threads[i]->latency_ns_hist);
}
printf("%s", combined.ToString().c_str());
if (FLAGS_gather_stats) { if (FLAGS_histograms) {
printf("\nGather stats latency (us):\n"); printf("\nOperation latency (ns):\n");
printf("%s", stats_hist.ToString().c_str()); HistogramImpl combined;
} for (uint32_t i = 0; i < FLAGS_threads; i++) {
combined.Merge(threads[i]->latency_ns_hist);
}
printf("%s", combined.ToString().c_str());
if (FLAGS_gather_stats) {
printf("\nGather stats latency (us):\n");
printf("%s", stats_hist.ToString().c_str());
}
}
printf("\n%s", stats_report.c_str()); printf("\n%s", stats_report.c_str());
return true; return true;
@ -423,8 +462,6 @@ class CacheBench {
const uint64_t insert_threshold_; const uint64_t insert_threshold_;
const uint64_t lookup_threshold_; const uint64_t lookup_threshold_;
const uint64_t erase_threshold_; const uint64_t erase_threshold_;
const bool skewed_;
int max_log_;
// A benchmark version of gathering stats on an active block cache by // A benchmark version of gathering stats on an active block cache by
// iterating over it. The primary purpose is to measure the impact of // iterating over it. The primary purpose is to measure the impact of
@ -494,13 +531,17 @@ class CacheBench {
// Something slightly more expensive as in stats by category // Something slightly more expensive as in stats by category
helpers.insert(helper); helpers.insert(helper);
}; };
timer.Start(); if (FLAGS_histograms) {
timer.Start();
}
Cache::ApplyToAllEntriesOptions opts; Cache::ApplyToAllEntriesOptions opts;
opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock; opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts); shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount(); table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount();
table_size = shared->GetCacheBench()->cache_->GetTableAddressCount(); table_size = shared->GetCacheBench()->cache_->GetTableAddressCount();
stats_hist->Add(timer.ElapsedNanos() / 1000); if (FLAGS_histograms) {
stats_hist->Add(timer.ElapsedNanos() / 1000);
}
} }
} }
@ -531,6 +572,8 @@ class CacheBench {
void OperateCache(ThreadState* thread) { void OperateCache(ThreadState* thread) {
// To use looked-up values // To use looked-up values
uint64_t result = 0; uint64_t result = 0;
uint64_t lookup_misses = 0;
uint64_t lookup_hits = 0;
// To hold handles for a non-trivial amount of time // To hold handles for a non-trivial amount of time
Cache::Handle* handle = nullptr; Cache::Handle* handle = nullptr;
KeyGen gen; KeyGen gen;
@ -539,10 +582,12 @@ class CacheBench {
StopWatchNano timer(clock); StopWatchNano timer(clock);
for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
Slice key = gen.GetRand(thread->rnd, max_key_, max_log_); Slice key = gen.GetRand(thread->rnd, max_key_, FLAGS_skew);
uint64_t random_op = thread->rnd.Next(); uint64_t random_op = thread->rnd.Next();
timer.Start(); if (FLAGS_histograms) {
timer.Start();
}
if (random_op < lookup_insert_threshold_) { if (random_op < lookup_insert_threshold_) {
if (handle) { if (handle) {
@ -553,12 +598,14 @@ class CacheBench {
handle = cache_->Lookup(key, &helper2, /*context*/ nullptr, handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
Cache::Priority::LOW); Cache::Priority::LOW);
if (handle) { if (handle) {
++lookup_hits;
if (!FLAGS_lean) { if (!FLAGS_lean) {
// do something with the data // do something with the data
result += NPHash64(static_cast<char*>(cache_->Value(handle)), result += NPHash64(static_cast<char*>(cache_->Value(handle)),
FLAGS_value_bytes); FLAGS_value_bytes);
} }
} else { } else {
++lookup_misses;
// do insert // do insert
Status s = cache_->Insert(key, createValue(thread->rnd), &helper2, Status s = cache_->Insert(key, createValue(thread->rnd), &helper2,
FLAGS_value_bytes, &handle); FLAGS_value_bytes, &handle);
@ -582,11 +629,14 @@ class CacheBench {
handle = cache_->Lookup(key, &helper2, /*context*/ nullptr, handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
Cache::Priority::LOW); Cache::Priority::LOW);
if (handle) { if (handle) {
++lookup_hits;
if (!FLAGS_lean) { if (!FLAGS_lean) {
// do something with the data // do something with the data
result += NPHash64(static_cast<char*>(cache_->Value(handle)), result += NPHash64(static_cast<char*>(cache_->Value(handle)),
FLAGS_value_bytes); FLAGS_value_bytes);
} }
} else {
++lookup_misses;
} }
} else if (random_op < erase_threshold_) { } else if (random_op < erase_threshold_) {
// do erase // do erase
@ -595,7 +645,10 @@ class CacheBench {
// Should be extremely unlikely (noop) // Should be extremely unlikely (noop)
assert(random_op >= kHundredthUint64 * 100U); assert(random_op >= kHundredthUint64 * 100U);
} }
thread->latency_ns_hist.Add(timer.ElapsedNanos()); if (FLAGS_histograms) {
thread->latency_ns_hist.Add(timer.ElapsedNanos());
}
thread->shared->AddLookupStats(lookup_hits, lookup_misses);
} }
if (FLAGS_early_exit) { if (FLAGS_early_exit) {
MutexLock l(thread->shared->GetMutex()); MutexLock l(thread->shared->GetMutex());
@ -621,6 +674,7 @@ class CacheBench {
#ifndef NDEBUG #ifndef NDEBUG
printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif #endif
printf("----------------------------\n");
printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion);
printf("DMutex impl name : %s\n", DMutex::kName()); printf("DMutex impl name : %s\n", DMutex::kName());
printf("Number of threads : %u\n", FLAGS_threads); printf("Number of threads : %u\n", FLAGS_threads);
@ -960,11 +1014,14 @@ int cache_bench_tool(int argc, char** argv) {
exit(1); exit(1);
} }
if (FLAGS_seed == 0) {
FLAGS_seed = static_cast<uint32_t>(port::GetProcessID());
printf("Using seed = %" PRIu32 "\n", FLAGS_seed);
}
ROCKSDB_NAMESPACE::CacheBench bench; ROCKSDB_NAMESPACE::CacheBench bench;
if (FLAGS_populate_cache) { if (FLAGS_populate_cache) {
bench.PopulateCache(); bench.PopulateCache();
printf("Population complete\n");
printf("----------------------------\n");
} }
if (bench.Run()) { if (bench.Run()) {
return 0; return 0;

Loading…
Cancel
Save