From 0f91c72adc977c3895f8320b13ae4ef2b8633756 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Fri, 16 Sep 2022 12:47:29 -0700 Subject: [PATCH] Call experimental new clock cache HyperClockCache (#10684) Summary: This change establishes a distinctive name for the experimental new lock-free clock cache (originally developed by guidotag and revamped in PR https://github.com/facebook/rocksdb/issues/10626). A few reasons: * We want to make it clear that this is a fundamentally different implementation vs. the old clock cache, to avoid people saying "I already tried clock cache." * We want to highlight the key feature: it's fast (especially under parallel load) * Because it requires an estimated charge per entry, it is not drop-in API compatible with old clock cache. This estimate might always be required for highest performance, and giving it a distinct name should reduce confusion about the distinct API requirements. * We might develop a variant requiring the same estimate parameter but with LRU eviction. In that case, using the name HyperLRUCache should make things more clear. (FastLRUCache is just a prototype that might soon be removed.) Some API detail: * To reduce copy-pasting parameter lists, etc. as in LRUCache construction, I have a `MakeSharedCache()` function on `HyperClockCacheOptions` instead of `NewHyperClockCache()`. * Changes -cache_type=clock_cache to -cache_type=hyper_clock_cache for applicable tools. I think this is more consistent / sustainable for reasons already stated. For performance tests see https://github.com/facebook/rocksdb/pull/10626 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10684 Test Plan: no interesting functional changes; tests updated Reviewed By: anand1976 Differential Revision: D39547800 Pulled By: pdillinger fbshipit-source-id: 5c0fe1b5cf3cb680ab369b928c8569682b9795bf --- HISTORY.md | 1 + cache/cache_bench_tool.cc | 14 ++- cache/cache_test.cc | 65 ++++++------ cache/clock_cache.cc | 46 ++++----- cache/clock_cache.h | 31 +++--- cache/lru_cache_test.cc | 20 ++-- db/db_block_cache_test.cc | 22 ++-- db_stress_tool/db_stress_test_base.cc | 16 ++- include/rocksdb/cache.h | 141 +++++++++++++++++++------- tools/db_bench_tool.cc | 17 ++-- tools/db_crashtest.py | 2 +- 11 files changed, 216 insertions(+), 159 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9615b07cf..eacde6fd8 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -32,6 +32,7 @@ * RocksDB does internal auto prefetching if it notices 2 sequential reads if readahead_size is not specified. New option `num_file_reads_for_auto_readahead` is added in BlockBasedTableOptions which indicates after how many sequential reads internal auto prefetching should be start (default is 2). * Added new perf context counters `block_cache_standalone_handle_count`, `block_cache_real_handle_count`,`compressed_sec_cache_insert_real_count`, `compressed_sec_cache_insert_dummy_count`, `compressed_sec_cache_uncompressed_bytes`, and `compressed_sec_cache_compressed_bytes`. * Memory for blobs which are to be inserted into the blob cache is now allocated using the cache's allocator (see #10628 and #10647). +* HyperClockCache is an experimental, lock-free Cache alternative for block cache that offers much improved CPU efficiency under high parallel load or high contention, with some caveats. As much as 4.5x higher ops/sec vs. LRUCache has been seen in db_bench under high parallel load. ### Performance Improvements * Iterator performance is improved for `DeleteRange()` users. Internally, iterator will skip to the end of a range tombstone when possible, instead of looping through each key and check individually if a key is range deleted. diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index dd36a5f06..4accf7ba0 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -13,7 +13,6 @@ #include #include -#include "cache/clock_cache.h" #include "cache/fast_lru_cache.h" #include "db/db_impl/db_impl.h" #include "monitoring/histogram.h" @@ -292,13 +291,12 @@ class CacheBench { } if (FLAGS_cache_type == "clock_cache") { - cache_ = ExperimentalNewClockCache( - FLAGS_cache_size, FLAGS_value_bytes, FLAGS_num_shard_bits, - false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy); - if (!cache_) { - fprintf(stderr, "Clock cache not supported.\n"); - exit(1); - } + fprintf(stderr, "Old clock cache implementation has been removed.\n"); + exit(1); + } else if (FLAGS_cache_type == "hyper_clock_cache") { + cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes, + FLAGS_num_shard_bits) + .MakeSharedCache(); } else if (FLAGS_cache_type == "fast_lru_cache") { cache_ = NewFastLRUCache( FLAGS_cache_size, FLAGS_value_bytes, FLAGS_num_shard_bits, diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 14b6e44d9..1a8bae4df 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -15,7 +15,6 @@ #include #include -#include "cache/clock_cache.h" #include "cache/fast_lru_cache.h" #include "cache/lru_cache.h" #include "port/stack_trace.h" @@ -23,7 +22,7 @@ #include "util/coding.h" #include "util/string_util.h" -// FastLRUCache and ClockCache only support 16-byte keys, so some of +// FastLRUCache and HyperClockCache only support 16-byte keys, so some of // the tests originally wrote for LRUCache do not work on the other caches. // Those tests were adapted to use 16-byte keys. We kept the original ones. // TODO: Remove the original tests if they ever become unused. @@ -76,7 +75,7 @@ void EraseDeleter2(const Slice& /*key*/, void* value) { } const std::string kLRU = "lru"; -const std::string kClock = "clock"; +const std::string kHyperClock = "hyper_clock"; const std::string kFast = "fast"; } // anonymous namespace @@ -87,7 +86,7 @@ class CacheTest : public testing::TestWithParam { static std::string type_; static void Deleter(const Slice& key, void* v) { - if (type_ == kFast || type_ == kClock) { + if (type_ == kFast || type_ == kHyperClock) { current_->deleted_keys_.push_back(DecodeKey16Bytes(key)); } else { current_->deleted_keys_.push_back(DecodeKey32Bits(key)); @@ -122,10 +121,10 @@ class CacheTest : public testing::TestWithParam { if (type == kLRU) { return NewLRUCache(capacity); } - if (type == kClock) { - return ExperimentalNewClockCache( - capacity, estimated_value_size_, -1 /*num_shard_bits*/, - false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy); + if (type == kHyperClock) { + return HyperClockCacheOptions( + capacity, estimated_value_size_ /*estimated_value_size*/) + .MakeSharedCache(); } if (type == kFast) { return NewFastLRUCache( @@ -148,10 +147,11 @@ class CacheTest : public testing::TestWithParam { co.metadata_charge_policy = charge_policy; return NewLRUCache(co); } - if (type == kClock) { - return ExperimentalNewClockCache(capacity, 1 /*estimated_value_size*/, - num_shard_bits, strict_capacity_limit, - charge_policy); + if (type == kHyperClock) { + return HyperClockCacheOptions(capacity, 1 /*estimated_value_size*/, + num_shard_bits, strict_capacity_limit, + nullptr /*allocator*/, charge_policy) + .MakeSharedCache(); } if (type == kFast) { return NewFastLRUCache(capacity, 1 /*estimated_value_size*/, @@ -163,12 +163,11 @@ class CacheTest : public testing::TestWithParam { // These functions encode/decode keys in tests cases that use // int keys. - // Currently, FastLRUCache requires keys to be 16B long, whereas - // LRUCache and ClockCache don't, so the encoding depends on - // the cache type. + // Currently, HyperClockCache requires keys to be 16B long, whereas + // LRUCache doesn't, so the encoding depends on the cache type. std::string EncodeKey(int k) { auto type = GetParam(); - if (type == kFast || type == kClock) { + if (type == kFast || type == kHyperClock) { return EncodeKey16Bytes(k); } else { return EncodeKey32Bits(k); @@ -177,7 +176,7 @@ class CacheTest : public testing::TestWithParam { int DecodeKey(const Slice& k) { auto type = GetParam(); - if (type == kFast || type == kClock) { + if (type == kFast || type == kHyperClock) { return DecodeKey16Bytes(k); } else { return DecodeKey32Bits(k); @@ -242,7 +241,7 @@ TEST_P(CacheTest, UsageTest) { auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata); ASSERT_EQ(0, cache->GetUsage()); size_t baseline_meta_usage = precise_cache->GetUsage(); - if (type != kClock) { + if (type != kHyperClock) { ASSERT_EQ(0, baseline_meta_usage); } @@ -263,7 +262,7 @@ TEST_P(CacheTest, UsageTest) { kv_size, DumbDeleter)); usage += kv_size; ASSERT_EQ(usage, cache->GetUsage()); - if (type == kClock) { + if (type == kHyperClock) { ASSERT_EQ(baseline_meta_usage + usage, precise_cache->GetUsage()); } else { ASSERT_LT(usage, precise_cache->GetUsage()); @@ -293,7 +292,7 @@ TEST_P(CacheTest, UsageTest) { ASSERT_GT(kCapacity, cache->GetUsage()); ASSERT_GT(kCapacity, precise_cache->GetUsage()); ASSERT_LT(kCapacity * 0.95, cache->GetUsage()); - if (type != kClock) { + if (type != kHyperClock) { ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage()); } else { // estimated value size of 1 is weird for clock cache, because @@ -319,7 +318,7 @@ TEST_P(CacheTest, PinnedUsageTest) { auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata); size_t baseline_meta_usage = precise_cache->GetUsage(); - if (type != kClock) { + if (type != kHyperClock) { ASSERT_EQ(0, baseline_meta_usage); } @@ -428,7 +427,7 @@ TEST_P(CacheTest, HitAndMiss) { ASSERT_EQ(-1, Lookup(300)); Insert(100, 102); - if (GetParam() == kClock) { + if (GetParam() == kHyperClock) { // ClockCache usually doesn't overwrite on Insert ASSERT_EQ(101, Lookup(100)); } else { @@ -439,7 +438,7 @@ TEST_P(CacheTest, HitAndMiss) { ASSERT_EQ(1U, deleted_keys_.size()); ASSERT_EQ(100, deleted_keys_[0]); - if (GetParam() == kClock) { + if (GetParam() == kHyperClock) { ASSERT_EQ(102, deleted_values_[0]); } else { ASSERT_EQ(101, deleted_values_[0]); @@ -447,7 +446,7 @@ TEST_P(CacheTest, HitAndMiss) { } TEST_P(CacheTest, InsertSameKey) { - if (GetParam() == kClock) { + if (GetParam() == kHyperClock) { ROCKSDB_GTEST_BYPASS( "ClockCache doesn't guarantee Insert overwrite same key."); return; @@ -477,7 +476,7 @@ TEST_P(CacheTest, Erase) { } TEST_P(CacheTest, EntriesArePinned) { - if (GetParam() == kClock) { + if (GetParam() == kHyperClock) { ROCKSDB_GTEST_BYPASS( "ClockCache doesn't guarantee Insert overwrite same key."); return; @@ -543,7 +542,7 @@ TEST_P(CacheTest, ExternalRefPinsEntries) { Insert(1000 + j, 2000 + j); } // Clock cache is even more stateful and needs more churn to evict - if (GetParam() == kClock) { + if (GetParam() == kHyperClock) { for (int j = 0; j < kCacheSize; j++) { Insert(11000 + j, 11000 + j); } @@ -742,9 +741,9 @@ TEST_P(CacheTest, ReleaseWithoutErase) { TEST_P(CacheTest, SetCapacity) { auto type = GetParam(); - if (type == kFast || type == kClock) { + if (type == kFast || type == kHyperClock) { ROCKSDB_GTEST_BYPASS( - "FastLRUCache and ClockCache don't support arbitrary capacity " + "FastLRUCache and HyperClockCache don't support arbitrary capacity " "adjustments."); return; } @@ -883,7 +882,7 @@ TEST_P(CacheTest, OverCapacity) { cache->Release(handles[i]); } - if (GetParam() == kClock) { + if (GetParam() == kHyperClock) { // Make sure eviction is triggered. ASSERT_OK(cache->Insert(EncodeKey(-1), nullptr, 1, &deleter, &handles[0])); @@ -1020,7 +1019,8 @@ TEST_P(CacheTest, DefaultShardBits) { // Prevent excessive allocation (to save time & space) estimated_value_size_ = 100000; // Implementations use different minimum shard sizes - size_t min_shard_size = (GetParam() == kClock ? 32U * 1024U : 512U) * 1024U; + size_t min_shard_size = + (GetParam() == kHyperClock ? 32U * 1024U : 512U) * 1024U; std::shared_ptr cache = NewCache(32U * min_shard_size); ShardedCache* sc = dynamic_cast(cache.get()); @@ -1052,11 +1052,8 @@ TEST_P(CacheTest, GetChargeAndDeleter) { cache_->Release(h1); } -std::shared_ptr (*new_clock_cache_func)(size_t, size_t, int, bool, - CacheMetadataChargePolicy) = - ExperimentalNewClockCache; INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, - testing::Values(kLRU, kClock, kFast)); + testing::Values(kLRU, kHyperClock, kFast)); INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU, kFast)); diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 3bff5feee..0b07542c4 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -21,7 +21,7 @@ namespace ROCKSDB_NAMESPACE { -namespace clock_cache { +namespace hyper_clock_cache { static_assert(sizeof(ClockHandle) == 64U, "Expecting size / alignment with common cache line size"); @@ -1126,9 +1126,10 @@ size_t ClockCacheShard::GetTableAddressCount() const { return table_.GetTableSize(); } -ClockCache::ClockCache(size_t capacity, size_t estimated_value_size, - int num_shard_bits, bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy) +HyperClockCache::HyperClockCache( + size_t capacity, size_t estimated_value_size, int num_shard_bits, + bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy) : ShardedCache(capacity, num_shard_bits, strict_capacity_limit), num_shards_(1 << num_shard_bits) { assert(estimated_value_size > 0 || @@ -1145,7 +1146,7 @@ ClockCache::ClockCache(size_t capacity, size_t estimated_value_size, } } -ClockCache::~ClockCache() { +HyperClockCache::~HyperClockCache() { if (shards_ != nullptr) { assert(num_shards_ > 0); for (int i = 0; i < num_shards_; i++) { @@ -1155,32 +1156,32 @@ ClockCache::~ClockCache() { } } -CacheShard* ClockCache::GetShard(uint32_t shard) { +CacheShard* HyperClockCache::GetShard(uint32_t shard) { return reinterpret_cast(&shards_[shard]); } -const CacheShard* ClockCache::GetShard(uint32_t shard) const { +const CacheShard* HyperClockCache::GetShard(uint32_t shard) const { return reinterpret_cast(&shards_[shard]); } -void* ClockCache::Value(Handle* handle) { +void* HyperClockCache::Value(Handle* handle) { return reinterpret_cast(handle)->value; } -size_t ClockCache::GetCharge(Handle* handle) const { +size_t HyperClockCache::GetCharge(Handle* handle) const { return reinterpret_cast(handle)->total_charge; } -Cache::DeleterFn ClockCache::GetDeleter(Handle* handle) const { +Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const { auto h = reinterpret_cast(handle); return h->deleter; } -uint32_t ClockCache::GetHash(Handle* handle) const { +uint32_t HyperClockCache::GetHash(Handle* handle) const { return reinterpret_cast(handle)->hash; } -void ClockCache::DisownData() { +void HyperClockCache::DisownData() { // Leak data only if that won't generate an ASAN/valgrind warning. if (!kMustFreeHeapAllocations) { shards_ = nullptr; @@ -1188,8 +1189,9 @@ void ClockCache::DisownData() { } } -} // namespace clock_cache +} // namespace hyper_clock_cache +// DEPRECATED (see public API) std::shared_ptr NewClockCache( size_t capacity, int num_shard_bits, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy) { @@ -1199,22 +1201,20 @@ std::shared_ptr NewClockCache( /* low_pri_pool_ratio */ 0.0); } -std::shared_ptr ExperimentalNewClockCache( - size_t capacity, size_t estimated_value_size, int num_shard_bits, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy) { - if (num_shard_bits >= 20) { +std::shared_ptr HyperClockCacheOptions::MakeSharedCache() const { + auto my_num_shard_bits = num_shard_bits; + if (my_num_shard_bits >= 20) { return nullptr; // The cache cannot be sharded into too many fine pieces. } - if (num_shard_bits < 0) { + if (my_num_shard_bits < 0) { // Use larger shard size to reduce risk of large entries clustering // or skewing individual shards. constexpr size_t min_shard_size = 32U * 1024U * 1024U; - num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size); + my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size); } - return std::make_shared( - capacity, estimated_value_size, num_shard_bits, strict_capacity_limit, - metadata_charge_policy); + return std::make_shared( + capacity, estimated_entry_charge, my_num_shard_bits, + strict_capacity_limit, metadata_charge_policy); } } // namespace ROCKSDB_NAMESPACE diff --git a/cache/clock_cache.h b/cache/clock_cache.h index 8ceb46478..a68514e36 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -27,22 +27,22 @@ namespace ROCKSDB_NAMESPACE { -namespace clock_cache { +namespace hyper_clock_cache { // Forward declaration of friend class. class ClockCacheTest; -// ClockCache is an experimental alternative to LRUCache. +// HyperClockCache is an experimental alternative to LRUCache. // // Benefits // -------- // * Fully lock free (no waits or spins) for efficiency under high concurrency // * Optimized for hot path reads. For concurrency control, most Lookup() and // essentially all Release() are a single atomic add operation. +// * Eviction on insertion is fully parallel and lock-free. // * Uses a generalized + aging variant of CLOCK eviction that might outperform // LRU in some cases. (For background, see // https://en.wikipedia.org/wiki/Page_replacement_algorithm) -// * Eviction on insertion is fully parallel and lock-free. // // Costs // ----- @@ -582,20 +582,20 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard { std::atomic strict_capacity_limit_; }; // class ClockCacheShard -class ClockCache +class HyperClockCache #ifdef NDEBUG final #endif : public ShardedCache { public: - ClockCache(size_t capacity, size_t estimated_value_size, int num_shard_bits, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy = - kDontChargeCacheMetadata); + HyperClockCache(size_t capacity, size_t estimated_value_size, + int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy = + kDontChargeCacheMetadata); - ~ClockCache() override; + ~HyperClockCache() override; - const char* Name() const override { return "ClockCache"; } + const char* Name() const override { return "HyperClockCache"; } CacheShard* GetShard(uint32_t shard) override; @@ -615,15 +615,8 @@ class ClockCache ClockCacheShard* shards_ = nullptr; int num_shards_; -}; // class ClockCache - -} // namespace clock_cache +}; // class HyperClockCache -// Only for internal testing, temporarily replacing NewClockCache. -// TODO(Guido) Remove once NewClockCache constructs a ClockCache again. -extern std::shared_ptr ExperimentalNewClockCache( - size_t capacity, size_t estimated_value_size, int num_shard_bits, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy); +} // namespace hyper_clock_cache } // namespace ROCKSDB_NAMESPACE diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 1b70bde2d..f42404cce 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -506,7 +506,7 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) { } // namespace fast_lru_cache -namespace clock_cache { +namespace hyper_clock_cache { class ClockCacheTest : public testing::Test { public: @@ -975,9 +975,11 @@ TEST_F(ClockCacheTest, TableSizesTest) { SCOPED_TRACE("est_count = " + std::to_string(est_count)); size_t capacity = static_cast(est_val_size * est_count); // kDontChargeCacheMetadata - auto cache = ExperimentalNewClockCache( - capacity, est_val_size, /*num shard_bits*/ -1, - /*strict_capacity_limit*/ false, kDontChargeCacheMetadata); + auto cache = HyperClockCacheOptions( + capacity, est_val_size, /*num shard_bits*/ -1, + /*strict_capacity_limit*/ false, + /*memory_allocator*/ nullptr, kDontChargeCacheMetadata) + .MakeSharedCache(); // Table sizes are currently only powers of two EXPECT_GE(cache->GetTableAddressCount(), est_count / kLoadFactor); EXPECT_LE(cache->GetTableAddressCount(), est_count / kLoadFactor * 2.0); @@ -989,9 +991,11 @@ TEST_F(ClockCacheTest, TableSizesTest) { // doubling the table size could cut by 90% the space available to // values. Therefore, we omit those weird cases for now. if (est_val_size >= 512) { - cache = ExperimentalNewClockCache( - capacity, est_val_size, /*num shard_bits*/ -1, - /*strict_capacity_limit*/ false, kFullChargeCacheMetadata); + cache = HyperClockCacheOptions( + capacity, est_val_size, /*num shard_bits*/ -1, + /*strict_capacity_limit*/ false, + /*memory_allocator*/ nullptr, kFullChargeCacheMetadata) + .MakeSharedCache(); double est_count_after_meta = (capacity - cache->GetUsage()) * 1.0 / est_val_size; EXPECT_GE(cache->GetTableAddressCount(), @@ -1003,7 +1007,7 @@ TEST_F(ClockCacheTest, TableSizesTest) { } } -} // namespace clock_cache +} // namespace hyper_clock_cache class TestSecondaryCache : public SecondaryCache { public: diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index d550c5225..6c335febc 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -13,7 +13,6 @@ #include "cache/cache_entry_roles.h" #include "cache/cache_key.h" -#include "cache/clock_cache.h" #include "cache/fast_lru_cache.h" #include "cache/lru_cache.h" #include "db/column_family.h" @@ -938,16 +937,14 @@ TEST_F(DBBlockCacheTest, AddRedundantStats) { int iterations_tested = 0; for (std::shared_ptr base_cache : {NewLRUCache(capacity, num_shard_bits), - ExperimentalNewClockCache( + HyperClockCacheOptions( capacity, BlockBasedTableOptions().block_size /*estimated_value_size*/, - num_shard_bits, false /*strict_capacity_limit*/, - kDefaultCacheMetadataChargePolicy), - NewFastLRUCache( - capacity, - BlockBasedTableOptions().block_size /*estimated_value_size*/, - num_shard_bits, false /*strict_capacity_limit*/, - kDefaultCacheMetadataChargePolicy)}) { + num_shard_bits) + .MakeSharedCache(), + NewFastLRUCache(capacity, 1 /*estimated_value_size*/, num_shard_bits, + false /*strict_capacity_limit*/, + kDefaultCacheMetadataChargePolicy)}) { if (!base_cache) { // Skip clock cache when not supported continue; @@ -1302,11 +1299,10 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { for (bool partition : {false, true}) { for (std::shared_ptr cache : {NewLRUCache(capacity), - ExperimentalNewClockCache( + HyperClockCacheOptions( capacity, - BlockBasedTableOptions().block_size /*estimated_value_size*/, - -1 /*num_shard_bits*/, false /*strict_capacity_limit*/, - kDefaultCacheMetadataChargePolicy)}) { + BlockBasedTableOptions().block_size /*estimated_value_size*/) + .MakeSharedCache()}) { if (!cache) { // Skip clock cache when not supported continue; diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 03f7e1a13..1b850d425 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -10,7 +10,6 @@ #include "util/compression.h" #ifdef GFLAGS -#include "cache/clock_cache.h" #include "cache/fast_lru_cache.h" #include "db_stress_tool/db_stress_common.h" #include "db_stress_tool/db_stress_compaction_filter.h" @@ -115,14 +114,13 @@ std::shared_ptr StressTest::NewCache(size_t capacity, } if (FLAGS_cache_type == "clock_cache") { - auto cache = ExperimentalNewClockCache( - static_cast(capacity), FLAGS_block_size, num_shard_bits, - false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy); - if (!cache) { - fprintf(stderr, "Clock cache not supported."); - exit(1); - } - return cache; + fprintf(stderr, "Old clock cache implementation has been removed.\n"); + exit(1); + } else if (FLAGS_cache_type == "hyper_clock_cache") { + return HyperClockCacheOptions(static_cast(capacity), + FLAGS_block_size /*estimated_entry_charge*/, + num_shard_bits) + .MakeSharedCache(); } else if (FLAGS_cache_type == "fast_lru_cache") { return NewFastLRUCache(static_cast(capacity), FLAGS_block_size, num_shard_bits, false /*strict_capacity_limit*/, diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 119cf959c..004c30376 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -42,24 +42,64 @@ class SecondaryCache; extern const bool kDefaultToAdaptiveMutex; enum CacheMetadataChargePolicy { + // Only the `charge` of each entry inserted into a Cache counts against + // the `capacity` kDontChargeCacheMetadata, + // In addition to the `charge`, the approximate space overheads in the + // Cache (in bytes) also count against `capacity`. These space overheads + // are for supporting fast Lookup and managing the lifetime of entries. kFullChargeCacheMetadata }; const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy = kFullChargeCacheMetadata; -struct LRUCacheOptions { - // Capacity of the cache. +// Options shared betweeen various cache implementations that +// divide the key space into shards using hashing. +struct ShardedCacheOptions { + // Capacity of the cache, in the same units as the `charge` of each entry. + // This is typically measured in bytes, but can be a different unit if using + // kDontChargeCacheMetadata. size_t capacity = 0; // Cache is sharded into 2^num_shard_bits shards, by hash of key. - // Refer to NewLRUCache for further information. + // If < 0, a good default is chosen based on the capacity and the + // implementation. (Mutex-based implementations are much more reliant + // on many shards for parallel scalability.) int num_shard_bits = -1; - // If strict_capacity_limit is set, - // insert to the cache will fail when cache is full. + // If strict_capacity_limit is set, Insert() will fail if there is not + // enough capacity for the new entry along with all the existing referenced + // (pinned) cache entries. (Unreferenced cache entries are evicted as + // needed, sometimes immediately.) If strict_capacity_limit == false + // (default), Insert() never fails. bool strict_capacity_limit = false; + // If non-nullptr, RocksDB will use this allocator instead of system + // allocator when allocating memory for cache blocks. + // + // Caveat: when the cache is used as block cache, the memory allocator is + // ignored when dealing with compression libraries that allocate memory + // internally (currently only XPRESS). + std::shared_ptr memory_allocator; + + // See CacheMetadataChargePolicy + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy; + + ShardedCacheOptions() {} + ShardedCacheOptions( + size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, + std::shared_ptr _memory_allocator = nullptr, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy) + : capacity(_capacity), + num_shard_bits(_num_shard_bits), + strict_capacity_limit(_strict_capacity_limit), + memory_allocator(std::move(_memory_allocator)), + metadata_charge_policy(_metadata_charge_policy) {} +}; + +struct LRUCacheOptions : public ShardedCacheOptions { // Percentage of cache reserved for high priority entries. // If greater than zero, the LRU list will be split into a high-pri // list and a low-pri list. High-pri entries will be inserted to the @@ -83,24 +123,12 @@ struct LRUCacheOptions { // See also high_pri_pool_ratio. double low_pri_pool_ratio = 0.0; - // If non-nullptr will use this allocator instead of system allocator when - // allocating memory for cache blocks. Call this method before you start using - // the cache! - // - // Caveat: when the cache is used as block cache, the memory allocator is - // ignored when dealing with compression libraries that allocate memory - // internally (currently only XPRESS). - std::shared_ptr memory_allocator; - // Whether to use adaptive mutexes for cache shards. Note that adaptive // mutexes need to be supported by the platform in order for this to have any // effect. The default value is true if RocksDB is compiled with // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise. bool use_adaptive_mutex = kDefaultToAdaptiveMutex; - CacheMetadataChargePolicy metadata_charge_policy = - kDefaultCacheMetadataChargePolicy; - // A SecondaryCache instance to use a the non-volatile tier. std::shared_ptr secondary_cache; @@ -112,14 +140,12 @@ struct LRUCacheOptions { CacheMetadataChargePolicy _metadata_charge_policy = kDefaultCacheMetadataChargePolicy, double _low_pri_pool_ratio = 0.0) - : capacity(_capacity), - num_shard_bits(_num_shard_bits), - strict_capacity_limit(_strict_capacity_limit), + : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit, + std::move(_memory_allocator), + _metadata_charge_policy), high_pri_pool_ratio(_high_pri_pool_ratio), low_pri_pool_ratio(_low_pri_pool_ratio), - memory_allocator(std::move(_memory_allocator)), - use_adaptive_mutex(_use_adaptive_mutex), - metadata_charge_policy(_metadata_charge_policy) {} + use_adaptive_mutex(_use_adaptive_mutex) {} }; // Create a new cache with a fixed size capacity. The cache is sharded @@ -190,18 +216,65 @@ extern std::shared_ptr NewCompressedSecondaryCache( extern std::shared_ptr NewCompressedSecondaryCache( const CompressedSecondaryCacheOptions& opts); -// EXPERIMENTAL Currently ClockCache is under development, although it's -// already exposed in the public API. To avoid unreliable performance and -// correctness issues, NewClockCache will temporarily return an LRUCache -// constructed with the corresponding arguments. +// HyperClockCache - EXPERIMENTAL // -// TODO(Guido) When ClockCache is complete, roll back to the old text: -// `` -// Similar to NewLRUCache, but create a cache based on clock algorithm with -// better concurrent performance in some cases. See util/clock_cache.cc for -// more detail. -// Return nullptr if it is not supported. -// `` +// A lock-free Cache alternative for RocksDB block cache that offers much +// improved CPU efficiency under high parallel load or high contention, with +// some caveats. +// +// See internal cache/clock_cache.h for full description. +struct HyperClockCacheOptions : public ShardedCacheOptions { + // The estimated average `charge` associated with cache entries. This is a + // critical configuration parameter for good performance from the hyper + // cache, because having a table size that is fixed at creation time greatly + // reduces the required synchronization between threads. + // * If the estimate is substantially too low (e.g. less than half the true + // average) then metadata space overhead with be substantially higher (e.g. + // 200 bytes per entry rather than 100). With kFullChargeCacheMetadata, this + // can slightly reduce cache hit rates, and slightly reduce access times due + // to the larger working memory size. + // * If the estimate is substantially too high (e.g. 25% higher than the true + // average) then there might not be sufficient slots in the hash table for + // both efficient operation and capacity utilization (hit rate). The hyper + // cache will evict entries to prevent load factors that could dramatically + // affect lookup times, instead letting the hit rate suffer by not utilizing + // the full capacity. + // + // A reasonable choice is the larger of block_size and metadata_block_size. + // When WriteBufferManager (and similar) charge memory usage to the block + // cache, this can lead to the same effect as estimate being too low, which + // is better than the opposite. Therefore, the general recommendation is to + // assume that other memory charged to block cache could be negligible, and + // ignore it in making the estimate. + // + // The best parameter choice based on a cache in use is given by + // GetUsage() / GetOccupancyCount(), ignoring metadata overheads such as + // with kDontChargeCacheMetadata. More precisely with + // kFullChargeCacheMetadata is (GetUsage() - 64 * GetTableAddressCount()) / + // GetOccupancyCount(). However, when the average value size might vary + // (e.g. balance between metadata and data blocks in cache), it is better + // to estimate toward the lower side than the higher side. + size_t estimated_entry_charge; + + HyperClockCacheOptions( + size_t _capacity, size_t _estimated_entry_charge, + int _num_shard_bits = -1, bool _strict_capacity_limit = false, + std::shared_ptr _memory_allocator = nullptr, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy) + : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit, + std::move(_memory_allocator), + _metadata_charge_policy), + estimated_entry_charge(_estimated_entry_charge) {} + + // Construct an instance of HyperClockCache using these options + std::shared_ptr MakeSharedCache() const; +}; + +// DEPRECATED - The old Clock Cache implementation had an unresolved bug and +// has been removed. The new HyperClockCache requires an additional +// configuration parameter that is not provided by this API. This function +// simply returns a new LRUCache for functional compatibility. extern std::shared_ptr NewClockCache( size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 50c143f5f..6c5457a54 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -37,7 +37,6 @@ #include #include -#include "cache/clock_cache.h" #include "cache/fast_lru_cache.h" #include "db/db_impl/db_impl.h" #include "db/malloc_stats.h" @@ -3057,15 +3056,13 @@ class Benchmark { return nullptr; } if (FLAGS_cache_type == "clock_cache") { - auto cache = ExperimentalNewClockCache( - static_cast(capacity), FLAGS_block_size, - FLAGS_cache_numshardbits, false /*strict_capacity_limit*/, - kDefaultCacheMetadataChargePolicy); - if (!cache) { - fprintf(stderr, "Clock cache not supported."); - exit(1); - } - return cache; + fprintf(stderr, "Old clock cache implementation has been removed.\n"); + exit(1); + } else if (FLAGS_cache_type == "hyper_clock_cache") { + return HyperClockCacheOptions(static_cast(capacity), + FLAGS_block_size /*estimated_entry_charge*/, + FLAGS_cache_numshardbits) + .MakeSharedCache(); } else if (FLAGS_cache_type == "fast_lru_cache") { return NewFastLRUCache(static_cast(capacity), FLAGS_block_size, FLAGS_cache_numshardbits, diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 1cdc62b94..2f7af92e3 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -116,7 +116,7 @@ default_params = { "use_direct_reads": lambda: random.randint(0, 1), "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1), "mock_direct_io": False, - "cache_type": lambda: random.choice(["lru_cache", "clock_cache"]), + "cache_type": lambda: random.choice(["lru_cache", "hyper_clock_cache"]), # fast_lru_cache is incompatible with stress tests, because it doesn't support strict_capacity_limit == false. "use_full_merge_v1": lambda: random.randint(0, 1), "use_merge": lambda: random.randint(0, 1),