diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index 87245c0b3..f2b32daba 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -284,7 +284,9 @@ class CacheBench { } if (FLAGS_cache_type == "clock_cache") { - cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits); + cache_ = NewClockCache( + FLAGS_cache_size, FLAGS_value_bytes, FLAGS_num_shard_bits, + false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy); if (!cache_) { fprintf(stderr, "Clock cache not supported.\n"); exit(1); diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 7ef540e1b..da357637b 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -77,7 +77,7 @@ class CacheTest : public testing::TestWithParam { static std::string type_; static void Deleter(const Slice& key, void* v) { - if (type_ == kFast) { + if (type_ == kFast || type_ == kClock) { current_->deleted_keys_.push_back(DecodeKey16Bytes(key)); } else { current_->deleted_keys_.push_back(DecodeKey32Bits(key)); @@ -111,7 +111,9 @@ class CacheTest : public testing::TestWithParam { return NewLRUCache(capacity); } if (type == kClock) { - return NewClockCache(capacity); + return NewClockCache( + capacity, 1 /*estimated_value_size*/, -1 /*num_shard_bits*/, + false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy); } if (type == kFast) { return NewFastLRUCache( @@ -135,8 +137,8 @@ class CacheTest : public testing::TestWithParam { return NewLRUCache(co); } if (type == kClock) { - return NewClockCache(capacity, num_shard_bits, strict_capacity_limit, - charge_policy); + return NewClockCache(capacity, 1 /*estimated_value_size*/, num_shard_bits, + strict_capacity_limit, charge_policy); } if (type == kFast) { return NewFastLRUCache(capacity, 1 /*estimated_value_size*/, @@ -152,7 +154,8 @@ class CacheTest : public testing::TestWithParam { // LRUCache and ClockCache don't, so the encoding depends on // the cache type. std::string EncodeKey(int k) { - if (GetParam() == kFast) { + auto type = GetParam(); + if (type == kFast || type == kClock) { return EncodeKey16Bytes(k); } else { return EncodeKey32Bits(k); @@ -160,7 +163,8 @@ class CacheTest : public testing::TestWithParam { } int DecodeKey(const Slice& k) { - if (GetParam() == kFast) { + auto type = GetParam(); + if (type == kFast || type == kClock) { return DecodeKey16Bytes(k); } else { return DecodeKey32Bits(k); @@ -217,8 +221,9 @@ std::string CacheTest::type_; class LRUCacheTest : public CacheTest {}; TEST_P(CacheTest, UsageTest) { - if (GetParam() == kFast) { - ROCKSDB_GTEST_BYPASS("FastLRUCache requires 16 byte keys."); + auto type = GetParam(); + if (type == kFast || type == kClock) { + ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys."); return; } @@ -266,8 +271,9 @@ TEST_P(CacheTest, UsageTest) { } TEST_P(CacheTest, PinnedUsageTest) { - if (GetParam() == kFast) { - ROCKSDB_GTEST_BYPASS("FastLRUCache requires 16 byte keys."); + auto type = GetParam(); + if (type == kFast || type == kClock) { + ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys."); return; } @@ -492,8 +498,10 @@ TEST_P(CacheTest, EvictionPolicyRef) { Insert(302, 103); Insert(303, 104); - // Insert entries much more than Cache capacity - for (int i = 0; i < kCacheSize * 2; i++) { + // Insert entries much more than cache capacity. + double load_factor = + std::min(fast_lru_cache::kLoadFactor, clock_cache::kLoadFactor); + for (int i = 0; i < 2 * static_cast(kCacheSize / load_factor); i++) { Insert(1000 + i, 2000 + i); } @@ -523,8 +531,9 @@ TEST_P(CacheTest, EvictionPolicyRef) { } TEST_P(CacheTest, EvictEmptyCache) { - if (GetParam() == kFast) { - ROCKSDB_GTEST_BYPASS("FastLRUCache requires 16 byte keys."); + auto type = GetParam(); + if (type == kFast || type == kClock) { + ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys."); return; } @@ -534,8 +543,9 @@ TEST_P(CacheTest, EvictEmptyCache) { } TEST_P(CacheTest, EraseFromDeleter) { - if (GetParam() == kFast) { - ROCKSDB_GTEST_BYPASS("FastLRUCache requires 16 byte keys."); + auto type = GetParam(); + if (type == kFast || type == kClock) { + ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys."); return; } @@ -650,8 +660,10 @@ TEST_P(CacheTest, ReleaseWithoutErase) { } TEST_P(CacheTest, SetCapacity) { - if (GetParam() == kFast) { - ROCKSDB_GTEST_BYPASS("FastLRUCache doesn't support capacity adjustments."); + auto type = GetParam(); + if (type == kFast || type == kClock) { + ROCKSDB_GTEST_BYPASS( + "FastLRUCache and ClockCache don't support capacity adjustments."); return; } // test1: increase capacity @@ -702,9 +714,11 @@ TEST_P(CacheTest, SetCapacity) { } TEST_P(LRUCacheTest, SetStrictCapacityLimit) { - if (GetParam() == kFast) { + auto type = GetParam(); + if (type == kFast || type == kClock) { ROCKSDB_GTEST_BYPASS( - "FastLRUCache doesn't support an unbounded number of inserts beyond " + "FastLRUCache and ClockCache don't support an unbounded number of " + "inserts beyond " "capacity."); return; } @@ -759,8 +773,10 @@ TEST_P(LRUCacheTest, SetStrictCapacityLimit) { } TEST_P(CacheTest, OverCapacity) { - if (GetParam() == kFast) { - ROCKSDB_GTEST_BYPASS("FastLRUCache doesn't support capacity adjustments."); + auto type = GetParam(); + if (type == kFast || type == kClock) { + ROCKSDB_GTEST_BYPASS( + "FastLRUCache and ClockCache don't support capacity adjustments."); return; } size_t n = 10; @@ -938,15 +954,10 @@ TEST_P(CacheTest, GetChargeAndDeleter) { cache_->Release(h1); } -#ifdef SUPPORT_CLOCK_CACHE std::shared_ptr (*new_clock_cache_func)( - size_t, int, bool, CacheMetadataChargePolicy) = NewClockCache; + size_t, size_t, int, bool, CacheMetadataChargePolicy) = NewClockCache; INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, testing::Values(kLRU, kClock, kFast)); -#else -INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, - testing::Values(kLRU, kFast)); -#endif // SUPPORT_CLOCK_CACHE INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU, kFast)); diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 20b9f0a63..8fd0b012f 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -9,831 +9,587 @@ #include "cache/clock_cache.h" -#ifndef SUPPORT_CLOCK_CACHE +#include +#include +#include +#include -namespace ROCKSDB_NAMESPACE { - -std::shared_ptr NewClockCache( - size_t /*capacity*/, int /*num_shard_bits*/, bool /*strict_capacity_limit*/, - CacheMetadataChargePolicy /*metadata_charge_policy*/) { - // Clock cache not supported. - return nullptr; -} +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "port/lang.h" +#include "util/distributed_mutex.h" +#include "util/hash.h" +#include "util/math.h" +#include "util/random.h" -} // namespace ROCKSDB_NAMESPACE +namespace ROCKSDB_NAMESPACE { -#else +namespace clock_cache { -#include -#include -#include +ClockHandleTable::ClockHandleTable(int hash_bits) + : length_bits_(hash_bits), + length_bits_mask_((uint32_t{1} << length_bits_) - 1), + occupancy_(0), + occupancy_limit_(static_cast((uint32_t{1} << length_bits_) * + kStrictLoadFactor)), + array_(new ClockHandle[size_t{1} << length_bits_]) { + assert(hash_bits <= 32); +} -// "tbb/concurrent_hash_map.h" requires RTTI if exception is enabled. -// Disable it so users can chooose to disable RTTI. -#ifndef ROCKSDB_USE_RTTI -#define TBB_USE_EXCEPTIONS 0 -#endif -#include "cache/sharded_cache.h" -#include "port/lang.h" -#include "port/malloc.h" -#include "port/port.h" -#include "tbb/concurrent_hash_map.h" -#include "util/autovector.h" -#include "util/distributed_mutex.h" +ClockHandleTable::~ClockHandleTable() { + ApplyToEntriesRange([](ClockHandle* h) { h->FreeData(); }, 0, GetTableSize()); +} -namespace ROCKSDB_NAMESPACE { +ClockHandle* ClockHandleTable::Lookup(const Slice& key) { + int probe = 0; + int slot = FindVisibleElement(key, probe, 0); + return (slot == -1) ? nullptr : &array_[slot]; +} -namespace { - -// An implementation of the Cache interface based on CLOCK algorithm, with -// better concurrent performance than LRUCache. The idea of CLOCK algorithm -// is to maintain all cache entries in a circular list, and an iterator -// (the "head") pointing to the last examined entry. Eviction starts from the -// current head. Each entry is given a second chance before eviction, if it -// has been access since last examine. In contrast to LRU, no modification -// to the internal data-structure (except for flipping the usage bit) needs -// to be done upon lookup. This gives us oppertunity to implement a cache -// with better concurrency. -// -// Each cache entry is represented by a cache handle, and all the handles -// are arranged in a circular list, as describe above. Upon erase of an entry, -// we never remove the handle. Instead, the handle is put into a recycle bin -// to be re-use. This is to avoid memory dealocation, which is hard to deal -// with in concurrent environment. -// -// The cache also maintains a concurrent hash map for lookup. Any concurrent -// hash map implementation should do the work. We currently use -// tbb::concurrent_hash_map because it supports concurrent erase. -// -// Each cache handle has the following flags and counters, which are squeeze -// in an atomic interger, to make sure the handle always be in a consistent -// state: -// -// * In-cache bit: whether the entry is reference by the cache itself. If -// an entry is in cache, its key would also be available in the hash map. -// * Usage bit: whether the entry has been access by user since last -// examine for eviction. Can be reset by eviction. -// * Reference count: reference count by user. -// -// An entry can be reference only when it's in cache. An entry can be evicted -// only when it is in cache, has no usage since last examine, and reference -// count is zero. -// -// The follow figure shows a possible layout of the cache. Boxes represents -// cache handles and numbers in each box being in-cache bit, usage bit and -// reference count respectively. -// -// hash map: -// +-------+--------+ -// | key | handle | -// +-------+--------+ -// | "foo" | 5 |-------------------------------------+ -// +-------+--------+ | -// | "bar" | 2 |--+ | -// +-------+--------+ | | -// | | -// head | | -// | | | -// circular list: | | | -// +-------+ +-------+ +-------+ +-------+ +-------+ +------- -// |(0,0,0)|---|(1,1,0)|---|(0,0,0)|---|(0,1,3)|---|(1,0,0)|---| ... -// +-------+ +-------+ +-------+ +-------+ +-------+ +------- -// | | -// +-------+ +-----------+ -// | | -// +---+---+ -// recycle bin: | 1 | 3 | -// +---+---+ -// -// Suppose we try to insert "baz" into the cache at this point and the cache is -// full. The cache will first look for entries to evict, starting from where -// head points to (the second entry). It resets usage bit of the second entry, -// skips the third and fourth entry since they are not in cache, and finally -// evict the fifth entry ("foo"). It looks at recycle bin for available handle, -// grabs handle 3, and insert the key into the handle. The following figure -// shows the resulting layout. -// -// hash map: -// +-------+--------+ -// | key | handle | -// +-------+--------+ -// | "baz" | 3 |-------------+ -// +-------+--------+ | -// | "bar" | 2 |--+ | -// +-------+--------+ | | -// | | -// | | head -// | | | -// circular list: | | | -// +-------+ +-------+ +-------+ +-------+ +-------+ +------- -// |(0,0,0)|---|(1,0,0)|---|(1,0,0)|---|(0,1,3)|---|(0,0,0)|---| ... -// +-------+ +-------+ +-------+ +-------+ +-------+ +------- -// | | -// +-------+ +-----------------------------------+ -// | | -// +---+---+ -// recycle bin: | 1 | 5 | -// +---+---+ -// -// A global mutex guards the circular list, the head, and the recycle bin. -// We additionally require that modifying the hash map needs to hold the mutex. -// As such, Modifying the cache (such as Insert() and Erase()) require to -// hold the mutex. Lookup() only access the hash map and the flags associated -// with each handle, and don't require explicit locking. Release() has to -// acquire the mutex only when it releases the last reference to the entry and -// the entry has been erased from cache explicitly. A future improvement could -// be to remove the mutex completely. -// -// Benchmark: -// We run readrandom db_bench on a test DB of size 13GB, with size of each -// level: -// -// Level Files Size(MB) -// ------------------------- -// L0 1 0.01 -// L1 18 17.32 -// L2 230 182.94 -// L3 1186 1833.63 -// L4 4602 8140.30 -// -// We test with both 32 and 16 read threads, with 2GB cache size (the whole DB -// doesn't fits in) and 64GB cache size (the whole DB can fit in cache), and -// whether to put index and filter blocks in block cache. The benchmark runs -// with -// with RocksDB 4.10. We got the following result: -// -// Threads Cache Cache ClockCache LRUCache -// Size Index/Filter Throughput(MB/s) Hit Throughput(MB/s) Hit -// 32 2GB yes 466.7 85.9% 433.7 86.5% -// 32 2GB no 529.9 72.7% 532.7 73.9% -// 32 64GB yes 649.9 99.9% 507.9 99.9% -// 32 64GB no 740.4 99.9% 662.8 99.9% -// 16 2GB yes 278.4 85.9% 283.4 86.5% -// 16 2GB no 318.6 72.7% 335.8 73.9% -// 16 64GB yes 391.9 99.9% 353.3 99.9% -// 16 64GB no 433.8 99.8% 419.4 99.8% - -// Cache entry meta data. -struct CacheHandle { - Slice key; - void* value; - size_t charge; - Cache::DeleterFn deleter; - uint32_t hash; - - // Addition to "charge" to get "total charge" under metadata policy. - uint32_t meta_charge; - - // Flags and counters associated with the cache handle: - // lowest bit: in-cache bit - // second lowest bit: usage bit - // the rest bits: reference count - // The handle is unused when flags equals to 0. The thread decreases the count - // to 0 is responsible to put the handle back to recycle_ and cleanup memory. - std::atomic flags; - - CacheHandle() = default; - - CacheHandle(const CacheHandle& a) { *this = a; } - - CacheHandle(const Slice& k, void* v, - void (*del)(const Slice& key, void* value)) - : key(k), value(v), deleter(del) {} - - CacheHandle& operator=(const CacheHandle& a) { - // Only copy members needed for deletion. - key = a.key; - value = a.value; - deleter = a.deleter; - return *this; +ClockHandle* ClockHandleTable::Insert(ClockHandle* h, ClockHandle** old) { + int probe = 0; + int slot = + FindVisibleElementOrAvailableSlot(h->key(), probe, 1 /*displacement*/); + *old = nullptr; + if (slot == -1) { + return nullptr; } - inline static uint32_t CalcMetadataCharge( - Slice key, CacheMetadataChargePolicy metadata_charge_policy) { - size_t meta_charge = 0; - if (metadata_charge_policy == kFullChargeCacheMetadata) { - meta_charge += sizeof(CacheHandle); -#ifdef ROCKSDB_MALLOC_USABLE_SIZE - meta_charge += - malloc_usable_size(static_cast(const_cast(key.data()))); -#else - meta_charge += key.size(); -#endif + if (array_[slot].IsEmpty() || array_[slot].IsTombstone()) { + bool empty = array_[slot].IsEmpty(); + Assign(slot, h); + ClockHandle* new_entry = &array_[slot]; + if (empty) { + // This used to be an empty slot. + return new_entry; + } + // It used to be a tombstone, so there may already be a copy of the + // key in the table. + slot = FindVisibleElement(h->key(), probe, 0 /*displacement*/); + if (slot == -1) { + // No existing copy of the key. + return new_entry; } - assert(meta_charge <= UINT32_MAX); - return static_cast(meta_charge); + *old = &array_[slot]; + return new_entry; + } else { + // There is an existing copy of the key. + *old = &array_[slot]; + // Find an available slot for the new element. + array_[slot].displacements++; + slot = FindAvailableSlot(h->key(), probe, 1 /*displacement*/); + if (slot == -1) { + // No available slots. Roll back displacements. + probe = 0; + slot = FindVisibleElement(h->key(), probe, -1); + array_[slot].displacements--; + FindAvailableSlot(h->key(), probe, -1); + return nullptr; + } + Assign(slot, h); + return &array_[slot]; } +} - inline size_t GetTotalCharge() { return charge + meta_charge; } -}; +void ClockHandleTable::Remove(ClockHandle* h) { + assert(!h->IsInClockList()); // Already off the clock list. + int probe = 0; + FindSlot( + h->key(), [&h](ClockHandle* e) { return e == h; }, probe, + -1 /*displacement*/); + h->SetIsVisible(false); + h->SetIsElement(false); + occupancy_--; +} -// Key of hash map. We store hash value with the key for convenience. -struct ClockCacheKey { - Slice key; - uint32_t hash_value; +void ClockHandleTable::Assign(int slot, ClockHandle* h) { + ClockHandle* dst = &array_[slot]; + uint32_t disp = dst->displacements; + *dst = *h; + dst->displacements = disp; + dst->SetIsVisible(true); + dst->SetIsElement(true); + dst->SetPriority(ClockHandle::ClockPriority::NONE); + occupancy_++; +} - ClockCacheKey() = default; +void ClockHandleTable::Exclude(ClockHandle* h) { h->SetIsVisible(false); } - ClockCacheKey(const Slice& k, uint32_t h) { - key = k; - hash_value = h; - } +int ClockHandleTable::FindVisibleElement(const Slice& key, int& probe, + int displacement) { + return FindSlot( + key, [&](ClockHandle* h) { return h->Matches(key) && h->IsVisible(); }, + probe, displacement); +} - static bool equal(const ClockCacheKey& a, const ClockCacheKey& b) { - return a.hash_value == b.hash_value && a.key == b.key; - } +int ClockHandleTable::FindAvailableSlot(const Slice& key, int& probe, + int displacement) { + return FindSlot( + key, [](ClockHandle* h) { return h->IsEmpty() || h->IsTombstone(); }, + probe, displacement); +} - static size_t hash(const ClockCacheKey& a) { - return static_cast(a.hash_value); - } -}; - -struct CleanupContext { - // List of values to be deleted, along with the key and deleter. - autovector to_delete_value; - - // List of keys to be deleted. - autovector to_delete_key; -}; - -// A cache shard which maintains its own CLOCK cache. -class ClockCacheShard final : public CacheShard { - public: - // Hash map type. - using HashTable = - tbb::concurrent_hash_map; - - ClockCacheShard(); - ~ClockCacheShard() override; - - // Interfaces - void SetCapacity(size_t capacity) override; - void SetStrictCapacityLimit(bool strict_capacity_limit) override; - Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), - Cache::Handle** handle, Cache::Priority priority) override; - Status Insert(const Slice& key, uint32_t hash, void* value, - const Cache::CacheItemHelper* helper, size_t charge, - Cache::Handle** handle, Cache::Priority priority) override { - return Insert(key, hash, value, charge, helper->del_cb, handle, priority); - } - Cache::Handle* Lookup(const Slice& key, uint32_t hash) override; - Cache::Handle* Lookup(const Slice& key, uint32_t hash, - const Cache::CacheItemHelper* /*helper*/, - const Cache::CreateCallback& /*create_cb*/, - Cache::Priority /*priority*/, bool /*wait*/, - Statistics* /*stats*/) override { - return Lookup(key, hash); - } - bool Release(Cache::Handle* handle, bool /*useful*/, - bool erase_if_last_ref) override { - return Release(handle, erase_if_last_ref); - } - bool IsReady(Cache::Handle* /*handle*/) override { return true; } - void Wait(Cache::Handle* /*handle*/) override {} - - // If the entry in in cache, increase reference count and return true. - // Return false otherwise. - // - // Not necessary to hold mutex_ before being called. - bool Ref(Cache::Handle* handle) override; - bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override; - void Erase(const Slice& key, uint32_t hash) override; - bool EraseAndConfirm(const Slice& key, uint32_t hash, - CleanupContext* context); - size_t GetUsage() const override; - size_t GetPinnedUsage() const override; - void EraseUnRefEntries() override; - void ApplyToSomeEntries( - const std::function& callback, - uint32_t average_entries_per_lock, uint32_t* state) override; - - private: - static const uint32_t kInCacheBit = 1; - static const uint32_t kUsageBit = 2; - static const uint32_t kRefsOffset = 2; - static const uint32_t kOneRef = 1 << kRefsOffset; - - // Helper functions to extract cache handle flags and counters. - static bool InCache(uint32_t flags) { return flags & kInCacheBit; } - static bool HasUsage(uint32_t flags) { return flags & kUsageBit; } - static uint32_t CountRefs(uint32_t flags) { return flags >> kRefsOffset; } - - // Decrease reference count of the entry. If this decreases the count to 0, - // recycle the entry. If set_usage is true, also set the usage bit. - // - // returns true if a value is erased. - // - // Not necessary to hold mutex_ before being called. - bool Unref(CacheHandle* handle, bool set_usage, CleanupContext* context); - - // Unset in-cache bit of the entry. Recycle the handle if necessary. - // - // returns true if a value is erased. - // - // Has to hold mutex_ before being called. - bool UnsetInCache(CacheHandle* handle, CleanupContext* context); - - // Put the handle back to recycle_ list, and put the value associated with - // it into to-be-deleted list. It doesn't cleanup the key as it might be - // reused by another handle. - // - // Has to hold mutex_ before being called. - void RecycleHandle(CacheHandle* handle, CleanupContext* context); - - // Delete keys and values in to-be-deleted list. Call the method without - // holding mutex, as destructors can be expensive. - void Cleanup(const CleanupContext& context); - - // Examine the handle for eviction. If the handle is in cache, usage bit is - // not set, and referece count is 0, evict it from cache. Otherwise unset - // the usage bit. - // - // Has to hold mutex_ before being called. - bool TryEvict(CacheHandle* value, CleanupContext* context); - - // Scan through the circular list, evict entries until we get enough capacity - // for new cache entry of specific size. Return true if success, false - // otherwise. - // - // Has to hold mutex_ before being called. - bool EvictFromCache(size_t charge, CleanupContext* context); - - CacheHandle* Insert(const Slice& key, uint32_t hash, void* value, - size_t change, - void (*deleter)(const Slice& key, void* value), - bool hold_reference, CleanupContext* context, - bool* overwritten); - - // Guards list_, head_, and recycle_. In addition, updating table_ also has - // to hold the mutex, to avoid the cache being in inconsistent state. - mutable DMutex mutex_; - - // The circular list of cache handles. Initially the list is empty. Once a - // handle is needed by insertion, and no more handles are available in - // recycle bin, one more handle is appended to the end. - // - // We use std::deque for the circular list because we want to make sure - // pointers to handles are valid through out the life-cycle of the cache - // (in contrast to std::vector), and be able to grow the list (in contrast - // to statically allocated arrays). - std::deque list_; - - // Pointer to the next handle in the circular list to be examine for - // eviction. - size_t head_; - - // Recycle bin of cache handles. - autovector recycle_; - - // Maximum cache size. - std::atomic capacity_; - - // Current total size of the cache. - std::atomic usage_; - - // Total un-released cache size. - std::atomic pinned_usage_; - - // Whether allow insert into cache if cache is full. - std::atomic strict_capacity_limit_; - - // Hash table (tbb::concurrent_hash_map) for lookup. - HashTable table_; -}; - -ClockCacheShard::ClockCacheShard() - : head_(0), usage_(0), pinned_usage_(0), strict_capacity_limit_(false) {} - -ClockCacheShard::~ClockCacheShard() { - for (auto& handle : list_) { - uint32_t flags = handle.flags.load(std::memory_order_relaxed); - if (InCache(flags) || CountRefs(flags) > 0) { - if (handle.deleter != nullptr) { - (*handle.deleter)(handle.key, handle.value); - } - delete[] handle.key.data(); +int ClockHandleTable::FindVisibleElementOrAvailableSlot(const Slice& key, + int& probe, + int displacement) { + return FindSlot( + key, + [&](ClockHandle* h) { + return h->IsEmpty() || h->IsTombstone() || + (h->Matches(key) && h->IsVisible()); + }, + probe, displacement); +} + +inline int ClockHandleTable::FindSlot(const Slice& key, + std::function cond, + int& probe, int displacement) { + uint32_t base = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1)); + uint32_t increment = + ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1); + uint32_t current = ModTableSize(base + probe * increment); + while (true) { + ClockHandle* h = &array_[current]; + probe++; + if (current == base && probe > 1) { + // We looped back. + return -1; + } + if (cond(h)) { + return current; + } + if (h->IsEmpty()) { + // We check emptyness after the condition, because + // the condition may be emptyness. + return -1; } + h->displacements += displacement; + current = ModTableSize(current + increment); } } -size_t ClockCacheShard::GetUsage() const { - return usage_.load(std::memory_order_relaxed); +ClockCacheShard::ClockCacheShard( + size_t capacity, size_t estimated_value_size, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy) + : capacity_(capacity), + strict_capacity_limit_(strict_capacity_limit), + clock_pointer_(0), + table_( + CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)), + usage_(0), + clock_usage_(0) { + set_metadata_charge_policy(metadata_charge_policy); } -size_t ClockCacheShard::GetPinnedUsage() const { - return pinned_usage_.load(std::memory_order_relaxed); +void ClockCacheShard::EraseUnRefEntries() { + autovector last_reference_list; + { + DMutexLock l(mutex_); + uint32_t slot = 0; + do { + ClockHandle* old = &(table_.array_[slot]); + if (!old->IsInClockList()) { + continue; + } + ClockRemove(old); + table_.Remove(old); + assert(usage_ >= old->total_charge); + usage_ -= old->total_charge; + last_reference_list.push_back(*old); + slot = table_.ModTableSize(slot + 1); + } while (slot != 0); + } + + // Free the entries here outside of mutex for performance reasons. + for (auto& h : last_reference_list) { + h.FreeData(); + } } void ClockCacheShard::ApplyToSomeEntries( const std::function& callback, uint32_t average_entries_per_lock, uint32_t* state) { - assert(average_entries_per_lock > 0); + // The state is essentially going to be the starting hash, which works + // nicely even if we resize between calls because we use upper-most + // hash bits for table indexes. DMutexLock l(mutex_); + uint32_t length_bits = table_.GetLengthBits(); + uint32_t length = table_.GetTableSize(); - // Figure out the range to iterate, update `state` - size_t list_size = list_.size(); - size_t start_idx = *state; - size_t end_idx = start_idx + average_entries_per_lock; - if (start_idx > list_size) { - // Shouldn't reach here, but recoverable - assert(false); - // Mark finished with all - *state = UINT32_MAX; - return; - } - if (end_idx >= list_size || end_idx >= UINT32_MAX) { - // This also includes the hypothetical case of >4 billion - // cache handles. - end_idx = list_size; - // Mark finished with all + assert(average_entries_per_lock > 0); + // Assuming we are called with same average_entries_per_lock repeatedly, + // this simplifies some logic (index_end will not overflow). + assert(average_entries_per_lock < length || *state == 0); + + uint32_t index_begin = *state >> (32 - length_bits); + uint32_t index_end = index_begin + average_entries_per_lock; + if (index_end >= length) { + // Going to end + index_end = length; *state = UINT32_MAX; } else { - *state = static_cast(end_idx); + *state = index_end << (32 - length_bits); } - // Do the iteration - auto cur = list_.begin() + start_idx; - auto end = list_.begin() + end_idx; - for (; cur != end; ++cur) { - const CacheHandle& handle = *cur; - // Use relaxed semantics instead of acquire semantics since we are - // holding mutex - uint32_t flags = handle.flags.load(std::memory_order_relaxed); - if (InCache(flags)) { - callback(handle.key, handle.value, handle.charge, handle.deleter); - } - } + table_.ApplyToEntriesRange( + [callback, + metadata_charge_policy = metadata_charge_policy_](ClockHandle* h) { + callback(h->key(), h->value, h->GetCharge(metadata_charge_policy), + h->deleter); + }, + index_begin, index_end); } -void ClockCacheShard::RecycleHandle(CacheHandle* handle, - CleanupContext* context) { - mutex_.AssertHeld(); - assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0); - context->to_delete_key.push_back(handle->key.data()); - context->to_delete_value.emplace_back(*handle); - size_t total_charge = handle->GetTotalCharge(); - // clearing `handle` fields would go here but not strictly required - recycle_.push_back(handle); - usage_.fetch_sub(total_charge, std::memory_order_relaxed); +void ClockCacheShard::ClockRemove(ClockHandle* h) { + assert(h->IsInClockList()); + h->SetPriority(ClockHandle::ClockPriority::NONE); + assert(clock_usage_ >= h->total_charge); + clock_usage_ -= h->total_charge; } -void ClockCacheShard::Cleanup(const CleanupContext& context) { - for (const CacheHandle& handle : context.to_delete_value) { - if (handle.deleter) { - (*handle.deleter)(handle.key, handle.value); - } - } - for (const char* key : context.to_delete_key) { - delete[] key; - } +void ClockCacheShard::ClockInsert(ClockHandle* h) { + assert(!h->IsInClockList()); + h->SetPriority(ClockHandle::ClockPriority::HIGH); + clock_usage_ += h->total_charge; } -bool ClockCacheShard::Ref(Cache::Handle* h) { - auto handle = reinterpret_cast(h); - // CAS loop to increase reference count. - uint32_t flags = handle->flags.load(std::memory_order_relaxed); - while (InCache(flags)) { - // Use acquire semantics on success, as further operations on the cache - // entry has to be order after reference count is increased. - if (handle->flags.compare_exchange_weak(flags, flags + kOneRef, - std::memory_order_acquire, - std::memory_order_relaxed)) { - if (CountRefs(flags) == 0) { - // No reference count before the operation. - size_t total_charge = handle->GetTotalCharge(); - pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed); - } - return true; +void ClockCacheShard::EvictFromClock(size_t charge, + autovector* deleted) { + assert(charge <= capacity_); + while (clock_usage_ > 0 && (usage_ + charge) > capacity_) { + ClockHandle* old = &table_.array_[clock_pointer_]; + clock_pointer_ = table_.ModTableSize(clock_pointer_ + 1); + // Clock list contains only elements which can be evicted. + if (!old->IsInClockList()) { + continue; } + if (old->GetPriority() == ClockHandle::ClockPriority::LOW) { + ClockRemove(old); + table_.Remove(old); + assert(usage_ >= old->total_charge); + usage_ -= old->total_charge; + deleted->push_back(*old); + return; + } + old->DecreasePriority(); } - return false; } -bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage, - CleanupContext* context) { - if (set_usage) { - handle->flags.fetch_or(kUsageBit, std::memory_order_relaxed); - } - // If the handle reaches state refs=0 and InCache=true after this - // atomic operation then we cannot access `handle` afterward, because - // it could be evicted before we access the `handle`. - size_t total_charge = handle->GetTotalCharge(); - - // Use acquire-release semantics as previous operations on the cache entry - // has to be order before reference count is decreased, and potential cleanup - // of the entry has to be order after. - uint32_t flags = handle->flags.fetch_sub(kOneRef, std::memory_order_acq_rel); - assert(CountRefs(flags) > 0); - if (CountRefs(flags) == 1) { - // this is the last reference. - pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed); - // Cleanup if it is the last reference. - if (!InCache(flags)) { - DMutexLock l(mutex_); - RecycleHandle(handle, context); - } - } - return context->to_delete_value.size(); -} - -bool ClockCacheShard::UnsetInCache(CacheHandle* handle, - CleanupContext* context) { - mutex_.AssertHeld(); - // Use acquire-release semantics as previous operations on the cache entry - // has to be order before reference count is decreased, and potential cleanup - // of the entry has to be order after. - uint32_t flags = - handle->flags.fetch_and(~kInCacheBit, std::memory_order_acq_rel); - // Cleanup if it is the last reference. - if (InCache(flags) && CountRefs(flags) == 0) { - RecycleHandle(handle, context); - } - return context->to_delete_value.size(); -} - -bool ClockCacheShard::TryEvict(CacheHandle* handle, CleanupContext* context) { - mutex_.AssertHeld(); - uint32_t flags = kInCacheBit; - if (handle->flags.compare_exchange_strong(flags, 0, std::memory_order_acquire, - std::memory_order_relaxed)) { - bool erased __attribute__((__unused__)) = - table_.erase(ClockCacheKey(handle->key, handle->hash)); - assert(erased); - RecycleHandle(handle, context); - return true; - } - handle->flags.fetch_and(~kUsageBit, std::memory_order_relaxed); - return false; +size_t ClockCacheShard::CalcEstimatedHandleCharge( + size_t estimated_value_size, + CacheMetadataChargePolicy metadata_charge_policy) { + ClockHandle h; + h.CalcTotalCharge(estimated_value_size, metadata_charge_policy); + return h.total_charge; } -bool ClockCacheShard::EvictFromCache(size_t charge, CleanupContext* context) { - size_t usage = usage_.load(std::memory_order_relaxed); - size_t capacity = capacity_.load(std::memory_order_relaxed); - if (usage == 0) { - return charge <= capacity; - } - size_t new_head = head_; - bool second_iteration = false; - while (usage + charge > capacity) { - assert(new_head < list_.size()); - if (TryEvict(&list_[new_head], context)) { - usage = usage_.load(std::memory_order_relaxed); - } - new_head = (new_head + 1 >= list_.size()) ? 0 : new_head + 1; - if (new_head == head_) { - if (second_iteration) { - return false; - } else { - second_iteration = true; - } - } +int ClockCacheShard::CalcHashBits( + size_t capacity, size_t estimated_value_size, + CacheMetadataChargePolicy metadata_charge_policy) { + size_t handle_charge = + CalcEstimatedHandleCharge(estimated_value_size, metadata_charge_policy); + uint32_t num_entries = + static_cast(capacity / (kLoadFactor * handle_charge)); + + if (num_entries == 0) { + return 0; } - head_ = new_head; - return true; + int hash_bits = FloorLog2(num_entries); + return hash_bits + (size_t{1} << hash_bits < num_entries ? 1 : 0); } void ClockCacheShard::SetCapacity(size_t capacity) { - CleanupContext context; + assert(false); // Not supported. TODO(Guido) Support it? + autovector last_reference_list; { DMutexLock l(mutex_); - capacity_.store(capacity, std::memory_order_relaxed); - EvictFromCache(0, &context); + capacity_ = capacity; + EvictFromClock(0, &last_reference_list); + } + + // Free the entries here outside of mutex for performance reasons. + for (auto& h : last_reference_list) { + h.FreeData(); } - Cleanup(context); } void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { - strict_capacity_limit_.store(strict_capacity_limit, - std::memory_order_relaxed); -} - -CacheHandle* ClockCacheShard::Insert( - const Slice& key, uint32_t hash, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), bool hold_reference, - CleanupContext* context, bool* overwritten) { - assert(overwritten != nullptr && *overwritten == false); - uint32_t meta_charge = - CacheHandle::CalcMetadataCharge(key, metadata_charge_policy_); - size_t total_charge = charge + meta_charge; DMutexLock l(mutex_); - bool success = EvictFromCache(total_charge, context); - bool strict = strict_capacity_limit_.load(std::memory_order_relaxed); - if (!success && (strict || !hold_reference)) { - context->to_delete_key.push_back(key.data()); - if (!hold_reference) { - context->to_delete_value.emplace_back(key, value, deleter); - } - return nullptr; - } - // Grab available handle from recycle bin. If recycle bin is empty, create - // and append new handle to end of circular list. - CacheHandle* handle = nullptr; - if (!recycle_.empty()) { - handle = recycle_.back(); - recycle_.pop_back(); - } else { - list_.emplace_back(); - handle = &list_.back(); - } - // Fill handle. - handle->key = key; - handle->hash = hash; - handle->value = value; - handle->charge = charge; - handle->meta_charge = meta_charge; - handle->deleter = deleter; - uint32_t flags = hold_reference ? kInCacheBit + kOneRef : kInCacheBit; - - // TODO investigate+fix suspected race condition: - // [thread 1] Lookup starts, up to Ref() - // [thread 2] Erase/evict the entry just looked up - // [thread 1] Ref() the handle, even though it's in the recycle bin - // [thread 2] Insert with recycling that handle - // Here we obliterate the other thread's Ref - // Possible fix: never blindly overwrite the flags, but only make - // relative updates (fetch_add, etc). - handle->flags.store(flags, std::memory_order_relaxed); - HashTable::accessor accessor; - if (table_.find(accessor, ClockCacheKey(key, hash))) { - *overwritten = true; - CacheHandle* existing_handle = accessor->second; - table_.erase(accessor); - UnsetInCache(existing_handle, context); - } - table_.insert(HashTable::value_type(ClockCacheKey(key, hash), handle)); - if (hold_reference) { - pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed); - } - usage_.fetch_add(total_charge, std::memory_order_relaxed); - return handle; + strict_capacity_limit_ = strict_capacity_limit; } Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value, - size_t charge, - void (*deleter)(const Slice& key, void* value), - Cache::Handle** out_handle, + size_t charge, Cache::DeleterFn deleter, + Cache::Handle** handle, Cache::Priority /*priority*/) { - CleanupContext context; - HashTable::accessor accessor; - char* key_data = new char[key.size()]; - memcpy(key_data, key.data(), key.size()); - Slice key_copy(key_data, key.size()); - bool overwritten = false; - CacheHandle* handle = Insert(key_copy, hash, value, charge, deleter, - out_handle != nullptr, &context, &overwritten); - Status s; - if (out_handle != nullptr) { - if (handle == nullptr) { - s = Status::Incomplete("Insert failed due to CLOCK cache being full."); + if (key.size() != kCacheKeySize) { + return Status::NotSupported("ClockCache only supports key size " + + std::to_string(kCacheKeySize) + "B"); + } + + ClockHandle tmp; + tmp.value = value; + tmp.deleter = deleter; + tmp.hash = hash; + tmp.CalcTotalCharge(charge, metadata_charge_policy_); + for (int i = 0; i < kCacheKeySize; i++) { + tmp.key_data[i] = key.data()[i]; + } + + Status s = Status::OK(); + autovector last_reference_list; + { + DMutexLock l(mutex_); + assert(table_.GetOccupancy() <= table_.GetOccupancyLimit()); + // Free the space following strict clock policy until enough space + // is freed or the clock list is empty. + EvictFromClock(tmp.total_charge, &last_reference_list); + if ((usage_ + tmp.total_charge > capacity_ && + (strict_capacity_limit_ || handle == nullptr)) || + table_.GetOccupancy() == table_.GetOccupancyLimit()) { + if (handle == nullptr) { + // Don't insert the entry but still return ok, as if the entry inserted + // into cache and get evicted immediately. + last_reference_list.push_back(tmp); + } else { + if (table_.GetOccupancy() == table_.GetOccupancyLimit()) { + s = Status::Incomplete( + "Insert failed because all slots in the hash table are full."); + // TODO(Guido) Use the correct statuses. + } else { + s = Status::Incomplete( + "Insert failed because the total charge has exceeded the " + "capacity."); + } + } } else { - *out_handle = reinterpret_cast(handle); + // Insert into the cache. Note that the cache might get larger than its + // capacity if not enough space was freed up. + ClockHandle* old; + ClockHandle* h = table_.Insert(&tmp, &old); + assert(h != nullptr); // We're below occupancy, so this insertion should + // never fail. + usage_ += h->total_charge; + if (old != nullptr) { + s = Status::OkOverwritten(); + assert(old->IsVisible()); + table_.Exclude(old); + if (!old->HasRefs()) { + // old is in clock because it's in cache and its reference count is 0. + ClockRemove(old); + table_.Remove(old); + assert(usage_ >= old->total_charge); + usage_ -= old->total_charge; + last_reference_list.push_back(*old); + } + } + if (handle == nullptr) { + ClockInsert(h); + } else { + // If caller already holds a ref, no need to take one here. + if (!h->HasRefs()) { + h->Ref(); + } + *handle = reinterpret_cast(h); + } } } - if (overwritten) { - assert(s.ok()); - s = Status::OkOverwritten(); + + // Free the entries here outside of mutex for performance reasons. + for (auto& h : last_reference_list) { + h.FreeData(); } - Cleanup(context); + return s; } -Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) { - HashTable::const_accessor accessor; - if (!table_.find(accessor, ClockCacheKey(key, hash))) { - return nullptr; - } - CacheHandle* handle = accessor->second; - accessor.release(); - // Ref() could fail if another thread sneak in and evict/erase the cache - // entry before we are able to hold reference. - if (!Ref(reinterpret_cast(handle))) { - return nullptr; - } - // Double check the key since the handle may now representing another key - // if other threads sneak in, evict/erase the entry and re-used the handle - // for another cache entry. - if (hash != handle->hash || key != handle->key) { - CleanupContext context; - Unref(handle, false, &context); - // It is possible Unref() delete the entry, so we need to cleanup. - Cleanup(context); - return nullptr; +Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t /* hash */) { + ClockHandle* h = nullptr; + { + DMutexLock l(mutex_); + h = table_.Lookup(key); + if (h != nullptr) { + assert(h->IsVisible()); + if (!h->HasRefs()) { + // The entry is in clock since it's in the hash table and has no + // external references. + ClockRemove(h); + } + h->Ref(); + } } - return reinterpret_cast(handle); + return reinterpret_cast(h); } -bool ClockCacheShard::Release(Cache::Handle* h, bool erase_if_last_ref) { - CleanupContext context; - CacheHandle* handle = reinterpret_cast(h); - bool erased = Unref(handle, true, &context); - if (erase_if_last_ref && !erased) { - erased = EraseAndConfirm(handle->key, handle->hash, &context); - } - Cleanup(context); - return erased; +bool ClockCacheShard::Ref(Cache::Handle* h) { + ClockHandle* e = reinterpret_cast(h); + DMutexLock l(mutex_); + // To create another reference - entry must be already externally referenced. + assert(e->HasRefs()); + e->Ref(); + return true; } -void ClockCacheShard::Erase(const Slice& key, uint32_t hash) { - CleanupContext context; - EraseAndConfirm(key, hash, &context); - Cleanup(context); -} +bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) { + if (handle == nullptr) { + return false; + } + ClockHandle* h = reinterpret_cast(handle); + ClockHandle copy; + bool last_reference = false; + assert(!h->IsInClockList()); + { + DMutexLock l(mutex_); + last_reference = h->Unref(); + if (last_reference && h->IsVisible()) { + // The item is still in cache, and nobody else holds a reference to it. + if (usage_ > capacity_ || erase_if_last_ref) { + // The clock list must be empty since the cache is full. + assert(clock_usage_ == 0 || erase_if_last_ref); + // Take this opportunity and remove the item. + table_.Remove(h); + } else { + // Put the item back on the clock list, and don't free it. + ClockInsert(h); + last_reference = false; + } + } + // If it was the last reference, then decrement the cache usage. + if (last_reference) { + assert(usage_ >= h->total_charge); + usage_ -= h->total_charge; + copy = *h; + } + } -bool ClockCacheShard::EraseAndConfirm(const Slice& key, uint32_t hash, - CleanupContext* context) { - DMutexLock l(mutex_); - HashTable::accessor accessor; - bool erased = false; - if (table_.find(accessor, ClockCacheKey(key, hash))) { - CacheHandle* handle = accessor->second; - table_.erase(accessor); - erased = UnsetInCache(handle, context); + // Free the entry here outside of mutex for performance reasons. + if (last_reference) { + copy.FreeData(); } - return erased; + return last_reference; } -void ClockCacheShard::EraseUnRefEntries() { - CleanupContext context; +void ClockCacheShard::Erase(const Slice& key, uint32_t /* hash */) { + ClockHandle copy; + bool last_reference = false; { DMutexLock l(mutex_); - table_.clear(); - for (auto& handle : list_) { - UnsetInCache(&handle, &context); + ClockHandle* h = table_.Lookup(key); + if (h != nullptr) { + table_.Exclude(h); + if (!h->HasRefs()) { + // The entry is in Clock since it's in cache and has no external + // references. + ClockRemove(h); + table_.Remove(h); + assert(usage_ >= h->total_charge); + usage_ -= h->total_charge; + last_reference = true; + copy = *h; + } } } - Cleanup(context); -} - -class ClockCache final : public ShardedCache { - public: - ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy) - : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) { - int num_shards = 1 << num_shard_bits; - shards_ = new ClockCacheShard[num_shards]; - for (int i = 0; i < num_shards; i++) { - shards_[i].set_metadata_charge_policy(metadata_charge_policy); - } - SetCapacity(capacity); - SetStrictCapacityLimit(strict_capacity_limit); + // Free the entry here outside of mutex for performance reasons. + // last_reference will only be true if e != nullptr. + if (last_reference) { + copy.FreeData(); } +} - ~ClockCache() override { delete[] shards_; } +size_t ClockCacheShard::GetUsage() const { + DMutexLock l(mutex_); + return usage_; +} - const char* Name() const override { return "ClockCache"; } +size_t ClockCacheShard::GetPinnedUsage() const { + DMutexLock l(mutex_); + assert(usage_ >= clock_usage_); + return usage_ - clock_usage_; +} - CacheShard* GetShard(uint32_t shard) override { - return reinterpret_cast(&shards_[shard]); - } +std::string ClockCacheShard::GetPrintableOptions() const { + return std::string{}; +} - const CacheShard* GetShard(uint32_t shard) const override { - return reinterpret_cast(&shards_[shard]); +ClockCache::ClockCache(size_t capacity, size_t estimated_value_size, + int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy) + : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) { + num_shards_ = 1 << num_shard_bits; + shards_ = reinterpret_cast( + port::cacheline_aligned_alloc(sizeof(ClockCacheShard) * num_shards_)); + size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_; + for (int i = 0; i < num_shards_; i++) { + new (&shards_[i]) + ClockCacheShard(per_shard, estimated_value_size, strict_capacity_limit, + metadata_charge_policy); } +} - void* Value(Handle* handle) override { - return reinterpret_cast(handle)->value; +ClockCache::~ClockCache() { + if (shards_ != nullptr) { + assert(num_shards_ > 0); + for (int i = 0; i < num_shards_; i++) { + shards_[i].~ClockCacheShard(); + } + port::cacheline_aligned_free(shards_); } +} - size_t GetCharge(Handle* handle) const override { - return reinterpret_cast(handle)->charge; - } +CacheShard* ClockCache::GetShard(uint32_t shard) { + return reinterpret_cast(&shards_[shard]); +} - uint32_t GetHash(Handle* handle) const override { - return reinterpret_cast(handle)->hash; - } +const CacheShard* ClockCache::GetShard(uint32_t shard) const { + return reinterpret_cast(&shards_[shard]); +} - DeleterFn GetDeleter(Handle* handle) const override { - return reinterpret_cast(handle)->deleter; - } +void* ClockCache::Value(Handle* handle) { + return reinterpret_cast(handle)->value; +} - void DisownData() override { - // Leak data only if that won't generate an ASAN/valgrind warning - if (!kMustFreeHeapAllocations) { - shards_ = nullptr; - } +size_t ClockCache::GetCharge(Handle* handle) const { + CacheMetadataChargePolicy metadata_charge_policy = kDontChargeCacheMetadata; + if (num_shards_ > 0) { + metadata_charge_policy = shards_[0].metadata_charge_policy_; } + return reinterpret_cast(handle)->GetCharge( + metadata_charge_policy); +} - void WaitAll(std::vector& /*handles*/) override {} +Cache::DeleterFn ClockCache::GetDeleter(Handle* handle) const { + auto h = reinterpret_cast(handle); + return h->deleter; +} - private: - ClockCacheShard* shards_; -}; +uint32_t ClockCache::GetHash(Handle* handle) const { + return reinterpret_cast(handle)->hash; +} -} // end anonymous namespace +void ClockCache::DisownData() { + // Leak data only if that won't generate an ASAN/valgrind warning. + if (!kMustFreeHeapAllocations) { + shards_ = nullptr; + num_shards_ = 0; + } +} + +} // namespace clock_cache std::shared_ptr NewClockCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, + size_t capacity, size_t estimated_value_size, int num_shard_bits, + bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy) { + if (num_shard_bits >= 20) { + return nullptr; // The cache cannot be sharded into too many fine pieces. + } if (num_shard_bits < 0) { num_shard_bits = GetDefaultCacheShardBits(capacity); } - return std::make_shared( - capacity, num_shard_bits, strict_capacity_limit, metadata_charge_policy); + return std::make_shared( + capacity, estimated_value_size, num_shard_bits, strict_capacity_limit, + metadata_charge_policy); } } // namespace ROCKSDB_NAMESPACE - -#endif // SUPPORT_CLOCK_CACHE diff --git a/cache/clock_cache.h b/cache/clock_cache.h index 1614c0ed4..9d2d4473e 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -9,8 +9,445 @@ #pragma once +#include +#include +#include + +#include "cache/cache_key.h" +#include "cache/sharded_cache.h" +#include "port/lang.h" +#include "port/malloc.h" +#include "port/port.h" #include "rocksdb/cache.h" +#include "rocksdb/secondary_cache.h" +#include "util/autovector.h" +#include "util/distributed_mutex.h" + +namespace ROCKSDB_NAMESPACE { + +namespace clock_cache { + +// Clock cache implementation. This is based on FastLRUCache's open-addressed +// hash table. Importantly, it stores elements in an array, and resolves +// collision using a probing strategy. Visibility and referenceability of +// elements works as usual. See fast_lru_cache.h for a detailed description. +// +// The main difference with FastLRUCache is, not surprisingly, the eviction +// algorithm +// ---instead of an LRU list, we maintain a circular list with the elements +// available for eviction, which the clock algorithm traverses to pick the next +// victim. The clock list is represented using the array of handles, and we +// simply mark those elements that are present in the list. This is done using +// different clock flags, namely NONE, LOW, MEDIUM, HIGH, that represent +// priorities: NONE means that the element is not part of the clock list, and +// LOW to HIGH represent how close an element is from being evictable (LOW being +// immediately evictable). When the clock pointer steps on an element that is +// not immediately evictable, it decreases its priority. + +constexpr double kLoadFactor = 0.35; // See fast_lru_cache.h. + +constexpr double kStrictLoadFactor = 0.7; // See fast_lru_cache.h. + +// Arbitrary seeds. +constexpr uint32_t kProbingSeed1 = 0xbc9f1d34; +constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5; + +// An experimental (under development!) alternative to LRUCache + +struct ClockHandle { + void* value; + Cache::DeleterFn deleter; + uint32_t hash; + size_t total_charge; // TODO(opt): Only allow uint32_t? + // The number of external refs to this entry. + uint32_t refs; + + static constexpr int kIsVisibleOffset = 0; + static constexpr int kIsElementOffset = 1; + static constexpr int kClockPriorityOffset = 2; + + enum Flags : uint8_t { + // Whether the handle is visible to Lookups. + IS_VISIBLE = (1 << kIsVisibleOffset), + // Whether the slot is in use by an element. + IS_ELEMENT = (1 << kIsElementOffset), + // Clock priorities. Represents how close a handle is from + // being evictable. + CLOCK_PRIORITY = (3 << kClockPriorityOffset), + }; + uint8_t flags; + + enum ClockPriority : uint8_t { + NONE = (0 << kClockPriorityOffset), // Not an element in the eyes of clock. + LOW = (1 << kClockPriorityOffset), // Immediately evictable. + MEDIUM = (2 << kClockPriorityOffset), + HIGH = (3 << kClockPriorityOffset) + // Priority is CLOCK_NONE if and only if + // (i) the handle is not an element, or + // (ii) the handle is an element but it is being referenced. + }; + + // The number of elements that hash to this slot or a lower one, + // but wind up in a higher slot. + uint32_t displacements; + + std::array key_data; + + ClockHandle() { + value = nullptr; + deleter = nullptr; + hash = 0; + total_charge = 0; + refs = 0; + flags = 0; + SetIsVisible(false); + SetIsElement(false); + SetPriority(ClockPriority::NONE); + displacements = 0; + key_data.fill(0); + } + + Slice key() const { return Slice(key_data.data(), kCacheKeySize); } + + // Increase the reference count by 1. + void Ref() { refs++; } + + // Just reduce the reference count by 1. Return true if it was last reference. + bool Unref() { + assert(refs > 0); + refs--; + return refs == 0; + } + + // Return true if there are external refs, false otherwise. + bool HasRefs() const { return refs > 0; } + + bool IsVisible() const { return flags & IS_VISIBLE; } + + void SetIsVisible(bool is_visible) { + if (is_visible) { + flags |= IS_VISIBLE; + } else { + flags &= ~IS_VISIBLE; + } + } + + bool IsElement() const { return flags & IS_ELEMENT; } + + void SetIsElement(bool is_element) { + if (is_element) { + flags |= IS_ELEMENT; + } else { + flags &= ~IS_ELEMENT; + } + } + + ClockPriority GetPriority() const { + return static_cast(flags & Flags::CLOCK_PRIORITY); + } + + bool IsInClockList() const { + return GetPriority() != ClockHandle::ClockPriority::NONE; + } + + void SetPriority(ClockPriority priority) { + flags &= ~Flags::CLOCK_PRIORITY; + flags |= priority; + } + + void DecreasePriority() { + uint8_t p = static_cast(flags & Flags::CLOCK_PRIORITY) >> + kClockPriorityOffset; + assert(p > 0); + p--; + flags &= ~Flags::CLOCK_PRIORITY; + ClockPriority new_priority = + static_cast(p << kClockPriorityOffset); + flags |= new_priority; + } + + void FreeData() { + assert(refs == 0); + if (deleter) { + (*deleter)(key(), value); + } + } + + // Calculate the memory usage by metadata. + inline size_t CalcMetaCharge( + CacheMetadataChargePolicy metadata_charge_policy) const { + if (metadata_charge_policy != kFullChargeCacheMetadata) { + return 0; + } else { + // #ifdef ROCKSDB_MALLOC_USABLE_SIZE + // return malloc_usable_size( + // const_cast(static_cast(this))); + // #else + // TODO(Guido) malloc_usable_size only works when we call it on + // a pointer allocated with malloc. Because our handles are all + // allocated in a single shot as an array, the user can't call + // CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle + // pointer returned by the cache. Moreover, malloc_usable_size + // expects a heap-allocated handle, but sometimes in our code we + // wish to pass a stack-allocated handle (this is only a performance + // concern). + // What is the right way to compute metadata charges with pre-allocated + // handles? + return sizeof(ClockHandle); + // #endif + } + } + + inline void CalcTotalCharge( + size_t charge, CacheMetadataChargePolicy metadata_charge_policy) { + total_charge = charge + CalcMetaCharge(metadata_charge_policy); + } + + inline size_t GetCharge( + CacheMetadataChargePolicy metadata_charge_policy) const { + size_t meta_charge = CalcMetaCharge(metadata_charge_policy); + assert(total_charge >= meta_charge); + return total_charge - meta_charge; + } + + inline bool IsEmpty() { + return !this->IsElement() && this->displacements == 0; + } + + inline bool IsTombstone() { + return !this->IsElement() && this->displacements > 0; + } + + inline bool Matches(const Slice& some_key) { + return this->IsElement() && this->key() == some_key; + } +}; // struct ClockHandle + +class ClockHandleTable { + public: + explicit ClockHandleTable(int hash_bits); + ~ClockHandleTable(); + + // Returns a pointer to a visible element matching the key/hash, or + // nullptr if not present. + ClockHandle* Lookup(const Slice& key); + + // Inserts a copy of h into the hash table. + // Returns a pointer to the inserted handle, or nullptr if no slot + // available was found. If an existing visible element matching the + // key/hash is already present in the hash table, the argument old + // is set to pointe to it; otherwise, it's set to nullptr. + ClockHandle* Insert(ClockHandle* h, ClockHandle** old); + + // Removes h from the hash table. The handle must already be off + // the clock list. + void Remove(ClockHandle* h); + + // Turns a visible element h into a ghost (i.e., not visible). + void Exclude(ClockHandle* h); + + // Assigns a copy of h to the given slot. + void Assign(int slot, ClockHandle* h); + + template + void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) { + for (uint32_t i = index_begin; i < index_end; i++) { + ClockHandle* h = &array_[i]; + if (h->IsVisible()) { + func(h); + } + } + } -#if defined(TBB) && !defined(ROCKSDB_LITE) -#define SUPPORT_CLOCK_CACHE + uint32_t GetTableSize() const { return uint32_t{1} << length_bits_; } + + int GetLengthBits() const { return length_bits_; } + + uint32_t GetOccupancyLimit() const { return occupancy_limit_; } + + uint32_t GetOccupancy() const { return occupancy_; } + + // Returns x mod 2^{length_bits_}. + uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; } + + private: + friend class ClockCacheShard; + + int FindVisibleElement(const Slice& key, int& probe, int displacement); + + int FindAvailableSlot(const Slice& key, int& probe, int displacement); + + int FindVisibleElementOrAvailableSlot(const Slice& key, int& probe, + int displacement); + + // Returns the index of the first slot probed (hashing with + // the given key) with a handle e such that cond(e) is true. + // Otherwise, if no match is found, returns -1. + // For every handle e probed except the final slot, updates + // e->displacements += displacement. + // The argument probe is modified such that consecutive calls + // to FindSlot continue probing right after where the previous + // call left. + int FindSlot(const Slice& key, std::function cond, + int& probe, int displacement); + + // Number of hash bits used for table index. + // The size of the table is 1 << length_bits_. + int length_bits_; + + const uint32_t length_bits_mask_; + + // Number of elements in the table. + uint32_t occupancy_; + + // Maximum number of elements the user can store in the table. + uint32_t occupancy_limit_; + + std::unique_ptr array_; +}; // class ClockHandleTable + +// A single shard of sharded cache. +class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard { + public: + ClockCacheShard(size_t capacity, size_t estimated_value_size, + bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy); + ~ClockCacheShard() override = default; + + // Separate from constructor so caller can easily make an array of ClockCache + // if current usage is more than new capacity, the function will attempt to + // free the needed space. + void SetCapacity(size_t capacity) override; + + // Set the flag to reject insertion if cache if full. + void SetStrictCapacityLimit(bool strict_capacity_limit) override; + + // Like Cache methods, but with an extra "hash" parameter. + // Insert an item into the hash table and, if handle is null, insert into + // the clock list. Older items are evicted as necessary. If the cache is full + // and free_handle_on_fail is true, the item is deleted and handle is set to + // nullptr. + Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, + Cache::DeleterFn deleter, Cache::Handle** handle, + Cache::Priority priority) override; + + Status Insert(const Slice& key, uint32_t hash, void* value, + const Cache::CacheItemHelper* helper, size_t charge, + Cache::Handle** handle, Cache::Priority priority) override { + return Insert(key, hash, value, charge, helper->del_cb, handle, priority); + } + + Cache::Handle* Lookup(const Slice& key, uint32_t hash, + const Cache::CacheItemHelper* /*helper*/, + const Cache::CreateCallback& /*create_cb*/, + Cache::Priority /*priority*/, bool /*wait*/, + Statistics* /*stats*/) override { + return Lookup(key, hash); + } + Cache::Handle* Lookup(const Slice& key, uint32_t hash) override; + + bool Release(Cache::Handle* handle, bool /*useful*/, + bool erase_if_last_ref) override { + return Release(handle, erase_if_last_ref); + } + bool IsReady(Cache::Handle* /*handle*/) override { return true; } + void Wait(Cache::Handle* /*handle*/) override {} + + bool Ref(Cache::Handle* handle) override; + bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override; + void Erase(const Slice& key, uint32_t hash) override; + + size_t GetUsage() const override; + size_t GetPinnedUsage() const override; + + void ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) override; + + void EraseUnRefEntries() override; + + std::string GetPrintableOptions() const override; + + private: + friend class ClockCache; + void ClockRemove(ClockHandle* e); + void ClockInsert(ClockHandle* e); + + // Free some space following strict clock policy until enough space + // to hold (usage_ + charge) is freed or the clock list is empty + // This function is not thread safe - it needs to be executed while + // holding the mutex_. + void EvictFromClock(size_t charge, autovector* deleted); + + // Returns the charge of a single handle. + static size_t CalcEstimatedHandleCharge( + size_t estimated_value_size, + CacheMetadataChargePolicy metadata_charge_policy); + + // Returns the number of bits used to hash an element in the hash + // table. + static int CalcHashBits(size_t capacity, size_t estimated_value_size, + CacheMetadataChargePolicy metadata_charge_policy); + + // Initialized before use. + size_t capacity_; + + // Whether to reject insertion if cache reaches its full capacity. + bool strict_capacity_limit_; + + uint32_t clock_pointer_; + + // ------------^^^^^^^^^^^^^----------- + // Not frequently modified data members + // ------------------------------------ + // + // We separate data members that are updated frequently from the ones that + // are not frequently updated so that they don't share the same cache line + // which will lead into false cache sharing + // + // ------------------------------------ + // Frequently modified data members + // ------------vvvvvvvvvvvvv----------- + ClockHandleTable table_; + + // Memory size for entries residing in the cache. + size_t usage_; + + // Memory size for unpinned entries in the clock list. + size_t clock_usage_; + + // mutex_ protects the following state. + // We don't count mutex_ as the cache's internal state so semantically we + // don't mind mutex_ invoking the non-const actions. + mutable DMutex mutex_; +}; // class ClockCacheShard + +class ClockCache +#ifdef NDEBUG + final #endif + : public ShardedCache { + public: + ClockCache(size_t capacity, size_t estimated_value_size, int num_shard_bits, + bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy = + kDontChargeCacheMetadata); + ~ClockCache() override; + const char* Name() const override { return "ClockCache"; } + CacheShard* GetShard(uint32_t shard) override; + const CacheShard* GetShard(uint32_t shard) const override; + void* Value(Handle* handle) override; + size_t GetCharge(Handle* handle) const override; + uint32_t GetHash(Handle* handle) const override; + DeleterFn GetDeleter(Handle* handle) const override; + void DisownData() override; + + private: + ClockCacheShard* shards_ = nullptr; + int num_shards_ = 0; +}; // class ClockCache + +} // namespace clock_cache + +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/fast_lru_cache.cc b/cache/fast_lru_cache.cc index 62821a875..f6ffefba2 100644 --- a/cache/fast_lru_cache.cc +++ b/cache/fast_lru_cache.cc @@ -9,8 +9,6 @@ #include "cache/fast_lru_cache.h" -#include - #include #include #include @@ -21,39 +19,25 @@ #include "port/lang.h" #include "util/distributed_mutex.h" #include "util/hash.h" +#include "util/math.h" #include "util/random.h" namespace ROCKSDB_NAMESPACE { namespace fast_lru_cache { -namespace { -// Returns x % 2^{bits}. -inline uint32_t BinaryMod(uint32_t x, uint8_t bits) { - assert(bits <= 32); - return (x << (32 - bits)) >> (32 - bits); -} -} // anonymous namespace - -LRUHandleTable::LRUHandleTable(uint8_t hash_bits) +LRUHandleTable::LRUHandleTable(int hash_bits) : length_bits_(hash_bits), + length_bits_mask_((uint32_t{1} << length_bits_) - 1), occupancy_(0), + occupancy_limit_(static_cast((uint32_t{1} << length_bits_) * + kStrictLoadFactor)), array_(new LRUHandle[size_t{1} << length_bits_]) { assert(hash_bits <= 32); } LRUHandleTable::~LRUHandleTable() { - // TODO(Guido) If users still hold references to handles, - // those will become invalidated. And if we choose not to - // delete the data, it will become leaked. - ApplyToEntriesRange( - [](LRUHandle* h) { - // TODO(Guido) Remove the HasRefs() check? - if (!h->HasRefs()) { - h->FreeData(); - } - }, - 0, uint32_t{1} << length_bits_); + ApplyToEntriesRange([](LRUHandle* h) { h->FreeData(); }, 0, GetTableSize()); } LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) { @@ -161,11 +145,10 @@ int LRUHandleTable::FindVisibleElementOrAvailableSlot(const Slice& key, inline int LRUHandleTable::FindSlot(const Slice& key, std::function cond, int& probe, int displacement) { - uint32_t base = - BinaryMod(Hash(key.data(), key.size(), kProbingSeed1), length_bits_); - uint32_t increment = BinaryMod( - (Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1, length_bits_); - uint32_t current = BinaryMod(base + probe * increment, length_bits_); + uint32_t base = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1)); + uint32_t increment = + ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1); + uint32_t current = ModTableSize(base + probe * increment); while (true) { LRUHandle* h = &array_[current]; probe++; @@ -182,7 +165,7 @@ inline int LRUHandleTable::FindSlot(const Slice& key, return -1; } h->displacements += displacement; - current = BinaryMod(current + increment, length_bits_); + current = ModTableSize(current + increment); } } @@ -233,7 +216,7 @@ void LRUCacheShard::ApplyToSomeEntries( // hash bits for table indexes. DMutexLock l(mutex_); uint32_t length_bits = table_.GetLengthBits(); - uint32_t length = uint32_t{1} << length_bits; + uint32_t length = table_.GetTableSize(); assert(average_entries_per_lock > 0); // Assuming we are called with same average_entries_per_lock repeatedly, @@ -302,22 +285,19 @@ size_t LRUCacheShard::CalcEstimatedHandleCharge( return h.total_charge; } -uint8_t LRUCacheShard::CalcHashBits( +int LRUCacheShard::CalcHashBits( size_t capacity, size_t estimated_value_size, CacheMetadataChargePolicy metadata_charge_policy) { size_t handle_charge = CalcEstimatedHandleCharge(estimated_value_size, metadata_charge_policy); - size_t num_entries = - static_cast(capacity / (kLoadFactor * handle_charge)); - - // Compute the ceiling of log2(num_entries). If num_entries == 0, return 0. - uint8_t num_hash_bits = 0; - size_t num_entries_copy = num_entries; - while (num_entries_copy >>= 1) { - ++num_hash_bits; + uint32_t num_entries = + static_cast(capacity / (kLoadFactor * handle_charge)); + + if (num_entries == 0) { + return 0; } - num_hash_bits += size_t{1} << num_hash_bits < num_entries ? 1 : 0; - return num_hash_bits; + int hash_bits = FloorLog2(num_entries); + return hash_bits + (size_t{1} << hash_bits < num_entries ? 1 : 0); } void LRUCacheShard::SetCapacity(size_t capacity) { @@ -362,33 +342,51 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, autovector last_reference_list; { DMutexLock l(mutex_); + assert(table_.GetOccupancy() <= table_.GetOccupancyLimit()); // Free the space following strict LRU policy until enough space // is freed or the lru list is empty. EvictFromLRU(tmp.total_charge, &last_reference_list); if ((usage_ + tmp.total_charge > capacity_ && (strict_capacity_limit_ || handle == nullptr)) || - table_.GetOccupancy() == size_t{1} << table_.GetLengthBits()) { - // Originally, when strict_capacity_limit_ == false and handle != nullptr - // (i.e., the user wants to immediately get a reference to the new - // handle), the insertion would proceed even if the total charge already - // exceeds capacity. We can't do this now, because we can't physically - // insert a new handle when the table is at maximum occupancy. + table_.GetOccupancy() == table_.GetOccupancyLimit()) { + // There are two measures of capacity: + // - Space (or charge) capacity: The maximum possible sum of the charges + // of the elements. + // - Table capacity: The number of slots in the hash table. + // These are incomparable, in the sense that one doesn't imply the other. + // Typically we will reach space capacity before table capacity--- + // if the user always inserts values with size equal to + // estimated_value_size, then at most a kLoadFactor fraction of slots + // will ever be occupied. But in some cases we may reach table capacity + // before space capacity---if the user initially claims a very large + // estimated_value_size but then inserts tiny values, more elements than + // initially estimated will be inserted. + // TODO(Guido) Some tests (at least two from cache_test, as well as the - // stress tests) currently assume the old behavior. + // stress tests) currently assume the table capacity is unbounded. if (handle == nullptr) { // Don't insert the entry but still return ok, as if the entry inserted // into cache and get evicted immediately. last_reference_list.push_back(tmp); } else { - s = Status::Incomplete("Insert failed due to LRU cache being full."); + if (table_.GetOccupancy() == table_.GetOccupancyLimit()) { + s = Status::Incomplete( + "Insert failed because all slots in the hash table are full."); + // TODO(Guido) Use the correct statuses. + } else { + s = Status::Incomplete( + "Insert failed because the total charge has exceeded the " + "capacity."); + } } } else { // Insert into the cache. Note that the cache might get larger than its // capacity if not enough space was freed up. LRUHandle* old; LRUHandle* h = table_.Insert(&tmp, &old); - assert(h != nullptr); // Insertions should never fail. + assert(h != nullptr); // We're below occupancy, so this insertion should + // never fail. usage_ += h->total_charge; if (old != nullptr) { s = Status::OkOverwritten(); @@ -431,7 +429,8 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) { if (h != nullptr) { assert(h->IsVisible()); if (!h->HasRefs()) { - // The entry is in LRU since it's in hash and has no external references + // The entry is in LRU since it's in hash and has no external + // references. LRU_Remove(h); } h->Ref(); @@ -497,7 +496,7 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { table_.Exclude(h); if (!h->HasRefs()) { // The entry is in LRU since it's in cache and has no external - // references + // references. LRU_Remove(h); table_.Remove(h); assert(usage_ >= h->total_charge); diff --git a/cache/fast_lru_cache.h b/cache/fast_lru_cache.h index 7b0917a55..a02422beb 100644 --- a/cache/fast_lru_cache.h +++ b/cache/fast_lru_cache.h @@ -78,8 +78,9 @@ class FastLRUCacheTest; // times at most a fraction p of all slots, without counting tombstones, // are occupied by elements. This means that the probability that a // random probe hits an empty slot is at most p, and thus at most 1/p probes -// are required on average. We use p = 70%, so between 1 and 2 probes are -// needed on average. +// are required on average. For example, p = 70% implies that between 1 and 2 +// probes are needed on average (bear in mind that this reasoning doesn't +// consider the effects of clustering over time). // Because the size of the hash table is always rounded up to the next // power of 2, p is really an upper bound on the actual load factor---the // actual load factor is anywhere between p/2 and p. This is a bit wasteful, @@ -87,7 +88,12 @@ class FastLRUCacheTest; // Since space cost is dominated by the values (the LSM blocks), // overprovisioning the table with metadata only increases the total cache space // usage by a tiny fraction. -constexpr double kLoadFactor = 0.7; +constexpr double kLoadFactor = 0.35; + +// The user can exceed kLoadFactor if the sizes of the inserted values don't +// match estimated_value_size, or if strict_capacity_limit == false. To +// avoid performance to plunge, we set a strict upper bound on the load factor. +constexpr double kStrictLoadFactor = 0.7; // Arbitrary seeds. constexpr uint32_t kProbingSeed1 = 0xbc9f1d34; @@ -103,7 +109,7 @@ struct LRUHandle { size_t total_charge; // TODO(opt): Only allow uint32_t? // The hash of key(). Used for fast sharding and comparisons. uint32_t hash; - // The number of external refs to this entry. The cache itself is not counted. + // The number of external refs to this entry. uint32_t refs; enum Flags : uint8_t { @@ -226,16 +232,10 @@ struct LRUHandle { } }; -// TODO(Guido) Update the following comment. -// We provide our own simple hash table since it removes a whole bunch -// of porting hacks and is also faster than some of the built-in hash -// table implementations in some of the compiler/runtime combinations -// we have tested. E.g., readrandom speeds up by ~5% over the g++ -// 4.4.3's builtin hashtable. class LRUHandleTable { public: - explicit LRUHandleTable(uint8_t hash_bits); + explicit LRUHandleTable(int hash_bits); ~LRUHandleTable(); // Returns a pointer to a visible element matching the key/hash, or @@ -269,10 +269,17 @@ class LRUHandleTable { } } - uint8_t GetLengthBits() const { return length_bits_; } + uint32_t GetTableSize() const { return uint32_t{1} << length_bits_; } + + int GetLengthBits() const { return length_bits_; } + + uint32_t GetOccupancyLimit() const { return occupancy_limit_; } uint32_t GetOccupancy() const { return occupancy_; } + // Returns x mod 2^{length_bits_}. + uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; } + private: int FindVisibleElement(const Slice& key, uint32_t hash, int& probe, int displacement); @@ -295,11 +302,16 @@ class LRUHandleTable { // Number of hash bits used for table index. // The size of the table is 1 << length_bits_. - uint8_t length_bits_; + int length_bits_; + + const uint32_t length_bits_mask_; // Number of elements in the table. uint32_t occupancy_; + // Maximum number of elements the user can store in the table. + uint32_t occupancy_limit_; + std::unique_ptr array_; }; @@ -374,7 +386,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { void LRU_Insert(LRUHandle* e); // Free some space following strict LRU policy until enough space - // to hold (usage_ + charge) is freed or the lru list is empty + // to hold (usage_ + charge) is freed or the LRU list is empty // This function is not thread safe - it needs to be executed while // holding the mutex_. void EvictFromLRU(size_t charge, autovector* deleted); @@ -386,8 +398,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { // Returns the number of bits used to hash an element in the hash // table. - static uint8_t CalcHashBits(size_t capacity, size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy); + static int CalcHashBits(size_t capacity, size_t estimated_value_size, + CacheMetadataChargePolicy metadata_charge_policy); // Initialized before use. size_t capacity_; diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 5767d9186..0833e63d6 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -9,6 +9,7 @@ #include #include "cache/cache_key.h" +#include "cache/clock_cache.h" #include "cache/fast_lru_cache.h" #include "db/db_test_util.h" #include "file/sst_file_manager_impl.h" @@ -207,8 +208,8 @@ TEST_F(LRUCacheTest, EntriesWithPriority) { } namespace fast_lru_cache { -// TODO(guido) Consolidate the following FastLRUCache tests with -// that of LRUCache. + +// TODO(guido) Replicate LRU policy tests from LRUCache here. class FastLRUCacheTest : public testing::Test { public: FastLRUCacheTest() {} @@ -246,9 +247,8 @@ class FastLRUCacheTest : public testing::Test { estimated_value_size, metadata_charge_policy); } - uint8_t CalcHashBitsWrapper( - size_t capacity, size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy) { + int CalcHashBitsWrapper(size_t capacity, size_t estimated_value_size, + CacheMetadataChargePolicy metadata_charge_policy) { return fast_lru_cache::LRUCacheShard::CalcHashBits( capacity, estimated_value_size, metadata_charge_policy); } @@ -262,7 +262,7 @@ class FastLRUCacheTest : public testing::Test { return capacity / (fast_lru_cache::kLoadFactor * handle_charge); } - bool TableSizeIsAppropriate(uint8_t hash_bits, double max_occupancy) { + bool TableSizeIsAppropriate(int hash_bits, double max_occupancy) { if (hash_bits == 0) { return max_occupancy <= 1; } else { @@ -292,8 +292,8 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) { CacheMetadataChargePolicy metadata_charge_policy = kDontChargeCacheMetadata; double max_occupancy = CalcMaxOccupancy(capacity, estimated_value_size, metadata_charge_policy); - uint8_t hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); + int hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, + metadata_charge_policy); EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy)); capacity = 1024; @@ -342,6 +342,127 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) { } // namespace fast_lru_cache +namespace clock_cache { + +class ClockCacheTest : public testing::Test { + public: + ClockCacheTest() {} + ~ClockCacheTest() override { DeleteShard(); } + + void DeleteShard() { + if (shard_ != nullptr) { + shard_->~ClockCacheShard(); + port::cacheline_aligned_free(shard_); + shard_ = nullptr; + } + } + + void NewShard(size_t capacity) { + DeleteShard(); + shard_ = reinterpret_cast( + port::cacheline_aligned_alloc(sizeof(ClockCacheShard))); + new (shard_) ClockCacheShard(capacity, 1, true /*strict_capacity_limit*/, + kDontChargeCacheMetadata); + } + + Status Insert(const std::string& key, + Cache::Priority priority = Cache::Priority::LOW) { + return shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/, + nullptr /*deleter*/, nullptr /*handle*/, priority); + } + + Status Insert(char key, Cache::Priority priority = Cache::Priority::LOW) { + return Insert(std::string(kCacheKeySize, key), priority); + } + + Status Insert(char key, size_t len) { return Insert(std::string(len, key)); } + + bool Lookup(const std::string& key) { + auto handle = shard_->Lookup(key, 0 /*hash*/); + if (handle) { + shard_->Release(handle); + return true; + } + return false; + } + + bool Lookup(char key) { return Lookup(std::string(kCacheKeySize, key)); } + + void Erase(const std::string& key) { shard_->Erase(key, 0 /*hash*/); } + + // void ValidateLRUList(std::vector keys, + // size_t num_high_pri_pool_keys = 0) { + // LRUHandle* lru; + // LRUHandle* lru_low_pri; + // cache_->TEST_GetLRUList(&lru, &lru_low_pri); + // LRUHandle* iter = lru; + // bool in_high_pri_pool = false; + // size_t high_pri_pool_keys = 0; + // if (iter == lru_low_pri) { + // in_high_pri_pool = true; + // } + // for (const auto& key : keys) { + // iter = iter->next; + // ASSERT_NE(lru, iter); + // ASSERT_EQ(key, iter->key().ToString()); + // ASSERT_EQ(in_high_pri_pool, iter->InHighPriPool()); + // if (in_high_pri_pool) { + // high_pri_pool_keys++; + // } + // if (iter == lru_low_pri) { + // ASSERT_FALSE(in_high_pri_pool); + // in_high_pri_pool = true; + // } + // } + // ASSERT_EQ(lru, iter->next); + // ASSERT_TRUE(in_high_pri_pool); + // ASSERT_EQ(num_high_pri_pool_keys, high_pri_pool_keys); + // } + + private: + clock_cache::ClockCacheShard* shard_ = nullptr; +}; + +TEST_F(ClockCacheTest, Validate) { + NewShard(3); + EXPECT_OK(Insert('a', 16)); + EXPECT_NOK(Insert('b', 15)); + EXPECT_OK(Insert('b', 16)); + EXPECT_NOK(Insert('c', 17)); + EXPECT_NOK(Insert('d', 1000)); + EXPECT_NOK(Insert('e', 11)); + EXPECT_NOK(Insert('f', 0)); +} + +TEST_F(ClockCacheTest, ClockPriorityTest) { + clock_cache::ClockHandle handle; + EXPECT_EQ(handle.GetPriority(), + clock_cache::ClockHandle::ClockPriority::NONE); + handle.SetPriority(clock_cache::ClockHandle::ClockPriority::HIGH); + EXPECT_EQ(handle.GetPriority(), + clock_cache::ClockHandle::ClockPriority::HIGH); + handle.DecreasePriority(); + EXPECT_EQ(handle.GetPriority(), + clock_cache::ClockHandle::ClockPriority::MEDIUM); + handle.DecreasePriority(); + EXPECT_EQ(handle.GetPriority(), clock_cache::ClockHandle::ClockPriority::LOW); + handle.SetPriority(clock_cache::ClockHandle::ClockPriority::MEDIUM); + EXPECT_EQ(handle.GetPriority(), + clock_cache::ClockHandle::ClockPriority::MEDIUM); + handle.SetPriority(clock_cache::ClockHandle::ClockPriority::NONE); + EXPECT_EQ(handle.GetPriority(), + clock_cache::ClockHandle::ClockPriority::NONE); + handle.SetPriority(clock_cache::ClockHandle::ClockPriority::MEDIUM); + EXPECT_EQ(handle.GetPriority(), + clock_cache::ClockHandle::ClockPriority::MEDIUM); + handle.DecreasePriority(); + handle.DecreasePriority(); + EXPECT_EQ(handle.GetPriority(), + clock_cache::ClockHandle::ClockPriority::NONE); +} + +} // namespace clock_cache + class TestSecondaryCache : public SecondaryCache { public: // Specifies what action to take on a lookup for a particular key diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 353db60dd..0594c7950 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -932,7 +932,9 @@ TEST_F(DBBlockCacheTest, AddRedundantStats) { int iterations_tested = 0; for (std::shared_ptr base_cache : {NewLRUCache(capacity, num_shard_bits), - NewClockCache(capacity, num_shard_bits), + NewClockCache(capacity, 1 /*estimated_value_size*/, num_shard_bits, + false /*strict_capacity_limit*/, + kDefaultCacheMetadataChargePolicy), NewFastLRUCache(capacity, 1 /*estimated_value_size*/, num_shard_bits, false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy)}) { @@ -1288,11 +1290,10 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { const size_t capacity = size_t{1} << 25; int iterations_tested = 0; for (bool partition : {false, true}) { - for (std::shared_ptr cache : - {NewLRUCache(capacity), NewClockCache(capacity)}) { - // This test doesn't support FastLRUCache because the + for (std::shared_ptr cache : {NewLRUCache(capacity)}) { + // This test doesn't support FastLRUCache nor ClockCache because the // keys used are not 16B long. - // TODO(guido) Add support for FastLRUCache. + // TODO(guido) Add support for FastLRUCache and ClockCache. if (!cache) { // Skip clock cache when not supported continue; diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 42d41cab7..e31d4dc35 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -114,7 +114,9 @@ std::shared_ptr StressTest::NewCache(size_t capacity, } if (FLAGS_cache_type == "clock_cache") { - auto cache = NewClockCache((size_t)capacity); + auto cache = NewClockCache(static_cast(capacity), FLAGS_block_size, + num_shard_bits, false /*strict_capacity_limit*/, + kDefaultCacheMetadataChargePolicy); if (!cache) { fprintf(stderr, "Clock cache not supported."); exit(1); diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 48093811e..03f8ac51f 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -174,19 +174,15 @@ extern std::shared_ptr NewCompressedSecondaryCache( extern std::shared_ptr NewCompressedSecondaryCache( const CompressedSecondaryCacheOptions& opts); -// Similar to NewLRUCache, but create a cache based on CLOCK algorithm with +// Similar to NewLRUCache, but create a cache based on clock algorithm with // better concurrent performance in some cases. See util/clock_cache.cc for // more detail. // // Return nullptr if it is not supported. -// -// BROKEN: ClockCache is known to have bugs that could lead to crash or -// corruption, so should not be used until fixed. Use NewLRUCache instead. extern std::shared_ptr NewClockCache( - size_t capacity, int num_shard_bits = -1, - bool strict_capacity_limit = false, - CacheMetadataChargePolicy metadata_charge_policy = - kDefaultCacheMetadataChargePolicy); + size_t capacity, size_t estimated_value_size, int num_shard_bits, + bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy); class Cache { public: diff --git a/java/rocksjni/clock_cache.cc b/java/rocksjni/clock_cache.cc index e04991aa9..911eed6b3 100644 --- a/java/rocksjni/clock_cache.cc +++ b/java/rocksjni/clock_cache.cc @@ -23,8 +23,10 @@ jlong Java_org_rocksdb_ClockCache_newClockCache( jboolean jstrict_capacity_limit) { auto* sptr_clock_cache = new std::shared_ptr( ROCKSDB_NAMESPACE::NewClockCache( - static_cast(jcapacity), static_cast(jnum_shard_bits), - static_cast(jstrict_capacity_limit))); + static_cast(jcapacity), 1 /* estimated_value_size */, + static_cast(jnum_shard_bits), + static_cast(jstrict_capacity_limit), + rocksdb::CacheMetadataChargePolicy::kFullChargeCacheMetadata)); return GET_CPLUSPLUS_POINTER(sptr_clock_cache); } diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 1a41138da..a8511fb0c 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -2971,7 +2971,9 @@ class Benchmark { } if (FLAGS_cache_type == "clock_cache") { auto cache = NewClockCache(static_cast(capacity), - FLAGS_cache_numshardbits); + FLAGS_block_size, FLAGS_cache_numshardbits, + false /*strict_capacity_limit*/, + kDefaultCacheMetadataChargePolicy); if (!cache) { fprintf(stderr, "Clock cache not supported."); exit(1); diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index ad8b19641..766ed2bbf 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -115,8 +115,8 @@ default_params = { "use_direct_reads": lambda: random.randint(0, 1), "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1), "mock_direct_io": False, - "cache_type": "lru_cache", # clock_cache is broken - # fast_lru_cache is currently incompatible with stress tests, because they use strict_capacity_limit = false + "cache_type": "lru_cache", # fast_lru_cache and clock_cache are currently incompatible + # with stress tests, because they use strict_capacity_limit = false "use_full_merge_v1": lambda: random.randint(0, 1), "use_merge": lambda: random.randint(0, 1), # 999 -> use Bloom API