diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index 663bff953..dd36a5f06 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -441,6 +441,8 @@ class CacheBench { uint64_t total_key_size = 0; uint64_t total_charge = 0; uint64_t total_entry_count = 0; + uint64_t table_occupancy = 0; + uint64_t table_size = 0; std::set deleters; StopWatchNano timer(clock); @@ -456,6 +458,9 @@ class CacheBench { std::ostringstream ostr; ostr << "Most recent cache entry stats:\n" << "Number of entries: " << total_entry_count << "\n" + << "Table occupancy: " << table_occupancy << " / " + << table_size << " = " + << (100.0 * table_occupancy / table_size) << "%\n" << "Total charge: " << BytesToHumanString(total_charge) << "\n" << "Average key size: " << (1.0 * total_key_size / total_entry_count) << "\n" @@ -492,6 +497,8 @@ class CacheBench { Cache::ApplyToAllEntriesOptions opts; opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock; shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts); + table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount(); + table_size = shared->GetCacheBench()->cache_->GetTableAddressCount(); stats_hist->Add(timer.ElapsedNanos() / 1000); } } diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 81a9d412c..14b6e44d9 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -106,6 +106,8 @@ class CacheTest : public testing::TestWithParam { std::shared_ptr cache_; std::shared_ptr cache2_; + size_t estimated_value_size_ = 1; + CacheTest() : cache_(NewCache(kCacheSize, kNumShardBits, false)), cache2_(NewCache(kCacheSize2, kNumShardBits2, false)) { @@ -122,12 +124,12 @@ class CacheTest : public testing::TestWithParam { } if (type == kClock) { return ExperimentalNewClockCache( - capacity, 1 /*estimated_value_size*/, -1 /*num_shard_bits*/, + capacity, estimated_value_size_, -1 /*num_shard_bits*/, false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy); } if (type == kFast) { return NewFastLRUCache( - capacity, 1 /*estimated_value_size*/, -1 /*num_shard_bits*/, + capacity, estimated_value_size_, -1 /*num_shard_bits*/, false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy); } return nullptr; @@ -239,7 +241,10 @@ TEST_P(CacheTest, UsageTest) { auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata); ASSERT_EQ(0, cache->GetUsage()); - ASSERT_EQ(0, precise_cache->GetUsage()); + size_t baseline_meta_usage = precise_cache->GetUsage(); + if (type != kClock) { + ASSERT_EQ(0, baseline_meta_usage); + } size_t usage = 0; char value[10] = "abcdef"; @@ -258,13 +263,17 @@ TEST_P(CacheTest, UsageTest) { kv_size, DumbDeleter)); usage += kv_size; ASSERT_EQ(usage, cache->GetUsage()); - ASSERT_LT(usage, precise_cache->GetUsage()); + if (type == kClock) { + ASSERT_EQ(baseline_meta_usage + usage, precise_cache->GetUsage()); + } else { + ASSERT_LT(usage, precise_cache->GetUsage()); + } } cache->EraseUnRefEntries(); precise_cache->EraseUnRefEntries(); ASSERT_EQ(0, cache->GetUsage()); - ASSERT_EQ(0, precise_cache->GetUsage()); + ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage()); // make sure the cache will be overloaded for (size_t i = 1; i < kCapacity; ++i) { @@ -284,7 +293,15 @@ TEST_P(CacheTest, UsageTest) { ASSERT_GT(kCapacity, cache->GetUsage()); ASSERT_GT(kCapacity, precise_cache->GetUsage()); ASSERT_LT(kCapacity * 0.95, cache->GetUsage()); - ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage()); + if (type != kClock) { + ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage()); + } else { + // estimated value size of 1 is weird for clock cache, because + // almost all of the capacity will be used for metadata, and due to only + // using power of 2 table sizes, we might hit strict occupancy limit + // before hitting capacity limit. + ASSERT_LT(kCapacity * 0.80, precise_cache->GetUsage()); + } } // TODO: This test takes longer than expected on ClockCache. This is @@ -301,6 +318,10 @@ TEST_P(CacheTest, PinnedUsageTest) { const size_t kCapacity = 200000; auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata); + size_t baseline_meta_usage = precise_cache->GetUsage(); + if (type != kClock) { + ASSERT_EQ(0, baseline_meta_usage); + } size_t pinned_usage = 0; char value[10] = "abcdef"; @@ -390,7 +411,7 @@ TEST_P(CacheTest, PinnedUsageTest) { cache->EraseUnRefEntries(); precise_cache->EraseUnRefEntries(); ASSERT_EQ(0, cache->GetUsage()); - ASSERT_EQ(0, precise_cache->GetUsage()); + ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage()); } TEST_P(CacheTest, HitAndMiss) { @@ -407,16 +428,30 @@ TEST_P(CacheTest, HitAndMiss) { ASSERT_EQ(-1, Lookup(300)); Insert(100, 102); - ASSERT_EQ(102, Lookup(100)); + if (GetParam() == kClock) { + // ClockCache usually doesn't overwrite on Insert + ASSERT_EQ(101, Lookup(100)); + } else { + ASSERT_EQ(102, Lookup(100)); + } ASSERT_EQ(201, Lookup(200)); ASSERT_EQ(-1, Lookup(300)); ASSERT_EQ(1U, deleted_keys_.size()); ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); + if (GetParam() == kClock) { + ASSERT_EQ(102, deleted_values_[0]); + } else { + ASSERT_EQ(101, deleted_values_[0]); + } } TEST_P(CacheTest, InsertSameKey) { + if (GetParam() == kClock) { + ROCKSDB_GTEST_BYPASS( + "ClockCache doesn't guarantee Insert overwrite same key."); + return; + } Insert(1, 1); Insert(1, 2); ASSERT_EQ(2, Lookup(1)); @@ -442,6 +477,11 @@ TEST_P(CacheTest, Erase) { } TEST_P(CacheTest, EntriesArePinned) { + if (GetParam() == kClock) { + ROCKSDB_GTEST_BYPASS( + "ClockCache doesn't guarantee Insert overwrite same key."); + return; + } Insert(100, 101); Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); @@ -474,7 +514,6 @@ TEST_P(CacheTest, EntriesArePinned) { TEST_P(CacheTest, EvictionPolicy) { Insert(100, 101); Insert(200, 201); - // Frequently used entry must be kept around for (int i = 0; i < 2 * kCacheSize; i++) { Insert(1000+i, 2000+i); @@ -503,6 +542,12 @@ TEST_P(CacheTest, ExternalRefPinsEntries) { for (int j = 0; j < 2 * kCacheSize + 100; j++) { Insert(1000 + j, 2000 + j); } + // Clock cache is even more stateful and needs more churn to evict + if (GetParam() == kClock) { + for (int j = 0; j < kCacheSize; j++) { + Insert(11000 + j, 11000 + j); + } + } if (i < 2) { ASSERT_EQ(101, Lookup(100)); } @@ -810,11 +855,6 @@ TEST_P(LRUCacheTest, SetStrictCapacityLimit) { } TEST_P(CacheTest, OverCapacity) { - auto type = GetParam(); - if (type == kClock) { - ROCKSDB_GTEST_BYPASS("Requires LRU eviction policy."); - return; - } size_t n = 10; // a LRUCache with n entries and one shard only @@ -842,23 +882,34 @@ TEST_P(CacheTest, OverCapacity) { for (int i = 0; i < static_cast(n + 1); i++) { cache->Release(handles[i]); } - // Make sure eviction is triggered. - cache->SetCapacity(n); - // cache is under capacity now since elements were released - ASSERT_EQ(n, cache->GetUsage()); + if (GetParam() == kClock) { + // Make sure eviction is triggered. + ASSERT_OK(cache->Insert(EncodeKey(-1), nullptr, 1, &deleter, &handles[0])); - // element 0 is evicted and the rest is there - // This is consistent with the LRU policy since the element 0 - // was released first - for (int i = 0; i < static_cast(n + 1); i++) { - std::string key = EncodeKey(i + 1); - auto h = cache->Lookup(key); - if (h) { - ASSERT_NE(static_cast(i), 0U); - cache->Release(h); - } else { - ASSERT_EQ(static_cast(i), 0U); + // cache is under capacity now since elements were released + ASSERT_GE(n, cache->GetUsage()); + + // clean up + cache->Release(handles[0]); + } else { + // LRUCache checks for over-capacity in Release. + + // cache is exactly at capacity now with minimal eviction + ASSERT_EQ(n, cache->GetUsage()); + + // element 0 is evicted and the rest is there + // This is consistent with the LRU policy since the element 0 + // was released first + for (int i = 0; i < static_cast(n + 1); i++) { + std::string key = EncodeKey(i + 1); + auto h = cache->Lookup(key); + if (h) { + ASSERT_NE(static_cast(i), 0U); + cache->Release(h); + } else { + ASSERT_EQ(static_cast(i), 0U); + } } } } @@ -966,19 +1017,30 @@ TEST_P(CacheTest, ApplyToAllEntriesDuringResize) { } TEST_P(CacheTest, DefaultShardBits) { - // test1: set the flag to false. Insert more keys than capacity. See if they - // all go through. - std::shared_ptr cache = NewCache(16 * 1024L * 1024L); + // Prevent excessive allocation (to save time & space) + estimated_value_size_ = 100000; + // Implementations use different minimum shard sizes + size_t min_shard_size = (GetParam() == kClock ? 32U * 1024U : 512U) * 1024U; + + std::shared_ptr cache = NewCache(32U * min_shard_size); ShardedCache* sc = dynamic_cast(cache.get()); ASSERT_EQ(5, sc->GetNumShardBits()); - cache = NewLRUCache(511 * 1024L, -1, true); + cache = NewCache(min_shard_size / 1000U * 999U); sc = dynamic_cast(cache.get()); ASSERT_EQ(0, sc->GetNumShardBits()); - cache = NewLRUCache(1024L * 1024L * 1024L, -1, true); + cache = NewCache(3U * 1024U * 1024U * 1024U); sc = dynamic_cast(cache.get()); + // current maximum of 6 ASSERT_EQ(6, sc->GetNumShardBits()); + + if constexpr (sizeof(size_t) > 4) { + cache = NewCache(128U * min_shard_size); + sc = dynamic_cast(cache.get()); + // current maximum of 6 + ASSERT_EQ(6, sc->GetNumShardBits()); + } } TEST_P(CacheTest, GetChargeAndDeleter) { diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index edc63ae4e..3bff5feee 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -10,8 +10,6 @@ #include "cache/clock_cache.h" #include -#include -#include #include #include "monitoring/perf_context_imp.h" @@ -25,359 +23,937 @@ namespace ROCKSDB_NAMESPACE { namespace clock_cache { -ClockHandleTable::ClockHandleTable(size_t capacity, int hash_bits) +static_assert(sizeof(ClockHandle) == 64U, + "Expecting size / alignment with common cache line size"); + +ClockHandleTable::ClockHandleTable(int hash_bits, bool initial_charge_metadata) : length_bits_(hash_bits), - length_bits_mask_((uint32_t{1} << length_bits_) - 1), - occupancy_limit_(static_cast((uint32_t{1} << length_bits_) * + length_bits_mask_(Lower32of64((uint64_t{1} << length_bits_) - 1)), + occupancy_limit_(static_cast((uint64_t{1} << length_bits_) * kStrictLoadFactor)), - capacity_(capacity), - array_(new ClockHandle[size_t{1} << length_bits_]), - clock_pointer_(0), - occupancy_(0), - usage_(0) { - assert(hash_bits <= 32); + array_(new ClockHandle[size_t{1} << length_bits_]) { + assert(hash_bits <= 32); // FIXME: ensure no overlap with sharding bits + if (initial_charge_metadata) { + usage_ += size_t{GetTableSize()} * sizeof(ClockHandle); + } } ClockHandleTable::~ClockHandleTable() { - // Assumes there are no references (of any type) to any slot in the table. + // Assumes there are no references or active operations on any slot/element + // in the table. for (uint32_t i = 0; i < GetTableSize(); i++) { - ClockHandle* h = &array_[i]; - if (h->IsElement()) { - h->FreeData(); + ClockHandle& h = array_[i]; + switch (h.meta >> ClockHandle::kStateShift) { + case ClockHandle::kStateEmpty: + // noop + break; + case ClockHandle::kStateInvisible: // rare but possible + case ClockHandle::kStateVisible: + h.FreeData(); +#ifndef NDEBUG + Rollback(h.hash, &h); + usage_.fetch_sub(h.total_charge, std::memory_order_relaxed); + occupancy_.fetch_sub(1U, std::memory_order_relaxed); +#endif + break; + // otherwise + default: + assert(false); + break; } } + +#ifndef NDEBUG + for (uint32_t i = 0; i < GetTableSize(); i++) { + assert(array_[i].displacements.load() == 0); + } +#endif + + assert(usage_.load() == 0 || + usage_.load() == size_t{GetTableSize()} * sizeof(ClockHandle)); + assert(occupancy_ == 0); } -ClockHandle* ClockHandleTable::Lookup(const Slice& key, uint32_t hash) { +// If an entry doesn't receive clock updates but is repeatedly referenced & +// released, the acquire and release counters could overflow without some +// intervention. This is that intervention, which should be inexpensive +// because it only incurs a simple, very predictable check. (Applying a bit +// mask in addition to an increment to every Release likely would be +// relatively expensive, because it's an extra atomic update.) +// +// We do have to assume that we never have many millions of simultaneous +// references to a cache handle, because we cannot represent so many +// references with the difference in counters, masked to the number of +// counter bits. Similarly, we assume there aren't millions of threads +// holding transient references (which might be "undone" rather than +// released by the way). +// +// Consider these possible states for each counter: +// low: less than kMaxCountdown +// medium: kMaxCountdown to half way to overflow + kMaxCountdown +// high: half way to overflow + kMaxCountdown, or greater +// +// And these possible states for the combination of counters: +// acquire / release +// ------- ------- +// low low - Normal / common, with caveats (see below) +// medium low - Can happen while holding some refs +// high low - Violates assumptions (too many refs) +// low medium - Violates assumptions (refs underflow, etc.) +// medium medium - Normal (very read heavy cache) +// high medium - Can happen while holding some refs +// low high - This function is supposed to prevent +// medium high - Violates assumptions (refs underflow, etc.) +// high high - Needs CorrectNearOverflow +// +// Basically, this function detects (high, high) state (inferred from +// release alone being high) and bumps it back down to (medium, medium) +// state with the same refcount and the same logical countdown counter +// (everything > kMaxCountdown is logically the same). Note that bumping +// down to (low, low) would modify the countdown counter, so is "reserved" +// in a sense. +// +// If near-overflow correction is triggered here, there's no guarantee +// that another thread hasn't freed the entry and replaced it with another. +// Therefore, it must be the case that the correction does not affect +// entries unless they are very old (many millions of acquire-release cycles). +// (Our bit manipulation is indeed idempotent and only affects entries in +// exceptional cases.) We assume a pre-empted thread will not stall that long. +// If it did, the state could be corrupted in the (unlikely) case that the top +// bit of the acquire counter is set but not the release counter, and thus +// we only clear the top bit of the acquire counter on resumption. It would +// then appear that there are too many refs and the entry would be permanently +// pinned (which is not terrible for an exceptionally rare occurrence), unless +// it is referenced enough (at least kMaxCountdown more times) for the release +// counter to reach "high" state again and bumped back to "medium." (This +// motivates only checking for release counter in high state, not both in high +// state.) +inline void CorrectNearOverflow(uint64_t old_meta, + std::atomic& meta) { + // We clear both top-most counter bits at the same time. + constexpr uint64_t kCounterTopBit = uint64_t{1} + << (ClockHandle::kCounterNumBits - 1); + constexpr uint64_t kClearBits = + (kCounterTopBit << ClockHandle::kAcquireCounterShift) | + (kCounterTopBit << ClockHandle::kReleaseCounterShift); + // A simple check that allows us to initiate clearing the top bits for + // a large portion of the "high" state space on release counter. + constexpr uint64_t kCheckBits = + (kCounterTopBit | (ClockHandle::kMaxCountdown + 1)) + << ClockHandle::kReleaseCounterShift; + + if (UNLIKELY(old_meta & kCheckBits)) { + meta.fetch_and(~kClearBits, std::memory_order_relaxed); + } +} + +Status ClockHandleTable::Insert(const ClockHandleMoreData& proto, + ClockHandle** handle, Cache::Priority priority, + size_t capacity, bool strict_capacity_limit) { + // Do we have the available occupancy? Optimistically assume we do + // and deal with it if we don't. + uint32_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire); + auto revert_occupancy_fn = [&]() { + occupancy_.fetch_sub(1, std::memory_order_relaxed); + }; + // Whether we over-committed and need an eviction to make up for it + bool need_evict_for_occupancy = old_occupancy >= occupancy_limit_; + + // Usage/capacity handling is somewhat different depending on + // strict_capacity_limit, but mostly pessimistic. + bool use_detached_insert = false; + const size_t total_charge = proto.total_charge; + if (strict_capacity_limit) { + if (total_charge > capacity) { + assert(!use_detached_insert); + revert_occupancy_fn(); + return Status::MemoryLimit( + "Cache entry too large for a single cache shard: " + + std::to_string(total_charge) + " > " + std::to_string(capacity)); + } + // Grab any available capacity, and free up any more required. + size_t old_usage = usage_.load(std::memory_order_relaxed); + size_t new_usage; + if (LIKELY(old_usage != capacity)) { + do { + new_usage = std::min(capacity, old_usage + total_charge); + } while (!usage_.compare_exchange_weak(old_usage, new_usage, + std::memory_order_relaxed)); + } else { + new_usage = old_usage; + } + // How much do we need to evict then? + size_t need_evict_charge = old_usage + total_charge - new_usage; + size_t request_evict_charge = need_evict_charge; + if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) { + // Require at least 1 eviction. + request_evict_charge = 1; + } + if (request_evict_charge > 0) { + size_t evicted_charge = 0; + uint32_t evicted_count = 0; + Evict(request_evict_charge, &evicted_charge, &evicted_count); + occupancy_.fetch_sub(evicted_count, std::memory_order_release); + if (LIKELY(evicted_charge > need_evict_charge)) { + assert(evicted_count > 0); + // Evicted more than enough + usage_.fetch_sub(evicted_charge - need_evict_charge, + std::memory_order_relaxed); + } else if (evicted_charge < need_evict_charge || + (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) { + // Roll back to old usage minus evicted + usage_.fetch_sub(evicted_charge + (new_usage - old_usage), + std::memory_order_relaxed); + assert(!use_detached_insert); + revert_occupancy_fn(); + if (evicted_charge < need_evict_charge) { + return Status::MemoryLimit( + "Insert failed because unable to evict entries to stay within " + "capacity limit."); + } else { + return Status::MemoryLimit( + "Insert failed because unable to evict entries to stay within " + "table occupancy limit."); + } + } + // If we needed to evict something and we are proceeding, we must have + // evicted something. + assert(evicted_count > 0); + } + } else { + // Case strict_capacity_limit == false + + // For simplicity, we consider that either the cache can accept the insert + // with no evictions, or we must evict enough to make (at least) enough + // space. It could lead to unnecessary failures or excessive evictions in + // some extreme cases, but allows a fast, simple protocol. If we allow a + // race to get us over capacity, then we might never get back to capacity + // limit if the sizes of entries allow each insertion to evict the minimum + // charge. Thus, we should evict some extra if it's not a signifcant + // portion of the shard capacity. This can have the side benefit of + // involving fewer threads in eviction. + size_t old_usage = usage_.load(std::memory_order_relaxed); + size_t need_evict_charge; + // NOTE: if total_charge > old_usage, there isn't yet enough to evict + // `total_charge` amount. Even if we only try to evict `old_usage` amount, + // there's likely something referenced and we would eat CPU looking for + // enough to evict. + if (old_usage + total_charge <= capacity || total_charge > old_usage) { + // Good enough for me (might run over with a race) + need_evict_charge = 0; + } else { + // Try to evict enough space, and maybe some extra + need_evict_charge = total_charge; + if (old_usage > capacity) { + // Not too much to avoid thundering herd while avoiding strict + // synchronization + need_evict_charge += std::min(capacity / 1024, total_charge) + 1; + } + } + if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) { + // Special case: require at least 1 eviction if we only have to + // deal with occupancy + need_evict_charge = 1; + } + size_t evicted_charge = 0; + uint32_t evicted_count = 0; + if (need_evict_charge > 0) { + Evict(need_evict_charge, &evicted_charge, &evicted_count); + // Deal with potential occupancy deficit + if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) { + assert(evicted_charge == 0); + revert_occupancy_fn(); + if (handle == nullptr) { + // Don't insert the entry but still return ok, as if the entry + // inserted into cache and evicted immediately. + proto.FreeData(); + return Status::OK(); + } else { + use_detached_insert = true; + } + } else { + // Update occupancy for evictions + occupancy_.fetch_sub(evicted_count, std::memory_order_release); + } + } + // Track new usage even if we weren't able to evict enough + usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed); + // No underflow + assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); + } + auto revert_usage_fn = [&]() { + usage_.fetch_sub(total_charge, std::memory_order_relaxed); + // No underflow + assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); + }; + + if (!use_detached_insert) { + // Attempt a table insert, but abort if we find an existing entry for the + // key. If we were to overwrite old entries, we would either + // * Have to gain ownership over an existing entry to overwrite it, which + // would only work if there are no outstanding (read) references and would + // create a small gap in availability of the entry (old or new) to lookups. + // * Have to insert into a suboptimal location (more probes) so that the + // old entry can be kept around as well. + + // Set initial clock data from priority + // TODO: configuration parameters for priority handling and clock cycle + // count? + uint64_t initial_countdown; + switch (priority) { + case Cache::Priority::HIGH: + initial_countdown = ClockHandle::kHighCountdown; + break; + default: + assert(false); + FALLTHROUGH_INTENDED; + case Cache::Priority::LOW: + initial_countdown = ClockHandle::kLowCountdown; + break; + case Cache::Priority::BOTTOM: + initial_countdown = ClockHandle::kBottomCountdown; + break; + } + assert(initial_countdown > 0); + + uint32_t probe = 0; + ClockHandle* e = FindSlot( + proto.hash, + [&](ClockHandle* h) { + // Optimistically transition the slot from "empty" to + // "under construction" (no effect on other states) + uint64_t old_meta = + h->meta.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit} + << ClockHandle::kStateShift, + std::memory_order_acq_rel); + uint64_t old_state = old_meta >> ClockHandle::kStateShift; + + if (old_state == ClockHandle::kStateEmpty) { + // We've started inserting into an available slot, and taken + // ownership Save data fields + ClockHandleMoreData* h_alias = h; + *h_alias = proto; + + // Transition from "under construction" state to "visible" state + uint64_t new_meta = uint64_t{ClockHandle::kStateVisible} + << ClockHandle::kStateShift; + + // Maybe with an outstanding reference + new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift; + new_meta |= (initial_countdown - (handle != nullptr)) + << ClockHandle::kReleaseCounterShift; + +#ifndef NDEBUG + // Save the state transition, with assertion + old_meta = h->meta.exchange(new_meta, std::memory_order_release); + assert(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateConstruction); +#else + // Save the state transition + h->meta.store(new_meta, std::memory_order_release); +#endif + return true; + } else if (old_state != ClockHandle::kStateVisible) { + // Slot not usable / touchable now + return false; + } + // Existing, visible entry, which might be a match. + // But first, we need to acquire a ref to read it. In fact, number of + // refs for initial countdown, so that we boost the clock state if + // this is a match. + old_meta = h->meta.fetch_add( + ClockHandle::kAcquireIncrement * initial_countdown, + std::memory_order_acq_rel); + // Like Lookup + if ((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateVisible) { + // Acquired a read reference + if (h->key == proto.key) { + // Match. Release in a way that boosts the clock state + old_meta = h->meta.fetch_add( + ClockHandle::kReleaseIncrement * initial_countdown, + std::memory_order_acq_rel); + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h->meta); + // Insert detached instead (only if return handle needed) + use_detached_insert = true; + return true; + } else { + // Mismatch. Pretend we never took the reference + old_meta = h->meta.fetch_sub( + ClockHandle::kAcquireIncrement * initial_countdown, + std::memory_order_acq_rel); + } + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + // WART: there's a tiny chance we release last ref to invisible + // entry here. If that happens, we let eviction take care of it. + old_meta = h->meta.fetch_sub( + ClockHandle::kAcquireIncrement * initial_countdown, + std::memory_order_acq_rel); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. + // Slot not usable / touchable now. + } + (void)old_meta; + return false; + }, + [&](ClockHandle* /*h*/) { return false; }, + [&](ClockHandle* h) { + h->displacements.fetch_add(1, std::memory_order_relaxed); + }, + probe); + if (e == nullptr) { + // Occupancy check and never abort FindSlot above should generally + // prevent this, except it's theoretically possible for other threads + // to evict and replace entries in the right order to hit every slot + // when it is populated. Assuming random hashing, the chance of that + // should be no higher than pow(kStrictLoadFactor, n) for n slots. + // That should be infeasible for roughly n >= 256, so if this assertion + // fails, that suggests something is going wrong. + assert(GetTableSize() < 256); + use_detached_insert = true; + } + if (!use_detached_insert) { + // Successfully inserted + if (handle) { + *handle = e; + } + return Status::OK(); + } + // Roll back table insertion + Rollback(proto.hash, e); + revert_occupancy_fn(); + // Maybe fall back on detached insert + if (handle == nullptr) { + revert_usage_fn(); + // As if unrefed entry immdiately evicted + proto.FreeData(); + return Status::OK(); + } + } + + // Run detached insert + assert(use_detached_insert); + + ClockHandle* h = new ClockHandle(); + ClockHandleMoreData* h_alias = h; + *h_alias = proto; + h->detached = true; + // Single reference (detached entries only created if returning a refed + // Handle back to user) + uint64_t meta = uint64_t{ClockHandle::kStateInvisible} + << ClockHandle::kStateShift; + meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift; + h->meta.store(meta, std::memory_order_release); + // Keep track of usage + detached_usage_.fetch_add(total_charge, std::memory_order_relaxed); + + *handle = h; + // The OkOverwritten status is used to count "redundant" insertions into + // block cache. This implementation doesn't strictly check for redundant + // insertions, but we instead are probably interested in how many insertions + // didn't go into the table (instead "detached"), which could be redundant + // Insert or some other reason (use_detached_insert reasons above). + return Status::OkOverwritten(); +} + +ClockHandle* ClockHandleTable::Lookup(const CacheKeyBytes& key, uint32_t hash) { uint32_t probe = 0; ClockHandle* e = FindSlot( - key, + hash, [&](ClockHandle* h) { - if (h->TryInternalRef()) { - if (h->IsElement() && h->Matches(key, hash)) { + // Mostly branch-free version (similar performance) + /* + uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U; + bool visible = (old_meta >> ClockHandle::kStateShift) & 1U; + bool match = (h->key == key) & visible; + h->meta.fetch_sub(static_cast(Shareable & !match) << + ClockHandle::kAcquireCounterShift, std::memory_order_release); return + match; + */ + // Optimistic lookup should pay off when the table is relatively + // sparse. + constexpr bool kOptimisticLookup = true; + uint64_t old_meta; + if (!kOptimisticLookup) { + old_meta = h->meta.load(std::memory_order_acquire); + if ((old_meta >> ClockHandle::kStateShift) != + ClockHandle::kStateVisible) { + return false; + } + } + // (Optimistically) increment acquire counter + old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateVisible) { + // Acquired a read reference + if (h->key == key) { + // Match return true; + } else { + // Mismatch. Pretend we never took the reference + old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); } - h->ReleaseInternalRef(); + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + // WART: there's a tiny chance we release last ref to invisible + // entry here. If that happens, we let eviction take care of it. + old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. Furthermore, we cannot safely undo + // it because we did not acquire a read reference to lock the + // entry in a Shareable state. } + (void)old_meta; return false; }, - [&](ClockHandle* h) { return h->displacements == 0; }, + [&](ClockHandle* h) { + return h->displacements.load(std::memory_order_relaxed) == 0; + }, [&](ClockHandle* /*h*/) {}, probe); - if (e != nullptr) { - // TODO(Guido) Comment from #10347: Here it looks like we have three atomic - // updates where it would be possible to combine into one CAS (more metadata - // under one atomic field) or maybe two atomic updates (one arithmetic, one - // bitwise). Something to think about optimizing. - e->SetHit(); - // The handle is now referenced, so we take it out of clock. - ClockOff(e); - e->InternalToExternalRef(); - } - return e; } -ClockHandle* ClockHandleTable::Insert(ClockHandle* h, - autovector* deleted, - bool take_reference) { - uint32_t probe = 0; - ClockHandle* e = FindAvailableSlot(h->key(), h->hash, probe, deleted); - if (e == nullptr) { - // No available slot to place the handle. - return nullptr; - } - - // The slot is empty or is a tombstone. And we have an exclusive ref. - Assign(e, h); - // TODO(Guido) The following RemoveAll can probably be run outside of - // the exclusive ref. I had a bad case in mind: multiple inserts could - // annihilate each. Although I think this is impossible, I'm not sure - // my mental proof covers every case. - if (e->displacements != 0) { - // It used to be a tombstone, so there may already be copies of the - // key in the table. - RemoveAll(h->key(), h->hash, probe, deleted); - } +bool ClockHandleTable::Release(ClockHandle* h, bool useful, + bool erase_if_last_ref) { + // In contrast with LRUCache's Release, this function won't delete the handle + // when the cache is above capacity and the reference is the last one. Space + // is only freed up by EvictFromClock (called by Insert when space is needed) + // and Erase. We do this to avoid an extra atomic read of the variable usage_. - if (take_reference) { - // The user wants to take a reference. - e->ExclusiveToExternalRef(); + uint64_t old_meta; + if (useful) { + // Increment release counter to indicate was used + old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement, + std::memory_order_release); } else { - // The user doesn't want to immediately take a reference, so we make - // it evictable. - ClockOn(e); - e->ReleaseExclusiveRef(); + // Decrement acquire counter to pretend it never happened + old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); } - return e; -} -void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) { - // DON'T touch displacements and refs. - dst->value = src->value; - dst->deleter = src->deleter; - dst->hash = src->hash; - dst->total_charge = src->total_charge; - dst->key_data = src->key_data; - dst->flags.store(0); - dst->SetIsElement(true); - dst->SetCachePriority(src->GetCachePriority()); - usage_ += dst->total_charge; - occupancy_++; -} - -bool ClockHandleTable::TryRemove(ClockHandle* h, - autovector* deleted) { - if (h->TryExclusiveRef()) { - if (h->WillBeDeleted()) { - Remove(h, deleted); - return true; + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + // No underflow + assert(((old_meta >> ClockHandle::kAcquireCounterShift) & + ClockHandle::kCounterMask) != + ((old_meta >> ClockHandle::kReleaseCounterShift) & + ClockHandle::kCounterMask)); + + if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateInvisible)) { + // Update for last fetch_add op + if (useful) { + old_meta += ClockHandle::kReleaseIncrement; + } else { + old_meta -= ClockHandle::kAcquireIncrement; + } + // Take ownership if no refs + do { + uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) - + (old_meta >> ClockHandle::kReleaseCounterShift)) & + ClockHandle::kCounterMask; + if (refcount != 0) { + // Not last ref at some point in time during this Release call + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h->meta); + return false; + } + if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit} + << ClockHandle::kStateShift)) == 0) { + // Someone else took ownership + return false; + } + // Note that there's a small chance that we release, another thread + // replaces this entry with another, reaches zero refs, and then we end + // up erasing that other entry. That's an acceptable risk / imprecision. + } while (!h->meta.compare_exchange_weak( + old_meta, + uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift, + std::memory_order_acquire)); + // Took ownership + // TODO? Delay freeing? + h->FreeData(); + size_t total_charge = h->total_charge; + if (UNLIKELY(h->detached)) { + // Delete detached handle + delete h; + detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed); + } else { + uint32_t hash = h->hash; +#ifndef NDEBUG + // Mark slot as empty, with assertion + old_meta = h->meta.exchange(0, std::memory_order_release); + assert(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateConstruction); +#else + // Mark slot as empty + h->meta.store(0, std::memory_order_release); +#endif + occupancy_.fetch_sub(1U, std::memory_order_release); + Rollback(hash, h); } - h->ReleaseExclusiveRef(); + usage_.fetch_sub(total_charge, std::memory_order_relaxed); + assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); + return true; + } else { + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h->meta); + return false; } - return false; } -bool ClockHandleTable::SpinTryRemove(ClockHandle* h, - autovector* deleted) { - if (h->SpinTryExclusiveRef()) { - if (h->WillBeDeleted()) { - Remove(h, deleted); - return true; - } - h->ReleaseExclusiveRef(); - } - return false; +void ClockHandleTable::Ref(ClockHandle& h) { + // Increment acquire counter + uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + (void)old_meta; } -void ClockHandleTable::ClockOff(ClockHandle* h) { - h->SetClockPriority(ClockHandle::ClockPriority::NONE); +void ClockHandleTable::TEST_RefN(ClockHandle& h, size_t n) { + // Increment acquire counter + uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + (void)old_meta; } -void ClockHandleTable::ClockOn(ClockHandle* h) { - assert(!h->IsInClock()); - bool is_high_priority = - h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH; - h->SetClockPriority(static_cast( - is_high_priority ? ClockHandle::ClockPriority::HIGH - : ClockHandle::ClockPriority::MEDIUM)); +void ClockHandleTable::TEST_ReleaseN(ClockHandle* h, size_t n) { + if (n > 0) { + // Split into n - 1 and 1 steps. + uint64_t old_meta = h->meta.fetch_add( + (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire); + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + (void)old_meta; + + Release(h, /*useful*/ true, /*erase_if_last_ref*/ false); + } } -void ClockHandleTable::Remove(ClockHandle* h, - autovector* deleted) { - deleted->push_back(*h); - ClockOff(h); +void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) { uint32_t probe = 0; - FindSlot( - h->key(), [&](ClockHandle* e) { return e == h; }, - [&](ClockHandle* /*e*/) { return false; }, - [&](ClockHandle* e) { e->displacements--; }, probe); - h->SetWillBeDeleted(false); - h->SetIsElement(false); -} - -void ClockHandleTable::RemoveAll(const Slice& key, uint32_t hash, - uint32_t& probe, - autovector* deleted) { - FindSlot( - key, + (void)FindSlot( + hash, [&](ClockHandle* h) { - if (h->TryInternalRef()) { - if (h->IsElement() && h->Matches(key, hash)) { - h->SetWillBeDeleted(true); - h->ReleaseInternalRef(); - if (TryRemove(h, deleted)) { - h->ReleaseExclusiveRef(); + // Could be multiple entries in rare cases. Erase them all. + // Optimistically increment acquire counter + uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateVisible) { + // Acquired a read reference + if (h->key == key) { + // Match. Set invisible. + old_meta = + h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift), + std::memory_order_acq_rel); + // Apply update to local copy + old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift); + for (;;) { + uint64_t refcount = + ((old_meta >> ClockHandle::kAcquireCounterShift) - + (old_meta >> ClockHandle::kReleaseCounterShift)) & + ClockHandle::kCounterMask; + assert(refcount > 0); + if (refcount > 1) { + // Not last ref at some point in time during this Erase call + // Pretend we never took the reference + h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + break; + } else if (h->meta.compare_exchange_weak( + old_meta, uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift)) { + // Took ownership + assert(hash == h->hash); + // TODO? Delay freeing? + h->FreeData(); + usage_.fetch_sub(h->total_charge, std::memory_order_relaxed); + assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); +#ifndef NDEBUG + // Mark slot as empty, with assertion + old_meta = h->meta.exchange(0, std::memory_order_release); + assert(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateConstruction); +#else + // Mark slot as empty + h->meta.store(0, std::memory_order_release); +#endif + occupancy_.fetch_sub(1U, std::memory_order_release); + Rollback(hash, h); + break; + } } - return false; + } else { + // Mismatch. Pretend we never took the reference + h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); } - h->ReleaseInternalRef(); + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + // WART: there's a tiny chance we release last ref to invisible + // entry here. If that happens, we let eviction take care of it. + h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. } return false; }, - [&](ClockHandle* h) { return h->displacements == 0; }, + [&](ClockHandle* h) { + return h->displacements.load(std::memory_order_relaxed) == 0; + }, [&](ClockHandle* /*h*/) {}, probe); } -void ClockHandleTable::Free(autovector* deleted) { - if (deleted->size() == 0) { - // Avoid unnecessarily reading usage_ and occupancy_. - return; +void ClockHandleTable::ConstApplyToEntriesRange( + std::function func, uint32_t index_begin, + uint32_t index_end, bool apply_if_will_be_deleted) const { + uint64_t check_state_mask = ClockHandle::kStateShareableBit; + if (!apply_if_will_be_deleted) { + check_state_mask |= ClockHandle::kStateVisibleBit; } - size_t deleted_charge = 0; - for (auto& h : *deleted) { - deleted_charge += h.total_charge; - h.FreeData(); + for (uint32_t i = index_begin; i < index_end; i++) { + ClockHandle& h = array_[i]; + + uint64_t old_meta = h.meta.load(std::memory_order_relaxed); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { + // Increment acquire counter + old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + // Double-check + if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { + func(h); + } + // Pretend we never took the reference + h.meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + // No net change, so don't need to check for overflow + } } - assert(usage_ >= deleted_charge); - usage_ -= deleted_charge; - occupancy_ -= static_cast(deleted->size()); } -ClockHandle* ClockHandleTable::FindAvailableSlot( - const Slice& key, uint32_t hash, uint32_t& probe, - autovector* deleted) { - ClockHandle* e = FindSlot( - key, - [&](ClockHandle* h) { - // To read the handle, first acquire a shared ref. - if (h->TryInternalRef()) { - if (h->IsElement()) { - // The slot is not available. - // TODO(Guido) Is it worth testing h->WillBeDeleted()? - if (h->WillBeDeleted() || h->Matches(key, hash)) { - // The slot can be freed up, or the key we're inserting is already - // in the table, so we try to delete it. When the attempt is - // successful, the slot becomes available, so we stop probing. - // Notice that in that case TryRemove returns an exclusive ref. - h->SetWillBeDeleted(true); - h->ReleaseInternalRef(); - if (TryRemove(h, deleted)) { - return true; - } - return false; - } - h->ReleaseInternalRef(); - return false; - } - - // Available slot. - h->ReleaseInternalRef(); - // Try to acquire an exclusive ref. If we fail, continue probing. - if (h->SpinTryExclusiveRef()) { - // Check that the slot is still available. - if (!h->IsElement()) { - return true; - } - h->ReleaseExclusiveRef(); - } - } - return false; - }, - [&](ClockHandle* /*h*/) { return false; }, - [&](ClockHandle* h) { h->displacements++; }, probe); - if (e == nullptr) { - Rollback(key, probe); +void ClockHandleTable::EraseUnRefEntries() { + for (uint32_t i = 0; i <= this->length_bits_mask_; i++) { + ClockHandle& h = array_[i]; + + uint64_t old_meta = h.meta.load(std::memory_order_relaxed); + uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) - + (old_meta >> ClockHandle::kReleaseCounterShift)) & + ClockHandle::kCounterMask; + if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} + << ClockHandle::kStateShift) && + refcount == 0 && + h.meta.compare_exchange_strong(old_meta, + uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift, + std::memory_order_acquire)) { + // Took ownership + uint32_t hash = h.hash; + h.FreeData(); + usage_.fetch_sub(h.total_charge, std::memory_order_relaxed); +#ifndef NDEBUG + // Mark slot as empty, with assertion + old_meta = h.meta.exchange(0, std::memory_order_release); + assert(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateConstruction); +#else + // Mark slot as empty + h.meta.store(0, std::memory_order_release); +#endif + occupancy_.fetch_sub(1U, std::memory_order_release); + Rollback(hash, &h); + } } - return e; } +namespace { +inline uint32_t Remix1(uint32_t hash) { + return Lower32of64((uint64_t{hash} * 0xbc9f1d35) >> 29); +} + +inline uint32_t Remix2(uint32_t hash) { + return Lower32of64((uint64_t{hash} * 0x7a2bb9d5) >> 29); +} +} // namespace + ClockHandle* ClockHandleTable::FindSlot( - const Slice& key, std::function match, - std::function abort, - std::function update, uint32_t& probe) { + uint32_t hash, std::function match_fn, + std::function abort_fn, + std::function update_fn, uint32_t& probe) { // We use double-hashing probing. Every probe in the sequence is a // pseudorandom integer, computed as a linear function of two random hashes, // which we call base and increment. Specifically, the i-th probe is base + i // * increment modulo the table size. - uint32_t base = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1)); + uint32_t base = ModTableSize(Remix1(hash)); // We use an odd increment, which is relatively prime with the power-of-two // table size. This implies that we cycle back to the first probe only // after probing every slot exactly once. - uint32_t increment = - ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1); + // TODO: we could also reconsider linear probing, though locality benefits + // are limited because each slot is a full cache line + uint32_t increment = Remix2(hash) | 1U; uint32_t current = ModTableSize(base + probe * increment); - while (true) { + while (probe <= length_bits_mask_) { ClockHandle* h = &array_[current]; - if (current == base && probe > 0) { - // We looped back. - return nullptr; - } - if (match(h)) { + if (match_fn(h)) { probe++; return h; } - if (abort(h)) { + if (abort_fn(h)) { return nullptr; } probe++; - update(h); + update_fn(h); current = ModTableSize(current + increment); } + // We looped back. + return nullptr; } -void ClockHandleTable::Rollback(const Slice& key, uint32_t probe) { - uint32_t current = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1)); - uint32_t increment = - ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1); - for (uint32_t i = 0; i < probe; i++) { - array_[current].displacements--; +void ClockHandleTable::Rollback(uint32_t hash, const ClockHandle* h) { + uint32_t current = ModTableSize(Remix1(hash)); + uint32_t increment = Remix2(hash) | 1U; + for (uint32_t i = 0; &array_[current] != h; i++) { + array_[current].displacements.fetch_sub(1, std::memory_order_relaxed); current = ModTableSize(current + increment); } } -void ClockHandleTable::ClockRun(size_t charge) { - // TODO(Guido) When an element is in the probe sequence of a - // hot element, it will be hard to get an exclusive ref. - // Do we need a mechanism to prevent an element from sitting - // for a long time in cache waiting to be evicted? - autovector deleted; - uint32_t max_iterations = - ClockHandle::ClockPriority::HIGH * - (1 + - static_cast( - GetTableSize() * - kLoadFactor)); // It may take up to HIGH passes to evict an element. - size_t usage_local = usage_; - size_t capacity_local = capacity_; - while (usage_local + charge > capacity_local && max_iterations--) { - uint32_t steps = 1 + static_cast(1 / kLoadFactor); - uint32_t clock_pointer_local = (clock_pointer_ += steps) - steps; - for (uint32_t i = 0; i < steps; i++) { - ClockHandle* h = &array_[ModTableSize(clock_pointer_local + i)]; - if (h->TryExclusiveRef()) { - if (h->WillBeDeleted()) { - Remove(h, &deleted); - usage_local -= h->total_charge; - } else { - if (!h->IsInClock() && h->IsElement()) { - // We adjust the clock priority to make the element evictable again. - // Why? Elements that are not in clock are either currently - // externally referenced or used to be. Because we are holding an - // exclusive ref, we know we are in the latter case. This can only - // happen when the last external reference to an element was - // released, and the element was not immediately removed. - ClockOn(h); - } - ClockHandle::ClockPriority priority = h->GetClockPriority(); - if (priority == ClockHandle::ClockPriority::LOW) { - Remove(h, &deleted); - usage_local -= h->total_charge; - } else if (priority > ClockHandle::ClockPriority::LOW) { - h->DecreaseClockPriority(); - } - } - h->ReleaseExclusiveRef(); +void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge, + uint32_t* freed_count) { + // precondition + assert(requested_charge > 0); + + // TODO: make a tuning parameter? + constexpr uint32_t step_size = 4; + + // First (concurrent) increment clock pointer + uint64_t old_clock_pointer = + clock_pointer_.fetch_add(step_size, std::memory_order_relaxed); + + // Cap the eviction effort at this thread (along with those operating in + // parallel) circling through the whole structure kMaxCountdown times. + // In other words, this eviction run must find something/anything that is + // unreferenced at start of and during the eviction run that isn't reclaimed + // by a concurrent eviction run. + uint64_t max_clock_pointer = + old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_); + + for (;;) { + for (uint32_t i = 0; i < step_size; i++) { + ClockHandle& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))]; + uint64_t meta = h.meta.load(std::memory_order_relaxed); + + uint64_t acquire_count = (meta >> ClockHandle::kAcquireCounterShift) & + ClockHandle::kCounterMask; + uint64_t release_count = (meta >> ClockHandle::kReleaseCounterShift) & + ClockHandle::kCounterMask; + if (acquire_count != release_count) { + // Only clock update entries with no outstanding refs + continue; + } + if (!(meta >> ClockHandle::kStateShift & + ClockHandle::kStateShareableBit)) { + // Only clock update Shareable entries + continue; + } + // ModTableSize(old_clock_pointer + i)); + if (meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible && + acquire_count > 0) { + // Decrement clock + uint64_t new_count = std::min(acquire_count - 1, + uint64_t{ClockHandle::kMaxCountdown} - 1); + // Compare-exchange in the decremented clock info, but + // not aggressively + uint64_t new_meta = + (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) | + (new_count << ClockHandle::kReleaseCounterShift) | + (new_count << ClockHandle::kAcquireCounterShift); + h.meta.compare_exchange_strong(meta, new_meta, + std::memory_order_relaxed); + continue; + } + // Otherwise, remove entry (either unreferenced invisible or + // unreferenced and expired visible). Compare-exchange failing probably + // indicates the entry was used, so skip it in that case. + if (h.meta.compare_exchange_strong( + meta, + uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift, + std::memory_order_acquire)) { + // Took ownership + uint32_t hash = h.hash; + // TODO? Delay freeing? + h.FreeData(); + *freed_charge += h.total_charge; +#ifndef NDEBUG + // Mark slot as empty, with assertion + meta = h.meta.exchange(0, std::memory_order_release); + assert(meta >> ClockHandle::kStateShift == + ClockHandle::kStateConstruction); +#else + // Mark slot as empty + h.meta.store(0, std::memory_order_release); +#endif + *freed_count += 1; + Rollback(hash, &h); } } - } - Free(&deleted); + // Loop exit condition + if (*freed_charge >= requested_charge) { + return; + } + if (old_clock_pointer >= max_clock_pointer) { + return; + } + + // Advance clock pointer (concurrently) + old_clock_pointer = + clock_pointer_.fetch_add(step_size, std::memory_order_relaxed); + } } ClockCacheShard::ClockCacheShard( size_t capacity, size_t estimated_value_size, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy) - : strict_capacity_limit_(strict_capacity_limit), - detached_usage_(0), - table_(capacity, CalcHashBits(capacity, estimated_value_size, - metadata_charge_policy)) { - set_metadata_charge_policy(metadata_charge_policy); + : CacheShard(metadata_charge_policy), + table_( + CalcHashBits(capacity, estimated_value_size, metadata_charge_policy), + /*initial_charge_metadata*/ metadata_charge_policy == + kFullChargeCacheMetadata), + capacity_(capacity), + strict_capacity_limit_(strict_capacity_limit) { + // Initial charge metadata should not exceed capacity + assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(ClockHandle)); } -void ClockCacheShard::EraseUnRefEntries() { - autovector deleted; - - table_.ApplyToEntriesRange( - [this, &deleted](ClockHandle* h) { - // Externally unreferenced element. - table_.Remove(h, &deleted); - }, - 0, table_.GetTableSize(), true); - - table_.Free(&deleted); -} +void ClockCacheShard::EraseUnRefEntries() { table_.EraseUnRefEntries(); } void ClockCacheShard::ApplyToSomeEntries( const std::functionkey(), h->value, h->GetCharge(metadata_charge_policy), - h->deleter); + [callback](const ClockHandle& h) { + callback(h.KeySlice(), h.value, h.total_charge, h.deleter); }, index_begin, index_end, false); } -ClockHandle* ClockCacheShard::DetachedInsert(ClockHandle* h) { - ClockHandle* e = new ClockHandle(); - *e = *h; - e->SetDetached(); - e->TryExternalRef(); - detached_usage_ += h->total_charge; - return e; -} - -size_t ClockCacheShard::CalcEstimatedHandleCharge( - size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy) { - ClockHandle h; - h.CalcTotalCharge(estimated_value_size, metadata_charge_policy); - return h.total_charge; -} - int ClockCacheShard::CalcHashBits( size_t capacity, size_t estimated_value_size, CacheMetadataChargePolicy metadata_charge_policy) { - size_t handle_charge = - CalcEstimatedHandleCharge(estimated_value_size, metadata_charge_policy); - assert(handle_charge > 0); - uint32_t num_entries = - static_cast(capacity / (kLoadFactor * handle_charge)) + 1; - assert(num_entries <= uint32_t{1} << 31); - return FloorLog2((num_entries << 1) - 1); + double average_slot_charge = estimated_value_size * kLoadFactor; + if (metadata_charge_policy == kFullChargeCacheMetadata) { + average_slot_charge += sizeof(ClockHandle); + } + assert(average_slot_charge > 0.0); + uint64_t num_slots = + static_cast(capacity / average_slot_charge + 0.999999); + + int hash_bits = std::min(FloorLog2((num_slots << 1) - 1), 32); + if (metadata_charge_policy == kFullChargeCacheMetadata) { + // For very small estimated value sizes, it's possible to overshoot + while (hash_bits > 0 && + uint64_t{sizeof(ClockHandle)} << hash_bits > capacity) { + hash_bits--; + } + } + return hash_bits; } void ClockCacheShard::SetCapacity(size_t capacity) { - if (capacity > table_.GetCapacity()) { - assert(false); // Not supported. - } - table_.SetCapacity(capacity); - table_.ClockRun(detached_usage_); + capacity_.store(capacity, std::memory_order_relaxed); + // next Insert will take care of any necessary evictions } void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { - strict_capacity_limit_ = strict_capacity_limit; + strict_capacity_limit_.store(strict_capacity_limit, + std::memory_order_relaxed); + // next Insert will take care of any necessary evictions } Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value, size_t charge, Cache::DeleterFn deleter, Cache::Handle** handle, Cache::Priority priority) { - if (key.size() != kCacheKeySize) { + if (UNLIKELY(key.size() != kCacheKeySize)) { return Status::NotSupported("ClockCache only supports key size " + std::to_string(kCacheKeySize) + "B"); } - - ClockHandle tmp; - tmp.value = value; - tmp.deleter = deleter; - tmp.hash = hash; - tmp.CalcTotalCharge(charge, metadata_charge_policy_); - tmp.SetCachePriority(priority); - for (int i = 0; i < kCacheKeySize; i++) { - tmp.key_data[i] = key.data()[i]; - } - - Status s = Status::OK(); - - // Use a local copy to minimize cache synchronization. - size_t detached_usage = detached_usage_; - - // Free space with the clock policy until enough space is freed or there are - // no evictable elements. - table_.ClockRun(tmp.total_charge + detached_usage); - - // Use local copies to minimize cache synchronization - // (occupancy_ and usage_ are read and written by all insertions). - uint32_t occupancy_local = table_.GetOccupancy(); - size_t total_usage = table_.GetUsage() + detached_usage; - - // TODO: Currently we support strict_capacity_limit == false as long as the - // number of pinned elements is below table_.GetOccupancyLimit(). We can - // always support it as follows: whenever we exceed this limit, we dynamically - // allocate a handle and return it (when the user provides a handle pointer, - // of course). Then, Release checks whether the handle was dynamically - // allocated, or is stored in the table. - if (total_usage + tmp.total_charge > table_.GetCapacity() && - (strict_capacity_limit_ || handle == nullptr)) { - if (handle == nullptr) { - // Don't insert the entry but still return ok, as if the entry inserted - // into cache and get evicted immediately. - tmp.FreeData(); - } else { - if (occupancy_local + 1 > table_.GetOccupancyLimit()) { - // TODO: Consider using a distinct status for this case, but usually - // it will be handled the same way as reaching charge capacity limit - s = Status::MemoryLimit( - "Insert failed because all slots in the hash table are full."); - } else { - s = Status::MemoryLimit( - "Insert failed because the total charge has exceeded the " - "capacity."); - } - } - } else { - ClockHandle* h = nullptr; - if (handle != nullptr && occupancy_local + 1 > table_.GetOccupancyLimit()) { - // Even if the user wishes to overload the cache, we can't insert into - // the hash table. Instead, we dynamically allocate a new handle. - h = DetachedInsert(&tmp); - // TODO: Return special status? - } else { - // Insert into the cache. Note that the cache might get larger than its - // capacity if not enough space was freed up. - autovector deleted; - h = table_.Insert(&tmp, &deleted, handle != nullptr); - if (h == nullptr && handle != nullptr) { - // The table is full. This can happen when many threads simultaneously - // attempt an insert, and the table is operating close to full capacity. - h = DetachedInsert(&tmp); - } - // Notice that if handle == nullptr, we don't insert the entry but still - // return ok. - if (deleted.size() > 0) { - s = Status::OkOverwritten(); - } - table_.Free(&deleted); - } - if (handle != nullptr) { - *handle = reinterpret_cast(h); - } - } - + ClockHandleMoreData proto; + proto.key = *reinterpret_cast(key.data()); + proto.hash = hash; + proto.value = value; + proto.deleter = deleter; + proto.total_charge = charge; + Status s = + table_.Insert(proto, reinterpret_cast(handle), priority, + capacity_.load(std::memory_order_relaxed), + strict_capacity_limit_.load(std::memory_order_relaxed)); return s; } Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) { - return reinterpret_cast(table_.Lookup(key, hash)); + if (UNLIKELY(key.size() != kCacheKeySize)) { + return nullptr; + } + auto key_bytes = reinterpret_cast(key.data()); + return reinterpret_cast(table_.Lookup(*key_bytes, hash)); } bool ClockCacheShard::Ref(Cache::Handle* h) { - ClockHandle* e = reinterpret_cast(h); - assert(e->ExternalRefs() > 0); - return e->TryExternalRef(); + if (h == nullptr) { + return false; + } + table_.Ref(*reinterpret_cast(h)); + return true; } -bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) { - // In contrast with LRUCache's Release, this function won't delete the handle - // when the cache is above capacity and the reference is the last one. Space - // is only freed up by EvictFromClock (called by Insert when space is needed) - // and Erase. We do this to avoid an extra atomic read of the variable usage_. +bool ClockCacheShard::Release(Cache::Handle* handle, bool useful, + bool erase_if_last_ref) { if (handle == nullptr) { return false; } + return table_.Release(reinterpret_cast(handle), useful, + erase_if_last_ref); +} - ClockHandle* h = reinterpret_cast(handle); - - if (UNLIKELY(h->IsDetached())) { - h->ReleaseExternalRef(); - if (h->TryExclusiveRef()) { - // Only the last reference will succeed. - // Don't bother releasing the exclusive ref. - h->FreeData(); - detached_usage_ -= h->total_charge; - delete h; - return true; - } - return false; - } +void ClockCacheShard::TEST_RefN(Cache::Handle* h, size_t n) { + table_.TEST_RefN(*reinterpret_cast(h), n); +} - uint32_t refs = h->refs; - bool last_reference = ((refs & ClockHandle::EXTERNAL_REFS) == 1); - bool will_be_deleted = refs & ClockHandle::WILL_BE_DELETED; - - if (last_reference && (will_be_deleted || erase_if_last_ref)) { - autovector deleted; - h->SetWillBeDeleted(true); - h->ReleaseExternalRef(); - if (table_.SpinTryRemove(h, &deleted)) { - h->ReleaseExclusiveRef(); - table_.Free(&deleted); - return true; - } - } else { - h->ReleaseExternalRef(); - } +void ClockCacheShard::TEST_ReleaseN(Cache::Handle* h, size_t n) { + table_.TEST_ReleaseN(reinterpret_cast(h), n); +} - return false; +bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) { + return Release(handle, /*useful=*/true, erase_if_last_ref); } void ClockCacheShard::Erase(const Slice& key, uint32_t hash) { - autovector deleted; - uint32_t probe = 0; - table_.RemoveAll(key, hash, probe, &deleted); - table_.Free(&deleted); + if (UNLIKELY(key.size() != kCacheKeySize)) { + return; + } + auto key_bytes = reinterpret_cast(key.data()); + table_.Erase(*key_bytes, hash); } size_t ClockCacheShard::GetUsage() const { return table_.GetUsage(); } @@ -613,18 +1095,35 @@ size_t ClockCacheShard::GetPinnedUsage() const { // Why avoid this counter? Because Lookup removes elements from the clock // list, so it would need to update the pinned usage every time, // which creates additional synchronization costs. - size_t clock_usage = 0; - + size_t table_pinned_usage = 0; + const bool charge_metadata = + metadata_charge_policy_ == kFullChargeCacheMetadata; table_.ConstApplyToEntriesRange( - [&clock_usage](const ClockHandle* h) { - if (h->ExternalRefs() > 1) { - // We check > 1 because we are holding an external ref. - clock_usage += h->total_charge; + [&table_pinned_usage, charge_metadata](const ClockHandle& h) { + uint64_t meta = h.meta.load(std::memory_order_relaxed); + uint64_t refcount = ((meta >> ClockHandle::kAcquireCounterShift) - + (meta >> ClockHandle::kReleaseCounterShift)) & + ClockHandle::kCounterMask; + // Holding one ref for ConstApplyToEntriesRange + assert(refcount > 0); + if (refcount > 1) { + table_pinned_usage += h.total_charge; + if (charge_metadata) { + table_pinned_usage += sizeof(ClockHandle); + } } }, 0, table_.GetTableSize(), true); - return clock_usage + detached_usage_; + return table_pinned_usage + table_.GetDetachedUsage(); +} + +size_t ClockCacheShard::GetOccupancyCount() const { + return table_.GetOccupancy(); +} + +size_t ClockCacheShard::GetTableAddressCount() const { + return table_.GetTableSize(); } ClockCache::ClockCache(size_t capacity, size_t estimated_value_size, @@ -634,6 +1133,8 @@ ClockCache::ClockCache(size_t capacity, size_t estimated_value_size, num_shards_(1 << num_shard_bits) { assert(estimated_value_size > 0 || metadata_charge_policy != kDontChargeCacheMetadata); + // TODO: should not need to go through two levels of pointer indirection to + // get to table entries shards_ = reinterpret_cast( port::cacheline_aligned_alloc(sizeof(ClockCacheShard) * num_shards_)); size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_; @@ -667,12 +1168,7 @@ void* ClockCache::Value(Handle* handle) { } size_t ClockCache::GetCharge(Handle* handle) const { - CacheMetadataChargePolicy metadata_charge_policy = kDontChargeCacheMetadata; - if (num_shards_ > 0) { - metadata_charge_policy = shards_[0].metadata_charge_policy_; - } - return reinterpret_cast(handle)->GetCharge( - metadata_charge_policy); + return reinterpret_cast(handle)->total_charge; } Cache::DeleterFn ClockCache::GetDeleter(Handle* handle) const { @@ -711,7 +1207,10 @@ std::shared_ptr ExperimentalNewClockCache( return nullptr; // The cache cannot be sharded into too many fine pieces. } if (num_shard_bits < 0) { - num_shard_bits = GetDefaultCacheShardBits(capacity); + // Use larger shard size to reduce risk of large entries clustering + // or skewing individual shards. + constexpr size_t min_shard_size = 32U * 1024U * 1024U; + num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size); } return std::make_shared( capacity, estimated_value_size, num_shard_bits, strict_capacity_limit, diff --git a/cache/clock_cache.h b/cache/clock_cache.h index e495f1c04..8ceb46478 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -9,10 +9,9 @@ #pragma once -#include - #include #include +#include #include #include #include @@ -33,140 +32,262 @@ namespace clock_cache { // Forward declaration of friend class. class ClockCacheTest; -// An experimental alternative to LRUCache, using a lock-free, open-addressed -// hash table and clock eviction. - -// ---------------------------------------------------------------------------- -// 1. INTRODUCTION +// ClockCache is an experimental alternative to LRUCache. +// +// Benefits +// -------- +// * Fully lock free (no waits or spins) for efficiency under high concurrency +// * Optimized for hot path reads. For concurrency control, most Lookup() and +// essentially all Release() are a single atomic add operation. +// * Uses a generalized + aging variant of CLOCK eviction that might outperform +// LRU in some cases. (For background, see +// https://en.wikipedia.org/wiki/Page_replacement_algorithm) +// * Eviction on insertion is fully parallel and lock-free. +// +// Costs +// ----- +// * Hash table is not resizable (for lock-free efficiency) so capacity is not +// dynamically changeable. Rely on an estimated average value (block) size for +// space+time efficiency. (See estimated_entry_charge option details.) +// * Insert usually does not (but might) overwrite a previous entry associated +// with a cache key. This is OK for RocksDB uses of Cache. +// * Only supports keys of exactly 16 bytes, which is what RocksDB uses for +// block cache (not row cache or table cache). +// * SecondaryCache is not supported. +// * Cache priorities are less aggressively enforced. Unlike LRUCache, enough +// transient LOW or BOTTOM priority items can evict HIGH priority entries that +// are not referenced recently (or often) enough. +// * If pinned entries leave little or nothing eligible for eviction, +// performance can degrade substantially, because of clock eviction eating +// CPU looking for evictable entries and because Release does not +// pro-actively delete unreferenced entries when the cache is over-full. +// Specifically, this makes this implementation more susceptible to the +// following combination: +// * num_shard_bits is high (e.g. 6) +// * capacity small (e.g. some MBs) +// * some large individual entries (e.g. non-partitioned filters) +// where individual entries occupy a large portion of their shard capacity. +// This should be mostly mitigated by the implementation picking a lower +// number of cache shards than LRUCache for a given capacity (when +// num_shard_bits is not overridden; see calls to GetDefaultCacheShardBits()). +// * With strict_capacity_limit=false, respecting the capacity limit is not as +// aggressive as LRUCache. The limit might be transiently exceeded by a very +// small number of entries even when not strictly necessary, and slower to +// recover after pinning forces limit to be substantially exceeded. (Even with +// strict_capacity_limit=true, RocksDB will nevertheless transiently allocate +// memory before discovering it is over the block cache capacity, so this +// should not be a detectable regression in respecting memory limits, except +// on exceptionally small caches.) +// * In some cases, erased or duplicated entries might not be freed +// immediately. They will eventually be freed by eviction from further Inserts. +// * Internal metadata can overflow if the number of simultaneous references +// to a cache handle reaches many millions. +// +// High-level eviction algorithm +// ----------------------------- +// A score (or "countdown") is maintained for each entry, initially determined +// by priority. The score is incremented on each Lookup, up to a max of 3, +// though is easily returned to previous state if useful=false with Release. +// During CLOCK-style eviction iteration, entries with score > 0 are +// decremented if currently unreferenced and entries with score == 0 are +// evicted if currently unreferenced. Note that scoring might not be perfect +// because entries can be referenced transiently within the cache even when +// there are no outside references to the entry. +// +// Cache sharding like LRUCache is used to reduce contention on usage+eviction +// state, though here the performance improvement from more shards is small, +// and (as noted above) potentially detrimental if shard capacity is too close +// to largest entry size. Here cache sharding mostly only affects cache update +// (Insert / Erase) performance, not read performance. +// +// Read efficiency (hot path) +// -------------------------- +// Mostly to minimize the cost of accessing metadata blocks with +// cache_index_and_filter_blocks=true, we focus on optimizing Lookup and +// Release. In terms of concurrency, at a minimum, these operations have +// to do reference counting (and Lookup has to compare full keys in a safe +// way). Can we fold in all the other metadata tracking *for free* with +// Lookup and Release doing a simple atomic fetch_add/fetch_sub? (Assume +// for the moment that Lookup succeeds on the first probe.) +// +// We have a clever way of encoding an entry's reference count and countdown +// clock so that Lookup and Release are each usually a single atomic addition. +// In a single metadata word we have both an "acquire" count, incremented by +// Lookup, and a "release" count, incremented by Release. If useful=false, +// Release can instead decrement the acquire count. Thus the current ref +// count is (acquires - releases), and the countdown clock is min(3, acquires). +// Note that only unreferenced entries (acquires == releases) are eligible +// for CLOCK manipulation and eviction. We tolerate use of more expensive +// compare_exchange operations for cache writes (insertions and erasures). +// +// In a cache receiving many reads and little or no writes, it is possible +// for the acquire and release counters to overflow. Assuming the *current* +// refcount never reaches to many millions, we only have to correct for +// overflow in both counters in Release, not in Lookup. The overflow check +// should be only 1-2 CPU cycles per Release because it is a predictable +// branch on a simple condition on data already in registers. +// +// Slot states +// ----------- +// We encode a state indicator into the same metadata word with the +// acquire and release counters. This allows bigger state transitions to +// be atomic. States: // -// In RocksDB, a Cache is a concurrent unordered dictionary that supports -// external references (a.k.a. user references). A ClockCache is a type of Cache -// that uses the clock algorithm as its eviction policy. Internally, a -// ClockCache is an open-addressed hash table that stores all KV pairs in a -// large array. Every slot in the hash table is a ClockHandle, which holds a KV -// pair plus some additional metadata that controls the different aspects of the -// cache: external references, the hashing mechanism, concurrent access and the -// clock algorithm. +// * Empty - slot is not in use and unowned. All other metadata and data is +// in an undefined state. +// * Construction - slot is exclusively owned by one thread, the thread +// successfully entering this state, for populating or freeing data. +// * Shareable (group) - slot holds an entry with counted references for +// pinning and reading, including +// * Visible - slot holds an entry that can be returned by Lookup +// * Invisible - slot holds an entry that is not visible to Lookup +// (erased by user) but can be read by existing references, and ref count +// changed by Ref and Release. // +// A special case is "detached" entries, which are heap-allocated handles +// not in the table. They are always Invisible and freed on zero refs. // -// 2. EXTERNAL REFERENCES +// State transitions: +// Empty -> Construction (in Insert): The encoding of state enables Insert to +// perform an optimistic atomic bitwise-or to take ownership if a slot is +// empty, or otherwise make no state change. // -// An externally referenced handle can't be deleted (either evicted by the clock -// algorithm, or explicitly deleted) or replaced by a new version (via an insert -// of the same key) until all external references to it have been released by -// the users. ClockHandles have two members to support external references: -// - EXTERNAL_REFS counter: The number of external refs. When EXTERNAL_REFS > 0, -// the handle is externally referenced. Updates that intend to modify the -// handle will refrain from doing so. Eventually, when all references are -// released, we have EXTERNAL_REFS == 0, and updates can operate normally on -// the handle. -// - WILL_BE_DELETED flag: An handle is marked for deletion when an operation -// decides the handle should be deleted. This happens either when the last -// reference to a handle is released (and the release operation is instructed -// to delete on last reference) or on when a delete operation is called on -// the item. This flag is needed because an externally referenced handle -// can't be immediately deleted. In these cases, the flag will be later read -// and acted upon by the eviction algorithm. Importantly, WILL_BE_DELETED is -// used not only to defer deletions, but also as a barrier for external -// references: once WILL_BE_DELETED is set, lookups (which are the most -// common way to acquire new external references) will ignore the handle. -// For this reason, when WILL_BE_DELETED is set, we say the handle is -// invisible (and, otherwise, that it's visible). +// Construction -> Visible (in Insert): This can be a simple assignment to the +// metadata word because the current thread has exclusive ownership and other +// metadata is meaningless. // +// Visible -> Invisible (in Erase): This can be a bitwise-and while holding +// a shared reference, which is safe because the change is idempotent (in case +// of parallel Erase). By the way, we never go Invisible->Visible. // -// 3. HASHING AND COLLISION RESOLUTION +// Shareable -> Construction (in Evict part of Insert, in Erase, and in +// Release if Invisible): This is for starting to freeing/deleting an +// unreferenced entry. We have to use compare_exchange to ensure we only make +// this transition when there are zero refs. // -// ClockCache uses an open-addressed hash table to store the handles. -// We use a variant of tombstones to manage collisions: every slot keeps a -// count of how many KV pairs that are currently in the cache have probed the -// slot in an attempt to insert. Probes are generated with double-hashing -// (although the code can be easily modified to use other probing schemes, like -// linear probing). +// Construction -> Empty (in same places): This is for completing free/delete +// of an entry. A "release" atomic store suffices, as we have exclusive +// ownership of the slot but have to ensure none of the data member reads are +// re-ordered after committing the state transition. // -// A slot in the hash table can be in a few different states: -// - Element: The slot contains an element. This is indicated with the -// IS_ELEMENT flag. Element can be sub-classified depending on the -// value of WILL_BE_DELETED: -// * Visible element. -// * Invisible element. -// - Tombstone: The slot doesn't contain an element, but there is some other -// element that probed this slot during its insertion. -// - Empty: The slot is unused---it's neither an element nor a tombstone. +// Insert +// ------ +// If Insert were to guarantee replacing an existing entry for a key, there +// would be complications for concurrency and efficiency. First, consider how +// many probes to get to an entry. To ensure Lookup never waits and +// availability of a key is uninterrupted, we would need to use a different +// slot for a new entry for the same key. This means it is most likely in a +// later probing position than the old version, which should soon be removed. +// (Also, an entry is too big to replace atomically, even if no current refs.) // -// A slot cycles through the following sequence of states: -// empty or tombstone --> visible element --> invisible element --> -// empty or tombstone. Initially a slot is available---it's either -// empty or a tombstone. As soon as a KV pair is written into the slot, it -// becomes a visible element. At some point, the handle will be deleted -// by an explicit delete operation, the eviction algorithm, or an overwriting -// insert. In either case, the handle is marked for deletion. When the an -// attempt to delete the element finally succeeds, the slot is freed up -// and becomes available again. +// However, overwrite capability is not really needed by RocksDB. Also, we +// know from our "redundant" stats that overwrites are very rare for the block +// cache, so we should not spend much to make them effective. // +// So instead we Insert as soon as we find an empty slot in the probing +// sequence without seeing an existing (visible) entry for the same key. This +// way we only insert if we can improve the probing performance, and we don't +// need to probe beyond our insert position, assuming we are willing to let +// the previous entry for the same key die of old age (eventual eviction from +// not being used). We can reach a similar state with concurrent insertions, +// where one will pass over the other while it is "under construction." +// This temporary duplication is acceptable for RocksDB block cache because +// we know redundant insertion is rare. // -// 4. CONCURRENCY +// Another problem to solve is what to return to the caller when we find an +// existing entry whose probing position we cannot improve on, or when the +// table occupancy limit has been reached. If strict_capacity_limit=false, +// we must never fail Insert, and if a Handle* is provided, we have to return +// a usable Cache handle on success. The solution to this (typically rare) +// problem is "detached" handles, which are usable by the caller but not +// actually available for Lookup in the Cache. Detached handles are allocated +// independently on the heap and specially marked so that they are freed on +// the heap when their last reference is released. // -// ClockCache is lock-free. At a high level, we synchronize the operations -// using a read-prioritized, non-blocking variant of RW locks on every slot of -// the hash table. To do this we generalize the concept of reference: -// - Internal reference: Taken by a thread that is attempting to read a slot -// or do a very precise type of update. -// - Exclusive reference: Taken by a thread that is attempting to write a -// a slot extensively. +// Usage on capacity +// ----------------- +// Insert takes different approaches to usage tracking depending on +// strict_capacity_limit setting. If true, we enforce a kind of strong +// consistency where compare-exchange is used to ensure the usage number never +// exceeds its limit, and provide threads with an authoritative signal on how +// much "usage" they have taken ownership of. With strict_capacity_limit=false, +// we use a kind of "eventual consistency" where all threads Inserting to the +// same cache shard might race on reserving the same space, but the +// over-commitment will be worked out in later insertions. It is kind of a +// dance because we don't want threads racing each other too much on paying +// down the over-commitment (with eviction) either. // -// We defer the precise definitions to the comments in the code below. -// A crucial feature of our references is that attempting to take one never -// blocks the thread. Another important feature is that readers are -// prioritized, as they use extremely fast synchronization primitives---they -// use atomic arithmetic/bit operations, but no compare-and-swaps (which are -// much slower). +// Eviction +// -------- +// A key part of Insert is evicting some entries currently unreferenced to +// make room for new entries. The high-level eviction algorithm is described +// above, but the details are also interesting. A key part is parallelizing +// eviction with a single CLOCK pointer. This works by each thread working on +// eviction pre-emptively incrementing the CLOCK pointer, and then CLOCK- +// updating or evicting the incremented-over slot(s). To reduce contention at +// the cost of possibly evicting too much, each thread increments the clock +// pointer by 4, so commits to updating at least 4 slots per batch. As +// described above, a CLOCK update will decrement the "countdown" of +// unreferenced entries, or evict unreferenced entries with zero countdown. +// Referenced entries are not updated, because we (presumably) don't want +// long-referenced entries to age while referenced. Note however that we +// cannot distinguish transiently referenced entries from cache user +// references, so some CLOCK updates might be somewhat arbitrarily skipped. +// This is OK as long as it is rare enough that eviction order is still +// pretty good. // -// Internal references are used by threads to read slots during a probing -// sequence, making them the most common references (probing is performed -// in almost every operation, not just lookups). During a lookup, once -// the target element is found, and just before the handle is handed over -// to the user, an internal reference is converted into an external reference. -// During an update operation, once the target slot is found, an internal -// reference is converted into an exclusive reference. Interestingly, we -// can't atomically upgrade from internal to exclusive, or we may run into a -// deadlock. Releasing the internal reference and then taking an exclusive -// reference avoids the deadlock, but then the handle may change inbetween. -// One of the key observations we use in our implementation is that we can -// make up for this lack of atomicity using IS_ELEMENT and WILL_BE_DELETED. +// There is no synchronization on the completion of the CLOCK updates, so it +// is theoretically possible for another thread to cycle back around and have +// two threads racing on CLOCK updates to the same slot. Thus, we cannot rely +// on any implied exclusivity to make the updates or eviction more efficient. +// These updates use an opportunistic compare-exchange (no loop), where a +// racing thread might cause the update to be skipped without retry, but in +// such case the update is likely not needed because the most likely update +// to an entry is that it has become referenced. (TODO: test efficiency of +// avoiding compare-exchange loop) // -// Distinguishing internal from external references is useful for two reasons: -// - Internal references are short lived, but external references are typically -// not. This is helpful when acquiring an exclusive ref: if there are any -// external references to the item, it's probably not worth waiting until -// they go away. -// - We can precisely determine when there are no more external references to a -// handle, and proceed to mark it for deletion. This is useful when users -// release external references. +// Release +// ------- +// In the common case, Release is a simple atomic increment of the release +// counter. There is a simple overflow check that only does another atomic +// update in extremely rare cases, so costs almost nothing. // +// If the Release specifies "not useful", we can instead decrement the +// acquire counter, which returns to the same CLOCK state as before Lookup +// or Ref. // -// 5. CLOCK ALGORITHM +// Adding a check for over-full cache on every release to zero-refs would +// likely be somewhat expensive, increasing read contention on cache shard +// metadata. Instead we are less aggressive about deleting entries right +// away in those cases. // -// The clock algorithm circularly sweeps through the hash table to find the next -// victim. Recall that handles that are referenced are not evictable; the clock -// algorithm never picks those. We use different clock priorities: NONE, LOW, -// MEDIUM and HIGH. Priorities LOW, MEDIUM and HIGH represent how close an -// element is from being evicted, LOW being the closest to evicted. NONE means -// the slot is not evictable. NONE priority is used in one of the following -// cases: -// (a) the slot doesn't contain an element, or -// (b) the slot contains an externally referenced element, or -// (c) the slot contains an element that used to be externally referenced, -// and the clock pointer has not swept through the slot since the element -// stopped being externally referenced. -// ---------------------------------------------------------------------------- +// However Release tries to immediately delete entries reaching zero refs +// if (a) erase_if_last_ref is set by the caller, or (b) the entry is already +// marked invisible. Both of these are checks on values already in CPU +// registers so do not increase cross-CPU contention when not applicable. +// When applicable, they use a compare-exchange loop to take exclusive +// ownership of the slot for freeing the entry. These are rare cases +// that should not usually affect performance. +// +// Erase +// ----- +// Searches for an entry like Lookup but moves it to Invisible state if found. +// This state transition is with bit operations so is idempotent and safely +// done while only holding a shared "read" reference. Like Release, it makes +// a best effort to immediately release an Invisible entry that reaches zero +// refs, but there are some corner cases where it will only be freed by the +// clock eviction process. + +// ----------------------------------------------------------------------- // // The load factor p is a real number in (0, 1) such that at all // times at most a fraction p of all slots, without counting tombstones, -// are occupied by elements. This means that the probability that a -// random probe hits an empty slot is at most p, and thus at most 1/p probes +// are occupied by elements. This means that the probability that a random +// probe hits an occupied slot is at most p, and thus at most 1/p probes // are required on average. For example, p = 70% implies that between 1 and 2 // probes are needed on average (bear in mind that this reasoning doesn't -// consider the effects of clustering over time). +// consider the effects of clustering over time, which should be negligible +// with double hashing). // Because the size of the hash table is always rounded up to the next // power of 2, p is really an upper bound on the actual load factor---the // actual load factor is anywhere between p/2 and p. This is a bit wasteful, @@ -174,440 +295,119 @@ class ClockCacheTest; // Since space cost is dominated by the values (the LSM blocks), // overprovisioning the table with metadata only increases the total cache space // usage by a tiny fraction. -constexpr double kLoadFactor = 0.35; +constexpr double kLoadFactor = 0.7; // The user can exceed kLoadFactor if the sizes of the inserted values don't -// match estimated_value_size, or if strict_capacity_limit == false. To -// avoid a performance drop, we set a strict upper bound on the load factor. -constexpr double kStrictLoadFactor = 0.7; - -// Maximum number of spins when trying to acquire a ref. -// TODO(Guido) This value was set arbitrarily. Is it appropriate? -// What's the best way to bound the spinning? -constexpr uint32_t kSpinsPerTry = 100000; - -// Arbitrary seeds. -constexpr uint32_t kProbingSeed1 = 0xbc9f1d34; -constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5; - -struct ClockHandle { - void* value; - Cache::DeleterFn deleter; - uint32_t hash; - size_t total_charge; - std::array key_data; - - static constexpr uint8_t kIsElementOffset = 0; - static constexpr uint8_t kClockPriorityOffset = 1; - static constexpr uint8_t kIsHitOffset = 3; - static constexpr uint8_t kCachePriorityOffset = 4; - - enum Flags : uint8_t { - // Whether the slot is in use by an element. - IS_ELEMENT = 1 << kIsElementOffset, - // Clock priorities. Represents how close a handle is from being evictable. - CLOCK_PRIORITY = 3 << kClockPriorityOffset, - // Whether the handle has been looked up after its insertion. - HAS_HIT = 1 << kIsHitOffset, - // The value of Cache::Priority of the handle. - CACHE_PRIORITY = 1 << kCachePriorityOffset, - }; - - std::atomic flags; - - enum ClockPriority : uint8_t { - NONE = (0 << kClockPriorityOffset), - LOW = (1 << kClockPriorityOffset), - MEDIUM = (2 << kClockPriorityOffset), - HIGH = (3 << kClockPriorityOffset) - }; - - // The number of elements that hash to this slot or a lower one, but wind - // up in this slot or a higher one. - std::atomic displacements; - - static constexpr uint8_t kExternalRefsOffset = 0; - static constexpr uint8_t kSharedRefsOffset = 15; - static constexpr uint8_t kExclusiveRefOffset = 30; - static constexpr uint8_t kWillBeDeletedOffset = 31; - - enum Refs : uint32_t { - // Synchronization model: - // - An external reference guarantees that hash, value, key_data - // and the IS_ELEMENT flag are not modified. Doesn't allow - // any writes. - // - An internal reference has the same guarantees as an - // external reference, and additionally allows the following - // idempotent updates on the handle: - // * set CLOCK_PRIORITY to NONE; - // * set the HAS_HIT bit; - // * set the WILL_BE_DELETED bit. - // - A shared reference is either an external reference or an - // internal reference. - // - An exclusive reference guarantees that no other thread has a shared - // or exclusive reference to the handle, and allows writes - // on the handle. - - // Number of external references to the slot. - EXTERNAL_REFS = ((uint32_t{1} << 15) - 1) - << kExternalRefsOffset, // Bits 0, ..., 14 - // Number of internal references plus external references to the slot. - SHARED_REFS = ((uint32_t{1} << 15) - 1) - << kSharedRefsOffset, // Bits 15, ..., 29 - // Whether a thread has an exclusive reference to the slot. - EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30 - // Whether the handle will be deleted soon. When this bit is set, new - // internal references to this handle stop being accepted. - // External references may still be granted---they can be created from - // existing external references, or converting from existing internal - // references. - WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31 - - // Having these 4 fields in a single variable allows us to support the - // following operations efficiently: - // - Convert an internal reference into an external reference in a single - // atomic arithmetic operation. - // - Attempt to take a shared reference using a single atomic arithmetic - // operation. This is because we can increment the internal ref count - // as well as checking whether the entry is marked for deletion using a - // single atomic arithmetic operation (and one non-atomic comparison). - }; - - static constexpr uint32_t kOneInternalRef = 0x8000; - static constexpr uint32_t kOneExternalRef = 0x8001; - - std::atomic refs; +// match estimated_value_size, or in some rare cases with +// strict_capacity_limit == false. To avoid degenerate performance, we set a +// strict upper bound on the load factor. +constexpr double kStrictLoadFactor = 0.84; - // True iff the handle is allocated separately from hash table. - bool detached; - - ClockHandle() - : value(nullptr), - deleter(nullptr), - hash(0), - total_charge(0), - flags(0), - displacements(0), - refs(0), - detached(false) { - SetWillBeDeleted(false); - SetIsElement(false); - SetClockPriority(ClockPriority::NONE); - SetCachePriority(Cache::Priority::LOW); - key_data.fill(0); - } +using CacheKeyBytes = std::array; - // The copy ctor and assignment operator are only used to copy a handle - // for immediate deletion. (We need to copy because the slot may become - // re-used before the deletion is completed.) We only copy the necessary - // members to carry out the deletion. In particular, we don't need - // the atomic members. - ClockHandle(const ClockHandle& other) { *this = other; } - - void operator=(const ClockHandle& other) { - value = other.value; - deleter = other.deleter; - key_data = other.key_data; - hash = other.hash; - total_charge = other.total_charge; - } +struct ClockHandleBasicData { + void* value = nullptr; + Cache::DeleterFn deleter = nullptr; + CacheKeyBytes key = {}; + size_t total_charge = 0; - Slice key() const { return Slice(key_data.data(), kCacheKeySize); } + Slice KeySlice() const { return Slice(key.data(), kCacheKeySize); } - void FreeData() { + void FreeData() const { if (deleter) { - (*deleter)(key(), value); + (*deleter)(KeySlice(), value); } } +}; + +struct ClockHandleMoreData : public ClockHandleBasicData { + uint32_t hash = 0; +}; + +// Target size to be exactly a common cache line size (see static_assert in +// clock_cache.cc) +struct ALIGN_AS(64U) ClockHandle : public ClockHandleMoreData { + // Constants for handling the atomic `meta` word, which tracks most of the + // state of the handle. The meta word looks like this: + // low bits high bits + // ----------------------------------------------------------------------- + // | acquire counter | release counter | state marker | + // ----------------------------------------------------------------------- + + // For reading or updating counters in meta word. + static constexpr uint8_t kCounterNumBits = 30; + static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1; + + static constexpr uint8_t kAcquireCounterShift = 0; + static constexpr uint64_t kAcquireIncrement = uint64_t{1} + << kAcquireCounterShift; + static constexpr uint8_t kReleaseCounterShift = kCounterNumBits; + static constexpr uint64_t kReleaseIncrement = uint64_t{1} + << kReleaseCounterShift; + + // For reading or updating the state marker in meta word + static constexpr uint8_t kStateShift = 2U * kCounterNumBits; + + // Bits contribution to state marker. + // Occupied means any state other than empty + static constexpr uint8_t kStateOccupiedBit = 0b100; + // Shareable means the entry is reference counted (visible or invisible) + // (only set if also occupied) + static constexpr uint8_t kStateShareableBit = 0b010; + // Visible is only set if also shareable + static constexpr uint8_t kStateVisibleBit = 0b001; + + // Complete state markers (not shifted into full word) + static constexpr uint8_t kStateEmpty = 0b000; + static constexpr uint8_t kStateConstruction = kStateOccupiedBit; + static constexpr uint8_t kStateInvisible = + kStateOccupiedBit | kStateShareableBit; + static constexpr uint8_t kStateVisible = + kStateOccupiedBit | kStateShareableBit | kStateVisibleBit; + + // Constants for initializing the countdown clock. (Countdown clock is only + // in effect with zero refs, acquire counter == release counter, and in that + // case the countdown clock == both of those counters.) + static constexpr uint8_t kHighCountdown = 3; + static constexpr uint8_t kLowCountdown = 2; + static constexpr uint8_t kBottomCountdown = 1; + // During clock update, treat any countdown clock value greater than this + // value the same as this value. + static constexpr uint8_t kMaxCountdown = kHighCountdown; + // TODO: make these coundown values tuning parameters for eviction? + + // See above + std::atomic meta{}; + // The number of elements that hash to this slot or a lower one, but wind + // up in this slot or a higher one. + std::atomic displacements{}; - // Calculate the memory usage by metadata. - inline size_t CalcMetaCharge( - CacheMetadataChargePolicy metadata_charge_policy) const { - if (metadata_charge_policy != kFullChargeCacheMetadata) { - return 0; - } else { - // #ifdef ROCKSDB_MALLOC_USABLE_SIZE - // return malloc_usable_size( - // const_cast(static_cast(this))); - // #else - // TODO(Guido) malloc_usable_size only works when we call it on - // a pointer allocated with malloc. Because our handles are all - // allocated in a single shot as an array, the user can't call - // CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle - // pointer returned by the cache. Moreover, malloc_usable_size - // expects a heap-allocated handle, but sometimes in our code we - // wish to pass a stack-allocated handle (this is only a performance - // concern). - // What is the right way to compute metadata charges with pre-allocated - // handles? - return sizeof(ClockHandle); - // #endif - } - } - - inline void CalcTotalCharge( - size_t charge, CacheMetadataChargePolicy metadata_charge_policy) { - total_charge = charge + CalcMetaCharge(metadata_charge_policy); - } - - inline size_t GetCharge( - CacheMetadataChargePolicy metadata_charge_policy) const { - size_t meta_charge = CalcMetaCharge(metadata_charge_policy); - assert(total_charge >= meta_charge); - return total_charge - meta_charge; - } - - // flags functions. - - bool IsElement() const { return flags & Flags::IS_ELEMENT; } - - void SetIsElement(bool is_element) { - if (is_element) { - flags |= Flags::IS_ELEMENT; - } else { - flags &= static_cast(~Flags::IS_ELEMENT); - } - } - - bool HasHit() const { return flags & HAS_HIT; } - - void SetHit() { flags |= HAS_HIT; } - - Cache::Priority GetCachePriority() const { - return static_cast(flags & CACHE_PRIORITY); - } - - void SetCachePriority(Cache::Priority priority) { - if (priority == Cache::Priority::HIGH) { - flags |= Flags::CACHE_PRIORITY; - } else { - flags &= static_cast(~Flags::CACHE_PRIORITY); - } - } - - bool IsInClock() const { - return GetClockPriority() != ClockHandle::ClockPriority::NONE; - } - - ClockPriority GetClockPriority() const { - return static_cast(flags & Flags::CLOCK_PRIORITY); - } - - void SetClockPriority(ClockPriority priority) { - flags &= static_cast(~Flags::CLOCK_PRIORITY); - flags |= priority; - } - - void DecreaseClockPriority() { - uint8_t p = static_cast(flags & Flags::CLOCK_PRIORITY) >> - kClockPriorityOffset; - assert(p > 0); - p--; - flags &= static_cast(~Flags::CLOCK_PRIORITY); - ClockPriority new_priority = - static_cast(p << kClockPriorityOffset); - flags |= new_priority; - } - - bool IsDetached() { return detached; } - - void SetDetached() { detached = true; } - - inline bool IsEmpty() const { - return !this->IsElement() && this->displacements == 0; - } - - inline bool IsTombstone() const { - return !this->IsElement() && this->displacements > 0; - } - - inline bool Matches(const Slice& some_key, uint32_t some_hash) const { - return this->hash == some_hash && this->key() == some_key; - } - - // refs functions. - - inline bool WillBeDeleted() const { return refs & WILL_BE_DELETED; } - - void SetWillBeDeleted(bool will_be_deleted) { - if (will_be_deleted) { - refs |= WILL_BE_DELETED; - } else { - refs &= ~WILL_BE_DELETED; - } - } - - uint32_t ExternalRefs() const { - return (refs & EXTERNAL_REFS) >> kExternalRefsOffset; - } - - // Tries to take an internal ref. Returns true iff it succeeds. - inline bool TryInternalRef() { - if (!((refs += kOneInternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) { - return true; - } - refs -= kOneInternalRef; - return false; - } - - // Tries to take an external ref. Returns true iff it succeeds. - inline bool TryExternalRef() { - if (!((refs += kOneExternalRef) & EXCLUSIVE_REF)) { - return true; - } - refs -= kOneExternalRef; - return false; - } - - // Tries to take an exclusive ref. Returns true iff it succeeds. - // TODO(Guido) After every TryExclusiveRef call, we always call - // WillBeDeleted(). We could save an atomic read by having an output parameter - // with the last value of refs. - inline bool TryExclusiveRef() { - uint32_t will_be_deleted = refs & WILL_BE_DELETED; - uint32_t expected = will_be_deleted; - return refs.compare_exchange_strong(expected, - EXCLUSIVE_REF | will_be_deleted); - } - - // Repeatedly tries to take an exclusive reference, but aborts as soon - // as an external or exclusive reference is detected (since the wait - // would presumably be too long). - inline bool SpinTryExclusiveRef() { - uint32_t expected = 0; - uint32_t will_be_deleted = 0; - uint32_t spins = kSpinsPerTry; - while (!refs.compare_exchange_strong(expected, - EXCLUSIVE_REF | will_be_deleted) && - spins--) { - std::this_thread::yield(); - if (expected & (EXTERNAL_REFS | EXCLUSIVE_REF)) { - return false; - } - will_be_deleted = expected & WILL_BE_DELETED; - expected = will_be_deleted; - } - return true; - } - - // Take an external ref, assuming there is already one external ref - // to the handle. - void Ref() { - // TODO(Guido) Is it okay to assume that the existing external reference - // survives until this function returns? - refs += kOneExternalRef; - } - - inline void ReleaseExternalRef() { refs -= kOneExternalRef; } - - inline void ReleaseInternalRef() { refs -= kOneInternalRef; } - - inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); } - - // Downgrade an exclusive ref to external. - inline void ExclusiveToExternalRef() { - refs += kOneExternalRef; - ReleaseExclusiveRef(); - } - - // Convert an internal ref into external. - inline void InternalToExternalRef() { - refs += kOneExternalRef - kOneInternalRef; - } - + // True iff the handle is allocated separately from hash table. + bool detached = false; }; // struct ClockHandle class ClockHandleTable { public: - explicit ClockHandleTable(size_t capacity, int hash_bits); + explicit ClockHandleTable(int hash_bits, bool initial_charge_metadata); ~ClockHandleTable(); - // Returns a pointer to a visible handle matching the key/hash, or - // nullptr if not present. When an actual handle is produced, an - // internal reference is handed over. - ClockHandle* Lookup(const Slice& key, uint32_t hash); - - // Inserts a copy of h into the hash table. Returns a pointer to the - // inserted handle, or nullptr if no available slot was found. Every - // existing visible handle matching the key is already present in the - // hash table is marked as WILL_BE_DELETED. The deletion is also attempted, - // and, if the attempt is successful, the handle is inserted into the - // autovector deleted. When take_reference is true, the function hands - // over an external reference on the handle, and otherwise no reference is - // produced. - ClockHandle* Insert(ClockHandle* h, autovector* deleted, - bool take_reference); - - // Assigns h the appropriate clock priority, making it evictable. - void ClockOn(ClockHandle* h); - - // Makes h non-evictable. - void ClockOff(ClockHandle* h); - - // Runs the clock eviction algorithm until usage_ + charge is at most - // capacity_. - void ClockRun(size_t charge); - - // Remove h from the hash table. Requires an exclusive ref to h. - void Remove(ClockHandle* h, autovector* deleted); - - // Remove from the hash table all handles with matching key/hash along a - // probe sequence, starting from the given probe number. Doesn't - // require any references. - void RemoveAll(const Slice& key, uint32_t hash, uint32_t& probe, - autovector* deleted); - - void RemoveAll(const Slice& key, uint32_t hash, - autovector* deleted) { - uint32_t probe = 0; - RemoveAll(key, hash, probe, deleted); - } + Status Insert(const ClockHandleMoreData& proto, ClockHandle** handle, + Cache::Priority priority, size_t capacity, + bool strict_capacity_limit); - // Tries to remove h from the hash table. If the attempt is successful, - // the function hands over an exclusive ref to h. - bool TryRemove(ClockHandle* h, autovector* deleted); - - // Similar to TryRemove, except that it spins, increasing the chances of - // success. Requires that the caller thread has no shared ref to h. - bool SpinTryRemove(ClockHandle* h, autovector* deleted); - - // Call this function after an Insert, Remove, RemoveAll, TryRemove - // or SpinTryRemove. It frees the deleted values and updates the hash table - // metadata. - void Free(autovector* deleted); - - void ApplyToEntriesRange(std::function func, - uint32_t index_begin, uint32_t index_end, - bool apply_if_will_be_deleted) { - for (uint32_t i = index_begin; i < index_end; i++) { - ClockHandle* h = &array_[i]; - if (h->TryExclusiveRef()) { - if (h->IsElement() && - (apply_if_will_be_deleted || !h->WillBeDeleted())) { - func(h); - } - h->ReleaseExclusiveRef(); - } - } - } + ClockHandle* Lookup(const CacheKeyBytes& key, uint32_t hash); + + bool Release(ClockHandle* handle, bool useful, bool erase_if_last_ref); - void ConstApplyToEntriesRange(std::function func, + void Ref(ClockHandle& handle); + + void Erase(const CacheKeyBytes& key, uint32_t hash); + + void ConstApplyToEntriesRange(std::function func, uint32_t index_begin, uint32_t index_end, - bool apply_if_will_be_deleted) const { - for (uint32_t i = index_begin; i < index_end; i++) { - ClockHandle* h = &array_[i]; - // We take an external ref because we are handing over control - // to a user-defined function, and because the handle will not be - // modified. - if (h->TryExternalRef()) { - if (h->IsElement() && - (apply_if_will_be_deleted || !h->WillBeDeleted())) { - func(h); - } - h->ReleaseExternalRef(); - } - } - } + bool apply_if_will_be_deleted) const; + + void EraseUnRefEntries(); uint32_t GetTableSize() const { return uint32_t{1} << length_bits_; } @@ -615,22 +415,29 @@ class ClockHandleTable { uint32_t GetOccupancyLimit() const { return occupancy_limit_; } - uint32_t GetOccupancy() const { return occupancy_; } + uint32_t GetOccupancy() const { + return occupancy_.load(std::memory_order_relaxed); + } - size_t GetUsage() const { return usage_; } + size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); } - size_t GetCapacity() const { return capacity_; } + size_t GetDetachedUsage() const { + return detached_usage_.load(std::memory_order_relaxed); + } - void SetCapacity(size_t capacity) { capacity_ = capacity; } + // Acquire/release N references + void TEST_RefN(ClockHandle& handle, size_t n); + void TEST_ReleaseN(ClockHandle* handle, size_t n); + private: // functions // Returns x mod 2^{length_bits_}. uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; } - private: - // Extracts the element information from a handle (src), and assigns it - // to a hash table slot (dst). Doesn't touch displacements and refs, - // which are maintained by the hash table algorithm. - void Assign(ClockHandle* dst, ClockHandle* src); + // Runs the clock eviction algorithm trying to reclaim at least + // requested_charge. Returns how much is evicted, which could be less + // if it appears impossible to evict the requested amount without blocking. + void Evict(size_t requested_charge, size_t* freed_charge, + uint32_t* freed_count); // Returns the first slot in the probe sequence, starting from the given // probe number, with a handle e such that match(e) is true. At every @@ -643,26 +450,17 @@ class ClockHandleTable { // value of probe is one more than the last non-aborting probe during the // call. This is so that that the variable can be used to keep track of // progress across consecutive calls to FindSlot. - inline ClockHandle* FindSlot(const Slice& key, + inline ClockHandle* FindSlot(uint32_t hash, std::function match, std::function stop, std::function update, uint32_t& probe); - // Returns an available slot for the given key. All copies of the - // key found along the probing sequence until an available slot is - // found are marked for deletion. On each of them, a deletion is - // attempted, and when the attempt succeeds the slot is assigned to - // the new copy of the element. - ClockHandle* FindAvailableSlot(const Slice& key, uint32_t hash, - uint32_t& probe, - autovector* deleted); - - // After a failed FindSlot call (i.e., with answer -1) in - // FindAvailableSlot, this function fixes all displacements's - // starting from the 0-th probe, until the given probe. - void Rollback(const Slice& key, uint32_t probe); + // Re-decrement all displacements in probe path starting from beginning + // until (not including) the given handle + void Rollback(uint32_t hash, const ClockHandle* h); + private: // data // Number of hash bits used for table index. // The size of the table is 1 << length_bits_. const int length_bits_; @@ -673,27 +471,26 @@ class ClockHandleTable { // Maximum number of elements the user can store in the table. const uint32_t occupancy_limit_; - // Maximum total charge of all elements stored in the table. - size_t capacity_; + // Array of slots comprising the hash table. + const std::unique_ptr array_; // We partition the following members into different cache lines // to avoid false sharing among Lookup, Release, Erase and Insert // operations in ClockCacheShard. - ALIGN_AS(CACHE_LINE_SIZE) - // Array of slots comprising the hash table. - std::unique_ptr array_; - ALIGN_AS(CACHE_LINE_SIZE) // Clock algorithm sweep pointer. - std::atomic clock_pointer_; + std::atomic clock_pointer_{}; ALIGN_AS(CACHE_LINE_SIZE) // Number of elements in the table. - std::atomic occupancy_; + std::atomic occupancy_{}; - // Memory size for entries residing in the cache. - std::atomic usage_; + // Memory usage by entries tracked by the cache (including detached) + std::atomic usage_{}; + + // Part of usage by detached entries (not in table) + std::atomic detached_usage_{}; }; // class ClockHandleTable // A single shard of sharded cache. @@ -704,58 +501,34 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard { CacheMetadataChargePolicy metadata_charge_policy); ~ClockCacheShard() override = default; - // Separate from constructor so caller can easily make an array of ClockCache - // if current usage is more than new capacity, the function will attempt to - // free the needed space. + // TODO: document limitations void SetCapacity(size_t capacity) override; - // Set the flag to reject insertion if cache if full. void SetStrictCapacityLimit(bool strict_capacity_limit) override; - // Like Cache methods, but with an extra "hash" parameter. - // Insert an item into the hash table and, if handle is null, make it - // evictable by the clock algorithm. Older items are evicted as necessary. - // If the cache is full and free_handle_on_fail is true, the item is deleted - // and handle is set to nullptr. Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, Cache::DeleterFn deleter, Cache::Handle** handle, Cache::Priority priority) override; - Status Insert(const Slice& key, uint32_t hash, void* value, - const Cache::CacheItemHelper* helper, size_t charge, - Cache::Handle** handle, Cache::Priority priority) override { - return Insert(key, hash, value, charge, helper->del_cb, handle, priority); - } - - Cache::Handle* Lookup(const Slice& key, uint32_t hash, - const Cache::CacheItemHelper* /*helper*/, - const Cache::CreateCallback& /*create_cb*/, - Cache::Priority /*priority*/, bool /*wait*/, - Statistics* /*stats*/) override { - return Lookup(key, hash); - } - Cache::Handle* Lookup(const Slice& key, uint32_t hash) override; - bool Release(Cache::Handle* handle, bool /*useful*/, - bool erase_if_last_ref) override { - return Release(handle, erase_if_last_ref); - } + bool Release(Cache::Handle* handle, bool useful, + bool erase_if_last_ref) override; - bool IsReady(Cache::Handle* /*handle*/) override { return true; } - - void Wait(Cache::Handle* /*handle*/) override {} + bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override; bool Ref(Cache::Handle* handle) override; - bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override; - void Erase(const Slice& key, uint32_t hash) override; size_t GetUsage() const override; size_t GetPinnedUsage() const override; + size_t GetOccupancyCount() const override; + + size_t GetTableAddressCount() const override; + void ApplyToSomeEntries( const std::function& callback, @@ -765,29 +538,48 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard { std::string GetPrintableOptions() const override { return std::string{}; } - private: + // SecondaryCache not yet supported + Status Insert(const Slice& key, uint32_t hash, void* value, + const Cache::CacheItemHelper* helper, size_t charge, + Cache::Handle** handle, Cache::Priority priority) override { + return Insert(key, hash, value, charge, helper->del_cb, handle, priority); + } + + Cache::Handle* Lookup(const Slice& key, uint32_t hash, + const Cache::CacheItemHelper* /*helper*/, + const Cache::CreateCallback& /*create_cb*/, + Cache::Priority /*priority*/, bool /*wait*/, + Statistics* /*stats*/) override { + return Lookup(key, hash); + } + + bool IsReady(Cache::Handle* /*handle*/) override { return true; } + + void Wait(Cache::Handle* /*handle*/) override {} + + // Acquire/release N references + void TEST_RefN(Cache::Handle* handle, size_t n); + void TEST_ReleaseN(Cache::Handle* handle, size_t n); + + private: // functions friend class ClockCache; friend class ClockCacheTest; - ClockHandle* DetachedInsert(ClockHandle* h); - - // Returns the charge of a single handle. - static size_t CalcEstimatedHandleCharge( - size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy); + ClockHandle* DetachedInsert(const ClockHandleMoreData& h); // Returns the number of bits used to hash an element in the hash // table. static int CalcHashBits(size_t capacity, size_t estimated_value_size, CacheMetadataChargePolicy metadata_charge_policy); - // Whether to reject insertion if cache reaches its full capacity. - std::atomic strict_capacity_limit_; + private: // data + ClockHandleTable table_; - // Handles allocated separately from the table. - std::atomic detached_usage_; + // Maximum total charge of all elements stored in the table. + std::atomic capacity_; - ClockHandleTable table_; + // Whether to reject insertion if cache reaches its full capacity. + std::atomic strict_capacity_limit_; }; // class ClockCacheShard class ClockCache diff --git a/cache/fast_lru_cache.cc b/cache/fast_lru_cache.cc index 817f3be18..f5f93800d 100644 --- a/cache/fast_lru_cache.cc +++ b/cache/fast_lru_cache.cc @@ -173,13 +173,13 @@ inline int LRUHandleTable::FindSlot(const Slice& key, LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy) - : capacity_(capacity), + : CacheShard(metadata_charge_policy), + capacity_(capacity), strict_capacity_limit_(strict_capacity_limit), table_( CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)), usage_(0), lru_usage_(0) { - set_metadata_charge_policy(metadata_charge_policy); // Make empty circular linked list. lru_.next = &lru_; lru_.prev = &lru_; @@ -525,6 +525,16 @@ size_t LRUCacheShard::GetPinnedUsage() const { return usage_ - lru_usage_; } +size_t LRUCacheShard::GetOccupancyCount() const { + DMutexLock l(mutex_); + return table_.GetOccupancy(); +} + +size_t LRUCacheShard::GetTableAddressCount() const { + DMutexLock l(mutex_); + return table_.GetTableSize(); +} + std::string LRUCacheShard::GetPrintableOptions() const { return std::string{}; } LRUCache::LRUCache(size_t capacity, size_t estimated_value_size, diff --git a/cache/fast_lru_cache.h b/cache/fast_lru_cache.h index a02422beb..77aff8bab 100644 --- a/cache/fast_lru_cache.h +++ b/cache/fast_lru_cache.h @@ -368,6 +368,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { size_t GetUsage() const override; size_t GetPinnedUsage() const override; + size_t GetOccupancyCount() const override; + size_t GetTableAddressCount() const override; void ApplyToSomeEntries( const std::function& secondary_cache) - : capacity_(0), + : CacheShard(metadata_charge_policy), + capacity_(0), high_pri_pool_usage_(0), low_pri_pool_usage_(0), strict_capacity_limit_(strict_capacity_limit), @@ -128,7 +129,6 @@ LRUCacheShard::LRUCacheShard( lru_usage_(0), mutex_(use_adaptive_mutex), secondary_cache_(secondary_cache) { - set_metadata_charge_policy(metadata_charge_policy); // Make empty circular linked list. lru_.next = &lru_; lru_.prev = &lru_; @@ -759,6 +759,16 @@ size_t LRUCacheShard::GetPinnedUsage() const { return usage_ - lru_usage_; } +size_t LRUCacheShard::GetOccupancyCount() const { + DMutexLock l(mutex_); + return table_.GetOccupancyCount(); +} + +size_t LRUCacheShard::GetTableAddressCount() const { + DMutexLock l(mutex_); + return size_t{1} << table_.GetLengthBits(); +} + std::string LRUCacheShard::GetPrintableOptions() const { const int kBufferSize = 200; char buffer[kBufferSize]; diff --git a/cache/lru_cache.h b/cache/lru_cache.h index b60d5ac7b..6e642d04d 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -305,6 +305,8 @@ class LRUHandleTable { int GetLengthBits() const { return length_bits_; } + size_t GetOccupancyCount() const { return elems_; } + private: // Return a pointer to slot that points to a cache entry that // matches key/hash. If there is no such cache entry, return a @@ -394,6 +396,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { virtual size_t GetUsage() const override; virtual size_t GetPinnedUsage() const override; + virtual size_t GetOccupancyCount() const override; + virtual size_t GetTableAddressCount() const override; virtual void ApplyToSomeEntries( const std::function( port::cacheline_aligned_alloc(sizeof(ClockCacheShard))); - new (shard_) ClockCacheShard(capacity, 1, true /*strict_capacity_limit*/, + new (shard_) ClockCacheShard(capacity, 1, strict_capacity_limit, kDontChargeCacheMetadata); } @@ -539,21 +539,26 @@ class ClockCacheTest : public testing::Test { return Insert(std::string(kCacheKeySize, key), priority); } - Status Insert(char key, size_t len) { return Insert(std::string(len, key)); } + Status InsertWithLen(char key, size_t len) { + return Insert(std::string(len, key)); + } - bool Lookup(const std::string& key) { + bool Lookup(const std::string& key, bool useful = true) { auto handle = shard_->Lookup(key, 0 /*hash*/); if (handle) { - shard_->Release(handle); + shard_->Release(handle, useful, /*erase_if_last_ref=*/false); return true; } return false; } - bool Lookup(char key) { return Lookup(std::string(kCacheKeySize, key)); } + bool Lookup(char key, bool useful = true) { + return Lookup(std::string(kCacheKeySize, key), useful); + } void Erase(const std::string& key) { shard_->Erase(key, 0 /*hash*/); } +#if 0 // FIXME size_t CalcEstimatedHandleChargeWrapper( size_t estimated_value_size, CacheMetadataChargePolicy metadata_charge_policy) { @@ -583,106 +588,419 @@ class ClockCacheTest : public testing::Test { (1 << (hash_bits - 1) <= max_occupancy); } } +#endif - private: ClockCacheShard* shard_ = nullptr; }; -TEST_F(ClockCacheTest, Validate) { +TEST_F(ClockCacheTest, Misc) { NewShard(3); - EXPECT_OK(Insert('a', 16)); - EXPECT_NOK(Insert('b', 15)); - EXPECT_OK(Insert('b', 16)); - EXPECT_NOK(Insert('c', 17)); - EXPECT_NOK(Insert('d', 1000)); - EXPECT_NOK(Insert('e', 11)); - EXPECT_NOK(Insert('f', 0)); -} -TEST_F(ClockCacheTest, ClockPriorityTest) { - ClockHandle handle; - EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::NONE); - handle.SetClockPriority(ClockHandle::ClockPriority::HIGH); - EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::HIGH); - handle.DecreaseClockPriority(); - EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::MEDIUM); - handle.DecreaseClockPriority(); - EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::LOW); - handle.SetClockPriority(ClockHandle::ClockPriority::MEDIUM); - EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::MEDIUM); - handle.SetClockPriority(ClockHandle::ClockPriority::NONE); - EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::NONE); - handle.SetClockPriority(ClockHandle::ClockPriority::MEDIUM); - EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::MEDIUM); - handle.DecreaseClockPriority(); - handle.DecreaseClockPriority(); - EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::NONE); + // Key size stuff + EXPECT_OK(InsertWithLen('a', 16)); + EXPECT_NOK(InsertWithLen('b', 15)); + EXPECT_OK(InsertWithLen('b', 16)); + EXPECT_NOK(InsertWithLen('c', 17)); + EXPECT_NOK(InsertWithLen('d', 1000)); + EXPECT_NOK(InsertWithLen('e', 11)); + EXPECT_NOK(InsertWithLen('f', 0)); + + // Some of this is motivated by code coverage + std::string wrong_size_key(15, 'x'); + EXPECT_FALSE(Lookup(wrong_size_key)); + EXPECT_FALSE(shard_->Ref(nullptr)); + EXPECT_FALSE(shard_->Release(nullptr)); + shard_->Erase(wrong_size_key, /*hash*/ 42); // no-op } -TEST_F(ClockCacheTest, CalcHashBitsTest) { - size_t capacity; - size_t estimated_value_size; - double max_occupancy; - int hash_bits; - CacheMetadataChargePolicy metadata_charge_policy; +TEST_F(ClockCacheTest, Limits) { + NewShard(3, false /*strict_capacity_limit*/); + for (bool strict_capacity_limit : {false, true, false}) { + SCOPED_TRACE("strict_capacity_limit = " + + std::to_string(strict_capacity_limit)); - // Vary the cache capacity, fix the element charge. - for (int i = 0; i < 2048; i++) { - capacity = i; - estimated_value_size = 0; - metadata_charge_policy = kFullChargeCacheMetadata; - max_occupancy = CalcMaxOccupancy(capacity, estimated_value_size, - metadata_charge_policy); - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy)); + // Also tests switching between strict limit and not + shard_->SetStrictCapacityLimit(strict_capacity_limit); + + std::string key(16, 'x'); + + // Single entry charge beyond capacity + { + Status s = shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, + 5 /*charge*/, nullptr /*deleter*/, + nullptr /*handle*/, Cache::Priority::LOW); + if (strict_capacity_limit) { + EXPECT_TRUE(s.IsMemoryLimit()); + } else { + EXPECT_OK(s); + } + } + + // Single entry fills capacity + { + Cache::Handle* h; + ASSERT_OK(shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 3 /*charge*/, + nullptr /*deleter*/, &h, Cache::Priority::LOW)); + // Try to insert more + Status s = Insert('a'); + if (strict_capacity_limit) { + EXPECT_TRUE(s.IsMemoryLimit()); + } else { + EXPECT_OK(s); + } + // Release entry filling capacity. + // Cover useful = false case. + shard_->Release(h, false /*useful*/, false /*erase_if_last_ref*/); + } + + // Insert more than table size can handle (cleverly using zero-charge + // entries) to exceed occupancy limit. + { + size_t n = shard_->GetTableAddressCount() + 1; + std::unique_ptr ha { new Cache::Handle* [n] {} }; + Status s; + for (size_t i = 0; i < n && s.ok(); ++i) { + EncodeFixed64(&key[0], i); + s = shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 0 /*charge*/, + nullptr /*deleter*/, &ha[i], Cache::Priority::LOW); + if (i == 0) { + EXPECT_OK(s); + } + } + if (strict_capacity_limit) { + EXPECT_TRUE(s.IsMemoryLimit()); + } else { + EXPECT_OK(s); + } + // Same result if not keeping a reference + s = Insert('a'); + if (strict_capacity_limit) { + EXPECT_TRUE(s.IsMemoryLimit()); + } else { + EXPECT_OK(s); + } + + // Regardless, we didn't allow table to actually get full + EXPECT_LT(shard_->GetOccupancyCount(), shard_->GetTableAddressCount()); + + // Release handles + for (size_t i = 0; i < n; ++i) { + if (ha[i]) { + shard_->Release(ha[i]); + } + } + } } +} - // Fix the cache capacity, vary the element charge. - for (int i = 0; i < 1024; i++) { - capacity = 1024; - estimated_value_size = i; - metadata_charge_policy = kFullChargeCacheMetadata; - max_occupancy = CalcMaxOccupancy(capacity, estimated_value_size, - metadata_charge_policy); - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy)); +TEST_F(ClockCacheTest, ClockEvictionTest) { + for (bool strict_capacity_limit : {false, true}) { + SCOPED_TRACE("strict_capacity_limit = " + + std::to_string(strict_capacity_limit)); + + NewShard(6, strict_capacity_limit); + EXPECT_OK(Insert('a', Cache::Priority::BOTTOM)); + EXPECT_OK(Insert('b', Cache::Priority::LOW)); + EXPECT_OK(Insert('c', Cache::Priority::HIGH)); + EXPECT_OK(Insert('d', Cache::Priority::BOTTOM)); + EXPECT_OK(Insert('e', Cache::Priority::LOW)); + EXPECT_OK(Insert('f', Cache::Priority::HIGH)); + + EXPECT_TRUE(Lookup('a', /*use*/ false)); + EXPECT_TRUE(Lookup('b', /*use*/ false)); + EXPECT_TRUE(Lookup('c', /*use*/ false)); + EXPECT_TRUE(Lookup('d', /*use*/ false)); + EXPECT_TRUE(Lookup('e', /*use*/ false)); + EXPECT_TRUE(Lookup('f', /*use*/ false)); + + // Ensure bottom are evicted first, even if new entries are low + EXPECT_OK(Insert('g', Cache::Priority::LOW)); + EXPECT_OK(Insert('h', Cache::Priority::LOW)); + + EXPECT_FALSE(Lookup('a', /*use*/ false)); + EXPECT_TRUE(Lookup('b', /*use*/ false)); + EXPECT_TRUE(Lookup('c', /*use*/ false)); + EXPECT_FALSE(Lookup('d', /*use*/ false)); + EXPECT_TRUE(Lookup('e', /*use*/ false)); + EXPECT_TRUE(Lookup('f', /*use*/ false)); + // Mark g & h useful + EXPECT_TRUE(Lookup('g', /*use*/ true)); + EXPECT_TRUE(Lookup('h', /*use*/ true)); + + // Then old LOW entries + EXPECT_OK(Insert('i', Cache::Priority::LOW)); + EXPECT_OK(Insert('j', Cache::Priority::LOW)); + + EXPECT_FALSE(Lookup('b', /*use*/ false)); + EXPECT_TRUE(Lookup('c', /*use*/ false)); + EXPECT_FALSE(Lookup('e', /*use*/ false)); + EXPECT_TRUE(Lookup('f', /*use*/ false)); + // Mark g & h useful once again + EXPECT_TRUE(Lookup('g', /*use*/ true)); + EXPECT_TRUE(Lookup('h', /*use*/ true)); + EXPECT_TRUE(Lookup('i', /*use*/ false)); + EXPECT_TRUE(Lookup('j', /*use*/ false)); + + // Then old HIGH entries + EXPECT_OK(Insert('k', Cache::Priority::LOW)); + EXPECT_OK(Insert('l', Cache::Priority::LOW)); + + EXPECT_FALSE(Lookup('c', /*use*/ false)); + EXPECT_FALSE(Lookup('f', /*use*/ false)); + EXPECT_TRUE(Lookup('g', /*use*/ false)); + EXPECT_TRUE(Lookup('h', /*use*/ false)); + EXPECT_TRUE(Lookup('i', /*use*/ false)); + EXPECT_TRUE(Lookup('j', /*use*/ false)); + EXPECT_TRUE(Lookup('k', /*use*/ false)); + EXPECT_TRUE(Lookup('l', /*use*/ false)); + + // Then the (roughly) least recently useful + EXPECT_OK(Insert('m', Cache::Priority::HIGH)); + EXPECT_OK(Insert('n', Cache::Priority::HIGH)); + + EXPECT_TRUE(Lookup('g', /*use*/ false)); + EXPECT_TRUE(Lookup('h', /*use*/ false)); + EXPECT_FALSE(Lookup('i', /*use*/ false)); + EXPECT_FALSE(Lookup('j', /*use*/ false)); + EXPECT_TRUE(Lookup('k', /*use*/ false)); + EXPECT_TRUE(Lookup('l', /*use*/ false)); + + // Now try changing capacity down + shard_->SetCapacity(4); + // Insert to ensure evictions happen + EXPECT_OK(Insert('o', Cache::Priority::LOW)); + EXPECT_OK(Insert('p', Cache::Priority::LOW)); + + EXPECT_FALSE(Lookup('g', /*use*/ false)); + EXPECT_FALSE(Lookup('h', /*use*/ false)); + EXPECT_FALSE(Lookup('k', /*use*/ false)); + EXPECT_FALSE(Lookup('l', /*use*/ false)); + EXPECT_TRUE(Lookup('m', /*use*/ false)); + EXPECT_TRUE(Lookup('n', /*use*/ false)); + EXPECT_TRUE(Lookup('o', /*use*/ false)); + EXPECT_TRUE(Lookup('p', /*use*/ false)); + + // Now try changing capacity up + EXPECT_TRUE(Lookup('m', /*use*/ true)); + EXPECT_TRUE(Lookup('n', /*use*/ true)); + shard_->SetCapacity(6); + EXPECT_OK(Insert('q', Cache::Priority::HIGH)); + EXPECT_OK(Insert('r', Cache::Priority::HIGH)); + EXPECT_OK(Insert('s', Cache::Priority::HIGH)); + EXPECT_OK(Insert('t', Cache::Priority::HIGH)); + + EXPECT_FALSE(Lookup('o', /*use*/ false)); + EXPECT_FALSE(Lookup('p', /*use*/ false)); + EXPECT_TRUE(Lookup('m', /*use*/ false)); + EXPECT_TRUE(Lookup('n', /*use*/ false)); + EXPECT_TRUE(Lookup('q', /*use*/ false)); + EXPECT_TRUE(Lookup('r', /*use*/ false)); + EXPECT_TRUE(Lookup('s', /*use*/ false)); + EXPECT_TRUE(Lookup('t', /*use*/ false)); } +} - // Zero-capacity cache, and only values have charge. - capacity = 0; - estimated_value_size = 1; - metadata_charge_policy = kDontChargeCacheMetadata; - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */)); +void IncrementIntDeleter(const Slice& /*key*/, void* value) { + *reinterpret_cast(value) += 1; +} - // Zero-capacity cache, and only metadata has charge. - capacity = 0; - estimated_value_size = 0; - metadata_charge_policy = kFullChargeCacheMetadata; - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */)); +// Testing calls to CorrectNearOverflow in Release +TEST_F(ClockCacheTest, ClockCounterOverflowTest) { + NewShard(6, /*strict_capacity_limit*/ false); + Cache::Handle* h; + int deleted = 0; + std::string my_key(kCacheKeySize, 'x'); + uint32_t my_hash = 42; + ASSERT_OK(shard_->Insert(my_key, my_hash, &deleted, 1, IncrementIntDeleter, + &h, Cache::Priority::HIGH)); + + // Some large number outstanding + shard_->TEST_RefN(h, 123456789); + // Simulate many lookup/ref + release, plenty to overflow counters + for (int i = 0; i < 10000; ++i) { + shard_->TEST_RefN(h, 1234567); + shard_->TEST_ReleaseN(h, 1234567); + } + // Mark it invisible (to reach a different CorrectNearOverflow() in Release) + shard_->Erase(my_key, my_hash); + // Simulate many more lookup/ref + release (one-by-one would be too + // expensive for unit test) + for (int i = 0; i < 10000; ++i) { + shard_->TEST_RefN(h, 1234567); + shard_->TEST_ReleaseN(h, 1234567); + } + // Free all but last 1 + shard_->TEST_ReleaseN(h, 123456789); + // Still alive + ASSERT_EQ(deleted, 0); + // Free last ref, which will finalize erasure + shard_->Release(h); + // Deleted + ASSERT_EQ(deleted, 1); +} - // Small cache, large elements. - capacity = 1024; - estimated_value_size = 8192; - metadata_charge_policy = kFullChargeCacheMetadata; - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */)); +// This test is mostly to exercise some corner case logic, by forcing two +// keys to have the same hash, and more +TEST_F(ClockCacheTest, CollidingInsertEraseTest) { + NewShard(6, /*strict_capacity_limit*/ false); + int deleted = 0; + std::string key1(kCacheKeySize, 'x'); + std::string key2(kCacheKeySize, 'y'); + std::string key3(kCacheKeySize, 'z'); + uint32_t my_hash = 42; + Cache::Handle* h1; + ASSERT_OK(shard_->Insert(key1, my_hash, &deleted, 1, IncrementIntDeleter, &h1, + Cache::Priority::HIGH)); + Cache::Handle* h2; + ASSERT_OK(shard_->Insert(key2, my_hash, &deleted, 1, IncrementIntDeleter, &h2, + Cache::Priority::HIGH)); + Cache::Handle* h3; + ASSERT_OK(shard_->Insert(key3, my_hash, &deleted, 1, IncrementIntDeleter, &h3, + Cache::Priority::HIGH)); + + // Can repeatedly lookup+release despite the hash collision + Cache::Handle* tmp_h; + for (bool erase_if_last_ref : {true, false}) { // but not last ref + tmp_h = shard_->Lookup(key1, my_hash); + ASSERT_EQ(h1, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + + tmp_h = shard_->Lookup(key2, my_hash); + ASSERT_EQ(h2, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + + tmp_h = shard_->Lookup(key3, my_hash); + ASSERT_EQ(h3, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + } + + // Make h1 invisible + shard_->Erase(key1, my_hash); + // Redundant erase + shard_->Erase(key1, my_hash); + + // All still alive + ASSERT_EQ(deleted, 0); + + // Invisible to Lookup + tmp_h = shard_->Lookup(key1, my_hash); + ASSERT_EQ(nullptr, tmp_h); + + // Can still find h2, h3 + for (bool erase_if_last_ref : {true, false}) { // but not last ref + tmp_h = shard_->Lookup(key2, my_hash); + ASSERT_EQ(h2, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + + tmp_h = shard_->Lookup(key3, my_hash); + ASSERT_EQ(h3, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + } + + // Also Insert with invisible entry there + ASSERT_OK(shard_->Insert(key1, my_hash, &deleted, 1, IncrementIntDeleter, + nullptr, Cache::Priority::HIGH)); + tmp_h = shard_->Lookup(key1, my_hash); + // Found but distinct handle + ASSERT_NE(nullptr, tmp_h); + ASSERT_NE(h1, tmp_h); + ASSERT_TRUE(shard_->Release(tmp_h, /*erase_if_last_ref*/ true)); + + // tmp_h deleted + ASSERT_EQ(deleted--, 1); + + // Release last ref on h1 (already invisible) + ASSERT_TRUE(shard_->Release(h1, /*erase_if_last_ref*/ false)); + + // h1 deleted + ASSERT_EQ(deleted--, 1); + h1 = nullptr; + + // Can still find h2, h3 + for (bool erase_if_last_ref : {true, false}) { // but not last ref + tmp_h = shard_->Lookup(key2, my_hash); + ASSERT_EQ(h2, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + + tmp_h = shard_->Lookup(key3, my_hash); + ASSERT_EQ(h3, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + } + + // Release last ref on h2 + ASSERT_FALSE(shard_->Release(h2, /*erase_if_last_ref*/ false)); + + // h2 still not deleted (unreferenced in cache) + ASSERT_EQ(deleted, 0); + + // Can still find it + tmp_h = shard_->Lookup(key2, my_hash); + ASSERT_EQ(h2, tmp_h); + + // Release last ref on h2, with erase + ASSERT_TRUE(shard_->Release(h2, /*erase_if_last_ref*/ true)); + + // h2 deleted + ASSERT_EQ(deleted--, 1); + tmp_h = shard_->Lookup(key2, my_hash); + ASSERT_EQ(nullptr, tmp_h); + + // Can still find h3 + for (bool erase_if_last_ref : {true, false}) { // but not last ref + tmp_h = shard_->Lookup(key3, my_hash); + ASSERT_EQ(h3, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + } - // Large capacity. - capacity = 31924172; - estimated_value_size = 8192; - metadata_charge_policy = kFullChargeCacheMetadata; - max_occupancy = - CalcMaxOccupancy(capacity, estimated_value_size, metadata_charge_policy); - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy)); + // Release last ref on h3, without erase + ASSERT_FALSE(shard_->Release(h3, /*erase_if_last_ref*/ false)); + + // h3 still not deleted (unreferenced in cache) + ASSERT_EQ(deleted, 0); + + // Explicit erase + shard_->Erase(key3, my_hash); + + // h3 deleted + ASSERT_EQ(deleted--, 1); + tmp_h = shard_->Lookup(key3, my_hash); + ASSERT_EQ(nullptr, tmp_h); +} + +// This uses the public API to effectively test CalcHashBits etc. +TEST_F(ClockCacheTest, TableSizesTest) { + for (size_t est_val_size : {1U, 5U, 123U, 2345U, 345678U}) { + SCOPED_TRACE("est_val_size = " + std::to_string(est_val_size)); + for (double est_count : {1.1, 2.2, 511.9, 512.1, 2345.0}) { + SCOPED_TRACE("est_count = " + std::to_string(est_count)); + size_t capacity = static_cast(est_val_size * est_count); + // kDontChargeCacheMetadata + auto cache = ExperimentalNewClockCache( + capacity, est_val_size, /*num shard_bits*/ -1, + /*strict_capacity_limit*/ false, kDontChargeCacheMetadata); + // Table sizes are currently only powers of two + EXPECT_GE(cache->GetTableAddressCount(), est_count / kLoadFactor); + EXPECT_LE(cache->GetTableAddressCount(), est_count / kLoadFactor * 2.0); + EXPECT_EQ(cache->GetUsage(), 0); + + // kFullChargeMetaData + // Because table sizes are currently only powers of two, sizes get + // really weird when metadata is a huge portion of capacity. For example, + // doubling the table size could cut by 90% the space available to + // values. Therefore, we omit those weird cases for now. + if (est_val_size >= 512) { + cache = ExperimentalNewClockCache( + capacity, est_val_size, /*num shard_bits*/ -1, + /*strict_capacity_limit*/ false, kFullChargeCacheMetadata); + double est_count_after_meta = + (capacity - cache->GetUsage()) * 1.0 / est_val_size; + EXPECT_GE(cache->GetTableAddressCount(), + est_count_after_meta / kLoadFactor); + EXPECT_LE(cache->GetTableAddressCount(), + est_count_after_meta / kLoadFactor * 2.0); + } + } + } } } // namespace clock_cache diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc index 6a5fbebdc..3e6d6a4f7 100644 --- a/cache/sharded_cache.cc +++ b/cache/sharded_cache.cc @@ -213,9 +213,9 @@ std::string ShardedCache::GetPrintableOptions() const { ret.append(GetShard(0)->GetPrintableOptions()); return ret; } -int GetDefaultCacheShardBits(size_t capacity) { + +int GetDefaultCacheShardBits(size_t capacity, size_t min_shard_size) { int num_shard_bits = 0; - size_t min_shard_size = 512L * 1024L; // Every shard is at least 512KB. size_t num_shards = capacity / min_shard_size; while (num_shards >>= 1) { if (++num_shard_bits >= 6) { @@ -230,4 +230,21 @@ int ShardedCache::GetNumShardBits() const { return BitsSetToOne(shard_mask_); } uint32_t ShardedCache::GetNumShards() const { return shard_mask_ + 1; } +size_t ShardedCache::GetOccupancyCount() const { + size_t oc = 0; + uint32_t num_shards = GetNumShards(); + for (uint32_t s = 0; s < num_shards; s++) { + oc += GetShard(s)->GetOccupancyCount(); + } + return oc; +} +size_t ShardedCache::GetTableAddressCount() const { + size_t tac = 0; + uint32_t num_shards = GetNumShards(); + for (uint32_t s = 0; s < num_shards; s++) { + tac += GetShard(s)->GetTableAddressCount(); + } + return tac; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index c0bb60a21..8713d1dce 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -20,7 +20,8 @@ namespace ROCKSDB_NAMESPACE { // Single cache shard interface. class CacheShard { public: - CacheShard() = default; + explicit CacheShard(CacheMetadataChargePolicy metadata_charge_policy) + : metadata_charge_policy_(metadata_charge_policy) {} virtual ~CacheShard() = default; using DeleterFn = Cache::DeleterFn; @@ -47,6 +48,8 @@ class CacheShard { virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; virtual size_t GetUsage() const = 0; virtual size_t GetPinnedUsage() const = 0; + virtual size_t GetOccupancyCount() const = 0; + virtual size_t GetTableAddressCount() const = 0; // Handles iterating over roughly `average_entries_per_lock` entries, using // `state` to somehow record where it last ended up. Caller initially uses // *state == 0 and implementation sets *state = UINT32_MAX to indicate @@ -57,13 +60,9 @@ class CacheShard { uint32_t average_entries_per_lock, uint32_t* state) = 0; virtual void EraseUnRefEntries() = 0; virtual std::string GetPrintableOptions() const { return ""; } - void set_metadata_charge_policy( - CacheMetadataChargePolicy metadata_charge_policy) { - metadata_charge_policy_ = metadata_charge_policy; - } protected: - CacheMetadataChargePolicy metadata_charge_policy_ = kDontChargeCacheMetadata; + const CacheMetadataChargePolicy metadata_charge_policy_; }; // Generic cache interface which shards cache by hash of keys. 2^num_shard_bits @@ -106,6 +105,8 @@ class ShardedCache : public Cache { virtual size_t GetUsage() const override; virtual size_t GetUsage(Handle* handle) const override; virtual size_t GetPinnedUsage() const override; + virtual size_t GetOccupancyCount() const override; + virtual size_t GetTableAddressCount() const override; virtual void ApplyToAllEntries( const std::function& callback, @@ -127,6 +128,8 @@ class ShardedCache : public Cache { std::atomic last_id_; }; -extern int GetDefaultCacheShardBits(size_t capacity); +// 512KB is traditional minimum shard size. +int GetDefaultCacheShardBits(size_t capacity, + size_t min_shard_size = 512U * 1024U); } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 699873e9f..d550c5225 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -939,11 +939,15 @@ TEST_F(DBBlockCacheTest, AddRedundantStats) { for (std::shared_ptr base_cache : {NewLRUCache(capacity, num_shard_bits), ExperimentalNewClockCache( - capacity, 1 /*estimated_value_size*/, num_shard_bits, - false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy), - NewFastLRUCache(capacity, 1 /*estimated_value_size*/, num_shard_bits, - false /*strict_capacity_limit*/, - kDefaultCacheMetadataChargePolicy)}) { + capacity, + BlockBasedTableOptions().block_size /*estimated_value_size*/, + num_shard_bits, false /*strict_capacity_limit*/, + kDefaultCacheMetadataChargePolicy), + NewFastLRUCache( + capacity, + BlockBasedTableOptions().block_size /*estimated_value_size*/, + num_shard_bits, false /*strict_capacity_limit*/, + kDefaultCacheMetadataChargePolicy)}) { if (!base_cache) { // Skip clock cache when not supported continue; @@ -1298,10 +1302,11 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { for (bool partition : {false, true}) { for (std::shared_ptr cache : {NewLRUCache(capacity), - ExperimentalNewClockCache(capacity, 1 /*estimated_value_size*/, - -1 /*num_shard_bits*/, - false /*strict_capacity_limit*/, - kDefaultCacheMetadataChargePolicy)}) { + ExperimentalNewClockCache( + capacity, + BlockBasedTableOptions().block_size /*estimated_value_size*/, + -1 /*num_shard_bits*/, false /*strict_capacity_limit*/, + kDefaultCacheMetadataChargePolicy)}) { if (!cache) { // Skip clock cache when not supported continue; diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 3b2aba22f..5a8a24324 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -671,6 +671,9 @@ void InternalStats::CacheEntryRoleStats::BeginCollection( << port::GetProcessID(); cache_id = str.str(); cache_capacity = cache->GetCapacity(); + cache_usage = cache->GetUsage(); + table_size = cache->GetTableAddressCount(); + occupancy = cache->GetOccupancyCount(); } void InternalStats::CacheEntryRoleStats::EndCollection( @@ -695,6 +698,8 @@ std::string InternalStats::CacheEntryRoleStats::ToString( std::ostringstream str; str << "Block cache " << cache_id << " capacity: " << BytesToHumanString(cache_capacity) + << " usage: " << BytesToHumanString(cache_usage) + << " table_size: " << table_size << " occupancy: " << occupancy << " collections: " << collection_count << " last_copies: " << copies_of_last_collection << " last_secs: " << (GetLastDurationMicros() / 1000000.0) diff --git a/db/internal_stats.h b/db/internal_stats.h index 73c1f29e7..7091877bb 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -453,6 +453,9 @@ class InternalStats { // For use with CacheEntryStatsCollector struct CacheEntryRoleStats { uint64_t cache_capacity = 0; + uint64_t cache_usage = 0; + size_t table_size = 0; + size_t occupancy = 0; std::string cache_id; std::array total_charges; std::array entry_counts; diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 1e87f5f72..119cf959c 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -404,6 +404,16 @@ class Cache { // Returns the memory size for the entries residing in the cache. virtual size_t GetUsage() const = 0; + // Returns the number of entries currently tracked in the table. SIZE_MAX + // means "not supported." This is used for inspecting the load factor, along + // with GetTableAddressCount(). + virtual size_t GetOccupancyCount() const { return SIZE_MAX; } + + // Returns the number of ways the hash function is divided for addressing + // entries. Zero means "not supported." This is used for inspecting the load + // factor, along with GetOccupancyCount(). + virtual size_t GetTableAddressCount() const { return 0; } + // Returns the memory size for a specific entry in the cache. virtual size_t GetUsage(Handle* handle) const = 0; diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 67671a960..50c143f5f 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -560,7 +560,7 @@ DEFINE_bool(universal_incremental, false, DEFINE_int64(cache_size, 8 << 20, // 8MB "Number of bytes to use as a cache of uncompressed data"); -DEFINE_int32(cache_numshardbits, 6, +DEFINE_int32(cache_numshardbits, -1, "Number of shards for the block cache" " is 2 ** cache_numshardbits. Negative means use default settings." " This is applied only if FLAGS_cache_size is non-negative."); @@ -3618,6 +3618,9 @@ class Benchmark { } fresh_db = true; method = &Benchmark::TimeSeries; + } else if (name == "block_cache_entry_stats") { + // DB::Properties::kBlockCacheEntryStats + PrintStats("rocksdb.block-cache-entry-stats"); } else if (name == "stats") { PrintStats("rocksdb.stats"); } else if (name == "resetstats") {