diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index 1d93c1d96..61b12b19e 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -77,6 +77,10 @@ DEFINE_bool(lean, false, "If true, no additional computation is performed besides cache " "operations."); +DEFINE_bool(early_exit, false, + "Exit before deallocating most memory. Good for malloc stats, e.g." + "MALLOC_CONF=\"stats_print:true\""); + DEFINE_string(secondary_cache_uri, "", "Full URI for creating a custom secondary cache object"); static class std::shared_ptr secondary_cache; @@ -593,6 +597,10 @@ class CacheBench { } thread->latency_ns_hist.Add(timer.ElapsedNanos()); } + if (FLAGS_early_exit) { + MutexLock l(thread->shared->GetMutex()); + exit(0); + } if (handle) { cache_->Release(handle); handle = nullptr; diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 6b1caabfa..4d4faeb30 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -195,7 +195,6 @@ inline void CorrectNearOverflow(uint64_t old_meta, inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, uint64_t initial_countdown, bool* already_matches) { assert(*already_matches == false); - // Optimistically transition the slot from "empty" to // "under construction" (no effect on other states) uint64_t old_meta = h.meta.fetch_or( @@ -486,9 +485,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, // Do we have the available occupancy? Optimistically assume we do // and deal with it if we don't. size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire); - auto revert_occupancy_fn = [&]() { - occupancy_.fetch_sub(1, std::memory_order_relaxed); - }; // Whether we over-committed and need an eviction to make up for it bool need_evict_for_occupancy = !derived.GrowIfNeeded(old_occupancy + 1, state); @@ -501,7 +497,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, Status s = ChargeUsageMaybeEvictStrict( total_charge, capacity, need_evict_for_occupancy, state); if (!s.ok()) { - revert_occupancy_fn(); + // Revert occupancy + occupancy_.fetch_sub(1, std::memory_order_relaxed); return s; } } else { @@ -509,7 +506,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, bool success = ChargeUsageMaybeEvictNonStrict
( total_charge, capacity, need_evict_for_occupancy, state); if (!success) { - revert_occupancy_fn(); + // Revert occupancy + occupancy_.fetch_sub(1, std::memory_order_relaxed); if (handle == nullptr) { // Don't insert the entry but still return ok, as if the entry // inserted into cache and evicted immediately. @@ -522,11 +520,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, } } } - auto revert_usage_fn = [&]() { - usage_.fetch_sub(total_charge, std::memory_order_relaxed); - // No underflow - assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); - }; if (!use_standalone_insert) { // Attempt a table insert, but abort if we find an existing entry for the @@ -551,10 +544,14 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, return Status::OK(); } // Not inserted - revert_occupancy_fn(); + // Revert occupancy + occupancy_.fetch_sub(1, std::memory_order_relaxed); // Maybe fall back on standalone insert if (handle == nullptr) { - revert_usage_fn(); + // Revert usage + usage_.fetch_sub(total_charge, std::memory_order_relaxed); + // No underflow + assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); // As if unrefed entry immdiately evicted proto.FreeData(allocator_); return Status::OK(); @@ -680,47 +677,52 @@ bool HyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) { HyperClockTable::HandleImpl* HyperClockTable::DoInsert( const ClockHandleBasicData& proto, uint64_t initial_countdown, bool keep_ref, InsertState&) { - size_t probe = 0; bool already_matches = false; HandleImpl* e = FindSlot( proto.hashed_key, [&](HandleImpl* h) { - // FIXME: simplify and handle in abort_fn below? - bool inserted = - TryInsert(proto, *h, initial_countdown, keep_ref, &already_matches); - return inserted || already_matches; + return TryInsert(proto, *h, initial_countdown, keep_ref, + &already_matches); }, - [&](HandleImpl* /*h*/) { return false; }, [&](HandleImpl* h) { - h->displacements.fetch_add(1, std::memory_order_relaxed); + if (already_matches) { + // Stop searching & roll back displacements + Rollback(proto.hashed_key, h); + return true; + } else { + // Keep going + return false; + } }, - probe); - if (e == nullptr) { - // Occupancy check and never abort FindSlot above should generally - // prevent this, except it's theoretically possible for other threads - // to evict and replace entries in the right order to hit every slot - // when it is populated. Assuming random hashing, the chance of that - // should be no higher than pow(kStrictLoadFactor, n) for n slots. - // That should be infeasible for roughly n >= 256, so if this assertion - // fails, that suggests something is going wrong. - assert(GetTableSize() < 256); - // WART/FIXME: need to roll back every slot - already_matches = true; + [&](HandleImpl* h, bool is_last) { + if (is_last) { + // Search is ending. Roll back displacements + Rollback(proto.hashed_key, h); + } else { + h->displacements.fetch_add(1, std::memory_order_relaxed); + } + }); + if (already_matches) { + // Insertion skipped + return nullptr; } - if (!already_matches) { + if (e != nullptr) { // Successfully inserted - assert(e); return e; } - // Roll back displacements from failed table insertion - Rollback(proto.hashed_key, e); - // Insertion skipped + // Else, no available slot found. Occupancy check should generally prevent + // this, except it's theoretically possible for other threads to evict and + // replace entries in the right order to hit every slot when it is populated. + // Assuming random hashing, the chance of that should be no higher than + // pow(kStrictLoadFactor, n) for n slots. That should be infeasible for + // roughly n >= 256, so if this assertion fails, that suggests something is + // going wrong. + assert(GetTableSize() < 256); return nullptr; } HyperClockTable::HandleImpl* HyperClockTable::Lookup( const UniqueId64x2& hashed_key) { - size_t probe = 0; HandleImpl* e = FindSlot( hashed_key, [&](HandleImpl* h) { @@ -780,7 +782,7 @@ HyperClockTable::HandleImpl* HyperClockTable::Lookup( [&](HandleImpl* h) { return h->displacements.load(std::memory_order_relaxed) == 0; }, - [&](HandleImpl* /*h*/) {}, probe); + [&](HandleImpl* /*h*/, bool /*is_last*/) {}); return e; } @@ -873,7 +875,6 @@ void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { #endif void HyperClockTable::Erase(const UniqueId64x2& hashed_key) { - size_t probe = 0; (void)FindSlot( hashed_key, [&](HandleImpl* h) { @@ -940,7 +941,7 @@ void HyperClockTable::Erase(const UniqueId64x2& hashed_key) { [&](HandleImpl* h) { return h->displacements.load(std::memory_order_relaxed) == 0; }, - [&](HandleImpl* /*h*/) {}, probe); + [&](HandleImpl* /*h*/, bool /*is_last*/) {}); } void HyperClockTable::ConstApplyToEntriesRange( @@ -1005,10 +1006,10 @@ void HyperClockTable::EraseUnRefEntries() { } } +template inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot( - const UniqueId64x2& hashed_key, std::function match_fn, - std::function abort_fn, - std::function update_fn, size_t& probe) { + const UniqueId64x2& hashed_key, MatchFn match_fn, AbortFn abort_fn, + UpdateFn update_fn) { // NOTE: upper 32 bits of hashed_key[0] is used for sharding // // We use double-hashing probing. Every probe in the sequence is a @@ -1022,20 +1023,21 @@ inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot( // TODO: we could also reconsider linear probing, though locality benefits // are limited because each slot is a full cache line size_t increment = static_cast(hashed_key[0]) | 1U; - size_t current = ModTableSize(base + probe * increment); - while (probe <= length_bits_mask_) { + size_t first = ModTableSize(base); + size_t current = first; + bool is_last; + do { HandleImpl* h = &array_[current]; if (match_fn(h)) { - probe++; return h; } if (abort_fn(h)) { return nullptr; } - probe++; - update_fn(h); current = ModTableSize(current + increment); - } + is_last = current == first; + update_fn(h, is_last); + } while (!is_last); // We looped back. return nullptr; } diff --git a/cache/clock_cache.h b/cache/clock_cache.h index fff3ef43d..992083e4f 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -549,7 +549,12 @@ class HyperClockTable : public BaseClockTable { size_t GetOccupancyLimit() const { return occupancy_limit_; } #ifndef NDEBUG - void TEST_ReleaseN(HandleImpl* h, size_t n); + size_t& TEST_MutableOccupancyLimit() const { + return const_cast(occupancy_limit_); + } + + // Release N references + void TEST_ReleaseN(HandleImpl* handle, size_t n); #endif private: // functions @@ -558,22 +563,18 @@ class HyperClockTable : public BaseClockTable { return static_cast(x) & length_bits_mask_; } - // Returns the first slot in the probe sequence, starting from the given - // probe number, with a handle e such that match(e) is true. At every - // step, the function first tests whether match(e) holds. If this is false, - // it evaluates abort(e) to decide whether the search should be aborted, - // and in the affirmative returns -1. For every handle e probed except - // the last one, the function runs update(e). - // The probe parameter is modified as follows. We say a probe to a handle - // e is aborting if match(e) is false and abort(e) is true. Then the final - // value of probe is one more than the last non-aborting probe during the - // call. This is so that that the variable can be used to keep track of - // progress across consecutive calls to FindSlot. - inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, - std::function match, - std::function stop, - std::function update, - size_t& probe); + // Returns the first slot in the probe sequence with a handle e such that + // match_fn(e) is true. At every step, the function first tests whether + // match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide + // whether the search should be aborted, and if so, FindSlot immediately + // returns nullptr. For every handle e that is not a match and not aborted, + // FindSlot runs update_fn(e, is_last) where is_last is set to true iff that + // slot will be the last probed because the next would cycle back to the first + // slot probed. This function uses templates instead of std::function to + // minimize the risk of heap-allocated closures being created. + template + inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, MatchFn match_fn, + AbortFn abort_fn, UpdateFn update_fn); // Re-decrement all displacements in probe path starting from beginning // until (not including) the given handle @@ -704,9 +705,14 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { return Lookup(key, hashed_key); } +#ifndef NDEBUG + size_t& TEST_MutableOccupancyLimit() const { + return table_.TEST_MutableOccupancyLimit(); + } // Acquire/release N references void TEST_RefN(HandleImpl* handle, size_t n); void TEST_ReleaseN(HandleImpl* handle, size_t n); +#endif private: // data Table table_; diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 6e5118fb0..cb7beb7b1 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -715,6 +715,62 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) { ASSERT_EQ(val.deleted, 1); } +TEST_F(ClockCacheTest, ClockTableFull) { + // Force clock cache table to fill up (not usually allowed) in order + // to test full probe sequence that is theoretically possible due to + // parallel operations + NewShard(6, /*strict_capacity_limit*/ false); + size_t size = shard_->GetTableAddressCount(); + ASSERT_LE(size + 3, 256); // for using char keys + // Modify occupancy and capacity limits to attempt insert on full + shard_->TEST_MutableOccupancyLimit() = size + 100; + shard_->SetCapacity(size + 100); + + DeleteCounter val; + std::vector handles; + // NOTE: the three extra insertions should create standalone entries + for (size_t i = 0; i < size + 3; ++i) { + UniqueId64x2 hkey = TestHashedKey(static_cast(i)); + ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &val, &kDeleteCounterHelper, + 1, &handles.emplace_back(), + Cache::Priority::HIGH)); + } + + for (size_t i = 0; i < size + 3; ++i) { + UniqueId64x2 hkey = TestHashedKey(static_cast(i)); + HandleImpl* h = shard_->Lookup(TestKey(hkey), hkey); + if (i < size) { + ASSERT_NE(h, nullptr); + shard_->Release(h); + } else { + // Standalone entries not visible by lookup + ASSERT_EQ(h, nullptr); + } + } + + for (size_t i = 0; i < size + 3; ++i) { + ASSERT_NE(handles[i], nullptr); + shard_->Release(handles[i]); + if (i < size) { + // Everything still in cache + ASSERT_EQ(val.deleted, 0); + } else { + // Standalone entries freed on release + ASSERT_EQ(val.deleted, i + 1 - size); + } + } + + for (size_t i = size + 3; i > 0; --i) { + UniqueId64x2 hkey = TestHashedKey(static_cast(i - 1)); + shard_->Erase(TestKey(hkey), hkey); + if (i - 1 > size) { + ASSERT_EQ(val.deleted, 3); + } else { + ASSERT_EQ(val.deleted, 3 + size - (i - 1)); + } + } +} + // This test is mostly to exercise some corner case logic, by forcing two // keys to have the same hash, and more TEST_F(ClockCacheTest, CollidingInsertEraseTest) { diff --git a/unreleased_history/performance_improvements/hcc_perf b/unreleased_history/performance_improvements/hcc_perf new file mode 100644 index 000000000..c129393dc --- /dev/null +++ b/unreleased_history/performance_improvements/hcc_perf @@ -0,0 +1 @@ +Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations