Some small improvements to HyperClockCache (#11601)

Summary:
Stacked on https://github.com/facebook/rocksdb/issues/11572
* Minimize use of std::function and lambdas to minimize chances of
compiler heap-allocating closures (unnecessary stress on allocator). It
appears that converting FindSlot to a template enables inlining the
lambda parameters, avoiding heap allocations.
* Clean up some logic with FindSlot (FIXMEs from https://github.com/facebook/rocksdb/issues/11572)
* Fix handling of rare case of probing all slots, with new unit test.
(Previously Insert would not roll back displacements in that case, which
would kill performance if it were to happen.)
* Add an -early_exit option to cache_bench for gathering memory stats
before deallocation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11601

Test Plan:
unit test added for probing all slots

## Seeing heap allocations
Run `MALLOC_CONF="stats_print:true" ./cache_bench -cache_type=hyper_clock_cache`
before https://github.com/facebook/rocksdb/issues/11572 vs. after this change. Before, we see this in the
interesting bin statistics:

```
size  nrequests
----  ---------
  32     578460
  64      24340
8192     578460
```
And after:
```
size  nrequests
----  ---------
  32  (insignificant)
  64      24370
8192     579130
```

## Performance test
Build with `make USE_CLANG=1 PORTABLE=0 DEBUG_LEVEL=0 -j32 cache_bench`

Run `./cache_bench -cache_type=hyper_clock_cache -ops_per_thread=5000000`
in before and after configurations, simultaneously:

```
Before: Complete in 33.244 s; Rough parallel ops/sec = 2406442
After:  Complete in 32.773 s; Rough parallel ops/sec = 2441019
```

Reviewed By: jowlyzhang

Differential Revision: D47375092

Pulled By: pdillinger

fbshipit-source-id: 46f0f57257ddb374290a0a38c651764ea60ba410
oxigraph-main
Peter Dillinger 1 year ago committed by Facebook GitHub Bot
parent bc0db33483
commit b1b6f87fbe
  1. 8
      cache/cache_bench_tool.cc
  2. 102
      cache/clock_cache.cc
  3. 40
      cache/clock_cache.h
  4. 56
      cache/lru_cache_test.cc
  5. 1
      unreleased_history/performance_improvements/hcc_perf

@ -77,6 +77,10 @@ DEFINE_bool(lean, false,
"If true, no additional computation is performed besides cache "
"operations.");
DEFINE_bool(early_exit, false,
"Exit before deallocating most memory. Good for malloc stats, e.g."
"MALLOC_CONF=\"stats_print:true\"");
DEFINE_string(secondary_cache_uri, "",
"Full URI for creating a custom secondary cache object");
static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
@ -593,6 +597,10 @@ class CacheBench {
}
thread->latency_ns_hist.Add(timer.ElapsedNanos());
}
if (FLAGS_early_exit) {
MutexLock l(thread->shared->GetMutex());
exit(0);
}
if (handle) {
cache_->Release(handle);
handle = nullptr;

102
cache/clock_cache.cc vendored

@ -195,7 +195,6 @@ inline void CorrectNearOverflow(uint64_t old_meta,
inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
uint64_t initial_countdown, bool* already_matches) {
assert(*already_matches == false);
// Optimistically transition the slot from "empty" to
// "under construction" (no effect on other states)
uint64_t old_meta = h.meta.fetch_or(
@ -486,9 +485,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
// Do we have the available occupancy? Optimistically assume we do
// and deal with it if we don't.
size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
auto revert_occupancy_fn = [&]() {
occupancy_.fetch_sub(1, std::memory_order_relaxed);
};
// Whether we over-committed and need an eviction to make up for it
bool need_evict_for_occupancy =
!derived.GrowIfNeeded(old_occupancy + 1, state);
@ -501,7 +497,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
Status s = ChargeUsageMaybeEvictStrict<Table>(
total_charge, capacity, need_evict_for_occupancy, state);
if (!s.ok()) {
revert_occupancy_fn();
// Revert occupancy
occupancy_.fetch_sub(1, std::memory_order_relaxed);
return s;
}
} else {
@ -509,7 +506,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
bool success = ChargeUsageMaybeEvictNonStrict<Table>(
total_charge, capacity, need_evict_for_occupancy, state);
if (!success) {
revert_occupancy_fn();
// Revert occupancy
occupancy_.fetch_sub(1, std::memory_order_relaxed);
if (handle == nullptr) {
// Don't insert the entry but still return ok, as if the entry
// inserted into cache and evicted immediately.
@ -522,11 +520,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
}
}
}
auto revert_usage_fn = [&]() {
usage_.fetch_sub(total_charge, std::memory_order_relaxed);
// No underflow
assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
};
if (!use_standalone_insert) {
// Attempt a table insert, but abort if we find an existing entry for the
@ -551,10 +544,14 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
return Status::OK();
}
// Not inserted
revert_occupancy_fn();
// Revert occupancy
occupancy_.fetch_sub(1, std::memory_order_relaxed);
// Maybe fall back on standalone insert
if (handle == nullptr) {
revert_usage_fn();
// Revert usage
usage_.fetch_sub(total_charge, std::memory_order_relaxed);
// No underflow
assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
// As if unrefed entry immdiately evicted
proto.FreeData(allocator_);
return Status::OK();
@ -680,47 +677,52 @@ bool HyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) {
HyperClockTable::HandleImpl* HyperClockTable::DoInsert(
const ClockHandleBasicData& proto, uint64_t initial_countdown,
bool keep_ref, InsertState&) {
size_t probe = 0;
bool already_matches = false;
HandleImpl* e = FindSlot(
proto.hashed_key,
[&](HandleImpl* h) {
// FIXME: simplify and handle in abort_fn below?
bool inserted =
TryInsert(proto, *h, initial_countdown, keep_ref, &already_matches);
return inserted || already_matches;
return TryInsert(proto, *h, initial_countdown, keep_ref,
&already_matches);
},
[&](HandleImpl* /*h*/) { return false; },
[&](HandleImpl* h) {
h->displacements.fetch_add(1, std::memory_order_relaxed);
if (already_matches) {
// Stop searching & roll back displacements
Rollback(proto.hashed_key, h);
return true;
} else {
// Keep going
return false;
}
},
probe);
if (e == nullptr) {
// Occupancy check and never abort FindSlot above should generally
// prevent this, except it's theoretically possible for other threads
// to evict and replace entries in the right order to hit every slot
// when it is populated. Assuming random hashing, the chance of that
// should be no higher than pow(kStrictLoadFactor, n) for n slots.
// That should be infeasible for roughly n >= 256, so if this assertion
// fails, that suggests something is going wrong.
assert(GetTableSize() < 256);
// WART/FIXME: need to roll back every slot
already_matches = true;
[&](HandleImpl* h, bool is_last) {
if (is_last) {
// Search is ending. Roll back displacements
Rollback(proto.hashed_key, h);
} else {
h->displacements.fetch_add(1, std::memory_order_relaxed);
}
});
if (already_matches) {
// Insertion skipped
return nullptr;
}
if (!already_matches) {
if (e != nullptr) {
// Successfully inserted
assert(e);
return e;
}
// Roll back displacements from failed table insertion
Rollback(proto.hashed_key, e);
// Insertion skipped
// Else, no available slot found. Occupancy check should generally prevent
// this, except it's theoretically possible for other threads to evict and
// replace entries in the right order to hit every slot when it is populated.
// Assuming random hashing, the chance of that should be no higher than
// pow(kStrictLoadFactor, n) for n slots. That should be infeasible for
// roughly n >= 256, so if this assertion fails, that suggests something is
// going wrong.
assert(GetTableSize() < 256);
return nullptr;
}
HyperClockTable::HandleImpl* HyperClockTable::Lookup(
const UniqueId64x2& hashed_key) {
size_t probe = 0;
HandleImpl* e = FindSlot(
hashed_key,
[&](HandleImpl* h) {
@ -780,7 +782,7 @@ HyperClockTable::HandleImpl* HyperClockTable::Lookup(
[&](HandleImpl* h) {
return h->displacements.load(std::memory_order_relaxed) == 0;
},
[&](HandleImpl* /*h*/) {}, probe);
[&](HandleImpl* /*h*/, bool /*is_last*/) {});
return e;
}
@ -873,7 +875,6 @@ void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
#endif
void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
size_t probe = 0;
(void)FindSlot(
hashed_key,
[&](HandleImpl* h) {
@ -940,7 +941,7 @@ void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
[&](HandleImpl* h) {
return h->displacements.load(std::memory_order_relaxed) == 0;
},
[&](HandleImpl* /*h*/) {}, probe);
[&](HandleImpl* /*h*/, bool /*is_last*/) {});
}
void HyperClockTable::ConstApplyToEntriesRange(
@ -1005,10 +1006,10 @@ void HyperClockTable::EraseUnRefEntries() {
}
}
template <typename MatchFn, typename AbortFn, typename UpdateFn>
inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn,
std::function<bool(HandleImpl*)> abort_fn,
std::function<void(HandleImpl*)> update_fn, size_t& probe) {
const UniqueId64x2& hashed_key, MatchFn match_fn, AbortFn abort_fn,
UpdateFn update_fn) {
// NOTE: upper 32 bits of hashed_key[0] is used for sharding
//
// We use double-hashing probing. Every probe in the sequence is a
@ -1022,20 +1023,21 @@ inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
// TODO: we could also reconsider linear probing, though locality benefits
// are limited because each slot is a full cache line
size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
size_t current = ModTableSize(base + probe * increment);
while (probe <= length_bits_mask_) {
size_t first = ModTableSize(base);
size_t current = first;
bool is_last;
do {
HandleImpl* h = &array_[current];
if (match_fn(h)) {
probe++;
return h;
}
if (abort_fn(h)) {
return nullptr;
}
probe++;
update_fn(h);
current = ModTableSize(current + increment);
}
is_last = current == first;
update_fn(h, is_last);
} while (!is_last);
// We looped back.
return nullptr;
}

@ -549,7 +549,12 @@ class HyperClockTable : public BaseClockTable {
size_t GetOccupancyLimit() const { return occupancy_limit_; }
#ifndef NDEBUG
void TEST_ReleaseN(HandleImpl* h, size_t n);
size_t& TEST_MutableOccupancyLimit() const {
return const_cast<size_t&>(occupancy_limit_);
}
// Release N references
void TEST_ReleaseN(HandleImpl* handle, size_t n);
#endif
private: // functions
@ -558,22 +563,18 @@ class HyperClockTable : public BaseClockTable {
return static_cast<size_t>(x) & length_bits_mask_;
}
// Returns the first slot in the probe sequence, starting from the given
// probe number, with a handle e such that match(e) is true. At every
// step, the function first tests whether match(e) holds. If this is false,
// it evaluates abort(e) to decide whether the search should be aborted,
// and in the affirmative returns -1. For every handle e probed except
// the last one, the function runs update(e).
// The probe parameter is modified as follows. We say a probe to a handle
// e is aborting if match(e) is false and abort(e) is true. Then the final
// value of probe is one more than the last non-aborting probe during the
// call. This is so that that the variable can be used to keep track of
// progress across consecutive calls to FindSlot.
inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
std::function<bool(HandleImpl*)> match,
std::function<bool(HandleImpl*)> stop,
std::function<void(HandleImpl*)> update,
size_t& probe);
// Returns the first slot in the probe sequence with a handle e such that
// match_fn(e) is true. At every step, the function first tests whether
// match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
// whether the search should be aborted, and if so, FindSlot immediately
// returns nullptr. For every handle e that is not a match and not aborted,
// FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
// slot will be the last probed because the next would cycle back to the first
// slot probed. This function uses templates instead of std::function to
// minimize the risk of heap-allocated closures being created.
template <typename MatchFn, typename AbortFn, typename UpdateFn>
inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, MatchFn match_fn,
AbortFn abort_fn, UpdateFn update_fn);
// Re-decrement all displacements in probe path starting from beginning
// until (not including) the given handle
@ -704,9 +705,14 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
return Lookup(key, hashed_key);
}
#ifndef NDEBUG
size_t& TEST_MutableOccupancyLimit() const {
return table_.TEST_MutableOccupancyLimit();
}
// Acquire/release N references
void TEST_RefN(HandleImpl* handle, size_t n);
void TEST_ReleaseN(HandleImpl* handle, size_t n);
#endif
private: // data
Table table_;

@ -715,6 +715,62 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
ASSERT_EQ(val.deleted, 1);
}
TEST_F(ClockCacheTest, ClockTableFull) {
// Force clock cache table to fill up (not usually allowed) in order
// to test full probe sequence that is theoretically possible due to
// parallel operations
NewShard(6, /*strict_capacity_limit*/ false);
size_t size = shard_->GetTableAddressCount();
ASSERT_LE(size + 3, 256); // for using char keys
// Modify occupancy and capacity limits to attempt insert on full
shard_->TEST_MutableOccupancyLimit() = size + 100;
shard_->SetCapacity(size + 100);
DeleteCounter val;
std::vector<HandleImpl*> handles;
// NOTE: the three extra insertions should create standalone entries
for (size_t i = 0; i < size + 3; ++i) {
UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i));
ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &val, &kDeleteCounterHelper,
1, &handles.emplace_back(),
Cache::Priority::HIGH));
}
for (size_t i = 0; i < size + 3; ++i) {
UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i));
HandleImpl* h = shard_->Lookup(TestKey(hkey), hkey);
if (i < size) {
ASSERT_NE(h, nullptr);
shard_->Release(h);
} else {
// Standalone entries not visible by lookup
ASSERT_EQ(h, nullptr);
}
}
for (size_t i = 0; i < size + 3; ++i) {
ASSERT_NE(handles[i], nullptr);
shard_->Release(handles[i]);
if (i < size) {
// Everything still in cache
ASSERT_EQ(val.deleted, 0);
} else {
// Standalone entries freed on release
ASSERT_EQ(val.deleted, i + 1 - size);
}
}
for (size_t i = size + 3; i > 0; --i) {
UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i - 1));
shard_->Erase(TestKey(hkey), hkey);
if (i - 1 > size) {
ASSERT_EQ(val.deleted, 3);
} else {
ASSERT_EQ(val.deleted, 3 + size - (i - 1));
}
}
}
// This test is mostly to exercise some corner case logic, by forcing two
// keys to have the same hash, and more
TEST_F(ClockCacheTest, CollidingInsertEraseTest) {

@ -0,0 +1 @@
Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations
Loading…
Cancel
Save