Some small improvements to HyperClockCache (#11601)

Summary:
Stacked on https://github.com/facebook/rocksdb/issues/11572
* Minimize use of std::function and lambdas to minimize chances of
compiler heap-allocating closures (unnecessary stress on allocator). It
appears that converting FindSlot to a template enables inlining the
lambda parameters, avoiding heap allocations.
* Clean up some logic with FindSlot (FIXMEs from https://github.com/facebook/rocksdb/issues/11572)
* Fix handling of rare case of probing all slots, with new unit test.
(Previously Insert would not roll back displacements in that case, which
would kill performance if it were to happen.)
* Add an -early_exit option to cache_bench for gathering memory stats
before deallocation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11601

Test Plan:
unit test added for probing all slots

## Seeing heap allocations
Run `MALLOC_CONF="stats_print:true" ./cache_bench -cache_type=hyper_clock_cache`
before https://github.com/facebook/rocksdb/issues/11572 vs. after this change. Before, we see this in the
interesting bin statistics:

```
size  nrequests
----  ---------
  32     578460
  64      24340
8192     578460
```
And after:
```
size  nrequests
----  ---------
  32  (insignificant)
  64      24370
8192     579130
```

## Performance test
Build with `make USE_CLANG=1 PORTABLE=0 DEBUG_LEVEL=0 -j32 cache_bench`

Run `./cache_bench -cache_type=hyper_clock_cache -ops_per_thread=5000000`
in before and after configurations, simultaneously:

```
Before: Complete in 33.244 s; Rough parallel ops/sec = 2406442
After:  Complete in 32.773 s; Rough parallel ops/sec = 2441019
```

Reviewed By: jowlyzhang

Differential Revision: D47375092

Pulled By: pdillinger

fbshipit-source-id: 46f0f57257ddb374290a0a38c651764ea60ba410
oxigraph-main
Peter Dillinger 2 years ago committed by Facebook GitHub Bot
parent bc0db33483
commit b1b6f87fbe
  1. 8
      cache/cache_bench_tool.cc
  2. 102
      cache/clock_cache.cc
  3. 40
      cache/clock_cache.h
  4. 56
      cache/lru_cache_test.cc
  5. 1
      unreleased_history/performance_improvements/hcc_perf

@ -77,6 +77,10 @@ DEFINE_bool(lean, false,
"If true, no additional computation is performed besides cache " "If true, no additional computation is performed besides cache "
"operations."); "operations.");
DEFINE_bool(early_exit, false,
"Exit before deallocating most memory. Good for malloc stats, e.g."
"MALLOC_CONF=\"stats_print:true\"");
DEFINE_string(secondary_cache_uri, "", DEFINE_string(secondary_cache_uri, "",
"Full URI for creating a custom secondary cache object"); "Full URI for creating a custom secondary cache object");
static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache; static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
@ -593,6 +597,10 @@ class CacheBench {
} }
thread->latency_ns_hist.Add(timer.ElapsedNanos()); thread->latency_ns_hist.Add(timer.ElapsedNanos());
} }
if (FLAGS_early_exit) {
MutexLock l(thread->shared->GetMutex());
exit(0);
}
if (handle) { if (handle) {
cache_->Release(handle); cache_->Release(handle);
handle = nullptr; handle = nullptr;

102
cache/clock_cache.cc vendored

@ -195,7 +195,6 @@ inline void CorrectNearOverflow(uint64_t old_meta,
inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
uint64_t initial_countdown, bool* already_matches) { uint64_t initial_countdown, bool* already_matches) {
assert(*already_matches == false); assert(*already_matches == false);
// Optimistically transition the slot from "empty" to // Optimistically transition the slot from "empty" to
// "under construction" (no effect on other states) // "under construction" (no effect on other states)
uint64_t old_meta = h.meta.fetch_or( uint64_t old_meta = h.meta.fetch_or(
@ -486,9 +485,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
// Do we have the available occupancy? Optimistically assume we do // Do we have the available occupancy? Optimistically assume we do
// and deal with it if we don't. // and deal with it if we don't.
size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire); size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
auto revert_occupancy_fn = [&]() {
occupancy_.fetch_sub(1, std::memory_order_relaxed);
};
// Whether we over-committed and need an eviction to make up for it // Whether we over-committed and need an eviction to make up for it
bool need_evict_for_occupancy = bool need_evict_for_occupancy =
!derived.GrowIfNeeded(old_occupancy + 1, state); !derived.GrowIfNeeded(old_occupancy + 1, state);
@ -501,7 +497,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
Status s = ChargeUsageMaybeEvictStrict<Table>( Status s = ChargeUsageMaybeEvictStrict<Table>(
total_charge, capacity, need_evict_for_occupancy, state); total_charge, capacity, need_evict_for_occupancy, state);
if (!s.ok()) { if (!s.ok()) {
revert_occupancy_fn(); // Revert occupancy
occupancy_.fetch_sub(1, std::memory_order_relaxed);
return s; return s;
} }
} else { } else {
@ -509,7 +506,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
bool success = ChargeUsageMaybeEvictNonStrict<Table>( bool success = ChargeUsageMaybeEvictNonStrict<Table>(
total_charge, capacity, need_evict_for_occupancy, state); total_charge, capacity, need_evict_for_occupancy, state);
if (!success) { if (!success) {
revert_occupancy_fn(); // Revert occupancy
occupancy_.fetch_sub(1, std::memory_order_relaxed);
if (handle == nullptr) { if (handle == nullptr) {
// Don't insert the entry but still return ok, as if the entry // Don't insert the entry but still return ok, as if the entry
// inserted into cache and evicted immediately. // inserted into cache and evicted immediately.
@ -522,11 +520,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
} }
} }
} }
auto revert_usage_fn = [&]() {
usage_.fetch_sub(total_charge, std::memory_order_relaxed);
// No underflow
assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
};
if (!use_standalone_insert) { if (!use_standalone_insert) {
// Attempt a table insert, but abort if we find an existing entry for the // Attempt a table insert, but abort if we find an existing entry for the
@ -551,10 +544,14 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
return Status::OK(); return Status::OK();
} }
// Not inserted // Not inserted
revert_occupancy_fn(); // Revert occupancy
occupancy_.fetch_sub(1, std::memory_order_relaxed);
// Maybe fall back on standalone insert // Maybe fall back on standalone insert
if (handle == nullptr) { if (handle == nullptr) {
revert_usage_fn(); // Revert usage
usage_.fetch_sub(total_charge, std::memory_order_relaxed);
// No underflow
assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
// As if unrefed entry immdiately evicted // As if unrefed entry immdiately evicted
proto.FreeData(allocator_); proto.FreeData(allocator_);
return Status::OK(); return Status::OK();
@ -680,47 +677,52 @@ bool HyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) {
HyperClockTable::HandleImpl* HyperClockTable::DoInsert( HyperClockTable::HandleImpl* HyperClockTable::DoInsert(
const ClockHandleBasicData& proto, uint64_t initial_countdown, const ClockHandleBasicData& proto, uint64_t initial_countdown,
bool keep_ref, InsertState&) { bool keep_ref, InsertState&) {
size_t probe = 0;
bool already_matches = false; bool already_matches = false;
HandleImpl* e = FindSlot( HandleImpl* e = FindSlot(
proto.hashed_key, proto.hashed_key,
[&](HandleImpl* h) { [&](HandleImpl* h) {
// FIXME: simplify and handle in abort_fn below? return TryInsert(proto, *h, initial_countdown, keep_ref,
bool inserted = &already_matches);
TryInsert(proto, *h, initial_countdown, keep_ref, &already_matches);
return inserted || already_matches;
}, },
[&](HandleImpl* /*h*/) { return false; },
[&](HandleImpl* h) { [&](HandleImpl* h) {
h->displacements.fetch_add(1, std::memory_order_relaxed); if (already_matches) {
// Stop searching & roll back displacements
Rollback(proto.hashed_key, h);
return true;
} else {
// Keep going
return false;
}
}, },
probe); [&](HandleImpl* h, bool is_last) {
if (e == nullptr) { if (is_last) {
// Occupancy check and never abort FindSlot above should generally // Search is ending. Roll back displacements
// prevent this, except it's theoretically possible for other threads Rollback(proto.hashed_key, h);
// to evict and replace entries in the right order to hit every slot } else {
// when it is populated. Assuming random hashing, the chance of that h->displacements.fetch_add(1, std::memory_order_relaxed);
// should be no higher than pow(kStrictLoadFactor, n) for n slots. }
// That should be infeasible for roughly n >= 256, so if this assertion });
// fails, that suggests something is going wrong. if (already_matches) {
assert(GetTableSize() < 256); // Insertion skipped
// WART/FIXME: need to roll back every slot return nullptr;
already_matches = true;
} }
if (!already_matches) { if (e != nullptr) {
// Successfully inserted // Successfully inserted
assert(e);
return e; return e;
} }
// Roll back displacements from failed table insertion // Else, no available slot found. Occupancy check should generally prevent
Rollback(proto.hashed_key, e); // this, except it's theoretically possible for other threads to evict and
// Insertion skipped // replace entries in the right order to hit every slot when it is populated.
// Assuming random hashing, the chance of that should be no higher than
// pow(kStrictLoadFactor, n) for n slots. That should be infeasible for
// roughly n >= 256, so if this assertion fails, that suggests something is
// going wrong.
assert(GetTableSize() < 256);
return nullptr; return nullptr;
} }
HyperClockTable::HandleImpl* HyperClockTable::Lookup( HyperClockTable::HandleImpl* HyperClockTable::Lookup(
const UniqueId64x2& hashed_key) { const UniqueId64x2& hashed_key) {
size_t probe = 0;
HandleImpl* e = FindSlot( HandleImpl* e = FindSlot(
hashed_key, hashed_key,
[&](HandleImpl* h) { [&](HandleImpl* h) {
@ -780,7 +782,7 @@ HyperClockTable::HandleImpl* HyperClockTable::Lookup(
[&](HandleImpl* h) { [&](HandleImpl* h) {
return h->displacements.load(std::memory_order_relaxed) == 0; return h->displacements.load(std::memory_order_relaxed) == 0;
}, },
[&](HandleImpl* /*h*/) {}, probe); [&](HandleImpl* /*h*/, bool /*is_last*/) {});
return e; return e;
} }
@ -873,7 +875,6 @@ void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
#endif #endif
void HyperClockTable::Erase(const UniqueId64x2& hashed_key) { void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
size_t probe = 0;
(void)FindSlot( (void)FindSlot(
hashed_key, hashed_key,
[&](HandleImpl* h) { [&](HandleImpl* h) {
@ -940,7 +941,7 @@ void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
[&](HandleImpl* h) { [&](HandleImpl* h) {
return h->displacements.load(std::memory_order_relaxed) == 0; return h->displacements.load(std::memory_order_relaxed) == 0;
}, },
[&](HandleImpl* /*h*/) {}, probe); [&](HandleImpl* /*h*/, bool /*is_last*/) {});
} }
void HyperClockTable::ConstApplyToEntriesRange( void HyperClockTable::ConstApplyToEntriesRange(
@ -1005,10 +1006,10 @@ void HyperClockTable::EraseUnRefEntries() {
} }
} }
template <typename MatchFn, typename AbortFn, typename UpdateFn>
inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot( inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn, const UniqueId64x2& hashed_key, MatchFn match_fn, AbortFn abort_fn,
std::function<bool(HandleImpl*)> abort_fn, UpdateFn update_fn) {
std::function<void(HandleImpl*)> update_fn, size_t& probe) {
// NOTE: upper 32 bits of hashed_key[0] is used for sharding // NOTE: upper 32 bits of hashed_key[0] is used for sharding
// //
// We use double-hashing probing. Every probe in the sequence is a // We use double-hashing probing. Every probe in the sequence is a
@ -1022,20 +1023,21 @@ inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
// TODO: we could also reconsider linear probing, though locality benefits // TODO: we could also reconsider linear probing, though locality benefits
// are limited because each slot is a full cache line // are limited because each slot is a full cache line
size_t increment = static_cast<size_t>(hashed_key[0]) | 1U; size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
size_t current = ModTableSize(base + probe * increment); size_t first = ModTableSize(base);
while (probe <= length_bits_mask_) { size_t current = first;
bool is_last;
do {
HandleImpl* h = &array_[current]; HandleImpl* h = &array_[current];
if (match_fn(h)) { if (match_fn(h)) {
probe++;
return h; return h;
} }
if (abort_fn(h)) { if (abort_fn(h)) {
return nullptr; return nullptr;
} }
probe++;
update_fn(h);
current = ModTableSize(current + increment); current = ModTableSize(current + increment);
} is_last = current == first;
update_fn(h, is_last);
} while (!is_last);
// We looped back. // We looped back.
return nullptr; return nullptr;
} }

@ -549,7 +549,12 @@ class HyperClockTable : public BaseClockTable {
size_t GetOccupancyLimit() const { return occupancy_limit_; } size_t GetOccupancyLimit() const { return occupancy_limit_; }
#ifndef NDEBUG #ifndef NDEBUG
void TEST_ReleaseN(HandleImpl* h, size_t n); size_t& TEST_MutableOccupancyLimit() const {
return const_cast<size_t&>(occupancy_limit_);
}
// Release N references
void TEST_ReleaseN(HandleImpl* handle, size_t n);
#endif #endif
private: // functions private: // functions
@ -558,22 +563,18 @@ class HyperClockTable : public BaseClockTable {
return static_cast<size_t>(x) & length_bits_mask_; return static_cast<size_t>(x) & length_bits_mask_;
} }
// Returns the first slot in the probe sequence, starting from the given // Returns the first slot in the probe sequence with a handle e such that
// probe number, with a handle e such that match(e) is true. At every // match_fn(e) is true. At every step, the function first tests whether
// step, the function first tests whether match(e) holds. If this is false, // match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
// it evaluates abort(e) to decide whether the search should be aborted, // whether the search should be aborted, and if so, FindSlot immediately
// and in the affirmative returns -1. For every handle e probed except // returns nullptr. For every handle e that is not a match and not aborted,
// the last one, the function runs update(e). // FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
// The probe parameter is modified as follows. We say a probe to a handle // slot will be the last probed because the next would cycle back to the first
// e is aborting if match(e) is false and abort(e) is true. Then the final // slot probed. This function uses templates instead of std::function to
// value of probe is one more than the last non-aborting probe during the // minimize the risk of heap-allocated closures being created.
// call. This is so that that the variable can be used to keep track of template <typename MatchFn, typename AbortFn, typename UpdateFn>
// progress across consecutive calls to FindSlot. inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, MatchFn match_fn,
inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, AbortFn abort_fn, UpdateFn update_fn);
std::function<bool(HandleImpl*)> match,
std::function<bool(HandleImpl*)> stop,
std::function<void(HandleImpl*)> update,
size_t& probe);
// Re-decrement all displacements in probe path starting from beginning // Re-decrement all displacements in probe path starting from beginning
// until (not including) the given handle // until (not including) the given handle
@ -704,9 +705,14 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
return Lookup(key, hashed_key); return Lookup(key, hashed_key);
} }
#ifndef NDEBUG
size_t& TEST_MutableOccupancyLimit() const {
return table_.TEST_MutableOccupancyLimit();
}
// Acquire/release N references // Acquire/release N references
void TEST_RefN(HandleImpl* handle, size_t n); void TEST_RefN(HandleImpl* handle, size_t n);
void TEST_ReleaseN(HandleImpl* handle, size_t n); void TEST_ReleaseN(HandleImpl* handle, size_t n);
#endif
private: // data private: // data
Table table_; Table table_;

@ -715,6 +715,62 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
ASSERT_EQ(val.deleted, 1); ASSERT_EQ(val.deleted, 1);
} }
TEST_F(ClockCacheTest, ClockTableFull) {
// Force clock cache table to fill up (not usually allowed) in order
// to test full probe sequence that is theoretically possible due to
// parallel operations
NewShard(6, /*strict_capacity_limit*/ false);
size_t size = shard_->GetTableAddressCount();
ASSERT_LE(size + 3, 256); // for using char keys
// Modify occupancy and capacity limits to attempt insert on full
shard_->TEST_MutableOccupancyLimit() = size + 100;
shard_->SetCapacity(size + 100);
DeleteCounter val;
std::vector<HandleImpl*> handles;
// NOTE: the three extra insertions should create standalone entries
for (size_t i = 0; i < size + 3; ++i) {
UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i));
ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &val, &kDeleteCounterHelper,
1, &handles.emplace_back(),
Cache::Priority::HIGH));
}
for (size_t i = 0; i < size + 3; ++i) {
UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i));
HandleImpl* h = shard_->Lookup(TestKey(hkey), hkey);
if (i < size) {
ASSERT_NE(h, nullptr);
shard_->Release(h);
} else {
// Standalone entries not visible by lookup
ASSERT_EQ(h, nullptr);
}
}
for (size_t i = 0; i < size + 3; ++i) {
ASSERT_NE(handles[i], nullptr);
shard_->Release(handles[i]);
if (i < size) {
// Everything still in cache
ASSERT_EQ(val.deleted, 0);
} else {
// Standalone entries freed on release
ASSERT_EQ(val.deleted, i + 1 - size);
}
}
for (size_t i = size + 3; i > 0; --i) {
UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i - 1));
shard_->Erase(TestKey(hkey), hkey);
if (i - 1 > size) {
ASSERT_EQ(val.deleted, 3);
} else {
ASSERT_EQ(val.deleted, 3 + size - (i - 1));
}
}
}
// This test is mostly to exercise some corner case logic, by forcing two // This test is mostly to exercise some corner case logic, by forcing two
// keys to have the same hash, and more // keys to have the same hash, and more
TEST_F(ClockCacheTest, CollidingInsertEraseTest) { TEST_F(ClockCacheTest, CollidingInsertEraseTest) {

@ -0,0 +1 @@
Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations
Loading…
Cancel
Save