Lock-free Lookup and Release in ClockCache (#10347)

Summary:
This is a prototype of a partially lock-free version of ClockCache. Roughly speaking, reads are lock-free and writes are lock-based:
- Lookup is lock-free.
- Release is lock-free, unless (i) no references to the element are left and (ii) it was marked for deletion or ``erase_if_last_ref`` is set.
- Insert and Erase still use a per-shard lock.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10347

Test Plan:
- ``make -j24 check``
- ``make -j24 CRASH_TEST_EXT_ARGS="--duration=960 --cache_type=clock_cache --cache_size=1073741824 --block_size=16384" blackbox_crash_test_with_atomic_flush``

Reviewed By: pdillinger

Differential Revision: D37898776

Pulled By: guidotag

fbshipit-source-id: 6418fd980f786d69b871bf2fe959398e44cd3d80
main
Guido Tagliavini Ponce 2 years ago committed by Facebook GitHub Bot
parent faa0f9723c
commit efdb428edc
  1. 434
      cache/clock_cache.cc
  2. 534
      cache/clock_cache.h
  3. 1
      cache/fast_lru_cache.cc

434
cache/clock_cache.cc vendored

@ -37,133 +37,191 @@ ClockHandleTable::ClockHandleTable(int hash_bits)
} }
ClockHandleTable::~ClockHandleTable() { ClockHandleTable::~ClockHandleTable() {
ApplyToEntriesRange([](ClockHandle* h) { h->FreeData(); }, 0, GetTableSize()); ApplyToEntriesRange([](ClockHandle* h) { h->FreeData(); }, 0, GetTableSize(),
true);
} }
ClockHandle* ClockHandleTable::Lookup(const Slice& key) { ClockHandle* ClockHandleTable::Lookup(const Slice& key, uint32_t hash) {
int probe = 0; uint32_t probe = 0;
int slot = FindVisibleElement(key, probe, 0); int slot = FindElement(key, hash, probe);
return (slot == -1) ? nullptr : &array_[slot]; return (slot == -1) ? nullptr : &array_[slot];
} }
ClockHandle* ClockHandleTable::Insert(ClockHandle* h, ClockHandle** old) { ClockHandle* ClockHandleTable::Insert(ClockHandle* h, ClockHandle** old) {
int probe = 0; uint32_t probe = 0;
int slot = int slot = FindElementOrAvailableSlot(h->key(), h->hash, probe);
FindVisibleElementOrAvailableSlot(h->key(), probe, 1 /*displacement*/);
*old = nullptr; *old = nullptr;
if (slot == -1) { if (slot == -1) {
// The key is not already present, and there's no available slot to place
// the new copy.
return nullptr; return nullptr;
} }
if (array_[slot].IsEmpty() || array_[slot].IsTombstone()) { if (!array_[slot].IsElement()) {
bool empty = array_[slot].IsEmpty(); // The slot is empty or is a tombstone.
Assign(slot, h);
ClockHandle* new_entry = &array_[slot]; ClockHandle* new_entry = &array_[slot];
if (empty) { new_entry->InternalToExclusiveRef();
// This used to be an empty slot. Assign(new_entry, h);
if (new_entry->displacements == 0) {
// The slot was empty.
return new_entry; return new_entry;
} }
// It used to be a tombstone, so there may already be a copy of the // It used to be a tombstone, so there may already be a copy of the
// key in the table. // key in the table.
slot = FindVisibleElement(h->key(), probe, 0 /*displacement*/); slot = FindElement(h->key(), h->hash, probe);
if (slot == -1) { if (slot == -1) {
// No existing copy of the key. // Nope, no existing copy of the key.
return new_entry; return new_entry;
} }
*old = &array_[slot]; ClockHandle* old_entry = &array_[slot];
old_entry->ReleaseInternalRef();
*old = old_entry;
return new_entry; return new_entry;
} else { } else {
// There is an existing copy of the key. // There is an existing copy of the key.
*old = &array_[slot]; ClockHandle* old_entry = &array_[slot];
old_entry->ReleaseInternalRef();
*old = old_entry;
// Find an available slot for the new element. // Find an available slot for the new element.
array_[slot].displacements++; old_entry->displacements++;
slot = FindAvailableSlot(h->key(), probe, 1 /*displacement*/); slot = FindAvailableSlot(h->key(), probe);
if (slot == -1) { if (slot == -1) {
// No available slots. Roll back displacements. // No available slots.
probe = 0;
slot = FindVisibleElement(h->key(), probe, -1);
array_[slot].displacements--;
FindAvailableSlot(h->key(), probe, -1);
return nullptr; return nullptr;
} }
Assign(slot, h); ClockHandle* new_entry = &array_[slot];
return &array_[slot]; new_entry->InternalToExclusiveRef();
Assign(new_entry, h);
return new_entry;
} }
} }
void ClockHandleTable::Remove(ClockHandle* h) { void ClockHandleTable::Remove(ClockHandle* h) {
assert(!h->IsInClockList()); // Already off the clock list. assert(!h->IsInClock()); // Already off clock.
int probe = 0; uint32_t probe = 0;
FindSlot( FindSlot(
h->key(), [&h](ClockHandle* e) { return e == h; }, probe, h->key(), [&](ClockHandle* e) { return e == h; },
-1 /*displacement*/); [&](ClockHandle* /*e*/) { return false; },
h->SetIsVisible(false); [&](ClockHandle* e) { e->displacements--; }, probe);
h->SetWillBeDeleted(false);
h->SetIsElement(false); h->SetIsElement(false);
occupancy_--; occupancy_--;
} }
void ClockHandleTable::Assign(int slot, ClockHandle* h) { void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) {
ClockHandle* dst = &array_[slot]; // DON'T touch displacements and refs.
uint32_t disp = dst->displacements; dst->value = src->value;
*dst = *h; dst->deleter = src->deleter;
dst->displacements = disp; dst->hash = src->hash;
dst->SetIsVisible(true); dst->total_charge = src->total_charge;
dst->key_data = src->key_data;
dst->flags.store(0);
dst->SetIsElement(true); dst->SetIsElement(true);
dst->SetClockPriority(ClockHandle::ClockPriority::NONE); dst->SetClockPriority(ClockHandle::ClockPriority::NONE);
dst->SetCachePriority(src->GetCachePriority());
occupancy_++; occupancy_++;
} }
void ClockHandleTable::Exclude(ClockHandle* h) { h->SetIsVisible(false); } int ClockHandleTable::FindElement(const Slice& key, uint32_t hash,
uint32_t& probe) {
int ClockHandleTable::FindVisibleElement(const Slice& key, int& probe,
int displacement) {
return FindSlot( return FindSlot(
key, [&](ClockHandle* h) { return h->Matches(key) && h->IsVisible(); }, key,
probe, displacement); [&](ClockHandle* h) {
if (h->TryInternalRef()) {
if (h->Matches(key, hash)) {
return true;
}
h->ReleaseInternalRef();
}
return false;
},
[&](ClockHandle* h) { return h->displacements == 0; },
[&](ClockHandle* /*h*/) {}, probe);
} }
int ClockHandleTable::FindAvailableSlot(const Slice& key, int& probe, int ClockHandleTable::FindAvailableSlot(const Slice& key, uint32_t& probe) {
int displacement) { int slot = FindSlot(
return FindSlot( key,
key, [](ClockHandle* h) { return h->IsEmpty() || h->IsTombstone(); }, [&](ClockHandle* h) {
probe, displacement); if (h->TryInternalRef()) {
if (!h->IsElement()) {
return true;
}
h->ReleaseInternalRef();
}
return false;
},
[&](ClockHandle* /*h*/) { return false; },
[&](ClockHandle* h) { h->displacements++; }, probe);
if (slot == -1) {
Rollback(key, probe);
}
return slot;
} }
int ClockHandleTable::FindVisibleElementOrAvailableSlot(const Slice& key, int ClockHandleTable::FindElementOrAvailableSlot(const Slice& key,
int& probe, uint32_t hash,
int displacement) { uint32_t& probe) {
return FindSlot( int slot = FindSlot(
key, key,
[&](ClockHandle* h) { [&](ClockHandle* h) {
return h->IsEmpty() || h->IsTombstone() || if (h->TryInternalRef()) {
(h->Matches(key) && h->IsVisible()); if (!h->IsElement() || h->Matches(key, hash)) {
return true;
}
h->ReleaseInternalRef();
}
return false;
}, },
probe, displacement); [&](ClockHandle* /*h*/) { return false; },
[&](ClockHandle* h) { h->displacements++; }, probe);
if (slot == -1) {
Rollback(key, probe);
}
return slot;
} }
inline int ClockHandleTable::FindSlot(const Slice& key, int ClockHandleTable::FindSlot(const Slice& key,
std::function<bool(ClockHandle*)> cond, std::function<bool(ClockHandle*)> match,
int& probe, int displacement) { std::function<bool(ClockHandle*)> abort,
std::function<void(ClockHandle*)> update,
uint32_t& probe) {
// We use double-hashing probing. Every probe in the sequence is a
// pseudorandom integer, computed as a linear function of two random hashes,
// which we call base and increment. Specifically, the i-th probe is base + i
// * increment modulo the table size.
uint32_t base = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1)); uint32_t base = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1));
// We use an odd increment, which is relatively prime with the power-of-two
// table size. This implies that we cycle back to the first probe only
// after probing every slot exactly once.
uint32_t increment = uint32_t increment =
ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1); ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1);
uint32_t current = ModTableSize(base + probe * increment); uint32_t current = ModTableSize(base + probe * increment);
while (true) { while (true) {
ClockHandle* h = &array_[current]; ClockHandle* h = &array_[current];
probe++; if (current == base && probe > 0) {
if (current == base && probe > 1) {
// We looped back. // We looped back.
return -1; return -1;
} }
if (cond(h)) { if (match(h)) {
probe++;
return current; return current;
} }
if (h->IsEmpty()) { if (abort(h)) {
// We check emptyness after the condition, because
// the condition may be emptyness.
return -1; return -1;
} }
h->displacements += displacement; probe++;
update(h);
current = ModTableSize(current + increment);
}
}
void ClockHandleTable::Rollback(const Slice& key, uint32_t probe) {
uint32_t current = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1));
uint32_t increment =
ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1);
for (uint32_t i = 0; i < probe; i++) {
array_[current].displacements--;
current = ModTableSize(current + increment); current = ModTableSize(current + increment);
} }
} }
@ -176,8 +234,7 @@ ClockCacheShard::ClockCacheShard(
clock_pointer_(0), clock_pointer_(0),
table_( table_(
CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)), CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)),
usage_(0), usage_(0) {
clock_usage_(0) {
set_metadata_charge_policy(metadata_charge_policy); set_metadata_charge_policy(metadata_charge_policy);
} }
@ -185,22 +242,16 @@ void ClockCacheShard::EraseUnRefEntries() {
autovector<ClockHandle> last_reference_list; autovector<ClockHandle> last_reference_list;
{ {
DMutexLock l(mutex_); DMutexLock l(mutex_);
uint32_t slot = 0; table_.ApplyToEntriesRange(
do { [this, &last_reference_list](ClockHandle* h) {
ClockHandle* old = &(table_.array_[slot]); // Externally unreferenced element.
if (!old->IsInClockList()) { last_reference_list.push_back(*h);
continue; Evict(h);
} },
ClockRemove(old); 0, table_.GetTableSize(), true);
table_.Remove(old);
assert(usage_ >= old->total_charge);
usage_ -= old->total_charge;
last_reference_list.push_back(*old);
slot = table_.ModTableSize(slot + 1);
} while (slot != 0);
} }
// Free the entries here outside of mutex for performance reasons. // Free the entry outside of the mutex for performance reasons.
for (auto& h : last_reference_list) { for (auto& h : last_reference_list) {
h.FreeData(); h.FreeData();
} }
@ -238,45 +289,60 @@ void ClockCacheShard::ApplyToSomeEntries(
callback(h->key(), h->value, h->GetCharge(metadata_charge_policy), callback(h->key(), h->value, h->GetCharge(metadata_charge_policy),
h->deleter); h->deleter);
}, },
index_begin, index_end); index_begin, index_end, false);
} }
void ClockCacheShard::ClockRemove(ClockHandle* h) { void ClockCacheShard::ClockOff(ClockHandle* h) {
assert(h->IsInClockList());
h->SetClockPriority(ClockHandle::ClockPriority::NONE); h->SetClockPriority(ClockHandle::ClockPriority::NONE);
assert(clock_usage_ >= h->total_charge);
clock_usage_ -= h->total_charge;
} }
void ClockCacheShard::ClockInsert(ClockHandle* h) { void ClockCacheShard::ClockOn(ClockHandle* h) {
assert(!h->IsInClockList()); assert(!h->IsInClock());
bool is_high_priority = bool is_high_priority =
h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH; h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH;
h->SetClockPriority(static_cast<ClockHandle::ClockPriority>( h->SetClockPriority(static_cast<ClockHandle::ClockPriority>(
is_high_priority * ClockHandle::ClockPriority::HIGH + is_high_priority * ClockHandle::ClockPriority::HIGH +
(1 - is_high_priority) * ClockHandle::ClockPriority::MEDIUM)); (1 - is_high_priority) * ClockHandle::ClockPriority::MEDIUM));
clock_usage_ += h->total_charge; }
void ClockCacheShard::Evict(ClockHandle* h) {
ClockOff(h);
table_.Remove(h);
assert(usage_ >= h->total_charge);
usage_ -= h->total_charge;
} }
void ClockCacheShard::EvictFromClock(size_t charge, void ClockCacheShard::EvictFromClock(size_t charge,
autovector<ClockHandle>* deleted) { autovector<ClockHandle>* deleted) {
// TODO(Guido) When an element is in the probe sequence of a
// hot element, it will be hard to get an exclusive ref.
// We may need a mechanism to avoid that an element sits forever
// in cache waiting to be evicted.
assert(charge <= capacity_); assert(charge <= capacity_);
while (clock_usage_ > 0 && (usage_ + charge) > capacity_) { uint32_t max_iterations = table_.GetTableSize();
ClockHandle* old = &table_.array_[clock_pointer_]; while (usage_ + charge > capacity_ && max_iterations--) {
ClockHandle* h = &table_.array_[clock_pointer_];
clock_pointer_ = table_.ModTableSize(clock_pointer_ + 1); clock_pointer_ = table_.ModTableSize(clock_pointer_ + 1);
// Clock list contains only elements which can be evicted.
if (!old->IsInClockList()) { if (h->TryExclusiveRef()) {
continue; if (!h->IsInClock() && h->IsElement()) {
// We adjust the clock priority to make the element evictable again.
// Why? Elements that are not in clock are either currently
// externally referenced or used to be---because we are holding an
// exclusive ref, we know we are in the latter case. This can only
// happen when the last external reference to an element was released,
// and the element was not immediately removed.
ClockOn(h);
}
if (h->GetClockPriority() == ClockHandle::ClockPriority::LOW) {
deleted->push_back(*h);
Evict(h);
} else if (h->GetClockPriority() > ClockHandle::ClockPriority::LOW) {
h->DecreaseClockPriority();
} }
if (old->GetClockPriority() == ClockHandle::ClockPriority::LOW) { h->ReleaseExclusiveRef();
ClockRemove(old);
table_.Remove(old);
assert(usage_ >= old->total_charge);
usage_ -= old->total_charge;
deleted->push_back(*old);
return;
} }
old->DecreaseClockPriority();
} }
} }
@ -309,13 +375,14 @@ void ClockCacheShard::SetCapacity(size_t capacity) {
EvictFromClock(0, &last_reference_list); EvictFromClock(0, &last_reference_list);
} }
// Free the entries here outside of mutex for performance reasons. // Free the entry outside of the mutex for performance reasons.
for (auto& h : last_reference_list) { for (auto& h : last_reference_list) {
h.FreeData(); h.FreeData();
} }
} }
void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
assert(false); // Not supported. TODO(Guido) Support it?
DMutexLock l(mutex_); DMutexLock l(mutex_);
strict_capacity_limit_ = strict_capacity_limit; strict_capacity_limit_ = strict_capacity_limit;
} }
@ -343,9 +410,10 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
autovector<ClockHandle> last_reference_list; autovector<ClockHandle> last_reference_list;
{ {
DMutexLock l(mutex_); DMutexLock l(mutex_);
assert(table_.GetOccupancy() <= table_.GetOccupancyLimit()); assert(table_.GetOccupancy() <= table_.GetOccupancyLimit());
// Free the space following strict clock policy until enough space // Free the space following strict clock policy until enough space
// is freed or the clock list is empty. // is freed or there are no evictable elements.
EvictFromClock(tmp.total_charge, &last_reference_list); EvictFromClock(tmp.total_charge, &last_reference_list);
if ((usage_ + tmp.total_charge > capacity_ && if ((usage_ + tmp.total_charge > capacity_ &&
(strict_capacity_limit_ || handle == nullptr)) || (strict_capacity_limit_ || handle == nullptr)) ||
@ -376,30 +444,29 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
usage_ += h->total_charge; usage_ += h->total_charge;
if (old != nullptr) { if (old != nullptr) {
s = Status::OkOverwritten(); s = Status::OkOverwritten();
assert(old->IsVisible()); assert(!old->WillBeDeleted());
table_.Exclude(old); old->SetWillBeDeleted(true);
if (!old->HasRefs()) { // Try to evict the old copy of the element.
// old is in clock because it's in cache and its reference count is 0. if (old->TryExclusiveRef()) {
ClockRemove(old);
table_.Remove(old);
assert(usage_ >= old->total_charge);
usage_ -= old->total_charge;
last_reference_list.push_back(*old); last_reference_list.push_back(*old);
Evict(old);
old->ReleaseExclusiveRef();
} }
} }
if (handle == nullptr) { if (handle == nullptr) {
ClockInsert(h); // If the user didn't provide a handle, no reference is taken,
// so we make the element evictable.
ClockOn(h);
h->ReleaseExclusiveRef();
} else { } else {
// If caller already holds a ref, no need to take one here. // The caller already holds a ref.
if (!h->HasRefs()) { h->ExclusiveToExternalRef();
h->Ref();
}
*handle = reinterpret_cast<Cache::Handle*>(h); *handle = reinterpret_cast<Cache::Handle*>(h);
} }
} }
} }
// Free the entries here outside of mutex for performance reasons. // Free the entry outside of the mutex for performance reasons.
for (auto& h : last_reference_list) { for (auto& h : last_reference_list) {
h.FreeData(); h.FreeData();
} }
@ -407,95 +474,102 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
return s; return s;
} }
Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t /* hash */) { Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
ClockHandle* h = nullptr; ClockHandle* h = nullptr;
{ h = table_.Lookup(key, hash);
DMutexLock l(mutex_);
h = table_.Lookup(key);
if (h != nullptr) { if (h != nullptr) {
assert(h->IsVisible()); // TODO(Guido) Comment from #10347: Here it looks like we have three atomic
if (!h->HasRefs()) { // updates where it would be possible to combine into one CAS (more metadata
// The entry is in clock since it's in the hash table and has no // under one atomic field) or maybe two atomic updates (one arithmetic, one
// external references. // bitwise). Something to think about optimizing.
ClockRemove(h); h->InternalToExternalRef();
}
h->Ref();
h->SetHit(); h->SetHit();
} // The handle is now referenced, so we take it out of clock.
ClockOff(h);
} }
return reinterpret_cast<Cache::Handle*>(h); return reinterpret_cast<Cache::Handle*>(h);
} }
bool ClockCacheShard::Ref(Cache::Handle* h) { bool ClockCacheShard::Ref(Cache::Handle* h) {
ClockHandle* e = reinterpret_cast<ClockHandle*>(h); ClockHandle* e = reinterpret_cast<ClockHandle*>(h);
DMutexLock l(mutex_); assert(e->HasExternalRefs());
// To create another reference - entry must be already externally referenced. return e->TryExternalRef();
assert(e->HasRefs());
e->Ref();
return true;
} }
bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) { bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
// In contrast with LRUCache's Release, this function won't delete the handle
// when the reference is the last one and the cache is above capacity. Space
// is only freed up by EvictFromClock (called by Insert when space is needed)
// and Erase.
if (handle == nullptr) { if (handle == nullptr) {
return false; return false;
} }
ClockHandle* h = reinterpret_cast<ClockHandle*>(handle); ClockHandle* h = reinterpret_cast<ClockHandle*>(handle);
uint32_t hash = h->hash;
uint32_t refs = h->ReleaseExternalRef();
bool last_reference = !(refs & ClockHandle::EXTERNAL_REFS);
bool will_be_deleted = refs & ClockHandle::WILL_BE_DELETED;
if (last_reference && (will_be_deleted || erase_if_last_ref)) {
// At this point we want to evict the element, so we need to take
// a lock and an exclusive reference. But there's a problem:
// as soon as we released the last reference, an Insert or Erase could've
// replaced this element, and by the time we take the lock and ref
// we could potentially be referencing a different element.
// Thus, before evicting the (potentially different) element, we need to
// re-check that it's unreferenced and marked as WILL_BE_DELETED, so the
// eviction is safe. Additionally, we check that the hash doesn't change,
// which will detect, most of the time, whether the element is a different
// one. The bottomline is that we only guarantee that the input handle will
// be deleted, and occasionally also another handle, but in any case all
// deleted handles are safe to delete.
// TODO(Guido) With lock-free inserts and deletes we may be able to
// "atomically" transition to an exclusive ref, without creating a deadlock.
ClockHandle copy; ClockHandle copy;
bool last_reference = false;
assert(!h->IsInClockList());
{ {
DMutexLock l(mutex_); DMutexLock l(mutex_);
last_reference = h->Unref(); if (h->TrySpinExclusiveRef()) {
if (last_reference && h->IsVisible()) { will_be_deleted = h->refs & ClockHandle::WILL_BE_DELETED;
// The item is still in cache, and nobody else holds a reference to it. // Check that it's still safe to delete.
if (usage_ > capacity_ || erase_if_last_ref) { if (h->IsElement() && (will_be_deleted || erase_if_last_ref) &&
// The clock list must be empty since the cache is full. h->hash == hash) {
assert(clock_usage_ == 0 || erase_if_last_ref);
// Take this opportunity and remove the item.
table_.Remove(h);
} else {
// Put the item back on the clock list, and don't free it.
ClockInsert(h);
last_reference = false;
}
}
// If it was the last reference, then decrement the cache usage.
if (last_reference) {
assert(usage_ >= h->total_charge);
usage_ -= h->total_charge;
copy = *h; copy = *h;
Evict(h);
}
h->ReleaseExclusiveRef();
} else {
// An external ref was detected.
return false;
} }
} }
// Free the entry here outside of mutex for performance reasons. // Free the entry outside of the mutex for performance reasons.
if (last_reference) {
copy.FreeData(); copy.FreeData();
return true;
} }
return last_reference;
return false;
} }
void ClockCacheShard::Erase(const Slice& key, uint32_t /* hash */) { void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {
ClockHandle copy; ClockHandle copy;
bool last_reference = false; bool last_reference = false;
{ {
DMutexLock l(mutex_); DMutexLock l(mutex_);
ClockHandle* h = table_.Lookup(key); ClockHandle* h = table_.Lookup(key, hash);
if (h != nullptr) { if (h != nullptr) {
table_.Exclude(h); h->SetWillBeDeleted(true);
if (!h->HasRefs()) { h->ReleaseInternalRef();
// The entry is in Clock since it's in cache and has no external if (h->TryExclusiveRef()) {
// references.
ClockRemove(h);
table_.Remove(h);
assert(usage_ >= h->total_charge);
usage_ -= h->total_charge;
last_reference = true;
copy = *h; copy = *h;
Evict(h);
last_reference = true;
h->ReleaseExclusiveRef();
} }
} }
} }
// Free the entry here outside of mutex for performance reasons. // Free the entry outside of the mutex for performance reasons.
// last_reference will only be true if e != nullptr.
if (last_reference) { if (last_reference) {
copy.FreeData(); copy.FreeData();
} }
@ -507,9 +581,25 @@ size_t ClockCacheShard::GetUsage() const {
} }
size_t ClockCacheShard::GetPinnedUsage() const { size_t ClockCacheShard::GetPinnedUsage() const {
// Computes the pinned usage scanning the whole hash table. This
// is slow, but avoid keeping an exact counter on the clock usage,
// i.e., the number of not externally referenced elements.
// Why avoid this? Because Lookup removes elements from the clock
// list, so it would need to update the pinned usage every time,
// which creates additional synchronization costs.
DMutexLock l(mutex_); DMutexLock l(mutex_);
assert(usage_ >= clock_usage_);
return usage_ - clock_usage_; size_t clock_usage = 0;
table_.ConstApplyToEntriesRange(
[&clock_usage](ClockHandle* h) {
if (h->HasExternalRefs()) {
clock_usage += h->total_charge;
}
},
0, table_.GetTableSize(), true);
return clock_usage;
} }
std::string ClockCacheShard::GetPrintableOptions() const { std::string ClockCacheShard::GetPrintableOptions() const {

534
cache/clock_cache.h vendored

@ -10,6 +10,8 @@
#pragma once #pragma once
#include <array> #include <array>
#include <atomic>
#include <cstdint>
#include <memory> #include <memory>
#include <string> #include <string>
@ -27,116 +29,254 @@ namespace ROCKSDB_NAMESPACE {
namespace clock_cache { namespace clock_cache {
// Clock cache implementation. This is based on FastLRUCache's open-addressed // Block cache implementation using a lock-free open-address hash table
// hash table. Importantly, it stores elements in an array, and resolves // and clock eviction.
// collision using a probing strategy. Visibility and referenceability of
// elements works as usual. See fast_lru_cache.h for a detailed description. ///////////////////////////////////////////////////////////////////////////////
// Part 1: Handles
//
// Every slot in the hash table is a ClockHandle. A handle can be in a few
// different states, that stem from the fact that handles can be externally
// referenced and, thus, can't always be immediately evicted when a delete
// operation is executed or when they are replaced by a new version (via an
// insert of the same key). Concretely, the state of a handle is defined by the
// following two properties:
// (R) Externally referenced: A handle can be referenced externally, or not.
// Importantly, a handle can be evicted if and only if it's not
// referenced. In particular, when an handle becomes referenced, it's
// temporarily taken out of clock until all references to it are released.
// (M) Marked for deletion (or invisible): An handle is marked for deletion
// when an operation attempts to delete it, but the handle is externally
// referenced, so it can't be immediately deleted. When this mark is placed,
// lookups will no longer be able to find it. Consequently, no more external
// references will be taken to the handle. When a handle is marked for
// deletion, we also say it's invisible.
// These properties induce 4 different states, with transitions defined as
// follows:
// - Not M --> M: When a handle is deleted or replaced by a new version, but
// not immediately evicted.
// - M --> not M: This cannot happen. Once a handle is marked for deletion,
// there is no can't go back.
// - R --> not R: When all references to an handle are released.
// - Not R --> R: When an unreferenced handle becomes referenced. This can only
// happen if the handle is visible, since references to an handle can only be
// created when it's visible.
//
///////////////////////////////////////////////////////////////////////////////
// Part 2: Hash table structure
//
// Internally, the cache uses an open-addressed hash table to index the handles.
// We use tombstone counters to keep track of displacements. Probes are
// generated with double-hashing (but the code can be easily modified to use
// other probing schemes, like linear hashing). Because of the tombstones and
// the two possible visibility states of a handle, the table slots (we use the
// word "slot" to refer to handles that are not necessary valid key-value
// elements) can be in 4 different states:
// 1. Visible element: The slot contains an element in not M state.
// 2. To-be-deleted element: The slot contains an element in M state.
// 3. Tombstone: The slot doesn't contain an element, but there is some other
// element that probed this slot during its insertion.
// 4. Empty: The slot is unused.
// When a ghost is removed from the table, it can either transition to being a
// tombstone or an empty slot, depending on the number of displacements of the
// slot. In any case, the slot becomes available. When a handle is inserted
// into that slot, it becomes a visible element again.
//
///////////////////////////////////////////////////////////////////////////////
// Part 3: The clock algorithm
//
// We maintain a circular buffer with the handles available for eviction,
// which the clock algorithm traverses (using a "clock pointer") to pick the
// next victim. We use the hash table array as the circular buffer, and mark
// the handles that are evictable. For this we use different clock flags, namely
// NONE, LOW, MEDIUM, HIGH, that represent priorities: LOW, MEDIUM and HIGH
// represent how close an element is from being evictable, LOW being immediately
// evictable. NONE means the slot is not evictable. This is due to one of the
// following reasons:
// (i) the slot doesn't contain an element, or
// (ii) the slot contains an element that is in R state, or
// (iii) the slot contains an element that was in R state but it's
// not any more, and the clock pointer has not swept through the
// slot since the element stopped being referenced.
//
// The priority NONE is really only important for case (iii), as in the other
// two cases there are other metadata fields that already capture the state.
// When an element stops being referenced (and is not deleted), the clock
// algorithm must acknowledge this, and assign a non-NONE priority to make
// the element evictable again.
//
///////////////////////////////////////////////////////////////////////////////
// Part 4: Synchronization
//
// We provide the following synchronization guarantees:
// - Lookup is lock-free.
// - Release is lock-free, unless (i) no references to the element are left,
// and (ii) it was marked for deletion or the user wishes to delete if
// releasing the last reference.
// - Insert and Erase still use a per-shard lock.
//
// Our hash table is lock-free, in the sense that system-wide progress is
// guaranteed, i.e., some thread is always able to make progress.
// //
// The main difference with FastLRUCache is, not surprisingly, the eviction ///////////////////////////////////////////////////////////////////////////////
// algorithm
// ---instead of an LRU list, we maintain a circular list with the elements // The load factor p is a real number in (0, 1) such that at all
// available for eviction, which the clock algorithm traverses to pick the next // times at most a fraction p of all slots, without counting tombstones,
// victim. The clock list is represented using the array of handles, and we // are occupied by elements. This means that the probability that a
// simply mark those elements that are present in the list. This is done using // random probe hits an empty slot is at most p, and thus at most 1/p probes
// different clock flags, namely NONE, LOW, MEDIUM, HIGH, that represent // are required on average. For example, p = 70% implies that between 1 and 2
// priorities: NONE means that the element is not part of the clock list, and // probes are needed on average (bear in mind that this reasoning doesn't
// LOW to HIGH represent how close an element is from being evictable (LOW being // consider the effects of clustering over time).
// immediately evictable). When the clock pointer steps on an element that is // Because the size of the hash table is always rounded up to the next
// not immediately evictable, it decreases its priority. // power of 2, p is really an upper bound on the actual load factor---the
// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
constexpr double kLoadFactor = 0.35; // See fast_lru_cache.h. // but bear in mind that slots only hold metadata, not actual values.
// Since space cost is dominated by the values (the LSM blocks),
constexpr double kStrictLoadFactor = 0.7; // See fast_lru_cache.h. // overprovisioning the table with metadata only increases the total cache space
// usage by a tiny fraction.
constexpr double kLoadFactor = 0.35;
// The user can exceed kLoadFactor if the sizes of the inserted values don't
// match estimated_value_size, or if strict_capacity_limit == false. To
// avoid performance to plunge, we set a strict upper bound on the load factor.
constexpr double kStrictLoadFactor = 0.7;
// Arbitrary seeds. // Arbitrary seeds.
constexpr uint32_t kProbingSeed1 = 0xbc9f1d34; constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5; constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5;
// An experimental (under development!) alternative to LRUCache // An experimental (under development!) alternative to LRUCache.
struct ClockHandle { struct ClockHandle {
void* value; void* value;
Cache::DeleterFn deleter; Cache::DeleterFn deleter;
uint32_t hash; uint32_t hash;
size_t total_charge; // TODO(opt): Only allow uint32_t? size_t total_charge;
// The number of external refs to this entry. std::array<char, kCacheKeySize> key_data;
uint32_t refs;
static constexpr uint8_t kExternalRefsOffset = 0;
static constexpr uint8_t kSharedRefsOffset = 15;
static constexpr uint8_t kExclusiveRefOffset = 30;
static constexpr uint8_t kWillBeDeletedOffset = 31;
enum Refs : uint32_t {
// Number of external references to the slot.
EXTERNAL_REFS = ((uint32_t{1} << 15) - 1)
<< kExternalRefsOffset, // Bits 0, ..., 14
// Number of internal references plus external references to the slot.
SHARED_REFS = ((uint32_t{1} << 15) - 1)
<< kSharedRefsOffset, // Bits 15, ..., 29
// Whether a thread has an exclusive reference to the slot.
EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30
// Whether the handle will be deleted soon. When this bit is set, new
// internal
// or external references to this handle stop being accepted.
// There is an exception: external references can be created from
// existing external references, or converting from existing internal
// references.
WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31
// Shared references (i.e., external and internal references) and exclusive
// references are our custom implementation of RW locks---external and
// internal references are read locks, and exclusive references are write
// locks. We prioritize readers, which never block; in fact, they don't even
// use compare-and-swap operations. Using our own implementation of RW locks
// allows us to save many atomic operations by packing data more carefully.
// In particular:
// - Combining EXTERNAL_REFS and SHARED_REFS allows us to convert an
// internal
// reference into an external reference in a single atomic arithmetic
// operation.
// - Combining SHARED_REFS and WILL_BE_DELETED allows us to attempt to take
// a shared reference and check whether the entry is marked for deletion
// in a single atomic arithmetic operation.
};
static constexpr uint32_t kOneInternalRef = 0x8000;
static constexpr uint32_t kOneExternalRef = 0x8001;
std::atomic<uint32_t> refs;
static constexpr int kIsVisibleOffset = 0; static constexpr uint8_t kIsElementOffset = 1;
static constexpr int kIsElementOffset = 1; static constexpr uint8_t kClockPriorityOffset = 2;
static constexpr int kClockPriorityOffset = 2; static constexpr uint8_t kIsHitOffset = 4;
static constexpr int kIsHitOffset = 4; static constexpr uint8_t kCachePriorityOffset = 5;
static constexpr int kCachePriorityOffset = 5;
enum Flags : uint8_t { enum Flags : uint8_t {
// Whether the handle is visible to Lookups.
IS_VISIBLE = (1 << kIsVisibleOffset),
// Whether the slot is in use by an element. // Whether the slot is in use by an element.
IS_ELEMENT = (1 << kIsElementOffset), IS_ELEMENT = 1 << kIsElementOffset,
// Clock priorities. Represents how close a handle is from // Clock priorities. Represents how close a handle is from being evictable.
// being evictable. CLOCK_PRIORITY = 3 << kClockPriorityOffset,
CLOCK_PRIORITY = (3 << kClockPriorityOffset),
// Whether the handle has been looked up after its insertion. // Whether the handle has been looked up after its insertion.
HAS_HIT = (1 << kIsHitOffset), HAS_HIT = 1 << kIsHitOffset,
CACHE_PRIORITY = (1 << kCachePriorityOffset), // The value of Cache::Priority for the handle.
CACHE_PRIORITY = 1 << kCachePriorityOffset,
}; };
uint8_t flags;
std::atomic<uint8_t> flags;
enum ClockPriority : uint8_t { enum ClockPriority : uint8_t {
NONE = (0 << kClockPriorityOffset), // Not an element in the eyes of clock. NONE = (0 << kClockPriorityOffset),
LOW = (1 << kClockPriorityOffset), // Immediately evictable. LOW = (1 << kClockPriorityOffset),
MEDIUM = (2 << kClockPriorityOffset), MEDIUM = (2 << kClockPriorityOffset),
HIGH = (3 << kClockPriorityOffset) HIGH = (3 << kClockPriorityOffset)
// Priority is NONE if and only if
// (i) the handle is not an element, or
// (ii) the handle is an element but it is being referenced.
}; };
// The number of elements that hash to this slot or a lower one, // The number of elements that hash to this slot or a lower one, but wind
// but wind up in a higher slot. // up in this slot or a higher one.
uint32_t displacements; std::atomic<uint32_t> displacements;
std::array<char, kCacheKeySize> key_data; // Synchronization rules:
// - Use a shared reference when we want the handle's identity
ClockHandle() { // members (key_data, hash, value and IS_ELEMENT flag) to
value = nullptr; // remain untouched, but not modify them. The only updates
deleter = nullptr; // that a shared reference allows are:
hash = 0; // * set CLOCK_PRIORITY to NONE;
total_charge = 0; // * set the HAS_HIT bit.
refs = 0; // Notice that these two types of updates are idempotent, so
flags = 0; // they don't require synchronization across shared references.
SetIsVisible(false); // - Use an exclusive reference when we want identity members
// to remain untouched, as well as modify any identity member
// or flag.
// - displacements can be modified without holding a reference.
// - refs is only modified through appropriate functions to
// take or release references.
ClockHandle()
: value(nullptr),
deleter(nullptr),
hash(0),
total_charge(0),
refs(0),
flags(0),
displacements(0) {
SetWillBeDeleted(false);
SetIsElement(false); SetIsElement(false);
SetClockPriority(ClockPriority::NONE); SetClockPriority(ClockPriority::NONE);
SetCachePriority(Cache::Priority::LOW); SetCachePriority(Cache::Priority::LOW);
displacements = 0;
key_data.fill(0); key_data.fill(0);
} }
Slice key() const { return Slice(key_data.data(), kCacheKeySize); } ClockHandle(const ClockHandle& other) { *this = other; }
// Increase the reference count by 1. void operator=(const ClockHandle& other) {
void Ref() { refs++; } value = other.value;
deleter = other.deleter;
// Just reduce the reference count by 1. Return true if it was last reference. hash = other.hash;
bool Unref() { total_charge = other.total_charge;
assert(refs > 0); refs.store(other.refs);
refs--; key_data = other.key_data;
return refs == 0; flags.store(other.flags);
SetWillBeDeleted(other.WillBeDeleted());
SetIsElement(other.IsElement());
SetClockPriority(other.GetClockPriority());
SetCachePriority(other.GetCachePriority());
displacements.store(other.displacements);
} }
// Return true if there are external refs, false otherwise. Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
bool HasRefs() const { return refs > 0; }
bool IsVisible() const { return flags & IS_VISIBLE; }
void SetIsVisible(bool is_visible) { bool HasExternalRefs() const { return (refs & EXTERNAL_REFS) > 0; }
if (is_visible) {
flags |= IS_VISIBLE;
} else {
flags &= ~IS_VISIBLE;
}
}
bool IsElement() const { return flags & IS_ELEMENT; } bool IsElement() const { return flags & IS_ELEMENT; }
@ -144,7 +284,7 @@ struct ClockHandle {
if (is_element) { if (is_element) {
flags |= IS_ELEMENT; flags |= IS_ELEMENT;
} else { } else {
flags &= ~IS_ELEMENT; flags &= static_cast<uint8_t>(~IS_ELEMENT);
} }
} }
@ -152,7 +292,7 @@ struct ClockHandle {
void SetHit() { flags |= HAS_HIT; } void SetHit() { flags |= HAS_HIT; }
bool IsInClockList() const { bool IsInClock() const {
return GetClockPriority() != ClockHandle::ClockPriority::NONE; return GetClockPriority() != ClockHandle::ClockPriority::NONE;
} }
@ -164,7 +304,7 @@ struct ClockHandle {
if (priority == Cache::Priority::HIGH) { if (priority == Cache::Priority::HIGH) {
flags |= Flags::CACHE_PRIORITY; flags |= Flags::CACHE_PRIORITY;
} else { } else {
flags &= ~Flags::CACHE_PRIORITY; flags &= static_cast<uint8_t>(~Flags::CACHE_PRIORITY);
} }
} }
@ -173,7 +313,7 @@ struct ClockHandle {
} }
void SetClockPriority(ClockPriority priority) { void SetClockPriority(ClockPriority priority) {
flags &= ~Flags::CLOCK_PRIORITY; flags &= static_cast<uint8_t>(~Flags::CLOCK_PRIORITY);
flags |= priority; flags |= priority;
} }
@ -182,14 +322,13 @@ struct ClockHandle {
kClockPriorityOffset; kClockPriorityOffset;
assert(p > 0); assert(p > 0);
p--; p--;
flags &= ~Flags::CLOCK_PRIORITY; flags &= static_cast<uint8_t>(~Flags::CLOCK_PRIORITY);
ClockPriority new_priority = ClockPriority new_priority =
static_cast<ClockPriority>(p << kClockPriorityOffset); static_cast<ClockPriority>(p << kClockPriorityOffset);
flags |= new_priority; flags |= new_priority;
} }
void FreeData() { void FreeData() {
assert(refs == 0);
if (deleter) { if (deleter) {
(*deleter)(key(), value); (*deleter)(key(), value);
} }
@ -232,17 +371,131 @@ struct ClockHandle {
return total_charge - meta_charge; return total_charge - meta_charge;
} }
inline bool IsEmpty() { inline bool IsEmpty() const {
return !this->IsElement() && this->displacements == 0; return !this->IsElement() && this->displacements == 0;
} }
inline bool IsTombstone() { inline bool IsTombstone() const {
return !this->IsElement() && this->displacements > 0; return !this->IsElement() && this->displacements > 0;
} }
inline bool Matches(const Slice& some_key) { inline bool Matches(const Slice& some_key, uint32_t some_hash) const {
return this->IsElement() && this->key() == some_key; return this->IsElement() && this->hash == some_hash &&
this->key() == some_key;
}
bool WillBeDeleted() const { return refs & WILL_BE_DELETED; }
void SetWillBeDeleted(bool will_be_deleted) {
if (will_be_deleted) {
refs |= WILL_BE_DELETED;
} else {
refs &= ~WILL_BE_DELETED;
}
}
// The following functions are for taking and releasing refs.
// Tries to take an external ref. Returns true iff it succeeds.
inline bool TryExternalRef() {
if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
return true;
}
refs -= kOneExternalRef;
return false;
}
// Releases an external ref. Returns the new value (this is useful to
// avoid an extra atomic read).
inline uint32_t ReleaseExternalRef() { return refs -= kOneExternalRef; }
// Take an external ref, assuming there is already one external ref
// to the handle.
void Ref() {
// TODO(Guido) Is it okay to assume that the existing external reference
// survives until this function returns?
refs += kOneExternalRef;
}
// Tries to take an internal ref. Returns true iff it succeeds.
inline bool TryInternalRef() {
if (!((refs += kOneInternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
return true;
}
refs -= kOneInternalRef;
return false;
}
inline void ReleaseInternalRef() { refs -= kOneInternalRef; }
// Tries to take an exclusive ref. Returns true iff it succeeds.
inline bool TryExclusiveRef() {
uint32_t will_be_deleted = refs & WILL_BE_DELETED;
uint32_t expected = will_be_deleted;
return refs.compare_exchange_strong(expected,
EXCLUSIVE_REF | will_be_deleted);
}
// Repeatedly tries to take an exclusive reference, but stops as soon
// as an external reference is detected (in this case the wait would
// presumably be too long).
inline bool TrySpinExclusiveRef() {
uint32_t expected = 0;
uint32_t will_be_deleted = 0;
while (!refs.compare_exchange_strong(expected,
EXCLUSIVE_REF | will_be_deleted)) {
if (expected & EXTERNAL_REFS) {
return false;
}
will_be_deleted = expected & WILL_BE_DELETED;
expected = will_be_deleted;
}
return true;
}
inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); }
// The following functions are for upgrading and downgrading refs.
// They guarantee atomicity, i.e., no exclusive refs to the handle
// can be taken by a different thread during the conversion.
inline void ExclusiveToInternalRef() {
refs += kOneInternalRef;
ReleaseExclusiveRef();
}
inline void ExclusiveToExternalRef() {
refs += kOneExternalRef;
ReleaseExclusiveRef();
}
// TODO(Guido) Do we want to bound the loop and prepare the
// algorithms to react to a failure?
inline void InternalToExclusiveRef() {
uint32_t expected = kOneInternalRef;
uint32_t will_be_deleted = 0;
while (!refs.compare_exchange_strong(expected,
EXCLUSIVE_REF | will_be_deleted)) {
will_be_deleted = expected & WILL_BE_DELETED;
expected = kOneInternalRef | will_be_deleted;
} }
}
inline void InternalToExternalRef() {
refs += kOneExternalRef - kOneInternalRef;
}
// TODO(Guido) Same concern.
inline void ExternalToExclusiveRef() {
uint32_t expected = kOneExternalRef;
uint32_t will_be_deleted = 0;
while (!refs.compare_exchange_strong(expected,
EXCLUSIVE_REF | will_be_deleted)) {
will_be_deleted = expected & WILL_BE_DELETED;
expected = kOneExternalRef | will_be_deleted;
}
}
}; // struct ClockHandle }; // struct ClockHandle
class ClockHandleTable { class ClockHandleTable {
@ -252,32 +505,55 @@ class ClockHandleTable {
// Returns a pointer to a visible element matching the key/hash, or // Returns a pointer to a visible element matching the key/hash, or
// nullptr if not present. // nullptr if not present.
ClockHandle* Lookup(const Slice& key); ClockHandle* Lookup(const Slice& key, uint32_t hash);
// Inserts a copy of h into the hash table. // Inserts a copy of h into the hash table.
// Returns a pointer to the inserted handle, or nullptr if no slot // Returns a pointer to the inserted handle, or nullptr if no slot
// available was found. If an existing visible element matching the // available was found. If an existing visible element matching the
// key/hash is already present in the hash table, the argument old // key/hash is already present in the hash table, the argument old
// is set to pointe to it; otherwise, it's set to nullptr. // is set to point to it; otherwise, it's set to nullptr.
// Returns an exclusive reference to h, and no references to old.
ClockHandle* Insert(ClockHandle* h, ClockHandle** old); ClockHandle* Insert(ClockHandle* h, ClockHandle** old);
// Removes h from the hash table. The handle must already be off // Removes h from the hash table. The handle must already be off clock.
// the clock list.
void Remove(ClockHandle* h); void Remove(ClockHandle* h);
// Turns a visible element h into a ghost (i.e., not visible). // Extracts the element information from a handle (src), and assigns it
void Exclude(ClockHandle* h); // to a hash table slot (dst). Doesn't touch displacements and refs,
// which are maintained by the hash table algorithm.
void Assign(ClockHandle* dst, ClockHandle* src);
// Assigns a copy of h to the given slot. template <typename T>
void Assign(int slot, ClockHandle* h); void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end,
bool apply_if_will_be_deleted) {
for (uint32_t i = index_begin; i < index_end; i++) {
ClockHandle* h = &array_[i];
if (h->TryExclusiveRef()) {
if (h->IsElement() &&
(apply_if_will_be_deleted || !h->WillBeDeleted())) {
// Hand the internal ref over to func, which is now responsible
// to release it.
func(h);
} else {
h->ReleaseExclusiveRef();
}
}
}
}
template <typename T> template <typename T>
void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) { void ConstApplyToEntriesRange(T func, uint32_t index_begin,
uint32_t index_end,
bool apply_if_will_be_deleted) const {
for (uint32_t i = index_begin; i < index_end; i++) { for (uint32_t i = index_begin; i < index_end; i++) {
ClockHandle* h = &array_[i]; ClockHandle* h = &array_[i];
if (h->IsVisible()) { if (h->TryExclusiveRef()) {
if (h->IsElement() &&
(apply_if_will_be_deleted || !h->WillBeDeleted())) {
func(h); func(h);
} }
h->ReleaseExclusiveRef();
}
} }
} }
@ -295,28 +571,38 @@ class ClockHandleTable {
private: private:
friend class ClockCacheShard; friend class ClockCacheShard;
int FindVisibleElement(const Slice& key, int& probe, int displacement); int FindElement(const Slice& key, uint32_t hash, uint32_t& probe);
int FindAvailableSlot(const Slice& key, int& probe, int displacement); int FindAvailableSlot(const Slice& key, uint32_t& probe);
int FindVisibleElementOrAvailableSlot(const Slice& key, int& probe, int FindElementOrAvailableSlot(const Slice& key, uint32_t hash,
int displacement); uint32_t& probe);
// Returns the index of the first slot probed (hashing with // Returns the index of the first slot probed (hashing with
// the given key) with a handle e such that cond(e) is true. // the given key) with a handle e such that match(e) is true.
// Otherwise, if no match is found, returns -1. // At every step, the function first tests whether match(e) holds.
// For every handle e probed except the final slot, updates // If it's false, it evaluates abort(e) to decide whether the
// e->displacements += displacement. // search should be aborted, and in the affirmative returns -1.
// The argument probe is modified such that consecutive calls // For every handle e probed except the last one, the function runs
// to FindSlot continue probing right after where the previous // update(e). We say a probe to a handle e is aborting if match(e) is
// call left. // false and abort(e) is true. The argument probe is one more than the
int FindSlot(const Slice& key, std::function<bool(ClockHandle*)> cond, // last non-aborting probe during the call. This is so that that the
int& probe, int displacement); // variable can be used to keep track of progress across consecutive
// calls to FindSlot.
inline int FindSlot(const Slice& key, std::function<bool(ClockHandle*)> match,
std::function<bool(ClockHandle*)> stop,
std::function<void(ClockHandle*)> update,
uint32_t& probe);
// After a failed FindSlot call (i.e., with answer -1), this function
// decrements all displacements, starting from the 0-th probe.
void Rollback(const Slice& key, uint32_t probe);
// Number of hash bits used for table index. // Number of hash bits used for table index.
// The size of the table is 1 << length_bits_. // The size of the table is 1 << length_bits_.
int length_bits_; int length_bits_;
// For faster computation of ModTableSize.
const uint32_t length_bits_mask_; const uint32_t length_bits_mask_;
// Number of elements in the table. // Number of elements in the table.
@ -345,10 +631,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
void SetStrictCapacityLimit(bool strict_capacity_limit) override; void SetStrictCapacityLimit(bool strict_capacity_limit) override;
// Like Cache methods, but with an extra "hash" parameter. // Like Cache methods, but with an extra "hash" parameter.
// Insert an item into the hash table and, if handle is null, insert into // Insert an item into the hash table and, if handle is null, make it
// the clock list. Older items are evicted as necessary. If the cache is full // evictable by the clock algorithm. Older items are evicted as necessary.
// and free_handle_on_fail is true, the item is deleted and handle is set to // If the cache is full and free_handle_on_fail is true, the item is deleted
// nullptr. // and handle is set to nullptr.
Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
Cache::DeleterFn deleter, Cache::Handle** handle, Cache::DeleterFn deleter, Cache::Handle** handle,
Cache::Priority priority) override; Cache::Priority priority) override;
@ -393,13 +679,18 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
private: private:
friend class ClockCache; friend class ClockCache;
void ClockRemove(ClockHandle* e);
void ClockInsert(ClockHandle* e); // Makes an element evictable by clock.
void ClockOn(ClockHandle* h);
// Makes an element non-evictable.
void ClockOff(ClockHandle* h);
// Requires an exclusive ref on h.
void Evict(ClockHandle* h);
// Free some space following strict clock policy until enough space // Free some space following strict clock policy until enough space
// to hold (usage_ + charge) is freed or the clock list is empty // to hold (usage_ + charge) is freed or there are no evictable elements.
// This function is not thread safe - it needs to be executed while
// holding the mutex_.
void EvictFromClock(size_t charge, autovector<ClockHandle>* deleted); void EvictFromClock(size_t charge, autovector<ClockHandle>* deleted);
// Returns the charge of a single handle. // Returns the charge of a single handle.
@ -436,9 +727,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
// Memory size for entries residing in the cache. // Memory size for entries residing in the cache.
size_t usage_; size_t usage_;
// Memory size for unpinned entries in the clock list.
size_t clock_usage_;
// mutex_ protects the following state. // mutex_ protects the following state.
// We don't count mutex_ as the cache's internal state so semantically we // We don't count mutex_ as the cache's internal state so semantically we
// don't mind mutex_ invoking the non-const actions. // don't mind mutex_ invoking the non-const actions.

@ -52,6 +52,7 @@ LRUHandle* LRUHandleTable::Insert(LRUHandle* h, LRUHandle** old) {
1 /*displacement*/); 1 /*displacement*/);
*old = nullptr; *old = nullptr;
if (slot == -1) { if (slot == -1) {
// TODO(Guido) Don't we need to roll back displacements here?
return nullptr; return nullptr;
} }

Loading…
Cancel
Save