Lock-free ClockCache (#10390)

Summary:
ClockCache completely free of locks. As part of this PR we have also pushed clock algorithm functionality out of ClockCacheShard into ClockHandleTable, so that ClockCacheShard acts more as an interface and less as an actual data structure.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10390

Test Plan:
- ``make -j24 check``
- ``make -j24 CRASH_TEST_EXT_ARGS="--duration=960 --cache_type=clock_cache --cache_size=1073741824 --block_size=16384" blackbox_crash_test_with_atomic_flush``

Reviewed By: pdillinger

Differential Revision: D38106945

Pulled By: guidotag

fbshipit-source-id: 6cbf6bd2397dc9f582809ccff5118a8a33ea6cb1
main
Guido Tagliavini Ponce 2 years ago committed by Facebook GitHub Bot
parent 8860fc902a
commit 6a160e1fec
  1. 595
      cache/clock_cache.cc
  2. 732
      cache/clock_cache.h

595
cache/clock_cache.cc vendored

@ -17,7 +17,6 @@
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics.h"
#include "port/lang.h"
#include "util/distributed_mutex.h"
#include "util/hash.h"
#include "util/math.h"
#include "util/random.h"
@ -26,86 +25,91 @@ namespace ROCKSDB_NAMESPACE {
namespace clock_cache {
ClockHandleTable::ClockHandleTable(int hash_bits)
ClockHandleTable::ClockHandleTable(size_t capacity, int hash_bits)
: length_bits_(hash_bits),
length_bits_mask_((uint32_t{1} << length_bits_) - 1),
occupancy_(0),
occupancy_limit_(static_cast<uint32_t>((uint32_t{1} << length_bits_) *
kStrictLoadFactor)),
array_(new ClockHandle[size_t{1} << length_bits_]) {
capacity_(capacity),
array_(new ClockHandle[size_t{1} << length_bits_]),
clock_pointer_(0),
occupancy_(0),
usage_(0) {
assert(hash_bits <= 32);
}
ClockHandleTable::~ClockHandleTable() {
ApplyToEntriesRange([](ClockHandle* h) { h->FreeData(); }, 0, GetTableSize(),
true);
// Assumes there are no references (of any type) to any slot in the table.
for (uint32_t i = 0; i < GetTableSize(); i++) {
ClockHandle* h = &array_[i];
if (h->IsElement()) {
h->FreeData();
}
}
}
ClockHandle* ClockHandleTable::Lookup(const Slice& key, uint32_t hash) {
uint32_t probe = 0;
int slot = FindElement(key, hash, probe);
return (slot == -1) ? nullptr : &array_[slot];
ClockHandle* e = FindSlot(
key,
[&](ClockHandle* h) {
if (h->TryInternalRef()) {
if (h->IsElement() && h->Matches(key, hash)) {
return true;
}
h->ReleaseInternalRef();
}
return false;
},
[&](ClockHandle* h) { return h->displacements == 0; },
[&](ClockHandle* /*h*/) {}, probe);
ClockHandle* ClockHandleTable::Insert(ClockHandle* h, ClockHandle** old) {
uint32_t probe = 0;
int slot = FindElementOrAvailableSlot(h->key(), h->hash, probe);
*old = nullptr;
if (slot == -1) {
// The key is not already present, and there's no available slot to place
// the new copy.
return nullptr;
if (e != nullptr) {
// TODO(Guido) Comment from #10347: Here it looks like we have three atomic
// updates where it would be possible to combine into one CAS (more metadata
// under one atomic field) or maybe two atomic updates (one arithmetic, one
// bitwise). Something to think about optimizing.
e->InternalToExternalRef();
e->SetHit();
// The handle is now referenced, so we take it out of clock.
ClockOff(e);
}
if (!array_[slot].IsElement()) {
// The slot is empty or is a tombstone.
ClockHandle* new_entry = &array_[slot];
new_entry->InternalToExclusiveRef();
Assign(new_entry, h);
if (new_entry->displacements == 0) {
// The slot was empty.
return new_entry;
return e;
}
// It used to be a tombstone, so there may already be a copy of the
// key in the table.
slot = FindElement(h->key(), h->hash, probe);
if (slot == -1) {
// Nope, no existing copy of the key.
return new_entry;
}
ClockHandle* old_entry = &array_[slot];
old_entry->ReleaseInternalRef();
*old = old_entry;
return new_entry;
} else {
// There is an existing copy of the key.
ClockHandle* old_entry = &array_[slot];
old_entry->ReleaseInternalRef();
*old = old_entry;
// Find an available slot for the new element.
old_entry->displacements++;
slot = FindAvailableSlot(h->key(), probe);
if (slot == -1) {
// No available slots.
ClockHandle* ClockHandleTable::Insert(ClockHandle* h,
autovector<ClockHandle>* deleted,
bool take_reference) {
uint32_t probe = 0;
ClockHandle* e = FindAvailableSlot(h->key(), h->hash, probe, deleted);
if (e == nullptr) {
// No available slot to place the handle.
return nullptr;
}
ClockHandle* new_entry = &array_[slot];
new_entry->InternalToExclusiveRef();
Assign(new_entry, h);
return new_entry;
}
// The slot is empty or is a tombstone. And we have an exclusive ref.
Assign(e, h);
// TODO(Guido) The following RemoveAll can probably be run outside of
// the exclusive ref. I had a bad case in mind: multiple inserts could
// annihilate each. Although I think this is impossible, I'm not sure
// my mental proof covers every case.
if (e->displacements != 0) {
// It used to be a tombstone, so there may already be copies of the
// key in the table.
RemoveAll(h->key(), h->hash, probe, deleted);
}
void ClockHandleTable::Remove(ClockHandle* h) {
assert(!h->IsInClock()); // Already off clock.
uint32_t probe = 0;
FindSlot(
h->key(), [&](ClockHandle* e) { return e == h; },
[&](ClockHandle* /*e*/) { return false; },
[&](ClockHandle* e) { e->displacements--; }, probe);
h->SetWillBeDeleted(false);
h->SetIsElement(false);
occupancy_--;
if (take_reference) {
// The user wants to take a reference.
e->ExclusiveToExternalRef();
} else {
// The user doesn't want to immediately take a reference, so we make
// it evictable.
ClockOn(e);
e->ReleaseExclusiveRef();
}
return e;
}
void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) {
@ -117,75 +121,152 @@ void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) {
dst->key_data = src->key_data;
dst->flags.store(0);
dst->SetIsElement(true);
dst->SetClockPriority(ClockHandle::ClockPriority::NONE);
dst->SetCachePriority(src->GetCachePriority());
usage_ += dst->total_charge;
occupancy_++;
}
int ClockHandleTable::FindElement(const Slice& key, uint32_t hash,
uint32_t& probe) {
return FindSlot(
key,
[&](ClockHandle* h) {
if (h->TryInternalRef()) {
if (h->Matches(key, hash)) {
bool ClockHandleTable::TryRemove(ClockHandle* h,
autovector<ClockHandle>* deleted) {
if (h->TryExclusiveRef()) {
if (h->WillBeDeleted()) {
Remove(h, deleted);
return true;
}
h->ReleaseInternalRef();
h->ReleaseExclusiveRef();
}
return false;
},
[&](ClockHandle* h) { return h->displacements == 0; },
[&](ClockHandle* /*h*/) {}, probe);
}
int ClockHandleTable::FindAvailableSlot(const Slice& key, uint32_t& probe) {
int slot = FindSlot(
bool ClockHandleTable::SpinTryRemove(ClockHandle* h,
autovector<ClockHandle>* deleted) {
if (h->SpinTryExclusiveRef()) {
if (h->WillBeDeleted()) {
Remove(h, deleted);
return true;
}
h->ReleaseExclusiveRef();
}
return false;
}
void ClockHandleTable::ClockOff(ClockHandle* h) {
h->SetClockPriority(ClockHandle::ClockPriority::NONE);
}
void ClockHandleTable::ClockOn(ClockHandle* h) {
assert(!h->IsInClock());
bool is_high_priority =
h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH;
h->SetClockPriority(static_cast<ClockHandle::ClockPriority>(
is_high_priority ? ClockHandle::ClockPriority::HIGH
: ClockHandle::ClockPriority::MEDIUM));
}
void ClockHandleTable::Remove(ClockHandle* h,
autovector<ClockHandle>* deleted) {
deleted->push_back(*h);
ClockOff(h);
uint32_t probe = 0;
FindSlot(
h->key(), [&](ClockHandle* e) { return e == h; },
[&](ClockHandle* /*e*/) { return false; },
[&](ClockHandle* e) { e->displacements--; }, probe);
h->SetWillBeDeleted(false);
h->SetIsElement(false);
}
void ClockHandleTable::RemoveAll(const Slice& key, uint32_t hash,
uint32_t& probe,
autovector<ClockHandle>* deleted) {
FindSlot(
key,
[&](ClockHandle* h) {
if (h->TryInternalRef()) {
if (!h->IsElement()) {
return true;
if (h->IsElement() && h->Matches(key, hash)) {
h->SetWillBeDeleted(true);
h->ReleaseInternalRef();
if (TryRemove(h, deleted)) {
h->ReleaseExclusiveRef();
}
return false;
}
h->ReleaseInternalRef();
}
return false;
},
[&](ClockHandle* /*h*/) { return false; },
[&](ClockHandle* h) { h->displacements++; }, probe);
if (slot == -1) {
Rollback(key, probe);
[&](ClockHandle* h) { return h->displacements == 0; },
[&](ClockHandle* /*h*/) {}, probe);
}
return slot;
void ClockHandleTable::Free(autovector<ClockHandle>* deleted) {
if (deleted->size() == 0) {
// Avoid unnecessarily reading usage_ and occupancy_.
return;
}
int ClockHandleTable::FindElementOrAvailableSlot(const Slice& key,
uint32_t hash,
uint32_t& probe) {
int slot = FindSlot(
size_t deleted_charge = 0;
for (auto& h : *deleted) {
deleted_charge += h.total_charge;
h.FreeData();
}
assert(usage_ >= deleted_charge);
usage_ -= deleted_charge;
occupancy_ -= static_cast<uint32_t>(deleted->size());
}
ClockHandle* ClockHandleTable::FindAvailableSlot(
const Slice& key, uint32_t hash, uint32_t& probe,
autovector<ClockHandle>* deleted) {
ClockHandle* e = FindSlot(
key,
[&](ClockHandle* h) {
// To read the handle, first acquire a shared ref.
if (h->TryInternalRef()) {
if (!h->IsElement() || h->Matches(key, hash)) {
if (h->IsElement()) {
// The slot is not available.
// TODO(Guido) Is it worth testing h->WillBeDeleted()?
if (h->WillBeDeleted() || h->Matches(key, hash)) {
// The slot can be freed up, or the key we're inserting is already
// in the table, so we try to delete it. When the attempt is
// successful, the slot becomes available, so we stop probing.
// Notice that in that case TryRemove returns an exclusive ref.
h->SetWillBeDeleted(true);
h->ReleaseInternalRef();
if (TryRemove(h, deleted)) {
return true;
}
return false;
}
h->ReleaseInternalRef();
return false;
}
// Available slot.
h->ReleaseInternalRef();
// Try to acquire an exclusive ref. If we fail, continue probing.
if (h->SpinTryExclusiveRef()) {
// Check that the slot is still available.
if (!h->IsElement()) {
return true;
}
h->ReleaseExclusiveRef();
}
}
return false;
},
[&](ClockHandle* /*h*/) { return false; },
[&](ClockHandle* h) { h->displacements++; }, probe);
if (slot == -1) {
if (e == nullptr) {
Rollback(key, probe);
}
return slot;
return e;
}
int ClockHandleTable::FindSlot(const Slice& key,
std::function<bool(ClockHandle*)> match,
ClockHandle* ClockHandleTable::FindSlot(
const Slice& key, std::function<bool(ClockHandle*)> match,
std::function<bool(ClockHandle*)> abort,
std::function<void(ClockHandle*)> update,
uint32_t& probe) {
std::function<void(ClockHandle*)> update, uint32_t& probe) {
// We use double-hashing probing. Every probe in the sequence is a
// pseudorandom integer, computed as a linear function of two random hashes,
// which we call base and increment. Specifically, the i-th probe is base + i
@ -201,14 +282,14 @@ int ClockHandleTable::FindSlot(const Slice& key,
ClockHandle* h = &array_[current];
if (current == base && probe > 0) {
// We looped back.
return -1;
return nullptr;
}
if (match(h)) {
probe++;
return current;
return h;
}
if (abort(h)) {
return -1;
return nullptr;
}
probe++;
update(h);
@ -226,35 +307,73 @@ void ClockHandleTable::Rollback(const Slice& key, uint32_t probe) {
}
}
void ClockHandleTable::ClockRun(size_t charge) {
// TODO(Guido) When an element is in the probe sequence of a
// hot element, it will be hard to get an exclusive ref.
// Do we need a mechanism to prevent an element from sitting
// for a long time in cache waiting to be evicted?
assert(charge <= capacity_);
autovector<ClockHandle> deleted;
uint32_t max_iterations =
1 + static_cast<uint32_t>(GetTableSize() * kLoadFactor);
size_t usage_local = usage_;
while (usage_local + charge > capacity_ && max_iterations--) {
uint32_t steps = 1 + static_cast<uint32_t>(1 / kLoadFactor);
uint32_t clock_pointer_local = (clock_pointer_ += steps) - steps;
for (uint32_t i = 0; i < steps; i++) {
ClockHandle* h = &array_[ModTableSize(clock_pointer_local + i)];
if (h->TryExclusiveRef()) {
if (h->WillBeDeleted()) {
Remove(h, &deleted);
usage_local -= h->total_charge;
} else {
if (!h->IsInClock() && h->IsElement()) {
// We adjust the clock priority to make the element evictable again.
// Why? Elements that are not in clock are either currently
// externally referenced or used to be. Because we are holding an
// exclusive ref, we know we are in the latter case. This can only
// happen when the last external reference to an element was
// released, and the element was not immediately removed.
ClockOn(h);
}
ClockHandle::ClockPriority priority = h->GetClockPriority();
if (priority == ClockHandle::ClockPriority::LOW) {
Remove(h, &deleted);
usage_local -= h->total_charge;
} else if (priority > ClockHandle::ClockPriority::LOW) {
h->DecreaseClockPriority();
}
}
h->ReleaseExclusiveRef();
}
}
}
Free(&deleted);
}
ClockCacheShard::ClockCacheShard(
size_t capacity, size_t estimated_value_size, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy)
: capacity_(capacity),
strict_capacity_limit_(strict_capacity_limit),
clock_pointer_(0),
table_(
CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)),
usage_(0) {
: strict_capacity_limit_(strict_capacity_limit),
table_(capacity, CalcHashBits(capacity, estimated_value_size,
metadata_charge_policy)) {
set_metadata_charge_policy(metadata_charge_policy);
}
void ClockCacheShard::EraseUnRefEntries() {
autovector<ClockHandle> last_reference_list;
{
DMutexLock l(mutex_);
autovector<ClockHandle> deleted;
table_.ApplyToEntriesRange(
[this, &last_reference_list](ClockHandle* h) {
[this, &deleted](ClockHandle* h) {
// Externally unreferenced element.
last_reference_list.push_back(*h);
Evict(h);
table_.Remove(h, &deleted);
},
0, table_.GetTableSize(), true);
}
// Free the entry outside of the mutex for performance reasons.
for (auto& h : last_reference_list) {
h.FreeData();
}
table_.Free(&deleted);
}
void ClockCacheShard::ApplyToSomeEntries(
@ -264,7 +383,6 @@ void ClockCacheShard::ApplyToSomeEntries(
// The state is essentially going to be the starting hash, which works
// nicely even if we resize between calls because we use upper-most
// hash bits for table indexes.
DMutexLock l(mutex_);
uint32_t length_bits = table_.GetLengthBits();
uint32_t length = table_.GetTableSize();
@ -276,7 +394,7 @@ void ClockCacheShard::ApplyToSomeEntries(
uint32_t index_begin = *state >> (32 - length_bits);
uint32_t index_end = index_begin + average_entries_per_lock;
if (index_end >= length) {
// Going to end
// Going to end.
index_end = length;
*state = UINT32_MAX;
} else {
@ -292,60 +410,6 @@ void ClockCacheShard::ApplyToSomeEntries(
index_begin, index_end, false);
}
void ClockCacheShard::ClockOff(ClockHandle* h) {
h->SetClockPriority(ClockHandle::ClockPriority::NONE);
}
void ClockCacheShard::ClockOn(ClockHandle* h) {
assert(!h->IsInClock());
bool is_high_priority =
h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH;
h->SetClockPriority(static_cast<ClockHandle::ClockPriority>(
is_high_priority * ClockHandle::ClockPriority::HIGH +
(1 - is_high_priority) * ClockHandle::ClockPriority::MEDIUM));
}
void ClockCacheShard::Evict(ClockHandle* h) {
ClockOff(h);
table_.Remove(h);
assert(usage_ >= h->total_charge);
usage_ -= h->total_charge;
}
void ClockCacheShard::EvictFromClock(size_t charge,
autovector<ClockHandle>* deleted) {
// TODO(Guido) When an element is in the probe sequence of a
// hot element, it will be hard to get an exclusive ref.
// We may need a mechanism to avoid that an element sits forever
// in cache waiting to be evicted.
assert(charge <= capacity_);
uint32_t max_iterations = table_.GetTableSize();
while (usage_ + charge > capacity_ && max_iterations--) {
ClockHandle* h = &table_.array_[clock_pointer_];
clock_pointer_ = table_.ModTableSize(clock_pointer_ + 1);
if (h->TryExclusiveRef()) {
if (!h->IsInClock() && h->IsElement()) {
// We adjust the clock priority to make the element evictable again.
// Why? Elements that are not in clock are either currently
// externally referenced or used to be---because we are holding an
// exclusive ref, we know we are in the latter case. This can only
// happen when the last external reference to an element was released,
// and the element was not immediately removed.
ClockOn(h);
}
if (h->GetClockPriority() == ClockHandle::ClockPriority::LOW) {
deleted->push_back(*h);
Evict(h);
} else if (h->GetClockPriority() > ClockHandle::ClockPriority::LOW) {
h->DecreaseClockPriority();
}
h->ReleaseExclusiveRef();
}
}
}
size_t ClockCacheShard::CalcEstimatedHandleCharge(
size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy) {
@ -366,25 +430,12 @@ int ClockCacheShard::CalcHashBits(
return FloorLog2((num_entries << 1) - 1);
}
void ClockCacheShard::SetCapacity(size_t capacity) {
assert(false); // Not supported. TODO(Guido) Support it?
autovector<ClockHandle> last_reference_list;
{
DMutexLock l(mutex_);
capacity_ = capacity;
EvictFromClock(0, &last_reference_list);
void ClockCacheShard::SetCapacity(size_t /*capacity*/) {
assert(false); // Not supported.
}
// Free the entry outside of the mutex for performance reasons.
for (auto& h : last_reference_list) {
h.FreeData();
}
}
void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
assert(false); // Not supported. TODO(Guido) Support it?
DMutexLock l(mutex_);
strict_capacity_limit_ = strict_capacity_limit;
void ClockCacheShard::SetStrictCapacityLimit(bool /*strict_capacity_limit*/) {
assert(false); // Not supported.
}
Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
@ -407,23 +458,28 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
}
Status s = Status::OK();
autovector<ClockHandle> last_reference_list;
{
DMutexLock l(mutex_);
assert(table_.GetOccupancy() <= table_.GetOccupancyLimit());
// Free the space following strict clock policy until enough space
// is freed or there are no evictable elements.
EvictFromClock(tmp.total_charge, &last_reference_list);
if ((usage_ + tmp.total_charge > capacity_ &&
// Free space with the clock policy until enough space is freed or there are
// no evictable elements.
table_.ClockRun(tmp.total_charge);
// occupancy_ and usage_ are contended members across concurrent updates
// on the same shard, so we use a single copy to reduce cache synchronization.
uint32_t occupancy_local = table_.GetOccupancy();
size_t usage_local = table_.GetUsage();
assert(occupancy_local <= table_.GetOccupancyLimit());
autovector<ClockHandle> deleted;
if ((usage_local + tmp.total_charge > table_.GetCapacity() &&
(strict_capacity_limit_ || handle == nullptr)) ||
table_.GetOccupancy() == table_.GetOccupancyLimit()) {
occupancy_local > table_.GetOccupancyLimit()) {
if (handle == nullptr) {
// Don't insert the entry but still return ok, as if the entry inserted
// into cache and get evicted immediately.
last_reference_list.push_back(tmp);
deleted.push_back(tmp);
} else {
if (table_.GetOccupancy() == table_.GetOccupancyLimit()) {
if (occupancy_local > table_.GetOccupancyLimit()) {
// TODO: Consider using a distinct status for this case, but usually
// it will be handled the same way as reaching charge capacity limit
s = Status::MemoryLimit(
@ -437,57 +493,25 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
} else {
// Insert into the cache. Note that the cache might get larger than its
// capacity if not enough space was freed up.
ClockHandle* old;
ClockHandle* h = table_.Insert(&tmp, &old);
assert(h != nullptr); // We're below occupancy, so this insertion should
// never fail.
usage_ += h->total_charge;
if (old != nullptr) {
s = Status::OkOverwritten();
assert(!old->WillBeDeleted());
old->SetWillBeDeleted(true);
// Try to evict the old copy of the element.
if (old->TryExclusiveRef()) {
last_reference_list.push_back(*old);
Evict(old);
old->ReleaseExclusiveRef();
}
}
if (handle == nullptr) {
// If the user didn't provide a handle, no reference is taken,
// so we make the element evictable.
ClockOn(h);
h->ReleaseExclusiveRef();
} else {
// The caller already holds a ref.
h->ExclusiveToExternalRef();
ClockHandle* h = table_.Insert(&tmp, &deleted, handle != nullptr);
assert(h != nullptr); // The occupancy is way below the table size, so this
// insertion should never fail.
if (handle != nullptr) {
*handle = reinterpret_cast<Cache::Handle*>(h);
}
if (deleted.size() > 0) {
s = Status::OkOverwritten();
}
}
// Free the entry outside of the mutex for performance reasons.
for (auto& h : last_reference_list) {
h.FreeData();
}
table_.Free(&deleted);
return s;
}
Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
ClockHandle* h = nullptr;
h = table_.Lookup(key, hash);
if (h != nullptr) {
// TODO(Guido) Comment from #10347: Here it looks like we have three atomic
// updates where it would be possible to combine into one CAS (more metadata
// under one atomic field) or maybe two atomic updates (one arithmetic, one
// bitwise). Something to think about optimizing.
h->InternalToExternalRef();
h->SetHit();
// The handle is now referenced, so we take it out of clock.
ClockOff(h);
}
return reinterpret_cast<Cache::Handle*>(h);
return reinterpret_cast<Cache::Handle*>(table_.Lookup(key, hash));
}
bool ClockCacheShard::Ref(Cache::Handle* h) {
@ -498,97 +522,50 @@ bool ClockCacheShard::Ref(Cache::Handle* h) {
bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
// In contrast with LRUCache's Release, this function won't delete the handle
// when the reference is the last one and the cache is above capacity. Space
// when the cache is above capacity and the reference is the last one. Space
// is only freed up by EvictFromClock (called by Insert when space is needed)
// and Erase.
// and Erase. We do this to avoid an extra atomic read of the variable usage_.
if (handle == nullptr) {
return false;
}
ClockHandle* h = reinterpret_cast<ClockHandle*>(handle);
uint32_t hash = h->hash;
uint32_t refs = h->ReleaseExternalRef();
bool last_reference = !(refs & ClockHandle::EXTERNAL_REFS);
uint32_t refs = h->refs;
bool last_reference = ((refs & ClockHandle::EXTERNAL_REFS) == 1);
bool will_be_deleted = refs & ClockHandle::WILL_BE_DELETED;
if (last_reference && (will_be_deleted || erase_if_last_ref)) {
// At this point we want to evict the element, so we need to take
// a lock and an exclusive reference. But there's a problem:
// as soon as we released the last reference, an Insert or Erase could've
// replaced this element, and by the time we take the lock and ref
// we could potentially be referencing a different element.
// Thus, before evicting the (potentially different) element, we need to
// re-check that it's unreferenced and marked as WILL_BE_DELETED, so the
// eviction is safe. Additionally, we check that the hash doesn't change,
// which will detect, most of the time, whether the element is a different
// one. The bottomline is that we only guarantee that the input handle will
// be deleted, and occasionally also another handle, but in any case all
// deleted handles are safe to delete.
// TODO(Guido) With lock-free inserts and deletes we may be able to
// "atomically" transition to an exclusive ref, without creating a deadlock.
ClockHandle copy;
{
DMutexLock l(mutex_);
if (h->TrySpinExclusiveRef()) {
will_be_deleted = h->refs & ClockHandle::WILL_BE_DELETED;
// Check that it's still safe to delete.
if (h->IsElement() && (will_be_deleted || erase_if_last_ref) &&
h->hash == hash) {
copy = *h;
Evict(h);
}
autovector<ClockHandle> deleted;
h->SetWillBeDeleted(true);
h->ReleaseExternalRef();
if (table_.SpinTryRemove(h, &deleted)) {
h->ReleaseExclusiveRef();
} else {
// An external ref was detected.
return false;
}
}
// Free the entry outside of the mutex for performance reasons.
copy.FreeData();
table_.Free(&deleted);
return true;
}
} else {
h->ReleaseExternalRef();
}
return false;
}
void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {
ClockHandle copy;
bool last_reference = false;
{
DMutexLock l(mutex_);
ClockHandle* h = table_.Lookup(key, hash);
if (h != nullptr) {
h->SetWillBeDeleted(true);
h->ReleaseInternalRef();
if (h->TryExclusiveRef()) {
copy = *h;
Evict(h);
last_reference = true;
h->ReleaseExclusiveRef();
}
}
}
// Free the entry outside of the mutex for performance reasons.
if (last_reference) {
copy.FreeData();
}
autovector<ClockHandle> deleted;
uint32_t probe = 0;
table_.RemoveAll(key, hash, probe, &deleted);
table_.Free(&deleted);
}
size_t ClockCacheShard::GetUsage() const {
DMutexLock l(mutex_);
return usage_;
}
size_t ClockCacheShard::GetUsage() const { return table_.GetUsage(); }
size_t ClockCacheShard::GetPinnedUsage() const {
// Computes the pinned usage scanning the whole hash table. This
// is slow, but avoid keeping an exact counter on the clock usage,
// Computes the pinned usage by scanning the whole hash table. This
// is slow, but avoids keeping an exact counter on the clock usage,
// i.e., the number of not externally referenced elements.
// Why avoid this? Because Lookup removes elements from the clock
// Why avoid this counter? Because Lookup removes elements from the clock
// list, so it would need to update the pinned usage every time,
// which creates additional synchronization costs.
DMutexLock l(mutex_);
size_t clock_usage = 0;
table_.ConstApplyToEntriesRange(
@ -602,17 +579,13 @@ size_t ClockCacheShard::GetPinnedUsage() const {
return clock_usage;
}
std::string ClockCacheShard::GetPrintableOptions() const {
return std::string{};
}
ClockCache::ClockCache(size_t capacity, size_t estimated_value_size,
int num_shard_bits, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy)
: ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
: ShardedCache(capacity, num_shard_bits, strict_capacity_limit),
num_shards_(1 << num_shard_bits) {
assert(estimated_value_size > 0 ||
metadata_charge_policy != kDontChargeCacheMetadata);
num_shards_ = 1 << num_shard_bits;
shards_ = reinterpret_cast<ClockCacheShard*>(
port::cacheline_aligned_alloc(sizeof(ClockCacheShard) * num_shards_));
size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;

732
cache/clock_cache.h vendored

@ -23,102 +23,137 @@
#include "rocksdb/cache.h"
#include "rocksdb/secondary_cache.h"
#include "util/autovector.h"
#include "util/distributed_mutex.h"
namespace ROCKSDB_NAMESPACE {
namespace clock_cache {
// Block cache implementation using a lock-free open-address hash table
// and clock eviction.
// An experimental alternative to LRUCache, using a lock-free, open-addressed
// hash table and clock eviction.
///////////////////////////////////////////////////////////////////////////////
// Part 1: Handles
// ----------------------------------------------------------------------------
// 1. INTRODUCTION
//
// Every slot in the hash table is a ClockHandle. A handle can be in a few
// different states, that stem from the fact that handles can be externally
// referenced and, thus, can't always be immediately evicted when a delete
// operation is executed or when they are replaced by a new version (via an
// insert of the same key). Concretely, the state of a handle is defined by the
// following two properties:
// (R) Externally referenced: A handle can be referenced externally, or not.
// Importantly, a handle can be evicted if and only if it's not
// referenced. In particular, when an handle becomes referenced, it's
// temporarily taken out of clock until all references to it are released.
// (M) Marked for deletion (or invisible): An handle is marked for deletion
// when an operation attempts to delete it, but the handle is externally
// referenced, so it can't be immediately deleted. When this mark is placed,
// lookups will no longer be able to find it. Consequently, no more external
// references will be taken to the handle. When a handle is marked for
// deletion, we also say it's invisible.
// These properties induce 4 different states, with transitions defined as
// follows:
// - Not M --> M: When a handle is deleted or replaced by a new version, but
// not immediately evicted.
// - M --> not M: This cannot happen. Once a handle is marked for deletion,
// there is no can't go back.
// - R --> not R: When all references to an handle are released.
// - Not R --> R: When an unreferenced handle becomes referenced. This can only
// happen if the handle is visible, since references to an handle can only be
// created when it's visible.
// In RocksDB, a Cache is a concurrent unordered dictionary that supports
// external references (a.k.a. user references). A ClockCache is a type of Cache
// that uses the clock algorithm as its eviction policy. Internally, a
// ClockCache is an open-addressed hash table that stores all KV pairs in a
// large array. Every slot in the hash table is a ClockHandle, which holds a KV
// pair plus some additional metadata that controls the different aspects of the
// cache: external references, the hashing mechanism, concurrent access and the
// clock algorithm.
//
///////////////////////////////////////////////////////////////////////////////
// Part 2: Hash table structure
//
// Internally, the cache uses an open-addressed hash table to index the handles.
// We use tombstone counters to keep track of displacements. Probes are
// generated with double-hashing (but the code can be easily modified to use
// other probing schemes, like linear hashing). Because of the tombstones and
// the two possible visibility states of a handle, the table slots (we use the
// word "slot" to refer to handles that are not necessary valid key-value
// elements) can be in 4 different states:
// 1. Visible element: The slot contains an element in not M state.
// 2. To-be-deleted element: The slot contains an element in M state.
// 3. Tombstone: The slot doesn't contain an element, but there is some other
// 2. EXTERNAL REFERENCES
//
// An externally referenced handle can't be deleted (either evicted by the clock
// algorithm, or explicitly deleted) or replaced by a new version (via an insert
// of the same key) until all external references to it have been released by
// the users. ClockHandles have two members to support external references:
// - EXTERNAL_REFS counter: The number of external refs. When EXTERNAL_REFS > 0,
// the handle is externally referenced. Updates that intend to modify the
// handle will refrain from doing so. Eventually, when all references are
// released, we have EXTERNAL_REFS == 0, and updates can operate normally on
// the handle.
// - WILL_BE_DELETED flag: An handle is marked for deletion when an operation
// decides the handle should be deleted. This happens either when the last
// reference to a handle is released (and the release operation is instructed
// to delete on last reference) or on when a delete operation is called on
// the item. This flag is needed because an externally referenced handle
// can't be immediately deleted. In these cases, the flag will be later read
// and acted upon by the eviction algorithm. Importantly, WILL_BE_DELETED is
// used not only to defer deletions, but also as a barrier for external
// references: once WILL_BE_DELETED is set, lookups (which are the means to
// acquire new external references) will ignore the handle. For this reason,
// when WILL_BE_DELETED is set, we say the handle is invisible (and
// otherwise, that it's visible).
//
//
// 3. HASHING AND COLLISION RESOLUTION
//
// ClockCache uses an open-addressed hash table to store the handles.
// We use a variant of tombstones to manage collisions: every slot keeps a
// count of how many KV pairs that are currently in the cache have probed the
// slot in an attempt to insert. Probes are generated with double-hashing
// (although the code can be easily modified to use other probing schemes, like
// linear probing).
//
// A slot in the hash table can be in a few different states:
// - Element: The slot contains an element. This is indicated with the
// IS_ELEMENT flag. Element can be sub-classified depending on the
// value of WILL_BE_DELETED:
// * Visible element.
// * Invisible element.
// - Tombstone: The slot doesn't contain an element, but there is some other
// element that probed this slot during its insertion.
// 4. Empty: The slot is unused.
// When a ghost is removed from the table, it can either transition to being a
// tombstone or an empty slot, depending on the number of displacements of the
// slot. In any case, the slot becomes available. When a handle is inserted
// into that slot, it becomes a visible element again.
// - Empty: The slot is unused---it's neither an element nor a tombstone.
//
///////////////////////////////////////////////////////////////////////////////
// Part 3: The clock algorithm
// A slot cycles through the following sequence of states:
// empty or tombstone --> visible element --> invisible element -->
// empty or tombstone. Initially a slot is available---it's either
// empty or a tombstone. As soon as a KV pair is written into the slot, it
// becomes a visible element. At some point, the handle will be deleted
// by an explicit delete operation, the eviction algorithm, or an overwriting
// insert. In either case, the handle is marked for deletion. When the an
// attempt to delete the element finally succeeds, the slot is freed up
// and becomes available again.
//
// We maintain a circular buffer with the handles available for eviction,
// which the clock algorithm traverses (using a "clock pointer") to pick the
// next victim. We use the hash table array as the circular buffer, and mark
// the handles that are evictable. For this we use different clock flags, namely
// NONE, LOW, MEDIUM, HIGH, that represent priorities: LOW, MEDIUM and HIGH
// represent how close an element is from being evictable, LOW being immediately
// evictable. NONE means the slot is not evictable. This is due to one of the
// following reasons:
// (i) the slot doesn't contain an element, or
// (ii) the slot contains an element that is in R state, or
// (iii) the slot contains an element that was in R state but it's
// not any more, and the clock pointer has not swept through the
// slot since the element stopped being referenced.
//
// The priority NONE is really only important for case (iii), as in the other
// two cases there are other metadata fields that already capture the state.
// When an element stops being referenced (and is not deleted), the clock
// algorithm must acknowledge this, and assign a non-NONE priority to make
// the element evictable again.
// 4. CONCURRENCY
//
///////////////////////////////////////////////////////////////////////////////
// Part 4: Synchronization
// ClockCache is lock-free. At a high level, we synchronize the operations
// using a read-prioritized, non-blocking variant of RW locks on every slot of
// the hash table. To do this we generalize the concept of reference:
// - Internal reference: Taken by a thread that is attempting to read a slot
// or do a very precise type of update.
// - Exclusive reference: Taken by a thread that is attempting to write a
// a slot extensively.
//
// We provide the following synchronization guarantees:
// - Lookup is lock-free.
// - Release is lock-free, unless (i) no references to the element are left,
// and (ii) it was marked for deletion or the user wishes to delete if
// releasing the last reference.
// - Insert and Erase still use a per-shard lock.
// We defer the precise definitions to the comments in the code below.
// A crucial feature of our references is that attempting to take one never
// blocks the thread. Another important feature is that readers are
// prioritized, as they use extremely fast synchronization primitives---they
// use atomic arithmetic/bit operations, but no compare-and-swaps (which are
// much slower).
//
// Our hash table is lock-free, in the sense that system-wide progress is
// guaranteed, i.e., some thread is always able to make progress.
// Internal references are used by threads to read slots during a probing
// sequence, making them the most common references (probing is performed
// in almost every operation, not just lookups). During a lookup, once
// the target element is found, and just before the handle is handed over
// to the user, an internal reference is converted into an external reference.
// During an update operation, once the target slot is found, an internal
// reference is converted into an exclusive reference. Interestingly, we
// can't atomically upgrade from internal to exclusive, or we may run into a
// deadlock. Releasing the internal reference and then taking an exclusive
// reference avoids the deadlock, but then the handle may change inbetween.
// One of the key observations we use in our implementation is that we can
// make up for this lack of atomicity using IS_ELEMENT and WILL_BE_DELETED.
//
///////////////////////////////////////////////////////////////////////////////
// Distinguishing internal from external references is useful for two reasons:
// - Internal references are short lived, but external references are typically
// not. This is helpful when acquiring an exclusive ref: if there are any
// external references to the item, it's probably not worth waiting until
// they go away.
// - We can precisely determine when there are no more external references to a
// handle, and proceed to mark it for deletion. This is useful when users
// release external references.
//
//
// 5. CLOCK ALGORITHM
//
// The clock algorithm circularly sweeps through the hash table to find the next
// victim. Recall that handles that are referenced are not evictable; the clock
// algorithm never picks those. We use different clock priorities: NONE, LOW,
// MEDIUM and HIGH. Priorities LOW, MEDIUM and HIGH represent how close an
// element is from being evicted, LOW being the closest to evicted. NONE means
// the slot is not evictable. NONE priority is used in one of the following
// cases:
// (a) the slot doesn't contain an element, or
// (b) the slot contains an externally referenced element, or
// (c) the slot contains an element that used to be externally referenced,
// and the clock pointer has not swept through the slot since the element
// stopped being externally referenced.
// ----------------------------------------------------------------------------
// The load factor p is a real number in (0, 1) such that at all
// times at most a fraction p of all slots, without counting tombstones,
@ -138,15 +173,18 @@ constexpr double kLoadFactor = 0.35;
// The user can exceed kLoadFactor if the sizes of the inserted values don't
// match estimated_value_size, or if strict_capacity_limit == false. To
// avoid performance to plunge, we set a strict upper bound on the load factor.
// avoid a performance drop, we set a strict upper bound on the load factor.
constexpr double kStrictLoadFactor = 0.7;
// Maximum number of spins when trying to acquire a ref.
// TODO(Guido) This value was set arbitrarily. Is it appropriate?
// What's the best way to bound the spinning?
constexpr uint32_t kSpinsPerTry = 100000;
// Arbitrary seeds.
constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5;
// An experimental (under development!) alternative to LRUCache.
struct ClockHandle {
void* value;
Cache::DeleterFn deleter;
@ -154,49 +192,6 @@ struct ClockHandle {
size_t total_charge;
std::array<char, kCacheKeySize> key_data;
static constexpr uint8_t kExternalRefsOffset = 0;
static constexpr uint8_t kSharedRefsOffset = 15;
static constexpr uint8_t kExclusiveRefOffset = 30;
static constexpr uint8_t kWillBeDeletedOffset = 31;
enum Refs : uint32_t {
// Number of external references to the slot.
EXTERNAL_REFS = ((uint32_t{1} << 15) - 1)
<< kExternalRefsOffset, // Bits 0, ..., 14
// Number of internal references plus external references to the slot.
SHARED_REFS = ((uint32_t{1} << 15) - 1)
<< kSharedRefsOffset, // Bits 15, ..., 29
// Whether a thread has an exclusive reference to the slot.
EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30
// Whether the handle will be deleted soon. When this bit is set, new
// internal
// or external references to this handle stop being accepted.
// There is an exception: external references can be created from
// existing external references, or converting from existing internal
// references.
WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31
// Shared references (i.e., external and internal references) and exclusive
// references are our custom implementation of RW locks---external and
// internal references are read locks, and exclusive references are write
// locks. We prioritize readers, which never block; in fact, they don't even
// use compare-and-swap operations. Using our own implementation of RW locks
// allows us to save many atomic operations by packing data more carefully.
// In particular:
// - Combining EXTERNAL_REFS and SHARED_REFS allows us to convert an
// internal
// reference into an external reference in a single atomic arithmetic
// operation.
// - Combining SHARED_REFS and WILL_BE_DELETED allows us to attempt to take
// a shared reference and check whether the entry is marked for deletion
// in a single atomic arithmetic operation.
};
static constexpr uint32_t kOneInternalRef = 0x8000;
static constexpr uint32_t kOneExternalRef = 0x8001;
std::atomic<uint32_t> refs;
static constexpr uint8_t kIsElementOffset = 1;
static constexpr uint8_t kClockPriorityOffset = 2;
static constexpr uint8_t kIsHitOffset = 4;
@ -209,7 +204,7 @@ struct ClockHandle {
CLOCK_PRIORITY = 3 << kClockPriorityOffset,
// Whether the handle has been looked up after its insertion.
HAS_HIT = 1 << kIsHitOffset,
// The value of Cache::Priority for the handle.
// The value of Cache::Priority of the handle.
CACHE_PRIORITY = 1 << kCachePriorityOffset,
};
@ -226,30 +221,67 @@ struct ClockHandle {
// up in this slot or a higher one.
std::atomic<uint32_t> displacements;
// Synchronization rules:
// - Use a shared reference when we want the handle's identity
// members (key_data, hash, value and IS_ELEMENT flag) to
// remain untouched, but not modify them. The only updates
// that a shared reference allows are:
static constexpr uint8_t kExternalRefsOffset = 0;
static constexpr uint8_t kSharedRefsOffset = 15;
static constexpr uint8_t kExclusiveRefOffset = 30;
static constexpr uint8_t kWillBeDeletedOffset = 31;
enum Refs : uint32_t {
// Synchronization model:
// - An external reference guarantees that hash, value, key_data
// and the IS_ELEMENT flag are not modified. Doesn't allow
// any writes.
// - An internal reference has the same guarantees as an
// external reference, and additionally allows the following
// idempotent updates on the handle:
// * set CLOCK_PRIORITY to NONE;
// * set the HAS_HIT bit.
// Notice that these two types of updates are idempotent, so
// they don't require synchronization across shared references.
// - Use an exclusive reference when we want identity members
// to remain untouched, as well as modify any identity member
// or flag.
// - displacements can be modified without holding a reference.
// - refs is only modified through appropriate functions to
// take or release references.
// * set the HAS_HIT bit;
// * set the WILL_BE_DELETED bit.
// - A shared reference is either an external reference or an
// internal reference.
// - An exclusive reference guarantees that no other thread has a shared
// or exclusive reference to the handle, and allows writes
// on the handle.
// Number of external references to the slot.
EXTERNAL_REFS = ((uint32_t{1} << 15) - 1)
<< kExternalRefsOffset, // Bits 0, ..., 14
// Number of internal references plus external references to the slot.
SHARED_REFS = ((uint32_t{1} << 15) - 1)
<< kSharedRefsOffset, // Bits 15, ..., 29
// Whether a thread has an exclusive reference to the slot.
EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30
// Whether the handle will be deleted soon. When this bit is set, new
// internal
// or external references to this handle stop being accepted.
// There is an exception: external references can be created from
// existing external references, or converting from existing internal
// references.
WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31
// Having these 4 fields in a single variable allows us to support the
// following operations efficiently:
// - Convert an internal reference into an external reference in a single
// atomic arithmetic operation.
// - Attempt to take a shared reference using a single atomic arithmetic
// operation. This is because we can increment the internal ref count
// as well as checking whether the entry is marked for deletion using a
// single atomic arithmetic operation (and one non-atomic comparison).
};
static constexpr uint32_t kOneInternalRef = 0x8000;
static constexpr uint32_t kOneExternalRef = 0x8001;
std::atomic<uint32_t> refs;
ClockHandle()
: value(nullptr),
deleter(nullptr),
hash(0),
total_charge(0),
refs(0),
flags(0),
displacements(0) {
displacements(0),
refs(0) {
SetWillBeDeleted(false);
SetIsElement(false);
SetClockPriority(ClockPriority::NONE);
@ -257,26 +289,66 @@ struct ClockHandle {
key_data.fill(0);
}
// The copy ctor and assignment operator are only used to copy a handle
// for immediate deletion. (We need to copy because the slot may become
// re-used before the deletion is completed.) We only copy the necessary
// members to carry out the deletion. In particular, we don't need
// the atomic members.
ClockHandle(const ClockHandle& other) { *this = other; }
void operator=(const ClockHandle& other) {
value = other.value;
deleter = other.deleter;
hash = other.hash;
total_charge = other.total_charge;
refs.store(other.refs);
key_data = other.key_data;
flags.store(other.flags);
SetWillBeDeleted(other.WillBeDeleted());
SetIsElement(other.IsElement());
SetClockPriority(other.GetClockPriority());
SetCachePriority(other.GetCachePriority());
displacements.store(other.displacements);
total_charge = other.total_charge;
}
Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
bool HasExternalRefs() const { return (refs & EXTERNAL_REFS) > 0; }
void FreeData() {
if (deleter) {
(*deleter)(key(), value);
}
}
// Calculate the memory usage by metadata.
inline size_t CalcMetaCharge(
CacheMetadataChargePolicy metadata_charge_policy) const {
if (metadata_charge_policy != kFullChargeCacheMetadata) {
return 0;
} else {
// #ifdef ROCKSDB_MALLOC_USABLE_SIZE
// return malloc_usable_size(
// const_cast<void*>(static_cast<const void*>(this)));
// #else
// TODO(Guido) malloc_usable_size only works when we call it on
// a pointer allocated with malloc. Because our handles are all
// allocated in a single shot as an array, the user can't call
// CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
// pointer returned by the cache. Moreover, malloc_usable_size
// expects a heap-allocated handle, but sometimes in our code we
// wish to pass a stack-allocated handle (this is only a performance
// concern).
// What is the right way to compute metadata charges with pre-allocated
// handles?
return sizeof(ClockHandle);
// #endif
}
}
inline void CalcTotalCharge(
size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
total_charge = charge + CalcMetaCharge(metadata_charge_policy);
}
inline size_t GetCharge(
CacheMetadataChargePolicy metadata_charge_policy) const {
size_t meta_charge = CalcMetaCharge(metadata_charge_policy);
assert(total_charge >= meta_charge);
return total_charge - meta_charge;
}
// flags functions.
bool IsElement() const { return flags & IS_ELEMENT; }
@ -292,10 +364,6 @@ struct ClockHandle {
void SetHit() { flags |= HAS_HIT; }
bool IsInClock() const {
return GetClockPriority() != ClockHandle::ClockPriority::NONE;
}
Cache::Priority GetCachePriority() const {
return static_cast<Cache::Priority>(flags & CACHE_PRIORITY);
}
@ -308,6 +376,10 @@ struct ClockHandle {
}
}
bool IsInClock() const {
return GetClockPriority() != ClockHandle::ClockPriority::NONE;
}
ClockPriority GetClockPriority() const {
return static_cast<ClockPriority>(flags & Flags::CLOCK_PRIORITY);
}
@ -328,49 +400,6 @@ struct ClockHandle {
flags |= new_priority;
}
void FreeData() {
if (deleter) {
(*deleter)(key(), value);
}
}
// Calculate the memory usage by metadata.
inline size_t CalcMetaCharge(
CacheMetadataChargePolicy metadata_charge_policy) const {
if (metadata_charge_policy != kFullChargeCacheMetadata) {
return 0;
} else {
// #ifdef ROCKSDB_MALLOC_USABLE_SIZE
// return malloc_usable_size(
// const_cast<void*>(static_cast<const void*>(this)));
// #else
// TODO(Guido) malloc_usable_size only works when we call it on
// a pointer allocated with malloc. Because our handles are all
// allocated in a single shot as an array, the user can't call
// CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
// pointer returned by the cache. Moreover, malloc_usable_size
// expects a heap-allocated handle, but sometimes in our code we
// wish to pass a stack-allocated handle (this is only a performance
// concern).
// What is the right way to compute metadata charges with pre-allocated
// handles?
return sizeof(ClockHandle);
// #endif
}
}
inline void CalcTotalCharge(
size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
total_charge = charge + CalcMetaCharge(metadata_charge_policy);
}
inline size_t GetCharge(
CacheMetadataChargePolicy metadata_charge_policy) const {
size_t meta_charge = CalcMetaCharge(metadata_charge_policy);
assert(total_charge >= meta_charge);
return total_charge - meta_charge;
}
inline bool IsEmpty() const {
return !this->IsElement() && this->displacements == 0;
}
@ -380,11 +409,12 @@ struct ClockHandle {
}
inline bool Matches(const Slice& some_key, uint32_t some_hash) const {
return this->IsElement() && this->hash == some_hash &&
this->key() == some_key;
return this->hash == some_hash && this->key() == some_key;
}
bool WillBeDeleted() const { return refs & WILL_BE_DELETED; }
// refs functions.
inline bool WillBeDeleted() const { return refs & WILL_BE_DELETED; }
void SetWillBeDeleted(bool will_be_deleted) {
if (will_be_deleted) {
@ -394,28 +424,7 @@ struct ClockHandle {
}
}
// The following functions are for taking and releasing refs.
// Tries to take an external ref. Returns true iff it succeeds.
inline bool TryExternalRef() {
if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
return true;
}
refs -= kOneExternalRef;
return false;
}
// Releases an external ref. Returns the new value (this is useful to
// avoid an extra atomic read).
inline uint32_t ReleaseExternalRef() { return refs -= kOneExternalRef; }
// Take an external ref, assuming there is already one external ref
// to the handle.
void Ref() {
// TODO(Guido) Is it okay to assume that the existing external reference
// survives until this function returns?
refs += kOneExternalRef;
}
bool HasExternalRefs() const { return (refs & EXTERNAL_REFS) > 0; }
// Tries to take an internal ref. Returns true iff it succeeds.
inline bool TryInternalRef() {
@ -426,9 +435,19 @@ struct ClockHandle {
return false;
}
inline void ReleaseInternalRef() { refs -= kOneInternalRef; }
// Tries to take an external ref. Returns true iff it succeeds.
inline bool TryExternalRef() {
if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
return true;
}
refs -= kOneExternalRef;
return false;
}
// Tries to take an exclusive ref. Returns true iff it succeeds.
// TODO(Guido) After every TryExclusiveRef call, we always call
// WillBeDeleted(). We could save an atomic read by having an output parameter
// with the last value of refs.
inline bool TryExclusiveRef() {
uint32_t will_be_deleted = refs & WILL_BE_DELETED;
uint32_t expected = will_be_deleted;
@ -436,15 +455,18 @@ struct ClockHandle {
EXCLUSIVE_REF | will_be_deleted);
}
// Repeatedly tries to take an exclusive reference, but stops as soon
// as an external reference is detected (in this case the wait would
// presumably be too long).
inline bool TrySpinExclusiveRef() {
// Repeatedly tries to take an exclusive reference, but aborts as soon
// as an external or exclusive reference is detected (since the wait
// would presumably be too long).
inline bool SpinTryExclusiveRef() {
uint32_t expected = 0;
uint32_t will_be_deleted = 0;
uint32_t spins = kSpinsPerTry;
while (!refs.compare_exchange_strong(expected,
EXCLUSIVE_REF | will_be_deleted)) {
if (expected & EXTERNAL_REFS) {
EXCLUSIVE_REF | will_be_deleted) &&
spins--) {
std::this_thread::yield();
if (expected & (EXTERNAL_REFS | EXCLUSIVE_REF)) {
return false;
}
will_be_deleted = expected & WILL_BE_DELETED;
@ -453,75 +475,88 @@ struct ClockHandle {
return true;
}
inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); }
// Take an external ref, assuming there is already one external ref
// to the handle.
void Ref() {
// TODO(Guido) Is it okay to assume that the existing external reference
// survives until this function returns?
refs += kOneExternalRef;
}
// The following functions are for upgrading and downgrading refs.
// They guarantee atomicity, i.e., no exclusive refs to the handle
// can be taken by a different thread during the conversion.
inline void ReleaseExternalRef() { refs -= kOneExternalRef; }
inline void ExclusiveToInternalRef() {
refs += kOneInternalRef;
ReleaseExclusiveRef();
}
inline void ReleaseInternalRef() { refs -= kOneInternalRef; }
inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); }
// Downgrade an exclusive ref to external.
inline void ExclusiveToExternalRef() {
refs += kOneExternalRef;
ReleaseExclusiveRef();
}
// TODO(Guido) Do we want to bound the loop and prepare the
// algorithms to react to a failure?
inline void InternalToExclusiveRef() {
uint32_t expected = kOneInternalRef;
uint32_t will_be_deleted = 0;
while (!refs.compare_exchange_strong(expected,
EXCLUSIVE_REF | will_be_deleted)) {
will_be_deleted = expected & WILL_BE_DELETED;
expected = kOneInternalRef | will_be_deleted;
}
}
// Convert an internal ref into external.
inline void InternalToExternalRef() {
refs += kOneExternalRef - kOneInternalRef;
}
// TODO(Guido) Same concern.
inline void ExternalToExclusiveRef() {
uint32_t expected = kOneExternalRef;
uint32_t will_be_deleted = 0;
while (!refs.compare_exchange_strong(expected,
EXCLUSIVE_REF | will_be_deleted)) {
will_be_deleted = expected & WILL_BE_DELETED;
expected = kOneExternalRef | will_be_deleted;
}
}
}; // struct ClockHandle
class ClockHandleTable {
public:
explicit ClockHandleTable(int hash_bits);
explicit ClockHandleTable(size_t capacity, int hash_bits);
~ClockHandleTable();
// Returns a pointer to a visible element matching the key/hash, or
// nullptr if not present.
// Returns a pointer to a visible handle matching the key/hash, or
// nullptr if not present. When an actual handle is produced, an
// internal reference is handed over.
ClockHandle* Lookup(const Slice& key, uint32_t hash);
// Inserts a copy of h into the hash table.
// Returns a pointer to the inserted handle, or nullptr if no slot
// available was found. If an existing visible element matching the
// key/hash is already present in the hash table, the argument old
// is set to point to it; otherwise, it's set to nullptr.
// Returns an exclusive reference to h, and no references to old.
ClockHandle* Insert(ClockHandle* h, ClockHandle** old);
// Inserts a copy of h into the hash table. Returns a pointer to the
// inserted handle, or nullptr if no available slot was found. Every
// existing visible handle matching the key is already present in the
// hash table is marked as WILL_BE_DELETED. The deletion is also attempted,
// and, if the attempt is successful, the handle is inserted into the
// autovector deleted. When take_reference is true, the function hands
// over an external reference on the handle, and otherwise no reference is
// produced.
ClockHandle* Insert(ClockHandle* h, autovector<ClockHandle>* deleted,
bool take_reference);
// Assigns h the appropriate clock priority, making it evictable.
void ClockOn(ClockHandle* h);
// Makes h non-evictable.
void ClockOff(ClockHandle* h);
// Removes h from the hash table. The handle must already be off clock.
void Remove(ClockHandle* h);
// Runs the clock eviction algorithm until there is enough space to
// insert an element with the given charge.
void ClockRun(size_t charge);
// Extracts the element information from a handle (src), and assigns it
// to a hash table slot (dst). Doesn't touch displacements and refs,
// which are maintained by the hash table algorithm.
void Assign(ClockHandle* dst, ClockHandle* src);
// Remove h from the hash table. Requires an exclusive ref to h.
void Remove(ClockHandle* h, autovector<ClockHandle>* deleted);
// Remove from the hash table all handles with matching key/hash along a
// probe sequence, starting from the given probe number. Doesn't
// require any references.
void RemoveAll(const Slice& key, uint32_t hash, uint32_t& probe,
autovector<ClockHandle>* deleted);
void RemoveAll(const Slice& key, uint32_t hash,
autovector<ClockHandle>* deleted) {
uint32_t probe = 0;
RemoveAll(key, hash, probe, deleted);
}
void Free(autovector<ClockHandle>* deleted);
// Tries to remove h from the hash table. If the attempt is successful,
// the function hands over an exclusive ref to h.
bool TryRemove(ClockHandle* h, autovector<ClockHandle>* deleted);
// Similar to TryRemove, except that it spins, increasing the chances of
// success. Requires that the caller thread has no shared ref to h.
bool SpinTryRemove(ClockHandle* h, autovector<ClockHandle>* deleted);
template <typename T>
void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end,
@ -531,12 +566,9 @@ class ClockHandleTable {
if (h->TryExclusiveRef()) {
if (h->IsElement() &&
(apply_if_will_be_deleted || !h->WillBeDeleted())) {
// Hand the internal ref over to func, which is now responsible
// to release it.
func(h);
} else {
h->ReleaseExclusiveRef();
}
h->ReleaseExclusiveRef();
}
}
}
@ -565,53 +597,81 @@ class ClockHandleTable {
uint32_t GetOccupancy() const { return occupancy_; }
size_t GetUsage() const { return usage_; }
size_t GetCapacity() const { return capacity_; }
// Returns x mod 2^{length_bits_}.
uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; }
private:
friend class ClockCacheShard;
int FindElement(const Slice& key, uint32_t hash, uint32_t& probe);
int FindAvailableSlot(const Slice& key, uint32_t& probe);
int FindElementOrAvailableSlot(const Slice& key, uint32_t hash,
uint32_t& probe);
// Extracts the element information from a handle (src), and assigns it
// to a hash table slot (dst). Doesn't touch displacements and refs,
// which are maintained by the hash table algorithm.
void Assign(ClockHandle* dst, ClockHandle* src);
// Returns the index of the first slot probed (hashing with
// the given key) with a handle e such that match(e) is true.
// At every step, the function first tests whether match(e) holds.
// If it's false, it evaluates abort(e) to decide whether the
// search should be aborted, and in the affirmative returns -1.
// For every handle e probed except the last one, the function runs
// update(e). We say a probe to a handle e is aborting if match(e) is
// false and abort(e) is true. The argument probe is one more than the
// last non-aborting probe during the call. This is so that that the
// variable can be used to keep track of progress across consecutive
// calls to FindSlot.
inline int FindSlot(const Slice& key, std::function<bool(ClockHandle*)> match,
// Returns the first slot in the probe sequence, starting from the given
// probe number, with a handle e such that match(e) is true. At every
// step, the function first tests whether match(e) holds. If this is false,
// it evaluates abort(e) to decide whether the search should be aborted,
// and in the affirmative returns -1. For every handle e probed except
// the last one, the function runs update(e).
// The probe parameter is modified as follows. We say a probe to a handle
// e is aborting if match(e) is false and abort(e) is true. Then the final
// value of probe is one more than the last non-aborting probe during the
// call. This is so that that the variable can be used to keep track of
// progress across consecutive calls to FindSlot.
inline ClockHandle* FindSlot(const Slice& key,
std::function<bool(ClockHandle*)> match,
std::function<bool(ClockHandle*)> stop,
std::function<void(ClockHandle*)> update,
uint32_t& probe);
// After a failed FindSlot call (i.e., with answer -1), this function
// decrements all displacements, starting from the 0-th probe.
// Returns an available slot for the given key. All copies of the
// key found along the probing sequence until an available slot is
// found are marked for deletion. On each of them, a deletion is
// attempted, and when the attempt succeeds the slot is assigned to
// the new copy of the element.
ClockHandle* FindAvailableSlot(const Slice& key, uint32_t hash,
uint32_t& probe,
autovector<ClockHandle>* deleted);
// After a failed FindSlot call (i.e., with answer -1) in
// FindAvailableSlot, this function fixes all displacements's
// starting from the 0-th probe, until the given probe.
void Rollback(const Slice& key, uint32_t probe);
// Number of hash bits used for table index.
// The size of the table is 1 << length_bits_.
int length_bits_;
const int length_bits_;
// For faster computation of ModTableSize.
const uint32_t length_bits_mask_;
// Number of elements in the table.
uint32_t occupancy_;
// Maximum number of elements the user can store in the table.
uint32_t occupancy_limit_;
const uint32_t occupancy_limit_;
// Maximum total charge of all elements stored in the table.
const size_t capacity_;
// We partition the following members into different cache lines
// to avoid false sharing among Lookup, Release, Erase and Insert
// operations in ClockCacheShard.
ALIGN_AS(CACHE_LINE_SIZE)
// Array of slots comprising the hash table.
std::unique_ptr<ClockHandle[]> array_;
ALIGN_AS(CACHE_LINE_SIZE)
// Clock algorithm sweep pointer.
std::atomic<uint32_t> clock_pointer_;
ALIGN_AS(CACHE_LINE_SIZE)
// Number of elements in the table.
std::atomic<uint32_t> occupancy_;
// Memory size for entries residing in the cache.
std::atomic<size_t> usage_;
}; // class ClockHandleTable
// A single shard of sharded cache.
@ -652,20 +712,26 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
Statistics* /*stats*/) override {
return Lookup(key, hash);
}
Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
bool Release(Cache::Handle* handle, bool /*useful*/,
bool erase_if_last_ref) override {
return Release(handle, erase_if_last_ref);
}
bool IsReady(Cache::Handle* /*handle*/) override { return true; }
void Wait(Cache::Handle* /*handle*/) override {}
bool Ref(Cache::Handle* handle) override;
bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
void Erase(const Slice& key, uint32_t hash) override;
size_t GetUsage() const override;
size_t GetPinnedUsage() const override;
void ApplyToSomeEntries(
@ -675,20 +741,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
void EraseUnRefEntries() override;
std::string GetPrintableOptions() const override;
std::string GetPrintableOptions() const override { return std::string{}; }
private:
friend class ClockCache;
// Makes an element evictable by clock.
void ClockOn(ClockHandle* h);
// Makes an element non-evictable.
void ClockOff(ClockHandle* h);
// Requires an exclusive ref on h.
void Evict(ClockHandle* h);
// Free some space following strict clock policy until enough space
// to hold (usage_ + charge) is freed or there are no evictable elements.
void EvictFromClock(size_t charge, autovector<ClockHandle>* deleted);
@ -703,34 +760,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
static int CalcHashBits(size_t capacity, size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy);
// Initialized before use.
size_t capacity_;
// Whether to reject insertion if cache reaches its full capacity.
bool strict_capacity_limit_;
uint32_t clock_pointer_;
std::atomic<bool> strict_capacity_limit_;
// ------------^^^^^^^^^^^^^-----------
// Not frequently modified data members
// ------------------------------------
//
// We separate data members that are updated frequently from the ones that
// are not frequently updated so that they don't share the same cache line
// which will lead into false cache sharing
//
// ------------------------------------
// Frequently modified data members
// ------------vvvvvvvvvvvvv-----------
ClockHandleTable table_;
// Memory size for entries residing in the cache.
size_t usage_;
// mutex_ protects the following state.
// We don't count mutex_ as the cache's internal state so semantically we
// don't mind mutex_ invoking the non-const actions.
mutable DMutex mutex_;
}; // class ClockCacheShard
class ClockCache
@ -743,19 +776,28 @@ class ClockCache
bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy =
kDontChargeCacheMetadata);
~ClockCache() override;
const char* Name() const override { return "ClockCache"; }
CacheShard* GetShard(uint32_t shard) override;
const CacheShard* GetShard(uint32_t shard) const override;
void* Value(Handle* handle) override;
size_t GetCharge(Handle* handle) const override;
uint32_t GetHash(Handle* handle) const override;
DeleterFn GetDeleter(Handle* handle) const override;
void DisownData() override;
private:
ClockCacheShard* shards_ = nullptr;
int num_shards_ = 0;
int num_shards_;
}; // class ClockCache
} // namespace clock_cache

Loading…
Cancel
Save