diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 3f55acf6b..ad2c5f4af 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -37,133 +37,191 @@ ClockHandleTable::ClockHandleTable(int hash_bits) } ClockHandleTable::~ClockHandleTable() { - ApplyToEntriesRange([](ClockHandle* h) { h->FreeData(); }, 0, GetTableSize()); + ApplyToEntriesRange([](ClockHandle* h) { h->FreeData(); }, 0, GetTableSize(), + true); } -ClockHandle* ClockHandleTable::Lookup(const Slice& key) { - int probe = 0; - int slot = FindVisibleElement(key, probe, 0); +ClockHandle* ClockHandleTable::Lookup(const Slice& key, uint32_t hash) { + uint32_t probe = 0; + int slot = FindElement(key, hash, probe); return (slot == -1) ? nullptr : &array_[slot]; } ClockHandle* ClockHandleTable::Insert(ClockHandle* h, ClockHandle** old) { - int probe = 0; - int slot = - FindVisibleElementOrAvailableSlot(h->key(), probe, 1 /*displacement*/); + uint32_t probe = 0; + int slot = FindElementOrAvailableSlot(h->key(), h->hash, probe); *old = nullptr; if (slot == -1) { + // The key is not already present, and there's no available slot to place + // the new copy. return nullptr; } - if (array_[slot].IsEmpty() || array_[slot].IsTombstone()) { - bool empty = array_[slot].IsEmpty(); - Assign(slot, h); + if (!array_[slot].IsElement()) { + // The slot is empty or is a tombstone. ClockHandle* new_entry = &array_[slot]; - if (empty) { - // This used to be an empty slot. + new_entry->InternalToExclusiveRef(); + Assign(new_entry, h); + if (new_entry->displacements == 0) { + // The slot was empty. return new_entry; } // It used to be a tombstone, so there may already be a copy of the // key in the table. - slot = FindVisibleElement(h->key(), probe, 0 /*displacement*/); + slot = FindElement(h->key(), h->hash, probe); if (slot == -1) { - // No existing copy of the key. + // Nope, no existing copy of the key. return new_entry; } - *old = &array_[slot]; + ClockHandle* old_entry = &array_[slot]; + old_entry->ReleaseInternalRef(); + *old = old_entry; return new_entry; } else { // There is an existing copy of the key. - *old = &array_[slot]; + ClockHandle* old_entry = &array_[slot]; + old_entry->ReleaseInternalRef(); + *old = old_entry; // Find an available slot for the new element. - array_[slot].displacements++; - slot = FindAvailableSlot(h->key(), probe, 1 /*displacement*/); + old_entry->displacements++; + slot = FindAvailableSlot(h->key(), probe); if (slot == -1) { - // No available slots. Roll back displacements. - probe = 0; - slot = FindVisibleElement(h->key(), probe, -1); - array_[slot].displacements--; - FindAvailableSlot(h->key(), probe, -1); + // No available slots. return nullptr; } - Assign(slot, h); - return &array_[slot]; + ClockHandle* new_entry = &array_[slot]; + new_entry->InternalToExclusiveRef(); + Assign(new_entry, h); + return new_entry; } } void ClockHandleTable::Remove(ClockHandle* h) { - assert(!h->IsInClockList()); // Already off the clock list. - int probe = 0; + assert(!h->IsInClock()); // Already off clock. + uint32_t probe = 0; FindSlot( - h->key(), [&h](ClockHandle* e) { return e == h; }, probe, - -1 /*displacement*/); - h->SetIsVisible(false); + h->key(), [&](ClockHandle* e) { return e == h; }, + [&](ClockHandle* /*e*/) { return false; }, + [&](ClockHandle* e) { e->displacements--; }, probe); + h->SetWillBeDeleted(false); h->SetIsElement(false); occupancy_--; } -void ClockHandleTable::Assign(int slot, ClockHandle* h) { - ClockHandle* dst = &array_[slot]; - uint32_t disp = dst->displacements; - *dst = *h; - dst->displacements = disp; - dst->SetIsVisible(true); +void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) { + // DON'T touch displacements and refs. + dst->value = src->value; + dst->deleter = src->deleter; + dst->hash = src->hash; + dst->total_charge = src->total_charge; + dst->key_data = src->key_data; + dst->flags.store(0); dst->SetIsElement(true); dst->SetClockPriority(ClockHandle::ClockPriority::NONE); + dst->SetCachePriority(src->GetCachePriority()); occupancy_++; } -void ClockHandleTable::Exclude(ClockHandle* h) { h->SetIsVisible(false); } - -int ClockHandleTable::FindVisibleElement(const Slice& key, int& probe, - int displacement) { - return FindSlot( - key, [&](ClockHandle* h) { return h->Matches(key) && h->IsVisible(); }, - probe, displacement); -} - -int ClockHandleTable::FindAvailableSlot(const Slice& key, int& probe, - int displacement) { +int ClockHandleTable::FindElement(const Slice& key, uint32_t hash, + uint32_t& probe) { return FindSlot( - key, [](ClockHandle* h) { return h->IsEmpty() || h->IsTombstone(); }, - probe, displacement); + key, + [&](ClockHandle* h) { + if (h->TryInternalRef()) { + if (h->Matches(key, hash)) { + return true; + } + h->ReleaseInternalRef(); + } + return false; + }, + [&](ClockHandle* h) { return h->displacements == 0; }, + [&](ClockHandle* /*h*/) {}, probe); } -int ClockHandleTable::FindVisibleElementOrAvailableSlot(const Slice& key, - int& probe, - int displacement) { - return FindSlot( +int ClockHandleTable::FindAvailableSlot(const Slice& key, uint32_t& probe) { + int slot = FindSlot( key, [&](ClockHandle* h) { - return h->IsEmpty() || h->IsTombstone() || - (h->Matches(key) && h->IsVisible()); + if (h->TryInternalRef()) { + if (!h->IsElement()) { + return true; + } + h->ReleaseInternalRef(); + } + return false; }, - probe, displacement); + [&](ClockHandle* /*h*/) { return false; }, + [&](ClockHandle* h) { h->displacements++; }, probe); + if (slot == -1) { + Rollback(key, probe); + } + return slot; } -inline int ClockHandleTable::FindSlot(const Slice& key, - std::function cond, - int& probe, int displacement) { +int ClockHandleTable::FindElementOrAvailableSlot(const Slice& key, + uint32_t hash, + uint32_t& probe) { + int slot = FindSlot( + key, + [&](ClockHandle* h) { + if (h->TryInternalRef()) { + if (!h->IsElement() || h->Matches(key, hash)) { + return true; + } + h->ReleaseInternalRef(); + } + return false; + }, + [&](ClockHandle* /*h*/) { return false; }, + [&](ClockHandle* h) { h->displacements++; }, probe); + if (slot == -1) { + Rollback(key, probe); + } + return slot; +} + +int ClockHandleTable::FindSlot(const Slice& key, + std::function match, + std::function abort, + std::function update, + uint32_t& probe) { + // We use double-hashing probing. Every probe in the sequence is a + // pseudorandom integer, computed as a linear function of two random hashes, + // which we call base and increment. Specifically, the i-th probe is base + i + // * increment modulo the table size. uint32_t base = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1)); + // We use an odd increment, which is relatively prime with the power-of-two + // table size. This implies that we cycle back to the first probe only + // after probing every slot exactly once. uint32_t increment = ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1); uint32_t current = ModTableSize(base + probe * increment); while (true) { ClockHandle* h = &array_[current]; - probe++; - if (current == base && probe > 1) { + if (current == base && probe > 0) { // We looped back. return -1; } - if (cond(h)) { + if (match(h)) { + probe++; return current; } - if (h->IsEmpty()) { - // We check emptyness after the condition, because - // the condition may be emptyness. + if (abort(h)) { return -1; } - h->displacements += displacement; + probe++; + update(h); + current = ModTableSize(current + increment); + } +} + +void ClockHandleTable::Rollback(const Slice& key, uint32_t probe) { + uint32_t current = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1)); + uint32_t increment = + ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1); + for (uint32_t i = 0; i < probe; i++) { + array_[current].displacements--; current = ModTableSize(current + increment); } } @@ -176,8 +234,7 @@ ClockCacheShard::ClockCacheShard( clock_pointer_(0), table_( CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)), - usage_(0), - clock_usage_(0) { + usage_(0) { set_metadata_charge_policy(metadata_charge_policy); } @@ -185,22 +242,16 @@ void ClockCacheShard::EraseUnRefEntries() { autovector last_reference_list; { DMutexLock l(mutex_); - uint32_t slot = 0; - do { - ClockHandle* old = &(table_.array_[slot]); - if (!old->IsInClockList()) { - continue; - } - ClockRemove(old); - table_.Remove(old); - assert(usage_ >= old->total_charge); - usage_ -= old->total_charge; - last_reference_list.push_back(*old); - slot = table_.ModTableSize(slot + 1); - } while (slot != 0); + table_.ApplyToEntriesRange( + [this, &last_reference_list](ClockHandle* h) { + // Externally unreferenced element. + last_reference_list.push_back(*h); + Evict(h); + }, + 0, table_.GetTableSize(), true); } - // Free the entries here outside of mutex for performance reasons. + // Free the entry outside of the mutex for performance reasons. for (auto& h : last_reference_list) { h.FreeData(); } @@ -238,45 +289,60 @@ void ClockCacheShard::ApplyToSomeEntries( callback(h->key(), h->value, h->GetCharge(metadata_charge_policy), h->deleter); }, - index_begin, index_end); + index_begin, index_end, false); } -void ClockCacheShard::ClockRemove(ClockHandle* h) { - assert(h->IsInClockList()); +void ClockCacheShard::ClockOff(ClockHandle* h) { h->SetClockPriority(ClockHandle::ClockPriority::NONE); - assert(clock_usage_ >= h->total_charge); - clock_usage_ -= h->total_charge; } -void ClockCacheShard::ClockInsert(ClockHandle* h) { - assert(!h->IsInClockList()); +void ClockCacheShard::ClockOn(ClockHandle* h) { + assert(!h->IsInClock()); bool is_high_priority = h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH; h->SetClockPriority(static_cast( is_high_priority * ClockHandle::ClockPriority::HIGH + (1 - is_high_priority) * ClockHandle::ClockPriority::MEDIUM)); - clock_usage_ += h->total_charge; +} + +void ClockCacheShard::Evict(ClockHandle* h) { + ClockOff(h); + table_.Remove(h); + assert(usage_ >= h->total_charge); + usage_ -= h->total_charge; } void ClockCacheShard::EvictFromClock(size_t charge, autovector* deleted) { + // TODO(Guido) When an element is in the probe sequence of a + // hot element, it will be hard to get an exclusive ref. + // We may need a mechanism to avoid that an element sits forever + // in cache waiting to be evicted. assert(charge <= capacity_); - while (clock_usage_ > 0 && (usage_ + charge) > capacity_) { - ClockHandle* old = &table_.array_[clock_pointer_]; + uint32_t max_iterations = table_.GetTableSize(); + while (usage_ + charge > capacity_ && max_iterations--) { + ClockHandle* h = &table_.array_[clock_pointer_]; clock_pointer_ = table_.ModTableSize(clock_pointer_ + 1); - // Clock list contains only elements which can be evicted. - if (!old->IsInClockList()) { - continue; - } - if (old->GetClockPriority() == ClockHandle::ClockPriority::LOW) { - ClockRemove(old); - table_.Remove(old); - assert(usage_ >= old->total_charge); - usage_ -= old->total_charge; - deleted->push_back(*old); - return; + + if (h->TryExclusiveRef()) { + if (!h->IsInClock() && h->IsElement()) { + // We adjust the clock priority to make the element evictable again. + // Why? Elements that are not in clock are either currently + // externally referenced or used to be---because we are holding an + // exclusive ref, we know we are in the latter case. This can only + // happen when the last external reference to an element was released, + // and the element was not immediately removed. + ClockOn(h); + } + + if (h->GetClockPriority() == ClockHandle::ClockPriority::LOW) { + deleted->push_back(*h); + Evict(h); + } else if (h->GetClockPriority() > ClockHandle::ClockPriority::LOW) { + h->DecreaseClockPriority(); + } + h->ReleaseExclusiveRef(); } - old->DecreaseClockPriority(); } } @@ -309,13 +375,14 @@ void ClockCacheShard::SetCapacity(size_t capacity) { EvictFromClock(0, &last_reference_list); } - // Free the entries here outside of mutex for performance reasons. + // Free the entry outside of the mutex for performance reasons. for (auto& h : last_reference_list) { h.FreeData(); } } void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { + assert(false); // Not supported. TODO(Guido) Support it? DMutexLock l(mutex_); strict_capacity_limit_ = strict_capacity_limit; } @@ -343,9 +410,10 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value, autovector last_reference_list; { DMutexLock l(mutex_); + assert(table_.GetOccupancy() <= table_.GetOccupancyLimit()); // Free the space following strict clock policy until enough space - // is freed or the clock list is empty. + // is freed or there are no evictable elements. EvictFromClock(tmp.total_charge, &last_reference_list); if ((usage_ + tmp.total_charge > capacity_ && (strict_capacity_limit_ || handle == nullptr)) || @@ -376,30 +444,29 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value, usage_ += h->total_charge; if (old != nullptr) { s = Status::OkOverwritten(); - assert(old->IsVisible()); - table_.Exclude(old); - if (!old->HasRefs()) { - // old is in clock because it's in cache and its reference count is 0. - ClockRemove(old); - table_.Remove(old); - assert(usage_ >= old->total_charge); - usage_ -= old->total_charge; + assert(!old->WillBeDeleted()); + old->SetWillBeDeleted(true); + // Try to evict the old copy of the element. + if (old->TryExclusiveRef()) { last_reference_list.push_back(*old); + Evict(old); + old->ReleaseExclusiveRef(); } } if (handle == nullptr) { - ClockInsert(h); + // If the user didn't provide a handle, no reference is taken, + // so we make the element evictable. + ClockOn(h); + h->ReleaseExclusiveRef(); } else { - // If caller already holds a ref, no need to take one here. - if (!h->HasRefs()) { - h->Ref(); - } + // The caller already holds a ref. + h->ExclusiveToExternalRef(); *handle = reinterpret_cast(h); } } } - // Free the entries here outside of mutex for performance reasons. + // Free the entry outside of the mutex for performance reasons. for (auto& h : last_reference_list) { h.FreeData(); } @@ -407,95 +474,102 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value, return s; } -Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t /* hash */) { +Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) { ClockHandle* h = nullptr; - { - DMutexLock l(mutex_); - h = table_.Lookup(key); - if (h != nullptr) { - assert(h->IsVisible()); - if (!h->HasRefs()) { - // The entry is in clock since it's in the hash table and has no - // external references. - ClockRemove(h); - } - h->Ref(); - h->SetHit(); - } + h = table_.Lookup(key, hash); + if (h != nullptr) { + // TODO(Guido) Comment from #10347: Here it looks like we have three atomic + // updates where it would be possible to combine into one CAS (more metadata + // under one atomic field) or maybe two atomic updates (one arithmetic, one + // bitwise). Something to think about optimizing. + h->InternalToExternalRef(); + h->SetHit(); + // The handle is now referenced, so we take it out of clock. + ClockOff(h); } return reinterpret_cast(h); } bool ClockCacheShard::Ref(Cache::Handle* h) { ClockHandle* e = reinterpret_cast(h); - DMutexLock l(mutex_); - // To create another reference - entry must be already externally referenced. - assert(e->HasRefs()); - e->Ref(); - return true; + assert(e->HasExternalRefs()); + return e->TryExternalRef(); } bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) { + // In contrast with LRUCache's Release, this function won't delete the handle + // when the reference is the last one and the cache is above capacity. Space + // is only freed up by EvictFromClock (called by Insert when space is needed) + // and Erase. if (handle == nullptr) { return false; } + ClockHandle* h = reinterpret_cast(handle); - ClockHandle copy; - bool last_reference = false; - assert(!h->IsInClockList()); - { - DMutexLock l(mutex_); - last_reference = h->Unref(); - if (last_reference && h->IsVisible()) { - // The item is still in cache, and nobody else holds a reference to it. - if (usage_ > capacity_ || erase_if_last_ref) { - // The clock list must be empty since the cache is full. - assert(clock_usage_ == 0 || erase_if_last_ref); - // Take this opportunity and remove the item. - table_.Remove(h); + uint32_t hash = h->hash; + uint32_t refs = h->ReleaseExternalRef(); + bool last_reference = !(refs & ClockHandle::EXTERNAL_REFS); + bool will_be_deleted = refs & ClockHandle::WILL_BE_DELETED; + + if (last_reference && (will_be_deleted || erase_if_last_ref)) { + // At this point we want to evict the element, so we need to take + // a lock and an exclusive reference. But there's a problem: + // as soon as we released the last reference, an Insert or Erase could've + // replaced this element, and by the time we take the lock and ref + // we could potentially be referencing a different element. + // Thus, before evicting the (potentially different) element, we need to + // re-check that it's unreferenced and marked as WILL_BE_DELETED, so the + // eviction is safe. Additionally, we check that the hash doesn't change, + // which will detect, most of the time, whether the element is a different + // one. The bottomline is that we only guarantee that the input handle will + // be deleted, and occasionally also another handle, but in any case all + // deleted handles are safe to delete. + // TODO(Guido) With lock-free inserts and deletes we may be able to + // "atomically" transition to an exclusive ref, without creating a deadlock. + ClockHandle copy; + { + DMutexLock l(mutex_); + if (h->TrySpinExclusiveRef()) { + will_be_deleted = h->refs & ClockHandle::WILL_BE_DELETED; + // Check that it's still safe to delete. + if (h->IsElement() && (will_be_deleted || erase_if_last_ref) && + h->hash == hash) { + copy = *h; + Evict(h); + } + h->ReleaseExclusiveRef(); } else { - // Put the item back on the clock list, and don't free it. - ClockInsert(h); - last_reference = false; + // An external ref was detected. + return false; } } - // If it was the last reference, then decrement the cache usage. - if (last_reference) { - assert(usage_ >= h->total_charge); - usage_ -= h->total_charge; - copy = *h; - } - } - // Free the entry here outside of mutex for performance reasons. - if (last_reference) { + // Free the entry outside of the mutex for performance reasons. copy.FreeData(); + return true; } - return last_reference; + + return false; } -void ClockCacheShard::Erase(const Slice& key, uint32_t /* hash */) { +void ClockCacheShard::Erase(const Slice& key, uint32_t hash) { ClockHandle copy; bool last_reference = false; { DMutexLock l(mutex_); - ClockHandle* h = table_.Lookup(key); + ClockHandle* h = table_.Lookup(key, hash); if (h != nullptr) { - table_.Exclude(h); - if (!h->HasRefs()) { - // The entry is in Clock since it's in cache and has no external - // references. - ClockRemove(h); - table_.Remove(h); - assert(usage_ >= h->total_charge); - usage_ -= h->total_charge; - last_reference = true; + h->SetWillBeDeleted(true); + h->ReleaseInternalRef(); + if (h->TryExclusiveRef()) { copy = *h; + Evict(h); + last_reference = true; + h->ReleaseExclusiveRef(); } } } - // Free the entry here outside of mutex for performance reasons. - // last_reference will only be true if e != nullptr. + // Free the entry outside of the mutex for performance reasons. if (last_reference) { copy.FreeData(); } @@ -507,9 +581,25 @@ size_t ClockCacheShard::GetUsage() const { } size_t ClockCacheShard::GetPinnedUsage() const { + // Computes the pinned usage scanning the whole hash table. This + // is slow, but avoid keeping an exact counter on the clock usage, + // i.e., the number of not externally referenced elements. + // Why avoid this? Because Lookup removes elements from the clock + // list, so it would need to update the pinned usage every time, + // which creates additional synchronization costs. DMutexLock l(mutex_); - assert(usage_ >= clock_usage_); - return usage_ - clock_usage_; + + size_t clock_usage = 0; + + table_.ConstApplyToEntriesRange( + [&clock_usage](ClockHandle* h) { + if (h->HasExternalRefs()) { + clock_usage += h->total_charge; + } + }, + 0, table_.GetTableSize(), true); + + return clock_usage; } std::string ClockCacheShard::GetPrintableOptions() const { diff --git a/cache/clock_cache.h b/cache/clock_cache.h index ca6205b83..8091bab27 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -10,6 +10,8 @@ #pragma once #include +#include +#include #include #include @@ -27,116 +29,254 @@ namespace ROCKSDB_NAMESPACE { namespace clock_cache { -// Clock cache implementation. This is based on FastLRUCache's open-addressed -// hash table. Importantly, it stores elements in an array, and resolves -// collision using a probing strategy. Visibility and referenceability of -// elements works as usual. See fast_lru_cache.h for a detailed description. +// Block cache implementation using a lock-free open-address hash table +// and clock eviction. + +/////////////////////////////////////////////////////////////////////////////// +// Part 1: Handles +// +// Every slot in the hash table is a ClockHandle. A handle can be in a few +// different states, that stem from the fact that handles can be externally +// referenced and, thus, can't always be immediately evicted when a delete +// operation is executed or when they are replaced by a new version (via an +// insert of the same key). Concretely, the state of a handle is defined by the +// following two properties: +// (R) Externally referenced: A handle can be referenced externally, or not. +// Importantly, a handle can be evicted if and only if it's not +// referenced. In particular, when an handle becomes referenced, it's +// temporarily taken out of clock until all references to it are released. +// (M) Marked for deletion (or invisible): An handle is marked for deletion +// when an operation attempts to delete it, but the handle is externally +// referenced, so it can't be immediately deleted. When this mark is placed, +// lookups will no longer be able to find it. Consequently, no more external +// references will be taken to the handle. When a handle is marked for +// deletion, we also say it's invisible. +// These properties induce 4 different states, with transitions defined as +// follows: +// - Not M --> M: When a handle is deleted or replaced by a new version, but +// not immediately evicted. +// - M --> not M: This cannot happen. Once a handle is marked for deletion, +// there is no can't go back. +// - R --> not R: When all references to an handle are released. +// - Not R --> R: When an unreferenced handle becomes referenced. This can only +// happen if the handle is visible, since references to an handle can only be +// created when it's visible. +// +/////////////////////////////////////////////////////////////////////////////// +// Part 2: Hash table structure +// +// Internally, the cache uses an open-addressed hash table to index the handles. +// We use tombstone counters to keep track of displacements. Probes are +// generated with double-hashing (but the code can be easily modified to use +// other probing schemes, like linear hashing). Because of the tombstones and +// the two possible visibility states of a handle, the table slots (we use the +// word "slot" to refer to handles that are not necessary valid key-value +// elements) can be in 4 different states: +// 1. Visible element: The slot contains an element in not M state. +// 2. To-be-deleted element: The slot contains an element in M state. +// 3. Tombstone: The slot doesn't contain an element, but there is some other +// element that probed this slot during its insertion. +// 4. Empty: The slot is unused. +// When a ghost is removed from the table, it can either transition to being a +// tombstone or an empty slot, depending on the number of displacements of the +// slot. In any case, the slot becomes available. When a handle is inserted +// into that slot, it becomes a visible element again. +// +/////////////////////////////////////////////////////////////////////////////// +// Part 3: The clock algorithm +// +// We maintain a circular buffer with the handles available for eviction, +// which the clock algorithm traverses (using a "clock pointer") to pick the +// next victim. We use the hash table array as the circular buffer, and mark +// the handles that are evictable. For this we use different clock flags, namely +// NONE, LOW, MEDIUM, HIGH, that represent priorities: LOW, MEDIUM and HIGH +// represent how close an element is from being evictable, LOW being immediately +// evictable. NONE means the slot is not evictable. This is due to one of the +// following reasons: +// (i) the slot doesn't contain an element, or +// (ii) the slot contains an element that is in R state, or +// (iii) the slot contains an element that was in R state but it's +// not any more, and the clock pointer has not swept through the +// slot since the element stopped being referenced. +// +// The priority NONE is really only important for case (iii), as in the other +// two cases there are other metadata fields that already capture the state. +// When an element stops being referenced (and is not deleted), the clock +// algorithm must acknowledge this, and assign a non-NONE priority to make +// the element evictable again. +// +/////////////////////////////////////////////////////////////////////////////// +// Part 4: Synchronization +// +// We provide the following synchronization guarantees: +// - Lookup is lock-free. +// - Release is lock-free, unless (i) no references to the element are left, +// and (ii) it was marked for deletion or the user wishes to delete if +// releasing the last reference. +// - Insert and Erase still use a per-shard lock. +// +// Our hash table is lock-free, in the sense that system-wide progress is +// guaranteed, i.e., some thread is always able to make progress. // -// The main difference with FastLRUCache is, not surprisingly, the eviction -// algorithm -// ---instead of an LRU list, we maintain a circular list with the elements -// available for eviction, which the clock algorithm traverses to pick the next -// victim. The clock list is represented using the array of handles, and we -// simply mark those elements that are present in the list. This is done using -// different clock flags, namely NONE, LOW, MEDIUM, HIGH, that represent -// priorities: NONE means that the element is not part of the clock list, and -// LOW to HIGH represent how close an element is from being evictable (LOW being -// immediately evictable). When the clock pointer steps on an element that is -// not immediately evictable, it decreases its priority. - -constexpr double kLoadFactor = 0.35; // See fast_lru_cache.h. - -constexpr double kStrictLoadFactor = 0.7; // See fast_lru_cache.h. +/////////////////////////////////////////////////////////////////////////////// + +// The load factor p is a real number in (0, 1) such that at all +// times at most a fraction p of all slots, without counting tombstones, +// are occupied by elements. This means that the probability that a +// random probe hits an empty slot is at most p, and thus at most 1/p probes +// are required on average. For example, p = 70% implies that between 1 and 2 +// probes are needed on average (bear in mind that this reasoning doesn't +// consider the effects of clustering over time). +// Because the size of the hash table is always rounded up to the next +// power of 2, p is really an upper bound on the actual load factor---the +// actual load factor is anywhere between p/2 and p. This is a bit wasteful, +// but bear in mind that slots only hold metadata, not actual values. +// Since space cost is dominated by the values (the LSM blocks), +// overprovisioning the table with metadata only increases the total cache space +// usage by a tiny fraction. +constexpr double kLoadFactor = 0.35; + +// The user can exceed kLoadFactor if the sizes of the inserted values don't +// match estimated_value_size, or if strict_capacity_limit == false. To +// avoid performance to plunge, we set a strict upper bound on the load factor. +constexpr double kStrictLoadFactor = 0.7; // Arbitrary seeds. constexpr uint32_t kProbingSeed1 = 0xbc9f1d34; constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5; -// An experimental (under development!) alternative to LRUCache +// An experimental (under development!) alternative to LRUCache. struct ClockHandle { void* value; Cache::DeleterFn deleter; uint32_t hash; - size_t total_charge; // TODO(opt): Only allow uint32_t? - // The number of external refs to this entry. - uint32_t refs; + size_t total_charge; + std::array key_data; + + static constexpr uint8_t kExternalRefsOffset = 0; + static constexpr uint8_t kSharedRefsOffset = 15; + static constexpr uint8_t kExclusiveRefOffset = 30; + static constexpr uint8_t kWillBeDeletedOffset = 31; + + enum Refs : uint32_t { + // Number of external references to the slot. + EXTERNAL_REFS = ((uint32_t{1} << 15) - 1) + << kExternalRefsOffset, // Bits 0, ..., 14 + // Number of internal references plus external references to the slot. + SHARED_REFS = ((uint32_t{1} << 15) - 1) + << kSharedRefsOffset, // Bits 15, ..., 29 + // Whether a thread has an exclusive reference to the slot. + EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30 + // Whether the handle will be deleted soon. When this bit is set, new + // internal + // or external references to this handle stop being accepted. + // There is an exception: external references can be created from + // existing external references, or converting from existing internal + // references. + WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31 + + // Shared references (i.e., external and internal references) and exclusive + // references are our custom implementation of RW locks---external and + // internal references are read locks, and exclusive references are write + // locks. We prioritize readers, which never block; in fact, they don't even + // use compare-and-swap operations. Using our own implementation of RW locks + // allows us to save many atomic operations by packing data more carefully. + // In particular: + // - Combining EXTERNAL_REFS and SHARED_REFS allows us to convert an + // internal + // reference into an external reference in a single atomic arithmetic + // operation. + // - Combining SHARED_REFS and WILL_BE_DELETED allows us to attempt to take + // a shared reference and check whether the entry is marked for deletion + // in a single atomic arithmetic operation. + }; + + static constexpr uint32_t kOneInternalRef = 0x8000; + static constexpr uint32_t kOneExternalRef = 0x8001; + + std::atomic refs; - static constexpr int kIsVisibleOffset = 0; - static constexpr int kIsElementOffset = 1; - static constexpr int kClockPriorityOffset = 2; - static constexpr int kIsHitOffset = 4; - static constexpr int kCachePriorityOffset = 5; + static constexpr uint8_t kIsElementOffset = 1; + static constexpr uint8_t kClockPriorityOffset = 2; + static constexpr uint8_t kIsHitOffset = 4; + static constexpr uint8_t kCachePriorityOffset = 5; enum Flags : uint8_t { - // Whether the handle is visible to Lookups. - IS_VISIBLE = (1 << kIsVisibleOffset), // Whether the slot is in use by an element. - IS_ELEMENT = (1 << kIsElementOffset), - // Clock priorities. Represents how close a handle is from - // being evictable. - CLOCK_PRIORITY = (3 << kClockPriorityOffset), + IS_ELEMENT = 1 << kIsElementOffset, + // Clock priorities. Represents how close a handle is from being evictable. + CLOCK_PRIORITY = 3 << kClockPriorityOffset, // Whether the handle has been looked up after its insertion. - HAS_HIT = (1 << kIsHitOffset), - CACHE_PRIORITY = (1 << kCachePriorityOffset), + HAS_HIT = 1 << kIsHitOffset, + // The value of Cache::Priority for the handle. + CACHE_PRIORITY = 1 << kCachePriorityOffset, }; - uint8_t flags; + + std::atomic flags; enum ClockPriority : uint8_t { - NONE = (0 << kClockPriorityOffset), // Not an element in the eyes of clock. - LOW = (1 << kClockPriorityOffset), // Immediately evictable. + NONE = (0 << kClockPriorityOffset), + LOW = (1 << kClockPriorityOffset), MEDIUM = (2 << kClockPriorityOffset), HIGH = (3 << kClockPriorityOffset) - // Priority is NONE if and only if - // (i) the handle is not an element, or - // (ii) the handle is an element but it is being referenced. }; - // The number of elements that hash to this slot or a lower one, - // but wind up in a higher slot. - uint32_t displacements; - - std::array key_data; - - ClockHandle() { - value = nullptr; - deleter = nullptr; - hash = 0; - total_charge = 0; - refs = 0; - flags = 0; - SetIsVisible(false); + // The number of elements that hash to this slot or a lower one, but wind + // up in this slot or a higher one. + std::atomic displacements; + + // Synchronization rules: + // - Use a shared reference when we want the handle's identity + // members (key_data, hash, value and IS_ELEMENT flag) to + // remain untouched, but not modify them. The only updates + // that a shared reference allows are: + // * set CLOCK_PRIORITY to NONE; + // * set the HAS_HIT bit. + // Notice that these two types of updates are idempotent, so + // they don't require synchronization across shared references. + // - Use an exclusive reference when we want identity members + // to remain untouched, as well as modify any identity member + // or flag. + // - displacements can be modified without holding a reference. + // - refs is only modified through appropriate functions to + // take or release references. + + ClockHandle() + : value(nullptr), + deleter(nullptr), + hash(0), + total_charge(0), + refs(0), + flags(0), + displacements(0) { + SetWillBeDeleted(false); SetIsElement(false); SetClockPriority(ClockPriority::NONE); SetCachePriority(Cache::Priority::LOW); - displacements = 0; key_data.fill(0); } - Slice key() const { return Slice(key_data.data(), kCacheKeySize); } - - // Increase the reference count by 1. - void Ref() { refs++; } - - // Just reduce the reference count by 1. Return true if it was last reference. - bool Unref() { - assert(refs > 0); - refs--; - return refs == 0; + ClockHandle(const ClockHandle& other) { *this = other; } + + void operator=(const ClockHandle& other) { + value = other.value; + deleter = other.deleter; + hash = other.hash; + total_charge = other.total_charge; + refs.store(other.refs); + key_data = other.key_data; + flags.store(other.flags); + SetWillBeDeleted(other.WillBeDeleted()); + SetIsElement(other.IsElement()); + SetClockPriority(other.GetClockPriority()); + SetCachePriority(other.GetCachePriority()); + displacements.store(other.displacements); } - // Return true if there are external refs, false otherwise. - bool HasRefs() const { return refs > 0; } + Slice key() const { return Slice(key_data.data(), kCacheKeySize); } - bool IsVisible() const { return flags & IS_VISIBLE; } - - void SetIsVisible(bool is_visible) { - if (is_visible) { - flags |= IS_VISIBLE; - } else { - flags &= ~IS_VISIBLE; - } - } + bool HasExternalRefs() const { return (refs & EXTERNAL_REFS) > 0; } bool IsElement() const { return flags & IS_ELEMENT; } @@ -144,7 +284,7 @@ struct ClockHandle { if (is_element) { flags |= IS_ELEMENT; } else { - flags &= ~IS_ELEMENT; + flags &= static_cast(~IS_ELEMENT); } } @@ -152,7 +292,7 @@ struct ClockHandle { void SetHit() { flags |= HAS_HIT; } - bool IsInClockList() const { + bool IsInClock() const { return GetClockPriority() != ClockHandle::ClockPriority::NONE; } @@ -164,7 +304,7 @@ struct ClockHandle { if (priority == Cache::Priority::HIGH) { flags |= Flags::CACHE_PRIORITY; } else { - flags &= ~Flags::CACHE_PRIORITY; + flags &= static_cast(~Flags::CACHE_PRIORITY); } } @@ -173,7 +313,7 @@ struct ClockHandle { } void SetClockPriority(ClockPriority priority) { - flags &= ~Flags::CLOCK_PRIORITY; + flags &= static_cast(~Flags::CLOCK_PRIORITY); flags |= priority; } @@ -182,14 +322,13 @@ struct ClockHandle { kClockPriorityOffset; assert(p > 0); p--; - flags &= ~Flags::CLOCK_PRIORITY; + flags &= static_cast(~Flags::CLOCK_PRIORITY); ClockPriority new_priority = static_cast(p << kClockPriorityOffset); flags |= new_priority; } void FreeData() { - assert(refs == 0); if (deleter) { (*deleter)(key(), value); } @@ -232,17 +371,131 @@ struct ClockHandle { return total_charge - meta_charge; } - inline bool IsEmpty() { + inline bool IsEmpty() const { return !this->IsElement() && this->displacements == 0; } - inline bool IsTombstone() { + inline bool IsTombstone() const { return !this->IsElement() && this->displacements > 0; } - inline bool Matches(const Slice& some_key) { - return this->IsElement() && this->key() == some_key; + inline bool Matches(const Slice& some_key, uint32_t some_hash) const { + return this->IsElement() && this->hash == some_hash && + this->key() == some_key; + } + + bool WillBeDeleted() const { return refs & WILL_BE_DELETED; } + + void SetWillBeDeleted(bool will_be_deleted) { + if (will_be_deleted) { + refs |= WILL_BE_DELETED; + } else { + refs &= ~WILL_BE_DELETED; + } + } + + // The following functions are for taking and releasing refs. + + // Tries to take an external ref. Returns true iff it succeeds. + inline bool TryExternalRef() { + if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) { + return true; + } + refs -= kOneExternalRef; + return false; + } + + // Releases an external ref. Returns the new value (this is useful to + // avoid an extra atomic read). + inline uint32_t ReleaseExternalRef() { return refs -= kOneExternalRef; } + + // Take an external ref, assuming there is already one external ref + // to the handle. + void Ref() { + // TODO(Guido) Is it okay to assume that the existing external reference + // survives until this function returns? + refs += kOneExternalRef; + } + + // Tries to take an internal ref. Returns true iff it succeeds. + inline bool TryInternalRef() { + if (!((refs += kOneInternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) { + return true; + } + refs -= kOneInternalRef; + return false; + } + + inline void ReleaseInternalRef() { refs -= kOneInternalRef; } + + // Tries to take an exclusive ref. Returns true iff it succeeds. + inline bool TryExclusiveRef() { + uint32_t will_be_deleted = refs & WILL_BE_DELETED; + uint32_t expected = will_be_deleted; + return refs.compare_exchange_strong(expected, + EXCLUSIVE_REF | will_be_deleted); } + + // Repeatedly tries to take an exclusive reference, but stops as soon + // as an external reference is detected (in this case the wait would + // presumably be too long). + inline bool TrySpinExclusiveRef() { + uint32_t expected = 0; + uint32_t will_be_deleted = 0; + while (!refs.compare_exchange_strong(expected, + EXCLUSIVE_REF | will_be_deleted)) { + if (expected & EXTERNAL_REFS) { + return false; + } + will_be_deleted = expected & WILL_BE_DELETED; + expected = will_be_deleted; + } + return true; + } + + inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); } + + // The following functions are for upgrading and downgrading refs. + // They guarantee atomicity, i.e., no exclusive refs to the handle + // can be taken by a different thread during the conversion. + + inline void ExclusiveToInternalRef() { + refs += kOneInternalRef; + ReleaseExclusiveRef(); + } + + inline void ExclusiveToExternalRef() { + refs += kOneExternalRef; + ReleaseExclusiveRef(); + } + + // TODO(Guido) Do we want to bound the loop and prepare the + // algorithms to react to a failure? + inline void InternalToExclusiveRef() { + uint32_t expected = kOneInternalRef; + uint32_t will_be_deleted = 0; + while (!refs.compare_exchange_strong(expected, + EXCLUSIVE_REF | will_be_deleted)) { + will_be_deleted = expected & WILL_BE_DELETED; + expected = kOneInternalRef | will_be_deleted; + } + } + + inline void InternalToExternalRef() { + refs += kOneExternalRef - kOneInternalRef; + } + + // TODO(Guido) Same concern. + inline void ExternalToExclusiveRef() { + uint32_t expected = kOneExternalRef; + uint32_t will_be_deleted = 0; + while (!refs.compare_exchange_strong(expected, + EXCLUSIVE_REF | will_be_deleted)) { + will_be_deleted = expected & WILL_BE_DELETED; + expected = kOneExternalRef | will_be_deleted; + } + } + }; // struct ClockHandle class ClockHandleTable { @@ -252,31 +505,54 @@ class ClockHandleTable { // Returns a pointer to a visible element matching the key/hash, or // nullptr if not present. - ClockHandle* Lookup(const Slice& key); + ClockHandle* Lookup(const Slice& key, uint32_t hash); // Inserts a copy of h into the hash table. // Returns a pointer to the inserted handle, or nullptr if no slot // available was found. If an existing visible element matching the // key/hash is already present in the hash table, the argument old - // is set to pointe to it; otherwise, it's set to nullptr. + // is set to point to it; otherwise, it's set to nullptr. + // Returns an exclusive reference to h, and no references to old. ClockHandle* Insert(ClockHandle* h, ClockHandle** old); - // Removes h from the hash table. The handle must already be off - // the clock list. + // Removes h from the hash table. The handle must already be off clock. void Remove(ClockHandle* h); - // Turns a visible element h into a ghost (i.e., not visible). - void Exclude(ClockHandle* h); + // Extracts the element information from a handle (src), and assigns it + // to a hash table slot (dst). Doesn't touch displacements and refs, + // which are maintained by the hash table algorithm. + void Assign(ClockHandle* dst, ClockHandle* src); - // Assigns a copy of h to the given slot. - void Assign(int slot, ClockHandle* h); + template + void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end, + bool apply_if_will_be_deleted) { + for (uint32_t i = index_begin; i < index_end; i++) { + ClockHandle* h = &array_[i]; + if (h->TryExclusiveRef()) { + if (h->IsElement() && + (apply_if_will_be_deleted || !h->WillBeDeleted())) { + // Hand the internal ref over to func, which is now responsible + // to release it. + func(h); + } else { + h->ReleaseExclusiveRef(); + } + } + } + } template - void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) { + void ConstApplyToEntriesRange(T func, uint32_t index_begin, + uint32_t index_end, + bool apply_if_will_be_deleted) const { for (uint32_t i = index_begin; i < index_end; i++) { ClockHandle* h = &array_[i]; - if (h->IsVisible()) { - func(h); + if (h->TryExclusiveRef()) { + if (h->IsElement() && + (apply_if_will_be_deleted || !h->WillBeDeleted())) { + func(h); + } + h->ReleaseExclusiveRef(); } } } @@ -295,28 +571,38 @@ class ClockHandleTable { private: friend class ClockCacheShard; - int FindVisibleElement(const Slice& key, int& probe, int displacement); + int FindElement(const Slice& key, uint32_t hash, uint32_t& probe); - int FindAvailableSlot(const Slice& key, int& probe, int displacement); + int FindAvailableSlot(const Slice& key, uint32_t& probe); - int FindVisibleElementOrAvailableSlot(const Slice& key, int& probe, - int displacement); + int FindElementOrAvailableSlot(const Slice& key, uint32_t hash, + uint32_t& probe); // Returns the index of the first slot probed (hashing with - // the given key) with a handle e such that cond(e) is true. - // Otherwise, if no match is found, returns -1. - // For every handle e probed except the final slot, updates - // e->displacements += displacement. - // The argument probe is modified such that consecutive calls - // to FindSlot continue probing right after where the previous - // call left. - int FindSlot(const Slice& key, std::function cond, - int& probe, int displacement); + // the given key) with a handle e such that match(e) is true. + // At every step, the function first tests whether match(e) holds. + // If it's false, it evaluates abort(e) to decide whether the + // search should be aborted, and in the affirmative returns -1. + // For every handle e probed except the last one, the function runs + // update(e). We say a probe to a handle e is aborting if match(e) is + // false and abort(e) is true. The argument probe is one more than the + // last non-aborting probe during the call. This is so that that the + // variable can be used to keep track of progress across consecutive + // calls to FindSlot. + inline int FindSlot(const Slice& key, std::function match, + std::function stop, + std::function update, + uint32_t& probe); + + // After a failed FindSlot call (i.e., with answer -1), this function + // decrements all displacements, starting from the 0-th probe. + void Rollback(const Slice& key, uint32_t probe); // Number of hash bits used for table index. // The size of the table is 1 << length_bits_. int length_bits_; + // For faster computation of ModTableSize. const uint32_t length_bits_mask_; // Number of elements in the table. @@ -345,10 +631,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard { void SetStrictCapacityLimit(bool strict_capacity_limit) override; // Like Cache methods, but with an extra "hash" parameter. - // Insert an item into the hash table and, if handle is null, insert into - // the clock list. Older items are evicted as necessary. If the cache is full - // and free_handle_on_fail is true, the item is deleted and handle is set to - // nullptr. + // Insert an item into the hash table and, if handle is null, make it + // evictable by the clock algorithm. Older items are evicted as necessary. + // If the cache is full and free_handle_on_fail is true, the item is deleted + // and handle is set to nullptr. Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, Cache::DeleterFn deleter, Cache::Handle** handle, Cache::Priority priority) override; @@ -393,13 +679,18 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard { private: friend class ClockCache; - void ClockRemove(ClockHandle* e); - void ClockInsert(ClockHandle* e); + + // Makes an element evictable by clock. + void ClockOn(ClockHandle* h); + + // Makes an element non-evictable. + void ClockOff(ClockHandle* h); + + // Requires an exclusive ref on h. + void Evict(ClockHandle* h); // Free some space following strict clock policy until enough space - // to hold (usage_ + charge) is freed or the clock list is empty - // This function is not thread safe - it needs to be executed while - // holding the mutex_. + // to hold (usage_ + charge) is freed or there are no evictable elements. void EvictFromClock(size_t charge, autovector* deleted); // Returns the charge of a single handle. @@ -436,9 +727,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard { // Memory size for entries residing in the cache. size_t usage_; - // Memory size for unpinned entries in the clock list. - size_t clock_usage_; - // mutex_ protects the following state. // We don't count mutex_ as the cache's internal state so semantically we // don't mind mutex_ invoking the non-const actions. diff --git a/cache/fast_lru_cache.cc b/cache/fast_lru_cache.cc index 0152b6fbe..a425204b9 100644 --- a/cache/fast_lru_cache.cc +++ b/cache/fast_lru_cache.cc @@ -52,6 +52,7 @@ LRUHandle* LRUHandleTable::Insert(LRUHandle* h, LRUHandle** old) { 1 /*displacement*/); *old = nullptr; if (slot == -1) { + // TODO(Guido) Don't we need to roll back displacements here? return nullptr; }