diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index ad2c5f4af..aa8e10cd6 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -17,7 +17,6 @@ #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "port/lang.h" -#include "util/distributed_mutex.h" #include "util/hash.h" #include "util/math.h" #include "util/random.h" @@ -26,86 +25,91 @@ namespace ROCKSDB_NAMESPACE { namespace clock_cache { -ClockHandleTable::ClockHandleTable(int hash_bits) +ClockHandleTable::ClockHandleTable(size_t capacity, int hash_bits) : length_bits_(hash_bits), length_bits_mask_((uint32_t{1} << length_bits_) - 1), - occupancy_(0), occupancy_limit_(static_cast((uint32_t{1} << length_bits_) * kStrictLoadFactor)), - array_(new ClockHandle[size_t{1} << length_bits_]) { + capacity_(capacity), + array_(new ClockHandle[size_t{1} << length_bits_]), + clock_pointer_(0), + occupancy_(0), + usage_(0) { assert(hash_bits <= 32); } ClockHandleTable::~ClockHandleTable() { - ApplyToEntriesRange([](ClockHandle* h) { h->FreeData(); }, 0, GetTableSize(), - true); + // Assumes there are no references (of any type) to any slot in the table. + for (uint32_t i = 0; i < GetTableSize(); i++) { + ClockHandle* h = &array_[i]; + if (h->IsElement()) { + h->FreeData(); + } + } } ClockHandle* ClockHandleTable::Lookup(const Slice& key, uint32_t hash) { uint32_t probe = 0; - int slot = FindElement(key, hash, probe); - return (slot == -1) ? nullptr : &array_[slot]; + ClockHandle* e = FindSlot( + key, + [&](ClockHandle* h) { + if (h->TryInternalRef()) { + if (h->IsElement() && h->Matches(key, hash)) { + return true; + } + h->ReleaseInternalRef(); + } + return false; + }, + [&](ClockHandle* h) { return h->displacements == 0; }, + [&](ClockHandle* /*h*/) {}, probe); + + if (e != nullptr) { + // TODO(Guido) Comment from #10347: Here it looks like we have three atomic + // updates where it would be possible to combine into one CAS (more metadata + // under one atomic field) or maybe two atomic updates (one arithmetic, one + // bitwise). Something to think about optimizing. + e->InternalToExternalRef(); + e->SetHit(); + // The handle is now referenced, so we take it out of clock. + ClockOff(e); + } + + return e; } -ClockHandle* ClockHandleTable::Insert(ClockHandle* h, ClockHandle** old) { +ClockHandle* ClockHandleTable::Insert(ClockHandle* h, + autovector* deleted, + bool take_reference) { uint32_t probe = 0; - int slot = FindElementOrAvailableSlot(h->key(), h->hash, probe); - *old = nullptr; - if (slot == -1) { - // The key is not already present, and there's no available slot to place - // the new copy. + ClockHandle* e = FindAvailableSlot(h->key(), h->hash, probe, deleted); + if (e == nullptr) { + // No available slot to place the handle. return nullptr; } - if (!array_[slot].IsElement()) { - // The slot is empty or is a tombstone. - ClockHandle* new_entry = &array_[slot]; - new_entry->InternalToExclusiveRef(); - Assign(new_entry, h); - if (new_entry->displacements == 0) { - // The slot was empty. - return new_entry; - } - // It used to be a tombstone, so there may already be a copy of the + // The slot is empty or is a tombstone. And we have an exclusive ref. + Assign(e, h); + // TODO(Guido) The following RemoveAll can probably be run outside of + // the exclusive ref. I had a bad case in mind: multiple inserts could + // annihilate each. Although I think this is impossible, I'm not sure + // my mental proof covers every case. + if (e->displacements != 0) { + // It used to be a tombstone, so there may already be copies of the // key in the table. - slot = FindElement(h->key(), h->hash, probe); - if (slot == -1) { - // Nope, no existing copy of the key. - return new_entry; - } - ClockHandle* old_entry = &array_[slot]; - old_entry->ReleaseInternalRef(); - *old = old_entry; - return new_entry; - } else { - // There is an existing copy of the key. - ClockHandle* old_entry = &array_[slot]; - old_entry->ReleaseInternalRef(); - *old = old_entry; - // Find an available slot for the new element. - old_entry->displacements++; - slot = FindAvailableSlot(h->key(), probe); - if (slot == -1) { - // No available slots. - return nullptr; - } - ClockHandle* new_entry = &array_[slot]; - new_entry->InternalToExclusiveRef(); - Assign(new_entry, h); - return new_entry; + RemoveAll(h->key(), h->hash, probe, deleted); } -} -void ClockHandleTable::Remove(ClockHandle* h) { - assert(!h->IsInClock()); // Already off clock. - uint32_t probe = 0; - FindSlot( - h->key(), [&](ClockHandle* e) { return e == h; }, - [&](ClockHandle* /*e*/) { return false; }, - [&](ClockHandle* e) { e->displacements--; }, probe); - h->SetWillBeDeleted(false); - h->SetIsElement(false); - occupancy_--; + if (take_reference) { + // The user wants to take a reference. + e->ExclusiveToExternalRef(); + } else { + // The user doesn't want to immediately take a reference, so we make + // it evictable. + ClockOn(e); + e->ReleaseExclusiveRef(); + } + return e; } void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) { @@ -117,19 +121,75 @@ void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) { dst->key_data = src->key_data; dst->flags.store(0); dst->SetIsElement(true); - dst->SetClockPriority(ClockHandle::ClockPriority::NONE); dst->SetCachePriority(src->GetCachePriority()); + usage_ += dst->total_charge; occupancy_++; } -int ClockHandleTable::FindElement(const Slice& key, uint32_t hash, - uint32_t& probe) { - return FindSlot( +bool ClockHandleTable::TryRemove(ClockHandle* h, + autovector* deleted) { + if (h->TryExclusiveRef()) { + if (h->WillBeDeleted()) { + Remove(h, deleted); + return true; + } + h->ReleaseExclusiveRef(); + } + return false; +} + +bool ClockHandleTable::SpinTryRemove(ClockHandle* h, + autovector* deleted) { + if (h->SpinTryExclusiveRef()) { + if (h->WillBeDeleted()) { + Remove(h, deleted); + return true; + } + h->ReleaseExclusiveRef(); + } + return false; +} + +void ClockHandleTable::ClockOff(ClockHandle* h) { + h->SetClockPriority(ClockHandle::ClockPriority::NONE); +} + +void ClockHandleTable::ClockOn(ClockHandle* h) { + assert(!h->IsInClock()); + bool is_high_priority = + h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH; + h->SetClockPriority(static_cast( + is_high_priority ? ClockHandle::ClockPriority::HIGH + : ClockHandle::ClockPriority::MEDIUM)); +} + +void ClockHandleTable::Remove(ClockHandle* h, + autovector* deleted) { + deleted->push_back(*h); + ClockOff(h); + uint32_t probe = 0; + FindSlot( + h->key(), [&](ClockHandle* e) { return e == h; }, + [&](ClockHandle* /*e*/) { return false; }, + [&](ClockHandle* e) { e->displacements--; }, probe); + h->SetWillBeDeleted(false); + h->SetIsElement(false); +} + +void ClockHandleTable::RemoveAll(const Slice& key, uint32_t hash, + uint32_t& probe, + autovector* deleted) { + FindSlot( key, [&](ClockHandle* h) { if (h->TryInternalRef()) { - if (h->Matches(key, hash)) { - return true; + if (h->IsElement() && h->Matches(key, hash)) { + h->SetWillBeDeleted(true); + h->ReleaseInternalRef(); + if (TryRemove(h, deleted)) { + h->ReleaseExclusiveRef(); + } + return false; } h->ReleaseInternalRef(); } @@ -139,53 +199,74 @@ int ClockHandleTable::FindElement(const Slice& key, uint32_t hash, [&](ClockHandle* /*h*/) {}, probe); } -int ClockHandleTable::FindAvailableSlot(const Slice& key, uint32_t& probe) { - int slot = FindSlot( - key, - [&](ClockHandle* h) { - if (h->TryInternalRef()) { - if (!h->IsElement()) { - return true; - } - h->ReleaseInternalRef(); - } - return false; - }, - [&](ClockHandle* /*h*/) { return false; }, - [&](ClockHandle* h) { h->displacements++; }, probe); - if (slot == -1) { - Rollback(key, probe); +void ClockHandleTable::Free(autovector* deleted) { + if (deleted->size() == 0) { + // Avoid unnecessarily reading usage_ and occupancy_. + return; + } + + size_t deleted_charge = 0; + for (auto& h : *deleted) { + deleted_charge += h.total_charge; + h.FreeData(); } - return slot; + assert(usage_ >= deleted_charge); + usage_ -= deleted_charge; + occupancy_ -= static_cast(deleted->size()); } -int ClockHandleTable::FindElementOrAvailableSlot(const Slice& key, - uint32_t hash, - uint32_t& probe) { - int slot = FindSlot( +ClockHandle* ClockHandleTable::FindAvailableSlot( + const Slice& key, uint32_t hash, uint32_t& probe, + autovector* deleted) { + ClockHandle* e = FindSlot( key, [&](ClockHandle* h) { + // To read the handle, first acquire a shared ref. if (h->TryInternalRef()) { - if (!h->IsElement() || h->Matches(key, hash)) { - return true; + if (h->IsElement()) { + // The slot is not available. + // TODO(Guido) Is it worth testing h->WillBeDeleted()? + if (h->WillBeDeleted() || h->Matches(key, hash)) { + // The slot can be freed up, or the key we're inserting is already + // in the table, so we try to delete it. When the attempt is + // successful, the slot becomes available, so we stop probing. + // Notice that in that case TryRemove returns an exclusive ref. + h->SetWillBeDeleted(true); + h->ReleaseInternalRef(); + if (TryRemove(h, deleted)) { + return true; + } + return false; + } + h->ReleaseInternalRef(); + return false; } + + // Available slot. h->ReleaseInternalRef(); + // Try to acquire an exclusive ref. If we fail, continue probing. + if (h->SpinTryExclusiveRef()) { + // Check that the slot is still available. + if (!h->IsElement()) { + return true; + } + h->ReleaseExclusiveRef(); + } } return false; }, [&](ClockHandle* /*h*/) { return false; }, [&](ClockHandle* h) { h->displacements++; }, probe); - if (slot == -1) { + if (e == nullptr) { Rollback(key, probe); } - return slot; + return e; } -int ClockHandleTable::FindSlot(const Slice& key, - std::function match, - std::function abort, - std::function update, - uint32_t& probe) { +ClockHandle* ClockHandleTable::FindSlot( + const Slice& key, std::function match, + std::function abort, + std::function update, uint32_t& probe) { // We use double-hashing probing. Every probe in the sequence is a // pseudorandom integer, computed as a linear function of two random hashes, // which we call base and increment. Specifically, the i-th probe is base + i @@ -201,14 +282,14 @@ int ClockHandleTable::FindSlot(const Slice& key, ClockHandle* h = &array_[current]; if (current == base && probe > 0) { // We looped back. - return -1; + return nullptr; } if (match(h)) { probe++; - return current; + return h; } if (abort(h)) { - return -1; + return nullptr; } probe++; update(h); @@ -226,35 +307,73 @@ void ClockHandleTable::Rollback(const Slice& key, uint32_t probe) { } } +void ClockHandleTable::ClockRun(size_t charge) { + // TODO(Guido) When an element is in the probe sequence of a + // hot element, it will be hard to get an exclusive ref. + // Do we need a mechanism to prevent an element from sitting + // for a long time in cache waiting to be evicted? + assert(charge <= capacity_); + autovector deleted; + uint32_t max_iterations = + 1 + static_cast(GetTableSize() * kLoadFactor); + size_t usage_local = usage_; + while (usage_local + charge > capacity_ && max_iterations--) { + uint32_t steps = 1 + static_cast(1 / kLoadFactor); + uint32_t clock_pointer_local = (clock_pointer_ += steps) - steps; + for (uint32_t i = 0; i < steps; i++) { + ClockHandle* h = &array_[ModTableSize(clock_pointer_local + i)]; + + if (h->TryExclusiveRef()) { + if (h->WillBeDeleted()) { + Remove(h, &deleted); + usage_local -= h->total_charge; + } else { + if (!h->IsInClock() && h->IsElement()) { + // We adjust the clock priority to make the element evictable again. + // Why? Elements that are not in clock are either currently + // externally referenced or used to be. Because we are holding an + // exclusive ref, we know we are in the latter case. This can only + // happen when the last external reference to an element was + // released, and the element was not immediately removed. + + ClockOn(h); + } + ClockHandle::ClockPriority priority = h->GetClockPriority(); + if (priority == ClockHandle::ClockPriority::LOW) { + Remove(h, &deleted); + usage_local -= h->total_charge; + } else if (priority > ClockHandle::ClockPriority::LOW) { + h->DecreaseClockPriority(); + } + } + h->ReleaseExclusiveRef(); + } + } + } + + Free(&deleted); +} + ClockCacheShard::ClockCacheShard( size_t capacity, size_t estimated_value_size, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy) - : capacity_(capacity), - strict_capacity_limit_(strict_capacity_limit), - clock_pointer_(0), - table_( - CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)), - usage_(0) { + : strict_capacity_limit_(strict_capacity_limit), + table_(capacity, CalcHashBits(capacity, estimated_value_size, + metadata_charge_policy)) { set_metadata_charge_policy(metadata_charge_policy); } void ClockCacheShard::EraseUnRefEntries() { - autovector last_reference_list; - { - DMutexLock l(mutex_); - table_.ApplyToEntriesRange( - [this, &last_reference_list](ClockHandle* h) { - // Externally unreferenced element. - last_reference_list.push_back(*h); - Evict(h); - }, - 0, table_.GetTableSize(), true); - } + autovector deleted; - // Free the entry outside of the mutex for performance reasons. - for (auto& h : last_reference_list) { - h.FreeData(); - } + table_.ApplyToEntriesRange( + [this, &deleted](ClockHandle* h) { + // Externally unreferenced element. + table_.Remove(h, &deleted); + }, + 0, table_.GetTableSize(), true); + + table_.Free(&deleted); } void ClockCacheShard::ApplyToSomeEntries( @@ -264,7 +383,6 @@ void ClockCacheShard::ApplyToSomeEntries( // The state is essentially going to be the starting hash, which works // nicely even if we resize between calls because we use upper-most // hash bits for table indexes. - DMutexLock l(mutex_); uint32_t length_bits = table_.GetLengthBits(); uint32_t length = table_.GetTableSize(); @@ -276,7 +394,7 @@ void ClockCacheShard::ApplyToSomeEntries( uint32_t index_begin = *state >> (32 - length_bits); uint32_t index_end = index_begin + average_entries_per_lock; if (index_end >= length) { - // Going to end + // Going to end. index_end = length; *state = UINT32_MAX; } else { @@ -292,60 +410,6 @@ void ClockCacheShard::ApplyToSomeEntries( index_begin, index_end, false); } -void ClockCacheShard::ClockOff(ClockHandle* h) { - h->SetClockPriority(ClockHandle::ClockPriority::NONE); -} - -void ClockCacheShard::ClockOn(ClockHandle* h) { - assert(!h->IsInClock()); - bool is_high_priority = - h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH; - h->SetClockPriority(static_cast( - is_high_priority * ClockHandle::ClockPriority::HIGH + - (1 - is_high_priority) * ClockHandle::ClockPriority::MEDIUM)); -} - -void ClockCacheShard::Evict(ClockHandle* h) { - ClockOff(h); - table_.Remove(h); - assert(usage_ >= h->total_charge); - usage_ -= h->total_charge; -} - -void ClockCacheShard::EvictFromClock(size_t charge, - autovector* deleted) { - // TODO(Guido) When an element is in the probe sequence of a - // hot element, it will be hard to get an exclusive ref. - // We may need a mechanism to avoid that an element sits forever - // in cache waiting to be evicted. - assert(charge <= capacity_); - uint32_t max_iterations = table_.GetTableSize(); - while (usage_ + charge > capacity_ && max_iterations--) { - ClockHandle* h = &table_.array_[clock_pointer_]; - clock_pointer_ = table_.ModTableSize(clock_pointer_ + 1); - - if (h->TryExclusiveRef()) { - if (!h->IsInClock() && h->IsElement()) { - // We adjust the clock priority to make the element evictable again. - // Why? Elements that are not in clock are either currently - // externally referenced or used to be---because we are holding an - // exclusive ref, we know we are in the latter case. This can only - // happen when the last external reference to an element was released, - // and the element was not immediately removed. - ClockOn(h); - } - - if (h->GetClockPriority() == ClockHandle::ClockPriority::LOW) { - deleted->push_back(*h); - Evict(h); - } else if (h->GetClockPriority() > ClockHandle::ClockPriority::LOW) { - h->DecreaseClockPriority(); - } - h->ReleaseExclusiveRef(); - } - } -} - size_t ClockCacheShard::CalcEstimatedHandleCharge( size_t estimated_value_size, CacheMetadataChargePolicy metadata_charge_policy) { @@ -366,25 +430,12 @@ int ClockCacheShard::CalcHashBits( return FloorLog2((num_entries << 1) - 1); } -void ClockCacheShard::SetCapacity(size_t capacity) { - assert(false); // Not supported. TODO(Guido) Support it? - autovector last_reference_list; - { - DMutexLock l(mutex_); - capacity_ = capacity; - EvictFromClock(0, &last_reference_list); - } - - // Free the entry outside of the mutex for performance reasons. - for (auto& h : last_reference_list) { - h.FreeData(); - } +void ClockCacheShard::SetCapacity(size_t /*capacity*/) { + assert(false); // Not supported. } -void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { - assert(false); // Not supported. TODO(Guido) Support it? - DMutexLock l(mutex_); - strict_capacity_limit_ = strict_capacity_limit; +void ClockCacheShard::SetStrictCapacityLimit(bool /*strict_capacity_limit*/) { + assert(false); // Not supported. } Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value, @@ -407,87 +458,60 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value, } Status s = Status::OK(); - autovector last_reference_list; - { - DMutexLock l(mutex_); - - assert(table_.GetOccupancy() <= table_.GetOccupancyLimit()); - // Free the space following strict clock policy until enough space - // is freed or there are no evictable elements. - EvictFromClock(tmp.total_charge, &last_reference_list); - if ((usage_ + tmp.total_charge > capacity_ && - (strict_capacity_limit_ || handle == nullptr)) || - table_.GetOccupancy() == table_.GetOccupancyLimit()) { - if (handle == nullptr) { - // Don't insert the entry but still return ok, as if the entry inserted - // into cache and get evicted immediately. - last_reference_list.push_back(tmp); - } else { - if (table_.GetOccupancy() == table_.GetOccupancyLimit()) { - // TODO: Consider using a distinct status for this case, but usually - // it will be handled the same way as reaching charge capacity limit - s = Status::MemoryLimit( - "Insert failed because all slots in the hash table are full."); - } else { - s = Status::MemoryLimit( - "Insert failed because the total charge has exceeded the " - "capacity."); - } - } + + // Free space with the clock policy until enough space is freed or there are + // no evictable elements. + table_.ClockRun(tmp.total_charge); + + // occupancy_ and usage_ are contended members across concurrent updates + // on the same shard, so we use a single copy to reduce cache synchronization. + uint32_t occupancy_local = table_.GetOccupancy(); + size_t usage_local = table_.GetUsage(); + assert(occupancy_local <= table_.GetOccupancyLimit()); + + autovector deleted; + + if ((usage_local + tmp.total_charge > table_.GetCapacity() && + (strict_capacity_limit_ || handle == nullptr)) || + occupancy_local > table_.GetOccupancyLimit()) { + if (handle == nullptr) { + // Don't insert the entry but still return ok, as if the entry inserted + // into cache and get evicted immediately. + deleted.push_back(tmp); } else { - // Insert into the cache. Note that the cache might get larger than its - // capacity if not enough space was freed up. - ClockHandle* old; - ClockHandle* h = table_.Insert(&tmp, &old); - assert(h != nullptr); // We're below occupancy, so this insertion should - // never fail. - usage_ += h->total_charge; - if (old != nullptr) { - s = Status::OkOverwritten(); - assert(!old->WillBeDeleted()); - old->SetWillBeDeleted(true); - // Try to evict the old copy of the element. - if (old->TryExclusiveRef()) { - last_reference_list.push_back(*old); - Evict(old); - old->ReleaseExclusiveRef(); - } - } - if (handle == nullptr) { - // If the user didn't provide a handle, no reference is taken, - // so we make the element evictable. - ClockOn(h); - h->ReleaseExclusiveRef(); + if (occupancy_local > table_.GetOccupancyLimit()) { + // TODO: Consider using a distinct status for this case, but usually + // it will be handled the same way as reaching charge capacity limit + s = Status::MemoryLimit( + "Insert failed because all slots in the hash table are full."); } else { - // The caller already holds a ref. - h->ExclusiveToExternalRef(); - *handle = reinterpret_cast(h); + s = Status::MemoryLimit( + "Insert failed because the total charge has exceeded the " + "capacity."); } } - } + } else { + // Insert into the cache. Note that the cache might get larger than its + // capacity if not enough space was freed up. + ClockHandle* h = table_.Insert(&tmp, &deleted, handle != nullptr); + assert(h != nullptr); // The occupancy is way below the table size, so this + // insertion should never fail. + if (handle != nullptr) { + *handle = reinterpret_cast(h); + } - // Free the entry outside of the mutex for performance reasons. - for (auto& h : last_reference_list) { - h.FreeData(); + if (deleted.size() > 0) { + s = Status::OkOverwritten(); + } } + table_.Free(&deleted); + return s; } Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) { - ClockHandle* h = nullptr; - h = table_.Lookup(key, hash); - if (h != nullptr) { - // TODO(Guido) Comment from #10347: Here it looks like we have three atomic - // updates where it would be possible to combine into one CAS (more metadata - // under one atomic field) or maybe two atomic updates (one arithmetic, one - // bitwise). Something to think about optimizing. - h->InternalToExternalRef(); - h->SetHit(); - // The handle is now referenced, so we take it out of clock. - ClockOff(h); - } - return reinterpret_cast(h); + return reinterpret_cast(table_.Lookup(key, hash)); } bool ClockCacheShard::Ref(Cache::Handle* h) { @@ -498,97 +522,50 @@ bool ClockCacheShard::Ref(Cache::Handle* h) { bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) { // In contrast with LRUCache's Release, this function won't delete the handle - // when the reference is the last one and the cache is above capacity. Space + // when the cache is above capacity and the reference is the last one. Space // is only freed up by EvictFromClock (called by Insert when space is needed) - // and Erase. + // and Erase. We do this to avoid an extra atomic read of the variable usage_. if (handle == nullptr) { return false; } ClockHandle* h = reinterpret_cast(handle); - uint32_t hash = h->hash; - uint32_t refs = h->ReleaseExternalRef(); - bool last_reference = !(refs & ClockHandle::EXTERNAL_REFS); + uint32_t refs = h->refs; + bool last_reference = ((refs & ClockHandle::EXTERNAL_REFS) == 1); bool will_be_deleted = refs & ClockHandle::WILL_BE_DELETED; if (last_reference && (will_be_deleted || erase_if_last_ref)) { - // At this point we want to evict the element, so we need to take - // a lock and an exclusive reference. But there's a problem: - // as soon as we released the last reference, an Insert or Erase could've - // replaced this element, and by the time we take the lock and ref - // we could potentially be referencing a different element. - // Thus, before evicting the (potentially different) element, we need to - // re-check that it's unreferenced and marked as WILL_BE_DELETED, so the - // eviction is safe. Additionally, we check that the hash doesn't change, - // which will detect, most of the time, whether the element is a different - // one. The bottomline is that we only guarantee that the input handle will - // be deleted, and occasionally also another handle, but in any case all - // deleted handles are safe to delete. - // TODO(Guido) With lock-free inserts and deletes we may be able to - // "atomically" transition to an exclusive ref, without creating a deadlock. - ClockHandle copy; - { - DMutexLock l(mutex_); - if (h->TrySpinExclusiveRef()) { - will_be_deleted = h->refs & ClockHandle::WILL_BE_DELETED; - // Check that it's still safe to delete. - if (h->IsElement() && (will_be_deleted || erase_if_last_ref) && - h->hash == hash) { - copy = *h; - Evict(h); - } - h->ReleaseExclusiveRef(); - } else { - // An external ref was detected. - return false; - } + autovector deleted; + h->SetWillBeDeleted(true); + h->ReleaseExternalRef(); + if (table_.SpinTryRemove(h, &deleted)) { + h->ReleaseExclusiveRef(); + table_.Free(&deleted); + return true; } - - // Free the entry outside of the mutex for performance reasons. - copy.FreeData(); - return true; + } else { + h->ReleaseExternalRef(); } return false; } void ClockCacheShard::Erase(const Slice& key, uint32_t hash) { - ClockHandle copy; - bool last_reference = false; - { - DMutexLock l(mutex_); - ClockHandle* h = table_.Lookup(key, hash); - if (h != nullptr) { - h->SetWillBeDeleted(true); - h->ReleaseInternalRef(); - if (h->TryExclusiveRef()) { - copy = *h; - Evict(h); - last_reference = true; - h->ReleaseExclusiveRef(); - } - } - } - // Free the entry outside of the mutex for performance reasons. - if (last_reference) { - copy.FreeData(); - } + autovector deleted; + uint32_t probe = 0; + table_.RemoveAll(key, hash, probe, &deleted); + table_.Free(&deleted); } -size_t ClockCacheShard::GetUsage() const { - DMutexLock l(mutex_); - return usage_; -} +size_t ClockCacheShard::GetUsage() const { return table_.GetUsage(); } size_t ClockCacheShard::GetPinnedUsage() const { - // Computes the pinned usage scanning the whole hash table. This - // is slow, but avoid keeping an exact counter on the clock usage, + // Computes the pinned usage by scanning the whole hash table. This + // is slow, but avoids keeping an exact counter on the clock usage, // i.e., the number of not externally referenced elements. - // Why avoid this? Because Lookup removes elements from the clock + // Why avoid this counter? Because Lookup removes elements from the clock // list, so it would need to update the pinned usage every time, // which creates additional synchronization costs. - DMutexLock l(mutex_); - size_t clock_usage = 0; table_.ConstApplyToEntriesRange( @@ -602,17 +579,13 @@ size_t ClockCacheShard::GetPinnedUsage() const { return clock_usage; } -std::string ClockCacheShard::GetPrintableOptions() const { - return std::string{}; -} - ClockCache::ClockCache(size_t capacity, size_t estimated_value_size, int num_shard_bits, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy) - : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) { + : ShardedCache(capacity, num_shard_bits, strict_capacity_limit), + num_shards_(1 << num_shard_bits) { assert(estimated_value_size > 0 || metadata_charge_policy != kDontChargeCacheMetadata); - num_shards_ = 1 << num_shard_bits; shards_ = reinterpret_cast( port::cacheline_aligned_alloc(sizeof(ClockCacheShard) * num_shards_)); size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_; diff --git a/cache/clock_cache.h b/cache/clock_cache.h index 8091bab27..4bded9cad 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -23,102 +23,137 @@ #include "rocksdb/cache.h" #include "rocksdb/secondary_cache.h" #include "util/autovector.h" -#include "util/distributed_mutex.h" namespace ROCKSDB_NAMESPACE { namespace clock_cache { -// Block cache implementation using a lock-free open-address hash table -// and clock eviction. +// An experimental alternative to LRUCache, using a lock-free, open-addressed +// hash table and clock eviction. -/////////////////////////////////////////////////////////////////////////////// -// Part 1: Handles +// ---------------------------------------------------------------------------- +// 1. INTRODUCTION // -// Every slot in the hash table is a ClockHandle. A handle can be in a few -// different states, that stem from the fact that handles can be externally -// referenced and, thus, can't always be immediately evicted when a delete -// operation is executed or when they are replaced by a new version (via an -// insert of the same key). Concretely, the state of a handle is defined by the -// following two properties: -// (R) Externally referenced: A handle can be referenced externally, or not. -// Importantly, a handle can be evicted if and only if it's not -// referenced. In particular, when an handle becomes referenced, it's -// temporarily taken out of clock until all references to it are released. -// (M) Marked for deletion (or invisible): An handle is marked for deletion -// when an operation attempts to delete it, but the handle is externally -// referenced, so it can't be immediately deleted. When this mark is placed, -// lookups will no longer be able to find it. Consequently, no more external -// references will be taken to the handle. When a handle is marked for -// deletion, we also say it's invisible. -// These properties induce 4 different states, with transitions defined as -// follows: -// - Not M --> M: When a handle is deleted or replaced by a new version, but -// not immediately evicted. -// - M --> not M: This cannot happen. Once a handle is marked for deletion, -// there is no can't go back. -// - R --> not R: When all references to an handle are released. -// - Not R --> R: When an unreferenced handle becomes referenced. This can only -// happen if the handle is visible, since references to an handle can only be -// created when it's visible. +// In RocksDB, a Cache is a concurrent unordered dictionary that supports +// external references (a.k.a. user references). A ClockCache is a type of Cache +// that uses the clock algorithm as its eviction policy. Internally, a +// ClockCache is an open-addressed hash table that stores all KV pairs in a +// large array. Every slot in the hash table is a ClockHandle, which holds a KV +// pair plus some additional metadata that controls the different aspects of the +// cache: external references, the hashing mechanism, concurrent access and the +// clock algorithm. // -/////////////////////////////////////////////////////////////////////////////// -// Part 2: Hash table structure // -// Internally, the cache uses an open-addressed hash table to index the handles. -// We use tombstone counters to keep track of displacements. Probes are -// generated with double-hashing (but the code can be easily modified to use -// other probing schemes, like linear hashing). Because of the tombstones and -// the two possible visibility states of a handle, the table slots (we use the -// word "slot" to refer to handles that are not necessary valid key-value -// elements) can be in 4 different states: -// 1. Visible element: The slot contains an element in not M state. -// 2. To-be-deleted element: The slot contains an element in M state. -// 3. Tombstone: The slot doesn't contain an element, but there is some other +// 2. EXTERNAL REFERENCES +// +// An externally referenced handle can't be deleted (either evicted by the clock +// algorithm, or explicitly deleted) or replaced by a new version (via an insert +// of the same key) until all external references to it have been released by +// the users. ClockHandles have two members to support external references: +// - EXTERNAL_REFS counter: The number of external refs. When EXTERNAL_REFS > 0, +// the handle is externally referenced. Updates that intend to modify the +// handle will refrain from doing so. Eventually, when all references are +// released, we have EXTERNAL_REFS == 0, and updates can operate normally on +// the handle. +// - WILL_BE_DELETED flag: An handle is marked for deletion when an operation +// decides the handle should be deleted. This happens either when the last +// reference to a handle is released (and the release operation is instructed +// to delete on last reference) or on when a delete operation is called on +// the item. This flag is needed because an externally referenced handle +// can't be immediately deleted. In these cases, the flag will be later read +// and acted upon by the eviction algorithm. Importantly, WILL_BE_DELETED is +// used not only to defer deletions, but also as a barrier for external +// references: once WILL_BE_DELETED is set, lookups (which are the means to +// acquire new external references) will ignore the handle. For this reason, +// when WILL_BE_DELETED is set, we say the handle is invisible (and +// otherwise, that it's visible). +// +// +// 3. HASHING AND COLLISION RESOLUTION +// +// ClockCache uses an open-addressed hash table to store the handles. +// We use a variant of tombstones to manage collisions: every slot keeps a +// count of how many KV pairs that are currently in the cache have probed the +// slot in an attempt to insert. Probes are generated with double-hashing +// (although the code can be easily modified to use other probing schemes, like +// linear probing). +// +// A slot in the hash table can be in a few different states: +// - Element: The slot contains an element. This is indicated with the +// IS_ELEMENT flag. Element can be sub-classified depending on the +// value of WILL_BE_DELETED: +// * Visible element. +// * Invisible element. +// - Tombstone: The slot doesn't contain an element, but there is some other // element that probed this slot during its insertion. -// 4. Empty: The slot is unused. -// When a ghost is removed from the table, it can either transition to being a -// tombstone or an empty slot, depending on the number of displacements of the -// slot. In any case, the slot becomes available. When a handle is inserted -// into that slot, it becomes a visible element again. +// - Empty: The slot is unused---it's neither an element nor a tombstone. // -/////////////////////////////////////////////////////////////////////////////// -// Part 3: The clock algorithm +// A slot cycles through the following sequence of states: +// empty or tombstone --> visible element --> invisible element --> +// empty or tombstone. Initially a slot is available---it's either +// empty or a tombstone. As soon as a KV pair is written into the slot, it +// becomes a visible element. At some point, the handle will be deleted +// by an explicit delete operation, the eviction algorithm, or an overwriting +// insert. In either case, the handle is marked for deletion. When the an +// attempt to delete the element finally succeeds, the slot is freed up +// and becomes available again. // -// We maintain a circular buffer with the handles available for eviction, -// which the clock algorithm traverses (using a "clock pointer") to pick the -// next victim. We use the hash table array as the circular buffer, and mark -// the handles that are evictable. For this we use different clock flags, namely -// NONE, LOW, MEDIUM, HIGH, that represent priorities: LOW, MEDIUM and HIGH -// represent how close an element is from being evictable, LOW being immediately -// evictable. NONE means the slot is not evictable. This is due to one of the -// following reasons: -// (i) the slot doesn't contain an element, or -// (ii) the slot contains an element that is in R state, or -// (iii) the slot contains an element that was in R state but it's -// not any more, and the clock pointer has not swept through the -// slot since the element stopped being referenced. // -// The priority NONE is really only important for case (iii), as in the other -// two cases there are other metadata fields that already capture the state. -// When an element stops being referenced (and is not deleted), the clock -// algorithm must acknowledge this, and assign a non-NONE priority to make -// the element evictable again. +// 4. CONCURRENCY // -/////////////////////////////////////////////////////////////////////////////// -// Part 4: Synchronization +// ClockCache is lock-free. At a high level, we synchronize the operations +// using a read-prioritized, non-blocking variant of RW locks on every slot of +// the hash table. To do this we generalize the concept of reference: +// - Internal reference: Taken by a thread that is attempting to read a slot +// or do a very precise type of update. +// - Exclusive reference: Taken by a thread that is attempting to write a +// a slot extensively. // -// We provide the following synchronization guarantees: -// - Lookup is lock-free. -// - Release is lock-free, unless (i) no references to the element are left, -// and (ii) it was marked for deletion or the user wishes to delete if -// releasing the last reference. -// - Insert and Erase still use a per-shard lock. +// We defer the precise definitions to the comments in the code below. +// A crucial feature of our references is that attempting to take one never +// blocks the thread. Another important feature is that readers are +// prioritized, as they use extremely fast synchronization primitives---they +// use atomic arithmetic/bit operations, but no compare-and-swaps (which are +// much slower). // -// Our hash table is lock-free, in the sense that system-wide progress is -// guaranteed, i.e., some thread is always able to make progress. +// Internal references are used by threads to read slots during a probing +// sequence, making them the most common references (probing is performed +// in almost every operation, not just lookups). During a lookup, once +// the target element is found, and just before the handle is handed over +// to the user, an internal reference is converted into an external reference. +// During an update operation, once the target slot is found, an internal +// reference is converted into an exclusive reference. Interestingly, we +// can't atomically upgrade from internal to exclusive, or we may run into a +// deadlock. Releasing the internal reference and then taking an exclusive +// reference avoids the deadlock, but then the handle may change inbetween. +// One of the key observations we use in our implementation is that we can +// make up for this lack of atomicity using IS_ELEMENT and WILL_BE_DELETED. // -/////////////////////////////////////////////////////////////////////////////// +// Distinguishing internal from external references is useful for two reasons: +// - Internal references are short lived, but external references are typically +// not. This is helpful when acquiring an exclusive ref: if there are any +// external references to the item, it's probably not worth waiting until +// they go away. +// - We can precisely determine when there are no more external references to a +// handle, and proceed to mark it for deletion. This is useful when users +// release external references. +// +// +// 5. CLOCK ALGORITHM +// +// The clock algorithm circularly sweeps through the hash table to find the next +// victim. Recall that handles that are referenced are not evictable; the clock +// algorithm never picks those. We use different clock priorities: NONE, LOW, +// MEDIUM and HIGH. Priorities LOW, MEDIUM and HIGH represent how close an +// element is from being evicted, LOW being the closest to evicted. NONE means +// the slot is not evictable. NONE priority is used in one of the following +// cases: +// (a) the slot doesn't contain an element, or +// (b) the slot contains an externally referenced element, or +// (c) the slot contains an element that used to be externally referenced, +// and the clock pointer has not swept through the slot since the element +// stopped being externally referenced. +// ---------------------------------------------------------------------------- // The load factor p is a real number in (0, 1) such that at all // times at most a fraction p of all slots, without counting tombstones, @@ -138,15 +173,18 @@ constexpr double kLoadFactor = 0.35; // The user can exceed kLoadFactor if the sizes of the inserted values don't // match estimated_value_size, or if strict_capacity_limit == false. To -// avoid performance to plunge, we set a strict upper bound on the load factor. +// avoid a performance drop, we set a strict upper bound on the load factor. constexpr double kStrictLoadFactor = 0.7; +// Maximum number of spins when trying to acquire a ref. +// TODO(Guido) This value was set arbitrarily. Is it appropriate? +// What's the best way to bound the spinning? +constexpr uint32_t kSpinsPerTry = 100000; + // Arbitrary seeds. constexpr uint32_t kProbingSeed1 = 0xbc9f1d34; constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5; -// An experimental (under development!) alternative to LRUCache. - struct ClockHandle { void* value; Cache::DeleterFn deleter; @@ -154,49 +192,6 @@ struct ClockHandle { size_t total_charge; std::array key_data; - static constexpr uint8_t kExternalRefsOffset = 0; - static constexpr uint8_t kSharedRefsOffset = 15; - static constexpr uint8_t kExclusiveRefOffset = 30; - static constexpr uint8_t kWillBeDeletedOffset = 31; - - enum Refs : uint32_t { - // Number of external references to the slot. - EXTERNAL_REFS = ((uint32_t{1} << 15) - 1) - << kExternalRefsOffset, // Bits 0, ..., 14 - // Number of internal references plus external references to the slot. - SHARED_REFS = ((uint32_t{1} << 15) - 1) - << kSharedRefsOffset, // Bits 15, ..., 29 - // Whether a thread has an exclusive reference to the slot. - EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30 - // Whether the handle will be deleted soon. When this bit is set, new - // internal - // or external references to this handle stop being accepted. - // There is an exception: external references can be created from - // existing external references, or converting from existing internal - // references. - WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31 - - // Shared references (i.e., external and internal references) and exclusive - // references are our custom implementation of RW locks---external and - // internal references are read locks, and exclusive references are write - // locks. We prioritize readers, which never block; in fact, they don't even - // use compare-and-swap operations. Using our own implementation of RW locks - // allows us to save many atomic operations by packing data more carefully. - // In particular: - // - Combining EXTERNAL_REFS and SHARED_REFS allows us to convert an - // internal - // reference into an external reference in a single atomic arithmetic - // operation. - // - Combining SHARED_REFS and WILL_BE_DELETED allows us to attempt to take - // a shared reference and check whether the entry is marked for deletion - // in a single atomic arithmetic operation. - }; - - static constexpr uint32_t kOneInternalRef = 0x8000; - static constexpr uint32_t kOneExternalRef = 0x8001; - - std::atomic refs; - static constexpr uint8_t kIsElementOffset = 1; static constexpr uint8_t kClockPriorityOffset = 2; static constexpr uint8_t kIsHitOffset = 4; @@ -209,7 +204,7 @@ struct ClockHandle { CLOCK_PRIORITY = 3 << kClockPriorityOffset, // Whether the handle has been looked up after its insertion. HAS_HIT = 1 << kIsHitOffset, - // The value of Cache::Priority for the handle. + // The value of Cache::Priority of the handle. CACHE_PRIORITY = 1 << kCachePriorityOffset, }; @@ -226,30 +221,67 @@ struct ClockHandle { // up in this slot or a higher one. std::atomic displacements; - // Synchronization rules: - // - Use a shared reference when we want the handle's identity - // members (key_data, hash, value and IS_ELEMENT flag) to - // remain untouched, but not modify them. The only updates - // that a shared reference allows are: - // * set CLOCK_PRIORITY to NONE; - // * set the HAS_HIT bit. - // Notice that these two types of updates are idempotent, so - // they don't require synchronization across shared references. - // - Use an exclusive reference when we want identity members - // to remain untouched, as well as modify any identity member - // or flag. - // - displacements can be modified without holding a reference. - // - refs is only modified through appropriate functions to - // take or release references. + static constexpr uint8_t kExternalRefsOffset = 0; + static constexpr uint8_t kSharedRefsOffset = 15; + static constexpr uint8_t kExclusiveRefOffset = 30; + static constexpr uint8_t kWillBeDeletedOffset = 31; + + enum Refs : uint32_t { + // Synchronization model: + // - An external reference guarantees that hash, value, key_data + // and the IS_ELEMENT flag are not modified. Doesn't allow + // any writes. + // - An internal reference has the same guarantees as an + // external reference, and additionally allows the following + // idempotent updates on the handle: + // * set CLOCK_PRIORITY to NONE; + // * set the HAS_HIT bit; + // * set the WILL_BE_DELETED bit. + // - A shared reference is either an external reference or an + // internal reference. + // - An exclusive reference guarantees that no other thread has a shared + // or exclusive reference to the handle, and allows writes + // on the handle. + + // Number of external references to the slot. + EXTERNAL_REFS = ((uint32_t{1} << 15) - 1) + << kExternalRefsOffset, // Bits 0, ..., 14 + // Number of internal references plus external references to the slot. + SHARED_REFS = ((uint32_t{1} << 15) - 1) + << kSharedRefsOffset, // Bits 15, ..., 29 + // Whether a thread has an exclusive reference to the slot. + EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30 + // Whether the handle will be deleted soon. When this bit is set, new + // internal + // or external references to this handle stop being accepted. + // There is an exception: external references can be created from + // existing external references, or converting from existing internal + // references. + WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31 + + // Having these 4 fields in a single variable allows us to support the + // following operations efficiently: + // - Convert an internal reference into an external reference in a single + // atomic arithmetic operation. + // - Attempt to take a shared reference using a single atomic arithmetic + // operation. This is because we can increment the internal ref count + // as well as checking whether the entry is marked for deletion using a + // single atomic arithmetic operation (and one non-atomic comparison). + }; + + static constexpr uint32_t kOneInternalRef = 0x8000; + static constexpr uint32_t kOneExternalRef = 0x8001; + + std::atomic refs; ClockHandle() : value(nullptr), deleter(nullptr), hash(0), total_charge(0), - refs(0), flags(0), - displacements(0) { + displacements(0), + refs(0) { SetWillBeDeleted(false); SetIsElement(false); SetClockPriority(ClockPriority::NONE); @@ -257,26 +289,66 @@ struct ClockHandle { key_data.fill(0); } + // The copy ctor and assignment operator are only used to copy a handle + // for immediate deletion. (We need to copy because the slot may become + // re-used before the deletion is completed.) We only copy the necessary + // members to carry out the deletion. In particular, we don't need + // the atomic members. ClockHandle(const ClockHandle& other) { *this = other; } void operator=(const ClockHandle& other) { value = other.value; deleter = other.deleter; - hash = other.hash; - total_charge = other.total_charge; - refs.store(other.refs); key_data = other.key_data; - flags.store(other.flags); - SetWillBeDeleted(other.WillBeDeleted()); - SetIsElement(other.IsElement()); - SetClockPriority(other.GetClockPriority()); - SetCachePriority(other.GetCachePriority()); - displacements.store(other.displacements); + total_charge = other.total_charge; } Slice key() const { return Slice(key_data.data(), kCacheKeySize); } - bool HasExternalRefs() const { return (refs & EXTERNAL_REFS) > 0; } + void FreeData() { + if (deleter) { + (*deleter)(key(), value); + } + } + + // Calculate the memory usage by metadata. + inline size_t CalcMetaCharge( + CacheMetadataChargePolicy metadata_charge_policy) const { + if (metadata_charge_policy != kFullChargeCacheMetadata) { + return 0; + } else { + // #ifdef ROCKSDB_MALLOC_USABLE_SIZE + // return malloc_usable_size( + // const_cast(static_cast(this))); + // #else + // TODO(Guido) malloc_usable_size only works when we call it on + // a pointer allocated with malloc. Because our handles are all + // allocated in a single shot as an array, the user can't call + // CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle + // pointer returned by the cache. Moreover, malloc_usable_size + // expects a heap-allocated handle, but sometimes in our code we + // wish to pass a stack-allocated handle (this is only a performance + // concern). + // What is the right way to compute metadata charges with pre-allocated + // handles? + return sizeof(ClockHandle); + // #endif + } + } + + inline void CalcTotalCharge( + size_t charge, CacheMetadataChargePolicy metadata_charge_policy) { + total_charge = charge + CalcMetaCharge(metadata_charge_policy); + } + + inline size_t GetCharge( + CacheMetadataChargePolicy metadata_charge_policy) const { + size_t meta_charge = CalcMetaCharge(metadata_charge_policy); + assert(total_charge >= meta_charge); + return total_charge - meta_charge; + } + + // flags functions. bool IsElement() const { return flags & IS_ELEMENT; } @@ -292,10 +364,6 @@ struct ClockHandle { void SetHit() { flags |= HAS_HIT; } - bool IsInClock() const { - return GetClockPriority() != ClockHandle::ClockPriority::NONE; - } - Cache::Priority GetCachePriority() const { return static_cast(flags & CACHE_PRIORITY); } @@ -308,6 +376,10 @@ struct ClockHandle { } } + bool IsInClock() const { + return GetClockPriority() != ClockHandle::ClockPriority::NONE; + } + ClockPriority GetClockPriority() const { return static_cast(flags & Flags::CLOCK_PRIORITY); } @@ -328,49 +400,6 @@ struct ClockHandle { flags |= new_priority; } - void FreeData() { - if (deleter) { - (*deleter)(key(), value); - } - } - - // Calculate the memory usage by metadata. - inline size_t CalcMetaCharge( - CacheMetadataChargePolicy metadata_charge_policy) const { - if (metadata_charge_policy != kFullChargeCacheMetadata) { - return 0; - } else { - // #ifdef ROCKSDB_MALLOC_USABLE_SIZE - // return malloc_usable_size( - // const_cast(static_cast(this))); - // #else - // TODO(Guido) malloc_usable_size only works when we call it on - // a pointer allocated with malloc. Because our handles are all - // allocated in a single shot as an array, the user can't call - // CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle - // pointer returned by the cache. Moreover, malloc_usable_size - // expects a heap-allocated handle, but sometimes in our code we - // wish to pass a stack-allocated handle (this is only a performance - // concern). - // What is the right way to compute metadata charges with pre-allocated - // handles? - return sizeof(ClockHandle); - // #endif - } - } - - inline void CalcTotalCharge( - size_t charge, CacheMetadataChargePolicy metadata_charge_policy) { - total_charge = charge + CalcMetaCharge(metadata_charge_policy); - } - - inline size_t GetCharge( - CacheMetadataChargePolicy metadata_charge_policy) const { - size_t meta_charge = CalcMetaCharge(metadata_charge_policy); - assert(total_charge >= meta_charge); - return total_charge - meta_charge; - } - inline bool IsEmpty() const { return !this->IsElement() && this->displacements == 0; } @@ -380,11 +409,12 @@ struct ClockHandle { } inline bool Matches(const Slice& some_key, uint32_t some_hash) const { - return this->IsElement() && this->hash == some_hash && - this->key() == some_key; + return this->hash == some_hash && this->key() == some_key; } - bool WillBeDeleted() const { return refs & WILL_BE_DELETED; } + // refs functions. + + inline bool WillBeDeleted() const { return refs & WILL_BE_DELETED; } void SetWillBeDeleted(bool will_be_deleted) { if (will_be_deleted) { @@ -394,28 +424,7 @@ struct ClockHandle { } } - // The following functions are for taking and releasing refs. - - // Tries to take an external ref. Returns true iff it succeeds. - inline bool TryExternalRef() { - if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) { - return true; - } - refs -= kOneExternalRef; - return false; - } - - // Releases an external ref. Returns the new value (this is useful to - // avoid an extra atomic read). - inline uint32_t ReleaseExternalRef() { return refs -= kOneExternalRef; } - - // Take an external ref, assuming there is already one external ref - // to the handle. - void Ref() { - // TODO(Guido) Is it okay to assume that the existing external reference - // survives until this function returns? - refs += kOneExternalRef; - } + bool HasExternalRefs() const { return (refs & EXTERNAL_REFS) > 0; } // Tries to take an internal ref. Returns true iff it succeeds. inline bool TryInternalRef() { @@ -426,9 +435,19 @@ struct ClockHandle { return false; } - inline void ReleaseInternalRef() { refs -= kOneInternalRef; } + // Tries to take an external ref. Returns true iff it succeeds. + inline bool TryExternalRef() { + if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) { + return true; + } + refs -= kOneExternalRef; + return false; + } // Tries to take an exclusive ref. Returns true iff it succeeds. + // TODO(Guido) After every TryExclusiveRef call, we always call + // WillBeDeleted(). We could save an atomic read by having an output parameter + // with the last value of refs. inline bool TryExclusiveRef() { uint32_t will_be_deleted = refs & WILL_BE_DELETED; uint32_t expected = will_be_deleted; @@ -436,15 +455,18 @@ struct ClockHandle { EXCLUSIVE_REF | will_be_deleted); } - // Repeatedly tries to take an exclusive reference, but stops as soon - // as an external reference is detected (in this case the wait would - // presumably be too long). - inline bool TrySpinExclusiveRef() { + // Repeatedly tries to take an exclusive reference, but aborts as soon + // as an external or exclusive reference is detected (since the wait + // would presumably be too long). + inline bool SpinTryExclusiveRef() { uint32_t expected = 0; uint32_t will_be_deleted = 0; + uint32_t spins = kSpinsPerTry; while (!refs.compare_exchange_strong(expected, - EXCLUSIVE_REF | will_be_deleted)) { - if (expected & EXTERNAL_REFS) { + EXCLUSIVE_REF | will_be_deleted) && + spins--) { + std::this_thread::yield(); + if (expected & (EXTERNAL_REFS | EXCLUSIVE_REF)) { return false; } will_be_deleted = expected & WILL_BE_DELETED; @@ -453,75 +475,88 @@ struct ClockHandle { return true; } - inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); } + // Take an external ref, assuming there is already one external ref + // to the handle. + void Ref() { + // TODO(Guido) Is it okay to assume that the existing external reference + // survives until this function returns? + refs += kOneExternalRef; + } - // The following functions are for upgrading and downgrading refs. - // They guarantee atomicity, i.e., no exclusive refs to the handle - // can be taken by a different thread during the conversion. + inline void ReleaseExternalRef() { refs -= kOneExternalRef; } - inline void ExclusiveToInternalRef() { - refs += kOneInternalRef; - ReleaseExclusiveRef(); - } + inline void ReleaseInternalRef() { refs -= kOneInternalRef; } + inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); } + + // Downgrade an exclusive ref to external. inline void ExclusiveToExternalRef() { refs += kOneExternalRef; ReleaseExclusiveRef(); } - // TODO(Guido) Do we want to bound the loop and prepare the - // algorithms to react to a failure? - inline void InternalToExclusiveRef() { - uint32_t expected = kOneInternalRef; - uint32_t will_be_deleted = 0; - while (!refs.compare_exchange_strong(expected, - EXCLUSIVE_REF | will_be_deleted)) { - will_be_deleted = expected & WILL_BE_DELETED; - expected = kOneInternalRef | will_be_deleted; - } - } - + // Convert an internal ref into external. inline void InternalToExternalRef() { refs += kOneExternalRef - kOneInternalRef; } - // TODO(Guido) Same concern. - inline void ExternalToExclusiveRef() { - uint32_t expected = kOneExternalRef; - uint32_t will_be_deleted = 0; - while (!refs.compare_exchange_strong(expected, - EXCLUSIVE_REF | will_be_deleted)) { - will_be_deleted = expected & WILL_BE_DELETED; - expected = kOneExternalRef | will_be_deleted; - } - } - }; // struct ClockHandle class ClockHandleTable { public: - explicit ClockHandleTable(int hash_bits); + explicit ClockHandleTable(size_t capacity, int hash_bits); ~ClockHandleTable(); - // Returns a pointer to a visible element matching the key/hash, or - // nullptr if not present. + // Returns a pointer to a visible handle matching the key/hash, or + // nullptr if not present. When an actual handle is produced, an + // internal reference is handed over. ClockHandle* Lookup(const Slice& key, uint32_t hash); - // Inserts a copy of h into the hash table. - // Returns a pointer to the inserted handle, or nullptr if no slot - // available was found. If an existing visible element matching the - // key/hash is already present in the hash table, the argument old - // is set to point to it; otherwise, it's set to nullptr. - // Returns an exclusive reference to h, and no references to old. - ClockHandle* Insert(ClockHandle* h, ClockHandle** old); + // Inserts a copy of h into the hash table. Returns a pointer to the + // inserted handle, or nullptr if no available slot was found. Every + // existing visible handle matching the key is already present in the + // hash table is marked as WILL_BE_DELETED. The deletion is also attempted, + // and, if the attempt is successful, the handle is inserted into the + // autovector deleted. When take_reference is true, the function hands + // over an external reference on the handle, and otherwise no reference is + // produced. + ClockHandle* Insert(ClockHandle* h, autovector* deleted, + bool take_reference); + + // Assigns h the appropriate clock priority, making it evictable. + void ClockOn(ClockHandle* h); - // Removes h from the hash table. The handle must already be off clock. - void Remove(ClockHandle* h); + // Makes h non-evictable. + void ClockOff(ClockHandle* h); - // Extracts the element information from a handle (src), and assigns it - // to a hash table slot (dst). Doesn't touch displacements and refs, - // which are maintained by the hash table algorithm. - void Assign(ClockHandle* dst, ClockHandle* src); + // Runs the clock eviction algorithm until there is enough space to + // insert an element with the given charge. + void ClockRun(size_t charge); + + // Remove h from the hash table. Requires an exclusive ref to h. + void Remove(ClockHandle* h, autovector* deleted); + + // Remove from the hash table all handles with matching key/hash along a + // probe sequence, starting from the given probe number. Doesn't + // require any references. + void RemoveAll(const Slice& key, uint32_t hash, uint32_t& probe, + autovector* deleted); + + void RemoveAll(const Slice& key, uint32_t hash, + autovector* deleted) { + uint32_t probe = 0; + RemoveAll(key, hash, probe, deleted); + } + + void Free(autovector* deleted); + + // Tries to remove h from the hash table. If the attempt is successful, + // the function hands over an exclusive ref to h. + bool TryRemove(ClockHandle* h, autovector* deleted); + + // Similar to TryRemove, except that it spins, increasing the chances of + // success. Requires that the caller thread has no shared ref to h. + bool SpinTryRemove(ClockHandle* h, autovector* deleted); template void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end, @@ -531,12 +566,9 @@ class ClockHandleTable { if (h->TryExclusiveRef()) { if (h->IsElement() && (apply_if_will_be_deleted || !h->WillBeDeleted())) { - // Hand the internal ref over to func, which is now responsible - // to release it. func(h); - } else { - h->ReleaseExclusiveRef(); } + h->ReleaseExclusiveRef(); } } } @@ -565,53 +597,81 @@ class ClockHandleTable { uint32_t GetOccupancy() const { return occupancy_; } + size_t GetUsage() const { return usage_; } + + size_t GetCapacity() const { return capacity_; } + // Returns x mod 2^{length_bits_}. uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; } private: - friend class ClockCacheShard; - - int FindElement(const Slice& key, uint32_t hash, uint32_t& probe); - - int FindAvailableSlot(const Slice& key, uint32_t& probe); - - int FindElementOrAvailableSlot(const Slice& key, uint32_t hash, - uint32_t& probe); - - // Returns the index of the first slot probed (hashing with - // the given key) with a handle e such that match(e) is true. - // At every step, the function first tests whether match(e) holds. - // If it's false, it evaluates abort(e) to decide whether the - // search should be aborted, and in the affirmative returns -1. - // For every handle e probed except the last one, the function runs - // update(e). We say a probe to a handle e is aborting if match(e) is - // false and abort(e) is true. The argument probe is one more than the - // last non-aborting probe during the call. This is so that that the - // variable can be used to keep track of progress across consecutive - // calls to FindSlot. - inline int FindSlot(const Slice& key, std::function match, - std::function stop, - std::function update, - uint32_t& probe); - - // After a failed FindSlot call (i.e., with answer -1), this function - // decrements all displacements, starting from the 0-th probe. + // Extracts the element information from a handle (src), and assigns it + // to a hash table slot (dst). Doesn't touch displacements and refs, + // which are maintained by the hash table algorithm. + void Assign(ClockHandle* dst, ClockHandle* src); + + // Returns the first slot in the probe sequence, starting from the given + // probe number, with a handle e such that match(e) is true. At every + // step, the function first tests whether match(e) holds. If this is false, + // it evaluates abort(e) to decide whether the search should be aborted, + // and in the affirmative returns -1. For every handle e probed except + // the last one, the function runs update(e). + // The probe parameter is modified as follows. We say a probe to a handle + // e is aborting if match(e) is false and abort(e) is true. Then the final + // value of probe is one more than the last non-aborting probe during the + // call. This is so that that the variable can be used to keep track of + // progress across consecutive calls to FindSlot. + inline ClockHandle* FindSlot(const Slice& key, + std::function match, + std::function stop, + std::function update, + uint32_t& probe); + + // Returns an available slot for the given key. All copies of the + // key found along the probing sequence until an available slot is + // found are marked for deletion. On each of them, a deletion is + // attempted, and when the attempt succeeds the slot is assigned to + // the new copy of the element. + ClockHandle* FindAvailableSlot(const Slice& key, uint32_t hash, + uint32_t& probe, + autovector* deleted); + + // After a failed FindSlot call (i.e., with answer -1) in + // FindAvailableSlot, this function fixes all displacements's + // starting from the 0-th probe, until the given probe. void Rollback(const Slice& key, uint32_t probe); // Number of hash bits used for table index. // The size of the table is 1 << length_bits_. - int length_bits_; + const int length_bits_; // For faster computation of ModTableSize. const uint32_t length_bits_mask_; - // Number of elements in the table. - uint32_t occupancy_; - // Maximum number of elements the user can store in the table. - uint32_t occupancy_limit_; + const uint32_t occupancy_limit_; + + // Maximum total charge of all elements stored in the table. + const size_t capacity_; + // We partition the following members into different cache lines + // to avoid false sharing among Lookup, Release, Erase and Insert + // operations in ClockCacheShard. + + ALIGN_AS(CACHE_LINE_SIZE) + // Array of slots comprising the hash table. std::unique_ptr array_; + + ALIGN_AS(CACHE_LINE_SIZE) + // Clock algorithm sweep pointer. + std::atomic clock_pointer_; + + ALIGN_AS(CACHE_LINE_SIZE) + // Number of elements in the table. + std::atomic occupancy_; + + // Memory size for entries residing in the cache. + std::atomic usage_; }; // class ClockHandleTable // A single shard of sharded cache. @@ -652,20 +712,26 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard { Statistics* /*stats*/) override { return Lookup(key, hash); } + Cache::Handle* Lookup(const Slice& key, uint32_t hash) override; bool Release(Cache::Handle* handle, bool /*useful*/, bool erase_if_last_ref) override { return Release(handle, erase_if_last_ref); } + bool IsReady(Cache::Handle* /*handle*/) override { return true; } + void Wait(Cache::Handle* /*handle*/) override {} bool Ref(Cache::Handle* handle) override; + bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override; + void Erase(const Slice& key, uint32_t hash) override; size_t GetUsage() const override; + size_t GetPinnedUsage() const override; void ApplyToSomeEntries( @@ -675,20 +741,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard { void EraseUnRefEntries() override; - std::string GetPrintableOptions() const override; + std::string GetPrintableOptions() const override { return std::string{}; } private: friend class ClockCache; - // Makes an element evictable by clock. - void ClockOn(ClockHandle* h); - - // Makes an element non-evictable. - void ClockOff(ClockHandle* h); - - // Requires an exclusive ref on h. - void Evict(ClockHandle* h); - // Free some space following strict clock policy until enough space // to hold (usage_ + charge) is freed or there are no evictable elements. void EvictFromClock(size_t charge, autovector* deleted); @@ -703,34 +760,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard { static int CalcHashBits(size_t capacity, size_t estimated_value_size, CacheMetadataChargePolicy metadata_charge_policy); - // Initialized before use. - size_t capacity_; - // Whether to reject insertion if cache reaches its full capacity. - bool strict_capacity_limit_; - - uint32_t clock_pointer_; - - // ------------^^^^^^^^^^^^^----------- - // Not frequently modified data members - // ------------------------------------ - // - // We separate data members that are updated frequently from the ones that - // are not frequently updated so that they don't share the same cache line - // which will lead into false cache sharing - // - // ------------------------------------ - // Frequently modified data members - // ------------vvvvvvvvvvvvv----------- - ClockHandleTable table_; + std::atomic strict_capacity_limit_; - // Memory size for entries residing in the cache. - size_t usage_; - - // mutex_ protects the following state. - // We don't count mutex_ as the cache's internal state so semantically we - // don't mind mutex_ invoking the non-const actions. - mutable DMutex mutex_; + ClockHandleTable table_; }; // class ClockCacheShard class ClockCache @@ -743,19 +776,28 @@ class ClockCache bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy = kDontChargeCacheMetadata); + ~ClockCache() override; + const char* Name() const override { return "ClockCache"; } + CacheShard* GetShard(uint32_t shard) override; + const CacheShard* GetShard(uint32_t shard) const override; + void* Value(Handle* handle) override; + size_t GetCharge(Handle* handle) const override; + uint32_t GetHash(Handle* handle) const override; + DeleterFn GetDeleter(Handle* handle) const override; + void DisownData() override; private: ClockCacheShard* shards_ = nullptr; - int num_shards_ = 0; + int num_shards_; }; // class ClockCache } // namespace clock_cache