Lock-free Lookup and Release in ClockCache (#10347)

Summary: This is a prototype of a partially lock-free version of ClockCache. Roughly speaking, reads are lock-free and writes are lock-based: - Lookup is lock-free. - Release is lock-free, unless (i) no references to the element are left and (ii) it was marked for deletion or ``erase_if_last_ref`` is set. - Insert and Erase still use a per-shard lock. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10347 Test Plan: - ``make -j24 check`` - ``make -j24 CRASH_TEST_EXT_ARGS="--duration=960 --cache_type=clock_cache --cache_size=1073741824 --block_size=16384" blackbox_crash_test_with_atomic_flush`` Reviewed By: pdillinger Differential Revision: D37898776 Pulled By: guidotag fbshipit-source-id: 6418fd980f786d69b871bf2fe959398e44cd3d80
3 years ago · efdb428edc
parent faa0f9723c
commit efdb428edc
3 changed files with 683 additions and 304 deletions
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -37,133 +37,191 @@ ClockHandleTable::ClockHandleTable(int hash_bits)
 }
 ClockHandleTable::~ClockHandleTable() {
-  ApplyToEntriesRange([](ClockHandle* h) { h->FreeData(); }, 0, GetTableSize());
+  ApplyToEntriesRange([](ClockHandle* h) { h->FreeData(); }, 0, GetTableSize(),
                      true);
 }
-ClockHandle* ClockHandleTable::Lookup(const Slice& key) {
+ClockHandle* ClockHandleTable::Lookup(const Slice& key, uint32_t hash) {
-  int probe = 0;
+  uint32_t probe = 0;
-  int slot = FindVisibleElement(key, probe, 0);
+  int slot = FindElement(key, hash, probe);
  return (slot == -1) ? nullptr : &array_[slot];
 }
 ClockHandle* ClockHandleTable::Insert(ClockHandle* h, ClockHandle** old) {
-  int probe = 0;
+  uint32_t probe = 0;
-  int slot =
+  int slot = FindElementOrAvailableSlot(h->key(), h->hash, probe);
      FindVisibleElementOrAvailableSlot(h->key(), probe, 1 /*displacement*/);
  *old = nullptr;
  if (slot == -1) {
    // The key is not already present, and there's no available slot to place
    // the new copy.
    return nullptr;
  }
-  if (array_[slot].IsEmpty() || array_[slot].IsTombstone()) {
+  if (!array_[slot].IsElement()) {
-    bool empty = array_[slot].IsEmpty();
+    // The slot is empty or is a tombstone.
    Assign(slot, h);
    ClockHandle* new_entry = &array_[slot];
-    if (empty) {
+    new_entry->InternalToExclusiveRef();
-      // This used to be an empty slot.
+    Assign(new_entry, h);
    if (new_entry->displacements == 0) {
      // The slot was empty.
      return new_entry;
    }
    // It used to be a tombstone, so there may already be a copy of the
    // key in the table.
-    slot = FindVisibleElement(h->key(), probe, 0 /*displacement*/);
+    slot = FindElement(h->key(), h->hash, probe);
    if (slot == -1) {
-      // No existing copy of the key.
+      // Nope, no existing copy of the key.
      return new_entry;
    }
-    *old = &array_[slot];
+    ClockHandle* old_entry = &array_[slot];
    old_entry->ReleaseInternalRef();
    *old = old_entry;
    return new_entry;
  } else {
    // There is an existing copy of the key.
-    *old = &array_[slot];
+    ClockHandle* old_entry = &array_[slot];
    old_entry->ReleaseInternalRef();
    *old = old_entry;
    // Find an available slot for the new element.
-    array_[slot].displacements++;
+    old_entry->displacements++;
-    slot = FindAvailableSlot(h->key(), probe, 1 /*displacement*/);
+    slot = FindAvailableSlot(h->key(), probe);
    if (slot == -1) {
-      // No available slots. Roll back displacements.
+      // No available slots.
      probe = 0;
      slot = FindVisibleElement(h->key(), probe, -1);
      array_[slot].displacements--;
      FindAvailableSlot(h->key(), probe, -1);
      return nullptr;
    }
-    Assign(slot, h);
+    ClockHandle* new_entry = &array_[slot];
-    return &array_[slot];
+    new_entry->InternalToExclusiveRef();
    Assign(new_entry, h);
    return new_entry;
  }
 }
 void ClockHandleTable::Remove(ClockHandle* h) {
-  assert(!h->IsInClockList());  // Already off the clock list.
+  assert(!h->IsInClock());  // Already off clock.
-  int probe = 0;
+  uint32_t probe = 0;
  FindSlot(
-      h->key(), [&h](ClockHandle* e) { return e == h; }, probe,
+      h->key(), [&](ClockHandle* e) { return e == h; },
-      -1 /*displacement*/);
+      [&](ClockHandle* /*e*/) { return false; },
-  h->SetIsVisible(false);
+      [&](ClockHandle* e) { e->displacements--; }, probe);
  h->SetWillBeDeleted(false);
  h->SetIsElement(false);
  occupancy_--;
 }
-void ClockHandleTable::Assign(int slot, ClockHandle* h) {
+void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) {
-  ClockHandle* dst = &array_[slot];
+  // DON'T touch displacements and refs.
-  uint32_t disp = dst->displacements;
+  dst->value = src->value;
-  *dst = *h;
+  dst->deleter = src->deleter;
-  dst->displacements = disp;
+  dst->hash = src->hash;
-  dst->SetIsVisible(true);
+  dst->total_charge = src->total_charge;
  dst->key_data = src->key_data;
  dst->flags.store(0);
  dst->SetIsElement(true);
  dst->SetClockPriority(ClockHandle::ClockPriority::NONE);
  dst->SetCachePriority(src->GetCachePriority());
  occupancy_++;
 }
-void ClockHandleTable::Exclude(ClockHandle* h) { h->SetIsVisible(false); }
+int ClockHandleTable::FindElement(const Slice& key, uint32_t hash,
-
+                                  uint32_t& probe) {
 int ClockHandleTable::FindVisibleElement(const Slice& key, int& probe,
                                         int displacement) {
  return FindSlot(
-      key, [&](ClockHandle* h) { return h->Matches(key) && h->IsVisible(); },
+      key,
-      probe, displacement);
+      [&](ClockHandle* h) {
        if (h->TryInternalRef()) {
          if (h->Matches(key, hash)) {
            return true;
          }
          h->ReleaseInternalRef();
        }
        return false;
      },
      [&](ClockHandle* h) { return h->displacements == 0; },
      [&](ClockHandle* /*h*/) {}, probe);
 }
-int ClockHandleTable::FindAvailableSlot(const Slice& key, int& probe,
+int ClockHandleTable::FindAvailableSlot(const Slice& key, uint32_t& probe) {
-                                        int displacement) {
+  int slot = FindSlot(
-  return FindSlot(
+      key,
-      key, [](ClockHandle* h) { return h->IsEmpty() || h->IsTombstone(); },
+      [&](ClockHandle* h) {
-      probe, displacement);
+        if (h->TryInternalRef()) {
          if (!h->IsElement()) {
            return true;
          }
          h->ReleaseInternalRef();
        }
        return false;
      },
      [&](ClockHandle* /*h*/) { return false; },
      [&](ClockHandle* h) { h->displacements++; }, probe);
  if (slot == -1) {
    Rollback(key, probe);
  }
  return slot;
 }
-int ClockHandleTable::FindVisibleElementOrAvailableSlot(const Slice& key,
+int ClockHandleTable::FindElementOrAvailableSlot(const Slice& key,
-                                                        int& probe,
+                                                 uint32_t hash,
-                                                        int displacement) {
+                                                 uint32_t& probe) {
-  return FindSlot(
+  int slot = FindSlot(
      key,
      [&](ClockHandle* h) {
-        return h->IsEmpty() || h->IsTombstone() ||
+        if (h->TryInternalRef()) {
-               (h->Matches(key) && h->IsVisible());
+          if (!h->IsElement() || h->Matches(key, hash)) {
            return true;
          }
          h->ReleaseInternalRef();
        }
        return false;
      },
-      probe, displacement);
+      [&](ClockHandle* /*h*/) { return false; },
      [&](ClockHandle* h) { h->displacements++; }, probe);
  if (slot == -1) {
    Rollback(key, probe);
  }
  return slot;
 }
-inline int ClockHandleTable::FindSlot(const Slice& key,
+int ClockHandleTable::FindSlot(const Slice& key,
-                                      std::function<bool(ClockHandle*)> cond,
+                               std::function<bool(ClockHandle*)> match,
-                                      int& probe, int displacement) {
+                               std::function<bool(ClockHandle*)> abort,
                               std::function<void(ClockHandle*)> update,
                               uint32_t& probe) {
  // We use double-hashing probing. Every probe in the sequence is a
  // pseudorandom integer, computed as a linear function of two random hashes,
  // which we call base and increment. Specifically, the i-th probe is base + i
  // * increment modulo the table size.
  uint32_t base = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1));
  // We use an odd increment, which is relatively prime with the power-of-two
  // table size. This implies that we cycle back to the first probe only
  // after probing every slot exactly once.
  uint32_t increment =
      ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1);
  uint32_t current = ModTableSize(base + probe * increment);
  while (true) {
    ClockHandle* h = &array_[current];
-    probe++;
+    if (current == base && probe > 0) {
    if (current == base && probe > 1) {
      // We looped back.
      return -1;
    }
-    if (cond(h)) {
+    if (match(h)) {
      probe++;
      return current;
    }
-    if (h->IsEmpty()) {
+    if (abort(h)) {
      // We check emptyness after the condition, because
      // the condition may be emptyness.
      return -1;
    }
-    h->displacements += displacement;
+    probe++;
    update(h);
    current = ModTableSize(current + increment);
  }
 }
 void ClockHandleTable::Rollback(const Slice& key, uint32_t probe) {
  uint32_t current = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1));
  uint32_t increment =
      ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1);
  for (uint32_t i = 0; i < probe; i++) {
    array_[current].displacements--;
    current = ModTableSize(current + increment);
  }
 }
@ -176,8 +234,7 @@ ClockCacheShard::ClockCacheShard(
      clock_pointer_(0),
      table_(
          CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)),
-      usage_(0),
+      usage_(0) {
      clock_usage_(0) {
  set_metadata_charge_policy(metadata_charge_policy);
 }
@ -185,22 +242,16 @@ void ClockCacheShard::EraseUnRefEntries() {
  autovector<ClockHandle> last_reference_list;
  {
    DMutexLock l(mutex_);
-    uint32_t slot = 0;
+    table_.ApplyToEntriesRange(
-    do {
+        [this, &last_reference_list](ClockHandle* h) {
-      ClockHandle* old = &(table_.array_[slot]);
+          // Externally unreferenced element.
-      if (!old->IsInClockList()) {
+          last_reference_list.push_back(*h);
-        continue;
+          Evict(h);
-      }
+        },
-      ClockRemove(old);
+        0, table_.GetTableSize(), true);
      table_.Remove(old);
      assert(usage_ >= old->total_charge);
      usage_ -= old->total_charge;
      last_reference_list.push_back(*old);
      slot = table_.ModTableSize(slot + 1);
    } while (slot != 0);
  }
-  // Free the entries here outside of mutex for performance reasons.
+  // Free the entry outside of the mutex for performance reasons.
  for (auto& h : last_reference_list) {
    h.FreeData();
  }
@ -238,45 +289,60 @@ void ClockCacheShard::ApplyToSomeEntries(
        callback(h->key(), h->value, h->GetCharge(metadata_charge_policy),
                 h->deleter);
      },
-      index_begin, index_end);
+      index_begin, index_end, false);
 }
-void ClockCacheShard::ClockRemove(ClockHandle* h) {
+void ClockCacheShard::ClockOff(ClockHandle* h) {
  assert(h->IsInClockList());
  h->SetClockPriority(ClockHandle::ClockPriority::NONE);
  assert(clock_usage_ >= h->total_charge);
  clock_usage_ -= h->total_charge;
 }
-void ClockCacheShard::ClockInsert(ClockHandle* h) {
+void ClockCacheShard::ClockOn(ClockHandle* h) {
-  assert(!h->IsInClockList());
+  assert(!h->IsInClock());
  bool is_high_priority =
      h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH;
  h->SetClockPriority(static_cast<ClockHandle::ClockPriority>(
      is_high_priority * ClockHandle::ClockPriority::HIGH +
      (1 - is_high_priority) * ClockHandle::ClockPriority::MEDIUM));
-  clock_usage_ += h->total_charge;
+}
 void ClockCacheShard::Evict(ClockHandle* h) {
  ClockOff(h);
  table_.Remove(h);
  assert(usage_ >= h->total_charge);
  usage_ -= h->total_charge;
 }
 void ClockCacheShard::EvictFromClock(size_t charge,
                                     autovector<ClockHandle>* deleted) {
  // TODO(Guido) When an element is in the probe sequence of a
  // hot element, it will be hard to get an exclusive ref.
  // We may need a mechanism to avoid that an element sits forever
  // in cache waiting to be evicted.
  assert(charge <= capacity_);
-  while (clock_usage_ > 0 && (usage_ + charge) > capacity_) {
+  uint32_t max_iterations = table_.GetTableSize();
-    ClockHandle* old = &table_.array_[clock_pointer_];
+  while (usage_ + charge > capacity_ && max_iterations--) {
    ClockHandle* h = &table_.array_[clock_pointer_];
    clock_pointer_ = table_.ModTableSize(clock_pointer_ + 1);
-    // Clock list contains only elements which can be evicted.
+
-    if (!old->IsInClockList()) {
+    if (h->TryExclusiveRef()) {
-      continue;
+      if (!h->IsInClock() && h->IsElement()) {
        // We adjust the clock priority to make the element evictable again.
        // Why? Elements that are not in clock are either currently
        // externally referenced or used to be---because we are holding an
        // exclusive ref, we know we are in the latter case. This can only
        // happen when the last external reference to an element was released,
        // and the element was not immediately removed.
        ClockOn(h);
      }
      if (h->GetClockPriority() == ClockHandle::ClockPriority::LOW) {
        deleted->push_back(*h);
        Evict(h);
      } else if (h->GetClockPriority() > ClockHandle::ClockPriority::LOW) {
        h->DecreaseClockPriority();
      }
-    if (old->GetClockPriority() == ClockHandle::ClockPriority::LOW) {
+      h->ReleaseExclusiveRef();
      ClockRemove(old);
      table_.Remove(old);
      assert(usage_ >= old->total_charge);
      usage_ -= old->total_charge;
      deleted->push_back(*old);
      return;
    }
    old->DecreaseClockPriority();
  }
 }
@ -309,13 +375,14 @@ void ClockCacheShard::SetCapacity(size_t capacity) {
    EvictFromClock(0, &last_reference_list);
  }
-  // Free the entries here outside of mutex for performance reasons.
+  // Free the entry outside of the mutex for performance reasons.
  for (auto& h : last_reference_list) {
    h.FreeData();
  }
 }
 void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
  assert(false);  // Not supported. TODO(Guido) Support it?
  DMutexLock l(mutex_);
  strict_capacity_limit_ = strict_capacity_limit;
 }
@ -343,9 +410,10 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
  autovector<ClockHandle> last_reference_list;
  {
    DMutexLock l(mutex_);
    assert(table_.GetOccupancy() <= table_.GetOccupancyLimit());
    // Free the space following strict clock policy until enough space
-    // is freed or the clock list is empty.
+    // is freed or there are no evictable elements.
    EvictFromClock(tmp.total_charge, &last_reference_list);
    if ((usage_ + tmp.total_charge > capacity_ &&
         (strict_capacity_limit_ || handle == nullptr)) ||
@ -376,30 +444,29 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
      usage_ += h->total_charge;
      if (old != nullptr) {
        s = Status::OkOverwritten();
-        assert(old->IsVisible());
+        assert(!old->WillBeDeleted());
-        table_.Exclude(old);
+        old->SetWillBeDeleted(true);
-        if (!old->HasRefs()) {
+        // Try to evict the old copy of the element.
-          // old is in clock because it's in cache and its reference count is 0.
+        if (old->TryExclusiveRef()) {
          ClockRemove(old);
          table_.Remove(old);
          assert(usage_ >= old->total_charge);
          usage_ -= old->total_charge;
          last_reference_list.push_back(*old);
          Evict(old);
          old->ReleaseExclusiveRef();
        }
      }
      if (handle == nullptr) {
-        ClockInsert(h);
+        // If the user didn't provide a handle, no reference is taken,
        // so we make the element evictable.
        ClockOn(h);
        h->ReleaseExclusiveRef();
      } else {
-        // If caller already holds a ref, no need to take one here.
+        // The caller already holds a ref.
-        if (!h->HasRefs()) {
+        h->ExclusiveToExternalRef();
          h->Ref();
        }
        *handle = reinterpret_cast<Cache::Handle*>(h);
      }
    }
  }
-  // Free the entries here outside of mutex for performance reasons.
+  // Free the entry outside of the mutex for performance reasons.
  for (auto& h : last_reference_list) {
    h.FreeData();
  }
@ -407,95 +474,102 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
  return s;
 }
-Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t /* hash */) {
+Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
  ClockHandle* h = nullptr;
-  {
+  h = table_.Lookup(key, hash);
    DMutexLock l(mutex_);
    h = table_.Lookup(key);
  if (h != nullptr) {
-      assert(h->IsVisible());
+    // TODO(Guido) Comment from #10347: Here it looks like we have three atomic
-      if (!h->HasRefs()) {
+    // updates where it would be possible to combine into one CAS (more metadata
-        // The entry is in clock since it's in the hash table and has no
+    // under one atomic field) or maybe two atomic updates (one arithmetic, one
-        // external references.
+    // bitwise). Something to think about optimizing.
-        ClockRemove(h);
+    h->InternalToExternalRef();
      }
      h->Ref();
    h->SetHit();
-    }
+    // The handle is now referenced, so we take it out of clock.
    ClockOff(h);
  }
  return reinterpret_cast<Cache::Handle*>(h);
 }
 bool ClockCacheShard::Ref(Cache::Handle* h) {
  ClockHandle* e = reinterpret_cast<ClockHandle*>(h);
-  DMutexLock l(mutex_);
+  assert(e->HasExternalRefs());
-  // To create another reference - entry must be already externally referenced.
+  return e->TryExternalRef();
  assert(e->HasRefs());
  e->Ref();
  return true;
 }
 bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
  // In contrast with LRUCache's Release, this function won't delete the handle
  // when the reference is the last one and the cache is above capacity. Space
  // is only freed up by EvictFromClock (called by Insert when space is needed)
  // and Erase.
  if (handle == nullptr) {
    return false;
  }
  ClockHandle* h = reinterpret_cast<ClockHandle*>(handle);
  uint32_t hash = h->hash;
  uint32_t refs = h->ReleaseExternalRef();
  bool last_reference = !(refs & ClockHandle::EXTERNAL_REFS);
  bool will_be_deleted = refs & ClockHandle::WILL_BE_DELETED;
  if (last_reference && (will_be_deleted || erase_if_last_ref)) {
    // At this point we want to evict the element, so we need to take
    // a lock and an exclusive reference. But there's a problem:
    // as soon as we released the last reference, an Insert or Erase could've
    // replaced this element, and by the time we take the lock and ref
    // we could potentially be referencing a different element.
    // Thus, before evicting the (potentially different) element, we need to
    // re-check that it's unreferenced and marked as WILL_BE_DELETED, so the
    // eviction is safe. Additionally, we check that the hash doesn't change,
    // which will detect, most of the time, whether the element is a different
    // one. The bottomline is that we only guarantee that the input handle will
    // be deleted, and occasionally also another handle, but in any case all
    // deleted handles are safe to delete.
    // TODO(Guido) With lock-free inserts and deletes we may be able to
    // "atomically" transition to an exclusive ref, without creating a deadlock.
    ClockHandle copy;
  bool last_reference = false;
  assert(!h->IsInClockList());
    {
      DMutexLock l(mutex_);
-    last_reference = h->Unref();
+      if (h->TrySpinExclusiveRef()) {
-    if (last_reference && h->IsVisible()) {
+        will_be_deleted = h->refs & ClockHandle::WILL_BE_DELETED;
-      // The item is still in cache, and nobody else holds a reference to it.
+        // Check that it's still safe to delete.
-      if (usage_ > capacity_ || erase_if_last_ref) {
+        if (h->IsElement() && (will_be_deleted || erase_if_last_ref) &&
-        // The clock list must be empty since the cache is full.
+            h->hash == hash) {
        assert(clock_usage_ == 0 || erase_if_last_ref);
        // Take this opportunity and remove the item.
        table_.Remove(h);
      } else {
        // Put the item back on the clock list, and don't free it.
        ClockInsert(h);
        last_reference = false;
      }
    }
    // If it was the last reference, then decrement the cache usage.
    if (last_reference) {
      assert(usage_ >= h->total_charge);
      usage_ -= h->total_charge;
          copy = *h;
          Evict(h);
        }
        h->ReleaseExclusiveRef();
      } else {
        // An external ref was detected.
        return false;
      }
    }
-  // Free the entry here outside of mutex for performance reasons.
+    // Free the entry outside of the mutex for performance reasons.
  if (last_reference) {
    copy.FreeData();
    return true;
  }
-  return last_reference;
+
  return false;
 }
-void ClockCacheShard::Erase(const Slice& key, uint32_t /* hash */) {
+void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {
  ClockHandle copy;
  bool last_reference = false;
  {
    DMutexLock l(mutex_);
-    ClockHandle* h = table_.Lookup(key);
+    ClockHandle* h = table_.Lookup(key, hash);
    if (h != nullptr) {
-      table_.Exclude(h);
+      h->SetWillBeDeleted(true);
-      if (!h->HasRefs()) {
+      h->ReleaseInternalRef();
-        // The entry is in Clock since it's in cache and has no external
+      if (h->TryExclusiveRef()) {
        // references.
        ClockRemove(h);
        table_.Remove(h);
        assert(usage_ >= h->total_charge);
        usage_ -= h->total_charge;
        last_reference = true;
        copy = *h;
        Evict(h);
        last_reference = true;
        h->ReleaseExclusiveRef();
      }
    }
  }
-  // Free the entry here outside of mutex for performance reasons.
+  // Free the entry outside of the mutex for performance reasons.
  // last_reference will only be true if e != nullptr.
  if (last_reference) {
    copy.FreeData();
  }
@ -507,9 +581,25 @@ size_t ClockCacheShard::GetUsage() const {
 }
 size_t ClockCacheShard::GetPinnedUsage() const {
  // Computes the pinned usage scanning the whole hash table. This
  // is slow, but avoid keeping an exact counter on the clock usage,
  // i.e., the number of not externally referenced elements.
  // Why avoid this? Because Lookup removes elements from the clock
  // list, so it would need to update the pinned usage every time,
  // which creates additional synchronization costs.
  DMutexLock l(mutex_);
-  assert(usage_ >= clock_usage_);
+
-  return usage_ - clock_usage_;
+  size_t clock_usage = 0;
  table_.ConstApplyToEntriesRange(
      [&clock_usage](ClockHandle* h) {
        if (h->HasExternalRefs()) {
          clock_usage += h->total_charge;
        }
      },
      0, table_.GetTableSize(), true);
  return clock_usage;
 }
 std::string ClockCacheShard::GetPrintableOptions() const {
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@ -10,6 +10,8 @@
 #pragma once
 #include <array>
 #include <atomic>
 #include <cstdint>
 #include <memory>
 #include <string>
@ -27,116 +29,254 @@ namespace ROCKSDB_NAMESPACE {
 namespace clock_cache {
-// Clock cache implementation. This is based on FastLRUCache's open-addressed
+// Block cache implementation using a lock-free open-address hash table
-// hash table. Importantly, it stores elements in an array, and resolves
+// and clock eviction.
-// collision using a probing strategy. Visibility and referenceability of
+
-// elements works as usual. See fast_lru_cache.h for a detailed description.
+///////////////////////////////////////////////////////////////////////////////
 //                          Part 1: Handles
 //
 // Every slot in the hash table is a ClockHandle. A handle can be in a few
 // different states, that stem from the fact that handles can be externally
 // referenced and, thus, can't always be immediately evicted when a delete
 // operation is executed or when they are replaced by a new version (via an
 // insert of the same key). Concretely, the state of a handle is defined by the
 // following two properties:
 // (R) Externally referenced: A handle can be referenced externally, or not.
 //    Importantly, a handle can be evicted if and only if it's not
 //    referenced. In particular, when an handle becomes referenced, it's
 //    temporarily taken out of clock until all references to it are released.
 // (M) Marked for deletion (or invisible): An handle is marked for deletion
 //    when an operation attempts to delete it, but the handle is externally
 //    referenced, so it can't be immediately deleted. When this mark is placed,
 //    lookups will no longer be able to find it. Consequently, no more external
 //    references will be taken to the handle. When a handle is marked for
 //    deletion, we also say it's invisible.
 // These properties induce 4 different states, with transitions defined as
 // follows:
 // - Not M --> M: When a handle is deleted or replaced by a new version, but
 //    not immediately evicted.
 // - M --> not M: This cannot happen. Once a handle is marked for deletion,
 //    there is no can't go back.
 // - R --> not R: When all references to an handle are released.
 // - Not R --> R: When an unreferenced handle becomes referenced. This can only
 //    happen if the handle is visible, since references to an handle can only be
 //    created when it's visible.
 //
 ///////////////////////////////////////////////////////////////////////////////
 //                      Part 2: Hash table structure
 //
 // Internally, the cache uses an open-addressed hash table to index the handles.
 // We use tombstone counters to keep track of displacements. Probes are
 // generated with double-hashing (but the code can be easily modified to use
 // other probing schemes, like linear hashing). Because of the tombstones and
 // the two possible visibility states of a handle, the table slots (we use the
 // word "slot" to refer to handles that are not necessary valid key-value
 // elements) can be in 4 different states:
 // 1. Visible element: The slot contains an element in not M state.
 // 2. To-be-deleted element: The slot contains an element in M state.
 // 3. Tombstone: The slot doesn't contain an element, but there is some other
 //    element that probed this slot during its insertion.
 // 4. Empty: The slot is unused.
 // When a ghost is removed from the table, it can either transition to being a
 // tombstone or an empty slot, depending on the number of displacements of the
 // slot. In any case, the slot becomes available. When a handle is inserted
 // into that slot, it becomes a visible element again.
 //
 ///////////////////////////////////////////////////////////////////////////////
 //                      Part 3: The clock algorithm
 //
 // We maintain a circular buffer with the handles available for eviction,
 // which the clock algorithm traverses (using a "clock pointer") to pick the
 // next victim. We use the hash table array as the circular buffer, and mark
 // the handles that are evictable. For this we use different clock flags, namely
 // NONE, LOW, MEDIUM, HIGH, that represent priorities: LOW, MEDIUM and HIGH
 // represent how close an element is from being evictable, LOW being immediately
 // evictable. NONE means the slot is not evictable. This is due to one of the
 // following reasons:
 // (i) the slot doesn't contain an element, or
 // (ii) the slot contains an element that is in R state, or
 // (iii) the slot contains an element that was in R state but it's
 //      not any more, and the clock pointer has not swept through the
 //      slot since the element stopped being referenced.
 //
 // The priority NONE is really only important for case (iii), as in the other
 // two cases there are other metadata fields that already capture the state.
 // When an element stops being referenced (and is not deleted), the clock
 // algorithm must acknowledge this, and assign a non-NONE priority to make
 // the element evictable again.
 //
 ///////////////////////////////////////////////////////////////////////////////
 //                      Part 4: Synchronization
 //
 // We provide the following synchronization guarantees:
 // - Lookup is lock-free.
 // - Release is lock-free, unless (i) no references to the element are left,
 //   and (ii) it was marked for deletion or the user wishes to delete if
 //   releasing the last reference.
 // - Insert and Erase still use a per-shard lock.
 //
 // Our hash table is lock-free, in the sense that system-wide progress is
 // guaranteed, i.e., some thread is always able to make progress.
 //
-// The main difference with FastLRUCache is, not surprisingly, the eviction
+///////////////////////////////////////////////////////////////////////////////
-// algorithm
+
-// ---instead of an LRU list, we maintain a circular list with the elements
+// The load factor p is a real number in (0, 1) such that at all
-// available for eviction, which the clock algorithm traverses to pick the next
+// times at most a fraction p of all slots, without counting tombstones,
-// victim. The clock list is represented using the array of handles, and we
+// are occupied by elements. This means that the probability that a
-// simply mark those elements that are present in the list. This is done using
+// random probe hits an empty slot is at most p, and thus at most 1/p probes
-// different clock flags, namely NONE, LOW, MEDIUM, HIGH, that represent
+// are required on average. For example, p = 70% implies that between 1 and 2
-// priorities: NONE means that the element is not part of the clock list, and
+// probes are needed on average (bear in mind that this reasoning doesn't
-// LOW to HIGH represent how close an element is from being evictable (LOW being
+// consider the effects of clustering over time).
-// immediately evictable). When the clock pointer steps on an element that is
+// Because the size of the hash table is always rounded up to the next
-// not immediately evictable, it decreases its priority.
+// power of 2, p is really an upper bound on the actual load factor---the
-
+// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
-constexpr double kLoadFactor = 0.35;  // See fast_lru_cache.h.
+// but bear in mind that slots only hold metadata, not actual values.
-
+// Since space cost is dominated by the values (the LSM blocks),
-constexpr double kStrictLoadFactor = 0.7;  // See fast_lru_cache.h.
+// overprovisioning the table with metadata only increases the total cache space
 // usage by a tiny fraction.
 constexpr double kLoadFactor = 0.35;
 // The user can exceed kLoadFactor if the sizes of the inserted values don't
 // match estimated_value_size, or if strict_capacity_limit == false. To
 // avoid performance to plunge, we set a strict upper bound on the load factor.
 constexpr double kStrictLoadFactor = 0.7;
 // Arbitrary seeds.
 constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
 constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5;
-// An experimental (under development!) alternative to LRUCache
+// An experimental (under development!) alternative to LRUCache.
 struct ClockHandle {
  void* value;
  Cache::DeleterFn deleter;
  uint32_t hash;
-  size_t total_charge;  // TODO(opt): Only allow uint32_t?
+  size_t total_charge;
-  // The number of external refs to this entry.
+  std::array<char, kCacheKeySize> key_data;
-  uint32_t refs;
+
  static constexpr uint8_t kExternalRefsOffset = 0;
  static constexpr uint8_t kSharedRefsOffset = 15;
  static constexpr uint8_t kExclusiveRefOffset = 30;
  static constexpr uint8_t kWillBeDeletedOffset = 31;
  enum Refs : uint32_t {
    // Number of external references to the slot.
    EXTERNAL_REFS = ((uint32_t{1} << 15) - 1)
                    << kExternalRefsOffset,  // Bits 0, ..., 14
    // Number of internal references plus external references to the slot.
    SHARED_REFS = ((uint32_t{1} << 15) - 1)
                  << kSharedRefsOffset,  // Bits 15, ..., 29
    // Whether a thread has an exclusive reference to the slot.
    EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset,  // Bit 30
    // Whether the handle will be deleted soon. When this bit is set, new
    // internal
    // or external references to this handle stop being accepted.
    // There is an exception: external references can be created from
    // existing external references, or converting from existing internal
    // references.
    WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset  // Bit 31
    // Shared references (i.e., external and internal references) and exclusive
    // references are our custom implementation of RW locks---external and
    // internal references are read locks, and exclusive references are write
    // locks. We prioritize readers, which never block; in fact, they don't even
    // use compare-and-swap operations. Using our own implementation of RW locks
    // allows us to save many atomic operations by packing data more carefully.
    // In particular:
    // - Combining EXTERNAL_REFS and SHARED_REFS allows us to convert an
    // internal
    //    reference into an external reference in a single atomic arithmetic
    //    operation.
    // - Combining SHARED_REFS and WILL_BE_DELETED allows us to attempt to take
    //    a shared reference and check whether the entry is marked for deletion
    //    in a single atomic arithmetic operation.
  };
  static constexpr uint32_t kOneInternalRef = 0x8000;
  static constexpr uint32_t kOneExternalRef = 0x8001;
  std::atomic<uint32_t> refs;
-  static constexpr int kIsVisibleOffset = 0;
+  static constexpr uint8_t kIsElementOffset = 1;
-  static constexpr int kIsElementOffset = 1;
+  static constexpr uint8_t kClockPriorityOffset = 2;
-  static constexpr int kClockPriorityOffset = 2;
+  static constexpr uint8_t kIsHitOffset = 4;
-  static constexpr int kIsHitOffset = 4;
+  static constexpr uint8_t kCachePriorityOffset = 5;
  static constexpr int kCachePriorityOffset = 5;
  enum Flags : uint8_t {
    // Whether the handle is visible to Lookups.
    IS_VISIBLE = (1 << kIsVisibleOffset),
    // Whether the slot is in use by an element.
-    IS_ELEMENT = (1 << kIsElementOffset),
+    IS_ELEMENT = 1 << kIsElementOffset,
-    // Clock priorities. Represents how close a handle is from
+    // Clock priorities. Represents how close a handle is from being evictable.
-    // being evictable.
+    CLOCK_PRIORITY = 3 << kClockPriorityOffset,
    CLOCK_PRIORITY = (3 << kClockPriorityOffset),
    // Whether the handle has been looked up after its insertion.
-    HAS_HIT = (1 << kIsHitOffset),
+    HAS_HIT = 1 << kIsHitOffset,
-    CACHE_PRIORITY = (1 << kCachePriorityOffset),
+    // The value of Cache::Priority for the handle.
    CACHE_PRIORITY = 1 << kCachePriorityOffset,
  };
-  uint8_t flags;
+
  std::atomic<uint8_t> flags;
  enum ClockPriority : uint8_t {
-    NONE = (0 << kClockPriorityOffset),  // Not an element in the eyes of clock.
+    NONE = (0 << kClockPriorityOffset),
-    LOW = (1 << kClockPriorityOffset),   // Immediately evictable.
+    LOW = (1 << kClockPriorityOffset),
    MEDIUM = (2 << kClockPriorityOffset),
    HIGH = (3 << kClockPriorityOffset)
    // Priority is NONE if and only if
    // (i) the handle is not an element, or
    // (ii) the handle is an element but it is being referenced.
  };
-  // The number of elements that hash to this slot or a lower one,
+  // The number of elements that hash to this slot or a lower one, but wind
-  // but wind up in a higher slot.
+  // up in this slot or a higher one.
-  uint32_t displacements;
+  std::atomic<uint32_t> displacements;
-
+
-  std::array<char, kCacheKeySize> key_data;
+  // Synchronization rules:
-
+  // - Use a shared reference when we want the handle's identity
-  ClockHandle() {
+  //    members (key_data, hash, value and IS_ELEMENT flag) to
-    value = nullptr;
+  //    remain untouched, but not modify them. The only updates
-    deleter = nullptr;
+  //    that a shared reference allows are:
-    hash = 0;
+  //      * set CLOCK_PRIORITY to NONE;
-    total_charge = 0;
+  //      * set the HAS_HIT bit.
-    refs = 0;
+  //    Notice that these two types of updates are idempotent, so
-    flags = 0;
+  //    they don't require synchronization across shared references.
-    SetIsVisible(false);
+  // - Use an exclusive reference when we want identity members
  //    to remain untouched, as well as modify any identity member
  //    or flag.
  // - displacements can be modified without holding a reference.
  // - refs is only modified through appropriate functions to
  //    take or release references.
  ClockHandle()
      : value(nullptr),
        deleter(nullptr),
        hash(0),
        total_charge(0),
        refs(0),
        flags(0),
        displacements(0) {
    SetWillBeDeleted(false);
    SetIsElement(false);
    SetClockPriority(ClockPriority::NONE);
    SetCachePriority(Cache::Priority::LOW);
    displacements = 0;
    key_data.fill(0);
  }
-  Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
+  ClockHandle(const ClockHandle& other) { *this = other; }
-
+
-  // Increase the reference count by 1.
+  void operator=(const ClockHandle& other) {
-  void Ref() { refs++; }
+    value = other.value;
-
+    deleter = other.deleter;
-  // Just reduce the reference count by 1. Return true if it was last reference.
+    hash = other.hash;
-  bool Unref() {
+    total_charge = other.total_charge;
-    assert(refs > 0);
+    refs.store(other.refs);
-    refs--;
+    key_data = other.key_data;
-    return refs == 0;
+    flags.store(other.flags);
    SetWillBeDeleted(other.WillBeDeleted());
    SetIsElement(other.IsElement());
    SetClockPriority(other.GetClockPriority());
    SetCachePriority(other.GetCachePriority());
    displacements.store(other.displacements);
  }
-  // Return true if there are external refs, false otherwise.
+  Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
  bool HasRefs() const { return refs > 0; }
  bool IsVisible() const { return flags & IS_VISIBLE; }
-  void SetIsVisible(bool is_visible) {
+  bool HasExternalRefs() const { return (refs & EXTERNAL_REFS) > 0; }
    if (is_visible) {
      flags |= IS_VISIBLE;
    } else {
      flags &= ~IS_VISIBLE;
    }
  }
  bool IsElement() const { return flags & IS_ELEMENT; }
@ -144,7 +284,7 @@ struct ClockHandle {
    if (is_element) {
      flags |= IS_ELEMENT;
    } else {
-      flags &= ~IS_ELEMENT;
+      flags &= static_cast<uint8_t>(~IS_ELEMENT);
    }
  }
@ -152,7 +292,7 @@ struct ClockHandle {
  void SetHit() { flags |= HAS_HIT; }
-  bool IsInClockList() const {
+  bool IsInClock() const {
    return GetClockPriority() != ClockHandle::ClockPriority::NONE;
  }
@ -164,7 +304,7 @@ struct ClockHandle {
    if (priority == Cache::Priority::HIGH) {
      flags |= Flags::CACHE_PRIORITY;
    } else {
-      flags &= ~Flags::CACHE_PRIORITY;
+      flags &= static_cast<uint8_t>(~Flags::CACHE_PRIORITY);
    }
  }
@ -173,7 +313,7 @@ struct ClockHandle {
  }
  void SetClockPriority(ClockPriority priority) {
-    flags &= ~Flags::CLOCK_PRIORITY;
+    flags &= static_cast<uint8_t>(~Flags::CLOCK_PRIORITY);
    flags |= priority;
  }
@ -182,14 +322,13 @@ struct ClockHandle {
                kClockPriorityOffset;
    assert(p > 0);
    p--;
-    flags &= ~Flags::CLOCK_PRIORITY;
+    flags &= static_cast<uint8_t>(~Flags::CLOCK_PRIORITY);
    ClockPriority new_priority =
        static_cast<ClockPriority>(p << kClockPriorityOffset);
    flags |= new_priority;
  }
  void FreeData() {
    assert(refs == 0);
    if (deleter) {
      (*deleter)(key(), value);
    }
@ -232,17 +371,131 @@ struct ClockHandle {
    return total_charge - meta_charge;
  }
-  inline bool IsEmpty() {
+  inline bool IsEmpty() const {
    return !this->IsElement() && this->displacements == 0;
  }
-  inline bool IsTombstone() {
+  inline bool IsTombstone() const {
    return !this->IsElement() && this->displacements > 0;
  }
-  inline bool Matches(const Slice& some_key) {
+  inline bool Matches(const Slice& some_key, uint32_t some_hash) const {
-    return this->IsElement() && this->key() == some_key;
+    return this->IsElement() && this->hash == some_hash &&
           this->key() == some_key;
  }
  bool WillBeDeleted() const { return refs & WILL_BE_DELETED; }
  void SetWillBeDeleted(bool will_be_deleted) {
    if (will_be_deleted) {
      refs |= WILL_BE_DELETED;
    } else {
      refs &= ~WILL_BE_DELETED;
    }
  }
  // The following functions are for taking and releasing refs.
  // Tries to take an external ref. Returns true iff it succeeds.
  inline bool TryExternalRef() {
    if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
      return true;
    }
    refs -= kOneExternalRef;
    return false;
  }
  // Releases an external ref. Returns the new value (this is useful to
  // avoid an extra atomic read).
  inline uint32_t ReleaseExternalRef() { return refs -= kOneExternalRef; }
  // Take an external ref, assuming there is already one external ref
  // to the handle.
  void Ref() {
    // TODO(Guido) Is it okay to assume that the existing external reference
    // survives until this function returns?
    refs += kOneExternalRef;
  }
  // Tries to take an internal ref. Returns true iff it succeeds.
  inline bool TryInternalRef() {
    if (!((refs += kOneInternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
      return true;
    }
    refs -= kOneInternalRef;
    return false;
  }
  inline void ReleaseInternalRef() { refs -= kOneInternalRef; }
  // Tries to take an exclusive ref. Returns true iff it succeeds.
  inline bool TryExclusiveRef() {
    uint32_t will_be_deleted = refs & WILL_BE_DELETED;
    uint32_t expected = will_be_deleted;
    return refs.compare_exchange_strong(expected,
                                        EXCLUSIVE_REF | will_be_deleted);
  }
  // Repeatedly tries to take an exclusive reference, but stops as soon
  // as an external reference is detected (in this case the wait would
  // presumably be too long).
  inline bool TrySpinExclusiveRef() {
    uint32_t expected = 0;
    uint32_t will_be_deleted = 0;
    while (!refs.compare_exchange_strong(expected,
                                         EXCLUSIVE_REF | will_be_deleted)) {
      if (expected & EXTERNAL_REFS) {
        return false;
      }
      will_be_deleted = expected & WILL_BE_DELETED;
      expected = will_be_deleted;
    }
    return true;
  }
  inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); }
  // The following functions are for upgrading and downgrading refs.
  // They guarantee atomicity, i.e., no exclusive refs to the handle
  // can be taken by a different thread during the conversion.
  inline void ExclusiveToInternalRef() {
    refs += kOneInternalRef;
    ReleaseExclusiveRef();
  }
  inline void ExclusiveToExternalRef() {
    refs += kOneExternalRef;
    ReleaseExclusiveRef();
  }
  // TODO(Guido) Do we want to bound the loop and prepare the
  // algorithms to react to a failure?
  inline void InternalToExclusiveRef() {
    uint32_t expected = kOneInternalRef;
    uint32_t will_be_deleted = 0;
    while (!refs.compare_exchange_strong(expected,
                                         EXCLUSIVE_REF | will_be_deleted)) {
      will_be_deleted = expected & WILL_BE_DELETED;
      expected = kOneInternalRef | will_be_deleted;
    }
  }
  inline void InternalToExternalRef() {
    refs += kOneExternalRef - kOneInternalRef;
  }
  // TODO(Guido) Same concern.
  inline void ExternalToExclusiveRef() {
    uint32_t expected = kOneExternalRef;
    uint32_t will_be_deleted = 0;
    while (!refs.compare_exchange_strong(expected,
                                         EXCLUSIVE_REF | will_be_deleted)) {
      will_be_deleted = expected & WILL_BE_DELETED;
      expected = kOneExternalRef | will_be_deleted;
    }
  }
 };  // struct ClockHandle
 class ClockHandleTable {
@ -252,32 +505,55 @@ class ClockHandleTable {
  // Returns a pointer to a visible element matching the key/hash, or
  // nullptr if not present.
-  ClockHandle* Lookup(const Slice& key);
+  ClockHandle* Lookup(const Slice& key, uint32_t hash);
  // Inserts a copy of h into the hash table.
  // Returns a pointer to the inserted handle, or nullptr if no slot
  // available was found. If an existing visible element matching the
  // key/hash is already present in the hash table, the argument old
-  // is set to pointe to it; otherwise, it's set to nullptr.
+  // is set to point to it; otherwise, it's set to nullptr.
  // Returns an exclusive reference to h, and no references to old.
  ClockHandle* Insert(ClockHandle* h, ClockHandle** old);
-  // Removes h from the hash table. The handle must already be off
+  // Removes h from the hash table. The handle must already be off clock.
  // the clock list.
  void Remove(ClockHandle* h);
-  // Turns a visible element h into a ghost (i.e., not visible).
+  // Extracts the element information from a handle (src), and assigns it
-  void Exclude(ClockHandle* h);
+  // to a hash table slot (dst). Doesn't touch displacements and refs,
  // which are maintained by the hash table algorithm.
  void Assign(ClockHandle* dst, ClockHandle* src);
-  // Assigns a copy of h to the given slot.
+  template <typename T>
-  void Assign(int slot, ClockHandle* h);
+  void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end,
                           bool apply_if_will_be_deleted) {
    for (uint32_t i = index_begin; i < index_end; i++) {
      ClockHandle* h = &array_[i];
      if (h->TryExclusiveRef()) {
        if (h->IsElement() &&
            (apply_if_will_be_deleted || !h->WillBeDeleted())) {
          // Hand the internal ref over to func, which is now responsible
          // to release it.
          func(h);
        } else {
          h->ReleaseExclusiveRef();
        }
      }
    }
  }
  template <typename T>
-  void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
+  void ConstApplyToEntriesRange(T func, uint32_t index_begin,
                                uint32_t index_end,
                                bool apply_if_will_be_deleted) const {
    for (uint32_t i = index_begin; i < index_end; i++) {
      ClockHandle* h = &array_[i];
-      if (h->IsVisible()) {
+      if (h->TryExclusiveRef()) {
        if (h->IsElement() &&
            (apply_if_will_be_deleted || !h->WillBeDeleted())) {
          func(h);
        }
        h->ReleaseExclusiveRef();
      }
    }
  }
@ -295,28 +571,38 @@ class ClockHandleTable {
 private:
  friend class ClockCacheShard;
-  int FindVisibleElement(const Slice& key, int& probe, int displacement);
+  int FindElement(const Slice& key, uint32_t hash, uint32_t& probe);
-  int FindAvailableSlot(const Slice& key, int& probe, int displacement);
+  int FindAvailableSlot(const Slice& key, uint32_t& probe);
-  int FindVisibleElementOrAvailableSlot(const Slice& key, int& probe,
+  int FindElementOrAvailableSlot(const Slice& key, uint32_t hash,
-                                        int displacement);
+                                 uint32_t& probe);
  // Returns the index of the first slot probed (hashing with
-  // the given key) with a handle e such that cond(e) is true.
+  // the given key) with a handle e such that match(e) is true.
-  // Otherwise, if no match is found, returns -1.
+  // At every step, the function first tests whether match(e) holds.
-  // For every handle e probed except the final slot, updates
+  // If it's false, it evaluates abort(e) to decide whether the
-  // e->displacements += displacement.
+  // search should be aborted, and in the affirmative returns -1.
-  // The argument probe is modified such that consecutive calls
+  // For every handle e probed except the last one, the function runs
-  // to FindSlot continue probing right after where the previous
+  // update(e). We say a probe to a handle e is aborting if match(e) is
-  // call left.
+  // false and abort(e) is true. The argument probe is one more than the
-  int FindSlot(const Slice& key, std::function<bool(ClockHandle*)> cond,
+  // last non-aborting probe during the call. This is so that that the
-               int& probe, int displacement);
+  // variable can be used to keep track of progress across consecutive
  // calls to FindSlot.
  inline int FindSlot(const Slice& key, std::function<bool(ClockHandle*)> match,
                      std::function<bool(ClockHandle*)> stop,
                      std::function<void(ClockHandle*)> update,
                      uint32_t& probe);
  // After a failed FindSlot call (i.e., with answer -1), this function
  // decrements all displacements, starting from the 0-th probe.
  void Rollback(const Slice& key, uint32_t probe);
  // Number of hash bits used for table index.
  // The size of the table is 1 << length_bits_.
  int length_bits_;
  // For faster computation of ModTableSize.
  const uint32_t length_bits_mask_;
  // Number of elements in the table.
@ -345,10 +631,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
  void SetStrictCapacityLimit(bool strict_capacity_limit) override;
  // Like Cache methods, but with an extra "hash" parameter.
-  // Insert an item into the hash table and, if handle is null, insert into
+  // Insert an item into the hash table and, if handle is null, make it
-  // the clock list. Older items are evicted as necessary. If the cache is full
+  // evictable by the clock algorithm. Older items are evicted as necessary.
-  // and free_handle_on_fail is true, the item is deleted and handle is set to
+  // If the cache is full and free_handle_on_fail is true, the item is deleted
-  // nullptr.
+  // and handle is set to nullptr.
  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
                Cache::DeleterFn deleter, Cache::Handle** handle,
                Cache::Priority priority) override;
@ -393,13 +679,18 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
 private:
  friend class ClockCache;
-  void ClockRemove(ClockHandle* e);
+
-  void ClockInsert(ClockHandle* e);
+  // Makes an element evictable by clock.
  void ClockOn(ClockHandle* h);
  // Makes an element non-evictable.
  void ClockOff(ClockHandle* h);
  // Requires an exclusive ref on h.
  void Evict(ClockHandle* h);
  // Free some space following strict clock policy until enough space
-  // to hold (usage_ + charge) is freed or the clock list is empty
+  // to hold (usage_ + charge) is freed or there are no evictable elements.
  // This function is not thread safe - it needs to be executed while
  // holding the mutex_.
  void EvictFromClock(size_t charge, autovector<ClockHandle>* deleted);
  // Returns the charge of a single handle.
@ -436,9 +727,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
  // Memory size for entries residing in the cache.
  size_t usage_;
  // Memory size for unpinned entries in the clock list.
  size_t clock_usage_;
  // mutex_ protects the following state.
  // We don't count mutex_ as the cache's internal state so semantically we
  // don't mind mutex_ invoking the non-const actions.
--- a/cache/fast_lru_cache.cc
+++ b/cache/fast_lru_cache.cc
@ -52,6 +52,7 @@ LRUHandle* LRUHandleTable::Insert(LRUHandle* h, LRUHandle** old) {
                                               1 /*displacement*/);
  *old = nullptr;
  if (slot == -1) {
    // TODO(Guido) Don't we need to roll back displacements here?
    return nullptr;
  }