Replace per-shard chained hash tables with open-addressing scheme (#10194)

Summary: In FastLRUCache, we replace the current chained per-shard hash table by an open-addressing hash table. In particular, this allows us to preallocate all handles. Because all handles are preallocated, this implementation doesn't support strict_capacity_limit = false (i.e., allowing insertions beyond the predefined capacity). This clashes with current assumptions of some tests, namely two tests in cache_test and the crash tests. We have disabled these for now. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10194 Test Plan: ``make -j24 check`` Reviewed By: pdillinger Differential Revision: D37296770 Pulled By: guidotag fbshipit-source-id: 232ff1b8260331d868ebf4e3e5d8ad709390b0ad
3 years ago · 3afed7408c
parent deff48bcef
commit 3afed7408c
4 changed files with 475 additions and 219 deletions
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@ -650,6 +650,10 @@ TEST_P(CacheTest, ReleaseWithoutErase) {
 }

 TEST_P(CacheTest, SetCapacity) {
+  if (GetParam() == kFast) {
+    ROCKSDB_GTEST_BYPASS("FastLRUCache doesn't support capacity adjustments.");
+    return;
+  }
  // test1: increase capacity
  // lets create a cache with capacity 5,
  // then, insert 5 elements, then increase capacity
@ -698,6 +702,12 @@ TEST_P(CacheTest, SetCapacity) {
 }

 TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
+  if (GetParam() == kFast) {
+    ROCKSDB_GTEST_BYPASS(
+        "FastLRUCache doesn't support an unbounded number of inserts beyond "
+        "capacity.");
+    return;
+  }
  // test1: set the flag to false. Insert more keys than capacity. See if they
  // all go through.
  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
@ -749,6 +759,10 @@ TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
 }

 TEST_P(CacheTest, OverCapacity) {
+  if (GetParam() == kFast) {
+    ROCKSDB_GTEST_BYPASS("FastLRUCache doesn't support capacity adjustments.");
+    return;
+  }
  size_t n = 10;

  // a LRUCache with n entries and one shard only
--- a/cache/fast_lru_cache.cc
+++ b/cache/fast_lru_cache.cc
@ -9,77 +9,191 @@

 #include "cache/fast_lru_cache.h"

+#include <math.h>
+
 #include <cassert>
 #include <cstdint>
 #include <cstdio>
+#include <functional>

 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "port/lang.h"
 #include "util/distributed_mutex.h"
-
-#define KEY_LENGTH \
-  16  // TODO(guido) Make use of this symbol in other parts of the source code
-      // (e.g., cache_key.h, cache_test.cc, etc.)
+#include "util/hash.h"
+#include "util/random.h"

 namespace ROCKSDB_NAMESPACE {

 namespace fast_lru_cache {

-LRUHandleTable::LRUHandleTable(int hash_bits)
+namespace {
+// Returns x % 2^{bits}.
+inline uint32_t BinaryMod(uint32_t x, uint8_t bits) {
+  assert(bits <= 32);
+  return (x << (32 - bits)) >> (32 - bits);
+}
+}  // anonymous namespace
+
+LRUHandleTable::LRUHandleTable(uint8_t hash_bits)
    : length_bits_(hash_bits),
-      list_(new LRUHandle* [size_t{1} << length_bits_] {}) {}
+      occupancy_(0),
+      array_(new LRUHandle[size_t{1} << length_bits_]) {
+  assert(hash_bits <= 32);
+}

 LRUHandleTable::~LRUHandleTable() {
+  // TODO(Guido) If users still hold references to handles,
+  // those will become invalidated. And if we choose not to
+  // delete the data, it will become leaked.
  ApplyToEntriesRange(
      [](LRUHandle* h) {
+        // TODO(Guido) Remove the HasRefs() check?
        if (!h->HasRefs()) {
-          h->Free();
+          h->FreeData();
        }
      },
      0, uint32_t{1} << length_bits_);
 }

 LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
-  return *FindPointer(key, hash);
-}
-
-inline LRUHandle** LRUHandleTable::Head(uint32_t hash) {
-  return &list_[hash >> (32 - length_bits_)];
-}
-
-LRUHandle* LRUHandleTable::Insert(LRUHandle* h) {
-  LRUHandle** ptr = FindPointer(h->key(), h->hash);
-  LRUHandle* old = *ptr;
-  h->next_hash = (old == nullptr ? nullptr : old->next_hash);
-  *ptr = h;
-  return old;
+  int probe = 0;
+  int slot = FindVisibleElement(key, hash, probe, 0);
+  return (slot == -1) ? nullptr : &array_[slot];
+}
+
+LRUHandle* LRUHandleTable::Insert(LRUHandle* h, LRUHandle** old) {
+  int probe = 0;
+  int slot = FindVisibleElementOrAvailableSlot(h->key(), h->hash, probe,
+                                               1 /*displacement*/);
+  *old = nullptr;
+  if (slot == -1) {
+    return nullptr;
+  }
+
+  if (array_[slot].IsEmpty() || array_[slot].IsTombstone()) {
+    bool empty = array_[slot].IsEmpty();
+    Assign(slot, h);
+    LRUHandle* new_entry = &array_[slot];
+    if (empty) {
+      // This used to be an empty slot.
+      return new_entry;
+    }
+    // It used to be a tombstone, so there may already be a copy of the
+    // key in the table.
+    slot = FindVisibleElement(h->key(), h->hash, probe, 0 /*displacement*/);
+    if (slot == -1) {
+      // No existing copy of the key.
+      return new_entry;
+    }
+    *old = &array_[slot];
+    return new_entry;
+  } else {
+    // There is an existing copy of the key.
+    *old = &array_[slot];
+    // Find an available slot for the new element.
+    array_[slot].displacements++;
+    slot = FindAvailableSlot(h->key(), probe, 1 /*displacement*/);
+    if (slot == -1) {
+      // No available slots. Roll back displacements.
+      probe = 0;
+      slot = FindVisibleElement(h->key(), h->hash, probe, -1);
+      array_[slot].displacements--;
+      FindAvailableSlot(h->key(), probe, -1);
+      return nullptr;
+    }
+    Assign(slot, h);
+    return &array_[slot];
+  }
+}
+
+void LRUHandleTable::Remove(LRUHandle* h) {
+  assert(h->next == nullptr &&
+         h->prev == nullptr);  // Already off the LRU list.
+  int probe = 0;
+  FindSlot(
+      h->key(), [&h](LRUHandle* e) { return e == h; }, probe,
+      -1 /*displacement*/);
+  h->SetIsVisible(false);
+  h->SetIsElement(false);
+  occupancy_--;
+}
+
+void LRUHandleTable::Assign(int slot, LRUHandle* h) {
+  LRUHandle* dst = &array_[slot];
+  uint32_t disp = dst->displacements;
+  *dst = *h;
+  dst->displacements = disp;
+  dst->SetIsVisible(true);
+  dst->SetIsElement(true);
+  occupancy_++;
+}
+
+void LRUHandleTable::Exclude(LRUHandle* h) { h->SetIsVisible(false); }
+
+int LRUHandleTable::FindVisibleElement(const Slice& key, uint32_t hash,
+                                       int& probe, int displacement) {
+  return FindSlot(
+      key,
+      [&](LRUHandle* h) { return h->Matches(key, hash) && h->IsVisible(); },
+      probe, displacement);
+}
+
+int LRUHandleTable::FindAvailableSlot(const Slice& key, int& probe,
+                                      int displacement) {
+  return FindSlot(
+      key, [](LRUHandle* h) { return h->IsEmpty() || h->IsTombstone(); }, probe,
+      displacement);
+}
+
+int LRUHandleTable::FindVisibleElementOrAvailableSlot(const Slice& key,
+                                                      uint32_t hash, int& probe,
+                                                      int displacement) {
+  return FindSlot(
+      key,
+      [&](LRUHandle* h) {
+        return h->IsEmpty() || h->IsTombstone() ||
+               (h->Matches(key, hash) && h->IsVisible());
+      },
+      probe, displacement);
 }

-LRUHandle* LRUHandleTable::Remove(const Slice& key, uint32_t hash) {
-  LRUHandle** ptr = FindPointer(key, hash);
-  LRUHandle* result = *ptr;
-  if (result != nullptr) {
-    *ptr = result->next_hash;
+inline int LRUHandleTable::FindSlot(const Slice& key,
+                                    std::function<bool(LRUHandle*)> cond,
+                                    int& probe, int displacement) {
+  uint32_t base =
+      BinaryMod(Hash(key.data(), key.size(), kProbingSeed1), length_bits_);
+  uint32_t increment = BinaryMod(
+      (Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1, length_bits_);
+  uint32_t current = BinaryMod(base + probe * increment, length_bits_);
+  while (true) {
+    LRUHandle* h = &array_[current];
+    probe++;
+    if (current == base && probe > 1) {
+      // We looped back.
+      return -1;
    }
-  return result;
-}
-
-LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) {
-  LRUHandle** ptr = &list_[hash >> (32 - length_bits_)];
-  while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) {
-    ptr = &(*ptr)->next_hash;
+    if (cond(h)) {
+      return current;
+    }
+    if (h->IsEmpty()) {
+      // We check emptyness after the condition, because
+      // the condition may be emptyness.
+      return -1;
+    }
+    h->displacements += displacement;
+    current = BinaryMod(current + increment, length_bits_);
  }
-  return ptr;
 }

 LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size,
                             bool strict_capacity_limit,
                             CacheMetadataChargePolicy metadata_charge_policy)
-    : capacity_(0),
+    : capacity_(capacity),
      strict_capacity_limit_(strict_capacity_limit),
      table_(
-          GetHashBits(capacity, estimated_value_size, metadata_charge_policy)),
+          CalcHashBits(capacity, estimated_value_size, metadata_charge_policy) +
+          static_cast<uint8_t>(ceil(log2(1.0 / kLoadFactor)))),
      usage_(0),
      lru_usage_(0) {
  set_metadata_charge_policy(metadata_charge_policy);
@ -87,29 +201,27 @@ LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size,
  lru_.next = &lru_;
  lru_.prev = &lru_;
  lru_low_pri_ = &lru_;
-  SetCapacity(capacity);
 }

 void LRUCacheShard::EraseUnRefEntries() {
-  autovector<LRUHandle*> last_reference_list;
+  autovector<LRUHandle> last_reference_list;
  {
    DMutexLock l(mutex_);
    while (lru_.next != &lru_) {
      LRUHandle* old = lru_.next;
      // LRU list contains only elements which can be evicted.
-      assert(old->InCache() && !old->HasRefs());
+      assert(old->IsVisible() && !old->HasRefs());
      LRU_Remove(old);
-      table_.Remove(old->key(), old->hash);
-      old->SetInCache(false);
+      table_.Remove(old);
      assert(usage_ >= old->total_charge);
      usage_ -= old->total_charge;
-      last_reference_list.push_back(old);
+      last_reference_list.push_back(*old);
    }
  }

  // Free the entries here outside of mutex for performance reasons.
-  for (auto entry : last_reference_list) {
-    entry->Free();
+  for (auto& h : last_reference_list) {
+    h.FreeData();
  }
 }

@ -148,57 +260,48 @@ void LRUCacheShard::ApplyToSomeEntries(
      index_begin, index_end);
 }

-void LRUCacheShard::LRU_Remove(LRUHandle* e) {
-  assert(e->next != nullptr);
-  assert(e->prev != nullptr);
-  e->next->prev = e->prev;
-  e->prev->next = e->next;
-  e->prev = e->next = nullptr;
-  assert(lru_usage_ >= e->total_charge);
-  lru_usage_ -= e->total_charge;
+void LRUCacheShard::LRU_Remove(LRUHandle* h) {
+  assert(h->next != nullptr);
+  assert(h->prev != nullptr);
+  h->next->prev = h->prev;
+  h->prev->next = h->next;
+  h->prev = h->next = nullptr;
+  assert(lru_usage_ >= h->total_charge);
+  lru_usage_ -= h->total_charge;
 }

-void LRUCacheShard::LRU_Insert(LRUHandle* e) {
-  assert(e->next == nullptr);
-  assert(e->prev == nullptr);
-  // Inset "e" to head of LRU list.
-  e->next = &lru_;
-  e->prev = lru_.prev;
-  e->prev->next = e;
-  e->next->prev = e;
-  lru_usage_ += e->total_charge;
+void LRUCacheShard::LRU_Insert(LRUHandle* h) {
+  assert(h->next == nullptr);
+  assert(h->prev == nullptr);
+  // Insert h to head of LRU list.
+  h->next = &lru_;
+  h->prev = lru_.prev;
+  h->prev->next = h;
+  h->next->prev = h;
+  lru_usage_ += h->total_charge;
 }

 void LRUCacheShard::EvictFromLRU(size_t charge,
-                                 autovector<LRUHandle*>* deleted) {
+                                 autovector<LRUHandle>* deleted) {
  while ((usage_ + charge) > capacity_ && lru_.next != &lru_) {
    LRUHandle* old = lru_.next;
    // LRU list contains only elements which can be evicted.
-    assert(old->InCache() && !old->HasRefs());
+    assert(old->IsVisible() && !old->HasRefs());
    LRU_Remove(old);
-    table_.Remove(old->key(), old->hash);
-    old->SetInCache(false);
+    table_.Remove(old);
    assert(usage_ >= old->total_charge);
    usage_ -= old->total_charge;
-    deleted->push_back(old);
+    deleted->push_back(*old);
  }
 }

-int LRUCacheShard::GetHashBits(
+uint8_t LRUCacheShard::CalcHashBits(
    size_t capacity, size_t estimated_value_size,
    CacheMetadataChargePolicy metadata_charge_policy) {
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(
-      new char[sizeof(LRUHandle) - 1 + KEY_LENGTH]);
-  e->key_length = KEY_LENGTH;
-  e->deleter = nullptr;
-  e->refs = 0;
-  e->flags = 0;
-  e->refs = 0;
-
-  e->CalcTotalCharge(estimated_value_size, metadata_charge_policy);
-  size_t num_entries = capacity / e->total_charge;
-  e->Free();
-  int num_hash_bits = 0;
+  LRUHandle h;
+  h.CalcTotalCharge(estimated_value_size, metadata_charge_policy);
+  size_t num_entries = capacity / h.total_charge;
+  uint8_t num_hash_bits = 0;
  while (num_entries >>= 1) {
    ++num_hash_bits;
  }
@ -206,7 +309,8 @@ int LRUCacheShard::GetHashBits(
 }

 void LRUCacheShard::SetCapacity(size_t capacity) {
-  autovector<LRUHandle*> last_reference_list;
+  assert(false);  // Not supported. TODO(Guido) Support it?
+  autovector<LRUHandle> last_reference_list;
  {
    DMutexLock l(mutex_);
    capacity_ = capacity;
@ -214,8 +318,8 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
  }

  // Free the entries here outside of mutex for performance reasons.
-  for (auto entry : last_reference_list) {
-    entry->Free();
+  for (auto& h : last_reference_list) {
+    h.FreeData();
  }
 }

@ -224,83 +328,104 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
  strict_capacity_limit_ = strict_capacity_limit;
 }

-Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle,
-                                 bool free_handle_on_fail) {
+Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
+                             size_t charge, Cache::DeleterFn deleter,
+                             Cache::Handle** handle,
+                             Cache::Priority /*priority*/) {
+  if (key.size() != kCacheKeySize) {
+    return Status::NotSupported("FastLRUCache only supports key size " +
+                                std::to_string(kCacheKeySize) + "B");
+  }
+
+  LRUHandle tmp;
+  tmp.value = value;
+  tmp.deleter = deleter;
+  tmp.hash = hash;
+  tmp.CalcTotalCharge(charge, metadata_charge_policy_);
+  for (int i = 0; i < kCacheKeySize; i++) {
+    tmp.key_data[i] = key.data()[i];
+  }
+
  Status s = Status::OK();
-  autovector<LRUHandle*> last_reference_list;
+  autovector<LRUHandle> last_reference_list;
  {
    DMutexLock l(mutex_);

    // Free the space following strict LRU policy until enough space
    // is freed or the lru list is empty.
-    EvictFromLRU(e->total_charge, &last_reference_list);
-
-    if ((usage_ + e->total_charge) > capacity_ &&
-        (strict_capacity_limit_ || handle == nullptr)) {
-      e->SetInCache(false);
+    EvictFromLRU(tmp.total_charge, &last_reference_list);
+    if ((usage_ + tmp.total_charge > capacity_ &&
+         (strict_capacity_limit_ || handle == nullptr)) ||
+        table_.GetOccupancy() == size_t{1} << table_.GetLengthBits()) {
+      // Originally, when strict_capacity_limit_ == false and handle != nullptr
+      // (i.e., the user wants to immediately get a reference to the new
+      // handle), the insertion would proceed even if the total charge already
+      // exceeds capacity. We can't do this now, because we can't physically
+      // insert a new handle when the table is at maximum occupancy.
+      // TODO(Guido) Some tests (at least two from cache_test, as well as the
+      // stress tests) currently assume the old behavior.
      if (handle == nullptr) {
        // Don't insert the entry but still return ok, as if the entry inserted
        // into cache and get evicted immediately.
-        last_reference_list.push_back(e);
+        last_reference_list.push_back(tmp);
      } else {
-        if (free_handle_on_fail) {
-          delete[] reinterpret_cast<char*>(e);
-          *handle = nullptr;
-        }
        s = Status::Incomplete("Insert failed due to LRU cache being full.");
      }
    } else {
      // Insert into the cache. Note that the cache might get larger than its
      // capacity if not enough space was freed up.
-      LRUHandle* old = table_.Insert(e);
-      usage_ += e->total_charge;
+      LRUHandle* old;
+      LRUHandle* h = table_.Insert(&tmp, &old);
+      assert(h != nullptr);  // Insertions should never fail.
+      usage_ += h->total_charge;
      if (old != nullptr) {
        s = Status::OkOverwritten();
-        assert(old->InCache());
-        old->SetInCache(false);
+        assert(old->IsVisible());
+        table_.Exclude(old);
        if (!old->HasRefs()) {
          // old is on LRU because it's in cache and its reference count is 0.
          LRU_Remove(old);
+          table_.Remove(old);
          assert(usage_ >= old->total_charge);
          usage_ -= old->total_charge;
-          last_reference_list.push_back(old);
+          last_reference_list.push_back(*old);
        }
      }
      if (handle == nullptr) {
-        LRU_Insert(e);
+        LRU_Insert(h);
      } else {
        // If caller already holds a ref, no need to take one here.
-        if (!e->HasRefs()) {
-          e->Ref();
+        if (!h->HasRefs()) {
+          h->Ref();
        }
-        *handle = reinterpret_cast<Cache::Handle*>(e);
+        *handle = reinterpret_cast<Cache::Handle*>(h);
      }
    }
  }

  // Free the entries here outside of mutex for performance reasons.
-  for (auto entry : last_reference_list) {
-    entry->Free();
+  for (auto& h : last_reference_list) {
+    h.FreeData();
  }

  return s;
 }

 Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
-  LRUHandle* e = nullptr;
+  LRUHandle* h = nullptr;
  {
    DMutexLock l(mutex_);
-    e = table_.Lookup(key, hash);
-    if (e != nullptr) {
-      assert(e->InCache());
-      if (!e->HasRefs()) {
+    h = table_.Lookup(key, hash);
+    if (h != nullptr) {
+      assert(h->IsVisible());
+      if (!h->HasRefs()) {
        // The entry is in LRU since it's in hash and has no external references
-        LRU_Remove(e);
+        LRU_Remove(h);
      }
-      e->Ref();
+      h->Ref();
    }
  }
-  return reinterpret_cast<Cache::Handle*>(e);
+  return reinterpret_cast<Cache::Handle*>(h);
 }

 bool LRUCacheShard::Ref(Cache::Handle* h) {
@ -316,91 +441,64 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
  if (handle == nullptr) {
    return false;
  }
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
+  LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
+  LRUHandle copy;
  bool last_reference = false;
  {
    DMutexLock l(mutex_);
-    last_reference = e->Unref();
-    if (last_reference && e->InCache()) {
+    last_reference = h->Unref();
+    if (last_reference && h->IsVisible()) {
      // The item is still in cache, and nobody else holds a reference to it.
      if (usage_ > capacity_ || erase_if_last_ref) {
        // The LRU list must be empty since the cache is full.
        assert(lru_.next == &lru_ || erase_if_last_ref);
        // Take this opportunity and remove the item.
-        table_.Remove(e->key(), e->hash);
-        e->SetInCache(false);
+        table_.Remove(h);
      } else {
        // Put the item back on the LRU list, and don't free it.
-        LRU_Insert(e);
+        LRU_Insert(h);
        last_reference = false;
      }
    }
    // If it was the last reference, then decrement the cache usage.
    if (last_reference) {
-      assert(usage_ >= e->total_charge);
-      usage_ -= e->total_charge;
+      assert(usage_ >= h->total_charge);
+      usage_ -= h->total_charge;
+      copy = *h;
    }
  }

  // Free the entry here outside of mutex for performance reasons.
  if (last_reference) {
-    e->Free();
+    copy.FreeData();
  }
  return last_reference;
 }

-Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
-                             size_t charge, Cache::DeleterFn deleter,
-                             Cache::Handle** handle,
-                             Cache::Priority /*priority*/) {
-  if (key.size() != KEY_LENGTH) {
-    return Status::NotSupported("FastLRUCache only supports key size " +
-                                std::to_string(KEY_LENGTH) + "B");
-  }
-
-  // Allocate the memory here outside of the mutex.
-  // If the cache is full, we'll have to release it.
-  // It shouldn't happen very often though.
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(
-      new char[sizeof(LRUHandle) - 1 + key.size()]);
-
-  e->value = value;
-  e->flags = 0;
-  e->deleter = deleter;
-  e->key_length = key.size();
-  e->hash = hash;
-  e->refs = 0;
-  e->next = e->prev = nullptr;
-  e->SetInCache(true);
-  e->CalcTotalCharge(charge, metadata_charge_policy_);
-  memcpy(e->key_data, key.data(), key.size());
-
-  return InsertItem(e, handle, /* free_handle_on_fail */ true);
-}
-
 void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
-  LRUHandle* e;
+  LRUHandle copy;
  bool last_reference = false;
  {
    DMutexLock l(mutex_);
-    e = table_.Remove(key, hash);
-    if (e != nullptr) {
-      assert(e->InCache());
-      e->SetInCache(false);
-      if (!e->HasRefs()) {
-        // The entry is in LRU since it's in hash and has no external references
-        LRU_Remove(e);
-        assert(usage_ >= e->total_charge);
-        usage_ -= e->total_charge;
+    LRUHandle* h = table_.Lookup(key, hash);
+    if (h != nullptr) {
+      table_.Exclude(h);
+      if (!h->HasRefs()) {
+        // The entry is in LRU since it's in cache and has no external
+        // references
+        LRU_Remove(h);
+        table_.Remove(h);
+        assert(usage_ >= h->total_charge);
+        usage_ -= h->total_charge;
        last_reference = true;
+        copy = *h;
      }
    }
  }
-
  // Free the entry here outside of mutex for performance reasons.
  // last_reference will only be true if e != nullptr.
  if (last_reference) {
-    e->Free();
+    copy.FreeData();
  }
 }

--- a/cache/fast_lru_cache.h
+++ b/cache/fast_lru_cache.h
@ -8,9 +8,11 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once

+#include <array>
 #include <memory>
 #include <string>

+#include "cache/cache_key.h"
 #include "cache/sharded_cache.h"
 #include "port/lang.h"
 #include "port/malloc.h"
@ -22,31 +24,115 @@
 namespace ROCKSDB_NAMESPACE {
 namespace fast_lru_cache {

+// LRU cache implementation using an open-address hash table.
+
+// Every slot in the hash table is an LRUHandle. Because handles can be
+// referenced externally, we can't discard them immediately once they are
+// deleted (via a delete or an LRU eviction) or replaced by a new version
+// (via an insert of the same key). The state of an element is defined by
+// the following two properties:
+// (R) Referenced: An element can be referenced externally (refs > 0), or not.
+//    Importantly, an element can be evicted if and only if it's not
+//    referenced. In particular, when an element becomes referenced, it's
+//    temporarily taken out of the LRU list until all references to it
+//    are dropped.
+// (V) Visible: An element can visible for lookups (IS_VISIBLE set), or not.
+//    Initially, every element is visible. An element that is not visible is
+//    called a ghost.
+// These properties induce 4 different states, with transitions defined as
+// follows:
+// - V --> not V: When a visible element is deleted or replaced by a new
+//    version.
+// - Not V --> V: This cannot happen. A ghost remains in that state until it's
+//    not referenced any more, at which point it's ready to be removed from the
+//    hash table. (A ghost simply waits to transition to the afterlife---it will
+//    never be visible again.)
+// - R --> not R: When all references to an element are dropped.
+// - Not R --> R: When an unreferenced element becomes referenced. This can only
+//    happen if the element is V, since references to an element can only be
+//    created when it's visible.
+
+// Internally, the cache uses an open-addressed hash table to index the handles.
+// We use tombstone counters to keep track of displacements.
+// Because of the tombstones and the two possible visibility states of an
+// element, the table slots can be in 4 different states:
+// 1. Visible element (IS_ELEMENT set and IS_VISIBLE set): The slot contains a
+//    key-value element.
+// 2. Ghost element (IS_ELEMENT set and IS_VISIBLE unset): The slot contains an
+//    element that has been removed, but it's still referenced. It's invisible
+//    to lookups.
+// 3. Tombstone (IS_ELEMENT unset and displacements > 0): The slot contains a
+//    tombstone.
+// 4. Empty (IS_ELEMENT unset and displacements == 0): The slot is unused.
+//    A slot that is an element can further have IS_VISIBLE set or not.
+// When a ghost is removed from the table, it can either transition to being a
+// tombstone or an empty slot, depending on the number of displacements of the
+// slot. In any case, the slot becomes available. When a handle is inserted
+// into that slot, it becomes a visible element again.
+
+constexpr uint8_t kCacheKeySize =
+    static_cast<uint8_t>(sizeof(ROCKSDB_NAMESPACE::CacheKey));
+
+// The load factor p is a real number in (0, 1) such that at all
+// times at most a fraction p of all slots, without counting tombstones,
+// are occupied by elements. This means that the probability that a
+// random probe hits an empty slot is at most p, and thus at most 1/p probes
+// are required on average. We use p = 70%, so between 1 and 2 probes are
+// needed on average.
+// Because the size of the hash table is always rounded up to the next
+// power of 2, p is really an upper bound on the actual load factor---the
+// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
+// but bear in mind that slots only hold metadata, not actual values.
+// Since space cost is dominated by the values (the LSM blocks),
+// overprovisioning the table with metadata only increases the total cache space
+// usage by a tiny fraction.
+constexpr double kLoadFactor = 0.7;
+
+// Arbitrary seeds.
+constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
+constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5;
+
 // An experimental (under development!) alternative to LRUCache

 struct LRUHandle {
  void* value;
  Cache::DeleterFn deleter;
-  LRUHandle* next_hash;
  LRUHandle* next;
  LRUHandle* prev;
  size_t total_charge;  // TODO(opt): Only allow uint32_t?
-  size_t key_length;
  // The hash of key(). Used for fast sharding and comparisons.
  uint32_t hash;
  // The number of external refs to this entry. The cache itself is not counted.
  uint32_t refs;

  enum Flags : uint8_t {
-    // Whether this entry is referenced by the hash table.
-    IN_CACHE = (1 << 0),
+    // Whether the handle is visible to Lookups.
+    IS_VISIBLE = (1 << 0),
+    // Whether the slot is in use by an element.
+    IS_ELEMENT = (1 << 1),
  };
  uint8_t flags;

-  // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
-  char key_data[1];
+  // The number of elements that hash to this slot or a lower one,
+  // but wind up in a higher slot.
+  uint32_t displacements;
+
+  std::array<char, kCacheKeySize> key_data;
+
+  LRUHandle() {
+    value = nullptr;
+    deleter = nullptr;
+    next = nullptr;
+    prev = nullptr;
+    total_charge = 0;
+    hash = 0;
+    refs = 0;
+    flags = 0;
+    displacements = 0;
+    key_data.fill(0);
+  }

-  Slice key() const { return Slice(key_data, key_length); }
+  Slice key() const { return Slice(key_data.data(), kCacheKeySize); }

  // Increase the reference count by 1.
  void Ref() { refs++; }
@ -61,22 +147,31 @@ struct LRUHandle {
  // Return true if there are external refs, false otherwise.
  bool HasRefs() const { return refs > 0; }

-  bool InCache() const { return flags & IN_CACHE; }
+  bool IsVisible() const { return flags & IS_VISIBLE; }

-  void SetInCache(bool in_cache) {
-    if (in_cache) {
-      flags |= IN_CACHE;
+  void SetIsVisible(bool is_visible) {
+    if (is_visible) {
+      flags |= IS_VISIBLE;
    } else {
-      flags &= ~IN_CACHE;
+      flags &= ~IS_VISIBLE;
    }
  }

-  void Free() {
+  bool IsElement() const { return flags & IS_ELEMENT; }
+
+  void SetIsElement(bool is_element) {
+    if (is_element) {
+      flags |= IS_ELEMENT;
+    } else {
+      flags &= ~IS_ELEMENT;
+    }
+  }
+
+  void FreeData() {
    assert(refs == 0);
    if (deleter) {
      (*deleter)(key(), value);
    }
-    delete[] reinterpret_cast<char*>(this);
  }

  // Calculate the memory usage by metadata.
@ -85,13 +180,22 @@ struct LRUHandle {
    if (metadata_charge_policy != kFullChargeCacheMetadata) {
      return 0;
    } else {
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-      return malloc_usable_size(
-          const_cast<void*>(static_cast<const void*>(this)));
-#else
-      // This is the size that is used when a new handle is created.
-      return sizeof(LRUHandle) - 1 + key_length;
-#endif
+      // #ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      //       return malloc_usable_size(
+      //           const_cast<void*>(static_cast<const void*>(this)));
+      // #else
+      // TODO(Guido) malloc_usable_size only works when we call it on
+      // a pointer allocated with malloc. Because our handles are all
+      // allocated in a single shot as an array, the user can't call
+      // CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
+      // pointer returned by the cache. Moreover, malloc_usable_size
+      // expects a heap-allocated handle, but sometimes in our code we
+      // wish to pass a stack-allocated handle (this is only a performance
+      // concern).
+      // What is the right way to compute metadata charges with pre-allocated
+      // handles?
+      return sizeof(LRUHandle);
+      // #endif
    }
  }

@ -106,8 +210,23 @@ struct LRUHandle {
    assert(total_charge >= meta_charge);
    return total_charge - meta_charge;
  }
+
+  inline bool IsEmpty() {
+    return !this->IsElement() && this->displacements == 0;
+  }
+
+  inline bool IsTombstone() {
+    return !this->IsElement() && this->displacements > 0;
+  }
+
+  inline bool Matches(const Slice& some_key, uint32_t some_hash) {
+    return this->IsElement() && this->hash == some_hash &&
+           this->key() == some_key;
+  }
 };

+// TODO(Guido) Update the following comment.
+
 // We provide our own simple hash table since it removes a whole bunch
 // of porting hacks and is also faster than some of the built-in hash
 // table implementations in some of the compiler/runtime combinations
@ -115,45 +234,72 @@ struct LRUHandle {
 // 4.4.3's builtin hashtable.
 class LRUHandleTable {
 public:
-  explicit LRUHandleTable(int hash_bits);
+  explicit LRUHandleTable(uint8_t hash_bits);
  ~LRUHandleTable();

+  // Returns a pointer to a visible element matching the key/hash, or
+  // nullptr if not present.
  LRUHandle* Lookup(const Slice& key, uint32_t hash);
-  LRUHandle* Insert(LRUHandle* h);
-  LRUHandle* Remove(const Slice& key, uint32_t hash);
+
+  // Inserts a copy of h into the hash table.
+  // Returns a pointer to the inserted handle, or nullptr if no slot
+  // available was found. If an existing visible element matching the
+  // key/hash is already present in the hash table, the argument old
+  // is set to pointe to it; otherwise, it's set to nullptr.
+  LRUHandle* Insert(LRUHandle* h, LRUHandle** old);
+
+  // Removes h from the hash table. The handle must already be off
+  // the LRU list.
+  void Remove(LRUHandle* h);
+
+  // Turns a visible element h into a ghost (i.e., not visible).
+  void Exclude(LRUHandle* h);
+
+  // Assigns a copy of h to the given slot.
+  void Assign(int slot, LRUHandle* h);

  template <typename T>
  void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
    for (uint32_t i = index_begin; i < index_end; i++) {
-      LRUHandle* h = list_[i];
-      while (h != nullptr) {
-        auto n = h->next_hash;
-        assert(h->InCache());
+      LRUHandle* h = &array_[i];
+      if (h->IsVisible()) {
        func(h);
-        h = n;
      }
    }
  }

-  int GetLengthBits() const { return length_bits_; }
+  uint8_t GetLengthBits() const { return length_bits_; }

-  // Return the address of the head of the chain in the bucket given
-  // by the hash.
-  inline LRUHandle** Head(uint32_t hash);
+  uint32_t GetOccupancy() const { return occupancy_; }

 private:
-  // Return a pointer to slot that points to a cache entry that
-  // matches key/hash.  If there is no such cache entry, return a
-  // pointer to the trailing slot in the corresponding linked list.
-  LRUHandle** FindPointer(const Slice& key, uint32_t hash);
-
-  // Number of hash bits (upper because lower bits used for sharding)
-  // used for table index. Length == 1 << length_bits_
-  int length_bits_;
-
-  // The table consists of an array of buckets where each bucket is
-  // a linked list of cache entries that hash into the bucket.
-  std::unique_ptr<LRUHandle*[]> list_;
+  int FindVisibleElement(const Slice& key, uint32_t hash, int& probe,
+                         int displacement);
+
+  int FindAvailableSlot(const Slice& key, int& probe, int displacement);
+
+  int FindVisibleElementOrAvailableSlot(const Slice& key, uint32_t hash,
+                                        int& probe, int displacement);
+
+  // Returns the index of the first slot probed (hashing with
+  // the given key) with a handle e such that cond(e) is true.
+  // Otherwise, if no match is found, returns -1.
+  // For every handle e probed except the final slot, updates
+  // e->displacements += displacement.
+  // The argument probe is modified such that consecutive calls
+  // to FindSlot continue probing right after where the previous
+  // call left.
+  int FindSlot(const Slice& key, std::function<bool(LRUHandle*)> cond,
+               int& probe, int displacement);
+
+  // Number of hash bits used for table index.
+  // The size of the table is 1 << length_bits_.
+  uint8_t length_bits_;
+
+  // Number of elements in the table.
+  uint32_t occupancy_;
+
+  std::unique_ptr<LRUHandle[]> array_;
 };

 // A single shard of sharded cache.
@ -173,6 +319,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  void SetStrictCapacityLimit(bool strict_capacity_limit) override;

  // Like Cache methods, but with an extra "hash" parameter.
+  // Insert an item into the hash table and, if handle is null, insert into
+  // the LRU list. Older items are evicted as necessary. If the cache is full
+  // and free_handle_on_fail is true, the item is deleted and handle is set to
+  // nullptr.
  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
                Cache::DeleterFn deleter, Cache::Handle** handle,
                Cache::Priority priority) override;
@ -217,13 +367,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {

 private:
  friend class LRUCache;
-  // Insert an item into the hash table and, if handle is null, insert into
-  // the LRU list. Older items are evicted as necessary. If the cache is full
-  // and free_handle_on_fail is true, the item is deleted and handle is set to
-  // nullptr.
-  Status InsertItem(LRUHandle* item, Cache::Handle** handle,
-                    bool free_handle_on_fail);
-
  void LRU_Remove(LRUHandle* e);
  void LRU_Insert(LRUHandle* e);

@ -231,11 +374,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  // to hold (usage_ + charge) is freed or the lru list is empty
  // This function is not thread safe - it needs to be executed while
  // holding the mutex_.
-  void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted);
+  void EvictFromLRU(size_t charge, autovector<LRUHandle>* deleted);

-  // Returns the number of bits used to hash an element in the per-shard
+  // Returns the number of bits used to hash an element in the hash
  // table.
-  static int GetHashBits(size_t capacity, size_t estimated_value_size,
+  static uint8_t CalcHashBits(size_t capacity, size_t estimated_value_size,
                              CacheMetadataChargePolicy metadata_charge_policy);

  // Initialized before use.
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@ -114,7 +114,8 @@ default_params = {
    "use_direct_reads": lambda: random.randint(0, 1),
    "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
    "mock_direct_io": False,
-    "cache_type": lambda: random.choice(["fast_lru_cache", "lru_cache"]),   # clock_cache is broken
+    "cache_type": "lru_cache",  # clock_cache is broken
+                                # fast_lru_cache is currently incompatible with stress tests, because they use strict_capacity_limit = false
    "use_full_merge_v1": lambda: random.randint(0, 1),
    "use_merge": lambda: random.randint(0, 1),
    # 999 -> use Bloom API