diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc
index 663bff953..dd36a5f06 100644
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@@ -441,6 +441,8 @@ class CacheBench {
     uint64_t total_key_size = 0;
     uint64_t total_charge = 0;
     uint64_t total_entry_count = 0;
+    uint64_t table_occupancy = 0;
+    uint64_t table_size = 0;
     std::set<Cache::DeleterFn> deleters;
     StopWatchNano timer(clock);
 
@@ -456,6 +458,9 @@ class CacheBench {
             std::ostringstream ostr;
             ostr << "Most recent cache entry stats:\n"
                  << "Number of entries: " << total_entry_count << "\n"
+                 << "Table occupancy: " << table_occupancy << " / "
+                 << table_size << " = "
+                 << (100.0 * table_occupancy / table_size) << "%\n"
                  << "Total charge: " << BytesToHumanString(total_charge) << "\n"
                  << "Average key size: "
                  << (1.0 * total_key_size / total_entry_count) << "\n"
@@ -492,6 +497,8 @@ class CacheBench {
       Cache::ApplyToAllEntriesOptions opts;
       opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
       shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
+      table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount();
+      table_size = shared->GetCacheBench()->cache_->GetTableAddressCount();
       stats_hist->Add(timer.ElapsedNanos() / 1000);
     }
   }
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 81a9d412c..14b6e44d9 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -106,6 +106,8 @@ class CacheTest : public testing::TestWithParam<std::string> {
   std::shared_ptr<Cache> cache_;
   std::shared_ptr<Cache> cache2_;
 
+  size_t estimated_value_size_ = 1;
+
   CacheTest()
       : cache_(NewCache(kCacheSize, kNumShardBits, false)),
         cache2_(NewCache(kCacheSize2, kNumShardBits2, false)) {
@@ -122,12 +124,12 @@ class CacheTest : public testing::TestWithParam<std::string> {
     }
     if (type == kClock) {
       return ExperimentalNewClockCache(
-          capacity, 1 /*estimated_value_size*/, -1 /*num_shard_bits*/,
+          capacity, estimated_value_size_, -1 /*num_shard_bits*/,
           false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy);
     }
     if (type == kFast) {
       return NewFastLRUCache(
-          capacity, 1 /*estimated_value_size*/, -1 /*num_shard_bits*/,
+          capacity, estimated_value_size_, -1 /*num_shard_bits*/,
           false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy);
     }
     return nullptr;
@@ -239,7 +241,10 @@ TEST_P(CacheTest, UsageTest) {
   auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
   auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata);
   ASSERT_EQ(0, cache->GetUsage());
-  ASSERT_EQ(0, precise_cache->GetUsage());
+  size_t baseline_meta_usage = precise_cache->GetUsage();
+  if (type != kClock) {
+    ASSERT_EQ(0, baseline_meta_usage);
+  }
 
   size_t usage = 0;
   char value[10] = "abcdef";
@@ -258,13 +263,17 @@ TEST_P(CacheTest, UsageTest) {
                                     kv_size, DumbDeleter));
     usage += kv_size;
     ASSERT_EQ(usage, cache->GetUsage());
-    ASSERT_LT(usage, precise_cache->GetUsage());
+    if (type == kClock) {
+      ASSERT_EQ(baseline_meta_usage + usage, precise_cache->GetUsage());
+    } else {
+      ASSERT_LT(usage, precise_cache->GetUsage());
+    }
   }
 
   cache->EraseUnRefEntries();
   precise_cache->EraseUnRefEntries();
   ASSERT_EQ(0, cache->GetUsage());
-  ASSERT_EQ(0, precise_cache->GetUsage());
+  ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage());
 
   // make sure the cache will be overloaded
   for (size_t i = 1; i < kCapacity; ++i) {
@@ -284,7 +293,15 @@ TEST_P(CacheTest, UsageTest) {
   ASSERT_GT(kCapacity, cache->GetUsage());
   ASSERT_GT(kCapacity, precise_cache->GetUsage());
   ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
-  ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage());
+  if (type != kClock) {
+    ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage());
+  } else {
+    // estimated value size of 1 is weird for clock cache, because
+    // almost all of the capacity will be used for metadata, and due to only
+    // using power of 2 table sizes, we might hit strict occupancy limit
+    // before hitting capacity limit.
+    ASSERT_LT(kCapacity * 0.80, precise_cache->GetUsage());
+  }
 }
 
 // TODO: This test takes longer than expected on ClockCache. This is
@@ -301,6 +318,10 @@ TEST_P(CacheTest, PinnedUsageTest) {
   const size_t kCapacity = 200000;
   auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
   auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata);
+  size_t baseline_meta_usage = precise_cache->GetUsage();
+  if (type != kClock) {
+    ASSERT_EQ(0, baseline_meta_usage);
+  }
 
   size_t pinned_usage = 0;
   char value[10] = "abcdef";
@@ -390,7 +411,7 @@ TEST_P(CacheTest, PinnedUsageTest) {
   cache->EraseUnRefEntries();
   precise_cache->EraseUnRefEntries();
   ASSERT_EQ(0, cache->GetUsage());
-  ASSERT_EQ(0, precise_cache->GetUsage());
+  ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage());
 }
 
 TEST_P(CacheTest, HitAndMiss) {
@@ -407,16 +428,30 @@ TEST_P(CacheTest, HitAndMiss) {
   ASSERT_EQ(-1,  Lookup(300));
 
   Insert(100, 102);
-  ASSERT_EQ(102, Lookup(100));
+  if (GetParam() == kClock) {
+    // ClockCache usually doesn't overwrite on Insert
+    ASSERT_EQ(101, Lookup(100));
+  } else {
+    ASSERT_EQ(102, Lookup(100));
+  }
   ASSERT_EQ(201, Lookup(200));
   ASSERT_EQ(-1,  Lookup(300));
 
   ASSERT_EQ(1U, deleted_keys_.size());
   ASSERT_EQ(100, deleted_keys_[0]);
-  ASSERT_EQ(101, deleted_values_[0]);
+  if (GetParam() == kClock) {
+    ASSERT_EQ(102, deleted_values_[0]);
+  } else {
+    ASSERT_EQ(101, deleted_values_[0]);
+  }
 }
 
 TEST_P(CacheTest, InsertSameKey) {
+  if (GetParam() == kClock) {
+    ROCKSDB_GTEST_BYPASS(
+        "ClockCache doesn't guarantee Insert overwrite same key.");
+    return;
+  }
   Insert(1, 1);
   Insert(1, 2);
   ASSERT_EQ(2, Lookup(1));
@@ -442,6 +477,11 @@ TEST_P(CacheTest, Erase) {
 }
 
 TEST_P(CacheTest, EntriesArePinned) {
+  if (GetParam() == kClock) {
+    ROCKSDB_GTEST_BYPASS(
+        "ClockCache doesn't guarantee Insert overwrite same key.");
+    return;
+  }
   Insert(100, 101);
   Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
   ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
@@ -474,7 +514,6 @@ TEST_P(CacheTest, EntriesArePinned) {
 TEST_P(CacheTest, EvictionPolicy) {
   Insert(100, 101);
   Insert(200, 201);
-
   // Frequently used entry must be kept around
   for (int i = 0; i < 2 * kCacheSize; i++) {
     Insert(1000+i, 2000+i);
@@ -503,6 +542,12 @@ TEST_P(CacheTest, ExternalRefPinsEntries) {
     for (int j = 0; j < 2 * kCacheSize + 100; j++) {
       Insert(1000 + j, 2000 + j);
     }
+    // Clock cache is even more stateful and needs more churn to evict
+    if (GetParam() == kClock) {
+      for (int j = 0; j < kCacheSize; j++) {
+        Insert(11000 + j, 11000 + j);
+      }
+    }
     if (i < 2) {
       ASSERT_EQ(101, Lookup(100));
     }
@@ -810,11 +855,6 @@ TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
 }
 
 TEST_P(CacheTest, OverCapacity) {
-  auto type = GetParam();
-  if (type == kClock) {
-    ROCKSDB_GTEST_BYPASS("Requires LRU eviction policy.");
-    return;
-  }
   size_t n = 10;
 
   // a LRUCache with n entries and one shard only
@@ -842,23 +882,34 @@ TEST_P(CacheTest, OverCapacity) {
   for (int i = 0; i < static_cast<int>(n + 1); i++) {
     cache->Release(handles[i]);
   }
-  // Make sure eviction is triggered.
-  cache->SetCapacity(n);
 
-  // cache is under capacity now since elements were released
-  ASSERT_EQ(n, cache->GetUsage());
+  if (GetParam() == kClock) {
+    // Make sure eviction is triggered.
+    ASSERT_OK(cache->Insert(EncodeKey(-1), nullptr, 1, &deleter, &handles[0]));
 
-  // element 0 is evicted and the rest is there
-  // This is consistent with the LRU policy since the element 0
-  // was released first
-  for (int i = 0; i < static_cast<int>(n + 1); i++) {
-    std::string key = EncodeKey(i + 1);
-    auto h = cache->Lookup(key);
-    if (h) {
-      ASSERT_NE(static_cast<size_t>(i), 0U);
-      cache->Release(h);
-    } else {
-      ASSERT_EQ(static_cast<size_t>(i), 0U);
+    // cache is under capacity now since elements were released
+    ASSERT_GE(n, cache->GetUsage());
+
+    // clean up
+    cache->Release(handles[0]);
+  } else {
+    // LRUCache checks for over-capacity in Release.
+
+    // cache is exactly at capacity now with minimal eviction
+    ASSERT_EQ(n, cache->GetUsage());
+
+    // element 0 is evicted and the rest is there
+    // This is consistent with the LRU policy since the element 0
+    // was released first
+    for (int i = 0; i < static_cast<int>(n + 1); i++) {
+      std::string key = EncodeKey(i + 1);
+      auto h = cache->Lookup(key);
+      if (h) {
+        ASSERT_NE(static_cast<size_t>(i), 0U);
+        cache->Release(h);
+      } else {
+        ASSERT_EQ(static_cast<size_t>(i), 0U);
+      }
     }
   }
 }
@@ -966,19 +1017,30 @@ TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {
 }
 
 TEST_P(CacheTest, DefaultShardBits) {
-  // test1: set the flag to false. Insert more keys than capacity. See if they
-  // all go through.
-  std::shared_ptr<Cache> cache = NewCache(16 * 1024L * 1024L);
+  // Prevent excessive allocation (to save time & space)
+  estimated_value_size_ = 100000;
+  // Implementations use different minimum shard sizes
+  size_t min_shard_size = (GetParam() == kClock ? 32U * 1024U : 512U) * 1024U;
+
+  std::shared_ptr<Cache> cache = NewCache(32U * min_shard_size);
   ShardedCache* sc = dynamic_cast<ShardedCache*>(cache.get());
   ASSERT_EQ(5, sc->GetNumShardBits());
 
-  cache = NewLRUCache(511 * 1024L, -1, true);
+  cache = NewCache(min_shard_size / 1000U * 999U);
   sc = dynamic_cast<ShardedCache*>(cache.get());
   ASSERT_EQ(0, sc->GetNumShardBits());
 
-  cache = NewLRUCache(1024L * 1024L * 1024L, -1, true);
+  cache = NewCache(3U * 1024U * 1024U * 1024U);
   sc = dynamic_cast<ShardedCache*>(cache.get());
+  // current maximum of 6
   ASSERT_EQ(6, sc->GetNumShardBits());
+
+  if constexpr (sizeof(size_t) > 4) {
+    cache = NewCache(128U * min_shard_size);
+    sc = dynamic_cast<ShardedCache*>(cache.get());
+    // current maximum of 6
+    ASSERT_EQ(6, sc->GetNumShardBits());
+  }
 }
 
 TEST_P(CacheTest, GetChargeAndDeleter) {
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index edc63ae4e..3bff5feee 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -10,8 +10,6 @@
 #include "cache/clock_cache.h"
 
 #include <cassert>
-#include <cstdint>
-#include <cstdio>
 #include <functional>
 
 #include "monitoring/perf_context_imp.h"
@@ -25,359 +23,937 @@ namespace ROCKSDB_NAMESPACE {
 
 namespace clock_cache {
 
-ClockHandleTable::ClockHandleTable(size_t capacity, int hash_bits)
+static_assert(sizeof(ClockHandle) == 64U,
+              "Expecting size / alignment with common cache line size");
+
+ClockHandleTable::ClockHandleTable(int hash_bits, bool initial_charge_metadata)
     : length_bits_(hash_bits),
-      length_bits_mask_((uint32_t{1} << length_bits_) - 1),
-      occupancy_limit_(static_cast<uint32_t>((uint32_t{1} << length_bits_) *
+      length_bits_mask_(Lower32of64((uint64_t{1} << length_bits_) - 1)),
+      occupancy_limit_(static_cast<uint32_t>((uint64_t{1} << length_bits_) *
                                              kStrictLoadFactor)),
-      capacity_(capacity),
-      array_(new ClockHandle[size_t{1} << length_bits_]),
-      clock_pointer_(0),
-      occupancy_(0),
-      usage_(0) {
-  assert(hash_bits <= 32);
+      array_(new ClockHandle[size_t{1} << length_bits_]) {
+  assert(hash_bits <= 32);  // FIXME: ensure no overlap with sharding bits
+  if (initial_charge_metadata) {
+    usage_ += size_t{GetTableSize()} * sizeof(ClockHandle);
+  }
 }
 
 ClockHandleTable::~ClockHandleTable() {
-  // Assumes there are no references (of any type) to any slot in the table.
+  // Assumes there are no references or active operations on any slot/element
+  // in the table.
   for (uint32_t i = 0; i < GetTableSize(); i++) {
-    ClockHandle* h = &array_[i];
-    if (h->IsElement()) {
-      h->FreeData();
+    ClockHandle& h = array_[i];
+    switch (h.meta >> ClockHandle::kStateShift) {
+      case ClockHandle::kStateEmpty:
+        // noop
+        break;
+      case ClockHandle::kStateInvisible:  // rare but possible
+      case ClockHandle::kStateVisible:
+        h.FreeData();
+#ifndef NDEBUG
+        Rollback(h.hash, &h);
+        usage_.fetch_sub(h.total_charge, std::memory_order_relaxed);
+        occupancy_.fetch_sub(1U, std::memory_order_relaxed);
+#endif
+        break;
+      // otherwise
+      default:
+        assert(false);
+        break;
     }
   }
+
+#ifndef NDEBUG
+  for (uint32_t i = 0; i < GetTableSize(); i++) {
+    assert(array_[i].displacements.load() == 0);
+  }
+#endif
+
+  assert(usage_.load() == 0 ||
+         usage_.load() == size_t{GetTableSize()} * sizeof(ClockHandle));
+  assert(occupancy_ == 0);
 }
 
-ClockHandle* ClockHandleTable::Lookup(const Slice& key, uint32_t hash) {
+// If an entry doesn't receive clock updates but is repeatedly referenced &
+// released, the acquire and release counters could overflow without some
+// intervention. This is that intervention, which should be inexpensive
+// because it only incurs a simple, very predictable check. (Applying a bit
+// mask in addition to an increment to every Release likely would be
+// relatively expensive, because it's an extra atomic update.)
+//
+// We do have to assume that we never have many millions of simultaneous
+// references to a cache handle, because we cannot represent so many
+// references with the difference in counters, masked to the number of
+// counter bits. Similarly, we assume there aren't millions of threads
+// holding transient references (which might be "undone" rather than
+// released by the way).
+//
+// Consider these possible states for each counter:
+// low: less than kMaxCountdown
+// medium: kMaxCountdown to half way to overflow + kMaxCountdown
+// high: half way to overflow + kMaxCountdown, or greater
+//
+// And these possible states for the combination of counters:
+// acquire / release
+// -------   -------
+// low       low       - Normal / common, with caveats (see below)
+// medium    low       - Can happen while holding some refs
+// high      low       - Violates assumptions (too many refs)
+// low       medium    - Violates assumptions (refs underflow, etc.)
+// medium    medium    - Normal (very read heavy cache)
+// high      medium    - Can happen while holding some refs
+// low       high      - This function is supposed to prevent
+// medium    high      - Violates assumptions (refs underflow, etc.)
+// high      high      - Needs CorrectNearOverflow
+//
+// Basically, this function detects (high, high) state (inferred from
+// release alone being high) and bumps it back down to (medium, medium)
+// state with the same refcount and the same logical countdown counter
+// (everything > kMaxCountdown is logically the same). Note that bumping
+// down to (low, low) would modify the countdown counter, so is "reserved"
+// in a sense.
+//
+// If near-overflow correction is triggered here, there's no guarantee
+// that another thread hasn't freed the entry and replaced it with another.
+// Therefore, it must be the case that the correction does not affect
+// entries unless they are very old (many millions of acquire-release cycles).
+// (Our bit manipulation is indeed idempotent and only affects entries in
+// exceptional cases.) We assume a pre-empted thread will not stall that long.
+// If it did, the state could be corrupted in the (unlikely) case that the top
+// bit of the acquire counter is set but not the release counter, and thus
+// we only clear the top bit of the acquire counter on resumption. It would
+// then appear that there are too many refs and the entry would be permanently
+// pinned (which is not terrible for an exceptionally rare occurrence), unless
+// it is referenced enough (at least kMaxCountdown more times) for the release
+// counter to reach "high" state again and bumped back to "medium." (This
+// motivates only checking for release counter in high state, not both in high
+// state.)
+inline void CorrectNearOverflow(uint64_t old_meta,
+                                std::atomic<uint64_t>& meta) {
+  // We clear both top-most counter bits at the same time.
+  constexpr uint64_t kCounterTopBit = uint64_t{1}
+                                      << (ClockHandle::kCounterNumBits - 1);
+  constexpr uint64_t kClearBits =
+      (kCounterTopBit << ClockHandle::kAcquireCounterShift) |
+      (kCounterTopBit << ClockHandle::kReleaseCounterShift);
+  // A simple check that allows us to initiate clearing the top bits for
+  // a large portion of the "high" state space on release counter.
+  constexpr uint64_t kCheckBits =
+      (kCounterTopBit | (ClockHandle::kMaxCountdown + 1))
+      << ClockHandle::kReleaseCounterShift;
+
+  if (UNLIKELY(old_meta & kCheckBits)) {
+    meta.fetch_and(~kClearBits, std::memory_order_relaxed);
+  }
+}
+
+Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
+                                ClockHandle** handle, Cache::Priority priority,
+                                size_t capacity, bool strict_capacity_limit) {
+  // Do we have the available occupancy? Optimistically assume we do
+  // and deal with it if we don't.
+  uint32_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
+  auto revert_occupancy_fn = [&]() {
+    occupancy_.fetch_sub(1, std::memory_order_relaxed);
+  };
+  // Whether we over-committed and need an eviction to make up for it
+  bool need_evict_for_occupancy = old_occupancy >= occupancy_limit_;
+
+  // Usage/capacity handling is somewhat different depending on
+  // strict_capacity_limit, but mostly pessimistic.
+  bool use_detached_insert = false;
+  const size_t total_charge = proto.total_charge;
+  if (strict_capacity_limit) {
+    if (total_charge > capacity) {
+      assert(!use_detached_insert);
+      revert_occupancy_fn();
+      return Status::MemoryLimit(
+          "Cache entry too large for a single cache shard: " +
+          std::to_string(total_charge) + " > " + std::to_string(capacity));
+    }
+    // Grab any available capacity, and free up any more required.
+    size_t old_usage = usage_.load(std::memory_order_relaxed);
+    size_t new_usage;
+    if (LIKELY(old_usage != capacity)) {
+      do {
+        new_usage = std::min(capacity, old_usage + total_charge);
+      } while (!usage_.compare_exchange_weak(old_usage, new_usage,
+                                             std::memory_order_relaxed));
+    } else {
+      new_usage = old_usage;
+    }
+    // How much do we need to evict then?
+    size_t need_evict_charge = old_usage + total_charge - new_usage;
+    size_t request_evict_charge = need_evict_charge;
+    if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) {
+      // Require at least 1 eviction.
+      request_evict_charge = 1;
+    }
+    if (request_evict_charge > 0) {
+      size_t evicted_charge = 0;
+      uint32_t evicted_count = 0;
+      Evict(request_evict_charge, &evicted_charge, &evicted_count);
+      occupancy_.fetch_sub(evicted_count, std::memory_order_release);
+      if (LIKELY(evicted_charge > need_evict_charge)) {
+        assert(evicted_count > 0);
+        // Evicted more than enough
+        usage_.fetch_sub(evicted_charge - need_evict_charge,
+                         std::memory_order_relaxed);
+      } else if (evicted_charge < need_evict_charge ||
+                 (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) {
+        // Roll back to old usage minus evicted
+        usage_.fetch_sub(evicted_charge + (new_usage - old_usage),
+                         std::memory_order_relaxed);
+        assert(!use_detached_insert);
+        revert_occupancy_fn();
+        if (evicted_charge < need_evict_charge) {
+          return Status::MemoryLimit(
+              "Insert failed because unable to evict entries to stay within "
+              "capacity limit.");
+        } else {
+          return Status::MemoryLimit(
+              "Insert failed because unable to evict entries to stay within "
+              "table occupancy limit.");
+        }
+      }
+      // If we needed to evict something and we are proceeding, we must have
+      // evicted something.
+      assert(evicted_count > 0);
+    }
+  } else {
+    // Case strict_capacity_limit == false
+
+    // For simplicity, we consider that either the cache can accept the insert
+    // with no evictions, or we must evict enough to make (at least) enough
+    // space. It could lead to unnecessary failures or excessive evictions in
+    // some extreme cases, but allows a fast, simple protocol. If we allow a
+    // race to get us over capacity, then we might never get back to capacity
+    // limit if the sizes of entries allow each insertion to evict the minimum
+    // charge. Thus, we should evict some extra if it's not a signifcant
+    // portion of the shard capacity. This can have the side benefit of
+    // involving fewer threads in eviction.
+    size_t old_usage = usage_.load(std::memory_order_relaxed);
+    size_t need_evict_charge;
+    // NOTE: if total_charge > old_usage, there isn't yet enough to evict
+    // `total_charge` amount. Even if we only try to evict `old_usage` amount,
+    // there's likely something referenced and we would eat CPU looking for
+    // enough to evict.
+    if (old_usage + total_charge <= capacity || total_charge > old_usage) {
+      // Good enough for me (might run over with a race)
+      need_evict_charge = 0;
+    } else {
+      // Try to evict enough space, and maybe some extra
+      need_evict_charge = total_charge;
+      if (old_usage > capacity) {
+        // Not too much to avoid thundering herd while avoiding strict
+        // synchronization
+        need_evict_charge += std::min(capacity / 1024, total_charge) + 1;
+      }
+    }
+    if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) {
+      // Special case: require at least 1 eviction if we only have to
+      // deal with occupancy
+      need_evict_charge = 1;
+    }
+    size_t evicted_charge = 0;
+    uint32_t evicted_count = 0;
+    if (need_evict_charge > 0) {
+      Evict(need_evict_charge, &evicted_charge, &evicted_count);
+      // Deal with potential occupancy deficit
+      if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) {
+        assert(evicted_charge == 0);
+        revert_occupancy_fn();
+        if (handle == nullptr) {
+          // Don't insert the entry but still return ok, as if the entry
+          // inserted into cache and evicted immediately.
+          proto.FreeData();
+          return Status::OK();
+        } else {
+          use_detached_insert = true;
+        }
+      } else {
+        // Update occupancy for evictions
+        occupancy_.fetch_sub(evicted_count, std::memory_order_release);
+      }
+    }
+    // Track new usage even if we weren't able to evict enough
+    usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed);
+    // No underflow
+    assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
+  }
+  auto revert_usage_fn = [&]() {
+    usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+    // No underflow
+    assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
+  };
+
+  if (!use_detached_insert) {
+    // Attempt a table insert, but abort if we find an existing entry for the
+    // key. If we were to overwrite old entries, we would either
+    // * Have to gain ownership over an existing entry to overwrite it, which
+    // would only work if there are no outstanding (read) references and would
+    // create a small gap in availability of the entry (old or new) to lookups.
+    // * Have to insert into a suboptimal location (more probes) so that the
+    // old entry can be kept around as well.
+
+    // Set initial clock data from priority
+    // TODO: configuration parameters for priority handling and clock cycle
+    // count?
+    uint64_t initial_countdown;
+    switch (priority) {
+      case Cache::Priority::HIGH:
+        initial_countdown = ClockHandle::kHighCountdown;
+        break;
+      default:
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case Cache::Priority::LOW:
+        initial_countdown = ClockHandle::kLowCountdown;
+        break;
+      case Cache::Priority::BOTTOM:
+        initial_countdown = ClockHandle::kBottomCountdown;
+        break;
+    }
+    assert(initial_countdown > 0);
+
+    uint32_t probe = 0;
+    ClockHandle* e = FindSlot(
+        proto.hash,
+        [&](ClockHandle* h) {
+          // Optimistically transition the slot from "empty" to
+          // "under construction" (no effect on other states)
+          uint64_t old_meta =
+              h->meta.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit}
+                                   << ClockHandle::kStateShift,
+                               std::memory_order_acq_rel);
+          uint64_t old_state = old_meta >> ClockHandle::kStateShift;
+
+          if (old_state == ClockHandle::kStateEmpty) {
+            // We've started inserting into an available slot, and taken
+            // ownership Save data fields
+            ClockHandleMoreData* h_alias = h;
+            *h_alias = proto;
+
+            // Transition from "under construction" state to "visible" state
+            uint64_t new_meta = uint64_t{ClockHandle::kStateVisible}
+                                << ClockHandle::kStateShift;
+
+            // Maybe with an outstanding reference
+            new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift;
+            new_meta |= (initial_countdown - (handle != nullptr))
+                        << ClockHandle::kReleaseCounterShift;
+
+#ifndef NDEBUG
+            // Save the state transition, with assertion
+            old_meta = h->meta.exchange(new_meta, std::memory_order_release);
+            assert(old_meta >> ClockHandle::kStateShift ==
+                   ClockHandle::kStateConstruction);
+#else
+            // Save the state transition
+            h->meta.store(new_meta, std::memory_order_release);
+#endif
+            return true;
+          } else if (old_state != ClockHandle::kStateVisible) {
+            // Slot not usable / touchable now
+            return false;
+          }
+          // Existing, visible entry, which might be a match.
+          // But first, we need to acquire a ref to read it. In fact, number of
+          // refs for initial countdown, so that we boost the clock state if
+          // this is a match.
+          old_meta = h->meta.fetch_add(
+              ClockHandle::kAcquireIncrement * initial_countdown,
+              std::memory_order_acq_rel);
+          // Like Lookup
+          if ((old_meta >> ClockHandle::kStateShift) ==
+              ClockHandle::kStateVisible) {
+            // Acquired a read reference
+            if (h->key == proto.key) {
+              // Match. Release in a way that boosts the clock state
+              old_meta = h->meta.fetch_add(
+                  ClockHandle::kReleaseIncrement * initial_countdown,
+                  std::memory_order_acq_rel);
+              // Correct for possible (but rare) overflow
+              CorrectNearOverflow(old_meta, h->meta);
+              // Insert detached instead (only if return handle needed)
+              use_detached_insert = true;
+              return true;
+            } else {
+              // Mismatch. Pretend we never took the reference
+              old_meta = h->meta.fetch_sub(
+                  ClockHandle::kAcquireIncrement * initial_countdown,
+                  std::memory_order_acq_rel);
+            }
+          } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+                              ClockHandle::kStateInvisible)) {
+            // Pretend we never took the reference
+            // WART: there's a tiny chance we release last ref to invisible
+            // entry here. If that happens, we let eviction take care of it.
+            old_meta = h->meta.fetch_sub(
+                ClockHandle::kAcquireIncrement * initial_countdown,
+                std::memory_order_acq_rel);
+          } else {
+            // For other states, incrementing the acquire counter has no effect
+            // so we don't need to undo it.
+            // Slot not usable / touchable now.
+          }
+          (void)old_meta;
+          return false;
+        },
+        [&](ClockHandle* /*h*/) { return false; },
+        [&](ClockHandle* h) {
+          h->displacements.fetch_add(1, std::memory_order_relaxed);
+        },
+        probe);
+    if (e == nullptr) {
+      // Occupancy check and never abort FindSlot above should generally
+      // prevent this, except it's theoretically possible for other threads
+      // to evict and replace entries in the right order to hit every slot
+      // when it is populated. Assuming random hashing, the chance of that
+      // should be no higher than pow(kStrictLoadFactor, n) for n slots.
+      // That should be infeasible for roughly n >= 256, so if this assertion
+      // fails, that suggests something is going wrong.
+      assert(GetTableSize() < 256);
+      use_detached_insert = true;
+    }
+    if (!use_detached_insert) {
+      // Successfully inserted
+      if (handle) {
+        *handle = e;
+      }
+      return Status::OK();
+    }
+    // Roll back table insertion
+    Rollback(proto.hash, e);
+    revert_occupancy_fn();
+    // Maybe fall back on detached insert
+    if (handle == nullptr) {
+      revert_usage_fn();
+      // As if unrefed entry immdiately evicted
+      proto.FreeData();
+      return Status::OK();
+    }
+  }
+
+  // Run detached insert
+  assert(use_detached_insert);
+
+  ClockHandle* h = new ClockHandle();
+  ClockHandleMoreData* h_alias = h;
+  *h_alias = proto;
+  h->detached = true;
+  // Single reference (detached entries only created if returning a refed
+  // Handle back to user)
+  uint64_t meta = uint64_t{ClockHandle::kStateInvisible}
+                  << ClockHandle::kStateShift;
+  meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift;
+  h->meta.store(meta, std::memory_order_release);
+  // Keep track of usage
+  detached_usage_.fetch_add(total_charge, std::memory_order_relaxed);
+
+  *handle = h;
+  // The OkOverwritten status is used to count "redundant" insertions into
+  // block cache. This implementation doesn't strictly check for redundant
+  // insertions, but we instead are probably interested in how many insertions
+  // didn't go into the table (instead "detached"), which could be redundant
+  // Insert or some other reason (use_detached_insert reasons above).
+  return Status::OkOverwritten();
+}
+
+ClockHandle* ClockHandleTable::Lookup(const CacheKeyBytes& key, uint32_t hash) {
   uint32_t probe = 0;
   ClockHandle* e = FindSlot(
-      key,
+      hash,
       [&](ClockHandle* h) {
-        if (h->TryInternalRef()) {
-          if (h->IsElement() && h->Matches(key, hash)) {
+        // Mostly branch-free version (similar performance)
+        /*
+        uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                     std::memory_order_acquire);
+        bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U;
+        bool visible = (old_meta >> ClockHandle::kStateShift) & 1U;
+        bool match = (h->key == key) & visible;
+        h->meta.fetch_sub(static_cast<uint64_t>(Shareable & !match) <<
+        ClockHandle::kAcquireCounterShift, std::memory_order_release); return
+        match;
+        */
+        // Optimistic lookup should pay off when the table is relatively
+        // sparse.
+        constexpr bool kOptimisticLookup = true;
+        uint64_t old_meta;
+        if (!kOptimisticLookup) {
+          old_meta = h->meta.load(std::memory_order_acquire);
+          if ((old_meta >> ClockHandle::kStateShift) !=
+              ClockHandle::kStateVisible) {
+            return false;
+          }
+        }
+        // (Optimistically) increment acquire counter
+        old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                     std::memory_order_acquire);
+        // Check if it's an entry visible to lookups
+        if ((old_meta >> ClockHandle::kStateShift) ==
+            ClockHandle::kStateVisible) {
+          // Acquired a read reference
+          if (h->key == key) {
+            // Match
             return true;
+          } else {
+            // Mismatch. Pretend we never took the reference
+            old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                         std::memory_order_release);
           }
-          h->ReleaseInternalRef();
+        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+                            ClockHandle::kStateInvisible)) {
+          // Pretend we never took the reference
+          // WART: there's a tiny chance we release last ref to invisible
+          // entry here. If that happens, we let eviction take care of it.
+          old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                       std::memory_order_release);
+        } else {
+          // For other states, incrementing the acquire counter has no effect
+          // so we don't need to undo it. Furthermore, we cannot safely undo
+          // it because we did not acquire a read reference to lock the
+          // entry in a Shareable state.
         }
+        (void)old_meta;
         return false;
       },
-      [&](ClockHandle* h) { return h->displacements == 0; },
+      [&](ClockHandle* h) {
+        return h->displacements.load(std::memory_order_relaxed) == 0;
+      },
       [&](ClockHandle* /*h*/) {}, probe);
 
-  if (e != nullptr) {
-    // TODO(Guido) Comment from #10347: Here it looks like we have three atomic
-    // updates where it would be possible to combine into one CAS (more metadata
-    // under one atomic field) or maybe two atomic updates (one arithmetic, one
-    // bitwise). Something to think about optimizing.
-    e->SetHit();
-    // The handle is now referenced, so we take it out of clock.
-    ClockOff(e);
-    e->InternalToExternalRef();
-  }
-
   return e;
 }
 
-ClockHandle* ClockHandleTable::Insert(ClockHandle* h,
-                                      autovector<ClockHandle>* deleted,
-                                      bool take_reference) {
-  uint32_t probe = 0;
-  ClockHandle* e = FindAvailableSlot(h->key(), h->hash, probe, deleted);
-  if (e == nullptr) {
-    // No available slot to place the handle.
-    return nullptr;
-  }
-
-  // The slot is empty or is a tombstone. And we have an exclusive ref.
-  Assign(e, h);
-  // TODO(Guido) The following RemoveAll can probably be run outside of
-  // the exclusive ref. I had a bad case in mind: multiple inserts could
-  // annihilate each. Although I think this is impossible, I'm not sure
-  // my mental proof covers every case.
-  if (e->displacements != 0) {
-    // It used to be a tombstone, so there may already be copies of the
-    // key in the table.
-    RemoveAll(h->key(), h->hash, probe, deleted);
-  }
+bool ClockHandleTable::Release(ClockHandle* h, bool useful,
+                               bool erase_if_last_ref) {
+  // In contrast with LRUCache's Release, this function won't delete the handle
+  // when the cache is above capacity and the reference is the last one. Space
+  // is only freed up by EvictFromClock (called by Insert when space is needed)
+  // and Erase. We do this to avoid an extra atomic read of the variable usage_.
 
-  if (take_reference) {
-    // The user wants to take a reference.
-    e->ExclusiveToExternalRef();
+  uint64_t old_meta;
+  if (useful) {
+    // Increment release counter to indicate was used
+    old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement,
+                                 std::memory_order_release);
   } else {
-    // The user doesn't want to immediately take a reference, so we make
-    // it evictable.
-    ClockOn(e);
-    e->ReleaseExclusiveRef();
+    // Decrement acquire counter to pretend it never happened
+    old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                 std::memory_order_release);
   }
-  return e;
-}
 
-void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) {
-  // DON'T touch displacements and refs.
-  dst->value = src->value;
-  dst->deleter = src->deleter;
-  dst->hash = src->hash;
-  dst->total_charge = src->total_charge;
-  dst->key_data = src->key_data;
-  dst->flags.store(0);
-  dst->SetIsElement(true);
-  dst->SetCachePriority(src->GetCachePriority());
-  usage_ += dst->total_charge;
-  occupancy_++;
-}
-
-bool ClockHandleTable::TryRemove(ClockHandle* h,
-                                 autovector<ClockHandle>* deleted) {
-  if (h->TryExclusiveRef()) {
-    if (h->WillBeDeleted()) {
-      Remove(h, deleted);
-      return true;
+  assert((old_meta >> ClockHandle::kStateShift) &
+         ClockHandle::kStateShareableBit);
+  // No underflow
+  assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
+          ClockHandle::kCounterMask) !=
+         ((old_meta >> ClockHandle::kReleaseCounterShift) &
+          ClockHandle::kCounterMask));
+
+  if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
+                                    ClockHandle::kStateInvisible)) {
+    // Update for last fetch_add op
+    if (useful) {
+      old_meta += ClockHandle::kReleaseIncrement;
+    } else {
+      old_meta -= ClockHandle::kAcquireIncrement;
+    }
+    // Take ownership if no refs
+    do {
+      uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) -
+                           (old_meta >> ClockHandle::kReleaseCounterShift)) &
+                          ClockHandle::kCounterMask;
+      if (refcount != 0) {
+        // Not last ref at some point in time during this Release call
+        // Correct for possible (but rare) overflow
+        CorrectNearOverflow(old_meta, h->meta);
+        return false;
+      }
+      if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
+                       << ClockHandle::kStateShift)) == 0) {
+        // Someone else took ownership
+        return false;
+      }
+      // Note that there's a small chance that we release, another thread
+      // replaces this entry with another, reaches zero refs, and then we end
+      // up erasing that other entry. That's an acceptable risk / imprecision.
+    } while (!h->meta.compare_exchange_weak(
+        old_meta,
+        uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
+        std::memory_order_acquire));
+    // Took ownership
+    // TODO? Delay freeing?
+    h->FreeData();
+    size_t total_charge = h->total_charge;
+    if (UNLIKELY(h->detached)) {
+      // Delete detached handle
+      delete h;
+      detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+    } else {
+      uint32_t hash = h->hash;
+#ifndef NDEBUG
+      // Mark slot as empty, with assertion
+      old_meta = h->meta.exchange(0, std::memory_order_release);
+      assert(old_meta >> ClockHandle::kStateShift ==
+             ClockHandle::kStateConstruction);
+#else
+      // Mark slot as empty
+      h->meta.store(0, std::memory_order_release);
+#endif
+      occupancy_.fetch_sub(1U, std::memory_order_release);
+      Rollback(hash, h);
     }
-    h->ReleaseExclusiveRef();
+    usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+    assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
+    return true;
+  } else {
+    // Correct for possible (but rare) overflow
+    CorrectNearOverflow(old_meta, h->meta);
+    return false;
   }
-  return false;
 }
 
-bool ClockHandleTable::SpinTryRemove(ClockHandle* h,
-                                     autovector<ClockHandle>* deleted) {
-  if (h->SpinTryExclusiveRef()) {
-    if (h->WillBeDeleted()) {
-      Remove(h, deleted);
-      return true;
-    }
-    h->ReleaseExclusiveRef();
-  }
-  return false;
+void ClockHandleTable::Ref(ClockHandle& h) {
+  // Increment acquire counter
+  uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                       std::memory_order_acquire);
+
+  assert((old_meta >> ClockHandle::kStateShift) &
+         ClockHandle::kStateShareableBit);
+  (void)old_meta;
 }
 
-void ClockHandleTable::ClockOff(ClockHandle* h) {
-  h->SetClockPriority(ClockHandle::ClockPriority::NONE);
+void ClockHandleTable::TEST_RefN(ClockHandle& h, size_t n) {
+  // Increment acquire counter
+  uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement,
+                                       std::memory_order_acquire);
+
+  assert((old_meta >> ClockHandle::kStateShift) &
+         ClockHandle::kStateShareableBit);
+  (void)old_meta;
 }
 
-void ClockHandleTable::ClockOn(ClockHandle* h) {
-  assert(!h->IsInClock());
-  bool is_high_priority =
-      h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH;
-  h->SetClockPriority(static_cast<ClockHandle::ClockPriority>(
-      is_high_priority ? ClockHandle::ClockPriority::HIGH
-                       : ClockHandle::ClockPriority::MEDIUM));
+void ClockHandleTable::TEST_ReleaseN(ClockHandle* h, size_t n) {
+  if (n > 0) {
+    // Split into n - 1 and 1 steps.
+    uint64_t old_meta = h->meta.fetch_add(
+        (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire);
+    assert((old_meta >> ClockHandle::kStateShift) &
+           ClockHandle::kStateShareableBit);
+    (void)old_meta;
+
+    Release(h, /*useful*/ true, /*erase_if_last_ref*/ false);
+  }
 }
 
-void ClockHandleTable::Remove(ClockHandle* h,
-                              autovector<ClockHandle>* deleted) {
-  deleted->push_back(*h);
-  ClockOff(h);
+void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
   uint32_t probe = 0;
-  FindSlot(
-      h->key(), [&](ClockHandle* e) { return e == h; },
-      [&](ClockHandle* /*e*/) { return false; },
-      [&](ClockHandle* e) { e->displacements--; }, probe);
-  h->SetWillBeDeleted(false);
-  h->SetIsElement(false);
-}
-
-void ClockHandleTable::RemoveAll(const Slice& key, uint32_t hash,
-                                 uint32_t& probe,
-                                 autovector<ClockHandle>* deleted) {
-  FindSlot(
-      key,
+  (void)FindSlot(
+      hash,
       [&](ClockHandle* h) {
-        if (h->TryInternalRef()) {
-          if (h->IsElement() && h->Matches(key, hash)) {
-            h->SetWillBeDeleted(true);
-            h->ReleaseInternalRef();
-            if (TryRemove(h, deleted)) {
-              h->ReleaseExclusiveRef();
+        // Could be multiple entries in rare cases. Erase them all.
+        // Optimistically increment acquire counter
+        uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                              std::memory_order_acquire);
+        // Check if it's an entry visible to lookups
+        if ((old_meta >> ClockHandle::kStateShift) ==
+            ClockHandle::kStateVisible) {
+          // Acquired a read reference
+          if (h->key == key) {
+            // Match. Set invisible.
+            old_meta =
+                h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
+                                    << ClockHandle::kStateShift),
+                                  std::memory_order_acq_rel);
+            // Apply update to local copy
+            old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
+                          << ClockHandle::kStateShift);
+            for (;;) {
+              uint64_t refcount =
+                  ((old_meta >> ClockHandle::kAcquireCounterShift) -
+                   (old_meta >> ClockHandle::kReleaseCounterShift)) &
+                  ClockHandle::kCounterMask;
+              assert(refcount > 0);
+              if (refcount > 1) {
+                // Not last ref at some point in time during this Erase call
+                // Pretend we never took the reference
+                h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                  std::memory_order_release);
+                break;
+              } else if (h->meta.compare_exchange_weak(
+                             old_meta, uint64_t{ClockHandle::kStateConstruction}
+                                           << ClockHandle::kStateShift)) {
+                // Took ownership
+                assert(hash == h->hash);
+                // TODO? Delay freeing?
+                h->FreeData();
+                usage_.fetch_sub(h->total_charge, std::memory_order_relaxed);
+                assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
+#ifndef NDEBUG
+                // Mark slot as empty, with assertion
+                old_meta = h->meta.exchange(0, std::memory_order_release);
+                assert(old_meta >> ClockHandle::kStateShift ==
+                       ClockHandle::kStateConstruction);
+#else
+                // Mark slot as empty
+                h->meta.store(0, std::memory_order_release);
+#endif
+                occupancy_.fetch_sub(1U, std::memory_order_release);
+                Rollback(hash, h);
+                break;
+              }
             }
-            return false;
+          } else {
+            // Mismatch. Pretend we never took the reference
+            h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                              std::memory_order_release);
           }
-          h->ReleaseInternalRef();
+        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+                            ClockHandle::kStateInvisible)) {
+          // Pretend we never took the reference
+          // WART: there's a tiny chance we release last ref to invisible
+          // entry here. If that happens, we let eviction take care of it.
+          h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                            std::memory_order_release);
+        } else {
+          // For other states, incrementing the acquire counter has no effect
+          // so we don't need to undo it.
         }
         return false;
       },
-      [&](ClockHandle* h) { return h->displacements == 0; },
+      [&](ClockHandle* h) {
+        return h->displacements.load(std::memory_order_relaxed) == 0;
+      },
       [&](ClockHandle* /*h*/) {}, probe);
 }
 
-void ClockHandleTable::Free(autovector<ClockHandle>* deleted) {
-  if (deleted->size() == 0) {
-    // Avoid unnecessarily reading usage_ and occupancy_.
-    return;
+void ClockHandleTable::ConstApplyToEntriesRange(
+    std::function<void(const ClockHandle&)> func, uint32_t index_begin,
+    uint32_t index_end, bool apply_if_will_be_deleted) const {
+  uint64_t check_state_mask = ClockHandle::kStateShareableBit;
+  if (!apply_if_will_be_deleted) {
+    check_state_mask |= ClockHandle::kStateVisibleBit;
   }
 
-  size_t deleted_charge = 0;
-  for (auto& h : *deleted) {
-    deleted_charge += h.total_charge;
-    h.FreeData();
+  for (uint32_t i = index_begin; i < index_end; i++) {
+    ClockHandle& h = array_[i];
+
+    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
+    // Check if it's an entry visible to lookups
+    if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+      // Increment acquire counter
+      old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                  std::memory_order_acquire);
+      // Double-check
+      if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+        func(h);
+      }
+      // Pretend we never took the reference
+      h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                       std::memory_order_release);
+      // No net change, so don't need to check for overflow
+    }
   }
-  assert(usage_ >= deleted_charge);
-  usage_ -= deleted_charge;
-  occupancy_ -= static_cast<uint32_t>(deleted->size());
 }
 
-ClockHandle* ClockHandleTable::FindAvailableSlot(
-    const Slice& key, uint32_t hash, uint32_t& probe,
-    autovector<ClockHandle>* deleted) {
-  ClockHandle* e = FindSlot(
-      key,
-      [&](ClockHandle* h) {
-        // To read the handle, first acquire a shared ref.
-        if (h->TryInternalRef()) {
-          if (h->IsElement()) {
-            // The slot is not available.
-            // TODO(Guido) Is it worth testing h->WillBeDeleted()?
-            if (h->WillBeDeleted() || h->Matches(key, hash)) {
-              // The slot can be freed up, or the key we're inserting is already
-              // in the table, so we try to delete it. When the attempt is
-              // successful, the slot becomes available, so we stop probing.
-              // Notice that in that case TryRemove returns an exclusive ref.
-              h->SetWillBeDeleted(true);
-              h->ReleaseInternalRef();
-              if (TryRemove(h, deleted)) {
-                return true;
-              }
-              return false;
-            }
-            h->ReleaseInternalRef();
-            return false;
-          }
-
-          // Available slot.
-          h->ReleaseInternalRef();
-          // Try to acquire an exclusive ref. If we fail, continue probing.
-          if (h->SpinTryExclusiveRef()) {
-            // Check that the slot is still available.
-            if (!h->IsElement()) {
-              return true;
-            }
-            h->ReleaseExclusiveRef();
-          }
-        }
-        return false;
-      },
-      [&](ClockHandle* /*h*/) { return false; },
-      [&](ClockHandle* h) { h->displacements++; }, probe);
-  if (e == nullptr) {
-    Rollback(key, probe);
+void ClockHandleTable::EraseUnRefEntries() {
+  for (uint32_t i = 0; i <= this->length_bits_mask_; i++) {
+    ClockHandle& h = array_[i];
+
+    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
+    uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) -
+                         (old_meta >> ClockHandle::kReleaseCounterShift)) &
+                        ClockHandle::kCounterMask;
+    if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
+                    << ClockHandle::kStateShift) &&
+        refcount == 0 &&
+        h.meta.compare_exchange_strong(old_meta,
+                                       uint64_t{ClockHandle::kStateConstruction}
+                                           << ClockHandle::kStateShift,
+                                       std::memory_order_acquire)) {
+      // Took ownership
+      uint32_t hash = h.hash;
+      h.FreeData();
+      usage_.fetch_sub(h.total_charge, std::memory_order_relaxed);
+#ifndef NDEBUG
+      // Mark slot as empty, with assertion
+      old_meta = h.meta.exchange(0, std::memory_order_release);
+      assert(old_meta >> ClockHandle::kStateShift ==
+             ClockHandle::kStateConstruction);
+#else
+      // Mark slot as empty
+      h.meta.store(0, std::memory_order_release);
+#endif
+      occupancy_.fetch_sub(1U, std::memory_order_release);
+      Rollback(hash, &h);
+    }
   }
-  return e;
 }
 
+namespace {
+inline uint32_t Remix1(uint32_t hash) {
+  return Lower32of64((uint64_t{hash} * 0xbc9f1d35) >> 29);
+}
+
+inline uint32_t Remix2(uint32_t hash) {
+  return Lower32of64((uint64_t{hash} * 0x7a2bb9d5) >> 29);
+}
+}  // namespace
+
 ClockHandle* ClockHandleTable::FindSlot(
-    const Slice& key, std::function<bool(ClockHandle*)> match,
-    std::function<bool(ClockHandle*)> abort,
-    std::function<void(ClockHandle*)> update, uint32_t& probe) {
+    uint32_t hash, std::function<bool(ClockHandle*)> match_fn,
+    std::function<bool(ClockHandle*)> abort_fn,
+    std::function<void(ClockHandle*)> update_fn, uint32_t& probe) {
   // We use double-hashing probing. Every probe in the sequence is a
   // pseudorandom integer, computed as a linear function of two random hashes,
   // which we call base and increment. Specifically, the i-th probe is base + i
   // * increment modulo the table size.
-  uint32_t base = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1));
+  uint32_t base = ModTableSize(Remix1(hash));
   // We use an odd increment, which is relatively prime with the power-of-two
   // table size. This implies that we cycle back to the first probe only
   // after probing every slot exactly once.
-  uint32_t increment =
-      ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1);
+  // TODO: we could also reconsider linear probing, though locality benefits
+  // are limited because each slot is a full cache line
+  uint32_t increment = Remix2(hash) | 1U;
   uint32_t current = ModTableSize(base + probe * increment);
-  while (true) {
+  while (probe <= length_bits_mask_) {
     ClockHandle* h = &array_[current];
-    if (current == base && probe > 0) {
-      // We looped back.
-      return nullptr;
-    }
-    if (match(h)) {
+    if (match_fn(h)) {
       probe++;
       return h;
     }
-    if (abort(h)) {
+    if (abort_fn(h)) {
       return nullptr;
     }
     probe++;
-    update(h);
+    update_fn(h);
     current = ModTableSize(current + increment);
   }
+  // We looped back.
+  return nullptr;
 }
 
-void ClockHandleTable::Rollback(const Slice& key, uint32_t probe) {
-  uint32_t current = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1));
-  uint32_t increment =
-      ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1);
-  for (uint32_t i = 0; i < probe; i++) {
-    array_[current].displacements--;
+void ClockHandleTable::Rollback(uint32_t hash, const ClockHandle* h) {
+  uint32_t current = ModTableSize(Remix1(hash));
+  uint32_t increment = Remix2(hash) | 1U;
+  for (uint32_t i = 0; &array_[current] != h; i++) {
+    array_[current].displacements.fetch_sub(1, std::memory_order_relaxed);
     current = ModTableSize(current + increment);
   }
 }
 
-void ClockHandleTable::ClockRun(size_t charge) {
-  // TODO(Guido) When an element is in the probe sequence of a
-  // hot element, it will be hard to get an exclusive ref.
-  // Do we need a mechanism to prevent an element from sitting
-  // for a long time in cache waiting to be evicted?
-  autovector<ClockHandle> deleted;
-  uint32_t max_iterations =
-      ClockHandle::ClockPriority::HIGH *
-      (1 +
-       static_cast<uint32_t>(
-           GetTableSize() *
-           kLoadFactor));  // It may take up to HIGH passes to evict an element.
-  size_t usage_local = usage_;
-  size_t capacity_local = capacity_;
-  while (usage_local + charge > capacity_local && max_iterations--) {
-    uint32_t steps = 1 + static_cast<uint32_t>(1 / kLoadFactor);
-    uint32_t clock_pointer_local = (clock_pointer_ += steps) - steps;
-    for (uint32_t i = 0; i < steps; i++) {
-      ClockHandle* h = &array_[ModTableSize(clock_pointer_local + i)];
-      if (h->TryExclusiveRef()) {
-        if (h->WillBeDeleted()) {
-          Remove(h, &deleted);
-          usage_local -= h->total_charge;
-        } else {
-          if (!h->IsInClock() && h->IsElement()) {
-            // We adjust the clock priority to make the element evictable again.
-            // Why? Elements that are not in clock are either currently
-            // externally referenced or used to be. Because we are holding an
-            // exclusive ref, we know we are in the latter case. This can only
-            // happen when the last external reference to an element was
-            // released, and the element was not immediately removed.
-            ClockOn(h);
-          }
-          ClockHandle::ClockPriority priority = h->GetClockPriority();
-          if (priority == ClockHandle::ClockPriority::LOW) {
-            Remove(h, &deleted);
-            usage_local -= h->total_charge;
-          } else if (priority > ClockHandle::ClockPriority::LOW) {
-            h->DecreaseClockPriority();
-          }
-        }
-        h->ReleaseExclusiveRef();
+void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
+                             uint32_t* freed_count) {
+  // precondition
+  assert(requested_charge > 0);
+
+  // TODO: make a tuning parameter?
+  constexpr uint32_t step_size = 4;
+
+  // First (concurrent) increment clock pointer
+  uint64_t old_clock_pointer =
+      clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
+
+  // Cap the eviction effort at this thread (along with those operating in
+  // parallel) circling through the whole structure kMaxCountdown times.
+  // In other words, this eviction run must find something/anything that is
+  // unreferenced at start of and during the eviction run that isn't reclaimed
+  // by a concurrent eviction run.
+  uint64_t max_clock_pointer =
+      old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_);
+
+  for (;;) {
+    for (uint32_t i = 0; i < step_size; i++) {
+      ClockHandle& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))];
+      uint64_t meta = h.meta.load(std::memory_order_relaxed);
+
+      uint64_t acquire_count = (meta >> ClockHandle::kAcquireCounterShift) &
+                               ClockHandle::kCounterMask;
+      uint64_t release_count = (meta >> ClockHandle::kReleaseCounterShift) &
+                               ClockHandle::kCounterMask;
+      if (acquire_count != release_count) {
+        // Only clock update entries with no outstanding refs
+        continue;
+      }
+      if (!(meta >> ClockHandle::kStateShift &
+            ClockHandle::kStateShareableBit)) {
+        // Only clock update Shareable entries
+        continue;
+      }
+      // ModTableSize(old_clock_pointer + i));
+      if (meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible &&
+          acquire_count > 0) {
+        // Decrement clock
+        uint64_t new_count = std::min(acquire_count - 1,
+                                      uint64_t{ClockHandle::kMaxCountdown} - 1);
+        // Compare-exchange in the decremented clock info, but
+        // not aggressively
+        uint64_t new_meta =
+            (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) |
+            (new_count << ClockHandle::kReleaseCounterShift) |
+            (new_count << ClockHandle::kAcquireCounterShift);
+        h.meta.compare_exchange_strong(meta, new_meta,
+                                       std::memory_order_relaxed);
+        continue;
+      }
+      // Otherwise, remove entry (either unreferenced invisible or
+      // unreferenced and expired visible). Compare-exchange failing probably
+      // indicates the entry was used, so skip it in that case.
+      if (h.meta.compare_exchange_strong(
+              meta,
+              uint64_t{ClockHandle::kStateConstruction}
+                  << ClockHandle::kStateShift,
+              std::memory_order_acquire)) {
+        // Took ownership
+        uint32_t hash = h.hash;
+        // TODO? Delay freeing?
+        h.FreeData();
+        *freed_charge += h.total_charge;
+#ifndef NDEBUG
+        // Mark slot as empty, with assertion
+        meta = h.meta.exchange(0, std::memory_order_release);
+        assert(meta >> ClockHandle::kStateShift ==
+               ClockHandle::kStateConstruction);
+#else
+        // Mark slot as empty
+        h.meta.store(0, std::memory_order_release);
+#endif
+        *freed_count += 1;
+        Rollback(hash, &h);
       }
     }
-  }
 
-  Free(&deleted);
+    // Loop exit condition
+    if (*freed_charge >= requested_charge) {
+      return;
+    }
+    if (old_clock_pointer >= max_clock_pointer) {
+      return;
+    }
+
+    // Advance clock pointer (concurrently)
+    old_clock_pointer =
+        clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
+  }
 }
 
 ClockCacheShard::ClockCacheShard(
     size_t capacity, size_t estimated_value_size, bool strict_capacity_limit,
     CacheMetadataChargePolicy metadata_charge_policy)
-    : strict_capacity_limit_(strict_capacity_limit),
-      detached_usage_(0),
-      table_(capacity, CalcHashBits(capacity, estimated_value_size,
-                                    metadata_charge_policy)) {
-  set_metadata_charge_policy(metadata_charge_policy);
+    : CacheShard(metadata_charge_policy),
+      table_(
+          CalcHashBits(capacity, estimated_value_size, metadata_charge_policy),
+          /*initial_charge_metadata*/ metadata_charge_policy ==
+              kFullChargeCacheMetadata),
+      capacity_(capacity),
+      strict_capacity_limit_(strict_capacity_limit) {
+  // Initial charge metadata should not exceed capacity
+  assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(ClockHandle));
 }
 
-void ClockCacheShard::EraseUnRefEntries() {
-  autovector<ClockHandle> deleted;
-
-  table_.ApplyToEntriesRange(
-      [this, &deleted](ClockHandle* h) {
-        // Externally unreferenced element.
-        table_.Remove(h, &deleted);
-      },
-      0, table_.GetTableSize(), true);
-
-  table_.Free(&deleted);
-}
+void ClockCacheShard::EraseUnRefEntries() { table_.EraseUnRefEntries(); }
 
 void ClockCacheShard::ApplyToSomeEntries(
     const std::function<void(const Slice& key, void* value, size_t charge,
@@ -405,203 +981,109 @@ void ClockCacheShard::ApplyToSomeEntries(
   }
 
   table_.ConstApplyToEntriesRange(
-      [callback,
-       metadata_charge_policy = metadata_charge_policy_](const ClockHandle* h) {
-        callback(h->key(), h->value, h->GetCharge(metadata_charge_policy),
-                 h->deleter);
+      [callback](const ClockHandle& h) {
+        callback(h.KeySlice(), h.value, h.total_charge, h.deleter);
       },
       index_begin, index_end, false);
 }
 
-ClockHandle* ClockCacheShard::DetachedInsert(ClockHandle* h) {
-  ClockHandle* e = new ClockHandle();
-  *e = *h;
-  e->SetDetached();
-  e->TryExternalRef();
-  detached_usage_ += h->total_charge;
-  return e;
-}
-
-size_t ClockCacheShard::CalcEstimatedHandleCharge(
-    size_t estimated_value_size,
-    CacheMetadataChargePolicy metadata_charge_policy) {
-  ClockHandle h;
-  h.CalcTotalCharge(estimated_value_size, metadata_charge_policy);
-  return h.total_charge;
-}
-
 int ClockCacheShard::CalcHashBits(
     size_t capacity, size_t estimated_value_size,
     CacheMetadataChargePolicy metadata_charge_policy) {
-  size_t handle_charge =
-      CalcEstimatedHandleCharge(estimated_value_size, metadata_charge_policy);
-  assert(handle_charge > 0);
-  uint32_t num_entries =
-      static_cast<uint32_t>(capacity / (kLoadFactor * handle_charge)) + 1;
-  assert(num_entries <= uint32_t{1} << 31);
-  return FloorLog2((num_entries << 1) - 1);
+  double average_slot_charge = estimated_value_size * kLoadFactor;
+  if (metadata_charge_policy == kFullChargeCacheMetadata) {
+    average_slot_charge += sizeof(ClockHandle);
+  }
+  assert(average_slot_charge > 0.0);
+  uint64_t num_slots =
+      static_cast<uint64_t>(capacity / average_slot_charge + 0.999999);
+
+  int hash_bits = std::min(FloorLog2((num_slots << 1) - 1), 32);
+  if (metadata_charge_policy == kFullChargeCacheMetadata) {
+    // For very small estimated value sizes, it's possible to overshoot
+    while (hash_bits > 0 &&
+           uint64_t{sizeof(ClockHandle)} << hash_bits > capacity) {
+      hash_bits--;
+    }
+  }
+  return hash_bits;
 }
 
 void ClockCacheShard::SetCapacity(size_t capacity) {
-  if (capacity > table_.GetCapacity()) {
-    assert(false);  // Not supported.
-  }
-  table_.SetCapacity(capacity);
-  table_.ClockRun(detached_usage_);
+  capacity_.store(capacity, std::memory_order_relaxed);
+  // next Insert will take care of any necessary evictions
 }
 
 void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
-  strict_capacity_limit_ = strict_capacity_limit;
+  strict_capacity_limit_.store(strict_capacity_limit,
+                               std::memory_order_relaxed);
+  // next Insert will take care of any necessary evictions
 }
 
 Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
                                size_t charge, Cache::DeleterFn deleter,
                                Cache::Handle** handle,
                                Cache::Priority priority) {
-  if (key.size() != kCacheKeySize) {
+  if (UNLIKELY(key.size() != kCacheKeySize)) {
     return Status::NotSupported("ClockCache only supports key size " +
                                 std::to_string(kCacheKeySize) + "B");
   }
-
-  ClockHandle tmp;
-  tmp.value = value;
-  tmp.deleter = deleter;
-  tmp.hash = hash;
-  tmp.CalcTotalCharge(charge, metadata_charge_policy_);
-  tmp.SetCachePriority(priority);
-  for (int i = 0; i < kCacheKeySize; i++) {
-    tmp.key_data[i] = key.data()[i];
-  }
-
-  Status s = Status::OK();
-
-  // Use a local copy to minimize cache synchronization.
-  size_t detached_usage = detached_usage_;
-
-  // Free space with the clock policy until enough space is freed or there are
-  // no evictable elements.
-  table_.ClockRun(tmp.total_charge + detached_usage);
-
-  // Use local copies to minimize cache synchronization
-  // (occupancy_ and usage_ are read and written by all insertions).
-  uint32_t occupancy_local = table_.GetOccupancy();
-  size_t total_usage = table_.GetUsage() + detached_usage;
-
-  // TODO: Currently we support strict_capacity_limit == false as long as the
-  // number of pinned elements is below table_.GetOccupancyLimit(). We can
-  // always support it as follows: whenever we exceed this limit, we dynamically
-  // allocate a handle and return it (when the user provides a handle pointer,
-  // of course). Then, Release checks whether the handle was dynamically
-  // allocated, or is stored in the table.
-  if (total_usage + tmp.total_charge > table_.GetCapacity() &&
-      (strict_capacity_limit_ || handle == nullptr)) {
-    if (handle == nullptr) {
-      // Don't insert the entry but still return ok, as if the entry inserted
-      // into cache and get evicted immediately.
-      tmp.FreeData();
-    } else {
-      if (occupancy_local + 1 > table_.GetOccupancyLimit()) {
-        // TODO: Consider using a distinct status for this case, but usually
-        // it will be handled the same way as reaching charge capacity limit
-        s = Status::MemoryLimit(
-            "Insert failed because all slots in the hash table are full.");
-      } else {
-        s = Status::MemoryLimit(
-            "Insert failed because the total charge has exceeded the "
-            "capacity.");
-      }
-    }
-  } else {
-    ClockHandle* h = nullptr;
-    if (handle != nullptr && occupancy_local + 1 > table_.GetOccupancyLimit()) {
-      // Even if the user wishes to overload the cache, we can't insert into
-      // the hash table. Instead, we dynamically allocate a new handle.
-      h = DetachedInsert(&tmp);
-      // TODO: Return special status?
-    } else {
-      // Insert into the cache. Note that the cache might get larger than its
-      // capacity if not enough space was freed up.
-      autovector<ClockHandle> deleted;
-      h = table_.Insert(&tmp, &deleted, handle != nullptr);
-      if (h == nullptr && handle != nullptr) {
-        // The table is full. This can happen when many threads simultaneously
-        // attempt an insert, and the table is operating close to full capacity.
-        h = DetachedInsert(&tmp);
-      }
-      // Notice that if handle == nullptr, we don't insert the entry but still
-      // return ok.
-      if (deleted.size() > 0) {
-        s = Status::OkOverwritten();
-      }
-      table_.Free(&deleted);
-    }
-    if (handle != nullptr) {
-      *handle = reinterpret_cast<Cache::Handle*>(h);
-    }
-  }
-
+  ClockHandleMoreData proto;
+  proto.key = *reinterpret_cast<const CacheKeyBytes*>(key.data());
+  proto.hash = hash;
+  proto.value = value;
+  proto.deleter = deleter;
+  proto.total_charge = charge;
+  Status s =
+      table_.Insert(proto, reinterpret_cast<ClockHandle**>(handle), priority,
+                    capacity_.load(std::memory_order_relaxed),
+                    strict_capacity_limit_.load(std::memory_order_relaxed));
   return s;
 }
 
 Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
-  return reinterpret_cast<Cache::Handle*>(table_.Lookup(key, hash));
+  if (UNLIKELY(key.size() != kCacheKeySize)) {
+    return nullptr;
+  }
+  auto key_bytes = reinterpret_cast<const CacheKeyBytes*>(key.data());
+  return reinterpret_cast<Cache::Handle*>(table_.Lookup(*key_bytes, hash));
 }
 
 bool ClockCacheShard::Ref(Cache::Handle* h) {
-  ClockHandle* e = reinterpret_cast<ClockHandle*>(h);
-  assert(e->ExternalRefs() > 0);
-  return e->TryExternalRef();
+  if (h == nullptr) {
+    return false;
+  }
+  table_.Ref(*reinterpret_cast<ClockHandle*>(h));
+  return true;
 }
 
-bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
-  // In contrast with LRUCache's Release, this function won't delete the handle
-  // when the cache is above capacity and the reference is the last one. Space
-  // is only freed up by EvictFromClock (called by Insert when space is needed)
-  // and Erase. We do this to avoid an extra atomic read of the variable usage_.
+bool ClockCacheShard::Release(Cache::Handle* handle, bool useful,
+                              bool erase_if_last_ref) {
   if (handle == nullptr) {
     return false;
   }
+  return table_.Release(reinterpret_cast<ClockHandle*>(handle), useful,
+                        erase_if_last_ref);
+}
 
-  ClockHandle* h = reinterpret_cast<ClockHandle*>(handle);
-
-  if (UNLIKELY(h->IsDetached())) {
-    h->ReleaseExternalRef();
-    if (h->TryExclusiveRef()) {
-      // Only the last reference will succeed.
-      // Don't bother releasing the exclusive ref.
-      h->FreeData();
-      detached_usage_ -= h->total_charge;
-      delete h;
-      return true;
-    }
-    return false;
-  }
+void ClockCacheShard::TEST_RefN(Cache::Handle* h, size_t n) {
+  table_.TEST_RefN(*reinterpret_cast<ClockHandle*>(h), n);
+}
 
-  uint32_t refs = h->refs;
-  bool last_reference = ((refs & ClockHandle::EXTERNAL_REFS) == 1);
-  bool will_be_deleted = refs & ClockHandle::WILL_BE_DELETED;
-
-  if (last_reference && (will_be_deleted || erase_if_last_ref)) {
-    autovector<ClockHandle> deleted;
-    h->SetWillBeDeleted(true);
-    h->ReleaseExternalRef();
-    if (table_.SpinTryRemove(h, &deleted)) {
-      h->ReleaseExclusiveRef();
-      table_.Free(&deleted);
-      return true;
-    }
-  } else {
-    h->ReleaseExternalRef();
-  }
+void ClockCacheShard::TEST_ReleaseN(Cache::Handle* h, size_t n) {
+  table_.TEST_ReleaseN(reinterpret_cast<ClockHandle*>(h), n);
+}
 
-  return false;
+bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
+  return Release(handle, /*useful=*/true, erase_if_last_ref);
 }
 
 void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {
-  autovector<ClockHandle> deleted;
-  uint32_t probe = 0;
-  table_.RemoveAll(key, hash, probe, &deleted);
-  table_.Free(&deleted);
+  if (UNLIKELY(key.size() != kCacheKeySize)) {
+    return;
+  }
+  auto key_bytes = reinterpret_cast<const CacheKeyBytes*>(key.data());
+  table_.Erase(*key_bytes, hash);
 }
 
 size_t ClockCacheShard::GetUsage() const { return table_.GetUsage(); }
@@ -613,18 +1095,35 @@ size_t ClockCacheShard::GetPinnedUsage() const {
   // Why avoid this counter? Because Lookup removes elements from the clock
   // list, so it would need to update the pinned usage every time,
   // which creates additional synchronization costs.
-  size_t clock_usage = 0;
-
+  size_t table_pinned_usage = 0;
+  const bool charge_metadata =
+      metadata_charge_policy_ == kFullChargeCacheMetadata;
   table_.ConstApplyToEntriesRange(
-      [&clock_usage](const ClockHandle* h) {
-        if (h->ExternalRefs() > 1) {
-          // We check > 1 because we are holding an external ref.
-          clock_usage += h->total_charge;
+      [&table_pinned_usage, charge_metadata](const ClockHandle& h) {
+        uint64_t meta = h.meta.load(std::memory_order_relaxed);
+        uint64_t refcount = ((meta >> ClockHandle::kAcquireCounterShift) -
+                             (meta >> ClockHandle::kReleaseCounterShift)) &
+                            ClockHandle::kCounterMask;
+        // Holding one ref for ConstApplyToEntriesRange
+        assert(refcount > 0);
+        if (refcount > 1) {
+          table_pinned_usage += h.total_charge;
+          if (charge_metadata) {
+            table_pinned_usage += sizeof(ClockHandle);
+          }
         }
       },
       0, table_.GetTableSize(), true);
 
-  return clock_usage + detached_usage_;
+  return table_pinned_usage + table_.GetDetachedUsage();
+}
+
+size_t ClockCacheShard::GetOccupancyCount() const {
+  return table_.GetOccupancy();
+}
+
+size_t ClockCacheShard::GetTableAddressCount() const {
+  return table_.GetTableSize();
 }
 
 ClockCache::ClockCache(size_t capacity, size_t estimated_value_size,
@@ -634,6 +1133,8 @@ ClockCache::ClockCache(size_t capacity, size_t estimated_value_size,
       num_shards_(1 << num_shard_bits) {
   assert(estimated_value_size > 0 ||
          metadata_charge_policy != kDontChargeCacheMetadata);
+  // TODO: should not need to go through two levels of pointer indirection to
+  // get to table entries
   shards_ = reinterpret_cast<ClockCacheShard*>(
       port::cacheline_aligned_alloc(sizeof(ClockCacheShard) * num_shards_));
   size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
@@ -667,12 +1168,7 @@ void* ClockCache::Value(Handle* handle) {
 }
 
 size_t ClockCache::GetCharge(Handle* handle) const {
-  CacheMetadataChargePolicy metadata_charge_policy = kDontChargeCacheMetadata;
-  if (num_shards_ > 0) {
-    metadata_charge_policy = shards_[0].metadata_charge_policy_;
-  }
-  return reinterpret_cast<const ClockHandle*>(handle)->GetCharge(
-      metadata_charge_policy);
+  return reinterpret_cast<const ClockHandle*>(handle)->total_charge;
 }
 
 Cache::DeleterFn ClockCache::GetDeleter(Handle* handle) const {
@@ -711,7 +1207,10 @@ std::shared_ptr<Cache> ExperimentalNewClockCache(
     return nullptr;  // The cache cannot be sharded into too many fine pieces.
   }
   if (num_shard_bits < 0) {
-    num_shard_bits = GetDefaultCacheShardBits(capacity);
+    // Use larger shard size to reduce risk of large entries clustering
+    // or skewing individual shards.
+    constexpr size_t min_shard_size = 32U * 1024U * 1024U;
+    num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size);
   }
   return std::make_shared<clock_cache::ClockCache>(
       capacity, estimated_value_size, num_shard_bits, strict_capacity_limit,
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
index e495f1c04..8ceb46478 100644
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -9,10 +9,9 @@
 
 #pragma once
 
-#include <sys/types.h>
-
 #include <array>
 #include <atomic>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -33,140 +32,262 @@ namespace clock_cache {
 // Forward declaration of friend class.
 class ClockCacheTest;
 
-// An experimental alternative to LRUCache, using a lock-free, open-addressed
-// hash table and clock eviction.
-
-// ----------------------------------------------------------------------------
-// 1. INTRODUCTION
+// ClockCache is an experimental alternative to LRUCache.
+//
+// Benefits
+// --------
+// * Fully lock free (no waits or spins) for efficiency under high concurrency
+// * Optimized for hot path reads. For concurrency control, most Lookup() and
+// essentially all Release() are a single atomic add operation.
+// * Uses a generalized + aging variant of CLOCK eviction that might outperform
+// LRU in some cases. (For background, see
+// https://en.wikipedia.org/wiki/Page_replacement_algorithm)
+// * Eviction on insertion is fully parallel and lock-free.
+//
+// Costs
+// -----
+// * Hash table is not resizable (for lock-free efficiency) so capacity is not
+// dynamically changeable. Rely on an estimated average value (block) size for
+// space+time efficiency. (See estimated_entry_charge option details.)
+// * Insert usually does not (but might) overwrite a previous entry associated
+// with a cache key. This is OK for RocksDB uses of Cache.
+// * Only supports keys of exactly 16 bytes, which is what RocksDB uses for
+// block cache (not row cache or table cache).
+// * SecondaryCache is not supported.
+// * Cache priorities are less aggressively enforced. Unlike LRUCache, enough
+// transient LOW or BOTTOM priority items can evict HIGH priority entries that
+// are not referenced recently (or often) enough.
+// * If pinned entries leave little or nothing eligible for eviction,
+// performance can degrade substantially, because of clock eviction eating
+// CPU looking for evictable entries and because Release does not
+// pro-actively delete unreferenced entries when the cache is over-full.
+// Specifically, this makes this implementation more susceptible to the
+// following combination:
+//   * num_shard_bits is high (e.g. 6)
+//   * capacity small (e.g. some MBs)
+//   * some large individual entries (e.g. non-partitioned filters)
+// where individual entries occupy a large portion of their shard capacity.
+// This should be mostly mitigated by the implementation picking a lower
+// number of cache shards than LRUCache for a given capacity (when
+// num_shard_bits is not overridden; see calls to GetDefaultCacheShardBits()).
+// * With strict_capacity_limit=false, respecting the capacity limit is not as
+// aggressive as LRUCache. The limit might be transiently exceeded by a very
+// small number of entries even when not strictly necessary, and slower to
+// recover after pinning forces limit to be substantially exceeded. (Even with
+// strict_capacity_limit=true, RocksDB will nevertheless transiently allocate
+// memory before discovering it is over the block cache capacity, so this
+// should not be a detectable regression in respecting memory limits, except
+// on exceptionally small caches.)
+// * In some cases, erased or duplicated entries might not be freed
+// immediately. They will eventually be freed by eviction from further Inserts.
+// * Internal metadata can overflow if the number of simultaneous references
+// to a cache handle reaches many millions.
+//
+// High-level eviction algorithm
+// -----------------------------
+// A score (or "countdown") is maintained for each entry, initially determined
+// by priority. The score is incremented on each Lookup, up to a max of 3,
+// though is easily returned to previous state if useful=false with Release.
+// During CLOCK-style eviction iteration, entries with score > 0 are
+// decremented if currently unreferenced and entries with score == 0 are
+// evicted if currently unreferenced. Note that scoring might not be perfect
+// because entries can be referenced transiently within the cache even when
+// there are no outside references to the entry.
+//
+// Cache sharding like LRUCache is used to reduce contention on usage+eviction
+// state, though here the performance improvement from more shards is small,
+// and (as noted above) potentially detrimental if shard capacity is too close
+// to largest entry size. Here cache sharding mostly only affects cache update
+// (Insert / Erase) performance, not read performance.
+//
+// Read efficiency (hot path)
+// --------------------------
+// Mostly to minimize the cost of accessing metadata blocks with
+// cache_index_and_filter_blocks=true, we focus on optimizing Lookup and
+// Release. In terms of concurrency, at a minimum, these operations have
+// to do reference counting (and Lookup has to compare full keys in a safe
+// way). Can we fold in all the other metadata tracking *for free* with
+// Lookup and Release doing a simple atomic fetch_add/fetch_sub? (Assume
+// for the moment that Lookup succeeds on the first probe.)
+//
+// We have a clever way of encoding an entry's reference count and countdown
+// clock so that Lookup and Release are each usually a single atomic addition.
+// In a single metadata word we have both an "acquire" count, incremented by
+// Lookup, and a "release" count, incremented by Release. If useful=false,
+// Release can instead decrement the acquire count. Thus the current ref
+// count is (acquires - releases), and the countdown clock is min(3, acquires).
+// Note that only unreferenced entries (acquires == releases) are eligible
+// for CLOCK manipulation and eviction. We tolerate use of more expensive
+// compare_exchange operations for cache writes (insertions and erasures).
+//
+// In a cache receiving many reads and little or no writes, it is possible
+// for the acquire and release counters to overflow. Assuming the *current*
+// refcount never reaches to many millions, we only have to correct for
+// overflow in both counters in Release, not in Lookup. The overflow check
+// should be only 1-2 CPU cycles per Release because it is a predictable
+// branch on a simple condition on data already in registers.
+//
+// Slot states
+// -----------
+// We encode a state indicator into the same metadata word with the
+// acquire and release counters. This allows bigger state transitions to
+// be atomic. States:
 //
-// In RocksDB, a Cache is a concurrent unordered dictionary that supports
-// external references (a.k.a. user references). A ClockCache is a type of Cache
-// that uses the clock algorithm as its eviction policy. Internally, a
-// ClockCache is an open-addressed hash table that stores all KV pairs in a
-// large array. Every slot in the hash table is a ClockHandle, which holds a KV
-// pair plus some additional metadata that controls the different aspects of the
-// cache: external references, the hashing mechanism, concurrent access and the
-// clock algorithm.
+// * Empty - slot is not in use and unowned. All other metadata and data is
+// in an undefined state.
+// * Construction - slot is exclusively owned by one thread, the thread
+// successfully entering this state, for populating or freeing data.
+// * Shareable (group) - slot holds an entry with counted references for
+// pinning and reading, including
+//   * Visible - slot holds an entry that can be returned by Lookup
+//   * Invisible - slot holds an entry that is not visible to Lookup
+//     (erased by user) but can be read by existing references, and ref count
+//     changed by Ref and Release.
 //
+// A special case is "detached" entries, which are heap-allocated handles
+// not in the table. They are always Invisible and freed on zero refs.
 //
-// 2. EXTERNAL REFERENCES
+// State transitions:
+// Empty -> Construction (in Insert): The encoding of state enables Insert to
+// perform an optimistic atomic bitwise-or to take ownership if a slot is
+// empty, or otherwise make no state change.
 //
-// An externally referenced handle can't be deleted (either evicted by the clock
-// algorithm, or explicitly deleted) or replaced by a new version (via an insert
-// of the same key) until all external references to it have been released by
-// the users. ClockHandles have two members to support external references:
-// - EXTERNAL_REFS counter: The number of external refs. When EXTERNAL_REFS > 0,
-//    the handle is externally referenced. Updates that intend to modify the
-//    handle will refrain from doing so. Eventually, when all references are
-//    released, we have EXTERNAL_REFS == 0, and updates can operate normally on
-//    the handle.
-// - WILL_BE_DELETED flag: An handle is marked for deletion when an operation
-//    decides the handle should be deleted. This happens either when the last
-//    reference to a handle is released (and the release operation is instructed
-//    to delete on last reference) or on when a delete operation is called on
-//    the item. This flag is needed because an externally referenced handle
-//    can't be immediately deleted. In these cases, the flag will be later read
-//    and acted upon by the eviction algorithm. Importantly, WILL_BE_DELETED is
-//    used not only to defer deletions, but also as a barrier for external
-//    references: once WILL_BE_DELETED is set, lookups (which are the most
-//    common way to acquire new external references) will ignore the handle.
-//    For this reason, when WILL_BE_DELETED is set, we say the handle is
-//    invisible (and, otherwise, that it's visible).
+// Construction -> Visible (in Insert): This can be a simple assignment to the
+// metadata word because the current thread has exclusive ownership and other
+// metadata is meaningless.
 //
+// Visible -> Invisible (in Erase): This can be a bitwise-and while holding
+// a shared reference, which is safe because the change is idempotent (in case
+// of parallel Erase). By the way, we never go Invisible->Visible.
 //
-// 3. HASHING AND COLLISION RESOLUTION
+// Shareable -> Construction (in Evict part of Insert, in Erase, and in
+// Release if Invisible): This is for starting to freeing/deleting an
+// unreferenced entry. We have to use compare_exchange to ensure we only make
+// this transition when there are zero refs.
 //
-// ClockCache uses an open-addressed hash table to store the handles.
-// We use a variant of tombstones to manage collisions: every slot keeps a
-// count of how many KV pairs that are currently in the cache have probed the
-// slot in an attempt to insert. Probes are generated with double-hashing
-// (although the code can be easily modified to use other probing schemes, like
-// linear probing).
+// Construction -> Empty (in same places): This is for completing free/delete
+// of an entry. A "release" atomic store suffices, as we have exclusive
+// ownership of the slot but have to ensure none of the data member reads are
+// re-ordered after committing the state transition.
 //
-// A slot in the hash table can be in a few different states:
-// - Element: The slot contains an element. This is indicated with the
-//    IS_ELEMENT flag. Element can be sub-classified depending on the
-//    value of WILL_BE_DELETED:
-//    * Visible element.
-//    * Invisible element.
-// - Tombstone: The slot doesn't contain an element, but there is some other
-//    element that probed this slot during its insertion.
-// - Empty: The slot is unused---it's neither an element nor a tombstone.
+// Insert
+// ------
+// If Insert were to guarantee replacing an existing entry for a key, there
+// would be complications for concurrency and efficiency. First, consider how
+// many probes to get to an entry. To ensure Lookup never waits and
+// availability of a key is uninterrupted, we would need to use a different
+// slot for a new entry for the same key. This means it is most likely in a
+// later probing position than the old version, which should soon be removed.
+// (Also, an entry is too big to replace atomically, even if no current refs.)
 //
-// A slot cycles through the following sequence of states:
-// empty or tombstone --> visible element --> invisible element -->
-// empty or tombstone. Initially a slot is available---it's either
-// empty or a tombstone. As soon as a KV pair is written into the slot, it
-// becomes a visible element. At some point, the handle will be deleted
-// by an explicit delete operation, the eviction algorithm, or an overwriting
-// insert. In either case, the handle is marked for deletion. When the an
-// attempt to delete the element finally succeeds, the slot is freed up
-// and becomes available again.
+// However, overwrite capability is not really needed by RocksDB. Also, we
+// know from our "redundant" stats that overwrites are very rare for the block
+// cache, so we should not spend much to make them effective.
 //
+// So instead we Insert as soon as we find an empty slot in the probing
+// sequence without seeing an existing (visible) entry for the same key. This
+// way we only insert if we can improve the probing performance, and we don't
+// need to probe beyond our insert position, assuming we are willing to let
+// the previous entry for the same key die of old age (eventual eviction from
+// not being used). We can reach a similar state with concurrent insertions,
+// where one will pass over the other while it is "under construction."
+// This temporary duplication is acceptable for RocksDB block cache because
+// we know redundant insertion is rare.
 //
-// 4. CONCURRENCY
+// Another problem to solve is what to return to the caller when we find an
+// existing entry whose probing position we cannot improve on, or when the
+// table occupancy limit has been reached. If strict_capacity_limit=false,
+// we must never fail Insert, and if a Handle* is provided, we have to return
+// a usable Cache handle on success. The solution to this (typically rare)
+// problem is "detached" handles, which are usable by the caller but not
+// actually available for Lookup in the Cache. Detached handles are allocated
+// independently on the heap and specially marked so that they are freed on
+// the heap when their last reference is released.
 //
-// ClockCache is lock-free. At a high level, we synchronize the operations
-// using a read-prioritized, non-blocking variant of RW locks on every slot of
-// the hash table. To do this we generalize the concept of reference:
-// - Internal reference: Taken by a thread that is attempting to read a slot
-//    or do a very precise type of update.
-// - Exclusive reference: Taken by a thread that is attempting to write a
-//    a slot extensively.
+// Usage on capacity
+// -----------------
+// Insert takes different approaches to usage tracking depending on
+// strict_capacity_limit setting. If true, we enforce a kind of strong
+// consistency where compare-exchange is used to ensure the usage number never
+// exceeds its limit, and provide threads with an authoritative signal on how
+// much "usage" they have taken ownership of. With strict_capacity_limit=false,
+// we use a kind of "eventual consistency" where all threads Inserting to the
+// same cache shard might race on reserving the same space, but the
+// over-commitment will be worked out in later insertions. It is kind of a
+// dance because we don't want threads racing each other too much on paying
+// down the over-commitment (with eviction) either.
 //
-// We defer the precise definitions to the comments in the code below.
-// A crucial feature of our references is that attempting to take one never
-// blocks the thread. Another important feature is that readers are
-// prioritized, as they use extremely fast synchronization primitives---they
-// use atomic arithmetic/bit operations, but no compare-and-swaps (which are
-// much slower).
+// Eviction
+// --------
+// A key part of Insert is evicting some entries currently unreferenced to
+// make room for new entries. The high-level eviction algorithm is described
+// above, but the details are also interesting. A key part is parallelizing
+// eviction with a single CLOCK pointer. This works by each thread working on
+// eviction pre-emptively incrementing the CLOCK pointer, and then CLOCK-
+// updating or evicting the incremented-over slot(s). To reduce contention at
+// the cost of possibly evicting too much, each thread increments the clock
+// pointer by 4, so commits to updating at least 4 slots per batch. As
+// described above, a CLOCK update will decrement the "countdown" of
+// unreferenced entries, or evict unreferenced entries with zero countdown.
+// Referenced entries are not updated, because we (presumably) don't want
+// long-referenced entries to age while referenced. Note however that we
+// cannot distinguish transiently referenced entries from cache user
+// references, so some CLOCK updates might be somewhat arbitrarily skipped.
+// This is OK as long as it is rare enough that eviction order is still
+// pretty good.
 //
-// Internal references are used by threads to read slots during a probing
-// sequence, making them the most common references (probing is performed
-// in almost every operation, not just lookups). During a lookup, once
-// the target element is found, and just before the handle is handed over
-// to the user, an internal reference is converted into an external reference.
-// During an update operation, once the target slot is found, an internal
-// reference is converted into an exclusive reference. Interestingly, we
-// can't atomically upgrade from internal to exclusive, or we may run into a
-// deadlock. Releasing the internal reference and then taking an exclusive
-// reference avoids the deadlock, but then the handle may change inbetween.
-// One of the key observations we use in our implementation is that we can
-// make up for this lack of atomicity using IS_ELEMENT and WILL_BE_DELETED.
+// There is no synchronization on the completion of the CLOCK updates, so it
+// is theoretically possible for another thread to cycle back around and have
+// two threads racing on CLOCK updates to the same slot. Thus, we cannot rely
+// on any implied exclusivity to make the updates or eviction more efficient.
+// These updates use an opportunistic compare-exchange (no loop), where a
+// racing thread might cause the update to be skipped without retry, but in
+// such case the update is likely not needed because the most likely update
+// to an entry is that it has become referenced. (TODO: test efficiency of
+// avoiding compare-exchange loop)
 //
-// Distinguishing internal from external references is useful for two reasons:
-// - Internal references are short lived, but external references are typically
-//    not. This is helpful when acquiring an exclusive ref: if there are any
-//    external references to the item, it's probably not worth waiting until
-//    they go away.
-// - We can precisely determine when there are no more external references to a
-//    handle, and proceed to mark it for deletion. This is useful when users
-//    release external references.
+// Release
+// -------
+// In the common case, Release is a simple atomic increment of the release
+// counter. There is a simple overflow check that only does another atomic
+// update in extremely rare cases, so costs almost nothing.
 //
+// If the Release specifies "not useful", we can instead decrement the
+// acquire counter, which returns to the same CLOCK state as before Lookup
+// or Ref.
 //
-// 5. CLOCK ALGORITHM
+// Adding a check for over-full cache on every release to zero-refs would
+// likely be somewhat expensive, increasing read contention on cache shard
+// metadata. Instead we are less aggressive about deleting entries right
+// away in those cases.
 //
-// The clock algorithm circularly sweeps through the hash table to find the next
-// victim. Recall that handles that are referenced are not evictable; the clock
-// algorithm never picks those. We use different clock priorities: NONE, LOW,
-// MEDIUM and HIGH. Priorities LOW, MEDIUM and HIGH represent how close an
-// element is from being evicted, LOW being the closest to evicted. NONE means
-// the slot is not evictable. NONE priority is used in one of the following
-// cases:
-// (a) the slot doesn't contain an element, or
-// (b) the slot contains an externally referenced element, or
-// (c) the slot contains an element that used to be externally referenced,
-//      and the clock pointer has not swept through the slot since the element
-//      stopped being externally referenced.
-// ----------------------------------------------------------------------------
+// However Release tries to immediately delete entries reaching zero refs
+// if (a) erase_if_last_ref is set by the caller, or (b) the entry is already
+// marked invisible. Both of these are checks on values already in CPU
+// registers so do not increase cross-CPU contention when not applicable.
+// When applicable, they use a compare-exchange loop to take exclusive
+// ownership of the slot for freeing the entry. These are rare cases
+// that should not usually affect performance.
+//
+// Erase
+// -----
+// Searches for an entry like Lookup but moves it to Invisible state if found.
+// This state transition is with bit operations so is idempotent and safely
+// done while only holding a shared "read" reference. Like Release, it makes
+// a best effort to immediately release an Invisible entry that reaches zero
+// refs, but there are some corner cases where it will only be freed by the
+// clock eviction process.
+
+// ----------------------------------------------------------------------- //
 
 // The load factor p is a real number in (0, 1) such that at all
 // times at most a fraction p of all slots, without counting tombstones,
-// are occupied by elements. This means that the probability that a
-// random probe hits an empty slot is at most p, and thus at most 1/p probes
+// are occupied by elements. This means that the probability that a random
+// probe hits an occupied slot is at most p, and thus at most 1/p probes
 // are required on average. For example, p = 70% implies that between 1 and 2
 // probes are needed on average (bear in mind that this reasoning doesn't
-// consider the effects of clustering over time).
+// consider the effects of clustering over time, which should be negligible
+// with double hashing).
 // Because the size of the hash table is always rounded up to the next
 // power of 2, p is really an upper bound on the actual load factor---the
 // actual load factor is anywhere between p/2 and p. This is a bit wasteful,
@@ -174,440 +295,119 @@ class ClockCacheTest;
 // Since space cost is dominated by the values (the LSM blocks),
 // overprovisioning the table with metadata only increases the total cache space
 // usage by a tiny fraction.
-constexpr double kLoadFactor = 0.35;
+constexpr double kLoadFactor = 0.7;
 
 // The user can exceed kLoadFactor if the sizes of the inserted values don't
-// match estimated_value_size, or if strict_capacity_limit == false. To
-// avoid a performance drop, we set a strict upper bound on the load factor.
-constexpr double kStrictLoadFactor = 0.7;
-
-// Maximum number of spins when trying to acquire a ref.
-// TODO(Guido) This value was set arbitrarily. Is it appropriate?
-// What's the best way to bound the spinning?
-constexpr uint32_t kSpinsPerTry = 100000;
-
-// Arbitrary seeds.
-constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
-constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5;
-
-struct ClockHandle {
-  void* value;
-  Cache::DeleterFn deleter;
-  uint32_t hash;
-  size_t total_charge;
-  std::array<char, kCacheKeySize> key_data;
-
-  static constexpr uint8_t kIsElementOffset = 0;
-  static constexpr uint8_t kClockPriorityOffset = 1;
-  static constexpr uint8_t kIsHitOffset = 3;
-  static constexpr uint8_t kCachePriorityOffset = 4;
-
-  enum Flags : uint8_t {
-    // Whether the slot is in use by an element.
-    IS_ELEMENT = 1 << kIsElementOffset,
-    // Clock priorities. Represents how close a handle is from being evictable.
-    CLOCK_PRIORITY = 3 << kClockPriorityOffset,
-    // Whether the handle has been looked up after its insertion.
-    HAS_HIT = 1 << kIsHitOffset,
-    // The value of Cache::Priority of the handle.
-    CACHE_PRIORITY = 1 << kCachePriorityOffset,
-  };
-
-  std::atomic<uint8_t> flags;
-
-  enum ClockPriority : uint8_t {
-    NONE = (0 << kClockPriorityOffset),
-    LOW = (1 << kClockPriorityOffset),
-    MEDIUM = (2 << kClockPriorityOffset),
-    HIGH = (3 << kClockPriorityOffset)
-  };
-
-  // The number of elements that hash to this slot or a lower one, but wind
-  // up in this slot or a higher one.
-  std::atomic<uint32_t> displacements;
-
-  static constexpr uint8_t kExternalRefsOffset = 0;
-  static constexpr uint8_t kSharedRefsOffset = 15;
-  static constexpr uint8_t kExclusiveRefOffset = 30;
-  static constexpr uint8_t kWillBeDeletedOffset = 31;
-
-  enum Refs : uint32_t {
-    // Synchronization model:
-    // - An external reference guarantees that hash, value, key_data
-    //    and the IS_ELEMENT flag are not modified. Doesn't allow
-    //    any writes.
-    // - An internal reference has the same guarantees as an
-    //    external reference, and additionally allows the following
-    //    idempotent updates on the handle:
-    //      * set CLOCK_PRIORITY to NONE;
-    //      * set the HAS_HIT bit;
-    //      * set the WILL_BE_DELETED bit.
-    // - A shared reference is either an external reference or an
-    //    internal reference.
-    // - An exclusive reference guarantees that no other thread has a shared
-    //    or exclusive reference to the handle, and allows writes
-    //    on the handle.
-
-    // Number of external references to the slot.
-    EXTERNAL_REFS = ((uint32_t{1} << 15) - 1)
-                    << kExternalRefsOffset,  // Bits 0, ..., 14
-    // Number of internal references plus external references to the slot.
-    SHARED_REFS = ((uint32_t{1} << 15) - 1)
-                  << kSharedRefsOffset,  // Bits 15, ..., 29
-    // Whether a thread has an exclusive reference to the slot.
-    EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset,  // Bit 30
-    // Whether the handle will be deleted soon. When this bit is set, new
-    // internal references to this handle stop being accepted.
-    // External references may still be granted---they can be created from
-    // existing external references, or converting from existing internal
-    // references.
-    WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset  // Bit 31
-
-    // Having these 4 fields in a single variable allows us to support the
-    // following operations efficiently:
-    // - Convert an internal reference into an external reference in a single
-    //    atomic arithmetic operation.
-    // - Attempt to take a shared reference using a single atomic arithmetic
-    //    operation. This is because we can increment the internal ref count
-    //    as well as checking whether the entry is marked for deletion using a
-    //    single atomic arithmetic operation (and one non-atomic comparison).
-  };
-
-  static constexpr uint32_t kOneInternalRef = 0x8000;
-  static constexpr uint32_t kOneExternalRef = 0x8001;
-
-  std::atomic<uint32_t> refs;
+// match estimated_value_size, or in some rare cases with
+// strict_capacity_limit == false. To avoid degenerate performance, we set a
+// strict upper bound on the load factor.
+constexpr double kStrictLoadFactor = 0.84;
 
-  // True iff the handle is allocated separately from hash table.
-  bool detached;
-
-  ClockHandle()
-      : value(nullptr),
-        deleter(nullptr),
-        hash(0),
-        total_charge(0),
-        flags(0),
-        displacements(0),
-        refs(0),
-        detached(false) {
-    SetWillBeDeleted(false);
-    SetIsElement(false);
-    SetClockPriority(ClockPriority::NONE);
-    SetCachePriority(Cache::Priority::LOW);
-    key_data.fill(0);
-  }
+using CacheKeyBytes = std::array<char, kCacheKeySize>;
 
-  // The copy ctor and assignment operator are only used to copy a handle
-  // for immediate deletion. (We need to copy because the slot may become
-  // re-used before the deletion is completed.) We only copy the necessary
-  // members to carry out the deletion. In particular, we don't need
-  // the atomic members.
-  ClockHandle(const ClockHandle& other) { *this = other; }
-
-  void operator=(const ClockHandle& other) {
-    value = other.value;
-    deleter = other.deleter;
-    key_data = other.key_data;
-    hash = other.hash;
-    total_charge = other.total_charge;
-  }
+struct ClockHandleBasicData {
+  void* value = nullptr;
+  Cache::DeleterFn deleter = nullptr;
+  CacheKeyBytes key = {};
+  size_t total_charge = 0;
 
-  Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
+  Slice KeySlice() const { return Slice(key.data(), kCacheKeySize); }
 
-  void FreeData() {
+  void FreeData() const {
     if (deleter) {
-      (*deleter)(key(), value);
+      (*deleter)(KeySlice(), value);
     }
   }
+};
+
+struct ClockHandleMoreData : public ClockHandleBasicData {
+  uint32_t hash = 0;
+};
+
+// Target size to be exactly a common cache line size (see static_assert in
+// clock_cache.cc)
+struct ALIGN_AS(64U) ClockHandle : public ClockHandleMoreData {
+  // Constants for handling the atomic `meta` word, which tracks most of the
+  // state of the handle. The meta word looks like this:
+  // low bits                                                     high bits
+  // -----------------------------------------------------------------------
+  // | acquire counter          | release counter           | state marker |
+  // -----------------------------------------------------------------------
+
+  // For reading or updating counters in meta word.
+  static constexpr uint8_t kCounterNumBits = 30;
+  static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1;
+
+  static constexpr uint8_t kAcquireCounterShift = 0;
+  static constexpr uint64_t kAcquireIncrement = uint64_t{1}
+                                                << kAcquireCounterShift;
+  static constexpr uint8_t kReleaseCounterShift = kCounterNumBits;
+  static constexpr uint64_t kReleaseIncrement = uint64_t{1}
+                                                << kReleaseCounterShift;
+
+  // For reading or updating the state marker in meta word
+  static constexpr uint8_t kStateShift = 2U * kCounterNumBits;
+
+  // Bits contribution to state marker.
+  // Occupied means any state other than empty
+  static constexpr uint8_t kStateOccupiedBit = 0b100;
+  // Shareable means the entry is reference counted (visible or invisible)
+  // (only set if also occupied)
+  static constexpr uint8_t kStateShareableBit = 0b010;
+  // Visible is only set if also shareable
+  static constexpr uint8_t kStateVisibleBit = 0b001;
+
+  // Complete state markers (not shifted into full word)
+  static constexpr uint8_t kStateEmpty = 0b000;
+  static constexpr uint8_t kStateConstruction = kStateOccupiedBit;
+  static constexpr uint8_t kStateInvisible =
+      kStateOccupiedBit | kStateShareableBit;
+  static constexpr uint8_t kStateVisible =
+      kStateOccupiedBit | kStateShareableBit | kStateVisibleBit;
+
+  // Constants for initializing the countdown clock. (Countdown clock is only
+  // in effect with zero refs, acquire counter == release counter, and in that
+  // case the countdown clock == both of those counters.)
+  static constexpr uint8_t kHighCountdown = 3;
+  static constexpr uint8_t kLowCountdown = 2;
+  static constexpr uint8_t kBottomCountdown = 1;
+  // During clock update, treat any countdown clock value greater than this
+  // value the same as this value.
+  static constexpr uint8_t kMaxCountdown = kHighCountdown;
+  // TODO: make these coundown values tuning parameters for eviction?
+
+  // See above
+  std::atomic<uint64_t> meta{};
+  // The number of elements that hash to this slot or a lower one, but wind
+  // up in this slot or a higher one.
+  std::atomic<uint32_t> displacements{};
 
-  // Calculate the memory usage by metadata.
-  inline size_t CalcMetaCharge(
-      CacheMetadataChargePolicy metadata_charge_policy) const {
-    if (metadata_charge_policy != kFullChargeCacheMetadata) {
-      return 0;
-    } else {
-      // #ifdef ROCKSDB_MALLOC_USABLE_SIZE
-      //       return malloc_usable_size(
-      //           const_cast<void*>(static_cast<const void*>(this)));
-      // #else
-      // TODO(Guido) malloc_usable_size only works when we call it on
-      // a pointer allocated with malloc. Because our handles are all
-      // allocated in a single shot as an array, the user can't call
-      // CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
-      // pointer returned by the cache. Moreover, malloc_usable_size
-      // expects a heap-allocated handle, but sometimes in our code we
-      // wish to pass a stack-allocated handle (this is only a performance
-      // concern).
-      // What is the right way to compute metadata charges with pre-allocated
-      // handles?
-      return sizeof(ClockHandle);
-      // #endif
-    }
-  }
-
-  inline void CalcTotalCharge(
-      size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
-    total_charge = charge + CalcMetaCharge(metadata_charge_policy);
-  }
-
-  inline size_t GetCharge(
-      CacheMetadataChargePolicy metadata_charge_policy) const {
-    size_t meta_charge = CalcMetaCharge(metadata_charge_policy);
-    assert(total_charge >= meta_charge);
-    return total_charge - meta_charge;
-  }
-
-  // flags functions.
-
-  bool IsElement() const { return flags & Flags::IS_ELEMENT; }
-
-  void SetIsElement(bool is_element) {
-    if (is_element) {
-      flags |= Flags::IS_ELEMENT;
-    } else {
-      flags &= static_cast<uint8_t>(~Flags::IS_ELEMENT);
-    }
-  }
-
-  bool HasHit() const { return flags & HAS_HIT; }
-
-  void SetHit() { flags |= HAS_HIT; }
-
-  Cache::Priority GetCachePriority() const {
-    return static_cast<Cache::Priority>(flags & CACHE_PRIORITY);
-  }
-
-  void SetCachePriority(Cache::Priority priority) {
-    if (priority == Cache::Priority::HIGH) {
-      flags |= Flags::CACHE_PRIORITY;
-    } else {
-      flags &= static_cast<uint8_t>(~Flags::CACHE_PRIORITY);
-    }
-  }
-
-  bool IsInClock() const {
-    return GetClockPriority() != ClockHandle::ClockPriority::NONE;
-  }
-
-  ClockPriority GetClockPriority() const {
-    return static_cast<ClockPriority>(flags & Flags::CLOCK_PRIORITY);
-  }
-
-  void SetClockPriority(ClockPriority priority) {
-    flags &= static_cast<uint8_t>(~Flags::CLOCK_PRIORITY);
-    flags |= priority;
-  }
-
-  void DecreaseClockPriority() {
-    uint8_t p = static_cast<uint8_t>(flags & Flags::CLOCK_PRIORITY) >>
-                kClockPriorityOffset;
-    assert(p > 0);
-    p--;
-    flags &= static_cast<uint8_t>(~Flags::CLOCK_PRIORITY);
-    ClockPriority new_priority =
-        static_cast<ClockPriority>(p << kClockPriorityOffset);
-    flags |= new_priority;
-  }
-
-  bool IsDetached() { return detached; }
-
-  void SetDetached() { detached = true; }
-
-  inline bool IsEmpty() const {
-    return !this->IsElement() && this->displacements == 0;
-  }
-
-  inline bool IsTombstone() const {
-    return !this->IsElement() && this->displacements > 0;
-  }
-
-  inline bool Matches(const Slice& some_key, uint32_t some_hash) const {
-    return this->hash == some_hash && this->key() == some_key;
-  }
-
-  // refs functions.
-
-  inline bool WillBeDeleted() const { return refs & WILL_BE_DELETED; }
-
-  void SetWillBeDeleted(bool will_be_deleted) {
-    if (will_be_deleted) {
-      refs |= WILL_BE_DELETED;
-    } else {
-      refs &= ~WILL_BE_DELETED;
-    }
-  }
-
-  uint32_t ExternalRefs() const {
-    return (refs & EXTERNAL_REFS) >> kExternalRefsOffset;
-  }
-
-  // Tries to take an internal ref. Returns true iff it succeeds.
-  inline bool TryInternalRef() {
-    if (!((refs += kOneInternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
-      return true;
-    }
-    refs -= kOneInternalRef;
-    return false;
-  }
-
-  // Tries to take an external ref. Returns true iff it succeeds.
-  inline bool TryExternalRef() {
-    if (!((refs += kOneExternalRef) & EXCLUSIVE_REF)) {
-      return true;
-    }
-    refs -= kOneExternalRef;
-    return false;
-  }
-
-  // Tries to take an exclusive ref. Returns true iff it succeeds.
-  // TODO(Guido) After every TryExclusiveRef call, we always call
-  // WillBeDeleted(). We could save an atomic read by having an output parameter
-  // with the last value of refs.
-  inline bool TryExclusiveRef() {
-    uint32_t will_be_deleted = refs & WILL_BE_DELETED;
-    uint32_t expected = will_be_deleted;
-    return refs.compare_exchange_strong(expected,
-                                        EXCLUSIVE_REF | will_be_deleted);
-  }
-
-  // Repeatedly tries to take an exclusive reference, but aborts as soon
-  // as an external or exclusive reference is detected (since the wait
-  // would presumably be too long).
-  inline bool SpinTryExclusiveRef() {
-    uint32_t expected = 0;
-    uint32_t will_be_deleted = 0;
-    uint32_t spins = kSpinsPerTry;
-    while (!refs.compare_exchange_strong(expected,
-                                         EXCLUSIVE_REF | will_be_deleted) &&
-           spins--) {
-      std::this_thread::yield();
-      if (expected & (EXTERNAL_REFS | EXCLUSIVE_REF)) {
-        return false;
-      }
-      will_be_deleted = expected & WILL_BE_DELETED;
-      expected = will_be_deleted;
-    }
-    return true;
-  }
-
-  // Take an external ref, assuming there is already one external ref
-  // to the handle.
-  void Ref() {
-    // TODO(Guido) Is it okay to assume that the existing external reference
-    // survives until this function returns?
-    refs += kOneExternalRef;
-  }
-
-  inline void ReleaseExternalRef() { refs -= kOneExternalRef; }
-
-  inline void ReleaseInternalRef() { refs -= kOneInternalRef; }
-
-  inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); }
-
-  // Downgrade an exclusive ref to external.
-  inline void ExclusiveToExternalRef() {
-    refs += kOneExternalRef;
-    ReleaseExclusiveRef();
-  }
-
-  // Convert an internal ref into external.
-  inline void InternalToExternalRef() {
-    refs += kOneExternalRef - kOneInternalRef;
-  }
-
+  // True iff the handle is allocated separately from hash table.
+  bool detached = false;
 };  // struct ClockHandle
 
 class ClockHandleTable {
  public:
-  explicit ClockHandleTable(size_t capacity, int hash_bits);
+  explicit ClockHandleTable(int hash_bits, bool initial_charge_metadata);
   ~ClockHandleTable();
 
-  // Returns a pointer to a visible handle matching the key/hash, or
-  // nullptr if not present. When an actual handle is produced, an
-  // internal reference is handed over.
-  ClockHandle* Lookup(const Slice& key, uint32_t hash);
-
-  // Inserts a copy of h into the hash table. Returns a pointer to the
-  // inserted handle, or nullptr if no available slot was found. Every
-  // existing visible handle matching the key is already present in the
-  // hash table is marked as WILL_BE_DELETED. The deletion is also attempted,
-  // and, if the attempt is successful, the handle is inserted into the
-  // autovector deleted. When take_reference is true, the function hands
-  // over an external reference on the handle, and otherwise no reference is
-  // produced.
-  ClockHandle* Insert(ClockHandle* h, autovector<ClockHandle>* deleted,
-                      bool take_reference);
-
-  // Assigns h the appropriate clock priority, making it evictable.
-  void ClockOn(ClockHandle* h);
-
-  // Makes h non-evictable.
-  void ClockOff(ClockHandle* h);
-
-  // Runs the clock eviction algorithm until usage_ + charge is at most
-  // capacity_.
-  void ClockRun(size_t charge);
-
-  // Remove h from the hash table. Requires an exclusive ref to h.
-  void Remove(ClockHandle* h, autovector<ClockHandle>* deleted);
-
-  // Remove from the hash table all handles with matching key/hash along a
-  // probe sequence, starting from the given probe number. Doesn't
-  // require any references.
-  void RemoveAll(const Slice& key, uint32_t hash, uint32_t& probe,
-                 autovector<ClockHandle>* deleted);
-
-  void RemoveAll(const Slice& key, uint32_t hash,
-                 autovector<ClockHandle>* deleted) {
-    uint32_t probe = 0;
-    RemoveAll(key, hash, probe, deleted);
-  }
+  Status Insert(const ClockHandleMoreData& proto, ClockHandle** handle,
+                Cache::Priority priority, size_t capacity,
+                bool strict_capacity_limit);
 
-  // Tries to remove h from the hash table. If the attempt is successful,
-  // the function hands over an exclusive ref to h.
-  bool TryRemove(ClockHandle* h, autovector<ClockHandle>* deleted);
-
-  // Similar to TryRemove, except that it spins, increasing the chances of
-  // success. Requires that the caller thread has no shared ref to h.
-  bool SpinTryRemove(ClockHandle* h, autovector<ClockHandle>* deleted);
-
-  // Call this function after an Insert, Remove, RemoveAll, TryRemove
-  // or SpinTryRemove. It frees the deleted values and updates the hash table
-  // metadata.
-  void Free(autovector<ClockHandle>* deleted);
-
-  void ApplyToEntriesRange(std::function<void(ClockHandle*)> func,
-                           uint32_t index_begin, uint32_t index_end,
-                           bool apply_if_will_be_deleted) {
-    for (uint32_t i = index_begin; i < index_end; i++) {
-      ClockHandle* h = &array_[i];
-      if (h->TryExclusiveRef()) {
-        if (h->IsElement() &&
-            (apply_if_will_be_deleted || !h->WillBeDeleted())) {
-          func(h);
-        }
-        h->ReleaseExclusiveRef();
-      }
-    }
-  }
+  ClockHandle* Lookup(const CacheKeyBytes& key, uint32_t hash);
+
+  bool Release(ClockHandle* handle, bool useful, bool erase_if_last_ref);
 
-  void ConstApplyToEntriesRange(std::function<void(const ClockHandle*)> func,
+  void Ref(ClockHandle& handle);
+
+  void Erase(const CacheKeyBytes& key, uint32_t hash);
+
+  void ConstApplyToEntriesRange(std::function<void(const ClockHandle&)> func,
                                 uint32_t index_begin, uint32_t index_end,
-                                bool apply_if_will_be_deleted) const {
-    for (uint32_t i = index_begin; i < index_end; i++) {
-      ClockHandle* h = &array_[i];
-      // We take an external ref because we are handing over control
-      // to a user-defined function, and because the handle will not be
-      // modified.
-      if (h->TryExternalRef()) {
-        if (h->IsElement() &&
-            (apply_if_will_be_deleted || !h->WillBeDeleted())) {
-          func(h);
-        }
-        h->ReleaseExternalRef();
-      }
-    }
-  }
+                                bool apply_if_will_be_deleted) const;
+
+  void EraseUnRefEntries();
 
   uint32_t GetTableSize() const { return uint32_t{1} << length_bits_; }
 
@@ -615,22 +415,29 @@ class ClockHandleTable {
 
   uint32_t GetOccupancyLimit() const { return occupancy_limit_; }
 
-  uint32_t GetOccupancy() const { return occupancy_; }
+  uint32_t GetOccupancy() const {
+    return occupancy_.load(std::memory_order_relaxed);
+  }
 
-  size_t GetUsage() const { return usage_; }
+  size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
 
-  size_t GetCapacity() const { return capacity_; }
+  size_t GetDetachedUsage() const {
+    return detached_usage_.load(std::memory_order_relaxed);
+  }
 
-  void SetCapacity(size_t capacity) { capacity_ = capacity; }
+  // Acquire/release N references
+  void TEST_RefN(ClockHandle& handle, size_t n);
+  void TEST_ReleaseN(ClockHandle* handle, size_t n);
 
+ private:  // functions
   // Returns x mod 2^{length_bits_}.
   uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; }
 
- private:
-  // Extracts the element information from a handle (src), and assigns it
-  // to a hash table slot (dst). Doesn't touch displacements and refs,
-  // which are maintained by the hash table algorithm.
-  void Assign(ClockHandle* dst, ClockHandle* src);
+  // Runs the clock eviction algorithm trying to reclaim at least
+  // requested_charge. Returns how much is evicted, which could be less
+  // if it appears impossible to evict the requested amount without blocking.
+  void Evict(size_t requested_charge, size_t* freed_charge,
+             uint32_t* freed_count);
 
   // Returns the first slot in the probe sequence, starting from the given
   // probe number, with a handle e such that match(e) is true. At every
@@ -643,26 +450,17 @@ class ClockHandleTable {
   // value of probe is one more than the last non-aborting probe during the
   // call. This is so that that the variable can be used to keep track of
   // progress across consecutive calls to FindSlot.
-  inline ClockHandle* FindSlot(const Slice& key,
+  inline ClockHandle* FindSlot(uint32_t hash,
                                std::function<bool(ClockHandle*)> match,
                                std::function<bool(ClockHandle*)> stop,
                                std::function<void(ClockHandle*)> update,
                                uint32_t& probe);
 
-  // Returns an available slot for the given key. All copies of the
-  // key found along the probing sequence until an available slot is
-  // found are marked for deletion. On each of them, a deletion is
-  // attempted, and when the attempt succeeds the slot is assigned to
-  // the new copy of the element.
-  ClockHandle* FindAvailableSlot(const Slice& key, uint32_t hash,
-                                 uint32_t& probe,
-                                 autovector<ClockHandle>* deleted);
-
-  // After a failed FindSlot call (i.e., with answer -1) in
-  // FindAvailableSlot, this function fixes all displacements's
-  // starting from the 0-th probe, until the given probe.
-  void Rollback(const Slice& key, uint32_t probe);
+  // Re-decrement all displacements in probe path starting from beginning
+  // until (not including) the given handle
+  void Rollback(uint32_t hash, const ClockHandle* h);
 
+ private:  // data
   // Number of hash bits used for table index.
   // The size of the table is 1 << length_bits_.
   const int length_bits_;
@@ -673,27 +471,26 @@ class ClockHandleTable {
   // Maximum number of elements the user can store in the table.
   const uint32_t occupancy_limit_;
 
-  // Maximum total charge of all elements stored in the table.
-  size_t capacity_;
+  // Array of slots comprising the hash table.
+  const std::unique_ptr<ClockHandle[]> array_;
 
   // We partition the following members into different cache lines
   // to avoid false sharing among Lookup, Release, Erase and Insert
   // operations in ClockCacheShard.
 
-  ALIGN_AS(CACHE_LINE_SIZE)
-  // Array of slots comprising the hash table.
-  std::unique_ptr<ClockHandle[]> array_;
-
   ALIGN_AS(CACHE_LINE_SIZE)
   // Clock algorithm sweep pointer.
-  std::atomic<uint32_t> clock_pointer_;
+  std::atomic<uint64_t> clock_pointer_{};
 
   ALIGN_AS(CACHE_LINE_SIZE)
   // Number of elements in the table.
-  std::atomic<uint32_t> occupancy_;
+  std::atomic<uint32_t> occupancy_{};
 
-  // Memory size for entries residing in the cache.
-  std::atomic<size_t> usage_;
+  // Memory usage by entries tracked by the cache (including detached)
+  std::atomic<size_t> usage_{};
+
+  // Part of usage by detached entries (not in table)
+  std::atomic<size_t> detached_usage_{};
 };  // class ClockHandleTable
 
 // A single shard of sharded cache.
@@ -704,58 +501,34 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
                   CacheMetadataChargePolicy metadata_charge_policy);
   ~ClockCacheShard() override = default;
 
-  // Separate from constructor so caller can easily make an array of ClockCache
-  // if current usage is more than new capacity, the function will attempt to
-  // free the needed space.
+  // TODO: document limitations
   void SetCapacity(size_t capacity) override;
 
-  // Set the flag to reject insertion if cache if full.
   void SetStrictCapacityLimit(bool strict_capacity_limit) override;
 
-  // Like Cache methods, but with an extra "hash" parameter.
-  // Insert an item into the hash table and, if handle is null, make it
-  // evictable by the clock algorithm. Older items are evicted as necessary.
-  // If the cache is full and free_handle_on_fail is true, the item is deleted
-  // and handle is set to nullptr.
   Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
                 Cache::DeleterFn deleter, Cache::Handle** handle,
                 Cache::Priority priority) override;
 
-  Status Insert(const Slice& key, uint32_t hash, void* value,
-                const Cache::CacheItemHelper* helper, size_t charge,
-                Cache::Handle** handle, Cache::Priority priority) override {
-    return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
-  }
-
-  Cache::Handle* Lookup(const Slice& key, uint32_t hash,
-                        const Cache::CacheItemHelper* /*helper*/,
-                        const Cache::CreateCallback& /*create_cb*/,
-                        Cache::Priority /*priority*/, bool /*wait*/,
-                        Statistics* /*stats*/) override {
-    return Lookup(key, hash);
-  }
-
   Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
 
-  bool Release(Cache::Handle* handle, bool /*useful*/,
-               bool erase_if_last_ref) override {
-    return Release(handle, erase_if_last_ref);
-  }
+  bool Release(Cache::Handle* handle, bool useful,
+               bool erase_if_last_ref) override;
 
-  bool IsReady(Cache::Handle* /*handle*/) override { return true; }
-
-  void Wait(Cache::Handle* /*handle*/) override {}
+  bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
 
   bool Ref(Cache::Handle* handle) override;
 
-  bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
-
   void Erase(const Slice& key, uint32_t hash) override;
 
   size_t GetUsage() const override;
 
   size_t GetPinnedUsage() const override;
 
+  size_t GetOccupancyCount() const override;
+
+  size_t GetTableAddressCount() const override;
+
   void ApplyToSomeEntries(
       const std::function<void(const Slice& key, void* value, size_t charge,
                                DeleterFn deleter)>& callback,
@@ -765,29 +538,48 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
 
   std::string GetPrintableOptions() const override { return std::string{}; }
 
- private:
+  // SecondaryCache not yet supported
+  Status Insert(const Slice& key, uint32_t hash, void* value,
+                const Cache::CacheItemHelper* helper, size_t charge,
+                Cache::Handle** handle, Cache::Priority priority) override {
+    return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
+  }
+
+  Cache::Handle* Lookup(const Slice& key, uint32_t hash,
+                        const Cache::CacheItemHelper* /*helper*/,
+                        const Cache::CreateCallback& /*create_cb*/,
+                        Cache::Priority /*priority*/, bool /*wait*/,
+                        Statistics* /*stats*/) override {
+    return Lookup(key, hash);
+  }
+
+  bool IsReady(Cache::Handle* /*handle*/) override { return true; }
+
+  void Wait(Cache::Handle* /*handle*/) override {}
+
+  // Acquire/release N references
+  void TEST_RefN(Cache::Handle* handle, size_t n);
+  void TEST_ReleaseN(Cache::Handle* handle, size_t n);
+
+ private:  // functions
   friend class ClockCache;
   friend class ClockCacheTest;
 
-  ClockHandle* DetachedInsert(ClockHandle* h);
-
-  // Returns the charge of a single handle.
-  static size_t CalcEstimatedHandleCharge(
-      size_t estimated_value_size,
-      CacheMetadataChargePolicy metadata_charge_policy);
+  ClockHandle* DetachedInsert(const ClockHandleMoreData& h);
 
   // Returns the number of bits used to hash an element in the hash
   // table.
   static int CalcHashBits(size_t capacity, size_t estimated_value_size,
                           CacheMetadataChargePolicy metadata_charge_policy);
 
-  // Whether to reject insertion if cache reaches its full capacity.
-  std::atomic<bool> strict_capacity_limit_;
+ private:  // data
+  ClockHandleTable table_;
 
-  // Handles allocated separately from the table.
-  std::atomic<size_t> detached_usage_;
+  // Maximum total charge of all elements stored in the table.
+  std::atomic<size_t> capacity_;
 
-  ClockHandleTable table_;
+  // Whether to reject insertion if cache reaches its full capacity.
+  std::atomic<bool> strict_capacity_limit_;
 };  // class ClockCacheShard
 
 class ClockCache
diff --git a/cache/fast_lru_cache.cc b/cache/fast_lru_cache.cc
index 817f3be18..f5f93800d 100644
--- a/cache/fast_lru_cache.cc
+++ b/cache/fast_lru_cache.cc
@@ -173,13 +173,13 @@ inline int LRUHandleTable::FindSlot(const Slice& key,
 LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size,
                              bool strict_capacity_limit,
                              CacheMetadataChargePolicy metadata_charge_policy)
-    : capacity_(capacity),
+    : CacheShard(metadata_charge_policy),
+      capacity_(capacity),
       strict_capacity_limit_(strict_capacity_limit),
       table_(
           CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)),
       usage_(0),
       lru_usage_(0) {
-  set_metadata_charge_policy(metadata_charge_policy);
   // Make empty circular linked list.
   lru_.next = &lru_;
   lru_.prev = &lru_;
@@ -525,6 +525,16 @@ size_t LRUCacheShard::GetPinnedUsage() const {
   return usage_ - lru_usage_;
 }
 
+size_t LRUCacheShard::GetOccupancyCount() const {
+  DMutexLock l(mutex_);
+  return table_.GetOccupancy();
+}
+
+size_t LRUCacheShard::GetTableAddressCount() const {
+  DMutexLock l(mutex_);
+  return table_.GetTableSize();
+}
+
 std::string LRUCacheShard::GetPrintableOptions() const { return std::string{}; }
 
 LRUCache::LRUCache(size_t capacity, size_t estimated_value_size,
diff --git a/cache/fast_lru_cache.h b/cache/fast_lru_cache.h
index a02422beb..77aff8bab 100644
--- a/cache/fast_lru_cache.h
+++ b/cache/fast_lru_cache.h
@@ -368,6 +368,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
 
   size_t GetUsage() const override;
   size_t GetPinnedUsage() const override;
+  size_t GetOccupancyCount() const override;
+  size_t GetTableAddressCount() const override;
 
   void ApplyToSomeEntries(
       const std::function<void(const Slice& key, void* value, size_t charge,
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index 2ac2019b6..864bcf7d9 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -115,7 +115,8 @@ LRUCacheShard::LRUCacheShard(
     double low_pri_pool_ratio, bool use_adaptive_mutex,
     CacheMetadataChargePolicy metadata_charge_policy, int max_upper_hash_bits,
     const std::shared_ptr<SecondaryCache>& secondary_cache)
-    : capacity_(0),
+    : CacheShard(metadata_charge_policy),
+      capacity_(0),
       high_pri_pool_usage_(0),
       low_pri_pool_usage_(0),
       strict_capacity_limit_(strict_capacity_limit),
@@ -128,7 +129,6 @@ LRUCacheShard::LRUCacheShard(
       lru_usage_(0),
       mutex_(use_adaptive_mutex),
       secondary_cache_(secondary_cache) {
-  set_metadata_charge_policy(metadata_charge_policy);
   // Make empty circular linked list.
   lru_.next = &lru_;
   lru_.prev = &lru_;
@@ -759,6 +759,16 @@ size_t LRUCacheShard::GetPinnedUsage() const {
   return usage_ - lru_usage_;
 }
 
+size_t LRUCacheShard::GetOccupancyCount() const {
+  DMutexLock l(mutex_);
+  return table_.GetOccupancyCount();
+}
+
+size_t LRUCacheShard::GetTableAddressCount() const {
+  DMutexLock l(mutex_);
+  return size_t{1} << table_.GetLengthBits();
+}
+
 std::string LRUCacheShard::GetPrintableOptions() const {
   const int kBufferSize = 200;
   char buffer[kBufferSize];
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index b60d5ac7b..6e642d04d 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -305,6 +305,8 @@ class LRUHandleTable {
 
   int GetLengthBits() const { return length_bits_; }
 
+  size_t GetOccupancyCount() const { return elems_; }
+
  private:
   // Return a pointer to slot that points to a cache entry that
   // matches key/hash.  If there is no such cache entry, return a
@@ -394,6 +396,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
 
   virtual size_t GetUsage() const override;
   virtual size_t GetPinnedUsage() const override;
+  virtual size_t GetOccupancyCount() const override;
+  virtual size_t GetTableAddressCount() const override;
 
   virtual void ApplyToSomeEntries(
       const std::function<void(const Slice& key, void* value, size_t charge,
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index a5021dd75..1b70bde2d 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -521,11 +521,11 @@ class ClockCacheTest : public testing::Test {
     }
   }
 
-  void NewShard(size_t capacity) {
+  void NewShard(size_t capacity, bool strict_capacity_limit = true) {
     DeleteShard();
     shard_ = reinterpret_cast<ClockCacheShard*>(
         port::cacheline_aligned_alloc(sizeof(ClockCacheShard)));
-    new (shard_) ClockCacheShard(capacity, 1, true /*strict_capacity_limit*/,
+    new (shard_) ClockCacheShard(capacity, 1, strict_capacity_limit,
                                  kDontChargeCacheMetadata);
   }
 
@@ -539,21 +539,26 @@ class ClockCacheTest : public testing::Test {
     return Insert(std::string(kCacheKeySize, key), priority);
   }
 
-  Status Insert(char key, size_t len) { return Insert(std::string(len, key)); }
+  Status InsertWithLen(char key, size_t len) {
+    return Insert(std::string(len, key));
+  }
 
-  bool Lookup(const std::string& key) {
+  bool Lookup(const std::string& key, bool useful = true) {
     auto handle = shard_->Lookup(key, 0 /*hash*/);
     if (handle) {
-      shard_->Release(handle);
+      shard_->Release(handle, useful, /*erase_if_last_ref=*/false);
       return true;
     }
     return false;
   }
 
-  bool Lookup(char key) { return Lookup(std::string(kCacheKeySize, key)); }
+  bool Lookup(char key, bool useful = true) {
+    return Lookup(std::string(kCacheKeySize, key), useful);
+  }
 
   void Erase(const std::string& key) { shard_->Erase(key, 0 /*hash*/); }
 
+#if 0  // FIXME
   size_t CalcEstimatedHandleChargeWrapper(
       size_t estimated_value_size,
       CacheMetadataChargePolicy metadata_charge_policy) {
@@ -583,106 +588,419 @@ class ClockCacheTest : public testing::Test {
              (1 << (hash_bits - 1) <= max_occupancy);
     }
   }
+#endif
 
- private:
   ClockCacheShard* shard_ = nullptr;
 };
 
-TEST_F(ClockCacheTest, Validate) {
+TEST_F(ClockCacheTest, Misc) {
   NewShard(3);
-  EXPECT_OK(Insert('a', 16));
-  EXPECT_NOK(Insert('b', 15));
-  EXPECT_OK(Insert('b', 16));
-  EXPECT_NOK(Insert('c', 17));
-  EXPECT_NOK(Insert('d', 1000));
-  EXPECT_NOK(Insert('e', 11));
-  EXPECT_NOK(Insert('f', 0));
-}
 
-TEST_F(ClockCacheTest, ClockPriorityTest) {
-  ClockHandle handle;
-  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::NONE);
-  handle.SetClockPriority(ClockHandle::ClockPriority::HIGH);
-  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::HIGH);
-  handle.DecreaseClockPriority();
-  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::MEDIUM);
-  handle.DecreaseClockPriority();
-  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::LOW);
-  handle.SetClockPriority(ClockHandle::ClockPriority::MEDIUM);
-  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::MEDIUM);
-  handle.SetClockPriority(ClockHandle::ClockPriority::NONE);
-  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::NONE);
-  handle.SetClockPriority(ClockHandle::ClockPriority::MEDIUM);
-  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::MEDIUM);
-  handle.DecreaseClockPriority();
-  handle.DecreaseClockPriority();
-  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::NONE);
+  // Key size stuff
+  EXPECT_OK(InsertWithLen('a', 16));
+  EXPECT_NOK(InsertWithLen('b', 15));
+  EXPECT_OK(InsertWithLen('b', 16));
+  EXPECT_NOK(InsertWithLen('c', 17));
+  EXPECT_NOK(InsertWithLen('d', 1000));
+  EXPECT_NOK(InsertWithLen('e', 11));
+  EXPECT_NOK(InsertWithLen('f', 0));
+
+  // Some of this is motivated by code coverage
+  std::string wrong_size_key(15, 'x');
+  EXPECT_FALSE(Lookup(wrong_size_key));
+  EXPECT_FALSE(shard_->Ref(nullptr));
+  EXPECT_FALSE(shard_->Release(nullptr));
+  shard_->Erase(wrong_size_key, /*hash*/ 42);  // no-op
 }
 
-TEST_F(ClockCacheTest, CalcHashBitsTest) {
-  size_t capacity;
-  size_t estimated_value_size;
-  double max_occupancy;
-  int hash_bits;
-  CacheMetadataChargePolicy metadata_charge_policy;
+TEST_F(ClockCacheTest, Limits) {
+  NewShard(3, false /*strict_capacity_limit*/);
+  for (bool strict_capacity_limit : {false, true, false}) {
+    SCOPED_TRACE("strict_capacity_limit = " +
+                 std::to_string(strict_capacity_limit));
 
-  // Vary the cache capacity, fix the element charge.
-  for (int i = 0; i < 2048; i++) {
-    capacity = i;
-    estimated_value_size = 0;
-    metadata_charge_policy = kFullChargeCacheMetadata;
-    max_occupancy = CalcMaxOccupancy(capacity, estimated_value_size,
-                                     metadata_charge_policy);
-    hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
-                                    metadata_charge_policy);
-    EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy));
+    // Also tests switching between strict limit and not
+    shard_->SetStrictCapacityLimit(strict_capacity_limit);
+
+    std::string key(16, 'x');
+
+    // Single entry charge beyond capacity
+    {
+      Status s = shard_->Insert(key, 0 /*hash*/, nullptr /*value*/,
+                                5 /*charge*/, nullptr /*deleter*/,
+                                nullptr /*handle*/, Cache::Priority::LOW);
+      if (strict_capacity_limit) {
+        EXPECT_TRUE(s.IsMemoryLimit());
+      } else {
+        EXPECT_OK(s);
+      }
+    }
+
+    // Single entry fills capacity
+    {
+      Cache::Handle* h;
+      ASSERT_OK(shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 3 /*charge*/,
+                               nullptr /*deleter*/, &h, Cache::Priority::LOW));
+      // Try to insert more
+      Status s = Insert('a');
+      if (strict_capacity_limit) {
+        EXPECT_TRUE(s.IsMemoryLimit());
+      } else {
+        EXPECT_OK(s);
+      }
+      // Release entry filling capacity.
+      // Cover useful = false case.
+      shard_->Release(h, false /*useful*/, false /*erase_if_last_ref*/);
+    }
+
+    // Insert more than table size can handle (cleverly using zero-charge
+    // entries) to exceed occupancy limit.
+    {
+      size_t n = shard_->GetTableAddressCount() + 1;
+      std::unique_ptr<Cache::Handle* []> ha { new Cache::Handle* [n] {} };
+      Status s;
+      for (size_t i = 0; i < n && s.ok(); ++i) {
+        EncodeFixed64(&key[0], i);
+        s = shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 0 /*charge*/,
+                           nullptr /*deleter*/, &ha[i], Cache::Priority::LOW);
+        if (i == 0) {
+          EXPECT_OK(s);
+        }
+      }
+      if (strict_capacity_limit) {
+        EXPECT_TRUE(s.IsMemoryLimit());
+      } else {
+        EXPECT_OK(s);
+      }
+      // Same result if not keeping a reference
+      s = Insert('a');
+      if (strict_capacity_limit) {
+        EXPECT_TRUE(s.IsMemoryLimit());
+      } else {
+        EXPECT_OK(s);
+      }
+
+      // Regardless, we didn't allow table to actually get full
+      EXPECT_LT(shard_->GetOccupancyCount(), shard_->GetTableAddressCount());
+
+      // Release handles
+      for (size_t i = 0; i < n; ++i) {
+        if (ha[i]) {
+          shard_->Release(ha[i]);
+        }
+      }
+    }
   }
+}
 
-  // Fix the cache capacity, vary the element charge.
-  for (int i = 0; i < 1024; i++) {
-    capacity = 1024;
-    estimated_value_size = i;
-    metadata_charge_policy = kFullChargeCacheMetadata;
-    max_occupancy = CalcMaxOccupancy(capacity, estimated_value_size,
-                                     metadata_charge_policy);
-    hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
-                                    metadata_charge_policy);
-    EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy));
+TEST_F(ClockCacheTest, ClockEvictionTest) {
+  for (bool strict_capacity_limit : {false, true}) {
+    SCOPED_TRACE("strict_capacity_limit = " +
+                 std::to_string(strict_capacity_limit));
+
+    NewShard(6, strict_capacity_limit);
+    EXPECT_OK(Insert('a', Cache::Priority::BOTTOM));
+    EXPECT_OK(Insert('b', Cache::Priority::LOW));
+    EXPECT_OK(Insert('c', Cache::Priority::HIGH));
+    EXPECT_OK(Insert('d', Cache::Priority::BOTTOM));
+    EXPECT_OK(Insert('e', Cache::Priority::LOW));
+    EXPECT_OK(Insert('f', Cache::Priority::HIGH));
+
+    EXPECT_TRUE(Lookup('a', /*use*/ false));
+    EXPECT_TRUE(Lookup('b', /*use*/ false));
+    EXPECT_TRUE(Lookup('c', /*use*/ false));
+    EXPECT_TRUE(Lookup('d', /*use*/ false));
+    EXPECT_TRUE(Lookup('e', /*use*/ false));
+    EXPECT_TRUE(Lookup('f', /*use*/ false));
+
+    // Ensure bottom are evicted first, even if new entries are low
+    EXPECT_OK(Insert('g', Cache::Priority::LOW));
+    EXPECT_OK(Insert('h', Cache::Priority::LOW));
+
+    EXPECT_FALSE(Lookup('a', /*use*/ false));
+    EXPECT_TRUE(Lookup('b', /*use*/ false));
+    EXPECT_TRUE(Lookup('c', /*use*/ false));
+    EXPECT_FALSE(Lookup('d', /*use*/ false));
+    EXPECT_TRUE(Lookup('e', /*use*/ false));
+    EXPECT_TRUE(Lookup('f', /*use*/ false));
+    // Mark g & h useful
+    EXPECT_TRUE(Lookup('g', /*use*/ true));
+    EXPECT_TRUE(Lookup('h', /*use*/ true));
+
+    // Then old LOW entries
+    EXPECT_OK(Insert('i', Cache::Priority::LOW));
+    EXPECT_OK(Insert('j', Cache::Priority::LOW));
+
+    EXPECT_FALSE(Lookup('b', /*use*/ false));
+    EXPECT_TRUE(Lookup('c', /*use*/ false));
+    EXPECT_FALSE(Lookup('e', /*use*/ false));
+    EXPECT_TRUE(Lookup('f', /*use*/ false));
+    // Mark g & h useful once again
+    EXPECT_TRUE(Lookup('g', /*use*/ true));
+    EXPECT_TRUE(Lookup('h', /*use*/ true));
+    EXPECT_TRUE(Lookup('i', /*use*/ false));
+    EXPECT_TRUE(Lookup('j', /*use*/ false));
+
+    // Then old HIGH entries
+    EXPECT_OK(Insert('k', Cache::Priority::LOW));
+    EXPECT_OK(Insert('l', Cache::Priority::LOW));
+
+    EXPECT_FALSE(Lookup('c', /*use*/ false));
+    EXPECT_FALSE(Lookup('f', /*use*/ false));
+    EXPECT_TRUE(Lookup('g', /*use*/ false));
+    EXPECT_TRUE(Lookup('h', /*use*/ false));
+    EXPECT_TRUE(Lookup('i', /*use*/ false));
+    EXPECT_TRUE(Lookup('j', /*use*/ false));
+    EXPECT_TRUE(Lookup('k', /*use*/ false));
+    EXPECT_TRUE(Lookup('l', /*use*/ false));
+
+    // Then the (roughly) least recently useful
+    EXPECT_OK(Insert('m', Cache::Priority::HIGH));
+    EXPECT_OK(Insert('n', Cache::Priority::HIGH));
+
+    EXPECT_TRUE(Lookup('g', /*use*/ false));
+    EXPECT_TRUE(Lookup('h', /*use*/ false));
+    EXPECT_FALSE(Lookup('i', /*use*/ false));
+    EXPECT_FALSE(Lookup('j', /*use*/ false));
+    EXPECT_TRUE(Lookup('k', /*use*/ false));
+    EXPECT_TRUE(Lookup('l', /*use*/ false));
+
+    // Now try changing capacity down
+    shard_->SetCapacity(4);
+    // Insert to ensure evictions happen
+    EXPECT_OK(Insert('o', Cache::Priority::LOW));
+    EXPECT_OK(Insert('p', Cache::Priority::LOW));
+
+    EXPECT_FALSE(Lookup('g', /*use*/ false));
+    EXPECT_FALSE(Lookup('h', /*use*/ false));
+    EXPECT_FALSE(Lookup('k', /*use*/ false));
+    EXPECT_FALSE(Lookup('l', /*use*/ false));
+    EXPECT_TRUE(Lookup('m', /*use*/ false));
+    EXPECT_TRUE(Lookup('n', /*use*/ false));
+    EXPECT_TRUE(Lookup('o', /*use*/ false));
+    EXPECT_TRUE(Lookup('p', /*use*/ false));
+
+    // Now try changing capacity up
+    EXPECT_TRUE(Lookup('m', /*use*/ true));
+    EXPECT_TRUE(Lookup('n', /*use*/ true));
+    shard_->SetCapacity(6);
+    EXPECT_OK(Insert('q', Cache::Priority::HIGH));
+    EXPECT_OK(Insert('r', Cache::Priority::HIGH));
+    EXPECT_OK(Insert('s', Cache::Priority::HIGH));
+    EXPECT_OK(Insert('t', Cache::Priority::HIGH));
+
+    EXPECT_FALSE(Lookup('o', /*use*/ false));
+    EXPECT_FALSE(Lookup('p', /*use*/ false));
+    EXPECT_TRUE(Lookup('m', /*use*/ false));
+    EXPECT_TRUE(Lookup('n', /*use*/ false));
+    EXPECT_TRUE(Lookup('q', /*use*/ false));
+    EXPECT_TRUE(Lookup('r', /*use*/ false));
+    EXPECT_TRUE(Lookup('s', /*use*/ false));
+    EXPECT_TRUE(Lookup('t', /*use*/ false));
   }
+}
 
-  // Zero-capacity cache, and only values have charge.
-  capacity = 0;
-  estimated_value_size = 1;
-  metadata_charge_policy = kDontChargeCacheMetadata;
-  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
-                                  metadata_charge_policy);
-  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */));
+void IncrementIntDeleter(const Slice& /*key*/, void* value) {
+  *reinterpret_cast<int*>(value) += 1;
+}
 
-  // Zero-capacity cache, and only metadata has charge.
-  capacity = 0;
-  estimated_value_size = 0;
-  metadata_charge_policy = kFullChargeCacheMetadata;
-  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
-                                  metadata_charge_policy);
-  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */));
+// Testing calls to CorrectNearOverflow in Release
+TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
+  NewShard(6, /*strict_capacity_limit*/ false);
+  Cache::Handle* h;
+  int deleted = 0;
+  std::string my_key(kCacheKeySize, 'x');
+  uint32_t my_hash = 42;
+  ASSERT_OK(shard_->Insert(my_key, my_hash, &deleted, 1, IncrementIntDeleter,
+                           &h, Cache::Priority::HIGH));
+
+  // Some large number outstanding
+  shard_->TEST_RefN(h, 123456789);
+  // Simulate many lookup/ref + release, plenty to overflow counters
+  for (int i = 0; i < 10000; ++i) {
+    shard_->TEST_RefN(h, 1234567);
+    shard_->TEST_ReleaseN(h, 1234567);
+  }
+  // Mark it invisible (to reach a different CorrectNearOverflow() in Release)
+  shard_->Erase(my_key, my_hash);
+  // Simulate many more lookup/ref + release (one-by-one would be too
+  // expensive for unit test)
+  for (int i = 0; i < 10000; ++i) {
+    shard_->TEST_RefN(h, 1234567);
+    shard_->TEST_ReleaseN(h, 1234567);
+  }
+  // Free all but last 1
+  shard_->TEST_ReleaseN(h, 123456789);
+  // Still alive
+  ASSERT_EQ(deleted, 0);
+  // Free last ref, which will finalize erasure
+  shard_->Release(h);
+  // Deleted
+  ASSERT_EQ(deleted, 1);
+}
 
-  // Small cache, large elements.
-  capacity = 1024;
-  estimated_value_size = 8192;
-  metadata_charge_policy = kFullChargeCacheMetadata;
-  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
-                                  metadata_charge_policy);
-  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */));
+// This test is mostly to exercise some corner case logic, by forcing two
+// keys to have the same hash, and more
+TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
+  NewShard(6, /*strict_capacity_limit*/ false);
+  int deleted = 0;
+  std::string key1(kCacheKeySize, 'x');
+  std::string key2(kCacheKeySize, 'y');
+  std::string key3(kCacheKeySize, 'z');
+  uint32_t my_hash = 42;
+  Cache::Handle* h1;
+  ASSERT_OK(shard_->Insert(key1, my_hash, &deleted, 1, IncrementIntDeleter, &h1,
+                           Cache::Priority::HIGH));
+  Cache::Handle* h2;
+  ASSERT_OK(shard_->Insert(key2, my_hash, &deleted, 1, IncrementIntDeleter, &h2,
+                           Cache::Priority::HIGH));
+  Cache::Handle* h3;
+  ASSERT_OK(shard_->Insert(key3, my_hash, &deleted, 1, IncrementIntDeleter, &h3,
+                           Cache::Priority::HIGH));
+
+  // Can repeatedly lookup+release despite the hash collision
+  Cache::Handle* tmp_h;
+  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
+    tmp_h = shard_->Lookup(key1, my_hash);
+    ASSERT_EQ(h1, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+    tmp_h = shard_->Lookup(key2, my_hash);
+    ASSERT_EQ(h2, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+    tmp_h = shard_->Lookup(key3, my_hash);
+    ASSERT_EQ(h3, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+  }
+
+  // Make h1 invisible
+  shard_->Erase(key1, my_hash);
+  // Redundant erase
+  shard_->Erase(key1, my_hash);
+
+  // All still alive
+  ASSERT_EQ(deleted, 0);
+
+  // Invisible to Lookup
+  tmp_h = shard_->Lookup(key1, my_hash);
+  ASSERT_EQ(nullptr, tmp_h);
+
+  // Can still find h2, h3
+  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
+    tmp_h = shard_->Lookup(key2, my_hash);
+    ASSERT_EQ(h2, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+    tmp_h = shard_->Lookup(key3, my_hash);
+    ASSERT_EQ(h3, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+  }
+
+  // Also Insert with invisible entry there
+  ASSERT_OK(shard_->Insert(key1, my_hash, &deleted, 1, IncrementIntDeleter,
+                           nullptr, Cache::Priority::HIGH));
+  tmp_h = shard_->Lookup(key1, my_hash);
+  // Found but distinct handle
+  ASSERT_NE(nullptr, tmp_h);
+  ASSERT_NE(h1, tmp_h);
+  ASSERT_TRUE(shard_->Release(tmp_h, /*erase_if_last_ref*/ true));
+
+  // tmp_h deleted
+  ASSERT_EQ(deleted--, 1);
+
+  // Release last ref on h1 (already invisible)
+  ASSERT_TRUE(shard_->Release(h1, /*erase_if_last_ref*/ false));
+
+  // h1 deleted
+  ASSERT_EQ(deleted--, 1);
+  h1 = nullptr;
+
+  // Can still find h2, h3
+  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
+    tmp_h = shard_->Lookup(key2, my_hash);
+    ASSERT_EQ(h2, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+    tmp_h = shard_->Lookup(key3, my_hash);
+    ASSERT_EQ(h3, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+  }
+
+  // Release last ref on h2
+  ASSERT_FALSE(shard_->Release(h2, /*erase_if_last_ref*/ false));
+
+  // h2 still not deleted (unreferenced in cache)
+  ASSERT_EQ(deleted, 0);
+
+  // Can still find it
+  tmp_h = shard_->Lookup(key2, my_hash);
+  ASSERT_EQ(h2, tmp_h);
+
+  // Release last ref on h2, with erase
+  ASSERT_TRUE(shard_->Release(h2, /*erase_if_last_ref*/ true));
+
+  // h2 deleted
+  ASSERT_EQ(deleted--, 1);
+  tmp_h = shard_->Lookup(key2, my_hash);
+  ASSERT_EQ(nullptr, tmp_h);
+
+  // Can still find h3
+  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
+    tmp_h = shard_->Lookup(key3, my_hash);
+    ASSERT_EQ(h3, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+  }
 
-  // Large capacity.
-  capacity = 31924172;
-  estimated_value_size = 8192;
-  metadata_charge_policy = kFullChargeCacheMetadata;
-  max_occupancy =
-      CalcMaxOccupancy(capacity, estimated_value_size, metadata_charge_policy);
-  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
-                                  metadata_charge_policy);
-  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy));
+  // Release last ref on h3, without erase
+  ASSERT_FALSE(shard_->Release(h3, /*erase_if_last_ref*/ false));
+
+  // h3 still not deleted (unreferenced in cache)
+  ASSERT_EQ(deleted, 0);
+
+  // Explicit erase
+  shard_->Erase(key3, my_hash);
+
+  // h3 deleted
+  ASSERT_EQ(deleted--, 1);
+  tmp_h = shard_->Lookup(key3, my_hash);
+  ASSERT_EQ(nullptr, tmp_h);
+}
+
+// This uses the public API to effectively test CalcHashBits etc.
+TEST_F(ClockCacheTest, TableSizesTest) {
+  for (size_t est_val_size : {1U, 5U, 123U, 2345U, 345678U}) {
+    SCOPED_TRACE("est_val_size = " + std::to_string(est_val_size));
+    for (double est_count : {1.1, 2.2, 511.9, 512.1, 2345.0}) {
+      SCOPED_TRACE("est_count = " + std::to_string(est_count));
+      size_t capacity = static_cast<size_t>(est_val_size * est_count);
+      // kDontChargeCacheMetadata
+      auto cache = ExperimentalNewClockCache(
+          capacity, est_val_size, /*num shard_bits*/ -1,
+          /*strict_capacity_limit*/ false, kDontChargeCacheMetadata);
+      // Table sizes are currently only powers of two
+      EXPECT_GE(cache->GetTableAddressCount(), est_count / kLoadFactor);
+      EXPECT_LE(cache->GetTableAddressCount(), est_count / kLoadFactor * 2.0);
+      EXPECT_EQ(cache->GetUsage(), 0);
+
+      // kFullChargeMetaData
+      // Because table sizes are currently only powers of two, sizes get
+      // really weird when metadata is a huge portion of capacity. For example,
+      // doubling the table size could cut by 90% the space available to
+      // values. Therefore, we omit those weird cases for now.
+      if (est_val_size >= 512) {
+        cache = ExperimentalNewClockCache(
+            capacity, est_val_size, /*num shard_bits*/ -1,
+            /*strict_capacity_limit*/ false, kFullChargeCacheMetadata);
+        double est_count_after_meta =
+            (capacity - cache->GetUsage()) * 1.0 / est_val_size;
+        EXPECT_GE(cache->GetTableAddressCount(),
+                  est_count_after_meta / kLoadFactor);
+        EXPECT_LE(cache->GetTableAddressCount(),
+                  est_count_after_meta / kLoadFactor * 2.0);
+      }
+    }
+  }
 }
 
 }  // namespace clock_cache
diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc
index 6a5fbebdc..3e6d6a4f7 100644
--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@@ -213,9 +213,9 @@ std::string ShardedCache::GetPrintableOptions() const {
   ret.append(GetShard(0)->GetPrintableOptions());
   return ret;
 }
-int GetDefaultCacheShardBits(size_t capacity) {
+
+int GetDefaultCacheShardBits(size_t capacity, size_t min_shard_size) {
   int num_shard_bits = 0;
-  size_t min_shard_size = 512L * 1024L;  // Every shard is at least 512KB.
   size_t num_shards = capacity / min_shard_size;
   while (num_shards >>= 1) {
     if (++num_shard_bits >= 6) {
@@ -230,4 +230,21 @@ int ShardedCache::GetNumShardBits() const { return BitsSetToOne(shard_mask_); }
 
 uint32_t ShardedCache::GetNumShards() const { return shard_mask_ + 1; }
 
+size_t ShardedCache::GetOccupancyCount() const {
+  size_t oc = 0;
+  uint32_t num_shards = GetNumShards();
+  for (uint32_t s = 0; s < num_shards; s++) {
+    oc += GetShard(s)->GetOccupancyCount();
+  }
+  return oc;
+}
+size_t ShardedCache::GetTableAddressCount() const {
+  size_t tac = 0;
+  uint32_t num_shards = GetNumShards();
+  for (uint32_t s = 0; s < num_shards; s++) {
+    tac += GetShard(s)->GetTableAddressCount();
+  }
+  return tac;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h
index c0bb60a21..8713d1dce 100644
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@@ -20,7 +20,8 @@ namespace ROCKSDB_NAMESPACE {
 // Single cache shard interface.
 class CacheShard {
  public:
-  CacheShard() = default;
+  explicit CacheShard(CacheMetadataChargePolicy metadata_charge_policy)
+      : metadata_charge_policy_(metadata_charge_policy) {}
   virtual ~CacheShard() = default;
 
   using DeleterFn = Cache::DeleterFn;
@@ -47,6 +48,8 @@ class CacheShard {
   virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
   virtual size_t GetUsage() const = 0;
   virtual size_t GetPinnedUsage() const = 0;
+  virtual size_t GetOccupancyCount() const = 0;
+  virtual size_t GetTableAddressCount() const = 0;
   // Handles iterating over roughly `average_entries_per_lock` entries, using
   // `state` to somehow record where it last ended up. Caller initially uses
   // *state == 0 and implementation sets *state = UINT32_MAX to indicate
@@ -57,13 +60,9 @@ class CacheShard {
       uint32_t average_entries_per_lock, uint32_t* state) = 0;
   virtual void EraseUnRefEntries() = 0;
   virtual std::string GetPrintableOptions() const { return ""; }
-  void set_metadata_charge_policy(
-      CacheMetadataChargePolicy metadata_charge_policy) {
-    metadata_charge_policy_ = metadata_charge_policy;
-  }
 
  protected:
-  CacheMetadataChargePolicy metadata_charge_policy_ = kDontChargeCacheMetadata;
+  const CacheMetadataChargePolicy metadata_charge_policy_;
 };
 
 // Generic cache interface which shards cache by hash of keys. 2^num_shard_bits
@@ -106,6 +105,8 @@ class ShardedCache : public Cache {
   virtual size_t GetUsage() const override;
   virtual size_t GetUsage(Handle* handle) const override;
   virtual size_t GetPinnedUsage() const override;
+  virtual size_t GetOccupancyCount() const override;
+  virtual size_t GetTableAddressCount() const override;
   virtual void ApplyToAllEntries(
       const std::function<void(const Slice& key, void* value, size_t charge,
                                DeleterFn deleter)>& callback,
@@ -127,6 +128,8 @@ class ShardedCache : public Cache {
   std::atomic<uint64_t> last_id_;
 };
 
-extern int GetDefaultCacheShardBits(size_t capacity);
+// 512KB is traditional minimum shard size.
+int GetDefaultCacheShardBits(size_t capacity,
+                             size_t min_shard_size = 512U * 1024U);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index 699873e9f..d550c5225 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -939,11 +939,15 @@ TEST_F(DBBlockCacheTest, AddRedundantStats) {
   for (std::shared_ptr<Cache> base_cache :
        {NewLRUCache(capacity, num_shard_bits),
         ExperimentalNewClockCache(
-            capacity, 1 /*estimated_value_size*/, num_shard_bits,
-            false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy),
-        NewFastLRUCache(capacity, 1 /*estimated_value_size*/, num_shard_bits,
-                        false /*strict_capacity_limit*/,
-                        kDefaultCacheMetadataChargePolicy)}) {
+            capacity,
+            BlockBasedTableOptions().block_size /*estimated_value_size*/,
+            num_shard_bits, false /*strict_capacity_limit*/,
+            kDefaultCacheMetadataChargePolicy),
+        NewFastLRUCache(
+            capacity,
+            BlockBasedTableOptions().block_size /*estimated_value_size*/,
+            num_shard_bits, false /*strict_capacity_limit*/,
+            kDefaultCacheMetadataChargePolicy)}) {
     if (!base_cache) {
       // Skip clock cache when not supported
       continue;
@@ -1298,10 +1302,11 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
   for (bool partition : {false, true}) {
     for (std::shared_ptr<Cache> cache :
          {NewLRUCache(capacity),
-          ExperimentalNewClockCache(capacity, 1 /*estimated_value_size*/,
-                                    -1 /*num_shard_bits*/,
-                                    false /*strict_capacity_limit*/,
-                                    kDefaultCacheMetadataChargePolicy)}) {
+          ExperimentalNewClockCache(
+              capacity,
+              BlockBasedTableOptions().block_size /*estimated_value_size*/,
+              -1 /*num_shard_bits*/, false /*strict_capacity_limit*/,
+              kDefaultCacheMetadataChargePolicy)}) {
       if (!cache) {
         // Skip clock cache when not supported
         continue;
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 3b2aba22f..5a8a24324 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -671,6 +671,9 @@ void InternalStats::CacheEntryRoleStats::BeginCollection(
       << port::GetProcessID();
   cache_id = str.str();
   cache_capacity = cache->GetCapacity();
+  cache_usage = cache->GetUsage();
+  table_size = cache->GetTableAddressCount();
+  occupancy = cache->GetOccupancyCount();
 }
 
 void InternalStats::CacheEntryRoleStats::EndCollection(
@@ -695,6 +698,8 @@ std::string InternalStats::CacheEntryRoleStats::ToString(
   std::ostringstream str;
   str << "Block cache " << cache_id
       << " capacity: " << BytesToHumanString(cache_capacity)
+      << " usage: " << BytesToHumanString(cache_usage)
+      << " table_size: " << table_size << " occupancy: " << occupancy
       << " collections: " << collection_count
       << " last_copies: " << copies_of_last_collection
       << " last_secs: " << (GetLastDurationMicros() / 1000000.0)
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 73c1f29e7..7091877bb 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -453,6 +453,9 @@ class InternalStats {
   // For use with CacheEntryStatsCollector
   struct CacheEntryRoleStats {
     uint64_t cache_capacity = 0;
+    uint64_t cache_usage = 0;
+    size_t table_size = 0;
+    size_t occupancy = 0;
     std::string cache_id;
     std::array<uint64_t, kNumCacheEntryRoles> total_charges;
     std::array<size_t, kNumCacheEntryRoles> entry_counts;
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 1e87f5f72..119cf959c 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -404,6 +404,16 @@ class Cache {
   // Returns the memory size for the entries residing in the cache.
   virtual size_t GetUsage() const = 0;
 
+  // Returns the number of entries currently tracked in the table. SIZE_MAX
+  // means "not supported." This is used for inspecting the load factor, along
+  // with GetTableAddressCount().
+  virtual size_t GetOccupancyCount() const { return SIZE_MAX; }
+
+  // Returns the number of ways the hash function is divided for addressing
+  // entries. Zero means "not supported." This is used for inspecting the load
+  // factor, along with GetOccupancyCount().
+  virtual size_t GetTableAddressCount() const { return 0; }
+
   // Returns the memory size for a specific entry in the cache.
   virtual size_t GetUsage(Handle* handle) const = 0;
 
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 67671a960..50c143f5f 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -560,7 +560,7 @@ DEFINE_bool(universal_incremental, false,
 DEFINE_int64(cache_size, 8 << 20,  // 8MB
              "Number of bytes to use as a cache of uncompressed data");
 
-DEFINE_int32(cache_numshardbits, 6,
+DEFINE_int32(cache_numshardbits, -1,
              "Number of shards for the block cache"
              " is 2 ** cache_numshardbits. Negative means use default settings."
              " This is applied only if FLAGS_cache_size is non-negative.");
@@ -3618,6 +3618,9 @@ class Benchmark {
         }
         fresh_db = true;
         method = &Benchmark::TimeSeries;
+      } else if (name == "block_cache_entry_stats") {
+        // DB::Properties::kBlockCacheEntryStats
+        PrintStats("rocksdb.block-cache-entry-stats");
       } else if (name == "stats") {
         PrintStats("rocksdb.stats");
       } else if (name == "resetstats") {