Towards a production-quality ClockCache (#10418)

Summary: In this PR we bring ClockCache closer to production quality. We implement the following changes: 1. Fixed a few bugs in ClockCache. 2. ClockCache now fully supports ``strict_capacity_limit == false``: When an insertion over capacity is commanded, we allocate a handle separately from the hash table. 3. ClockCache now runs on almost every test in cache_test. The only exceptions are a test where either the LRU policy is required, and a test that dynamically increases the table capacity. 4. ClockCache now supports dynamically decreasing capacity via SetCapacity. (This is easy: we shrink the capacity upper bound and run the clock algorithm.) 5. Old FastLRUCache tests in lru_cache_test.cc are now also used on ClockCache. As a byproduct of 1. and 2. we are able to turn on ClockCache in the stress tests. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10418 Test Plan: - ``make -j24 USE_CLANG=1 COMPILE_WITH_ASAN=1 COMPILE_WITH_UBSAN=1 check`` - ``make -j24 USE_CLANG=1 COMPILE_WITH_TSAN=1 check`` - ``make -j24 USE_CLANG=1 COMPILE_WITH_ASAN=1 COMPILE_WITH_UBSAN=1 CRASH_TEST_EXT_ARGS="--duration=960 --cache_type=clock_cache" blackbox_crash_test_with_atomic_flush`` - ``make -j24 USE_CLANG=1 COMPILE_WITH_TSAN=1 CRASH_TEST_EXT_ARGS="--duration=960 --cache_type=clock_cache" blackbox_crash_test_with_atomic_flush`` Reviewed By: pdillinger Differential Revision: D38170673 Pulled By: guidotag fbshipit-source-id: 508987b9dc9d9d68f1a03eefac769820b680340a
3 years ago · 9d7de6517c
parent 8db8b98f98
commit 9d7de6517c
6 changed files with 350 additions and 193 deletions
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@ -23,6 +23,11 @@
 #include "util/coding.h"
 #include "util/string_util.h"

+// FastLRUCache and ClockCache only support 16-byte keys, so some of
+// the tests originally wrote for LRUCache do not work on the other caches.
+// Those tests were adapted to use 16-byte keys. We kept the original ones.
+// TODO: Remove the original tests if they ever become unused.
+
 namespace ROCKSDB_NAMESPACE {

 namespace {
@ -58,17 +63,22 @@ int DecodeValue(void* v) {
  return static_cast<int>(reinterpret_cast<uintptr_t>(v));
 }

-const std::string kLRU = "lru";
-const std::string kClock = "clock";
-const std::string kFast = "fast";
+void DumbDeleter(const Slice& /*key*/, void* /*value*/) {}

-void dumbDeleter(const Slice& /*key*/, void* /*value*/) {}
-
-void eraseDeleter(const Slice& /*key*/, void* value) {
+void EraseDeleter1(const Slice& /*key*/, void* value) {
  Cache* cache = reinterpret_cast<Cache*>(value);
  cache->Erase("foo");
 }

+void EraseDeleter2(const Slice& /*key*/, void* value) {
+  Cache* cache = reinterpret_cast<Cache*>(value);
+  cache->Erase(EncodeKey16Bytes(1234));
+}
+
+const std::string kLRU = "lru";
+const std::string kClock = "clock";
+const std::string kFast = "fast";
+
 }  // anonymous namespace

 class CacheTest : public testing::TestWithParam<std::string> {
@ -223,13 +233,9 @@ class LRUCacheTest : public CacheTest {};

 TEST_P(CacheTest, UsageTest) {
  auto type = GetParam();
-  if (type == kFast || type == kClock) {
-    ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys.");
-    return;
-  }

  // cache is std::shared_ptr and will be automatically cleaned up.
-  const uint64_t kCapacity = 100000;
+  const size_t kCapacity = 100000;
  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
  auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata);
  ASSERT_EQ(0, cache->GetUsage());
@ -239,12 +245,17 @@ TEST_P(CacheTest, UsageTest) {
  char value[10] = "abcdef";
  // make sure everything will be cached
  for (int i = 1; i < 100; ++i) {
-    std::string key(i, 'a');
+    std::string key;
+    if (type == kLRU) {
+      key = std::string(i, 'a');
+    } else {
+      key = EncodeKey(i);
+    }
    auto kv_size = key.size() + 5;
    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
-                            dumbDeleter));
+                            DumbDeleter));
    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
-                                    kv_size, dumbDeleter));
+                                    kv_size, DumbDeleter));
    usage += kv_size;
    ASSERT_EQ(usage, cache->GetUsage());
    ASSERT_LT(usage, precise_cache->GetUsage());
@ -256,12 +267,17 @@ TEST_P(CacheTest, UsageTest) {
  ASSERT_EQ(0, precise_cache->GetUsage());

  // make sure the cache will be overloaded
-  for (uint64_t i = 1; i < kCapacity; ++i) {
-    auto key = std::to_string(i);
+  for (size_t i = 1; i < kCapacity; ++i) {
+    std::string key;
+    if (type == kLRU) {
+      key = std::to_string(i);
+    } else {
+      key = EncodeKey(static_cast<int>(1000 + i));
+    }
    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
-                            dumbDeleter));
+                            DumbDeleter));
    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
-                                    key.size() + 5, dumbDeleter));
+                                    key.size() + 5, DumbDeleter));
  }

  // the usage should be close to the capacity
@ -271,15 +287,18 @@ TEST_P(CacheTest, UsageTest) {
  ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage());
 }

+// TODO: This test takes longer than expected on ClockCache. This is
+// because the values size estimate at construction is too sloppy.
+// Fix this.
+// Why is it so slow? The cache is constructed with an estimate of 1, but
+// then the charge is claimed to be 21. This will cause the hash table
+// to be extremely sparse, which in turn means clock needs to scan too
+// many slots to find victims.
 TEST_P(CacheTest, PinnedUsageTest) {
  auto type = GetParam();
-  if (type == kFast || type == kClock) {
-    ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys.");
-    return;
-  }

  // cache is std::shared_ptr and will be automatically cleaned up.
-  const uint64_t kCapacity = 200000;
+  const size_t kCapacity = 200000;
  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
  auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata);

@ -292,15 +311,20 @@ TEST_P(CacheTest, PinnedUsageTest) {
  // Add entries. Unpin some of them after insertion. Then, pin some of them
  // again. Check GetPinnedUsage().
  for (int i = 1; i < 100; ++i) {
-    std::string key(i, 'a');
+    std::string key;
+    if (type == kLRU) {
+      key = std::string(i, 'a');
+    } else {
+      key = EncodeKey(i);
+    }
    auto kv_size = key.size() + 5;
    Cache::Handle* handle;
    Cache::Handle* handle_in_precise_cache;
    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
-                            dumbDeleter, &handle));
+                            DumbDeleter, &handle));
    assert(handle);
    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
-                                    kv_size, dumbDeleter,
+                                    kv_size, DumbDeleter,
                                    &handle_in_precise_cache));
    assert(handle_in_precise_cache);
    pinned_usage += kv_size;
@ -334,12 +358,17 @@ TEST_P(CacheTest, PinnedUsageTest) {
  ASSERT_LT(pinned_usage, precise_cache_pinned_usage);

  // check that overloading the cache does not change the pinned usage
-  for (uint64_t i = 1; i < 2 * kCapacity; ++i) {
-    auto key = std::to_string(i);
+  for (size_t i = 1; i < 2 * kCapacity; ++i) {
+    std::string key;
+    if (type == kLRU) {
+      key = std::to_string(i);
+    } else {
+      key = EncodeKey(static_cast<int>(1000 + i));
+    }
    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
-                            dumbDeleter));
+                            DumbDeleter));
    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
-                                    key.size() + 5, dumbDeleter));
+                                    key.size() + 5, DumbDeleter));
  }
  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
@ -447,7 +476,7 @@ TEST_P(CacheTest, EvictionPolicy) {
  Insert(200, 201);

  // Frequently used entry must be kept around
-  for (int i = 0; i < kCacheSize * 2; i++) {
+  for (int i = 0; i < 2 * kCacheSize; i++) {
    Insert(1000+i, 2000+i);
    ASSERT_EQ(101, Lookup(100));
  }
@ -500,9 +529,7 @@ TEST_P(CacheTest, EvictionPolicyRef) {
  Insert(303, 104);

  // Insert entries much more than cache capacity.
-  double load_factor =
-      std::min(fast_lru_cache::kLoadFactor, clock_cache::kLoadFactor);
-  for (int i = 0; i < 2 * static_cast<int>(kCacheSize / load_factor); i++) {
+  for (int i = 0; i < 100 * kCacheSize; i++) {
    Insert(1000 + i, 2000 + i);
  }

@ -533,31 +560,40 @@ TEST_P(CacheTest, EvictionPolicyRef) {

 TEST_P(CacheTest, EvictEmptyCache) {
  auto type = GetParam();
-  if (type == kFast || type == kClock) {
-    ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys.");
-    return;
-  }

  // Insert item large than capacity to trigger eviction on empty cache.
  auto cache = NewCache(1, 0, false);
-  ASSERT_OK(cache->Insert("foo", nullptr, 10, dumbDeleter));
+  if (type == kLRU) {
+    ASSERT_OK(cache->Insert("foo", nullptr, 10, DumbDeleter));
+  } else {
+    ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, 10, DumbDeleter));
+  }
 }

 TEST_P(CacheTest, EraseFromDeleter) {
  auto type = GetParam();
-  if (type == kFast || type == kClock) {
-    ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys.");
-    return;
-  }

  // Have deleter which will erase item from cache, which will re-enter
  // the cache at that point.
  std::shared_ptr<Cache> cache = NewCache(10, 0, false);
-  ASSERT_OK(cache->Insert("foo", nullptr, 1, dumbDeleter));
-  ASSERT_OK(cache->Insert("bar", cache.get(), 1, eraseDeleter));
-  cache->Erase("bar");
-  ASSERT_EQ(nullptr, cache->Lookup("foo"));
-  ASSERT_EQ(nullptr, cache->Lookup("bar"));
+  std::string foo, bar;
+  Cache::DeleterFn erase_deleter;
+  if (type == kLRU) {
+    foo = "foo";
+    bar = "bar";
+    erase_deleter = EraseDeleter1;
+  } else {
+    foo = EncodeKey(1234);
+    bar = EncodeKey(5678);
+    erase_deleter = EraseDeleter2;
+  }
+
+  ASSERT_OK(cache->Insert(foo, nullptr, 1, DumbDeleter));
+  ASSERT_OK(cache->Insert(bar, cache.get(), 1, erase_deleter));
+
+  cache->Erase(bar);
+  ASSERT_EQ(nullptr, cache->Lookup(foo));
+  ASSERT_EQ(nullptr, cache->Lookup(bar));
 }

 TEST_P(CacheTest, ErasedHandleState) {
@ -590,9 +626,9 @@ TEST_P(CacheTest, HeavyEntries) {
  const int kHeavy = 10;
  int added = 0;
  int index = 0;
-  while (added < 2*kCacheSize) {
+  while (added < 2 * kCacheSize) {
    const int weight = (index & 1) ? kLight : kHeavy;
-    Insert(index, 1000+index, weight);
+    Insert(index, 1000 + index, weight);
    added += weight;
    index++;
  }
@ -603,7 +639,7 @@ TEST_P(CacheTest, HeavyEntries) {
    int r = Lookup(i);
    if (r >= 0) {
      cached_weight += weight;
-      ASSERT_EQ(1000+i, r);
+      ASSERT_EQ(1000 + i, r);
    }
  }
  ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10);
@ -615,7 +651,6 @@ TEST_P(CacheTest, NewId) {
  ASSERT_NE(a, b);
 }

-
 class Value {
 public:
  explicit Value(int v) : v_(v) {}
@ -664,7 +699,8 @@ TEST_P(CacheTest, SetCapacity) {
  auto type = GetParam();
  if (type == kFast || type == kClock) {
    ROCKSDB_GTEST_BYPASS(
-        "FastLRUCache and ClockCache don't support capacity adjustments.");
+        "FastLRUCache and ClockCache don't support arbitrary capacity "
+        "adjustments.");
    return;
  }
  // test1: increase capacity
@ -716,9 +752,9 @@ TEST_P(CacheTest, SetCapacity) {

 TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
  auto type = GetParam();
-  if (type == kFast || type == kClock) {
+  if (type == kFast) {
    ROCKSDB_GTEST_BYPASS(
-        "FastLRUCache and ClockCache don't support an unbounded number of "
+        "FastLRUCache only supports a limited number of "
        "inserts beyond "
        "capacity.");
    return;
@ -775,9 +811,8 @@ TEST_P(LRUCacheTest, SetStrictCapacityLimit) {

 TEST_P(CacheTest, OverCapacity) {
  auto type = GetParam();
-  if (type == kFast || type == kClock) {
-    ROCKSDB_GTEST_BYPASS(
-        "FastLRUCache and ClockCache don't support capacity adjustments.");
+  if (type == kClock) {
+    ROCKSDB_GTEST_BYPASS("Requires LRU eviction policy.");
    return;
  }
  size_t n = 10;
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -69,10 +69,10 @@ ClockHandle* ClockHandleTable::Lookup(const Slice& key, uint32_t hash) {
    // updates where it would be possible to combine into one CAS (more metadata
    // under one atomic field) or maybe two atomic updates (one arithmetic, one
    // bitwise). Something to think about optimizing.
-    e->InternalToExternalRef();
    e->SetHit();
    // The handle is now referenced, so we take it out of clock.
    ClockOff(e);
+    e->InternalToExternalRef();
  }

  return e;
@ -312,17 +312,20 @@ void ClockHandleTable::ClockRun(size_t charge) {
  // hot element, it will be hard to get an exclusive ref.
  // Do we need a mechanism to prevent an element from sitting
  // for a long time in cache waiting to be evicted?
-  assert(charge <= capacity_);
  autovector<ClockHandle> deleted;
  uint32_t max_iterations =
-      1 + static_cast<uint32_t>(GetTableSize() * kLoadFactor);
+      ClockHandle::ClockPriority::HIGH *
+      (1 +
+       static_cast<uint32_t>(
+           GetTableSize() *
+           kLoadFactor));  // It may take up to HIGH passes to evict an element.
  size_t usage_local = usage_;
-  while (usage_local + charge > capacity_ && max_iterations--) {
+  size_t capacity_local = capacity_;
+  while (usage_local + charge > capacity_local && max_iterations--) {
    uint32_t steps = 1 + static_cast<uint32_t>(1 / kLoadFactor);
    uint32_t clock_pointer_local = (clock_pointer_ += steps) - steps;
    for (uint32_t i = 0; i < steps; i++) {
      ClockHandle* h = &array_[ModTableSize(clock_pointer_local + i)];
-
      if (h->TryExclusiveRef()) {
        if (h->WillBeDeleted()) {
          Remove(h, &deleted);
@ -335,7 +338,6 @@ void ClockHandleTable::ClockRun(size_t charge) {
            // exclusive ref, we know we are in the latter case. This can only
            // happen when the last external reference to an element was
            // released, and the element was not immediately removed.
-
            ClockOn(h);
          }
          ClockHandle::ClockPriority priority = h->GetClockPriority();
@ -358,6 +360,7 @@ ClockCacheShard::ClockCacheShard(
    size_t capacity, size_t estimated_value_size, bool strict_capacity_limit,
    CacheMetadataChargePolicy metadata_charge_policy)
    : strict_capacity_limit_(strict_capacity_limit),
+      detached_usage_(0),
      table_(capacity, CalcHashBits(capacity, estimated_value_size,
                                    metadata_charge_policy)) {
  set_metadata_charge_policy(metadata_charge_policy);
@ -430,12 +433,16 @@ int ClockCacheShard::CalcHashBits(
  return FloorLog2((num_entries << 1) - 1);
 }

-void ClockCacheShard::SetCapacity(size_t /*capacity*/) {
+void ClockCacheShard::SetCapacity(size_t capacity) {
+  if (capacity > table_.GetCapacity()) {
    assert(false);  // Not supported.
+  }
+  table_.SetCapacity(capacity);
+  table_.ClockRun(detached_usage_);
 }

-void ClockCacheShard::SetStrictCapacityLimit(bool /*strict_capacity_limit*/) {
-  assert(false);  // Not supported.
+void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
+  strict_capacity_limit_ = strict_capacity_limit;
 }

 Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
@ -459,27 +466,32 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,

  Status s = Status::OK();

+  // Use a local copy to minimize cache synchronization.
+  size_t detached_usage = detached_usage_;
+
  // Free space with the clock policy until enough space is freed or there are
  // no evictable elements.
-  table_.ClockRun(tmp.total_charge);
+  table_.ClockRun(tmp.total_charge + detached_usage);

-  // occupancy_ and usage_ are contended members across concurrent updates
-  // on the same shard, so we use a single copy to reduce cache synchronization.
+  // Use local copies to minimize cache synchronization
+  // (occupancy_ and usage_ are read and written by all insertions).
  uint32_t occupancy_local = table_.GetOccupancy();
-  size_t usage_local = table_.GetUsage();
-  assert(occupancy_local <= table_.GetOccupancyLimit());
-
-  autovector<ClockHandle> deleted;
-
-  if ((usage_local + tmp.total_charge > table_.GetCapacity() &&
-       (strict_capacity_limit_ || handle == nullptr)) ||
-      occupancy_local > table_.GetOccupancyLimit()) {
+  size_t total_usage = table_.GetUsage() + detached_usage;
+
+  // TODO: Currently we support strict_capacity_limit == false as long as the
+  // number of pinned elements is below table_.GetOccupancyLimit(). We can
+  // always support it as follows: whenever we exceed this limit, we dynamically
+  // allocate a handle and return it (when the user provides a handle pointer,
+  // of course). Then, Release checks whether the handle was dynamically
+  // allocated, or is stored in the table.
+  if (total_usage + tmp.total_charge > table_.GetCapacity() &&
+      (strict_capacity_limit_ || handle == nullptr)) {
    if (handle == nullptr) {
      // Don't insert the entry but still return ok, as if the entry inserted
      // into cache and get evicted immediately.
-      deleted.push_back(tmp);
+      tmp.FreeData();
    } else {
-      if (occupancy_local > table_.GetOccupancyLimit()) {
+      if (occupancy_local + 1 > table_.GetOccupancyLimit()) {
        // TODO: Consider using a distinct status for this case, but usually
        // it will be handled the same way as reaching charge capacity limit
        s = Status::MemoryLimit(
@ -490,22 +502,33 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
            "capacity.");
      }
    }
+  } else {
+    ClockHandle* h;
+    if (occupancy_local + 1 > table_.GetOccupancyLimit()) {
+      // Even if the user wishes to overload the cache, we can't insert into
+      // the hash table. Instead, we dynamically allocate a new handle.
+      h = new ClockHandle();
+      *h = tmp;
+      h->SetDetached();
+      h->TryExternalRef();
+      detached_usage_ += h->total_charge;
+      // TODO: Return special status?
    } else {
      // Insert into the cache. Note that the cache might get larger than its
      // capacity if not enough space was freed up.
-    ClockHandle* h = table_.Insert(&tmp, &deleted, handle != nullptr);
-    assert(h != nullptr);  // The occupancy is way below the table size, so this
-                           // insertion should never fail.
-    if (handle != nullptr) {
-      *handle = reinterpret_cast<Cache::Handle*>(h);
-    }
-
+      autovector<ClockHandle> deleted;
+      h = table_.Insert(&tmp, &deleted, handle != nullptr);
+      assert(h != nullptr);  // The occupancy is way below the table size, so
+                             // this insertion should never fail.
      if (deleted.size() > 0) {
        s = Status::OkOverwritten();
      }
-  }
-
      table_.Free(&deleted);
+    }
+    if (handle != nullptr) {
+      *handle = reinterpret_cast<Cache::Handle*>(h);
+    }
+  }

  return s;
 }
@ -516,7 +539,7 @@ Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {

 bool ClockCacheShard::Ref(Cache::Handle* h) {
  ClockHandle* e = reinterpret_cast<ClockHandle*>(h);
-  assert(e->HasExternalRefs());
+  assert(e->ExternalRefs() > 0);
  return e->TryExternalRef();
 }

@ -530,6 +553,20 @@ bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
  }

  ClockHandle* h = reinterpret_cast<ClockHandle*>(handle);
+
+  if (UNLIKELY(h->IsDetached())) {
+    h->ReleaseExternalRef();
+    if (h->TryExclusiveRef()) {
+      // Only the last reference will succeed.
+      // Don't bother releasing the exclusive ref.
+      h->FreeData();
+      detached_usage_ -= h->total_charge;
+      delete h;
+      return true;
+    }
+    return false;
+  }
+
  uint32_t refs = h->refs;
  bool last_reference = ((refs & ClockHandle::EXTERNAL_REFS) == 1);
  bool will_be_deleted = refs & ClockHandle::WILL_BE_DELETED;
@ -570,13 +607,14 @@ size_t ClockCacheShard::GetPinnedUsage() const {

  table_.ConstApplyToEntriesRange(
      [&clock_usage](ClockHandle* h) {
-        if (h->HasExternalRefs()) {
+        if (h->ExternalRefs() > 1) {
+          // We check > 1 because we are holding an external ref.
          clock_usage += h->total_charge;
        }
      },
      0, table_.GetTableSize(), true);

-  return clock_usage;
+  return clock_usage + detached_usage_;
 }

 ClockCache::ClockCache(size_t capacity, size_t estimated_value_size,
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@ -9,6 +9,8 @@

 #pragma once

+#include <sys/types.h>
+
 #include <array>
 #include <atomic>
 #include <cstdint>
@ -28,6 +30,9 @@ namespace ROCKSDB_NAMESPACE {

 namespace clock_cache {

+// Forward declaration of friend class.
+class ClockCacheTest;
+
 // An experimental alternative to LRUCache, using a lock-free, open-addressed
 // hash table and clock eviction.

@ -63,10 +68,10 @@ namespace clock_cache {
 //    can't be immediately deleted. In these cases, the flag will be later read
 //    and acted upon by the eviction algorithm. Importantly, WILL_BE_DELETED is
 //    used not only to defer deletions, but also as a barrier for external
-//    references: once WILL_BE_DELETED is set, lookups (which are the means to
-//    acquire new external references) will ignore the handle. For this reason,
-//    when WILL_BE_DELETED is set, we say the handle is invisible (and
-//    otherwise, that it's visible).
+//    references: once WILL_BE_DELETED is set, lookups (which are the most
+//    common way to acquire new external references) will ignore the handle.
+//    For this reason, when WILL_BE_DELETED is set, we say the handle is
+//    invisible (and, otherwise, that it's visible).
 //
 //
 // 3. HASHING AND COLLISION RESOLUTION
@ -192,10 +197,10 @@ struct ClockHandle {
  size_t total_charge;
  std::array<char, kCacheKeySize> key_data;

-  static constexpr uint8_t kIsElementOffset = 1;
-  static constexpr uint8_t kClockPriorityOffset = 2;
-  static constexpr uint8_t kIsHitOffset = 4;
-  static constexpr uint8_t kCachePriorityOffset = 5;
+  static constexpr uint8_t kIsElementOffset = 0;
+  static constexpr uint8_t kClockPriorityOffset = 1;
+  static constexpr uint8_t kIsHitOffset = 3;
+  static constexpr uint8_t kCachePriorityOffset = 4;

  enum Flags : uint8_t {
    // Whether the slot is in use by an element.
@ -252,9 +257,8 @@ struct ClockHandle {
    // Whether a thread has an exclusive reference to the slot.
    EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset,  // Bit 30
    // Whether the handle will be deleted soon. When this bit is set, new
-    // internal
-    // or external references to this handle stop being accepted.
-    // There is an exception: external references can be created from
+    // internal references to this handle stop being accepted.
+    // External references may still be granted---they can be created from
    // existing external references, or converting from existing internal
    // references.
    WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset  // Bit 31
@ -274,6 +278,9 @@ struct ClockHandle {

  std::atomic<uint32_t> refs;

+  // True iff the handle is allocated separately from hash table.
+  bool detached;
+
  ClockHandle()
      : value(nullptr),
        deleter(nullptr),
@ -281,7 +288,8 @@ struct ClockHandle {
        total_charge(0),
        flags(0),
        displacements(0),
-        refs(0) {
+        refs(0),
+        detached(false) {
    SetWillBeDeleted(false);
    SetIsElement(false);
    SetClockPriority(ClockPriority::NONE);
@ -300,6 +308,7 @@ struct ClockHandle {
    value = other.value;
    deleter = other.deleter;
    key_data = other.key_data;
+    hash = other.hash;
    total_charge = other.total_charge;
  }

@ -350,13 +359,13 @@ struct ClockHandle {

  // flags functions.

-  bool IsElement() const { return flags & IS_ELEMENT; }
+  bool IsElement() const { return flags & Flags::IS_ELEMENT; }

  void SetIsElement(bool is_element) {
    if (is_element) {
-      flags |= IS_ELEMENT;
+      flags |= Flags::IS_ELEMENT;
    } else {
-      flags &= static_cast<uint8_t>(~IS_ELEMENT);
+      flags &= static_cast<uint8_t>(~Flags::IS_ELEMENT);
    }
  }

@ -400,6 +409,10 @@ struct ClockHandle {
    flags |= new_priority;
  }

+  bool IsDetached() { return detached; }
+
+  void SetDetached() { detached = true; }
+
  inline bool IsEmpty() const {
    return !this->IsElement() && this->displacements == 0;
  }
@ -424,7 +437,9 @@ struct ClockHandle {
    }
  }

-  bool HasExternalRefs() const { return (refs & EXTERNAL_REFS) > 0; }
+  uint32_t ExternalRefs() const {
+    return (refs & EXTERNAL_REFS) >> kExternalRefsOffset;
+  }

  // Tries to take an internal ref. Returns true iff it succeeds.
  inline bool TryInternalRef() {
@ -437,7 +452,7 @@ struct ClockHandle {

  // Tries to take an external ref. Returns true iff it succeeds.
  inline bool TryExternalRef() {
-    if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
+    if (!((refs += kOneExternalRef) & EXCLUSIVE_REF)) {
      return true;
    }
    refs -= kOneExternalRef;
@ -529,8 +544,8 @@ class ClockHandleTable {
  // Makes h non-evictable.
  void ClockOff(ClockHandle* h);

-  // Runs the clock eviction algorithm until there is enough space to
-  // insert an element with the given charge.
+  // Runs the clock eviction algorithm until usage_ + charge is at most
+  // capacity_.
  void ClockRun(size_t charge);

  // Remove h from the hash table. Requires an exclusive ref to h.
@ -548,8 +563,6 @@ class ClockHandleTable {
    RemoveAll(key, hash, probe, deleted);
  }

-  void Free(autovector<ClockHandle>* deleted);
-
  // Tries to remove h from the hash table. If the attempt is successful,
  // the function hands over an exclusive ref to h.
  bool TryRemove(ClockHandle* h, autovector<ClockHandle>* deleted);
@ -558,6 +571,11 @@ class ClockHandleTable {
  // success. Requires that the caller thread has no shared ref to h.
  bool SpinTryRemove(ClockHandle* h, autovector<ClockHandle>* deleted);

+  // Call this function after an Insert, Remove, RemoveAll, TryRemove
+  // or SpinTryRemove. It frees the deleted values and updates the hash table
+  // metadata.
+  void Free(autovector<ClockHandle>* deleted);
+
  template <typename T>
  void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end,
                           bool apply_if_will_be_deleted) {
@ -579,12 +597,15 @@ class ClockHandleTable {
                                bool apply_if_will_be_deleted) const {
    for (uint32_t i = index_begin; i < index_end; i++) {
      ClockHandle* h = &array_[i];
-      if (h->TryExclusiveRef()) {
+      // We take an external ref because we are handing over control
+      // to a user-defined function, and because the handle will not be
+      // modified.
+      if (h->TryExternalRef()) {
        if (h->IsElement() &&
            (apply_if_will_be_deleted || !h->WillBeDeleted())) {
          func(h);
        }
-        h->ReleaseExclusiveRef();
+        h->ReleaseExternalRef();
      }
    }
  }
@ -601,6 +622,8 @@ class ClockHandleTable {

  size_t GetCapacity() const { return capacity_; }

+  void SetCapacity(size_t capacity) { capacity_ = capacity; }
+
  // Returns x mod 2^{length_bits_}.
  uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; }

@ -652,7 +675,7 @@ class ClockHandleTable {
  const uint32_t occupancy_limit_;

  // Maximum total charge of all elements stored in the table.
-  const size_t capacity_;
+  size_t capacity_;

  // We partition the following members into different cache lines
  // to avoid false sharing among Lookup, Release, Erase and Insert
@ -745,6 +768,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {

 private:
  friend class ClockCache;
+  friend class ClockCacheTest;

  // Free some space following strict clock policy until enough space
  // to hold (usage_ + charge) is freed or there are no evictable elements.
@ -763,6 +787,9 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
  // Whether to reject insertion if cache reaches its full capacity.
  std::atomic<bool> strict_capacity_limit_;

+  // Handles allocated separately from the table.
+  std::atomic<size_t> detached_usage_;
+
  ClockHandleTable table_;
 };  // class ClockCacheShard

@ -797,6 +824,7 @@ class ClockCache

 private:
  ClockCacheShard* shards_ = nullptr;
+
  int num_shards_;
 };  // class ClockCache

--- a/cache/fast_lru_cache.cc
+++ b/cache/fast_lru_cache.cc
@ -299,10 +299,12 @@ int LRUCacheShard::CalcHashBits(
 }

 void LRUCacheShard::SetCapacity(size_t capacity) {
-  assert(false);  // Not supported. TODO(Guido) Support it?
  autovector<LRUHandle> last_reference_list;
  {
    DMutexLock l(mutex_);
+    if (capacity > capacity_) {
+      assert(false);  // Not supported.
+    }
    capacity_ = capacity;
    EvictFromLRU(0, &last_reference_list);
  }
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@ -207,6 +207,9 @@ TEST_F(LRUCacheTest, EntriesWithPriority) {
  ValidateLRUList({"e", "f", "g", "Z", "d"}, 2);
 }

+// TODO: FastLRUCache and ClockCache use the same tests. We can probably remove
+// them from FastLRUCache after ClockCache becomes productive, and we don't plan
+// to use or maintain FastLRUCache any more.
 namespace fast_lru_cache {

 // TODO(guido) Replicate LRU policy tests from LRUCache here.
@ -225,10 +228,10 @@ class FastLRUCacheTest : public testing::Test {

  void NewCache(size_t capacity) {
    DeleteCache();
-    cache_ = reinterpret_cast<fast_lru_cache::LRUCacheShard*>(
-        port::cacheline_aligned_alloc(sizeof(fast_lru_cache::LRUCacheShard)));
-    new (cache_) fast_lru_cache::LRUCacheShard(
-        capacity, 1 /*estimated_value_size*/, false /*strict_capacity_limit*/,
+    cache_ = reinterpret_cast<LRUCacheShard*>(
+        port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
+    new (cache_) LRUCacheShard(capacity, 1 /*estimated_value_size*/,
+                               false /*strict_capacity_limit*/,
                               kDontChargeCacheMetadata);
  }

@ -243,25 +246,23 @@ class FastLRUCacheTest : public testing::Test {
  size_t CalcEstimatedHandleChargeWrapper(
      size_t estimated_value_size,
      CacheMetadataChargePolicy metadata_charge_policy) {
-    return fast_lru_cache::LRUCacheShard::CalcEstimatedHandleCharge(
-        estimated_value_size, metadata_charge_policy);
+    return LRUCacheShard::CalcEstimatedHandleCharge(estimated_value_size,
+                                                    metadata_charge_policy);
  }

  int CalcHashBitsWrapper(size_t capacity, size_t estimated_value_size,
                          CacheMetadataChargePolicy metadata_charge_policy) {
-    return fast_lru_cache::LRUCacheShard::CalcHashBits(
-        capacity, estimated_value_size, metadata_charge_policy);
+    return LRUCacheShard::CalcHashBits(capacity, estimated_value_size,
+                                       metadata_charge_policy);
  }

  // Maximum number of items that a shard can hold.
  double CalcMaxOccupancy(size_t capacity, size_t estimated_value_size,
                          CacheMetadataChargePolicy metadata_charge_policy) {
-    size_t handle_charge =
-        fast_lru_cache::LRUCacheShard::CalcEstimatedHandleCharge(
+    size_t handle_charge = LRUCacheShard::CalcEstimatedHandleCharge(
        estimated_value_size, metadata_charge_policy);
-    return capacity / (fast_lru_cache::kLoadFactor * handle_charge);
+    return capacity / (kLoadFactor * handle_charge);
  }
-
  bool TableSizeIsAppropriate(int hash_bits, double max_occupancy) {
    if (hash_bits == 0) {
      return max_occupancy <= 1;
@ -272,7 +273,7 @@ class FastLRUCacheTest : public testing::Test {
  }

 private:
-  fast_lru_cache::LRUCacheShard* cache_ = nullptr;
+  LRUCacheShard* cache_ = nullptr;
 };

 TEST_F(FastLRUCacheTest, ValidateKeySize) {
@ -292,7 +293,6 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) {
  double max_occupancy;
  int hash_bits;
  CacheMetadataChargePolicy metadata_charge_policy;
-
  // Vary the cache capacity, fix the element charge.
  for (int i = 0; i < 2048; i++) {
    capacity = i;
@ -304,7 +304,6 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) {
                                    metadata_charge_policy);
    EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy));
  }
-
  // Fix the cache capacity, vary the element charge.
  for (int i = 0; i < 1024; i++) {
    capacity = 1024;
@ -316,7 +315,6 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) {
                                    metadata_charge_policy);
    EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy));
  }
-
  // Zero-capacity cache, and only values have charge.
  capacity = 0;
  estimated_value_size = 1;
@ -324,7 +322,6 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) {
  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
                                  metadata_charge_policy);
  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */));
-
  // Zero-capacity cache, and only metadata has charge.
  capacity = 0;
  estimated_value_size = 0;
@ -332,7 +329,6 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) {
  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
                                  metadata_charge_policy);
  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */));
-
  // Small cache, large elements.
  capacity = 1024;
  estimated_value_size = 8192;
@ -340,7 +336,6 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) {
  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
                                  metadata_charge_policy);
  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */));
-
  // Large capacity.
  capacity = 31924172;
  estimated_value_size = 8192;
@ -402,37 +397,38 @@ class ClockCacheTest : public testing::Test {

  void Erase(const std::string& key) { shard_->Erase(key, 0 /*hash*/); }

-  // void ValidateLRUList(std::vector<std::string> keys,
-  //                      size_t num_high_pri_pool_keys = 0) {
-  // LRUHandle* lru;
-  // LRUHandle* lru_low_pri;
-  // cache_->TEST_GetLRUList(&lru, &lru_low_pri);
-  // LRUHandle* iter = lru;
-  // bool in_high_pri_pool = false;
-  // size_t high_pri_pool_keys = 0;
-  // if (iter == lru_low_pri) {
-  //   in_high_pri_pool = true;
-  // }
-  // for (const auto& key : keys) {
-  //   iter = iter->next;
-  //   ASSERT_NE(lru, iter);
-  //   ASSERT_EQ(key, iter->key().ToString());
-  //   ASSERT_EQ(in_high_pri_pool, iter->InHighPriPool());
-  //   if (in_high_pri_pool) {
-  //     high_pri_pool_keys++;
-  //   }
-  //   if (iter == lru_low_pri) {
-  //     ASSERT_FALSE(in_high_pri_pool);
-  //     in_high_pri_pool = true;
-  //   }
-  // }
-  // ASSERT_EQ(lru, iter->next);
-  // ASSERT_TRUE(in_high_pri_pool);
-  // ASSERT_EQ(num_high_pri_pool_keys, high_pri_pool_keys);
-  // }
+  size_t CalcEstimatedHandleChargeWrapper(
+      size_t estimated_value_size,
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    return ClockCacheShard::CalcEstimatedHandleCharge(estimated_value_size,
+                                                      metadata_charge_policy);
+  }
+
+  int CalcHashBitsWrapper(size_t capacity, size_t estimated_value_size,
+                          CacheMetadataChargePolicy metadata_charge_policy) {
+    return ClockCacheShard::CalcHashBits(capacity, estimated_value_size,
+                                         metadata_charge_policy);
+  }
+
+  // Maximum number of items that a shard can hold.
+  double CalcMaxOccupancy(size_t capacity, size_t estimated_value_size,
+                          CacheMetadataChargePolicy metadata_charge_policy) {
+    size_t handle_charge = ClockCacheShard::CalcEstimatedHandleCharge(
+        estimated_value_size, metadata_charge_policy);
+    return capacity / (kLoadFactor * handle_charge);
+  }
+
+  bool TableSizeIsAppropriate(int hash_bits, double max_occupancy) {
+    if (hash_bits == 0) {
+      return max_occupancy <= 1;
+    } else {
+      return (1 << hash_bits >= max_occupancy) &&
+             (1 << (hash_bits - 1) <= max_occupancy);
+    }
+  }

 private:
-  clock_cache::ClockCacheShard* shard_ = nullptr;
+  ClockCacheShard* shard_ = nullptr;
 };

 TEST_F(ClockCacheTest, Validate) {
@ -447,31 +443,89 @@ TEST_F(ClockCacheTest, Validate) {
 }

 TEST_F(ClockCacheTest, ClockPriorityTest) {
-  clock_cache::ClockHandle handle;
-  EXPECT_EQ(handle.GetClockPriority(),
-            clock_cache::ClockHandle::ClockPriority::NONE);
-  handle.SetClockPriority(clock_cache::ClockHandle::ClockPriority::HIGH);
-  EXPECT_EQ(handle.GetClockPriority(),
-            clock_cache::ClockHandle::ClockPriority::HIGH);
+  ClockHandle handle;
+  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::NONE);
+  handle.SetClockPriority(ClockHandle::ClockPriority::HIGH);
+  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::HIGH);
  handle.DecreaseClockPriority();
-  EXPECT_EQ(handle.GetClockPriority(),
-            clock_cache::ClockHandle::ClockPriority::MEDIUM);
+  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::MEDIUM);
  handle.DecreaseClockPriority();
-  EXPECT_EQ(handle.GetClockPriority(),
-            clock_cache::ClockHandle::ClockPriority::LOW);
-  handle.SetClockPriority(clock_cache::ClockHandle::ClockPriority::MEDIUM);
-  EXPECT_EQ(handle.GetClockPriority(),
-            clock_cache::ClockHandle::ClockPriority::MEDIUM);
-  handle.SetClockPriority(clock_cache::ClockHandle::ClockPriority::NONE);
-  EXPECT_EQ(handle.GetClockPriority(),
-            clock_cache::ClockHandle::ClockPriority::NONE);
-  handle.SetClockPriority(clock_cache::ClockHandle::ClockPriority::MEDIUM);
-  EXPECT_EQ(handle.GetClockPriority(),
-            clock_cache::ClockHandle::ClockPriority::MEDIUM);
+  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::LOW);
+  handle.SetClockPriority(ClockHandle::ClockPriority::MEDIUM);
+  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::MEDIUM);
+  handle.SetClockPriority(ClockHandle::ClockPriority::NONE);
+  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::NONE);
+  handle.SetClockPriority(ClockHandle::ClockPriority::MEDIUM);
+  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::MEDIUM);
  handle.DecreaseClockPriority();
  handle.DecreaseClockPriority();
-  EXPECT_EQ(handle.GetClockPriority(),
-            clock_cache::ClockHandle::ClockPriority::NONE);
+  EXPECT_EQ(handle.GetClockPriority(), ClockHandle::ClockPriority::NONE);
+}
+
+TEST_F(ClockCacheTest, CalcHashBitsTest) {
+  size_t capacity;
+  size_t estimated_value_size;
+  double max_occupancy;
+  int hash_bits;
+  CacheMetadataChargePolicy metadata_charge_policy;
+
+  // Vary the cache capacity, fix the element charge.
+  for (int i = 0; i < 2048; i++) {
+    capacity = i;
+    estimated_value_size = 0;
+    metadata_charge_policy = kFullChargeCacheMetadata;
+    max_occupancy = CalcMaxOccupancy(capacity, estimated_value_size,
+                                     metadata_charge_policy);
+    hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
+                                    metadata_charge_policy);
+    EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy));
+  }
+
+  // Fix the cache capacity, vary the element charge.
+  for (int i = 0; i < 1024; i++) {
+    capacity = 1024;
+    estimated_value_size = i;
+    metadata_charge_policy = kFullChargeCacheMetadata;
+    max_occupancy = CalcMaxOccupancy(capacity, estimated_value_size,
+                                     metadata_charge_policy);
+    hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
+                                    metadata_charge_policy);
+    EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy));
+  }
+
+  // Zero-capacity cache, and only values have charge.
+  capacity = 0;
+  estimated_value_size = 1;
+  metadata_charge_policy = kDontChargeCacheMetadata;
+  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
+                                  metadata_charge_policy);
+  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */));
+
+  // Zero-capacity cache, and only metadata has charge.
+  capacity = 0;
+  estimated_value_size = 0;
+  metadata_charge_policy = kFullChargeCacheMetadata;
+  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
+                                  metadata_charge_policy);
+  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */));
+
+  // Small cache, large elements.
+  capacity = 1024;
+  estimated_value_size = 8192;
+  metadata_charge_policy = kFullChargeCacheMetadata;
+  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
+                                  metadata_charge_policy);
+  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */));
+
+  // Large capacity.
+  capacity = 31924172;
+  estimated_value_size = 8192;
+  metadata_charge_policy = kFullChargeCacheMetadata;
+  max_occupancy =
+      CalcMaxOccupancy(capacity, estimated_value_size, metadata_charge_policy);
+  hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
+                                  metadata_charge_policy);
+  EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy));
 }

 }  // namespace clock_cache
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@ -116,8 +116,8 @@ default_params = {
    "use_direct_reads": lambda: random.randint(0, 1),
    "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
    "mock_direct_io": False,
-    "cache_type": "lru_cache",  # fast_lru_cache and clock_cache are currently incompatible
-                                # with stress tests, because they use strict_capacity_limit = false
+    "cache_type": lambda: random.choice(["lru_cache", "clock_cache"]),
+        # fast_lru_cache is incompatible with stress tests, because it doesn't support strict_capacity_limit == false.
    "use_full_merge_v1": lambda: random.randint(0, 1),
    "use_merge": lambda: random.randint(0, 1),
    # 999 -> use Bloom API