Fix use-after-free threading bug in ClockCache (#8261)

Summary: In testing for https://github.com/facebook/rocksdb/issues/8225 I found cache_bench would crash with -use_clock_cache, as well as db_bench -use_clock_cache, but not single-threaded. Smaller cache size hits failure much faster. ASAN reported the failuer as calling malloc_usable_size on the `key` pointer of a ClockCache handle after it was reportedly freed. On detailed inspection I found this bad sequence of operations for a cache entry: state=InCache=1,refs=1 [thread 1] Start ClockCacheShard::Unref (from Release, no mutex) [thread 1] Decrement ref count state=InCache=1,refs=0 [thread 1] Suspend before CalcTotalCharge (no mutex) [thread 2] Start UnsetInCache (from Insert, mutex held) [thread 2] clear InCache bit state=InCache=0,refs=0 [thread 2] Calls RecycleHandle (based on pre-updated state) [thread 2] Returns to Insert which calls Cleanup which deletes `key` [thread 1] Resume ClockCacheShard::Unref [thread 1] Read `key` in CalcTotalCharge To fix this, I've added a field to the handle to store the metadata charge so that we can efficiently remember everything we need from the handle in Unref. We must not read from the handle again if we decrement the count to zero with InCache=1, which means we don't own the entry and someone else could eject/overwrite it immediately. Note before this change, on amd64 sizeof(Handle) == 56 even though there are only 48 bytes of data. Grouping together the uint32_t fields would cut it down to 48, but I've added another uint32_t, which takes it back up to 56. Not a big deal. Also fixed DisownData to cooperate with ASAN as in LRUCache. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8261 Test Plan: Manual + adding use_clock_cache to db_crashtest.py Base performance ./cache_bench -use_clock_cache Complete in 17.060 s; QPS = 2458513 New performance ./cache_bench -use_clock_cache Complete in 17.052 s; QPS = 2459695 Any difference is easily buried in small noise. Crash test shows still more bug(s) in ClockCache, so I'm expecting to disable ClockCache from production code in a follow-up PR (if we can't find and fix the bug(s)) Reviewed By: mrambacher Differential Revision: D28207358 Pulled By: pdillinger fbshipit-source-id: aa7a9322afc6f18f30e462c75dbbe4a1206eb294
5 years ago · 3b981eaa1d
parent c70bae1b05
commit 3b981eaa1d
4 changed files with 47 additions and 18 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -19,6 +19,7 @@
 ### Public API change
 * Removed a parameter from TableFactory::NewTableBuilder, which should not be called by user code because TableBuilder is not a public API.
 * The `skip_filters` parameter to SstFileWriter is now considered deprecated. Use `BlockBasedTableOptions::filter_policy` to control generation of filters.
+* ClockCache is known to have bugs that could lead to crash or corruption, so should not be used until fixed. Use NewLRUCache instead.

 ## 6.20.0 (04/16/2021)
 ### Behavior Changes
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -176,10 +176,13 @@ namespace {
 // Cache entry meta data.
 struct CacheHandle {
  Slice key;
-  uint32_t hash;
  void* value;
  size_t charge;
  void (*deleter)(const Slice&, void* value);
+  uint32_t hash;
+
+  // Addition to "charge" to get "total charge" under metadata policy.
+  uint32_t meta_charge;

  // Flags and counters associated with the cache handle:
  //   lowest bit: in-cache bit
@ -205,9 +208,8 @@ struct CacheHandle {
    return *this;
  }

-  inline static size_t CalcTotalCharge(
-      Slice key, size_t charge,
-      CacheMetadataChargePolicy metadata_charge_policy) {
+  inline static uint32_t CalcMetadataCharge(
+      Slice key, CacheMetadataChargePolicy metadata_charge_policy) {
    size_t meta_charge = 0;
    if (metadata_charge_policy == kFullChargeCacheMetadata) {
      meta_charge += sizeof(CacheHandle);
@ -218,13 +220,11 @@ struct CacheHandle {
      meta_charge += key.size();
 #endif
    }
-    return charge + meta_charge;
+    assert(meta_charge <= UINT32_MAX);
+    return static_cast<uint32_t>(meta_charge);
  }

-  inline size_t CalcTotalCharge(
-      CacheMetadataChargePolicy metadata_charge_policy) {
-    return CalcTotalCharge(key, charge, metadata_charge_policy);
-  }
+  inline size_t GetTotalCharge() { return charge + meta_charge; }
 };

 // Key of hash map. We store hash value with the key for convenience.
@ -428,10 +428,8 @@ void ClockCacheShard::RecycleHandle(CacheHandle* handle,
  assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0);
  context->to_delete_key.push_back(handle->key.data());
  context->to_delete_value.emplace_back(*handle);
-  size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
-  handle->key.clear();
-  handle->value = nullptr;
-  handle->deleter = nullptr;
+  size_t total_charge = handle->GetTotalCharge();
+  // clearing `handle` fields would go here but not strictly required
  recycle_.push_back(handle);
  usage_.fetch_sub(total_charge, std::memory_order_relaxed);
 }
@ -459,7 +457,7 @@ bool ClockCacheShard::Ref(Cache::Handle* h) {
                                            std::memory_order_relaxed)) {
      if (CountRefs(flags) == 0) {
        // No reference count before the operation.
-        size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+        size_t total_charge = handle->GetTotalCharge();
        pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
      }
      return true;
@ -473,6 +471,11 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
  if (set_usage) {
    handle->flags.fetch_or(kUsageBit, std::memory_order_relaxed);
  }
+  // If the handle reaches state refs=0 and InCache=true after this
+  // atomic operation then we cannot access `handle` afterward, because
+  // it could be evicted before we access the `handle`.
+  size_t total_charge = handle->GetTotalCharge();
+
  // Use acquire-release semantics as previous operations on the cache entry
  // has to be order before reference count is decreased, and potential cleanup
  // of the entry has to be order after.
@ -480,7 +483,6 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
  assert(CountRefs(flags) > 0);
  if (CountRefs(flags) == 1) {
    // this is the last reference.
-    size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
    pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
    // Cleanup if it is the last reference.
    if (!InCache(flags)) {
@ -567,8 +569,9 @@ CacheHandle* ClockCacheShard::Insert(
    void (*deleter)(const Slice& key, void* value), bool hold_reference,
    CleanupContext* context, bool* overwritten) {
  assert(overwritten != nullptr && *overwritten == false);
-  size_t total_charge =
-      CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_);
+  uint32_t meta_charge =
+      CacheHandle::CalcMetadataCharge(key, metadata_charge_policy_);
+  size_t total_charge = charge + meta_charge;
  MutexLock l(&mutex_);
  bool success = EvictFromCache(total_charge, context);
  bool strict = strict_capacity_limit_.load(std::memory_order_relaxed);
@ -594,8 +597,18 @@ CacheHandle* ClockCacheShard::Insert(
  handle->hash = hash;
  handle->value = value;
  handle->charge = charge;
+  handle->meta_charge = meta_charge;
  handle->deleter = deleter;
  uint32_t flags = hold_reference ? kInCacheBit + kOneRef : kInCacheBit;
+
+  // TODO investigate+fix suspected race condition:
+  // [thread 1] Lookup starts, up to Ref()
+  // [thread 2] Erase/evict the entry just looked up
+  // [thread 1] Ref() the handle, even though it's in the recycle bin
+  // [thread 2] Insert with recycling that handle
+  // Here we obliterate the other thread's Ref
+  // Possible fix: never blindly overwrite the flags, but only make
+  // relative updates (fetch_add, etc).
  handle->flags.store(flags, std::memory_order_relaxed);
  HashTable::accessor accessor;
  if (table_.find(accessor, CacheKey(key, hash))) {
@ -746,7 +759,17 @@ class ClockCache final : public ShardedCache {
    return reinterpret_cast<const CacheHandle*>(handle)->hash;
  }

-  void DisownData() override { shards_ = nullptr; }
+  void DisownData() override {
+#if defined(__clang__)
+#if !defined(__has_feature) || !__has_feature(address_sanitizer)
+    shards_ = nullptr;
+#endif
+#else  // __clang__
+#ifndef __SANITIZE_ADDRESS__
+    shards_ = nullptr;
+#endif  // !__SANITIZE_ADDRESS__
+#endif  // __clang__
+  }

 private:
  ClockCacheShard* shards_;
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@ -126,11 +126,15 @@ extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
 // more detail.
 //
 // Return nullptr if it is not supported.
+//
+// BROKEN: ClockCache is known to have bugs that could lead to crash or
+// corruption, so should not be used until fixed. Use NewLRUCache instead.
 extern std::shared_ptr<Cache> NewClockCache(
    size_t capacity, int num_shard_bits = -1,
    bool strict_capacity_limit = false,
    CacheMetadataChargePolicy metadata_charge_policy =
        kDefaultCacheMetadataChargePolicy);
+
 class Cache {
 public:
  // Depending on implementation, cache entries with high priority could be less
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@ -100,6 +100,7 @@ default_params = {
    "use_direct_reads": lambda: random.randint(0, 1),
    "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
    "mock_direct_io": False,
+    "use_clock_cache": lambda: random.choice([0, 0, 0, 1]),
    "use_full_merge_v1": lambda: random.randint(0, 1),
    "use_merge": lambda: random.randint(0, 1),
    "use_ribbon_filter": lambda: random.randint(0, 1),