Some small improvements to HyperClockCache (#11601)

Summary: Stacked on https://github.com/facebook/rocksdb/issues/11572 * Minimize use of std::function and lambdas to minimize chances of compiler heap-allocating closures (unnecessary stress on allocator). It appears that converting FindSlot to a template enables inlining the lambda parameters, avoiding heap allocations. * Clean up some logic with FindSlot (FIXMEs from https://github.com/facebook/rocksdb/issues/11572) * Fix handling of rare case of probing all slots, with new unit test. (Previously Insert would not roll back displacements in that case, which would kill performance if it were to happen.) * Add an -early_exit option to cache_bench for gathering memory stats before deallocation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11601 Test Plan: unit test added for probing all slots ## Seeing heap allocations Run `MALLOC_CONF="stats_print:true" ./cache_bench -cache_type=hyper_clock_cache` before https://github.com/facebook/rocksdb/issues/11572 vs. after this change. Before, we see this in the interesting bin statistics: ``` size nrequests ---- --------- 32 578460 64 24340 8192 578460 ``` And after: ``` size nrequests ---- --------- 32 (insignificant) 64 24370 8192 579130 ``` ## Performance test Build with `make USE_CLANG=1 PORTABLE=0 DEBUG_LEVEL=0 -j32 cache_bench` Run `./cache_bench -cache_type=hyper_clock_cache -ops_per_thread=5000000` in before and after configurations, simultaneously: ``` Before: Complete in 33.244 s; Rough parallel ops/sec = 2406442 After: Complete in 32.773 s; Rough parallel ops/sec = 2441019 ``` Reviewed By: jowlyzhang Differential Revision: D47375092 Pulled By: pdillinger fbshipit-source-id: 46f0f57257ddb374290a0a38c651764ea60ba410
2 years ago · b1b6f87fbe
parent bc0db33483
commit b1b6f87fbe
5 changed files with 140 additions and 67 deletions
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@ -77,6 +77,10 @@ DEFINE_bool(lean, false,
            "If true, no additional computation is performed besides cache "
            "operations.");
 DEFINE_bool(early_exit, false,
            "Exit before deallocating most memory. Good for malloc stats, e.g."
            "MALLOC_CONF=\"stats_print:true\"");
 DEFINE_string(secondary_cache_uri, "",
              "Full URI for creating a custom secondary cache object");
 static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
@ -593,6 +597,10 @@ class CacheBench {
      }
      thread->latency_ns_hist.Add(timer.ElapsedNanos());
    }
    if (FLAGS_early_exit) {
      MutexLock l(thread->shared->GetMutex());
      exit(0);
    }
    if (handle) {
      cache_->Release(handle);
      handle = nullptr;
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -195,7 +195,6 @@ inline void CorrectNearOverflow(uint64_t old_meta,
 inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
                            uint64_t initial_countdown, bool* already_matches) {
  assert(*already_matches == false);
  // Optimistically transition the slot from "empty" to
  // "under construction" (no effect on other states)
  uint64_t old_meta = h.meta.fetch_or(
@ -486,9 +485,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
  // Do we have the available occupancy? Optimistically assume we do
  // and deal with it if we don't.
  size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
  auto revert_occupancy_fn = [&]() {
    occupancy_.fetch_sub(1, std::memory_order_relaxed);
  };
  // Whether we over-committed and need an eviction to make up for it
  bool need_evict_for_occupancy =
      !derived.GrowIfNeeded(old_occupancy + 1, state);
@ -501,7 +497,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
    Status s = ChargeUsageMaybeEvictStrict<Table>(
        total_charge, capacity, need_evict_for_occupancy, state);
    if (!s.ok()) {
-      revert_occupancy_fn();
+      // Revert occupancy
      occupancy_.fetch_sub(1, std::memory_order_relaxed);
      return s;
    }
  } else {
@ -509,7 +506,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
    bool success = ChargeUsageMaybeEvictNonStrict<Table>(
        total_charge, capacity, need_evict_for_occupancy, state);
    if (!success) {
-      revert_occupancy_fn();
+      // Revert occupancy
      occupancy_.fetch_sub(1, std::memory_order_relaxed);
      if (handle == nullptr) {
        // Don't insert the entry but still return ok, as if the entry
        // inserted into cache and evicted immediately.
@ -522,11 +520,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
      }
    }
  }
  auto revert_usage_fn = [&]() {
    usage_.fetch_sub(total_charge, std::memory_order_relaxed);
    // No underflow
    assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
  };
  if (!use_standalone_insert) {
    // Attempt a table insert, but abort if we find an existing entry for the
@ -551,10 +544,14 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
      return Status::OK();
    }
    // Not inserted
-    revert_occupancy_fn();
+    // Revert occupancy
    occupancy_.fetch_sub(1, std::memory_order_relaxed);
    // Maybe fall back on standalone insert
    if (handle == nullptr) {
-      revert_usage_fn();
+      // Revert usage
      usage_.fetch_sub(total_charge, std::memory_order_relaxed);
      // No underflow
      assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
      // As if unrefed entry immdiately evicted
      proto.FreeData(allocator_);
      return Status::OK();
@ -680,47 +677,52 @@ bool HyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) {
 HyperClockTable::HandleImpl* HyperClockTable::DoInsert(
    const ClockHandleBasicData& proto, uint64_t initial_countdown,
    bool keep_ref, InsertState&) {
  size_t probe = 0;
  bool already_matches = false;
  HandleImpl* e = FindSlot(
      proto.hashed_key,
      [&](HandleImpl* h) {
-        // FIXME: simplify and handle in abort_fn below?
+        return TryInsert(proto, *h, initial_countdown, keep_ref,
-        bool inserted =
+                         &already_matches);
            TryInsert(proto, *h, initial_countdown, keep_ref, &already_matches);
        return inserted || already_matches;
      },
      [&](HandleImpl* /*h*/) { return false; },
      [&](HandleImpl* h) {
-        h->displacements.fetch_add(1, std::memory_order_relaxed);
+        if (already_matches) {
          // Stop searching & roll back displacements
          Rollback(proto.hashed_key, h);
          return true;
        } else {
          // Keep going
          return false;
        }
      },
-      probe);
+      [&](HandleImpl* h, bool is_last) {
-  if (e == nullptr) {
+        if (is_last) {
-    // Occupancy check and never abort FindSlot above should generally
+          // Search is ending. Roll back displacements
-    // prevent this, except it's theoretically possible for other threads
+          Rollback(proto.hashed_key, h);
-    // to evict and replace entries in the right order to hit every slot
+        } else {
-    // when it is populated. Assuming random hashing, the chance of that
+          h->displacements.fetch_add(1, std::memory_order_relaxed);
-    // should be no higher than pow(kStrictLoadFactor, n) for n slots.
+        }
-    // That should be infeasible for roughly n >= 256, so if this assertion
+      });
-    // fails, that suggests something is going wrong.
+  if (already_matches) {
-    assert(GetTableSize() < 256);
+    // Insertion skipped
-    // WART/FIXME: need to roll back every slot
+    return nullptr;
    already_matches = true;
  }
-  if (!already_matches) {
+  if (e != nullptr) {
    // Successfully inserted
    assert(e);
    return e;
  }
-  // Roll back displacements from failed table insertion
+  // Else, no available slot found. Occupancy check should generally prevent
-  Rollback(proto.hashed_key, e);
+  // this, except it's theoretically possible for other threads to evict and
-  // Insertion skipped
+  // replace entries in the right order to hit every slot when it is populated.
  // Assuming random hashing, the chance of that should be no higher than
  // pow(kStrictLoadFactor, n) for n slots. That should be infeasible for
  // roughly n >= 256, so if this assertion fails, that suggests something is
  // going wrong.
  assert(GetTableSize() < 256);
  return nullptr;
 }
 HyperClockTable::HandleImpl* HyperClockTable::Lookup(
    const UniqueId64x2& hashed_key) {
  size_t probe = 0;
  HandleImpl* e = FindSlot(
      hashed_key,
      [&](HandleImpl* h) {
@ -780,7 +782,7 @@ HyperClockTable::HandleImpl* HyperClockTable::Lookup(
      [&](HandleImpl* h) {
        return h->displacements.load(std::memory_order_relaxed) == 0;
      },
-      [&](HandleImpl* /*h*/) {}, probe);
+      [&](HandleImpl* /*h*/, bool /*is_last*/) {});
  return e;
 }
@ -873,7 +875,6 @@ void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
 #endif
 void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
  size_t probe = 0;
  (void)FindSlot(
      hashed_key,
      [&](HandleImpl* h) {
@ -940,7 +941,7 @@ void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
      [&](HandleImpl* h) {
        return h->displacements.load(std::memory_order_relaxed) == 0;
      },
-      [&](HandleImpl* /*h*/) {}, probe);
+      [&](HandleImpl* /*h*/, bool /*is_last*/) {});
 }
 void HyperClockTable::ConstApplyToEntriesRange(
@ -1005,10 +1006,10 @@ void HyperClockTable::EraseUnRefEntries() {
  }
 }
 template <typename MatchFn, typename AbortFn, typename UpdateFn>
 inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
-    const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn,
+    const UniqueId64x2& hashed_key, MatchFn match_fn, AbortFn abort_fn,
-    std::function<bool(HandleImpl*)> abort_fn,
+    UpdateFn update_fn) {
    std::function<void(HandleImpl*)> update_fn, size_t& probe) {
  // NOTE: upper 32 bits of hashed_key[0] is used for sharding
  //
  // We use double-hashing probing. Every probe in the sequence is a
@ -1022,20 +1023,21 @@ inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
  // TODO: we could also reconsider linear probing, though locality benefits
  // are limited because each slot is a full cache line
  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
-  size_t current = ModTableSize(base + probe * increment);
+  size_t first = ModTableSize(base);
-  while (probe <= length_bits_mask_) {
+  size_t current = first;
  bool is_last;
  do {
    HandleImpl* h = &array_[current];
    if (match_fn(h)) {
      probe++;
      return h;
    }
    if (abort_fn(h)) {
      return nullptr;
    }
    probe++;
    update_fn(h);
    current = ModTableSize(current + increment);
-  }
+    is_last = current == first;
    update_fn(h, is_last);
  } while (!is_last);
  // We looped back.
  return nullptr;
 }
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@ -549,7 +549,12 @@ class HyperClockTable : public BaseClockTable {
  size_t GetOccupancyLimit() const { return occupancy_limit_; }
 #ifndef NDEBUG
-  void TEST_ReleaseN(HandleImpl* h, size_t n);
+  size_t& TEST_MutableOccupancyLimit() const {
    return const_cast<size_t&>(occupancy_limit_);
  }
  // Release N references
  void TEST_ReleaseN(HandleImpl* handle, size_t n);
 #endif
 private:  // functions
@ -558,22 +563,18 @@ class HyperClockTable : public BaseClockTable {
    return static_cast<size_t>(x) & length_bits_mask_;
  }
-  // Returns the first slot in the probe sequence, starting from the given
+  // Returns the first slot in the probe sequence with a handle e such that
-  // probe number, with a handle e such that match(e) is true. At every
+  // match_fn(e) is true. At every step, the function first tests whether
-  // step, the function first tests whether match(e) holds. If this is false,
+  // match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
-  // it evaluates abort(e) to decide whether the search should be aborted,
+  // whether the search should be aborted, and if so, FindSlot immediately
-  // and in the affirmative returns -1. For every handle e probed except
+  // returns nullptr. For every handle e that is not a match and not aborted,
-  // the last one, the function runs update(e).
+  // FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
-  // The probe parameter is modified as follows. We say a probe to a handle
+  // slot will be the last probed because the next would cycle back to the first
-  // e is aborting if match(e) is false and abort(e) is true. Then the final
+  // slot probed. This function uses templates instead of std::function to
-  // value of probe is one more than the last non-aborting probe during the
+  // minimize the risk of heap-allocated closures being created.
-  // call. This is so that that the variable can be used to keep track of
+  template <typename MatchFn, typename AbortFn, typename UpdateFn>
-  // progress across consecutive calls to FindSlot.
+  inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, MatchFn match_fn,
-  inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
+                              AbortFn abort_fn, UpdateFn update_fn);
                              std::function<bool(HandleImpl*)> match,
                              std::function<bool(HandleImpl*)> stop,
                              std::function<void(HandleImpl*)> update,
                              size_t& probe);
  // Re-decrement all displacements in probe path starting from beginning
  // until (not including) the given handle
@ -704,9 +705,14 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
    return Lookup(key, hashed_key);
  }
 #ifndef NDEBUG
  size_t& TEST_MutableOccupancyLimit() const {
    return table_.TEST_MutableOccupancyLimit();
  }
  // Acquire/release N references
  void TEST_RefN(HandleImpl* handle, size_t n);
  void TEST_ReleaseN(HandleImpl* handle, size_t n);
 #endif
 private:  // data
  Table table_;
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@ -715,6 +715,62 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
  ASSERT_EQ(val.deleted, 1);
 }
 TEST_F(ClockCacheTest, ClockTableFull) {
  // Force clock cache table to fill up (not usually allowed) in order
  // to test full probe sequence that is theoretically possible due to
  // parallel operations
  NewShard(6, /*strict_capacity_limit*/ false);
  size_t size = shard_->GetTableAddressCount();
  ASSERT_LE(size + 3, 256);  // for using char keys
  // Modify occupancy and capacity limits to attempt insert on full
  shard_->TEST_MutableOccupancyLimit() = size + 100;
  shard_->SetCapacity(size + 100);
  DeleteCounter val;
  std::vector<HandleImpl*> handles;
  // NOTE: the three extra insertions should create standalone entries
  for (size_t i = 0; i < size + 3; ++i) {
    UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i));
    ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &val, &kDeleteCounterHelper,
                             1, &handles.emplace_back(),
                             Cache::Priority::HIGH));
  }
  for (size_t i = 0; i < size + 3; ++i) {
    UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i));
    HandleImpl* h = shard_->Lookup(TestKey(hkey), hkey);
    if (i < size) {
      ASSERT_NE(h, nullptr);
      shard_->Release(h);
    } else {
      // Standalone entries not visible by lookup
      ASSERT_EQ(h, nullptr);
    }
  }
  for (size_t i = 0; i < size + 3; ++i) {
    ASSERT_NE(handles[i], nullptr);
    shard_->Release(handles[i]);
    if (i < size) {
      // Everything still in cache
      ASSERT_EQ(val.deleted, 0);
    } else {
      // Standalone entries freed on release
      ASSERT_EQ(val.deleted, i + 1 - size);
    }
  }
  for (size_t i = size + 3; i > 0; --i) {
    UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i - 1));
    shard_->Erase(TestKey(hkey), hkey);
    if (i - 1 > size) {
      ASSERT_EQ(val.deleted, 3);
    } else {
      ASSERT_EQ(val.deleted, 3 + size - (i - 1));
    }
  }
 }
 // This test is mostly to exercise some corner case logic, by forcing two
 // keys to have the same hash, and more
 TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
--- a/unreleased_history/performance_improvements/hcc_perf
+++ b/unreleased_history/performance_improvements/hcc_perf
@ -0,0 +1 @@
 Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations
		`@ -0,0 +1 @@`
							`Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations`