Fix bug in HyperClockCache ApplyToEntries; cleanup (#10768)

Summary: We have seen some rare crash test failures in HyperClockCache, and the source could certainly be a bug fixed in this change, in ClockHandleTable::ConstApplyToEntriesRange. It wasn't properly accounting for the fact that incrementing the acquire counter could be ineffective, due to parallel updates. (When incrementing the acquire counter is ineffective, it is incorrect to then decrement it.) This change includes some other minor clean-up in HyperClockCache, and adds stats_dump_period_sec with a much lower period to the crash test. This should be the primary caller of ApplyToEntries, in collecting cache entry stats. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10768 Test Plan: haven't been able to reproduce the failure, but should be in a better state (bug fix and improved crash test) Reviewed By: anand1976 Differential Revision: D40034747 Pulled By: anand1976 fbshipit-source-id: a06fcefe146e17ee35001984445cedcf3b63eb68
2 years ago · b205c6d029
parent f461e064ed
commit b205c6d029
6 changed files with 49 additions and 28 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -11,6 +11,7 @@
 * Fixed a bug causing manual flush with `flush_opts.wait=false` to stall when database has stopped all writes (#10001).
 * Fixed a bug in iterator refresh that was not freeing up SuperVersion, which could cause excessive resource pinniung (#10770).
 * Fixed a bug where RocksDB could be doing compaction endlessly when allow_ingest_behind is true and the bottommost level is not filled (#10767).
+* Fixed a memory safety bug in experimental HyperClockCache (#10768)

 ### Performance Improvements
 * Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files.
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -23,6 +23,12 @@ namespace ROCKSDB_NAMESPACE {

 namespace hyper_clock_cache {

+inline uint64_t GetRefcount(uint64_t meta) {
+  return ((meta >> ClockHandle::kAcquireCounterShift) -
+          (meta >> ClockHandle::kReleaseCounterShift)) &
+         ClockHandle::kCounterMask;
+}
+
 static_assert(sizeof(ClockHandle) == 64U,
              "Expecting size / alignment with common cache line size");

@ -49,6 +55,7 @@ ClockHandleTable::~ClockHandleTable() {
        break;
      case ClockHandle::kStateInvisible:  // rare but possible
      case ClockHandle::kStateVisible:
+        assert(GetRefcount(h.meta) == 0);
        h.FreeData();
 #ifndef NDEBUG
        Rollback(h.hash, &h);
@ -562,10 +569,7 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful,
    }
    // Take ownership if no refs
    do {
-      uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) -
-                           (old_meta >> ClockHandle::kReleaseCounterShift)) &
-                          ClockHandle::kCounterMask;
-      if (refcount != 0) {
+      if (GetRefcount(old_meta) != 0) {
        // Not last ref at some point in time during this Release call
        // Correct for possible (but rare) overflow
        CorrectNearOverflow(old_meta, h->meta);
@ -622,6 +626,8 @@ void ClockHandleTable::Ref(ClockHandle& h) {

  assert((old_meta >> ClockHandle::kStateShift) &
         ClockHandle::kStateShareableBit);
+  // Must have already had a reference
+  assert(GetRefcount(old_meta) > 0);
  (void)old_meta;
 }

@ -671,10 +677,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
            old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
                          << ClockHandle::kStateShift);
            for (;;) {
-              uint64_t refcount =
-                  ((old_meta >> ClockHandle::kAcquireCounterShift) -
-                   (old_meta >> ClockHandle::kReleaseCounterShift)) &
-                  ClockHandle::kCounterMask;
+              uint64_t refcount = GetRefcount(old_meta);
              assert(refcount > 0);
              if (refcount > 1) {
                // Not last ref at some point in time during this Erase call
@ -683,8 +686,10 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
                                  std::memory_order_release);
                break;
              } else if (h->meta.compare_exchange_weak(
-                             old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                           << ClockHandle::kStateShift)) {
+                             old_meta,
+                             uint64_t{ClockHandle::kStateConstruction}
+                                 << ClockHandle::kStateShift,
+                             std::memory_order_acq_rel)) {
                // Took ownership
                assert(hash == h->hash);
                // TODO? Delay freeing?
@ -740,20 +745,32 @@ void ClockHandleTable::ConstApplyToEntriesRange(
  for (uint32_t i = index_begin; i < index_end; i++) {
    ClockHandle& h = array_[i];

+    // Note: to avoid using compare_exchange, we have to be extra careful.
    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
    // Check if it's an entry visible to lookups
    if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
-      // Increment acquire counter
+      // Increment acquire counter. Note: it's possible that the entry has
+      // completely changed since we loaded old_meta, but incrementing acquire
+      // count is always safe. (Similar to optimistic Lookup here.)
      old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
                                  std::memory_order_acquire);
-      // Double-check
-      if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
-        func(h);
+      // Check whether we actually acquired a reference.
+      if ((old_meta >> ClockHandle::kStateShift) &
+          ClockHandle::kStateShareableBit) {
+        // Apply func if appropriate
+        if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+          func(h);
+        }
+        // Pretend we never took the reference
+        h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                         std::memory_order_release);
+        // No net change, so don't need to check for overflow
+      } else {
+        // For other states, incrementing the acquire counter has no effect
+        // so we don't need to undo it. Furthermore, we cannot safely undo
+        // it because we did not acquire a read reference to lock the
+        // entry in a Shareable state.
      }
-      // Pretend we never took the reference
-      h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
-                       std::memory_order_release);
-      // No net change, so don't need to check for overflow
    }
  }
 }
@ -763,12 +780,9 @@ void ClockHandleTable::EraseUnRefEntries() {
    ClockHandle& h = array_[i];

    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
-    uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) -
-                         (old_meta >> ClockHandle::kReleaseCounterShift)) &
-                        ClockHandle::kCounterMask;
    if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
                    << ClockHandle::kStateShift) &&
-        refcount == 0 &&
+        GetRefcount(old_meta) == 0 &&
        h.meta.compare_exchange_strong(old_meta,
                                       uint64_t{ClockHandle::kStateConstruction}
                                           << ClockHandle::kStateShift,
@ -877,13 +891,12 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
        // Only clock update entries with no outstanding refs
        continue;
      }
-      if (!(meta >> ClockHandle::kStateShift &
+      if (!((meta >> ClockHandle::kStateShift) &
            ClockHandle::kStateShareableBit)) {
        // Only clock update Shareable entries
        continue;
      }
-      // ModTableSize(old_clock_pointer + i));
-      if (meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible &&
+      if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
          acquire_count > 0) {
        // Decrement clock
        uint64_t new_count = std::min(acquire_count - 1,
@ -1101,9 +1114,7 @@ size_t ClockCacheShard::GetPinnedUsage() const {
  table_.ConstApplyToEntriesRange(
      [&table_pinned_usage, charge_metadata](const ClockHandle& h) {
        uint64_t meta = h.meta.load(std::memory_order_relaxed);
-        uint64_t refcount = ((meta >> ClockHandle::kAcquireCounterShift) -
-                             (meta >> ClockHandle::kReleaseCounterShift)) &
-                            ClockHandle::kCounterMask;
+        uint64_t refcount = GetRefcount(meta);
        // Holding one ref for ConstApplyToEntriesRange
        assert(refcount > 0);
        if (refcount > 1) {
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@ -174,6 +174,7 @@ DECLARE_bool(mock_direct_io);
 DECLARE_bool(statistics);
 DECLARE_bool(sync);
 DECLARE_bool(use_fsync);
+DECLARE_uint64(stats_dump_period_sec);
 DECLARE_uint64(bytes_per_sync);
 DECLARE_uint64(wal_bytes_per_sync);
 DECLARE_int32(kill_random_test);
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@ -1063,4 +1063,8 @@ DEFINE_bool(
    "be preserved similarly under `FLAGS_expected_values_dir/unverified` when "
    "`--expected_values_dir` is nonempty.");

+DEFINE_uint64(stats_dump_period_sec,
+              ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
+              "Gap between printing stats to log in seconds");
+
 #endif  // GFLAGS
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@ -3095,6 +3095,8 @@ void InitializeOptionsFromFlags(
  options.experimental_mempurge_threshold =
      FLAGS_experimental_mempurge_threshold;
  options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
+  options.stats_dump_period_sec =
+      static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
  options.ttl = FLAGS_compaction_ttl;
  options.enable_pipelined_write = FLAGS_enable_pipelined_write;
  options.enable_write_thread_adaptive_yield =
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@ -137,6 +137,8 @@ default_params = {
    "index_block_restart_interval": lambda: random.choice(range(1, 16)),
    "use_multiget": lambda: random.randint(0, 1),
    "periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+    # 0 = never (used by some), 10 = often (for threading bugs), 600 = default
+    "stats_dump_period_sec": lambda: random.choice([0, 10, 600]),
    "compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
    # Test small max_manifest_file_size in a smaller chance, as most of the
    # time we wnat manifest history to be preserved to help debug