From b205c6d0296096cc41117037ecbbb09eaa6237b8 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 6 Oct 2022 14:54:21 -0700
Subject: [PATCH] Fix bug in HyperClockCache ApplyToEntries; cleanup (#10768)

Summary:
We have seen some rare crash test failures in HyperClockCache, and the source could certainly be a bug fixed in this change, in ClockHandleTable::ConstApplyToEntriesRange. It wasn't properly accounting for the fact that incrementing the acquire counter could be ineffective, due to parallel updates. (When incrementing the acquire counter is ineffective, it is incorrect to then decrement it.)

This change includes some other minor clean-up in HyperClockCache, and adds stats_dump_period_sec with a much lower period to the crash test. This should be the primary caller of ApplyToEntries, in collecting cache entry stats.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10768

Test Plan: haven't been able to reproduce the failure, but should be in a better state (bug fix and improved crash test)

Reviewed By: anand1976

Differential Revision: D40034747

Pulled By: anand1976

fbshipit-source-id: a06fcefe146e17ee35001984445cedcf3b63eb68
---
 HISTORY.md                            |  1 +
 cache/clock_cache.cc                  | 67 ++++++++++++++++-----------
 db_stress_tool/db_stress_common.h     |  1 +
 db_stress_tool/db_stress_gflags.cc    |  4 ++
 db_stress_tool/db_stress_test_base.cc |  2 +
 tools/db_crashtest.py                 |  2 +
 6 files changed, 49 insertions(+), 28 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 4a639a60a..4122a82c3 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -11,6 +11,7 @@
 * Fixed a bug causing manual flush with `flush_opts.wait=false` to stall when database has stopped all writes (#10001).
 * Fixed a bug in iterator refresh that was not freeing up SuperVersion, which could cause excessive resource pinniung (#10770).
 * Fixed a bug where RocksDB could be doing compaction endlessly when allow_ingest_behind is true and the bottommost level is not filled (#10767).
+* Fixed a memory safety bug in experimental HyperClockCache (#10768)
 
 ### Performance Improvements
 * Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files.
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 0b07542c4..58a7f94bb 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -23,6 +23,12 @@ namespace ROCKSDB_NAMESPACE {
 
 namespace hyper_clock_cache {
 
+inline uint64_t GetRefcount(uint64_t meta) {
+  return ((meta >> ClockHandle::kAcquireCounterShift) -
+          (meta >> ClockHandle::kReleaseCounterShift)) &
+         ClockHandle::kCounterMask;
+}
+
 static_assert(sizeof(ClockHandle) == 64U,
               "Expecting size / alignment with common cache line size");
 
@@ -49,6 +55,7 @@ ClockHandleTable::~ClockHandleTable() {
         break;
       case ClockHandle::kStateInvisible:  // rare but possible
       case ClockHandle::kStateVisible:
+        assert(GetRefcount(h.meta) == 0);
         h.FreeData();
 #ifndef NDEBUG
         Rollback(h.hash, &h);
@@ -562,10 +569,7 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful,
     }
     // Take ownership if no refs
     do {
-      uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) -
-                           (old_meta >> ClockHandle::kReleaseCounterShift)) &
-                          ClockHandle::kCounterMask;
-      if (refcount != 0) {
+      if (GetRefcount(old_meta) != 0) {
         // Not last ref at some point in time during this Release call
         // Correct for possible (but rare) overflow
         CorrectNearOverflow(old_meta, h->meta);
@@ -622,6 +626,8 @@ void ClockHandleTable::Ref(ClockHandle& h) {
 
   assert((old_meta >> ClockHandle::kStateShift) &
          ClockHandle::kStateShareableBit);
+  // Must have already had a reference
+  assert(GetRefcount(old_meta) > 0);
   (void)old_meta;
 }
 
@@ -671,10 +677,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
             old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
                           << ClockHandle::kStateShift);
             for (;;) {
-              uint64_t refcount =
-                  ((old_meta >> ClockHandle::kAcquireCounterShift) -
-                   (old_meta >> ClockHandle::kReleaseCounterShift)) &
-                  ClockHandle::kCounterMask;
+              uint64_t refcount = GetRefcount(old_meta);
               assert(refcount > 0);
               if (refcount > 1) {
                 // Not last ref at some point in time during this Erase call
@@ -683,8 +686,10 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
                                   std::memory_order_release);
                 break;
               } else if (h->meta.compare_exchange_weak(
-                             old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                           << ClockHandle::kStateShift)) {
+                             old_meta,
+                             uint64_t{ClockHandle::kStateConstruction}
+                                 << ClockHandle::kStateShift,
+                             std::memory_order_acq_rel)) {
                 // Took ownership
                 assert(hash == h->hash);
                 // TODO? Delay freeing?
@@ -740,20 +745,32 @@ void ClockHandleTable::ConstApplyToEntriesRange(
   for (uint32_t i = index_begin; i < index_end; i++) {
     ClockHandle& h = array_[i];
 
+    // Note: to avoid using compare_exchange, we have to be extra careful.
     uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
     // Check if it's an entry visible to lookups
     if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
-      // Increment acquire counter
+      // Increment acquire counter. Note: it's possible that the entry has
+      // completely changed since we loaded old_meta, but incrementing acquire
+      // count is always safe. (Similar to optimistic Lookup here.)
       old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
                                   std::memory_order_acquire);
-      // Double-check
-      if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
-        func(h);
+      // Check whether we actually acquired a reference.
+      if ((old_meta >> ClockHandle::kStateShift) &
+          ClockHandle::kStateShareableBit) {
+        // Apply func if appropriate
+        if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+          func(h);
+        }
+        // Pretend we never took the reference
+        h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                         std::memory_order_release);
+        // No net change, so don't need to check for overflow
+      } else {
+        // For other states, incrementing the acquire counter has no effect
+        // so we don't need to undo it. Furthermore, we cannot safely undo
+        // it because we did not acquire a read reference to lock the
+        // entry in a Shareable state.
       }
-      // Pretend we never took the reference
-      h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
-                       std::memory_order_release);
-      // No net change, so don't need to check for overflow
     }
   }
 }
@@ -763,12 +780,9 @@ void ClockHandleTable::EraseUnRefEntries() {
     ClockHandle& h = array_[i];
 
     uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
-    uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) -
-                         (old_meta >> ClockHandle::kReleaseCounterShift)) &
-                        ClockHandle::kCounterMask;
     if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
                     << ClockHandle::kStateShift) &&
-        refcount == 0 &&
+        GetRefcount(old_meta) == 0 &&
         h.meta.compare_exchange_strong(old_meta,
                                        uint64_t{ClockHandle::kStateConstruction}
                                            << ClockHandle::kStateShift,
@@ -877,13 +891,12 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
         // Only clock update entries with no outstanding refs
         continue;
       }
-      if (!(meta >> ClockHandle::kStateShift &
+      if (!((meta >> ClockHandle::kStateShift) &
             ClockHandle::kStateShareableBit)) {
         // Only clock update Shareable entries
         continue;
       }
-      // ModTableSize(old_clock_pointer + i));
-      if (meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible &&
+      if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
           acquire_count > 0) {
         // Decrement clock
         uint64_t new_count = std::min(acquire_count - 1,
@@ -1101,9 +1114,7 @@ size_t ClockCacheShard::GetPinnedUsage() const {
   table_.ConstApplyToEntriesRange(
       [&table_pinned_usage, charge_metadata](const ClockHandle& h) {
         uint64_t meta = h.meta.load(std::memory_order_relaxed);
-        uint64_t refcount = ((meta >> ClockHandle::kAcquireCounterShift) -
-                             (meta >> ClockHandle::kReleaseCounterShift)) &
-                            ClockHandle::kCounterMask;
+        uint64_t refcount = GetRefcount(meta);
         // Holding one ref for ConstApplyToEntriesRange
         assert(refcount > 0);
         if (refcount > 1) {
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 53e1db42d..e849b770e 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -174,6 +174,7 @@ DECLARE_bool(mock_direct_io);
 DECLARE_bool(statistics);
 DECLARE_bool(sync);
 DECLARE_bool(use_fsync);
+DECLARE_uint64(stats_dump_period_sec);
 DECLARE_uint64(bytes_per_sync);
 DECLARE_uint64(wal_bytes_per_sync);
 DECLARE_int32(kill_random_test);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 067600ee9..abc07beec 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1063,4 +1063,8 @@ DEFINE_bool(
     "be preserved similarly under `FLAGS_expected_values_dir/unverified` when "
     "`--expected_values_dir` is nonempty.");
 
+DEFINE_uint64(stats_dump_period_sec,
+              ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
+              "Gap between printing stats to log in seconds");
+
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 020380dc2..d5af0c30a 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3095,6 +3095,8 @@ void InitializeOptionsFromFlags(
   options.experimental_mempurge_threshold =
       FLAGS_experimental_mempurge_threshold;
   options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
+  options.stats_dump_period_sec =
+      static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
   options.ttl = FLAGS_compaction_ttl;
   options.enable_pipelined_write = FLAGS_enable_pipelined_write;
   options.enable_write_thread_adaptive_yield =
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index f194a0ddb..f803b060b 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -137,6 +137,8 @@ default_params = {
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
     "use_multiget": lambda: random.randint(0, 1),
     "periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+    # 0 = never (used by some), 10 = often (for threading bugs), 600 = default
+    "stats_dump_period_sec": lambda: random.choice([0, 10, 600]),
     "compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
     # Test small max_manifest_file_size in a smaller chance, as most of the
     # time we wnat manifest history to be preserved to help debug