Pin CacheEntryStatsCollector to fix performance bug (#8385)

Summary: If the block Cache is full with strict_capacity_limit=false, then our CacheEntryStatsCollector could be immediately evicted on release, so iterating through column families with shared block cache could trigger re-scan for each CF. This change fixes that problem by pinning the CacheEntryStatsCollector from InternalStats so that it's not evicted. I had originally thought that this object could participate in LRU like everything else, but even though a re-load+re-scan only touches memory, it can be orders of magnitude more expensive than other cache misses. One service in Facebook has scans that take ~20s over 100GB block cache that is mostly 4KB entries. (The up-side of this bug and https://github.com/facebook/rocksdb/issues/8369 is that we had a natural experiment on the effect on some service metrics even with block cache scans running continuously in the background--a kind of worst case scenario. Metrics like latency were not affected enough to trigger warnings.) Other smaller fixes: 20s is already a sizable portion of 600s stats dump period, or 180s default max age to force re-scan, so added logic to ensure that (for each block cache) we don't spend more than 0.2% of our background thread time scanning it. Nevertheless, "foreground" requests for cache entry stats (calls to `db->GetMapProperty(DB::Properties::kBlockCacheEntryStats)`) are permitted to consume more CPU. Renamed field to cache_entry_stats_ to match code style. This change is intended for patching in 6.21 release. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8385 Test Plan: unit test expanded to cover new logic (detect regression), some manual testing with db_bench Reviewed By: ajkr Differential Revision: D29042759 Pulled By: pdillinger fbshipit-source-id: 236faa902397f50038c618f50fbc8cf3f277308c
4 years ago · d5a46c40e5
parent d60ae5b1c7
commit d5a46c40e5
4 changed files with 150 additions and 48 deletions
--- a/cache/cache_entry_stats.h
+++ b/cache/cache_entry_stats.h
@ -52,15 +52,29 @@ template <class Stats>
 class CacheEntryStatsCollector {
 public:
  // Gathers stats and saves results into `stats`
-  void GetStats(Stats *stats, int maximum_age_in_seconds = 180) {
+  //
+  // Maximum allowed age for a "hit" on saved results is determined by the
+  // two interval parameters. Both set to 0 forces a re-scan. For example
+  // with min_interval_seconds=300 and min_interval_factor=100, if the last
+  // scan took 10s, we would only rescan ("miss") if the age in seconds of
+  // the saved results is > max(300, 100*10).
+  // Justification: scans can vary wildly in duration, e.g. from 0.02 sec
+  // to as much as 20 seconds, so we want to be able to cap the absolute
+  // and relative frequency of scans.
+  void GetStats(Stats *stats, int min_interval_seconds,
+                int min_interval_factor) {
    // Waits for any pending reader or writer (collector)
    std::lock_guard<std::mutex> lock(mutex_);

-    // Maximum allowed age is nominally given by the parameter, but
-    // to limit the possibility of accidental repeated scans, impose
-    // a minimum TTL of 1 second.
    uint64_t max_age_micros =
-        static_cast<uint64_t>(std::max(maximum_age_in_seconds, 1)) * 1000000U;
+        static_cast<uint64_t>(std::max(min_interval_seconds, 0)) * 1000000U;
+
+    if (last_end_time_micros_ > last_start_time_micros_ &&
+        min_interval_factor > 0) {
+      max_age_micros = std::max(
+          max_age_micros, min_interval_factor * (last_end_time_micros_ -
+                                                 last_start_time_micros_));
+    }

    uint64_t start_time_micros = clock_->NowMicros();
    if ((start_time_micros - last_end_time_micros_) > max_age_micros) {
@ -68,6 +82,8 @@ class CacheEntryStatsCollector {
      saved_stats_.BeginCollection(cache_, clock_, start_time_micros);

      cache_->ApplyToAllEntries(saved_stats_.GetEntryCallback(), {});
+      TEST_SYNC_POINT_CALLBACK(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr);

      uint64_t end_time_micros = clock_->NowMicros();
      last_end_time_micros_ = end_time_micros;
@ -109,7 +125,8 @@ class CacheEntryStatsCollector {
        // usage to go flaky. Fix the problem somehow so we can use an
        // accurate charge.
        size_t charge = 0;
-        Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h);
+        Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h,
+                                 Cache::Priority::HIGH);
        if (!s.ok()) {
          assert(h == nullptr);
          return s;
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@ -152,12 +152,13 @@ class DBBlockCacheTest : public DBTestBase {
  }

 #ifndef ROCKSDB_LITE
-  const std::array<size_t, kNumCacheEntryRoles>& GetCacheEntryRoleCounts() {
+  const std::array<size_t, kNumCacheEntryRoles>& GetCacheEntryRoleCountsBg() {
    // Verify in cache entry role stats
    ColumnFamilyHandleImpl* cfh =
        static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
    InternalStats* internal_stats_ptr = cfh->cfd()->internal_stats();
-    return internal_stats_ptr->TEST_GetCacheEntryRoleStats().entry_counts;
+    return internal_stats_ptr->TEST_GetCacheEntryRoleStats(/*foreground=*/false)
+        .entry_counts;
  }
 #endif  // ROCKSDB_LITE
 };
@ -935,6 +936,8 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
      options.max_open_files = 13;
      options.table_cache_numshardbits = 0;
+      // If this wakes up, it could interfere with test
+      options.stats_dump_period_sec = 0;

      BlockBasedTableOptions table_options;
      table_options.block_cache = cache;
@ -970,7 +973,7 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
      std::array<size_t, kNumCacheEntryRoles> expected{};
      // For CacheEntryStatsCollector
      expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
-      EXPECT_EQ(expected, GetCacheEntryRoleCounts());
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());

      std::array<size_t, kNumCacheEntryRoles> prev_expected = expected;

@ -981,13 +984,13 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
        expected[static_cast<size_t>(CacheEntryRole::kFilterMetaBlock)] += 2;
      }
      // Within some time window, we will get cached entry stats
-      EXPECT_EQ(prev_expected, GetCacheEntryRoleCounts());
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
      // Not enough to force a miss
-      env_->MockSleepForSeconds(10);
-      EXPECT_EQ(prev_expected, GetCacheEntryRoleCounts());
+      env_->MockSleepForSeconds(45);
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
      // Enough to force a miss
-      env_->MockSleepForSeconds(1000);
-      EXPECT_EQ(expected, GetCacheEntryRoleCounts());
+      env_->MockSleepForSeconds(601);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());

      // Now access index and data block
      ASSERT_EQ("value", Get("foo"));
@ -997,8 +1000,22 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
        expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
      }
      expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
-      env_->MockSleepForSeconds(1000);
-      EXPECT_EQ(expected, GetCacheEntryRoleCounts());
+      // Enough to force a miss
+      env_->MockSleepForSeconds(601);
+      // But inject a simulated long scan so that we need a longer
+      // interval to force a miss next time.
+      SyncPoint::GetInstance()->SetCallBack(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
+          [this](void*) {
+            // To spend no more than 0.2% of time scanning, we would need
+            // interval of at least 10000s
+            env_->MockSleepForSeconds(20);
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+      prev_expected = expected;
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();

      // The same for other file
      ASSERT_EQ("value", Get("zfoo"));
@ -1008,8 +1025,14 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
        expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
      }
      expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
-      env_->MockSleepForSeconds(1000);
-      EXPECT_EQ(expected, GetCacheEntryRoleCounts());
+      // Because of the simulated long scan, this is not enough to force
+      // a miss
+      env_->MockSleepForSeconds(601);
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // But this is enough
+      env_->MockSleepForSeconds(10000);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+      prev_expected = expected;

      // Also check the GetProperty interface
      std::map<std::string, std::string> values;
@ -1025,8 +1048,54 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
      EXPECT_EQ(
          ToString(expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)]),
          values["count.filter-block"]);
+      EXPECT_EQ(
+          ToString(
+              prev_expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
+          values["count.write-buffer"]);
      EXPECT_EQ(ToString(expected[static_cast<size_t>(CacheEntryRole::kMisc)]),
                values["count.misc"]);
+
+      // Add one for kWriteBuffer
+      {
+        WriteBufferManager wbm(size_t{1} << 20, cache);
+        wbm.ReserveMem(1024);
+        expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]++;
+        // Now we check that the GetProperty interface is more agressive about
+        // re-scanning stats, but not totally aggressive.
+        // Within some time window, we will get cached entry stats
+        env_->MockSleepForSeconds(1);
+        EXPECT_EQ(ToString(prev_expected[static_cast<size_t>(
+                      CacheEntryRole::kWriteBuffer)]),
+                  values["count.write-buffer"]);
+        // Not enough for a "background" miss but enough for a "foreground" miss
+        env_->MockSleepForSeconds(45);
+
+        ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats,
+                                        &values));
+        EXPECT_EQ(
+            ToString(
+                expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
+            values["count.write-buffer"]);
+      }
+      prev_expected = expected;
+
+      // With collector pinned in cache, we should be able to hit
+      // even if the cache is full
+      ClearCache(cache.get());
+      Cache::Handle* h = nullptr;
+      ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1,
+                              GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
+                              &h, Cache::Priority::HIGH));
+      ASSERT_GT(cache->GetUsage(), cache->GetCapacity());
+      expected = {};
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)]++;
+      // Still able to hit on saved stats
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Enough to force a miss
+      env_->MockSleepForSeconds(1000);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      cache->Release(h);
    }
    EXPECT_GE(iterations_tested, 1);
  }
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@ -528,26 +528,37 @@ InternalStats::InternalStats(int num_levels, SystemClock* clock,
      cfd_(cfd),
      started_at_(clock->NowMicros()) {}

-Status InternalStats::CollectCacheEntryStats() {
-  using Collector = CacheEntryStatsCollector<CacheEntryRoleStats>;
-  Cache* block_cache;
-  bool ok = HandleBlockCacheStat(&block_cache);
-  if (ok) {
-    // Extract or create stats collector.
-    std::shared_ptr<Collector> collector;
-    Status s = Collector::GetShared(block_cache, clock_, &collector);
-    if (s.ok()) {
-      // TODO: use a max age like stats_dump_period_sec / 2, but it's
-      // difficult to access that setting from here with just cfd_
-      collector->GetStats(&cache_entry_stats);
+Status InternalStats::CollectCacheEntryStats(bool foreground) {
+  // Lazy initialize/reference the collector. It is pinned in cache (through
+  // a shared_ptr) so that it does not get immediately ejected from a full
+  // cache, which would force a re-scan on the next GetStats.
+  if (!cache_entry_stats_collector_) {
+    Cache* block_cache;
+    bool ok = HandleBlockCacheStat(&block_cache);
+    if (ok) {
+      // Extract or create stats collector.
+      Status s = CacheEntryStatsCollector<CacheEntryRoleStats>::GetShared(
+          block_cache, clock_, &cache_entry_stats_collector_);
+      if (!s.ok()) {
+        // Block cache likely under pressure. Scanning could make it worse,
+        // so skip.
+        return s;
+      }
    } else {
-      // Block cache likely under pressure. Scanning could make it worse,
-      // so skip.
+      return Status::NotFound("block cache not configured");
    }
-    return s;
-  } else {
-    return Status::NotFound("block cache not configured");
  }
+  assert(cache_entry_stats_collector_);
+
+  // For "background" collections, strictly cap the collection time by
+  // expanding effective cache TTL. For foreground, be more aggressive about
+  // getting latest data.
+  int min_interval_seconds = foreground ? 10 : 180;
+  // 1/500 = max of 0.2% of one CPU thread
+  int min_interval_factor = foreground ? 10 : 500;
+  cache_entry_stats_collector_->GetStats(
+      &cache_entry_stats_, min_interval_seconds, min_interval_factor);
+  return Status::OK();
 }

 std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
@ -638,21 +649,21 @@ void InternalStats::CacheEntryRoleStats::ToMap(

 bool InternalStats::HandleBlockCacheEntryStats(std::string* value,
                                               Slice /*suffix*/) {
-  Status s = CollectCacheEntryStats();
+  Status s = CollectCacheEntryStats(/*foreground*/ true);
  if (!s.ok()) {
    return false;
  }
-  *value = cache_entry_stats.ToString(clock_);
+  *value = cache_entry_stats_.ToString(clock_);
  return true;
 }

 bool InternalStats::HandleBlockCacheEntryStatsMap(
    std::map<std::string, std::string>* values, Slice /*suffix*/) {
-  Status s = CollectCacheEntryStats();
+  Status s = CollectCacheEntryStats(/*foreground*/ true);
  if (!s.ok()) {
    return false;
  }
-  cache_entry_stats.ToMap(values, clock_);
+  cache_entry_stats_.ToMap(values, clock_);
  return true;
 }

@ -1608,9 +1619,10 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
  cf_stats_snapshot_.comp_stats = compaction_stats_sum;
  cf_stats_snapshot_.stall_count = total_stall_count;

-  Status s = CollectCacheEntryStats();
+  // Always treat CFStats context as "background"
+  Status s = CollectCacheEntryStats(/*foreground=*/false);
  if (s.ok()) {
-    value->append(cache_entry_stats.ToString(clock_));
+    value->append(cache_entry_stats_.ToString(clock_));
  } else {
    value->append("Block cache: ");
    value->append(s.ToString());
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@ -23,6 +23,8 @@ class ColumnFamilyData;

 namespace ROCKSDB_NAMESPACE {

+template <class Stats>
+class CacheEntryStatsCollector;
 class DBImpl;
 class MemTableList;

@ -390,7 +392,7 @@ class InternalStats {
      cf_stats_count_[i] = 0;
      cf_stats_value_[i] = 0;
    }
-    cache_entry_stats.Clear();
+    cache_entry_stats_.Clear();
    for (auto& comp_stat : comp_stats_) {
      comp_stat.Clear();
    }
@ -463,13 +465,13 @@ class InternalStats {
    return comp_stats_;
  }

-  const CacheEntryRoleStats& TEST_GetCacheEntryRoleStats() {
-    Status s = CollectCacheEntryStats();
+  const CacheEntryRoleStats& TEST_GetCacheEntryRoleStats(bool foreground) {
+    Status s = CollectCacheEntryStats(foreground);
    if (!s.ok()) {
      assert(false);
-      cache_entry_stats.Clear();
+      cache_entry_stats_.Clear();
    }
-    return cache_entry_stats;
+    return cache_entry_stats_;
  }

  // Store a mapping from the user-facing DB::Properties string to our
@ -492,14 +494,16 @@ class InternalStats {

  bool HandleBlockCacheStat(Cache** block_cache);

-  Status CollectCacheEntryStats();
+  Status CollectCacheEntryStats(bool foreground);

  // Per-DB stats
  std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
  // Per-ColumnFamily stats
  uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
  uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
-  CacheEntryRoleStats cache_entry_stats;
+  CacheEntryRoleStats cache_entry_stats_;
+  std::shared_ptr<CacheEntryStatsCollector<CacheEntryRoleStats>>
+      cache_entry_stats_collector_;
  // Per-ColumnFamily/level compaction stats
  std::vector<CompactionStats> comp_stats_;
  std::vector<CompactionStats> comp_stats_by_pri_;