From d5a46c40e58a62b94d46c6739bcd44d3d459ef44 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Mon, 14 Jun 2021 08:14:21 -0700 Subject: [PATCH] Pin CacheEntryStatsCollector to fix performance bug (#8385) Summary: If the block Cache is full with strict_capacity_limit=false, then our CacheEntryStatsCollector could be immediately evicted on release, so iterating through column families with shared block cache could trigger re-scan for each CF. This change fixes that problem by pinning the CacheEntryStatsCollector from InternalStats so that it's not evicted. I had originally thought that this object could participate in LRU like everything else, but even though a re-load+re-scan only touches memory, it can be orders of magnitude more expensive than other cache misses. One service in Facebook has scans that take ~20s over 100GB block cache that is mostly 4KB entries. (The up-side of this bug and https://github.com/facebook/rocksdb/issues/8369 is that we had a natural experiment on the effect on some service metrics even with block cache scans running continuously in the background--a kind of worst case scenario. Metrics like latency were not affected enough to trigger warnings.) Other smaller fixes: 20s is already a sizable portion of 600s stats dump period, or 180s default max age to force re-scan, so added logic to ensure that (for each block cache) we don't spend more than 0.2% of our background thread time scanning it. Nevertheless, "foreground" requests for cache entry stats (calls to `db->GetMapProperty(DB::Properties::kBlockCacheEntryStats)`) are permitted to consume more CPU. Renamed field to cache_entry_stats_ to match code style. This change is intended for patching in 6.21 release. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8385 Test Plan: unit test expanded to cover new logic (detect regression), some manual testing with db_bench Reviewed By: ajkr Differential Revision: D29042759 Pulled By: pdillinger fbshipit-source-id: 236faa902397f50038c618f50fbc8cf3f277308c --- cache/cache_entry_stats.h | 29 +++++++++--- db/db_block_cache_test.cc | 93 ++++++++++++++++++++++++++++++++++----- db/internal_stats.cc | 58 ++++++++++++++---------- db/internal_stats.h | 18 +++++--- 4 files changed, 150 insertions(+), 48 deletions(-) diff --git a/cache/cache_entry_stats.h b/cache/cache_entry_stats.h index f602b448e..ecd387f85 100644 --- a/cache/cache_entry_stats.h +++ b/cache/cache_entry_stats.h @@ -52,15 +52,29 @@ template class CacheEntryStatsCollector { public: // Gathers stats and saves results into `stats` - void GetStats(Stats *stats, int maximum_age_in_seconds = 180) { + // + // Maximum allowed age for a "hit" on saved results is determined by the + // two interval parameters. Both set to 0 forces a re-scan. For example + // with min_interval_seconds=300 and min_interval_factor=100, if the last + // scan took 10s, we would only rescan ("miss") if the age in seconds of + // the saved results is > max(300, 100*10). + // Justification: scans can vary wildly in duration, e.g. from 0.02 sec + // to as much as 20 seconds, so we want to be able to cap the absolute + // and relative frequency of scans. + void GetStats(Stats *stats, int min_interval_seconds, + int min_interval_factor) { // Waits for any pending reader or writer (collector) std::lock_guard lock(mutex_); - // Maximum allowed age is nominally given by the parameter, but - // to limit the possibility of accidental repeated scans, impose - // a minimum TTL of 1 second. uint64_t max_age_micros = - static_cast(std::max(maximum_age_in_seconds, 1)) * 1000000U; + static_cast(std::max(min_interval_seconds, 0)) * 1000000U; + + if (last_end_time_micros_ > last_start_time_micros_ && + min_interval_factor > 0) { + max_age_micros = std::max( + max_age_micros, min_interval_factor * (last_end_time_micros_ - + last_start_time_micros_)); + } uint64_t start_time_micros = clock_->NowMicros(); if ((start_time_micros - last_end_time_micros_) > max_age_micros) { @@ -68,6 +82,8 @@ class CacheEntryStatsCollector { saved_stats_.BeginCollection(cache_, clock_, start_time_micros); cache_->ApplyToAllEntries(saved_stats_.GetEntryCallback(), {}); + TEST_SYNC_POINT_CALLBACK( + "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr); uint64_t end_time_micros = clock_->NowMicros(); last_end_time_micros_ = end_time_micros; @@ -109,7 +125,8 @@ class CacheEntryStatsCollector { // usage to go flaky. Fix the problem somehow so we can use an // accurate charge. size_t charge = 0; - Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h); + Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h, + Cache::Priority::HIGH); if (!s.ok()) { assert(h == nullptr); return s; diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 45dae285c..fe1cb58d6 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -152,12 +152,13 @@ class DBBlockCacheTest : public DBTestBase { } #ifndef ROCKSDB_LITE - const std::array& GetCacheEntryRoleCounts() { + const std::array& GetCacheEntryRoleCountsBg() { // Verify in cache entry role stats ColumnFamilyHandleImpl* cfh = static_cast(dbfull()->DefaultColumnFamily()); InternalStats* internal_stats_ptr = cfh->cfd()->internal_stats(); - return internal_stats_ptr->TEST_GetCacheEntryRoleStats().entry_counts; + return internal_stats_ptr->TEST_GetCacheEntryRoleStats(/*foreground=*/false) + .entry_counts; } #endif // ROCKSDB_LITE }; @@ -935,6 +936,8 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.max_open_files = 13; options.table_cache_numshardbits = 0; + // If this wakes up, it could interfere with test + options.stats_dump_period_sec = 0; BlockBasedTableOptions table_options; table_options.block_cache = cache; @@ -970,7 +973,7 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { std::array expected{}; // For CacheEntryStatsCollector expected[static_cast(CacheEntryRole::kMisc)] = 1; - EXPECT_EQ(expected, GetCacheEntryRoleCounts()); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); std::array prev_expected = expected; @@ -981,13 +984,13 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { expected[static_cast(CacheEntryRole::kFilterMetaBlock)] += 2; } // Within some time window, we will get cached entry stats - EXPECT_EQ(prev_expected, GetCacheEntryRoleCounts()); + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); // Not enough to force a miss - env_->MockSleepForSeconds(10); - EXPECT_EQ(prev_expected, GetCacheEntryRoleCounts()); + env_->MockSleepForSeconds(45); + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); // Enough to force a miss - env_->MockSleepForSeconds(1000); - EXPECT_EQ(expected, GetCacheEntryRoleCounts()); + env_->MockSleepForSeconds(601); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); // Now access index and data block ASSERT_EQ("value", Get("foo")); @@ -997,8 +1000,22 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { expected[static_cast(CacheEntryRole::kIndexBlock)]++; } expected[static_cast(CacheEntryRole::kDataBlock)]++; - env_->MockSleepForSeconds(1000); - EXPECT_EQ(expected, GetCacheEntryRoleCounts()); + // Enough to force a miss + env_->MockSleepForSeconds(601); + // But inject a simulated long scan so that we need a longer + // interval to force a miss next time. + SyncPoint::GetInstance()->SetCallBack( + "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", + [this](void*) { + // To spend no more than 0.2% of time scanning, we would need + // interval of at least 10000s + env_->MockSleepForSeconds(20); + }); + SyncPoint::GetInstance()->EnableProcessing(); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + prev_expected = expected; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); // The same for other file ASSERT_EQ("value", Get("zfoo")); @@ -1008,8 +1025,14 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { expected[static_cast(CacheEntryRole::kIndexBlock)]++; } expected[static_cast(CacheEntryRole::kDataBlock)]++; - env_->MockSleepForSeconds(1000); - EXPECT_EQ(expected, GetCacheEntryRoleCounts()); + // Because of the simulated long scan, this is not enough to force + // a miss + env_->MockSleepForSeconds(601); + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // But this is enough + env_->MockSleepForSeconds(10000); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + prev_expected = expected; // Also check the GetProperty interface std::map values; @@ -1025,8 +1048,54 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { EXPECT_EQ( ToString(expected[static_cast(CacheEntryRole::kFilterBlock)]), values["count.filter-block"]); + EXPECT_EQ( + ToString( + prev_expected[static_cast(CacheEntryRole::kWriteBuffer)]), + values["count.write-buffer"]); EXPECT_EQ(ToString(expected[static_cast(CacheEntryRole::kMisc)]), values["count.misc"]); + + // Add one for kWriteBuffer + { + WriteBufferManager wbm(size_t{1} << 20, cache); + wbm.ReserveMem(1024); + expected[static_cast(CacheEntryRole::kWriteBuffer)]++; + // Now we check that the GetProperty interface is more agressive about + // re-scanning stats, but not totally aggressive. + // Within some time window, we will get cached entry stats + env_->MockSleepForSeconds(1); + EXPECT_EQ(ToString(prev_expected[static_cast( + CacheEntryRole::kWriteBuffer)]), + values["count.write-buffer"]); + // Not enough for a "background" miss but enough for a "foreground" miss + env_->MockSleepForSeconds(45); + + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, + &values)); + EXPECT_EQ( + ToString( + expected[static_cast(CacheEntryRole::kWriteBuffer)]), + values["count.write-buffer"]); + } + prev_expected = expected; + + // With collector pinned in cache, we should be able to hit + // even if the cache is full + ClearCache(cache.get()); + Cache::Handle* h = nullptr; + ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1, + GetNoopDeleterForRole(), + &h, Cache::Priority::HIGH)); + ASSERT_GT(cache->GetUsage(), cache->GetCapacity()); + expected = {}; + expected[static_cast(CacheEntryRole::kMisc)]++; + // Still able to hit on saved stats + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // Enough to force a miss + env_->MockSleepForSeconds(1000); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + + cache->Release(h); } EXPECT_GE(iterations_tested, 1); } diff --git a/db/internal_stats.cc b/db/internal_stats.cc index b609add47..5f1f06fa5 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -528,26 +528,37 @@ InternalStats::InternalStats(int num_levels, SystemClock* clock, cfd_(cfd), started_at_(clock->NowMicros()) {} -Status InternalStats::CollectCacheEntryStats() { - using Collector = CacheEntryStatsCollector; - Cache* block_cache; - bool ok = HandleBlockCacheStat(&block_cache); - if (ok) { - // Extract or create stats collector. - std::shared_ptr collector; - Status s = Collector::GetShared(block_cache, clock_, &collector); - if (s.ok()) { - // TODO: use a max age like stats_dump_period_sec / 2, but it's - // difficult to access that setting from here with just cfd_ - collector->GetStats(&cache_entry_stats); +Status InternalStats::CollectCacheEntryStats(bool foreground) { + // Lazy initialize/reference the collector. It is pinned in cache (through + // a shared_ptr) so that it does not get immediately ejected from a full + // cache, which would force a re-scan on the next GetStats. + if (!cache_entry_stats_collector_) { + Cache* block_cache; + bool ok = HandleBlockCacheStat(&block_cache); + if (ok) { + // Extract or create stats collector. + Status s = CacheEntryStatsCollector::GetShared( + block_cache, clock_, &cache_entry_stats_collector_); + if (!s.ok()) { + // Block cache likely under pressure. Scanning could make it worse, + // so skip. + return s; + } } else { - // Block cache likely under pressure. Scanning could make it worse, - // so skip. + return Status::NotFound("block cache not configured"); } - return s; - } else { - return Status::NotFound("block cache not configured"); } + assert(cache_entry_stats_collector_); + + // For "background" collections, strictly cap the collection time by + // expanding effective cache TTL. For foreground, be more aggressive about + // getting latest data. + int min_interval_seconds = foreground ? 10 : 180; + // 1/500 = max of 0.2% of one CPU thread + int min_interval_factor = foreground ? 10 : 500; + cache_entry_stats_collector_->GetStats( + &cache_entry_stats_, min_interval_seconds, min_interval_factor); + return Status::OK(); } std::function @@ -638,21 +649,21 @@ void InternalStats::CacheEntryRoleStats::ToMap( bool InternalStats::HandleBlockCacheEntryStats(std::string* value, Slice /*suffix*/) { - Status s = CollectCacheEntryStats(); + Status s = CollectCacheEntryStats(/*foreground*/ true); if (!s.ok()) { return false; } - *value = cache_entry_stats.ToString(clock_); + *value = cache_entry_stats_.ToString(clock_); return true; } bool InternalStats::HandleBlockCacheEntryStatsMap( std::map* values, Slice /*suffix*/) { - Status s = CollectCacheEntryStats(); + Status s = CollectCacheEntryStats(/*foreground*/ true); if (!s.ok()) { return false; } - cache_entry_stats.ToMap(values, clock_); + cache_entry_stats_.ToMap(values, clock_); return true; } @@ -1608,9 +1619,10 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) { cf_stats_snapshot_.comp_stats = compaction_stats_sum; cf_stats_snapshot_.stall_count = total_stall_count; - Status s = CollectCacheEntryStats(); + // Always treat CFStats context as "background" + Status s = CollectCacheEntryStats(/*foreground=*/false); if (s.ok()) { - value->append(cache_entry_stats.ToString(clock_)); + value->append(cache_entry_stats_.ToString(clock_)); } else { value->append("Block cache: "); value->append(s.ToString()); diff --git a/db/internal_stats.h b/db/internal_stats.h index a9672258a..023bf3b5e 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -23,6 +23,8 @@ class ColumnFamilyData; namespace ROCKSDB_NAMESPACE { +template +class CacheEntryStatsCollector; class DBImpl; class MemTableList; @@ -390,7 +392,7 @@ class InternalStats { cf_stats_count_[i] = 0; cf_stats_value_[i] = 0; } - cache_entry_stats.Clear(); + cache_entry_stats_.Clear(); for (auto& comp_stat : comp_stats_) { comp_stat.Clear(); } @@ -463,13 +465,13 @@ class InternalStats { return comp_stats_; } - const CacheEntryRoleStats& TEST_GetCacheEntryRoleStats() { - Status s = CollectCacheEntryStats(); + const CacheEntryRoleStats& TEST_GetCacheEntryRoleStats(bool foreground) { + Status s = CollectCacheEntryStats(foreground); if (!s.ok()) { assert(false); - cache_entry_stats.Clear(); + cache_entry_stats_.Clear(); } - return cache_entry_stats; + return cache_entry_stats_; } // Store a mapping from the user-facing DB::Properties string to our @@ -492,14 +494,16 @@ class InternalStats { bool HandleBlockCacheStat(Cache** block_cache); - Status CollectCacheEntryStats(); + Status CollectCacheEntryStats(bool foreground); // Per-DB stats std::atomic db_stats_[kIntStatsNumMax]; // Per-ColumnFamily stats uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX]; uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX]; - CacheEntryRoleStats cache_entry_stats; + CacheEntryRoleStats cache_entry_stats_; + std::shared_ptr> + cache_entry_stats_collector_; // Per-ColumnFamily/level compaction stats std::vector comp_stats_; std::vector comp_stats_by_pri_;