Pin CacheEntryStatsCollector to fix performance bug (#8385)

Summary:
If the block Cache is full with strict_capacity_limit=false,
then our CacheEntryStatsCollector could be immediately evicted on
release, so iterating through column families with shared block cache
could trigger re-scan for each CF. This change fixes that problem by
pinning the CacheEntryStatsCollector from InternalStats so that it's not
evicted.

I had originally thought that this object could participate in LRU like
everything else, but even though a re-load+re-scan only touches memory,
it can be orders of magnitude more expensive than other cache misses.
One service in Facebook has scans that take ~20s over 100GB block cache
that is mostly 4KB entries. (The up-side of this bug and https://github.com/facebook/rocksdb/issues/8369 is that
we had a natural experiment on the effect on some service metrics even
with block cache scans running continuously in the background--a kind
of worst case scenario. Metrics like latency were not affected enough
to trigger warnings.)

Other smaller fixes:

20s is already a sizable portion of 600s stats dump period, or 180s
default max age to force re-scan, so added logic to ensure that (for
each block cache) we don't spend more than 0.2% of our background thread
time scanning it. Nevertheless, "foreground" requests for cache entry
stats (calls to `db->GetMapProperty(DB::Properties::kBlockCacheEntryStats)`)
are permitted to consume more CPU.

Renamed field to cache_entry_stats_ to match code style.

This change is intended for patching in 6.21 release.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8385

Test Plan:
unit test expanded to cover new logic (detect regression),
some manual testing with db_bench

Reviewed By: ajkr

Differential Revision: D29042759

Pulled By: pdillinger

fbshipit-source-id: 236faa902397f50038c618f50fbc8cf3f277308c
main
Peter Dillinger 3 years ago committed by Facebook GitHub Bot
parent d60ae5b1c7
commit d5a46c40e5
  1. 29
      cache/cache_entry_stats.h
  2. 93
      db/db_block_cache_test.cc
  3. 58
      db/internal_stats.cc
  4. 18
      db/internal_stats.h

@ -52,15 +52,29 @@ template <class Stats>
class CacheEntryStatsCollector {
public:
// Gathers stats and saves results into `stats`
void GetStats(Stats *stats, int maximum_age_in_seconds = 180) {
//
// Maximum allowed age for a "hit" on saved results is determined by the
// two interval parameters. Both set to 0 forces a re-scan. For example
// with min_interval_seconds=300 and min_interval_factor=100, if the last
// scan took 10s, we would only rescan ("miss") if the age in seconds of
// the saved results is > max(300, 100*10).
// Justification: scans can vary wildly in duration, e.g. from 0.02 sec
// to as much as 20 seconds, so we want to be able to cap the absolute
// and relative frequency of scans.
void GetStats(Stats *stats, int min_interval_seconds,
int min_interval_factor) {
// Waits for any pending reader or writer (collector)
std::lock_guard<std::mutex> lock(mutex_);
// Maximum allowed age is nominally given by the parameter, but
// to limit the possibility of accidental repeated scans, impose
// a minimum TTL of 1 second.
uint64_t max_age_micros =
static_cast<uint64_t>(std::max(maximum_age_in_seconds, 1)) * 1000000U;
static_cast<uint64_t>(std::max(min_interval_seconds, 0)) * 1000000U;
if (last_end_time_micros_ > last_start_time_micros_ &&
min_interval_factor > 0) {
max_age_micros = std::max(
max_age_micros, min_interval_factor * (last_end_time_micros_ -
last_start_time_micros_));
}
uint64_t start_time_micros = clock_->NowMicros();
if ((start_time_micros - last_end_time_micros_) > max_age_micros) {
@ -68,6 +82,8 @@ class CacheEntryStatsCollector {
saved_stats_.BeginCollection(cache_, clock_, start_time_micros);
cache_->ApplyToAllEntries(saved_stats_.GetEntryCallback(), {});
TEST_SYNC_POINT_CALLBACK(
"CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr);
uint64_t end_time_micros = clock_->NowMicros();
last_end_time_micros_ = end_time_micros;
@ -109,7 +125,8 @@ class CacheEntryStatsCollector {
// usage to go flaky. Fix the problem somehow so we can use an
// accurate charge.
size_t charge = 0;
Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h);
Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h,
Cache::Priority::HIGH);
if (!s.ok()) {
assert(h == nullptr);
return s;

@ -152,12 +152,13 @@ class DBBlockCacheTest : public DBTestBase {
}
#ifndef ROCKSDB_LITE
const std::array<size_t, kNumCacheEntryRoles>& GetCacheEntryRoleCounts() {
const std::array<size_t, kNumCacheEntryRoles>& GetCacheEntryRoleCountsBg() {
// Verify in cache entry role stats
ColumnFamilyHandleImpl* cfh =
static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
InternalStats* internal_stats_ptr = cfh->cfd()->internal_stats();
return internal_stats_ptr->TEST_GetCacheEntryRoleStats().entry_counts;
return internal_stats_ptr->TEST_GetCacheEntryRoleStats(/*foreground=*/false)
.entry_counts;
}
#endif // ROCKSDB_LITE
};
@ -935,6 +936,8 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
options.max_open_files = 13;
options.table_cache_numshardbits = 0;
// If this wakes up, it could interfere with test
options.stats_dump_period_sec = 0;
BlockBasedTableOptions table_options;
table_options.block_cache = cache;
@ -970,7 +973,7 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
std::array<size_t, kNumCacheEntryRoles> expected{};
// For CacheEntryStatsCollector
expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
std::array<size_t, kNumCacheEntryRoles> prev_expected = expected;
@ -981,13 +984,13 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
expected[static_cast<size_t>(CacheEntryRole::kFilterMetaBlock)] += 2;
}
// Within some time window, we will get cached entry stats
EXPECT_EQ(prev_expected, GetCacheEntryRoleCounts());
EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
// Not enough to force a miss
env_->MockSleepForSeconds(10);
EXPECT_EQ(prev_expected, GetCacheEntryRoleCounts());
env_->MockSleepForSeconds(45);
EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
// Enough to force a miss
env_->MockSleepForSeconds(1000);
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
env_->MockSleepForSeconds(601);
EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
// Now access index and data block
ASSERT_EQ("value", Get("foo"));
@ -997,8 +1000,22 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
}
expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
env_->MockSleepForSeconds(1000);
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
// Enough to force a miss
env_->MockSleepForSeconds(601);
// But inject a simulated long scan so that we need a longer
// interval to force a miss next time.
SyncPoint::GetInstance()->SetCallBack(
"CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
[this](void*) {
// To spend no more than 0.2% of time scanning, we would need
// interval of at least 10000s
env_->MockSleepForSeconds(20);
});
SyncPoint::GetInstance()->EnableProcessing();
EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
prev_expected = expected;
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
// The same for other file
ASSERT_EQ("value", Get("zfoo"));
@ -1008,8 +1025,14 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
}
expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
env_->MockSleepForSeconds(1000);
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
// Because of the simulated long scan, this is not enough to force
// a miss
env_->MockSleepForSeconds(601);
EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
// But this is enough
env_->MockSleepForSeconds(10000);
EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
prev_expected = expected;
// Also check the GetProperty interface
std::map<std::string, std::string> values;
@ -1025,8 +1048,54 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
EXPECT_EQ(
ToString(expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)]),
values["count.filter-block"]);
EXPECT_EQ(
ToString(
prev_expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
values["count.write-buffer"]);
EXPECT_EQ(ToString(expected[static_cast<size_t>(CacheEntryRole::kMisc)]),
values["count.misc"]);
// Add one for kWriteBuffer
{
WriteBufferManager wbm(size_t{1} << 20, cache);
wbm.ReserveMem(1024);
expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]++;
// Now we check that the GetProperty interface is more agressive about
// re-scanning stats, but not totally aggressive.
// Within some time window, we will get cached entry stats
env_->MockSleepForSeconds(1);
EXPECT_EQ(ToString(prev_expected[static_cast<size_t>(
CacheEntryRole::kWriteBuffer)]),
values["count.write-buffer"]);
// Not enough for a "background" miss but enough for a "foreground" miss
env_->MockSleepForSeconds(45);
ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats,
&values));
EXPECT_EQ(
ToString(
expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
values["count.write-buffer"]);
}
prev_expected = expected;
// With collector pinned in cache, we should be able to hit
// even if the cache is full
ClearCache(cache.get());
Cache::Handle* h = nullptr;
ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1,
GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
&h, Cache::Priority::HIGH));
ASSERT_GT(cache->GetUsage(), cache->GetCapacity());
expected = {};
expected[static_cast<size_t>(CacheEntryRole::kMisc)]++;
// Still able to hit on saved stats
EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
// Enough to force a miss
env_->MockSleepForSeconds(1000);
EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
cache->Release(h);
}
EXPECT_GE(iterations_tested, 1);
}

@ -528,26 +528,37 @@ InternalStats::InternalStats(int num_levels, SystemClock* clock,
cfd_(cfd),
started_at_(clock->NowMicros()) {}
Status InternalStats::CollectCacheEntryStats() {
using Collector = CacheEntryStatsCollector<CacheEntryRoleStats>;
Cache* block_cache;
bool ok = HandleBlockCacheStat(&block_cache);
if (ok) {
// Extract or create stats collector.
std::shared_ptr<Collector> collector;
Status s = Collector::GetShared(block_cache, clock_, &collector);
if (s.ok()) {
// TODO: use a max age like stats_dump_period_sec / 2, but it's
// difficult to access that setting from here with just cfd_
collector->GetStats(&cache_entry_stats);
Status InternalStats::CollectCacheEntryStats(bool foreground) {
// Lazy initialize/reference the collector. It is pinned in cache (through
// a shared_ptr) so that it does not get immediately ejected from a full
// cache, which would force a re-scan on the next GetStats.
if (!cache_entry_stats_collector_) {
Cache* block_cache;
bool ok = HandleBlockCacheStat(&block_cache);
if (ok) {
// Extract or create stats collector.
Status s = CacheEntryStatsCollector<CacheEntryRoleStats>::GetShared(
block_cache, clock_, &cache_entry_stats_collector_);
if (!s.ok()) {
// Block cache likely under pressure. Scanning could make it worse,
// so skip.
return s;
}
} else {
// Block cache likely under pressure. Scanning could make it worse,
// so skip.
return Status::NotFound("block cache not configured");
}
return s;
} else {
return Status::NotFound("block cache not configured");
}
assert(cache_entry_stats_collector_);
// For "background" collections, strictly cap the collection time by
// expanding effective cache TTL. For foreground, be more aggressive about
// getting latest data.
int min_interval_seconds = foreground ? 10 : 180;
// 1/500 = max of 0.2% of one CPU thread
int min_interval_factor = foreground ? 10 : 500;
cache_entry_stats_collector_->GetStats(
&cache_entry_stats_, min_interval_seconds, min_interval_factor);
return Status::OK();
}
std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
@ -638,21 +649,21 @@ void InternalStats::CacheEntryRoleStats::ToMap(
bool InternalStats::HandleBlockCacheEntryStats(std::string* value,
Slice /*suffix*/) {
Status s = CollectCacheEntryStats();
Status s = CollectCacheEntryStats(/*foreground*/ true);
if (!s.ok()) {
return false;
}
*value = cache_entry_stats.ToString(clock_);
*value = cache_entry_stats_.ToString(clock_);
return true;
}
bool InternalStats::HandleBlockCacheEntryStatsMap(
std::map<std::string, std::string>* values, Slice /*suffix*/) {
Status s = CollectCacheEntryStats();
Status s = CollectCacheEntryStats(/*foreground*/ true);
if (!s.ok()) {
return false;
}
cache_entry_stats.ToMap(values, clock_);
cache_entry_stats_.ToMap(values, clock_);
return true;
}
@ -1608,9 +1619,10 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
cf_stats_snapshot_.comp_stats = compaction_stats_sum;
cf_stats_snapshot_.stall_count = total_stall_count;
Status s = CollectCacheEntryStats();
// Always treat CFStats context as "background"
Status s = CollectCacheEntryStats(/*foreground=*/false);
if (s.ok()) {
value->append(cache_entry_stats.ToString(clock_));
value->append(cache_entry_stats_.ToString(clock_));
} else {
value->append("Block cache: ");
value->append(s.ToString());

@ -23,6 +23,8 @@ class ColumnFamilyData;
namespace ROCKSDB_NAMESPACE {
template <class Stats>
class CacheEntryStatsCollector;
class DBImpl;
class MemTableList;
@ -390,7 +392,7 @@ class InternalStats {
cf_stats_count_[i] = 0;
cf_stats_value_[i] = 0;
}
cache_entry_stats.Clear();
cache_entry_stats_.Clear();
for (auto& comp_stat : comp_stats_) {
comp_stat.Clear();
}
@ -463,13 +465,13 @@ class InternalStats {
return comp_stats_;
}
const CacheEntryRoleStats& TEST_GetCacheEntryRoleStats() {
Status s = CollectCacheEntryStats();
const CacheEntryRoleStats& TEST_GetCacheEntryRoleStats(bool foreground) {
Status s = CollectCacheEntryStats(foreground);
if (!s.ok()) {
assert(false);
cache_entry_stats.Clear();
cache_entry_stats_.Clear();
}
return cache_entry_stats;
return cache_entry_stats_;
}
// Store a mapping from the user-facing DB::Properties string to our
@ -492,14 +494,16 @@ class InternalStats {
bool HandleBlockCacheStat(Cache** block_cache);
Status CollectCacheEntryStats();
Status CollectCacheEntryStats(bool foreground);
// Per-DB stats
std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
// Per-ColumnFamily stats
uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
CacheEntryRoleStats cache_entry_stats;
CacheEntryRoleStats cache_entry_stats_;
std::shared_ptr<CacheEntryStatsCollector<CacheEntryRoleStats>>
cache_entry_stats_collector_;
// Per-ColumnFamily/level compaction stats
std::vector<CompactionStats> comp_stats_;
std::vector<CompactionStats> comp_stats_by_pri_;

Loading…
Cancel
Save