From 01a2e202994ffe540d55181f1da1d70d955956c6 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Mon, 25 Jul 2022 13:03:55 -0700 Subject: [PATCH] Account for DB ID in stress testing block cache keys (#10388) Summary: I recently discovered that block cache keys are slightly lower quality than previously thought, because my stress testing tool failed to simulate the effect of DB ID differences. This change updates the tool and gives us data to guide future developments. (No changes to production code here and now.) Nevertheless, the following promise still holds ``` // In fact, if our SST files are all < 4TB (see // BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated // in a single process are guaranteed to have unique cache keys, unless/until // number session ids * max file number = 2**86 ... ``` because although different DB IDs could cause collision in file number and offset data, that would have to be using the same DB session (lower) to cause a block cache key collision, which is not possible in the same process. (A session is associated with only one DB ID.) This change fixes cache_bench -stress_cache_key to set and reset DB IDs in a parameterized way to evaluate the effect. Previous results assumed to be representative (using -sck_keep_bits=43): ``` 15 collisions after 15 x 90 days, est 90 days between (1.03763e+20 corrected) ``` or expected collision on a single machine every 104 billion billion days (see "corrected" value). After accounting for DB IDs, test never really changing, intermediate, and very frequently changing (using default -sck_db_count=100): ``` -sck_newdb_nreopen=1000000000: 15 collisions after 2 x 90 days, est 12 days between (1.38351e+19 corrected) -sck_newdb_nreopen=10000: 17 collisions after 2 x 90 days, est 10.5882 days between (1.22074e+19 corrected) -sck_newdb_nreopen=100: 19 collisions after 2 x 90 days, est 9.47368 days between (1.09224e+19 corrected) ``` or roughly 10x more often than previously thought (still extremely if not impossibly rare), and better than random base cache keys (with -sck_randomize), though < 10x better than random: ``` 31 collisions after 1 x 90 days, est 2.90323 days between (3.34719e+18 corrected) ``` If we simply fixed this by ignoring DB ID for cache keys, we would potentially have a shortage of entropy for some cases, such as small file numbers and offsets (e.g. many short-lived processes each using SstFileWriter to create a small file), because existing DB session IDs only provide ~103 bits of entropy. We could upgrade the entropy in DB session IDs to accommodate, but it's not known what all would be affected by changing from 20 digit session IDs to something larger. Instead, my plan is to 1) Move to block cache keys derived from SST unique IDs (so that we can derive block cache keys from manifest data without reading file on storage), and show no significant regression in expected collision rate. 2) Generate better SST unique IDs in format_version=6 (https://github.com/facebook/rocksdb/issues/9058), which should have ~100x lower expected/predicted collision rate based on simulations with this stress test: ``` ./cache_bench -stress_cache_key -sck_keep_bits=39 -sck_newdb_nreopen=100 -sck_footer_unique_id ... 15 collisions after 19 x 90 days, est 114 days between (2.10293e+21 corrected) ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/10388 Test Plan: no production changes Reviewed By: jay-zhuang Differential Revision: D37986714 Pulled By: pdillinger fbshipit-source-id: e759b2469e3365cb01c6661a69e0ab849ef4c3df --- cache/cache_bench_tool.cc | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index 2fc5ed34b..56db2fea3 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -114,6 +114,8 @@ DEFINE_uint32( "(-stress_cache_key) Simulated file size in MiB, for accounting purposes"); DEFINE_uint32(sck_reopen_nfiles, 100, "(-stress_cache_key) Simulate DB re-open average every n files"); +DEFINE_uint32(sck_newdb_nreopen, 1000, + "(-stress_cache_key) Simulate new DB average every n re-opens"); DEFINE_uint32(sck_restarts_per_day, 24, "(-stress_cache_key) Average simulated process restarts per day " "(across DBs)"); @@ -780,7 +782,7 @@ class StressCacheKey { void RunOnce() { // Re-initialized simulated state - const size_t db_count = FLAGS_sck_db_count; + const size_t db_count = std::max(size_t{FLAGS_sck_db_count}, size_t{1}); dbs_.reset(new TableProperties[db_count]{}); const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1; table_.reset(new uint64_t[table_mask + 1]{}); @@ -797,7 +799,8 @@ class StressCacheKey { process_count_ = 0; session_count_ = 0; - ResetProcess(); + newdb_count_ = 0; + ResetProcess(/*newdbs*/ true); Random64 r{std::random_device{}()}; @@ -816,9 +819,9 @@ class StressCacheKey { } // Any other periodic actions before simulating next file if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) { - ResetSession(db_i); + ResetSession(db_i, /*newdb*/ r.OneIn(FLAGS_sck_newdb_nreopen)); } else if (r.OneIn(restart_nfiles_)) { - ResetProcess(); + ResetProcess(/*newdbs*/ false); } // Simulate next file OffsetableCacheKey ock; @@ -870,7 +873,7 @@ class StressCacheKey { // Our goal is to predict probability of no collisions, not expected // number of collisions. To make the distinction, we have to get rid // of observing correlated collisions, which this takes care of: - ResetProcess(); + ResetProcess(/*newdbs*/ false); } else { // Replace (end of lifetime for file that was in this slot) table_[pos] = reduced_key; @@ -888,10 +891,11 @@ class StressCacheKey { } // Report printf( - "%" PRIu64 " days, %" PRIu64 " proc, %" PRIu64 - " sess, %u coll, occ %g%%, ejected %g%% \r", + "%" PRIu64 " days, %" PRIu64 " proc, %" PRIu64 " sess, %" PRIu64 + " newdb, %u coll, occ %g%%, ejected %g%% \r", file_count / FLAGS_sck_files_per_day, process_count_, - session_count_, collisions_this_run, 100.0 * sampled_count / 1000.0, + session_count_, newdb_count_ - FLAGS_sck_db_count, + collisions_this_run, 100.0 * sampled_count / 1000.0, 100.0 * (1.0 - sampled_count / 1000.0 * table_mask / file_count)); fflush(stdout); } @@ -899,16 +903,27 @@ class StressCacheKey { collisions_ += collisions_this_run; } - void ResetSession(size_t i) { + void ResetSession(size_t i, bool newdb) { dbs_[i].db_session_id = DBImpl::GenerateDbSessionId(nullptr); + if (newdb) { + ++newdb_count_; + if (FLAGS_sck_footer_unique_id) { + // Simulate how footer id would behave + dbs_[i].db_id = "none"; + } else { + // db_id might be ignored, depending on the implementation details + dbs_[i].db_id = std::to_string(newdb_count_); + dbs_[i].orig_file_number = 0; + } + } session_count_++; } - void ResetProcess() { + void ResetProcess(bool newdbs) { process_count_++; DBImpl::TEST_ResetDbSessionIdGen(); for (size_t i = 0; i < FLAGS_sck_db_count; ++i) { - ResetSession(i); + ResetSession(i, newdbs); } if (FLAGS_sck_footer_unique_id) { // For footer unique ID, this tracks process-wide generated SST file @@ -923,6 +938,7 @@ class StressCacheKey { std::unique_ptr table_; uint64_t process_count_ = 0; uint64_t session_count_ = 0; + uint64_t newdb_count_ = 0; uint64_t collisions_ = 0; uint32_t restart_nfiles_ = 0; double multiplier_ = 0.0;