From 01a2e202994ffe540d55181f1da1d70d955956c6 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 25 Jul 2022 13:03:55 -0700
Subject: [PATCH] Account for DB ID in stress testing block cache keys (#10388)

Summary:
I recently discovered that block cache keys are slightly lower
quality than previously thought, because my stress testing tool failed
to simulate the effect of DB ID differences. This change updates the
tool and gives us data to guide future developments. (No changes to
production code here and now.)

Nevertheless, the following promise still holds

```
// In fact, if our SST files are all < 4TB (see
// BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated
// in a single process are guaranteed to have unique cache keys, unless/until
// number session ids * max file number = 2**86 ...
```

because although different DB IDs could cause collision in file number
and offset data, that would have to be using the same DB session (lower)
to cause a block cache key collision, which is not possible in the same
process. (A session is associated with only one DB ID.)

This change fixes cache_bench -stress_cache_key to set and reset DB IDs in
a parameterized way to evaluate the effect. Previous results assumed to
be representative (using -sck_keep_bits=43):

```
15 collisions after 15 x 90 days, est 90 days between (1.03763e+20 corrected)
```

or expected collision on a single machine every 104 billion billion
days (see "corrected" value).

After accounting for DB IDs, test never really changing, intermediate, and very
frequently changing (using default -sck_db_count=100):

```
-sck_newdb_nreopen=1000000000:
15 collisions after 2 x 90 days, est 12 days between (1.38351e+19 corrected)
-sck_newdb_nreopen=10000:
17 collisions after 2 x 90 days, est 10.5882 days between (1.22074e+19 corrected)
-sck_newdb_nreopen=100:
19 collisions after 2 x 90 days, est 9.47368 days between (1.09224e+19 corrected)
```

or roughly 10x more often than previously thought (still extremely if
not impossibly rare), and better than random base cache keys
(with -sck_randomize), though < 10x better than random:

```
31 collisions after 1 x 90 days, est 2.90323 days between (3.34719e+18 corrected)
```

If we simply fixed this by ignoring DB ID for cache keys, we would
potentially have a shortage of entropy for some cases, such as small
file numbers and offsets (e.g. many short-lived processes each using
SstFileWriter to create a small file), because existing DB session IDs
only provide ~103 bits of entropy. We could upgrade the entropy in DB
session IDs to accommodate, but it's not known what all would be
affected by changing from 20 digit session IDs to something larger.

Instead, my plan is to
1) Move to block cache keys derived from SST unique IDs (so that we can
derive block cache keys from manifest data without reading file on
storage), and show no significant regression in expected collision
rate.
2) Generate better SST unique IDs in format_version=6 (https://github.com/facebook/rocksdb/issues/9058),
which should have ~100x lower expected/predicted collision rate based
on simulations with this stress test:
```
./cache_bench -stress_cache_key -sck_keep_bits=39 -sck_newdb_nreopen=100 -sck_footer_unique_id
...
15 collisions after 19 x 90 days, est 114 days between (2.10293e+21 corrected)
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10388

Test Plan: no production changes

Reviewed By: jay-zhuang

Differential Revision: D37986714

Pulled By: pdillinger

fbshipit-source-id: e759b2469e3365cb01c6661a69e0ab849ef4c3df
---
 cache/cache_bench_tool.cc | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc
index 2fc5ed34b..56db2fea3 100644
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@@ -114,6 +114,8 @@ DEFINE_uint32(
     "(-stress_cache_key) Simulated file size in MiB, for accounting purposes");
 DEFINE_uint32(sck_reopen_nfiles, 100,
               "(-stress_cache_key) Simulate DB re-open average every n files");
+DEFINE_uint32(sck_newdb_nreopen, 1000,
+              "(-stress_cache_key) Simulate new DB average every n re-opens");
 DEFINE_uint32(sck_restarts_per_day, 24,
               "(-stress_cache_key) Average simulated process restarts per day "
               "(across DBs)");
@@ -780,7 +782,7 @@ class StressCacheKey {
 
   void RunOnce() {
     // Re-initialized simulated state
-    const size_t db_count = FLAGS_sck_db_count;
+    const size_t db_count = std::max(size_t{FLAGS_sck_db_count}, size_t{1});
     dbs_.reset(new TableProperties[db_count]{});
     const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1;
     table_.reset(new uint64_t[table_mask + 1]{});
@@ -797,7 +799,8 @@ class StressCacheKey {
 
     process_count_ = 0;
     session_count_ = 0;
-    ResetProcess();
+    newdb_count_ = 0;
+    ResetProcess(/*newdbs*/ true);
 
     Random64 r{std::random_device{}()};
 
@@ -816,9 +819,9 @@ class StressCacheKey {
       }
       // Any other periodic actions before simulating next file
       if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) {
-        ResetSession(db_i);
+        ResetSession(db_i, /*newdb*/ r.OneIn(FLAGS_sck_newdb_nreopen));
       } else if (r.OneIn(restart_nfiles_)) {
-        ResetProcess();
+        ResetProcess(/*newdbs*/ false);
       }
       // Simulate next file
       OffsetableCacheKey ock;
@@ -870,7 +873,7 @@ class StressCacheKey {
         // Our goal is to predict probability of no collisions, not expected
         // number of collisions. To make the distinction, we have to get rid
         // of observing correlated collisions, which this takes care of:
-        ResetProcess();
+        ResetProcess(/*newdbs*/ false);
       } else {
         // Replace (end of lifetime for file that was in this slot)
         table_[pos] = reduced_key;
@@ -888,10 +891,11 @@ class StressCacheKey {
         }
         // Report
         printf(
-            "%" PRIu64 " days, %" PRIu64 " proc, %" PRIu64
-            " sess, %u coll, occ %g%%, ejected %g%%   \r",
+            "%" PRIu64 " days, %" PRIu64 " proc, %" PRIu64 " sess, %" PRIu64
+            " newdb, %u coll, occ %g%%, ejected %g%%      \r",
             file_count / FLAGS_sck_files_per_day, process_count_,
-            session_count_, collisions_this_run, 100.0 * sampled_count / 1000.0,
+            session_count_, newdb_count_ - FLAGS_sck_db_count,
+            collisions_this_run, 100.0 * sampled_count / 1000.0,
             100.0 * (1.0 - sampled_count / 1000.0 * table_mask / file_count));
         fflush(stdout);
       }
@@ -899,16 +903,27 @@ class StressCacheKey {
     collisions_ += collisions_this_run;
   }
 
-  void ResetSession(size_t i) {
+  void ResetSession(size_t i, bool newdb) {
     dbs_[i].db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+    if (newdb) {
+      ++newdb_count_;
+      if (FLAGS_sck_footer_unique_id) {
+        // Simulate how footer id would behave
+        dbs_[i].db_id = "none";
+      } else {
+        // db_id might be ignored, depending on the implementation details
+        dbs_[i].db_id = std::to_string(newdb_count_);
+        dbs_[i].orig_file_number = 0;
+      }
+    }
     session_count_++;
   }
 
-  void ResetProcess() {
+  void ResetProcess(bool newdbs) {
     process_count_++;
     DBImpl::TEST_ResetDbSessionIdGen();
     for (size_t i = 0; i < FLAGS_sck_db_count; ++i) {
-      ResetSession(i);
+      ResetSession(i, newdbs);
     }
     if (FLAGS_sck_footer_unique_id) {
       // For footer unique ID, this tracks process-wide generated SST file
@@ -923,6 +938,7 @@ class StressCacheKey {
   std::unique_ptr<uint64_t[]> table_;
   uint64_t process_count_ = 0;
   uint64_t session_count_ = 0;
+  uint64_t newdb_count_ = 0;
   uint64_t collisions_ = 0;
   uint32_t restart_nfiles_ = 0;
   double multiplier_ = 0.0;