Enhance new cache key testing & comments (#9329)

Summary: Follow-up to https://github.com/facebook/rocksdb/issues/9126 Added new unit tests to validate some of the claims of guaranteed uniqueness within certain large bounds. Also cleaned up the cache_bench -stress-cache-key tool with better comments and description. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9329 Test Plan: no changes to production code Reviewed By: mrambacher Differential Revision: D33269328 Pulled By: pdillinger fbshipit-source-id: 3a2b684a6b2b15f79dc872e563e3d16563be26de
3 years ago · afc280fdfd
parent 42e0751b3a
commit afc280fdfd
3 changed files with 493 additions and 50 deletions
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@ -79,31 +79,52 @@ static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
 DEFINE_bool(use_clock_cache, false, "");

 // ## BEGIN stress_cache_key sub-tool options ##
+// See class StressCacheKey below.
 DEFINE_bool(stress_cache_key, false,
            "If true, run cache key stress test instead");
-DEFINE_uint32(sck_files_per_day, 2500000,
-              "(-stress_cache_key) Simulated files generated per day");
-DEFINE_uint32(sck_duration, 90,
+DEFINE_uint32(
+    sck_files_per_day, 2500000,
+    "(-stress_cache_key) Simulated files generated per simulated day");
+// NOTE: Giving each run a specified lifetime, rather than e.g. "until
+// first collision" ensures equal skew from start-up, when collisions are
+// less likely.
+DEFINE_uint32(sck_days_per_run, 90,
              "(-stress_cache_key) Number of days to simulate in each run");
+// NOTE: The number of observed collisions directly affects the relative
+// accuracy of the predicted probabilities. 15 observations should be well
+// within factor-of-2 accuracy.
 DEFINE_uint32(
    sck_min_collision, 15,
    "(-stress_cache_key) Keep running until this many collisions seen");
+// sck_file_size_mb can be thought of as average file size. The simulation is
+// not precise enough to care about the distribution of file sizes; other
+// simulations (https://github.com/pdillinger/unique_id/tree/main/monte_carlo)
+// indicate the distribution only makes a small difference (e.g. < 2x factor)
 DEFINE_uint32(
    sck_file_size_mb, 32,
    "(-stress_cache_key) Simulated file size in MiB, for accounting purposes");
 DEFINE_uint32(sck_reopen_nfiles, 100,
-              "(-stress_cache_key) Re-opens DB average every n files");
+              "(-stress_cache_key) Simulate DB re-open average every n files");
+DEFINE_uint32(sck_restarts_per_day, 24,
+              "(-stress_cache_key) Average simulated process restarts per day "
+              "(across DBs)");
+DEFINE_uint32(
+    sck_db_count, 100,
+    "(-stress_cache_key) Parallel DBs in simulation sharing a block cache");
 DEFINE_uint32(
-    sck_restarts_per_day, 24,
-    "(-stress_cache_key) Simulated process restarts per day (across DBs)");
-DEFINE_uint32(sck_db_count, 100,
-              "(-stress_cache_key) Parallel DBs in operation");
-DEFINE_uint32(sck_table_bits, 20,
-              "(-stress_cache_key) Log2 number of tracked files");
-DEFINE_uint32(sck_keep_bits, 50,
-              "(-stress_cache_key) Number of cache key bits to keep");
+    sck_table_bits, 20,
+    "(-stress_cache_key) Log2 number of tracked (live) files (across DBs)");
+// sck_keep_bits being well below full 128 bits amplifies the collision
+// probability so that the true probability can be estimated through observed
+// collisions. (More explanation below.)
+DEFINE_uint32(
+    sck_keep_bits, 50,
+    "(-stress_cache_key) Number of bits to keep from each cache key (<= 64)");
+// sck_randomize is used to validate whether cache key is performing "better
+// than random." Even with this setting, file offsets are not randomized.
 DEFINE_bool(sck_randomize, false,
            "(-stress_cache_key) Randomize (hash) cache key");
+// See https://github.com/facebook/rocksdb/pull/9058
 DEFINE_bool(sck_footer_unique_id, false,
            "(-stress_cache_key) Simulate using proposed footer unique id");
 // ## END stress_cache_key sub-tool options ##
@ -583,20 +604,97 @@ class CacheBench {
  }
 };

-// TODO: better description (see PR #9126 for some info)
+// cache_bench -stress_cache_key is an independent embedded tool for
+// estimating the probability of CacheKey collisions through simulation.
+// At a high level, it simulates generating SST files over many months,
+// keeping them in the DB and/or cache for some lifetime while staying
+// under resource caps, and checking for any cache key collisions that
+// arise among the set of live files. For efficient simulation, we make
+// some simplifying "pessimistic" assumptions (that only increase the
+// chance of the simulation reporting a collision relative to the chance
+// of collision in practice):
+// * Every generated file has a cache entry for every byte offset in the
+// file (contiguous range of cache keys)
+// * All of every file is cached for its entire lifetime. (Here "lifetime"
+// is technically the union of DB and Cache lifetime, though we only
+// model a generous DB lifetime, where space usage is always maximized.
+// In a effective Cache, lifetime in cache can only substantially exceed
+// lifetime in DB if there is little cache activity; cache activity is
+// required to hit cache key collisions.)
+//
+// It would be possible to track an exact set of cache key ranges for the
+// set of live files, but we would have no hope of observing collisions
+// (overlap in live files) in our simulation. We need to employ some way
+// of amplifying collision probability that allows us to predict the real
+// collision probability by extrapolation from observed collisions. Our
+// basic approach is to reduce each cache key range down to some smaller
+// number of bits, and limiting to bits that are shared over the whole
+// range.  Now we can observe collisions using a set of smaller stripped-down
+// (reduced) cache keys. Let's do some case analysis to understand why this
+// works:
+// * No collision in reduced key - because the reduction is a pure function
+// this implies no collision in the full keys
+// * Collision detected between two reduced keys - either
+//   * The reduction has dropped some structured uniqueness info (from one of
+// session counter or file number; file offsets are never materialized here).
+// This can only artificially inflate the observed and extrapolated collision
+// probabilities. We only have to worry about this in designing the reduction.
+//   * The reduction has preserved all the structured uniqueness in the cache
+// key, which means either
+//     * REJECTED: We have a uniqueness bug in generating cache keys, where
+// structured uniqueness info should have been different but isn't. In such a
+// case, increasing by 1 the number of bits kept after reduction would not
+// reduce observed probabilities by half. (In our observations, the
+// probabilities are reduced approximately by half.)
+//     * ACCEPTED: The lost unstructured uniqueness in the key determines the
+// probability that an observed collision would imply an overlap in ranges.
+// In short, dropping n bits from key would increase collision probability by
+// 2**n, assuming those n bits have full entropy in unstructured uniqueness.
+//
+// But we also have to account for the key ranges based on file size. If file
+// sizes are roughly 2**b offsets, using XOR in 128-bit cache keys for
+// "ranges", we know from other simulations (see
+// https://github.com/pdillinger/unique_id/) that that's roughly equivalent to
+// (less than 2x higher collision probability) using a cache key of size
+// 128 - b bits for the whole file. (This is the only place we make an
+// "optimistic" assumption, which is more than offset by the real
+// implementation stripping off 2 lower bits from block byte offsets for cache
+// keys. The simulation assumes byte offsets, which is net pessimistic.)
+//
+// So to accept the extrapolation as valid, we need to be confident that all
+// "lost" bits, excluding those covered by file offset, are full entropy.
+// Recall that we have assumed (verifiably, safely) that other structured data
+// (file number and session counter) are kept, not lost. Based on the
+// implementation comments for OffsetableCacheKey, the only potential hole here
+// is that we only have ~103 bits of entropy in "all new" session IDs, and in
+// extreme cases, there might be only 1 DB ID. However, because the upper ~39
+// bits of session ID are hashed, the combination of file number and file
+// offset only has to add to 25 bits (or more) to ensure full entropy in
+// unstructured uniqueness lost in the reduction. Typical file size of 32MB
+// suffices (at least for simulation purposes where we assume each file offset
+// occupies a cache key).
+//
+// Example results in comments on OffsetableCacheKey.
 class StressCacheKey {
 public:
  void Run() {
    if (FLAGS_sck_footer_unique_id) {
+      // Proposed footer unique IDs are DB-independent and session-independent
+      // (but process-dependent) which is most easily simulated here by
+      // assuming 1 DB and (later below) no session resets without process
+      // reset.
      FLAGS_sck_db_count = 1;
    }

+    // Describe the simulated workload
    uint64_t mb_per_day =
        uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb;
    printf("Total cache or DBs size: %gTiB  Writing %g MiB/s or %gTiB/day\n",
           FLAGS_sck_file_size_mb / 1024.0 / 1024.0 *
               std::pow(2.0, FLAGS_sck_table_bits),
           mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0);
+    // For extrapolating probability of any collisions from a number of
+    // observed collisions
    multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) /
                  (FLAGS_sck_file_size_mb * 1024.0 * 1024.0);
    printf(
@ -606,6 +704,9 @@ class StressCacheKey {
    restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day;
    double without_ejection =
        std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day;
+    // This should be a lower bound for -sck_randomize, usually a terribly
+    // rough lower bound.
+    // If observation is worse than this, then something has gone wrong.
    printf(
        "Without ejection, expect random collision after %g days (%g "
        "corrected)\n",
@ -613,30 +714,36 @@ class StressCacheKey {
    double with_full_table =
        std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) /
        FLAGS_sck_files_per_day;
+    // This is an alternate lower bound for -sck_randomize, usually pretty
+    // accurate. Our cache keys should usually perform "better than random"
+    // but always no worse. (If observation is substantially worse than this,
+    // then something has gone wrong.)
    printf(
        "With ejection and full table, expect random collision after %g "
        "days (%g corrected)\n",
        with_full_table, with_full_table * multiplier_);
    collisions_ = 0;

+    // Run until sufficient number of observed collisions.
    for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) {
      RunOnce();
      if (collisions_ == 0) {
        printf(
            "No collisions after %d x %u days                              "
            "                   \n",
-            i, FLAGS_sck_duration);
+            i, FLAGS_sck_days_per_run);
      } else {
-        double est = 1.0 * i * FLAGS_sck_duration / collisions_;
+        double est = 1.0 * i * FLAGS_sck_days_per_run / collisions_;
        printf("%" PRIu64
               " collisions after %d x %u days, est %g days between (%g "
               "corrected)        \n",
-               collisions_, i, FLAGS_sck_duration, est, est * multiplier_);
+               collisions_, i, FLAGS_sck_days_per_run, est, est * multiplier_);
      }
    }
  }

  void RunOnce() {
+    // Re-initialized simulated state
    const size_t db_count = FLAGS_sck_db_count;
    dbs_.reset(new TableProperties[db_count]{});
    const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1;
@ -644,7 +751,11 @@ class StressCacheKey {
    if (FLAGS_sck_keep_bits > 64) {
      FLAGS_sck_keep_bits = 64;
    }
+
+    // Details of which bits are dropped in reduction
    uint32_t shift_away = 64 - FLAGS_sck_keep_bits;
+    // Shift away fewer potential file number bits (b) than potential
+    // session counter bits (a).
    uint32_t shift_away_b = shift_away / 3;
    uint32_t shift_away_a = shift_away - shift_away_b;

@ -655,62 +766,78 @@ class StressCacheKey {
    Random64 r{std::random_device{}()};

    uint64_t max_file_count =
-        uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_duration;
-    uint64_t file_count = 0;
+        uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run;
+    uint64_t file_size = FLAGS_sck_file_size_mb * uint64_t{1024} * 1024U;
    uint32_t report_count = 0;
    uint32_t collisions_this_run = 0;
-    // Round robin through DBs
-    for (size_t db_i = 0;; ++db_i) {
+    size_t db_i = 0;
+
+    for (uint64_t file_count = 1; file_count <= max_file_count;
+         ++file_count, ++db_i) {
+      // Round-robin through DBs (this faster than %)
      if (db_i >= db_count) {
        db_i = 0;
      }
-      if (file_count >= max_file_count) {
-        break;
-      }
+      // Any other periodic actions before simulating next file
      if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) {
        ResetSession(db_i);
      } else if (r.OneIn(restart_nfiles_)) {
        ResetProcess();
      }
+      // Simulate next file
      OffsetableCacheKey ock;
      dbs_[db_i].orig_file_number += 1;
-      // skip some file numbers, unless 1 DB so that that can simulate
-      // better (DB-independent) unique IDs
-      if (db_count > 1) {
+      // skip some file numbers for other file kinds, except in footer unique
+      // ID, orig_file_number here tracks process-wide generated SST file
+      // count.
+      if (!FLAGS_sck_footer_unique_id) {
        dbs_[db_i].orig_file_number += (r.Next() & 3);
      }
-      BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], "", 42, 42, &ock);
+      bool is_stable;
+      BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "",
+                                         /* ignored */ 42, file_size, &ock,
+                                         &is_stable);
+      assert(is_stable);
+      // Get a representative cache key, which later we analytically generalize
+      // to a range.
      CacheKey ck = ock.WithOffset(0);
-      uint64_t stripped;
+      uint64_t reduced_key;
      if (FLAGS_sck_randomize) {
-        stripped = GetSliceHash64(ck.AsSlice()) >> shift_away;
+        reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away;
      } else if (FLAGS_sck_footer_unique_id) {
+        // Special case: keep only file number, not session counter
        uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a;
        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
-        stripped = (uint64_t{a} << 32) + b;
+        reduced_key = (uint64_t{a} << 32) + b;
      } else {
+        // Try to keep file number and session counter (shift away other bits)
        uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
-        stripped = (uint64_t{a} << 32) + b;
+        reduced_key = (uint64_t{a} << 32) + b;
      }
-      if (stripped == 0) {
-        // Unlikely, but we need to exclude tracking this value
+      if (reduced_key == 0) {
+        // Unlikely, but we need to exclude tracking this value because we
+        // use it to mean "empty" in table. This case is OK as long as we
+        // don't hit it often.
        printf("Hit Zero!                                                  \n");
+        file_count--;
        continue;
      }
-      file_count++;
-      uint64_t h = NPHash64(reinterpret_cast<char*>(&stripped), 8);
-      // Skew lifetimes
+      uint64_t h =
+          NPHash64(reinterpret_cast<char*>(&reduced_key), sizeof(reduced_key));
+      // Skew expected lifetimes, for high variance (super-Poisson) variance
+      // in actual lifetimes.
      size_t pos =
          std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask);
-      if (table_[pos] == stripped) {
+      if (table_[pos] == reduced_key) {
        collisions_this_run++;
-        // To predict probability of no collisions, we have to get rid of
-        // correlated collisions, which this takes care of:
+        // Our goal is to predict probability of no collisions, not expected
+        // number of collisions. To make the distinction, we have to get rid
+        // of observing correlated collisions, which this takes care of:
        ResetProcess();
      } else {
-        // Replace
-        table_[pos] = stripped;
+        // Replace (end of lifetime for file that was in this slot)
+        table_[pos] = reduced_key;
      }

      if (++report_count == FLAGS_sck_files_per_day) {
@ -748,6 +875,8 @@ class StressCacheKey {
      ResetSession(i);
    }
    if (FLAGS_sck_footer_unique_id) {
+      // For footer unique ID, this tracks process-wide generated SST file
+      // count.
      dbs_[0].orig_file_number = 0;
    }
  }
--- a/cache/cache_key.cc
+++ b/cache/cache_key.cc
@ -35,7 +35,8 @@ CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) {
 CacheKey CacheKey::CreateUniqueForProcessLifetime() {
  // To avoid colliding with CreateUniqueForCacheLifetime, assuming
  // Cache::NewId counts up from zero, here we count down from UINT64_MAX.
-  // If this ever becomes a point of contention, we could use CoreLocalArray.
+  // If this ever becomes a point of contention, we could sub-divide the
+  // space and use CoreLocalArray.
  static std::atomic<uint64_t> counter{UINT64_MAX};
  uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed);
  // Ensure we don't collide with CreateUniqueForCacheLifetime
@ -118,9 +119,10 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // "structured" uniqueness hasn't been cloned. Using a static
 // SemiStructuredUniqueIdGen for db_session_ids, this means we only get an
 // "all new" session id when a new process uses RocksDB. (Between processes,
-// we don't know if a DB or other persistent storage has been cloned.) Within
-// a process, only the session_lower of the db_session_id changes
-// incrementally ("structured" uniqueness).
+// we don't know if a DB or other persistent storage has been cloned. We
+// assume that if VM hot cloning is used, subsequently generated SST files
+// do not interact.) Within a process, only the session_lower of the
+// db_session_id changes incrementally ("structured" uniqueness).
 //
 // This basically means that our offsets, counters and file numbers allow us
 // to do somewhat "better than random" (birthday paradox) while in the
@ -168,12 +170,83 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // data from the last 180 days is in cache, but NOT the other assumption
 // for the 1 in a trillion estimate above).
 //
-// Conclusion: Burning through session IDs, particularly "all new" IDs that
-// only arise when a new process is started, is the only way to have a
-// plausible chance of cache key collision. When processes live for hours
-// or days, the chance of a cache key collision seems more plausibly due
-// to bad hardware than to bad luck in random session ID data.
 //
+// Collision probability estimation through simulation:
+// A tool ./cache_bench -stress_cache_key broadly simulates host-wide cache
+// activity over many months, by making some pessimistic simplifying
+// assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
+// Here is some sample output with
+// `./cache_bench -stress_cache_key -sck_keep_bits=40`:
+//
+//   Total cache or DBs size: 32TiB  Writing 925.926 MiB/s or 76.2939TiB/day
+//   Multiply by 9.22337e+18 to correct for simulation losses (but still
+//   assume whole file cached)
+//
+// These come from default settings of 2.5M files per day of 32 MB each, and
+// `-sck_keep_bits=40` means that to represent a single file, we are only
+// keeping 40 bits of the 128-bit (base) cache key.  With file size of 2**25
+// contiguous keys (pessimistic), our simulation is about 2\*\*(128-40-25) or
+// about 9 billion billion times more prone to collision than reality.
+//
+// More default assumptions, relatively pessimistic:
+// * 100 DBs in same process (doesn't matter much)
+// * Re-open DB in same process (new session ID related to old session ID) on
+// average every 100 files generated
+// * Restart process (all new session IDs unrelated to old) 24 times per day
+//
+// After enough data, we get a result at the end (-sck_keep_bits=40):
+//
+//   (keep 40 bits)  17 collisions after 2 x 90 days, est 10.5882 days between
+//                   (9.76592e+19 corrected)
+//
+// If we believe the (pessimistic) simulation and the mathematical
+// extrapolation, we would need to run a billion machines all for 97 billion
+// days to expect a cache key collision. To help verify that our extrapolation
+// ("corrected") is robust, we can make our simulation more precise with
+// `-sck_keep_bits=41` and `42`, which takes more running time to get enough
+// collision data:
+//
+//   (keep 41 bits)  16 collisions after 4 x 90 days, est 22.5 days between
+//                   (1.03763e+20 corrected)
+//   (keep 42 bits)  19 collisions after 10 x 90 days, est 47.3684 days between
+//                   (1.09224e+20 corrected)
+//
+// The extrapolated prediction is very close. If anything, we might have some
+// very small losses of structured data (see class StressCacheKey in
+// cache_bench_tool.cc) leading to more accurate & more attractive prediction
+// with more bits kept.
+//
+// With the `-sck_randomize` option, we can see that typical workloads like
+// above have lower collision probability than "random" cache keys (note:
+// offsets still non-randomized) by a modest amount (roughly 20x less collision
+// prone than random), which should make us reasonably comfortable even in
+// "degenerate" cases (e.g. repeatedly launch a process to generate 1 file
+// with SstFileWriter):
+//
+//   (rand 40 bits) 197 collisions after 1 x 90 days, est 0.456853 days between
+//                  (4.21372e+18 corrected)
+//
+// We can see that with more frequent process restarts (all new session IDs),
+// we get closer to the "random" cache key performance:
+//
+//   (-sck_restarts_per_day=5000): 140 collisions after 1 x 90 days, ...
+//                  (5.92931e+18 corrected)
+//
+// Other tests have been run to validate other conditions behave as expected,
+// never behaving "worse than random" unless we start chopping off structured
+// data.
+//
+//
+// Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
+// that only arise when a new process is started, the chance of any cache key
+// collisions in a giant fleet of machines is negligible. Especially when
+// processes live for hours or days, the chance of a cache key collision is
+// likely more plausibly due to bad hardware than to bad luck in random
+// session ID data. Software defects are surely more likely to cause corruption
+// than both of those.
+//
+// TODO: Nevertheless / regardless, an efficient way to detect (and thus
+// quantify) block cache corruptions, including collisions, should be added.
 OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
                                       const std::string &db_session_id,
                                       uint64_t file_number,
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@ -9,17 +9,26 @@
 #include <cstdlib>
 #include <functional>
 #include <memory>
+#include <unordered_set>

 #include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
 #include "cache/lru_cache.h"
 #include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
+#include "env/unique_id_gen.h"
 #include "port/stack_trace.h"
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/unique_id_impl.h"
 #include "util/compression.h"
 #include "util/defer.h"
+#include "util/hash.h"
+#include "util/math.h"
 #include "util/random.h"
 #include "utilities/fault_injection_fs.h"

@ -1714,6 +1723,238 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }

+class CacheKeyTest : public testing::Test {
+ public:
+  void SetupStableBase() {
+    // Like SemiStructuredUniqueIdGen::GenerateNext
+    tp_.db_session_id = EncodeSessionId(base_session_upper_,
+                                        base_session_lower_ ^ session_counter_);
+    tp_.db_id = ToString(db_id_);
+    tp_.orig_file_number = file_number_;
+    bool is_stable;
+    std::string cur_session_id = "";  // ignored
+    uint64_t cur_file_number = 42;    // ignored
+    BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number,
+                                       file_size_, &base_cache_key_,
+                                       &is_stable);
+    ASSERT_TRUE(is_stable);
+  }
+  CacheKey WithOffset(uint64_t offset) {
+    return BlockBasedTable::GetCacheKey(base_cache_key_,
+                                        BlockHandle(offset, /*size*/ 5));
+  }
+
+ protected:
+  OffsetableCacheKey base_cache_key_;
+  TableProperties tp_;
+  uint64_t file_size_ = 0;
+  uint64_t base_session_upper_ = 0;
+  uint64_t base_session_lower_ = 0;
+  uint64_t session_counter_ = 0;
+  uint64_t file_number_ = 0;
+  uint64_t db_id_ = 0;
+};
+
+namespace {
+template <typename T>
+int CountBitsDifferent(const T& t1, const T& t2) {
+  int diff = 0;
+  const uint8_t* p1 = reinterpret_cast<const uint8_t*>(&t1);
+  const uint8_t* p2 = reinterpret_cast<const uint8_t*>(&t2);
+  static_assert(sizeof(*p1) == 1, "Expecting uint8_t byte");
+  for (size_t i = 0; i < sizeof(T); ++i) {
+    diff += BitsSetToOne(p1[i] ^ p2[i]);
+  }
+  return diff;
+}
+
+}  // namespace
+
+TEST_F(CacheKeyTest, DBImplSessionIdStructure) {
+  // We have to generate our own session IDs for simulation purposes in other
+  // tests. Here we verify that the DBImpl implementation seems to match
+  // our construction here, by using lowest XORed-in bits for "session
+  // counter."
+  std::string session_id1 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
+  std::string session_id2 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
+  uint64_t upper1, upper2, lower1, lower2;
+  ASSERT_OK(DecodeSessionId(session_id1, &upper1, &lower1));
+  ASSERT_OK(DecodeSessionId(session_id2, &upper2, &lower2));
+  // Because generated in same process
+  ASSERT_EQ(upper1, upper2);
+  // Unless we generate > 4 billion session IDs in this process...
+  ASSERT_EQ(Upper32of64(lower1), Upper32of64(lower2));
+  // But they must be different somewhere
+  ASSERT_NE(Lower32of64(lower1), Lower32of64(lower2));
+}
+
+TEST_F(CacheKeyTest, StandardEncodingLimit) {
+  base_session_upper_ = 1234;
+  base_session_lower_ = 5678;
+  session_counter_ = 42;
+  file_number_ = 42;
+  db_id_ = 1234;
+
+  file_size_ = 42;
+  SetupStableBase();
+  CacheKey ck1;
+  ASSERT_TRUE(ck1.IsEmpty());
+  ck1 = WithOffset(0);
+  ASSERT_FALSE(ck1.IsEmpty());
+
+  // Should use same encoding
+  file_size_ = BlockBasedTable::kMaxFileSizeStandardEncoding;
+  SetupStableBase();
+  CacheKey ck2 = WithOffset(0);
+  ASSERT_EQ(CountBitsDifferent(ck1, ck2), 0);
+
+  // Should use different encoding
+  ++file_size_;
+  SetupStableBase();
+  CacheKey ck3 = WithOffset(0);
+  ASSERT_GT(CountBitsDifferent(ck2, ck3), 0);
+}
+
+TEST_F(CacheKeyTest, Encodings) {
+  // Claim from cache_key.cc:
+  // In fact, if our SST files are all < 4TB (see
+  // BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated
+  // in a single process are guaranteed to have unique cache keys, unless/until
+  // number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in
+  // a single process and 64 trillion files generated.
+
+  // We can generalize that. For
+  // * z bits in maximum file size
+  // * n bits in maximum file number
+  // * s bits in maximum session counter
+  // uniqueness is guaranteed at least when all of these hold:
+  // *  z + n + s <= 121  (128 - 2 meta + 2 offset trim - (8-1) byte granularity
+  //                       in encoding)
+  // *  n + s <= 86       (encoding limitation)
+  // *  s <= 62           (because of 2-bit metadata)
+
+  // We can verify this indirectly by how input bits get into the cache key,
+  // but we have to be mindful that for sufficiently large file sizes,
+  // different encodings might be used. But for cases mixing large and small
+  // files, we have to verify uniqueness between encodings.
+
+  // Going through all combinations would be a little expensive, so we test
+  // only one random "stripe" of the configuration space per run.
+  constexpr uint32_t kStripeBits = 8;
+  constexpr uint32_t kStripeMask = (uint32_t{1} << kStripeBits) - 1;
+
+  // Also cycle through stripes on repeated runs (not thread safe)
+  static uint32_t stripe =
+      static_cast<uint32_t>(std::random_device{}()) & kStripeMask;
+  stripe = (stripe + 1) & kStripeMask;
+
+  fprintf(stderr, "%u\n", stripe);
+
+  // We are going to randomly initialize some values which *should* not affect
+  // result
+  Random64 r{std::random_device{}()};
+
+  int max_num_encodings = 0;
+  uint32_t config_num = 0;
+  uint32_t session_counter_bits, file_number_bits, max_file_size_bits;
+
+  // Inner loop body, used later in a loop over configurations
+  auto TestConfig = [&]() {
+    base_session_upper_ = r.Next();
+    base_session_lower_ = r.Next();
+    session_counter_ = r.Next();
+    if (session_counter_bits < 64) {
+      // Avoid shifting UB
+      session_counter_ = session_counter_ >> 1 >> (63 - session_counter_bits);
+    }
+    file_number_ = r.Next() >> (64 - file_number_bits);
+    // Need two bits set to avoid temporary zero below
+    if (BitsSetToOne(file_number_) < 2) {
+      file_number_ = 3;
+    }
+    db_id_ = r.Next();
+
+    // Work-around clang-analyzer which thinks empty last_base is garbage
+    CacheKey last_base = CacheKey::CreateUniqueForProcessLifetime();
+
+    std::unordered_set<std::string> seen;
+    int num_encodings = 0;
+
+    // Loop over encodings by increasing file size bits
+    for (uint32_t file_size_bits = 1; file_size_bits <= max_file_size_bits;
+         ++file_size_bits) {
+      file_size_ = uint64_t{1} << (file_size_bits - 1);
+      SetupStableBase();
+      CacheKey new_base = WithOffset(0);
+      if (CountBitsDifferent(last_base, new_base) == 0) {
+        // Same as previous encoding
+        continue;
+      }
+
+      // New encoding
+      ++num_encodings;
+      ASSERT_TRUE(seen.insert(new_base.AsSlice().ToString()).second);
+      last_base = new_base;
+      for (uint32_t i = 0; i < file_size_bits; ++i) {
+        CacheKey ck = WithOffset(uint64_t{1} << i);
+        if (i < 2) {
+          // These cases are not relevant and optimized by dropping two
+          // lowest bits because there's always at least 5 bytes between
+          // blocks.
+          ASSERT_EQ(CountBitsDifferent(ck, new_base), 0);
+        } else {
+          // Normal case
+          // 1 bit different from base and never been seen implies the bit
+          // is encoded into cache key without overlapping other structured
+          // data.
+          ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
+          ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
+        }
+      }
+      for (uint32_t i = 0; i < session_counter_bits; ++i) {
+        SaveAndRestore<uint64_t> tmp(&session_counter_,
+                                     session_counter_ ^ (uint64_t{1} << i));
+        SetupStableBase();
+        CacheKey ck = WithOffset(0);
+        ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
+        ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
+      }
+      for (uint32_t i = 0; i < file_number_bits; ++i) {
+        SaveAndRestore<uint64_t> tmp(&file_number_,
+                                     file_number_ ^ (uint64_t{1} << i));
+        SetupStableBase();
+        CacheKey ck = WithOffset(0);
+        ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
+        ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
+      }
+      max_num_encodings = std::max(max_num_encodings, num_encodings);
+    }
+  };
+
+  // Loop over configurations and test those in stripe
+  for (session_counter_bits = 0; session_counter_bits <= 62;
+       ++session_counter_bits) {
+    uint32_t max_file_number_bits =
+        std::min(uint32_t{64}, uint32_t{86} - session_counter_bits);
+    // Start with 2 to avoid file_number_ == 0 in testing
+    for (file_number_bits = 2; file_number_bits <= max_file_number_bits;
+         ++file_number_bits) {
+      uint32_t max_max_file_size_bits =
+          std::min(uint32_t{64},
+                   uint32_t{121} - file_number_bits - session_counter_bits);
+      for (max_file_size_bits = 1; max_file_size_bits <= max_max_file_size_bits;
+           ++max_file_size_bits) {
+        if ((config_num++ & kStripeMask) == stripe) {
+          TestConfig();
+        }
+      }
+    }
+  }
+
+  // Make sure the current implementation is exercised
+  ASSERT_EQ(max_num_encodings, 4);
+}
+
 INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest,
                        ::testing::Combine(::testing::Bool(),
                                           ::testing::Bool()));