Derive cache keys from SST unique IDs (#10394)

Summary: ... so that cache keys can be derived from DB manifest data before reading the file from storage--so that every part of the file can potentially go in a persistent cache. See updated comments in cache_key.cc for technical details. Importantly, the new cache key encoding uses some fancy but efficient math to pack data into the cache key without depending on the sizes of the various pieces. This simplifies some existing code creating cache keys, like cache warming before the file size is known. This should provide us an essentially permanent mapping between SST unique IDs and base cache keys, with the ability to "upgrade" SST unique IDs (and thus cache keys) with new SST format_versions. These cache keys are of similar, perhaps indistinguishable quality to the previous generation. Before this change (see "corrected" days between collision): ``` ./cache_bench -stress_cache_key -sck_keep_bits=43 18 collisions after 2 x 90 days, est 10 days between (1.15292e+19 corrected) ``` After this change (keep 43 bits, up through 50, to validate "trajectory" is ok on "corrected" days between collision): ``` 19 collisions after 3 x 90 days, est 14.2105 days between (1.63836e+19 corrected) 16 collisions after 5 x 90 days, est 28.125 days between (1.6213e+19 corrected) 15 collisions after 7 x 90 days, est 42 days between (1.21057e+19 corrected) 15 collisions after 17 x 90 days, est 102 days between (1.46997e+19 corrected) 15 collisions after 49 x 90 days, est 294 days between (2.11849e+19 corrected) 15 collisions after 62 x 90 days, est 372 days between (1.34027e+19 corrected) 15 collisions after 53 x 90 days, est 318 days between (5.72858e+18 corrected) 15 collisions after 309 x 90 days, est 1854 days between (1.66994e+19 corrected) ``` However, the change does modify (probably weaken) the "guaranteed unique" promise from this > SST files generated in a single process are guaranteed to have unique cache keys, unless/until number session ids * max file number = 2**86 to this (see https://github.com/facebook/rocksdb/issues/10388) > With the DB id limitation, we only have nice guaranteed unique cache keys for files generated in a single process until biggest session_id_counter and offset_in_file reach combined 64 bits I don't think this is a practical concern, though. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10394 Test Plan: unit tests updated, see simulation results above Reviewed By: jay-zhuang Differential Revision: D38667529 Pulled By: pdillinger fbshipit-source-id: 49af3fe7f47e5b61162809a78b76c769fd519fba
3 years ago · 86a1e3e0e7
parent 9fa5c146d7
commit 86a1e3e0e7
20 changed files with 673 additions and 525 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -39,6 +39,9 @@
 * Improve read performance by avoiding dynamic memory allocation.
 * When using iterators with the integrated BlobDB implementation, blob cache handles are now released immediately when the iterator's position changes.
 ## Behavior Change
 * Block cache keys have changed, which will cause any persistent caches to miss between versions.
 ## 7.5.0 (07/15/2022)
 ### New Features
 * Mempurge option flag `experimental_mempurge_threshold` is now a ColumnFamilyOptions and can now be dynamically configured using `SetOptions()`.
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@ -806,7 +806,6 @@ class StressCacheKey {
    uint64_t max_file_count =
        uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run;
    uint64_t file_size = FLAGS_sck_file_size_mb * uint64_t{1024} * 1024U;
    uint32_t report_count = 0;
    uint32_t collisions_this_run = 0;
    size_t db_i = 0;
@ -834,8 +833,7 @@ class StressCacheKey {
      }
      bool is_stable;
      BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "",
-                                         /* ignored */ 42, file_size, &ock,
+                                         /* ignored */ 42, &ock, &is_stable);
                                         &is_stable);
      assert(is_stable);
      // Get a representative cache key, which later we analytically generalize
      // to a range.
@ -845,13 +843,11 @@ class StressCacheKey {
        reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away;
      } else if (FLAGS_sck_footer_unique_id) {
        // Special case: keep only file number, not session counter
-        uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a;
+        reduced_key = DecodeFixed64(ck.AsSlice().data()) >> shift_away;
        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
        reduced_key = (uint64_t{a} << 32) + b;
      } else {
        // Try to keep file number and session counter (shift away other bits)
        uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
-        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
+        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_b;
        reduced_key = (uint64_t{a} << 32) + b;
      }
      if (reduced_key == 0) {
--- a/cache/cache_key.cc
+++ b/cache/cache_key.cc
@ -17,7 +17,7 @@ namespace ROCKSDB_NAMESPACE {
 // Value space plan for CacheKey:
 //
-// session_etc64_ | offset_etc64_ | Only generated by
+// file_num_etc64_ | offset_etc64_ | Only generated by
 // ---------------+---------------+------------------------------------------
 //              0 |             0 | Reserved for "empty" CacheKey()
 //              0 |  > 0, < 1<<63 | CreateUniqueForCacheLifetime
@ -44,7 +44,7 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
  return CacheKey(0, id);
 }
-// Value plan for CacheKeys from OffsetableCacheKey, assuming that
+// How we generate CacheKeys and base OffsetableCacheKey, assuming that
 // db_session_ids are generated from a base_session_id and
 // session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId
 // in DBImpl::GenerateDbSessionId):
@ -56,63 +56,108 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 //   base_session_id         (unstructured, from GenerateRawUniqueId)
 //   session_id_counter      (structured)
 //                           * usually much smaller than 2**24
-//   file_number             (structured)
+//   orig_file_number        (structured)
 //                           * usually smaller than 2**24
 //   offset_in_file          (structured, might skip lots of values)
 //                           * usually smaller than 2**32
 //   max_offset              determines placement of file_number to prevent
 //                           overlapping with offset
 //
-// Outputs come from bitwise-xor of the constituent pieces, low bits on left:
+// Overall approach (see https://github.com/pdillinger/unique_id for
-//
+// background):
-// |------------------------- session_etc64 -------------------------|
+//
-// | +++++++++++++++ base_session_id (lower 64 bits) +++++++++++++++ |
+// First, we have three "structured" values, up to 64 bits each, that we
 // need to fit, without losses, into 128 bits. In practice, the values will
 // be small enough that they should fit. For example, applications generating
 // large SST files (large offsets) will naturally produce fewer files (small
 // file numbers). But we don't know ahead of time what bounds the values will
 // have.
 //
 // Second, we have unstructured inputs that enable distinct RocksDB processes
 // to pick a random point in space, likely very different from others. Xoring
 // the structured with the unstructured give us a cache key that is
 // structurally distinct between related keys (e.g. same file or same RocksDB
 // process) and distinct with high probability between unrelated keys.
 //
 // The problem of packing three structured values into the space for two is
 // complicated by the fact that we want to derive cache keys from SST unique
 // IDs, which have already combined structured and unstructured inputs in a
 // practically inseparable way. And we want a base cache key that works
 // with an offset of any size. So basically, we need to encode these three
 // structured values, each up to 64 bits, into 128 bits without knowing any
 // of their sizes. The DownwardInvolution() function gives us a mechanism to
 // accomplish this. (See its properties in math.h.) Specifically, for inputs
 // a, b, and c:
 //   lower64 = DownwardInvolution(a) ^ ReverseBits(b);
 //   upper64 = c ^ ReverseBits(a);
 // The 128-bit output is unique assuming there exist some i, j, and k
 // where a < 2**i, b < 2**j, c < 2**k, i <= 64, j <= 64, k <= 64, and
 // i + j + k <= 128. In other words, as long as there exist some bounds
 // that would allow us to pack the bits of a, b, and c into the output
 // if we know the bound, we can generate unique outputs without knowing
 // those bounds. To validate this claim, the inversion function (given
 // the bounds) has been implemented in CacheKeyDecoder in
 // db_block_cache_test.cc.
 //
 // With that in mind, the outputs in terms of the conceptual inputs look
 // like this, using bitwise-xor of the constituent pieces, low bits on left:
 //
 // |------------------------- file_num_etc64 -------------------------|
 // | +++++++++ base_session_id (lower 64 bits, involution) +++++++++ |
 // |-----------------------------------------------------------------|
 // | session_id_counter (involution) ..... |                         |
 // |-----------------------------------------------------------------|
-// | session_id_counter ...|                                         |
+// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
 // |  * base_session_id (upper ~39 bits)                             |
 // |  * db_id (~122 bits entropy)                                    |
 // |-----------------------------------------------------------------|
-// |                                               | ... file_number |
+// |                             | ..... orig_file_number (reversed) |
 // |                                               | overflow & meta |
 // |-----------------------------------------------------------------|
 //
 //
 // |------------------------- offset_etc64 --------------------------|
-// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
+// | ++++++++++ base_session_id (lower 64 bits, reversed) ++++++++++ |
 // |  * base_session_id (upper ~39 bits)                             |
 // |  * db_id (~122 bits entropy)                                    |
 // |-----------------------------------------------------------------|
-// | offset_in_file ............... |                                |
+// |                           | ..... session_id_counter (reversed) |
 // |-----------------------------------------------------------------|
-// |                                              | file_number, 0-3 |
+// | offset_in_file ............... |                                |
 // |                                              | lower bytes      |
 // |-----------------------------------------------------------------|
 //
-// Based on max_offset, a maximal number of bytes 0..3 is chosen for
+// Some oddities or inconveniences of this layout are due to deriving
-// including from lower bits of file_number in offset_etc64. The choice
+// the "base" cache key (without offset) from the SST unique ID (see
-// is encoded in two bits of metadata going into session_etc64, though
+// GetSstInternalUniqueId). Specifically,
-// the common case of 3 bytes is encoded as 0 so that session_etc64
+// * Lower 64 of base_session_id occurs in both output words (ok but
-// is unmodified by file_number concerns in the common case.
+//   weird)
-//
+// * The inclusion of db_id is bad for the conditions under which we
-// There is nothing preventing "file number overflow & meta" from meeting
+//   can guarantee uniqueness, but could be useful in some cases with
-// and overlapping with session_id_counter, but reaching such a case requires
+//   few small files per process, to make up for db session id only having
-// an intractable combination of large file offsets (thus at least some large
+//   ~103 bits of entropy.
-// files), large file numbers (thus large number of files generated), and
+//
-// large number of session IDs generated in a single process. A trillion each
+// In fact, if DB ids were not involved, we would be guaranteed unique
-// (2**40) of session ids, offsets, and file numbers comes to 120 bits.
+// cache keys for files generated in a single process until total bits for
-// With two bits of metadata and byte granularity, this is on the verge of
+// biggest session_id_counter, orig_file_number, and offset_in_file
-// overlap, but even in the overlap case, it doesn't seem likely that
+// reach 128 bits.
-// a file from billions of files or session ids ago will still be live
+//
-// or cached.
+// With the DB id limitation, we only have nice guaranteed unique cache
-//
+// keys for files generated in a single process until biggest
-// In fact, if our SST files are all < 4TB (see
+// session_id_counter and offset_in_file reach combined 64 bits. This
-// BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated
+// is quite good in practice because we can have millions of DB Opens
-// in a single process are guaranteed to have unique cache keys, unless/until
+// with terabyte size SST files, or billions of DB Opens with gigabyte
-// number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in
+// size SST files.
-// a single process and 64 trillion files generated. Even at that point, to
+//
-// see a collision we would need a miraculous re-synchronization of session
+// One of the considerations in the translation between existing SST unique
-// id and file number, along with a live file or stale cache entry from
+// IDs and base cache keys is supporting better SST unique IDs in a future
-// trillions of files ago.
+// format_version. If we use a process-wide file counter instead of
-//
+// session counter and file numbers, we only need to combine two 64-bit values
-// How https://github.com/pdillinger/unique_id applies here:
+// instead of three. But we don't want to track unique ID versions in the
 // manifest, so we want to keep the same translation layer between SST unique
 // IDs and base cache keys, even with updated SST unique IDs. If the new
 // unique IDs put the file counter where the orig_file_number was, and
 // use no structured field where session_id_counter was, then our translation
 // layer works fine for two structured fields as well as three (for
 // compatibility). The small computation for the translation (one
 // DownwardInvolution(), two ReverseBits(), both ~log(64) instructions deep)
 // is negligible for computing as part of SST file reader open.
 //
 // More on how https://github.com/pdillinger/unique_id applies here:
 // Every bit of output always includes "unstructured" uniqueness bits and
 // often combines with "structured" uniqueness bits. The "unstructured" bits
 // change infrequently: only when we cannot guarantee our state tracking for
@ -141,12 +186,11 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // 128 bits cache key size
 // - 55 <- ideal size for byte offsets + file numbers
 // -  2 <- bits for offsets and file numbers not exactly powers of two
 // -  2 <- bits for file number encoding metadata
 // +  2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey
 // ----
-//   71 <- bits remaining for distinguishing session IDs
+//   73 <- bits remaining for distinguishing session IDs
-// The probability of a collision in 71 bits of session ID data is less than
+// The probability of a collision in 73 bits of session ID data is less than
-// 1 in 2**(71 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
+// 1 in 2**(73 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
 // data from the last 180 days is in cache for potential collision, and that
 // cache keys under each session id exhaustively cover the remaining 57 bits
 // while in reality they'll only cover a small fraction of it.
@ -160,7 +204,7 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // Now suppose we have many DBs per host, say 2**10, with same host-wide write
 // rate and process/session lifetime. File numbers will be ~10 bits smaller
 // and we will have 2**10 times as many session IDs because of simultaneous
-// lifetimes. So now collision chance is less than 1 in 2**(81 - (2 * 26)),
+// lifetimes. So now collision chance is less than 1 in 2**(83 - (2 * 26)),
 // or roughly 1 in a billion.
 //
 // Suppose instead we generated random or hashed cache keys for each
@ -176,17 +220,17 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // activity over many months, by making some pessimistic simplifying
 // assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
 // Here is some sample output with
-// `./cache_bench -stress_cache_key -sck_keep_bits=40`:
+// `./cache_bench -stress_cache_key -sck_keep_bits=43`:
 //
 //   Total cache or DBs size: 32TiB  Writing 925.926 MiB/s or 76.2939TiB/day
-//   Multiply by 9.22337e+18 to correct for simulation losses (but still
+//   Multiply by 1.15292e+18 to correct for simulation losses (but still
 //   assume whole file cached)
 //
 // These come from default settings of 2.5M files per day of 32 MB each, and
-// `-sck_keep_bits=40` means that to represent a single file, we are only
+// `-sck_keep_bits=43` means that to represent a single file, we are only
-// keeping 40 bits of the 128-bit (base) cache key.  With file size of 2**25
+// keeping 43 bits of the 128-bit (base) cache key.  With file size of 2**25
-// contiguous keys (pessimistic), our simulation is about 2\*\*(128-40-25) or
+// contiguous keys (pessimistic), our simulation is about 2\*\*(128-43-25) or
-// about 9 billion billion times more prone to collision than reality.
+// about 1 billion billion times more prone to collision than reality.
 //
 // More default assumptions, relatively pessimistic:
 // * 100 DBs in same process (doesn't matter much)
@ -194,49 +238,55 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // average every 100 files generated
 // * Restart process (all new session IDs unrelated to old) 24 times per day
 //
-// After enough data, we get a result at the end (-sck_keep_bits=40):
+// After enough data, we get a result at the end (-sck_keep_bits=43):
 //
-//   (keep 40 bits)  17 collisions after 2 x 90 days, est 10.5882 days between
+//   (keep 43 bits)  18 collisions after 2 x 90 days, est 10 days between
-//                   (9.76592e+19 corrected)
+//                   (1.15292e+19 corrected)
 //
 // If we believe the (pessimistic) simulation and the mathematical
-// extrapolation, we would need to run a billion machines all for 97 billion
+// extrapolation, we would need to run a billion machines all for 11 billion
 // days to expect a cache key collision. To help verify that our extrapolation
-// ("corrected") is robust, we can make our simulation more precise with
+// ("corrected") is robust, we can make our simulation more precise by
-// `-sck_keep_bits=41` and `42`, which takes more running time to get enough
+// increasing the "keep" bits, which takes more running time to get enough
 // collision data:
 //
-//   (keep 41 bits)  16 collisions after 4 x 90 days, est 22.5 days between
+//   (keep 44 bits)  16 collisions after 5 x 90 days, est 28.125 days between
-//                   (1.03763e+20 corrected)
+//                   (1.6213e+19 corrected)
-//   (keep 42 bits)  19 collisions after 10 x 90 days, est 47.3684 days between
+//   (keep 45 bits)  15 collisions after 7 x 90 days, est 42 days between
-//                   (1.09224e+20 corrected)
+//                   (1.21057e+19 corrected)
 //   (keep 46 bits)  15 collisions after 17 x 90 days, est 102 days between
 //                   (1.46997e+19 corrected)
 //   (keep 47 bits)  15 collisions after 49 x 90 days, est 294 days between
 //                   (2.11849e+19 corrected)
 //
-// The extrapolated prediction is very close. If anything, we might have some
+// The extrapolated prediction seems to be within noise (sampling error).
 // very small losses of structured data (see class StressCacheKey in
 // cache_bench_tool.cc) leading to more accurate & more attractive prediction
 // with more bits kept.
 //
 // With the `-sck_randomize` option, we can see that typical workloads like
 // above have lower collision probability than "random" cache keys (note:
-// offsets still non-randomized) by a modest amount (roughly 20x less collision
+// offsets still non-randomized) by a modest amount (roughly 2-3x less
-// prone than random), which should make us reasonably comfortable even in
+// collision prone than random), which should make us reasonably comfortable
-// "degenerate" cases (e.g. repeatedly launch a process to generate 1 file
+// even in "degenerate" cases (e.g. repeatedly launch a process to generate
-// with SstFileWriter):
+// one file with SstFileWriter):
 //
 //   (rand 43 bits) 22 collisions after 1 x 90 days, est 4.09091 days between
 //                  (4.7165e+18 corrected)
 //
 // We can see that with more frequent process restarts,
 // -sck_restarts_per_day=5000, which means more all-new session IDs, we get
 // closer to the "random" cache key performance:
 //
-//   (rand 40 bits) 197 collisions after 1 x 90 days, est 0.456853 days between
+// 15 collisions after 1 x 90 days, est 6 days between (6.91753e+18 corrected)
 //                  (4.21372e+18 corrected)
 //
-// We can see that with more frequent process restarts (all new session IDs),
+// And with less frequent process restarts and re-opens,
-// we get closer to the "random" cache key performance:
+// -sck_restarts_per_day=1 -sck_reopen_nfiles=1000, we get lower collision
 // probability:
 //
-//   (-sck_restarts_per_day=5000): 140 collisions after 1 x 90 days, ...
+// 18 collisions after 8 x 90 days, est 40 days between (4.61169e+19 corrected)
 //                  (5.92931e+18 corrected)
 //
 // Other tests have been run to validate other conditions behave as expected,
 // never behaving "worse than random" unless we start chopping off structured
 // data.
 //
 //
 // Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
 // that only arise when a new process is started, the chance of any cache key
 // collisions in a giant fleet of machines is negligible. Especially when
@ -249,96 +299,66 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // quantify) block cache corruptions, including collisions, should be added.
 OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
                                       const std::string &db_session_id,
-                                       uint64_t file_number,
+                                       uint64_t file_number) {
-                                       uint64_t max_offset) {
+  UniqueId64x2 internal_id;
-#ifndef NDEBUG
+  Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number,
-  max_offset_ = max_offset;
+                                    &internal_id, /*force=*/true);
-#endif
+  assert(s.ok());
-  // Closely related to GetSstInternalUniqueId, but only need 128 bits and
+  *this = FromInternalUniqueId(&internal_id);
-  // need to include an offset within the file.
+}
  // See also https://github.com/pdillinger/unique_id for background.
  uint64_t session_upper = 0;  // Assignment to appease clang-analyze
  uint64_t session_lower = 0;  // Assignment to appease clang-analyze
  {
    Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower);
    if (!s.ok()) {
      // A reasonable fallback in case malformed
      Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper,
               &session_lower);
    }
  }
  // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy)
  // for more global uniqueness entropy.
  // (It is possible that many DBs descended from one common DB id are copied
  // around and proliferate, in which case session id is critical, but it is
  // more common for different DBs to have different DB ids.)
  uint64_t db_hash = Hash64(db_id.data(), db_id.size(), session_upper);
  // This establishes the db+session id part of the cache key.
  //
  // Exactly preserve (in common cases; see modifiers below) session lower to
  // ensure that session ids generated during the same process lifetime are
  // guaranteed unique.
  //
  // We put this first for CommonPrefixSlice(), so that a small-ish set of
  // cache key prefixes to cover entries relevant to any DB.
  session_etc64_ = session_lower;
  // This provides extra entopy in case of different DB id or process
  // generating a session id, but is also partly/variably obscured by
  // file_number and offset (see below).
  offset_etc64_ = db_hash;
  // Into offset_etc64_ we are (eventually) going to pack & xor in an offset and
  // a file_number, but we might need the file_number to overflow into
  // session_etc64_. (There must only be one session_etc64_ value per
  // file, and preferably shared among many files.)
  //
  // Figure out how many bytes of file_number we are going to be able to
  // pack in with max_offset, though our encoding will only support packing
  // in up to 3 bytes of file_number. (16M file numbers is enough for a new
  // file number every second for half a year.)
  int file_number_bytes_in_offset_etc =
      (63 - FloorLog2(max_offset | 0x100000000U)) / 8;
  int file_number_bits_in_offset_etc = file_number_bytes_in_offset_etc * 8;
-  // Assert two bits of metadata
+OffsetableCacheKey OffsetableCacheKey::FromInternalUniqueId(UniqueIdPtr id) {
-  assert(file_number_bytes_in_offset_etc >= 0 &&
+  uint64_t session_lower = id.ptr[0];
-         file_number_bytes_in_offset_etc <= 3);
+  uint64_t file_num_etc = id.ptr[1];
  // Assert we couldn't have used a larger allowed number of bytes (shift
  // would chop off bytes).
  assert(file_number_bytes_in_offset_etc == 3 ||
         (max_offset << (file_number_bits_in_offset_etc + 8) >>
          (file_number_bits_in_offset_etc + 8)) != max_offset);
-  uint64_t mask = (uint64_t{1} << (file_number_bits_in_offset_etc)) - 1;
+#ifndef NDEBUG
-  // Pack into high bits of etc so that offset can go in low bits of etc
+  bool is_empty = session_lower == 0 && file_num_etc == 0;
-  // TODO: could be EndianSwapValue?
+#endif
  uint64_t offset_etc_modifier = ReverseBits(file_number & mask);
  assert(offset_etc_modifier << file_number_bits_in_offset_etc == 0U);
-  // Overflow and 3 - byte count (likely both zero) go into session_id part
+  // Although DBImpl guarantees (in recent versions) that session_lower is not
-  uint64_t session_etc_modifier =
+  // zero, that's not entirely sufficient to guarantee that file_num_etc64_ is
-      (file_number >> file_number_bits_in_offset_etc << 2) |
+  // not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
-      static_cast<uint64_t>(3 - file_number_bytes_in_offset_etc);
+  // However, if we are given an "empty" id as input, then we should produce
-  // Packed into high bits to minimize interference with session id counter.
+  // "empty" as output.
-  session_etc_modifier = ReverseBits(session_etc_modifier);
+  // As a consequence, this function is only bijective assuming
  // id[0] == 0 only if id[1] == 0.
  if (session_lower == 0U) {
    session_lower = file_num_etc;
  }
-  // Assert session_id part is only modified in extreme cases
+  // See comments above for how DownwardInvolution and ReverseBits
-  assert(session_etc_modifier == 0 || file_number > /*3 bytes*/ 0xffffffU ||
+  // make this function invertible under various assumptions.
-         max_offset > /*5 bytes*/ 0xffffffffffU);
+  OffsetableCacheKey rv;
  rv.file_num_etc64_ =
      DownwardInvolution(session_lower) ^ ReverseBits(file_num_etc);
  rv.offset_etc64_ = ReverseBits(session_lower);
-  // Xor in the modifiers
+  // Because of these transformations and needing to allow arbitrary
-  session_etc64_ ^= session_etc_modifier;
+  // offset (thus, second 64 bits of cache key might be 0), we need to
-  offset_etc64_ ^= offset_etc_modifier;
+  // make some correction to ensure the first 64 bits is not 0.
  // Fortunately, the transformation ensures the second 64 bits is not 0
  // for non-empty base key, so we can swap in the case one is 0 without
  // breaking bijectivity (assuming condition above).
  assert(is_empty || rv.offset_etc64_ > 0);
  if (rv.file_num_etc64_ == 0) {
    std::swap(rv.file_num_etc64_, rv.offset_etc64_);
  }
  assert(is_empty || rv.file_num_etc64_ > 0);
  return rv;
 }
-  // Although DBImpl guarantees (in recent versions) that session_lower is not
+// Inverse of FromInternalUniqueId (assuming file_num_etc64 == 0 only if
-  // zero, that's not entirely sufficient to guarantee that session_etc64_ is
+// offset_etc64 == 0)
-  // not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
+UniqueId64x2 OffsetableCacheKey::ToInternalUniqueId() {
-  if (session_etc64_ == 0U) {
+  uint64_t a = file_num_etc64_;
-    session_etc64_ = session_upper | 1U;
+  uint64_t b = offset_etc64_;
  if (b == 0) {
    std::swap(a, b);
  }
-  assert(session_etc64_ != 0);
+  UniqueId64x2 rv;
  rv[0] = ReverseBits(b);
  rv[1] = ReverseBits(a ^ DownwardInvolution(rv[0]));
  return rv;
 }
 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/cache_key.h
+++ b/cache/cache_key.h
@ -9,6 +9,7 @@
 #include "rocksdb/rocksdb_namespace.h"
 #include "rocksdb/slice.h"
 #include "table/unique_id_impl.h"
 namespace ROCKSDB_NAMESPACE {
@ -33,10 +34,10 @@ class CacheKey {
 public:
  // For convenience, constructs an "empty" cache key that is never returned
  // by other means.
-  inline CacheKey() : session_etc64_(), offset_etc64_() {}
+  inline CacheKey() : file_num_etc64_(), offset_etc64_() {}
  inline bool IsEmpty() const {
-    return (session_etc64_ == 0) & (offset_etc64_ == 0);
+    return (file_num_etc64_ == 0) & (offset_etc64_ == 0);
  }
  // Use this cache key as a Slice (byte order is endianness-dependent)
@ -59,9 +60,9 @@ class CacheKey {
 protected:
  friend class OffsetableCacheKey;
-  CacheKey(uint64_t session_etc64, uint64_t offset_etc64)
+  CacheKey(uint64_t file_num_etc64, uint64_t offset_etc64)
-      : session_etc64_(session_etc64), offset_etc64_(offset_etc64) {}
+      : file_num_etc64_(file_num_etc64), offset_etc64_(offset_etc64) {}
-  uint64_t session_etc64_;
+  uint64_t file_num_etc64_;
  uint64_t offset_etc64_;
 };
@ -85,50 +86,58 @@ class OffsetableCacheKey : private CacheKey {
  inline OffsetableCacheKey() : CacheKey() {}
  // Constructs an OffsetableCacheKey with the given information about a file.
-  // max_offset is based on file size (see WithOffset) and is required here to
+  // This constructor never generates an "empty" base key.
  // choose an appropriate (sub-)encoding. This constructor never generates an
  // "empty" base key.
  OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id,
-                     uint64_t file_number, uint64_t max_offset);
+                     uint64_t file_number);
  // Creates an OffsetableCacheKey from an SST unique ID, so that cache keys
  // can be derived from DB manifest data before reading the file from
  // storage--so that every part of the file can potentially go in a persistent
  // cache.
  //
  // Calling GetSstInternalUniqueId() on a db_id, db_session_id, and
  // file_number and passing the result to this function produces the same
  // base cache key as feeding those inputs directly to the constructor.
  //
  // This is a bijective transformation assuming either id is empty or
  // lower 64 bits is non-zero:
  // * Empty (all zeros) input -> empty (all zeros) output
  // * Lower 64 input is non-zero -> lower 64 output (file_num_etc64_) is
  //   non-zero
  static OffsetableCacheKey FromInternalUniqueId(UniqueIdPtr id);
  // This is the inverse transformation to the above, assuming either empty
  // or lower 64 bits (file_num_etc64_) is non-zero. Perhaps only useful for
  // testing.
  UniqueId64x2 ToInternalUniqueId();
  inline bool IsEmpty() const {
-    bool result = session_etc64_ == 0;
+    bool result = file_num_etc64_ == 0;
    assert(!(offset_etc64_ > 0 && result));
    return result;
  }
-  // Construct a CacheKey for an offset within a file, which must be
+  // Construct a CacheKey for an offset within a file. An offset is not
-  // <= max_offset provided in constructor. An offset is not necessarily a
+  // necessarily a byte offset if a smaller unique identifier of keyable
-  // byte offset if a smaller unique identifier of keyable offsets is used.
+  // offsets is used.
  //
  // This class was designed to make this hot code extremely fast.
  inline CacheKey WithOffset(uint64_t offset) const {
    assert(!IsEmpty());
-    assert(offset <= max_offset_);
+    return CacheKey(file_num_etc64_, offset_etc64_ ^ offset);
    return CacheKey(session_etc64_, offset_etc64_ ^ offset);
  }
-  // The "common prefix" is a shared prefix for all the returned CacheKeys,
+  // The "common prefix" is a shared prefix for all the returned CacheKeys.
-  // that also happens to usually be the same among many files in the same DB,
+  // It is specific to the file but the same for all offsets within the file.
  // so is efficient and highly accurate (not perfectly) for DB-specific cache
  // dump selection (but not file-specific).
  static constexpr size_t kCommonPrefixSize = 8;
  inline Slice CommonPrefixSlice() const {
-    static_assert(sizeof(session_etc64_) == kCommonPrefixSize,
+    static_assert(sizeof(file_num_etc64_) == kCommonPrefixSize,
                  "8 byte common prefix expected");
    assert(!IsEmpty());
-    assert(&this->session_etc64_ == static_cast<const void *>(this));
+    assert(&this->file_num_etc64_ == static_cast<const void *>(this));
    return Slice(reinterpret_cast<const char *>(this), kCommonPrefixSize);
  }
  // For any max_offset <= this value, the same encoding scheme is guaranteed.
  static constexpr uint64_t kMaxOffsetStandardEncoding = 0xffffffffffU;
 private:
 #ifndef NDEBUG
  uint64_t max_offset_ = 0;
 #endif
 };
 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/cache_reservation_manager_test.cc
+++ b/cache/cache_reservation_manager_test.cc
@ -48,13 +48,13 @@ TEST_F(CacheReservationManagerTest, GenerateCacheKey) {
  // Next unique Cache key
  CacheKey ckey = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  // Get to the underlying values
-  using PairU64 = std::array<uint64_t, 2>;
+  uint64_t* ckey_data = reinterpret_cast<uint64_t*>(&ckey);
  auto& ckey_pair = *reinterpret_cast<PairU64*>(&ckey);
  // Back it up to the one used by CRM (using CacheKey implementation details)
-  ckey_pair[1]--;
+  ckey_data[1]--;
  // Specific key (subject to implementation details)
-  EXPECT_EQ(ckey_pair, PairU64({0, 2}));
+  EXPECT_EQ(ckey_data[0], 0);
  EXPECT_EQ(ckey_data[1], 2);
  Cache::Handle* handle = cache->Lookup(ckey.AsSlice());
  EXPECT_NE(handle, nullptr)
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@ -558,19 +558,12 @@ class TestSecondaryCache : public SecondaryCache {
  void ResetInjectFailure() { inject_failure_ = false; }
  void SetDbSessionId(const std::string& db_session_id) {
    // NOTE: we assume the file is smaller than kMaxFileSizeStandardEncoding
    // for this to work, but that's safe in a test.
    auto base = OffsetableCacheKey("unknown", db_session_id, 1, 1);
    ckey_prefix_ = base.CommonPrefixSlice().ToString();
  }
  Status Insert(const Slice& key, void* value,
                const Cache::CacheItemHelper* helper) override {
    if (inject_failure_) {
      return Status::Corruption("Insertion Data Corrupted");
    }
-    EXPECT_TRUE(IsDbSessionLowerAsKeyPrefix(key));
+    CheckCacheKeyCommonPrefix(key);
    size_t size;
    char* buf;
    Status s;
@ -648,8 +641,13 @@ class TestSecondaryCache : public SecondaryCache {
  uint32_t num_lookups() { return num_lookups_; }
-  bool IsDbSessionLowerAsKeyPrefix(const Slice& key) {
+  void CheckCacheKeyCommonPrefix(const Slice& key) {
-    return key.starts_with(ckey_prefix_);
+    Slice current_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize);
    if (ckey_prefix_.empty()) {
      ckey_prefix_ = current_prefix.ToString();
    } else {
      EXPECT_EQ(ckey_prefix_, current_prefix.ToString());
    }
  }
 private:
@ -794,28 +792,30 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicTest) {
  opts.secondary_cache = secondary_cache;
  std::shared_ptr<Cache> cache = NewLRUCache(opts);
  std::shared_ptr<Statistics> stats = CreateDBStatistics();
  CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  Random rnd(301);
  std::string str1 = rnd.RandomString(1020);
  TestItem* item1 = new TestItem(str1.data(), str1.length());
-  ASSERT_OK(cache->Insert("k1", item1, &LRUCacheSecondaryCacheTest::helper_,
+  ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
-                          str1.length()));
+                          &LRUCacheSecondaryCacheTest::helper_, str1.length()));
  std::string str2 = rnd.RandomString(1020);
  TestItem* item2 = new TestItem(str2.data(), str2.length());
  // k1 should be demoted to NVM
-  ASSERT_OK(cache->Insert("k2", item2, &LRUCacheSecondaryCacheTest::helper_,
+  ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
-                          str2.length()));
+                          &LRUCacheSecondaryCacheTest::helper_, str2.length()));
  get_perf_context()->Reset();
  Cache::Handle* handle;
  handle =
-      cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_,
+      cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
                    test_item_creator, Cache::Priority::LOW, true, stats.get());
  ASSERT_NE(handle, nullptr);
  cache->Release(handle);
  // This lookup should promote k1 and demote k2
  handle =
-      cache->Lookup("k1", &LRUCacheSecondaryCacheTest::helper_,
+      cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
                    test_item_creator, Cache::Priority::LOW, true, stats.get());
  ASSERT_NE(handle, nullptr);
  cache->Release(handle);
@ -837,21 +837,23 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicFailTest) {
      std::make_shared<TestSecondaryCache>(2048);
  opts.secondary_cache = secondary_cache;
  std::shared_ptr<Cache> cache = NewLRUCache(opts);
  CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  Random rnd(301);
  std::string str1 = rnd.RandomString(1020);
  auto item1 = std::make_unique<TestItem>(str1.data(), str1.length());
-  ASSERT_TRUE(cache->Insert("k1", item1.get(), nullptr, str1.length())
+  ASSERT_TRUE(cache->Insert(k1.AsSlice(), item1.get(), nullptr, str1.length())
                  .IsInvalidArgument());
-  ASSERT_OK(cache->Insert("k1", item1.get(),
+  ASSERT_OK(cache->Insert(k1.AsSlice(), item1.get(),
                          &LRUCacheSecondaryCacheTest::helper_, str1.length()));
  item1.release();  // Appease clang-analyze "potential memory leak"
  Cache::Handle* handle;
-  handle = cache->Lookup("k2", nullptr, test_item_creator, Cache::Priority::LOW,
+  handle = cache->Lookup(k2.AsSlice(), nullptr, test_item_creator,
-                         true);
+                         Cache::Priority::LOW, true);
  ASSERT_EQ(handle, nullptr);
-  handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_,
+  handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
                         test_item_creator, Cache::Priority::LOW, false);
  ASSERT_EQ(handle, nullptr);
@ -866,30 +868,37 @@ TEST_F(LRUCacheSecondaryCacheTest, SaveFailTest) {
      std::make_shared<TestSecondaryCache>(2048);
  opts.secondary_cache = secondary_cache;
  std::shared_ptr<Cache> cache = NewLRUCache(opts);
  CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  Random rnd(301);
  std::string str1 = rnd.RandomString(1020);
  TestItem* item1 = new TestItem(str1.data(), str1.length());
-  ASSERT_OK(cache->Insert(
+  ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
-      "k1", item1, &LRUCacheSecondaryCacheTest::helper_fail_, str1.length()));
+                          &LRUCacheSecondaryCacheTest::helper_fail_,
                          str1.length()));
  std::string str2 = rnd.RandomString(1020);
  TestItem* item2 = new TestItem(str2.data(), str2.length());
  // k1 should be demoted to NVM
-  ASSERT_OK(cache->Insert(
+  ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
-      "k2", item2, &LRUCacheSecondaryCacheTest::helper_fail_, str2.length()));
+                          &LRUCacheSecondaryCacheTest::helper_fail_,
                          str2.length()));
  Cache::Handle* handle;
-  handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_fail_,
+  handle =
-                         test_item_creator, Cache::Priority::LOW, true);
+      cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_,
                    test_item_creator, Cache::Priority::LOW, true);
  ASSERT_NE(handle, nullptr);
  cache->Release(handle);
  // This lookup should fail, since k1 demotion would have failed
-  handle = cache->Lookup("k1", &LRUCacheSecondaryCacheTest::helper_fail_,
+  handle =
-                         test_item_creator, Cache::Priority::LOW, true);
+      cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_,
                    test_item_creator, Cache::Priority::LOW, true);
  ASSERT_EQ(handle, nullptr);
  // Since k1 didn't get promoted, k2 should still be in cache
-  handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_fail_,
+  handle =
-                         test_item_creator, Cache::Priority::LOW, true);
+      cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_,
                    test_item_creator, Cache::Priority::LOW, true);
  ASSERT_NE(handle, nullptr);
  cache->Release(handle);
  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
@ -906,30 +915,32 @@ TEST_F(LRUCacheSecondaryCacheTest, CreateFailTest) {
      std::make_shared<TestSecondaryCache>(2048);
  opts.secondary_cache = secondary_cache;
  std::shared_ptr<Cache> cache = NewLRUCache(opts);
  CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  Random rnd(301);
  std::string str1 = rnd.RandomString(1020);
  TestItem* item1 = new TestItem(str1.data(), str1.length());
-  ASSERT_OK(cache->Insert("k1", item1, &LRUCacheSecondaryCacheTest::helper_,
+  ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
-                          str1.length()));
+                          &LRUCacheSecondaryCacheTest::helper_, str1.length()));
  std::string str2 = rnd.RandomString(1020);
  TestItem* item2 = new TestItem(str2.data(), str2.length());
  // k1 should be demoted to NVM
-  ASSERT_OK(cache->Insert("k2", item2, &LRUCacheSecondaryCacheTest::helper_,
+  ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
-                          str2.length()));
+                          &LRUCacheSecondaryCacheTest::helper_, str2.length()));
  Cache::Handle* handle;
  SetFailCreate(true);
-  handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_,
+  handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
                         test_item_creator, Cache::Priority::LOW, true);
  ASSERT_NE(handle, nullptr);
  cache->Release(handle);
  // This lookup should fail, since k1 creation would have failed
-  handle = cache->Lookup("k1", &LRUCacheSecondaryCacheTest::helper_,
+  handle = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
                         test_item_creator, Cache::Priority::LOW, true);
  ASSERT_EQ(handle, nullptr);
  // Since k1 didn't get promoted, k2 should still be in cache
-  handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_,
+  handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
                         test_item_creator, Cache::Priority::LOW, true);
  ASSERT_NE(handle, nullptr);
  cache->Release(handle);
@ -947,32 +958,34 @@ TEST_F(LRUCacheSecondaryCacheTest, FullCapacityTest) {
      std::make_shared<TestSecondaryCache>(2048);
  opts.secondary_cache = secondary_cache;
  std::shared_ptr<Cache> cache = NewLRUCache(opts);
  CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
  Random rnd(301);
  std::string str1 = rnd.RandomString(1020);
  TestItem* item1 = new TestItem(str1.data(), str1.length());
-  ASSERT_OK(cache->Insert("k1", item1, &LRUCacheSecondaryCacheTest::helper_,
+  ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
-                          str1.length()));
+                          &LRUCacheSecondaryCacheTest::helper_, str1.length()));
  std::string str2 = rnd.RandomString(1020);
  TestItem* item2 = new TestItem(str2.data(), str2.length());
  // k1 should be demoted to NVM
-  ASSERT_OK(cache->Insert("k2", item2, &LRUCacheSecondaryCacheTest::helper_,
+  ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
-                          str2.length()));
+                          &LRUCacheSecondaryCacheTest::helper_, str2.length()));
  Cache::Handle* handle;
-  handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_,
+  handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
                         test_item_creator, Cache::Priority::LOW, true);
  ASSERT_NE(handle, nullptr);
  // k1 promotion should fail due to the block cache being at capacity,
  // but the lookup should still succeed
  Cache::Handle* handle2;
-  handle2 = cache->Lookup("k1", &LRUCacheSecondaryCacheTest::helper_,
+  handle2 = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
                          test_item_creator, Cache::Priority::LOW, true);
  ASSERT_NE(handle2, nullptr);
  // Since k1 didn't get inserted, k2 should still be in cache
  cache->Release(handle);
  cache->Release(handle2);
-  handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_,
+  handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
                         test_item_creator, Cache::Priority::LOW, true);
  ASSERT_NE(handle, nullptr);
  cache->Release(handle);
@ -1009,9 +1022,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) {
  // all the blocks will be accessed.
  options.paranoid_file_checks = true;
  DestroyAndReopen(options);
  std::string session_id;
  ASSERT_OK(db_->GetDbSessionId(session_id));
  secondary_cache->SetDbSessionId(session_id);
  Random rnd(301);
  const int N = 6;
  for (int i = 0; i < N; i++) {
@ -1103,9 +1113,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) {
  options.env = fault_env_.get();
  fault_fs_->SetFailGetUniqueId(true);
  DestroyAndReopen(options);
  std::string session_id;
  ASSERT_OK(db_->GetDbSessionId(session_id));
  secondary_cache->SetDbSessionId(session_id);
  Random rnd(301);
  const int N = 6;
  for (int i = 0; i < N; i++) {
@ -1197,9 +1204,6 @@ TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) {
  fault_fs_->SetFailGetUniqueId(true);
  DestroyAndReopen(options);
  std::string session_id;
  ASSERT_OK(db_->GetDbSessionId(session_id));
  secondary_cache->SetDbSessionId(session_id);
  Random rnd(301);
  const int N = 6;
  for (int i = 0; i < N; i++) {
@ -1249,9 +1253,6 @@ TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) {
  options.env = fault_env_.get();
  fault_fs_->SetFailGetUniqueId(true);
  DestroyAndReopen(options);
  std::string session_id;
  ASSERT_OK(db_->GetDbSessionId(session_id));
  secondary_cache->SetDbSessionId(session_id);
  Random rnd(301);
  const int N = 256;
  for (int i = 0; i < N; i++) {
@ -1299,9 +1300,6 @@ TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) {
  options.env = fault_env_.get();
  fault_fs_->SetFailGetUniqueId(true);
  DestroyAndReopen(options);
  std::string session_id;
  ASSERT_OK(db_->GetDbSessionId(session_id));
  secondary_cache->SetDbSessionId(session_id);
  Random rnd(301);
  const int N = 6;
  for (int i = 0; i < N; i++) {
@ -1382,6 +1380,7 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) {
  opts.secondary_cache = secondary_cache;
  std::shared_ptr<Cache> cache = NewLRUCache(opts);
  const int num_keys = 32;
  OffsetableCacheKey ock{"foo", "bar", 1};
  Random rnd(301);
  std::vector<std::string> values;
@ -1389,7 +1388,7 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) {
    std::string str = rnd.RandomString(1020);
    values.emplace_back(str);
    TestItem* item = new TestItem(str.data(), str.length());
-    ASSERT_OK(cache->Insert("k" + std::to_string(i), item,
+    ASSERT_OK(cache->Insert(ock.WithOffset(i).AsSlice(), item,
                            &LRUCacheSecondaryCacheTest::helper_,
                            str.length()));
  }
@ -1399,13 +1398,16 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) {
  cache->SetCapacity(32 * 1024);
  secondary_cache->SetResultMap(
-      {{"k3", TestSecondaryCache::ResultType::DEFER},
+      {{ock.WithOffset(3).AsSlice().ToString(),
-       {"k4", TestSecondaryCache::ResultType::DEFER_AND_FAIL},
+        TestSecondaryCache::ResultType::DEFER},
-       {"k5", TestSecondaryCache::ResultType::FAIL}});
+       {ock.WithOffset(4).AsSlice().ToString(),
        TestSecondaryCache::ResultType::DEFER_AND_FAIL},
       {ock.WithOffset(5).AsSlice().ToString(),
        TestSecondaryCache::ResultType::FAIL}});
  std::vector<Cache::Handle*> results;
  for (int i = 0; i < 6; ++i) {
    results.emplace_back(cache->Lookup(
-        "k" + std::to_string(i), &LRUCacheSecondaryCacheTest::helper_,
+        ock.WithOffset(i).AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
        test_item_creator, Cache::Priority::LOW, false));
  }
  cache->WaitAll(results);
@ -1891,9 +1893,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) {
  // all the blocks will be accessed.
  options.paranoid_file_checks = true;
  DestroyAndReopen(options);
  std::string session_id;
  ASSERT_OK(db_->GetDbSessionId(session_id));
  secondary_cache->SetDbSessionId(session_id);
  Random rnd(301);
  const int N = 6;
  for (int i = 0; i < N; i++) {
@ -1986,9 +1985,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) {
  // all the blocks will be accessed.
  options.paranoid_file_checks = true;
  DestroyAndReopen(options);
  std::string session_id;
  ASSERT_OK(db_->GetDbSessionId(session_id));
  secondary_cache->SetDbSessionId(session_id);
  Random rnd(301);
  const int N = 6;
  for (int i = 0; i < N; i++) {
@ -2087,12 +2083,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) {
  ASSERT_OK(DB::Open(options2, dbname2, &db2));
  fault_fs_->SetFailGetUniqueId(true);
  // Set the file paranoid check, so after flush, the file will be read
  // all the blocks will be accessed.
  std::string session_id;
  ASSERT_OK(db1->GetDbSessionId(session_id));
  secondary_cache->SetDbSessionId(session_id);
  WriteOptions wo;
  Random rnd(301);
  const int N = 6;
--- a/db/blob/blob_file_builder.cc
+++ b/db/blob/blob_file_builder.cc
@ -399,12 +399,8 @@ Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
      creation_reason_ == BlobFileCreationReason::kFlush;
  if (blob_cache && warm_cache) {
-    // The blob file during flush is unknown to be exactly how big it is.
+    const OffsetableCacheKey base_cache_key(db_id_, db_session_id_,
-    // Therefore, we set the file size to kMaxOffsetStandardEncoding. For any
+                                            blob_file_number);
    // max_offset <= this value, the same encoding scheme is guaranteed.
    const OffsetableCacheKey base_cache_key(
        db_id_, db_session_id_, blob_file_number,
        OffsetableCacheKey::kMaxOffsetStandardEncoding);
    const CacheKey cache_key = base_cache_key.WithOffset(blob_offset);
    const Slice key = cache_key.AsSlice();
--- a/db/blob/blob_source.cc
+++ b/db/blob/blob_source.cc
@ -286,7 +286,7 @@ void BlobSource::MultiGetBlob(const ReadOptions& read_options,
 void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
                                         uint64_t file_number,
-                                         uint64_t file_size,
+                                         uint64_t /*file_size*/,
                                         autovector<BlobReadRequest>& blob_reqs,
                                         uint64_t* bytes_read) {
  const size_t num_blobs = blob_reqs.size();
@ -303,8 +303,7 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
  Mask cache_hit_mask = 0;
  uint64_t total_bytes = 0;
-  const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number,
+  const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
                                          file_size);
  if (blob_cache_) {
    size_t cached_blob_count = 0;
--- a/db/blob/blob_source.h
+++ b/db/blob/blob_source.h
@ -118,10 +118,9 @@ class BlobSource {
                              size_t charge, Cache::Handle** cache_handle,
                              Cache::Priority priority) const;
-  inline CacheKey GetCacheKey(uint64_t file_number, uint64_t file_size,
+  inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/,
                              uint64_t offset) const {
-    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number,
+    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
                                      file_size);
    return base_cache_key.WithOffset(offset);
  }
--- a/db/blob/blob_source_test.cc
+++ b/db/blob/blob_source_test.cc
@ -1168,8 +1168,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
    ASSERT_TRUE(
        blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[1]));
-    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number,
+    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
                                      file_size);
    // blob_cache here only looks at the primary cache since we didn't provide
    // the cache item helper for the secondary cache. However, since key0 is
@ -1412,8 +1411,7 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
  }
  {
-    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, kBlobFileNumber,
+    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, kBlobFileNumber);
                                      blob_file_size_);
    size_t blob_bytes = options_.blob_cache->GetUsage();
    for (size_t i = 0; i < kNumBlobs; ++i) {
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@ -1736,51 +1736,60 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
 class CacheKeyTest : public testing::Test {
 public:
-  void SetupStableBase() {
+  CacheKey GetBaseCacheKey() {
    CacheKey rv = GetOffsetableCacheKey(0, /*min file_number*/ 1).WithOffset(0);
    // Correct for file_number_ == 1
    *reinterpret_cast<uint64_t*>(&rv) ^= ReverseBits(uint64_t{1});
    return rv;
  }
  CacheKey GetCacheKey(uint64_t session_counter, uint64_t file_number,
                       uint64_t offset) {
    OffsetableCacheKey offsetable =
        GetOffsetableCacheKey(session_counter, file_number);
    // * 4 to counteract optimization that strips lower 2 bits in encoding
    // the offset in BlockBasedTable::GetCacheKey (which we prefer to include
    // in unit tests to maximize functional coverage).
    EXPECT_GE(offset * 4, offset);  // no overflow
    return BlockBasedTable::GetCacheKey(offsetable,
                                        BlockHandle(offset * 4, /*size*/ 5));
  }
 protected:
  OffsetableCacheKey GetOffsetableCacheKey(uint64_t session_counter,
                                           uint64_t file_number) {
    // Like SemiStructuredUniqueIdGen::GenerateNext
    tp_.db_session_id = EncodeSessionId(base_session_upper_,
-                                        base_session_lower_ ^ session_counter_);
+                                        base_session_lower_ ^ session_counter);
    tp_.db_id = std::to_string(db_id_);
-    tp_.orig_file_number = file_number_;
+    tp_.orig_file_number = file_number;
    bool is_stable;
    std::string cur_session_id = "";  // ignored
    uint64_t cur_file_number = 42;    // ignored
    OffsetableCacheKey rv;
    BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number,
-                                       file_size_, &base_cache_key_,
+                                       &rv, &is_stable);
-                                       &is_stable);
+    EXPECT_TRUE(is_stable);
-    ASSERT_TRUE(is_stable);
+    EXPECT_TRUE(!rv.IsEmpty());
-  }
+    // BEGIN some assertions in relation to SST unique IDs
-  CacheKey WithOffset(uint64_t offset) {
+    std::string external_unique_id_str;
-    return BlockBasedTable::GetCacheKey(base_cache_key_,
+    EXPECT_OK(GetUniqueIdFromTableProperties(tp_, &external_unique_id_str));
-                                        BlockHandle(offset, /*size*/ 5));
+    UniqueId64x2 sst_unique_id = {};
    EXPECT_OK(DecodeUniqueIdBytes(external_unique_id_str, &sst_unique_id));
    ExternalUniqueIdToInternal(&sst_unique_id);
    OffsetableCacheKey ock =
        OffsetableCacheKey::FromInternalUniqueId(&sst_unique_id);
    EXPECT_EQ(rv.WithOffset(0).AsSlice(), ock.WithOffset(0).AsSlice());
    EXPECT_EQ(ock.ToInternalUniqueId(), sst_unique_id);
    // END some assertions in relation to SST unique IDs
    return rv;
  }
 protected:
  OffsetableCacheKey base_cache_key_;
  TableProperties tp_;
  uint64_t file_size_ = 0;
  uint64_t base_session_upper_ = 0;
  uint64_t base_session_lower_ = 0;
  uint64_t session_counter_ = 0;
  uint64_t file_number_ = 0;
  uint64_t db_id_ = 0;
 };
 namespace {
 template <typename T>
 int CountBitsDifferent(const T& t1, const T& t2) {
  int diff = 0;
  const uint8_t* p1 = reinterpret_cast<const uint8_t*>(&t1);
  const uint8_t* p2 = reinterpret_cast<const uint8_t*>(&t2);
  static_assert(sizeof(*p1) == 1, "Expecting uint8_t byte");
  for (size_t i = 0; i < sizeof(T); ++i) {
    diff += BitsSetToOne(p1[i] ^ p2[i]);
  }
  return diff;
 }
 }  // namespace
 TEST_F(CacheKeyTest, DBImplSessionIdStructure) {
  // We have to generate our own session IDs for simulation purposes in other
  // tests. Here we verify that the DBImpl implementation seems to match
@ -1799,171 +1808,202 @@ TEST_F(CacheKeyTest, DBImplSessionIdStructure) {
  ASSERT_NE(Lower32of64(lower1), Lower32of64(lower2));
 }
-TEST_F(CacheKeyTest, StandardEncodingLimit) {
+namespace {
-  base_session_upper_ = 1234;
+// Deconstruct cache key, based on knowledge of implementation details.
-  base_session_lower_ = 5678;
+void DeconstructNonemptyCacheKey(const CacheKey& key, uint64_t* file_num_etc64,
-  session_counter_ = 42;
+                                 uint64_t* offset_etc64) {
-  file_number_ = 42;
+  *file_num_etc64 = *reinterpret_cast<const uint64_t*>(key.AsSlice().data());
-  db_id_ = 1234;
+  *offset_etc64 = *reinterpret_cast<const uint64_t*>(key.AsSlice().data() + 8);
-
+  assert(*file_num_etc64 != 0);
-  file_size_ = 42;
+  if (*offset_etc64 == 0) {
-  SetupStableBase();
+    std::swap(*file_num_etc64, *offset_etc64);
-  CacheKey ck1;
+  }
-  ASSERT_TRUE(ck1.IsEmpty());
+  assert(*offset_etc64 != 0);
-  ck1 = WithOffset(0);
+}
-  ASSERT_FALSE(ck1.IsEmpty());
+
-
+// Make a bit mask of 0 to 64 bits
-  // Should use same encoding
+uint64_t MakeMask64(int bits) {
-  file_size_ = BlockBasedTable::kMaxFileSizeStandardEncoding;
+  if (bits >= 64) {
-  SetupStableBase();
+    return uint64_t{0} - 1;
-  CacheKey ck2 = WithOffset(0);
+  } else {
-  ASSERT_EQ(CountBitsDifferent(ck1, ck2), 0);
+    return (uint64_t{1} << bits) - 1;
-
+  }
  // Should use different encoding
  ++file_size_;
  SetupStableBase();
  CacheKey ck3 = WithOffset(0);
  ASSERT_GT(CountBitsDifferent(ck2, ck3), 0);
 }
-TEST_F(CacheKeyTest, Encodings) {
+// See CacheKeyTest::Encodings
-  // Claim from cache_key.cc:
+struct CacheKeyDecoder {
-  // In fact, if our SST files are all < 4TB (see
+  // Inputs
-  // BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated
+  uint64_t base_file_num_etc64, base_offset_etc64;
-  // in a single process are guaranteed to have unique cache keys, unless/until
+  int session_counter_bits, file_number_bits, offset_bits;
  // number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in
  // a single process and 64 trillion files generated.
  // We can generalize that. For
  // * z bits in maximum file size
  // * n bits in maximum file number
  // * s bits in maximum session counter
  // uniqueness is guaranteed at least when all of these hold:
  // *  z + n + s <= 121  (128 - 2 meta + 2 offset trim - (8-1) byte granularity
  //                       in encoding)
  // *  n + s <= 86       (encoding limitation)
  // *  s <= 62           (because of 2-bit metadata)
  // We can verify this indirectly by how input bits get into the cache key,
  // but we have to be mindful that for sufficiently large file sizes,
  // different encodings might be used. But for cases mixing large and small
  // files, we have to verify uniqueness between encodings.
  // Going through all combinations would be a little expensive, so we test
  // only one random "stripe" of the configuration space per run.
  constexpr uint32_t kStripeBits = 8;
  constexpr uint32_t kStripeMask = (uint32_t{1} << kStripeBits) - 1;
  // Also cycle through stripes on repeated runs (not thread safe)
  static uint32_t stripe =
      static_cast<uint32_t>(std::random_device{}()) & kStripeMask;
  stripe = (stripe + 1) & kStripeMask;
  fprintf(stderr, "%u\n", stripe);
  // We are going to randomly initialize some values which *should* not affect
  // result
  Random64 r{std::random_device{}()};
-  int max_num_encodings = 0;
+  // Derived
-  uint32_t config_num = 0;
+  uint64_t session_counter_mask, file_number_mask, offset_mask;
-  uint32_t session_counter_bits, file_number_bits, max_file_size_bits;
+
-
+  // Outputs
-  // Inner loop body, used later in a loop over configurations
+  uint64_t decoded_session_counter, decoded_file_num, decoded_offset;
  auto TestConfig = [&]() {
    base_session_upper_ = r.Next();
    base_session_lower_ = r.Next();
    session_counter_ = r.Next();
    if (session_counter_bits < 64) {
      // Avoid shifting UB
      session_counter_ = session_counter_ >> 1 >> (63 - session_counter_bits);
    }
    file_number_ = r.Next() >> (64 - file_number_bits);
    // Need two bits set to avoid temporary zero below
    if (BitsSetToOne(file_number_) < 2) {
      file_number_ = 3;
    }
    db_id_ = r.Next();
    // Work-around clang-analyzer which thinks empty last_base is garbage
    CacheKey last_base = CacheKey::CreateUniqueForProcessLifetime();
    std::unordered_set<std::string> seen;
    int num_encodings = 0;
    // Loop over encodings by increasing file size bits
    for (uint32_t file_size_bits = 1; file_size_bits <= max_file_size_bits;
         ++file_size_bits) {
      file_size_ = uint64_t{1} << (file_size_bits - 1);
      SetupStableBase();
      CacheKey new_base = WithOffset(0);
      if (CountBitsDifferent(last_base, new_base) == 0) {
        // Same as previous encoding
        continue;
      }
-      // New encoding
+  void SetBaseCacheKey(const CacheKey& base) {
-      ++num_encodings;
+    DeconstructNonemptyCacheKey(base, &base_file_num_etc64, &base_offset_etc64);
-      ASSERT_TRUE(seen.insert(new_base.AsSlice().ToString()).second);
+  }
-      last_base = new_base;
+
-      for (uint32_t i = 0; i < file_size_bits; ++i) {
+  void SetRanges(int _session_counter_bits, int _file_number_bits,
-        CacheKey ck = WithOffset(uint64_t{1} << i);
+                 int _offset_bits) {
-        if (i < 2) {
+    session_counter_bits = _session_counter_bits;
-          // These cases are not relevant and optimized by dropping two
+    session_counter_mask = MakeMask64(session_counter_bits);
-          // lowest bits because there's always at least 5 bytes between
+    file_number_bits = _file_number_bits;
-          // blocks.
+    file_number_mask = MakeMask64(file_number_bits);
-          ASSERT_EQ(CountBitsDifferent(ck, new_base), 0);
+    offset_bits = _offset_bits;
-        } else {
+    offset_mask = MakeMask64(offset_bits);
-          // Normal case
+  }
-          // 1 bit different from base and never been seen implies the bit
+
-          // is encoded into cache key without overlapping other structured
+  void Decode(const CacheKey& key) {
-          // data.
+    uint64_t file_num_etc64, offset_etc64;
-          ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
+    DeconstructNonemptyCacheKey(key, &file_num_etc64, &offset_etc64);
-          ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
+
    // First decode session counter
    if (offset_bits + session_counter_bits <= 64) {
      // fully recoverable from offset_etc64
      decoded_session_counter =
          ReverseBits((offset_etc64 ^ base_offset_etc64)) &
          session_counter_mask;
    } else if (file_number_bits + session_counter_bits <= 64) {
      // fully recoverable from file_num_etc64
      decoded_session_counter = DownwardInvolution(
          (file_num_etc64 ^ base_file_num_etc64) & session_counter_mask);
    } else {
      // Need to combine parts from each word.
      // Piece1 will contain some correct prefix of the bottom bits of
      // session counter.
      uint64_t piece1 =
          ReverseBits((offset_etc64 ^ base_offset_etc64) & ~offset_mask);
      int piece1_bits = 64 - offset_bits;
      // Piece2 will contain involuded bits that we can combine with piece1
      // to infer rest of session counter
      int piece2_bits = std::min(64 - file_number_bits, 64 - piece1_bits);
      ASSERT_LT(piece2_bits, 64);
      uint64_t piece2_mask = MakeMask64(piece2_bits);
      uint64_t piece2 = (file_num_etc64 ^ base_file_num_etc64) & piece2_mask;
      // Cancel out the part of piece2 that we can infer from piece1
      // (DownwardInvolution distributes over xor)
      piece2 ^= DownwardInvolution(piece1) & piece2_mask;
      // Now we need to solve for the unknown original bits in higher
      // positions than piece1 provides. We use Gaussian elimination
      // because we know that a piece2_bits X piece2_bits submatrix of
      // the matrix underlying DownwardInvolution times the vector of
      // unknown original bits equals piece2.
      //
      // Build an augmented row matrix for that submatrix, built column by
      // column.
      std::array<uint64_t, 64> aug_rows{};
      for (int i = 0; i < piece2_bits; ++i) {  // over columns
        uint64_t col_i = DownwardInvolution(uint64_t{1} << piece1_bits << i);
        ASSERT_NE(col_i & 1U, 0);
        for (int j = 0; j < piece2_bits; ++j) {  // over rows
          aug_rows[j] |= (col_i & 1U) << i;
          col_i >>= 1;
        }
      }
-      for (uint32_t i = 0; i < session_counter_bits; ++i) {
+      // Augment with right hand side
-        SaveAndRestore<uint64_t> tmp(&session_counter_,
+      for (int j = 0; j < piece2_bits; ++j) {  // over rows
-                                     session_counter_ ^ (uint64_t{1} << i));
+        aug_rows[j] |= (piece2 & 1U) << piece2_bits;
-        SetupStableBase();
+        piece2 >>= 1;
        CacheKey ck = WithOffset(0);
        ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
        ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
      }
-      for (uint32_t i = 0; i < file_number_bits; ++i) {
+      // Run Gaussian elimination
-        SaveAndRestore<uint64_t> tmp(&file_number_,
+      for (int i = 0; i < piece2_bits; ++i) {  // over columns
-                                     file_number_ ^ (uint64_t{1} << i));
+        // Find a row that can be used to cancel others
-        SetupStableBase();
+        uint64_t canceller = 0;
-        CacheKey ck = WithOffset(0);
+        // Note: Rows 0 through i-1 contain 1s in columns already eliminated
-        ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
+        for (int j = i; j < piece2_bits; ++j) {  // over rows
-        ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
+          if (aug_rows[j] & (uint64_t{1} << i)) {
            // Swap into appropriate row
            std::swap(aug_rows[i], aug_rows[j]);
            // Keep a handy copy for row reductions
            canceller = aug_rows[i];
            break;
          }
        }
        ASSERT_NE(canceller, 0);
        for (int j = 0; j < piece2_bits; ++j) {  // over rows
          if (i != j && ((aug_rows[j] >> i) & 1) != 0) {
            // Row reduction
            aug_rows[j] ^= canceller;
          }
        }
      }
      // Extract result
      decoded_session_counter = piece1;
      for (int j = 0; j < piece2_bits; ++j) {  // over rows
        ASSERT_EQ(aug_rows[j] & piece2_mask, uint64_t{1} << j);
        decoded_session_counter |= aug_rows[j] >> piece2_bits << piece1_bits
                                                              << j;
      }
      max_num_encodings = std::max(max_num_encodings, num_encodings);
    }
  };
-  // Loop over configurations and test those in stripe
+    decoded_offset =
-  for (session_counter_bits = 0; session_counter_bits <= 62;
+        offset_etc64 ^ base_offset_etc64 ^ ReverseBits(decoded_session_counter);
    decoded_file_num = ReverseBits(file_num_etc64 ^ base_file_num_etc64 ^
                                   DownwardInvolution(decoded_session_counter));
  }
 };
 }  // namespace
 TEST_F(CacheKeyTest, Encodings) {
  // This test primarily verifies this claim from cache_key.cc:
  // // In fact, if DB ids were not involved, we would be guaranteed unique
  // // cache keys for files generated in a single process until total bits for
  // // biggest session_id_counter, orig_file_number, and offset_in_file
  // // reach 128 bits.
  //
  // To demonstrate this, CacheKeyDecoder can reconstruct the structured inputs
  // to the cache key when provided an output cache key, the unstructured
  // inputs, and bounds on the structured inputs.
  //
  // See OffsetableCacheKey comments in cache_key.cc.
  // We are going to randomly initialize some values that *should* not affect
  // result
  Random64 r{std::random_device{}()};
  CacheKeyDecoder decoder;
  db_id_ = r.Next();
  base_session_upper_ = r.Next();
  base_session_lower_ = r.Next();
  if (base_session_lower_ == 0) {
    base_session_lower_ = 1;
  }
  decoder.SetBaseCacheKey(GetBaseCacheKey());
  // Loop over configurations and test those
  for (int session_counter_bits = 0; session_counter_bits <= 64;
       ++session_counter_bits) {
-    uint32_t max_file_number_bits =
+    for (int file_number_bits = 1; file_number_bits <= 64; ++file_number_bits) {
-        std::min(uint32_t{64}, uint32_t{86} - session_counter_bits);
+      // 62 bits max because unoptimized offset will be 64 bits in that case
-    // Start with 2 to avoid file_number_ == 0 in testing
+      for (int offset_bits = 0; offset_bits <= 62; ++offset_bits) {
-    for (file_number_bits = 2; file_number_bits <= max_file_number_bits;
+        if (session_counter_bits + file_number_bits + offset_bits > 128) {
-         ++file_number_bits) {
+          break;
-      uint32_t max_max_file_size_bits =
+        }
-          std::min(uint32_t{64},
+
-                   uint32_t{121} - file_number_bits - session_counter_bits);
+        decoder.SetRanges(session_counter_bits, file_number_bits, offset_bits);
-      for (max_file_size_bits = 1; max_file_size_bits <= max_max_file_size_bits;
+
-           ++max_file_size_bits) {
+        uint64_t session_counter = r.Next() & decoder.session_counter_mask;
-        if ((config_num++ & kStripeMask) == stripe) {
+        uint64_t file_number = r.Next() & decoder.file_number_mask;
-          TestConfig();
+        if (file_number == 0) {
          // Minimum
          file_number = 1;
        }
        uint64_t offset = r.Next() & decoder.offset_mask;
        decoder.Decode(GetCacheKey(session_counter, file_number, offset));
        EXPECT_EQ(decoder.decoded_session_counter, session_counter);
        EXPECT_EQ(decoder.decoded_file_num, file_number);
        EXPECT_EQ(decoder.decoded_offset, offset);
      }
    }
  }
  // Make sure the current implementation is exercised
  ASSERT_EQ(max_num_encodings, 4);
 }
 INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest,
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@ -895,12 +895,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
      "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
      const_cast<TableProperties*>(&rep_->props));
-  // Extremely large files use atypical cache key encoding, and we don't
+  BlockBasedTable::SetupBaseCacheKey(&rep_->props, tbo.db_session_id,
-  // know ahead of time how big the file will be. But assuming it's less
+                                     tbo.cur_file_num, &rep_->base_cache_key);
  // than 4TB, we will correctly predict the cache keys.
  BlockBasedTable::SetupBaseCacheKey(
      &rep_->props, tbo.db_session_id, tbo.cur_file_num,
      BlockBasedTable::kMaxFileSizeStandardEncoding, &rep_->base_cache_key);
  if (rep_->IsParallelCompressionEnabled()) {
    StartParallelCompression();
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@ -521,7 +521,6 @@ Status GetGlobalSequenceNumber(const TableProperties& table_properties,
 void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties,
                                        const std::string& cur_db_session_id,
                                        uint64_t cur_file_number,
                                        uint64_t file_size,
                                        OffsetableCacheKey* out_base_cache_key,
                                        bool* out_is_stable) {
  // Use a stable cache key if sufficient data is in table properties
@ -565,8 +564,7 @@ void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties,
  // Minimum block size is 5 bytes; therefore we can trim off two lower bits
  // from offsets. See GetCacheKey.
-  *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num,
+  *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num);
                                           /*max_offset*/ file_size >> 2);
 }
 CacheKey BlockBasedTable::GetCacheKey(const OffsetableCacheKey& base_cache_key,
@ -717,7 +715,7 @@ Status BlockBasedTable::Open(
  // With properties loaded, we can set up portable/stable cache keys
  SetupBaseCacheKey(rep->table_properties.get(), cur_db_session_id,
-                    cur_file_num, file_size, &rep->base_cache_key);
+                    cur_file_num, &rep->base_cache_key);
  rep->persistent_cache_options =
      PersistentCacheOptions(rep->table_options.persistent_cache,
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@ -231,15 +231,9 @@ class BlockBasedTable : public TableReader {
  class IndexReaderCommon;
  // Maximum SST file size that uses standard CacheKey encoding scheme.
  // See GetCacheKey to explain << 2. + 3 is permitted because it is trimmed
  // off by >> 2 in GetCacheKey.
  static constexpr uint64_t kMaxFileSizeStandardEncoding =
      (OffsetableCacheKey::kMaxOffsetStandardEncoding << 2) + 3;
  static void SetupBaseCacheKey(const TableProperties* properties,
                                const std::string& cur_db_session_id,
-                                uint64_t cur_file_number, uint64_t file_size,
+                                uint64_t cur_file_number,
                                OffsetableCacheKey* out_base_cache_key,
                                bool* out_is_stable = nullptr);
--- a/table/unique_id.cc
+++ b/table/unique_id.cc
@ -58,22 +58,34 @@ Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
 Status GetSstInternalUniqueId(const std::string &db_id,
                              const std::string &db_session_id,
-                              uint64_t file_number, UniqueIdPtr out) {
+                              uint64_t file_number, UniqueIdPtr out,
-  if (db_id.empty()) {
+                              bool force) {
-    return Status::NotSupported("Missing db_id");
+  if (!force) {
-  }
+    if (db_id.empty()) {
-  if (file_number == 0) {
+      return Status::NotSupported("Missing db_id");
-    return Status::NotSupported("Missing or bad file number");
+    }
-  }
+    if (file_number == 0) {
-  if (db_session_id.empty()) {
+      return Status::NotSupported("Missing or bad file number");
-    return Status::NotSupported("Missing db_session_id");
+    }
    if (db_session_id.empty()) {
      return Status::NotSupported("Missing db_session_id");
    }
  }
  uint64_t session_upper = 0;  // Assignment to appease clang-analyze
  uint64_t session_lower = 0;  // Assignment to appease clang-analyze
  {
    Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower);
    if (!s.ok()) {
-      return s;
+      if (!force) {
        return s;
      } else {
        // A reasonable fallback in case malformed
        Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper,
                 &session_lower);
        if (session_lower == 0) {
          session_lower = session_upper | 1;
        }
      }
    }
  }
@ -107,20 +119,6 @@ Status GetSstInternalUniqueId(const std::string &db_id,
  return Status::OK();
 }
 Status GetSstInternalUniqueId(const std::string &db_id,
                              const std::string &db_session_id,
                              uint64_t file_number, UniqueId64x2 *out) {
  UniqueId64x3 tmp{};
  Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number, &tmp);
  if (s.ok()) {
    (*out)[0] = tmp[0];
    (*out)[1] = tmp[1];
  } else {
    *out = {0, 0};
  }
  return s;
 }
 namespace {
 // For InternalUniqueIdToExternal / ExternalUniqueIdToInternal we want all
 // zeros in first 128 bits to map to itself, so that excluding zero in
--- a/table/unique_id_impl.h
+++ b/table/unique_id_impl.h
@ -47,7 +47,8 @@ struct UniqueIdPtr {
 // is long term stable.
 Status GetSstInternalUniqueId(const std::string &db_id,
                              const std::string &db_session_id,
-                              uint64_t file_number, UniqueIdPtr out);
+                              uint64_t file_number, UniqueIdPtr out,
                              bool force = false);
 // Helper for GetUniqueIdFromTableProperties. External unique ids go through
 // this extra hashing layer so that prefixes of the unique id have predictable
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@ -569,6 +569,7 @@ using ROCKSDB_NAMESPACE::ConstexprFloorLog2;
 using ROCKSDB_NAMESPACE::CountTrailingZeroBits;
 using ROCKSDB_NAMESPACE::DecodeFixed128;
 using ROCKSDB_NAMESPACE::DecodeFixedGeneric;
 using ROCKSDB_NAMESPACE::DownwardInvolution;
 using ROCKSDB_NAMESPACE::EncodeFixed128;
 using ROCKSDB_NAMESPACE::EncodeFixedGeneric;
 using ROCKSDB_NAMESPACE::FloorLog2;
@ -577,6 +578,8 @@ using ROCKSDB_NAMESPACE::Multiply64to128;
 using ROCKSDB_NAMESPACE::Unsigned128;
 using ROCKSDB_NAMESPACE::Upper64of128;
 int blah(int x) { return DownwardInvolution(x); }
 template <typename T>
 static void test_BitOps() {
  // This complex code is to generalize to 128-bit values. Otherwise
@ -640,6 +643,70 @@ static void test_BitOps() {
      EXPECT_EQ(ReverseBits(vm1), static_cast<T>(rv * ~T{1}));
    }
 #endif
    // DownwardInvolution
    {
      T misc = static_cast<T>(/*random*/ 0xc682cd153d0e3279U +
                              i * /*random*/ 0x9b3972f3bea0baa3U);
      if constexpr (sizeof(T) > 8) {
        misc = (misc << 64) | (/*random*/ 0x52af031a38ced62dU +
                               i * /*random*/ 0x936f803d9752ddc3U);
      }
      T misc_masked = misc & vm1;
      EXPECT_LE(misc_masked, vm1);
      T di_misc_masked = DownwardInvolution(misc_masked);
      EXPECT_LE(di_misc_masked, vm1);
      if (misc_masked > 0) {
        // Highest-order 1 in same position
        EXPECT_EQ(FloorLog2(misc_masked), FloorLog2(di_misc_masked));
      }
      // Validate involution property on short value
      EXPECT_EQ(DownwardInvolution(di_misc_masked), misc_masked);
      // Validate involution property on large value
      T di_misc = DownwardInvolution(misc);
      EXPECT_EQ(DownwardInvolution(di_misc), misc);
      // Highest-order 1 in same position
      if (misc > 0) {
        EXPECT_EQ(FloorLog2(misc), FloorLog2(di_misc));
      }
      // Validate distributes over xor.
      // static_casts to avoid numerical promotion effects.
      EXPECT_EQ(DownwardInvolution(static_cast<T>(misc_masked ^ vm1)),
                static_cast<T>(di_misc_masked ^ DownwardInvolution(vm1)));
      T misc2 = static_cast<T>(misc >> 1);
      EXPECT_EQ(DownwardInvolution(static_cast<T>(misc ^ misc2)),
                static_cast<T>(di_misc ^ DownwardInvolution(misc2)));
      // Choose some small number of bits to pull off to test combined
      // uniqueness guarantee
      int in_bits = i % 7;
      unsigned in_mask = (unsigned{1} << in_bits) - 1U;
      // IMPLICIT: int out_bits = 8 - in_bits;
      std::vector<bool> seen(256, false);
      for (int j = 0; j < 255; ++j) {
        T t_in = misc ^ static_cast<T>(j);
        unsigned in = static_cast<unsigned>(t_in);
        unsigned out = static_cast<unsigned>(DownwardInvolution(t_in));
        unsigned val = ((out << in_bits) | (in & in_mask)) & 255U;
        EXPECT_FALSE(seen[val]);
        seen[val] = true;
      }
      if (i + 8 < int{8 * sizeof(T)}) {
        // Also test manipulating bits in the middle of input is
        // bijective in bottom of output
        seen = std::vector<bool>(256, false);
        for (int j = 0; j < 255; ++j) {
          T in = misc ^ (static_cast<T>(j) << i);
          unsigned val = static_cast<unsigned>(DownwardInvolution(in)) & 255U;
          EXPECT_FALSE(seen[val]);
          seen[val] = true;
        }
      }
    }
    vm1 = (vm1 << 1) | 1;
  }
--- a/util/math.h
+++ b/util/math.h
@ -250,4 +250,45 @@ inline T ReverseBits(T v) {
  return r;
 }
 // Every output bit depends on many input bits in the same and higher
 // positions, but not lower positions. Specifically, this function
 // * Output highest bit set to 1 is same as input (same FloorLog2, or
 //   equivalently, same number of leading zeros)
 // * Is its own inverse (an involution)
 // * Guarantees that b bottom bits of v and c bottom bits of
 //   DownwardInvolution(v) uniquely identify b + c bottom bits of v
 //   (which is all of v if v < 2**(b + c)).
 // ** A notable special case is that modifying c adjacent bits at
 //    some chosen position in the input is bijective with the bottom c
 //    output bits.
 // * Distributes over xor, as in DI(a ^ b) == DI(a) ^ DI(b)
 //
 // This transformation is equivalent to a matrix*vector multiplication in
 // GF(2) where the matrix is recursively defined by the pattern matrix
 // P = | 1 1 |
 //     | 0 1 |
 // and replacing 1's with P and 0's with 2x2 zero matices to some depth,
 // e.g. depth of 6 for 64-bit T. An essential feature of this matrix
 // is that all square sub-matrices that include the top row are invertible.
 template <typename T>
 inline T DownwardInvolution(T v) {
  static_assert(std::is_integral<T>::value, "non-integral type");
  static_assert(sizeof(T) <= 8, "only supported up to 64 bits");
  uint64_t r = static_cast<uint64_t>(v);
  if constexpr (sizeof(T) > 4) {
    r ^= r >> 32;
  }
  if constexpr (sizeof(T) > 2) {
    r ^= (r & 0xffff0000ffff0000U) >> 16;
  }
  if constexpr (sizeof(T) > 1) {
    r ^= (r & 0xff00ff00ff00ff00U) >> 8;
  }
  r ^= (r & 0xf0f0f0f0f0f0f0f0U) >> 4;
  r ^= (r & 0xccccccccccccccccU) >> 2;
  r ^= (r & 0xaaaaaaaaaaaaaaaaU) >> 1;
  return static_cast<T>(r);
 }
 }  // namespace ROCKSDB_NAMESPACE
--- a/util/math128.h
+++ b/util/math128.h
@ -230,6 +230,12 @@ inline Unsigned128 ReverseBits(Unsigned128 v) {
         ReverseBits(Upper64of128(v));
 }
 template <>
 inline Unsigned128 DownwardInvolution(Unsigned128 v) {
  return (Unsigned128{DownwardInvolution(Upper64of128(v))} << 64) |
         DownwardInvolution(Upper64of128(v) ^ Lower64of128(v));
 }
 template <typename T>
 struct IsUnsignedUpTo128
    : std::integral_constant<bool, std::is_unsigned<T>::value ||
--- a/utilities/cache_dump_load_impl.cc
+++ b/utilities/cache_dump_load_impl.cc
@ -39,12 +39,9 @@ Status CacheDumperImpl::SetDumpFilter(std::vector<DB*> db_list) {
      // We only want to save cache entries that are portable to another
      // DB::Open, so only save entries with stable keys.
      bool is_stable;
-      // WART: if the file is extremely large (> kMaxFileSizeStandardEncoding)
+      BlockBasedTable::SetupBaseCacheKey(id->second.get(),
-      // then the prefix will be different. But this should not be a concern
+                                         /*cur_db_session_id*/ "",
-      // in practice because that limit is currently 4TB on a single file.
+                                         /*cur_file_num*/ 0, &base, &is_stable);
      BlockBasedTable::SetupBaseCacheKey(
          id->second.get(), /*cur_db_session_id*/ "", /*cur_file_num*/ 0,
          /*file_size*/ 42, &base, &is_stable);
      if (is_stable) {
        Slice prefix_slice = base.CommonPrefixSlice();
        assert(prefix_slice.size() == OffsetableCacheKey::kCommonPrefixSize);