diff --git a/HISTORY.md b/HISTORY.md index a038d7817..9cd260be2 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -39,6 +39,9 @@ * Improve read performance by avoiding dynamic memory allocation. * When using iterators with the integrated BlobDB implementation, blob cache handles are now released immediately when the iterator's position changes. +## Behavior Change +* Block cache keys have changed, which will cause any persistent caches to miss between versions. + ## 7.5.0 (07/15/2022) ### New Features * Mempurge option flag `experimental_mempurge_threshold` is now a ColumnFamilyOptions and can now be dynamically configured using `SetOptions()`. diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index 56db2fea3..ccdb90e49 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -806,7 +806,6 @@ class StressCacheKey { uint64_t max_file_count = uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run; - uint64_t file_size = FLAGS_sck_file_size_mb * uint64_t{1024} * 1024U; uint32_t report_count = 0; uint32_t collisions_this_run = 0; size_t db_i = 0; @@ -834,8 +833,7 @@ class StressCacheKey { } bool is_stable; BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "", - /* ignored */ 42, file_size, &ock, - &is_stable); + /* ignored */ 42, &ock, &is_stable); assert(is_stable); // Get a representative cache key, which later we analytically generalize // to a range. @@ -845,13 +843,11 @@ class StressCacheKey { reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away; } else if (FLAGS_sck_footer_unique_id) { // Special case: keep only file number, not session counter - uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a; - uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b; - reduced_key = (uint64_t{a} << 32) + b; + reduced_key = DecodeFixed64(ck.AsSlice().data()) >> shift_away; } else { // Try to keep file number and session counter (shift away other bits) uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a; - uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b; + uint32_t b = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_b; reduced_key = (uint64_t{a} << 32) + b; } if (reduced_key == 0) { diff --git a/cache/cache_key.cc b/cache/cache_key.cc index f99921b88..a79328972 100644 --- a/cache/cache_key.cc +++ b/cache/cache_key.cc @@ -17,7 +17,7 @@ namespace ROCKSDB_NAMESPACE { // Value space plan for CacheKey: // -// session_etc64_ | offset_etc64_ | Only generated by +// file_num_etc64_ | offset_etc64_ | Only generated by // ---------------+---------------+------------------------------------------ // 0 | 0 | Reserved for "empty" CacheKey() // 0 | > 0, < 1<<63 | CreateUniqueForCacheLifetime @@ -44,7 +44,7 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() { return CacheKey(0, id); } -// Value plan for CacheKeys from OffsetableCacheKey, assuming that +// How we generate CacheKeys and base OffsetableCacheKey, assuming that // db_session_ids are generated from a base_session_id and // session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId // in DBImpl::GenerateDbSessionId): @@ -56,63 +56,108 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() { // base_session_id (unstructured, from GenerateRawUniqueId) // session_id_counter (structured) // * usually much smaller than 2**24 -// file_number (structured) +// orig_file_number (structured) // * usually smaller than 2**24 // offset_in_file (structured, might skip lots of values) // * usually smaller than 2**32 -// max_offset determines placement of file_number to prevent -// overlapping with offset // -// Outputs come from bitwise-xor of the constituent pieces, low bits on left: -// -// |------------------------- session_etc64 -------------------------| -// | +++++++++++++++ base_session_id (lower 64 bits) +++++++++++++++ | +// Overall approach (see https://github.com/pdillinger/unique_id for +// background): +// +// First, we have three "structured" values, up to 64 bits each, that we +// need to fit, without losses, into 128 bits. In practice, the values will +// be small enough that they should fit. For example, applications generating +// large SST files (large offsets) will naturally produce fewer files (small +// file numbers). But we don't know ahead of time what bounds the values will +// have. +// +// Second, we have unstructured inputs that enable distinct RocksDB processes +// to pick a random point in space, likely very different from others. Xoring +// the structured with the unstructured give us a cache key that is +// structurally distinct between related keys (e.g. same file or same RocksDB +// process) and distinct with high probability between unrelated keys. +// +// The problem of packing three structured values into the space for two is +// complicated by the fact that we want to derive cache keys from SST unique +// IDs, which have already combined structured and unstructured inputs in a +// practically inseparable way. And we want a base cache key that works +// with an offset of any size. So basically, we need to encode these three +// structured values, each up to 64 bits, into 128 bits without knowing any +// of their sizes. The DownwardInvolution() function gives us a mechanism to +// accomplish this. (See its properties in math.h.) Specifically, for inputs +// a, b, and c: +// lower64 = DownwardInvolution(a) ^ ReverseBits(b); +// upper64 = c ^ ReverseBits(a); +// The 128-bit output is unique assuming there exist some i, j, and k +// where a < 2**i, b < 2**j, c < 2**k, i <= 64, j <= 64, k <= 64, and +// i + j + k <= 128. In other words, as long as there exist some bounds +// that would allow us to pack the bits of a, b, and c into the output +// if we know the bound, we can generate unique outputs without knowing +// those bounds. To validate this claim, the inversion function (given +// the bounds) has been implemented in CacheKeyDecoder in +// db_block_cache_test.cc. +// +// With that in mind, the outputs in terms of the conceptual inputs look +// like this, using bitwise-xor of the constituent pieces, low bits on left: +// +// |------------------------- file_num_etc64 -------------------------| +// | +++++++++ base_session_id (lower 64 bits, involution) +++++++++ | +// |-----------------------------------------------------------------| +// | session_id_counter (involution) ..... | | // |-----------------------------------------------------------------| -// | session_id_counter ...| | +// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ | +// | * base_session_id (upper ~39 bits) | +// | * db_id (~122 bits entropy) | // |-----------------------------------------------------------------| -// | | ... file_number | -// | | overflow & meta | +// | | ..... orig_file_number (reversed) | // |-----------------------------------------------------------------| // // // |------------------------- offset_etc64 --------------------------| -// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ | -// | * base_session_id (upper ~39 bits) | -// | * db_id (~122 bits entropy) | +// | ++++++++++ base_session_id (lower 64 bits, reversed) ++++++++++ | // |-----------------------------------------------------------------| -// | offset_in_file ............... | | +// | | ..... session_id_counter (reversed) | // |-----------------------------------------------------------------| -// | | file_number, 0-3 | -// | | lower bytes | +// | offset_in_file ............... | | // |-----------------------------------------------------------------| // -// Based on max_offset, a maximal number of bytes 0..3 is chosen for -// including from lower bits of file_number in offset_etc64. The choice -// is encoded in two bits of metadata going into session_etc64, though -// the common case of 3 bytes is encoded as 0 so that session_etc64 -// is unmodified by file_number concerns in the common case. -// -// There is nothing preventing "file number overflow & meta" from meeting -// and overlapping with session_id_counter, but reaching such a case requires -// an intractable combination of large file offsets (thus at least some large -// files), large file numbers (thus large number of files generated), and -// large number of session IDs generated in a single process. A trillion each -// (2**40) of session ids, offsets, and file numbers comes to 120 bits. -// With two bits of metadata and byte granularity, this is on the verge of -// overlap, but even in the overlap case, it doesn't seem likely that -// a file from billions of files or session ids ago will still be live -// or cached. -// -// In fact, if our SST files are all < 4TB (see -// BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated -// in a single process are guaranteed to have unique cache keys, unless/until -// number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in -// a single process and 64 trillion files generated. Even at that point, to -// see a collision we would need a miraculous re-synchronization of session -// id and file number, along with a live file or stale cache entry from -// trillions of files ago. -// -// How https://github.com/pdillinger/unique_id applies here: +// Some oddities or inconveniences of this layout are due to deriving +// the "base" cache key (without offset) from the SST unique ID (see +// GetSstInternalUniqueId). Specifically, +// * Lower 64 of base_session_id occurs in both output words (ok but +// weird) +// * The inclusion of db_id is bad for the conditions under which we +// can guarantee uniqueness, but could be useful in some cases with +// few small files per process, to make up for db session id only having +// ~103 bits of entropy. +// +// In fact, if DB ids were not involved, we would be guaranteed unique +// cache keys for files generated in a single process until total bits for +// biggest session_id_counter, orig_file_number, and offset_in_file +// reach 128 bits. +// +// With the DB id limitation, we only have nice guaranteed unique cache +// keys for files generated in a single process until biggest +// session_id_counter and offset_in_file reach combined 64 bits. This +// is quite good in practice because we can have millions of DB Opens +// with terabyte size SST files, or billions of DB Opens with gigabyte +// size SST files. +// +// One of the considerations in the translation between existing SST unique +// IDs and base cache keys is supporting better SST unique IDs in a future +// format_version. If we use a process-wide file counter instead of +// session counter and file numbers, we only need to combine two 64-bit values +// instead of three. But we don't want to track unique ID versions in the +// manifest, so we want to keep the same translation layer between SST unique +// IDs and base cache keys, even with updated SST unique IDs. If the new +// unique IDs put the file counter where the orig_file_number was, and +// use no structured field where session_id_counter was, then our translation +// layer works fine for two structured fields as well as three (for +// compatibility). The small computation for the translation (one +// DownwardInvolution(), two ReverseBits(), both ~log(64) instructions deep) +// is negligible for computing as part of SST file reader open. +// +// More on how https://github.com/pdillinger/unique_id applies here: // Every bit of output always includes "unstructured" uniqueness bits and // often combines with "structured" uniqueness bits. The "unstructured" bits // change infrequently: only when we cannot guarantee our state tracking for @@ -141,12 +186,11 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() { // 128 bits cache key size // - 55 <- ideal size for byte offsets + file numbers // - 2 <- bits for offsets and file numbers not exactly powers of two -// - 2 <- bits for file number encoding metadata // + 2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey // ---- -// 71 <- bits remaining for distinguishing session IDs -// The probability of a collision in 71 bits of session ID data is less than -// 1 in 2**(71 - (2 * 16)), or roughly 1 in a trillion. And this assumes all +// 73 <- bits remaining for distinguishing session IDs +// The probability of a collision in 73 bits of session ID data is less than +// 1 in 2**(73 - (2 * 16)), or roughly 1 in a trillion. And this assumes all // data from the last 180 days is in cache for potential collision, and that // cache keys under each session id exhaustively cover the remaining 57 bits // while in reality they'll only cover a small fraction of it. @@ -160,7 +204,7 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() { // Now suppose we have many DBs per host, say 2**10, with same host-wide write // rate and process/session lifetime. File numbers will be ~10 bits smaller // and we will have 2**10 times as many session IDs because of simultaneous -// lifetimes. So now collision chance is less than 1 in 2**(81 - (2 * 26)), +// lifetimes. So now collision chance is less than 1 in 2**(83 - (2 * 26)), // or roughly 1 in a billion. // // Suppose instead we generated random or hashed cache keys for each @@ -176,17 +220,17 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() { // activity over many months, by making some pessimistic simplifying // assumptions. See class StressCacheKey in cache_bench_tool.cc for details. // Here is some sample output with -// `./cache_bench -stress_cache_key -sck_keep_bits=40`: +// `./cache_bench -stress_cache_key -sck_keep_bits=43`: // // Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day -// Multiply by 9.22337e+18 to correct for simulation losses (but still +// Multiply by 1.15292e+18 to correct for simulation losses (but still // assume whole file cached) // // These come from default settings of 2.5M files per day of 32 MB each, and -// `-sck_keep_bits=40` means that to represent a single file, we are only -// keeping 40 bits of the 128-bit (base) cache key. With file size of 2**25 -// contiguous keys (pessimistic), our simulation is about 2\*\*(128-40-25) or -// about 9 billion billion times more prone to collision than reality. +// `-sck_keep_bits=43` means that to represent a single file, we are only +// keeping 43 bits of the 128-bit (base) cache key. With file size of 2**25 +// contiguous keys (pessimistic), our simulation is about 2\*\*(128-43-25) or +// about 1 billion billion times more prone to collision than reality. // // More default assumptions, relatively pessimistic: // * 100 DBs in same process (doesn't matter much) @@ -194,49 +238,55 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() { // average every 100 files generated // * Restart process (all new session IDs unrelated to old) 24 times per day // -// After enough data, we get a result at the end (-sck_keep_bits=40): +// After enough data, we get a result at the end (-sck_keep_bits=43): // -// (keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between -// (9.76592e+19 corrected) +// (keep 43 bits) 18 collisions after 2 x 90 days, est 10 days between +// (1.15292e+19 corrected) // // If we believe the (pessimistic) simulation and the mathematical -// extrapolation, we would need to run a billion machines all for 97 billion +// extrapolation, we would need to run a billion machines all for 11 billion // days to expect a cache key collision. To help verify that our extrapolation -// ("corrected") is robust, we can make our simulation more precise with -// `-sck_keep_bits=41` and `42`, which takes more running time to get enough +// ("corrected") is robust, we can make our simulation more precise by +// increasing the "keep" bits, which takes more running time to get enough // collision data: // -// (keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between -// (1.03763e+20 corrected) -// (keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between -// (1.09224e+20 corrected) +// (keep 44 bits) 16 collisions after 5 x 90 days, est 28.125 days between +// (1.6213e+19 corrected) +// (keep 45 bits) 15 collisions after 7 x 90 days, est 42 days between +// (1.21057e+19 corrected) +// (keep 46 bits) 15 collisions after 17 x 90 days, est 102 days between +// (1.46997e+19 corrected) +// (keep 47 bits) 15 collisions after 49 x 90 days, est 294 days between +// (2.11849e+19 corrected) // -// The extrapolated prediction is very close. If anything, we might have some -// very small losses of structured data (see class StressCacheKey in -// cache_bench_tool.cc) leading to more accurate & more attractive prediction -// with more bits kept. +// The extrapolated prediction seems to be within noise (sampling error). // // With the `-sck_randomize` option, we can see that typical workloads like // above have lower collision probability than "random" cache keys (note: -// offsets still non-randomized) by a modest amount (roughly 20x less collision -// prone than random), which should make us reasonably comfortable even in -// "degenerate" cases (e.g. repeatedly launch a process to generate 1 file -// with SstFileWriter): +// offsets still non-randomized) by a modest amount (roughly 2-3x less +// collision prone than random), which should make us reasonably comfortable +// even in "degenerate" cases (e.g. repeatedly launch a process to generate +// one file with SstFileWriter): +// +// (rand 43 bits) 22 collisions after 1 x 90 days, est 4.09091 days between +// (4.7165e+18 corrected) +// +// We can see that with more frequent process restarts, +// -sck_restarts_per_day=5000, which means more all-new session IDs, we get +// closer to the "random" cache key performance: // -// (rand 40 bits) 197 collisions after 1 x 90 days, est 0.456853 days between -// (4.21372e+18 corrected) +// 15 collisions after 1 x 90 days, est 6 days between (6.91753e+18 corrected) // -// We can see that with more frequent process restarts (all new session IDs), -// we get closer to the "random" cache key performance: +// And with less frequent process restarts and re-opens, +// -sck_restarts_per_day=1 -sck_reopen_nfiles=1000, we get lower collision +// probability: // -// (-sck_restarts_per_day=5000): 140 collisions after 1 x 90 days, ... -// (5.92931e+18 corrected) +// 18 collisions after 8 x 90 days, est 40 days between (4.61169e+19 corrected) // // Other tests have been run to validate other conditions behave as expected, // never behaving "worse than random" unless we start chopping off structured // data. // -// // Conclusion: Even in extreme cases, rapidly burning through "all new" IDs // that only arise when a new process is started, the chance of any cache key // collisions in a giant fleet of machines is negligible. Especially when @@ -249,96 +299,66 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() { // quantify) block cache corruptions, including collisions, should be added. OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id, - uint64_t file_number, - uint64_t max_offset) { -#ifndef NDEBUG - max_offset_ = max_offset; -#endif - // Closely related to GetSstInternalUniqueId, but only need 128 bits and - // need to include an offset within the file. - // See also https://github.com/pdillinger/unique_id for background. - uint64_t session_upper = 0; // Assignment to appease clang-analyze - uint64_t session_lower = 0; // Assignment to appease clang-analyze - { - Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower); - if (!s.ok()) { - // A reasonable fallback in case malformed - Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper, - &session_lower); - } - } - - // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy) - // for more global uniqueness entropy. - // (It is possible that many DBs descended from one common DB id are copied - // around and proliferate, in which case session id is critical, but it is - // more common for different DBs to have different DB ids.) - uint64_t db_hash = Hash64(db_id.data(), db_id.size(), session_upper); - - // This establishes the db+session id part of the cache key. - // - // Exactly preserve (in common cases; see modifiers below) session lower to - // ensure that session ids generated during the same process lifetime are - // guaranteed unique. - // - // We put this first for CommonPrefixSlice(), so that a small-ish set of - // cache key prefixes to cover entries relevant to any DB. - session_etc64_ = session_lower; - // This provides extra entopy in case of different DB id or process - // generating a session id, but is also partly/variably obscured by - // file_number and offset (see below). - offset_etc64_ = db_hash; - - // Into offset_etc64_ we are (eventually) going to pack & xor in an offset and - // a file_number, but we might need the file_number to overflow into - // session_etc64_. (There must only be one session_etc64_ value per - // file, and preferably shared among many files.) - // - // Figure out how many bytes of file_number we are going to be able to - // pack in with max_offset, though our encoding will only support packing - // in up to 3 bytes of file_number. (16M file numbers is enough for a new - // file number every second for half a year.) - int file_number_bytes_in_offset_etc = - (63 - FloorLog2(max_offset | 0x100000000U)) / 8; - int file_number_bits_in_offset_etc = file_number_bytes_in_offset_etc * 8; + uint64_t file_number) { + UniqueId64x2 internal_id; + Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number, + &internal_id, /*force=*/true); + assert(s.ok()); + *this = FromInternalUniqueId(&internal_id); +} - // Assert two bits of metadata - assert(file_number_bytes_in_offset_etc >= 0 && - file_number_bytes_in_offset_etc <= 3); - // Assert we couldn't have used a larger allowed number of bytes (shift - // would chop off bytes). - assert(file_number_bytes_in_offset_etc == 3 || - (max_offset << (file_number_bits_in_offset_etc + 8) >> - (file_number_bits_in_offset_etc + 8)) != max_offset); +OffsetableCacheKey OffsetableCacheKey::FromInternalUniqueId(UniqueIdPtr id) { + uint64_t session_lower = id.ptr[0]; + uint64_t file_num_etc = id.ptr[1]; - uint64_t mask = (uint64_t{1} << (file_number_bits_in_offset_etc)) - 1; - // Pack into high bits of etc so that offset can go in low bits of etc - // TODO: could be EndianSwapValue? - uint64_t offset_etc_modifier = ReverseBits(file_number & mask); - assert(offset_etc_modifier << file_number_bits_in_offset_etc == 0U); +#ifndef NDEBUG + bool is_empty = session_lower == 0 && file_num_etc == 0; +#endif - // Overflow and 3 - byte count (likely both zero) go into session_id part - uint64_t session_etc_modifier = - (file_number >> file_number_bits_in_offset_etc << 2) | - static_cast(3 - file_number_bytes_in_offset_etc); - // Packed into high bits to minimize interference with session id counter. - session_etc_modifier = ReverseBits(session_etc_modifier); + // Although DBImpl guarantees (in recent versions) that session_lower is not + // zero, that's not entirely sufficient to guarantee that file_num_etc64_ is + // not zero (so that the 0 case can be used by CacheKey::CreateUnique*) + // However, if we are given an "empty" id as input, then we should produce + // "empty" as output. + // As a consequence, this function is only bijective assuming + // id[0] == 0 only if id[1] == 0. + if (session_lower == 0U) { + session_lower = file_num_etc; + } - // Assert session_id part is only modified in extreme cases - assert(session_etc_modifier == 0 || file_number > /*3 bytes*/ 0xffffffU || - max_offset > /*5 bytes*/ 0xffffffffffU); + // See comments above for how DownwardInvolution and ReverseBits + // make this function invertible under various assumptions. + OffsetableCacheKey rv; + rv.file_num_etc64_ = + DownwardInvolution(session_lower) ^ ReverseBits(file_num_etc); + rv.offset_etc64_ = ReverseBits(session_lower); - // Xor in the modifiers - session_etc64_ ^= session_etc_modifier; - offset_etc64_ ^= offset_etc_modifier; + // Because of these transformations and needing to allow arbitrary + // offset (thus, second 64 bits of cache key might be 0), we need to + // make some correction to ensure the first 64 bits is not 0. + // Fortunately, the transformation ensures the second 64 bits is not 0 + // for non-empty base key, so we can swap in the case one is 0 without + // breaking bijectivity (assuming condition above). + assert(is_empty || rv.offset_etc64_ > 0); + if (rv.file_num_etc64_ == 0) { + std::swap(rv.file_num_etc64_, rv.offset_etc64_); + } + assert(is_empty || rv.file_num_etc64_ > 0); + return rv; +} - // Although DBImpl guarantees (in recent versions) that session_lower is not - // zero, that's not entirely sufficient to guarantee that session_etc64_ is - // not zero (so that the 0 case can be used by CacheKey::CreateUnique*) - if (session_etc64_ == 0U) { - session_etc64_ = session_upper | 1U; +// Inverse of FromInternalUniqueId (assuming file_num_etc64 == 0 only if +// offset_etc64 == 0) +UniqueId64x2 OffsetableCacheKey::ToInternalUniqueId() { + uint64_t a = file_num_etc64_; + uint64_t b = offset_etc64_; + if (b == 0) { + std::swap(a, b); } - assert(session_etc64_ != 0); + UniqueId64x2 rv; + rv[0] = ReverseBits(b); + rv[1] = ReverseBits(a ^ DownwardInvolution(rv[0])); + return rv; } } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_key.h b/cache/cache_key.h index 0858f3c8e..0b93c6bd9 100644 --- a/cache/cache_key.h +++ b/cache/cache_key.h @@ -9,6 +9,7 @@ #include "rocksdb/rocksdb_namespace.h" #include "rocksdb/slice.h" +#include "table/unique_id_impl.h" namespace ROCKSDB_NAMESPACE { @@ -33,10 +34,10 @@ class CacheKey { public: // For convenience, constructs an "empty" cache key that is never returned // by other means. - inline CacheKey() : session_etc64_(), offset_etc64_() {} + inline CacheKey() : file_num_etc64_(), offset_etc64_() {} inline bool IsEmpty() const { - return (session_etc64_ == 0) & (offset_etc64_ == 0); + return (file_num_etc64_ == 0) & (offset_etc64_ == 0); } // Use this cache key as a Slice (byte order is endianness-dependent) @@ -59,9 +60,9 @@ class CacheKey { protected: friend class OffsetableCacheKey; - CacheKey(uint64_t session_etc64, uint64_t offset_etc64) - : session_etc64_(session_etc64), offset_etc64_(offset_etc64) {} - uint64_t session_etc64_; + CacheKey(uint64_t file_num_etc64, uint64_t offset_etc64) + : file_num_etc64_(file_num_etc64), offset_etc64_(offset_etc64) {} + uint64_t file_num_etc64_; uint64_t offset_etc64_; }; @@ -85,50 +86,58 @@ class OffsetableCacheKey : private CacheKey { inline OffsetableCacheKey() : CacheKey() {} // Constructs an OffsetableCacheKey with the given information about a file. - // max_offset is based on file size (see WithOffset) and is required here to - // choose an appropriate (sub-)encoding. This constructor never generates an - // "empty" base key. + // This constructor never generates an "empty" base key. OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id, - uint64_t file_number, uint64_t max_offset); + uint64_t file_number); + + // Creates an OffsetableCacheKey from an SST unique ID, so that cache keys + // can be derived from DB manifest data before reading the file from + // storage--so that every part of the file can potentially go in a persistent + // cache. + // + // Calling GetSstInternalUniqueId() on a db_id, db_session_id, and + // file_number and passing the result to this function produces the same + // base cache key as feeding those inputs directly to the constructor. + // + // This is a bijective transformation assuming either id is empty or + // lower 64 bits is non-zero: + // * Empty (all zeros) input -> empty (all zeros) output + // * Lower 64 input is non-zero -> lower 64 output (file_num_etc64_) is + // non-zero + static OffsetableCacheKey FromInternalUniqueId(UniqueIdPtr id); + + // This is the inverse transformation to the above, assuming either empty + // or lower 64 bits (file_num_etc64_) is non-zero. Perhaps only useful for + // testing. + UniqueId64x2 ToInternalUniqueId(); inline bool IsEmpty() const { - bool result = session_etc64_ == 0; + bool result = file_num_etc64_ == 0; assert(!(offset_etc64_ > 0 && result)); return result; } - // Construct a CacheKey for an offset within a file, which must be - // <= max_offset provided in constructor. An offset is not necessarily a - // byte offset if a smaller unique identifier of keyable offsets is used. + // Construct a CacheKey for an offset within a file. An offset is not + // necessarily a byte offset if a smaller unique identifier of keyable + // offsets is used. // // This class was designed to make this hot code extremely fast. inline CacheKey WithOffset(uint64_t offset) const { assert(!IsEmpty()); - assert(offset <= max_offset_); - return CacheKey(session_etc64_, offset_etc64_ ^ offset); + return CacheKey(file_num_etc64_, offset_etc64_ ^ offset); } - // The "common prefix" is a shared prefix for all the returned CacheKeys, - // that also happens to usually be the same among many files in the same DB, - // so is efficient and highly accurate (not perfectly) for DB-specific cache - // dump selection (but not file-specific). + // The "common prefix" is a shared prefix for all the returned CacheKeys. + // It is specific to the file but the same for all offsets within the file. static constexpr size_t kCommonPrefixSize = 8; inline Slice CommonPrefixSlice() const { - static_assert(sizeof(session_etc64_) == kCommonPrefixSize, + static_assert(sizeof(file_num_etc64_) == kCommonPrefixSize, "8 byte common prefix expected"); assert(!IsEmpty()); - assert(&this->session_etc64_ == static_cast(this)); + assert(&this->file_num_etc64_ == static_cast(this)); return Slice(reinterpret_cast(this), kCommonPrefixSize); } - - // For any max_offset <= this value, the same encoding scheme is guaranteed. - static constexpr uint64_t kMaxOffsetStandardEncoding = 0xffffffffffU; - - private: -#ifndef NDEBUG - uint64_t max_offset_ = 0; -#endif }; } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_reservation_manager_test.cc b/cache/cache_reservation_manager_test.cc index e39b974b8..3d000b5ad 100644 --- a/cache/cache_reservation_manager_test.cc +++ b/cache/cache_reservation_manager_test.cc @@ -48,13 +48,13 @@ TEST_F(CacheReservationManagerTest, GenerateCacheKey) { // Next unique Cache key CacheKey ckey = CacheKey::CreateUniqueForCacheLifetime(cache.get()); // Get to the underlying values - using PairU64 = std::array; - auto& ckey_pair = *reinterpret_cast(&ckey); + uint64_t* ckey_data = reinterpret_cast(&ckey); // Back it up to the one used by CRM (using CacheKey implementation details) - ckey_pair[1]--; + ckey_data[1]--; // Specific key (subject to implementation details) - EXPECT_EQ(ckey_pair, PairU64({0, 2})); + EXPECT_EQ(ckey_data[0], 0); + EXPECT_EQ(ckey_data[1], 2); Cache::Handle* handle = cache->Lookup(ckey.AsSlice()); EXPECT_NE(handle, nullptr) diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 6295ffa1e..9c00b2150 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -558,19 +558,12 @@ class TestSecondaryCache : public SecondaryCache { void ResetInjectFailure() { inject_failure_ = false; } - void SetDbSessionId(const std::string& db_session_id) { - // NOTE: we assume the file is smaller than kMaxFileSizeStandardEncoding - // for this to work, but that's safe in a test. - auto base = OffsetableCacheKey("unknown", db_session_id, 1, 1); - ckey_prefix_ = base.CommonPrefixSlice().ToString(); - } - Status Insert(const Slice& key, void* value, const Cache::CacheItemHelper* helper) override { if (inject_failure_) { return Status::Corruption("Insertion Data Corrupted"); } - EXPECT_TRUE(IsDbSessionLowerAsKeyPrefix(key)); + CheckCacheKeyCommonPrefix(key); size_t size; char* buf; Status s; @@ -648,8 +641,13 @@ class TestSecondaryCache : public SecondaryCache { uint32_t num_lookups() { return num_lookups_; } - bool IsDbSessionLowerAsKeyPrefix(const Slice& key) { - return key.starts_with(ckey_prefix_); + void CheckCacheKeyCommonPrefix(const Slice& key) { + Slice current_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize); + if (ckey_prefix_.empty()) { + ckey_prefix_ = current_prefix.ToString(); + } else { + EXPECT_EQ(ckey_prefix_, current_prefix.ToString()); + } } private: @@ -794,28 +792,30 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicTest) { opts.secondary_cache = secondary_cache; std::shared_ptr cache = NewLRUCache(opts); std::shared_ptr stats = CreateDBStatistics(); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); Random rnd(301); std::string str1 = rnd.RandomString(1020); TestItem* item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1, &LRUCacheSecondaryCacheTest::helper_, - str1.length())); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, + &LRUCacheSecondaryCacheTest::helper_, str1.length())); std::string str2 = rnd.RandomString(1020); TestItem* item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to NVM - ASSERT_OK(cache->Insert("k2", item2, &LRUCacheSecondaryCacheTest::helper_, - str2.length())); + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, + &LRUCacheSecondaryCacheTest::helper_, str2.length())); get_perf_context()->Reset(); Cache::Handle* handle; handle = - cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_, + cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, test_item_creator, Cache::Priority::LOW, true, stats.get()); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should promote k1 and demote k2 handle = - cache->Lookup("k1", &LRUCacheSecondaryCacheTest::helper_, + cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, test_item_creator, Cache::Priority::LOW, true, stats.get()); ASSERT_NE(handle, nullptr); cache->Release(handle); @@ -837,21 +837,23 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicFailTest) { std::make_shared(2048); opts.secondary_cache = secondary_cache; std::shared_ptr cache = NewLRUCache(opts); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); Random rnd(301); std::string str1 = rnd.RandomString(1020); auto item1 = std::make_unique(str1.data(), str1.length()); - ASSERT_TRUE(cache->Insert("k1", item1.get(), nullptr, str1.length()) + ASSERT_TRUE(cache->Insert(k1.AsSlice(), item1.get(), nullptr, str1.length()) .IsInvalidArgument()); - ASSERT_OK(cache->Insert("k1", item1.get(), + ASSERT_OK(cache->Insert(k1.AsSlice(), item1.get(), &LRUCacheSecondaryCacheTest::helper_, str1.length())); item1.release(); // Appease clang-analyze "potential memory leak" Cache::Handle* handle; - handle = cache->Lookup("k2", nullptr, test_item_creator, Cache::Priority::LOW, - true); + handle = cache->Lookup(k2.AsSlice(), nullptr, test_item_creator, + Cache::Priority::LOW, true); ASSERT_EQ(handle, nullptr); - handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_, + handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, test_item_creator, Cache::Priority::LOW, false); ASSERT_EQ(handle, nullptr); @@ -866,30 +868,37 @@ TEST_F(LRUCacheSecondaryCacheTest, SaveFailTest) { std::make_shared(2048); opts.secondary_cache = secondary_cache; std::shared_ptr cache = NewLRUCache(opts); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); Random rnd(301); std::string str1 = rnd.RandomString(1020); TestItem* item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert( - "k1", item1, &LRUCacheSecondaryCacheTest::helper_fail_, str1.length())); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, + &LRUCacheSecondaryCacheTest::helper_fail_, + str1.length())); std::string str2 = rnd.RandomString(1020); TestItem* item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to NVM - ASSERT_OK(cache->Insert( - "k2", item2, &LRUCacheSecondaryCacheTest::helper_fail_, str2.length())); + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, + &LRUCacheSecondaryCacheTest::helper_fail_, + str2.length())); Cache::Handle* handle; - handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_fail_, - test_item_creator, Cache::Priority::LOW, true); + handle = + cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should fail, since k1 demotion would have failed - handle = cache->Lookup("k1", &LRUCacheSecondaryCacheTest::helper_fail_, - test_item_creator, Cache::Priority::LOW, true); + handle = + cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); ASSERT_EQ(handle, nullptr); // Since k1 didn't get promoted, k2 should still be in cache - handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_fail_, - test_item_creator, Cache::Priority::LOW, true); + handle = + cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); ASSERT_EQ(secondary_cache->num_inserts(), 1u); @@ -906,30 +915,32 @@ TEST_F(LRUCacheSecondaryCacheTest, CreateFailTest) { std::make_shared(2048); opts.secondary_cache = secondary_cache; std::shared_ptr cache = NewLRUCache(opts); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); Random rnd(301); std::string str1 = rnd.RandomString(1020); TestItem* item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1, &LRUCacheSecondaryCacheTest::helper_, - str1.length())); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, + &LRUCacheSecondaryCacheTest::helper_, str1.length())); std::string str2 = rnd.RandomString(1020); TestItem* item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to NVM - ASSERT_OK(cache->Insert("k2", item2, &LRUCacheSecondaryCacheTest::helper_, - str2.length())); + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, + &LRUCacheSecondaryCacheTest::helper_, str2.length())); Cache::Handle* handle; SetFailCreate(true); - handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_, + handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, test_item_creator, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should fail, since k1 creation would have failed - handle = cache->Lookup("k1", &LRUCacheSecondaryCacheTest::helper_, + handle = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, test_item_creator, Cache::Priority::LOW, true); ASSERT_EQ(handle, nullptr); // Since k1 didn't get promoted, k2 should still be in cache - handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_, + handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, test_item_creator, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); @@ -947,32 +958,34 @@ TEST_F(LRUCacheSecondaryCacheTest, FullCapacityTest) { std::make_shared(2048); opts.secondary_cache = secondary_cache; std::shared_ptr cache = NewLRUCache(opts); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); Random rnd(301); std::string str1 = rnd.RandomString(1020); TestItem* item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1, &LRUCacheSecondaryCacheTest::helper_, - str1.length())); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, + &LRUCacheSecondaryCacheTest::helper_, str1.length())); std::string str2 = rnd.RandomString(1020); TestItem* item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to NVM - ASSERT_OK(cache->Insert("k2", item2, &LRUCacheSecondaryCacheTest::helper_, - str2.length())); + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, + &LRUCacheSecondaryCacheTest::helper_, str2.length())); Cache::Handle* handle; - handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_, + handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, test_item_creator, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); // k1 promotion should fail due to the block cache being at capacity, // but the lookup should still succeed Cache::Handle* handle2; - handle2 = cache->Lookup("k1", &LRUCacheSecondaryCacheTest::helper_, + handle2 = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, test_item_creator, Cache::Priority::LOW, true); ASSERT_NE(handle2, nullptr); // Since k1 didn't get inserted, k2 should still be in cache cache->Release(handle); cache->Release(handle2); - handle = cache->Lookup("k2", &LRUCacheSecondaryCacheTest::helper_, + handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, test_item_creator, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); @@ -1009,9 +1022,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) { // all the blocks will be accessed. options.paranoid_file_checks = true; DestroyAndReopen(options); - std::string session_id; - ASSERT_OK(db_->GetDbSessionId(session_id)); - secondary_cache->SetDbSessionId(session_id); Random rnd(301); const int N = 6; for (int i = 0; i < N; i++) { @@ -1103,9 +1113,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) { options.env = fault_env_.get(); fault_fs_->SetFailGetUniqueId(true); DestroyAndReopen(options); - std::string session_id; - ASSERT_OK(db_->GetDbSessionId(session_id)); - secondary_cache->SetDbSessionId(session_id); Random rnd(301); const int N = 6; for (int i = 0; i < N; i++) { @@ -1197,9 +1204,6 @@ TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) { fault_fs_->SetFailGetUniqueId(true); DestroyAndReopen(options); - std::string session_id; - ASSERT_OK(db_->GetDbSessionId(session_id)); - secondary_cache->SetDbSessionId(session_id); Random rnd(301); const int N = 6; for (int i = 0; i < N; i++) { @@ -1249,9 +1253,6 @@ TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) { options.env = fault_env_.get(); fault_fs_->SetFailGetUniqueId(true); DestroyAndReopen(options); - std::string session_id; - ASSERT_OK(db_->GetDbSessionId(session_id)); - secondary_cache->SetDbSessionId(session_id); Random rnd(301); const int N = 256; for (int i = 0; i < N; i++) { @@ -1299,9 +1300,6 @@ TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) { options.env = fault_env_.get(); fault_fs_->SetFailGetUniqueId(true); DestroyAndReopen(options); - std::string session_id; - ASSERT_OK(db_->GetDbSessionId(session_id)); - secondary_cache->SetDbSessionId(session_id); Random rnd(301); const int N = 6; for (int i = 0; i < N; i++) { @@ -1382,6 +1380,7 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) { opts.secondary_cache = secondary_cache; std::shared_ptr cache = NewLRUCache(opts); const int num_keys = 32; + OffsetableCacheKey ock{"foo", "bar", 1}; Random rnd(301); std::vector values; @@ -1389,7 +1388,7 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) { std::string str = rnd.RandomString(1020); values.emplace_back(str); TestItem* item = new TestItem(str.data(), str.length()); - ASSERT_OK(cache->Insert("k" + std::to_string(i), item, + ASSERT_OK(cache->Insert(ock.WithOffset(i).AsSlice(), item, &LRUCacheSecondaryCacheTest::helper_, str.length())); } @@ -1399,13 +1398,16 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) { cache->SetCapacity(32 * 1024); secondary_cache->SetResultMap( - {{"k3", TestSecondaryCache::ResultType::DEFER}, - {"k4", TestSecondaryCache::ResultType::DEFER_AND_FAIL}, - {"k5", TestSecondaryCache::ResultType::FAIL}}); + {{ock.WithOffset(3).AsSlice().ToString(), + TestSecondaryCache::ResultType::DEFER}, + {ock.WithOffset(4).AsSlice().ToString(), + TestSecondaryCache::ResultType::DEFER_AND_FAIL}, + {ock.WithOffset(5).AsSlice().ToString(), + TestSecondaryCache::ResultType::FAIL}}); std::vector results; for (int i = 0; i < 6; ++i) { results.emplace_back(cache->Lookup( - "k" + std::to_string(i), &LRUCacheSecondaryCacheTest::helper_, + ock.WithOffset(i).AsSlice(), &LRUCacheSecondaryCacheTest::helper_, test_item_creator, Cache::Priority::LOW, false)); } cache->WaitAll(results); @@ -1891,9 +1893,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) { // all the blocks will be accessed. options.paranoid_file_checks = true; DestroyAndReopen(options); - std::string session_id; - ASSERT_OK(db_->GetDbSessionId(session_id)); - secondary_cache->SetDbSessionId(session_id); Random rnd(301); const int N = 6; for (int i = 0; i < N; i++) { @@ -1986,9 +1985,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) { // all the blocks will be accessed. options.paranoid_file_checks = true; DestroyAndReopen(options); - std::string session_id; - ASSERT_OK(db_->GetDbSessionId(session_id)); - secondary_cache->SetDbSessionId(session_id); Random rnd(301); const int N = 6; for (int i = 0; i < N; i++) { @@ -2087,12 +2083,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) { ASSERT_OK(DB::Open(options2, dbname2, &db2)); fault_fs_->SetFailGetUniqueId(true); - // Set the file paranoid check, so after flush, the file will be read - // all the blocks will be accessed. - std::string session_id; - ASSERT_OK(db1->GetDbSessionId(session_id)); - secondary_cache->SetDbSessionId(session_id); - WriteOptions wo; Random rnd(301); const int N = 6; diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc index 0e6fa46aa..214c2a49b 100644 --- a/db/blob/blob_file_builder.cc +++ b/db/blob/blob_file_builder.cc @@ -399,12 +399,8 @@ Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob, creation_reason_ == BlobFileCreationReason::kFlush; if (blob_cache && warm_cache) { - // The blob file during flush is unknown to be exactly how big it is. - // Therefore, we set the file size to kMaxOffsetStandardEncoding. For any - // max_offset <= this value, the same encoding scheme is guaranteed. - const OffsetableCacheKey base_cache_key( - db_id_, db_session_id_, blob_file_number, - OffsetableCacheKey::kMaxOffsetStandardEncoding); + const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, + blob_file_number); const CacheKey cache_key = base_cache_key.WithOffset(blob_offset); const Slice key = cache_key.AsSlice(); diff --git a/db/blob/blob_source.cc b/db/blob/blob_source.cc index a57d089e5..2ddf12feb 100644 --- a/db/blob/blob_source.cc +++ b/db/blob/blob_source.cc @@ -286,7 +286,7 @@ void BlobSource::MultiGetBlob(const ReadOptions& read_options, void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options, uint64_t file_number, - uint64_t file_size, + uint64_t /*file_size*/, autovector& blob_reqs, uint64_t* bytes_read) { const size_t num_blobs = blob_reqs.size(); @@ -303,8 +303,7 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options, Mask cache_hit_mask = 0; uint64_t total_bytes = 0; - const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number, - file_size); + const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); if (blob_cache_) { size_t cached_blob_count = 0; diff --git a/db/blob/blob_source.h b/db/blob/blob_source.h index 4999a3120..ffc8ae45f 100644 --- a/db/blob/blob_source.h +++ b/db/blob/blob_source.h @@ -118,10 +118,9 @@ class BlobSource { size_t charge, Cache::Handle** cache_handle, Cache::Priority priority) const; - inline CacheKey GetCacheKey(uint64_t file_number, uint64_t file_size, + inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/, uint64_t offset) const { - OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number, - file_size); + OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); return base_cache_key.WithOffset(offset); } diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc index 9f84a190f..3676e9d3a 100644 --- a/db/blob/blob_source_test.cc +++ b/db/blob/blob_source_test.cc @@ -1168,8 +1168,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { ASSERT_TRUE( blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[1])); - OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number, - file_size); + OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); // blob_cache here only looks at the primary cache since we didn't provide // the cache item helper for the secondary cache. However, since key0 is @@ -1412,8 +1411,7 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) { } { - OffsetableCacheKey base_cache_key(db_id_, db_session_id_, kBlobFileNumber, - blob_file_size_); + OffsetableCacheKey base_cache_key(db_id_, db_session_id_, kBlobFileNumber); size_t blob_bytes = options_.blob_cache->GetUsage(); for (size_t i = 0; i < kNumBlobs; ++i) { diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 89c351fd2..04e0dfdc0 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -1736,51 +1736,60 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { class CacheKeyTest : public testing::Test { public: - void SetupStableBase() { + CacheKey GetBaseCacheKey() { + CacheKey rv = GetOffsetableCacheKey(0, /*min file_number*/ 1).WithOffset(0); + // Correct for file_number_ == 1 + *reinterpret_cast(&rv) ^= ReverseBits(uint64_t{1}); + return rv; + } + CacheKey GetCacheKey(uint64_t session_counter, uint64_t file_number, + uint64_t offset) { + OffsetableCacheKey offsetable = + GetOffsetableCacheKey(session_counter, file_number); + // * 4 to counteract optimization that strips lower 2 bits in encoding + // the offset in BlockBasedTable::GetCacheKey (which we prefer to include + // in unit tests to maximize functional coverage). + EXPECT_GE(offset * 4, offset); // no overflow + return BlockBasedTable::GetCacheKey(offsetable, + BlockHandle(offset * 4, /*size*/ 5)); + } + + protected: + OffsetableCacheKey GetOffsetableCacheKey(uint64_t session_counter, + uint64_t file_number) { // Like SemiStructuredUniqueIdGen::GenerateNext tp_.db_session_id = EncodeSessionId(base_session_upper_, - base_session_lower_ ^ session_counter_); + base_session_lower_ ^ session_counter); tp_.db_id = std::to_string(db_id_); - tp_.orig_file_number = file_number_; + tp_.orig_file_number = file_number; bool is_stable; std::string cur_session_id = ""; // ignored uint64_t cur_file_number = 42; // ignored + OffsetableCacheKey rv; BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number, - file_size_, &base_cache_key_, - &is_stable); - ASSERT_TRUE(is_stable); - } - CacheKey WithOffset(uint64_t offset) { - return BlockBasedTable::GetCacheKey(base_cache_key_, - BlockHandle(offset, /*size*/ 5)); + &rv, &is_stable); + EXPECT_TRUE(is_stable); + EXPECT_TRUE(!rv.IsEmpty()); + // BEGIN some assertions in relation to SST unique IDs + std::string external_unique_id_str; + EXPECT_OK(GetUniqueIdFromTableProperties(tp_, &external_unique_id_str)); + UniqueId64x2 sst_unique_id = {}; + EXPECT_OK(DecodeUniqueIdBytes(external_unique_id_str, &sst_unique_id)); + ExternalUniqueIdToInternal(&sst_unique_id); + OffsetableCacheKey ock = + OffsetableCacheKey::FromInternalUniqueId(&sst_unique_id); + EXPECT_EQ(rv.WithOffset(0).AsSlice(), ock.WithOffset(0).AsSlice()); + EXPECT_EQ(ock.ToInternalUniqueId(), sst_unique_id); + // END some assertions in relation to SST unique IDs + return rv; } - protected: - OffsetableCacheKey base_cache_key_; TableProperties tp_; - uint64_t file_size_ = 0; uint64_t base_session_upper_ = 0; uint64_t base_session_lower_ = 0; - uint64_t session_counter_ = 0; - uint64_t file_number_ = 0; uint64_t db_id_ = 0; }; -namespace { -template -int CountBitsDifferent(const T& t1, const T& t2) { - int diff = 0; - const uint8_t* p1 = reinterpret_cast(&t1); - const uint8_t* p2 = reinterpret_cast(&t2); - static_assert(sizeof(*p1) == 1, "Expecting uint8_t byte"); - for (size_t i = 0; i < sizeof(T); ++i) { - diff += BitsSetToOne(p1[i] ^ p2[i]); - } - return diff; -} - -} // namespace - TEST_F(CacheKeyTest, DBImplSessionIdStructure) { // We have to generate our own session IDs for simulation purposes in other // tests. Here we verify that the DBImpl implementation seems to match @@ -1799,171 +1808,202 @@ TEST_F(CacheKeyTest, DBImplSessionIdStructure) { ASSERT_NE(Lower32of64(lower1), Lower32of64(lower2)); } -TEST_F(CacheKeyTest, StandardEncodingLimit) { - base_session_upper_ = 1234; - base_session_lower_ = 5678; - session_counter_ = 42; - file_number_ = 42; - db_id_ = 1234; - - file_size_ = 42; - SetupStableBase(); - CacheKey ck1; - ASSERT_TRUE(ck1.IsEmpty()); - ck1 = WithOffset(0); - ASSERT_FALSE(ck1.IsEmpty()); - - // Should use same encoding - file_size_ = BlockBasedTable::kMaxFileSizeStandardEncoding; - SetupStableBase(); - CacheKey ck2 = WithOffset(0); - ASSERT_EQ(CountBitsDifferent(ck1, ck2), 0); - - // Should use different encoding - ++file_size_; - SetupStableBase(); - CacheKey ck3 = WithOffset(0); - ASSERT_GT(CountBitsDifferent(ck2, ck3), 0); +namespace { +// Deconstruct cache key, based on knowledge of implementation details. +void DeconstructNonemptyCacheKey(const CacheKey& key, uint64_t* file_num_etc64, + uint64_t* offset_etc64) { + *file_num_etc64 = *reinterpret_cast(key.AsSlice().data()); + *offset_etc64 = *reinterpret_cast(key.AsSlice().data() + 8); + assert(*file_num_etc64 != 0); + if (*offset_etc64 == 0) { + std::swap(*file_num_etc64, *offset_etc64); + } + assert(*offset_etc64 != 0); +} + +// Make a bit mask of 0 to 64 bits +uint64_t MakeMask64(int bits) { + if (bits >= 64) { + return uint64_t{0} - 1; + } else { + return (uint64_t{1} << bits) - 1; + } } -TEST_F(CacheKeyTest, Encodings) { - // Claim from cache_key.cc: - // In fact, if our SST files are all < 4TB (see - // BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated - // in a single process are guaranteed to have unique cache keys, unless/until - // number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in - // a single process and 64 trillion files generated. - - // We can generalize that. For - // * z bits in maximum file size - // * n bits in maximum file number - // * s bits in maximum session counter - // uniqueness is guaranteed at least when all of these hold: - // * z + n + s <= 121 (128 - 2 meta + 2 offset trim - (8-1) byte granularity - // in encoding) - // * n + s <= 86 (encoding limitation) - // * s <= 62 (because of 2-bit metadata) - - // We can verify this indirectly by how input bits get into the cache key, - // but we have to be mindful that for sufficiently large file sizes, - // different encodings might be used. But for cases mixing large and small - // files, we have to verify uniqueness between encodings. - - // Going through all combinations would be a little expensive, so we test - // only one random "stripe" of the configuration space per run. - constexpr uint32_t kStripeBits = 8; - constexpr uint32_t kStripeMask = (uint32_t{1} << kStripeBits) - 1; - - // Also cycle through stripes on repeated runs (not thread safe) - static uint32_t stripe = - static_cast(std::random_device{}()) & kStripeMask; - stripe = (stripe + 1) & kStripeMask; - - fprintf(stderr, "%u\n", stripe); - - // We are going to randomly initialize some values which *should* not affect - // result - Random64 r{std::random_device{}()}; +// See CacheKeyTest::Encodings +struct CacheKeyDecoder { + // Inputs + uint64_t base_file_num_etc64, base_offset_etc64; + int session_counter_bits, file_number_bits, offset_bits; - int max_num_encodings = 0; - uint32_t config_num = 0; - uint32_t session_counter_bits, file_number_bits, max_file_size_bits; - - // Inner loop body, used later in a loop over configurations - auto TestConfig = [&]() { - base_session_upper_ = r.Next(); - base_session_lower_ = r.Next(); - session_counter_ = r.Next(); - if (session_counter_bits < 64) { - // Avoid shifting UB - session_counter_ = session_counter_ >> 1 >> (63 - session_counter_bits); - } - file_number_ = r.Next() >> (64 - file_number_bits); - // Need two bits set to avoid temporary zero below - if (BitsSetToOne(file_number_) < 2) { - file_number_ = 3; - } - db_id_ = r.Next(); - - // Work-around clang-analyzer which thinks empty last_base is garbage - CacheKey last_base = CacheKey::CreateUniqueForProcessLifetime(); - - std::unordered_set seen; - int num_encodings = 0; - - // Loop over encodings by increasing file size bits - for (uint32_t file_size_bits = 1; file_size_bits <= max_file_size_bits; - ++file_size_bits) { - file_size_ = uint64_t{1} << (file_size_bits - 1); - SetupStableBase(); - CacheKey new_base = WithOffset(0); - if (CountBitsDifferent(last_base, new_base) == 0) { - // Same as previous encoding - continue; - } + // Derived + uint64_t session_counter_mask, file_number_mask, offset_mask; + + // Outputs + uint64_t decoded_session_counter, decoded_file_num, decoded_offset; - // New encoding - ++num_encodings; - ASSERT_TRUE(seen.insert(new_base.AsSlice().ToString()).second); - last_base = new_base; - for (uint32_t i = 0; i < file_size_bits; ++i) { - CacheKey ck = WithOffset(uint64_t{1} << i); - if (i < 2) { - // These cases are not relevant and optimized by dropping two - // lowest bits because there's always at least 5 bytes between - // blocks. - ASSERT_EQ(CountBitsDifferent(ck, new_base), 0); - } else { - // Normal case - // 1 bit different from base and never been seen implies the bit - // is encoded into cache key without overlapping other structured - // data. - ASSERT_EQ(CountBitsDifferent(ck, new_base), 1); - ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second); + void SetBaseCacheKey(const CacheKey& base) { + DeconstructNonemptyCacheKey(base, &base_file_num_etc64, &base_offset_etc64); + } + + void SetRanges(int _session_counter_bits, int _file_number_bits, + int _offset_bits) { + session_counter_bits = _session_counter_bits; + session_counter_mask = MakeMask64(session_counter_bits); + file_number_bits = _file_number_bits; + file_number_mask = MakeMask64(file_number_bits); + offset_bits = _offset_bits; + offset_mask = MakeMask64(offset_bits); + } + + void Decode(const CacheKey& key) { + uint64_t file_num_etc64, offset_etc64; + DeconstructNonemptyCacheKey(key, &file_num_etc64, &offset_etc64); + + // First decode session counter + if (offset_bits + session_counter_bits <= 64) { + // fully recoverable from offset_etc64 + decoded_session_counter = + ReverseBits((offset_etc64 ^ base_offset_etc64)) & + session_counter_mask; + } else if (file_number_bits + session_counter_bits <= 64) { + // fully recoverable from file_num_etc64 + decoded_session_counter = DownwardInvolution( + (file_num_etc64 ^ base_file_num_etc64) & session_counter_mask); + } else { + // Need to combine parts from each word. + // Piece1 will contain some correct prefix of the bottom bits of + // session counter. + uint64_t piece1 = + ReverseBits((offset_etc64 ^ base_offset_etc64) & ~offset_mask); + int piece1_bits = 64 - offset_bits; + // Piece2 will contain involuded bits that we can combine with piece1 + // to infer rest of session counter + int piece2_bits = std::min(64 - file_number_bits, 64 - piece1_bits); + ASSERT_LT(piece2_bits, 64); + uint64_t piece2_mask = MakeMask64(piece2_bits); + uint64_t piece2 = (file_num_etc64 ^ base_file_num_etc64) & piece2_mask; + + // Cancel out the part of piece2 that we can infer from piece1 + // (DownwardInvolution distributes over xor) + piece2 ^= DownwardInvolution(piece1) & piece2_mask; + + // Now we need to solve for the unknown original bits in higher + // positions than piece1 provides. We use Gaussian elimination + // because we know that a piece2_bits X piece2_bits submatrix of + // the matrix underlying DownwardInvolution times the vector of + // unknown original bits equals piece2. + // + // Build an augmented row matrix for that submatrix, built column by + // column. + std::array aug_rows{}; + for (int i = 0; i < piece2_bits; ++i) { // over columns + uint64_t col_i = DownwardInvolution(uint64_t{1} << piece1_bits << i); + ASSERT_NE(col_i & 1U, 0); + for (int j = 0; j < piece2_bits; ++j) { // over rows + aug_rows[j] |= (col_i & 1U) << i; + col_i >>= 1; } } - for (uint32_t i = 0; i < session_counter_bits; ++i) { - SaveAndRestore tmp(&session_counter_, - session_counter_ ^ (uint64_t{1} << i)); - SetupStableBase(); - CacheKey ck = WithOffset(0); - ASSERT_EQ(CountBitsDifferent(ck, new_base), 1); - ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second); + // Augment with right hand side + for (int j = 0; j < piece2_bits; ++j) { // over rows + aug_rows[j] |= (piece2 & 1U) << piece2_bits; + piece2 >>= 1; } - for (uint32_t i = 0; i < file_number_bits; ++i) { - SaveAndRestore tmp(&file_number_, - file_number_ ^ (uint64_t{1} << i)); - SetupStableBase(); - CacheKey ck = WithOffset(0); - ASSERT_EQ(CountBitsDifferent(ck, new_base), 1); - ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second); + // Run Gaussian elimination + for (int i = 0; i < piece2_bits; ++i) { // over columns + // Find a row that can be used to cancel others + uint64_t canceller = 0; + // Note: Rows 0 through i-1 contain 1s in columns already eliminated + for (int j = i; j < piece2_bits; ++j) { // over rows + if (aug_rows[j] & (uint64_t{1} << i)) { + // Swap into appropriate row + std::swap(aug_rows[i], aug_rows[j]); + // Keep a handy copy for row reductions + canceller = aug_rows[i]; + break; + } + } + ASSERT_NE(canceller, 0); + for (int j = 0; j < piece2_bits; ++j) { // over rows + if (i != j && ((aug_rows[j] >> i) & 1) != 0) { + // Row reduction + aug_rows[j] ^= canceller; + } + } + } + // Extract result + decoded_session_counter = piece1; + for (int j = 0; j < piece2_bits; ++j) { // over rows + ASSERT_EQ(aug_rows[j] & piece2_mask, uint64_t{1} << j); + decoded_session_counter |= aug_rows[j] >> piece2_bits << piece1_bits + << j; } - max_num_encodings = std::max(max_num_encodings, num_encodings); } - }; - // Loop over configurations and test those in stripe - for (session_counter_bits = 0; session_counter_bits <= 62; + decoded_offset = + offset_etc64 ^ base_offset_etc64 ^ ReverseBits(decoded_session_counter); + + decoded_file_num = ReverseBits(file_num_etc64 ^ base_file_num_etc64 ^ + DownwardInvolution(decoded_session_counter)); + } +}; +} // namespace + +TEST_F(CacheKeyTest, Encodings) { + // This test primarily verifies this claim from cache_key.cc: + // // In fact, if DB ids were not involved, we would be guaranteed unique + // // cache keys for files generated in a single process until total bits for + // // biggest session_id_counter, orig_file_number, and offset_in_file + // // reach 128 bits. + // + // To demonstrate this, CacheKeyDecoder can reconstruct the structured inputs + // to the cache key when provided an output cache key, the unstructured + // inputs, and bounds on the structured inputs. + // + // See OffsetableCacheKey comments in cache_key.cc. + + // We are going to randomly initialize some values that *should* not affect + // result + Random64 r{std::random_device{}()}; + + CacheKeyDecoder decoder; + db_id_ = r.Next(); + base_session_upper_ = r.Next(); + base_session_lower_ = r.Next(); + if (base_session_lower_ == 0) { + base_session_lower_ = 1; + } + + decoder.SetBaseCacheKey(GetBaseCacheKey()); + + // Loop over configurations and test those + for (int session_counter_bits = 0; session_counter_bits <= 64; ++session_counter_bits) { - uint32_t max_file_number_bits = - std::min(uint32_t{64}, uint32_t{86} - session_counter_bits); - // Start with 2 to avoid file_number_ == 0 in testing - for (file_number_bits = 2; file_number_bits <= max_file_number_bits; - ++file_number_bits) { - uint32_t max_max_file_size_bits = - std::min(uint32_t{64}, - uint32_t{121} - file_number_bits - session_counter_bits); - for (max_file_size_bits = 1; max_file_size_bits <= max_max_file_size_bits; - ++max_file_size_bits) { - if ((config_num++ & kStripeMask) == stripe) { - TestConfig(); + for (int file_number_bits = 1; file_number_bits <= 64; ++file_number_bits) { + // 62 bits max because unoptimized offset will be 64 bits in that case + for (int offset_bits = 0; offset_bits <= 62; ++offset_bits) { + if (session_counter_bits + file_number_bits + offset_bits > 128) { + break; + } + + decoder.SetRanges(session_counter_bits, file_number_bits, offset_bits); + + uint64_t session_counter = r.Next() & decoder.session_counter_mask; + uint64_t file_number = r.Next() & decoder.file_number_mask; + if (file_number == 0) { + // Minimum + file_number = 1; } + uint64_t offset = r.Next() & decoder.offset_mask; + decoder.Decode(GetCacheKey(session_counter, file_number, offset)); + + EXPECT_EQ(decoder.decoded_session_counter, session_counter); + EXPECT_EQ(decoder.decoded_file_num, file_number); + EXPECT_EQ(decoder.decoded_offset, offset); } } } - - // Make sure the current implementation is exercised - ASSERT_EQ(max_num_encodings, 4); } INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest, diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 565cd8ec8..da81cb254 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -895,12 +895,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey", const_cast(&rep_->props)); - // Extremely large files use atypical cache key encoding, and we don't - // know ahead of time how big the file will be. But assuming it's less - // than 4TB, we will correctly predict the cache keys. - BlockBasedTable::SetupBaseCacheKey( - &rep_->props, tbo.db_session_id, tbo.cur_file_num, - BlockBasedTable::kMaxFileSizeStandardEncoding, &rep_->base_cache_key); + BlockBasedTable::SetupBaseCacheKey(&rep_->props, tbo.db_session_id, + tbo.cur_file_num, &rep_->base_cache_key); if (rep_->IsParallelCompressionEnabled()) { StartParallelCompression(); diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 6983e1f7d..52606f6ba 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -521,7 +521,6 @@ Status GetGlobalSequenceNumber(const TableProperties& table_properties, void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties, const std::string& cur_db_session_id, uint64_t cur_file_number, - uint64_t file_size, OffsetableCacheKey* out_base_cache_key, bool* out_is_stable) { // Use a stable cache key if sufficient data is in table properties @@ -565,8 +564,7 @@ void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties, // Minimum block size is 5 bytes; therefore we can trim off two lower bits // from offsets. See GetCacheKey. - *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num, - /*max_offset*/ file_size >> 2); + *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num); } CacheKey BlockBasedTable::GetCacheKey(const OffsetableCacheKey& base_cache_key, @@ -717,7 +715,7 @@ Status BlockBasedTable::Open( // With properties loaded, we can set up portable/stable cache keys SetupBaseCacheKey(rep->table_properties.get(), cur_db_session_id, - cur_file_num, file_size, &rep->base_cache_key); + cur_file_num, &rep->base_cache_key); rep->persistent_cache_options = PersistentCacheOptions(rep->table_options.persistent_cache, diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index fee26f3c7..a1e4e56fd 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -231,15 +231,9 @@ class BlockBasedTable : public TableReader { class IndexReaderCommon; - // Maximum SST file size that uses standard CacheKey encoding scheme. - // See GetCacheKey to explain << 2. + 3 is permitted because it is trimmed - // off by >> 2 in GetCacheKey. - static constexpr uint64_t kMaxFileSizeStandardEncoding = - (OffsetableCacheKey::kMaxOffsetStandardEncoding << 2) + 3; - static void SetupBaseCacheKey(const TableProperties* properties, const std::string& cur_db_session_id, - uint64_t cur_file_number, uint64_t file_size, + uint64_t cur_file_number, OffsetableCacheKey* out_base_cache_key, bool* out_is_stable = nullptr); diff --git a/table/unique_id.cc b/table/unique_id.cc index cce0d7584..fcdd75650 100644 --- a/table/unique_id.cc +++ b/table/unique_id.cc @@ -58,22 +58,34 @@ Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, Status GetSstInternalUniqueId(const std::string &db_id, const std::string &db_session_id, - uint64_t file_number, UniqueIdPtr out) { - if (db_id.empty()) { - return Status::NotSupported("Missing db_id"); - } - if (file_number == 0) { - return Status::NotSupported("Missing or bad file number"); - } - if (db_session_id.empty()) { - return Status::NotSupported("Missing db_session_id"); + uint64_t file_number, UniqueIdPtr out, + bool force) { + if (!force) { + if (db_id.empty()) { + return Status::NotSupported("Missing db_id"); + } + if (file_number == 0) { + return Status::NotSupported("Missing or bad file number"); + } + if (db_session_id.empty()) { + return Status::NotSupported("Missing db_session_id"); + } } uint64_t session_upper = 0; // Assignment to appease clang-analyze uint64_t session_lower = 0; // Assignment to appease clang-analyze { Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower); if (!s.ok()) { - return s; + if (!force) { + return s; + } else { + // A reasonable fallback in case malformed + Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper, + &session_lower); + if (session_lower == 0) { + session_lower = session_upper | 1; + } + } } } @@ -107,20 +119,6 @@ Status GetSstInternalUniqueId(const std::string &db_id, return Status::OK(); } -Status GetSstInternalUniqueId(const std::string &db_id, - const std::string &db_session_id, - uint64_t file_number, UniqueId64x2 *out) { - UniqueId64x3 tmp{}; - Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number, &tmp); - if (s.ok()) { - (*out)[0] = tmp[0]; - (*out)[1] = tmp[1]; - } else { - *out = {0, 0}; - } - return s; -} - namespace { // For InternalUniqueIdToExternal / ExternalUniqueIdToInternal we want all // zeros in first 128 bits to map to itself, so that excluding zero in diff --git a/table/unique_id_impl.h b/table/unique_id_impl.h index 9f5f97704..6e3dc62c7 100644 --- a/table/unique_id_impl.h +++ b/table/unique_id_impl.h @@ -47,7 +47,8 @@ struct UniqueIdPtr { // is long term stable. Status GetSstInternalUniqueId(const std::string &db_id, const std::string &db_session_id, - uint64_t file_number, UniqueIdPtr out); + uint64_t file_number, UniqueIdPtr out, + bool force = false); // Helper for GetUniqueIdFromTableProperties. External unique ids go through // this extra hashing layer so that prefixes of the unique id have predictable diff --git a/util/hash_test.cc b/util/hash_test.cc index 594d2e1bb..b82b5a4af 100644 --- a/util/hash_test.cc +++ b/util/hash_test.cc @@ -569,6 +569,7 @@ using ROCKSDB_NAMESPACE::ConstexprFloorLog2; using ROCKSDB_NAMESPACE::CountTrailingZeroBits; using ROCKSDB_NAMESPACE::DecodeFixed128; using ROCKSDB_NAMESPACE::DecodeFixedGeneric; +using ROCKSDB_NAMESPACE::DownwardInvolution; using ROCKSDB_NAMESPACE::EncodeFixed128; using ROCKSDB_NAMESPACE::EncodeFixedGeneric; using ROCKSDB_NAMESPACE::FloorLog2; @@ -577,6 +578,8 @@ using ROCKSDB_NAMESPACE::Multiply64to128; using ROCKSDB_NAMESPACE::Unsigned128; using ROCKSDB_NAMESPACE::Upper64of128; +int blah(int x) { return DownwardInvolution(x); } + template static void test_BitOps() { // This complex code is to generalize to 128-bit values. Otherwise @@ -640,6 +643,70 @@ static void test_BitOps() { EXPECT_EQ(ReverseBits(vm1), static_cast(rv * ~T{1})); } #endif + + // DownwardInvolution + { + T misc = static_cast(/*random*/ 0xc682cd153d0e3279U + + i * /*random*/ 0x9b3972f3bea0baa3U); + if constexpr (sizeof(T) > 8) { + misc = (misc << 64) | (/*random*/ 0x52af031a38ced62dU + + i * /*random*/ 0x936f803d9752ddc3U); + } + T misc_masked = misc & vm1; + EXPECT_LE(misc_masked, vm1); + T di_misc_masked = DownwardInvolution(misc_masked); + EXPECT_LE(di_misc_masked, vm1); + if (misc_masked > 0) { + // Highest-order 1 in same position + EXPECT_EQ(FloorLog2(misc_masked), FloorLog2(di_misc_masked)); + } + // Validate involution property on short value + EXPECT_EQ(DownwardInvolution(di_misc_masked), misc_masked); + + // Validate involution property on large value + T di_misc = DownwardInvolution(misc); + EXPECT_EQ(DownwardInvolution(di_misc), misc); + // Highest-order 1 in same position + if (misc > 0) { + EXPECT_EQ(FloorLog2(misc), FloorLog2(di_misc)); + } + + // Validate distributes over xor. + // static_casts to avoid numerical promotion effects. + EXPECT_EQ(DownwardInvolution(static_cast(misc_masked ^ vm1)), + static_cast(di_misc_masked ^ DownwardInvolution(vm1))); + T misc2 = static_cast(misc >> 1); + EXPECT_EQ(DownwardInvolution(static_cast(misc ^ misc2)), + static_cast(di_misc ^ DownwardInvolution(misc2))); + + // Choose some small number of bits to pull off to test combined + // uniqueness guarantee + int in_bits = i % 7; + unsigned in_mask = (unsigned{1} << in_bits) - 1U; + // IMPLICIT: int out_bits = 8 - in_bits; + std::vector seen(256, false); + for (int j = 0; j < 255; ++j) { + T t_in = misc ^ static_cast(j); + unsigned in = static_cast(t_in); + unsigned out = static_cast(DownwardInvolution(t_in)); + unsigned val = ((out << in_bits) | (in & in_mask)) & 255U; + EXPECT_FALSE(seen[val]); + seen[val] = true; + } + + if (i + 8 < int{8 * sizeof(T)}) { + // Also test manipulating bits in the middle of input is + // bijective in bottom of output + seen = std::vector(256, false); + for (int j = 0; j < 255; ++j) { + T in = misc ^ (static_cast(j) << i); + unsigned val = static_cast(DownwardInvolution(in)) & 255U; + EXPECT_FALSE(seen[val]); + seen[val] = true; + } + } + } + vm1 = (vm1 << 1) | 1; } diff --git a/util/math.h b/util/math.h index a445216bf..da31b43ec 100644 --- a/util/math.h +++ b/util/math.h @@ -250,4 +250,45 @@ inline T ReverseBits(T v) { return r; } +// Every output bit depends on many input bits in the same and higher +// positions, but not lower positions. Specifically, this function +// * Output highest bit set to 1 is same as input (same FloorLog2, or +// equivalently, same number of leading zeros) +// * Is its own inverse (an involution) +// * Guarantees that b bottom bits of v and c bottom bits of +// DownwardInvolution(v) uniquely identify b + c bottom bits of v +// (which is all of v if v < 2**(b + c)). +// ** A notable special case is that modifying c adjacent bits at +// some chosen position in the input is bijective with the bottom c +// output bits. +// * Distributes over xor, as in DI(a ^ b) == DI(a) ^ DI(b) +// +// This transformation is equivalent to a matrix*vector multiplication in +// GF(2) where the matrix is recursively defined by the pattern matrix +// P = | 1 1 | +// | 0 1 | +// and replacing 1's with P and 0's with 2x2 zero matices to some depth, +// e.g. depth of 6 for 64-bit T. An essential feature of this matrix +// is that all square sub-matrices that include the top row are invertible. +template +inline T DownwardInvolution(T v) { + static_assert(std::is_integral::value, "non-integral type"); + static_assert(sizeof(T) <= 8, "only supported up to 64 bits"); + + uint64_t r = static_cast(v); + if constexpr (sizeof(T) > 4) { + r ^= r >> 32; + } + if constexpr (sizeof(T) > 2) { + r ^= (r & 0xffff0000ffff0000U) >> 16; + } + if constexpr (sizeof(T) > 1) { + r ^= (r & 0xff00ff00ff00ff00U) >> 8; + } + r ^= (r & 0xf0f0f0f0f0f0f0f0U) >> 4; + r ^= (r & 0xccccccccccccccccU) >> 2; + r ^= (r & 0xaaaaaaaaaaaaaaaaU) >> 1; + return static_cast(r); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/util/math128.h b/util/math128.h index 54ccf11dc..ae490051a 100644 --- a/util/math128.h +++ b/util/math128.h @@ -230,6 +230,12 @@ inline Unsigned128 ReverseBits(Unsigned128 v) { ReverseBits(Upper64of128(v)); } +template <> +inline Unsigned128 DownwardInvolution(Unsigned128 v) { + return (Unsigned128{DownwardInvolution(Upper64of128(v))} << 64) | + DownwardInvolution(Upper64of128(v) ^ Lower64of128(v)); +} + template struct IsUnsignedUpTo128 : std::integral_constant::value || diff --git a/utilities/cache_dump_load_impl.cc b/utilities/cache_dump_load_impl.cc index 7745e1618..33b0d37ec 100644 --- a/utilities/cache_dump_load_impl.cc +++ b/utilities/cache_dump_load_impl.cc @@ -39,12 +39,9 @@ Status CacheDumperImpl::SetDumpFilter(std::vector db_list) { // We only want to save cache entries that are portable to another // DB::Open, so only save entries with stable keys. bool is_stable; - // WART: if the file is extremely large (> kMaxFileSizeStandardEncoding) - // then the prefix will be different. But this should not be a concern - // in practice because that limit is currently 4TB on a single file. - BlockBasedTable::SetupBaseCacheKey( - id->second.get(), /*cur_db_session_id*/ "", /*cur_file_num*/ 0, - /*file_size*/ 42, &base, &is_stable); + BlockBasedTable::SetupBaseCacheKey(id->second.get(), + /*cur_db_session_id*/ "", + /*cur_file_num*/ 0, &base, &is_stable); if (is_stable) { Slice prefix_slice = base.CommonPrefixSlice(); assert(prefix_slice.size() == OffsetableCacheKey::kCommonPrefixSize);