Enhance new cache key testing & comments (#9329)

Summary:
Follow-up to https://github.com/facebook/rocksdb/issues/9126

Added new unit tests to validate some of the claims of guaranteed uniqueness
within certain large bounds.

Also cleaned up the cache_bench -stress-cache-key tool with better comments
and description.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9329

Test Plan: no changes to production code

Reviewed By: mrambacher

Differential Revision: D33269328

Pulled By: pdillinger

fbshipit-source-id: 3a2b684a6b2b15f79dc872e563e3d16563be26de
main
Peter Dillinger 3 years ago committed by Facebook GitHub Bot
parent 42e0751b3a
commit afc280fdfd
  1. 211
      cache/cache_bench_tool.cc
  2. 91
      cache/cache_key.cc
  3. 241
      db/db_block_cache_test.cc

@ -79,31 +79,52 @@ static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
DEFINE_bool(use_clock_cache, false, ""); DEFINE_bool(use_clock_cache, false, "");
// ## BEGIN stress_cache_key sub-tool options ## // ## BEGIN stress_cache_key sub-tool options ##
// See class StressCacheKey below.
DEFINE_bool(stress_cache_key, false, DEFINE_bool(stress_cache_key, false,
"If true, run cache key stress test instead"); "If true, run cache key stress test instead");
DEFINE_uint32(sck_files_per_day, 2500000, DEFINE_uint32(
"(-stress_cache_key) Simulated files generated per day"); sck_files_per_day, 2500000,
DEFINE_uint32(sck_duration, 90, "(-stress_cache_key) Simulated files generated per simulated day");
// NOTE: Giving each run a specified lifetime, rather than e.g. "until
// first collision" ensures equal skew from start-up, when collisions are
// less likely.
DEFINE_uint32(sck_days_per_run, 90,
"(-stress_cache_key) Number of days to simulate in each run"); "(-stress_cache_key) Number of days to simulate in each run");
// NOTE: The number of observed collisions directly affects the relative
// accuracy of the predicted probabilities. 15 observations should be well
// within factor-of-2 accuracy.
DEFINE_uint32( DEFINE_uint32(
sck_min_collision, 15, sck_min_collision, 15,
"(-stress_cache_key) Keep running until this many collisions seen"); "(-stress_cache_key) Keep running until this many collisions seen");
// sck_file_size_mb can be thought of as average file size. The simulation is
// not precise enough to care about the distribution of file sizes; other
// simulations (https://github.com/pdillinger/unique_id/tree/main/monte_carlo)
// indicate the distribution only makes a small difference (e.g. < 2x factor)
DEFINE_uint32( DEFINE_uint32(
sck_file_size_mb, 32, sck_file_size_mb, 32,
"(-stress_cache_key) Simulated file size in MiB, for accounting purposes"); "(-stress_cache_key) Simulated file size in MiB, for accounting purposes");
DEFINE_uint32(sck_reopen_nfiles, 100, DEFINE_uint32(sck_reopen_nfiles, 100,
"(-stress_cache_key) Re-opens DB average every n files"); "(-stress_cache_key) Simulate DB re-open average every n files");
DEFINE_uint32(sck_restarts_per_day, 24,
"(-stress_cache_key) Average simulated process restarts per day "
"(across DBs)");
DEFINE_uint32(
sck_db_count, 100,
"(-stress_cache_key) Parallel DBs in simulation sharing a block cache");
DEFINE_uint32( DEFINE_uint32(
sck_restarts_per_day, 24, sck_table_bits, 20,
"(-stress_cache_key) Simulated process restarts per day (across DBs)"); "(-stress_cache_key) Log2 number of tracked (live) files (across DBs)");
DEFINE_uint32(sck_db_count, 100, // sck_keep_bits being well below full 128 bits amplifies the collision
"(-stress_cache_key) Parallel DBs in operation"); // probability so that the true probability can be estimated through observed
DEFINE_uint32(sck_table_bits, 20, // collisions. (More explanation below.)
"(-stress_cache_key) Log2 number of tracked files"); DEFINE_uint32(
DEFINE_uint32(sck_keep_bits, 50, sck_keep_bits, 50,
"(-stress_cache_key) Number of cache key bits to keep"); "(-stress_cache_key) Number of bits to keep from each cache key (<= 64)");
// sck_randomize is used to validate whether cache key is performing "better
// than random." Even with this setting, file offsets are not randomized.
DEFINE_bool(sck_randomize, false, DEFINE_bool(sck_randomize, false,
"(-stress_cache_key) Randomize (hash) cache key"); "(-stress_cache_key) Randomize (hash) cache key");
// See https://github.com/facebook/rocksdb/pull/9058
DEFINE_bool(sck_footer_unique_id, false, DEFINE_bool(sck_footer_unique_id, false,
"(-stress_cache_key) Simulate using proposed footer unique id"); "(-stress_cache_key) Simulate using proposed footer unique id");
// ## END stress_cache_key sub-tool options ## // ## END stress_cache_key sub-tool options ##
@ -583,20 +604,97 @@ class CacheBench {
} }
}; };
// TODO: better description (see PR #9126 for some info) // cache_bench -stress_cache_key is an independent embedded tool for
// estimating the probability of CacheKey collisions through simulation.
// At a high level, it simulates generating SST files over many months,
// keeping them in the DB and/or cache for some lifetime while staying
// under resource caps, and checking for any cache key collisions that
// arise among the set of live files. For efficient simulation, we make
// some simplifying "pessimistic" assumptions (that only increase the
// chance of the simulation reporting a collision relative to the chance
// of collision in practice):
// * Every generated file has a cache entry for every byte offset in the
// file (contiguous range of cache keys)
// * All of every file is cached for its entire lifetime. (Here "lifetime"
// is technically the union of DB and Cache lifetime, though we only
// model a generous DB lifetime, where space usage is always maximized.
// In a effective Cache, lifetime in cache can only substantially exceed
// lifetime in DB if there is little cache activity; cache activity is
// required to hit cache key collisions.)
//
// It would be possible to track an exact set of cache key ranges for the
// set of live files, but we would have no hope of observing collisions
// (overlap in live files) in our simulation. We need to employ some way
// of amplifying collision probability that allows us to predict the real
// collision probability by extrapolation from observed collisions. Our
// basic approach is to reduce each cache key range down to some smaller
// number of bits, and limiting to bits that are shared over the whole
// range. Now we can observe collisions using a set of smaller stripped-down
// (reduced) cache keys. Let's do some case analysis to understand why this
// works:
// * No collision in reduced key - because the reduction is a pure function
// this implies no collision in the full keys
// * Collision detected between two reduced keys - either
// * The reduction has dropped some structured uniqueness info (from one of
// session counter or file number; file offsets are never materialized here).
// This can only artificially inflate the observed and extrapolated collision
// probabilities. We only have to worry about this in designing the reduction.
// * The reduction has preserved all the structured uniqueness in the cache
// key, which means either
// * REJECTED: We have a uniqueness bug in generating cache keys, where
// structured uniqueness info should have been different but isn't. In such a
// case, increasing by 1 the number of bits kept after reduction would not
// reduce observed probabilities by half. (In our observations, the
// probabilities are reduced approximately by half.)
// * ACCEPTED: The lost unstructured uniqueness in the key determines the
// probability that an observed collision would imply an overlap in ranges.
// In short, dropping n bits from key would increase collision probability by
// 2**n, assuming those n bits have full entropy in unstructured uniqueness.
//
// But we also have to account for the key ranges based on file size. If file
// sizes are roughly 2**b offsets, using XOR in 128-bit cache keys for
// "ranges", we know from other simulations (see
// https://github.com/pdillinger/unique_id/) that that's roughly equivalent to
// (less than 2x higher collision probability) using a cache key of size
// 128 - b bits for the whole file. (This is the only place we make an
// "optimistic" assumption, which is more than offset by the real
// implementation stripping off 2 lower bits from block byte offsets for cache
// keys. The simulation assumes byte offsets, which is net pessimistic.)
//
// So to accept the extrapolation as valid, we need to be confident that all
// "lost" bits, excluding those covered by file offset, are full entropy.
// Recall that we have assumed (verifiably, safely) that other structured data
// (file number and session counter) are kept, not lost. Based on the
// implementation comments for OffsetableCacheKey, the only potential hole here
// is that we only have ~103 bits of entropy in "all new" session IDs, and in
// extreme cases, there might be only 1 DB ID. However, because the upper ~39
// bits of session ID are hashed, the combination of file number and file
// offset only has to add to 25 bits (or more) to ensure full entropy in
// unstructured uniqueness lost in the reduction. Typical file size of 32MB
// suffices (at least for simulation purposes where we assume each file offset
// occupies a cache key).
//
// Example results in comments on OffsetableCacheKey.
class StressCacheKey { class StressCacheKey {
public: public:
void Run() { void Run() {
if (FLAGS_sck_footer_unique_id) { if (FLAGS_sck_footer_unique_id) {
// Proposed footer unique IDs are DB-independent and session-independent
// (but process-dependent) which is most easily simulated here by
// assuming 1 DB and (later below) no session resets without process
// reset.
FLAGS_sck_db_count = 1; FLAGS_sck_db_count = 1;
} }
// Describe the simulated workload
uint64_t mb_per_day = uint64_t mb_per_day =
uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb; uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb;
printf("Total cache or DBs size: %gTiB Writing %g MiB/s or %gTiB/day\n", printf("Total cache or DBs size: %gTiB Writing %g MiB/s or %gTiB/day\n",
FLAGS_sck_file_size_mb / 1024.0 / 1024.0 * FLAGS_sck_file_size_mb / 1024.0 / 1024.0 *
std::pow(2.0, FLAGS_sck_table_bits), std::pow(2.0, FLAGS_sck_table_bits),
mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0); mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0);
// For extrapolating probability of any collisions from a number of
// observed collisions
multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) / multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) /
(FLAGS_sck_file_size_mb * 1024.0 * 1024.0); (FLAGS_sck_file_size_mb * 1024.0 * 1024.0);
printf( printf(
@ -606,6 +704,9 @@ class StressCacheKey {
restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day; restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day;
double without_ejection = double without_ejection =
std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day; std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day;
// This should be a lower bound for -sck_randomize, usually a terribly
// rough lower bound.
// If observation is worse than this, then something has gone wrong.
printf( printf(
"Without ejection, expect random collision after %g days (%g " "Without ejection, expect random collision after %g days (%g "
"corrected)\n", "corrected)\n",
@ -613,30 +714,36 @@ class StressCacheKey {
double with_full_table = double with_full_table =
std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) / std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) /
FLAGS_sck_files_per_day; FLAGS_sck_files_per_day;
// This is an alternate lower bound for -sck_randomize, usually pretty
// accurate. Our cache keys should usually perform "better than random"
// but always no worse. (If observation is substantially worse than this,
// then something has gone wrong.)
printf( printf(
"With ejection and full table, expect random collision after %g " "With ejection and full table, expect random collision after %g "
"days (%g corrected)\n", "days (%g corrected)\n",
with_full_table, with_full_table * multiplier_); with_full_table, with_full_table * multiplier_);
collisions_ = 0; collisions_ = 0;
// Run until sufficient number of observed collisions.
for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) { for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) {
RunOnce(); RunOnce();
if (collisions_ == 0) { if (collisions_ == 0) {
printf( printf(
"No collisions after %d x %u days " "No collisions after %d x %u days "
" \n", " \n",
i, FLAGS_sck_duration); i, FLAGS_sck_days_per_run);
} else { } else {
double est = 1.0 * i * FLAGS_sck_duration / collisions_; double est = 1.0 * i * FLAGS_sck_days_per_run / collisions_;
printf("%" PRIu64 printf("%" PRIu64
" collisions after %d x %u days, est %g days between (%g " " collisions after %d x %u days, est %g days between (%g "
"corrected) \n", "corrected) \n",
collisions_, i, FLAGS_sck_duration, est, est * multiplier_); collisions_, i, FLAGS_sck_days_per_run, est, est * multiplier_);
} }
} }
} }
void RunOnce() { void RunOnce() {
// Re-initialized simulated state
const size_t db_count = FLAGS_sck_db_count; const size_t db_count = FLAGS_sck_db_count;
dbs_.reset(new TableProperties[db_count]{}); dbs_.reset(new TableProperties[db_count]{});
const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1; const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1;
@ -644,7 +751,11 @@ class StressCacheKey {
if (FLAGS_sck_keep_bits > 64) { if (FLAGS_sck_keep_bits > 64) {
FLAGS_sck_keep_bits = 64; FLAGS_sck_keep_bits = 64;
} }
// Details of which bits are dropped in reduction
uint32_t shift_away = 64 - FLAGS_sck_keep_bits; uint32_t shift_away = 64 - FLAGS_sck_keep_bits;
// Shift away fewer potential file number bits (b) than potential
// session counter bits (a).
uint32_t shift_away_b = shift_away / 3; uint32_t shift_away_b = shift_away / 3;
uint32_t shift_away_a = shift_away - shift_away_b; uint32_t shift_away_a = shift_away - shift_away_b;
@ -655,62 +766,78 @@ class StressCacheKey {
Random64 r{std::random_device{}()}; Random64 r{std::random_device{}()};
uint64_t max_file_count = uint64_t max_file_count =
uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_duration; uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run;
uint64_t file_count = 0; uint64_t file_size = FLAGS_sck_file_size_mb * uint64_t{1024} * 1024U;
uint32_t report_count = 0; uint32_t report_count = 0;
uint32_t collisions_this_run = 0; uint32_t collisions_this_run = 0;
// Round robin through DBs size_t db_i = 0;
for (size_t db_i = 0;; ++db_i) {
for (uint64_t file_count = 1; file_count <= max_file_count;
++file_count, ++db_i) {
// Round-robin through DBs (this faster than %)
if (db_i >= db_count) { if (db_i >= db_count) {
db_i = 0; db_i = 0;
} }
if (file_count >= max_file_count) { // Any other periodic actions before simulating next file
break;
}
if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) { if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) {
ResetSession(db_i); ResetSession(db_i);
} else if (r.OneIn(restart_nfiles_)) { } else if (r.OneIn(restart_nfiles_)) {
ResetProcess(); ResetProcess();
} }
// Simulate next file
OffsetableCacheKey ock; OffsetableCacheKey ock;
dbs_[db_i].orig_file_number += 1; dbs_[db_i].orig_file_number += 1;
// skip some file numbers, unless 1 DB so that that can simulate // skip some file numbers for other file kinds, except in footer unique
// better (DB-independent) unique IDs // ID, orig_file_number here tracks process-wide generated SST file
if (db_count > 1) { // count.
if (!FLAGS_sck_footer_unique_id) {
dbs_[db_i].orig_file_number += (r.Next() & 3); dbs_[db_i].orig_file_number += (r.Next() & 3);
} }
BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], "", 42, 42, &ock); bool is_stable;
BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "",
/* ignored */ 42, file_size, &ock,
&is_stable);
assert(is_stable);
// Get a representative cache key, which later we analytically generalize
// to a range.
CacheKey ck = ock.WithOffset(0); CacheKey ck = ock.WithOffset(0);
uint64_t stripped; uint64_t reduced_key;
if (FLAGS_sck_randomize) { if (FLAGS_sck_randomize) {
stripped = GetSliceHash64(ck.AsSlice()) >> shift_away; reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away;
} else if (FLAGS_sck_footer_unique_id) { } else if (FLAGS_sck_footer_unique_id) {
// Special case: keep only file number, not session counter
uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a; uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a;
uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b; uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
stripped = (uint64_t{a} << 32) + b; reduced_key = (uint64_t{a} << 32) + b;
} else { } else {
// Try to keep file number and session counter (shift away other bits)
uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a; uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b; uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
stripped = (uint64_t{a} << 32) + b; reduced_key = (uint64_t{a} << 32) + b;
} }
if (stripped == 0) { if (reduced_key == 0) {
// Unlikely, but we need to exclude tracking this value // Unlikely, but we need to exclude tracking this value because we
// use it to mean "empty" in table. This case is OK as long as we
// don't hit it often.
printf("Hit Zero! \n"); printf("Hit Zero! \n");
file_count--;
continue; continue;
} }
file_count++; uint64_t h =
uint64_t h = NPHash64(reinterpret_cast<char*>(&stripped), 8); NPHash64(reinterpret_cast<char*>(&reduced_key), sizeof(reduced_key));
// Skew lifetimes // Skew expected lifetimes, for high variance (super-Poisson) variance
// in actual lifetimes.
size_t pos = size_t pos =
std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask); std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask);
if (table_[pos] == stripped) { if (table_[pos] == reduced_key) {
collisions_this_run++; collisions_this_run++;
// To predict probability of no collisions, we have to get rid of // Our goal is to predict probability of no collisions, not expected
// correlated collisions, which this takes care of: // number of collisions. To make the distinction, we have to get rid
// of observing correlated collisions, which this takes care of:
ResetProcess(); ResetProcess();
} else { } else {
// Replace // Replace (end of lifetime for file that was in this slot)
table_[pos] = stripped; table_[pos] = reduced_key;
} }
if (++report_count == FLAGS_sck_files_per_day) { if (++report_count == FLAGS_sck_files_per_day) {
@ -748,6 +875,8 @@ class StressCacheKey {
ResetSession(i); ResetSession(i);
} }
if (FLAGS_sck_footer_unique_id) { if (FLAGS_sck_footer_unique_id) {
// For footer unique ID, this tracks process-wide generated SST file
// count.
dbs_[0].orig_file_number = 0; dbs_[0].orig_file_number = 0;
} }
} }

91
cache/cache_key.cc vendored

@ -35,7 +35,8 @@ CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) {
CacheKey CacheKey::CreateUniqueForProcessLifetime() { CacheKey CacheKey::CreateUniqueForProcessLifetime() {
// To avoid colliding with CreateUniqueForCacheLifetime, assuming // To avoid colliding with CreateUniqueForCacheLifetime, assuming
// Cache::NewId counts up from zero, here we count down from UINT64_MAX. // Cache::NewId counts up from zero, here we count down from UINT64_MAX.
// If this ever becomes a point of contention, we could use CoreLocalArray. // If this ever becomes a point of contention, we could sub-divide the
// space and use CoreLocalArray.
static std::atomic<uint64_t> counter{UINT64_MAX}; static std::atomic<uint64_t> counter{UINT64_MAX};
uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed); uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed);
// Ensure we don't collide with CreateUniqueForCacheLifetime // Ensure we don't collide with CreateUniqueForCacheLifetime
@ -118,9 +119,10 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
// "structured" uniqueness hasn't been cloned. Using a static // "structured" uniqueness hasn't been cloned. Using a static
// SemiStructuredUniqueIdGen for db_session_ids, this means we only get an // SemiStructuredUniqueIdGen for db_session_ids, this means we only get an
// "all new" session id when a new process uses RocksDB. (Between processes, // "all new" session id when a new process uses RocksDB. (Between processes,
// we don't know if a DB or other persistent storage has been cloned.) Within // we don't know if a DB or other persistent storage has been cloned. We
// a process, only the session_lower of the db_session_id changes // assume that if VM hot cloning is used, subsequently generated SST files
// incrementally ("structured" uniqueness). // do not interact.) Within a process, only the session_lower of the
// db_session_id changes incrementally ("structured" uniqueness).
// //
// This basically means that our offsets, counters and file numbers allow us // This basically means that our offsets, counters and file numbers allow us
// to do somewhat "better than random" (birthday paradox) while in the // to do somewhat "better than random" (birthday paradox) while in the
@ -168,12 +170,83 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
// data from the last 180 days is in cache, but NOT the other assumption // data from the last 180 days is in cache, but NOT the other assumption
// for the 1 in a trillion estimate above). // for the 1 in a trillion estimate above).
// //
// Conclusion: Burning through session IDs, particularly "all new" IDs that
// only arise when a new process is started, is the only way to have a
// plausible chance of cache key collision. When processes live for hours
// or days, the chance of a cache key collision seems more plausibly due
// to bad hardware than to bad luck in random session ID data.
// //
// Collision probability estimation through simulation:
// A tool ./cache_bench -stress_cache_key broadly simulates host-wide cache
// activity over many months, by making some pessimistic simplifying
// assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
// Here is some sample output with
// `./cache_bench -stress_cache_key -sck_keep_bits=40`:
//
// Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
// Multiply by 9.22337e+18 to correct for simulation losses (but still
// assume whole file cached)
//
// These come from default settings of 2.5M files per day of 32 MB each, and
// `-sck_keep_bits=40` means that to represent a single file, we are only
// keeping 40 bits of the 128-bit (base) cache key. With file size of 2**25
// contiguous keys (pessimistic), our simulation is about 2\*\*(128-40-25) or
// about 9 billion billion times more prone to collision than reality.
//
// More default assumptions, relatively pessimistic:
// * 100 DBs in same process (doesn't matter much)
// * Re-open DB in same process (new session ID related to old session ID) on
// average every 100 files generated
// * Restart process (all new session IDs unrelated to old) 24 times per day
//
// After enough data, we get a result at the end (-sck_keep_bits=40):
//
// (keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between
// (9.76592e+19 corrected)
//
// If we believe the (pessimistic) simulation and the mathematical
// extrapolation, we would need to run a billion machines all for 97 billion
// days to expect a cache key collision. To help verify that our extrapolation
// ("corrected") is robust, we can make our simulation more precise with
// `-sck_keep_bits=41` and `42`, which takes more running time to get enough
// collision data:
//
// (keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between
// (1.03763e+20 corrected)
// (keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between
// (1.09224e+20 corrected)
//
// The extrapolated prediction is very close. If anything, we might have some
// very small losses of structured data (see class StressCacheKey in
// cache_bench_tool.cc) leading to more accurate & more attractive prediction
// with more bits kept.
//
// With the `-sck_randomize` option, we can see that typical workloads like
// above have lower collision probability than "random" cache keys (note:
// offsets still non-randomized) by a modest amount (roughly 20x less collision
// prone than random), which should make us reasonably comfortable even in
// "degenerate" cases (e.g. repeatedly launch a process to generate 1 file
// with SstFileWriter):
//
// (rand 40 bits) 197 collisions after 1 x 90 days, est 0.456853 days between
// (4.21372e+18 corrected)
//
// We can see that with more frequent process restarts (all new session IDs),
// we get closer to the "random" cache key performance:
//
// (-sck_restarts_per_day=5000): 140 collisions after 1 x 90 days, ...
// (5.92931e+18 corrected)
//
// Other tests have been run to validate other conditions behave as expected,
// never behaving "worse than random" unless we start chopping off structured
// data.
//
//
// Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
// that only arise when a new process is started, the chance of any cache key
// collisions in a giant fleet of machines is negligible. Especially when
// processes live for hours or days, the chance of a cache key collision is
// likely more plausibly due to bad hardware than to bad luck in random
// session ID data. Software defects are surely more likely to cause corruption
// than both of those.
//
// TODO: Nevertheless / regardless, an efficient way to detect (and thus
// quantify) block cache corruptions, including collisions, should be added.
OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id, OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
const std::string &db_session_id, const std::string &db_session_id,
uint64_t file_number, uint64_t file_number,

@ -9,17 +9,26 @@
#include <cstdlib> #include <cstdlib>
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <unordered_set>
#include "cache/cache_entry_roles.h" #include "cache/cache_entry_roles.h"
#include "cache/cache_key.h"
#include "cache/lru_cache.h" #include "cache/lru_cache.h"
#include "db/column_family.h" #include "db/column_family.h"
#include "db/db_impl/db_impl.h"
#include "db/db_test_util.h" #include "db/db_test_util.h"
#include "env/unique_id_gen.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "rocksdb/persistent_cache.h" #include "rocksdb/persistent_cache.h"
#include "rocksdb/statistics.h" #include "rocksdb/statistics.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "rocksdb/table_properties.h"
#include "table/block_based/block_based_table_reader.h"
#include "table/unique_id_impl.h"
#include "util/compression.h" #include "util/compression.h"
#include "util/defer.h" #include "util/defer.h"
#include "util/hash.h"
#include "util/math.h"
#include "util/random.h" #include "util/random.h"
#include "utilities/fault_injection_fs.h" #include "utilities/fault_injection_fs.h"
@ -1714,6 +1723,238 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
} }
class CacheKeyTest : public testing::Test {
public:
void SetupStableBase() {
// Like SemiStructuredUniqueIdGen::GenerateNext
tp_.db_session_id = EncodeSessionId(base_session_upper_,
base_session_lower_ ^ session_counter_);
tp_.db_id = ToString(db_id_);
tp_.orig_file_number = file_number_;
bool is_stable;
std::string cur_session_id = ""; // ignored
uint64_t cur_file_number = 42; // ignored
BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number,
file_size_, &base_cache_key_,
&is_stable);
ASSERT_TRUE(is_stable);
}
CacheKey WithOffset(uint64_t offset) {
return BlockBasedTable::GetCacheKey(base_cache_key_,
BlockHandle(offset, /*size*/ 5));
}
protected:
OffsetableCacheKey base_cache_key_;
TableProperties tp_;
uint64_t file_size_ = 0;
uint64_t base_session_upper_ = 0;
uint64_t base_session_lower_ = 0;
uint64_t session_counter_ = 0;
uint64_t file_number_ = 0;
uint64_t db_id_ = 0;
};
namespace {
template <typename T>
int CountBitsDifferent(const T& t1, const T& t2) {
int diff = 0;
const uint8_t* p1 = reinterpret_cast<const uint8_t*>(&t1);
const uint8_t* p2 = reinterpret_cast<const uint8_t*>(&t2);
static_assert(sizeof(*p1) == 1, "Expecting uint8_t byte");
for (size_t i = 0; i < sizeof(T); ++i) {
diff += BitsSetToOne(p1[i] ^ p2[i]);
}
return diff;
}
} // namespace
TEST_F(CacheKeyTest, DBImplSessionIdStructure) {
// We have to generate our own session IDs for simulation purposes in other
// tests. Here we verify that the DBImpl implementation seems to match
// our construction here, by using lowest XORed-in bits for "session
// counter."
std::string session_id1 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
std::string session_id2 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
uint64_t upper1, upper2, lower1, lower2;
ASSERT_OK(DecodeSessionId(session_id1, &upper1, &lower1));
ASSERT_OK(DecodeSessionId(session_id2, &upper2, &lower2));
// Because generated in same process
ASSERT_EQ(upper1, upper2);
// Unless we generate > 4 billion session IDs in this process...
ASSERT_EQ(Upper32of64(lower1), Upper32of64(lower2));
// But they must be different somewhere
ASSERT_NE(Lower32of64(lower1), Lower32of64(lower2));
}
TEST_F(CacheKeyTest, StandardEncodingLimit) {
base_session_upper_ = 1234;
base_session_lower_ = 5678;
session_counter_ = 42;
file_number_ = 42;
db_id_ = 1234;
file_size_ = 42;
SetupStableBase();
CacheKey ck1;
ASSERT_TRUE(ck1.IsEmpty());
ck1 = WithOffset(0);
ASSERT_FALSE(ck1.IsEmpty());
// Should use same encoding
file_size_ = BlockBasedTable::kMaxFileSizeStandardEncoding;
SetupStableBase();
CacheKey ck2 = WithOffset(0);
ASSERT_EQ(CountBitsDifferent(ck1, ck2), 0);
// Should use different encoding
++file_size_;
SetupStableBase();
CacheKey ck3 = WithOffset(0);
ASSERT_GT(CountBitsDifferent(ck2, ck3), 0);
}
TEST_F(CacheKeyTest, Encodings) {
// Claim from cache_key.cc:
// In fact, if our SST files are all < 4TB (see
// BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated
// in a single process are guaranteed to have unique cache keys, unless/until
// number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in
// a single process and 64 trillion files generated.
// We can generalize that. For
// * z bits in maximum file size
// * n bits in maximum file number
// * s bits in maximum session counter
// uniqueness is guaranteed at least when all of these hold:
// * z + n + s <= 121 (128 - 2 meta + 2 offset trim - (8-1) byte granularity
// in encoding)
// * n + s <= 86 (encoding limitation)
// * s <= 62 (because of 2-bit metadata)
// We can verify this indirectly by how input bits get into the cache key,
// but we have to be mindful that for sufficiently large file sizes,
// different encodings might be used. But for cases mixing large and small
// files, we have to verify uniqueness between encodings.
// Going through all combinations would be a little expensive, so we test
// only one random "stripe" of the configuration space per run.
constexpr uint32_t kStripeBits = 8;
constexpr uint32_t kStripeMask = (uint32_t{1} << kStripeBits) - 1;
// Also cycle through stripes on repeated runs (not thread safe)
static uint32_t stripe =
static_cast<uint32_t>(std::random_device{}()) & kStripeMask;
stripe = (stripe + 1) & kStripeMask;
fprintf(stderr, "%u\n", stripe);
// We are going to randomly initialize some values which *should* not affect
// result
Random64 r{std::random_device{}()};
int max_num_encodings = 0;
uint32_t config_num = 0;
uint32_t session_counter_bits, file_number_bits, max_file_size_bits;
// Inner loop body, used later in a loop over configurations
auto TestConfig = [&]() {
base_session_upper_ = r.Next();
base_session_lower_ = r.Next();
session_counter_ = r.Next();
if (session_counter_bits < 64) {
// Avoid shifting UB
session_counter_ = session_counter_ >> 1 >> (63 - session_counter_bits);
}
file_number_ = r.Next() >> (64 - file_number_bits);
// Need two bits set to avoid temporary zero below
if (BitsSetToOne(file_number_) < 2) {
file_number_ = 3;
}
db_id_ = r.Next();
// Work-around clang-analyzer which thinks empty last_base is garbage
CacheKey last_base = CacheKey::CreateUniqueForProcessLifetime();
std::unordered_set<std::string> seen;
int num_encodings = 0;
// Loop over encodings by increasing file size bits
for (uint32_t file_size_bits = 1; file_size_bits <= max_file_size_bits;
++file_size_bits) {
file_size_ = uint64_t{1} << (file_size_bits - 1);
SetupStableBase();
CacheKey new_base = WithOffset(0);
if (CountBitsDifferent(last_base, new_base) == 0) {
// Same as previous encoding
continue;
}
// New encoding
++num_encodings;
ASSERT_TRUE(seen.insert(new_base.AsSlice().ToString()).second);
last_base = new_base;
for (uint32_t i = 0; i < file_size_bits; ++i) {
CacheKey ck = WithOffset(uint64_t{1} << i);
if (i < 2) {
// These cases are not relevant and optimized by dropping two
// lowest bits because there's always at least 5 bytes between
// blocks.
ASSERT_EQ(CountBitsDifferent(ck, new_base), 0);
} else {
// Normal case
// 1 bit different from base and never been seen implies the bit
// is encoded into cache key without overlapping other structured
// data.
ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
}
}
for (uint32_t i = 0; i < session_counter_bits; ++i) {
SaveAndRestore<uint64_t> tmp(&session_counter_,
session_counter_ ^ (uint64_t{1} << i));
SetupStableBase();
CacheKey ck = WithOffset(0);
ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
}
for (uint32_t i = 0; i < file_number_bits; ++i) {
SaveAndRestore<uint64_t> tmp(&file_number_,
file_number_ ^ (uint64_t{1} << i));
SetupStableBase();
CacheKey ck = WithOffset(0);
ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
}
max_num_encodings = std::max(max_num_encodings, num_encodings);
}
};
// Loop over configurations and test those in stripe
for (session_counter_bits = 0; session_counter_bits <= 62;
++session_counter_bits) {
uint32_t max_file_number_bits =
std::min(uint32_t{64}, uint32_t{86} - session_counter_bits);
// Start with 2 to avoid file_number_ == 0 in testing
for (file_number_bits = 2; file_number_bits <= max_file_number_bits;
++file_number_bits) {
uint32_t max_max_file_size_bits =
std::min(uint32_t{64},
uint32_t{121} - file_number_bits - session_counter_bits);
for (max_file_size_bits = 1; max_file_size_bits <= max_max_file_size_bits;
++max_file_size_bits) {
if ((config_num++ & kStripeMask) == stripe) {
TestConfig();
}
}
}
}
// Make sure the current implementation is exercised
ASSERT_EQ(max_num_encodings, 4);
}
INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest, INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest,
::testing::Combine(::testing::Bool(), ::testing::Combine(::testing::Bool(),
::testing::Bool())); ::testing::Bool()));

Loading…
Cancel
Save