@ -79,31 +79,52 @@ static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
DEFINE_bool ( use_clock_cache , false , " " ) ;
DEFINE_bool ( use_clock_cache , false , " " ) ;
// ## BEGIN stress_cache_key sub-tool options ##
// ## BEGIN stress_cache_key sub-tool options ##
// See class StressCacheKey below.
DEFINE_bool ( stress_cache_key , false ,
DEFINE_bool ( stress_cache_key , false ,
" If true, run cache key stress test instead " ) ;
" If true, run cache key stress test instead " ) ;
DEFINE_uint32 ( sck_files_per_day , 2500000 ,
DEFINE_uint32 (
" (-stress_cache_key) Simulated files generated per day " ) ;
sck_files_per_day , 2500000 ,
DEFINE_uint32 ( sck_duration , 90 ,
" (-stress_cache_key) Simulated files generated per simulated day " ) ;
// NOTE: Giving each run a specified lifetime, rather than e.g. "until
// first collision" ensures equal skew from start-up, when collisions are
// less likely.
DEFINE_uint32 ( sck_days_per_run , 90 ,
" (-stress_cache_key) Number of days to simulate in each run " ) ;
" (-stress_cache_key) Number of days to simulate in each run " ) ;
// NOTE: The number of observed collisions directly affects the relative
// accuracy of the predicted probabilities. 15 observations should be well
// within factor-of-2 accuracy.
DEFINE_uint32 (
DEFINE_uint32 (
sck_min_collision , 15 ,
sck_min_collision , 15 ,
" (-stress_cache_key) Keep running until this many collisions seen " ) ;
" (-stress_cache_key) Keep running until this many collisions seen " ) ;
// sck_file_size_mb can be thought of as average file size. The simulation is
// not precise enough to care about the distribution of file sizes; other
// simulations (https://github.com/pdillinger/unique_id/tree/main/monte_carlo)
// indicate the distribution only makes a small difference (e.g. < 2x factor)
DEFINE_uint32 (
DEFINE_uint32 (
sck_file_size_mb , 32 ,
sck_file_size_mb , 32 ,
" (-stress_cache_key) Simulated file size in MiB, for accounting purposes " ) ;
" (-stress_cache_key) Simulated file size in MiB, for accounting purposes " ) ;
DEFINE_uint32 ( sck_reopen_nfiles , 100 ,
DEFINE_uint32 ( sck_reopen_nfiles , 100 ,
" (-stress_cache_key) Re-opens DB average every n files " ) ;
" (-stress_cache_key) Simulate DB re-open average every n files " ) ;
DEFINE_uint32 ( sck_restarts_per_day , 24 ,
" (-stress_cache_key) Average simulated process restarts per day "
" (across DBs) " ) ;
DEFINE_uint32 (
sck_db_count , 100 ,
" (-stress_cache_key) Parallel DBs in simulation sharing a block cache " ) ;
DEFINE_uint32 (
sck_table_bits , 20 ,
" (-stress_cache_key) Log2 number of tracked (live) files (across DBs) " ) ;
// sck_keep_bits being well below full 128 bits amplifies the collision
// probability so that the true probability can be estimated through observed
// collisions. (More explanation below.)
DEFINE_uint32 (
DEFINE_uint32 (
sck_restarts_per_day , 24 ,
sck_keep_bits , 50 ,
" (-stress_cache_key) Simulated process restarts per day (across DBs) " ) ;
" (-stress_cache_key) Number of bits to keep from each cache key (<= 64) " ) ;
DEFINE_uint32 ( sck_db_count , 100 ,
// sck_randomize is used to validate whether cache key is performing "better
" (-stress_cache_key) Parallel DBs in operation " ) ;
// than random." Even with this setting, file offsets are not randomized.
DEFINE_uint32 ( sck_table_bits , 20 ,
" (-stress_cache_key) Log2 number of tracked files " ) ;
DEFINE_uint32 ( sck_keep_bits , 50 ,
" (-stress_cache_key) Number of cache key bits to keep " ) ;
DEFINE_bool ( sck_randomize , false ,
DEFINE_bool ( sck_randomize , false ,
" (-stress_cache_key) Randomize (hash) cache key " ) ;
" (-stress_cache_key) Randomize (hash) cache key " ) ;
// See https://github.com/facebook/rocksdb/pull/9058
DEFINE_bool ( sck_footer_unique_id , false ,
DEFINE_bool ( sck_footer_unique_id , false ,
" (-stress_cache_key) Simulate using proposed footer unique id " ) ;
" (-stress_cache_key) Simulate using proposed footer unique id " ) ;
// ## END stress_cache_key sub-tool options ##
// ## END stress_cache_key sub-tool options ##
@ -583,20 +604,97 @@ class CacheBench {
}
}
} ;
} ;
// TODO: better description (see PR #9126 for some info)
// cache_bench -stress_cache_key is an independent embedded tool for
// estimating the probability of CacheKey collisions through simulation.
// At a high level, it simulates generating SST files over many months,
// keeping them in the DB and/or cache for some lifetime while staying
// under resource caps, and checking for any cache key collisions that
// arise among the set of live files. For efficient simulation, we make
// some simplifying "pessimistic" assumptions (that only increase the
// chance of the simulation reporting a collision relative to the chance
// of collision in practice):
// * Every generated file has a cache entry for every byte offset in the
// file (contiguous range of cache keys)
// * All of every file is cached for its entire lifetime. (Here "lifetime"
// is technically the union of DB and Cache lifetime, though we only
// model a generous DB lifetime, where space usage is always maximized.
// In a effective Cache, lifetime in cache can only substantially exceed
// lifetime in DB if there is little cache activity; cache activity is
// required to hit cache key collisions.)
//
// It would be possible to track an exact set of cache key ranges for the
// set of live files, but we would have no hope of observing collisions
// (overlap in live files) in our simulation. We need to employ some way
// of amplifying collision probability that allows us to predict the real
// collision probability by extrapolation from observed collisions. Our
// basic approach is to reduce each cache key range down to some smaller
// number of bits, and limiting to bits that are shared over the whole
// range. Now we can observe collisions using a set of smaller stripped-down
// (reduced) cache keys. Let's do some case analysis to understand why this
// works:
// * No collision in reduced key - because the reduction is a pure function
// this implies no collision in the full keys
// * Collision detected between two reduced keys - either
// * The reduction has dropped some structured uniqueness info (from one of
// session counter or file number; file offsets are never materialized here).
// This can only artificially inflate the observed and extrapolated collision
// probabilities. We only have to worry about this in designing the reduction.
// * The reduction has preserved all the structured uniqueness in the cache
// key, which means either
// * REJECTED: We have a uniqueness bug in generating cache keys, where
// structured uniqueness info should have been different but isn't. In such a
// case, increasing by 1 the number of bits kept after reduction would not
// reduce observed probabilities by half. (In our observations, the
// probabilities are reduced approximately by half.)
// * ACCEPTED: The lost unstructured uniqueness in the key determines the
// probability that an observed collision would imply an overlap in ranges.
// In short, dropping n bits from key would increase collision probability by
// 2**n, assuming those n bits have full entropy in unstructured uniqueness.
//
// But we also have to account for the key ranges based on file size. If file
// sizes are roughly 2**b offsets, using XOR in 128-bit cache keys for
// "ranges", we know from other simulations (see
// https://github.com/pdillinger/unique_id/) that that's roughly equivalent to
// (less than 2x higher collision probability) using a cache key of size
// 128 - b bits for the whole file. (This is the only place we make an
// "optimistic" assumption, which is more than offset by the real
// implementation stripping off 2 lower bits from block byte offsets for cache
// keys. The simulation assumes byte offsets, which is net pessimistic.)
//
// So to accept the extrapolation as valid, we need to be confident that all
// "lost" bits, excluding those covered by file offset, are full entropy.
// Recall that we have assumed (verifiably, safely) that other structured data
// (file number and session counter) are kept, not lost. Based on the
// implementation comments for OffsetableCacheKey, the only potential hole here
// is that we only have ~103 bits of entropy in "all new" session IDs, and in
// extreme cases, there might be only 1 DB ID. However, because the upper ~39
// bits of session ID are hashed, the combination of file number and file
// offset only has to add to 25 bits (or more) to ensure full entropy in
// unstructured uniqueness lost in the reduction. Typical file size of 32MB
// suffices (at least for simulation purposes where we assume each file offset
// occupies a cache key).
//
// Example results in comments on OffsetableCacheKey.
class StressCacheKey {
class StressCacheKey {
public :
public :
void Run ( ) {
void Run ( ) {
if ( FLAGS_sck_footer_unique_id ) {
if ( FLAGS_sck_footer_unique_id ) {
// Proposed footer unique IDs are DB-independent and session-independent
// (but process-dependent) which is most easily simulated here by
// assuming 1 DB and (later below) no session resets without process
// reset.
FLAGS_sck_db_count = 1 ;
FLAGS_sck_db_count = 1 ;
}
}
// Describe the simulated workload
uint64_t mb_per_day =
uint64_t mb_per_day =
uint64_t { FLAGS_sck_files_per_day } * FLAGS_sck_file_size_mb ;
uint64_t { FLAGS_sck_files_per_day } * FLAGS_sck_file_size_mb ;
printf ( " Total cache or DBs size: %gTiB Writing %g MiB/s or %gTiB/day \n " ,
printf ( " Total cache or DBs size: %gTiB Writing %g MiB/s or %gTiB/day \n " ,
FLAGS_sck_file_size_mb / 1024.0 / 1024.0 *
FLAGS_sck_file_size_mb / 1024.0 / 1024.0 *
std : : pow ( 2.0 , FLAGS_sck_table_bits ) ,
std : : pow ( 2.0 , FLAGS_sck_table_bits ) ,
mb_per_day / 86400.0 , mb_per_day / 1024.0 / 1024.0 ) ;
mb_per_day / 86400.0 , mb_per_day / 1024.0 / 1024.0 ) ;
// For extrapolating probability of any collisions from a number of
// observed collisions
multiplier_ = std : : pow ( 2.0 , 128 - FLAGS_sck_keep_bits ) /
multiplier_ = std : : pow ( 2.0 , 128 - FLAGS_sck_keep_bits ) /
( FLAGS_sck_file_size_mb * 1024.0 * 1024.0 ) ;
( FLAGS_sck_file_size_mb * 1024.0 * 1024.0 ) ;
printf (
printf (
@ -606,6 +704,9 @@ class StressCacheKey {
restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day ;
restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day ;
double without_ejection =
double without_ejection =
std : : pow ( 1.414214 , FLAGS_sck_keep_bits ) / FLAGS_sck_files_per_day ;
std : : pow ( 1.414214 , FLAGS_sck_keep_bits ) / FLAGS_sck_files_per_day ;
// This should be a lower bound for -sck_randomize, usually a terribly
// rough lower bound.
// If observation is worse than this, then something has gone wrong.
printf (
printf (
" Without ejection, expect random collision after %g days (%g "
" Without ejection, expect random collision after %g days (%g "
" corrected) \n " ,
" corrected) \n " ,
@ -613,30 +714,36 @@ class StressCacheKey {
double with_full_table =
double with_full_table =
std : : pow ( 2.0 , FLAGS_sck_keep_bits - FLAGS_sck_table_bits ) /
std : : pow ( 2.0 , FLAGS_sck_keep_bits - FLAGS_sck_table_bits ) /
FLAGS_sck_files_per_day ;
FLAGS_sck_files_per_day ;
// This is an alternate lower bound for -sck_randomize, usually pretty
// accurate. Our cache keys should usually perform "better than random"
// but always no worse. (If observation is substantially worse than this,
// then something has gone wrong.)
printf (
printf (
" With ejection and full table, expect random collision after %g "
" With ejection and full table, expect random collision after %g "
" days (%g corrected) \n " ,
" days (%g corrected) \n " ,
with_full_table , with_full_table * multiplier_ ) ;
with_full_table , with_full_table * multiplier_ ) ;
collisions_ = 0 ;
collisions_ = 0 ;
// Run until sufficient number of observed collisions.
for ( int i = 1 ; collisions_ < FLAGS_sck_min_collision ; i + + ) {
for ( int i = 1 ; collisions_ < FLAGS_sck_min_collision ; i + + ) {
RunOnce ( ) ;
RunOnce ( ) ;
if ( collisions_ = = 0 ) {
if ( collisions_ = = 0 ) {
printf (
printf (
" No collisions after %d x %u days "
" No collisions after %d x %u days "
" \n " ,
" \n " ,
i , FLAGS_sck_duratio n ) ;
i , FLAGS_sck_days_per_ru n ) ;
} else {
} else {
double est = 1.0 * i * FLAGS_sck_duratio n / collisions_ ;
double est = 1.0 * i * FLAGS_sck_days_per_ru n / collisions_ ;
printf ( " % " PRIu64
printf ( " % " PRIu64
" collisions after %d x %u days, est %g days between (%g "
" collisions after %d x %u days, est %g days between (%g "
" corrected) \n " ,
" corrected) \n " ,
collisions_ , i , FLAGS_sck_duratio n , est , est * multiplier_ ) ;
collisions_ , i , FLAGS_sck_days_per_ru n , est , est * multiplier_ ) ;
}
}
}
}
}
}
void RunOnce ( ) {
void RunOnce ( ) {
// Re-initialized simulated state
const size_t db_count = FLAGS_sck_db_count ;
const size_t db_count = FLAGS_sck_db_count ;
dbs_ . reset ( new TableProperties [ db_count ] { } ) ;
dbs_ . reset ( new TableProperties [ db_count ] { } ) ;
const size_t table_mask = ( size_t { 1 } < < FLAGS_sck_table_bits ) - 1 ;
const size_t table_mask = ( size_t { 1 } < < FLAGS_sck_table_bits ) - 1 ;
@ -644,7 +751,11 @@ class StressCacheKey {
if ( FLAGS_sck_keep_bits > 64 ) {
if ( FLAGS_sck_keep_bits > 64 ) {
FLAGS_sck_keep_bits = 64 ;
FLAGS_sck_keep_bits = 64 ;
}
}
// Details of which bits are dropped in reduction
uint32_t shift_away = 64 - FLAGS_sck_keep_bits ;
uint32_t shift_away = 64 - FLAGS_sck_keep_bits ;
// Shift away fewer potential file number bits (b) than potential
// session counter bits (a).
uint32_t shift_away_b = shift_away / 3 ;
uint32_t shift_away_b = shift_away / 3 ;
uint32_t shift_away_a = shift_away - shift_away_b ;
uint32_t shift_away_a = shift_away - shift_away_b ;
@ -655,62 +766,78 @@ class StressCacheKey {
Random64 r { std : : random_device { } ( ) } ;
Random64 r { std : : random_device { } ( ) } ;
uint64_t max_file_count =
uint64_t max_file_count =
uint64_t { FLAGS_sck_files_per_day } * FLAGS_sck_duratio n ;
uint64_t { FLAGS_sck_files_per_day } * FLAGS_sck_days_per_ru n ;
uint64_t file_count = 0 ;
uint64_t file_size = FLAGS_sck_file_size_mb * uint64_t { 1024 } * 1024U ;
uint32_t report_count = 0 ;
uint32_t report_count = 0 ;
uint32_t collisions_this_run = 0 ;
uint32_t collisions_this_run = 0 ;
// Round robin through DBs
size_t db_i = 0 ;
for ( size_t db_i = 0 ; ; + + db_i ) {
for ( uint64_t file_count = 1 ; file_count < = max_file_count ;
+ + file_count , + + db_i ) {
// Round-robin through DBs (this faster than %)
if ( db_i > = db_count ) {
if ( db_i > = db_count ) {
db_i = 0 ;
db_i = 0 ;
}
}
if ( file_count > = max_file_count ) {
// Any other periodic actions before simulating next file
break ;
}
if ( ! FLAGS_sck_footer_unique_id & & r . OneIn ( FLAGS_sck_reopen_nfiles ) ) {
if ( ! FLAGS_sck_footer_unique_id & & r . OneIn ( FLAGS_sck_reopen_nfiles ) ) {
ResetSession ( db_i ) ;
ResetSession ( db_i ) ;
} else if ( r . OneIn ( restart_nfiles_ ) ) {
} else if ( r . OneIn ( restart_nfiles_ ) ) {
ResetProcess ( ) ;
ResetProcess ( ) ;
}
}
// Simulate next file
OffsetableCacheKey ock ;
OffsetableCacheKey ock ;
dbs_ [ db_i ] . orig_file_number + = 1 ;
dbs_ [ db_i ] . orig_file_number + = 1 ;
// skip some file numbers, unless 1 DB so that that can simulate
// skip some file numbers for other file kinds, except in footer unique
// better (DB-independent) unique IDs
// ID, orig_file_number here tracks process-wide generated SST file
if ( db_count > 1 ) {
// count.
if ( ! FLAGS_sck_footer_unique_id ) {
dbs_ [ db_i ] . orig_file_number + = ( r . Next ( ) & 3 ) ;
dbs_ [ db_i ] . orig_file_number + = ( r . Next ( ) & 3 ) ;
}
}
BlockBasedTable : : SetupBaseCacheKey ( & dbs_ [ db_i ] , " " , 42 , 42 , & ock ) ;
bool is_stable ;
BlockBasedTable : : SetupBaseCacheKey ( & dbs_ [ db_i ] , /* ignored */ " " ,
/* ignored */ 42 , file_size , & ock ,
& is_stable ) ;
assert ( is_stable ) ;
// Get a representative cache key, which later we analytically generalize
// to a range.
CacheKey ck = ock . WithOffset ( 0 ) ;
CacheKey ck = ock . WithOffset ( 0 ) ;
uint64_t stripped ;
uint64_t reduced_key ;
if ( FLAGS_sck_randomize ) {
if ( FLAGS_sck_randomize ) {
stripped = GetSliceHash64 ( ck . AsSlice ( ) ) > > shift_away ;
reduced_key = GetSliceHash64 ( ck . AsSlice ( ) ) > > shift_away ;
} else if ( FLAGS_sck_footer_unique_id ) {
} else if ( FLAGS_sck_footer_unique_id ) {
// Special case: keep only file number, not session counter
uint32_t a = DecodeFixed32 ( ck . AsSlice ( ) . data ( ) + 4 ) > > shift_away_a ;
uint32_t a = DecodeFixed32 ( ck . AsSlice ( ) . data ( ) + 4 ) > > shift_away_a ;
uint32_t b = DecodeFixed32 ( ck . AsSlice ( ) . data ( ) + 12 ) > > shift_away_b ;
uint32_t b = DecodeFixed32 ( ck . AsSlice ( ) . data ( ) + 12 ) > > shift_away_b ;
stripped = ( uint64_t { a } < < 32 ) + b ;
reduced_key = ( uint64_t { a } < < 32 ) + b ;
} else {
} else {
// Try to keep file number and session counter (shift away other bits)
uint32_t a = DecodeFixed32 ( ck . AsSlice ( ) . data ( ) ) < < shift_away_a ;
uint32_t a = DecodeFixed32 ( ck . AsSlice ( ) . data ( ) ) < < shift_away_a ;
uint32_t b = DecodeFixed32 ( ck . AsSlice ( ) . data ( ) + 12 ) > > shift_away_b ;
uint32_t b = DecodeFixed32 ( ck . AsSlice ( ) . data ( ) + 12 ) > > shift_away_b ;
stripped = ( uint64_t { a } < < 32 ) + b ;
reduced_key = ( uint64_t { a } < < 32 ) + b ;
}
}
if ( stripped = = 0 ) {
if ( reduced_key = = 0 ) {
// Unlikely, but we need to exclude tracking this value
// Unlikely, but we need to exclude tracking this value because we
// use it to mean "empty" in table. This case is OK as long as we
// don't hit it often.
printf ( " Hit Zero! \n " ) ;
printf ( " Hit Zero! \n " ) ;
file_count - - ;
continue ;
continue ;
}
}
file_count + + ;
uint64_t h =
uint64_t h = NPHash64 ( reinterpret_cast < char * > ( & stripped ) , 8 ) ;
NPHash64 ( reinterpret_cast < char * > ( & reduced_key ) , sizeof ( reduced_key ) ) ;
// Skew lifetimes
// Skew expected lifetimes, for high variance (super-Poisson) variance
// in actual lifetimes.
size_t pos =
size_t pos =
std : : min ( Lower32of64 ( h ) & table_mask , Upper32of64 ( h ) & table_mask ) ;
std : : min ( Lower32of64 ( h ) & table_mask , Upper32of64 ( h ) & table_mask ) ;
if ( table_ [ pos ] = = stripped ) {
if ( table_ [ pos ] = = reduced_key ) {
collisions_this_run + + ;
collisions_this_run + + ;
// To predict probability of no collisions, we have to get rid of
// Our goal is to predict probability of no collisions, not expected
// correlated collisions, which this takes care of:
// number of collisions. To make the distinction, we have to get rid
// of observing correlated collisions, which this takes care of:
ResetProcess ( ) ;
ResetProcess ( ) ;
} else {
} else {
// Replace
// Replace (end of lifetime for file that was in this slot)
table_ [ pos ] = stripped ;
table_ [ pos ] = reduced_key ;
}
}
if ( + + report_count = = FLAGS_sck_files_per_day ) {
if ( + + report_count = = FLAGS_sck_files_per_day ) {
@ -748,6 +875,8 @@ class StressCacheKey {
ResetSession ( i ) ;
ResetSession ( i ) ;
}
}
if ( FLAGS_sck_footer_unique_id ) {
if ( FLAGS_sck_footer_unique_id ) {
// For footer unique ID, this tracks process-wide generated SST file
// count.
dbs_ [ 0 ] . orig_file_number = 0 ;
dbs_ [ 0 ] . orig_file_number = 0 ;
}
}
}
}