@ -1644,44 +1644,41 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
const size_t kSampleBytes = r - > compression_opts . zstd_max_train_bytes > 0
const size_t kSampleBytes = r - > compression_opts . zstd_max_train_bytes > 0
? r - > compression_opts . zstd_max_train_bytes
? r - > compression_opts . zstd_max_train_bytes
: r - > compression_opts . max_dict_bytes ;
: r - > compression_opts . max_dict_bytes ;
const size_t kNumBlocksBuffered = r - > data_block_and_keys_buffers . size ( ) ;
// Abstract algebra teaches us that a finite cyclic group (such as the
// additive group of integers modulo N) can be generated by a number that is
// coprime with N. Since N is variable (number of buffered data blocks), we
// must then pick a prime number in order to guarantee coprimeness with any N.
//
// One downside of this approach is the spread will be poor when
// `kPrimeGeneratorRemainder` is close to zero or close to
// `kNumBlocksBuffered`.
//
// Picked a random number between one and one trillion and then chose the
// next prime number greater than or equal to it.
const uint64_t kPrimeGenerator = 545055921143ull ;
// Can avoid repeated division by just adding the remainder repeatedly.
const size_t kPrimeGeneratorRemainder = static_cast < size_t > (
kPrimeGenerator % static_cast < uint64_t > ( kNumBlocksBuffered ) ) ;
const size_t kInitSampleIdx = kNumBlocksBuffered / 2 ;
// If buffer size is reasonable, we pre-generate a permutation to enforce
// uniqueness. This prevents wasting samples on duplicates, which is
// particularly likely when not many blocks were buffered.
std : : vector < uint16_t > data_block_order ;
size_t data_block_order_idx = 0 ;
if ( r - > data_block_and_keys_buffers . size ( ) < = ( ( 1 < < 16 ) - 1 ) ) {
data_block_order . resize ( r - > data_block_and_keys_buffers . size ( ) ) ;
std : : iota ( data_block_order . begin ( ) , data_block_order . end ( ) ,
static_cast < uint16_t > ( 0 ) ) ;
// We could be smarter and interleave the shuffling and sample appending
// logic. Then we could terminate as soon as `kSampleBytes` is reached,
// saving some shuffling computation.
RandomShuffle ( data_block_order . begin ( ) , data_block_order . end ( ) ,
static_cast < uint32_t > ( r - > creation_time ) ) ;
}
Random64 generator { r - > creation_time } ;
std : : string compression_dict_samples ;
std : : string compression_dict_samples ;
std : : vector < size_t > compression_dict_sample_lens ;
std : : vector < size_t > compression_dict_sample_lens ;
if ( ! r - > data_block_and_keys_buffers . empty ( ) ) {
size_t buffer_idx = kInitSampleIdx ;
while ( ( data_block_order . empty ( ) | |
for ( size_t i = 0 ;
data_block_order_idx < data_block_order . size ( ) ) & &
i < kNumBlocksBuffered & & compression_dict_samples . size ( ) < kSampleBytes ;
compression_dict_samples . size ( ) < kSampleBytes ) {
+ + i ) {
size_t rand_idx ;
if ( data_block_order . empty ( ) ) {
rand_idx = static_cast < size_t > (
generator . Uniform ( r - > data_block_and_keys_buffers . size ( ) ) ) ;
} else {
rand_idx = data_block_order [ data_block_order_idx ] ;
+ + data_block_order_idx ;
}
size_t copy_len =
size_t copy_len =
std : : min ( kSampleBytes - compression_dict_samples . size ( ) ,
std : : min ( kSampleBytes - compression_dict_samples . size ( ) ,
r - > data_block_and_keys_buffers [ rand _idx ] . first . size ( ) ) ;
r - > data_block_and_keys_buffers [ buffer_idx ] . first . size ( ) ) ;
compression_dict_samples . append (
compression_dict_samples . append (
r - > data_block_and_keys_buffers [ rand _idx ] . first , 0 , copy_len ) ;
r - > data_block_and_keys_buffers [ buffer_idx ] . first , 0 , copy_len ) ;
compression_dict_sample_lens . emplace_back ( copy_len ) ;
compression_dict_sample_lens . emplace_back ( copy_len ) ;
buffer_idx + = kPrimeGeneratorRemainder ;
if ( buffer_idx > = kNumBlocksBuffered ) {
buffer_idx - = kNumBlocksBuffered ;
}
}
}
}