diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 3776004ca..bc7c113c6 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -1644,44 +1644,41 @@ void BlockBasedTableBuilder::EnterUnbuffered() { const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0 ? r->compression_opts.zstd_max_train_bytes : r->compression_opts.max_dict_bytes; + const size_t kNumBlocksBuffered = r->data_block_and_keys_buffers.size(); + + // Abstract algebra teaches us that a finite cyclic group (such as the + // additive group of integers modulo N) can be generated by a number that is + // coprime with N. Since N is variable (number of buffered data blocks), we + // must then pick a prime number in order to guarantee coprimeness with any N. + // + // One downside of this approach is the spread will be poor when + // `kPrimeGeneratorRemainder` is close to zero or close to + // `kNumBlocksBuffered`. + // + // Picked a random number between one and one trillion and then chose the + // next prime number greater than or equal to it. + const uint64_t kPrimeGenerator = 545055921143ull; + // Can avoid repeated division by just adding the remainder repeatedly. + const size_t kPrimeGeneratorRemainder = static_cast( + kPrimeGenerator % static_cast(kNumBlocksBuffered)); + const size_t kInitSampleIdx = kNumBlocksBuffered / 2; - // If buffer size is reasonable, we pre-generate a permutation to enforce - // uniqueness. This prevents wasting samples on duplicates, which is - // particularly likely when not many blocks were buffered. - std::vector data_block_order; - size_t data_block_order_idx = 0; - if (r->data_block_and_keys_buffers.size() <= ((1 << 16) - 1)) { - data_block_order.resize(r->data_block_and_keys_buffers.size()); - std::iota(data_block_order.begin(), data_block_order.end(), - static_cast(0)); - // We could be smarter and interleave the shuffling and sample appending - // logic. Then we could terminate as soon as `kSampleBytes` is reached, - // saving some shuffling computation. - RandomShuffle(data_block_order.begin(), data_block_order.end(), - static_cast(r->creation_time)); - } - - Random64 generator{r->creation_time}; std::string compression_dict_samples; std::vector compression_dict_sample_lens; - if (!r->data_block_and_keys_buffers.empty()) { - while ((data_block_order.empty() || - data_block_order_idx < data_block_order.size()) && - compression_dict_samples.size() < kSampleBytes) { - size_t rand_idx; - if (data_block_order.empty()) { - rand_idx = static_cast( - generator.Uniform(r->data_block_and_keys_buffers.size())); - } else { - rand_idx = data_block_order[data_block_order_idx]; - ++data_block_order_idx; - } - size_t copy_len = - std::min(kSampleBytes - compression_dict_samples.size(), - r->data_block_and_keys_buffers[rand_idx].first.size()); - compression_dict_samples.append( - r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len); - compression_dict_sample_lens.emplace_back(copy_len); + size_t buffer_idx = kInitSampleIdx; + for (size_t i = 0; + i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes; + ++i) { + size_t copy_len = + std::min(kSampleBytes - compression_dict_samples.size(), + r->data_block_and_keys_buffers[buffer_idx].first.size()); + compression_dict_samples.append( + r->data_block_and_keys_buffers[buffer_idx].first, 0, copy_len); + compression_dict_sample_lens.emplace_back(copy_len); + + buffer_idx += kPrimeGeneratorRemainder; + if (buffer_idx >= kNumBlocksBuffered) { + buffer_idx -= kNumBlocksBuffered; } }