@ -702,15 +702,18 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
sub_compact - > compaction - > mutable_cf_options ( ) ;
// To build compression dictionary, we sample the first output file, assuming
// it'll reach the maximum length, and then use the dictionary for compressing
// subsequent output files. The dictionary may be less than max_dict_bytes if
// the first output file's length is less than the maximum.
// it'll reach the maximum length. We optionally pass these samples through
// zstd's dictionary trainer, or just use them directly. Then, the dictionary
// is used for compressing subsequent output files in the same subcompaction.
const bool kUseZstdTrainer =
cfd - > ioptions ( ) - > compression_opts . zstd_max_train_bytes > 0 ;
const size_t kSampleBytes =
kUseZstdTrainer ? cfd - > ioptions ( ) - > compression_opts . zstd_max_train_bytes
: cfd - > ioptions ( ) - > compression_opts . max_dict_bytes ;
const int kSampleLenShift = 6 ; // 2^6 = 64-byte samples
std : : set < size_t > sample_begin_offsets ;
if ( bottommost_level_ & &
cfd - > ioptions ( ) - > compression_opts . max_dict_bytes > 0 ) {
const size_t kMaxSamples =
cfd - > ioptions ( ) - > compression_opts . max_dict_bytes > > kSampleLenShift ;
if ( bottommost_level_ & & kSampleBytes > 0 ) {
const size_t kMaxSamples = kSampleBytes > > kSampleLenShift ;
const size_t kOutFileLen = mutable_cf_options - > MaxFileSizeForLevel (
compact_ - > compaction - > output_level ( ) ) ;
if ( kOutFileLen ! = port : : kMaxSizet ) {
@ -780,11 +783,11 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
}
const auto & c_iter_stats = c_iter - > iter_stats ( ) ;
auto sample_begin_offset_iter = sample_begin_offsets . cbegin ( ) ;
// data_begin_offset and compression_dict are only valid while generating
// data_begin_offset and dict_sample_data are only valid while generating
// dictionary from the first output file.
size_t data_begin_offset = 0 ;
std : : string compression_dict ;
compression_dict . reserve ( cfd - > ioptions ( ) - > compression_opts . max_dict_b ytes) ;
std : : string dict_sample_data ;
dict_sample_data . reserve ( kSampleB ytes) ;
while ( status . ok ( ) & & ! cfd - > IsDropped ( ) & & c_iter - > Valid ( ) ) {
// Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
@ -856,7 +859,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
data_elmt_copy_len =
data_end_offset - ( data_begin_offset + data_elmt_copy_offset ) ;
}
compression_dict . append ( & data_elmt . data ( ) [ data_elmt_copy_offset ] ,
dict_sample_data . append ( & data_elmt . data ( ) [ data_elmt_copy_offset ] ,
data_elmt_copy_len ) ;
if ( sample_end_offset > data_end_offset ) {
// Didn't finish sample. Try to finish it with the next data_elmt.
@ -911,9 +914,15 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
RecordDroppedKeys ( range_del_out_stats ,
& sub_compact - > compaction_job_stats ) ;
if ( sub_compact - > outputs . size ( ) = = 1 ) {
// Use dictionary from first output file for compression of subsequent
// files.
sub_compact - > compression_dict = std : : move ( compression_dict ) ;
// Use samples from first output file to create dictionary for
// compression of subsequent files.
if ( kUseZstdTrainer ) {
sub_compact - > compression_dict = ZSTD_TrainDictionary (
dict_sample_data , kSampleLenShift ,
cfd - > ioptions ( ) - > compression_opts . max_dict_bytes ) ;
} else {
sub_compact - > compression_dict = std : : move ( dict_sample_data ) ;
}
}
}
}