From c8c8104d7ef5451c8ae9d5073c4f01dc99987acf Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Thu, 14 Feb 2019 10:16:12 -0800 Subject: [PATCH] Dictionary compression for files written by SstFileWriter (#4978) Summary: If `CompressionOptions::max_dict_bytes` and/or `CompressionOptions::zstd_max_train_bytes` are set, `SstFileWriter` will now generate files respecting those options. I refactored the logic a bit for deciding when to use dictionary compression. Previously we plumbed `is_bottommost_level` down to the table builder and used that. However it was kind of confusing in `SstFileWriter`'s context since we don't know what level the file will be ingested to. Instead, now the higher-level callers (e.g., flush, compaction, file writer) are responsible for building the right `CompressionOptions` to give the table builder. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4978 Differential Revision: D14060763 Pulled By: ajkr fbshipit-source-id: dc802c327896df2b319dc162d6acc82b9cdb452a --- HISTORY.md | 1 + db/builder.cc | 15 ++++++++++----- db/builder.h | 3 +-- db/compaction.cc | 6 ++++++ db/compaction_job.cc | 2 +- db/external_sst_file_test.cc | 31 ++++++++++++++++++++++++++++++ table/block_based_table_builder.cc | 12 ++++-------- table/block_based_table_builder.h | 4 +--- table/block_based_table_factory.cc | 1 - table/table_builder.h | 4 +--- 10 files changed, 56 insertions(+), 23 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 13d09aa40..42b6e8b5f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,6 +11,7 @@ * Add support for trace sampling. * Enable properties block checksum verification for block-based tables. * For all users of dictionary compression, we now generate a separate dictionary for compressing each bottom-level SST file. Previously we reused a single dictionary for a whole compaction to bottom level. The new approach achieves better compression ratios; however, it uses more memory and CPU for buffering/sampling data blocks and training dictionaries. +* Files written by `SstFileWriter` will now use dictionary compression if it is configured in the file writer's `CompressionOptions`. ### Public API Change * Disallow CompactionFilter::IgnoreSnapshots() = false, because it is not very useful and the behavior is confusing. The filter will filter everything if there is no snapshot declared by the time the compaction starts. However, users can define a snapshot after the compaction starts and before it finishes and this new snapshot won't be repeatable, because after the compaction finishes, some keys may be dropped. diff --git a/db/builder.cc b/db/builder.cc index 678b7a449..a60eb7ece 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -49,8 +49,7 @@ TableBuilder* NewTableBuilder( WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, int level, const bool skip_filters, const uint64_t creation_time, - const uint64_t oldest_key_time, const bool is_bottommost_level, - const uint64_t target_file_size) { + const uint64_t oldest_key_time, const uint64_t target_file_size) { assert((column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == column_family_name.empty()); @@ -59,7 +58,7 @@ TableBuilder* NewTableBuilder( int_tbl_prop_collector_factories, compression_type, compression_opts, skip_filters, column_family_name, level, creation_time, oldest_key_time, - is_bottommost_level, target_file_size), + target_file_size), column_family_id, file); } @@ -106,6 +105,11 @@ Status BuildTable( if (iter->Valid() || !range_del_agg->IsEmpty()) { TableBuilder* builder; std::unique_ptr file_writer; + // Currently we only enable dictionary compression during compaction to the + // bottommost level. + CompressionOptions compression_opts_for_flush(compression_opts); + compression_opts_for_flush.max_dict_bytes = 0; + compression_opts_for_flush.zstd_max_train_bytes = 0; { std::unique_ptr file; #ifndef NDEBUG @@ -128,8 +132,9 @@ Status BuildTable( builder = NewTableBuilder( ioptions, mutable_cf_options, internal_comparator, int_tbl_prop_collector_factories, column_family_id, - column_family_name, file_writer.get(), compression, compression_opts, - level, false /* skip_filters */, creation_time, oldest_key_time); + column_family_name, file_writer.get(), compression, + compression_opts_for_flush, level, false /* skip_filters */, + creation_time, oldest_key_time); } MergeHelper merge(env, internal_comparator.user_comparator(), diff --git a/db/builder.h b/db/builder.h index 84d9f941c..95985a558 100644 --- a/db/builder.h +++ b/db/builder.h @@ -49,8 +49,7 @@ TableBuilder* NewTableBuilder( WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, int level, const bool skip_filters = false, const uint64_t creation_time = 0, - const uint64_t oldest_key_time = 0, const bool is_bottommost_level = false, - const uint64_t target_file_size = 0); + const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0); // Build a Table file from the contents of *iter. The generated file // will be named according to number specified in meta. On success, the rest of diff --git a/db/compaction.cc b/db/compaction.cc index a22379258..f8805376f 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -250,6 +250,12 @@ Compaction::Compaction(VersionStorageInfo* vstorage, if (max_subcompactions_ == 0) { max_subcompactions_ = immutable_cf_options_.max_subcompactions; } + if (!bottommost_level_) { + // Currently we only enable dictionary compression during compaction to the + // bottommost level. + output_compression_opts_.max_dict_bytes = 0; + output_compression_opts_.zstd_max_train_bytes = 0; + } #ifndef NDEBUG for (size_t i = 1; i < inputs_.size(); ++i) { diff --git a/db/compaction_job.cc b/db/compaction_job.cc index c8000ae8b..72380f1eb 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1501,7 +1501,7 @@ Status CompactionJob::OpenCompactionOutputFile( sub_compact->compaction->output_compression(), sub_compact->compaction->output_compression_opts(), sub_compact->compaction->output_level(), skip_filters, - output_file_creation_time, 0 /* oldest_key_time */, bottommost_level_, + output_file_creation_time, 0 /* oldest_key_time */, sub_compact->compaction->max_output_file_size())); LogFlush(db_options_.info_log); return s; diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index a4e4dd326..e2ef64ce8 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -2318,6 +2318,37 @@ TEST_F(ExternalSSTFileTest, SkipBloomFilter) { } } +TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) { + if (!ZSTD_Supported()) { + return; + } + const int kNumEntries = 1 << 10; + const int kNumBytesPerEntry = 1 << 10; + Options options = CurrentOptions(); + options.compression = kZSTD; + options.compression_opts.max_dict_bytes = 1 << 14; // 16KB + options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB + DestroyAndReopen(options); + + std::atomic num_compression_dicts(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", + [&](void* /* arg */) { + ++num_compression_dicts; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + std::vector> random_data; + for (int i = 0; i < kNumEntries; i++) { + std::string val; + test::RandomString(&rnd, kNumBytesPerEntry, &val); + random_data.emplace_back(Key(i), std::move(val)); + } + ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data))); + ASSERT_EQ(1, num_compression_dicts); +} + TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) { std::unique_ptr fault_injection_env( new FaultInjectionTestEnv(env_)); diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index e8e360092..071795430 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -317,7 +317,6 @@ struct BlockBasedTableBuilder::Rep { const std::string& column_family_name; uint64_t creation_time = 0; uint64_t oldest_key_time = 0; - const bool is_bottommost_level; const uint64_t target_file_size; std::vector> table_properties_collectors; @@ -331,8 +330,7 @@ struct BlockBasedTableBuilder::Rep { const CompressionType _compression_type, const CompressionOptions& _compression_opts, const bool skip_filters, const std::string& _column_family_name, const uint64_t _creation_time, - const uint64_t _oldest_key_time, const bool _is_bottommost_level, - const uint64_t _target_file_size) + const uint64_t _oldest_key_time, const uint64_t _target_file_size) : ioptions(_ioptions), moptions(_moptions), table_options(table_opt), @@ -356,7 +354,7 @@ struct BlockBasedTableBuilder::Rep { compression_dict(), compression_ctx(_compression_type), verify_dict(), - state((_is_bottommost_level && _compression_opts.max_dict_bytes > 0) + state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered : State::kUnbuffered), use_delta_encoding_for_index_values(table_opt.format_version >= 4 && @@ -369,7 +367,6 @@ struct BlockBasedTableBuilder::Rep { column_family_name(_column_family_name), creation_time(_creation_time), oldest_key_time(_oldest_key_time), - is_bottommost_level(_is_bottommost_level), target_file_size(_target_file_size) { if (table_options.index_type == BlockBasedTableOptions::kTwoLevelIndexSearch) { @@ -421,8 +418,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( const CompressionType compression_type, const CompressionOptions& compression_opts, const bool skip_filters, const std::string& column_family_name, const uint64_t creation_time, - const uint64_t oldest_key_time, const bool is_bottommost_level, - const uint64_t target_file_size) { + const uint64_t oldest_key_time, const uint64_t target_file_size) { BlockBasedTableOptions sanitized_table_options(table_options); if (sanitized_table_options.format_version == 0 && sanitized_table_options.checksum != kCRC32c) { @@ -439,7 +435,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( internal_comparator, int_tbl_prop_collector_factories, column_family_id, file, compression_type, compression_opts, skip_filters, column_family_name, creation_time, - oldest_key_time, is_bottommost_level, target_file_size); + oldest_key_time, target_file_size); if (rep_->filter_builder != nullptr) { rep_->filter_builder->StartBlock(0); diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h index 127601474..b82bec16f 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based_table_builder.h @@ -47,9 +47,7 @@ class BlockBasedTableBuilder : public TableBuilder { const CompressionType compression_type, const CompressionOptions& compression_opts, const bool skip_filters, const std::string& column_family_name, const uint64_t creation_time = 0, - const uint64_t oldest_key_time = 0, - const bool is_bottommost_level = false, - const uint64_t target_file_size = 0); + const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0); // REQUIRES: Either Finish() or Abandon() has been called. ~BlockBasedTableBuilder(); diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 48f5cc1d6..35aeba252 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -219,7 +219,6 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder( table_builder_options.column_family_name, table_builder_options.creation_time, table_builder_options.oldest_key_time, - table_builder_options.is_bottommost_level, table_builder_options.target_file_size); return table_builder; diff --git a/table/table_builder.h b/table/table_builder.h index ff65636fc..2ec06f773 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -77,7 +77,7 @@ struct TableBuilderOptions { const CompressionOptions& _compression_opts, bool _skip_filters, const std::string& _column_family_name, int _level, const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0, - bool _is_bottommost_level = false, const uint64_t _target_file_size = 0) + const uint64_t _target_file_size = 0) : ioptions(_ioptions), moptions(_moptions), internal_comparator(_internal_comparator), @@ -89,7 +89,6 @@ struct TableBuilderOptions { level(_level), creation_time(_creation_time), oldest_key_time(_oldest_key_time), - is_bottommost_level(_is_bottommost_level), target_file_size(_target_file_size) {} const ImmutableCFOptions& ioptions; const MutableCFOptions& moptions; @@ -103,7 +102,6 @@ struct TableBuilderOptions { int level; // what level this table/file is on, -1 for "not set, don't know" const uint64_t creation_time; const int64_t oldest_key_time; - const bool is_bottommost_level; const uint64_t target_file_size; };