Dictionary compression for files written by SstFileWriter (#4978)

Summary: If `CompressionOptions::max_dict_bytes` and/or `CompressionOptions::zstd_max_train_bytes` are set, `SstFileWriter` will now generate files respecting those options. I refactored the logic a bit for deciding when to use dictionary compression. Previously we plumbed `is_bottommost_level` down to the table builder and used that. However it was kind of confusing in `SstFileWriter`'s context since we don't know what level the file will be ingested to. Instead, now the higher-level callers (e.g., flush, compaction, file writer) are responsible for building the right `CompressionOptions` to give the table builder. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4978 Differential Revision: D14060763 Pulled By: ajkr fbshipit-source-id: dc802c327896df2b319dc162d6acc82b9cdb452a
7 years ago · c8c8104d7e
parent 4fc442029a
commit c8c8104d7e
10 changed files with 56 additions and 23 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -11,6 +11,7 @@
 * Add support for trace sampling.
 * Enable properties block checksum verification for block-based tables.
 * For all users of dictionary compression, we now generate a separate dictionary for compressing each bottom-level SST file. Previously we reused a single dictionary for a whole compaction to bottom level. The new approach achieves better compression ratios; however, it uses more memory and CPU for buffering/sampling data blocks and training dictionaries.
+* Files written by `SstFileWriter` will now use dictionary compression if it is configured in the file writer's `CompressionOptions`.

 ### Public API Change
 * Disallow CompactionFilter::IgnoreSnapshots() = false, because it is not very useful and the behavior is confusing. The filter will filter everything if there is no snapshot declared by the time the compaction starts. However, users can define a snapshot after the compaction starts and before it finishes and this new snapshot won't be repeatable, because after the compaction finishes, some keys may be dropped. 
--- a/db/builder.cc
+++ b/db/builder.cc
@ -49,8 +49,7 @@ TableBuilder* NewTableBuilder(
    WritableFileWriter* file, const CompressionType compression_type,
    const CompressionOptions& compression_opts, int level,
    const bool skip_filters, const uint64_t creation_time,
-    const uint64_t oldest_key_time, const bool is_bottommost_level,
-    const uint64_t target_file_size) {
+    const uint64_t oldest_key_time, const uint64_t target_file_size) {
  assert((column_family_id ==
          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
         column_family_name.empty());
@ -59,7 +58,7 @@ TableBuilder* NewTableBuilder(
                          int_tbl_prop_collector_factories, compression_type,
                          compression_opts, skip_filters, column_family_name,
                          level, creation_time, oldest_key_time,
-                          is_bottommost_level, target_file_size),
+                          target_file_size),
      column_family_id, file);
 }

@ -106,6 +105,11 @@ Status BuildTable(
  if (iter->Valid() || !range_del_agg->IsEmpty()) {
    TableBuilder* builder;
    std::unique_ptr<WritableFileWriter> file_writer;
+    // Currently we only enable dictionary compression during compaction to the
+    // bottommost level.
+    CompressionOptions compression_opts_for_flush(compression_opts);
+    compression_opts_for_flush.max_dict_bytes = 0;
+    compression_opts_for_flush.zstd_max_train_bytes = 0;
    {
      std::unique_ptr<WritableFile> file;
 #ifndef NDEBUG
@ -128,8 +132,9 @@ Status BuildTable(
      builder = NewTableBuilder(
          ioptions, mutable_cf_options, internal_comparator,
          int_tbl_prop_collector_factories, column_family_id,
-          column_family_name, file_writer.get(), compression, compression_opts,
-          level, false /* skip_filters */, creation_time, oldest_key_time);
+          column_family_name, file_writer.get(), compression,
+          compression_opts_for_flush, level, false /* skip_filters */,
+          creation_time, oldest_key_time);
    }

    MergeHelper merge(env, internal_comparator.user_comparator(),
--- a/db/builder.h
+++ b/db/builder.h
@ -49,8 +49,7 @@ TableBuilder* NewTableBuilder(
    WritableFileWriter* file, const CompressionType compression_type,
    const CompressionOptions& compression_opts, int level,
    const bool skip_filters = false, const uint64_t creation_time = 0,
-    const uint64_t oldest_key_time = 0, const bool is_bottommost_level = false,
-    const uint64_t target_file_size = 0);
+    const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0);

 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to number specified in meta. On success, the rest of
--- a/db/compaction.cc
+++ b/db/compaction.cc
@ -250,6 +250,12 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
  if (max_subcompactions_ == 0) {
    max_subcompactions_ = immutable_cf_options_.max_subcompactions;
  }
+  if (!bottommost_level_) {
+    // Currently we only enable dictionary compression during compaction to the
+    // bottommost level.
+    output_compression_opts_.max_dict_bytes = 0;
+    output_compression_opts_.zstd_max_train_bytes = 0;
+  }

 #ifndef NDEBUG
  for (size_t i = 1; i < inputs_.size(); ++i) {
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@ -1501,7 +1501,7 @@ Status CompactionJob::OpenCompactionOutputFile(
      sub_compact->compaction->output_compression(),
      sub_compact->compaction->output_compression_opts(),
      sub_compact->compaction->output_level(), skip_filters,
-      output_file_creation_time, 0 /* oldest_key_time */, bottommost_level_,
+      output_file_creation_time, 0 /* oldest_key_time */,
      sub_compact->compaction->max_output_file_size()));
  LogFlush(db_options_.info_log);
  return s;
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@ -2318,6 +2318,37 @@ TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
  }
 }

+TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  const int kNumEntries = 1 << 10;
+  const int kNumBytesPerEntry = 1 << 10;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  DestroyAndReopen(options);
+
+  std::atomic<int> num_compression_dicts(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* /* arg */) {
+        ++num_compression_dicts;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::vector<std::pair<std::string, std::string>> random_data;
+  for (int i = 0; i < kNumEntries; i++) {
+    std::string val;
+    test::RandomString(&rnd, kNumBytesPerEntry, &val);
+    random_data.emplace_back(Key(i), std::move(val));
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data)));
+  ASSERT_EQ(1, num_compression_dicts);
+}
+
 TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
      new FaultInjectionTestEnv(env_));
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@ -317,7 +317,6 @@ struct BlockBasedTableBuilder::Rep {
  const std::string& column_family_name;
  uint64_t creation_time = 0;
  uint64_t oldest_key_time = 0;
-  const bool is_bottommost_level;
  const uint64_t target_file_size;

  std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
@ -331,8 +330,7 @@ struct BlockBasedTableBuilder::Rep {
      const CompressionType _compression_type,
      const CompressionOptions& _compression_opts, const bool skip_filters,
      const std::string& _column_family_name, const uint64_t _creation_time,
-      const uint64_t _oldest_key_time, const bool _is_bottommost_level,
-      const uint64_t _target_file_size)
+      const uint64_t _oldest_key_time, const uint64_t _target_file_size)
      : ioptions(_ioptions),
        moptions(_moptions),
        table_options(table_opt),
@ -356,7 +354,7 @@ struct BlockBasedTableBuilder::Rep {
        compression_dict(),
        compression_ctx(_compression_type),
        verify_dict(),
-        state((_is_bottommost_level && _compression_opts.max_dict_bytes > 0)
+        state((_compression_opts.max_dict_bytes > 0)
                  ? State::kBuffered
                  : State::kUnbuffered),
        use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
@ -369,7 +367,6 @@ struct BlockBasedTableBuilder::Rep {
        column_family_name(_column_family_name),
        creation_time(_creation_time),
        oldest_key_time(_oldest_key_time),
-        is_bottommost_level(_is_bottommost_level),
        target_file_size(_target_file_size) {
    if (table_options.index_type ==
        BlockBasedTableOptions::kTwoLevelIndexSearch) {
@ -421,8 +418,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
    const CompressionType compression_type,
    const CompressionOptions& compression_opts, const bool skip_filters,
    const std::string& column_family_name, const uint64_t creation_time,
-    const uint64_t oldest_key_time, const bool is_bottommost_level,
-    const uint64_t target_file_size) {
+    const uint64_t oldest_key_time, const uint64_t target_file_size) {
  BlockBasedTableOptions sanitized_table_options(table_options);
  if (sanitized_table_options.format_version == 0 &&
      sanitized_table_options.checksum != kCRC32c) {
@ -439,7 +435,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
                 internal_comparator, int_tbl_prop_collector_factories,
                 column_family_id, file, compression_type, compression_opts,
                 skip_filters, column_family_name, creation_time,
-                 oldest_key_time, is_bottommost_level, target_file_size);
+                 oldest_key_time, target_file_size);

  if (rep_->filter_builder != nullptr) {
    rep_->filter_builder->StartBlock(0);
--- a/table/block_based_table_builder.h
+++ b/table/block_based_table_builder.h
@ -47,9 +47,7 @@ class BlockBasedTableBuilder : public TableBuilder {
      const CompressionType compression_type,
      const CompressionOptions& compression_opts, const bool skip_filters,
      const std::string& column_family_name, const uint64_t creation_time = 0,
-      const uint64_t oldest_key_time = 0,
-      const bool is_bottommost_level = false,
-      const uint64_t target_file_size = 0);
+      const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0);

  // REQUIRES: Either Finish() or Abandon() has been called.
  ~BlockBasedTableBuilder();
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@ -219,7 +219,6 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder(
      table_builder_options.column_family_name,
      table_builder_options.creation_time,
      table_builder_options.oldest_key_time,
-      table_builder_options.is_bottommost_level,
      table_builder_options.target_file_size);

  return table_builder;
--- a/table/table_builder.h
+++ b/table/table_builder.h
@ -77,7 +77,7 @@ struct TableBuilderOptions {
      const CompressionOptions& _compression_opts, bool _skip_filters,
      const std::string& _column_family_name, int _level,
      const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0,
-      bool _is_bottommost_level = false, const uint64_t _target_file_size = 0)
+      const uint64_t _target_file_size = 0)
      : ioptions(_ioptions),
        moptions(_moptions),
        internal_comparator(_internal_comparator),
@ -89,7 +89,6 @@ struct TableBuilderOptions {
        level(_level),
        creation_time(_creation_time),
        oldest_key_time(_oldest_key_time),
-        is_bottommost_level(_is_bottommost_level),
        target_file_size(_target_file_size) {}
  const ImmutableCFOptions& ioptions;
  const MutableCFOptions& moptions;
@ -103,7 +102,6 @@ struct TableBuilderOptions {
  int level; // what level this table/file is on, -1 for "not set, don't know"
  const uint64_t creation_time;
  const int64_t oldest_key_time;
-  const bool is_bottommost_level;
  const uint64_t target_file_size;
 };