Add DBOptions::skip_sats_update_on_db_open

Summary: UpdateAccumulatedStats() is used to optimize compaction decision esp. when the number of deletion entries are high, but this function can slowdown DBOpen esp. in disk environment. This patch adds DBOptions::skip_sats_update_on_db_open, which skips UpdateAccumulatedStats() in DB::Open() time when it's set to true. Test Plan: Add DBCompactionTest.SkipStatsUpdateTest Reviewers: igor, anthony, IslamAbdelRahman, sdong Reviewed By: sdong Subscribers: tnovak, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D42843
10 years ago · 14d0bfa429
parent e2a3bfe74b
commit 14d0bfa429
8 changed files with 182 additions and 42 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -3,6 +3,7 @@
 ## Unreleased

 ### New Features
+* Add DBOptions::skip_stats_update_on_db_open.  When it is on, DB::Open() will run faster as it skips the random reads required for loading necessary stats from SST files to optimize compaction.
 * RollbackToSavePoint() in WriteBatch/WriteBatchWithIndex

 ### Public API Changes
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@ -196,12 +196,16 @@ const SstFileMetaData* PickFileRandomly(
 }  // anonymous namespace

 TEST_F(DBCompactionTest, CompactionDeletionTrigger) {
-  for (int tid = 0; tid < 2; ++tid) {
+  for (int tid = 0; tid < 3; ++tid) {
    uint64_t db_size[2];
    Options options = CurrentOptions(DeletionTriggerOptions());

    if (tid == 1) {
-      // second pass with universal compaction
+      // the following only disable stats update in DB::Open()
+      // and should not affect the result of this test.
+      options.skip_stats_update_on_db_open = true;
+    } else if (tid == 2) {
+      // third pass with universal compaction
      options.compaction_style = kCompactionStyleUniversal;
      options.num_levels = 1;
    }
@ -231,6 +235,64 @@ TEST_F(DBCompactionTest, CompactionDeletionTrigger) {
  }
 }

+TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
+  // This test verify UpdateAccumulatedStats is not on by observing
+  // the compaction behavior when there are many of deletion entries.
+  // The test will need to be updated if the internal behavior changes.
+
+  Options options = DeletionTriggerOptions();
+  options = CurrentOptions(options);
+  options.env = env_;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  const int kTestSize = kCDTKeysPerBuffer * 512;
+  std::vector<std::string> values;
+  for (int k = 0; k < kTestSize; ++k) {
+    values.push_back(RandomString(&rnd, kCDTValueSize));
+    ASSERT_OK(Put(Key(k), values[k]));
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+
+  uint64_t db_size[2];
+  db_size[0] = Size(Key(0), Key(kTestSize - 1));
+
+  for (int k = 0; k < kTestSize; ++k) {
+    ASSERT_OK(Delete(Key(k)));
+  }
+
+  // Reopen the DB with stats-update disabled
+  options.skip_stats_update_on_db_open = true;
+  env_->random_file_open_counter_.store(0);
+  Reopen(options);
+  // As stats-update is disabled, we expect a very low
+  // number of random file open.
+  ASSERT_LT(env_->random_file_open_counter_.load(), 5);
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+  db_size[1] = Size(Key(0), Key(kTestSize - 1));
+
+  // As stats update is disabled, we expect the deletion
+  // entries are not properly processed.
+  ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+  // Repeat the reopen process, but this time we enable
+  // stats-update.
+  options.skip_stats_update_on_db_open = false;
+  env_->random_file_open_counter_.store(0);
+  Reopen(options);
+  // Since we do a normal stats update on db-open, there
+  // will be more random open files.
+  ASSERT_GT(env_->random_file_open_counter_.load(), 5);
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+  db_size[1] = Size(Key(0), Key(kTestSize - 1));
+
+  // and we expect the deleiton entries being handled.
+  ASSERT_GT(db_size[0] / 3, db_size[1]);
+}
+
 TEST_F(DBCompactionTest, CompactionDeletionTriggerReopen) {
  for (int tid = 0; tid < 2; ++tid) {
    uint64_t db_size[3];
@ -287,6 +349,63 @@ TEST_F(DBCompactionTest, CompactionDeletionTriggerReopen) {
  }
 }

+TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
+  uint64_t db_size[3];
+  for (int test = 0; test < 2; ++test) {
+    Options options = CurrentOptions(DeletionTriggerOptions());
+    options.skip_stats_update_on_db_open = (test == 0);
+
+    env_->random_read_counter_.Reset();
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    // round 1 --- insert key/value pairs.
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+
+    // round 2 --- disable auto-compactions and issue deletions.
+    options.create_if_missing = false;
+    options.disable_auto_compactions = true;
+
+    env_->random_read_counter_.Reset();
+    Reopen(options);
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+    // as auto_compaction is off, we shouldn't see too much reduce
+    // in db size.
+    ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+    // round 3 --- reopen db with auto_compaction on and see if
+    // deletion compensation still work.
+    options.disable_auto_compactions = false;
+    Reopen(options);
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[2] = Size(Key(0), Key(kTestSize - 1));
+
+    if (options.skip_stats_update_on_db_open) {
+      // If update stats on DB::Open is disable, we don't expect
+      // deletion entries taking effect.
+      ASSERT_LT(db_size[0] / 3, db_size[2]);
+    } else {
+      // Otherwise, we should see a significant drop in db size.
+      ASSERT_GT(db_size[0] / 3, db_size[2]);
+    }
+  }
+}
+
 TEST_F(DBCompactionTest, CompactionTrigger) {
  Options options;
  options.write_buffer_size = 100 << 10;  // 100KB
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -873,8 +873,10 @@ void VersionStorageInfo::GenerateLevelFilesBrief() {
  }
 }

-void Version::PrepareApply(const MutableCFOptions& mutable_cf_options) {
-  UpdateAccumulatedStats();
+void Version::PrepareApply(
+    const MutableCFOptions& mutable_cf_options,
+    bool update_stats) {
+  UpdateAccumulatedStats(update_stats);
  storage_info_.UpdateNumNonEmptyLevels();
  storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options);
  storage_info_.UpdateFilesBySize();
@ -917,42 +919,45 @@ void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
  num_samples_++;
 }

-void Version::UpdateAccumulatedStats() {
-  // maximum number of table properties loaded from files.
-  const int kMaxInitCount = 20;
-  int init_count = 0;
-  // here only the first kMaxInitCount files which haven't been
-  // initialized from file will be updated with num_deletions.
-  // The motivation here is to cap the maximum I/O per Version creation.
-  // The reason for choosing files from lower-level instead of higher-level
-  // is that such design is able to propagate the initialization from
-  // lower-level to higher-level:  When the num_deletions of lower-level
-  // files are updated, it will make the lower-level files have accurate
-  // compensated_file_size, making lower-level to higher-level compaction
-  // will be triggered, which creates higher-level files whose num_deletions
-  // will be updated here.
-  for (int level = 0;
-       level < storage_info_.num_levels_ && init_count < kMaxInitCount;
-       ++level) {
-    for (auto* file_meta : storage_info_.files_[level]) {
-      if (MaybeInitializeFileMetaData(file_meta)) {
-        // each FileMeta will be initialized only once.
-        storage_info_.UpdateAccumulatedStats(file_meta);
-        if (++init_count >= kMaxInitCount) {
-          break;
+void Version::UpdateAccumulatedStats(bool update_stats) {
+  if (update_stats) {
+    // maximum number of table properties loaded from files.
+    const int kMaxInitCount = 20;
+    int init_count = 0;
+    // here only the first kMaxInitCount files which haven't been
+    // initialized from file will be updated with num_deletions.
+    // The motivation here is to cap the maximum I/O per Version creation.
+    // The reason for choosing files from lower-level instead of higher-level
+    // is that such design is able to propagate the initialization from
+    // lower-level to higher-level:  When the num_deletions of lower-level
+    // files are updated, it will make the lower-level files have accurate
+    // compensated_file_size, making lower-level to higher-level compaction
+    // will be triggered, which creates higher-level files whose num_deletions
+    // will be updated here.
+    for (int level = 0;
+         level < storage_info_.num_levels_ && init_count < kMaxInitCount;
+         ++level) {
+      for (auto* file_meta : storage_info_.files_[level]) {
+        if (MaybeInitializeFileMetaData(file_meta)) {
+          // each FileMeta will be initialized only once.
+          storage_info_.UpdateAccumulatedStats(file_meta);
+          if (++init_count >= kMaxInitCount) {
+            break;
+          }
        }
      }
    }
-  }
-  // In case all sampled-files contain only deletion entries, then we
-  // load the table-property of a file in higher-level to initialize
-  // that value.
-  for (int level = storage_info_.num_levels_ - 1;
-       storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
-    for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
-         storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
-      if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
-        storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
+    // In case all sampled-files contain only deletion entries, then we
+    // load the table-property of a file in higher-level to initialize
+    // that value.
+    for (int level = storage_info_.num_levels_ - 1;
+         storage_info_.accumulated_raw_value_size_ == 0 && level >= 0;
+         --level) {
+      for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
+           storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+        if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
+          storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
+        }
      }
    }
  }
@ -1967,7 +1972,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,

    if (!edit->IsColumnFamilyManipulation()) {
      // This is cpu-heavy operations, which should be called outside mutex.
-      v->PrepareApply(mutable_cf_options);
+      v->PrepareApply(mutable_cf_options, true);
    }

    // Write new record to MANIFEST log
@ -2398,7 +2403,8 @@ Status VersionSet::Recover(
      builder->SaveTo(v->storage_info());

      // Install recovered version
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions());
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
+          !(db_options_->skip_stats_update_on_db_open));
      AppendVersion(cfd, v);
    }

@ -2748,7 +2754,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,

      Version* v = new Version(cfd, this, current_version_number_++);
      builder->SaveTo(v->storage_info());
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions());
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false);

      printf("--------------- Column family \"%s\"  (ID %u) --------------\n",
             cfd->GetName().c_str(), (unsigned int)cfd->GetID());
--- a/db/version_set.h
+++ b/db/version_set.h
@ -417,7 +417,8 @@ class Version {

  // Loads some stats information from files. Call without mutex held. It needs
  // to be called before applying the version to the version set.
-  void PrepareApply(const MutableCFOptions& mutable_cf_options);
+  void PrepareApply(const MutableCFOptions& mutable_cf_options,
+                    bool update_stats);

  // Reference count management (so Versions do not disappear out from
  // under live iterators)
@ -490,7 +491,7 @@ class Version {

  // Update the accumulated stats associated with the current version.
  // This accumulated stats will be used in compaction.
-  void UpdateAccumulatedStats();
+  void UpdateAccumulatedStats(bool update_stats);

  // Sort all files for this version based on their file size and
  // record results in files_by_size_. The largest files are listed first.
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -1058,6 +1058,14 @@ struct DBOptions {
  // Default: 1MB/s
  uint64_t delayed_write_rate;

+  // If true, then DB::Open() will not update the statistics used to optimize
+  // compaction decision by loading table properties from many files.
+  // Turning off this feature will improve DBOpen time espcially in
+  // disk environment.
+  //
+  // Default: false
+  bool skip_stats_update_on_db_open;
+
  // Recovery mode to control the consistency while replaying WAL
  // Default: kTolerateCorruptedTailRecords
  WALRecoveryMode wal_recovery_mode;
--- a/util/db_test_util.cc
+++ b/util/db_test_util.cc
@ -28,6 +28,7 @@ SpecialEnv::SpecialEnv(Env* base)
  manifest_sync_error_.store(false, std::memory_order_release);
  manifest_write_error_.store(false, std::memory_order_release);
  log_write_error_.store(false, std::memory_order_release);
+  random_file_open_counter_.store(0, std::memory_order_relaxed);
  log_write_slowdown_ = 0;
  bytes_written_ = 0;
  sync_counter_ = 0;
--- a/util/db_test_util.h
+++ b/util/db_test_util.h
@ -281,6 +281,7 @@ class SpecialEnv : public EnvWrapper {
    };

    Status s = target()->NewRandomAccessFile(f, r, soptions);
+    random_file_open_counter_++;
    if (s.ok() && count_random_reads_) {
      r->reset(new CountingFile(std::move(*r), &random_read_counter_));
    }
@ -367,6 +368,7 @@ class SpecialEnv : public EnvWrapper {

  bool count_random_reads_;
  anon::AtomicCounter random_read_counter_;
+  std::atomic<int> random_file_open_counter_;

  bool count_sequential_reads_;
  anon::AtomicCounter sequential_read_counter_;
--- a/util/options.cc
+++ b/util/options.cc
@ -239,6 +239,7 @@ DBOptions::DBOptions()
      listeners(),
      enable_thread_tracking(false),
      delayed_write_rate(1024U * 1024U),
+      skip_stats_update_on_db_open(false),
      wal_recovery_mode(WALRecoveryMode::kTolerateCorruptedTailRecords) {
 }

@ -287,6 +288,7 @@ DBOptions::DBOptions(const Options& options)
      listeners(options.listeners),
      enable_thread_tracking(options.enable_thread_tracking),
      delayed_write_rate(options.delayed_write_rate),
+      skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
      wal_recovery_mode(options.wal_recovery_mode),
      row_cache(options.row_cache) {}