diff --git a/HISTORY.md b/HISTORY.md index 0e6db1347..d401d8153 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,7 @@ ## Unreleased ### New Features +* Add DBOptions::skip_stats_update_on_db_open. When it is on, DB::Open() will run faster as it skips the random reads required for loading necessary stats from SST files to optimize compaction. * RollbackToSavePoint() in WriteBatch/WriteBatchWithIndex ### Public API Changes diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 5747dc963..c910b77fa 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -196,12 +196,16 @@ const SstFileMetaData* PickFileRandomly( } // anonymous namespace TEST_F(DBCompactionTest, CompactionDeletionTrigger) { - for (int tid = 0; tid < 2; ++tid) { + for (int tid = 0; tid < 3; ++tid) { uint64_t db_size[2]; Options options = CurrentOptions(DeletionTriggerOptions()); if (tid == 1) { - // second pass with universal compaction + // the following only disable stats update in DB::Open() + // and should not affect the result of this test. + options.skip_stats_update_on_db_open = true; + } else if (tid == 2) { + // third pass with universal compaction options.compaction_style = kCompactionStyleUniversal; options.num_levels = 1; } @@ -231,6 +235,64 @@ TEST_F(DBCompactionTest, CompactionDeletionTrigger) { } } +TEST_F(DBCompactionTest, SkipStatsUpdateTest) { + // This test verify UpdateAccumulatedStats is not on by observing + // the compaction behavior when there are many of deletion entries. + // The test will need to be updated if the internal behavior changes. + + Options options = DeletionTriggerOptions(); + options = CurrentOptions(options); + options.env = env_; + DestroyAndReopen(options); + Random rnd(301); + + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(RandomString(&rnd, kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + + uint64_t db_size[2]; + db_size[0] = Size(Key(0), Key(kTestSize - 1)); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + + // Reopen the DB with stats-update disabled + options.skip_stats_update_on_db_open = true; + env_->random_file_open_counter_.store(0); + Reopen(options); + // As stats-update is disabled, we expect a very low + // number of random file open. + ASSERT_LT(env_->random_file_open_counter_.load(), 5); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + db_size[1] = Size(Key(0), Key(kTestSize - 1)); + + // As stats update is disabled, we expect the deletion + // entries are not properly processed. + ASSERT_LT(db_size[0] / 3, db_size[1]); + + // Repeat the reopen process, but this time we enable + // stats-update. + options.skip_stats_update_on_db_open = false; + env_->random_file_open_counter_.store(0); + Reopen(options); + // Since we do a normal stats update on db-open, there + // will be more random open files. + ASSERT_GT(env_->random_file_open_counter_.load(), 5); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + db_size[1] = Size(Key(0), Key(kTestSize - 1)); + + // and we expect the deleiton entries being handled. + ASSERT_GT(db_size[0] / 3, db_size[1]); +} + TEST_F(DBCompactionTest, CompactionDeletionTriggerReopen) { for (int tid = 0; tid < 2; ++tid) { uint64_t db_size[3]; @@ -287,6 +349,63 @@ TEST_F(DBCompactionTest, CompactionDeletionTriggerReopen) { } } +TEST_F(DBCompactionTest, DisableStatsUpdateReopen) { + uint64_t db_size[3]; + for (int test = 0; test < 2; ++test) { + Options options = CurrentOptions(DeletionTriggerOptions()); + options.skip_stats_update_on_db_open = (test == 0); + + env_->random_read_counter_.Reset(); + DestroyAndReopen(options); + Random rnd(301); + + // round 1 --- insert key/value pairs. + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(RandomString(&rnd, kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + db_size[0] = Size(Key(0), Key(kTestSize - 1)); + Close(); + + // round 2 --- disable auto-compactions and issue deletions. + options.create_if_missing = false; + options.disable_auto_compactions = true; + + env_->random_read_counter_.Reset(); + Reopen(options); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + db_size[1] = Size(Key(0), Key(kTestSize - 1)); + Close(); + // as auto_compaction is off, we shouldn't see too much reduce + // in db size. + ASSERT_LT(db_size[0] / 3, db_size[1]); + + // round 3 --- reopen db with auto_compaction on and see if + // deletion compensation still work. + options.disable_auto_compactions = false; + Reopen(options); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + db_size[2] = Size(Key(0), Key(kTestSize - 1)); + + if (options.skip_stats_update_on_db_open) { + // If update stats on DB::Open is disable, we don't expect + // deletion entries taking effect. + ASSERT_LT(db_size[0] / 3, db_size[2]); + } else { + // Otherwise, we should see a significant drop in db size. + ASSERT_GT(db_size[0] / 3, db_size[2]); + } + } +} + TEST_F(DBCompactionTest, CompactionTrigger) { Options options; options.write_buffer_size = 100 << 10; // 100KB diff --git a/db/version_set.cc b/db/version_set.cc index 79c0b2e88..c42b3b728 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -873,8 +873,10 @@ void VersionStorageInfo::GenerateLevelFilesBrief() { } } -void Version::PrepareApply(const MutableCFOptions& mutable_cf_options) { - UpdateAccumulatedStats(); +void Version::PrepareApply( + const MutableCFOptions& mutable_cf_options, + bool update_stats) { + UpdateAccumulatedStats(update_stats); storage_info_.UpdateNumNonEmptyLevels(); storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options); storage_info_.UpdateFilesBySize(); @@ -917,42 +919,45 @@ void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) { num_samples_++; } -void Version::UpdateAccumulatedStats() { - // maximum number of table properties loaded from files. - const int kMaxInitCount = 20; - int init_count = 0; - // here only the first kMaxInitCount files which haven't been - // initialized from file will be updated with num_deletions. - // The motivation here is to cap the maximum I/O per Version creation. - // The reason for choosing files from lower-level instead of higher-level - // is that such design is able to propagate the initialization from - // lower-level to higher-level: When the num_deletions of lower-level - // files are updated, it will make the lower-level files have accurate - // compensated_file_size, making lower-level to higher-level compaction - // will be triggered, which creates higher-level files whose num_deletions - // will be updated here. - for (int level = 0; - level < storage_info_.num_levels_ && init_count < kMaxInitCount; - ++level) { - for (auto* file_meta : storage_info_.files_[level]) { - if (MaybeInitializeFileMetaData(file_meta)) { - // each FileMeta will be initialized only once. - storage_info_.UpdateAccumulatedStats(file_meta); - if (++init_count >= kMaxInitCount) { - break; +void Version::UpdateAccumulatedStats(bool update_stats) { + if (update_stats) { + // maximum number of table properties loaded from files. + const int kMaxInitCount = 20; + int init_count = 0; + // here only the first kMaxInitCount files which haven't been + // initialized from file will be updated with num_deletions. + // The motivation here is to cap the maximum I/O per Version creation. + // The reason for choosing files from lower-level instead of higher-level + // is that such design is able to propagate the initialization from + // lower-level to higher-level: When the num_deletions of lower-level + // files are updated, it will make the lower-level files have accurate + // compensated_file_size, making lower-level to higher-level compaction + // will be triggered, which creates higher-level files whose num_deletions + // will be updated here. + for (int level = 0; + level < storage_info_.num_levels_ && init_count < kMaxInitCount; + ++level) { + for (auto* file_meta : storage_info_.files_[level]) { + if (MaybeInitializeFileMetaData(file_meta)) { + // each FileMeta will be initialized only once. + storage_info_.UpdateAccumulatedStats(file_meta); + if (++init_count >= kMaxInitCount) { + break; + } } } } - } - // In case all sampled-files contain only deletion entries, then we - // load the table-property of a file in higher-level to initialize - // that value. - for (int level = storage_info_.num_levels_ - 1; - storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) { - for (int i = static_cast(storage_info_.files_[level].size()) - 1; - storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) { - if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) { - storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]); + // In case all sampled-files contain only deletion entries, then we + // load the table-property of a file in higher-level to initialize + // that value. + for (int level = storage_info_.num_levels_ - 1; + storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; + --level) { + for (int i = static_cast(storage_info_.files_[level].size()) - 1; + storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) { + if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) { + storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]); + } } } } @@ -1967,7 +1972,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, if (!edit->IsColumnFamilyManipulation()) { // This is cpu-heavy operations, which should be called outside mutex. - v->PrepareApply(mutable_cf_options); + v->PrepareApply(mutable_cf_options, true); } // Write new record to MANIFEST log @@ -2398,7 +2403,8 @@ Status VersionSet::Recover( builder->SaveTo(v->storage_info()); // Install recovered version - v->PrepareApply(*cfd->GetLatestMutableCFOptions()); + v->PrepareApply(*cfd->GetLatestMutableCFOptions(), + !(db_options_->skip_stats_update_on_db_open)); AppendVersion(cfd, v); } @@ -2748,7 +2754,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, Version* v = new Version(cfd, this, current_version_number_++); builder->SaveTo(v->storage_info()); - v->PrepareApply(*cfd->GetLatestMutableCFOptions()); + v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false); printf("--------------- Column family \"%s\" (ID %u) --------------\n", cfd->GetName().c_str(), (unsigned int)cfd->GetID()); diff --git a/db/version_set.h b/db/version_set.h index 425925a05..abc42e74b 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -417,7 +417,8 @@ class Version { // Loads some stats information from files. Call without mutex held. It needs // to be called before applying the version to the version set. - void PrepareApply(const MutableCFOptions& mutable_cf_options); + void PrepareApply(const MutableCFOptions& mutable_cf_options, + bool update_stats); // Reference count management (so Versions do not disappear out from // under live iterators) @@ -490,7 +491,7 @@ class Version { // Update the accumulated stats associated with the current version. // This accumulated stats will be used in compaction. - void UpdateAccumulatedStats(); + void UpdateAccumulatedStats(bool update_stats); // Sort all files for this version based on their file size and // record results in files_by_size_. The largest files are listed first. diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 0a112348e..c9246b4b3 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1058,6 +1058,14 @@ struct DBOptions { // Default: 1MB/s uint64_t delayed_write_rate; + // If true, then DB::Open() will not update the statistics used to optimize + // compaction decision by loading table properties from many files. + // Turning off this feature will improve DBOpen time espcially in + // disk environment. + // + // Default: false + bool skip_stats_update_on_db_open; + // Recovery mode to control the consistency while replaying WAL // Default: kTolerateCorruptedTailRecords WALRecoveryMode wal_recovery_mode; diff --git a/util/db_test_util.cc b/util/db_test_util.cc index bca50da3b..3a5d089b1 100644 --- a/util/db_test_util.cc +++ b/util/db_test_util.cc @@ -28,6 +28,7 @@ SpecialEnv::SpecialEnv(Env* base) manifest_sync_error_.store(false, std::memory_order_release); manifest_write_error_.store(false, std::memory_order_release); log_write_error_.store(false, std::memory_order_release); + random_file_open_counter_.store(0, std::memory_order_relaxed); log_write_slowdown_ = 0; bytes_written_ = 0; sync_counter_ = 0; diff --git a/util/db_test_util.h b/util/db_test_util.h index 164839a5b..819fa7c0a 100644 --- a/util/db_test_util.h +++ b/util/db_test_util.h @@ -281,6 +281,7 @@ class SpecialEnv : public EnvWrapper { }; Status s = target()->NewRandomAccessFile(f, r, soptions); + random_file_open_counter_++; if (s.ok() && count_random_reads_) { r->reset(new CountingFile(std::move(*r), &random_read_counter_)); } @@ -367,6 +368,7 @@ class SpecialEnv : public EnvWrapper { bool count_random_reads_; anon::AtomicCounter random_read_counter_; + std::atomic random_file_open_counter_; bool count_sequential_reads_; anon::AtomicCounter sequential_read_counter_; diff --git a/util/options.cc b/util/options.cc index 466a69bfb..76aa3c8a0 100644 --- a/util/options.cc +++ b/util/options.cc @@ -239,6 +239,7 @@ DBOptions::DBOptions() listeners(), enable_thread_tracking(false), delayed_write_rate(1024U * 1024U), + skip_stats_update_on_db_open(false), wal_recovery_mode(WALRecoveryMode::kTolerateCorruptedTailRecords) { } @@ -287,6 +288,7 @@ DBOptions::DBOptions(const Options& options) listeners(options.listeners), enable_thread_tracking(options.enable_thread_tracking), delayed_write_rate(options.delayed_write_rate), + skip_stats_update_on_db_open(options.skip_stats_update_on_db_open), wal_recovery_mode(options.wal_recovery_mode), row_cache(options.row_cache) {}