From 637e64b9ac798d88097102d922b7284c6deaad54 Mon Sep 17 00:00:00 2001 From: Mike Kolupaev Date: Tue, 4 Feb 2020 01:24:29 -0800 Subject: [PATCH] Add an option to prevent DB::Open() from querying sizes of all sst files (#6353) Summary: When paranoid_checks is on, DBImpl::CheckConsistency() iterates over all sst files and calls Env::GetFileSize() for each of them. As far as I could understand, this is pretty arbitrary and doesn't affect correctness - if filesystem doesn't corrupt fsynced files, the file sizes will always match; if it does, it may as well corrupt contents as well as sizes, and rocksdb doesn't check contents on open. If there are thousands of sst files, getting all their sizes takes a while. If, on top of that, Env is overridden to use some remote storage instead of local filesystem, it can be *really* slow and overload the remote storage service. This PR adds an option to not do GetFileSize(); instead it does GetChildren() for parent directory to check that all the expected sst files are at least present, but doesn't check their sizes. We can't just disable paranoid_checks instead because paranoid_checks do a few other important things: make the DB read-only on write errors, print error messages on read errors, etc. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6353 Test Plan: ran the added sanity check unit test. Will try it out in a LogDevice test cluster where the GetFileSize() calls are causing a lot of trouble. Differential Revision: D19656425 Pulled By: al13n321 fbshipit-source-id: c2c421b367633033760d1f56747bad206d1fbf82 --- HISTORY.md | 1 + db/c.cc | 5 +++ db/db_impl/db_impl.cc | 74 ++++++++++++++++++++++++-------- db/db_impl/db_impl_open.cc | 7 +++ db/db_impl/db_impl_secondary.cc | 5 +++ db/db_sst_test.cc | 14 ++++++ include/rocksdb/c.h | 3 ++ include/rocksdb/options.h | 10 +++++ java/rocksjni/options.cc | 48 +++++++++++++++++++++ options/db_options.cc | 2 + options/db_options.h | 1 + options/options_helper.cc | 5 +++ options/options_settable_test.cc | 1 + test_util/testutil.cc | 1 + 14 files changed, 160 insertions(+), 17 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index a8560d0b1..bdd2a0aec 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ * Fix incorrect results while block-based table uses kHashSearch, together with Prev()/SeekForPrev(). * Fix a bug that prevents opening a DB after two consecutive crash with TransactionDB, where the first crash recovers from a corrupted WAL with kPointInTimeRecovery but the second cannot. * Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space. +* Add DBOptions::skip_checking_sst_file_sizes_on_db_open. It disables potentially expensive checking of all sst file sizes in DB::Open(). ### Public API Change * The BlobDB garbage collector now emits the statistics `BLOB_DB_GC_NUM_FILES` (number of blob files obsoleted during GC), `BLOB_DB_GC_NUM_NEW_FILES` (number of new blob files generated during GC), `BLOB_DB_GC_FAILURES` (number of failed GC passes), `BLOB_DB_GC_NUM_KEYS_RELOCATED` (number of blobs relocated during GC), and `BLOB_DB_GC_BYTES_RELOCATED` (total size of blobs relocated during GC). On the other hand, the following statistics, which are not relevant for the new GC implementation, are now deprecated: `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, and `BLOB_DB_GC_MICROS`. diff --git a/db/c.cc b/db/c.cc index 16703fb9f..b14a36742 100644 --- a/db/c.cc +++ b/db/c.cc @@ -2320,6 +2320,11 @@ void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, opt->rep.skip_stats_update_on_db_open = val; } +void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt, unsigned char val) { + opt->rep.skip_checking_sst_file_sizes_on_db_open = val; +} + void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { opt->rep.num_levels = n; } diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 890ea0994..be204e64c 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -3318,27 +3318,67 @@ Status DBImpl::CheckConsistency() { TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData"); std::string corruption_messages; - for (const auto& md : metadata) { - // md.name has a leading "/". - std::string file_path = md.db_path + md.name; - uint64_t fsize = 0; - TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize"); - Status s = env_->GetFileSize(file_path, &fsize); - if (!s.ok() && - env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) { - s = Status::OK(); + if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { + // Instead of calling GetFileSize() for each expected file, call + // GetChildren() for the DB directory and check that all expected files + // are listed, without checking their sizes. + // Since sst files might be in different directories, do it for each + // directory separately. + std::map> files_by_directory; + for (const auto& md : metadata) { + // md.name has a leading "/". Remove it. + std::string fname = md.name; + if (!fname.empty() && fname[0] == '/') { + fname = fname.substr(1); + } + files_by_directory[md.db_path].push_back(fname); } - if (!s.ok()) { - corruption_messages += - "Can't access " + md.name + ": " + s.ToString() + "\n"; - } else if (fsize != md.size) { - corruption_messages += "Sst file size mismatch: " + file_path + - ". Size recorded in manifest " + - ToString(md.size) + ", actual size " + - ToString(fsize) + "\n"; + for (const auto& dir_files : files_by_directory) { + std::string directory = dir_files.first; + std::vector existing_files; + Status s = env_->GetChildren(directory, &existing_files); + if (!s.ok()) { + corruption_messages += + "Can't list files in " + directory + ": " + s.ToString() + "\n"; + continue; + } + std::sort(existing_files.begin(), existing_files.end()); + + for (const std::string& fname : dir_files.second) { + if (!std::binary_search(existing_files.begin(), existing_files.end(), + fname) && + !std::binary_search(existing_files.begin(), existing_files.end(), + Rocks2LevelTableFileName(fname))) { + corruption_messages += + "Missing sst file " + fname + " in " + directory + "\n"; + } + } + } + } else { + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize"); + Status s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } else if (fsize != md.size) { + corruption_messages += "Sst file size mismatch: " + file_path + + ". Size recorded in manifest " + + ToString(md.size) + ", actual size " + + ToString(fsize) + "\n"; + } } } + if (corruption_messages.size() == 0) { return Status::OK(); } else { diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index f574981f8..202ae777b 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -172,6 +172,13 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { result.sst_file_manager = sst_file_manager; } #endif + + if (!result.paranoid_checks) { + result.skip_checking_sst_file_sizes_on_db_open = true; + ROCKS_LOG_INFO(result.info_log, + "file size check will be skipped during open."); + } + return result; } diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 4a3b0e035..937b6e90e 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -468,6 +468,11 @@ Status DBImplSecondary::CheckConsistency() { // approach and just proceed. TEST_SYNC_POINT_CALLBACK( "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s); + + if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { + return Status::OK(); + } + std::vector metadata; versions_->GetLiveFilesMetaData(&metadata); diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 37adee467..478a6bdab 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -124,6 +124,20 @@ TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) { Destroy(options); } +// Check that we don't crash when opening DB with +// DBOptions::skip_checking_sst_file_sizes_on_db_open = true. +TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) { + ASSERT_OK(Put("pika", "choo")); + ASSERT_OK(Flush()); + + // Just open the DB with the option set to true and check that we don't crash. + Options options; + options.skip_checking_sst_file_sizes_on_db_open = true; + Reopen(options); + + ASSERT_EQ("choo", Get("pika")); +} + #ifndef ROCKSDB_LITE TEST_F(DBSSTTest, DontDeleteMovedFile) { // This test triggers move compaction and verifies that the file is not diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 97e881b67..315b82ba8 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -854,6 +854,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics( extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt, unsigned char val); /* returns a pointer to a malloc()-ed, null terminated string */ extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index e0c7f2f94..c5c205847 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -985,6 +985,16 @@ struct DBOptions { // Default: false bool skip_stats_update_on_db_open = false; + // If true, then DB::Open() will not fetch and check sizes of all sst files. + // This may significantly speed up startup if there are many sst files, + // especially when using non-default Env with expensive GetFileSize(). + // We'll still check that all required sst files exist. + // If paranoid_checks is false, this option is ignored, and sst files are + // not checked at all. + // + // Default: false + bool skip_checking_sst_file_sizes_on_db_open = false; + // Recovery mode to control the consistency while replaying WAL // Default: kPointInTimeRecovery WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index cb23e2e82..181c5c996 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -1738,6 +1738,30 @@ jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen( return static_cast(opt->skip_stats_update_on_db_open); } +/* + * Class: org_rocksdb_Options + * Method: setSkipCheckingSstFileSizesOnDbOpen + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setSkipCheckingSstFileSizesOnDbOpen( + JNIEnv*, jobject, jlong jhandle, + jboolean jskip_checking_sst_file_sizes_on_db_open) { + auto* opt = reinterpret_cast(jhandle); + opt->skip_checking_sst_file_sizes_on_db_open = + static_cast(jskip_checking_sst_file_sizes_on_db_open); +} + +/* + * Class: org_rocksdb_Options + * Method: skipCheckingSstFileSizesOnDbOpen + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen( + JNIEnv*, jobject, jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->skip_checking_sst_file_sizes_on_db_open); +} + /* * Class: org_rocksdb_Options * Method: setWalRecoveryMode @@ -6010,6 +6034,30 @@ jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen( return static_cast(opt->skip_stats_update_on_db_open); } +/* + * Class: org_rocksdb_DBOptions + * Method: setSkipCheckingSstFileSizesOnDbOpen + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setSkipCheckingSstFileSizesOnDbOpen( + JNIEnv*, jobject, jlong jhandle, + jboolean jskip_checking_sst_file_sizes_on_db_open) { + auto* opt = reinterpret_cast(jhandle); + opt->skip_checking_sst_file_sizes_on_db_open = + static_cast(jskip_checking_sst_file_sizes_on_db_open); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: skipCheckingSstFileSizesOnDbOpen + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen( + JNIEnv*, jobject, jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->skip_checking_sst_file_sizes_on_db_open); +} + /* * Class: org_rocksdb_DBOptions * Method: setWalRecoveryMode diff --git a/options/db_options.cc b/options/db_options.cc index 1c8776fa1..d850ff66b 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -74,6 +74,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) write_thread_max_yield_usec(options.write_thread_max_yield_usec), write_thread_slow_yield_usec(options.write_thread_slow_yield_usec), skip_stats_update_on_db_open(options.skip_stats_update_on_db_open), + skip_checking_sst_file_sizes_on_db_open( + options.skip_checking_sst_file_sizes_on_db_open), wal_recovery_mode(options.wal_recovery_mode), allow_2pc(options.allow_2pc), row_cache(options.row_cache), diff --git a/options/db_options.h b/options/db_options.h index 9e8e40058..f345a7b0e 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -68,6 +68,7 @@ struct ImmutableDBOptions { uint64_t write_thread_max_yield_usec; uint64_t write_thread_slow_yield_usec; bool skip_stats_update_on_db_open; + bool skip_checking_sst_file_sizes_on_db_open; WALRecoveryMode wal_recovery_mode; bool allow_2pc; std::shared_ptr row_cache; diff --git a/options/options_helper.cc b/options/options_helper.cc index 446c9c707..685c0b867 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -119,6 +119,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, immutable_db_options.write_thread_slow_yield_usec; options.skip_stats_update_on_db_open = immutable_db_options.skip_stats_update_on_db_open; + options.skip_checking_sst_file_sizes_on_db_open = + immutable_db_options.skip_checking_sst_file_sizes_on_db_open; options.wal_recovery_mode = immutable_db_options.wal_recovery_mode; options.allow_2pc = immutable_db_options.allow_2pc; options.row_cache = immutable_db_options.row_cache; @@ -1473,6 +1475,9 @@ std::unordered_map {"skip_stats_update_on_db_open", {offsetof(struct DBOptions, skip_stats_update_on_db_open), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"skip_checking_sst_file_sizes_on_db_open", + {offsetof(struct DBOptions, skip_checking_sst_file_sizes_on_db_open), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, {"new_table_reader_for_compaction_inputs", {offsetof(struct DBOptions, new_table_reader_for_compaction_inputs), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 4d13585f7..a7f57dc93 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -248,6 +248,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "new_table_reader_for_compaction_inputs=false;" "keep_log_file_num=4890;" "skip_stats_update_on_db_open=false;" + "skip_checking_sst_file_sizes_on_db_open=false;" "max_manifest_file_size=4295009941;" "db_log_dir=path/to/db_log_dir;" "skip_log_error_on_recovery=true;" diff --git a/test_util/testutil.cc b/test_util/testutil.cc index e309df088..f27387ace 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -265,6 +265,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) { db_opt->paranoid_checks = rnd->Uniform(2); db_opt->skip_log_error_on_recovery = rnd->Uniform(2); db_opt->skip_stats_update_on_db_open = rnd->Uniform(2); + db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2); db_opt->use_adaptive_mutex = rnd->Uniform(2); db_opt->use_fsync = rnd->Uniform(2); db_opt->recycle_log_file_num = rnd->Uniform(2);