From 3d0d6b814051944056488cf0cdfc1313d2fc5c3e Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 22 Nov 2022 22:53:31 -0800 Subject: [PATCH] Make best-efforts recovery verify SST unique ID before Version construction (#10962) Summary: The check for SST unique IDs added to best-efforts recovery (`Options::best_efforts_recovery` is true). With best_efforts_recovery being true, RocksDB will recover to the latest point in MANIFEST such that all valid SST files included up to this point pass unique ID checks as well. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10962 Test Plan: make check Reviewed By: pdillinger Differential Revision: D41378241 Pulled By: riversand963 fbshipit-source-id: a036064e2c17dec13d080a24ef2a9f85d607b16c --- HISTORY.md | 2 ++ db/db_test2.cc | 74 ++++++++++++++++++++++++++++++++++++++ db/version_edit_handler.cc | 33 ++++++++++++++--- db/version_edit_handler.h | 16 +++++---- db/version_set.cc | 37 +++++++++++++++++-- db/version_set.h | 4 +-- include/rocksdb/options.h | 41 ++++++++++++--------- 7 files changed, 175 insertions(+), 32 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index abe16a799..571bc565a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,7 @@ # Rocksdb Change Log ## Unreleased +### Behavior changes +* Make best-efforts recovery verify SST unique ID before Version construction (#10962) ## 7.9.0 (11/21/2022) ### Performance Improvements diff --git a/db/db_test2.cc b/db/db_test2.cc index 97d671c14..779b8bf13 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -7509,6 +7509,80 @@ TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) { ASSERT_TRUE(s.IsCorruption()); } +TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) { + const auto tamper_with_uniq_id = [&](void* arg) { + auto props = static_cast(arg); + assert(props); + // update table property session_id to a different one + props->db_session_id = DBImpl::GenerateDbSessionId(nullptr); + }; + + const auto assert_db = [&](size_t expected_count, + const std::string& expected_v) { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + size_t cnt = 0; + for (it->SeekToFirst(); it->Valid(); it->Next(), ++cnt) { + ASSERT_EQ(std::to_string(cnt), it->key()); + ASSERT_EQ(expected_v, it->value()); + } + ASSERT_EQ(expected_count, cnt); + }; + + const int num_l0_compaction_trigger = 8; + const int num_l0 = num_l0_compaction_trigger - 1; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = num_l0_compaction_trigger; + + for (int k = 0; k < num_l0; ++k) { + // Allow mismatch for now + options.verify_sst_unique_id_in_manifest = false; + + DestroyAndReopen(options); + + constexpr size_t num_keys_per_file = 10; + for (int i = 0; i < num_l0; ++i) { + for (size_t j = 0; j < num_keys_per_file; ++j) { + ASSERT_OK(Put(std::to_string(j), "v" + std::to_string(i))); + } + if (i == k) { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", + tamper_with_uniq_id); + SyncPoint::GetInstance()->EnableProcessing(); + } + ASSERT_OK(Flush()); + } + + options.verify_sst_unique_id_in_manifest = true; + Status s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); + + options.best_efforts_recovery = true; + Reopen(options); + assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1)); + + // Reopen with regular recovery + options.best_efforts_recovery = false; + Reopen(options); + assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + for (size_t i = 0; i < num_keys_per_file; ++i) { + ASSERT_OK(Put(std::to_string(i), "v")); + } + ASSERT_OK(Flush()); + Reopen(options); + { + for (size_t i = 0; i < num_keys_per_file; ++i) { + ASSERT_EQ("v", Get(std::to_string(i))); + } + } + } +} + #ifndef ROCKSDB_LITE TEST_F(DBTest2, GetLatestSeqAndTsForKey) { Destroy(last_options_); diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 08d04c7c0..145e78789 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -734,12 +734,13 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( assert(!cfd->ioptions()->cf_paths.empty()); Status s; for (const auto& elem : edit.GetNewFiles()) { + int level = elem.first; const FileMetaData& meta = elem.second; const FileDescriptor& fd = meta.fd; uint64_t file_num = fd.GetNumber(); const std::string fpath = MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num); - s = VerifyFile(fpath, meta); + s = VerifyFile(cfd, fpath, level, meta); if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { missing_files.insert(file_num); s = Status::OK(); @@ -804,6 +805,18 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( auto* version = new Version(cfd, version_set_, version_set_->file_options_, *cfd->GetLatestMutableCFOptions(), io_tracer_, version_set_->current_version_number_++); + s = builder->LoadTableHandlers( + cfd->internal_stats(), + version_set_->db_options_->max_file_opening_threads, false, true, + cfd->GetLatestMutableCFOptions()->prefix_extractor, + MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + if (!s.ok()) { + delete version; + if (s.IsCorruption()) { + s = Status::OK(); + } + return s; + } s = builder->SaveTo(version->storage_info()); if (s.ok()) { version->PrepareAppend( @@ -823,9 +836,11 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( return s; } -Status VersionEditHandlerPointInTime::VerifyFile(const std::string& fpath, +Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd, + const std::string& fpath, + int level, const FileMetaData& fmeta) { - return version_set_->VerifyFileMetadata(fpath, fmeta); + return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta); } Status VersionEditHandlerPointInTime::VerifyBlobFile( @@ -843,6 +858,12 @@ Status VersionEditHandlerPointInTime::VerifyBlobFile( return s; } +Status VersionEditHandlerPointInTime::LoadTables( + ColumnFamilyData* /*cfd*/, bool /*prefetch_index_and_filter_in_cache*/, + bool /*is_initial_load*/) { + return Status::OK(); +} + Status ManifestTailer::Initialize() { if (Mode::kRecovery == mode_) { return VersionEditHandler::Initialize(); @@ -930,9 +951,11 @@ void ManifestTailer::CheckIterationResult(const log::Reader& reader, } } -Status ManifestTailer::VerifyFile(const std::string& fpath, +Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd, + const std::string& fpath, int level, const FileMetaData& fmeta) { - Status s = VersionEditHandlerPointInTime::VerifyFile(fpath, fmeta); + Status s = + VersionEditHandlerPointInTime::VerifyFile(cfd, fpath, level, fmeta); // TODO: Open file or create hard link to prevent the file from being // deleted. return s; diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index 665e0f0d4..fd2379b07 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -164,9 +164,9 @@ class VersionEditHandler : public VersionEditHandlerBase { ColumnFamilyData* cfd, bool force_create_version); - Status LoadTables(ColumnFamilyData* cfd, - bool prefetch_index_and_filter_in_cache, - bool is_initial_load); + virtual Status LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load); virtual bool MustOpenAllColumnFamilies() const { return !read_only_; } @@ -213,11 +213,15 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override; Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) override; - virtual Status VerifyFile(const std::string& fpath, - const FileMetaData& fmeta); + virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, + int level, const FileMetaData& fmeta); virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num, const BlobFileAddition& blob_addition); + Status LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load) override; + std::unordered_map versions_; }; @@ -250,7 +254,7 @@ class ManifestTailer : public VersionEditHandlerPointInTime { void CheckIterationResult(const log::Reader& reader, Status* s) override; - Status VerifyFile(const std::string& fpath, + Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& fmeta) override; enum Mode : uint8_t { diff --git a/db/version_set.cc b/db/version_set.cc index cac3a0a9e..082e55217 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -6704,8 +6704,9 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) { return all_versions_blob_file_size; } -Status VersionSet::VerifyFileMetadata(const std::string& fpath, - const FileMetaData& meta) const { +Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd, + const std::string& fpath, int level, + const FileMetaData& meta) { uint64_t fsize = 0; Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr); if (status.ok()) { @@ -6713,6 +6714,38 @@ Status VersionSet::VerifyFileMetadata(const std::string& fpath, status = Status::Corruption("File size mismatch: " + fpath); } } + if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) { + assert(cfd); + TableCache* table_cache = cfd->table_cache(); + assert(table_cache); + + const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions(); + assert(cf_opts); + std::shared_ptr pe = cf_opts->prefix_extractor; + size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts); + + const FileOptions& file_opts = file_options(); + + Version* version = cfd->current(); + assert(version); + VersionStorageInfo& storage_info = version->storage_info_; + const InternalKeyComparator* icmp = storage_info.InternalComparator(); + assert(icmp); + + InternalStats* internal_stats = cfd->internal_stats(); + + FileMetaData meta_copy = meta; + status = table_cache->FindTable( + ReadOptions(), file_opts, *icmp, meta_copy, + &(meta_copy.table_reader_handle), pe, + /*no_io=*/false, /*record_read_stats=*/true, + internal_stats->GetFileReadHist(level), false, level, + /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, + meta_copy.temperature); + if (meta_copy.table_reader_handle) { + table_cache->ReleaseHandle(meta_copy.table_reader_handle); + } + } return status; } diff --git a/db/version_set.h b/db/version_set.h index cf6f9af36..03176a8b5 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1501,8 +1501,8 @@ class VersionSet { ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, const VersionEdit* edit); - Status VerifyFileMetadata(const std::string& fpath, - const FileMetaData& meta) const; + Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath, + int level, const FileMetaData& meta); // Protected by DB mutex. WalSet wals_; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 3ba13d129..cd6644c00 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1285,30 +1285,37 @@ struct DBOptions { // Default: nullptr std::shared_ptr file_checksum_gen_factory = nullptr; - // By default, RocksDB recovery fails if any table/blob file referenced in + // By default, RocksDB recovery fails if any table/blob file referenced in the + // final version reconstructed from the // MANIFEST are missing after scanning the MANIFEST pointed to by the - // CURRENT file. - // Best-efforts recovery is another recovery mode that tolerates missing or - // corrupted table or blob files. + // CURRENT file. It can also fail if verification of unique SST id fails. + // Best-efforts recovery is another recovery mode that does not necessarily + // fail when certain table/blob files are missing/corrupted or have mismatched + // unique id table property. Instead, best-efforts recovery recovers each + // column family to a point in the MANIFEST that corresponds to a version. In + // such a version, all valid table/blob files referenced have the expected + // file size. For table files, their unique id table property match the + // MANIFEST. + // // Best-efforts recovery does not need a valid CURRENT file, and tries to // recover the database using one of the available MANIFEST files in the db // directory. - // Best-efforts recovery recovers database to a state in which the database - // includes only table and blob files whose actual sizes match the - // information in the chosen MANIFEST without holes in the history. // Best-efforts recovery tries the available MANIFEST files from high file // numbers (newer) to low file numbers (older), and stops after finding the // first MANIFEST file from which the db can be recovered to a state without - // invalid (missing/file-mismatch) table and blob files. - // It is possible that the database can be restored to an empty state with no - // table or blob files. - // Regardless of this option, the IDENTITY file is updated if needed during - // recovery to match the DB ID in the MANIFEST (if previously using - // write_dbid_to_manifest) or to be in some valid state (non-empty DB ID). - // Currently, not compatible with atomic flush. Furthermore, WAL files will - // not be used for recovery if best_efforts_recovery is true. - // Also requires either 1) LOCK file exists or 2) underlying env's LockFile() - // call returns ok even for non-existing LOCK file. + // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob + // files. It is possible that the database can be restored to an empty state + // with no table or blob files. + // + // Regardless of this option, the IDENTITY file + // is updated if needed during recovery to match the DB ID in the MANIFEST (if + // previously using write_dbid_to_manifest) or to be in some valid state + // (non-empty DB ID). Currently, not compatible with atomic flush. + // Furthermore, WAL files will not be used for recovery if + // best_efforts_recovery is true. Also requires either 1) LOCK file exists or + // 2) underlying env's LockFile() call returns ok even for non-existing LOCK + // file. + // // Default: false bool best_efforts_recovery = false;