From 9aad24da55570bb37da57496f2718fe7ae2a1937 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Wed, 26 Aug 2020 10:37:59 -0700 Subject: [PATCH] Real fix for race in backup custom checksum checking (#7309) Summary: This is a "real" fix for the issue worked around in https://github.com/facebook/rocksdb/issues/7294. To get DB checksum info for live files, we now read the manifest file that will become part of the checkpoint/backup. This requires a little extra handling in taking a custom checkpoint, including only reading the manifest file up to the size prescribed by the checkpoint. This moves GetFileChecksumsFromManifest from backup code to file_checksum_helper.{h,cc} and removes apparently unnecessary checking related to column families. Updated HISTORY.md and warned potential future users of DB::GetLiveFilesChecksumInfo() Pull Request resolved: https://github.com/facebook/rocksdb/pull/7309 Test Plan: updated unit test, before and after Reviewed By: ajkr Differential Revision: D23311994 Pulled By: pdillinger fbshipit-source-id: 741e30a2dc1830e8208f7648fcc8c5f000d4e2d5 --- HISTORY.md | 2 +- db/log_reader.cc | 4 + db/log_reader.h | 5 ++ include/rocksdb/db.h | 4 +- util/file_checksum_helper.cc | 68 ++++++++++++++++ util/file_checksum_helper.h | 6 ++ utilities/backupable/backupable_db.cc | 91 ++------------------- utilities/backupable/backupable_db_test.cc | 26 +++++- utilities/checkpoint/checkpoint_impl.cc | 93 ++++++++++++++-------- utilities/checkpoint/checkpoint_impl.h | 2 +- 10 files changed, 176 insertions(+), 125 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 2f15dd33c..dc20cad36 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,7 +5,7 @@ * Fix a possible corruption to the LSM state (overlapping files within a level) when a `CompactRange()` for refitting levels (`CompactRangeOptions::change_level == true`) and another manual compaction are executed in parallel. * Sanitize `recycle_log_file_num` to zero when the user attempts to enable it in combination with `WALRecoveryMode::kTolerateCorruptedTailRecords`. Previously the two features were allowed together, which compromised the user's configured crash-recovery guarantees. * Fix a bug where a level refitting in CompactRange() might race with an automatic compaction that puts the data to the target level of the refitting. The bug has been there for years. -* BackupEngine::CreateNewBackup could fail intermittently with non-OK status when backing up a read-write DB configured with a DBOptions::file_checksum_gen_factory. This issue has been worked-around such that CreateNewBackup should succeed, but (until fully fixed) BackupEngine might not see all checksums available in the DB. +* Fixed a bug in version 6.12 in which BackupEngine::CreateNewBackup could fail intermittently with non-OK status when backing up a read-write DB configured with a DBOptions::file_checksum_gen_factory. * Fix a bug where immutable flushed memtable is never destroyed because a memtable is not added to delete list because of refernce hold by super version and super version doesn't switch because of empty delete list. So memory usage increases beyond write_buffer_size + max_write_buffer_size_to_maintain. * Fix useless no-op compactions scheduled upon snapshot release when options.disable-auto-compactions = true. diff --git a/db/log_reader.cc b/db/log_reader.cc index 24daceb31..6e71eaa2f 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -202,6 +202,10 @@ uint64_t Reader::LastRecordOffset() { return last_record_offset_; } +uint64_t Reader::LastRecordEnd() { + return end_of_buffer_offset_ - buffer_.size(); +} + void Reader::UnmarkEOF() { if (read_error_) { return; diff --git a/db/log_reader.h b/db/log_reader.h index 45cfbb5a2..1ddc4abdd 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -71,6 +71,11 @@ class Reader { // Undefined before the first call to ReadRecord. uint64_t LastRecordOffset(); + // Returns the first physical offset after the last record returned by + // ReadRecord, or zero before first call to ReadRecord. This can also be + // thought of as the "current" position in processing the file bytes. + uint64_t LastRecordEnd(); + // returns true if the reader has encountered an eof condition. bool IsEOF() { return eof_; diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 498d81d6e..6a23629ce 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1354,7 +1354,9 @@ class DB { virtual void GetLiveFilesMetaData( std::vector* /*metadata*/) {} - // Return a list of all table checksum info + // Return a list of all table file checksum info. + // Note: This function might be of limited use because it cannot be + // synchronized with GetLiveFiles. virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0; // Obtains the meta data of the specified column family of the DB. diff --git a/util/file_checksum_helper.cc b/util/file_checksum_helper.cc index e8f32fb84..761307b3e 100644 --- a/util/file_checksum_helper.cc +++ b/util/file_checksum_helper.cc @@ -9,6 +9,12 @@ #include "util/file_checksum_helper.h" +#include + +#include "db/log_reader.h" +#include "db/version_edit.h" +#include "file/sequence_file_reader.h" + namespace ROCKSDB_NAMESPACE { void FileChecksumListImpl::reset() { checksum_map_.clear(); } @@ -83,4 +89,66 @@ std::shared_ptr GetFileChecksumGenCrc32cFactory() { return default_crc32c_gen_factory; } +Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, + uint64_t manifest_file_size, + FileChecksumList* checksum_list) { + if (checksum_list == nullptr) { + return Status::InvalidArgument("checksum_list is nullptr"); + } + + checksum_list->reset(); + Status s; + + std::unique_ptr file_reader; + { + std::unique_ptr file; + const std::shared_ptr& fs = src_env->GetFileSystem(); + s = fs->NewSequentialFile(abs_path, + fs->OptimizeForManifestRead(FileOptions()), &file, + nullptr /* dbg */); + if (!s.ok()) { + return s; + } + file_reader.reset(new SequentialFileReader(std::move(file), abs_path)); + } + + struct LogReporter : public log::Reader::Reporter { + Status* status_ptr; + virtual void Corruption(size_t /*bytes*/, const Status& st) override { + if (status_ptr->ok()) { + *status_ptr = st; + } + } + } reporter; + reporter.status_ptr = &s; + log::Reader reader(nullptr, std::move(file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + Slice record; + std::string scratch; + while (reader.LastRecordEnd() < manifest_file_size && + reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + + // Remove the deleted files from the checksum_list + for (const auto& deleted_file : edit.GetDeletedFiles()) { + checksum_list->RemoveOneFileChecksum(deleted_file.second); + } + + // Add the new files to the checksum_list + for (const auto& new_file : edit.GetNewFiles()) { + checksum_list->InsertOneFileChecksum( + new_file.second.fd.GetNumber(), new_file.second.file_checksum, + new_file.second.file_checksum_func_name); + } + } + assert(!s.ok() || + manifest_file_size == std::numeric_limits::max() || + reader.LastRecordEnd() == manifest_file_size); + return s; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/util/file_checksum_helper.h b/util/file_checksum_helper.h index ce56f7353..448927f55 100644 --- a/util/file_checksum_helper.h +++ b/util/file_checksum_helper.h @@ -89,4 +89,10 @@ class FileChecksumListImpl : public FileChecksumList { checksum_map_; }; +// If manifest_file_size < std::numeric_limits::max(), only use +// that length prefix of the manifest file. +Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, + uint64_t manifest_file_size, + FileChecksumList* checksum_list); + } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 9dde42af7..c787e3056 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -42,6 +42,7 @@ #include "util/channel.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_checksum_helper.h" #include "util/string_util.h" #include "utilities/checkpoint/checkpoint_impl.h" @@ -485,9 +486,6 @@ class BackupEngineImpl : public BackupEngine { const BackupMeta* backup, FileChecksumList* checksum_list); - Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, - FileChecksumList* checksum_list); - Status VerifyFileWithCrc32c(Env* src_env, const BackupMeta* backup, const std::string& rel_path); @@ -2159,6 +2157,7 @@ Status BackupEngineImpl::GetFileDbIdentities(Env* src_env, return s; } } + Status BackupEngineImpl::GetFileChecksumsFromManifestInBackup( Env* src_env, const BackupID& backup_id, const BackupMeta* backup, FileChecksumList* checksum_list) { @@ -2197,87 +2196,11 @@ Status BackupEngineImpl::GetFileChecksumsFromManifestInBackup( return s; } - s = GetFileChecksumsFromManifest(src_env, GetAbsolutePath(manifest_rel_path), - checksum_list); - return s; -} - -Status BackupEngineImpl::GetFileChecksumsFromManifest( - Env* src_env, const std::string& abs_path, - FileChecksumList* checksum_list) { - if (checksum_list == nullptr) { - return Status::InvalidArgument("checksum_list is nullptr"); - } - - checksum_list->reset(); - Status s; - - std::unique_ptr file_reader; - { - std::unique_ptr file; - const std::shared_ptr& fs = src_env->GetFileSystem(); - s = fs->NewSequentialFile(abs_path, - fs->OptimizeForManifestRead(FileOptions()), &file, - nullptr /* dbg */); - if (!s.ok()) { - return s; - } - file_reader.reset(new SequentialFileReader(std::move(file), abs_path)); - } - - LogReporter reporter; - reporter.status = &s; - log::Reader reader(nullptr, std::move(file_reader), &reporter, - true /* checksum */, 0 /* log_number */); - Slice record; - std::string scratch; - // Set of column families initialized with default CF - std::unordered_set cf_set = {0}; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - // Check current CF status - uint32_t column_family = edit.GetColumnFamily(); - auto cf_set_itr = cf_set.find(column_family); - bool cf_exist = (cf_set_itr != cf_set.end()); - if (edit.IsColumnFamilyAdd()) { - if (cf_exist) { - s = Status::Corruption("Manifest adding the same column family twice"); - break; - } - cf_set.insert(column_family); - } else if (edit.IsColumnFamilyDrop()) { - if (!cf_exist) { - s = Status::Corruption( - "Manifest dropping non-existing column family: " + - ToString(column_family)); - break; - } - cf_set.erase(cf_set_itr); - } else { - if (!cf_exist) { - s = Status::Corruption("Manifest referencing unknown column family: " + - ToString(column_family)); - break; - } - assert(cf_set.find(column_family) != cf_set.end()); - - // Remove the deleted files from the checksum_list - for (const auto& deleted_file : edit.GetDeletedFiles()) { - checksum_list->RemoveOneFileChecksum(deleted_file.second); - } - - // Add the new files to the checksum_list - for (const auto& new_file : edit.GetNewFiles()) { - checksum_list->InsertOneFileChecksum( - new_file.second.fd.GetNumber(), new_file.second.file_checksum, - new_file.second.file_checksum_func_name); - } - } - } + // Read whole manifest file in backup + s = GetFileChecksumsFromManifest( + src_env, GetAbsolutePath(manifest_rel_path), + std::numeric_limits::max() /*manifest_file_size*/, + checksum_list); return s; } diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index da5cb9f5e..278c0e3a3 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -1823,13 +1824,13 @@ TEST_F(BackupableDBTest, FlushCompactDuringBackupCheckpoint) { // That FillDB leaves a mix of flushed and unflushed data SyncPoint::GetInstance()->LoadDependency( {{"CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1", - "BackupableDBTest::FlushDuringBackupCheckpoint:BeforeFlush"}, - {"BackupableDBTest::FlushDuringBackupCheckpoint:AfterFlush", + "BackupableDBTest::FlushCompactDuringBackupCheckpoint:Before"}, + {"BackupableDBTest::FlushCompactDuringBackupCheckpoint:After", "CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"}}); SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::port::Thread flush_thread{[this]() { TEST_SYNC_POINT( - "BackupableDBTest::FlushDuringBackupCheckpoint:BeforeFlush"); + "BackupableDBTest::FlushCompactDuringBackupCheckpoint:Before"); FillDB(db_.get(), keys_iteration, 2 * keys_iteration); ASSERT_OK(db_->Flush(FlushOptions())); DBImpl* dbi = static_cast(db_.get()); @@ -1837,11 +1838,28 @@ TEST_F(BackupableDBTest, FlushCompactDuringBackupCheckpoint) { ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); dbi->TEST_WaitForCompact(); TEST_SYNC_POINT( - "BackupableDBTest::FlushDuringBackupCheckpoint:AfterFlush"); + "BackupableDBTest::FlushCompactDuringBackupCheckpoint:After"); }}; ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); flush_thread.join(); CloseDBAndBackupEngine(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + if (sopt == kShareWithChecksum) { + // Ensure we actually got DB manifest checksums by inspecting + // shared_checksum file names for hex checksum component + std::regex expected("[^_]+_[0-9A-F]{8}_[^_]+.sst"); + std::vector children; + const std::string dir = backupdir_ + "/shared_checksum"; + ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children)); + for (const auto& child : children) { + if (child.name == "." || child.name == ".." || child.size_bytes == 0) { + continue; + } + const std::string match("match"); + EXPECT_EQ(match, std::regex_replace(child.name, expected, match)); + } + } AssertBackupConsistency(0, 0, keys_iteration); } } diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index 81ee48fdd..0ce3222d8 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -27,6 +27,7 @@ #include "rocksdb/utilities/checkpoint.h" #include "test_util/sync_point.h" #include "util/cast_util.h" +#include "util/file_checksum_helper.h" namespace ROCKSDB_NAMESPACE { @@ -180,7 +181,7 @@ Status CheckpointImpl::CreateCustomCheckpoint( FileType type)> create_file_cb, uint64_t* sequence_number, uint64_t log_size_for_flush, - const bool& get_live_table_checksum) { + bool get_live_table_checksum) { Status s; std::vector live_files; uint64_t manifest_file_size = 0; @@ -262,19 +263,17 @@ Status CheckpointImpl::CreateCustomCheckpoint( size_t wal_size = live_wal_files.size(); - // get table file checksums if get_live_table_checksum is true - std::unique_ptr checksum_list(NewFileChecksumList()); - Status checksum_status; - if (get_live_table_checksum) { - checksum_status = db_->GetLiveFilesChecksumInfo(checksum_list.get()); - } - - // copy/hard link live_files + // process live files, non-table files first std::string manifest_fname, current_fname; - for (size_t i = 0; s.ok() && i < live_files.size(); ++i) { + // record table files for processing next + std::vector> live_table_files; + for (auto& live_file : live_files) { + if (!s.ok()) { + break; + } uint64_t number; FileType type; - bool ok = ParseFileName(live_files[i], &number, &type); + bool ok = ParseFileName(live_file, &number, &type); if (!ok) { s = Status::Corruption("Can't parse file name. This is very bad"); break; @@ -282,54 +281,80 @@ Status CheckpointImpl::CreateCustomCheckpoint( // we should only get sst, options, manifest and current files here assert(type == kTableFile || type == kDescriptorFile || type == kCurrentFile || type == kOptionsFile); - assert(live_files[i].size() > 0 && live_files[i][0] == '/'); + assert(live_file.size() > 0 && live_file[0] == '/'); if (type == kCurrentFile) { // We will craft the current file manually to ensure it's consistent with // the manifest number. This is necessary because current's file contents // can change during checkpoint creation. - current_fname = live_files[i]; + current_fname = live_file; continue; } else if (type == kDescriptorFile) { - manifest_fname = live_files[i]; + manifest_fname = live_file; + } + if (type != kTableFile) { + // copy non-table files here + // * if it's kDescriptorFile, limit the size to manifest_file_size + s = copy_file_cb(db_->GetName(), live_file, + (type == kDescriptorFile) ? manifest_file_size : 0, type, + kUnknownFileChecksumFuncName, kUnknownFileChecksum); + } else { + // process table files below + live_table_files.push_back(make_pair(live_file, number)); + } + } + + // get checksum info for table files + // get table file checksums if get_live_table_checksum is true + std::unique_ptr checksum_list; + + if (s.ok() && get_live_table_checksum) { + checksum_list.reset(NewFileChecksumList()); + // should succeed even without checksum info present, else manifest + // is corrupt + s = GetFileChecksumsFromManifest(db_->GetEnv(), + db_->GetName() + manifest_fname, + manifest_file_size, checksum_list.get()); + } + + // copy/hard link live table files + for (auto& ltf : live_table_files) { + if (!s.ok()) { + break; } - std::string src_fname = live_files[i]; + std::string& src_fname = ltf.first; + uint64_t number = ltf.second; // rules: - // * if it's kTableFile, then it's shared - // * if it's kDescriptorFile, limit the size to manifest_file_size - // * always copy if cross-device link - if ((type == kTableFile) && same_fs) { - s = link_file_cb(db_->GetName(), src_fname, type); + // * for kTableFile, attempt hard link instead of copy. + // * but can't hard link across filesystems. + if (same_fs) { + s = link_file_cb(db_->GetName(), src_fname, kTableFile); if (s.IsNotSupported()) { same_fs = false; s = Status::OK(); } } - if ((type != kTableFile) || (!same_fs)) { + if (!same_fs) { std::string checksum_name = kUnknownFileChecksumFuncName; std::string checksum_value = kUnknownFileChecksum; + // we ignore the checksums either they are not required or we failed to // obtain the checksum lsit for old table files that have no file // checksums - if (type == kTableFile && get_live_table_checksum && - checksum_status.ok()) { + if (get_live_table_checksum) { // find checksum info for table files - s = checksum_list->SearchOneFileChecksum(number, &checksum_value, - &checksum_name); + Status search = checksum_list->SearchOneFileChecksum( + number, &checksum_value, &checksum_name); - // XXX/FIXME(peterd): There's currently a race between GetLiveFiles - // and GetLiveFilesChecksumInfo that could lead to not finding - // checksum info on a file that has it. For now, we can accept - // that and treat it like a legacy file lacking checksum info. - if (!s.ok()) { + // could be a legacy file lacking checksum info. overall OK if + // not found + if (!search.ok()) { assert(checksum_name == kUnknownFileChecksumFuncName); assert(checksum_value == kUnknownFileChecksum); - s = Status::OK(); } } - s = copy_file_cb(db_->GetName(), src_fname, - (type == kDescriptorFile) ? manifest_file_size : 0, type, - checksum_name, checksum_value); + s = copy_file_cb(db_->GetName(), src_fname, 0, kTableFile, checksum_name, + checksum_value); } } if (s.ok() && !current_fname.empty() && !manifest_fname.empty()) { diff --git a/utilities/checkpoint/checkpoint_impl.h b/utilities/checkpoint/checkpoint_impl.h index 728477577..dfa8460f9 100644 --- a/utilities/checkpoint/checkpoint_impl.h +++ b/utilities/checkpoint/checkpoint_impl.h @@ -58,7 +58,7 @@ class CheckpointImpl : public Checkpoint { const std::string& contents, FileType type)> create_file_cb, uint64_t* sequence_number, uint64_t log_size_for_flush, - const bool& get_live_table_checksum = false); + bool get_live_table_checksum = false); private: void CleanStagingDirectory(const std::string& path, Logger* info_log);