From 6eafdf135ae19f1db8f73a3f4959739dd759cb24 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 31 Mar 2022 20:00:52 -0700 Subject: [PATCH] Encode min_log_number_to_keep and delete_wals_before in one version edit (#9766) Summary: min_log_number_to_keep denotes that the WALs whose numbers are below this value **will** be deleted by RocksDB. delete_wals_before will be used by RocksDB if track_and_verify_wals_in_manifest is set to true. During recovery, RocksDB uses the info encoded in delete_wals_before to reconstruct its knowledge about what WALs to expect existing. If these two tags are not encoded in the same VersionEdit, then it's possible for min_log_number_to_keep=100 to exist, but delete_wals_before=100 to be lost due to power failure. Subsequent recovery will delete 99.log. If the db crashes again, the following recovery will expect to see 99.log since there is no delete_wals_before=100 in the MANIFEST, but the WAL is already deleted. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9766 Test Plan: First of all, make check. Second, format compatibility. SHORT_TEST=1 ./tools/check_format_compatible.sh Reviewed By: ltamasi Differential Revision: D35203623 Pulled By: riversand963 fbshipit-source-id: 45623fc4b4b50d299d5e0f9559a3a4c5e9522c8f --- HISTORY.md | 1 + db/db_flush_test.cc | 2 +- db/memtable_list.cc | 25 +++++++++++-------------- db/version_edit.cc | 3 +++ 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 154ee23f1..20576489e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,7 @@ * Fixed a heap use-after-free race with DropColumnFamily. * Fixed a bug that `rocksdb.read.block.compaction.micros` cannot track compaction stats (#9722). * Fixed `file_type`, `relative_filename` and `directory` fields returned by `GetLiveFilesMetaData()`, which were added in inheriting from `FileStorageInfo`. +* Fixed a bug affecting `track_and_verify_wals_in_manifest`. Without the fix, application may see "open error: Corruption: Missing WAL with log number" while trying to open the db. The corruption is a false alarm but prevents DB open (#9766). ### New Features * For db_bench when --seed=0 or --seed is not set then it uses the current time as the seed value. Previously it used the value 1000. diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index a8ffb9ec5..76442086d 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -2271,7 +2271,7 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) { // The recovered min log number with prepared data should be non-zero. // In 2pc mode, MinLogNumberToKeep returns the - // VersionSet::min_log_number_to_keep_2pc recovered from MANIFEST, if it's 0, + // VersionSet::min_log_number_to_keep recovered from MANIFEST, if it's 0, // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST. cfs.push_back(kDefaultColumnFamilyName); ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index b0d29bcd2..f447ee735 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -505,21 +505,20 @@ Status MemTableList::TryInstallMemtableFlushResults( min_wal_number_to_keep = PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list); } - edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep); - std::unique_ptr wal_deletion; + VersionEdit wal_deletion; + wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep); if (vset->db_options()->track_and_verify_wals_in_manifest) { if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) { - wal_deletion.reset(new VersionEdit); - wal_deletion->DeleteWalsBefore(min_wal_number_to_keep); - edit_list.push_back(wal_deletion.get()); + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); } TEST_SYNC_POINT_CALLBACK( "MemTableList::TryInstallMemtableFlushResults:" "AfterComputeMinWalToKeep", nullptr); } + edit_list.push_back(&wal_deletion); const auto manifest_write_cb = [this, cfd, batch_count, log_buffer, to_delete, mu](const Status& status) { @@ -805,17 +804,15 @@ Status InstallMemtableAtomicFlushResults( min_wal_number_to_keep = PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists); } - edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep); - std::unique_ptr wal_deletion; - if (vset->db_options()->track_and_verify_wals_in_manifest) { - if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) { - wal_deletion.reset(new VersionEdit); - wal_deletion->DeleteWalsBefore(min_wal_number_to_keep); - edit_lists.back().push_back(wal_deletion.get()); - ++num_entries; - } + VersionEdit wal_deletion; + wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep); + if (vset->db_options()->track_and_verify_wals_in_manifest && + min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); } + edit_lists.back().push_back(&wal_deletion); + ++num_entries; // Mark the version edits as an atomic group if the number of version edits // exceeds 1. diff --git a/db/version_edit.cc b/db/version_edit.cc index e7cd0a6be..619f67db0 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -120,6 +120,9 @@ bool VersionEdit::EncodeTo(std::string* dst) const { if (has_max_column_family_) { PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_); } + if (has_min_log_number_to_keep_) { + PutVarint32Varint64(dst, kMinLogNumberToKeep, min_log_number_to_keep_); + } if (has_last_sequence_) { PutVarint32Varint64(dst, kLastSequence, last_sequence_); }