From 1d6700f9e6e7d066ef0f13e399613cd425a0088d Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Fri, 22 Sep 2017 17:24:14 -0700 Subject: [PATCH] Add test kPointInTimeRecoveryCFConsistency Summary: Context/problem: - CFs may be flushed at different times - A WAL can only be deleted after all CFs have flushed beyond end of that WAL. - Point-in-time recovery might stop upon reaching the first corruption. - Some CFs may have already flushed beyond that point, while others haven't. We should fail the Open() instead of proceeding with inconsistent CFs. Closes https://github.com/facebook/rocksdb/pull/2900 Differential Revision: D5863281 Pulled By: miasantreble fbshipit-source-id: 180dbaf83d96c804cff49b3c406312a4ae61313e --- HISTORY.md | 1 + db/db_impl_open.cc | 21 +++++++++++++++++++++ db/db_wal_test.cc | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index ed7eb1898..6e64270ef 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ ### New Features ### Bug Fixes +* Fix a potential data inconsistency issue during point-in-time recovery. `DB:Open()` will abort if column family inconsistency is found during PIT recovery. ## 5.8.0 (08/30/2017) ### Public API Change diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index d95c1bf7d..1da1465c4 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -493,6 +493,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, bool stop_replay_by_wal_filter = false; bool stop_replay_for_corruption = false; bool flushed = false; + uint64_t corrupted_log_number = kMaxSequenceNumber; for (auto log_number : log_numbers) { // The previous incarnation may not have written any MANIFEST // records after allocating this log number. So we manually @@ -720,6 +721,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // We should ignore the error but not continue replaying status = Status::OK(); stop_replay_for_corruption = true; + corrupted_log_number = log_number; ROCKS_LOG_INFO(immutable_db_options_.info_log, "Point in time recovered to log #%" PRIu64 " seq #%" PRIu64, @@ -741,6 +743,25 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, versions_->SetLastSequence(last_sequence); } } + // Compare the corrupted log number to all columnfamily's current log number. + // Abort Open() if any column family's log number is greater than + // the corrupted log number, which means CF contains data beyond the point of + // corruption. This could during PIT recovery when the WAL is corrupted and + // some (but not all) CFs are flushed + if (stop_replay_for_corruption == true && + (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords)) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->GetLogNumber() > corrupted_log_number) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Column family inconsistency: SST file contains data" + " beyond the point of corruption."); + return Status::Corruption("SST file is ahead of WALs"); + } + } + } // True if there's any data in the WALs; if not, we can skip re-processing // them later diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 1f3cd3636..507d8ffbe 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -876,6 +876,39 @@ TEST_F(DBWALTest, kAbsoluteConsistency) { } } +// Test scope: +// We don't expect the data store to be opened if there is any inconsistency +// between WAL and SST files +TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) { + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + + // Create DB with multiple column families. + CreateAndReopenWithCF({"one", "two"}, options); + ASSERT_OK(Put(1, "key1", "val1")); + ASSERT_OK(Put(2, "key2", "val2")); + + // Record the offset at this point + Env* env = options.env; + int wal_file_id = RecoveryTestHelper::kWALFileOffset + 1; + std::string fname = LogFileName(dbname_, wal_file_id); + uint64_t offset_to_corrupt; + ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt)); + ASSERT_GT(offset_to_corrupt, 0); + + ASSERT_OK(Put(1, "key3", "val3")); + // Corrupt WAL at location of key3 + RecoveryTestHelper::InduceCorruption( + fname, static_cast(offset_to_corrupt), static_cast(4)); + ASSERT_OK(Put(2, "key4", "val4")); + ASSERT_OK(Put(1, "key5", "val5")); + Flush(2); + + // PIT recovery & verify + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options)); +} + // Test scope: // - We expect to open data store under all circumstances // - We expect only data upto the point where the first error was encountered