From 1d6700f9e6e7d066ef0f13e399613cd425a0088d Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Fri, 22 Sep 2017 17:24:14 -0700
Subject: [PATCH] Add test kPointInTimeRecoveryCFConsistency

Summary:
Context/problem:

- CFs may be flushed at different times
- A WAL can only be deleted after all CFs have flushed beyond end of that WAL.
- Point-in-time recovery might stop upon reaching the first corruption.
- Some CFs may have already flushed beyond that point, while others haven't. We should fail the Open() instead of proceeding with inconsistent CFs.
Closes https://github.com/facebook/rocksdb/pull/2900

Differential Revision: D5863281

Pulled By: miasantreble

fbshipit-source-id: 180dbaf83d96c804cff49b3c406312a4ae61313e
---
 HISTORY.md         |  1 +
 db/db_impl_open.cc | 21 +++++++++++++++++++++
 db/db_wal_test.cc  | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index ed7eb1898..6e64270ef 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -5,6 +5,7 @@
 
 ### New Features
 ### Bug Fixes
+* Fix a potential data inconsistency issue during point-in-time recovery. `DB:Open()` will abort if column family inconsistency is found during PIT recovery.
 
 ## 5.8.0 (08/30/2017)
 ### Public API Change
diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc
index d95c1bf7d..1da1465c4 100644
--- a/db/db_impl_open.cc
+++ b/db/db_impl_open.cc
@@ -493,6 +493,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
   bool stop_replay_by_wal_filter = false;
   bool stop_replay_for_corruption = false;
   bool flushed = false;
+  uint64_t corrupted_log_number = kMaxSequenceNumber;
   for (auto log_number : log_numbers) {
     // The previous incarnation may not have written any MANIFEST
     // records after allocating this log number.  So we manually
@@ -720,6 +721,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
         // We should ignore the error but not continue replaying
         status = Status::OK();
         stop_replay_for_corruption = true;
+        corrupted_log_number = log_number;
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "Point in time recovered to log #%" PRIu64
                        " seq #%" PRIu64,
@@ -741,6 +743,25 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       versions_->SetLastSequence(last_sequence);
     }
   }
+  // Compare the corrupted log number to all columnfamily's current log number.
+  // Abort Open() if any column family's log number is greater than
+  // the corrupted log number, which means CF contains data beyond the point of
+  // corruption. This could during PIT recovery when the WAL is corrupted and
+  // some (but not all) CFs are flushed
+  if (stop_replay_for_corruption == true &&
+      (immutable_db_options_.wal_recovery_mode ==
+           WALRecoveryMode::kPointInTimeRecovery ||
+       immutable_db_options_.wal_recovery_mode ==
+           WALRecoveryMode::kTolerateCorruptedTailRecords)) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->GetLogNumber() > corrupted_log_number) {
+        ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                        "Column family inconsistency: SST file contains data"
+                        " beyond the point of corruption.");
+        return Status::Corruption("SST file is ahead of WALs");
+      }
+    }
+  }
 
   // True if there's any data in the WALs; if not, we can skip re-processing
   // them later
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 1f3cd3636..507d8ffbe 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -876,6 +876,39 @@ TEST_F(DBWALTest, kAbsoluteConsistency) {
   }
 }
 
+// Test scope:
+// We don't expect the data store to be opened if there is any inconsistency
+// between WAL and SST files
+TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+
+  // Create DB with multiple column families.
+  CreateAndReopenWithCF({"one", "two"}, options);
+  ASSERT_OK(Put(1, "key1", "val1"));
+  ASSERT_OK(Put(2, "key2", "val2"));
+
+  // Record the offset at this point
+  Env* env = options.env;
+  int wal_file_id = RecoveryTestHelper::kWALFileOffset + 1;
+  std::string fname = LogFileName(dbname_, wal_file_id);
+  uint64_t offset_to_corrupt;
+  ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt));
+  ASSERT_GT(offset_to_corrupt, 0);
+
+  ASSERT_OK(Put(1, "key3", "val3"));
+  // Corrupt WAL at location of key3
+  RecoveryTestHelper::InduceCorruption(
+      fname, static_cast<size_t>(offset_to_corrupt), static_cast<size_t>(4));
+  ASSERT_OK(Put(2, "key4", "val4"));
+  ASSERT_OK(Put(1, "key5", "val5"));
+  Flush(2);
+
+  // PIT recovery & verify
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
+}
+
 // Test scope:
 // - We expect to open data store under all circumstances
 // - We expect only data upto the point where the first error was encountered