Fix kPointInTimeRecovery handling of truncated WAL (#7701)

Summary: WAL may be truncated to an incomplete record due to crash while writing the last record or corruption. In the former case, no hole will be produced since no ACK'd data was lost. In the latter case, a hole could be produced without this PR since we proceeded to recover the next WAL as if nothing happened. This PR changes the record reading code to always report a corruption for incomplete records in `kPointInTimeRecovery` mode, and the upper layer will only ignore them if the next WAL has consecutive seqnum (i.e., we are guaranteed no hole). While this solves the hole problem for the case of incomplete records, the possibility is still there if the WAL is corrupted by truncation to an exact record boundary. This PR also regresses how much data can be recovered when writes are mixed with/without `WriteOptions::disableWAL`, as then we can not distinguish between a seqnum gap caused by corruption and a seqnum gap caused by a `disableWAL` write. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7701 Test Plan: Interestingly there already was a test for this case (`DBWALTestWithParams.kPointInTimeRecovery`); it just had a typo bug in the verification that prevented it from noticing holes in recovery. Reviewed By: anand1976 Differential Revision: D25111765 Pulled By: ajkr fbshipit-source-id: 5e330b13b1ee2b5be096cea9d0ff6075843e57b6
5 years ago · eb65d673fe
parent cc431ece37
commit eb65d673fe
4 changed files with 58 additions and 26 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -3,6 +3,9 @@
 ### Behavior Changes
 * Attempting to write a merge operand without explicitly configuring `merge_operator` now fails immediately, causing the DB to enter read-only mode. Previously, failure was deferred until the `merge_operator` was needed by a user read or a background operation.

+### Bug Fixes
+* Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries; for complete protection, users should enable `track_and_verify_wals_in_manifest`.
+
 ### Bug Fixes
 * Fixed the logic of populating native data structure for `read_amp_bytes_per_bit` during OPTIONS file parsing on big-endian architecture. Without this fix, original code introduced in PR7659, when running on big-endian machine, can mistakenly store read_amp_bytes_per_bit (an uint32) in little endian format. Future access to `read_amp_bytes_per_bit` will give wrong values. Little endian architecture is not affected.

--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@ -1348,13 +1348,19 @@ TEST_P(DBWALTestWithParams, kPointInTimeRecovery) {
  size_t recovered_row_count = RecoveryTestHelper::GetData(this);
  ASSERT_LT(recovered_row_count, row_count);

-  bool expect_data = true;
-  for (size_t k = 0; k < maxkeys; ++k) {
-    bool found = Get("key" + ToString(corrupt_offset)) != "NOT_FOUND";
-    if (expect_data && !found) {
-      expect_data = false;
+  // Verify a prefix of keys were recovered. But not in the case of full WAL
+  // truncation, because we have no way to know there was a corruption when
+  // truncation happened on record boundaries (preventing recovery holes in
+  // that case requires using `track_and_verify_wals_in_manifest`).
+  if (!trunc || corrupt_offset != 0) {
+    bool expect_data = true;
+    for (size_t k = 0; k < maxkeys; ++k) {
+      bool found = Get("key" + ToString(k)) != "NOT_FOUND";
+      if (expect_data && !found) {
+        expect_data = false;
+      }
+      ASSERT_EQ(found, expect_data);
    }
-    ASSERT_EQ(found, expect_data);
  }

  const size_t min = RecoveryTestHelper::kKeysPerWALFile *
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@ -119,16 +119,26 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
        break;

      case kBadHeader:
-        if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
-          // in clean shutdown we don't expect any error in the log files
+        if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+            wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+          // In clean shutdown we don't expect any error in the log files.
+          // In point-in-time recovery an incomplete record at the end could
+          // produce a hole in the recovered data. Report an error here, which
+          // higher layers can choose to ignore when it's provable there is no
+          // hole.
          ReportCorruption(drop_size, "truncated header");
        }
        FALLTHROUGH_INTENDED;

      case kEof:
        if (in_fragmented_record) {
-          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
-            // in clean shutdown we don't expect any error in the log files
+          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+              wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+            // In clean shutdown we don't expect any error in the log files.
+            // In point-in-time recovery an incomplete record at the end could
+            // produce a hole in the recovered data. Report an error here, which
+            // higher layers can choose to ignore when it's provable there is no
+            // hole.
            ReportCorruption(scratch->size(), "error reading trailing data");
          }
          // This can be caused by the writer dying immediately after
@ -142,8 +152,13 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
        if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) {
          // Treat a record from a previous instance of the log as EOF.
          if (in_fragmented_record) {
-            if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
-              // in clean shutdown we don't expect any error in the log files
+            if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+                wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+              // In clean shutdown we don't expect any error in the log files.
+              // In point-in-time recovery an incomplete record at the end could
+              // produce a hole in the recovered data. Report an error here,
+              // which higher layers can choose to ignore when it's provable
+              // there is no hole.
              ReportCorruption(scratch->size(), "error reading trailing data");
            }
            // This can be caused by the writer dying immediately after
@ -164,6 +179,20 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
        break;

      case kBadRecordLen:
+        if (eof_) {
+          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+              wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+            // In clean shutdown we don't expect any error in the log files.
+            // In point-in-time recovery an incomplete record at the end could
+            // produce a hole in the recovered data. Report an error here, which
+            // higher layers can choose to ignore when it's provable there is no
+            // hole.
+            ReportCorruption(drop_size, "truncated record body");
+          }
+          return false;
+        }
+        FALLTHROUGH_INTENDED;
+
      case kBadRecordChecksum:
        if (recycled_ &&
            wal_recovery_mode ==
@ -355,18 +384,14 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
      }
    }
    if (header_size + length > buffer_.size()) {
+      assert(buffer_.size() >= static_cast<size_t>(header_size));
      *drop_size = buffer_.size();
      buffer_.clear();
-      if (!eof_) {
-        return kBadRecordLen;
-      }
-      // If the end of the file has been reached without reading |length|
-      // bytes of payload, assume the writer died in the middle of writing the
-      // record. Don't report a corruption unless requested.
-      if (*drop_size) {
-        return kBadHeader;
-      }
-      return kEof;
+      // If the end of the read has been reached without seeing
+      // `header_size + length` bytes of payload, report a corruption. The
+      // higher layers can decide how to handle it based on the recovery mode,
+      // whether this occurred at EOF, whether this is the final WAL, etc.
+      return kBadRecordLen;
    }

    if (type == kZeroType && length == 0) {
--- a/db/log_test.cc
+++ b/db/log_test.cc
@ -465,7 +465,7 @@ TEST_P(LogTest, BadLengthAtEndIsNotIgnored) {
  ShrinkSize(1);
  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
  ASSERT_GT(DroppedBytes(), 0U);
-  ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+  ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
 }

 TEST_P(LogTest, ChecksumMismatch) {
@ -573,9 +573,7 @@ TEST_P(LogTest, PartialLastIsNotIgnored) {
  ShrinkSize(1);
  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
  ASSERT_GT(DroppedBytes(), 0U);
-  ASSERT_EQ("OK", MatchError(
-                      "Corruption: truncated headerCorruption: "
-                      "error reading trailing data"));
+  ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
 }

 TEST_P(LogTest, ErrorJoinsRecords) {