From fb05b5a65213aaa47e816dd2cef05224a7f08dcf Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Thu, 30 Jan 2020 10:53:46 -0800
Subject: [PATCH] Force a new manifest file if append to current one fails
 (#6331)

Summary:
Fix for issue https://github.com/facebook/rocksdb/issues/6316

When an append/sync of the manifest file fails due to an IO error such
as NoSpace, we don't always put the DB in read-only mode. This is true
for flush and compactions, as well as foreground operatons such as column family
add/drop, CompactFiles etc. Subsequent changes to the DB will be
recorded in the same manifest file, which would have a corrupted record
in the middle due to the previous failure. On next DB::Open(), it will
fail to process the full manifest and data will be lost.

To fix this, we reset VersionSet::descriptor_log_ on append/sync
failure, which will force a new manifest file to be written on the next
append.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6331

Test Plan: Add new unit tests in error_handler_test.cc

Differential Revision: D19632951

Pulled By: anand1976

fbshipit-source-id: 68d527cb6e59a94cbbbf9f5a17a7f464381d51e3
---
 HISTORY.md                    |   1 +
 db/error_handler.cc           |  13 ++-
 db/error_handler_test.cc      | 178 ++++++++++++++++++++++++++++++++++
 db/version_set.cc             |   5 +-
 file/sst_file_manager_impl.cc |   1 +
 5 files changed, 190 insertions(+), 8 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index a616cda94..1fa0cb0a9 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,6 +3,7 @@
 ### Bug Fixes
 * Fix incorrect results while block-based table uses kHashSearch, together with Prev()/SeekForPrev().
 * Fix a bug that prevents opening a DB after two consecutive crash with TransactionDB, where the first crash recovers from a corrupted WAL with kPointInTimeRecovery but the second cannot.
+* Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space.
 
 ### Public API Change
 * The BlobDB garbage collector now emits the statistics `BLOB_DB_GC_NUM_FILES` (number of blob files obsoleted during GC), `BLOB_DB_GC_NUM_NEW_FILES` (number of new blob files generated during GC), `BLOB_DB_GC_FAILURES` (number of failed GC passes), `BLOB_DB_GC_NUM_KEYS_RELOCATED` (number of blobs relocated during GC), and `BLOB_DB_GC_BYTES_RELOCATED` (total size of blobs relocated during GC). On the other hand, the following statistics, which are not relevant for the new GC implementation, are now deprecated: `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, and `BLOB_DB_GC_MICROS`.
diff --git a/db/error_handler.cc b/db/error_handler.cc
index 9e1bf5cc1..317f00723 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -166,12 +166,6 @@ Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reas
     return Status::OK();
   }
 
-  // Check if recovery is currently in progress. If it is, we will save this
-  // error so we can check it at the end to see if recovery succeeded or not
-  if (recovery_in_prog_ && recovery_error_.ok()) {
-    recovery_error_ = bg_err;
-  }
-
   bool paranoid = db_options_.paranoid_checks;
   Status::Severity sev = Status::Severity::kFatalError;
   Status new_bg_err;
@@ -204,10 +198,15 @@ Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reas
 
   new_bg_err = Status(bg_err, sev);
 
+  // Check if recovery is currently in progress. If it is, we will save this
+  // error so we can check it at the end to see if recovery succeeded or not
+  if (recovery_in_prog_ && recovery_error_.ok()) {
+    recovery_error_ = new_bg_err;
+  }
+
   bool auto_recovery = auto_recovery_;
   if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
     auto_recovery = false;
-    ;
   }
 
   // Allow some error specific overrides
diff --git a/db/error_handler_test.cc b/db/error_handler_test.cc
index 879686e9f..3af5c26f5 100644
--- a/db/error_handler_test.cc
+++ b/db/error_handler_test.cc
@@ -22,6 +22,21 @@ namespace rocksdb {
 class DBErrorHandlingTest : public DBTestBase {
  public:
   DBErrorHandlingTest() : DBTestBase("/db_error_handling_test") {}
+
+  std::string GetManifestNameFromLiveFiles() {
+    std::vector<std::string> live_files;
+    uint64_t manifest_size;
+
+    dbfull()->GetLiveFiles(live_files, &manifest_size, false);
+    for (auto& file : live_files) {
+      uint64_t num = 0;
+      FileType type;
+      if (ParseFileName(file, &num, &type) && type == kDescriptorFile) {
+        return file;
+      }
+    }
+    return "";
+  }
 };
 
 class DBErrorHandlingEnv : public EnvWrapper {
@@ -161,6 +176,169 @@ TEST_F(DBErrorHandlingTest, FLushWriteError) {
   Destroy(options);
 }
 
+TEST_F(DBErrorHandlingTest, ManifestWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  Put(Key(0), "val");
+  Flush();
+  Put(Key(1), "val");
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void *) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_EQ(s, Status::OK());
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, DoubleManifestWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  Put(Key(0), "val");
+  Flush();
+  Put(Key(1), "val");
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void *) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
+  fault_env->SetFilesystemActive(true);
+
+  // This Resume() will attempt to create a new manifest file and fail again
+  s = dbfull()->Resume();
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
+  fault_env->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // A successful Resume() will create a new manifest file
+  s = dbfull()->Resume();
+  ASSERT_EQ(s, Status::OK());
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, CompactionManifestWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.env = fault_env.get();
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  Put(Key(0), "val");
+  Put(Key(2), "val");
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+      // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1",
+        "CompactionManifestWriteError:0"},
+      // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteError:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+      // Wait for DB instance to clear bg_error before calling
+      // TEST_WaitForCompact
+       {"SstFileManagerImpl::ClearError",
+        "CompactionManifestWriteError:2"}});
+  // trigger manifest write failure in compaction thread
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void *) {
+      fail_manifest.store(true);
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void *) {
+      if (fail_manifest.load()) {
+        fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+      }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put(Key(1), "val");
+  // This Flush will trigger a compaction, which will fail when appending to
+  // the manifest
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  TEST_SYNC_POINT("CompactionManifestWriteError:0");
+  // Clear all errors so when the compaction is retried, it will succeed
+  fault_env->SetFilesystemActive(true);
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("CompactionManifestWriteError:1");
+  TEST_SYNC_POINT("CompactionManifestWriteError:2");
+
+  s = dbfull()->TEST_WaitForCompact();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(s, Status::OK());
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
 TEST_F(DBErrorHandlingTest, CompactionWriteError) {
   std::unique_ptr<FaultInjectionTestEnv> fault_env(
       new FaultInjectionTestEnv(Env::Default()));
diff --git a/db/version_set.cc b/db/version_set.cc
index 5ac0bb0d5..2257f7170 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3956,12 +3956,15 @@ Status VersionSet::ProcessManifestWrites(
     for (auto v : versions) {
       delete v;
     }
+    // If manifest append failed for whatever reason, the file could be
+    // corrupted. So we need to force the next version update to start a
+    // new manifest file.
+    descriptor_log_.reset();
     if (new_descriptor_log) {
       ROCKS_LOG_INFO(db_options_->info_log,
                      "Deleting manifest %" PRIu64 " current manifest %" PRIu64
                      "\n",
                      manifest_file_number_, pending_manifest_file_number_);
-      descriptor_log_.reset();
       env_->DeleteFile(
           DescriptorFileName(dbname_, pending_manifest_file_number_));
     }
diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc
index 48567d907..e9ed265ac 100644
--- a/file/sst_file_manager_impl.cc
+++ b/file/sst_file_manager_impl.cc
@@ -308,6 +308,7 @@ void SstFileManagerImpl::ClearError() {
       // since the ErrorHandler::recovery_in_prog_ flag would be true
       cur_instance_ = error_handler;
       mu_.Unlock();
+      TEST_SYNC_POINT("SstFileManagerImpl::ClearError");
       s = error_handler->RecoverFromBGError();
       mu_.Lock();
       // The DB instance might have been deleted while we were