From ecf2bec613a63745a058ae74170f2a989840f999 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm4702.ftw0.facebook.com>
Date: Wed, 8 Dec 2021 14:22:41 -0800
Subject: [PATCH] Add a listener callback for end of auto error recovery
 (#9244)

Summary:
Previously, the OnErrorRecoveryCompleted callback was called when
RocksDB was able to successfully recover from a retryable error.
However, if the recovery failed and was eventually stopped, there was no
indication of the status. To fix that, a new OnErrorRecoveryEnd callback
is introduced that deprecates the OnErrorRecoveryCompleted callback. The
new callback is called with the original error and the new error status.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9244

Test Plan: Add a new unit test in error_handler_fs_test

Reviewed By: zhichao-cao

Differential Revision: D32922303

Pulled By: anand1976

fbshipit-source-id: f04e77a9cb92c5ea6385590682d3fcf559971b99
---
 HISTORY.md                  |  1 +
 db/error_handler.cc         | 27 +++++++++++++++++----
 db/error_handler_fs_test.cc | 47 +++++++++++++++++++++++++++++++++++++
 db/event_helpers.cc         | 13 +++++++---
 db/event_helpers.h          |  5 ++--
 include/rocksdb/listener.h  | 19 +++++++++++++++
 6 files changed, 103 insertions(+), 9 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index c795b62e9..edeb54f99 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -9,6 +9,7 @@
 * MemTableList::TrimHistory now use allocated bytes when max_write_buffer_size_to_maintain > 0(default in TrasactionDB, introduced in PR#5022) Fix #8371.
 ### Public API change
 * Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes.
+* Introduce a new EventListener callback that will be called upon the end of automatic error recovery. 
 
 ### Performance Improvements
 * Replaced map property `TableProperties::properties_offsets`  with uint64_t property `external_sst_file_global_seqno_offset` to save table properties's memory.
diff --git a/db/error_handler.cc b/db/error_handler.cc
index 925dd87ff..ba8a8a91f 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -540,6 +540,8 @@ Status ErrorHandler::ClearBGError() {
   // Signal that recovery succeeded
   if (recovery_error_.ok()) {
     Status old_bg_error = bg_error_;
+    // old_bg_error is only for notifying listeners, so may not be checked
+    old_bg_error.PermitUncheckedError();
     // Clear and check the recovery IO and BG error
     bg_error_ = Status::OK();
     recovery_io_error_ = IOStatus::OK();
@@ -547,8 +549,8 @@ Status ErrorHandler::ClearBGError() {
     recovery_io_error_.PermitUncheckedError();
     recovery_in_prog_ = false;
     soft_error_no_bg_work_ = false;
-    EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
-                                                 old_bg_error, db_mutex_);
+    EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error,
+                                           bg_error_, db_mutex_);
   }
   return recovery_error_;
 #else
@@ -665,6 +667,9 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
   TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
   InstrumentedMutexLock l(db_mutex_);
   if (end_recovery_) {
+    EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                           Status::ShutdownInProgress(),
+                                           db_mutex_);
     return;
   }
   DBRecoverContext context = recover_context_;
@@ -674,6 +679,9 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
   // Recover from the retryable error. Create a separate thread to do it.
   while (resume_count > 0) {
     if (end_recovery_) {
+      EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                             Status::ShutdownInProgress(),
+                                             db_mutex_);
       return;
     }
     TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
@@ -695,6 +703,8 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
         RecordInHistogram(bg_error_stats_.get(),
                           ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
       }
+      EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                             bg_error_, db_mutex_);
       return;
     }
     if (!recovery_io_error_.ok() &&
@@ -718,8 +728,8 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
         Status old_bg_error = bg_error_;
         bg_error_ = Status::OK();
         bg_error_.PermitUncheckedError();
-        EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
-                                                     old_bg_error, db_mutex_);
+        EventHelpers::NotifyOnErrorRecoveryEnd(
+            db_options_.listeners, old_bg_error, bg_error_, db_mutex_);
         if (bg_error_stats_ != nullptr) {
           RecordTick(bg_error_stats_.get(),
                      ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
@@ -739,12 +749,21 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
           RecordInHistogram(bg_error_stats_.get(),
                             ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
         }
+        EventHelpers::NotifyOnErrorRecoveryEnd(
+            db_options_.listeners, bg_error_,
+            !recovery_io_error_.ok()
+                ? recovery_io_error_
+                : (!recovery_error_.ok() ? recovery_error_ : s),
+            db_mutex_);
         return;
       }
     }
     resume_count--;
   }
   recovery_in_prog_ = false;
+  EventHelpers::NotifyOnErrorRecoveryEnd(
+      db_options_.listeners, bg_error_,
+      Status::Aborted("Exceeded resume retry count"), db_mutex_);
   TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
   if (bg_error_stats_ != nullptr) {
     RecordInHistogram(bg_error_stats_.get(),
diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc
index 3dff3ee36..3283ffcf0 100644
--- a/db/error_handler_fs_test.cc
+++ b/db/error_handler_fs_test.cc
@@ -66,6 +66,7 @@ class ErrorHandlerFSListener : public EventListener {
   ~ErrorHandlerFSListener() {
     file_creation_error_.PermitUncheckedError();
     bg_error_.PermitUncheckedError();
+    new_bg_error_.PermitUncheckedError();
   }
 
   void OnTableFileCreationStarted(
@@ -96,6 +97,13 @@ class ErrorHandlerFSListener : public EventListener {
     old_bg_error.PermitUncheckedError();
   }
 
+  void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override {
+    InstrumentedMutexLock l(&mutex_);
+    recovery_complete_ = true;
+    cv_.SignalAll();
+    new_bg_error_ = info.new_bg_error;
+  }
+
   bool WaitForRecovery(uint64_t /*abs_time_us*/) {
     InstrumentedMutexLock l(&mutex_);
     while (!recovery_complete_) {
@@ -138,6 +146,8 @@ class ErrorHandlerFSListener : public EventListener {
     file_creation_error_ = io_s;
   }
 
+  Status new_bg_error() { return new_bg_error_; }
+
  private:
   InstrumentedMutex mutex_;
   InstrumentedCondVar cv_;
@@ -148,6 +158,7 @@ class ErrorHandlerFSListener : public EventListener {
   int file_count_;
   IOStatus file_creation_error_;
   Status bg_error_;
+  Status new_bg_error_;
   FaultInjectionTestFS* fault_fs_;
 };
 
@@ -2428,6 +2439,42 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
   Close();
 }
 
+// Fail auto resume from a flush retryable error and verify that
+// OnErrorRecoveryEnd listener callback is called
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ(listener->new_bg_error(), Status::Aborted());
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+
+  Destroy(options);
+}
+
 class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest,
                                    public testing::WithParamInterface<bool> {};
 
diff --git a/db/event_helpers.cc b/db/event_helpers.cc
index fd1c6f013..88bf8cc69 100644
--- a/db/event_helpers.cc
+++ b/db/event_helpers.cc
@@ -232,23 +232,30 @@ void EventHelpers::LogAndNotifyTableFileDeletion(
 #endif  // !ROCKSDB_LITE
 }
 
-void EventHelpers::NotifyOnErrorRecoveryCompleted(
+void EventHelpers::NotifyOnErrorRecoveryEnd(
     const std::vector<std::shared_ptr<EventListener>>& listeners,
-    Status old_bg_error, InstrumentedMutex* db_mutex) {
+    const Status& old_bg_error, const Status& new_bg_error,
+    InstrumentedMutex* db_mutex) {
 #ifndef ROCKSDB_LITE
   if (!listeners.empty()) {
     db_mutex->AssertHeld();
     // release lock while notifying events
     db_mutex->Unlock();
     for (auto& listener : listeners) {
+      BackgroundErrorRecoveryInfo info;
+      info.old_bg_error = old_bg_error;
+      info.new_bg_error = new_bg_error;
       listener->OnErrorRecoveryCompleted(old_bg_error);
+      listener->OnErrorRecoveryEnd(info);
+      info.old_bg_error.PermitUncheckedError();
+      info.new_bg_error.PermitUncheckedError();
     }
     db_mutex->Lock();
   }
-  old_bg_error.PermitUncheckedError();
 #else
   (void)listeners;
   (void)old_bg_error;
+  (void)new_bg_error;
   (void)db_mutex;
 #endif  // ROCKSDB_LITE
 }
diff --git a/db/event_helpers.h b/db/event_helpers.h
index f8b7f1d51..ad299670f 100644
--- a/db/event_helpers.h
+++ b/db/event_helpers.h
@@ -43,9 +43,10 @@ class EventHelpers {
       uint64_t file_number, const std::string& file_path,
       const Status& status, const std::string& db_name,
       const std::vector<std::shared_ptr<EventListener>>& listeners);
-  static void NotifyOnErrorRecoveryCompleted(
+  static void NotifyOnErrorRecoveryEnd(
       const std::vector<std::shared_ptr<EventListener>>& listeners,
-      Status bg_error, InstrumentedMutex* db_mutex);
+      const Status& old_bg_error, const Status& new_bg_error,
+      InstrumentedMutex* db_mutex);
 
 #ifndef ROCKSDB_LITE
   static void NotifyBlobFileCreationStarted(
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 304e8928d..1ffbc73a4 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -453,6 +453,16 @@ struct ExternalFileIngestionInfo {
   TableProperties table_properties;
 };
 
+// Result of auto background error recovery
+struct BackgroundErrorRecoveryInfo {
+  // The original error that triggered the recovery
+  Status old_bg_error;
+
+  // The final bg_error after all recovery attempts. Status::OK() means
+  // the recovery was successful and the database is fully operational.
+  Status new_bg_error;
+};
+
 struct IOErrorInfo {
   IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation,
               const std::string& _file_path, size_t _length, uint64_t _offset)
@@ -691,12 +701,21 @@ class EventListener : public Customizable {
                                     Status /* bg_error */,
                                     bool* /* auto_recovery */) {}
 
+  // DEPRECATED
   // A callback function for RocksDB which will be called once the database
   // is recovered from read-only mode after an error. When this is called, it
   // means normal writes to the database can be issued and the user can
   // initiate any further recovery actions needed
   virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
 
+  // A callback function for RocksDB which will be called once the recovery
+  // attempt from a background retryable error is completed. The recovery
+  // may have been successful or not. In either case, the callback is called
+  // with the old and new error. If info.new_bg_error is Status::OK(), that
+  // means the recovery succeeded.
+  virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) {
+  }
+
   // A callback function for RocksDB which will be called before
   // a blob file is being created. It will follow by OnBlobFileCreated after
   // the creation finishes.