From ecf2bec613a63745a058ae74170f2a989840f999 Mon Sep 17 00:00:00 2001 From: anand76 Date: Wed, 8 Dec 2021 14:22:41 -0800 Subject: [PATCH] Add a listener callback for end of auto error recovery (#9244) Summary: Previously, the OnErrorRecoveryCompleted callback was called when RocksDB was able to successfully recover from a retryable error. However, if the recovery failed and was eventually stopped, there was no indication of the status. To fix that, a new OnErrorRecoveryEnd callback is introduced that deprecates the OnErrorRecoveryCompleted callback. The new callback is called with the original error and the new error status. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9244 Test Plan: Add a new unit test in error_handler_fs_test Reviewed By: zhichao-cao Differential Revision: D32922303 Pulled By: anand1976 fbshipit-source-id: f04e77a9cb92c5ea6385590682d3fcf559971b99 --- HISTORY.md | 1 + db/error_handler.cc | 27 +++++++++++++++++---- db/error_handler_fs_test.cc | 47 +++++++++++++++++++++++++++++++++++++ db/event_helpers.cc | 13 +++++++--- db/event_helpers.h | 5 ++-- include/rocksdb/listener.h | 19 +++++++++++++++ 6 files changed, 103 insertions(+), 9 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c795b62e9..edeb54f99 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * MemTableList::TrimHistory now use allocated bytes when max_write_buffer_size_to_maintain > 0(default in TrasactionDB, introduced in PR#5022) Fix #8371. ### Public API change * Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes. +* Introduce a new EventListener callback that will be called upon the end of automatic error recovery. ### Performance Improvements * Replaced map property `TableProperties::properties_offsets` with uint64_t property `external_sst_file_global_seqno_offset` to save table properties's memory. diff --git a/db/error_handler.cc b/db/error_handler.cc index 925dd87ff..ba8a8a91f 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -540,6 +540,8 @@ Status ErrorHandler::ClearBGError() { // Signal that recovery succeeded if (recovery_error_.ok()) { Status old_bg_error = bg_error_; + // old_bg_error is only for notifying listeners, so may not be checked + old_bg_error.PermitUncheckedError(); // Clear and check the recovery IO and BG error bg_error_ = Status::OK(); recovery_io_error_ = IOStatus::OK(); @@ -547,8 +549,8 @@ Status ErrorHandler::ClearBGError() { recovery_io_error_.PermitUncheckedError(); recovery_in_prog_ = false; soft_error_no_bg_work_ = false; - EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, - old_bg_error, db_mutex_); + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error, + bg_error_, db_mutex_); } return recovery_error_; #else @@ -665,6 +667,9 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart"); InstrumentedMutexLock l(db_mutex_); if (end_recovery_) { + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + Status::ShutdownInProgress(), + db_mutex_); return; } DBRecoverContext context = recover_context_; @@ -674,6 +679,9 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { // Recover from the retryable error. Create a separate thread to do it. while (resume_count > 0) { if (end_recovery_) { + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + Status::ShutdownInProgress(), + db_mutex_); return; } TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0"); @@ -695,6 +703,8 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { RecordInHistogram(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); } + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + bg_error_, db_mutex_); return; } if (!recovery_io_error_.ok() && @@ -718,8 +728,8 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { Status old_bg_error = bg_error_; bg_error_ = Status::OK(); bg_error_.PermitUncheckedError(); - EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, - old_bg_error, db_mutex_); + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, old_bg_error, bg_error_, db_mutex_); if (bg_error_stats_ != nullptr) { RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT); @@ -739,12 +749,21 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { RecordInHistogram(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); } + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, bg_error_, + !recovery_io_error_.ok() + ? recovery_io_error_ + : (!recovery_error_.ok() ? recovery_error_ : s), + db_mutex_); return; } } resume_count--; } recovery_in_prog_ = false; + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, bg_error_, + Status::Aborted("Exceeded resume retry count"), db_mutex_); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut"); if (bg_error_stats_ != nullptr) { RecordInHistogram(bg_error_stats_.get(), diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc index 3dff3ee36..3283ffcf0 100644 --- a/db/error_handler_fs_test.cc +++ b/db/error_handler_fs_test.cc @@ -66,6 +66,7 @@ class ErrorHandlerFSListener : public EventListener { ~ErrorHandlerFSListener() { file_creation_error_.PermitUncheckedError(); bg_error_.PermitUncheckedError(); + new_bg_error_.PermitUncheckedError(); } void OnTableFileCreationStarted( @@ -96,6 +97,13 @@ class ErrorHandlerFSListener : public EventListener { old_bg_error.PermitUncheckedError(); } + void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override { + InstrumentedMutexLock l(&mutex_); + recovery_complete_ = true; + cv_.SignalAll(); + new_bg_error_ = info.new_bg_error; + } + bool WaitForRecovery(uint64_t /*abs_time_us*/) { InstrumentedMutexLock l(&mutex_); while (!recovery_complete_) { @@ -138,6 +146,8 @@ class ErrorHandlerFSListener : public EventListener { file_creation_error_ = io_s; } + Status new_bg_error() { return new_bg_error_; } + private: InstrumentedMutex mutex_; InstrumentedCondVar cv_; @@ -148,6 +158,7 @@ class ErrorHandlerFSListener : public EventListener { int file_count_; IOStatus file_creation_error_; Status bg_error_; + Status new_bg_error_; FaultInjectionTestFS* fault_fs_; }; @@ -2428,6 +2439,42 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) { Close(); } +// Fail auto resume from a flush retryable error and verify that +// OnErrorRecoveryEnd listener callback is called +TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) { + // Activate the FS before the first resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + ASSERT_EQ(listener->new_bg_error(), Status::Aborted()); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + + Destroy(options); +} + class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest, public testing::WithParamInterface {}; diff --git a/db/event_helpers.cc b/db/event_helpers.cc index fd1c6f013..88bf8cc69 100644 --- a/db/event_helpers.cc +++ b/db/event_helpers.cc @@ -232,23 +232,30 @@ void EventHelpers::LogAndNotifyTableFileDeletion( #endif // !ROCKSDB_LITE } -void EventHelpers::NotifyOnErrorRecoveryCompleted( +void EventHelpers::NotifyOnErrorRecoveryEnd( const std::vector>& listeners, - Status old_bg_error, InstrumentedMutex* db_mutex) { + const Status& old_bg_error, const Status& new_bg_error, + InstrumentedMutex* db_mutex) { #ifndef ROCKSDB_LITE if (!listeners.empty()) { db_mutex->AssertHeld(); // release lock while notifying events db_mutex->Unlock(); for (auto& listener : listeners) { + BackgroundErrorRecoveryInfo info; + info.old_bg_error = old_bg_error; + info.new_bg_error = new_bg_error; listener->OnErrorRecoveryCompleted(old_bg_error); + listener->OnErrorRecoveryEnd(info); + info.old_bg_error.PermitUncheckedError(); + info.new_bg_error.PermitUncheckedError(); } db_mutex->Lock(); } - old_bg_error.PermitUncheckedError(); #else (void)listeners; (void)old_bg_error; + (void)new_bg_error; (void)db_mutex; #endif // ROCKSDB_LITE } diff --git a/db/event_helpers.h b/db/event_helpers.h index f8b7f1d51..ad299670f 100644 --- a/db/event_helpers.h +++ b/db/event_helpers.h @@ -43,9 +43,10 @@ class EventHelpers { uint64_t file_number, const std::string& file_path, const Status& status, const std::string& db_name, const std::vector>& listeners); - static void NotifyOnErrorRecoveryCompleted( + static void NotifyOnErrorRecoveryEnd( const std::vector>& listeners, - Status bg_error, InstrumentedMutex* db_mutex); + const Status& old_bg_error, const Status& new_bg_error, + InstrumentedMutex* db_mutex); #ifndef ROCKSDB_LITE static void NotifyBlobFileCreationStarted( diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 304e8928d..1ffbc73a4 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -453,6 +453,16 @@ struct ExternalFileIngestionInfo { TableProperties table_properties; }; +// Result of auto background error recovery +struct BackgroundErrorRecoveryInfo { + // The original error that triggered the recovery + Status old_bg_error; + + // The final bg_error after all recovery attempts. Status::OK() means + // the recovery was successful and the database is fully operational. + Status new_bg_error; +}; + struct IOErrorInfo { IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation, const std::string& _file_path, size_t _length, uint64_t _offset) @@ -691,12 +701,21 @@ class EventListener : public Customizable { Status /* bg_error */, bool* /* auto_recovery */) {} + // DEPRECATED // A callback function for RocksDB which will be called once the database // is recovered from read-only mode after an error. When this is called, it // means normal writes to the database can be issued and the user can // initiate any further recovery actions needed virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {} + // A callback function for RocksDB which will be called once the recovery + // attempt from a background retryable error is completed. The recovery + // may have been successful or not. In either case, the callback is called + // with the old and new error. If info.new_bg_error is Status::OK(), that + // means the recovery succeeded. + virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) { + } + // A callback function for RocksDB which will be called before // a blob file is being created. It will follow by OnBlobFileCreated after // the creation finishes.