cleanup error_handler related code (#9098)

Summary:
Remove code not in use, add comments, remove redundant code.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9098

Test Plan: make check

Reviewed By: anand1976

Differential Revision: D32027219

Pulled By: zhichao-cao

fbshipit-source-id: 253aae926c87726268af6c027bf805dc9156c8a8
main
Zhichao Cao 3 years ago committed by Facebook GitHub Bot
parent a7478070f3
commit efaef9b40a
  1. 36
      db/error_handler.cc
  2. 4
      db/error_handler.h
  3. 9
      db/error_handler_fs_test.cc

@ -406,10 +406,7 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
// it can directly overwrite any existing bg_error_. // it can directly overwrite any existing bg_error_.
bool auto_recovery = false; bool auto_recovery = false;
Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError); Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
bg_error_ = bg_err; CheckAndSetRecoveryAndBGError(bg_err);
if (recovery_in_prog_ && recovery_error_.ok()) {
recovery_error_ = bg_err;
}
if (bg_error_stats_ != nullptr) { if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
@ -468,24 +465,14 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
// to many small memtable being generated during auto resume, the flush // to many small memtable being generated during auto resume, the flush
// reason is set to kErrorRecoveryRetryFlush. // reason is set to kErrorRecoveryRetryFlush.
Status bg_err(new_bg_io_err, Status::Severity::kSoftError); Status bg_err(new_bg_io_err, Status::Severity::kSoftError);
if (recovery_in_prog_ && recovery_error_.ok()) { CheckAndSetRecoveryAndBGError(bg_err);
recovery_error_ = bg_err;
}
if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err;
}
soft_error_no_bg_work_ = true; soft_error_no_bg_work_ = true;
context.flush_reason = FlushReason::kErrorRecoveryRetryFlush; context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
recover_context_ = context; recover_context_ = context;
return StartRecoverFromRetryableBGIOError(bg_io_err); return StartRecoverFromRetryableBGIOError(bg_io_err);
} else { } else {
Status bg_err(new_bg_io_err, Status::Severity::kHardError); Status bg_err(new_bg_io_err, Status::Severity::kHardError);
if (recovery_in_prog_ && recovery_error_.ok()) { CheckAndSetRecoveryAndBGError(bg_err);
recovery_error_ = bg_err;
}
if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err;
}
recover_context_ = context; recover_context_ = context;
return StartRecoverFromRetryableBGIOError(bg_io_err); return StartRecoverFromRetryableBGIOError(bg_io_err);
} }
@ -663,7 +650,6 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
if (recovery_io_error_.ok() && recovery_error_.ok()) { if (recovery_io_error_.ok() && recovery_error_.ok()) {
return recovery_error_; return recovery_error_;
} else { } else {
TEST_SYNC_POINT("StartRecoverRetryableBGIOError:RecoverFail");
return bg_error_; return bg_error_;
} }
#else #else
@ -677,7 +663,6 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
void ErrorHandler::RecoverFromRetryableBGIOError() { void ErrorHandler::RecoverFromRetryableBGIOError() {
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart1");
InstrumentedMutexLock l(db_mutex_); InstrumentedMutexLock l(db_mutex_);
if (end_recovery_) { if (end_recovery_) {
return; return;
@ -697,8 +682,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
recovery_error_ = Status::OK(); recovery_error_ = Status::OK();
retry_count++; retry_count++;
Status s = db_->ResumeImpl(context); Status s = db_->ResumeImpl(context);
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume0");
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume1");
if (bg_error_stats_ != nullptr) { if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT); ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT);
@ -707,7 +690,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
bg_error_.severity() >= Status::Severity::kFatalError) { bg_error_.severity() >= Status::Severity::kFatalError) {
// If DB shutdown in progress or the error severity is higher than // If DB shutdown in progress or the error severity is higher than
// Hard Error, stop auto resume and returns. // Hard Error, stop auto resume and returns.
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail0");
recovery_in_prog_ = false; recovery_in_prog_ = false;
if (bg_error_stats_ != nullptr) { if (bg_error_stats_ != nullptr) {
RecordInHistogram(bg_error_stats_.get(), RecordInHistogram(bg_error_stats_.get(),
@ -725,7 +707,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
int64_t wait_until = db_options_.clock->NowMicros() + wait_interval; int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
cv_.TimedWait(wait_until); cv_.TimedWait(wait_until);
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterWait0");
} else { } else {
// There are three possibility: 1) recover_io_error is set during resume // There are three possibility: 1) recover_io_error is set during resume
// and the error is not retryable, 2) recover is successful, 3) other // and the error is not retryable, 2) recover is successful, 3) other
@ -751,7 +732,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
} }
return; return;
} else { } else {
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail1");
// In this case: 1) recovery_io_error is more serious or not retryable // In this case: 1) recovery_io_error is more serious or not retryable
// 2) other Non IO recovery_error happens. The auto recovery stops. // 2) other Non IO recovery_error happens. The auto recovery stops.
recovery_in_prog_ = false; recovery_in_prog_ = false;
@ -776,6 +756,16 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
#endif #endif
} }
void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
if (recovery_in_prog_ && recovery_error_.ok()) {
recovery_error_ = bg_err;
}
if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err;
}
return;
}
void ErrorHandler::EndAutoRecovery() { void ErrorHandler::EndAutoRecovery() {
db_mutex_->AssertHeld(); db_mutex_->AssertHeld();
if (!end_recovery_) { if (!end_recovery_) {

@ -116,6 +116,10 @@ class ErrorHandler {
void RecoverFromNoSpace(); void RecoverFromNoSpace();
const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error); const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
void RecoverFromRetryableBGIOError(); void RecoverFromRetryableBGIOError();
// First, if it is in recovery and the recovery_error is ok. Set the
// recovery_error_ to bg_err. Second, if the severity is higher than the
// current bg_error_, overwrite it.
void CheckAndSetRecoveryAndBGError(const Status& bg_err);
}; };
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

@ -487,6 +487,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) {
Destroy(options); Destroy(options);
} }
// The flush error is injected before we finish the table build
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener()); new ErrorHandlerFSListener());
@ -542,6 +543,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
Destroy(options); Destroy(options);
} }
// The retryable IO error is injected before we sync table
TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) { TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener()); new ErrorHandlerFSListener());
@ -585,6 +587,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
Destroy(options); Destroy(options);
} }
// The retryable IO error is injected before we close the table file
TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) { TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) {
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener()); new ErrorHandlerFSListener());
@ -1874,6 +1877,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) {
Destroy(options); Destroy(options);
} }
// Auto resume fromt the flush retryable IO error. Activate the FS before the
// first resume. Resume is successful
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) { TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
// Activate the FS before the first resume // Activate the FS before the first resume
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(
@ -1914,6 +1919,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
Destroy(options); Destroy(options);
} }
// Auto resume fromt the flush retryable IO error and set the retry limit count.
// Never activate the FS and auto resume should fail at the end
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) { TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
// Fail all the resume and let user to resume // Fail all the resume and let user to resume
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(
@ -1963,6 +1970,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
Destroy(options); Destroy(options);
} }
// Auto resume fromt the flush retryable IO error and set the retry limit count.
// Fail the first resume and let the second resume be successful.
TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) { TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
// Fail the first resume and let the second resume be successful // Fail the first resume and let the second resume be successful
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(

Loading…
Cancel
Save