diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index a517a2015..94182c50d 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -1555,9 +1555,7 @@ Status CompactionJob::FinishCompactionOutputFile( "CompactionJob::FinishCompactionOutputFile:" "MaxAllowedSpaceReached"); InstrumentedMutexLock l(db_mutex_); - // Should handle return error? - db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction) - .PermitUncheckedError(); + db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction); } } #endif diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 0a7722692..b43dfe23d 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -4607,8 +4607,7 @@ Status DBImpl::IngestExternalFiles( // TODO: distinguish between MANIFEST write and CURRENT renaming const IOStatus& io_s = versions_->io_status(); // Should handle return error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite); } // Resume writes to the DB diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index ee5756cf5..7255f6d30 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -130,12 +130,10 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) { } if (!io_s.ok()) { if (total_log_size_ > 0) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush); } else { // If the WAL is empty, we use different error reason - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL); } TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); return io_s; @@ -192,8 +190,7 @@ Status DBImpl::FlushMemTableToOutputFile( s = io_s; if (!io_s.ok() && !io_s.IsShutdownInProgress() && !io_s.IsColumnFamilyDropped()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush); } } else { TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip"); @@ -253,31 +250,23 @@ Status DBImpl::FlushMemTableToOutputFile( // be pessimistic and try write to a new MANIFEST. // TODO: distinguish between MANIFEST write and CURRENT renaming if (!versions_->io_status().ok()) { - // Should handle return error? if (total_log_size_ > 0) { // If the WAL is empty, we use different error reason - error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWrite); } else { - error_handler_ - .SetBGError(io_s, BackgroundErrorReason::kManifestWriteNoWAL) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWriteNoWAL); } } else if (total_log_size_ > 0) { - // Should handle return error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush); } else { // If the WAL is empty, we use different error reason - // Should handle return error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL); } } else { Status new_bg_error = s; - // Should handle return error? - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } } else { // If we got here, then we decided not to care about the i_os status (either @@ -302,9 +291,7 @@ Status DBImpl::FlushMemTableToOutputFile( TEST_SYNC_POINT_CALLBACK( "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", &new_bg_error); - // Should handle this error? - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } } #endif // ROCKSDB_LITE @@ -645,9 +632,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( error_handler_.GetBGError().ok()) { Status new_bg_error = Status::SpaceLimit("Max allowed space was reached"); - // Should Handle this error? - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + error_handler_.SetBGError(new_bg_error, + BackgroundErrorReason::kFlush); } } } @@ -664,31 +650,23 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // be pessimistic and try write to a new MANIFEST. // TODO: distinguish between MANIFEST write and CURRENT renaming if (!versions_->io_status().ok()) { - // Should handle return error? if (total_log_size_ > 0) { // If the WAL is empty, we use different error reason - error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWrite); } else { - error_handler_ - .SetBGError(io_s, BackgroundErrorReason::kManifestWriteNoWAL) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWriteNoWAL); } } else if (total_log_size_ > 0) { - // Should Handle this error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush); } else { // If the WAL is empty, we use different error reason - // Should Handle this error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL); } } else { Status new_bg_error = s; - // Should Handle this error? - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } } @@ -1266,11 +1244,9 @@ Status DBImpl::CompactFilesImpl( job_context->job_id, status.ToString().c_str()); IOStatus io_s = compaction_job.io_status(); if (!io_s.ok()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction); } else { - error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction) - .PermitUncheckedError(); + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); } } @@ -3121,10 +3097,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, auto err_reason = versions_->io_status().ok() ? BackgroundErrorReason::kCompaction : BackgroundErrorReason::kManifestWrite; - error_handler_.SetBGError(io_s, err_reason).PermitUncheckedError(); + error_handler_.SetBGError(io_s, err_reason); } else { - error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction) - .PermitUncheckedError(); + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); } if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) { // Put this cfd back in the compaction queue so we can retry after some diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index c21c9fa8f..4825fb6ef 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -170,7 +170,7 @@ Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) { while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || bg_flush_scheduled_ || (wait_unscheduled && unscheduled_compactions_)) && - (error_handler_.GetBGError() == Status::OK())) { + (error_handler_.GetBGError().ok())) { bg_cv_.Wait(); } return error_handler_.GetBGError(); diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 02ef5671e..1a459c36a 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -850,8 +850,7 @@ void DBImpl::WriteStatusCheckOnLocked(const Status& status) { if (immutable_db_options_.paranoid_checks && !status.ok() && !status.IsBusy() && !status.IsIncomplete()) { // Maybe change the return status to void? - error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback) - .PermitUncheckedError(); + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); } } @@ -863,8 +862,7 @@ void DBImpl::WriteStatusCheck(const Status& status) { !status.IsBusy() && !status.IsIncomplete()) { mutex_.Lock(); // Maybe change the return status to void? - error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback) - .PermitUncheckedError(); + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); mutex_.Unlock(); } } @@ -877,8 +875,7 @@ void DBImpl::IOStatusCheck(const IOStatus& io_status) { io_status.IsIOFenced()) { mutex_.Lock(); // Maybe change the return status to void? - error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback) - .PermitUncheckedError(); + error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback); mutex_.Unlock(); } } @@ -1810,15 +1807,10 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { } // We may have lost data from the WritableFileBuffer in-memory buffer for // the current log, so treat it as a fatal error and set bg_error - // Should handle return error? if (!io_s.ok()) { - // Should handle return error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable); } else { - // Should handle return error? - error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable) - .PermitUncheckedError(); + error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable); } // Read back bg_error in order to get the right severity s = error_handler_.GetBGError(); diff --git a/db/error_handler.cc b/db/error_handler.cc index f2aabde5f..f121519f4 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -267,10 +267,11 @@ void ErrorHandler::CancelErrorRecovery() { // This can also get called as part of a recovery operation. In that case, we // also track the error separately in recovery_error_ so we can tell in the // end whether recovery succeeded or not -Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) { +const Status& ErrorHandler::SetBGError(const Status& bg_err, + BackgroundErrorReason reason) { db_mutex_->AssertHeld(); if (bg_err.ok()) { - return Status::OK(); + return bg_err; } bool paranoid = db_options_.paranoid_checks; @@ -363,11 +364,11 @@ Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reas // c) all other errors are mapped to hard error. // 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason // reason) will be called to handle other error cases. -Status ErrorHandler::SetBGError(const IOStatus& bg_io_err, - BackgroundErrorReason reason) { +const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err, + BackgroundErrorReason reason) { db_mutex_->AssertHeld(); if (bg_io_err.ok()) { - return Status::OK(); + return bg_io_err; } ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s", bg_io_err.ToString().c_str()); @@ -382,7 +383,6 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err, } Status new_bg_io_err = bg_io_err; - Status s; DBRecoverContext context; if (bg_io_err.GetDataLoss()) { // First, data loss is treated as unrecoverable error. So it can directly @@ -393,8 +393,8 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err, if (recovery_in_prog_ && recovery_error_.ok()) { recovery_error_ = bg_err; } - EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, - db_mutex_, &auto_recovery); + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, + &bg_err, db_mutex_, &auto_recovery); recover_context_ = context; return bg_error_; } else if (bg_io_err.GetRetryable()) { @@ -405,8 +405,9 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err, // soft error. In other cases, treat the retryable IO error as hard // error. bool auto_recovery = false; - EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, - db_mutex_, &auto_recovery); + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, + &new_bg_io_err, db_mutex_, + &auto_recovery); if (BackgroundErrorReason::kCompaction == reason) { Status bg_err(new_bg_io_err, Status::Severity::kSoftError); if (bg_err.severity() > bg_error_.severity()) { @@ -446,12 +447,11 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err, return StartRecoverFromRetryableBGIOError(bg_io_err); } } else { - s = SetBGError(new_bg_io_err, reason); + return SetBGError(new_bg_io_err, reason); } - return s; } -Status ErrorHandler::OverrideNoSpaceError(Status bg_error, +Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery) { #ifndef ROCKSDB_LITE if (bg_error.severity() >= Status::Severity::kFatalError) { @@ -507,7 +507,11 @@ Status ErrorHandler::ClearBGError() { // Signal that recovery succeeded if (recovery_error_.ok()) { Status old_bg_error = bg_error_; + // Clear and check the recovery IO and BG error bg_error_ = Status::OK(); + recovery_io_error_ = IOStatus::OK(); + bg_error_.PermitUncheckedError(); + recovery_io_error_.PermitUncheckedError(); recovery_in_prog_ = false; soft_error_no_bg_work_ = false; EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, @@ -557,6 +561,7 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) { // during the recovery process. While recovering, the only operations that // can generate background errors should be the flush operations recovery_error_ = Status::OK(); + recovery_error_.PermitUncheckedError(); Status s = db_->ResumeImpl(recover_context_); if (s.ok()) { soft_error_no_bg_work_ = false; @@ -578,13 +583,15 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) { #endif } -Status ErrorHandler::StartRecoverFromRetryableBGIOError(IOStatus io_error) { +const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( + const IOStatus& io_error) { #ifndef ROCKSDB_LITE db_mutex_->AssertHeld(); - if (bg_error_.ok() || io_error.ok()) { - return Status::OK(); - } - if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) { + if (bg_error_.ok()) { + return bg_error_; + } else if (io_error.ok()) { + return io_error; + } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) { // Auto resume BG error is not enabled, directly return bg_error_. return bg_error_; } @@ -603,7 +610,7 @@ Status ErrorHandler::StartRecoverFromRetryableBGIOError(IOStatus io_error) { new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this)); if (recovery_io_error_.ok() && recovery_error_.ok()) { - return Status::OK(); + return recovery_error_; } else { TEST_SYNC_POINT("StartRecoverRetryableBGIOError:RecoverFail"); return bg_error_; @@ -668,6 +675,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess"); Status old_bg_error = bg_error_; bg_error_ = Status::OK(); + bg_error_.PermitUncheckedError(); EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, old_bg_error, db_mutex_); recovery_in_prog_ = false; diff --git a/db/error_handler.h b/db/error_handler.h index 084434101..acd09514e 100644 --- a/db/error_handler.h +++ b/db/error_handler.h @@ -31,17 +31,14 @@ class ErrorHandler { InstrumentedMutex* db_mutex) : db_(db), db_options_(db_options), - bg_error_(Status::OK()), - recovery_error_(Status::OK()), - recovery_io_error_(IOStatus::OK()), cv_(db_mutex), end_recovery_(false), recovery_thread_(nullptr), db_mutex_(db_mutex), auto_recovery_(false), recovery_in_prog_(false), - soft_error_no_bg_work_(false) {} - ~ErrorHandler() { + soft_error_no_bg_work_(false) { + // Clear the checked flag for uninitialized errors bg_error_.PermitUncheckedError(); recovery_error_.PermitUncheckedError(); recovery_io_error_.PermitUncheckedError(); @@ -53,13 +50,14 @@ class ErrorHandler { Status::Code code, Status::SubCode subcode); - Status SetBGError(const Status& bg_err, BackgroundErrorReason reason); + const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason); - Status SetBGError(const IOStatus& bg_io_err, BackgroundErrorReason reason); + const Status& SetBGError(const IOStatus& bg_io_err, + BackgroundErrorReason reason); - Status GetBGError() { return bg_error_; } + Status GetBGError() const { return bg_error_; } - Status GetRecoveryError() { return recovery_error_; } + Status GetRecoveryError() const { return recovery_error_; } Status ClearBGError(); @@ -110,9 +108,9 @@ class ErrorHandler { // Used to store the context for recover, such as flush reason. DBRecoverContext recover_context_; - Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery); + Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery); void RecoverFromNoSpace(); - Status StartRecoverFromRetryableBGIOError(IOStatus io_error); + const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error); void RecoverFromRetryableBGIOError(); };