From eb5a8c06dde5c6b470e7b5dc5aa044cd647005a0 Mon Sep 17 00:00:00 2001 From: Zhichao Cao Date: Fri, 4 Dec 2020 14:57:29 -0800 Subject: [PATCH] Fix the thread wait case in error_handler (#7700) Summary: In error_handler auto recovery case, if recovery_in_prog_ is false, the recover is finished or failed. In this case, the auto recovery thread should finish its execution so recovery_thread_ should be null. However, in some cases, it is not null, the caller should not directly returned. Instead, it should wait for a while and create a new thread to execute the new recovery. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7700 Test Plan: make check, error_handler_fs_test Reviewed By: anand1976 Differential Revision: D25098233 Pulled By: zhichao-cao fbshipit-source-id: 5a1cba234ca18f6dd5d1be88e02d66e1d5ce931b --- db/error_handler.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/db/error_handler.cc b/db/error_handler.cc index 26b4b909d..f2aabde5f 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -584,12 +584,20 @@ Status ErrorHandler::StartRecoverFromRetryableBGIOError(IOStatus io_error) { if (bg_error_.ok() || io_error.ok()) { return Status::OK(); } - if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_ || - recovery_thread_) { + if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) { // Auto resume BG error is not enabled, directly return bg_error_. return bg_error_; } + if (recovery_thread_) { + // In this case, if recovery_in_prog_ is false, current thread should + // wait the previous recover thread to finish and create a new thread + // to recover from the bg error. + db_mutex_->Unlock(); + recovery_thread_->join(); + db_mutex_->Lock(); + } + recovery_in_prog_ = true; recovery_thread_.reset( new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));