Add the statistics and info log for Error handler (#8050)

Summary:
Add statistics and info log for error handler: counters for bg error, bg io error, bg retryable io error, auto resume, auto resume total retry, and auto resume sucess; Histogram for auto resume retry count in each recovery call.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8050

Test Plan: make check and add test to error_handler_fs_test

Reviewed By: anand1976

Differential Revision: D26990565

Pulled By: zhichao-cao

fbshipit-source-id: 49f71e8ea4e9db8b189943976404205b56ab883f
main
Zhichao Cao 4 years ago committed by Facebook GitHub Bot
parent 27d57a035e
commit 08ec5e7321
  1. 1
      HISTORY.md
  2. 63
      db/error_handler.cc
  3. 6
      db/error_handler.h
  4. 96
      db/error_handler_fs_test.cc
  5. 12
      include/rocksdb/statistics.h
  6. 35
      java/rocksjni/portal.h
  7. 5
      java/src/main/java/org/rocksdb/HistogramType.java
  8. 10
      java/src/main/java/org/rocksdb/TickerType.java
  9. 12
      monitoring/statistics.cc

@ -21,6 +21,7 @@
* Add new options for db_bench --benchmarks: flush, waitforcompaction, compact0, compact1. * Add new options for db_bench --benchmarks: flush, waitforcompaction, compact0, compact1.
* Add an option to BackupEngine::GetBackupInfo to include the name and size of each backed-up file. Especially in the presence of file sharing among backups, this offers detailed insight into backup space usage. * Add an option to BackupEngine::GetBackupInfo to include the name and size of each backed-up file. Especially in the presence of file sharing among backups, this offers detailed insight into backup space usage.
* Enable backward iteration on keys with user-defined timestamps. * Enable backward iteration on keys with user-defined timestamps.
* Add statistics and info log for error handler: counters for bg error, bg io error, bg retryable io error, auto resume count, auto resume total retry number, and auto resume sucess; Histogram for auto resume retry count in each recovery call. Note that, each auto resume attempt will have one or multiple retries.
## 6.18.0 (02/19/2021) ## 6.18.0 (02/19/2021)
### Behavior Changes ### Behavior Changes

@ -4,9 +4,11 @@
// (found in the LICENSE.Apache file in the root directory). // (found in the LICENSE.Apache file in the root directory).
// //
#include "db/error_handler.h" #include "db/error_handler.h"
#include "db/db_impl/db_impl.h" #include "db/db_impl/db_impl.h"
#include "db/event_helpers.h" #include "db/event_helpers.h"
#include "file/sst_file_manager_impl.h" #include "file/sst_file_manager_impl.h"
#include "logging/logging.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -274,6 +276,12 @@ const Status& ErrorHandler::SetBGError(const Status& bg_err,
return bg_err; return bg_err;
} }
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
}
ROCKS_LOG_INFO(db_options_.info_log,
"ErrorHandler: Set regular background error\n");
bool paranoid = db_options_.paranoid_checks; bool paranoid = db_options_.paranoid_checks;
Status::Severity sev = Status::Severity::kFatalError; Status::Severity sev = Status::Severity::kFatalError;
Status new_bg_err; Status new_bg_err;
@ -399,6 +407,13 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
if (recovery_in_prog_ && recovery_error_.ok()) { if (recovery_in_prog_ && recovery_error_.ok()) {
recovery_error_ = bg_err; recovery_error_ = bg_err;
} }
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
}
ROCKS_LOG_INFO(
db_options_.info_log,
"ErrorHandler: Set background IO error as unrecoverable error\n");
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
&bg_err, db_mutex_, &auto_recovery); &bg_err, db_mutex_, &auto_recovery);
recover_context_ = context; recover_context_ = context;
@ -416,12 +431,26 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
&new_bg_io_err, db_mutex_, &new_bg_io_err, db_mutex_,
&auto_recovery); &auto_recovery);
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT);
}
ROCKS_LOG_INFO(db_options_.info_log,
"ErrorHandler: Set background retryable IO error\n");
if (BackgroundErrorReason::kCompaction == reason) { if (BackgroundErrorReason::kCompaction == reason) {
// We map the retryable IO error during compaction to soft error. Since // We map the retryable IO error during compaction to soft error. Since
// compaction can reschedule by itself. We will not set the BG error in // compaction can reschedule by itself. We will not set the BG error in
// this case // this case
// TODO: a better way to set or clean the retryable IO error which // TODO: a better way to set or clean the retryable IO error which
// happens during compaction SST file write. // happens during compaction SST file write.
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
}
ROCKS_LOG_INFO(
db_options_.info_log,
"ErrorHandler: Compaction will schedule by itself to resume\n");
return bg_error_; return bg_error_;
} else if (BackgroundErrorReason::kFlushNoWAL == reason || } else if (BackgroundErrorReason::kFlushNoWAL == reason ||
BackgroundErrorReason::kManifestWriteNoWAL == reason) { BackgroundErrorReason::kManifestWriteNoWAL == reason) {
@ -455,6 +484,9 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
return StartRecoverFromRetryableBGIOError(bg_io_err); return StartRecoverFromRetryableBGIOError(bg_io_err);
} }
} else { } else {
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
}
return SetBGError(new_bg_io_err, reason); return SetBGError(new_bg_io_err, reason);
} }
} }
@ -603,7 +635,12 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
// Auto resume BG error is not enabled, directly return bg_error_. // Auto resume BG error is not enabled, directly return bg_error_.
return bg_error_; return bg_error_;
} }
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
}
ROCKS_LOG_INFO(
db_options_.info_log,
"ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n");
if (recovery_thread_) { if (recovery_thread_) {
// In this case, if recovery_in_prog_ is false, current thread should // In this case, if recovery_in_prog_ is false, current thread should
// wait the previous recover thread to finish and create a new thread // wait the previous recover thread to finish and create a new thread
@ -642,6 +679,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
DBRecoverContext context = recover_context_; DBRecoverContext context = recover_context_;
int resume_count = db_options_.max_bgerror_resume_count; int resume_count = db_options_.max_bgerror_resume_count;
uint64_t wait_interval = db_options_.bgerror_resume_retry_interval; uint64_t wait_interval = db_options_.bgerror_resume_retry_interval;
uint64_t retry_count = 0;
// Recover from the retryable error. Create a separate thread to do it. // Recover from the retryable error. Create a separate thread to do it.
while (resume_count > 0) { while (resume_count > 0) {
if (end_recovery_) { if (end_recovery_) {
@ -651,15 +689,24 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1");
recovery_io_error_ = IOStatus::OK(); recovery_io_error_ = IOStatus::OK();
recovery_error_ = Status::OK(); recovery_error_ = Status::OK();
retry_count++;
Status s = db_->ResumeImpl(context); Status s = db_->ResumeImpl(context);
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume0"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume0");
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume1"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume1");
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT);
}
if (s.IsShutdownInProgress() || if (s.IsShutdownInProgress() ||
bg_error_.severity() >= Status::Severity::kFatalError) { bg_error_.severity() >= Status::Severity::kFatalError) {
// If DB shutdown in progress or the error severity is higher than // If DB shutdown in progress or the error severity is higher than
// Hard Error, stop auto resume and returns. // Hard Error, stop auto resume and returns.
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail0"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail0");
recovery_in_prog_ = false; recovery_in_prog_ = false;
if (bg_error_stats_ != nullptr) {
RecordInHistogram(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
}
return; return;
} }
if (!recovery_io_error_.ok() && if (!recovery_io_error_.ok() &&
@ -686,6 +733,12 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
bg_error_.PermitUncheckedError(); bg_error_.PermitUncheckedError();
EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
old_bg_error, db_mutex_); old_bg_error, db_mutex_);
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
RecordInHistogram(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
}
recovery_in_prog_ = false; recovery_in_prog_ = false;
if (soft_error_no_bg_work_) { if (soft_error_no_bg_work_) {
soft_error_no_bg_work_ = false; soft_error_no_bg_work_ = false;
@ -696,6 +749,10 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
// In this case: 1) recovery_io_error is more serious or not retryable // In this case: 1) recovery_io_error is more serious or not retryable
// 2) other Non IO recovery_error happens. The auto recovery stops. // 2) other Non IO recovery_error happens. The auto recovery stops.
recovery_in_prog_ = false; recovery_in_prog_ = false;
if (bg_error_stats_ != nullptr) {
RecordInHistogram(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
}
return; return;
} }
} }
@ -703,6 +760,10 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
} }
recovery_in_prog_ = false; recovery_in_prog_ = false;
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
if (bg_error_stats_ != nullptr) {
RecordInHistogram(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
}
return; return;
#else #else
return; return;

@ -37,7 +37,8 @@ class ErrorHandler {
db_mutex_(db_mutex), db_mutex_(db_mutex),
auto_recovery_(false), auto_recovery_(false),
recovery_in_prog_(false), recovery_in_prog_(false),
soft_error_no_bg_work_(false) { soft_error_no_bg_work_(false),
bg_error_stats_(db_options.statistics) {
// Clear the checked flag for uninitialized errors // Clear the checked flag for uninitialized errors
bg_error_.PermitUncheckedError(); bg_error_.PermitUncheckedError();
recovery_error_.PermitUncheckedError(); recovery_error_.PermitUncheckedError();
@ -108,6 +109,9 @@ class ErrorHandler {
// Used to store the context for recover, such as flush reason. // Used to store the context for recover, such as flush reason.
DBRecoverContext recover_context_; DBRecoverContext recover_context_;
// The pointer of DB statistics.
std::shared_ptr<Statistics> bg_error_stats_;
Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery); Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery);
void RecoverFromNoSpace(); void RecoverFromNoSpace();
const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error); const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);

@ -158,6 +158,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteError) {
options.env = fault_env_.get(); options.env = fault_env_.get();
options.create_if_missing = true; options.create_if_missing = true;
options.listeners.emplace_back(listener); options.listeners.emplace_back(listener);
options.statistics = CreateDBStatistics();
Status s; Status s;
listener->EnableAutoRecovery(false); listener->EnableAutoRecovery(false);
@ -174,13 +175,25 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteError) {
fault_fs_->SetFilesystemActive(true); fault_fs_->SetFilesystemActive(true);
s = dbfull()->Resume(); s = dbfull()->Resume();
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_ERROR_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_IO_ERROR_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
Reopen(options); Reopen(options);
ASSERT_EQ("val", Get(Key(0))); ASSERT_EQ("val", Get(Key(0)));
Destroy(options); Destroy(options);
} }
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) { TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) {
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener()); new ErrorHandlerFSListener());
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
@ -188,6 +201,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) {
options.create_if_missing = true; options.create_if_missing = true;
options.listeners.emplace_back(listener); options.listeners.emplace_back(listener);
options.max_bgerror_resume_count = 0; options.max_bgerror_resume_count = 0;
options.statistics = CreateDBStatistics();
Status s; Status s;
listener->EnableAutoRecovery(false); listener->EnableAutoRecovery(false);
@ -207,6 +221,18 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) {
fault_fs_->SetFilesystemActive(true); fault_fs_->SetFilesystemActive(true);
s = dbfull()->Resume(); s = dbfull()->Resume();
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_ERROR_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_IO_ERROR_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
Reopen(options); Reopen(options);
ASSERT_EQ("val1", Get(Key(1))); ASSERT_EQ("val1", Get(Key(1)));
@ -241,7 +267,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) {
Destroy(options); Destroy(options);
} }
TEST_F(DBErrorHandlingFSTest, FLushWritFileScopeError) { TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) {
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener()); new ErrorHandlerFSListener());
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
@ -325,7 +351,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritFileScopeError) {
Destroy(options); Destroy(options);
} }
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError1) {
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener()); new ErrorHandlerFSListener());
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
@ -333,6 +359,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
options.create_if_missing = true; options.create_if_missing = true;
options.listeners.emplace_back(listener); options.listeners.emplace_back(listener);
options.max_bgerror_resume_count = 0; options.max_bgerror_resume_count = 0;
options.statistics = CreateDBStatistics();
Status s; Status s;
listener->EnableAutoRecovery(false); listener->EnableAutoRecovery(false);
@ -363,11 +390,23 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
s = Flush(); s = Flush();
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_EQ("val3", Get(Key(3))); ASSERT_EQ("val3", Get(Key(3)));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_ERROR_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_IO_ERROR_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
Destroy(options); Destroy(options);
} }
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError2) { TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener()); new ErrorHandlerFSListener());
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
@ -410,7 +449,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError2) {
Destroy(options); Destroy(options);
} }
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError3) { TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) {
std::shared_ptr<ErrorHandlerFSListener> listener( std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener()); new ErrorHandlerFSListener());
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
@ -1010,6 +1049,7 @@ TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) {
options.env = fault_env_.get(); options.env = fault_env_.get();
options.create_if_missing = true; options.create_if_missing = true;
options.listeners.emplace_back(listener); options.listeners.emplace_back(listener);
options.statistics = CreateDBStatistics();
Status s; Status s;
listener->EnableAutoRecovery(); listener->EnableAutoRecovery();
@ -1028,6 +1068,18 @@ TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) {
s = Put(Key(1), "val"); s = Put(Key(1), "val");
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_ERROR_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_IO_ERROR_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
Reopen(options); Reopen(options);
ASSERT_EQ("val", Get(Key(0))); ASSERT_EQ("val", Get(Key(0)));
@ -1567,6 +1619,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) {
options.listeners.emplace_back(listener); options.listeners.emplace_back(listener);
options.max_bgerror_resume_count = 2; options.max_bgerror_resume_count = 2;
options.bgerror_resume_retry_interval = 100000; // 0.1 second options.bgerror_resume_retry_interval = 100000; // 0.1 second
options.statistics = CreateDBStatistics();
Status s; Status s;
listener->EnableAutoRecovery(false); listener->EnableAutoRecovery(false);
@ -1594,6 +1647,22 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) {
ASSERT_EQ("val1", Get(Key(1))); ASSERT_EQ("val1", Get(Key(1)));
SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->DisableProcessing();
fault_fs_->SetFilesystemActive(true); fault_fs_->SetFilesystemActive(true);
ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_ERROR_COUNT));
ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_IO_ERROR_COUNT));
ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_COUNT));
ASSERT_EQ(2, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
HistogramData autoresume_retry;
options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
&autoresume_retry);
ASSERT_EQ(autoresume_retry.max, 2);
ASSERT_OK(Put(Key(2), "val2", wo)); ASSERT_OK(Put(Key(2), "val2", wo));
s = Flush(); s = Flush();
// Since auto resume fails, the bg error is not cleand, flush will // Since auto resume fails, the bg error is not cleand, flush will
@ -1620,6 +1689,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) {
options.listeners.emplace_back(listener); options.listeners.emplace_back(listener);
options.max_bgerror_resume_count = 2; options.max_bgerror_resume_count = 2;
options.bgerror_resume_retry_interval = 100000; // 0.1 second options.bgerror_resume_retry_interval = 100000; // 0.1 second
options.statistics = CreateDBStatistics();
Status s; Status s;
listener->EnableAutoRecovery(false); listener->EnableAutoRecovery(false);
@ -1643,6 +1713,22 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) {
fault_fs_->SetFilesystemActive(true); fault_fs_->SetFilesystemActive(true);
ASSERT_EQ(listener->WaitForRecovery(5000000), true); ASSERT_EQ(listener->WaitForRecovery(5000000), true);
ASSERT_EQ("val1", Get(Key(1))); ASSERT_EQ("val1", Get(Key(1)));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_ERROR_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_IO_ERROR_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
HistogramData autoresume_retry;
options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
&autoresume_retry);
ASSERT_EQ(autoresume_retry.max, 1);
ASSERT_OK(Put(Key(2), "val2", wo)); ASSERT_OK(Put(Key(2), "val2", wo));
s = Flush(); s = Flush();
// Since auto resume is successful, the bg error is cleaned, flush will // Since auto resume is successful, the bg error is cleaned, flush will

@ -374,6 +374,15 @@ enum Tickers : uint32_t {
// # of files deleted immediately by sst file manger through delete scheduler. // # of files deleted immediately by sst file manger through delete scheduler.
FILES_DELETED_IMMEDIATELY, FILES_DELETED_IMMEDIATELY,
// The counters for error handler, not that, bg_io_error is the subset of
// bg_error and bg_retryable_io_error is the subset of bg_io_error
ERROR_HANDLER_BG_ERROR_COUNT,
ERROR_HANDLER_BG_IO_ERROR_COUNT,
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
ERROR_HANDLER_AUTORESUME_COUNT,
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
TICKER_ENUM_MAX TICKER_ENUM_MAX
}; };
@ -472,6 +481,9 @@ enum Histograms : uint32_t {
// Num of sst files read from file system per level. // Num of sst files read from file system per level.
NUM_SST_READ_PER_LEVEL, NUM_SST_READ_PER_LEVEL,
// Error handler statistics
ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
HISTOGRAM_ENUM_MAX, HISTOGRAM_ENUM_MAX,
}; };

@ -4982,7 +4982,20 @@ class TickerTypeJni {
return -0x14; return -0x14;
case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL: case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL:
return -0x15; return -0x15;
case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT:
return -0x16;
case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT:
return -0x17;
case ROCKSDB_NAMESPACE::Tickers::
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT:
return -0x18;
case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT:
return -0x19;
case ROCKSDB_NAMESPACE::Tickers::
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT:
return -0x1A;
case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT:
return -0x1B;
case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
// 0x5F for backwards compatibility on current minor version. // 0x5F for backwards compatibility on current minor version.
return 0x5F; return 0x5F;
@ -5294,6 +5307,21 @@ class TickerTypeJni {
return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC; return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC;
case -0x15: case -0x15:
return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL; return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL;
case -0x16:
return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT;
case -0x17:
return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT;
case -0x18:
return ROCKSDB_NAMESPACE::Tickers::
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT;
case -0x19:
return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT;
case -0x1A:
return ROCKSDB_NAMESPACE::Tickers::
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT;
case -0x1B:
return ROCKSDB_NAMESPACE::Tickers::
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT;
case 0x5F: case 0x5F:
// 0x5F for backwards compatibility on current minor version. // 0x5F for backwards compatibility on current minor version.
return ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX; return ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX;
@ -5413,6 +5441,8 @@ class HistogramTypeJni {
return 0x30; return 0x30;
case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL: case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL:
return 0x31; return 0x31;
case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT:
return 0x31;
case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
// 0x1F for backwards compatibility on current minor version. // 0x1F for backwards compatibility on current minor version.
return 0x1F; return 0x1F;
@ -5527,6 +5557,9 @@ class HistogramTypeJni {
return ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL; return ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL;
case 0x31: case 0x31:
return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL; return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL;
case 0x32:
return ROCKSDB_NAMESPACE::Histograms::
ERROR_HANDLER_AUTORESUME_RETRY_COUNT;
case 0x1F: case 0x1F:
// 0x1F for backwards compatibility on current minor version. // 0x1F for backwards compatibility on current minor version.
return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX;

@ -175,6 +175,11 @@ public enum HistogramType {
*/ */
NUM_SST_READ_PER_LEVEL((byte) 0x31), NUM_SST_READ_PER_LEVEL((byte) 0x31),
/**
* The number of retry in auto resume
*/
ERROR_HANDLER_AUTORESUME_RETRY_COUNT((byte) 0x32),
// 0x1F for backwards compatibility on current minor version. // 0x1F for backwards compatibility on current minor version.
HISTOGRAM_ENUM_MAX((byte) 0x1F); HISTOGRAM_ENUM_MAX((byte) 0x1F);

@ -742,6 +742,16 @@ public enum TickerType {
COMPACT_WRITE_BYTES_PERIODIC((byte) -0x14), COMPACT_WRITE_BYTES_PERIODIC((byte) -0x14),
COMPACT_WRITE_BYTES_TTL((byte) -0x15), COMPACT_WRITE_BYTES_TTL((byte) -0x15),
/**
* DB error handler statistics
*/
ERROR_HANDLER_BG_ERROR_COUNT((byte) -0x16),
ERROR_HANDLER_BG_IO_ERROR_COUNT((byte) -0x17),
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT((byte) -0x18),
ERROR_HANDLER_AUTORESUME_COUNT((byte) -0x19),
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT((byte) -0x1A),
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT((byte) -0x1B),
TICKER_ENUM_MAX((byte) 0x5F); TICKER_ENUM_MAX((byte) 0x5F);
private final byte value; private final byte value;

@ -191,6 +191,16 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
"rocksdb.block.cache.compression.dict.add.redundant"}, "rocksdb.block.cache.compression.dict.add.redundant"},
{FILES_MARKED_TRASH, "rocksdb.files.marked.trash"}, {FILES_MARKED_TRASH, "rocksdb.files.marked.trash"},
{FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"}, {FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"},
{ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.errro.count"},
{ERROR_HANDLER_BG_IO_ERROR_COUNT,
"rocksdb.error.handler.bg.io.errro.count"},
{ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
"rocksdb.error.handler.bg.retryable.io.errro.count"},
{ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"},
{ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
"rocksdb.error.handler.autoresume.retry.total.count"},
{ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
"rocksdb.error.handler.autoresume.success.count"},
}; };
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = { const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
@ -246,6 +256,8 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
"rocksdb.num.index.and.filter.blocks.read.per.level"}, "rocksdb.num.index.and.filter.blocks.read.per.level"},
{NUM_DATA_BLOCKS_READ_PER_LEVEL, "rocksdb.num.data.blocks.read.per.level"}, {NUM_DATA_BLOCKS_READ_PER_LEVEL, "rocksdb.num.data.blocks.read.per.level"},
{NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"}, {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"},
{ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
"rocksdb.error.handler.autoresume.retry.count"},
}; };
std::shared_ptr<Statistics> CreateDBStatistics() { std::shared_ptr<Statistics> CreateDBStatistics() {

Loading…
Cancel
Save