From 782fcc44e16e8263be7b0c0cfaeb7ade9dca3bcb Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Mon, 20 Dec 2021 23:15:55 -0800 Subject: [PATCH] Fix race condition in `error_handler_fs_test` (#9325) Summary: We saw the below assertion failure in `error_handler_fs_test`: ``` db/error_handler_fs_test.cc:2471: Failure Expected equality of these values: listener->new_bg_error() Which is: 16-byte object <00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00> Status::Aborted() Which is: 16-byte object <0A-00 00-00 60-61 00-00 00-00 00-00 00-00 00-00> terminate called after throwing an instance of 'testing::internal::GoogleTestFailureException' what(): db/error_handler_fs_test.cc:2471: Failure Expected equality of these values: listener->new_bg_error() Which is: 16-byte object <00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00> Status::Aborted() Which is: 16-byte object <0A-00 00-00 60-61 00-00 00-00 00-00 00-00 00-00> Received signal 6 (Aborted) ``` The problem was completing `OnErrorRecoveryCompleted()` would wake up the main thread and allow it to proceed to that assertion. But that assertion assumes `OnErrorRecoveryEnd()` has completed since only `OnErrorRecoveryEnd()` affects `new_bg_error()`. The fix is just to make `OnErrorRecoveryCompleted()` not wake up the main thread, by means of not implementing it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9325 Test Plan: - ran `while TEST_TMPDIR=/dev/shm ./error_handler_fs_test ; do : ; done` for a while - injected sleep between `OnErrorRecovery{Completed,End}()` callbacks, which guaranteed repro before this PR Reviewed By: anand1976 Differential Revision: D33249200 Pulled By: ajkr fbshipit-source-id: 1659ee183cd09f90d4dbd898f65103473fcf84a8 --- db/error_handler_fs_test.cc | 7 ------- include/rocksdb/listener.h | 4 +++- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc index 3283ffcf0..c01273819 100644 --- a/db/error_handler_fs_test.cc +++ b/db/error_handler_fs_test.cc @@ -90,13 +90,6 @@ class ErrorHandlerFSListener : public EventListener { } } - void OnErrorRecoveryCompleted(Status old_bg_error) override { - InstrumentedMutexLock l(&mutex_); - recovery_complete_ = true; - cv_.SignalAll(); - old_bg_error.PermitUncheckedError(); - } - void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override { InstrumentedMutexLock l(&mutex_); recovery_complete_ = true; diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 1ffbc73a4..cf54e1248 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -706,7 +706,9 @@ class EventListener : public Customizable { // is recovered from read-only mode after an error. When this is called, it // means normal writes to the database can be issued and the user can // initiate any further recovery actions needed - virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {} + virtual void OnErrorRecoveryCompleted(Status old_bg_error) { + old_bg_error.PermitUncheckedError(); + } // A callback function for RocksDB which will be called once the recovery // attempt from a background retryable error is completed. The recovery