Document DB::Resume(), fix LockWALInEffect test (#11290)

Summary:
In rare cases seeing failures like this

```
[ RUN      ] DBWriteTestInstance/DBWriteTest.LockWALInEffect/2
db/db_write_test.cc:653: Failure
Put("key3", "value")
Corruption: Not active
```

in a test with no explicit threading. This is likely because of the unpredictability of background auto-resume. I didn't really know this feature, in part because DB::Resume() was undocumented. So I believe I have fixed the test and documented the API function.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11290

Test Plan: 1000s of stress runs of the test with gtest-parallel

Reviewed By: anand1976

Differential Revision: D43984583

Pulled By: pdillinger

fbshipit-source-id: d30dec120b4864e193751b2e33ff16834d313db3
oxigraph-8.1.1
Peter Dillinger 2 years ago committed by Facebook GitHub Bot
parent 9aa3b6f9ae
commit 648e972f30
  1. 2
      db/db_write_test.cc
  2. 12
      include/rocksdb/db.h
  3. 4
      include/rocksdb/options.h

@ -620,6 +620,7 @@ TEST_P(DBWriteTest, LockWALInEffect) {
options.env = fault_fs_env.get();
options.disable_auto_compactions = true;
options.paranoid_checks = false;
options.max_bgerror_resume_count = 0; // manual Resume()
Reopen(options);
// try the 1st WAL created during open
ASSERT_OK(Put("key0", "value"));
@ -649,6 +650,7 @@ TEST_P(DBWriteTest, LockWALInEffect) {
ASSERT_OK(db_->UnlockWAL());
}
fault_fs->SetFilesystemActive(true);
ASSERT_OK(db_->Resume());
// Writes should work again
ASSERT_OK(Put("key3", "value"));
ASSERT_EQ(Get("key3"), "value");

@ -301,6 +301,18 @@ class DB {
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
std::string trim_ts);
// Manually, synchronously attempt to resume DB writes after a write failure
// to the underlying filesystem. See
// https://github.com/facebook/rocksdb/wiki/Background-Error-Handling
//
// Returns OK if writes are successfully resumed, or there was no
// outstanding error to recover from. Returns underlying write error if
// it is not recoverable.
//
// WART: Does not mix well with auto-resume. Will return Busy if an
// auto-resume is in progress, without waiting for it to complete.
// See DBOptions::max_bgerror_resume_count and
// EventListener::OnErrorRecoveryBegin
virtual Status Resume() { return Status::NotSupported(); }
// Close the DB by releasing resources, closing files etc. This should be

@ -1311,12 +1311,12 @@ struct DBOptions {
// Default: false
bool best_efforts_recovery = false;
// It defines how many times db resume is called by a separate thread when
// It defines how many times DB::Resume() is called by a separate thread when
// background retryable IO Error happens. When background retryable IO
// Error happens, SetBGError is called to deal with the error. If the error
// can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
// then db resume is called in background to recover from the error. If this
// value is 0 or negative, db resume will not be called.
// value is 0 or negative, DB::Resume() will not be called automatically.
//
// Default: INT_MAX
int max_bgerror_resume_count = INT_MAX;

Loading…
Cancel
Save