From 4a9185340dffe343fc9275132a4740ec2db6e0a2 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Tue, 24 Jan 2023 12:55:03 -0800 Subject: [PATCH] A better contract for best_efforts_recovery (#11085) Summary: Capture more of the original intent at a high level, without getting bogged down in low-level details. The old text made some weak promises about handling of LOCK files. There should be no specific concern for LOCK files, because we already rely on LockFile() to create the file if it's not present already. And the lock file is generally size 0, so don't have to worry about truncation. Added a unit test. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11085 Test Plan: existing tests, and a new one. Reviewed By: siying Differential Revision: D42713233 Pulled By: pdillinger fbshipit-source-id: 2fce7c974d35fac065037c9c4c7326a59c9fe340 --- db/db_basic_test.cc | 21 ++++++++++++++ include/rocksdb/options.h | 60 +++++++++++++++++++-------------------- 2 files changed, 51 insertions(+), 30 deletions(-) diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 0fa53fead..1fcab52ee 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -679,6 +679,27 @@ TEST_F(DBBasicTest, IdentityAcrossRestarts) { } while (ChangeCompactOptions()); } +TEST_F(DBBasicTest, LockFileRecovery) { + Options options = CurrentOptions(); + // Regardless of best_efforts_recovery + for (bool ber : {false, true}) { + options.best_efforts_recovery = ber; + DestroyAndReopen(options); + std::string id1, id2; + ASSERT_OK(db_->GetDbIdentity(id1)); + Close(); + + // Should be OK to re-open DB after lock file deleted + std::string lockfilename = LockFileName(dbname_); + ASSERT_OK(env_->DeleteFile(lockfilename)); + Reopen(options); + + // Should be same DB as before + ASSERT_OK(db_->GetDbIdentity(id2)); + ASSERT_EQ(id1, id2); + } +} + #ifndef ROCKSDB_LITE TEST_F(DBBasicTest, Snapshot) { env_->SetMockSleep(); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 0b4cf0893..9f2b5b138 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1288,36 +1288,36 @@ struct DBOptions { // Default: nullptr std::shared_ptr file_checksum_gen_factory = nullptr; - // By default, RocksDB recovery fails if any table/blob file referenced in the - // final version reconstructed from the - // MANIFEST are missing after scanning the MANIFEST pointed to by the - // CURRENT file. It can also fail if verification of unique SST id fails. - // Best-efforts recovery is another recovery mode that does not necessarily - // fail when certain table/blob files are missing/corrupted or have mismatched - // unique id table property. Instead, best-efforts recovery recovers each - // column family to a point in the MANIFEST that corresponds to a version. In - // such a version, all valid table/blob files referenced have the expected - // file size. For table files, their unique id table property match the - // MANIFEST. - // - // Best-efforts recovery does not need a valid CURRENT file, and tries to - // recover the database using one of the available MANIFEST files in the db - // directory. - // Best-efforts recovery tries the available MANIFEST files from high file - // numbers (newer) to low file numbers (older), and stops after finding the - // first MANIFEST file from which the db can be recovered to a state without - // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob - // files. It is possible that the database can be restored to an empty state - // with no table or blob files. - // - // Regardless of this option, the IDENTITY file - // is updated if needed during recovery to match the DB ID in the MANIFEST (if - // previously using write_dbid_to_manifest) or to be in some valid state - // (non-empty DB ID). Currently, not compatible with atomic flush. - // Furthermore, WAL files will not be used for recovery if - // best_efforts_recovery is true. Also requires either 1) LOCK file exists or - // 2) underlying env's LockFile() call returns ok even for non-existing LOCK - // file. + // By default, RocksDB will attempt to detect any data losses or corruptions + // in DB files and return an error to the user, either at DB::Open time or + // later during DB operation. The exception to this policy is the WAL file, + // whose recovery is controlled by the wal_recovery_mode option. + // + // Best-efforts recovery (this option set to true) signals a preference for + // opening the DB to any point-in-time valid state for each column family, + // including the empty/new state, versus the default of returning non-WAL + // data losses to the user as errors. In terms of RocksDB user data, this + // is like applying WALRecoveryMode::kPointInTimeRecovery to each column + // family rather than just the WAL. + // + // Best-efforts recovery (BER) is specifically designed to recover a DB with + // files that are missing or truncated to some smaller size, such as the + // result of an incomplete DB "physical" (FileSystem) copy. BER can also + // detect when an SST file has been replaced with a different one of the + // same size (assuming SST unique IDs are tracked in DB manifest). + // BER is not yet designed to produce a usable DB from other corruptions to + // DB files (which should generally be detectable by DB::VerifyChecksum()), + // and BER does not yet attempt to recover any WAL files. + // + // For example, if an SST or blob file referenced by the MANIFEST is missing, + // BER might be able to find a set of files corresponding to an old "point in + // time" version of the column family, possibly from an older MANIFEST + // file. Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are + // either ignored or replaced with BER, or quietly fixed regardless of BER + // setting. BER does require at least one valid MANIFEST to recover to a + // non-trivial DB state, unlike `ldb repair`. + // + // Currently, best_efforts_recovery=true is not compatible with atomic flush. // // Default: false bool best_efforts_recovery = false;