diff --git a/HISTORY.md b/HISTORY.md index cecde3144..ac49ef440 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ * Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results. * Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`. * Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted. +* Fixed a bug where `GetLiveFiles()` output included a non-existent file called "OPTIONS-000000". Backups and checkpoints, which use `GetLiveFiles()`, failed on DBs impacted by this bug. Read-write DBs were impacted when the latest OPTIONS file failed to write and `fail_if_options_file_error == false`. Read-only DBs were impacted when no OPTIONS files existed. ### Behavior Changes * Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status. diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 35b8f648e..fce28c02c 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -98,7 +98,14 @@ Status DBImpl::GetLiveFiles(std::vector& ret, ret.emplace_back(CurrentFileName("")); ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number())); - ret.emplace_back(OptionsFileName("", versions_->options_file_number())); + // The OPTIONS file number is zero in read-write mode when OPTIONS file + // writing failed and the DB was configured with + // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file + // number is zero when no OPTIONS file exist at all. In those cases we do not + // record any OPTIONS file in the live file list. + if (versions_->options_file_number() != 0) { + ret.emplace_back(OptionsFileName("", versions_->options_file_number())); + } // find length of manifest file while holding the mutex lock *manifest_file_size = versions_->manifest_file_size(); diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index b6869964c..a74765942 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -258,6 +258,7 @@ DECLARE_bool(best_efforts_recovery); DECLARE_bool(skip_verifydb); DECLARE_bool(enable_compaction_filter); DECLARE_bool(paranoid_file_checks); +DECLARE_bool(fail_if_options_file_error); DECLARE_uint64(batch_protection_bytes_per_key); DECLARE_uint64(user_timestamp_size); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 6325314d9..1c3fbf4fe 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -792,6 +792,10 @@ DEFINE_bool(paranoid_file_checks, true, "After writing every SST file, reopen it and read all the keys " "and validate checksums"); +DEFINE_bool(fail_if_options_file_error, false, + "Fail operations that fail to detect or properly persist options " + "file."); + DEFINE_uint64(batch_protection_bytes_per_key, 0, "If nonzero, enables integrity protection in `WriteBatch` at the " "specified number of bytes per key. Currently the only supported " diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 6f8da9ba4..302ad2e74 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -2110,6 +2110,8 @@ void StressTest::PrintEnv() const { fprintf(stdout, "Sync fault injection : %d\n", FLAGS_sync_fault_injection); fprintf(stdout, "Best efforts recovery : %d\n", static_cast(FLAGS_best_efforts_recovery)); + fprintf(stdout, "Fail if OPTIONS file error: %d\n", + static_cast(FLAGS_fail_if_options_file_error)); fprintf(stdout, "User timestamp size bytes : %d\n", static_cast(FLAGS_user_timestamp_size)); @@ -2328,6 +2330,7 @@ void StressTest::Open() { options_.best_efforts_recovery = FLAGS_best_efforts_recovery; options_.paranoid_file_checks = FLAGS_paranoid_file_checks; + options_.fail_if_options_file_error = FLAGS_fail_if_options_file_error; if ((options_.enable_blob_files || options_.enable_blob_garbage_collection || FLAGS_allow_setting_blob_options_dynamically) && diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 59a8dc2ee..0e4ed041c 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -61,6 +61,7 @@ default_params = { "enable_pipelined_write": lambda: random.randint(0, 1), "enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]), "expected_values_path": lambda: setup_expected_values_file(), + "fail_if_options_file_error": lambda: random.randint(0, 1), "flush_one_in": 1000000, "file_checksum_impl": lambda: random.choice(["none", "crc32c", "xxh64", "big"]), "get_live_files_one_in": 1000000, diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index 476fde699..a8eda4e67 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -29,6 +29,7 @@ #include "test_util/testharness.h" #include "test_util/testutil.h" #include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { class CheckpointTest : public testing::Test { @@ -793,6 +794,50 @@ TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) { db_ = nullptr; } +TEST_F(CheckpointTest, CheckpointOptionsFileFailedToPersist) { + // Regression test for a bug where checkpoint failed on a DB where persisting + // OPTIONS file failed and the DB was opened with + // `fail_if_options_file_error == false`. + Options options = CurrentOptions(); + options.fail_if_options_file_error = false; + auto fault_fs = std::make_shared(FileSystem::Default()); + + // Setup `FaultInjectionTestFS` and `SyncPoint` callbacks to fail one + // operation when inside the OPTIONS file persisting code. + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + fault_fs->SetRandomMetadataWriteError(1 /* one_in */); + SyncPoint::GetInstance()->SetCallBack( + "PersistRocksDBOptions:start", [fault_fs](void* /* arg */) { + fault_fs->EnableMetadataWriteErrorInjection(); + }); + SyncPoint::GetInstance()->SetCallBack( + "FaultInjectionTestFS::InjectMetadataWriteError:Injected", + [fault_fs](void* /* arg */) { + fault_fs->DisableMetadataWriteErrorInjection(); + }); + options.env = fault_fs_env.get(); + SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + ASSERT_OK(Put("key1", "val1")); + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + + // Make sure it's usable. + options.env = env_; + DB* snapshot_db; + ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db)); + ReadOptions read_opts; + std::string get_result; + ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result)); + ASSERT_EQ("val1", get_result); + delete snapshot_db; + delete db_; + db_ = nullptr; +} + TEST_F(CheckpointTest, CheckpointReadOnlyDB) { ASSERT_OK(Put("foo", "foo_value")); ASSERT_OK(Flush()); diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc index 90c403690..570533aaf 100644 --- a/utilities/fault_injection_fs.cc +++ b/utilities/fault_injection_fs.cc @@ -22,6 +22,7 @@ #include "env/composite_env_wrapper.h" #include "port/lang.h" #include "port/stack_trace.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/random.h" @@ -708,12 +709,15 @@ IOStatus FaultInjectionTestFS::InjectWriteError(const std::string& file_name) { } IOStatus FaultInjectionTestFS::InjectMetadataWriteError() { - MutexLock l(&mutex_); - if (!enable_metadata_write_error_injection_ || - !metadata_write_error_one_in_ || - !write_error_rand_.OneIn(metadata_write_error_one_in_)) { - return IOStatus::OK(); + { + MutexLock l(&mutex_); + if (!enable_metadata_write_error_injection_ || + !metadata_write_error_one_in_ || + !write_error_rand_.OneIn(metadata_write_error_one_in_)) { + return IOStatus::OK(); + } } + TEST_SYNC_POINT("FaultInjectionTestFS::InjectMetadataWriteError:Injected"); return IOStatus::IOError(); }