From 62d84e2a2b4173e4010b76873b333d9a87e1ce2e Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Fri, 6 May 2022 11:17:08 -0700 Subject: [PATCH] db_stress fault injection in release mode (#9957) Summary: Previously all fault injection was ignored in release mode. This PR adds it back except for read fault injection (`--read_fault_one_in > 0`) since its dependency (`IGNORE_STATUS_IF_ERROR`) is unavailable in release mode. Other notable changes include: - Moved `EnableWriteErrorInjection()` for `--write_fault_one_in > 0` so it's after `DB::Open()` without depending on `SyncPoint` - Made `--read_fault_one_in > 0` return an error in release mode - Updated `db_crashtest.py` to always set `--read_fault_one_in=0` in release mode Pull Request resolved: https://github.com/facebook/rocksdb/pull/9957 Test Plan: ``` $ DEBUG_LEVEL=0 make -j24 db_stress $ DEBUG_LEVEL=0 TEST_TMPDIR=/dev/shm python3 tools/db_crashtest.py blackbox ``` Reviewed By: anand1976 Differential Revision: D36193830 Pulled By: ajkr fbshipit-source-id: 0b97946b4e3f06e3e0f6e7833c2763da08ec5321 --- db_stress_tool/db_stress_common.cc | 2 -- db_stress_tool/db_stress_common.h | 6 +----- db_stress_tool/db_stress_driver.cc | 5 +++-- db_stress_tool/db_stress_shared_state.h | 12 ++++++++++-- db_stress_tool/db_stress_test_base.cc | 8 ++------ db_stress_tool/db_stress_tool.cc | 12 ------------ db_stress_tool/multi_ops_txns_stress.cc | 2 -- db_stress_tool/no_batched_ops_stress.cc | 16 ---------------- tools/db_crashtest.py | 2 ++ 9 files changed, 18 insertions(+), 47 deletions(-) diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc index 0d5e07632..2a5cbd763 100644 --- a/db_stress_tool/db_stress_common.cc +++ b/db_stress_tool/db_stress_common.cc @@ -18,11 +18,9 @@ ROCKSDB_NAMESPACE::Env* db_stress_listener_env = nullptr; ROCKSDB_NAMESPACE::Env* db_stress_env = nullptr; -#ifndef NDEBUG // If non-null, injects read error at a rate specified by the // read_fault_one_in or write_fault_one_in flag std::shared_ptr fault_fs_guard; -#endif // NDEBUG enum ROCKSDB_NAMESPACE::CompressionType compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression; enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e = diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 8eaf38d75..8c23ac373 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -67,6 +67,7 @@ #include "util/random.h" #include "util/string_util.h" #include "utilities/blob_db/blob_db.h" +#include "utilities/fault_injection_fs.h" #include "utilities/merge_operators.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; @@ -296,12 +297,7 @@ constexpr int kValueMaxLen = 100; // wrapped posix environment extern ROCKSDB_NAMESPACE::Env* db_stress_env; extern ROCKSDB_NAMESPACE::Env* db_stress_listener_env; -#ifndef NDEBUG -namespace ROCKSDB_NAMESPACE { -class FaultInjectionTestFS; -} // namespace ROCKSDB_NAMESPACE extern std::shared_ptr fault_fs_guard; -#endif extern enum ROCKSDB_NAMESPACE::CompressionType compression_type_e; extern enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e; diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc index a56454abe..438a840ad 100644 --- a/db_stress_tool/db_stress_driver.cc +++ b/db_stress_tool/db_stress_driver.cc @@ -62,11 +62,12 @@ bool RunStressTest(StressTest* stress) { stress->InitDb(&shared); stress->FinishInitDb(&shared); -#ifndef NDEBUG if (FLAGS_sync_fault_injection) { fault_fs_guard->SetFilesystemDirectWritable(false); } -#endif + if (FLAGS_write_fault_one_in) { + fault_fs_guard->EnableWriteErrorInjection(); + } uint32_t n = FLAGS_threads; uint64_t now = clock->NowMicros(); diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h index b05144548..407c15569 100644 --- a/db_stress_tool/db_stress_shared_state.h +++ b/db_stress_tool/db_stress_shared_state.h @@ -133,13 +133,21 @@ class SharedState { for (int i = 0; i < FLAGS_column_families; ++i) { key_locks_[i].reset(new port::Mutex[num_locks]); } -#ifndef NDEBUG if (FLAGS_read_fault_one_in) { +#ifdef NDEBUG + // Unsupported in release mode because it relies on + // `IGNORE_STATUS_IF_ERROR` to distinguish faults not expected to lead to + // failure. + fprintf(stderr, + "Cannot set nonzero value for --read_fault_one_in in " + "release mode."); + exit(1); +#else // NDEBUG SyncPoint::GetInstance()->SetCallBack("FaultInjectionIgnoreError", IgnoreReadErrorCallback); SyncPoint::GetInstance()->EnableProcessing(); +#endif // NDEBUG } -#endif // NDEBUG } ~SharedState() { diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 40260f040..edc869002 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -674,6 +674,7 @@ void StressTest::OperateDb(ThreadState* thread) { fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(), FLAGS_read_fault_one_in); } +#endif // NDEBUG if (FLAGS_write_fault_one_in) { IOStatus error_msg; if (FLAGS_injest_error_severity <= 1 || FLAGS_injest_error_severity > 2) { @@ -691,7 +692,6 @@ void StressTest::OperateDb(ThreadState* thread) { thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg, /*inject_for_all_file_types=*/false, types); } -#endif // NDEBUG thread->stats.Start(); for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) { if (thread->shared->HasVerificationFailedYet() || @@ -2621,7 +2621,6 @@ void StressTest::Open(SharedState* shared) { RegisterAdditionalListeners(); options_.create_missing_column_families = true; if (!FLAGS_use_txn) { -#ifndef NDEBUG // Determine whether we need to ingest file metadata write failures // during DB reopen. If it does, enable it. // Only ingest metadata error if it is reopening, as initial open @@ -2663,7 +2662,6 @@ void StressTest::Open(SharedState* shared) { } } while (true) { -#endif // NDEBUG #ifndef ROCKSDB_LITE // StackableDB-based BlobDB if (FLAGS_use_blob_db) { @@ -2693,7 +2691,6 @@ void StressTest::Open(SharedState* shared) { } } -#ifndef NDEBUG if (ingest_meta_error || ingest_write_error || ingest_read_error) { fault_fs_guard->SetFilesystemDirectWritable(true); fault_fs_guard->DisableMetadataWriteErrorInjection(); @@ -2705,7 +2702,7 @@ void StressTest::Open(SharedState* shared) { // wait for all compactions to finish to make sure DB is in // clean state before executing queries. s = static_cast_with_check(db_->GetRootDB()) - ->TEST_WaitForCompact(true); + ->WaitForCompact(true /* wait_unscheduled */); if (!s.ok()) { for (auto cf : column_families_) { delete cf; @@ -2738,7 +2735,6 @@ void StressTest::Open(SharedState* shared) { } break; } -#endif // NDEBUG } else { #ifndef ROCKSDB_LITE TransactionDBOptions txn_db_options; diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index 2a7430bfa..45a8c99f1 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -24,9 +24,7 @@ #include "db_stress_tool/db_stress_common.h" #include "db_stress_tool/db_stress_driver.h" #include "rocksdb/convenience.h" -#ifndef NDEBUG #include "utilities/fault_injection_fs.h" -#endif namespace ROCKSDB_NAMESPACE { namespace { @@ -82,7 +80,6 @@ int db_stress_tool(int argc, char** argv) { dbsl_env_wrapper_guard = std::make_shared(raw_env); db_stress_listener_env = dbsl_env_wrapper_guard.get(); -#ifndef NDEBUG if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection || FLAGS_write_fault_one_in || FLAGS_open_metadata_write_fault_one_in || FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) { @@ -98,18 +95,10 @@ int db_stress_tool(int argc, char** argv) { std::make_shared(raw_env, fault_fs_guard); raw_env = fault_env_guard.get(); } - if (FLAGS_write_fault_one_in) { - SyncPoint::GetInstance()->SetCallBack( - "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs_guard->EnableWriteErrorInjection(); }); - SyncPoint::GetInstance()->EnableProcessing(); - } -#endif env_wrapper_guard = std::make_shared(raw_env); db_stress_env = env_wrapper_guard.get(); -#ifndef NDEBUG if (FLAGS_write_fault_one_in) { // In the write injection case, we need to use the FS interface and returns // the IOStatus with different error and flags. Therefore, @@ -118,7 +107,6 @@ int db_stress_tool(int argc, char** argv) { // CompositeEnvWrapper of env and fault_fs. db_stress_env = raw_env; } -#endif FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); diff --git a/db_stress_tool/multi_ops_txns_stress.cc b/db_stress_tool/multi_ops_txns_stress.cc index e29a30980..07a1783f2 100644 --- a/db_stress_tool/multi_ops_txns_stress.cc +++ b/db_stress_tool/multi_ops_txns_stress.cc @@ -12,9 +12,7 @@ #include "rocksdb/utilities/write_batch_with_index.h" #include "util/defer.h" -#ifndef NDEBUG #include "utilities/fault_injection_fs.h" -#endif // NDEBUG #include "utilities/transactions/write_prepared_txn_db.h" namespace ROCKSDB_NAMESPACE { diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index f764bb2b9..0a5001a4b 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -9,9 +9,7 @@ #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" -#ifndef NDEBUG #include "utilities/fault_injection_fs.h" -#endif // NDEBUG #include "rocksdb/utilities/transaction_db.h" namespace ROCKSDB_NAMESPACE { @@ -234,20 +232,15 @@ class NonBatchedOpsStressTest : public StressTest { std::string from_db; int error_count = 0; -#ifndef NDEBUG if (fault_fs_guard) { fault_fs_guard->EnableErrorInjection(); SharedState::ignore_read_error = false; } -#endif // NDEBUG Status s = db_->Get(read_opts, cfh, key, &from_db); -#ifndef NDEBUG if (fault_fs_guard) { error_count = fault_fs_guard->GetAndResetErrorCount(); } -#endif // NDEBUG if (s.ok()) { -#ifndef NDEBUG if (fault_fs_guard) { if (error_count && !SharedState::ignore_read_error) { // Grab mutex so multiple thread don't try to print the @@ -259,7 +252,6 @@ class NonBatchedOpsStressTest : public StressTest { std::terminate(); } } -#endif // NDEBUG // found case thread->stats.AddGets(1, 1); } else if (s.IsNotFound()) { @@ -273,11 +265,9 @@ class NonBatchedOpsStressTest : public StressTest { thread->stats.AddVerifiedErrors(1); } } -#ifndef NDEBUG if (fault_fs_guard) { fault_fs_guard->DisableErrorInjection(); } -#endif // NDEBUG return s; } @@ -365,19 +355,15 @@ class NonBatchedOpsStressTest : public StressTest { } if (!use_txn) { -#ifndef NDEBUG if (fault_fs_guard) { fault_fs_guard->EnableErrorInjection(); SharedState::ignore_read_error = false; } -#endif // NDEBUG db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(), statuses.data()); -#ifndef NDEBUG if (fault_fs_guard) { error_count = fault_fs_guard->GetAndResetErrorCount(); } -#endif // NDEBUG } else { #ifndef ROCKSDB_LITE txn->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(), @@ -385,7 +371,6 @@ class NonBatchedOpsStressTest : public StressTest { #endif } -#ifndef NDEBUG if (fault_fs_guard && error_count && !SharedState::ignore_read_error) { int stat_nok = 0; for (const auto& s : statuses) { @@ -409,7 +394,6 @@ class NonBatchedOpsStressTest : public StressTest { if (fault_fs_guard) { fault_fs_guard->DisableErrorInjection(); } -#endif // NDEBUG for (size_t i = 0; i < statuses.size(); ++i) { Status s = statuses[i]; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index a7c050f37..7fa5a4d20 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -415,6 +415,8 @@ multiops_wp_txn_params = { def finalize_and_sanitize(src_params): dest_params = dict([(k, v() if callable(v) else v) for (k, v) in src_params.items()]) + if is_release_mode(): + dest_params['read_fault_one_in'] = 0 if dest_params.get("compression_max_dict_bytes") == 0: dest_params["compression_zstd_max_train_bytes"] = 0 dest_params["compression_max_dict_buffer_bytes"] = 0