From 217ce20021ec1f5176887e58c2cb85b64806d002 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Wed, 18 Mar 2020 17:11:06 -0700 Subject: [PATCH] Remove GetSortedWalFiles/GetCurrentWalFile from the crash test (#6491) Summary: Currently, `db_stress` tests a randomly picked one of `GetLiveFiles`, `GetSortedWalFiles`, and `GetCurrentWalFile` with a 1/N chance when the command line parameter `get_live_files_and_wal_files_one_in` is specified. The problem is that `GetSortedWalFiles` and `GetCurrentWalFile` are unreliable in the sense that they can return errors if another thread removes a WAL file while they are executing (which is a perfectly plausible and legitimate scenario). The patch splits this command line parameter into three (one for each API), and changes the crash test script so that only `GetLiveFiles` is tested during our continuous crash test runs. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6491 Test Plan: ``` make check python tools/db_crashtest.py whitebox ``` Reviewed By: siying Differential Revision: D20312200 Pulled By: ltamasi fbshipit-source-id: e7c3481eddfe3bd3d5349476e34abc9eee5b7dc8 --- db_stress_tool/db_stress_common.h | 4 +- db_stress_tool/db_stress_gflags.cc | 19 ++++++-- db_stress_tool/db_stress_test_base.cc | 62 ++++++++++++++++----------- db_stress_tool/db_stress_test_base.h | 5 ++- tools/db_crashtest.py | 6 ++- 5 files changed, 63 insertions(+), 33 deletions(-) diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index d3d400ff2..135a613eb 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -128,7 +128,9 @@ DECLARE_int32(universal_min_merge_width); DECLARE_int32(universal_max_merge_width); DECLARE_int32(universal_max_size_amplification_percent); DECLARE_int32(clear_column_family_one_in); -DECLARE_int32(get_live_files_and_wal_files_one_in); +DECLARE_int32(get_live_files_one_in); +DECLARE_int32(get_sorted_wal_files_one_in); +DECLARE_int32(get_current_wal_file_one_in); DECLARE_int32(set_options_one_in); DECLARE_int32(set_in_place_one_in); DECLARE_int64(cache_size); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 055824fcf..48b50a9a1 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -256,10 +256,21 @@ DEFINE_int32(clear_column_family_one_in, 1000000, "it again. If N == 0, never drop/create column families. " "When test_batches_snapshots is true, this flag has no effect"); -DEFINE_int32(get_live_files_and_wal_files_one_in, 1000000, - "With a chance of 1/N, call GetLiveFiles, GetSortedWalFiles " - "and GetCurrentWalFile to verify if it returns correctly. If " - "N == 0, never call the three interfaces."); +DEFINE_int32(get_live_files_one_in, 1000000, + "With a chance of 1/N, call GetLiveFiles to verify if it returns " + "correctly. If N == 0, do not call the interface."); + +DEFINE_int32( + get_sorted_wal_files_one_in, 1000000, + "With a chance of 1/N, call GetSortedWalFiles to verify if it returns " + "correctly. (Note that this API may legitimately return an error.) If N == " + "0, do not call the interface."); + +DEFINE_int32( + get_current_wal_file_one_in, 1000000, + "With a chance of 1/N, call GetCurrentWalFile to verify if it returns " + "correctly. (Note that this API may legitimately return an error.) If N == " + "0, do not call the interface."); DEFINE_int32(set_options_one_in, 0, "With a chance of 1/N, change some random options"); diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 8e736eb3b..9f5de4dcd 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -593,13 +593,28 @@ void StressTest::OperateDb(ThreadState* thread) { } #ifndef ROCKSDB_LITE - // Every 1 in N verify the one of the following: 1) GetLiveFiles - // 2) GetSortedWalFiles 3) GetCurrentWalFile. Each time, randomly select - // one of them to run the test. - if (thread->rand.OneInOpt(FLAGS_get_live_files_and_wal_files_one_in)) { - Status status = VerifyGetLiveAndWalFiles(thread); + // Verify GetLiveFiles with a 1 in N chance. + if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in)) { + Status status = VerifyGetLiveFiles(); if (!status.ok()) { - VerificationAbort(shared, "VerifyGetLiveAndWalFiles status not OK", + VerificationAbort(shared, "VerifyGetLiveFiles status not OK", status); + } + } + + // Verify GetSortedWalFiles with a 1 in N chance. + if (thread->rand.OneInOpt(FLAGS_get_sorted_wal_files_one_in)) { + Status status = VerifyGetSortedWalFiles(); + if (!status.ok()) { + VerificationAbort(shared, "VerifyGetSortedWalFiles status not OK", + status); + } + } + + // Verify GetCurrentWalFile with a 1 in N chance. + if (thread->rand.OneInOpt(FLAGS_get_current_wal_file_one_in)) { + Status status = VerifyGetCurrentWalFile(); + if (!status.ok()) { + VerificationAbort(shared, "VerifyGetCurrentWalFile status not OK", status); } } @@ -978,28 +993,23 @@ Status StressTest::TestIterate(ThreadState* thread, } #ifndef ROCKSDB_LITE -// Test the return status of GetLiveFiles, GetSortedWalFiles, and -// GetCurrentWalFile. Each time, randomly select one of them to run -// and return the status. -Status StressTest::VerifyGetLiveAndWalFiles(ThreadState* thread) { - int case_num = thread->rand.Uniform(3); - if (case_num == 0) { - std::vector live_file; - uint64_t manifest_size; - return db_->GetLiveFiles(live_file, &manifest_size); - } +// Test the return status of GetLiveFiles. +Status StressTest::VerifyGetLiveFiles() const { + std::vector live_file; + uint64_t manifest_size = 0; + return db_->GetLiveFiles(live_file, &manifest_size); +} - if (case_num == 1) { - VectorLogPtr log_ptr; - return db_->GetSortedWalFiles(log_ptr); - } +// Test the return status of GetSortedWalFiles. +Status StressTest::VerifyGetSortedWalFiles() const { + VectorLogPtr log_ptr; + return db_->GetSortedWalFiles(log_ptr); +} - if (case_num == 2) { - std::unique_ptr cur_wal_file; - return db_->GetCurrentWalFile(&cur_wal_file); - } - assert(false); - return Status::Corruption("Undefined case happens!"); +// Test the return status of GetCurrentWalFile. +Status StressTest::VerifyGetCurrentWalFile() const { + std::unique_ptr cur_wal_file; + return db_->GetCurrentWalFile(&cur_wal_file); } #endif // !ROCKSDB_LITE diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h index 5d88ceb42..ab80fc578 100644 --- a/db_stress_tool/db_stress_test_base.h +++ b/db_stress_tool/db_stress_test_base.h @@ -184,7 +184,10 @@ class StressTest { Status MaybeReleaseSnapshots(ThreadState* thread, uint64_t i); #ifndef ROCKSDB_LITE - Status VerifyGetLiveAndWalFiles(ThreadState* thread); + Status VerifyGetLiveFiles() const; + Status VerifyGetSortedWalFiles() const; + Status VerifyGetCurrentWalFile() const; + virtual Status TestApproximateSize( ThreadState* thread, uint64_t iteration, const std::vector& rand_column_families, diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index bf690b1ec..eb17ddd96 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -51,7 +51,11 @@ default_params = { "enable_pipelined_write": lambda: random.randint(0, 1), "expected_values_path": expected_values_file.name, "flush_one_in": 1000000, - "get_live_files_and_wal_files_one_in": 1000000, + "get_live_files_one_in": 1000000, + # Note: the following two are intentionally disabled as the corresponding + # APIs are not guaranteed to succeed. + "get_sorted_wal_files_one_in": 0, + "get_current_wal_file_one_in": 0, # Temporarily disable hash index "index_type": lambda: random.choice([0,2]), "max_background_compactions": 20,