From f79b3d19a77f649536da6321f40e1c0d269bde48 Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Mon, 12 Sep 2022 13:55:23 -0700 Subject: [PATCH] Inject spurious wakeup and sleep before acquiring db mutex to expose race condition (#10291) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: **Context/Summary:** Previous experience with bugs and flaky tests taught us there exist features in RocksDB vulnerable to race condition caused by acquiring db mutex at a particular timing. This PR aggressively exposes those vulnerable features by injecting spurious wakeup and sleep to cause acquiring db mutex at various timing in order to expose such race condition **Testing:** - `COERCE_CONTEXT_SWITCH=1 make -j56 check / make -j56 db_stress` should reveal - flaky tests caused by db mutex related race condition - Reverted https://github.com/facebook/rocksdb/pull/9528 - A/B testing on `COMPILE_WITH_TSAN=1 make -j56 listener_test` w/ and w/o `COERCE_CONTEXT_SWITCH=1` followed by `./listener_test --gtest_filter=EventListenerTest.MultiCF --gtest_repeat=10` - `COERCE_CONTEXT_SWITCH=1` can cause expected test failure (i.e, expose target TSAN data race error) within 10 run while the other couldn't. - This proves our injection can expose flaky tests caused by db mutex related race condition faster. - known or new race-condition-type of internal bug by continuously running this PR - Performance - High ops-threads time: COERCE_CONTEXT_SWITCH=1 regressed by 4 times slower (2:01.16 vs 0:22.10 elapsed ). This PR will be run as a separate CI job so this regression won't affect any existing job. ``` TEST_TMPDIR=$db /usr/bin/time ./db_stress \ --ops_per_thread=100000 --expected_values_dir=$exp --clear_column_family_one_in=0 \ --write_buffer_size=524288 —target_file_size_base=524288 —ingest_external_file_one_in=100 —compact_files_one_in=1000 —compact_range_one_in=1000 ``` - Start-up time: COERCE_CONTEXT_SWITCH=1 didn't regress by 25% (0:01.51 vs 0:01.29 elapsed) ``` TEST_TMPDIR=$db ./db_stress -ops_per_thread=100000000 -expected_values_dir=$exp --clear_column_family_one_in=0 & sleep 120; pkill -9 db_stress TEST_TMPDIR=$db /usr/bin/time ./db_stress \ --ops_per_thread=1 -reopen=0 --expected_values_dir=$exp --clear_column_family_one_in=0 --destroy_db_initially=0 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/10291 Reviewed By: ajkr Differential Revision: D39231182 Pulled By: hx235 fbshipit-source-id: 7ab6695430460e0826727fd8c66679b32b3e44b6 --- Makefile | 10 ++++++++++ db/db_impl/db_impl.cc | 5 +++++ monitoring/instrumented_mutex.cc | 17 +++++++++++++++++ monitoring/instrumented_mutex.h | 17 ++++++++++++++--- 4 files changed, 46 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 102224e1f..0b507b591 100644 --- a/Makefile +++ b/Makefile @@ -190,6 +190,16 @@ ifeq ($(USE_LTO), 1) LDFLAGS += -flto -fuse-linker-plugin endif +# `COERCE_CONTEXT_SWITCH=1` will inject spurious wakeup and +# random length of sleep or context switch at critical +# points (e.g, before acquring db mutex) in RocksDB. +# In this way, it coerces as many excution orders as possible in the hope of +# exposing the problematic excution order +COERCE_CONTEXT_SWITCH ?= 0 +ifeq ($(COERCE_CONTEXT_SWITCH), 1) +OPT += -DCOERCE_CONTEXT_SWITCH +endif + #----------------------------------------------- include src.mk diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 6ec257836..3da370da7 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -165,8 +165,13 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, fs_(immutable_db_options_.fs, io_tracer_), mutable_db_options_(initial_db_options_), stats_(immutable_db_options_.stats), +#ifdef COERCE_CONTEXT_SWITCH + mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, &bg_cv_, + immutable_db_options_.use_adaptive_mutex), +#else // COERCE_CONTEXT_SWITCH mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, immutable_db_options_.use_adaptive_mutex), +#endif // COERCE_CONTEXT_SWITCH default_cf_handle_(nullptr), error_handler_(this, immutable_db_options_, &mutex_), event_logger_(immutable_db_options_.info_log.get()), diff --git a/monitoring/instrumented_mutex.cc b/monitoring/instrumented_mutex.cc index adca63f26..699495a34 100644 --- a/monitoring/instrumented_mutex.cc +++ b/monitoring/instrumented_mutex.cc @@ -34,6 +34,23 @@ void InstrumentedMutex::Lock() { void InstrumentedMutex::LockInternal() { #ifndef NDEBUG ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT); +#endif +#ifdef COERCE_CONTEXT_SWITCH + if (stats_code_ == DB_MUTEX_WAIT_MICROS) { + thread_local Random rnd(301); + if (rnd.OneIn(2)) { + if (bg_cv_) { + bg_cv_->SignalAll(); + } + sched_yield(); + } else { + uint32_t sleep_us = rnd.Uniform(11) * 1000; + if (bg_cv_) { + bg_cv_->SignalAll(); + } + SystemClock::Default()->SleepForMicroseconds(sleep_us); + } + } #endif mutex_.Lock(); } diff --git a/monitoring/instrumented_mutex.h b/monitoring/instrumented_mutex.h index ea29bb452..f2b0564bb 100644 --- a/monitoring/instrumented_mutex.h +++ b/monitoring/instrumented_mutex.h @@ -32,11 +32,19 @@ class InstrumentedMutex { clock_(clock), stats_code_(stats_code) {} +#ifdef COERCE_CONTEXT_SWITCH + InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code, + InstrumentedCondVar* bg_cv, bool adaptive = false) + : mutex_(adaptive), + stats_(stats), + clock_(clock), + stats_code_(stats_code), + bg_cv_(bg_cv) {} +#endif + void Lock(); - void Unlock() { - mutex_.Unlock(); - } + void Unlock() { mutex_.Unlock(); } void AssertHeld() { mutex_.AssertHeld(); @@ -49,6 +57,9 @@ class InstrumentedMutex { Statistics* stats_; SystemClock* clock_; int stats_code_; +#ifdef COERCE_CONTEXT_SWITCH + InstrumentedCondVar* bg_cv_ = nullptr; +#endif }; class ALIGN_AS(CACHE_LINE_SIZE) CacheAlignedInstrumentedMutex