From 9f690ec62c16592dc6bc80ddf9d227632db51441 Mon Sep 17 00:00:00 2001 From: Tomislav Novak Date: Sat, 21 Dec 2013 15:10:39 -0800 Subject: [PATCH] Fix a deadlock in CompactRange() Summary: The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions can cause it to deadlock when CompactRange() is called concurrently from multiple threads. Imagine a following scenario with only two threads (max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0): 1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction (bg_compaction_scheduled_ is now 10) and waits for it to complete. 2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_ (now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to drop to LargeNumber. 3. BG thread completes the first manual compaction, decrements bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_. Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again (now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue, thread #2 also goes to sleep (waiting on bg_cv_), without resetting bg_compaction_scheduled_. This diff attempts to address the problem by introducing a new counter bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only schedule manual compactions). Test Plan: I could pretty much consistently reproduce the deadlock with a program that calls CompactRange(nullptr, nullptr) immediately after Write() from multiple threads. This no longer happens with this patch. Tests (make check) pass. Reviewers: dhruba, igor, sdong, haobo Reviewed By: igor CC: leveldb Differential Revision: https://reviews.facebook.net/D14799 --- db/db_impl.cc | 71 +++++++++++++++++++++++++++------------------------ db/db_impl.h | 7 ++++- 2 files changed, 44 insertions(+), 34 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 48dc367da..b8ae72fd9 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -244,6 +244,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) super_version_(nullptr), tmp_batch_(), bg_compaction_scheduled_(0), + bg_manual_only_(0), bg_flush_scheduled_(0), bg_logstats_scheduled_(false), manual_compaction_(nullptr), @@ -1600,45 +1601,44 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { MutexLock l(&mutex_); - // When a manual compaction arrives, temporarily throttle down - // the number of background compaction threads to 1. This is - // needed to ensure that this manual compaction can compact - // any range of keys/files. We artificialy increase - // bg_compaction_scheduled_ by a large number, this causes - // the system to have a single background thread. Now, - // this manual compaction can progress without stomping - // on any other concurrent compactions. - const int LargeNumber = 10000000; - const int newvalue = options_.max_background_compactions-1; - bg_compaction_scheduled_ += LargeNumber; - while (bg_compaction_scheduled_ > LargeNumber) { - Log(options_.info_log, "Manual compaction request waiting for background threads to fall below 1"); + // When a manual compaction arrives, temporarily disable scheduling of + // non-manual compactions and wait until the number of scheduled compaction + // jobs drops to zero. This is needed to ensure that this manual compaction + // can compact any range of keys/files. + // + // bg_manual_only_ is non-zero when at least one thread is inside + // TEST_CompactRange(), i.e. during that time no other compaction will + // get scheduled (see MaybeScheduleFlushOrCompaction). + // + // Note that the following loop doesn't stop more that one thread calling + // TEST_CompactRange() from getting to the second while loop below. + // However, only one of them will actually schedule compaction, while + // others will wait on a condition variable until it completes. + + ++bg_manual_only_; + while (bg_compaction_scheduled_ > 0) { + Log(options_.info_log, + "Manual compaction waiting for all other scheduled background " + "compactions to finish"); bg_cv_.Wait(); } + Log(options_.info_log, "Manual compaction starting"); - while (!manual.done) { - while (manual_compaction_ != nullptr) { - bg_cv_.Wait(); - } - manual_compaction_ = &manual; - if (bg_compaction_scheduled_ == LargeNumber) { - bg_compaction_scheduled_ = newvalue; - } - MaybeScheduleFlushOrCompaction(); - while (manual_compaction_ == &manual) { + while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { + assert(bg_manual_only_ > 0); + if (manual_compaction_ != nullptr) { + // Running either this or some other manual compaction bg_cv_.Wait(); + } else { + manual_compaction_ = &manual; + MaybeScheduleFlushOrCompaction(); } } - assert(!manual.in_progress); - // wait till there are no background threads scheduled - bg_compaction_scheduled_ += LargeNumber; - while (bg_compaction_scheduled_ > LargeNumber + newvalue) { - Log(options_.info_log, "Manual compaction resetting background threads"); - bg_cv_.Wait(); - } - bg_compaction_scheduled_ = 0; + assert(!manual.in_progress); + assert(bg_manual_only_ > 0); + --bg_manual_only_; } Status DBImpl::FlushMemTable(const FlushOptions& options) { @@ -1703,11 +1703,16 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); } + // Schedule BGWorkCompaction if there's a compaction pending (or a memtable + // flush, but the HIGH pool is not enabled). Do it only if + // max_background_compactions hasn't been reached and, in case + // bg_manual_only_ > 0, if it's a manual compaction. if ((manual_compaction_ || versions_->NeedsCompaction() || (is_flush_pending && (options_.max_background_flushes <= 0))) && - bg_compaction_scheduled_ < options_.max_background_compactions) { - // compaction needed, or memtable flush needed but HIGH pool not enabled. + bg_compaction_scheduled_ < options_.max_background_compactions && + (!bg_manual_only_ || manual_compaction_)) { + bg_compaction_scheduled_++; env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); } diff --git a/db/db_impl.h b/db/db_impl.h index 52e8221f5..adeb163f0 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -388,9 +388,14 @@ class DBImpl : public DB { // part of ongoing compactions. std::set pending_outputs_; - // count how many background compaction been scheduled or is running? + // count how many background compactions are running or have been scheduled int bg_compaction_scheduled_; + // If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual + // compactions (if manual_compaction_ is not null). This mechanism enables + // manual compactions to wait until all other compactions are finished. + int bg_manual_only_; + // number of background memtable flush jobs, submitted to the HIGH pool int bg_flush_scheduled_;