Fix a deadlock in CompactRange()

Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):

   1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
      bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
      (bg_compaction_scheduled_ is now 10) and waits for it to complete.
   2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
      (now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
      drop to LargeNumber.
   3. BG thread completes the first manual compaction, decrements
      bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
      Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
      (now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
      thread #2 also goes to sleep (waiting on bg_cv_), without resetting
      bg_compaction_scheduled_.

This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).

Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.

Tests (make check) pass.

Reviewers: dhruba, igor, sdong, haobo

Reviewed By: igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14799
main
Tomislav Novak 11 years ago
parent c370f5597a
commit 9f690ec62c
  1. 71
      db/db_impl.cc
  2. 7
      db/db_impl.h

@ -244,6 +244,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
super_version_(nullptr),
tmp_batch_(),
bg_compaction_scheduled_(0),
bg_manual_only_(0),
bg_flush_scheduled_(0),
bg_logstats_scheduled_(false),
manual_compaction_(nullptr),
@ -1600,45 +1601,44 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
MutexLock l(&mutex_);
// When a manual compaction arrives, temporarily throttle down
// the number of background compaction threads to 1. This is
// needed to ensure that this manual compaction can compact
// any range of keys/files. We artificialy increase
// bg_compaction_scheduled_ by a large number, this causes
// the system to have a single background thread. Now,
// this manual compaction can progress without stomping
// on any other concurrent compactions.
const int LargeNumber = 10000000;
const int newvalue = options_.max_background_compactions-1;
bg_compaction_scheduled_ += LargeNumber;
while (bg_compaction_scheduled_ > LargeNumber) {
Log(options_.info_log, "Manual compaction request waiting for background threads to fall below 1");
// When a manual compaction arrives, temporarily disable scheduling of
// non-manual compactions and wait until the number of scheduled compaction
// jobs drops to zero. This is needed to ensure that this manual compaction
// can compact any range of keys/files.
//
// bg_manual_only_ is non-zero when at least one thread is inside
// TEST_CompactRange(), i.e. during that time no other compaction will
// get scheduled (see MaybeScheduleFlushOrCompaction).
//
// Note that the following loop doesn't stop more that one thread calling
// TEST_CompactRange() from getting to the second while loop below.
// However, only one of them will actually schedule compaction, while
// others will wait on a condition variable until it completes.
++bg_manual_only_;
while (bg_compaction_scheduled_ > 0) {
Log(options_.info_log,
"Manual compaction waiting for all other scheduled background "
"compactions to finish");
bg_cv_.Wait();
}
Log(options_.info_log, "Manual compaction starting");
while (!manual.done) {
while (manual_compaction_ != nullptr) {
bg_cv_.Wait();
}
manual_compaction_ = &manual;
if (bg_compaction_scheduled_ == LargeNumber) {
bg_compaction_scheduled_ = newvalue;
}
MaybeScheduleFlushOrCompaction();
while (manual_compaction_ == &manual) {
while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
assert(bg_manual_only_ > 0);
if (manual_compaction_ != nullptr) {
// Running either this or some other manual compaction
bg_cv_.Wait();
} else {
manual_compaction_ = &manual;
MaybeScheduleFlushOrCompaction();
}
}
assert(!manual.in_progress);
// wait till there are no background threads scheduled
bg_compaction_scheduled_ += LargeNumber;
while (bg_compaction_scheduled_ > LargeNumber + newvalue) {
Log(options_.info_log, "Manual compaction resetting background threads");
bg_cv_.Wait();
}
bg_compaction_scheduled_ = 0;
assert(!manual.in_progress);
assert(bg_manual_only_ > 0);
--bg_manual_only_;
}
Status DBImpl::FlushMemTable(const FlushOptions& options) {
@ -1703,11 +1703,16 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
}
// Schedule BGWorkCompaction if there's a compaction pending (or a memtable
// flush, but the HIGH pool is not enabled). Do it only if
// max_background_compactions hasn't been reached and, in case
// bg_manual_only_ > 0, if it's a manual compaction.
if ((manual_compaction_ ||
versions_->NeedsCompaction() ||
(is_flush_pending && (options_.max_background_flushes <= 0))) &&
bg_compaction_scheduled_ < options_.max_background_compactions) {
// compaction needed, or memtable flush needed but HIGH pool not enabled.
bg_compaction_scheduled_ < options_.max_background_compactions &&
(!bg_manual_only_ || manual_compaction_)) {
bg_compaction_scheduled_++;
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
}

@ -388,9 +388,14 @@ class DBImpl : public DB {
// part of ongoing compactions.
std::set<uint64_t> pending_outputs_;
// count how many background compaction been scheduled or is running?
// count how many background compactions are running or have been scheduled
int bg_compaction_scheduled_;
// If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
// compactions (if manual_compaction_ is not null). This mechanism enables
// manual compactions to wait until all other compactions are finished.
int bg_manual_only_;
// number of background memtable flush jobs, submitted to the HIGH pool
int bg_flush_scheduled_;

Loading…
Cancel
Save