From a7d4eb2f34d3177b9c062474626503a40c530e34 Mon Sep 17 00:00:00 2001 From: Yueh-Hsuan Chiang Date: Fri, 4 Mar 2016 14:24:52 -0800 Subject: [PATCH] Fix a bug where flush does not happen when a manual compaction is running Summary: Currently, when rocksdb tries to run manual compaction to refit data into a level, there's a ReFitLevel() process that requires no bg work is currently running. When RocksDB plans to ReFitLevel(), it will do the following: 1. pause scheduling new bg work. 2. wait until all bg work finished 3. do the ReFitLevel() 4. unpause scheduling new bg work. However, as it pause scheduling new bg work at step one and waiting for all bg work finished in step 2, RocksDB will stop flushing until all bg work is done (which could take a long time.) This patch fix this issue by changing the way ReFitLevel() pause the background work: 1. pause scheduling compaction. 2. wait until all bg work finished. 3. pause scheduling flush 4. do ReFitLevel() 5. unpause both flush and compaction. The major difference is that. We only pause scheduling compaction in step 1 and wait for all bg work finished in step 2. This prevent flush being blocked for a long time. Although there's a very rare case that ReFitLevel() might be in starvation in step 2, but it's less likely the case as flush typically finish very fast. Test Plan: existing test. Reviewers: anthony, IslamAbdelRahman, kradhakrishnan, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D55029 --- db/db_impl.cc | 13 ++++++++++++- db/db_impl.h | 5 ++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 153b605a8..ebcbe1a4e 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -282,6 +282,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) #endif // ROCKSDB_LITE event_logger_(db_options_.info_log.get()), bg_work_paused_(0), + bg_compaction_paused_(0), refitting_level_(false), opened_successfully_(false) { env_->GetAbsolutePath(dbname, &db_absolute_path_); @@ -1889,10 +1890,11 @@ Status DBImpl::CompactFilesImpl( Status DBImpl::PauseBackgroundWork() { InstrumentedMutexLock guard_lock(&mutex_); - bg_work_paused_++; + bg_compaction_paused_++; while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_ > 0) { bg_cv_.Wait(); } + bg_work_paused_++; return Status::OK(); } @@ -1902,7 +1904,11 @@ Status DBImpl::ContinueBackgroundWork() { return Status::InvalidArgument(); } assert(bg_work_paused_ > 0); + assert(bg_compaction_paused_ > 0); + bg_compaction_paused_--; bg_work_paused_--; + // It's sufficient to check just bg_work_paused_ here since + // bg_work_paused_ is always no greater than bg_compaction_paused_ if (bg_work_paused_ == 0) { MaybeScheduleFlushOrCompaction(); } @@ -2485,6 +2491,11 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { } } + if (bg_compaction_paused_ > 0) { + // we paused the background compaction + return; + } + if (HasExclusiveManualCompaction()) { // only manual compactions are allowed to run. don't schedule automatic // compactions diff --git a/db/db_impl.h b/db/db_impl.h index f068fe084..b9efb775b 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -844,9 +844,12 @@ class DBImpl : public DB { // Unified interface for logging events EventLogger event_logger_; - // A value of >0 temporarily disables scheduling of background work + // A value of > 0 temporarily disables scheduling of background work int bg_work_paused_; + // A value of > 0 temporarily disables scheduling of background compaction + int bg_compaction_paused_; + // Guard against multiple concurrent refitting bool refitting_level_;