From cd7c4143d795ab9a53e6eaeb4ee572b4e258313b Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Wed, 23 Nov 2016 09:19:11 -0800 Subject: [PATCH] Improve Write Stalling System Summary: Current write stalling system has the problem of lacking of positive feedback if the restricted rate is already too low. Users sometimes stack in very low slowdown value. With the diff, we add a positive feedback (increasing the slowdown value) if we recover from slowdown state back to normal. To avoid the positive feedback to keep the slowdown value to be to high, we add issue a negative feedback every time we are close to the stop condition. Experiments show it is easier to reach a relative balance than before. Also increase level0_stop_writes_trigger default from 24 to 32. Since level0_slowdown_writes_trigger default is 20, stop trigger 24 only gives four files as the buffer time to slowdown writes. In order to avoid stop in four files while 20 files have been accumulated, the slowdown value must be very low, which is amost the same as stop. It also doesn't give enough time for the slowdown value to converge. Increase it to 32 will smooth out the system. Closes https://github.com/facebook/rocksdb/pull/1562 Differential Revision: D4218519 Pulled By: siying fbshipit-source-id: 95e4088 --- HISTORY.md | 4 ++ db/column_family.cc | 118 +++++++++++++++++++++++++----------- db/column_family.h | 2 +- db/column_family_test.cc | 35 ++++++----- db/db_test.cc | 6 +- db/write_controller.h | 2 + db/write_controller_test.cc | 6 +- util/options.cc | 5 +- 8 files changed, 117 insertions(+), 61 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index a4a867e23..b16628ad0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,8 @@ # Rocksdb Change Log +## Unreleased +### Public API Change +* Options.level0_stop_writes_trigger default value changes from 24 to 32. + ## 5.0.0 (11/17/2016) ### Public API Change * Options::max_bytes_for_level_multiplier is now a double along with all getters and setters. diff --git a/db/column_family.cc b/db/column_family.cc index b4fe6c181..b27d70e71 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -492,14 +492,18 @@ ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const { return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_); } -const double kSlowdownRatio = 1.2; +const double kIncSlowdownRatio = 0.8; +const double kDecSlowdownRatio = 1 / kIncSlowdownRatio; +const double kNearStopSlowdownRatio = 0.6; +const double kDelayRecoverSlowdownRatio = 1.4; namespace { +// If penalize_stop is true, we further reduce slowdown rate. std::unique_ptr SetupDelay( - WriteController* write_controller, - uint64_t compaction_needed_bytes, uint64_t prev_compaction_neeed_bytes, + WriteController* write_controller, uint64_t compaction_needed_bytes, + uint64_t prev_compaction_need_bytes, bool penalize_stop, bool auto_comapctions_disabled) { - const uint64_t kMinWriteRate = 1024u; // Minimum write rate 1KB/s. + const uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s. uint64_t max_write_rate = write_controller->max_delayed_write_rate(); uint64_t write_rate = write_controller->delayed_write_rate(); @@ -524,19 +528,32 @@ std::unique_ptr SetupDelay( // insert to mem tables, so we need to actively slow down before we get // feedback signal from compaction and flushes to avoid the full stop // because of hitting the max write buffer number. - if (prev_compaction_neeed_bytes > 0 && - prev_compaction_neeed_bytes <= compaction_needed_bytes) { - write_rate = static_cast(static_cast(write_rate) / - kSlowdownRatio); + // + // If DB just falled into the stop condition, we need to further reduce + // the write rate to avoid the stop condition. + if (penalize_stop) { + // Penalize the near stop or stop condition by more agressive slowdown. + // This is to provide the long term slowdown increase signal. + // The penalty is more than the reward of recovering to the normal + // condition. + write_rate = static_cast(static_cast(write_rate) * + kNearStopSlowdownRatio); if (write_rate < kMinWriteRate) { write_rate = kMinWriteRate; } - } else if (prev_compaction_neeed_bytes > compaction_needed_bytes) { + } else if (prev_compaction_need_bytes > 0 && + prev_compaction_need_bytes <= compaction_needed_bytes) { + write_rate = static_cast(static_cast(write_rate) * + kIncSlowdownRatio); + if (write_rate < kMinWriteRate) { + write_rate = kMinWriteRate; + } + } else if (prev_compaction_need_bytes > compaction_needed_bytes) { // We are speeding up by ratio of kSlowdownRatio when we have paid // compaction debt. But we'll never speed up to faster than the write rate // given by users. write_rate = static_cast(static_cast(write_rate) * - kSlowdownRatio); + kDecSlowdownRatio); if (write_rate > max_write_rate) { write_rate = max_write_rate; } @@ -589,6 +606,9 @@ void ColumnFamilyData::RecalculateWriteStallConditions( uint64_t compaction_needed_bytes = vstorage->estimated_compaction_needed_bytes(); + bool was_stopped = write_controller->IsStopped(); + bool needed_delay = write_controller->NeedsDelay(); + if (imm()->NumNotFlushed() >= mutable_cf_options.max_write_buffer_number) { write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1); @@ -625,7 +645,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions( mutable_cf_options.max_write_buffer_number - 1) { write_controller_token_ = SetupDelay(write_controller, compaction_needed_bytes, - prev_compaction_needed_bytes_, + prev_compaction_needed_bytes_, was_stopped, mutable_cf_options.disable_auto_compactions); internal_stats_->AddCFStats(InternalStats::MEMTABLE_SLOWDOWN, 1); Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, @@ -639,9 +659,12 @@ void ColumnFamilyData::RecalculateWriteStallConditions( mutable_cf_options.level0_slowdown_writes_trigger >= 0 && vstorage->l0_delay_trigger_count() >= mutable_cf_options.level0_slowdown_writes_trigger) { + // L0 is the last two files from stopping. + bool near_stop = vstorage->l0_delay_trigger_count() >= + mutable_cf_options.level0_stop_writes_trigger - 2; write_controller_token_ = SetupDelay(write_controller, compaction_needed_bytes, - prev_compaction_needed_bytes_, + prev_compaction_needed_bytes_, was_stopped || near_stop, mutable_cf_options.disable_auto_compactions); internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN_TOTAL, 1); if (compaction_picker_->IsLevel0CompactionInProgress()) { @@ -657,9 +680,20 @@ void ColumnFamilyData::RecalculateWriteStallConditions( mutable_cf_options.soft_pending_compaction_bytes_limit > 0 && vstorage->estimated_compaction_needed_bytes() >= mutable_cf_options.soft_pending_compaction_bytes_limit) { + // If the distance to hard limit is less than 1/4 of the gap between soft + // and + // hard bytes limit, we think it is near stop and speed up the slowdown. + bool near_stop = + mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && + (compaction_needed_bytes - + mutable_cf_options.soft_pending_compaction_bytes_limit) > + 3 * (mutable_cf_options.hard_pending_compaction_bytes_limit - + mutable_cf_options.soft_pending_compaction_bytes_limit) / + 4; + write_controller_token_ = SetupDelay(write_controller, compaction_needed_bytes, - prev_compaction_needed_bytes_, + prev_compaction_needed_bytes_, was_stopped || near_stop, mutable_cf_options.disable_auto_compactions); internal_stats_->AddCFStats( InternalStats::SOFT_PENDING_COMPACTION_BYTES_LIMIT, 1); @@ -668,31 +702,43 @@ void ColumnFamilyData::RecalculateWriteStallConditions( "bytes %" PRIu64 " rate %" PRIu64, name_.c_str(), vstorage->estimated_compaction_needed_bytes(), write_controller->delayed_write_rate()); - } else if (vstorage->l0_delay_trigger_count() >= - GetL0ThresholdSpeedupCompaction( - mutable_cf_options.level0_file_num_compaction_trigger, - mutable_cf_options.level0_slowdown_writes_trigger)) { - write_controller_token_ = write_controller->GetCompactionPressureToken(); - Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, - "[%s] Increasing compaction threads because we have %d level-0 " - "files ", - name_.c_str(), vstorage->l0_delay_trigger_count()); - } else if (vstorage->estimated_compaction_needed_bytes() >= - mutable_cf_options.soft_pending_compaction_bytes_limit / 4) { - // Increase compaction threads if bytes needed for compaction exceeds - // 1/4 of threshold for slowing down. - // If soft pending compaction byte limit is not set, always speed up - // compaction. - write_controller_token_ = write_controller->GetCompactionPressureToken(); - if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { + } else { + if (vstorage->l0_delay_trigger_count() >= + GetL0ThresholdSpeedupCompaction( + mutable_cf_options.level0_file_num_compaction_trigger, + mutable_cf_options.level0_slowdown_writes_trigger)) { + write_controller_token_ = + write_controller->GetCompactionPressureToken(); Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, - "[%s] Increasing compaction threads because of estimated pending " - "compaction " - "bytes %" PRIu64, - name_.c_str(), vstorage->estimated_compaction_needed_bytes()); + "[%s] Increasing compaction threads because we have %d level-0 " + "files ", + name_.c_str(), vstorage->l0_delay_trigger_count()); + } else if (vstorage->estimated_compaction_needed_bytes() >= + mutable_cf_options.soft_pending_compaction_bytes_limit / 4) { + // Increase compaction threads if bytes needed for compaction exceeds + // 1/4 of threshold for slowing down. + // If soft pending compaction byte limit is not set, always speed up + // compaction. + write_controller_token_ = + write_controller->GetCompactionPressureToken(); + if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Increasing compaction threads because of estimated pending " + "compaction " + "bytes %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes()); + } + } else { + write_controller_token_.reset(); + } + // If the DB recovers from delay conditions, we reward with reducing + // double the slowdown ratio. This is to balance the long term slowdown + // increase signal. + if (needed_delay) { + uint64_t write_rate = write_controller->delayed_write_rate(); + write_controller->set_delayed_write_rate(static_cast( + static_cast(write_rate) * kDelayRecoverSlowdownRatio)); } - } else { - write_controller_token_.reset(); } prev_compaction_needed_bytes_ = compaction_needed_bytes; } diff --git a/db/column_family.h b/db/column_family.h index 5bf951e3a..29d297157 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -42,7 +42,7 @@ class LogBuffer; class InstrumentedMutex; class InstrumentedMutexLock; -extern const double kSlowdownRatio; +extern const double kIncSlowdownRatio; // ColumnFamilyHandleImpl is the class that clients use to access different // column families. It has non-trivial destructor, which gets called when client diff --git a/db/column_family_test.cc b/db/column_family_test.cc index 2d3f2cc0d..7e5164e96 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -2440,7 +2440,7 @@ TEST_F(ColumnFamilyTest, CreateAndDropRace) { #endif // !ROCKSDB_LITE TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { - const uint64_t kBaseRate = 810000u; + const uint64_t kBaseRate = 800000u; db_options_.delayed_write_rate = kBaseRate; db_options_.base_background_compactions = 2; db_options_.max_background_compactions = 6; @@ -2475,7 +2475,7 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2, + ASSERT_EQ(kBaseRate / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); @@ -2483,14 +2483,14 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2 / 1.2, + ASSERT_EQ(kBaseRate / 1.25 / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->TEST_set_estimated_compaction_needed_bytes(450); cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2, + ASSERT_EQ(kBaseRate / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->TEST_set_estimated_compaction_needed_bytes(205); @@ -2526,7 +2526,7 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2, + ASSERT_EQ(kBaseRate / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->TEST_set_estimated_compaction_needed_bytes(2001); @@ -2544,7 +2544,7 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2, + ASSERT_EQ(kBaseRate / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->TEST_set_estimated_compaction_needed_bytes(100); @@ -2556,15 +2556,14 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2, - dbfull()->TEST_write_controler().delayed_write_rate()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); vstorage->set_l0_delay_trigger_count(101); cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2 / 1.2, + ASSERT_EQ(kBaseRate / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->set_l0_delay_trigger_count(0); @@ -2572,21 +2571,21 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2 / 1.2 / 1.2, + ASSERT_EQ(kBaseRate / 1.25 / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->set_l0_delay_trigger_count(101); cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2 / 1.2 / 1.2 / 1.2, + ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->TEST_set_estimated_compaction_needed_bytes(200); cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2 / 1.2 / 1.2, + ASSERT_EQ(kBaseRate / 1.25 / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->set_l0_delay_trigger_count(0); @@ -2627,7 +2626,7 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2, + ASSERT_EQ(kBaseRate / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); } @@ -2744,35 +2743,35 @@ TEST_F(ColumnFamilyTest, WriteStallTwoColumnFamilies) { cfd1->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2, + ASSERT_EQ(kBaseRate / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->TEST_set_estimated_compaction_needed_bytes(300); cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2 / 1.2, + ASSERT_EQ(kBaseRate / 1.25 / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage1->TEST_set_estimated_compaction_needed_bytes(700); cfd1->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2, + ASSERT_EQ(kBaseRate / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->TEST_set_estimated_compaction_needed_bytes(500); cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2 / 1.2, + ASSERT_EQ(kBaseRate / 1.25 / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage1->TEST_set_estimated_compaction_needed_bytes(600); cfd1->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ASSERT_EQ(kBaseRate / 1.2, + ASSERT_EQ(kBaseRate / 1.25, dbfull()->TEST_write_controler().delayed_write_rate()); } diff --git a/db/db_test.cc b/db/db_test.cc index 2ca7de3dc..d7e597971 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -5378,7 +5378,7 @@ TEST_F(DBTest, FlushesInParallelWithCompactRange) { TEST_F(DBTest, DelayedWriteRate) { const int kEntriesPerMemTable = 100; - const int kTotalFlushes = 20; + const int kTotalFlushes = 12; Options options = CurrentOptions(); env_->SetBackgroundThreads(1, Env::LOW); @@ -5428,8 +5428,8 @@ TEST_F(DBTest, DelayedWriteRate) { dbfull()->TEST_WaitForFlushMemTable(); estimated_sleep_time += size_memtable * 1000000u / cur_rate; // Slow down twice. One for memtable switch and one for flush finishes. - cur_rate = static_cast(static_cast(cur_rate) / - kSlowdownRatio / kSlowdownRatio); + cur_rate = static_cast(static_cast(cur_rate) * + kIncSlowdownRatio * kIncSlowdownRatio); } // Estimate the total sleep time fall into the rough range. ASSERT_GT(env_->addon_time_.load(), diff --git a/db/write_controller.h b/db/write_controller.h index 0e8047456..36a905902 100644 --- a/db/write_controller.h +++ b/db/write_controller.h @@ -57,6 +57,8 @@ class WriteController { // avoid divide 0 if (write_rate == 0) { write_rate = 1u; + } else if (write_rate > max_delayed_write_rate()) { + write_rate = max_delayed_write_rate(); } delayed_write_rate_ = write_rate; } diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc index db9a9db1b..ae890467f 100644 --- a/db/write_controller_test.cc +++ b/db/write_controller_test.cc @@ -21,7 +21,8 @@ class TimeSetEnv : public EnvWrapper { TEST_F(WriteControllerTest, ChangeDelayRateTest) { TimeSetEnv env; - WriteController controller(10000000u); + WriteController controller(40000000u); // also set max delayed rate + controller.set_delayed_write_rate(10000000u); auto delay_token_0 = controller.GetDelayToken(controller.delayed_write_rate()); ASSERT_EQ(static_cast(2000000), @@ -35,8 +36,9 @@ TEST_F(WriteControllerTest, ChangeDelayRateTest) { auto delay_token_3 = controller.GetDelayToken(20000000u); ASSERT_EQ(static_cast(1000000), controller.GetDelay(&env, 20000000u)); + // This is more than max rate. Max delayed rate will be used. auto delay_token_4 = - controller.GetDelayToken(controller.delayed_write_rate() * 2); + controller.GetDelayToken(controller.delayed_write_rate() * 3); ASSERT_EQ(static_cast(500000), controller.GetDelay(&env, 20000000u)); } diff --git a/util/options.cc b/util/options.cc index 05ea42a3a..b4a6d36dc 100644 --- a/util/options.cc +++ b/util/options.cc @@ -50,7 +50,7 @@ ColumnFamilyOptions::ColumnFamilyOptions() num_levels(7), level0_file_num_compaction_trigger(4), level0_slowdown_writes_trigger(20), - level0_stop_writes_trigger(24), + level0_stop_writes_trigger(32), target_file_size_base(64 * 1048576), target_file_size_multiplier(1), max_bytes_for_level_base(256 * 1048576), @@ -672,6 +672,9 @@ ColumnFamilyOptions* ColumnFamilyOptions::OldDefaults( soft_pending_compaction_bytes_limit = 0; hard_pending_compaction_bytes_limit = 0; } + if (rocksdb_major_version < 5) { + level0_stop_writes_trigger = 24; + } compaction_pri = CompactionPri::kByCompensatedSize; return this;