From e7a60d01b2a082013a41449c61d300ba97ecb265 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Thu, 18 Mar 2021 09:45:59 -0700 Subject: [PATCH] Revamp WriteController (#8064) Summary: WriteController had a number of issues: * It could introduce a delay of 1ms even if the write rate never exceeded the configured delayed_write_rate. * The DB-wide delayed_write_rate could be exceeded in a number of ways with multiple column families: * Wiping all pending delay "debts" when another column family joins the delay with GetDelayToken(). * Resetting last_refill_time_ to (now + sleep amount) means each column family can write with delayed_write_rate for large writes. * Updating bytes_left_ for a partial refill without updating last_refill_time_ would essentially give out random bonuses, especially to medium-sized writes. Now the code is much simpler, with these issues fixed. See comments in the new code and new (replacement) tests. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8064 Test Plan: new tests, better than old tests Reviewed By: mrambacher Differential Revision: D27064936 Pulled By: pdillinger fbshipit-source-id: 497c23fe6819340b8f3d440bd634d8a2bc47323f --- HISTORY.md | 1 + db/db_impl/db_impl_write.cc | 8 +- db/db_test.cc | 12 +- db/write_controller.cc | 88 +++++----- db/write_controller.h | 15 +- db/write_controller_test.cc | 312 ++++++++++++++++++++++++------------ 6 files changed, 270 insertions(+), 166 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 479b3efc1..12250a579 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Unreleased ### Bug Fixes * Fixed the truncation error found in APIs/tools when dumping block-based SST files in a human-readable format. After fix, the block-based table can be fully dumped as a readable file. +* When hitting a write slowdown condition, no write delay (previously 1 millisecond) is imposed until `delayed_write_rate` is actually exceeded, with an initial burst allowance of 1 millisecond worth of bytes. Also, beyond the initial burst allowance, `delayed_write_rate` is now more strictly enforced, especially with multiple column families. ### Public API change * Changed default `BackupableDBOptions::share_files_with_checksum` to `true` and deprecated `false` because of potential for data loss. Note that accepting this change in behavior can temporarily increase backup data usage because files are not shared between backups using the two different settings. Also removed obsolete option kFlagMatchInterimNaming. diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 4afc08217..af785fa42 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -1476,9 +1476,11 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, write_thread_.BeginWriteStall(); TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone"); mutex_.Unlock(); - // We will delay the write until we have slept for delay ms or - // we don't need a delay anymore - const uint64_t kDelayInterval = 1000; + // We will delay the write until we have slept for `delay` microseconds + // or we don't need a delay anymore. We check for cancellation every 1ms + // (slightly longer because WriteController minimum delay is 1ms, in + // case of sleep imprecision, rounding, etc.) + const uint64_t kDelayInterval = 1001; uint64_t stall_end = sw.start_time() + delay; while (write_controller_.NeedsDelay()) { if (immutable_db_options_.clock->NowMicros() >= stall_end) { diff --git a/db/db_test.cc b/db/db_test.cc index 7afe317db..2fd04af5f 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -247,17 +247,21 @@ TEST_F(DBTest, SkipDelay) { wo.sync = sync; wo.disableWAL = disableWAL; wo.no_slowdown = true; - dbfull()->Put(wo, "foo", "bar"); + // Large enough to exceed allowance for one time interval + std::string large_value(1024, 'x'); + // Perhaps ideally this first write would fail because of delay, but + // the current implementation does not guarantee that. + dbfull()->Put(wo, "foo", large_value).PermitUncheckedError(); // We need the 2nd write to trigger delay. This is because delay is // estimated based on the last write size which is 0 for the first write. - ASSERT_NOK(dbfull()->Put(wo, "foo2", "bar2")); + ASSERT_NOK(dbfull()->Put(wo, "foo2", large_value)); ASSERT_GE(sleep_count.load(), 0); ASSERT_GE(wait_count.load(), 0); token.reset(); - token = dbfull()->TEST_write_controler().GetDelayToken(1000000000); + token = dbfull()->TEST_write_controler().GetDelayToken(1000000); wo.no_slowdown = false; - ASSERT_OK(dbfull()->Put(wo, "foo3", "bar3")); + ASSERT_OK(dbfull()->Put(wo, "foo3", large_value)); ASSERT_GE(sleep_count.load(), 1); token.reset(); } diff --git a/db/write_controller.cc b/db/write_controller.cc index 73f4f33eb..c5f744375 100644 --- a/db/write_controller.cc +++ b/db/write_controller.cc @@ -5,6 +5,7 @@ #include "db/write_controller.h" +#include #include #include #include @@ -20,10 +21,14 @@ std::unique_ptr WriteController::GetStopToken() { std::unique_ptr WriteController::GetDelayToken( uint64_t write_rate) { - total_delayed_++; - // Reset counters. - last_refill_time_ = 0; - bytes_left_ = 0; + if (0 == total_delayed_++) { + // Starting delay, so reset counters. + next_refill_time_ = 0; + credit_in_bytes_ = 0; + } + // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in + // next_refill_time_ will be based on an old rate. This rate will apply + // for subsequent additional debts and for the next refill. set_delayed_write_rate(write_rate); return std::unique_ptr(new DelayWriteToken(this)); } @@ -51,60 +56,47 @@ uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) { return 0; } - const uint64_t kMicrosPerSecond = 1000000; - const uint64_t kRefillInterval = 1024U; - - if (bytes_left_ >= num_bytes) { - bytes_left_ -= num_bytes; + if (credit_in_bytes_ >= num_bytes) { + credit_in_bytes_ -= num_bytes; return 0; } // The frequency to get time inside DB mutex is less than one per refill // interval. auto time_now = NowMicrosMonotonic(clock); - uint64_t sleep_debt = 0; - uint64_t time_since_last_refill = 0; - if (last_refill_time_ != 0) { - if (last_refill_time_ > time_now) { - sleep_debt = last_refill_time_ - time_now; - } else { - time_since_last_refill = time_now - last_refill_time_; - bytes_left_ += - static_cast(static_cast(time_since_last_refill) / - kMicrosPerSecond * delayed_write_rate_); - if (time_since_last_refill >= kRefillInterval && - bytes_left_ > num_bytes) { - // If refill interval already passed and we have enough bytes - // return without extra sleeping. - last_refill_time_ = time_now; - bytes_left_ -= num_bytes; - return 0; - } + const uint64_t kMicrosPerSecond = 1000000; + // Refill every 1 ms + const uint64_t kMicrosPerRefill = 1000; + + if (next_refill_time_ == 0) { + // Start with an initial allotment of bytes for one interval + next_refill_time_ = time_now; + } + if (next_refill_time_ <= time_now) { + // Refill based on time interval plus any extra elapsed + uint64_t elapsed = time_now - next_refill_time_ + kMicrosPerRefill; + credit_in_bytes_ += static_cast( + 1.0 * elapsed / kMicrosPerSecond * delayed_write_rate_ + 0.999999); + next_refill_time_ = time_now + kMicrosPerRefill; + + if (credit_in_bytes_ >= num_bytes) { + // Avoid delay if possible, to reduce DB mutex release & re-aquire. + credit_in_bytes_ -= num_bytes; + return 0; } } - uint64_t single_refill_amount = - delayed_write_rate_ * kRefillInterval / kMicrosPerSecond; - if (bytes_left_ + single_refill_amount >= num_bytes) { - // Wait until a refill interval - // Never trigger expire for less than one refill interval to avoid to get - // time. - bytes_left_ = bytes_left_ + single_refill_amount - num_bytes; - last_refill_time_ = time_now + kRefillInterval; - return kRefillInterval + sleep_debt; - } + // We need to delay to avoid exceeding write rate. + assert(num_bytes > credit_in_bytes_); + uint64_t bytes_over_budget = num_bytes - credit_in_bytes_; + uint64_t needed_delay = static_cast( + 1.0 * bytes_over_budget / delayed_write_rate_ * kMicrosPerSecond); + + credit_in_bytes_ = 0; + next_refill_time_ += needed_delay; - // Need to refill more than one interval. Need to sleep longer. Check - // whether expiration will hit - - // Sleep just until `num_bytes` is allowed. - uint64_t sleep_amount = - static_cast(num_bytes / - static_cast(delayed_write_rate_) * - kMicrosPerSecond) + - sleep_debt; - last_refill_time_ = time_now + sleep_amount; - return sleep_amount; + // Minimum delay of refill interval, to reduce DB mutex contention. + return std::max(next_refill_time_ - time_now, kMicrosPerRefill); } uint64_t WriteController::NowMicrosMonotonic(SystemClock* clock) { diff --git a/db/write_controller.h b/db/write_controller.h index dfd47cee2..88bd1417f 100644 --- a/db/write_controller.h +++ b/db/write_controller.h @@ -27,8 +27,8 @@ class WriteController { : total_stopped_(0), total_delayed_(0), total_compaction_pressure_(0), - bytes_left_(0), - last_refill_time_(0), + credit_in_bytes_(0), + next_refill_time_(0), low_pri_rate_limiter_( NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) { set_max_delayed_write_rate(_delayed_write_rate); @@ -95,11 +95,14 @@ class WriteController { std::atomic total_stopped_; std::atomic total_delayed_; std::atomic total_compaction_pressure_; - uint64_t bytes_left_; - uint64_t last_refill_time_; - // write rate set when initialization or by `DBImpl::SetDBOptions` + + // Number of bytes allowed to write without delay + uint64_t credit_in_bytes_; + // Next time that we can add more credit of bytes + uint64_t next_refill_time_; + // Write rate set when initialization or by `DBImpl::SetDBOptions` uint64_t max_delayed_write_rate_; - // current write rate + // Current write rate (bytes / second) uint64_t delayed_write_rate_; std::unique_ptr low_pri_rate_limiter_; diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc index f24c291ee..1f7cf999a 100644 --- a/db/write_controller_test.cc +++ b/db/write_controller_test.cc @@ -5,6 +5,7 @@ // #include "db/write_controller.h" +#include #include #include "rocksdb/system_clock.h" @@ -26,115 +27,216 @@ class WriteControllerTest : public testing::Test { std::shared_ptr clock_; }; -TEST_F(WriteControllerTest, ChangeDelayRateTest) { - WriteController controller(40000000u); // also set max delayed rate - controller.set_delayed_write_rate(10000000u); +// Make tests easier to read +#define MILLION *1000000u +#define MB MILLION +#define MBPS MILLION +#define SECS MILLION // in microseconds + +TEST_F(WriteControllerTest, BasicAPI) { + WriteController controller(40 MBPS); // also set max delayed rate + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + + // set, get + controller.set_delayed_write_rate(20 MBPS); + EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + + { + // set with token, get + auto delay_token_0 = controller.GetDelayToken(10 MBPS); + EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + // test with delay + EXPECT_EQ(2 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 2 SECS; // pay the "debt" + + auto delay_token_1 = controller.GetDelayToken(2 MBPS); + EXPECT_EQ(10 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 10 SECS; // pay the "debt" + + auto delay_token_2 = controller.GetDelayToken(1 MBPS); + EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 20 SECS; // pay the "debt" + + auto delay_token_3 = controller.GetDelayToken(20 MBPS); + EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 1 SECS; // pay the "debt" + + // 60M is more than the max rate of 40M. Max rate will be used. + EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS); + auto delay_token_4 = + controller.GetDelayToken(controller.delayed_write_rate() * 3); + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + EXPECT_EQ(static_cast(0.5 SECS), + controller.GetDelay(clock_.get(), 20 MB)); + + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + + // Test stop tokens + { + auto stop_token_1 = controller.GetStopToken(); + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + { + auto stop_token_2 = controller.GetStopToken(); + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + } + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + } + // Stop tokens released + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + // pay the previous "debt" + clock_->now_micros_ += static_cast(0.5 SECS); + EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 40 MB)); + } + + // Delay tokens released + EXPECT_FALSE(controller.NeedsDelay()); +} + +TEST_F(WriteControllerTest, StartFilled) { + WriteController controller(10 MBPS); + + // Attempt to write two things that combined would be allowed within + // a single refill interval auto delay_token_0 = controller.GetDelayToken(controller.delayed_write_rate()); - ASSERT_EQ(static_cast(2000000), - controller.GetDelay(clock_.get(), 20000000u)); - auto delay_token_1 = controller.GetDelayToken(2000000u); - ASSERT_EQ(static_cast(10000000), - controller.GetDelay(clock_.get(), 20000000u)); - auto delay_token_2 = controller.GetDelayToken(1000000u); - ASSERT_EQ(static_cast(20000000), - controller.GetDelay(clock_.get(), 20000000u)); - auto delay_token_3 = controller.GetDelayToken(20000000u); - ASSERT_EQ(static_cast(1000000), - controller.GetDelay(clock_.get(), 20000000u)); - // This is more than max rate. Max delayed rate will be used. - auto delay_token_4 = - controller.GetDelayToken(controller.delayed_write_rate() * 3); - ASSERT_EQ(static_cast(500000), - controller.GetDelay(clock_.get(), 20000000u)); + + // Verify no delay because write rate has not been exceeded within + // refill interval. + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + + // Allow refill (kMicrosPerRefill) + clock_->now_micros_ += 1000; + + // Again + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + + // Control: something bigger that would exceed write rate within interval + uint64_t delay = controller.GetDelay(clock_.get(), 10 MB); + EXPECT_GT(1.0 * delay, 0.999 SECS); + EXPECT_LT(1.0 * delay, 1.001 SECS); +} + +TEST_F(WriteControllerTest, DebtAccumulation) { + WriteController controller(10 MBPS); + + std::array, 10> tokens; + + // Accumulate a time delay debt with no passage of time, like many column + // families delaying writes simultaneously. (Old versions of WriteController + // would reset the debt on every GetDelayToken.) + uint64_t debt = 0; + for (unsigned i = 0; i < tokens.size(); ++i) { + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + uint64_t delay = controller.GetDelay(clock_.get(), 63 MB); + ASSERT_GT(delay, debt); + uint64_t incremental = delay - debt; + ASSERT_EQ(incremental, (63 SECS) / (i + 1u)); + debt += incremental; + } + + // Pay down the debt + clock_->now_micros_ += debt; + debt = 0; + + // Now accumulate debt with some passage of time. + for (unsigned i = 0; i < tokens.size(); ++i) { + // Debt is accumulated in time, not in bytes, so this new write + // limit is not applied to prior requested delays, even it they are + // in progress. + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + uint64_t delay = controller.GetDelay(clock_.get(), 63 MB); + ASSERT_GT(delay, debt); + uint64_t incremental = delay - debt; + ASSERT_EQ(incremental, (63 SECS) / (i + 1u)); + debt += incremental; + uint64_t credit = debt / 2; + clock_->now_micros_ += credit; + debt -= credit; + } + + // Pay down the debt + clock_->now_micros_ += debt; + debt = 0; // consistent state + (void)debt; // appease clang-analyze + + // Verify paid down + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); + + // Accumulate another debt, without accounting, and releasing tokens + for (unsigned i = 0; i < tokens.size(); ++i) { + // Big and small are delayed + ASSERT_LT(0U, controller.GetDelay(clock_.get(), 63 MB)); + ASSERT_LT(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); + tokens[i].reset(); + } + // All tokens released. + // Verify that releasing all tokens pays down debt, even with no time passage. + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); } -TEST_F(WriteControllerTest, SanityTest) { - WriteController controller(10000000u); - auto stop_token_1 = controller.GetStopToken(); - auto stop_token_2 = controller.GetStopToken(); - - ASSERT_TRUE(controller.IsStopped()); - stop_token_1.reset(); - ASSERT_TRUE(controller.IsStopped()); - stop_token_2.reset(); - ASSERT_FALSE(controller.IsStopped()); - - auto delay_token_1 = controller.GetDelayToken(10000000u); - ASSERT_EQ(static_cast(2000000), - controller.GetDelay(clock_.get(), 20000000u)); - - clock_->now_micros_ += 1999900u; // sleep debt 1000 - - auto delay_token_2 = controller.GetDelayToken(10000000u); - // Rate reset after changing the token. - ASSERT_EQ(static_cast(2000000), - controller.GetDelay(clock_.get(), 20000000u)); - - clock_->now_micros_ += 1999900u; // sleep debt 1000 - - // One refill: 10240 bytes allowed, 1000 used, 9240 left - ASSERT_EQ(static_cast(1124), - controller.GetDelay(clock_.get(), 1000u)); - clock_->now_micros_ += 1124u; // sleep debt 0 - - delay_token_2.reset(); - // 1000 used, 8240 left - ASSERT_EQ(static_cast(0), controller.GetDelay(clock_.get(), 1000u)); - - clock_->now_micros_ += 100u; // sleep credit 100 - // 1000 used, 7240 left - ASSERT_EQ(static_cast(0), controller.GetDelay(clock_.get(), 1000u)); - - clock_->now_micros_ += 100u; // sleep credit 200 - // One refill: 10240 fileed, sleep credit generates 2000. 8000 used - // 7240 + 10240 + 2000 - 8000 = 11480 left - ASSERT_EQ(static_cast(1024u), - controller.GetDelay(clock_.get(), 8000u)); - - clock_->now_micros_ += 200u; // sleep debt 824 - // 1000 used, 10480 left. - ASSERT_EQ(static_cast(0), controller.GetDelay(clock_.get(), 1000u)); - - clock_->now_micros_ += 200u; // sleep debt 624 - // Out of bound sleep, still 10480 left - ASSERT_EQ(static_cast(3000624u), - controller.GetDelay(clock_.get(), 30000000u)); - - clock_->now_micros_ += 3000724u; // sleep credit 100 - // 6000 used, 4480 left. - ASSERT_EQ(static_cast(0), controller.GetDelay(clock_.get(), 6000u)); - - clock_->now_micros_ += 200u; // sleep credit 300 - // One refill, credit 4480 balance + 3000 credit + 10240 refill - // Use 8000, 9720 left - ASSERT_EQ(static_cast(1024u), - controller.GetDelay(clock_.get(), 8000u)); - - clock_->now_micros_ += 3024u; // sleep credit 2000 - - // 1720 left - ASSERT_EQ(static_cast(0u), - controller.GetDelay(clock_.get(), 8000u)); - - // 1720 balance + 20000 credit = 20170 left - // Use 8000, 12170 left - ASSERT_EQ(static_cast(0u), - controller.GetDelay(clock_.get(), 8000u)); - - // 4170 left - ASSERT_EQ(static_cast(0u), - controller.GetDelay(clock_.get(), 8000u)); - - // Need a refill - ASSERT_EQ(static_cast(1024u), - controller.GetDelay(clock_.get(), 9000u)); - - delay_token_1.reset(); - ASSERT_EQ(static_cast(0), - controller.GetDelay(clock_.get(), 30000000u)); - delay_token_1.reset(); - ASSERT_FALSE(controller.IsStopped()); +// This may or may not be a "good" feature, but it's an old feature +TEST_F(WriteControllerTest, CreditAccumulation) { + WriteController controller(10 MBPS); + + std::array, 10> tokens; + + // Ensure started + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); + clock_->now_micros_ += 10 SECS; + + // Accumulate a credit + uint64_t credit = 1000 SECS /* see below: * 1 MB / 1 SEC */; + clock_->now_micros_ += credit; + + // Spend some credit (burst of I/O) + for (unsigned i = 0; i < tokens.size(); ++i) { + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 63 MB)); + // In WriteController, credit is accumulated in bytes, not in time. + // After an "unnecessary" delay, all of our time credit will be + // translated to bytes on the next operation, in this case with + // setting 1 MBPS. So regardless of the rate at delay time, we just + // account for the bytes. + credit -= 63 MB; + } + // Spend remaining credit + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), credit)); + // Verify + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); + clock_->now_micros_ += 10 SECS; + + // Accumulate a credit, no accounting + clock_->now_micros_ += 1000 SECS; + + // Spend a small amount, releasing tokens + for (unsigned i = 0; i < tokens.size(); ++i) { + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 3 MB)); + tokens[i].reset(); + } + + // All tokens released. + // Verify credit is wiped away on new delay. + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); } } // namespace ROCKSDB_NAMESPACE