rocksdb/db/write_controller.cc

//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.

#include "db/write_controller.h"

#include <atomic>
#include <cassert>
#include "rocksdb/env.h"

namespace rocksdb {

std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
  ++total_stopped_;
  return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
}

std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken() {
  if (total_delayed_++ == 0) {
    last_refill_time_ = 0;
    bytes_left_ = 0;
  }
  return std::unique_ptr<WriteControllerToken>(new DelayWriteToken(this));
}

bool WriteController::IsStopped() const { return total_stopped_ > 0; }
// Tihs is inside DB mutex, so we can't sleep and need to minimize
// frequency to get time.
// If it turns out to be a performance issue, we can redesign the thread
// synchronization model here.
// The function trust caller will sleep micros returned.
uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) {
  if (total_stopped_ > 0) {
    return 0;
  }
  if (total_delayed_ == 0) {
    return 0;
  }

  const uint64_t kMicrosPerSecond = 1000000;
  const uint64_t kRefillInterval = 1024U;

  if (bytes_left_ >= num_bytes) {
    bytes_left_ -= num_bytes;
    return 0;
  }
  // The frequency to get time inside DB mutex is less than one per refill
  // interval.
  auto time_now = env->NowMicros();

  uint64_t sleep_debt = 0;
  uint64_t time_since_last_refill = 0;
  if (last_refill_time_ != 0) {
    if (last_refill_time_ > time_now) {
      sleep_debt = last_refill_time_ - time_now;
    } else {
      time_since_last_refill = time_now - last_refill_time_;
      bytes_left_ +=
          static_cast<uint64_t>(static_cast<double>(time_since_last_refill) /
                                kMicrosPerSecond * delayed_write_rate_);
      if (time_since_last_refill >= kRefillInterval &&
          bytes_left_ > num_bytes) {
        // If refill interval already passed and we have enough bytes
        // return without extra sleeping.
        last_refill_time_ = time_now;
        bytes_left_ -= num_bytes;
        return 0;
      }
    }
  }

  uint64_t single_refill_amount =
      delayed_write_rate_ * kRefillInterval / kMicrosPerSecond;
  if (bytes_left_ + single_refill_amount >= num_bytes) {
    // Wait until a refill interval
    // Never trigger expire for less than one refill interval to avoid to get
    // time.
    bytes_left_ = bytes_left_ + single_refill_amount - num_bytes;
    last_refill_time_ = time_now + kRefillInterval;
    return kRefillInterval + sleep_debt;
  }

  // Need to refill more than one interval. Need to sleep longer. Check
  // whether expiration will hit

  // Sleep just until `num_bytes` is allowed.
  uint64_t sleep_amount =
      static_cast<uint64_t>(num_bytes /
                            static_cast<long double>(delayed_write_rate_) *
                            kMicrosPerSecond) +
      sleep_debt;
  last_refill_time_ = time_now + sleep_amount;
  return sleep_amount;
}

StopWriteToken::~StopWriteToken() {
  assert(controller_->total_stopped_ >= 1);
  --controller_->total_stopped_;
}

DelayWriteToken::~DelayWriteToken() {
  controller_->total_delayed_--;
  assert(controller_->total_delayed_ >= 0);
}

}  // namespace rocksdb
Push- instead of pull-model for managing Write stalls Summary: Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either: * proceed with all writes without delay * delay all writes by fixed time * stop all writes The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case). When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal. This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write. Test Plan: make check for now. I'll add some unit tests later. Also, perf test. Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D22791 10 years ago			`// Copyright (c) 2013, Facebook, Inc. All rights reserved.`
			`// This source code is licensed under the BSD-style license found in the`
			`// LICENSE file in the root directory of this source tree. An additional grant`
			`// of patent rights can be found in the PATENTS file in the same directory.`

			`#include "db/write_controller.h"`

Slow down writes by bytes written Summary: We slow down data into the database to the rate of options.delayed_write_rate (a new option) with this patch. The thread synchronization approach I take is to still synchronize write controller by DB mutex and GetDelay() is inside DB mutex. Try to minimize the frequency of getting time in GetDelay(). I verified it through db_bench and it seems to work hard_rate_limit is deprecated. options.delayed_write_rate is still not dynamically changeable. Need to work on it as a follow-up. Test Plan: Add new unit tests in db_test Reviewers: yhchiang, rven, kradhakrishnan, anthony, MarkCallaghan, igor Reviewed By: igor Subscribers: ikabiljo, leveldb, dhruba Differential Revision: https://reviews.facebook.net/D36351 10 years ago			`#include <atomic>`
Push- instead of pull-model for managing Write stalls Summary: Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either: * proceed with all writes without delay * delay all writes by fixed time * stop all writes The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case). When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal. This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write. Test Plan: make check for now. I'll add some unit tests later. Also, perf test. Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D22791 10 years ago			`#include <cassert>`
Slow down writes by bytes written Summary: We slow down data into the database to the rate of options.delayed_write_rate (a new option) with this patch. The thread synchronization approach I take is to still synchronize write controller by DB mutex and GetDelay() is inside DB mutex. Try to minimize the frequency of getting time in GetDelay(). I verified it through db_bench and it seems to work hard_rate_limit is deprecated. options.delayed_write_rate is still not dynamically changeable. Need to work on it as a follow-up. Test Plan: Add new unit tests in db_test Reviewers: yhchiang, rven, kradhakrishnan, anthony, MarkCallaghan, igor Reviewed By: igor Subscribers: ikabiljo, leveldb, dhruba Differential Revision: https://reviews.facebook.net/D36351 10 years ago			`#include "rocksdb/env.h"`
Push- instead of pull-model for managing Write stalls Summary: Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either: * proceed with all writes without delay * delay all writes by fixed time * stop all writes The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case). When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal. This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write. Test Plan: make check for now. I'll add some unit tests later. Also, perf test. Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D22791 10 years ago
			`namespace rocksdb {`

			`std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {`
			`++total_stopped_;`
			`return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));`
			`}`

Slow down writes by bytes written Summary: We slow down data into the database to the rate of options.delayed_write_rate (a new option) with this patch. The thread synchronization approach I take is to still synchronize write controller by DB mutex and GetDelay() is inside DB mutex. Try to minimize the frequency of getting time in GetDelay(). I verified it through db_bench and it seems to work hard_rate_limit is deprecated. options.delayed_write_rate is still not dynamically changeable. Need to work on it as a follow-up. Test Plan: Add new unit tests in db_test Reviewers: yhchiang, rven, kradhakrishnan, anthony, MarkCallaghan, igor Reviewed By: igor Subscribers: ikabiljo, leveldb, dhruba Differential Revision: https://reviews.facebook.net/D36351 10 years ago			`std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken() {`
			`if (total_delayed_++ == 0) {`
			`last_refill_time_ = 0;`
			`bytes_left_ = 0;`
			`}`
			`return std::unique_ptr<WriteControllerToken>(new DelayWriteToken(this));`
Push- instead of pull-model for managing Write stalls Summary: Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either: * proceed with all writes without delay * delay all writes by fixed time * stop all writes The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case). When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal. This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write. Test Plan: make check for now. I'll add some unit tests later. Also, perf test. Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D22791 10 years ago			`}`

			`bool WriteController::IsStopped() const { return total_stopped_ > 0; }`
Slow down writes by bytes written Summary: We slow down data into the database to the rate of options.delayed_write_rate (a new option) with this patch. The thread synchronization approach I take is to still synchronize write controller by DB mutex and GetDelay() is inside DB mutex. Try to minimize the frequency of getting time in GetDelay(). I verified it through db_bench and it seems to work hard_rate_limit is deprecated. options.delayed_write_rate is still not dynamically changeable. Need to work on it as a follow-up. Test Plan: Add new unit tests in db_test Reviewers: yhchiang, rven, kradhakrishnan, anthony, MarkCallaghan, igor Reviewed By: igor Subscribers: ikabiljo, leveldb, dhruba Differential Revision: https://reviews.facebook.net/D36351 10 years ago			`// Tihs is inside DB mutex, so we can't sleep and need to minimize`
			`// frequency to get time.`
			`// If it turns out to be a performance issue, we can redesign the thread`
			`// synchronization model here.`
			`// The function trust caller will sleep micros returned.`
			`uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) {`
			`if (total_stopped_ > 0) {`
			`return 0;`
			`}`
			`if (total_delayed_ == 0) {`
			`return 0;`
			`}`

			`const uint64_t kMicrosPerSecond = 1000000;`
			`const uint64_t kRefillInterval = 1024U;`

			`if (bytes_left_ >= num_bytes) {`
			`bytes_left_ -= num_bytes;`
			`return 0;`
			`}`
			`// The frequency to get time inside DB mutex is less than one per refill`
			`// interval.`
			`auto time_now = env->NowMicros();`

			`uint64_t sleep_debt = 0;`
			`uint64_t time_since_last_refill = 0;`
			`if (last_refill_time_ != 0) {`
			`if (last_refill_time_ > time_now) {`
			`sleep_debt = last_refill_time_ - time_now;`
			`} else {`
			`time_since_last_refill = time_now - last_refill_time_;`
			`bytes_left_ +=`
			`static_cast<uint64_t>(static_cast<double>(time_since_last_refill) /`
			`kMicrosPerSecond * delayed_write_rate_);`
			`if (time_since_last_refill >= kRefillInterval &&`
			`bytes_left_ > num_bytes) {`
			`// If refill interval already passed and we have enough bytes`
			`// return without extra sleeping.`
			`last_refill_time_ = time_now;`
			`bytes_left_ -= num_bytes;`
			`return 0;`
			`}`
			`}`
			`}`

			`uint64_t single_refill_amount =`
			`delayed_write_rate_ * kRefillInterval / kMicrosPerSecond;`
			`if (bytes_left_ + single_refill_amount >= num_bytes) {`
			`// Wait until a refill interval`
			`// Never trigger expire for less than one refill interval to avoid to get`
			`// time.`
			`bytes_left_ = bytes_left_ + single_refill_amount - num_bytes;`
			`last_refill_time_ = time_now + kRefillInterval;`
			`return kRefillInterval + sleep_debt;`
			`}`

			`// Need to refill more than one interval. Need to sleep longer. Check`
			`// whether expiration will hit`

			// Sleep just until `num_bytes` is allowed.
			`uint64_t sleep_amount =`
			`static_cast<uint64_t>(num_bytes /`
			`static_cast<long double>(delayed_write_rate_) *`
			`kMicrosPerSecond) +`
			`sleep_debt;`
			`last_refill_time_ = time_now + sleep_amount;`
			`return sleep_amount;`
			`}`
Push- instead of pull-model for managing Write stalls Summary: Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either: * proceed with all writes without delay * delay all writes by fixed time * stop all writes The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case). When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal. This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write. Test Plan: make check for now. I'll add some unit tests later. Also, perf test. Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D22791 10 years ago
			`StopWriteToken::~StopWriteToken() {`
			`assert(controller_->total_stopped_ >= 1);`
			`--controller_->total_stopped_;`
			`}`

			`DelayWriteToken::~DelayWriteToken() {`
Slow down writes by bytes written Summary: We slow down data into the database to the rate of options.delayed_write_rate (a new option) with this patch. The thread synchronization approach I take is to still synchronize write controller by DB mutex and GetDelay() is inside DB mutex. Try to minimize the frequency of getting time in GetDelay(). I verified it through db_bench and it seems to work hard_rate_limit is deprecated. options.delayed_write_rate is still not dynamically changeable. Need to work on it as a follow-up. Test Plan: Add new unit tests in db_test Reviewers: yhchiang, rven, kradhakrishnan, anthony, MarkCallaghan, igor Reviewed By: igor Subscribers: ikabiljo, leveldb, dhruba Differential Revision: https://reviews.facebook.net/D36351 10 years ago			`controller_->total_delayed_--;`
			`assert(controller_->total_delayed_ >= 0);`
Push- instead of pull-model for managing Write stalls Summary: Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either: * proceed with all writes without delay * delay all writes by fixed time * stop all writes The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case). When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal. This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write. Test Plan: make check for now. I'll add some unit tests later. Also, perf test. Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D22791 10 years ago			`}`

			`} // namespace rocksdb`