Add an option wal_bytes_per_sync to control sync_file_range for WAL files

Summary:
sync_file_range is not always asyncronous and thus can block writes if we do this for WAL in the foreground thread. See more here: http://yoshinorimatsunobu.blogspot.com/2014/03/how-syncfilerange-really-works.html

Some users don't want us to call sync_file_range on WALs. Some other do.
Thus, I'm adding a separate option wal_bytes_per_sync to control calling
sync_file_range on WAL files. bytes_per_sync will apply only to table
files now.

Test Plan: no more sync_file_range for WAL as evidenced by strace

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D38253
main
Igor Canadi 10 years ago
parent b0fdda4ff0
commit 4a855c0799
  1. 1
      HISTORY.md
  2. 5
      db/db_impl.cc
  3. 3
      include/rocksdb/env.h
  4. 6
      include/rocksdb/options.h
  5. 7
      util/env.cc
  6. 4
      util/env_posix.cc
  7. 4
      util/options.cc
  8. 2
      util/options_helper.cc
  9. 2
      util/options_test.cc

@ -11,6 +11,7 @@
### Public API changes ### Public API changes
* TablePropertiesCollector::AddUserKey() is added to replace TablePropertiesCollector::Add(). AddUserKey() exposes key type, sequence number and file size up to now to users. * TablePropertiesCollector::AddUserKey() is added to replace TablePropertiesCollector::Add(). AddUserKey() exposes key type, sequence number and file size up to now to users.
* DBOptions::bytes_per_sync used to apply to both WAL and table files. As of 3.11 it applies only to table files. If you want to use this option to sync WAL in the background, please use wal_bytes_per_sync
## 3.10.0 (3/24/2015) ## 3.10.0 (3/24/2015)
### New Features ### New Features

@ -3455,7 +3455,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
if (creating_new_log) { if (creating_new_log) {
s = env_->NewWritableFile( s = env_->NewWritableFile(
LogFileName(db_options_.wal_dir, new_log_number), &lfile, LogFileName(db_options_.wal_dir, new_log_number), &lfile,
env_->OptimizeForLogWrite(env_options_)); env_->OptimizeForLogWrite(env_options_, db_options_));
if (s.ok()) { if (s.ok()) {
// Our final size should be less than write_buffer_size // Our final size should be less than write_buffer_size
// (compression, etc) but err on the side of caution. // (compression, etc) but err on the side of caution.
@ -3965,7 +3965,8 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
EnvOptions soptions(db_options); EnvOptions soptions(db_options);
s = impl->db_options_.env->NewWritableFile( s = impl->db_options_.env->NewWritableFile(
LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile, LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile,
impl->db_options_.env->OptimizeForLogWrite(soptions)); impl->db_options_.env->OptimizeForLogWrite(soptions,
impl->db_options_));
if (s.ok()) { if (s.ok()) {
lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size); lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size);
impl->logfile_number_ = new_log_number; impl->logfile_number_ = new_log_number;

@ -300,7 +300,8 @@ class Env {
// OptimizeForLogWrite will create a new EnvOptions object that is a copy of // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
// the EnvOptions in the parameters, but is optimized for writing log files. // the EnvOptions in the parameters, but is optimized for writing log files.
// Default implementation returns the copy of the same object. // Default implementation returns the copy of the same object.
virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const; virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
const DBOptions& db_options) const;
// OptimizeForManifestWrite will create a new EnvOptions object that is a copy // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
// of the EnvOptions in the parameters, but is optimized for writing manifest // of the EnvOptions in the parameters, but is optimized for writing manifest
// files. Default implementation returns the copy of the same object. // files. Default implementation returns the copy of the same object.

@ -1001,8 +1001,14 @@ struct DBOptions {
// You may consider using rate_limiter to regulate write rate to device. // You may consider using rate_limiter to regulate write rate to device.
// When rate limiter is enabled, it automatically enables bytes_per_sync // When rate limiter is enabled, it automatically enables bytes_per_sync
// to 1MB. // to 1MB.
//
// This option applies to table files
uint64_t bytes_per_sync; uint64_t bytes_per_sync;
// Same as bytes_per_sync, but applies to WAL files
// Default: 0, turned off
uint64_t wal_bytes_per_sync;
// If true, then the status of the threads involved in this DB will // If true, then the status of the threads involved in this DB will
// be tracked and available via GetThreadList() API. // be tracked and available via GetThreadList() API.
// //

@ -249,8 +249,11 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
} }
EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const { EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options,
return env_options; const DBOptions& db_options) const {
EnvOptions optimized_env_options(env_options);
optimized_env_options.bytes_per_sync = db_options.wal_bytes_per_sync;
return optimized_env_options;
} }
EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const { EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {

@ -1510,9 +1510,11 @@ class PosixEnv : public Env {
return dummy; return dummy;
} }
EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const override { EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
const DBOptions& db_options) const override {
EnvOptions optimized = env_options; EnvOptions optimized = env_options;
optimized.use_mmap_writes = false; optimized.use_mmap_writes = false;
optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
// TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
// test and make this false // test and make this false

@ -255,6 +255,7 @@ DBOptions::DBOptions()
access_hint_on_compaction_start(NORMAL), access_hint_on_compaction_start(NORMAL),
use_adaptive_mutex(false), use_adaptive_mutex(false),
bytes_per_sync(0), bytes_per_sync(0),
wal_bytes_per_sync(0),
enable_thread_tracking(false) { enable_thread_tracking(false) {
} }
@ -298,6 +299,7 @@ DBOptions::DBOptions(const Options& options)
access_hint_on_compaction_start(options.access_hint_on_compaction_start), access_hint_on_compaction_start(options.access_hint_on_compaction_start),
use_adaptive_mutex(options.use_adaptive_mutex), use_adaptive_mutex(options.use_adaptive_mutex),
bytes_per_sync(options.bytes_per_sync), bytes_per_sync(options.bytes_per_sync),
wal_bytes_per_sync(options.wal_bytes_per_sync),
enable_thread_tracking(options.enable_thread_tracking) {} enable_thread_tracking(options.enable_thread_tracking) {}
static const char* const access_hints[] = { static const char* const access_hints[] = {
@ -364,6 +366,8 @@ void DBOptions::Dump(Logger* log) const {
rate_limiter.get()); rate_limiter.get());
Log(log, " Options.bytes_per_sync: %" PRIu64, Log(log, " Options.bytes_per_sync: %" PRIu64,
bytes_per_sync); bytes_per_sync);
Log(log, " Options.wal_bytes_per_sync: %" PRIu64,
wal_bytes_per_sync);
Log(log, " Options.enable_thread_tracking: %d", Log(log, " Options.enable_thread_tracking: %d",
enable_thread_tracking); enable_thread_tracking);
} // DBOptions::Dump } // DBOptions::Dump

@ -555,6 +555,8 @@ bool ParseDBOption(const std::string& name, const std::string& value,
new_options->use_adaptive_mutex = ParseBoolean(name, value); new_options->use_adaptive_mutex = ParseBoolean(name, value);
} else if (name == "bytes_per_sync") { } else if (name == "bytes_per_sync") {
new_options->bytes_per_sync = ParseUint64(value); new_options->bytes_per_sync = ParseUint64(value);
} else if (name == "wal_bytes_per_sync") {
new_options->wal_bytes_per_sync = ParseUint64(value);
} else { } else {
return false; return false;
} }

@ -172,6 +172,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"advise_random_on_open", "true"}, {"advise_random_on_open", "true"},
{"use_adaptive_mutex", "false"}, {"use_adaptive_mutex", "false"},
{"bytes_per_sync", "47"}, {"bytes_per_sync", "47"},
{"wal_bytes_per_sync", "48"},
}; };
ColumnFamilyOptions base_cf_opt; ColumnFamilyOptions base_cf_opt;
@ -278,6 +279,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_db_opt.advise_random_on_open, true); ASSERT_EQ(new_db_opt.advise_random_on_open, true);
ASSERT_EQ(new_db_opt.use_adaptive_mutex, false); ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47)); ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
} }
#endif // !ROCKSDB_LITE #endif // !ROCKSDB_LITE

Loading…
Cancel
Save