Make writable_file_max_buffer_size dynamic

Summary:
The DBOptions::writable_file_max_buffer_size can be changed dynamically.
Closes https://github.com/facebook/rocksdb/pull/3053

Differential Revision: D6152720

Pulled By: shligit

fbshipit-source-id: aa0c0cfcfae6a54eb17faadb148d904797c68681
main
Shaohua Li 7 years ago committed by Facebook Github Bot
parent c1be8d86c6
commit 33c7d4ccd9
  1. 1
      HISTORY.md
  2. 5
      db/c.cc
  3. 1
      db/db_impl.cc
  4. 52
      db/db_options_test.cc
  5. 28
      db/version_set.cc
  6. 10
      db/version_set.h
  7. 2
      env/env.cc
  8. 2
      env/env_posix.cc
  9. 2
      include/rocksdb/c.h
  10. 9
      options/db_options.cc
  11. 2
      options/db_options.h
  12. 2
      options/options_helper.cc
  13. 7
      options/options_helper.h
  14. 2
      port/win/env_win.cc
  15. 3
      util/file_reader_writer.h

@ -6,6 +6,7 @@
* Return an error on write if write_options.sync = true and write_options.disableWAL = true to warn user of inconsistent options. Previously we will not write to WAL and not respecting the sync options in this case.
### New Features
* `DBOptions::writable_file_max_buffer_size` can now be changed dynamically.
* `DBOptions::bytes_per_sync` and `DBOptions::wal_bytes_per_sync` can now be changed dynamically, `DBOptions::wal_bytes_per_sync` will flush all memtables and switch to a new WAL file.
* Support dynamic adjustment of rate limit according to demand for background I/O. It can be enabled by passing `true` to the `auto_tuned` parameter in `NewGenericRateLimiter()`. The value passed as `rate_bytes_per_sec` will still be respected as an upper-bound.
* Support dynamically changing `ColumnFamilyOptions::compaction_options_fifo`.

@ -2276,6 +2276,11 @@ void rocksdb_options_set_bytes_per_sync(
opt->rep.bytes_per_sync = v;
}
void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
uint64_t v) {
opt->rep.writable_file_max_buffer_size = v;
}
void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
unsigned char v) {
opt->rep.allow_concurrent_memtable_write = v;

@ -574,6 +574,7 @@ Status DBImpl::SetDBOptions(
env_options_for_compaction_ = env_->OptimizeForCompactionTableWrite(
env_options_for_compaction_,
immutable_db_options_);
versions_->ChangeEnvOptions(mutable_db_options_);
write_thread_.EnterUnbatched(&w, &mutex_);
if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
Status purge_wal_status = SwitchWAL(&write_context);

@ -208,6 +208,58 @@ TEST_F(DBOptionsTest, SetWalBytesPerSync) {
ASSERT_GT(low_bytes_per_sync, counter);
}
TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
Options options;
options.create_if_missing = true;
options.writable_file_max_buffer_size = 1024 * 1024;
options.level0_file_num_compaction_trigger = 3;
options.max_manifest_file_size = 1;
options.env = env_;
int buffer_size = 1024 * 1024;
Reopen(options);
ASSERT_EQ(buffer_size,
dbfull()->GetDBOptions().writable_file_max_buffer_size);
std::atomic<int> match_cnt(0);
std::atomic<int> unmatch_cnt(0);
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"WritableFileWriter::WritableFileWriter:0", [&](void* arg) {
int value = static_cast<int>(reinterpret_cast<uintptr_t>(arg));
if (value == buffer_size) {
match_cnt++;
} else {
unmatch_cnt++;
}
});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
int i = 0;
for (; i < 3; i++) {
ASSERT_OK(Put("foo", ToString(i)));
ASSERT_OK(Put("bar", ToString(i)));
Flush();
}
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(unmatch_cnt, 0);
ASSERT_GE(match_cnt, 11);
buffer_size = 512 * 1024;
match_cnt = 0;
unmatch_cnt = 0;
ASSERT_OK(
dbfull()->SetDBOptions({{"writable_file_max_buffer_size", "524288"}}));
ASSERT_EQ(buffer_size,
dbfull()->GetDBOptions().writable_file_max_buffer_size);
i = 0;
for (; i < 3; i++) {
ASSERT_OK(Put("foo", ToString(i)));
ASSERT_OK(Put("bar", ToString(i)));
Flush();
}
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(unmatch_cnt, 0);
ASSERT_GE(match_cnt, 11);
}
TEST_F(DBOptionsTest, SetOptionsAndReopen) {
Random rnd(1044);
auto rand_opts = GetRandomizedMutableCFOptionsMap(&rnd);

@ -576,7 +576,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
auto table_cache = cfd_->table_cache();
auto ioptions = cfd_->ioptions();
Status s = table_cache->GetTableProperties(
vset_->env_options_, cfd_->internal_comparator(), file_meta->fd,
env_options_, cfd_->internal_comparator(), file_meta->fd,
tp, true /* no io */);
if (s.ok()) {
return s;
@ -599,7 +599,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
file_meta->fd.GetPathId());
}
s = ioptions->env->NewRandomAccessFile(file_name, &file, vset_->env_options_);
s = ioptions->env->NewRandomAccessFile(file_name, &file, env_options_);
if (!s.ok()) {
return s;
}
@ -711,7 +711,7 @@ size_t Version::GetMemoryUsageByTableReaders() {
for (auto& file_level : storage_info_.level_files_brief_) {
for (size_t i = 0; i < file_level.num_files; i++) {
total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
vset_->env_options_, cfd_->internal_comparator(),
env_options_, cfd_->internal_comparator(),
file_level.files[i].fd);
}
}
@ -936,7 +936,7 @@ VersionStorageInfo::VersionStorageInfo(
}
Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
uint64_t version_number)
const EnvOptions& env_opt, uint64_t version_number)
: env_(vset->env_),
cfd_(column_family_data),
info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log),
@ -959,6 +959,7 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
next_(this),
prev_(this),
refs_(0),
env_options_(env_opt),
version_number_(version_number) {}
void Version::Get(const ReadOptions& read_options, const LookupKey& k,
@ -2532,7 +2533,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
LogAndApplyCFHelper(w.edit_list.front());
batch_edits.push_back(w.edit_list.front());
} else {
v = new Version(column_family_data, this, current_version_number_++);
v = new Version(column_family_data, this, env_options_,
current_version_number_++);
builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data));
auto* builder = builder_guard->version_builder();
for (const auto& writer : manifest_writers_) {
@ -2577,7 +2579,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
// Unlock during expensive operations. New writes cannot get here
// because &w is ensuring that all new writes get queued.
{
EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_);
mu->Unlock();
TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest");
@ -2599,7 +2601,6 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
pending_manifest_file_number_);
unique_ptr<WritableFile> descriptor_file;
EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_);
s = NewWritableFile(
env_, DescriptorFileName(dbname_, pending_manifest_file_number_),
&descriptor_file, opt_env_opts);
@ -3064,7 +3065,8 @@ Status VersionSet::Recover(
false /* prefetch_index_and_filter_in_cache */);
}
Version* v = new Version(cfd, this, current_version_number_++);
Version* v =
new Version(cfd, this, env_options_, current_version_number_++);
builder->SaveTo(v->storage_info());
// Install recovered version
@ -3422,7 +3424,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
assert(builders_iter != builders.end());
auto builder = builders_iter->second->version_builder();
Version* v = new Version(cfd, this, current_version_number_++);
Version* v =
new Version(cfd, this, env_options_, current_version_number_++);
builder->SaveTo(v->storage_info());
v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false);
@ -3634,7 +3637,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
// approximate offset of "key" within the table.
TableReader* table_reader_ptr;
InternalIterator* iter = v->cfd_->table_cache()->NewIterator(
ReadOptions(), env_options_, v->cfd_->internal_comparator(), f.fd,
ReadOptions(), v->env_options_, v->cfd_->internal_comparator(), f.fd,
nullptr /* range_del_agg */, &table_reader_ptr);
if (table_reader_ptr != nullptr) {
result = table_reader_ptr->ApproximateOffsetOf(key);
@ -3865,7 +3868,7 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
assert(edit->is_column_family_add_);
Version* dummy_versions = new Version(nullptr, this);
Version* dummy_versions = new Version(nullptr, this, env_options_);
// Ref() dummy version once so that later we can call Unref() to delete it
// by avoiding calling "delete" explicitly (~Version is private)
dummy_versions->Ref();
@ -3873,7 +3876,8 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
edit->column_family_name_, edit->column_family_, dummy_versions,
cf_options);
Version* v = new Version(new_cfd, this, current_version_number_++);
Version* v =
new Version(new_cfd, this, env_options_, current_version_number_++);
// Fill level target base information.
v->storage_info()->CalculateBaseBytes(*new_cfd->ioptions(),

@ -663,12 +663,14 @@ class Version {
Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list
int refs_; // Number of live refs to this version
const EnvOptions env_options_;
// A version number that uniquely represents this version. This is
// used for debugging and logging purposes only.
uint64_t version_number_;
Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
Version(ColumnFamilyData* cfd, VersionSet* vset, const EnvOptions& env_opt,
uint64_t version_number = 0);
~Version();
@ -844,6 +846,10 @@ class VersionSet {
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
const EnvOptions& env_options() { return env_options_; }
void ChangeEnvOptions(const MutableDBOptions& new_options) {
env_options_.writable_file_max_buffer_size =
new_options.writable_file_max_buffer_size;
}
static uint64_t GetNumLiveVersions(Version* dummy_versions);
@ -908,7 +914,7 @@ class VersionSet {
std::vector<std::string> obsolete_manifests_;
// env options for all reads and writes except compactions
const EnvOptions& env_options_;
EnvOptions env_options_;
// env options used for compactions. This is a copy of
// env_options_ but with readaheads set to readahead_compactions_.

2
env/env.cc vendored

@ -333,6 +333,8 @@ EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options,
const DBOptions& db_options) const {
EnvOptions optimized_env_options(env_options);
optimized_env_options.bytes_per_sync = db_options.wal_bytes_per_sync;
optimized_env_options.writable_file_max_buffer_size =
db_options.writable_file_max_buffer_size;
return optimized_env_options;
}

2
env/env_posix.cc vendored

@ -832,6 +832,8 @@ class PosixEnv : public Env {
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
// test and make this false
optimized.fallocate_with_keep_size = true;
optimized.writable_file_max_buffer_size =
db_options.writable_file_max_buffer_size;
return optimized;
}

@ -857,6 +857,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync(
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync(
rocksdb_options_t*, uint64_t);
extern ROCKSDB_LIBRARY_API void
rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t);
extern ROCKSDB_LIBRARY_API void
rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*,
unsigned char);
extern ROCKSDB_LIBRARY_API void

@ -64,7 +64,6 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
options.new_table_reader_for_compaction_inputs),
compaction_readahead_size(options.compaction_readahead_size),
random_access_max_buffer_size(options.random_access_max_buffer_size),
writable_file_max_buffer_size(options.writable_file_max_buffer_size),
use_adaptive_mutex(options.use_adaptive_mutex),
listeners(options.listeners),
enable_thread_tracking(options.enable_thread_tracking),
@ -175,9 +174,6 @@ void ImmutableDBOptions::Dump(Logger* log) const {
ROCKS_LOG_HEADER(
log, " Options.random_access_max_buffer_size: %" ROCKSDB_PRIszt,
random_access_max_buffer_size);
ROCKS_LOG_HEADER(
log, " Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt,
writable_file_max_buffer_size);
ROCKS_LOG_HEADER(log, " Options.use_adaptive_mutex: %d",
use_adaptive_mutex);
ROCKS_LOG_HEADER(log, " Options.rate_limiter: %p",
@ -230,6 +226,7 @@ MutableDBOptions::MutableDBOptions()
base_background_compactions(-1),
max_background_compactions(-1),
avoid_flush_during_shutdown(false),
writable_file_max_buffer_size(1024 * 1024),
delayed_write_rate(2 * 1024U * 1024U),
max_total_wal_size(0),
delete_obsolete_files_period_micros(6ULL * 60 * 60 * 1000000),
@ -243,6 +240,7 @@ MutableDBOptions::MutableDBOptions(const DBOptions& options)
base_background_compactions(options.base_background_compactions),
max_background_compactions(options.max_background_compactions),
avoid_flush_during_shutdown(options.avoid_flush_during_shutdown),
writable_file_max_buffer_size(options.writable_file_max_buffer_size),
delayed_write_rate(options.delayed_write_rate),
max_total_wal_size(options.max_total_wal_size),
delete_obsolete_files_period_micros(
@ -259,6 +257,9 @@ void MutableDBOptions::Dump(Logger* log) const {
max_background_compactions);
ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_shutdown: %d",
avoid_flush_during_shutdown);
ROCKS_LOG_HEADER(
log, " Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt,
writable_file_max_buffer_size);
ROCKS_LOG_HEADER(log, " Options.delayed_write_rate : %" PRIu64,
delayed_write_rate);
ROCKS_LOG_HEADER(log, " Options.max_total_wal_size: %" PRIu64,

@ -57,7 +57,6 @@ struct ImmutableDBOptions {
bool new_table_reader_for_compaction_inputs;
size_t compaction_readahead_size;
size_t random_access_max_buffer_size;
size_t writable_file_max_buffer_size;
bool use_adaptive_mutex;
std::vector<std::shared_ptr<EventListener>> listeners;
bool enable_thread_tracking;
@ -93,6 +92,7 @@ struct MutableDBOptions {
int base_background_compactions;
int max_background_compactions;
bool avoid_flush_during_shutdown;
size_t writable_file_max_buffer_size;
uint64_t delayed_write_rate;
uint64_t max_total_wal_size;
uint64_t delete_obsolete_files_period_micros;

@ -91,7 +91,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
options.random_access_max_buffer_size =
immutable_db_options.random_access_max_buffer_size;
options.writable_file_max_buffer_size =
immutable_db_options.writable_file_max_buffer_size;
mutable_db_options.writable_file_max_buffer_size;
options.use_adaptive_mutex = immutable_db_options.use_adaptive_mutex;
options.listeners = immutable_db_options.listeners;
options.enable_thread_tracking = immutable_db_options.enable_thread_tracking;

@ -220,9 +220,6 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
{"random_access_max_buffer_size",
{offsetof(struct DBOptions, random_access_max_buffer_size),
OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
{"writable_file_max_buffer_size",
{offsetof(struct DBOptions, writable_file_max_buffer_size),
OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
{"use_adaptive_mutex",
{offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean,
OptionVerificationType::kNormal, false, 0}},
@ -351,6 +348,10 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
{offsetof(struct DBOptions, avoid_flush_during_shutdown),
OptionType::kBoolean, OptionVerificationType::kNormal, true,
offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}},
{"writable_file_max_buffer_size",
{offsetof(struct DBOptions, writable_file_max_buffer_size),
OptionType::kSizeT, OptionVerificationType::kNormal, true,
offsetof(struct MutableDBOptions, writable_file_max_buffer_size)}},
{"allow_ingest_behind",
{offsetof(struct DBOptions, allow_ingest_behind), OptionType::kBoolean,
OptionVerificationType::kNormal, false,

@ -786,6 +786,8 @@ EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options,
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
// test and make this false
optimized.fallocate_with_keep_size = true;
optimized.writable_file_max_buffer_size =
db_options.writable_file_max_buffer_size;
return optimized;
}

@ -13,6 +13,7 @@
#include "rocksdb/env.h"
#include "rocksdb/rate_limiter.h"
#include "util/aligned_buffer.h"
#include "util/sync_point.h"
namespace rocksdb {
@ -151,6 +152,8 @@ class WritableFileWriter {
bytes_per_sync_(options.bytes_per_sync),
rate_limiter_(options.rate_limiter),
stats_(stats) {
TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
reinterpret_cast<void*>(max_buffer_size_));
buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
}

Loading…
Cancel
Save