Option to make write group size configurable (#5759)

Summary:
The max batch size that we can write to the WAL is controlled by a static manner. So if the leader write is less than 128 KB we will have the batch size as leader write size + 128 KB else the limit will be 1 MB. Both of them are statically defined.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5759

Differential Revision: D17329298

fbshipit-source-id: a3d910629d8d8ca84ea39ad89c2b2d284571ded5
main
Ronak Sisodia 5 years ago committed by Facebook Github Bot
parent 9eb3e1f77d
commit d05c0fe4d1
  1. 16
      db/write_thread.cc
  2. 5
      db/write_thread.h
  3. 7
      include/rocksdb/options.h
  4. 6
      options/db_options.cc
  5. 1
      options/db_options.h
  6. 5
      options/options_helper.cc
  7. 1
      options/options_settable_test.cc

@ -22,6 +22,8 @@ WriteThread::WriteThread(const ImmutableDBOptions& db_options)
allow_concurrent_memtable_write_( allow_concurrent_memtable_write_(
db_options.allow_concurrent_memtable_write), db_options.allow_concurrent_memtable_write),
enable_pipelined_write_(db_options.enable_pipelined_write), enable_pipelined_write_(db_options.enable_pipelined_write),
max_write_batch_group_size_bytes(
db_options.max_write_batch_group_size_bytes),
newest_writer_(nullptr), newest_writer_(nullptr),
newest_memtable_writer_(nullptr), newest_memtable_writer_(nullptr),
last_sequence_(0), last_sequence_(0),
@ -406,9 +408,10 @@ size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
// Allow the group to grow up to a maximum size, but if the // Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow // original write is small, limit the growth so we do not slow
// down the small write too much. // down the small write too much.
size_t max_size = 1 << 20; size_t max_size = max_write_batch_group_size_bytes;
if (size <= (128 << 10)) { const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
max_size = size + (128 << 10); if (size <= min_batch_size_bytes) {
max_size = size + min_batch_size_bytes;
} }
leader->write_group = write_group; leader->write_group = write_group;
@ -485,9 +488,10 @@ void WriteThread::EnterAsMemTableWriter(Writer* leader,
// Allow the group to grow up to a maximum size, but if the // Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow // original write is small, limit the growth so we do not slow
// down the small write too much. // down the small write too much.
size_t max_size = 1 << 20; size_t max_size = max_write_batch_group_size_bytes;
if (size <= (128 << 10)) { const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
max_size = size + (128 << 10); if (size <= min_batch_size_bytes) {
max_size = size + min_batch_size_bytes;
} }
leader->write_group = write_group; leader->write_group = write_group;

@ -360,6 +360,11 @@ class WriteThread {
// Enable pipelined write to WAL and memtable. // Enable pipelined write to WAL and memtable.
const bool enable_pipelined_write_; const bool enable_pipelined_write_;
// The maximum limit of number of bytes that are written in a single batch
// of WAL or memtable write. It is followed when the leader write size
// is larger than 1/8 of this limit.
const uint64_t max_write_batch_group_size_bytes;
// Points to the newest pending writer. Only leader can remove // Points to the newest pending writer. Only leader can remove
// elements, adding can be done lock-free by anybody. // elements, adding can be done lock-free by anybody.
std::atomic<Writer*> newest_writer_; std::atomic<Writer*> newest_writer_;

@ -953,6 +953,13 @@ struct DBOptions {
// Default: true // Default: true
bool enable_write_thread_adaptive_yield = true; bool enable_write_thread_adaptive_yield = true;
// The maximum limit of number of bytes that are written in a single batch
// of WAL or memtable write. It is followed when the leader write size
// is larger than 1/8 of this limit.
//
// Default: 1 MB
uint64_t max_write_batch_group_size_bytes = 1 << 20;
// The maximum number of microseconds that a write operation will use // The maximum number of microseconds that a write operation will use
// a yielding spin loop to coordinate with other write threads before // a yielding spin loop to coordinate with other write threads before
// blocking on a mutex. (Assuming write_thread_slow_yield_usec is // blocking on a mutex. (Assuming write_thread_slow_yield_usec is

@ -44,6 +44,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
table_cache_numshardbits(options.table_cache_numshardbits), table_cache_numshardbits(options.table_cache_numshardbits),
wal_ttl_seconds(options.WAL_ttl_seconds), wal_ttl_seconds(options.WAL_ttl_seconds),
wal_size_limit_mb(options.WAL_size_limit_MB), wal_size_limit_mb(options.WAL_size_limit_MB),
max_write_batch_group_size_bytes(
options.max_write_batch_group_size_bytes),
manifest_preallocation_size(options.manifest_preallocation_size), manifest_preallocation_size(options.manifest_preallocation_size),
allow_mmap_reads(options.allow_mmap_reads), allow_mmap_reads(options.allow_mmap_reads),
allow_mmap_writes(options.allow_mmap_writes), allow_mmap_writes(options.allow_mmap_writes),
@ -153,6 +155,10 @@ void ImmutableDBOptions::Dump(Logger* log) const {
ROCKS_LOG_HEADER(log, ROCKS_LOG_HEADER(log,
" Options.WAL_size_limit_MB: %" PRIu64, " Options.WAL_size_limit_MB: %" PRIu64,
wal_size_limit_mb); wal_size_limit_mb);
ROCKS_LOG_HEADER(log,
" "
"Options.max_write_batch_group_size_bytes: %" PRIu64,
max_write_batch_group_size_bytes);
ROCKS_LOG_HEADER( ROCKS_LOG_HEADER(
log, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt, log, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
manifest_preallocation_size); manifest_preallocation_size);

@ -43,6 +43,7 @@ struct ImmutableDBOptions {
int table_cache_numshardbits; int table_cache_numshardbits;
uint64_t wal_ttl_seconds; uint64_t wal_ttl_seconds;
uint64_t wal_size_limit_mb; uint64_t wal_size_limit_mb;
uint64_t max_write_batch_group_size_bytes;
size_t manifest_preallocation_size; size_t manifest_preallocation_size;
bool allow_mmap_reads; bool allow_mmap_reads;
bool allow_mmap_writes; bool allow_mmap_writes;

@ -110,6 +110,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
immutable_db_options.allow_concurrent_memtable_write; immutable_db_options.allow_concurrent_memtable_write;
options.enable_write_thread_adaptive_yield = options.enable_write_thread_adaptive_yield =
immutable_db_options.enable_write_thread_adaptive_yield; immutable_db_options.enable_write_thread_adaptive_yield;
options.max_write_batch_group_size_bytes =
immutable_db_options.max_write_batch_group_size_bytes;
options.write_thread_max_yield_usec = options.write_thread_max_yield_usec =
immutable_db_options.write_thread_max_yield_usec; immutable_db_options.write_thread_max_yield_usec;
options.write_thread_slow_yield_usec = options.write_thread_slow_yield_usec =
@ -1611,6 +1613,9 @@ std::unordered_map<std::string, OptionTypeInfo>
{"write_thread_slow_yield_usec", {"write_thread_slow_yield_usec",
{offsetof(struct DBOptions, write_thread_slow_yield_usec), {offsetof(struct DBOptions, write_thread_slow_yield_usec),
OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
{"max_write_batch_group_size_bytes",
{offsetof(struct DBOptions, max_write_batch_group_size_bytes),
OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
{"write_thread_max_yield_usec", {"write_thread_max_yield_usec",
{offsetof(struct DBOptions, write_thread_max_yield_usec), {offsetof(struct DBOptions, write_thread_max_yield_usec),
OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},

@ -229,6 +229,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"delete_obsolete_files_period_micros=4294967758;" "delete_obsolete_files_period_micros=4294967758;"
"WAL_ttl_seconds=4295008036;" "WAL_ttl_seconds=4295008036;"
"WAL_size_limit_MB=4295036161;" "WAL_size_limit_MB=4295036161;"
"max_write_batch_group_size_bytes=1048576;"
"wal_dir=path/to/wal_dir;" "wal_dir=path/to/wal_dir;"
"db_write_buffer_size=2587;" "db_write_buffer_size=2587;"
"max_subcompactions=64330;" "max_subcompactions=64330;"

Loading…
Cancel
Save