diff --git a/db/write_thread.cc b/db/write_thread.cc index 5ee943904..1ded68fde 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -22,6 +22,8 @@ WriteThread::WriteThread(const ImmutableDBOptions& db_options) allow_concurrent_memtable_write_( db_options.allow_concurrent_memtable_write), enable_pipelined_write_(db_options.enable_pipelined_write), + max_write_batch_group_size_bytes( + db_options.max_write_batch_group_size_bytes), newest_writer_(nullptr), newest_memtable_writer_(nullptr), last_sequence_(0), @@ -406,9 +408,10 @@ size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader, // Allow the group to grow up to a maximum size, but if the // original write is small, limit the growth so we do not slow // down the small write too much. - size_t max_size = 1 << 20; - if (size <= (128 << 10)) { - max_size = size + (128 << 10); + size_t max_size = max_write_batch_group_size_bytes; + const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8; + if (size <= min_batch_size_bytes) { + max_size = size + min_batch_size_bytes; } leader->write_group = write_group; @@ -485,9 +488,10 @@ void WriteThread::EnterAsMemTableWriter(Writer* leader, // Allow the group to grow up to a maximum size, but if the // original write is small, limit the growth so we do not slow // down the small write too much. - size_t max_size = 1 << 20; - if (size <= (128 << 10)) { - max_size = size + (128 << 10); + size_t max_size = max_write_batch_group_size_bytes; + const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8; + if (size <= min_batch_size_bytes) { + max_size = size + min_batch_size_bytes; } leader->write_group = write_group; diff --git a/db/write_thread.h b/db/write_thread.h index dc9c22ff8..e1db97066 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -360,6 +360,11 @@ class WriteThread { // Enable pipelined write to WAL and memtable. const bool enable_pipelined_write_; + // The maximum limit of number of bytes that are written in a single batch + // of WAL or memtable write. It is followed when the leader write size + // is larger than 1/8 of this limit. + const uint64_t max_write_batch_group_size_bytes; + // Points to the newest pending writer. Only leader can remove // elements, adding can be done lock-free by anybody. std::atomic newest_writer_; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index cf1d1f6ec..a7e8af16e 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -953,6 +953,13 @@ struct DBOptions { // Default: true bool enable_write_thread_adaptive_yield = true; + // The maximum limit of number of bytes that are written in a single batch + // of WAL or memtable write. It is followed when the leader write size + // is larger than 1/8 of this limit. + // + // Default: 1 MB + uint64_t max_write_batch_group_size_bytes = 1 << 20; + // The maximum number of microseconds that a write operation will use // a yielding spin loop to coordinate with other write threads before // blocking on a mutex. (Assuming write_thread_slow_yield_usec is diff --git a/options/db_options.cc b/options/db_options.cc index a39294211..ca2800d07 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -44,6 +44,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) table_cache_numshardbits(options.table_cache_numshardbits), wal_ttl_seconds(options.WAL_ttl_seconds), wal_size_limit_mb(options.WAL_size_limit_MB), + max_write_batch_group_size_bytes( + options.max_write_batch_group_size_bytes), manifest_preallocation_size(options.manifest_preallocation_size), allow_mmap_reads(options.allow_mmap_reads), allow_mmap_writes(options.allow_mmap_writes), @@ -153,6 +155,10 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.WAL_size_limit_MB: %" PRIu64, wal_size_limit_mb); + ROCKS_LOG_HEADER(log, + " " + "Options.max_write_batch_group_size_bytes: %" PRIu64, + max_write_batch_group_size_bytes); ROCKS_LOG_HEADER( log, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt, manifest_preallocation_size); diff --git a/options/db_options.h b/options/db_options.h index 98a790705..7c71b12a0 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -43,6 +43,7 @@ struct ImmutableDBOptions { int table_cache_numshardbits; uint64_t wal_ttl_seconds; uint64_t wal_size_limit_mb; + uint64_t max_write_batch_group_size_bytes; size_t manifest_preallocation_size; bool allow_mmap_reads; bool allow_mmap_writes; diff --git a/options/options_helper.cc b/options/options_helper.cc index 91ae2f8b5..42695a613 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -110,6 +110,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, immutable_db_options.allow_concurrent_memtable_write; options.enable_write_thread_adaptive_yield = immutable_db_options.enable_write_thread_adaptive_yield; + options.max_write_batch_group_size_bytes = + immutable_db_options.max_write_batch_group_size_bytes; options.write_thread_max_yield_usec = immutable_db_options.write_thread_max_yield_usec; options.write_thread_slow_yield_usec = @@ -1611,6 +1613,9 @@ std::unordered_map {"write_thread_slow_yield_usec", {offsetof(struct DBOptions, write_thread_slow_yield_usec), OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, + {"max_write_batch_group_size_bytes", + {offsetof(struct DBOptions, max_write_batch_group_size_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, {"write_thread_max_yield_usec", {offsetof(struct DBOptions, write_thread_max_yield_usec), OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index a4d4b9910..2208b4f30 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -229,6 +229,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "delete_obsolete_files_period_micros=4294967758;" "WAL_ttl_seconds=4295008036;" "WAL_size_limit_MB=4295036161;" + "max_write_batch_group_size_bytes=1048576;" "wal_dir=path/to/wal_dir;" "db_write_buffer_size=2587;" "max_subcompactions=64330;"