From c4c1f961e7264c0e1e11f222e5b90c8ef7eb1d86 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Thu, 2 Nov 2017 22:16:23 -0700 Subject: [PATCH] dynamically change current memtable size Summary: Previously setting `write_buffer_size` with `SetOptions` would only apply to new memtables. An internal user wanted it to take effect immediately, instead of at an arbitrary future point, to prevent OOM. This PR makes the memtable's size mutable, and makes `SetOptions()` mutate it. There is one case when we preserve the old behavior, which is when memtable prefix bloom filter is enabled and the user is increasing the memtable's capacity. That's because the prefix bloom filter's size is fixed and wouldn't work as well on a larger memtable. Closes https://github.com/facebook/rocksdb/pull/3119 Differential Revision: D6228304 Pulled By: ajkr fbshipit-source-id: e44bd9d10a5f8c9d8c464bf7436070bb3eafdfc9 --- db/column_family.cc | 4 ++++ db/db_test.cc | 22 +++++++++++++++++----- db/memtable.cc | 25 +++++++++++++------------ db/memtable.h | 29 ++++++++++++++++++++++------- db/write_batch.cc | 4 ++-- 5 files changed, 58 insertions(+), 26 deletions(-) diff --git a/db/column_family.cc b/db/column_family.cc index e5a27dc8f..667941f12 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -949,6 +949,10 @@ void ColumnFamilyData::InstallSuperVersion( RecalculateWriteStallConditions(mutable_cf_options); if (old_superversion != nullptr) { + if (old_superversion->mutable_cf_options.write_buffer_size != + mutable_cf_options.write_buffer_size) { + mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size); + } if (old_superversion->write_stall_condition != new_superversion->write_stall_condition) { sv_context->PushWriteStallNotification( diff --git a/db/db_test.cc b/db/db_test.cc index 6a19b7146..0d573631b 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3354,11 +3354,23 @@ TEST_F(DBTest, DynamicMemtableOptions) { {"write_buffer_size", "131072"}, })); - // The existing memtable is still 64KB in size, after it becomes immutable, - // the next memtable will be 128KB in size. Write 256KB total, we should - // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data - gen_l0_kb(256); - ASSERT_EQ(NumTableFilesAtLevel(0), 2); // (A) + // The existing memtable inflated 64KB->128KB when we invoked SetOptions(). + // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data. + gen_l0_kb(192); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); // (A) + ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB); + ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB); + + // Decrease buffer size below current usage + ASSERT_OK(dbfull()->SetOptions({ + {"write_buffer_size", "65536"}, + })); + // The existing memtable became eligible for flush when we reduced its + // capacity to 64KB. Two keys need to be added to trigger flush: first causes + // memtable to be marked full, second schedules the flush. Then we should have + // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key. + gen_l0_kb(2); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB); ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB); diff --git a/db/memtable.cc b/db/memtable.cc index 84e9028e7..af3ae8f83 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -39,10 +39,10 @@ namespace rocksdb { -MemTableOptions::MemTableOptions(const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options) - : write_buffer_size(mutable_cf_options.write_buffer_size), - arena_block_size(mutable_cf_options.arena_block_size), +ImmutableMemTableOptions::ImmutableMemTableOptions( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options) + : arena_block_size(mutable_cf_options.arena_block_size), memtable_prefix_bloom_bits( static_cast( static_cast(mutable_cf_options.write_buffer_size) * @@ -83,6 +83,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, data_size_(0), num_entries_(0), num_deletes_(0), + write_buffer_size_(mutable_cf_options.write_buffer_size), flush_in_progress_(false), flush_completed_(false), file_number_(0), @@ -136,6 +137,7 @@ size_t MemTable::ApproximateMemoryUsage() { } bool MemTable::ShouldFlushNow() const { + size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed); // In a lot of times, we cannot allocate arena blocks that exactly matches the // buffer size. Thus we have to decide if we should over-allocate or // under-allocate. @@ -153,16 +155,14 @@ bool MemTable::ShouldFlushNow() const { // if we can still allocate one more block without exceeding the // over-allocation ratio, then we should not flush. if (allocated_memory + kArenaBlockSize < - moptions_.write_buffer_size + - kArenaBlockSize * kAllowOverAllocationRatio) { + write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) { return false; } - // if user keeps adding entries that exceeds moptions.write_buffer_size, - // we need to flush earlier even though we still have much available - // memory left. - if (allocated_memory > moptions_.write_buffer_size + - kArenaBlockSize * kAllowOverAllocationRatio) { + // if user keeps adding entries that exceeds write_buffer_size, we need to + // flush earlier even though we still have much available memory left. + if (allocated_memory > + write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) { return true; } @@ -265,7 +265,8 @@ class MemTableIterator : public InternalIterator { comparator_(mem.comparator_), valid_(false), arena_mode_(arena != nullptr), - value_pinned_(!mem.GetMemTableOptions()->inplace_update_support) { + value_pinned_( + !mem.GetImmutableMemTableOptions()->inplace_update_support) { if (use_range_del_table) { iter_ = mem.range_del_table_->GetIterator(arena); } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) { diff --git a/db/memtable.h b/db/memtable.h index e1fe59c4d..76e3cf1bf 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -36,11 +36,9 @@ class MemTableIterator; class MergeContext; class InternalIterator; -struct MemTableOptions { - explicit MemTableOptions( - const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options); - size_t write_buffer_size; +struct ImmutableMemTableOptions { + explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); size_t arena_block_size; uint32_t memtable_prefix_bloom_bits; size_t memtable_huge_page_size; @@ -262,6 +260,18 @@ class MemTable { return num_deletes_.load(std::memory_order_relaxed); } + // Dynamically change the memtable's capacity. If set below the current usage, + // the next key added will trigger a flush. Can only increase size when + // memtable prefix bloom is disabled, since we can't easily allocate more + // space. + void UpdateWriteBufferSize(size_t new_write_buffer_size) { + if (prefix_bloom_ == nullptr || + new_write_buffer_size < write_buffer_size_) { + write_buffer_size_.store(new_write_buffer_size, + std::memory_order_relaxed); + } + } + // Returns the edits area that is needed for flushing the memtable VersionEdit* GetEdits() { return &edit_; } @@ -350,7 +360,9 @@ class MemTable { return comparator_.comparator; } - const MemTableOptions* GetMemTableOptions() const { return &moptions_; } + const ImmutableMemTableOptions* GetImmutableMemTableOptions() const { + return &moptions_; + } uint64_t ApproximateOldestKeyTime() const { return oldest_key_time_.load(std::memory_order_relaxed); @@ -364,7 +376,7 @@ class MemTable { friend class MemTableList; KeyComparator comparator_; - const MemTableOptions moptions_; + const ImmutableMemTableOptions moptions_; int refs_; const size_t kArenaBlockSize; AllocTracker mem_tracker_; @@ -378,6 +390,9 @@ class MemTable { std::atomic num_entries_; std::atomic num_deletes_; + // Dynamically changeable memtable option + std::atomic write_buffer_size_; + // These are used to manage memtable flushes to storage bool flush_in_progress_; // started the flush bool flush_completed_; // finished the flush diff --git a/db/write_batch.cc b/db/write_batch.cc index 76203ea1d..e6921e08d 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -1035,7 +1035,7 @@ class MemTableInserter : public WriteBatch::Handler { } MemTable* mem = cf_mems_->GetMemTable(); - auto* moptions = mem->GetMemTableOptions(); + auto* moptions = mem->GetImmutableMemTableOptions(); if (!moptions->inplace_update_support) { mem->Add(sequence_, value_type, key, value, concurrent_memtable_writes_, get_post_process_info(mem)); @@ -1196,7 +1196,7 @@ class MemTableInserter : public WriteBatch::Handler { } MemTable* mem = cf_mems_->GetMemTable(); - auto* moptions = mem->GetMemTableOptions(); + auto* moptions = mem->GetImmutableMemTableOptions(); bool perform_merge = false; // If we pass DB through and options.max_successive_merges is hit