Summary: Improve write buffer manager in several ways: 1. Size is tracked when arena block is allocated, rather than every allocation, so that it can better track actual memory usage and the tracking overhead is slightly lower. 2. We start to trigger memtable flush when 7/8 of the memory cap hits, instead of 100%, and make 100% much harder to hit. 3. Allow a cache object to be passed into buffer manager and the size allocated by memtable can be costed there. This can help users have one single memory cap across block cache and memtable. Closes https://github.com/facebook/rocksdb/pull/2350 Differential Revision: D5110648 Pulled By: siying fbshipit-source-id: b4238113094bf22574001e446b5d88523ba00017main
parent
a4d9c02511
commit
95b0e89b5d
@ -1,52 +0,0 @@ |
|||||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
||||||
// This source code is licensed under the BSD-style license found in the
|
|
||||||
// LICENSE file in the root directory of this source tree. An additional grant
|
|
||||||
// of patent rights can be found in the PATENTS file in the same directory.
|
|
||||||
// This source code is also licensed under the GPLv2 license found in the
|
|
||||||
// COPYING file in the root directory of this source tree.
|
|
||||||
//
|
|
||||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
||||||
//
|
|
||||||
// This is used by the MemTable to allocate write buffer memory. It connects
|
|
||||||
// to WriteBufferManager so we can track and enforce overall write buffer
|
|
||||||
// limits.
|
|
||||||
|
|
||||||
#pragma once |
|
||||||
|
|
||||||
#include <atomic> |
|
||||||
#include "rocksdb/write_buffer_manager.h" |
|
||||||
#include "util/allocator.h" |
|
||||||
|
|
||||||
namespace rocksdb { |
|
||||||
|
|
||||||
class Logger; |
|
||||||
|
|
||||||
class MemTableAllocator : public Allocator { |
|
||||||
public: |
|
||||||
explicit MemTableAllocator(Allocator* allocator, |
|
||||||
WriteBufferManager* write_buffer_manager); |
|
||||||
~MemTableAllocator(); |
|
||||||
|
|
||||||
// Allocator interface
|
|
||||||
char* Allocate(size_t bytes) override; |
|
||||||
char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, |
|
||||||
Logger* logger = nullptr) override; |
|
||||||
size_t BlockSize() const override; |
|
||||||
|
|
||||||
// Call when we're finished allocating memory so we can free it from
|
|
||||||
// the write buffer's limit.
|
|
||||||
void DoneAllocating(); |
|
||||||
|
|
||||||
private: |
|
||||||
Allocator* allocator_; |
|
||||||
WriteBufferManager* write_buffer_manager_; |
|
||||||
std::atomic<size_t> bytes_allocated_; |
|
||||||
|
|
||||||
// No copying allowed
|
|
||||||
MemTableAllocator(const MemTableAllocator&); |
|
||||||
void operator=(const MemTableAllocator&); |
|
||||||
}; |
|
||||||
|
|
||||||
} // namespace rocksdb
|
|
@ -0,0 +1,125 @@ |
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
// This source code is also licensed under the GPLv2 license found in the
|
||||||
|
// COPYING file in the root directory of this source tree.
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#include "rocksdb/write_buffer_manager.h" |
||||||
|
#include <mutex> |
||||||
|
#include "util/coding.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
#ifndef ROCKSDB_LITE |
||||||
|
namespace { |
||||||
|
const size_t kSizeDummyEntry = 1024 * 1024; |
||||||
|
// The key will be longer than keys for blocks in SST files so they won't
|
||||||
|
// conflict.
|
||||||
|
const size_t kCacheKeyPrefix = kMaxVarint64Length * 4 + 1; |
||||||
|
} // namespace
|
||||||
|
|
||||||
|
struct WriteBufferManager::CacheRep { |
||||||
|
std::shared_ptr<Cache> cache_; |
||||||
|
std::mutex cache_mutex_; |
||||||
|
std::atomic<size_t> cache_allocated_size_; |
||||||
|
// The non-prefix part will be updated according to the ID to use.
|
||||||
|
char cache_key_[kCacheKeyPrefix + kMaxVarint64Length]; |
||||||
|
uint64_t next_cache_key_id_ = 0; |
||||||
|
std::vector<Cache::Handle*> dummy_handles_; |
||||||
|
|
||||||
|
explicit CacheRep(std::shared_ptr<Cache> cache) |
||||||
|
: cache_(cache), cache_allocated_size_(0) { |
||||||
|
memset(cache_key_, 0, kCacheKeyPrefix); |
||||||
|
size_t pointer_size = sizeof(const void*); |
||||||
|
assert(pointer_size <= kCacheKeyPrefix); |
||||||
|
memcpy(cache_key_, static_cast<const void*>(this), pointer_size); |
||||||
|
} |
||||||
|
|
||||||
|
Slice GetNextCacheKey() { |
||||||
|
memset(cache_key_ + kCacheKeyPrefix, 0, kMaxVarint64Length); |
||||||
|
char* end = |
||||||
|
EncodeVarint64(cache_key_ + kCacheKeyPrefix, next_cache_key_id_++); |
||||||
|
return Slice(cache_key_, static_cast<size_t>(end - cache_key_)); |
||||||
|
} |
||||||
|
}; |
||||||
|
#else |
||||||
|
struct WriteBufferManager::CacheRep {}; |
||||||
|
#endif // ROCKSDB_LITE
|
||||||
|
|
||||||
|
WriteBufferManager::WriteBufferManager(size_t _buffer_size, |
||||||
|
std::shared_ptr<Cache> cache) |
||||||
|
: buffer_size_(_buffer_size), |
||||||
|
memory_used_(0), |
||||||
|
memory_active_(0), |
||||||
|
cache_rep_(nullptr) { |
||||||
|
#ifndef ROCKSDB_LITE |
||||||
|
if (cache) { |
||||||
|
// Construct the cache key using the pointer to this.
|
||||||
|
cache_rep_.reset(new CacheRep(cache)); |
||||||
|
} |
||||||
|
#endif // ROCKSDB_LITE
|
||||||
|
} |
||||||
|
|
||||||
|
WriteBufferManager::~WriteBufferManager() { |
||||||
|
#ifndef ROCKSDB_LITE |
||||||
|
if (cache_rep_) { |
||||||
|
for (auto* handle : cache_rep_->dummy_handles_) { |
||||||
|
cache_rep_->cache_->Release(handle, true); |
||||||
|
} |
||||||
|
} |
||||||
|
#endif // ROCKSDB_LITE
|
||||||
|
} |
||||||
|
|
||||||
|
// Should only be called from write thread
|
||||||
|
void WriteBufferManager::ReserveMemWithCache(size_t mem) { |
||||||
|
#ifndef ROCKSDB_LITE |
||||||
|
assert(cache_rep_ != nullptr); |
||||||
|
// Use a mutex to protect various data structures. Can be optimzied to a
|
||||||
|
// lock-free solution if it ends up with a performance bottleneck.
|
||||||
|
std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_); |
||||||
|
|
||||||
|
size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) + mem; |
||||||
|
memory_used_.store(new_mem_used, std::memory_order_relaxed); |
||||||
|
while (new_mem_used > cache_rep_->cache_allocated_size_) { |
||||||
|
// Expand size by at least 1MB.
|
||||||
|
// Add a dummy record to the cache
|
||||||
|
Cache::Handle* handle; |
||||||
|
cache_rep_->cache_->Insert(cache_rep_->GetNextCacheKey(), nullptr, |
||||||
|
kSizeDummyEntry, nullptr, &handle); |
||||||
|
cache_rep_->dummy_handles_.push_back(handle); |
||||||
|
cache_rep_->cache_allocated_size_ += kSizeDummyEntry; |
||||||
|
} |
||||||
|
#endif // ROCKSDB_LITE
|
||||||
|
} |
||||||
|
|
||||||
|
void WriteBufferManager::FreeMemWithCache(size_t mem) { |
||||||
|
#ifndef ROCKSDB_LITE |
||||||
|
assert(cache_rep_ != nullptr); |
||||||
|
// Use a mutex to protect various data structures. Can be optimzied to a
|
||||||
|
// lock-free solution if it ends up with a performance bottleneck.
|
||||||
|
std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_); |
||||||
|
size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) - mem; |
||||||
|
memory_used_.store(new_mem_used, std::memory_order_relaxed); |
||||||
|
// Gradually shrink memory costed in the block cache if the actual
|
||||||
|
// usage is less than 3/4 of what we reserve from the block cache.
|
||||||
|
// We do this becausse:
|
||||||
|
// 1. we don't pay the cost of the block cache immediately a memtable is
|
||||||
|
// freed, as block cache insert is expensive;
|
||||||
|
// 2. eventually, if we walk away from a temporary memtable size increase,
|
||||||
|
// we make sure shrink the memory costed in block cache over time.
|
||||||
|
// In this way, we only shrink costed memory showly even there is enough
|
||||||
|
// margin.
|
||||||
|
if (new_mem_used < cache_rep_->cache_allocated_size_ / 4 * 3 && |
||||||
|
cache_rep_->cache_allocated_size_ - kSizeDummyEntry > new_mem_used) { |
||||||
|
assert(!cache_rep_->dummy_handles_.empty()); |
||||||
|
cache_rep_->cache_->Release(cache_rep_->dummy_handles_.back(), true); |
||||||
|
cache_rep_->dummy_handles_.pop_back(); |
||||||
|
cache_rep_->cache_allocated_size_ -= kSizeDummyEntry; |
||||||
|
} |
||||||
|
#endif // ROCKSDB_LITE
|
||||||
|
} |
||||||
|
} // namespace rocksdb
|
@ -0,0 +1,141 @@ |
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
// This source code is also licensed under the GPLv2 license found in the
|
||||||
|
// COPYING file in the root directory of this source tree.
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#include "rocksdb/write_buffer_manager.h" |
||||||
|
#include "util/testharness.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
class WriteBufferManagerTest : public testing::Test {}; |
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE |
||||||
|
TEST_F(WriteBufferManagerTest, ShouldFlush) { |
||||||
|
// A write buffer manager of size 50MB
|
||||||
|
std::unique_ptr<WriteBufferManager> wbf( |
||||||
|
new WriteBufferManager(10 * 1024 * 1024)); |
||||||
|
|
||||||
|
wbf->ReserveMem(8 * 1024 * 1024); |
||||||
|
ASSERT_FALSE(wbf->ShouldFlush()); |
||||||
|
// 90% of the hard limit will hit the condition
|
||||||
|
wbf->ReserveMem(1 * 1024 * 1024); |
||||||
|
ASSERT_TRUE(wbf->ShouldFlush()); |
||||||
|
// Scheduling for feeing will release the condition
|
||||||
|
wbf->ScheduleFreeMem(1 * 1024 * 1024); |
||||||
|
ASSERT_FALSE(wbf->ShouldFlush()); |
||||||
|
|
||||||
|
wbf->ReserveMem(2 * 1024 * 1024); |
||||||
|
ASSERT_TRUE(wbf->ShouldFlush()); |
||||||
|
wbf->ScheduleFreeMem(5 * 1024 * 1024); |
||||||
|
// hard limit still hit
|
||||||
|
ASSERT_TRUE(wbf->ShouldFlush()); |
||||||
|
wbf->FreeMem(10 * 1024 * 1024); |
||||||
|
ASSERT_FALSE(wbf->ShouldFlush()); |
||||||
|
} |
||||||
|
|
||||||
|
TEST_F(WriteBufferManagerTest, CacheCost) { |
||||||
|
// 1GB cache
|
||||||
|
std::shared_ptr<Cache> cache = NewLRUCache(1024 * 1024 * 1024, 4); |
||||||
|
// A write buffer manager of size 50MB
|
||||||
|
std::unique_ptr<WriteBufferManager> wbf( |
||||||
|
new WriteBufferManager(50 * 1024 * 1024, cache)); |
||||||
|
|
||||||
|
// Allocate 1.5MB will allocate 2MB
|
||||||
|
wbf->ReserveMem(1536 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 2 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 2 * 1024 * 1024 + 10000); |
||||||
|
|
||||||
|
// Allocate another 2MB
|
||||||
|
wbf->ReserveMem(2 * 1024 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 4 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 4 * 1024 * 1024 + 10000); |
||||||
|
|
||||||
|
// Allocate another 20MB
|
||||||
|
wbf->ReserveMem(20 * 1024 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 24 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 24 * 1024 * 1024 + 10000); |
||||||
|
|
||||||
|
// Free 2MB will not cause any change in cache cost
|
||||||
|
wbf->FreeMem(2 * 1024 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 24 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 24 * 1024 * 1024 + 10000); |
||||||
|
|
||||||
|
ASSERT_FALSE(wbf->ShouldFlush()); |
||||||
|
|
||||||
|
// Allocate another 30MB
|
||||||
|
wbf->ReserveMem(30 * 1024 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 52 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 52 * 1024 * 1024 + 10000); |
||||||
|
ASSERT_TRUE(wbf->ShouldFlush()); |
||||||
|
|
||||||
|
ASSERT_TRUE(wbf->ShouldFlush()); |
||||||
|
|
||||||
|
wbf->ScheduleFreeMem(20 * 1024 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 52 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 52 * 1024 * 1024 + 10000); |
||||||
|
|
||||||
|
// Still need flush as the hard limit hits
|
||||||
|
ASSERT_TRUE(wbf->ShouldFlush()); |
||||||
|
|
||||||
|
// Free 20MB will releae 1MB from cache
|
||||||
|
wbf->FreeMem(20 * 1024 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 + 10000); |
||||||
|
|
||||||
|
ASSERT_FALSE(wbf->ShouldFlush()); |
||||||
|
|
||||||
|
// Every free will release 1MB if still not hit 3/4
|
||||||
|
wbf->FreeMem(16 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 50 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 50 * 1024 * 1024 + 10000); |
||||||
|
|
||||||
|
wbf->FreeMem(16 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 49 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 49 * 1024 * 1024 + 10000); |
||||||
|
|
||||||
|
// Free 2MB will not cause any change in cache cost
|
||||||
|
wbf->ReserveMem(2 * 1024 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 49 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 49 * 1024 * 1024 + 10000); |
||||||
|
|
||||||
|
wbf->FreeMem(16 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 48 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 48 * 1024 * 1024 + 10000); |
||||||
|
|
||||||
|
// Destory write buffer manger should free everything
|
||||||
|
wbf.reset(); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 1024 * 1024); |
||||||
|
} |
||||||
|
|
||||||
|
TEST_F(WriteBufferManagerTest, NoCapCacheCost) { |
||||||
|
// 1GB cache
|
||||||
|
std::shared_ptr<Cache> cache = NewLRUCache(1024 * 1024 * 1024, 4); |
||||||
|
// A write buffer manager of size 256MB
|
||||||
|
std::unique_ptr<WriteBufferManager> wbf(new WriteBufferManager(0, cache)); |
||||||
|
// Allocate 1.5MB will allocate 2MB
|
||||||
|
wbf->ReserveMem(10 * 1024 * 1024); |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 10 * 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 10 * 1024 * 1024 + 10000); |
||||||
|
ASSERT_FALSE(wbf->ShouldFlush()); |
||||||
|
|
||||||
|
wbf->FreeMem(9 * 1024 * 1024); |
||||||
|
for (int i = 0; i < 10; i++) { |
||||||
|
wbf->FreeMem(16 * 1024); |
||||||
|
} |
||||||
|
ASSERT_GE(cache->GetPinnedUsage(), 1024 * 1024); |
||||||
|
ASSERT_LT(cache->GetPinnedUsage(), 1024 * 1024 + 10000); |
||||||
|
} |
||||||
|
#endif // ROCKSDB_LITE
|
||||||
|
} // namespace rocksdb
|
||||||
|
|
||||||
|
int main(int argc, char** argv) { |
||||||
|
::testing::InitGoogleTest(&argc, argv); |
||||||
|
return RUN_ALL_TESTS(); |
||||||
|
} |
Loading…
Reference in new issue