rocksdb/memtable/write_buffer_manager.cc

//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "rocksdb/write_buffer_manager.h"
#include <mutex>
#include "util/coding.h"

namespace rocksdb {
#ifndef ROCKSDB_LITE
namespace {
const size_t kSizeDummyEntry = 1024 * 1024;
// The key will be longer than keys for blocks in SST files so they won't
// conflict.
const size_t kCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
}  // namespace

struct WriteBufferManager::CacheRep {
  std::shared_ptr<Cache> cache_;
  std::mutex cache_mutex_;
  std::atomic<size_t> cache_allocated_size_;
  // The non-prefix part will be updated according to the ID to use.
  char cache_key_[kCacheKeyPrefix + kMaxVarint64Length];
  uint64_t next_cache_key_id_ = 0;
  std::vector<Cache::Handle*> dummy_handles_;

  explicit CacheRep(std::shared_ptr<Cache> cache)
      : cache_(cache), cache_allocated_size_(0) {
    memset(cache_key_, 0, kCacheKeyPrefix);
    size_t pointer_size = sizeof(const void*);
    assert(pointer_size <= kCacheKeyPrefix);
    memcpy(cache_key_, static_cast<const void*>(this), pointer_size);
  }

  Slice GetNextCacheKey() {
    memset(cache_key_ + kCacheKeyPrefix, 0, kMaxVarint64Length);
    char* end =
        EncodeVarint64(cache_key_ + kCacheKeyPrefix, next_cache_key_id_++);
    return Slice(cache_key_, static_cast<size_t>(end - cache_key_));
  }
};
#else
struct WriteBufferManager::CacheRep {};
#endif  // ROCKSDB_LITE

WriteBufferManager::WriteBufferManager(size_t _buffer_size,
                                       std::shared_ptr<Cache> cache)
    : buffer_size_(_buffer_size),
      mutable_limit_(buffer_size_ * 7 / 8),
      memory_used_(0),
      memory_active_(0),
      cache_rep_(nullptr) {
#ifndef ROCKSDB_LITE
  if (cache) {
    // Construct the cache key using the pointer to this.
    cache_rep_.reset(new CacheRep(cache));
  }
#else
  (void)cache;
#endif  // ROCKSDB_LITE
}

WriteBufferManager::~WriteBufferManager() {
#ifndef ROCKSDB_LITE
  if (cache_rep_) {
    for (auto* handle : cache_rep_->dummy_handles_) {
      cache_rep_->cache_->Release(handle, true);
    }
  }
#endif  // ROCKSDB_LITE
}

// Should only be called from write thread
void WriteBufferManager::ReserveMemWithCache(size_t mem) {
#ifndef ROCKSDB_LITE
  assert(cache_rep_ != nullptr);
  // Use a mutex to protect various data structures. Can be optimzied to a
  // lock-free solution if it ends up with a performance bottleneck.
  std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_);

  size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) + mem;
  memory_used_.store(new_mem_used, std::memory_order_relaxed);
  while (new_mem_used > cache_rep_->cache_allocated_size_) {
    // Expand size by at least 1MB.
    // Add a dummy record to the cache
    Cache::Handle* handle;
    cache_rep_->cache_->Insert(cache_rep_->GetNextCacheKey(), nullptr,
                               kSizeDummyEntry, nullptr, &handle);
    cache_rep_->dummy_handles_.push_back(handle);
    cache_rep_->cache_allocated_size_ += kSizeDummyEntry;
  }
#else
  (void)mem;
#endif  // ROCKSDB_LITE
}

void WriteBufferManager::FreeMemWithCache(size_t mem) {
#ifndef ROCKSDB_LITE
  assert(cache_rep_ != nullptr);
  // Use a mutex to protect various data structures. Can be optimzied to a
  // lock-free solution if it ends up with a performance bottleneck.
  std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_);
  size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) - mem;
  memory_used_.store(new_mem_used, std::memory_order_relaxed);
  // Gradually shrink memory costed in the block cache if the actual
  // usage is less than 3/4 of what we reserve from the block cache.
  // We do this becausse:
  // 1. we don't pay the cost of the block cache immediately a memtable is
  //    freed, as block cache insert is expensive;
  // 2. eventually, if we walk away from a temporary memtable size increase,
  //    we make sure shrink the memory costed in block cache over time.
  // In this way, we only shrink costed memory showly even there is enough
  // margin.
  if (new_mem_used < cache_rep_->cache_allocated_size_ / 4 * 3 &&
      cache_rep_->cache_allocated_size_ - kSizeDummyEntry > new_mem_used) {
    assert(!cache_rep_->dummy_handles_.empty());
    cache_rep_->cache_->Release(cache_rep_->dummy_handles_.back(), true);
    cache_rep_->dummy_handles_.pop_back();
    cache_rep_->cache_allocated_size_ -= kSizeDummyEntry;
  }
#else
  (void)mem;
#endif  // ROCKSDB_LITE
}
}  // namespace rocksdb
Improve write buffer manager (and allow the size to be tracked in block cache) Summary: Improve write buffer manager in several ways: 1. Size is tracked when arena block is allocated, rather than every allocation, so that it can better track actual memory usage and the tracking overhead is slightly lower. 2. We start to trigger memtable flush when 7/8 of the memory cap hits, instead of 100%, and make 100% much harder to hit. 3. Allow a cache object to be passed into buffer manager and the size allocated by memtable can be costed there. This can help users have one single memory cap across block cache and memtable. Closes https://github.com/facebook/rocksdb/pull/2350 Differential Revision: D5110648 Pulled By: siying fbshipit-source-id: b4238113094bf22574001e446b5d88523ba00017 8 years ago			`// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.`
Change RocksDB License Summary: Closes https://github.com/facebook/rocksdb/pull/2589 Differential Revision: D5431502 Pulled By: siying fbshipit-source-id: 8ebf8c87883daa9daa54b2303d11ce01ab1f6f75 7 years ago			`// This source code is licensed under both the GPLv2 (found in the`
			`// COPYING file in the root directory) and Apache 2.0 License`
			`// (found in the LICENSE.Apache file in the root directory).`
Improve write buffer manager (and allow the size to be tracked in block cache) Summary: Improve write buffer manager in several ways: 1. Size is tracked when arena block is allocated, rather than every allocation, so that it can better track actual memory usage and the tracking overhead is slightly lower. 2. We start to trigger memtable flush when 7/8 of the memory cap hits, instead of 100%, and make 100% much harder to hit. 3. Allow a cache object to be passed into buffer manager and the size allocated by memtable can be costed there. This can help users have one single memory cap across block cache and memtable. Closes https://github.com/facebook/rocksdb/pull/2350 Differential Revision: D5110648 Pulled By: siying fbshipit-source-id: b4238113094bf22574001e446b5d88523ba00017 8 years ago			`//`
			`// Copyright (c) 2011 The LevelDB Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style license that can be`
			`// found in the LICENSE file. See the AUTHORS file for names of contributors.`

			`#include "rocksdb/write_buffer_manager.h"`
			`#include <mutex>`
			`#include "util/coding.h"`

			`namespace rocksdb {`
			`#ifndef ROCKSDB_LITE`
			`namespace {`
			`const size_t kSizeDummyEntry = 1024 * 1024;`
			`// The key will be longer than keys for blocks in SST files so they won't`
			`// conflict.`
			`const size_t kCacheKeyPrefix = kMaxVarint64Length * 4 + 1;`
			`} // namespace`

			`struct WriteBufferManager::CacheRep {`
			`std::shared_ptr<Cache> cache_;`
			`std::mutex cache_mutex_;`
			`std::atomic<size_t> cache_allocated_size_;`
			`// The non-prefix part will be updated according to the ID to use.`
			`char cache_key_[kCacheKeyPrefix + kMaxVarint64Length];`
			`uint64_t next_cache_key_id_ = 0;`
			`std::vector<Cache::Handle*> dummy_handles_;`

			`explicit CacheRep(std::shared_ptr<Cache> cache)`
			`: cache_(cache), cache_allocated_size_(0) {`
			`memset(cache_key_, 0, kCacheKeyPrefix);`
			`size_t pointer_size = sizeof(const void*);`
			`assert(pointer_size <= kCacheKeyPrefix);`
			`memcpy(cache_key_, static_cast<const void*>(this), pointer_size);`
			`}`

			`Slice GetNextCacheKey() {`
			`memset(cache_key_ + kCacheKeyPrefix, 0, kMaxVarint64Length);`
			`char* end =`
			`EncodeVarint64(cache_key_ + kCacheKeyPrefix, next_cache_key_id_++);`
			`return Slice(cache_key_, static_cast<size_t>(end - cache_key_));`
			`}`
			`};`
			`#else`
			`struct WriteBufferManager::CacheRep {};`
			`#endif // ROCKSDB_LITE`

			`WriteBufferManager::WriteBufferManager(size_t _buffer_size,`
			`std::shared_ptr<Cache> cache)`
			`: buffer_size_(_buffer_size),`
WriteBufferManager will not trigger flush if much data is already being flushed Summary: Even if hard limit hits, flushing more memtable may not help cap the memory usage if already more than half data is scheduled for flush. Not triggering flush instead. Closes https://github.com/facebook/rocksdb/pull/2469 Differential Revision: D5284249 Pulled By: siying fbshipit-source-id: 8ab7ba1aba56a634dbe72b318fcab2093063972e 7 years ago			`mutable_limit_(buffer_size_ * 7 / 8),`
Improve write buffer manager (and allow the size to be tracked in block cache) Summary: Improve write buffer manager in several ways: 1. Size is tracked when arena block is allocated, rather than every allocation, so that it can better track actual memory usage and the tracking overhead is slightly lower. 2. We start to trigger memtable flush when 7/8 of the memory cap hits, instead of 100%, and make 100% much harder to hit. 3. Allow a cache object to be passed into buffer manager and the size allocated by memtable can be costed there. This can help users have one single memory cap across block cache and memtable. Closes https://github.com/facebook/rocksdb/pull/2350 Differential Revision: D5110648 Pulled By: siying fbshipit-source-id: b4238113094bf22574001e446b5d88523ba00017 8 years ago			`memory_used_(0),`
			`memory_active_(0),`
			`cache_rep_(nullptr) {`
			`#ifndef ROCKSDB_LITE`
			`if (cache) {`
			`// Construct the cache key using the pointer to this.`
			`cache_rep_.reset(new CacheRep(cache));`
			`}`
comment unused parameters to turn on -Wunused-parameter flag Summary: This PR comments out the rest of the unused arguments which allow us to turn on the -Wunused-parameter flag. This is the second part of a codemod relating to https://github.com/facebook/rocksdb/pull/3557. Closes https://github.com/facebook/rocksdb/pull/3662 Differential Revision: D7426121 Pulled By: Dayvedde fbshipit-source-id: 223994923b42bd4953eb016a0129e47560f7e352 7 years ago			`#else`
			`(void)cache;`
Improve write buffer manager (and allow the size to be tracked in block cache) Summary: Improve write buffer manager in several ways: 1. Size is tracked when arena block is allocated, rather than every allocation, so that it can better track actual memory usage and the tracking overhead is slightly lower. 2. We start to trigger memtable flush when 7/8 of the memory cap hits, instead of 100%, and make 100% much harder to hit. 3. Allow a cache object to be passed into buffer manager and the size allocated by memtable can be costed there. This can help users have one single memory cap across block cache and memtable. Closes https://github.com/facebook/rocksdb/pull/2350 Differential Revision: D5110648 Pulled By: siying fbshipit-source-id: b4238113094bf22574001e446b5d88523ba00017 8 years ago			`#endif // ROCKSDB_LITE`
			`}`

			`WriteBufferManager::~WriteBufferManager() {`
			`#ifndef ROCKSDB_LITE`
			`if (cache_rep_) {`
			`for (auto* handle : cache_rep_->dummy_handles_) {`
			`cache_rep_->cache_->Release(handle, true);`
			`}`
			`}`
			`#endif // ROCKSDB_LITE`
			`}`

			`// Should only be called from write thread`
			`void WriteBufferManager::ReserveMemWithCache(size_t mem) {`
			`#ifndef ROCKSDB_LITE`
			`assert(cache_rep_ != nullptr);`
			`// Use a mutex to protect various data structures. Can be optimzied to a`
			`// lock-free solution if it ends up with a performance bottleneck.`
			`std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_);`

			`size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) + mem;`
			`memory_used_.store(new_mem_used, std::memory_order_relaxed);`
			`while (new_mem_used > cache_rep_->cache_allocated_size_) {`
			`// Expand size by at least 1MB.`
			`// Add a dummy record to the cache`
			`Cache::Handle* handle;`
			`cache_rep_->cache_->Insert(cache_rep_->GetNextCacheKey(), nullptr,`
			`kSizeDummyEntry, nullptr, &handle);`
			`cache_rep_->dummy_handles_.push_back(handle);`
			`cache_rep_->cache_allocated_size_ += kSizeDummyEntry;`
			`}`
comment unused parameters to turn on -Wunused-parameter flag Summary: This PR comments out the rest of the unused arguments which allow us to turn on the -Wunused-parameter flag. This is the second part of a codemod relating to https://github.com/facebook/rocksdb/pull/3557. Closes https://github.com/facebook/rocksdb/pull/3662 Differential Revision: D7426121 Pulled By: Dayvedde fbshipit-source-id: 223994923b42bd4953eb016a0129e47560f7e352 7 years ago			`#else`
			`(void)mem;`
Improve write buffer manager (and allow the size to be tracked in block cache) Summary: Improve write buffer manager in several ways: 1. Size is tracked when arena block is allocated, rather than every allocation, so that it can better track actual memory usage and the tracking overhead is slightly lower. 2. We start to trigger memtable flush when 7/8 of the memory cap hits, instead of 100%, and make 100% much harder to hit. 3. Allow a cache object to be passed into buffer manager and the size allocated by memtable can be costed there. This can help users have one single memory cap across block cache and memtable. Closes https://github.com/facebook/rocksdb/pull/2350 Differential Revision: D5110648 Pulled By: siying fbshipit-source-id: b4238113094bf22574001e446b5d88523ba00017 8 years ago			`#endif // ROCKSDB_LITE`
			`}`

			`void WriteBufferManager::FreeMemWithCache(size_t mem) {`
			`#ifndef ROCKSDB_LITE`
			`assert(cache_rep_ != nullptr);`
			`// Use a mutex to protect various data structures. Can be optimzied to a`
			`// lock-free solution if it ends up with a performance bottleneck.`
			`std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_);`
			`size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) - mem;`
			`memory_used_.store(new_mem_used, std::memory_order_relaxed);`
			`// Gradually shrink memory costed in the block cache if the actual`
			`// usage is less than 3/4 of what we reserve from the block cache.`
			`// We do this becausse:`
			`// 1. we don't pay the cost of the block cache immediately a memtable is`
			`// freed, as block cache insert is expensive;`
			`// 2. eventually, if we walk away from a temporary memtable size increase,`
			`// we make sure shrink the memory costed in block cache over time.`
			`// In this way, we only shrink costed memory showly even there is enough`
			`// margin.`
			`if (new_mem_used < cache_rep_->cache_allocated_size_ / 4 * 3 &&`
			`cache_rep_->cache_allocated_size_ - kSizeDummyEntry > new_mem_used) {`
			`assert(!cache_rep_->dummy_handles_.empty());`
			`cache_rep_->cache_->Release(cache_rep_->dummy_handles_.back(), true);`
			`cache_rep_->dummy_handles_.pop_back();`
			`cache_rep_->cache_allocated_size_ -= kSizeDummyEntry;`
			`}`
comment unused parameters to turn on -Wunused-parameter flag Summary: This PR comments out the rest of the unused arguments which allow us to turn on the -Wunused-parameter flag. This is the second part of a codemod relating to https://github.com/facebook/rocksdb/pull/3557. Closes https://github.com/facebook/rocksdb/pull/3662 Differential Revision: D7426121 Pulled By: Dayvedde fbshipit-source-id: 223994923b42bd4953eb016a0129e47560f7e352 7 years ago			`#else`
			`(void)mem;`
Improve write buffer manager (and allow the size to be tracked in block cache) Summary: Improve write buffer manager in several ways: 1. Size is tracked when arena block is allocated, rather than every allocation, so that it can better track actual memory usage and the tracking overhead is slightly lower. 2. We start to trigger memtable flush when 7/8 of the memory cap hits, instead of 100%, and make 100% much harder to hit. 3. Allow a cache object to be passed into buffer manager and the size allocated by memtable can be costed there. This can help users have one single memory cap across block cache and memtable. Closes https://github.com/facebook/rocksdb/pull/2350 Differential Revision: D5110648 Pulled By: siying fbshipit-source-id: b4238113094bf22574001e446b5d88523ba00017 8 years ago			`#endif // ROCKSDB_LITE`
			`}`
			`} // namespace rocksdb`