Charging block cache more accurately (#4073)

Summary:
Currently the block cache is charged only by the size of the raw data block and excludes the overhead of the c++ objects that contain the raw data block. The patch improves the accuracy of the charge by including the c++ object overhead into it.
Closes https://github.com/facebook/rocksdb/pull/4073

Differential Revision: D8686552

Pulled By: maysamyabandeh

fbshipit-source-id: 8472f7fc163c0644533bc6942e20cdd5725f520f
main
Maysam Yabandeh 7 years ago committed by Facebook Github Bot
parent b3efb1cbe0
commit 29ffbb8a50
  1. 8
      table/block.cc
  2. 17
      table/block.h
  3. 2
      table/block_based_table_builder.cc
  4. 119
      table/block_based_table_reader.cc
  5. 20
      table/format.h
  6. 18
      table/full_filter_block.cc
  7. 1
      table/full_filter_block.h
  8. 16
      table/partitioned_filter_block.cc
  9. 2
      table/table_test.cc

@ -486,9 +486,17 @@ void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) {
size_t Block::ApproximateMemoryUsage() const {
size_t usage = usable_size();
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
usage += malloc_usable_size((void*)this);
#else
usage += sizeof(*this);
#endif // ROCKSDB_MALLOC_USABLE_SIZE
if (prefix_index_) {
usage += prefix_index_->ApproximateMemoryUsage();
}
if (read_amp_bitmap_) {
usage += read_amp_bitmap_->ApproximateMemoryUsage();
}
return usage;
}

@ -104,6 +104,13 @@ class BlockReadAmpBitmap {
uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
size_t ApproximateMemoryUsage() const {
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
return malloc_usable_size((void*)this);
#endif // ROCKSDB_MALLOC_USABLE_SIZE
return sizeof(*this);
}
private:
// Get the current value of bit at `bit_idx` and set it to 1
inline bool GetAndSet(uint32_t bit_idx) {
@ -142,14 +149,8 @@ class Block {
size_t size() const { return size_; }
const char* data() const { return data_; }
bool cachable() const { return contents_.cachable; }
size_t usable_size() const {
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
if (contents_.allocation.get() != nullptr) {
return malloc_usable_size(contents_.allocation.get());
}
#endif // ROCKSDB_MALLOC_USABLE_SIZE
return size_;
}
// The additional memory space taken by the block data.
size_t usable_size() const { return contents_.usable_size(); }
uint32_t NumRestarts() const;
CompressionType compression_type() const {
return contents_.compression_type;

@ -659,7 +659,7 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
(end - r->compressed_cache_key_prefix));
// Insert into compressed block cache.
block_cache_compressed->Insert(key, block, block->usable_size(),
block_cache_compressed->Insert(key, block, block->ApproximateMemoryUsage(),
&DeleteCachedBlock);
// Invalidate OS cache.

@ -346,7 +346,14 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
virtual size_t ApproximateMemoryUsage() const override {
assert(index_block_);
return index_block_->ApproximateMemoryUsage();
size_t usage = index_block_->ApproximateMemoryUsage();
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
usage += malloc_usable_size((void*)this);
#else
usage += sizeof(*this);
#endif // ROCKSDB_MALLOC_USABLE_SIZE
// TODO(myabandeh): more accurate estimate of partition_map_ mem usage
return usage;
}
private:
@ -415,7 +422,13 @@ class BinarySearchIndexReader : public IndexReader {
virtual size_t ApproximateMemoryUsage() const override {
assert(index_block_);
return index_block_->ApproximateMemoryUsage();
size_t usage = index_block_->ApproximateMemoryUsage();
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
usage += malloc_usable_size((void*)this);
#else
usage += sizeof(*this);
#endif // ROCKSDB_MALLOC_USABLE_SIZE
return usage;
}
private:
@ -532,8 +545,14 @@ class HashIndexReader : public IndexReader {
virtual size_t ApproximateMemoryUsage() const override {
assert(index_block_);
return index_block_->ApproximateMemoryUsage() +
prefixes_contents_.data.size();
size_t usage = index_block_->ApproximateMemoryUsage();
usage += prefixes_contents_.usable_size();
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
usage += malloc_usable_size((void*)this);
#else
usage += sizeof(*this);
#endif // ROCKSDB_MALLOC_USABLE_SIZE
return usage;
}
private:
@ -1154,40 +1173,34 @@ Status BlockBasedTable::GetDataBlockFromCache(
assert(block->value->compression_type() == kNoCompression);
if (block_cache != nullptr && block->value->cachable() &&
read_options.fill_cache) {
s = block_cache->Insert(
block_cache_key, block->value, block->value->usable_size(),
&DeleteCachedEntry<Block>, &(block->cache_handle));
block_cache->TEST_mark_as_data_block(block_cache_key,
block->value->usable_size());
size_t charge = block->value->ApproximateMemoryUsage();
s = block_cache->Insert(block_cache_key, block->value, charge,
&DeleteCachedEntry<Block>,
&(block->cache_handle));
block_cache->TEST_mark_as_data_block(block_cache_key, charge);
if (s.ok()) {
if (get_context != nullptr) {
get_context->RecordCounters(BLOCK_CACHE_ADD, 1);
get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE,
block->value->usable_size());
get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge);
} else {
RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
block->value->usable_size());
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
}
if (is_index) {
if (get_context != nullptr) {
get_context->RecordCounters(BLOCK_CACHE_INDEX_ADD, 1);
get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT,
block->value->usable_size());
get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
} else {
RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT,
block->value->usable_size());
RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
}
} else {
if (get_context != nullptr) {
get_context->RecordCounters(BLOCK_CACHE_DATA_ADD, 1);
get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT,
block->value->usable_size());
get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT, charge);
} else {
RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT,
block->value->usable_size());
RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
}
}
} else {
@ -1243,7 +1256,7 @@ Status BlockBasedTable::PutDataBlockToCache(
if (block_cache_compressed != nullptr && raw_block != nullptr &&
raw_block->cachable()) {
s = block_cache_compressed->Insert(compressed_block_cache_key, raw_block,
raw_block->usable_size(),
raw_block->ApproximateMemoryUsage(),
&DeleteCachedEntry<Block>);
if (s.ok()) {
// Avoid the following code to delete this cached block.
@ -1258,41 +1271,35 @@ Status BlockBasedTable::PutDataBlockToCache(
// insert into uncompressed block cache
assert((block->value->compression_type() == kNoCompression));
if (block_cache != nullptr && block->value->cachable()) {
s = block_cache->Insert(
block_cache_key, block->value, block->value->usable_size(),
&DeleteCachedEntry<Block>, &(block->cache_handle), priority);
block_cache->TEST_mark_as_data_block(block_cache_key,
block->value->usable_size());
size_t charge = block->value->ApproximateMemoryUsage();
s = block_cache->Insert(block_cache_key, block->value, charge,
&DeleteCachedEntry<Block>, &(block->cache_handle),
priority);
block_cache->TEST_mark_as_data_block(block_cache_key, charge);
if (s.ok()) {
assert(block->cache_handle != nullptr);
if (get_context != nullptr) {
get_context->RecordCounters(BLOCK_CACHE_ADD, 1);
get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE,
block->value->usable_size());
get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge);
} else {
RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
block->value->usable_size());
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
}
if (is_index) {
if (get_context != nullptr) {
get_context->RecordCounters(BLOCK_CACHE_INDEX_ADD, 1);
get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT,
block->value->usable_size());
get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
} else {
RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT,
block->value->usable_size());
RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
}
} else {
if (get_context != nullptr) {
get_context->RecordCounters(BLOCK_CACHE_DATA_ADD, 1);
get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT,
block->value->usable_size());
get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT, charge);
} else {
RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT,
block->value->usable_size());
RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
}
}
assert(reinterpret_cast<Block*>(
@ -1429,24 +1436,23 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
filter = ReadFilter(prefetch_buffer, filter_blk_handle,
is_a_filter_partition, prefix_extractor);
if (filter != nullptr) {
size_t usage = filter->ApproximateMemoryUsage();
Status s = block_cache->Insert(
key, filter, filter->size(), &DeleteCachedFilterEntry, &cache_handle,
key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
rep_->table_options.cache_index_and_filter_blocks_with_high_priority
? Cache::Priority::HIGH
: Cache::Priority::LOW);
if (s.ok()) {
if (get_context != nullptr) {
get_context->RecordCounters(BLOCK_CACHE_ADD, 1);
get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, filter->size());
get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, usage);
get_context->RecordCounters(BLOCK_CACHE_FILTER_ADD, 1);
get_context->RecordCounters(BLOCK_CACHE_FILTER_BYTES_INSERT,
filter->size());
get_context->RecordCounters(BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
} else {
RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter->size());
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT,
filter->size());
RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
}
} else {
RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
@ -1512,27 +1518,27 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1");
TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3");
TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4");
size_t charge = 0;
if (s.ok()) {
assert(index_reader != nullptr);
charge = index_reader->ApproximateMemoryUsage();
s = block_cache->Insert(
key, index_reader, index_reader->usable_size(),
&DeleteCachedIndexEntry, &cache_handle,
key, index_reader, charge, &DeleteCachedIndexEntry, &cache_handle,
rep_->table_options.cache_index_and_filter_blocks_with_high_priority
? Cache::Priority::HIGH
: Cache::Priority::LOW);
}
if (s.ok()) {
size_t usable_size = index_reader->usable_size();
if (get_context != nullptr) {
get_context->RecordCounters(BLOCK_CACHE_ADD, 1);
get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, usable_size);
get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge);
} else {
RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usable_size);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
}
RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usable_size);
RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
} else {
if (index_reader != nullptr) {
delete index_reader;
@ -1661,8 +1667,9 @@ BlockIter* BlockBasedTable::NewDataBlockIterator(
static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
Slice unique_key =
Slice(cache_key, static_cast<size_t>(end - cache_key));
s = block_cache->Insert(unique_key, nullptr, block.value->usable_size(),
nullptr, &cache_handle);
s = block_cache->Insert(unique_key, nullptr,
block.value->ApproximateMemoryUsage(), nullptr,
&cache_handle);
if (s.ok()) {
if (cache_handle != nullptr) {
iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
@ -2998,7 +3005,7 @@ void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) {
FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value);
if (filter->statistics() != nullptr) {
RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT,
filter->size());
filter->ApproximateMemoryUsage());
}
delete filter;
}
@ -3007,7 +3014,7 @@ void DeleteCachedIndexEntry(const Slice& /*key*/, void* value) {
IndexReader* index_reader = reinterpret_cast<IndexReader*>(value);
if (index_reader->statistics() != nullptr) {
RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT,
index_reader->usable_size());
index_reader->ApproximateMemoryUsage());
}
delete index_reader;
}

@ -10,6 +10,13 @@
#pragma once
#include <stdint.h>
#include <string>
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
#ifdef OS_FREEBSD
#include <malloc_np.h>
#else
#include <malloc.h>
#endif
#endif
#include "rocksdb/options.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
@ -199,6 +206,19 @@ struct BlockContents {
compression_type(_compression_type),
allocation(std::move(_data)) {}
// The additional memory space taken by the block data.
size_t usable_size() const {
if (allocation.get() != nullptr) {
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
return malloc_usable_size(allocation.get());
#else
return sizeof(*allocation.get());
#endif // ROCKSDB_MALLOC_USABLE_SIZE
} else {
return 0; // no extra memory is occupied by the data
}
}
BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT {
*this = std::move(other);
}

@ -5,6 +5,14 @@
#include "table/full_filter_block.h"
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
#ifdef OS_FREEBSD
#include <malloc_np.h>
#else
#include <malloc.h>
#endif
#endif
#include "monitoring/perf_context_imp.h"
#include "port/port.h"
#include "rocksdb/filter_policy.h"
@ -152,7 +160,15 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) {
}
size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
return contents_.size();
size_t usage = block_contents_.usable_size();
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
usage += malloc_usable_size((void*)this);
usage += malloc_usable_size(filter_bits_reader_.get());
#else
usage += sizeof(*this);
usage += sizeof(*filter_bits_reader_.get());
#endif // ROCKSDB_MALLOC_USABLE_SIZE
return usage;
}
bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound,

@ -118,7 +118,6 @@ class FullFilterBlockReader : public FilterBlockReader {
Slice contents_;
std::unique_ptr<FilterBitsReader> filter_bits_reader_;
BlockContents block_contents_;
std::unique_ptr<const char[]> filter_data_;
bool full_length_enabled_;
size_t prefix_extractor_full_length_;

@ -5,6 +5,13 @@
#include "table/partitioned_filter_block.h"
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
#ifdef OS_FREEBSD
#include <malloc_np.h>
#else
#include <malloc.h>
#endif
#endif
#include <utility>
#include "monitoring/perf_context_imp.h"
@ -265,7 +272,14 @@ PartitionedFilterBlockReader::GetFilterPartition(
}
size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
return idx_on_fltr_blk_->size();
size_t usage = idx_on_fltr_blk_->usable_size();
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
usage += malloc_usable_size((void*)this);
#else
usage += sizeof(*this);
#endif // ROCKSDB_MALLOC_USABLE_SIZE
return usage;
// TODO(myabandeh): better estimation for filter_map_ size
}
// Release the cached entry and decrement its ref count.

@ -1988,7 +1988,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
// Enable the cache for index/filter blocks
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
table_options.block_cache = NewLRUCache(1024, 4);
table_options.block_cache = NewLRUCache(1024, 2);
table_options.cache_index_and_filter_blocks = true;
options.table_factory.reset(new BlockBasedTableFactory(table_options));
std::vector<std::string> keys;

Loading…
Cancel
Save