From 29ffbb8a50b495269b372c107b57b02f683d4059 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Fri, 29 Jun 2018 08:55:33 -0700 Subject: [PATCH] Charging block cache more accurately (#4073) Summary: Currently the block cache is charged only by the size of the raw data block and excludes the overhead of the c++ objects that contain the raw data block. The patch improves the accuracy of the charge by including the c++ object overhead into it. Closes https://github.com/facebook/rocksdb/pull/4073 Differential Revision: D8686552 Pulled By: maysamyabandeh fbshipit-source-id: 8472f7fc163c0644533bc6942e20cdd5725f520f --- table/block.cc | 8 ++ table/block.h | 17 +++-- table/block_based_table_builder.cc | 2 +- table/block_based_table_reader.cc | 119 +++++++++++++++-------------- table/format.h | 20 +++++ table/full_filter_block.cc | 18 ++++- table/full_filter_block.h | 1 - table/partitioned_filter_block.cc | 16 +++- table/table_test.cc | 2 +- 9 files changed, 134 insertions(+), 69 deletions(-) diff --git a/table/block.cc b/table/block.cc index fb79d7217..d10397564 100644 --- a/table/block.cc +++ b/table/block.cc @@ -486,9 +486,17 @@ void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) { size_t Block::ApproximateMemoryUsage() const { size_t usage = usable_size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE if (prefix_index_) { usage += prefix_index_->ApproximateMemoryUsage(); } + if (read_amp_bitmap_) { + usage += read_amp_bitmap_->ApproximateMemoryUsage(); + } return usage; } diff --git a/table/block.h b/table/block.h index 1fed0e297..ffdc54232 100644 --- a/table/block.h +++ b/table/block.h @@ -104,6 +104,13 @@ class BlockReadAmpBitmap { uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; } + size_t ApproximateMemoryUsage() const { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + return malloc_usable_size((void*)this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return sizeof(*this); + } + private: // Get the current value of bit at `bit_idx` and set it to 1 inline bool GetAndSet(uint32_t bit_idx) { @@ -142,14 +149,8 @@ class Block { size_t size() const { return size_; } const char* data() const { return data_; } bool cachable() const { return contents_.cachable; } - size_t usable_size() const { -#ifdef ROCKSDB_MALLOC_USABLE_SIZE - if (contents_.allocation.get() != nullptr) { - return malloc_usable_size(contents_.allocation.get()); - } -#endif // ROCKSDB_MALLOC_USABLE_SIZE - return size_; - } + // The additional memory space taken by the block data. + size_t usable_size() const { return contents_.usable_size(); } uint32_t NumRestarts() const; CompressionType compression_type() const { return contents_.compression_type; diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 7d7faf60d..bb45d1fa2 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -659,7 +659,7 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, (end - r->compressed_cache_key_prefix)); // Insert into compressed block cache. - block_cache_compressed->Insert(key, block, block->usable_size(), + block_cache_compressed->Insert(key, block, block->ApproximateMemoryUsage(), &DeleteCachedBlock); // Invalidate OS cache. diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index c28024ddb..ba5ed309d 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -346,7 +346,14 @@ class PartitionIndexReader : public IndexReader, public Cleanable { virtual size_t ApproximateMemoryUsage() const override { assert(index_block_); - return index_block_->ApproximateMemoryUsage(); + size_t usage = index_block_->ApproximateMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + // TODO(myabandeh): more accurate estimate of partition_map_ mem usage + return usage; } private: @@ -415,7 +422,13 @@ class BinarySearchIndexReader : public IndexReader { virtual size_t ApproximateMemoryUsage() const override { assert(index_block_); - return index_block_->ApproximateMemoryUsage(); + size_t usage = index_block_->ApproximateMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; } private: @@ -532,8 +545,14 @@ class HashIndexReader : public IndexReader { virtual size_t ApproximateMemoryUsage() const override { assert(index_block_); - return index_block_->ApproximateMemoryUsage() + - prefixes_contents_.data.size(); + size_t usage = index_block_->ApproximateMemoryUsage(); + usage += prefixes_contents_.usable_size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; } private: @@ -1154,40 +1173,34 @@ Status BlockBasedTable::GetDataBlockFromCache( assert(block->value->compression_type() == kNoCompression); if (block_cache != nullptr && block->value->cachable() && read_options.fill_cache) { - s = block_cache->Insert( - block_cache_key, block->value, block->value->usable_size(), - &DeleteCachedEntry, &(block->cache_handle)); - block_cache->TEST_mark_as_data_block(block_cache_key, - block->value->usable_size()); + size_t charge = block->value->ApproximateMemoryUsage(); + s = block_cache->Insert(block_cache_key, block->value, charge, + &DeleteCachedEntry, + &(block->cache_handle)); + block_cache->TEST_mark_as_data_block(block_cache_key, charge); if (s.ok()) { if (get_context != nullptr) { get_context->RecordCounters(BLOCK_CACHE_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, - block->value->usable_size()); + get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge); } else { RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, - block->value->usable_size()); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); } if (is_index) { if (get_context != nullptr) { get_context->RecordCounters(BLOCK_CACHE_INDEX_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT, - block->value->usable_size()); + get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT, charge); } else { RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); - RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, - block->value->usable_size()); + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); } } else { if (get_context != nullptr) { get_context->RecordCounters(BLOCK_CACHE_DATA_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT, - block->value->usable_size()); + get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT, charge); } else { RecordTick(statistics, BLOCK_CACHE_DATA_ADD); - RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, - block->value->usable_size()); + RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge); } } } else { @@ -1243,7 +1256,7 @@ Status BlockBasedTable::PutDataBlockToCache( if (block_cache_compressed != nullptr && raw_block != nullptr && raw_block->cachable()) { s = block_cache_compressed->Insert(compressed_block_cache_key, raw_block, - raw_block->usable_size(), + raw_block->ApproximateMemoryUsage(), &DeleteCachedEntry); if (s.ok()) { // Avoid the following code to delete this cached block. @@ -1258,41 +1271,35 @@ Status BlockBasedTable::PutDataBlockToCache( // insert into uncompressed block cache assert((block->value->compression_type() == kNoCompression)); if (block_cache != nullptr && block->value->cachable()) { - s = block_cache->Insert( - block_cache_key, block->value, block->value->usable_size(), - &DeleteCachedEntry, &(block->cache_handle), priority); - block_cache->TEST_mark_as_data_block(block_cache_key, - block->value->usable_size()); + size_t charge = block->value->ApproximateMemoryUsage(); + s = block_cache->Insert(block_cache_key, block->value, charge, + &DeleteCachedEntry, &(block->cache_handle), + priority); + block_cache->TEST_mark_as_data_block(block_cache_key, charge); if (s.ok()) { assert(block->cache_handle != nullptr); if (get_context != nullptr) { get_context->RecordCounters(BLOCK_CACHE_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, - block->value->usable_size()); + get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge); } else { RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, - block->value->usable_size()); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); } if (is_index) { if (get_context != nullptr) { get_context->RecordCounters(BLOCK_CACHE_INDEX_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT, - block->value->usable_size()); + get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT, charge); } else { RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); - RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, - block->value->usable_size()); + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); } } else { if (get_context != nullptr) { get_context->RecordCounters(BLOCK_CACHE_DATA_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT, - block->value->usable_size()); + get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT, charge); } else { RecordTick(statistics, BLOCK_CACHE_DATA_ADD); - RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, - block->value->usable_size()); + RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge); } } assert(reinterpret_cast( @@ -1429,24 +1436,23 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( filter = ReadFilter(prefetch_buffer, filter_blk_handle, is_a_filter_partition, prefix_extractor); if (filter != nullptr) { + size_t usage = filter->ApproximateMemoryUsage(); Status s = block_cache->Insert( - key, filter, filter->size(), &DeleteCachedFilterEntry, &cache_handle, + key, filter, usage, &DeleteCachedFilterEntry, &cache_handle, rep_->table_options.cache_index_and_filter_blocks_with_high_priority ? Cache::Priority::HIGH : Cache::Priority::LOW); if (s.ok()) { if (get_context != nullptr) { get_context->RecordCounters(BLOCK_CACHE_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, filter->size()); + get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, usage); get_context->RecordCounters(BLOCK_CACHE_FILTER_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_FILTER_BYTES_INSERT, - filter->size()); + get_context->RecordCounters(BLOCK_CACHE_FILTER_BYTES_INSERT, usage); } else { RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter->size()); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage); RecordTick(statistics, BLOCK_CACHE_FILTER_ADD); - RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, - filter->size()); + RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage); } } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); @@ -1512,27 +1518,27 @@ InternalIterator* BlockBasedTable::NewIndexIterator( TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1"); TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3"); TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4"); + size_t charge = 0; if (s.ok()) { assert(index_reader != nullptr); + charge = index_reader->ApproximateMemoryUsage(); s = block_cache->Insert( - key, index_reader, index_reader->usable_size(), - &DeleteCachedIndexEntry, &cache_handle, + key, index_reader, charge, &DeleteCachedIndexEntry, &cache_handle, rep_->table_options.cache_index_and_filter_blocks_with_high_priority ? Cache::Priority::HIGH : Cache::Priority::LOW); } if (s.ok()) { - size_t usable_size = index_reader->usable_size(); if (get_context != nullptr) { get_context->RecordCounters(BLOCK_CACHE_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, usable_size); + get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge); } else { RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usable_size); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); } RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); - RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usable_size); + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); } else { if (index_reader != nullptr) { delete index_reader; @@ -1661,8 +1667,9 @@ BlockIter* BlockBasedTable::NewDataBlockIterator( static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); Slice unique_key = Slice(cache_key, static_cast(end - cache_key)); - s = block_cache->Insert(unique_key, nullptr, block.value->usable_size(), - nullptr, &cache_handle); + s = block_cache->Insert(unique_key, nullptr, + block.value->ApproximateMemoryUsage(), nullptr, + &cache_handle); if (s.ok()) { if (cache_handle != nullptr) { iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, @@ -2998,7 +3005,7 @@ void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) { FilterBlockReader* filter = reinterpret_cast(value); if (filter->statistics() != nullptr) { RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT, - filter->size()); + filter->ApproximateMemoryUsage()); } delete filter; } @@ -3007,7 +3014,7 @@ void DeleteCachedIndexEntry(const Slice& /*key*/, void* value) { IndexReader* index_reader = reinterpret_cast(value); if (index_reader->statistics() != nullptr) { RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT, - index_reader->usable_size()); + index_reader->ApproximateMemoryUsage()); } delete index_reader; } diff --git a/table/format.h b/table/format.h index dddd6710f..258d1a9e7 100644 --- a/table/format.h +++ b/table/format.h @@ -10,6 +10,13 @@ #pragma once #include #include +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include +#else +#include +#endif +#endif #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" @@ -199,6 +206,19 @@ struct BlockContents { compression_type(_compression_type), allocation(std::move(_data)) {} + // The additional memory space taken by the block data. + size_t usable_size() const { + if (allocation.get() != nullptr) { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + return malloc_usable_size(allocation.get()); +#else + return sizeof(*allocation.get()); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + } else { + return 0; // no extra memory is occupied by the data + } + } + BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT { *this = std::move(other); } diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc index da9b93ee6..a7491a716 100644 --- a/table/full_filter_block.cc +++ b/table/full_filter_block.cc @@ -5,6 +5,14 @@ #include "table/full_filter_block.h" +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include +#else +#include +#endif +#endif + #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "rocksdb/filter_policy.h" @@ -152,7 +160,15 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) { } size_t FullFilterBlockReader::ApproximateMemoryUsage() const { - return contents_.size(); + size_t usage = block_contents_.usable_size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); + usage += malloc_usable_size(filter_bits_reader_.get()); +#else + usage += sizeof(*this); + usage += sizeof(*filter_bits_reader_.get()); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; } bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound, diff --git a/table/full_filter_block.h b/table/full_filter_block.h index aed4cfd7d..e4384c91a 100644 --- a/table/full_filter_block.h +++ b/table/full_filter_block.h @@ -118,7 +118,6 @@ class FullFilterBlockReader : public FilterBlockReader { Slice contents_; std::unique_ptr filter_bits_reader_; BlockContents block_contents_; - std::unique_ptr filter_data_; bool full_length_enabled_; size_t prefix_extractor_full_length_; diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc index 0ee738c4a..d81eeed72 100644 --- a/table/partitioned_filter_block.cc +++ b/table/partitioned_filter_block.cc @@ -5,6 +5,13 @@ #include "table/partitioned_filter_block.h" +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include +#else +#include +#endif +#endif #include #include "monitoring/perf_context_imp.h" @@ -265,7 +272,14 @@ PartitionedFilterBlockReader::GetFilterPartition( } size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { - return idx_on_fltr_blk_->size(); + size_t usage = idx_on_fltr_blk_->usable_size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + // TODO(myabandeh): better estimation for filter_map_ size } // Release the cached entry and decrement its ref count. diff --git a/table/table_test.cc b/table/table_test.cc index 3a4378a7f..bf9216668 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1988,7 +1988,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { // Enable the cache for index/filter blocks BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - table_options.block_cache = NewLRUCache(1024, 4); + table_options.block_cache = NewLRUCache(1024, 2); table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(new BlockBasedTableFactory(table_options)); std::vector keys;