From 0a019d74a0f1b9286f728a20075c04dd9617e8fd Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Fri, 26 Jun 2015 11:48:09 -0700 Subject: [PATCH] Use malloc_usable_size() for accounting block cache size Summary: Currently, when we insert something into block cache, we say that the block cache capacity decreased by the size of the block. However, size of the block might be less than the actual memory used by this object. For example, 4.5KB block will actually use 8KB of memory. So even if we configure block cache to 10GB, our actually memory usage of block cache will be 20GB! This problem showed up a lot in testing and just recently also showed up in MongoRocks production where we were using 30GB more memory than expected. This diff will fix the problem. Instead of counting the block size, we will count memory used by the block. That way, a block cache configured to be 10GB will actually use only 10GB of memory. I'm using non-portable function and I couldn't find info on portability on Google. However, it seems to work on Linux, which will cover majority of our use-cases. Test Plan: 1. fill up mongo instance with 80GB of data 2. restart mongo with block cache size configured to 10GB 3. do a table scan in mongo 4. memory usage before the diff: 12GB. memory usage after the diff: 10.5GB Reviewers: sdong, MarkCallaghan, rven, yhchiang Reviewed By: yhchiang Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D40635 --- HISTORY.md | 1 + build_tools/build_detect_platform | 12 ++++++++++++ build_tools/fbcode_config.sh | 2 +- build_tools/fbcode_config4.8.1.sh | 2 +- table/block.cc | 2 +- table/block.h | 11 +++++++++++ table/block_based_table_builder.cc | 4 ++-- table/block_based_table_reader.cc | 27 ++++++++++++++++++--------- 8 files changed, 47 insertions(+), 14 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index f515e11a5..2ef40c4a9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -21,6 +21,7 @@ * CompactRange() will now skip bottommost level compaction for level based compaction if there is no compaction filter, bottommost_level_compaction is introduced in CompactRangeOptions to control when it's possbile to skip bottommost level compaction. This mean that if you want the compaction to produce a single file you need to set bottommost_level_compaction to BottommostLevelCompaction::kForce. * Add Cache.GetPinnedUsage() to get the size of memory occupied by entries that are in use by the system. * DB:Open() will fail if the compression specified in Options is not linked with the binary. If you see this failure, recompile RocksDB with compression libraries present on your system. Also, previously our default compression was snappy. This behavior is now changed. Now, the default compression is snappy only if it's available on the system. If it isn't we change the default to kNoCompression. +* We changed how we account for memory used in block cache. Previously, we only counted the sum of block sizes currently present in block cache. Now, we count the actual memory usage of the blocks. For example, a block of size 4.5KB will use 8KB memory with jemalloc. This might decrease your memory usage and possibly decrease performance. Increase block cache size if you see this happening after an upgrade. ## 3.11.0 (5/19/2015) ### New Features diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index d0769e338..6edc22a25 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -294,6 +294,18 @@ EOF JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc" fi fi + + # Test whether malloc_usable_size is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() { + size_t res = malloc_usable_size(0); + return 0; + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_MALLOC_USABLE_SIZE" + fi fi # TODO(tec): Fix -Wshorten-64-to-32 errors on FreeBSD and enable the warning. diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh index 70c308959..5d7ff53eb 100644 --- a/build_tools/fbcode_config.sh +++ b/build_tools/fbcode_config.sh @@ -111,7 +111,7 @@ else fi CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE" CXXFLAGS+=" $CFLAGS" EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB" diff --git a/build_tools/fbcode_config4.8.1.sh b/build_tools/fbcode_config4.8.1.sh index 46bfe1330..b9b61b745 100644 --- a/build_tools/fbcode_config4.8.1.sh +++ b/build_tools/fbcode_config4.8.1.sh @@ -87,7 +87,7 @@ else fi CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE" CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA" CXXFLAGS+=" $CFLAGS" diff --git a/table/block.cc b/table/block.cc index 6a5ede600..ebae8560c 100644 --- a/table/block.cc +++ b/table/block.cc @@ -359,7 +359,7 @@ void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) { } size_t Block::ApproximateMemoryUsage() const { - size_t usage = size(); + size_t usage = usable_size(); if (hash_index_) { usage += hash_index_->ApproximateMemoryUsage(); } diff --git a/table/block.h b/table/block.h index 0187489bb..2ce48d3fd 100644 --- a/table/block.h +++ b/table/block.h @@ -10,6 +10,9 @@ #pragma once #include #include +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#include +#endif #include "rocksdb/iterator.h" #include "rocksdb/options.h" @@ -37,6 +40,14 @@ class Block { size_t size() const { return size_; } const char* data() const { return data_; } bool cachable() const { return contents_.cachable; } + size_t usable_size() const { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + if (contents_.allocation.get() != nullptr) { + return malloc_usable_size(contents_.allocation.get()); + } +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return size_; + } uint32_t NumRestarts() const; CompressionType compression_type() const { return contents_.compression_type; diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index c4b8b0eb3..18d7de9f6 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -703,8 +703,8 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, (end - r->compressed_cache_key_prefix)); // Insert into compressed block cache. - cache_handle = block_cache_compressed->Insert(key, block, block->size(), - &DeleteCachedBlock); + cache_handle = block_cache_compressed->Insert( + key, block, block->usable_size(), &DeleteCachedBlock); block_cache_compressed->Release(cache_handle); // Invalidate OS cache. diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index ed7fb0ba5..47e9a6a30 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -147,6 +147,8 @@ class BlockBasedTable::IndexReader { // The size of the index. virtual size_t size() const = 0; + // Memory usage of the index block + virtual size_t usable_size() const = 0; // Report an approximation of how much memory has been used other than memory // that was allocated in block cache. @@ -187,6 +189,9 @@ class BinarySearchIndexReader : public IndexReader { } virtual size_t size() const override { return index_block_->size(); } + virtual size_t usable_size() const override { + return index_block_->usable_size(); + } virtual size_t ApproximateMemoryUsage() const override { assert(index_block_); @@ -295,6 +300,9 @@ class HashIndexReader : public IndexReader { } virtual size_t size() const override { return index_block_->size(); } + virtual size_t usable_size() const override { + return index_block_->usable_size(); + } virtual size_t ApproximateMemoryUsage() const override { assert(index_block_); @@ -702,9 +710,9 @@ Status BlockBasedTable::GetDataBlockFromCache( assert(block->value->compression_type() == kNoCompression); if (block_cache != nullptr && block->value->cachable() && read_options.fill_cache) { - block->cache_handle = - block_cache->Insert(block_cache_key, block->value, - block->value->size(), &DeleteCachedEntry); + block->cache_handle = block_cache->Insert(block_cache_key, block->value, + block->value->usable_size(), + &DeleteCachedEntry); assert(reinterpret_cast( block_cache->Value(block->cache_handle)) == block->value); } @@ -747,7 +755,7 @@ Status BlockBasedTable::PutDataBlockToCache( if (block_cache_compressed != nullptr && raw_block != nullptr && raw_block->cachable()) { auto cache_handle = block_cache_compressed->Insert( - compressed_block_cache_key, raw_block, raw_block->size(), + compressed_block_cache_key, raw_block, raw_block->usable_size(), &DeleteCachedEntry); block_cache_compressed->Release(cache_handle); RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); @@ -759,9 +767,9 @@ Status BlockBasedTable::PutDataBlockToCache( // insert into uncompressed block cache assert((block->value->compression_type() == kNoCompression)); if (block_cache != nullptr && block->value->cachable()) { - block->cache_handle = - block_cache->Insert(block_cache_key, block->value, block->value->size(), - &DeleteCachedEntry); + block->cache_handle = block_cache->Insert(block_cache_key, block->value, + block->value->usable_size(), + &DeleteCachedEntry); RecordTick(statistics, BLOCK_CACHE_ADD); assert(reinterpret_cast(block_cache->Value(block->cache_handle)) == block->value); @@ -913,8 +921,9 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options, } } - cache_handle = block_cache->Insert(key, index_reader, index_reader->size(), - &DeleteCachedEntry); + cache_handle = + block_cache->Insert(key, index_reader, index_reader->usable_size(), + &DeleteCachedEntry); RecordTick(statistics, BLOCK_CACHE_ADD); }