From 638d23950716f3119057d725fbbf1590b807a849 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 16 Sep 2019 15:14:51 -0700 Subject: [PATCH] Charge block cache for cache internal usage (#5797) Summary: For our default block cache, each additional entry has extra memory overhead. It include LRUHandle (72 bytes currently) and the cache key (two varint64, file id and offset). The usage is not negligible. For example for block_size=4k, the overhead accounts for an extra 2% memory usage for the cache. The patch charging the cache for the extra usage, reducing untracked memory usage outside block cache. The feature is enabled by default and can be disabled by passing kDontChargeCacheMetadata to the cache constructor. This PR builds up on https://github.com/facebook/rocksdb/issues/4258 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5797 Test Plan: - Existing tests are updated to either disable the feature when the test has too much dependency on the old way of accounting the usage or increasing the cache capacity to account for the additional charge of metadata. - The Usage tests in cache_test.cc are augmented to test the cache usage under kFullChargeCacheMetadata. Differential Revision: D17396833 Pulled By: maysamyabandeh fbshipit-source-id: 7684ccb9f8a40ca595e4f5efcdb03623afea0c6f --- HISTORY.md | 1 + cache/cache_test.cc | 84 ++++++++++++++++--- cache/clock_cache.cc | 59 ++++++++++--- cache/lru_cache.cc | 66 +++++++++------ cache/lru_cache.h | 24 +++++- cache/lru_cache_test.cc | 3 +- cache/sharded_cache.h | 7 ++ db/db_block_cache_test.cc | 7 +- db/db_impl/db_impl.cc | 7 +- db/db_iterator_test.cc | 3 +- db/db_properties_test.cc | 6 +- db/db_test2.cc | 2 +- env/env_test.cc | 8 +- include/rocksdb/cache.h | 30 +++++-- memory/arena.cc | 8 +- memtable/write_buffer_manager_test.cc | 6 +- port/malloc.h | 17 ++++ table/block_based/block.h | 8 +- table/block_based/full_filter_block.cc | 9 +- table/block_based/partitioned_filter_block.cc | 8 +- table/format.h | 8 +- table/table_test.cc | 6 +- .../simulator_cache/cache_simulator_test.cc | 11 ++- utilities/simulator_cache/sim_cache.cc | 7 +- utilities/simulator_cache/sim_cache_test.cc | 14 +++- 25 files changed, 289 insertions(+), 120 deletions(-) create mode 100644 port/malloc.h diff --git a/HISTORY.md b/HISTORY.md index 0e91e518e..85c65484d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,6 +11,7 @@ * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process. * Add an option `memtable_insert_hint_per_batch` to WriteOptions. If it is true, each WriteBatch will maintain its own insert hints for each memtable in concurrent write. See include/rocksdb/options.h for more details. * The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0. +* The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior. ### Public API Change * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables. * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine. diff --git a/cache/cache_test.cc b/cache/cache_test.cc index b728c67c7..1c6fc7719 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -86,14 +86,22 @@ class CacheTest : public testing::TestWithParam { return nullptr; } - std::shared_ptr NewCache(size_t capacity, int num_shard_bits, - bool strict_capacity_limit) { + std::shared_ptr NewCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) { auto type = GetParam(); if (type == kLRU) { - return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, 0.0); + LRUCacheOptions co; + co.capacity = capacity; + co.num_shard_bits = num_shard_bits; + co.strict_capacity_limit = strict_capacity_limit; + co.high_pri_pool_ratio = 0; + co.metadata_charge_policy = charge_policy; + return NewLRUCache(co); } if (type == kClock) { - return NewClockCache(capacity, num_shard_bits, strict_capacity_limit); + return NewClockCache(capacity, num_shard_bits, strict_capacity_limit, + charge_policy); } return nullptr; } @@ -143,10 +151,15 @@ class CacheTest : public testing::TestWithParam { }; CacheTest* CacheTest::current_; +class LRUCacheTest : public CacheTest {}; + TEST_P(CacheTest, UsageTest) { // cache is std::shared_ptr and will be automatically cleaned up. const uint64_t kCapacity = 100000; - auto cache = NewCache(kCapacity, 8, false); + auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); + auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata); + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_EQ(0, precise_cache->GetUsage()); size_t usage = 0; char value[10] = "abcdef"; @@ -155,31 +168,45 @@ TEST_P(CacheTest, UsageTest) { std::string key(i, 'a'); auto kv_size = key.size() + 5; cache->Insert(key, reinterpret_cast(value), kv_size, dumbDeleter); + precise_cache->Insert(key, reinterpret_cast(value), kv_size, + dumbDeleter); usage += kv_size; ASSERT_EQ(usage, cache->GetUsage()); + ASSERT_LT(usage, precise_cache->GetUsage()); } + cache->EraseUnRefEntries(); + precise_cache->EraseUnRefEntries(); + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_EQ(0, precise_cache->GetUsage()); + // make sure the cache will be overloaded for (uint64_t i = 1; i < kCapacity; ++i) { auto key = ToString(i); cache->Insert(key, reinterpret_cast(value), key.size() + 5, dumbDeleter); + precise_cache->Insert(key, reinterpret_cast(value), key.size() + 5, + dumbDeleter); } // the usage should be close to the capacity ASSERT_GT(kCapacity, cache->GetUsage()); + ASSERT_GT(kCapacity, precise_cache->GetUsage()); ASSERT_LT(kCapacity * 0.95, cache->GetUsage()); + ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage()); } TEST_P(CacheTest, PinnedUsageTest) { // cache is std::shared_ptr and will be automatically cleaned up. - const uint64_t kCapacity = 100000; - auto cache = NewCache(kCapacity, 8, false); + const uint64_t kCapacity = 200000; + auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); + auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata); size_t pinned_usage = 0; char value[10] = "abcdef"; std::forward_list unreleased_handles; + std::forward_list unreleased_handles_in_precise_cache; // Add entries. Unpin some of them after insertion. Then, pin some of them // again. Check GetPinnedUsage(). @@ -187,40 +214,72 @@ TEST_P(CacheTest, PinnedUsageTest) { std::string key(i, 'a'); auto kv_size = key.size() + 5; Cache::Handle* handle; + Cache::Handle* handle_in_precise_cache; cache->Insert(key, reinterpret_cast(value), kv_size, dumbDeleter, &handle); + assert(handle); + precise_cache->Insert(key, reinterpret_cast(value), kv_size, + dumbDeleter, &handle_in_precise_cache); + assert(handle_in_precise_cache); pinned_usage += kv_size; ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage()); if (i % 2 == 0) { cache->Release(handle); + precise_cache->Release(handle_in_precise_cache); pinned_usage -= kv_size; ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage()); } else { unreleased_handles.push_front(handle); + unreleased_handles_in_precise_cache.push_front(handle_in_precise_cache); } if (i % 3 == 0) { unreleased_handles.push_front(cache->Lookup(key)); + auto x = precise_cache->Lookup(key); + assert(x); + unreleased_handles_in_precise_cache.push_front(x); // If i % 2 == 0, then the entry was unpinned before Lookup, so pinned // usage increased if (i % 2 == 0) { pinned_usage += kv_size; } ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage()); } } + auto precise_cache_pinned_usage = precise_cache->GetPinnedUsage(); + ASSERT_LT(pinned_usage, precise_cache_pinned_usage); // check that overloading the cache does not change the pinned usage for (uint64_t i = 1; i < 2 * kCapacity; ++i) { auto key = ToString(i); cache->Insert(key, reinterpret_cast(value), key.size() + 5, dumbDeleter); + precise_cache->Insert(key, reinterpret_cast(value), key.size() + 5, + dumbDeleter); } ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage()); + + cache->EraseUnRefEntries(); + precise_cache->EraseUnRefEntries(); + ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage()); // release handles for pinned entries to prevent memory leaks for (auto handle : unreleased_handles) { cache->Release(handle); } + for (auto handle : unreleased_handles_in_precise_cache) { + precise_cache->Release(handle); + } + ASSERT_EQ(0, cache->GetPinnedUsage()); + ASSERT_EQ(0, precise_cache->GetPinnedUsage()); + cache->EraseUnRefEntries(); + precise_cache->EraseUnRefEntries(); + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_EQ(0, precise_cache->GetUsage()); } TEST_P(CacheTest, HitAndMiss) { @@ -550,10 +609,10 @@ TEST_P(CacheTest, SetCapacity) { } } -TEST_P(CacheTest, SetStrictCapacityLimit) { +TEST_P(LRUCacheTest, SetStrictCapacityLimit) { // test1: set the flag to false. Insert more keys than capacity. See if they // all go through. - std::shared_ptr cache = NewLRUCache(5, 0, false); + std::shared_ptr cache = NewCache(5, 0, false); std::vector handles(10); Status s; for (size_t i = 0; i < 10; i++) { @@ -579,7 +638,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) { } // test3: init with flag being true. - std::shared_ptr cache2 = NewLRUCache(5, 0, true); + std::shared_ptr cache2 = NewCache(5, 0, true); for (size_t i = 0; i < 5; i++) { std::string key = ToString(i + 1); s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); @@ -697,13 +756,14 @@ TEST_P(CacheTest, GetCharge) { } #ifdef SUPPORT_CLOCK_CACHE -std::shared_ptr (*new_clock_cache_func)(size_t, int, - bool) = NewClockCache; +std::shared_ptr (*new_clock_cache_func)( + size_t, int, bool, CacheMetadataChargePolicy) = NewClockCache; INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, testing::Values(kLRU, kClock)); #else INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, testing::Values(kLRU)); #endif // SUPPORT_CLOCK_CACHE +INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU)); } // namespace rocksdb diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 89173834e..9165ad5dd 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -13,8 +13,9 @@ namespace rocksdb { -std::shared_ptr NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/, - bool /*strict_capacity_limit*/) { +std::shared_ptr NewClockCache( + size_t /*capacity*/, int /*num_shard_bits*/, bool /*strict_capacity_limit*/, + CacheMetadataChargePolicy /*metadata_charge_policy*/) { // Clock cache not supported. return nullptr; } @@ -35,6 +36,7 @@ std::shared_ptr NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/ #include "tbb/concurrent_hash_map.h" #include "cache/sharded_cache.h" +#include "port/malloc.h" #include "port/port.h" #include "util/autovector.h" #include "util/mutexlock.h" @@ -202,6 +204,27 @@ struct CacheHandle { deleter = a.deleter; return *this; } + + inline static size_t CalcTotalCharge( + Slice key, size_t charge, + CacheMetadataChargePolicy metadata_charge_policy) { + size_t meta_charge = 0; + if (metadata_charge_policy == kFullChargeCacheMetadata) { + meta_charge += sizeof(CacheHandle); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + meta_charge += + malloc_usable_size(static_cast(const_cast(key.data()))); +#else + meta_charge += key.size(); +#endif + } + return charge + meta_charge; + } + + inline size_t CalcTotalCharge( + CacheMetadataChargePolicy metadata_charge_policy) { + return CalcTotalCharge(key, charge, metadata_charge_policy); + } }; // Key of hash map. We store hash value with the key for convenience. @@ -404,11 +427,12 @@ void ClockCacheShard::RecycleHandle(CacheHandle* handle, assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0); context->to_delete_key.push_back(handle->key.data()); context->to_delete_value.emplace_back(*handle); + size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); handle->key.clear(); handle->value = nullptr; handle->deleter = nullptr; recycle_.push_back(handle); - usage_.fetch_sub(handle->charge, std::memory_order_relaxed); + usage_.fetch_sub(total_charge, std::memory_order_relaxed); } void ClockCacheShard::Cleanup(const CleanupContext& context) { @@ -434,7 +458,8 @@ bool ClockCacheShard::Ref(Cache::Handle* h) { std::memory_order_relaxed)) { if (CountRefs(flags) == 0) { // No reference count before the operation. - pinned_usage_.fetch_add(handle->charge, std::memory_order_relaxed); + size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); + pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed); } return true; } @@ -454,7 +479,8 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage, assert(CountRefs(flags) > 0); if (CountRefs(flags) == 1) { // this is the last reference. - pinned_usage_.fetch_sub(handle->charge, std::memory_order_relaxed); + size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); + pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed); // Cleanup if it is the last reference. if (!InCache(flags)) { MutexLock l(&mutex_); @@ -539,8 +565,10 @@ CacheHandle* ClockCacheShard::Insert( const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), bool hold_reference, CleanupContext* context) { + size_t total_charge = + CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_); MutexLock l(&mutex_); - bool success = EvictFromCache(charge, context); + bool success = EvictFromCache(total_charge, context); bool strict = strict_capacity_limit_.load(std::memory_order_relaxed); if (!success && (strict || !hold_reference)) { context->to_delete_key.push_back(key.data()); @@ -575,9 +603,9 @@ CacheHandle* ClockCacheShard::Insert( } table_.insert(HashTable::value_type(CacheKey(key, hash), handle)); if (hold_reference) { - pinned_usage_.fetch_add(charge, std::memory_order_relaxed); + pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed); } - usage_.fetch_add(charge, std::memory_order_relaxed); + usage_.fetch_add(total_charge, std::memory_order_relaxed); return handle; } @@ -674,10 +702,14 @@ void ClockCacheShard::EraseUnRefEntries() { class ClockCache final : public ShardedCache { public: - ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit) + ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy) : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) { int num_shards = 1 << num_shard_bits; shards_ = new ClockCacheShard[num_shards]; + for (int i = 0; i < num_shards; i++) { + shards_[i].set_metadata_charge_policy(metadata_charge_policy); + } SetCapacity(capacity); SetStrictCapacityLimit(strict_capacity_limit); } @@ -714,13 +746,14 @@ class ClockCache final : public ShardedCache { } // end anonymous namespace -std::shared_ptr NewClockCache(size_t capacity, int num_shard_bits, - bool strict_capacity_limit) { +std::shared_ptr NewClockCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy) { if (num_shard_bits < 0) { num_shard_bits = GetDefaultCacheShardBits(capacity); } - return std::make_shared(capacity, num_shard_bits, - strict_capacity_limit); + return std::make_shared( + capacity, num_shard_bits, strict_capacity_limit, metadata_charge_policy); } } // namespace rocksdb diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index 7c04cb909..85d2d67ec 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -97,7 +97,8 @@ void LRUHandleTable::Resize() { LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio, - bool use_adaptive_mutex) + bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy) : capacity_(0), high_pri_pool_usage_(0), strict_capacity_limit_(strict_capacity_limit), @@ -106,6 +107,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, usage_(0), lru_usage_(0), mutex_(use_adaptive_mutex) { + set_metadata_charge_policy(metadata_charge_policy); // Make empty circular linked list lru_.next = &lru_; lru_.prev = &lru_; @@ -124,7 +126,9 @@ void LRUCacheShard::EraseUnRefEntries() { LRU_Remove(old); table_.Remove(old->key(), old->hash); old->SetInCache(false); - usage_ -= old->charge; + size_t total_charge = old->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= total_charge); + usage_ -= total_charge; last_reference_list.push_back(old); } } @@ -180,16 +184,19 @@ void LRUCacheShard::LRU_Remove(LRUHandle* e) { e->next->prev = e->prev; e->prev->next = e->next; e->prev = e->next = nullptr; - lru_usage_ -= e->charge; + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); + assert(lru_usage_ >= total_charge); + lru_usage_ -= total_charge; if (e->InHighPriPool()) { - assert(high_pri_pool_usage_ >= e->charge); - high_pri_pool_usage_ -= e->charge; + assert(high_pri_pool_usage_ >= total_charge); + high_pri_pool_usage_ -= total_charge; } } void LRUCacheShard::LRU_Insert(LRUHandle* e) { assert(e->next == nullptr); assert(e->prev == nullptr); + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) { // Inset "e" to head of LRU list. e->next = &lru_; @@ -197,7 +204,7 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) { e->prev->next = e; e->next->prev = e; e->SetInHighPriPool(true); - high_pri_pool_usage_ += e->charge; + high_pri_pool_usage_ += total_charge; MaintainPoolSize(); } else { // Insert "e" to the head of low-pri pool. Note that when @@ -209,7 +216,7 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) { e->SetInHighPriPool(false); lru_low_pri_ = e; } - lru_usage_ += e->charge; + lru_usage_ += total_charge; } void LRUCacheShard::MaintainPoolSize() { @@ -218,6 +225,7 @@ void LRUCacheShard::MaintainPoolSize() { lru_low_pri_ = lru_low_pri_->next; assert(lru_low_pri_ != &lru_); lru_low_pri_->SetInHighPriPool(false); + assert(high_pri_pool_usage_ >= lru_low_pri_->charge); high_pri_pool_usage_ -= lru_low_pri_->charge; } } @@ -231,7 +239,9 @@ void LRUCacheShard::EvictFromLRU(size_t charge, LRU_Remove(old); table_.Remove(old->key(), old->hash); old->SetInCache(false); - usage_ -= old->charge; + size_t old_total_charge = old->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= old_total_charge); + usage_ -= old_total_charge; deleted->push_back(old); } } @@ -311,7 +321,9 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) { } } if (last_reference) { - usage_ -= e->charge; + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= total_charge); + usage_ -= total_charge; } } @@ -345,15 +357,16 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, e->SetInCache(true); e->SetPriority(priority); memcpy(e->key_data, key.data(), key.size()); + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); { MutexLock l(&mutex_); // Free the space following strict LRU policy until enough space // is freed or the lru list is empty - EvictFromLRU(charge, &last_reference_list); + EvictFromLRU(total_charge, &last_reference_list); - if ((usage_ + charge) > capacity_ && + if ((usage_ + total_charge) > capacity_ && (strict_capacity_limit_ || handle == nullptr)) { if (handle == nullptr) { // Don't insert the entry but still return ok, as if the entry inserted @@ -369,14 +382,17 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, // Insert into the cache. Note that the cache might get larger than its // capacity if not enough space was freed up. LRUHandle* old = table_.Insert(e); - usage_ += e->charge; + usage_ += total_charge; if (old != nullptr) { assert(old->InCache()); old->SetInCache(false); if (!old->HasRefs()) { // old is on LRU because it's in cache and its reference count is 0 LRU_Remove(old); - usage_ -= old->charge; + size_t old_total_charge = + old->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= old_total_charge); + usage_ -= old_total_charge; last_reference_list.push_back(old); } } @@ -409,7 +425,9 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { if (!e->HasRefs()) { // The entry is in LRU since it's in hash and has no external references LRU_Remove(e); - usage_ -= e->charge; + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= total_charge); + usage_ -= total_charge; last_reference = true; } } @@ -447,7 +465,8 @@ std::string LRUCacheShard::GetPrintableOptions() const { LRUCache::LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio, std::shared_ptr allocator, - bool use_adaptive_mutex) + bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy) : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, std::move(allocator)) { num_shards_ = 1 << num_shard_bits; @@ -457,7 +476,7 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits, for (int i = 0; i < num_shards_; i++) { new (&shards_[i]) LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio, - use_adaptive_mutex); + use_adaptive_mutex, metadata_charge_policy); } } @@ -526,15 +545,15 @@ std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits, cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, - cache_opts.memory_allocator, - cache_opts.use_adaptive_mutex); + cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, + cache_opts.metadata_charge_policy); } std::shared_ptr NewLRUCache( size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio, - std::shared_ptr memory_allocator, - bool use_adaptive_mutex) { + std::shared_ptr memory_allocator, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy) { if (num_shard_bits >= 20) { return nullptr; // the cache cannot be sharded into too many fine pieces } @@ -545,10 +564,9 @@ std::shared_ptr NewLRUCache( if (num_shard_bits < 0) { num_shard_bits = GetDefaultCacheShardBits(capacity); } - return std::make_shared(capacity, num_shard_bits, - strict_capacity_limit, high_pri_pool_ratio, - std::move(memory_allocator), - use_adaptive_mutex); + return std::make_shared( + capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, + std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy); } } // namespace rocksdb diff --git a/cache/lru_cache.h b/cache/lru_cache.h index 1ff765d15..6313c69db 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -12,6 +12,7 @@ #include "cache/sharded_cache.h" +#include "port/malloc.h" #include "port/port.h" #include "util/autovector.h" @@ -128,6 +129,22 @@ struct LRUHandle { } delete[] reinterpret_cast(this); } + + // Caclculate the memory usage by metadata + inline size_t CalcTotalCharge( + CacheMetadataChargePolicy metadata_charge_policy) { + assert(key_length); + size_t meta_charge = 0; + if (metadata_charge_policy == kFullChargeCacheMetadata) { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + meta_charge += malloc_usable_size(static_cast(this)); +#else + // This is the size that is used when a new handle is created + meta_charge += sizeof(LRUHandle) - 1 + key_length; +#endif + } + return charge + meta_charge; + } }; // We provide our own simple hash table since it removes a whole bunch @@ -176,7 +193,8 @@ class LRUHandleTable { class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { public: LRUCacheShard(size_t capacity, bool strict_capacity_limit, - double high_pri_pool_ratio, bool use_adaptive_mutex); + double high_pri_pool_ratio, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy); virtual ~LRUCacheShard() override = default; // Separate from constructor so caller can easily make an array of LRUCache @@ -297,7 +315,9 @@ class LRUCache LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio, std::shared_ptr memory_allocator = nullptr, - bool use_adaptive_mutex = kDefaultToAdaptiveMutex); + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDontChargeCacheMetadata); virtual ~LRUCache(); virtual const char* Name() const override { return "LRUCache"; } virtual CacheShard* GetShard(int shard) override; diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 575764611..f4f4dee69 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -31,7 +31,8 @@ class LRUCacheTest : public testing::Test { cache_ = reinterpret_cast( port::cacheline_aligned_alloc(sizeof(LRUCacheShard))); new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/, - high_pri_pool_ratio, use_adaptive_mutex); + high_pri_pool_ratio, use_adaptive_mutex, + kDontChargeCacheMetadata); } void Insert(const std::string& key, diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index 0c1499f22..4a396bd47 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -40,6 +40,13 @@ class CacheShard { bool thread_safe) = 0; virtual void EraseUnRefEntries() = 0; virtual std::string GetPrintableOptions() const { return ""; } + void set_metadata_charge_policy( + CacheMetadataChargePolicy metadata_charge_policy) { + metadata_charge_policy_ = metadata_charge_policy; + } + + protected: + CacheMetadataChargePolicy metadata_charge_policy_ = kDontChargeCacheMetadata; }; // Generic cache interface which shards cache by hash of keys. 2^num_shard_bits diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 39bb4de2f..89c2dbd5d 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -380,8 +380,13 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { options.statistics = rocksdb::CreateDBStatistics(); BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; + LRUCacheOptions co; // 500 bytes are enough to hold the first two blocks - std::shared_ptr cache = NewLRUCache(500, 0, false); + co.capacity = 500; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); table_options.block_cache = cache; table_options.filter_policy.reset(NewBloomFilterPolicy(20, true)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 8bc730250..443138908 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -240,8 +240,11 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, const int table_cache_size = (mutable_db_options_.max_open_files == -1) ? TableCache::kInfiniteCapacity : mutable_db_options_.max_open_files - 10; - table_cache_ = NewLRUCache(table_cache_size, - immutable_db_options_.table_cache_numshardbits); + LRUCacheOptions co; + co.capacity = table_cache_size; + co.num_shard_bits = immutable_db_options_.table_cache_numshardbits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + table_cache_ = NewLRUCache(co); versions_.reset(new VersionSet(dbname_, &immutable_db_options_, env_options_, table_cache_.get(), write_buffer_manager_, diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index 9ea7ea0d9..ed12a8801 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -1070,7 +1070,8 @@ TEST_P(DBIteratorTest, IndexWithFirstKey) { BlockBasedTableOptions::IndexShorteningMode::kNoShortening; table_options.flush_block_policy_factory = std::make_shared(); - table_options.block_cache = NewLRUCache(1000); // fits all blocks + table_options.block_cache = + NewLRUCache(8000); // fits all blocks and their cache metadata overhead options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 956accef8..be70bcea5 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -1631,7 +1631,11 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) { // Test with empty block cache. constexpr size_t kCapacity = 100; - auto block_cache = NewLRUCache(kCapacity, 0 /*num_shard_bits*/); + LRUCacheOptions co; + co.capacity = kCapacity; + co.num_shard_bits = 0; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto block_cache = NewLRUCache(co); table_options.block_cache = block_cache; table_options.no_block_cache = false; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); diff --git a/db/db_test2.cc b/db/db_test2.cc index 2c993580a..cf622973a 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -3780,7 +3780,7 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) { TEST_F(DBTest2, RowCacheSnapshot) { Options options = CurrentOptions(); options.statistics = rocksdb::CreateDBStatistics(); - options.row_cache = NewLRUCache(8192); + options.row_cache = NewLRUCache(8 * 8192); DestroyAndReopen(options); ASSERT_OK(Put("foo", "bar1")); diff --git a/env/env_test.cc b/env/env_test.cc index 6f225e37f..f9c597823 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -11,13 +11,6 @@ #include #endif -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif #include #include @@ -39,6 +32,7 @@ #include "env/env_chroot.h" #include "logging/log_buffer.h" +#include "port/malloc.h" #include "port/port.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index d8093c7ea..27b4a6f64 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -36,6 +36,13 @@ class Cache; extern const bool kDefaultToAdaptiveMutex; +enum CacheMetadataChargePolicy { + kDontChargeCacheMetadata, + kFullChargeCacheMetadata +}; +const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy = + kFullChargeCacheMetadata; + struct LRUCacheOptions { // Capacity of the cache. size_t capacity = 0; @@ -76,17 +83,23 @@ struct LRUCacheOptions { // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise. bool use_adaptive_mutex = kDefaultToAdaptiveMutex; + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy; + LRUCacheOptions() {} LRUCacheOptions(size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, double _high_pri_pool_ratio, std::shared_ptr _memory_allocator = nullptr, - bool _use_adaptive_mutex = kDefaultToAdaptiveMutex) + bool _use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy) : capacity(_capacity), num_shard_bits(_num_shard_bits), strict_capacity_limit(_strict_capacity_limit), high_pri_pool_ratio(_high_pri_pool_ratio), memory_allocator(std::move(_memory_allocator)), - use_adaptive_mutex(_use_adaptive_mutex) {} + use_adaptive_mutex(_use_adaptive_mutex), + metadata_charge_policy(_metadata_charge_policy) {} }; // Create a new cache with a fixed size capacity. The cache is sharded @@ -101,7 +114,9 @@ extern std::shared_ptr NewLRUCache( size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, std::shared_ptr memory_allocator = nullptr, - bool use_adaptive_mutex = kDefaultToAdaptiveMutex); + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy); extern std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts); @@ -110,10 +125,11 @@ extern std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts); // more detail. // // Return nullptr if it is not supported. -extern std::shared_ptr NewClockCache(size_t capacity, - int num_shard_bits = -1, - bool strict_capacity_limit = false); - +extern std::shared_ptr NewClockCache( + size_t capacity, int num_shard_bits = -1, + bool strict_capacity_limit = false, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy); class Cache { public: // Depending on implementation, cache entries with high priority could be less diff --git a/memory/arena.cc b/memory/arena.cc index 3f113e776..70c803901 100644 --- a/memory/arena.cc +++ b/memory/arena.cc @@ -8,18 +8,12 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "memory/arena.h" -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif #ifndef OS_WIN #include #endif #include #include "logging/logging.h" +#include "port/malloc.h" #include "port/port.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc index 06514eabd..23de06a62 100644 --- a/memtable/write_buffer_manager_test.cc +++ b/memtable/write_buffer_manager_test.cc @@ -51,8 +51,12 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) { } TEST_F(WriteBufferManagerTest, CacheCost) { + LRUCacheOptions co; // 1GB cache - std::shared_ptr cache = NewLRUCache(1024 * 1024 * 1024, 4); + co.capacity = 1024 * 1024 * 1024; + co.num_shard_bits = 4; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); // A write buffer manager of size 50MB std::unique_ptr wbf( new WriteBufferManager(50 * 1024 * 1024, cache)); diff --git a/port/malloc.h b/port/malloc.h new file mode 100644 index 000000000..f973263e2 --- /dev/null +++ b/port/malloc.h @@ -0,0 +1,17 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include +#else +#include +#endif // OS_FREEBSD +#endif // ROCKSDB_MALLOC_USABLE_SIZE diff --git a/table/block_based/block.h b/table/block_based/block.h index 3e19f9fdc..9568cd69c 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -12,16 +12,10 @@ #include #include #include -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" +#include "port/malloc.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index cf1afb5d3..905bbd217 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -6,15 +6,8 @@ #include #include "table/block_based/full_filter_block.h" -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif - #include "monitoring/perf_context_imp.h" +#include "port/malloc.h" #include "port/port.h" #include "rocksdb/filter_policy.h" #include "table/block_based/block_based_table_reader.h" diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index 1ba6b3c07..f06150c29 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -5,16 +5,10 @@ #include "table/block_based/partitioned_filter_block.h" -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif #include #include "monitoring/perf_context_imp.h" +#include "port/malloc.h" #include "port/port.h" #include "rocksdb/filter_policy.h" #include "table/block_based/block.h" diff --git a/table/format.h b/table/format.h index ef323a647..552cd940d 100644 --- a/table/format.h +++ b/table/format.h @@ -10,13 +10,6 @@ #pragma once #include #include -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif #include "file/file_prefetch_buffer.h" #include "file/random_access_file_reader.h" @@ -27,6 +20,7 @@ #include "memory/memory_allocator.h" #include "options/cf_options.h" +#include "port/malloc.h" #include "port/port.h" // noexcept #include "table/persistent_cache_options.h" #include "util/crc32c.h" diff --git a/table/table_test.cc b/table/table_test.cc index cd7363df0..77b962598 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2599,7 +2599,11 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { // Enable the cache for index/filter blocks BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - table_options.block_cache = NewLRUCache(2048, 2); + LRUCacheOptions co; + co.capacity = 2048; + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + table_options.block_cache = NewLRUCache(co); table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(new BlockBasedTableFactory(table_options)); std::vector keys; diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc index 3d3432e20..085e113ff 100644 --- a/utilities/simulator_cache/cache_simulator_test.cc +++ b/utilities/simulator_cache/cache_simulator_test.cc @@ -313,10 +313,13 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) { get.sst_fd_number = 0; get.get_from_user_specified_snapshot = Boolean::kFalse; - std::shared_ptr sim_cache = - NewLRUCache(/*capacity=*/16, /*num_shard_bits=*/1, - /*strict_capacity_limit=*/false, - /*high_pri_pool_ratio=*/0); + LRUCacheOptions co; + co.capacity = 16; + co.num_shard_bits = 1; + co.strict_capacity_limit = false; + co.high_pri_pool_ratio = 0; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr sim_cache = NewLRUCache(co); std::unique_ptr cache_simulator( new HybridRowBlockCacheSimulator( nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true)); diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc index 3e1f821f7..ac57a4230 100644 --- a/utilities/simulator_cache/sim_cache.cc +++ b/utilities/simulator_cache/sim_cache.cc @@ -331,8 +331,11 @@ class SimCacheImpl : public SimCache { // For instrumentation purpose, use NewSimCache instead std::shared_ptr NewSimCache(std::shared_ptr cache, size_t sim_capacity, int num_shard_bits) { - return NewSimCache(NewLRUCache(sim_capacity, num_shard_bits), cache, - num_shard_bits); + LRUCacheOptions co; + co.capacity = sim_capacity; + co.num_shard_bits = num_shard_bits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + return NewSimCache(NewLRUCache(co), cache, num_shard_bits); } std::shared_ptr NewSimCache(std::shared_ptr sim_cache, diff --git a/utilities/simulator_cache/sim_cache_test.cc b/utilities/simulator_cache/sim_cache_test.cc index 7f0f904a7..e66228107 100644 --- a/utilities/simulator_cache/sim_cache_test.cc +++ b/utilities/simulator_cache/sim_cache_test.cc @@ -77,8 +77,12 @@ TEST_F(SimCacheTest, SimCache) { auto table_options = GetTableOptions(); auto options = GetOptions(table_options); InitTable(options); - std::shared_ptr simCache = - NewSimCache(NewLRUCache(0, 0, false), 20000, 0); + LRUCacheOptions co; + co.capacity = 0; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr simCache = NewSimCache(NewLRUCache(co), 20000, 0); table_options.block_cache = simCache; options.table_factory.reset(new BlockBasedTableFactory(table_options)); Reopen(options); @@ -142,8 +146,10 @@ TEST_F(SimCacheTest, SimCacheLogging) { auto table_options = GetTableOptions(); auto options = GetOptions(table_options); options.disable_auto_compactions = true; - std::shared_ptr sim_cache = - NewSimCache(NewLRUCache(1024 * 1024), 20000, 0); + LRUCacheOptions co; + co.capacity = 1024 * 1024; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr sim_cache = NewSimCache(NewLRUCache(co), 20000, 0); table_options.block_cache = sim_cache; options.table_factory.reset(new BlockBasedTableFactory(table_options)); Reopen(options);