Summary:
This is the initial step in the development of a lock-free clock cache. This PR includes the base hash table design (which we mostly ported over from FastLRUCache) and the clock eviction algorithm. Importantly, it's still _not_ lock-free---all operations use a shard lock. Besides the locking, there are other features left as future work:
- Remove keys from the handles. Instead, use 128-bit bijective hashes of them for handle comparisons, probing (we need two 32-bit hashes of the key for double hashing) and sharding (we need one 6-bit hash).
- Remove the clock_usage_ field, which is updated on every lookup. Even if it were atomically updated, it could cause memory invalidations across cores.
- Middle insertions into the clock list.
- A test that exercises the clock eviction policy.
- Update the Java API of ClockCache and Java calls to C++.

Along the way, we improved the code and comments quality of FastLRUCache. These changes are relatively minor.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10273

Test Plan: ``make -j24 check``

Reviewed By: pdillinger

Differential Revision: D37522461

Pulled By: guidotag

fbshipit-source-id: 3d70b737dbb70dcf662f00cef8c609750f083943
main
Guido Tagliavini Ponce 3 years ago committed by Facebook GitHub Bot
parent c2dc4c0c52
commit 57a0e2f304
  1. 4
      cache/cache_bench_tool.cc
  2. 67
      cache/cache_test.cc
  3. 1198
      cache/clock_cache.cc
  4. 441
      cache/clock_cache.h
  5. 101
      cache/fast_lru_cache.cc
  6. 44
      cache/fast_lru_cache.h
  7. 137
      cache/lru_cache_test.cc
  8. 11
      db/db_block_cache_test.cc
  9. 4
      db_stress_tool/db_stress_test_base.cc
  10. 12
      include/rocksdb/cache.h
  11. 6
      java/rocksjni/clock_cache.cc
  12. 4
      tools/db_bench_tool.cc
  13. 4
      tools/db_crashtest.py

@ -284,7 +284,9 @@ class CacheBench {
}
if (FLAGS_cache_type == "clock_cache") {
cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits);
cache_ = NewClockCache(
FLAGS_cache_size, FLAGS_value_bytes, FLAGS_num_shard_bits,
false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy);
if (!cache_) {
fprintf(stderr, "Clock cache not supported.\n");
exit(1);

@ -77,7 +77,7 @@ class CacheTest : public testing::TestWithParam<std::string> {
static std::string type_;
static void Deleter(const Slice& key, void* v) {
if (type_ == kFast) {
if (type_ == kFast || type_ == kClock) {
current_->deleted_keys_.push_back(DecodeKey16Bytes(key));
} else {
current_->deleted_keys_.push_back(DecodeKey32Bits(key));
@ -111,7 +111,9 @@ class CacheTest : public testing::TestWithParam<std::string> {
return NewLRUCache(capacity);
}
if (type == kClock) {
return NewClockCache(capacity);
return NewClockCache(
capacity, 1 /*estimated_value_size*/, -1 /*num_shard_bits*/,
false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy);
}
if (type == kFast) {
return NewFastLRUCache(
@ -135,8 +137,8 @@ class CacheTest : public testing::TestWithParam<std::string> {
return NewLRUCache(co);
}
if (type == kClock) {
return NewClockCache(capacity, num_shard_bits, strict_capacity_limit,
charge_policy);
return NewClockCache(capacity, 1 /*estimated_value_size*/, num_shard_bits,
strict_capacity_limit, charge_policy);
}
if (type == kFast) {
return NewFastLRUCache(capacity, 1 /*estimated_value_size*/,
@ -152,7 +154,8 @@ class CacheTest : public testing::TestWithParam<std::string> {
// LRUCache and ClockCache don't, so the encoding depends on
// the cache type.
std::string EncodeKey(int k) {
if (GetParam() == kFast) {
auto type = GetParam();
if (type == kFast || type == kClock) {
return EncodeKey16Bytes(k);
} else {
return EncodeKey32Bits(k);
@ -160,7 +163,8 @@ class CacheTest : public testing::TestWithParam<std::string> {
}
int DecodeKey(const Slice& k) {
if (GetParam() == kFast) {
auto type = GetParam();
if (type == kFast || type == kClock) {
return DecodeKey16Bytes(k);
} else {
return DecodeKey32Bits(k);
@ -217,8 +221,9 @@ std::string CacheTest::type_;
class LRUCacheTest : public CacheTest {};
TEST_P(CacheTest, UsageTest) {
if (GetParam() == kFast) {
ROCKSDB_GTEST_BYPASS("FastLRUCache requires 16 byte keys.");
auto type = GetParam();
if (type == kFast || type == kClock) {
ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys.");
return;
}
@ -266,8 +271,9 @@ TEST_P(CacheTest, UsageTest) {
}
TEST_P(CacheTest, PinnedUsageTest) {
if (GetParam() == kFast) {
ROCKSDB_GTEST_BYPASS("FastLRUCache requires 16 byte keys.");
auto type = GetParam();
if (type == kFast || type == kClock) {
ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys.");
return;
}
@ -492,8 +498,10 @@ TEST_P(CacheTest, EvictionPolicyRef) {
Insert(302, 103);
Insert(303, 104);
// Insert entries much more than Cache capacity
for (int i = 0; i < kCacheSize * 2; i++) {
// Insert entries much more than cache capacity.
double load_factor =
std::min(fast_lru_cache::kLoadFactor, clock_cache::kLoadFactor);
for (int i = 0; i < 2 * static_cast<int>(kCacheSize / load_factor); i++) {
Insert(1000 + i, 2000 + i);
}
@ -523,8 +531,9 @@ TEST_P(CacheTest, EvictionPolicyRef) {
}
TEST_P(CacheTest, EvictEmptyCache) {
if (GetParam() == kFast) {
ROCKSDB_GTEST_BYPASS("FastLRUCache requires 16 byte keys.");
auto type = GetParam();
if (type == kFast || type == kClock) {
ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys.");
return;
}
@ -534,8 +543,9 @@ TEST_P(CacheTest, EvictEmptyCache) {
}
TEST_P(CacheTest, EraseFromDeleter) {
if (GetParam() == kFast) {
ROCKSDB_GTEST_BYPASS("FastLRUCache requires 16 byte keys.");
auto type = GetParam();
if (type == kFast || type == kClock) {
ROCKSDB_GTEST_BYPASS("FastLRUCache and ClockCache require 16-byte keys.");
return;
}
@ -650,8 +660,10 @@ TEST_P(CacheTest, ReleaseWithoutErase) {
}
TEST_P(CacheTest, SetCapacity) {
if (GetParam() == kFast) {
ROCKSDB_GTEST_BYPASS("FastLRUCache doesn't support capacity adjustments.");
auto type = GetParam();
if (type == kFast || type == kClock) {
ROCKSDB_GTEST_BYPASS(
"FastLRUCache and ClockCache don't support capacity adjustments.");
return;
}
// test1: increase capacity
@ -702,9 +714,11 @@ TEST_P(CacheTest, SetCapacity) {
}
TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
if (GetParam() == kFast) {
auto type = GetParam();
if (type == kFast || type == kClock) {
ROCKSDB_GTEST_BYPASS(
"FastLRUCache doesn't support an unbounded number of inserts beyond "
"FastLRUCache and ClockCache don't support an unbounded number of "
"inserts beyond "
"capacity.");
return;
}
@ -759,8 +773,10 @@ TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
}
TEST_P(CacheTest, OverCapacity) {
if (GetParam() == kFast) {
ROCKSDB_GTEST_BYPASS("FastLRUCache doesn't support capacity adjustments.");
auto type = GetParam();
if (type == kFast || type == kClock) {
ROCKSDB_GTEST_BYPASS(
"FastLRUCache and ClockCache don't support capacity adjustments.");
return;
}
size_t n = 10;
@ -938,15 +954,10 @@ TEST_P(CacheTest, GetChargeAndDeleter) {
cache_->Release(h1);
}
#ifdef SUPPORT_CLOCK_CACHE
std::shared_ptr<Cache> (*new_clock_cache_func)(
size_t, int, bool, CacheMetadataChargePolicy) = NewClockCache;
size_t, size_t, int, bool, CacheMetadataChargePolicy) = NewClockCache;
INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
testing::Values(kLRU, kClock, kFast));
#else
INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
testing::Values(kLRU, kFast));
#endif // SUPPORT_CLOCK_CACHE
INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest,
testing::Values(kLRU, kFast));

1198
cache/clock_cache.cc vendored

File diff suppressed because it is too large Load Diff

441
cache/clock_cache.h vendored

@ -9,8 +9,445 @@
#pragma once
#include <array>
#include <memory>
#include <string>
#include "cache/cache_key.h"
#include "cache/sharded_cache.h"
#include "port/lang.h"
#include "port/malloc.h"
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/secondary_cache.h"
#include "util/autovector.h"
#include "util/distributed_mutex.h"
namespace ROCKSDB_NAMESPACE {
namespace clock_cache {
// Clock cache implementation. This is based on FastLRUCache's open-addressed
// hash table. Importantly, it stores elements in an array, and resolves
// collision using a probing strategy. Visibility and referenceability of
// elements works as usual. See fast_lru_cache.h for a detailed description.
//
// The main difference with FastLRUCache is, not surprisingly, the eviction
// algorithm
// ---instead of an LRU list, we maintain a circular list with the elements
// available for eviction, which the clock algorithm traverses to pick the next
// victim. The clock list is represented using the array of handles, and we
// simply mark those elements that are present in the list. This is done using
// different clock flags, namely NONE, LOW, MEDIUM, HIGH, that represent
// priorities: NONE means that the element is not part of the clock list, and
// LOW to HIGH represent how close an element is from being evictable (LOW being
// immediately evictable). When the clock pointer steps on an element that is
// not immediately evictable, it decreases its priority.
constexpr double kLoadFactor = 0.35; // See fast_lru_cache.h.
constexpr double kStrictLoadFactor = 0.7; // See fast_lru_cache.h.
// Arbitrary seeds.
constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5;
// An experimental (under development!) alternative to LRUCache
struct ClockHandle {
void* value;
Cache::DeleterFn deleter;
uint32_t hash;
size_t total_charge; // TODO(opt): Only allow uint32_t?
// The number of external refs to this entry.
uint32_t refs;
static constexpr int kIsVisibleOffset = 0;
static constexpr int kIsElementOffset = 1;
static constexpr int kClockPriorityOffset = 2;
enum Flags : uint8_t {
// Whether the handle is visible to Lookups.
IS_VISIBLE = (1 << kIsVisibleOffset),
// Whether the slot is in use by an element.
IS_ELEMENT = (1 << kIsElementOffset),
// Clock priorities. Represents how close a handle is from
// being evictable.
CLOCK_PRIORITY = (3 << kClockPriorityOffset),
};
uint8_t flags;
enum ClockPriority : uint8_t {
NONE = (0 << kClockPriorityOffset), // Not an element in the eyes of clock.
LOW = (1 << kClockPriorityOffset), // Immediately evictable.
MEDIUM = (2 << kClockPriorityOffset),
HIGH = (3 << kClockPriorityOffset)
// Priority is CLOCK_NONE if and only if
// (i) the handle is not an element, or
// (ii) the handle is an element but it is being referenced.
};
// The number of elements that hash to this slot or a lower one,
// but wind up in a higher slot.
uint32_t displacements;
std::array<char, kCacheKeySize> key_data;
ClockHandle() {
value = nullptr;
deleter = nullptr;
hash = 0;
total_charge = 0;
refs = 0;
flags = 0;
SetIsVisible(false);
SetIsElement(false);
SetPriority(ClockPriority::NONE);
displacements = 0;
key_data.fill(0);
}
Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
// Increase the reference count by 1.
void Ref() { refs++; }
// Just reduce the reference count by 1. Return true if it was last reference.
bool Unref() {
assert(refs > 0);
refs--;
return refs == 0;
}
// Return true if there are external refs, false otherwise.
bool HasRefs() const { return refs > 0; }
bool IsVisible() const { return flags & IS_VISIBLE; }
void SetIsVisible(bool is_visible) {
if (is_visible) {
flags |= IS_VISIBLE;
} else {
flags &= ~IS_VISIBLE;
}
}
bool IsElement() const { return flags & IS_ELEMENT; }
void SetIsElement(bool is_element) {
if (is_element) {
flags |= IS_ELEMENT;
} else {
flags &= ~IS_ELEMENT;
}
}
ClockPriority GetPriority() const {
return static_cast<ClockPriority>(flags & Flags::CLOCK_PRIORITY);
}
bool IsInClockList() const {
return GetPriority() != ClockHandle::ClockPriority::NONE;
}
void SetPriority(ClockPriority priority) {
flags &= ~Flags::CLOCK_PRIORITY;
flags |= priority;
}
void DecreasePriority() {
uint8_t p = static_cast<uint8_t>(flags & Flags::CLOCK_PRIORITY) >>
kClockPriorityOffset;
assert(p > 0);
p--;
flags &= ~Flags::CLOCK_PRIORITY;
ClockPriority new_priority =
static_cast<ClockPriority>(p << kClockPriorityOffset);
flags |= new_priority;
}
void FreeData() {
assert(refs == 0);
if (deleter) {
(*deleter)(key(), value);
}
}
// Calculate the memory usage by metadata.
inline size_t CalcMetaCharge(
CacheMetadataChargePolicy metadata_charge_policy) const {
if (metadata_charge_policy != kFullChargeCacheMetadata) {
return 0;
} else {
// #ifdef ROCKSDB_MALLOC_USABLE_SIZE
// return malloc_usable_size(
// const_cast<void*>(static_cast<const void*>(this)));
// #else
// TODO(Guido) malloc_usable_size only works when we call it on
// a pointer allocated with malloc. Because our handles are all
// allocated in a single shot as an array, the user can't call
// CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
// pointer returned by the cache. Moreover, malloc_usable_size
// expects a heap-allocated handle, but sometimes in our code we
// wish to pass a stack-allocated handle (this is only a performance
// concern).
// What is the right way to compute metadata charges with pre-allocated
// handles?
return sizeof(ClockHandle);
// #endif
}
}
inline void CalcTotalCharge(
size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
total_charge = charge + CalcMetaCharge(metadata_charge_policy);
}
inline size_t GetCharge(
CacheMetadataChargePolicy metadata_charge_policy) const {
size_t meta_charge = CalcMetaCharge(metadata_charge_policy);
assert(total_charge >= meta_charge);
return total_charge - meta_charge;
}
inline bool IsEmpty() {
return !this->IsElement() && this->displacements == 0;
}
inline bool IsTombstone() {
return !this->IsElement() && this->displacements > 0;
}
inline bool Matches(const Slice& some_key) {
return this->IsElement() && this->key() == some_key;
}
}; // struct ClockHandle
class ClockHandleTable {
public:
explicit ClockHandleTable(int hash_bits);
~ClockHandleTable();
// Returns a pointer to a visible element matching the key/hash, or
// nullptr if not present.
ClockHandle* Lookup(const Slice& key);
// Inserts a copy of h into the hash table.
// Returns a pointer to the inserted handle, or nullptr if no slot
// available was found. If an existing visible element matching the
// key/hash is already present in the hash table, the argument old
// is set to pointe to it; otherwise, it's set to nullptr.
ClockHandle* Insert(ClockHandle* h, ClockHandle** old);
// Removes h from the hash table. The handle must already be off
// the clock list.
void Remove(ClockHandle* h);
// Turns a visible element h into a ghost (i.e., not visible).
void Exclude(ClockHandle* h);
// Assigns a copy of h to the given slot.
void Assign(int slot, ClockHandle* h);
template <typename T>
void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
for (uint32_t i = index_begin; i < index_end; i++) {
ClockHandle* h = &array_[i];
if (h->IsVisible()) {
func(h);
}
}
}
#if defined(TBB) && !defined(ROCKSDB_LITE)
#define SUPPORT_CLOCK_CACHE
uint32_t GetTableSize() const { return uint32_t{1} << length_bits_; }
int GetLengthBits() const { return length_bits_; }
uint32_t GetOccupancyLimit() const { return occupancy_limit_; }
uint32_t GetOccupancy() const { return occupancy_; }
// Returns x mod 2^{length_bits_}.
uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; }
private:
friend class ClockCacheShard;
int FindVisibleElement(const Slice& key, int& probe, int displacement);
int FindAvailableSlot(const Slice& key, int& probe, int displacement);
int FindVisibleElementOrAvailableSlot(const Slice& key, int& probe,
int displacement);
// Returns the index of the first slot probed (hashing with
// the given key) with a handle e such that cond(e) is true.
// Otherwise, if no match is found, returns -1.
// For every handle e probed except the final slot, updates
// e->displacements += displacement.
// The argument probe is modified such that consecutive calls
// to FindSlot continue probing right after where the previous
// call left.
int FindSlot(const Slice& key, std::function<bool(ClockHandle*)> cond,
int& probe, int displacement);
// Number of hash bits used for table index.
// The size of the table is 1 << length_bits_.
int length_bits_;
const uint32_t length_bits_mask_;
// Number of elements in the table.
uint32_t occupancy_;
// Maximum number of elements the user can store in the table.
uint32_t occupancy_limit_;
std::unique_ptr<ClockHandle[]> array_;
}; // class ClockHandleTable
// A single shard of sharded cache.
class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
public:
ClockCacheShard(size_t capacity, size_t estimated_value_size,
bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy);
~ClockCacheShard() override = default;
// Separate from constructor so caller can easily make an array of ClockCache
// if current usage is more than new capacity, the function will attempt to
// free the needed space.
void SetCapacity(size_t capacity) override;
// Set the flag to reject insertion if cache if full.
void SetStrictCapacityLimit(bool strict_capacity_limit) override;
// Like Cache methods, but with an extra "hash" parameter.
// Insert an item into the hash table and, if handle is null, insert into
// the clock list. Older items are evicted as necessary. If the cache is full
// and free_handle_on_fail is true, the item is deleted and handle is set to
// nullptr.
Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
Cache::DeleterFn deleter, Cache::Handle** handle,
Cache::Priority priority) override;
Status Insert(const Slice& key, uint32_t hash, void* value,
const Cache::CacheItemHelper* helper, size_t charge,
Cache::Handle** handle, Cache::Priority priority) override {
return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
}
Cache::Handle* Lookup(const Slice& key, uint32_t hash,
const Cache::CacheItemHelper* /*helper*/,
const Cache::CreateCallback& /*create_cb*/,
Cache::Priority /*priority*/, bool /*wait*/,
Statistics* /*stats*/) override {
return Lookup(key, hash);
}
Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
bool Release(Cache::Handle* handle, bool /*useful*/,
bool erase_if_last_ref) override {
return Release(handle, erase_if_last_ref);
}
bool IsReady(Cache::Handle* /*handle*/) override { return true; }
void Wait(Cache::Handle* /*handle*/) override {}
bool Ref(Cache::Handle* handle) override;
bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
void Erase(const Slice& key, uint32_t hash) override;
size_t GetUsage() const override;
size_t GetPinnedUsage() const override;
void ApplyToSomeEntries(
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
uint32_t average_entries_per_lock, uint32_t* state) override;
void EraseUnRefEntries() override;
std::string GetPrintableOptions() const override;
private:
friend class ClockCache;
void ClockRemove(ClockHandle* e);
void ClockInsert(ClockHandle* e);
// Free some space following strict clock policy until enough space
// to hold (usage_ + charge) is freed or the clock list is empty
// This function is not thread safe - it needs to be executed while
// holding the mutex_.
void EvictFromClock(size_t charge, autovector<ClockHandle>* deleted);
// Returns the charge of a single handle.
static size_t CalcEstimatedHandleCharge(
size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy);
// Returns the number of bits used to hash an element in the hash
// table.
static int CalcHashBits(size_t capacity, size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy);
// Initialized before use.
size_t capacity_;
// Whether to reject insertion if cache reaches its full capacity.
bool strict_capacity_limit_;
uint32_t clock_pointer_;
// ------------^^^^^^^^^^^^^-----------
// Not frequently modified data members
// ------------------------------------
//
// We separate data members that are updated frequently from the ones that
// are not frequently updated so that they don't share the same cache line
// which will lead into false cache sharing
//
// ------------------------------------
// Frequently modified data members
// ------------vvvvvvvvvvvvv-----------
ClockHandleTable table_;
// Memory size for entries residing in the cache.
size_t usage_;
// Memory size for unpinned entries in the clock list.
size_t clock_usage_;
// mutex_ protects the following state.
// We don't count mutex_ as the cache's internal state so semantically we
// don't mind mutex_ invoking the non-const actions.
mutable DMutex mutex_;
}; // class ClockCacheShard
class ClockCache
#ifdef NDEBUG
final
#endif
: public ShardedCache {
public:
ClockCache(size_t capacity, size_t estimated_value_size, int num_shard_bits,
bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy =
kDontChargeCacheMetadata);
~ClockCache() override;
const char* Name() const override { return "ClockCache"; }
CacheShard* GetShard(uint32_t shard) override;
const CacheShard* GetShard(uint32_t shard) const override;
void* Value(Handle* handle) override;
size_t GetCharge(Handle* handle) const override;
uint32_t GetHash(Handle* handle) const override;
DeleterFn GetDeleter(Handle* handle) const override;
void DisownData() override;
private:
ClockCacheShard* shards_ = nullptr;
int num_shards_ = 0;
}; // class ClockCache
} // namespace clock_cache
} // namespace ROCKSDB_NAMESPACE

@ -9,8 +9,6 @@
#include "cache/fast_lru_cache.h"
#include <math.h>
#include <cassert>
#include <cstdint>
#include <cstdio>
@ -21,39 +19,25 @@
#include "port/lang.h"
#include "util/distributed_mutex.h"
#include "util/hash.h"
#include "util/math.h"
#include "util/random.h"
namespace ROCKSDB_NAMESPACE {
namespace fast_lru_cache {
namespace {
// Returns x % 2^{bits}.
inline uint32_t BinaryMod(uint32_t x, uint8_t bits) {
assert(bits <= 32);
return (x << (32 - bits)) >> (32 - bits);
}
} // anonymous namespace
LRUHandleTable::LRUHandleTable(uint8_t hash_bits)
LRUHandleTable::LRUHandleTable(int hash_bits)
: length_bits_(hash_bits),
length_bits_mask_((uint32_t{1} << length_bits_) - 1),
occupancy_(0),
occupancy_limit_(static_cast<uint32_t>((uint32_t{1} << length_bits_) *
kStrictLoadFactor)),
array_(new LRUHandle[size_t{1} << length_bits_]) {
assert(hash_bits <= 32);
}
LRUHandleTable::~LRUHandleTable() {
// TODO(Guido) If users still hold references to handles,
// those will become invalidated. And if we choose not to
// delete the data, it will become leaked.
ApplyToEntriesRange(
[](LRUHandle* h) {
// TODO(Guido) Remove the HasRefs() check?
if (!h->HasRefs()) {
h->FreeData();
}
},
0, uint32_t{1} << length_bits_);
ApplyToEntriesRange([](LRUHandle* h) { h->FreeData(); }, 0, GetTableSize());
}
LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
@ -161,11 +145,10 @@ int LRUHandleTable::FindVisibleElementOrAvailableSlot(const Slice& key,
inline int LRUHandleTable::FindSlot(const Slice& key,
std::function<bool(LRUHandle*)> cond,
int& probe, int displacement) {
uint32_t base =
BinaryMod(Hash(key.data(), key.size(), kProbingSeed1), length_bits_);
uint32_t increment = BinaryMod(
(Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1, length_bits_);
uint32_t current = BinaryMod(base + probe * increment, length_bits_);
uint32_t base = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1));
uint32_t increment =
ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1);
uint32_t current = ModTableSize(base + probe * increment);
while (true) {
LRUHandle* h = &array_[current];
probe++;
@ -182,7 +165,7 @@ inline int LRUHandleTable::FindSlot(const Slice& key,
return -1;
}
h->displacements += displacement;
current = BinaryMod(current + increment, length_bits_);
current = ModTableSize(current + increment);
}
}
@ -233,7 +216,7 @@ void LRUCacheShard::ApplyToSomeEntries(
// hash bits for table indexes.
DMutexLock l(mutex_);
uint32_t length_bits = table_.GetLengthBits();
uint32_t length = uint32_t{1} << length_bits;
uint32_t length = table_.GetTableSize();
assert(average_entries_per_lock > 0);
// Assuming we are called with same average_entries_per_lock repeatedly,
@ -302,22 +285,19 @@ size_t LRUCacheShard::CalcEstimatedHandleCharge(
return h.total_charge;
}
uint8_t LRUCacheShard::CalcHashBits(
int LRUCacheShard::CalcHashBits(
size_t capacity, size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy) {
size_t handle_charge =
CalcEstimatedHandleCharge(estimated_value_size, metadata_charge_policy);
size_t num_entries =
static_cast<size_t>(capacity / (kLoadFactor * handle_charge));
// Compute the ceiling of log2(num_entries). If num_entries == 0, return 0.
uint8_t num_hash_bits = 0;
size_t num_entries_copy = num_entries;
while (num_entries_copy >>= 1) {
++num_hash_bits;
uint32_t num_entries =
static_cast<uint32_t>(capacity / (kLoadFactor * handle_charge));
if (num_entries == 0) {
return 0;
}
num_hash_bits += size_t{1} << num_hash_bits < num_entries ? 1 : 0;
return num_hash_bits;
int hash_bits = FloorLog2(num_entries);
return hash_bits + (size_t{1} << hash_bits < num_entries ? 1 : 0);
}
void LRUCacheShard::SetCapacity(size_t capacity) {
@ -362,33 +342,51 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
autovector<LRUHandle> last_reference_list;
{
DMutexLock l(mutex_);
assert(table_.GetOccupancy() <= table_.GetOccupancyLimit());
// Free the space following strict LRU policy until enough space
// is freed or the lru list is empty.
EvictFromLRU(tmp.total_charge, &last_reference_list);
if ((usage_ + tmp.total_charge > capacity_ &&
(strict_capacity_limit_ || handle == nullptr)) ||
table_.GetOccupancy() == size_t{1} << table_.GetLengthBits()) {
// Originally, when strict_capacity_limit_ == false and handle != nullptr
// (i.e., the user wants to immediately get a reference to the new
// handle), the insertion would proceed even if the total charge already
// exceeds capacity. We can't do this now, because we can't physically
// insert a new handle when the table is at maximum occupancy.
table_.GetOccupancy() == table_.GetOccupancyLimit()) {
// There are two measures of capacity:
// - Space (or charge) capacity: The maximum possible sum of the charges
// of the elements.
// - Table capacity: The number of slots in the hash table.
// These are incomparable, in the sense that one doesn't imply the other.
// Typically we will reach space capacity before table capacity---
// if the user always inserts values with size equal to
// estimated_value_size, then at most a kLoadFactor fraction of slots
// will ever be occupied. But in some cases we may reach table capacity
// before space capacity---if the user initially claims a very large
// estimated_value_size but then inserts tiny values, more elements than
// initially estimated will be inserted.
// TODO(Guido) Some tests (at least two from cache_test, as well as the
// stress tests) currently assume the old behavior.
// stress tests) currently assume the table capacity is unbounded.
if (handle == nullptr) {
// Don't insert the entry but still return ok, as if the entry inserted
// into cache and get evicted immediately.
last_reference_list.push_back(tmp);
} else {
s = Status::Incomplete("Insert failed due to LRU cache being full.");
if (table_.GetOccupancy() == table_.GetOccupancyLimit()) {
s = Status::Incomplete(
"Insert failed because all slots in the hash table are full.");
// TODO(Guido) Use the correct statuses.
} else {
s = Status::Incomplete(
"Insert failed because the total charge has exceeded the "
"capacity.");
}
}
} else {
// Insert into the cache. Note that the cache might get larger than its
// capacity if not enough space was freed up.
LRUHandle* old;
LRUHandle* h = table_.Insert(&tmp, &old);
assert(h != nullptr); // Insertions should never fail.
assert(h != nullptr); // We're below occupancy, so this insertion should
// never fail.
usage_ += h->total_charge;
if (old != nullptr) {
s = Status::OkOverwritten();
@ -431,7 +429,8 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
if (h != nullptr) {
assert(h->IsVisible());
if (!h->HasRefs()) {
// The entry is in LRU since it's in hash and has no external references
// The entry is in LRU since it's in hash and has no external
// references.
LRU_Remove(h);
}
h->Ref();
@ -497,7 +496,7 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
table_.Exclude(h);
if (!h->HasRefs()) {
// The entry is in LRU since it's in cache and has no external
// references
// references.
LRU_Remove(h);
table_.Remove(h);
assert(usage_ >= h->total_charge);

@ -78,8 +78,9 @@ class FastLRUCacheTest;
// times at most a fraction p of all slots, without counting tombstones,
// are occupied by elements. This means that the probability that a
// random probe hits an empty slot is at most p, and thus at most 1/p probes
// are required on average. We use p = 70%, so between 1 and 2 probes are
// needed on average.
// are required on average. For example, p = 70% implies that between 1 and 2
// probes are needed on average (bear in mind that this reasoning doesn't
// consider the effects of clustering over time).
// Because the size of the hash table is always rounded up to the next
// power of 2, p is really an upper bound on the actual load factor---the
// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
@ -87,7 +88,12 @@ class FastLRUCacheTest;
// Since space cost is dominated by the values (the LSM blocks),
// overprovisioning the table with metadata only increases the total cache space
// usage by a tiny fraction.
constexpr double kLoadFactor = 0.7;
constexpr double kLoadFactor = 0.35;
// The user can exceed kLoadFactor if the sizes of the inserted values don't
// match estimated_value_size, or if strict_capacity_limit == false. To
// avoid performance to plunge, we set a strict upper bound on the load factor.
constexpr double kStrictLoadFactor = 0.7;
// Arbitrary seeds.
constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
@ -103,7 +109,7 @@ struct LRUHandle {
size_t total_charge; // TODO(opt): Only allow uint32_t?
// The hash of key(). Used for fast sharding and comparisons.
uint32_t hash;
// The number of external refs to this entry. The cache itself is not counted.
// The number of external refs to this entry.
uint32_t refs;
enum Flags : uint8_t {
@ -226,16 +232,10 @@ struct LRUHandle {
}
};
// TODO(Guido) Update the following comment.
// We provide our own simple hash table since it removes a whole bunch
// of porting hacks and is also faster than some of the built-in hash
// table implementations in some of the compiler/runtime combinations
// we have tested. E.g., readrandom speeds up by ~5% over the g++
// 4.4.3's builtin hashtable.
class LRUHandleTable {
public:
explicit LRUHandleTable(uint8_t hash_bits);
explicit LRUHandleTable(int hash_bits);
~LRUHandleTable();
// Returns a pointer to a visible element matching the key/hash, or
@ -269,10 +269,17 @@ class LRUHandleTable {
}
}
uint8_t GetLengthBits() const { return length_bits_; }
uint32_t GetTableSize() const { return uint32_t{1} << length_bits_; }
int GetLengthBits() const { return length_bits_; }
uint32_t GetOccupancyLimit() const { return occupancy_limit_; }
uint32_t GetOccupancy() const { return occupancy_; }
// Returns x mod 2^{length_bits_}.
uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; }
private:
int FindVisibleElement(const Slice& key, uint32_t hash, int& probe,
int displacement);
@ -295,11 +302,16 @@ class LRUHandleTable {
// Number of hash bits used for table index.
// The size of the table is 1 << length_bits_.
uint8_t length_bits_;
int length_bits_;
const uint32_t length_bits_mask_;
// Number of elements in the table.
uint32_t occupancy_;
// Maximum number of elements the user can store in the table.
uint32_t occupancy_limit_;
std::unique_ptr<LRUHandle[]> array_;
};
@ -374,7 +386,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
void LRU_Insert(LRUHandle* e);
// Free some space following strict LRU policy until enough space
// to hold (usage_ + charge) is freed or the lru list is empty
// to hold (usage_ + charge) is freed or the LRU list is empty
// This function is not thread safe - it needs to be executed while
// holding the mutex_.
void EvictFromLRU(size_t charge, autovector<LRUHandle>* deleted);
@ -386,8 +398,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
// Returns the number of bits used to hash an element in the hash
// table.
static uint8_t CalcHashBits(size_t capacity, size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy);
static int CalcHashBits(size_t capacity, size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy);
// Initialized before use.
size_t capacity_;

@ -9,6 +9,7 @@
#include <vector>
#include "cache/cache_key.h"
#include "cache/clock_cache.h"
#include "cache/fast_lru_cache.h"
#include "db/db_test_util.h"
#include "file/sst_file_manager_impl.h"
@ -207,8 +208,8 @@ TEST_F(LRUCacheTest, EntriesWithPriority) {
}
namespace fast_lru_cache {
// TODO(guido) Consolidate the following FastLRUCache tests with
// that of LRUCache.
// TODO(guido) Replicate LRU policy tests from LRUCache here.
class FastLRUCacheTest : public testing::Test {
public:
FastLRUCacheTest() {}
@ -246,9 +247,8 @@ class FastLRUCacheTest : public testing::Test {
estimated_value_size, metadata_charge_policy);
}
uint8_t CalcHashBitsWrapper(
size_t capacity, size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy) {
int CalcHashBitsWrapper(size_t capacity, size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy) {
return fast_lru_cache::LRUCacheShard::CalcHashBits(
capacity, estimated_value_size, metadata_charge_policy);
}
@ -262,7 +262,7 @@ class FastLRUCacheTest : public testing::Test {
return capacity / (fast_lru_cache::kLoadFactor * handle_charge);
}
bool TableSizeIsAppropriate(uint8_t hash_bits, double max_occupancy) {
bool TableSizeIsAppropriate(int hash_bits, double max_occupancy) {
if (hash_bits == 0) {
return max_occupancy <= 1;
} else {
@ -292,8 +292,8 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) {
CacheMetadataChargePolicy metadata_charge_policy = kDontChargeCacheMetadata;
double max_occupancy =
CalcMaxOccupancy(capacity, estimated_value_size, metadata_charge_policy);
uint8_t hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
metadata_charge_policy);
int hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size,
metadata_charge_policy);
EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy));
capacity = 1024;
@ -342,6 +342,127 @@ TEST_F(FastLRUCacheTest, CalcHashBitsTest) {
} // namespace fast_lru_cache
namespace clock_cache {
class ClockCacheTest : public testing::Test {
public:
ClockCacheTest() {}
~ClockCacheTest() override { DeleteShard(); }
void DeleteShard() {
if (shard_ != nullptr) {
shard_->~ClockCacheShard();
port::cacheline_aligned_free(shard_);
shard_ = nullptr;
}
}
void NewShard(size_t capacity) {
DeleteShard();
shard_ = reinterpret_cast<ClockCacheShard*>(
port::cacheline_aligned_alloc(sizeof(ClockCacheShard)));
new (shard_) ClockCacheShard(capacity, 1, true /*strict_capacity_limit*/,
kDontChargeCacheMetadata);
}
Status Insert(const std::string& key,
Cache::Priority priority = Cache::Priority::LOW) {
return shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/,
nullptr /*deleter*/, nullptr /*handle*/, priority);
}
Status Insert(char key, Cache::Priority priority = Cache::Priority::LOW) {
return Insert(std::string(kCacheKeySize, key), priority);
}
Status Insert(char key, size_t len) { return Insert(std::string(len, key)); }
bool Lookup(const std::string& key) {
auto handle = shard_->Lookup(key, 0 /*hash*/);
if (handle) {
shard_->Release(handle);
return true;
}
return false;
}
bool Lookup(char key) { return Lookup(std::string(kCacheKeySize, key)); }
void Erase(const std::string& key) { shard_->Erase(key, 0 /*hash*/); }
// void ValidateLRUList(std::vector<std::string> keys,
// size_t num_high_pri_pool_keys = 0) {
// LRUHandle* lru;
// LRUHandle* lru_low_pri;
// cache_->TEST_GetLRUList(&lru, &lru_low_pri);
// LRUHandle* iter = lru;
// bool in_high_pri_pool = false;
// size_t high_pri_pool_keys = 0;
// if (iter == lru_low_pri) {
// in_high_pri_pool = true;
// }
// for (const auto& key : keys) {
// iter = iter->next;
// ASSERT_NE(lru, iter);
// ASSERT_EQ(key, iter->key().ToString());
// ASSERT_EQ(in_high_pri_pool, iter->InHighPriPool());
// if (in_high_pri_pool) {
// high_pri_pool_keys++;
// }
// if (iter == lru_low_pri) {
// ASSERT_FALSE(in_high_pri_pool);
// in_high_pri_pool = true;
// }
// }
// ASSERT_EQ(lru, iter->next);
// ASSERT_TRUE(in_high_pri_pool);
// ASSERT_EQ(num_high_pri_pool_keys, high_pri_pool_keys);
// }
private:
clock_cache::ClockCacheShard* shard_ = nullptr;
};
TEST_F(ClockCacheTest, Validate) {
NewShard(3);
EXPECT_OK(Insert('a', 16));
EXPECT_NOK(Insert('b', 15));
EXPECT_OK(Insert('b', 16));
EXPECT_NOK(Insert('c', 17));
EXPECT_NOK(Insert('d', 1000));
EXPECT_NOK(Insert('e', 11));
EXPECT_NOK(Insert('f', 0));
}
TEST_F(ClockCacheTest, ClockPriorityTest) {
clock_cache::ClockHandle handle;
EXPECT_EQ(handle.GetPriority(),
clock_cache::ClockHandle::ClockPriority::NONE);
handle.SetPriority(clock_cache::ClockHandle::ClockPriority::HIGH);
EXPECT_EQ(handle.GetPriority(),
clock_cache::ClockHandle::ClockPriority::HIGH);
handle.DecreasePriority();
EXPECT_EQ(handle.GetPriority(),
clock_cache::ClockHandle::ClockPriority::MEDIUM);
handle.DecreasePriority();
EXPECT_EQ(handle.GetPriority(), clock_cache::ClockHandle::ClockPriority::LOW);
handle.SetPriority(clock_cache::ClockHandle::ClockPriority::MEDIUM);
EXPECT_EQ(handle.GetPriority(),
clock_cache::ClockHandle::ClockPriority::MEDIUM);
handle.SetPriority(clock_cache::ClockHandle::ClockPriority::NONE);
EXPECT_EQ(handle.GetPriority(),
clock_cache::ClockHandle::ClockPriority::NONE);
handle.SetPriority(clock_cache::ClockHandle::ClockPriority::MEDIUM);
EXPECT_EQ(handle.GetPriority(),
clock_cache::ClockHandle::ClockPriority::MEDIUM);
handle.DecreasePriority();
handle.DecreasePriority();
EXPECT_EQ(handle.GetPriority(),
clock_cache::ClockHandle::ClockPriority::NONE);
}
} // namespace clock_cache
class TestSecondaryCache : public SecondaryCache {
public:
// Specifies what action to take on a lookup for a particular key

@ -932,7 +932,9 @@ TEST_F(DBBlockCacheTest, AddRedundantStats) {
int iterations_tested = 0;
for (std::shared_ptr<Cache> base_cache :
{NewLRUCache(capacity, num_shard_bits),
NewClockCache(capacity, num_shard_bits),
NewClockCache(capacity, 1 /*estimated_value_size*/, num_shard_bits,
false /*strict_capacity_limit*/,
kDefaultCacheMetadataChargePolicy),
NewFastLRUCache(capacity, 1 /*estimated_value_size*/, num_shard_bits,
false /*strict_capacity_limit*/,
kDefaultCacheMetadataChargePolicy)}) {
@ -1288,11 +1290,10 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
const size_t capacity = size_t{1} << 25;
int iterations_tested = 0;
for (bool partition : {false, true}) {
for (std::shared_ptr<Cache> cache :
{NewLRUCache(capacity), NewClockCache(capacity)}) {
// This test doesn't support FastLRUCache because the
for (std::shared_ptr<Cache> cache : {NewLRUCache(capacity)}) {
// This test doesn't support FastLRUCache nor ClockCache because the
// keys used are not 16B long.
// TODO(guido) Add support for FastLRUCache.
// TODO(guido) Add support for FastLRUCache and ClockCache.
if (!cache) {
// Skip clock cache when not supported
continue;

@ -114,7 +114,9 @@ std::shared_ptr<Cache> StressTest::NewCache(size_t capacity,
}
if (FLAGS_cache_type == "clock_cache") {
auto cache = NewClockCache((size_t)capacity);
auto cache = NewClockCache(static_cast<size_t>(capacity), FLAGS_block_size,
num_shard_bits, false /*strict_capacity_limit*/,
kDefaultCacheMetadataChargePolicy);
if (!cache) {
fprintf(stderr, "Clock cache not supported.");
exit(1);

@ -174,19 +174,15 @@ extern std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
extern std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
const CompressedSecondaryCacheOptions& opts);
// Similar to NewLRUCache, but create a cache based on CLOCK algorithm with
// Similar to NewLRUCache, but create a cache based on clock algorithm with
// better concurrent performance in some cases. See util/clock_cache.cc for
// more detail.
//
// Return nullptr if it is not supported.
//
// BROKEN: ClockCache is known to have bugs that could lead to crash or
// corruption, so should not be used until fixed. Use NewLRUCache instead.
extern std::shared_ptr<Cache> NewClockCache(
size_t capacity, int num_shard_bits = -1,
bool strict_capacity_limit = false,
CacheMetadataChargePolicy metadata_charge_policy =
kDefaultCacheMetadataChargePolicy);
size_t capacity, size_t estimated_value_size, int num_shard_bits,
bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy);
class Cache {
public:

@ -23,8 +23,10 @@ jlong Java_org_rocksdb_ClockCache_newClockCache(
jboolean jstrict_capacity_limit) {
auto* sptr_clock_cache = new std::shared_ptr<ROCKSDB_NAMESPACE::Cache>(
ROCKSDB_NAMESPACE::NewClockCache(
static_cast<size_t>(jcapacity), static_cast<int>(jnum_shard_bits),
static_cast<bool>(jstrict_capacity_limit)));
static_cast<size_t>(jcapacity), 1 /* estimated_value_size */,
static_cast<int>(jnum_shard_bits),
static_cast<bool>(jstrict_capacity_limit),
rocksdb::CacheMetadataChargePolicy::kFullChargeCacheMetadata));
return GET_CPLUSPLUS_POINTER(sptr_clock_cache);
}

@ -2971,7 +2971,9 @@ class Benchmark {
}
if (FLAGS_cache_type == "clock_cache") {
auto cache = NewClockCache(static_cast<size_t>(capacity),
FLAGS_cache_numshardbits);
FLAGS_block_size, FLAGS_cache_numshardbits,
false /*strict_capacity_limit*/,
kDefaultCacheMetadataChargePolicy);
if (!cache) {
fprintf(stderr, "Clock cache not supported.");
exit(1);

@ -115,8 +115,8 @@ default_params = {
"use_direct_reads": lambda: random.randint(0, 1),
"use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
"mock_direct_io": False,
"cache_type": "lru_cache", # clock_cache is broken
# fast_lru_cache is currently incompatible with stress tests, because they use strict_capacity_limit = false
"cache_type": "lru_cache", # fast_lru_cache and clock_cache are currently incompatible
# with stress tests, because they use strict_capacity_limit = false
"use_full_merge_v1": lambda: random.randint(0, 1),
"use_merge": lambda: random.randint(0, 1),
# 999 -> use Bloom API

Loading…
Cancel
Save