Replace per-shard chained hash tables with open-addressing scheme (#10194)

Summary:
In FastLRUCache, we replace the current chained per-shard hash table by an open-addressing hash table. In particular, this allows us to preallocate all handles.

Because all handles are preallocated, this implementation doesn't support strict_capacity_limit = false (i.e., allowing insertions beyond the predefined capacity). This clashes with current assumptions of some tests, namely two tests in cache_test and the crash tests. We have disabled these for now.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10194

Test Plan: ``make -j24 check``

Reviewed By: pdillinger

Differential Revision: D37296770

Pulled By: guidotag

fbshipit-source-id: 232ff1b8260331d868ebf4e3e5d8ad709390b0ad
main
Guido Tagliavini Ponce 2 years ago committed by Facebook GitHub Bot
parent deff48bcef
commit 3afed7408c
  1. 14
      cache/cache_test.cc
  2. 424
      cache/fast_lru_cache.cc
  3. 253
      cache/fast_lru_cache.h
  4. 3
      tools/db_crashtest.py

@ -650,6 +650,10 @@ TEST_P(CacheTest, ReleaseWithoutErase) {
} }
TEST_P(CacheTest, SetCapacity) { TEST_P(CacheTest, SetCapacity) {
if (GetParam() == kFast) {
ROCKSDB_GTEST_BYPASS("FastLRUCache doesn't support capacity adjustments.");
return;
}
// test1: increase capacity // test1: increase capacity
// lets create a cache with capacity 5, // lets create a cache with capacity 5,
// then, insert 5 elements, then increase capacity // then, insert 5 elements, then increase capacity
@ -698,6 +702,12 @@ TEST_P(CacheTest, SetCapacity) {
} }
TEST_P(LRUCacheTest, SetStrictCapacityLimit) { TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
if (GetParam() == kFast) {
ROCKSDB_GTEST_BYPASS(
"FastLRUCache doesn't support an unbounded number of inserts beyond "
"capacity.");
return;
}
// test1: set the flag to false. Insert more keys than capacity. See if they // test1: set the flag to false. Insert more keys than capacity. See if they
// all go through. // all go through.
std::shared_ptr<Cache> cache = NewCache(5, 0, false); std::shared_ptr<Cache> cache = NewCache(5, 0, false);
@ -749,6 +759,10 @@ TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
} }
TEST_P(CacheTest, OverCapacity) { TEST_P(CacheTest, OverCapacity) {
if (GetParam() == kFast) {
ROCKSDB_GTEST_BYPASS("FastLRUCache doesn't support capacity adjustments.");
return;
}
size_t n = 10; size_t n = 10;
// a LRUCache with n entries and one shard only // a LRUCache with n entries and one shard only

@ -9,77 +9,191 @@
#include "cache/fast_lru_cache.h" #include "cache/fast_lru_cache.h"
#include <math.h>
#include <cassert> #include <cassert>
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>
#include <functional>
#include "monitoring/perf_context_imp.h" #include "monitoring/perf_context_imp.h"
#include "monitoring/statistics.h" #include "monitoring/statistics.h"
#include "port/lang.h" #include "port/lang.h"
#include "util/distributed_mutex.h" #include "util/distributed_mutex.h"
#include "util/hash.h"
#define KEY_LENGTH \ #include "util/random.h"
16 // TODO(guido) Make use of this symbol in other parts of the source code
// (e.g., cache_key.h, cache_test.cc, etc.)
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
namespace fast_lru_cache { namespace fast_lru_cache {
LRUHandleTable::LRUHandleTable(int hash_bits) namespace {
// Returns x % 2^{bits}.
inline uint32_t BinaryMod(uint32_t x, uint8_t bits) {
assert(bits <= 32);
return (x << (32 - bits)) >> (32 - bits);
}
} // anonymous namespace
LRUHandleTable::LRUHandleTable(uint8_t hash_bits)
: length_bits_(hash_bits), : length_bits_(hash_bits),
list_(new LRUHandle* [size_t{1} << length_bits_] {}) {} occupancy_(0),
array_(new LRUHandle[size_t{1} << length_bits_]) {
assert(hash_bits <= 32);
}
LRUHandleTable::~LRUHandleTable() { LRUHandleTable::~LRUHandleTable() {
// TODO(Guido) If users still hold references to handles,
// those will become invalidated. And if we choose not to
// delete the data, it will become leaked.
ApplyToEntriesRange( ApplyToEntriesRange(
[](LRUHandle* h) { [](LRUHandle* h) {
// TODO(Guido) Remove the HasRefs() check?
if (!h->HasRefs()) { if (!h->HasRefs()) {
h->Free(); h->FreeData();
} }
}, },
0, uint32_t{1} << length_bits_); 0, uint32_t{1} << length_bits_);
} }
LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) { LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
return *FindPointer(key, hash); int probe = 0;
} int slot = FindVisibleElement(key, hash, probe, 0);
return (slot == -1) ? nullptr : &array_[slot];
inline LRUHandle** LRUHandleTable::Head(uint32_t hash) { }
return &list_[hash >> (32 - length_bits_)];
} LRUHandle* LRUHandleTable::Insert(LRUHandle* h, LRUHandle** old) {
int probe = 0;
LRUHandle* LRUHandleTable::Insert(LRUHandle* h) { int slot = FindVisibleElementOrAvailableSlot(h->key(), h->hash, probe,
LRUHandle** ptr = FindPointer(h->key(), h->hash); 1 /*displacement*/);
LRUHandle* old = *ptr; *old = nullptr;
h->next_hash = (old == nullptr ? nullptr : old->next_hash); if (slot == -1) {
*ptr = h; return nullptr;
return old; }
if (array_[slot].IsEmpty() || array_[slot].IsTombstone()) {
bool empty = array_[slot].IsEmpty();
Assign(slot, h);
LRUHandle* new_entry = &array_[slot];
if (empty) {
// This used to be an empty slot.
return new_entry;
}
// It used to be a tombstone, so there may already be a copy of the
// key in the table.
slot = FindVisibleElement(h->key(), h->hash, probe, 0 /*displacement*/);
if (slot == -1) {
// No existing copy of the key.
return new_entry;
}
*old = &array_[slot];
return new_entry;
} else {
// There is an existing copy of the key.
*old = &array_[slot];
// Find an available slot for the new element.
array_[slot].displacements++;
slot = FindAvailableSlot(h->key(), probe, 1 /*displacement*/);
if (slot == -1) {
// No available slots. Roll back displacements.
probe = 0;
slot = FindVisibleElement(h->key(), h->hash, probe, -1);
array_[slot].displacements--;
FindAvailableSlot(h->key(), probe, -1);
return nullptr;
}
Assign(slot, h);
return &array_[slot];
}
}
void LRUHandleTable::Remove(LRUHandle* h) {
assert(h->next == nullptr &&
h->prev == nullptr); // Already off the LRU list.
int probe = 0;
FindSlot(
h->key(), [&h](LRUHandle* e) { return e == h; }, probe,
-1 /*displacement*/);
h->SetIsVisible(false);
h->SetIsElement(false);
occupancy_--;
}
void LRUHandleTable::Assign(int slot, LRUHandle* h) {
LRUHandle* dst = &array_[slot];
uint32_t disp = dst->displacements;
*dst = *h;
dst->displacements = disp;
dst->SetIsVisible(true);
dst->SetIsElement(true);
occupancy_++;
}
void LRUHandleTable::Exclude(LRUHandle* h) { h->SetIsVisible(false); }
int LRUHandleTable::FindVisibleElement(const Slice& key, uint32_t hash,
int& probe, int displacement) {
return FindSlot(
key,
[&](LRUHandle* h) { return h->Matches(key, hash) && h->IsVisible(); },
probe, displacement);
}
int LRUHandleTable::FindAvailableSlot(const Slice& key, int& probe,
int displacement) {
return FindSlot(
key, [](LRUHandle* h) { return h->IsEmpty() || h->IsTombstone(); }, probe,
displacement);
}
int LRUHandleTable::FindVisibleElementOrAvailableSlot(const Slice& key,
uint32_t hash, int& probe,
int displacement) {
return FindSlot(
key,
[&](LRUHandle* h) {
return h->IsEmpty() || h->IsTombstone() ||
(h->Matches(key, hash) && h->IsVisible());
},
probe, displacement);
} }
LRUHandle* LRUHandleTable::Remove(const Slice& key, uint32_t hash) { inline int LRUHandleTable::FindSlot(const Slice& key,
LRUHandle** ptr = FindPointer(key, hash); std::function<bool(LRUHandle*)> cond,
LRUHandle* result = *ptr; int& probe, int displacement) {
if (result != nullptr) { uint32_t base =
*ptr = result->next_hash; BinaryMod(Hash(key.data(), key.size(), kProbingSeed1), length_bits_);
uint32_t increment = BinaryMod(
(Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1, length_bits_);
uint32_t current = BinaryMod(base + probe * increment, length_bits_);
while (true) {
LRUHandle* h = &array_[current];
probe++;
if (current == base && probe > 1) {
// We looped back.
return -1;
} }
return result; if (cond(h)) {
} return current;
}
LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) { if (h->IsEmpty()) {
LRUHandle** ptr = &list_[hash >> (32 - length_bits_)]; // We check emptyness after the condition, because
while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) { // the condition may be emptyness.
ptr = &(*ptr)->next_hash; return -1;
}
h->displacements += displacement;
current = BinaryMod(current + increment, length_bits_);
} }
return ptr;
} }
LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size, LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size,
bool strict_capacity_limit, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy) CacheMetadataChargePolicy metadata_charge_policy)
: capacity_(0), : capacity_(capacity),
strict_capacity_limit_(strict_capacity_limit), strict_capacity_limit_(strict_capacity_limit),
table_( table_(
GetHashBits(capacity, estimated_value_size, metadata_charge_policy)), CalcHashBits(capacity, estimated_value_size, metadata_charge_policy) +
static_cast<uint8_t>(ceil(log2(1.0 / kLoadFactor)))),
usage_(0), usage_(0),
lru_usage_(0) { lru_usage_(0) {
set_metadata_charge_policy(metadata_charge_policy); set_metadata_charge_policy(metadata_charge_policy);
@ -87,29 +201,27 @@ LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size,
lru_.next = &lru_; lru_.next = &lru_;
lru_.prev = &lru_; lru_.prev = &lru_;
lru_low_pri_ = &lru_; lru_low_pri_ = &lru_;
SetCapacity(capacity);
} }
void LRUCacheShard::EraseUnRefEntries() { void LRUCacheShard::EraseUnRefEntries() {
autovector<LRUHandle*> last_reference_list; autovector<LRUHandle> last_reference_list;
{ {
DMutexLock l(mutex_); DMutexLock l(mutex_);
while (lru_.next != &lru_) { while (lru_.next != &lru_) {
LRUHandle* old = lru_.next; LRUHandle* old = lru_.next;
// LRU list contains only elements which can be evicted. // LRU list contains only elements which can be evicted.
assert(old->InCache() && !old->HasRefs()); assert(old->IsVisible() && !old->HasRefs());
LRU_Remove(old); LRU_Remove(old);
table_.Remove(old->key(), old->hash); table_.Remove(old);
old->SetInCache(false);
assert(usage_ >= old->total_charge); assert(usage_ >= old->total_charge);
usage_ -= old->total_charge; usage_ -= old->total_charge;
last_reference_list.push_back(old); last_reference_list.push_back(*old);
} }
} }
// Free the entries here outside of mutex for performance reasons. // Free the entries here outside of mutex for performance reasons.
for (auto entry : last_reference_list) { for (auto& h : last_reference_list) {
entry->Free(); h.FreeData();
} }
} }
@ -148,57 +260,48 @@ void LRUCacheShard::ApplyToSomeEntries(
index_begin, index_end); index_begin, index_end);
} }
void LRUCacheShard::LRU_Remove(LRUHandle* e) { void LRUCacheShard::LRU_Remove(LRUHandle* h) {
assert(e->next != nullptr); assert(h->next != nullptr);
assert(e->prev != nullptr); assert(h->prev != nullptr);
e->next->prev = e->prev; h->next->prev = h->prev;
e->prev->next = e->next; h->prev->next = h->next;
e->prev = e->next = nullptr; h->prev = h->next = nullptr;
assert(lru_usage_ >= e->total_charge); assert(lru_usage_ >= h->total_charge);
lru_usage_ -= e->total_charge; lru_usage_ -= h->total_charge;
} }
void LRUCacheShard::LRU_Insert(LRUHandle* e) { void LRUCacheShard::LRU_Insert(LRUHandle* h) {
assert(e->next == nullptr); assert(h->next == nullptr);
assert(e->prev == nullptr); assert(h->prev == nullptr);
// Inset "e" to head of LRU list. // Insert h to head of LRU list.
e->next = &lru_; h->next = &lru_;
e->prev = lru_.prev; h->prev = lru_.prev;
e->prev->next = e; h->prev->next = h;
e->next->prev = e; h->next->prev = h;
lru_usage_ += e->total_charge; lru_usage_ += h->total_charge;
} }
void LRUCacheShard::EvictFromLRU(size_t charge, void LRUCacheShard::EvictFromLRU(size_t charge,
autovector<LRUHandle*>* deleted) { autovector<LRUHandle>* deleted) {
while ((usage_ + charge) > capacity_ && lru_.next != &lru_) { while ((usage_ + charge) > capacity_ && lru_.next != &lru_) {
LRUHandle* old = lru_.next; LRUHandle* old = lru_.next;
// LRU list contains only elements which can be evicted. // LRU list contains only elements which can be evicted.
assert(old->InCache() && !old->HasRefs()); assert(old->IsVisible() && !old->HasRefs());
LRU_Remove(old); LRU_Remove(old);
table_.Remove(old->key(), old->hash); table_.Remove(old);
old->SetInCache(false);
assert(usage_ >= old->total_charge); assert(usage_ >= old->total_charge);
usage_ -= old->total_charge; usage_ -= old->total_charge;
deleted->push_back(old); deleted->push_back(*old);
} }
} }
int LRUCacheShard::GetHashBits( uint8_t LRUCacheShard::CalcHashBits(
size_t capacity, size_t estimated_value_size, size_t capacity, size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy) { CacheMetadataChargePolicy metadata_charge_policy) {
LRUHandle* e = reinterpret_cast<LRUHandle*>( LRUHandle h;
new char[sizeof(LRUHandle) - 1 + KEY_LENGTH]); h.CalcTotalCharge(estimated_value_size, metadata_charge_policy);
e->key_length = KEY_LENGTH; size_t num_entries = capacity / h.total_charge;
e->deleter = nullptr; uint8_t num_hash_bits = 0;
e->refs = 0;
e->flags = 0;
e->refs = 0;
e->CalcTotalCharge(estimated_value_size, metadata_charge_policy);
size_t num_entries = capacity / e->total_charge;
e->Free();
int num_hash_bits = 0;
while (num_entries >>= 1) { while (num_entries >>= 1) {
++num_hash_bits; ++num_hash_bits;
} }
@ -206,7 +309,8 @@ int LRUCacheShard::GetHashBits(
} }
void LRUCacheShard::SetCapacity(size_t capacity) { void LRUCacheShard::SetCapacity(size_t capacity) {
autovector<LRUHandle*> last_reference_list; assert(false); // Not supported. TODO(Guido) Support it?
autovector<LRUHandle> last_reference_list;
{ {
DMutexLock l(mutex_); DMutexLock l(mutex_);
capacity_ = capacity; capacity_ = capacity;
@ -214,8 +318,8 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
} }
// Free the entries here outside of mutex for performance reasons. // Free the entries here outside of mutex for performance reasons.
for (auto entry : last_reference_list) { for (auto& h : last_reference_list) {
entry->Free(); h.FreeData();
} }
} }
@ -224,83 +328,104 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
strict_capacity_limit_ = strict_capacity_limit; strict_capacity_limit_ = strict_capacity_limit;
} }
Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle, Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
bool free_handle_on_fail) { size_t charge, Cache::DeleterFn deleter,
Cache::Handle** handle,
Cache::Priority /*priority*/) {
if (key.size() != kCacheKeySize) {
return Status::NotSupported("FastLRUCache only supports key size " +
std::to_string(kCacheKeySize) + "B");
}
LRUHandle tmp;
tmp.value = value;
tmp.deleter = deleter;
tmp.hash = hash;
tmp.CalcTotalCharge(charge, metadata_charge_policy_);
for (int i = 0; i < kCacheKeySize; i++) {
tmp.key_data[i] = key.data()[i];
}
Status s = Status::OK(); Status s = Status::OK();
autovector<LRUHandle*> last_reference_list; autovector<LRUHandle> last_reference_list;
{ {
DMutexLock l(mutex_); DMutexLock l(mutex_);
// Free the space following strict LRU policy until enough space // Free the space following strict LRU policy until enough space
// is freed or the lru list is empty. // is freed or the lru list is empty.
EvictFromLRU(e->total_charge, &last_reference_list); EvictFromLRU(tmp.total_charge, &last_reference_list);
if ((usage_ + tmp.total_charge > capacity_ &&
if ((usage_ + e->total_charge) > capacity_ && (strict_capacity_limit_ || handle == nullptr)) ||
(strict_capacity_limit_ || handle == nullptr)) { table_.GetOccupancy() == size_t{1} << table_.GetLengthBits()) {
e->SetInCache(false); // Originally, when strict_capacity_limit_ == false and handle != nullptr
// (i.e., the user wants to immediately get a reference to the new
// handle), the insertion would proceed even if the total charge already
// exceeds capacity. We can't do this now, because we can't physically
// insert a new handle when the table is at maximum occupancy.
// TODO(Guido) Some tests (at least two from cache_test, as well as the
// stress tests) currently assume the old behavior.
if (handle == nullptr) { if (handle == nullptr) {
// Don't insert the entry but still return ok, as if the entry inserted // Don't insert the entry but still return ok, as if the entry inserted
// into cache and get evicted immediately. // into cache and get evicted immediately.
last_reference_list.push_back(e); last_reference_list.push_back(tmp);
} else { } else {
if (free_handle_on_fail) {
delete[] reinterpret_cast<char*>(e);
*handle = nullptr;
}
s = Status::Incomplete("Insert failed due to LRU cache being full."); s = Status::Incomplete("Insert failed due to LRU cache being full.");
} }
} else { } else {
// Insert into the cache. Note that the cache might get larger than its // Insert into the cache. Note that the cache might get larger than its
// capacity if not enough space was freed up. // capacity if not enough space was freed up.
LRUHandle* old = table_.Insert(e); LRUHandle* old;
usage_ += e->total_charge; LRUHandle* h = table_.Insert(&tmp, &old);
assert(h != nullptr); // Insertions should never fail.
usage_ += h->total_charge;
if (old != nullptr) { if (old != nullptr) {
s = Status::OkOverwritten(); s = Status::OkOverwritten();
assert(old->InCache()); assert(old->IsVisible());
old->SetInCache(false); table_.Exclude(old);
if (!old->HasRefs()) { if (!old->HasRefs()) {
// old is on LRU because it's in cache and its reference count is 0. // old is on LRU because it's in cache and its reference count is 0.
LRU_Remove(old); LRU_Remove(old);
table_.Remove(old);
assert(usage_ >= old->total_charge); assert(usage_ >= old->total_charge);
usage_ -= old->total_charge; usage_ -= old->total_charge;
last_reference_list.push_back(old); last_reference_list.push_back(*old);
} }
} }
if (handle == nullptr) { if (handle == nullptr) {
LRU_Insert(e); LRU_Insert(h);
} else { } else {
// If caller already holds a ref, no need to take one here. // If caller already holds a ref, no need to take one here.
if (!e->HasRefs()) { if (!h->HasRefs()) {
e->Ref(); h->Ref();
} }
*handle = reinterpret_cast<Cache::Handle*>(e); *handle = reinterpret_cast<Cache::Handle*>(h);
} }
} }
} }
// Free the entries here outside of mutex for performance reasons. // Free the entries here outside of mutex for performance reasons.
for (auto entry : last_reference_list) { for (auto& h : last_reference_list) {
entry->Free(); h.FreeData();
} }
return s; return s;
} }
Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) { Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
LRUHandle* e = nullptr; LRUHandle* h = nullptr;
{ {
DMutexLock l(mutex_); DMutexLock l(mutex_);
e = table_.Lookup(key, hash); h = table_.Lookup(key, hash);
if (e != nullptr) { if (h != nullptr) {
assert(e->InCache()); assert(h->IsVisible());
if (!e->HasRefs()) { if (!h->HasRefs()) {
// The entry is in LRU since it's in hash and has no external references // The entry is in LRU since it's in hash and has no external references
LRU_Remove(e); LRU_Remove(h);
} }
e->Ref(); h->Ref();
} }
} }
return reinterpret_cast<Cache::Handle*>(e); return reinterpret_cast<Cache::Handle*>(h);
} }
bool LRUCacheShard::Ref(Cache::Handle* h) { bool LRUCacheShard::Ref(Cache::Handle* h) {
@ -316,91 +441,64 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
if (handle == nullptr) { if (handle == nullptr) {
return false; return false;
} }
LRUHandle* e = reinterpret_cast<LRUHandle*>(handle); LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
LRUHandle copy;
bool last_reference = false; bool last_reference = false;
{ {
DMutexLock l(mutex_); DMutexLock l(mutex_);
last_reference = e->Unref(); last_reference = h->Unref();
if (last_reference && e->InCache()) { if (last_reference && h->IsVisible()) {
// The item is still in cache, and nobody else holds a reference to it. // The item is still in cache, and nobody else holds a reference to it.
if (usage_ > capacity_ || erase_if_last_ref) { if (usage_ > capacity_ || erase_if_last_ref) {
// The LRU list must be empty since the cache is full. // The LRU list must be empty since the cache is full.
assert(lru_.next == &lru_ || erase_if_last_ref); assert(lru_.next == &lru_ || erase_if_last_ref);
// Take this opportunity and remove the item. // Take this opportunity and remove the item.
table_.Remove(e->key(), e->hash); table_.Remove(h);
e->SetInCache(false);
} else { } else {
// Put the item back on the LRU list, and don't free it. // Put the item back on the LRU list, and don't free it.
LRU_Insert(e); LRU_Insert(h);
last_reference = false; last_reference = false;
} }
} }
// If it was the last reference, then decrement the cache usage. // If it was the last reference, then decrement the cache usage.
if (last_reference) { if (last_reference) {
assert(usage_ >= e->total_charge); assert(usage_ >= h->total_charge);
usage_ -= e->total_charge; usage_ -= h->total_charge;
copy = *h;
} }
} }
// Free the entry here outside of mutex for performance reasons. // Free the entry here outside of mutex for performance reasons.
if (last_reference) { if (last_reference) {
e->Free(); copy.FreeData();
} }
return last_reference; return last_reference;
} }
Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
size_t charge, Cache::DeleterFn deleter,
Cache::Handle** handle,
Cache::Priority /*priority*/) {
if (key.size() != KEY_LENGTH) {
return Status::NotSupported("FastLRUCache only supports key size " +
std::to_string(KEY_LENGTH) + "B");
}
// Allocate the memory here outside of the mutex.
// If the cache is full, we'll have to release it.
// It shouldn't happen very often though.
LRUHandle* e = reinterpret_cast<LRUHandle*>(
new char[sizeof(LRUHandle) - 1 + key.size()]);
e->value = value;
e->flags = 0;
e->deleter = deleter;
e->key_length = key.size();
e->hash = hash;
e->refs = 0;
e->next = e->prev = nullptr;
e->SetInCache(true);
e->CalcTotalCharge(charge, metadata_charge_policy_);
memcpy(e->key_data, key.data(), key.size());
return InsertItem(e, handle, /* free_handle_on_fail */ true);
}
void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
LRUHandle* e; LRUHandle copy;
bool last_reference = false; bool last_reference = false;
{ {
DMutexLock l(mutex_); DMutexLock l(mutex_);
e = table_.Remove(key, hash); LRUHandle* h = table_.Lookup(key, hash);
if (e != nullptr) { if (h != nullptr) {
assert(e->InCache()); table_.Exclude(h);
e->SetInCache(false); if (!h->HasRefs()) {
if (!e->HasRefs()) { // The entry is in LRU since it's in cache and has no external
// The entry is in LRU since it's in hash and has no external references // references
LRU_Remove(e); LRU_Remove(h);
assert(usage_ >= e->total_charge); table_.Remove(h);
usage_ -= e->total_charge; assert(usage_ >= h->total_charge);
usage_ -= h->total_charge;
last_reference = true; last_reference = true;
copy = *h;
} }
} }
} }
// Free the entry here outside of mutex for performance reasons. // Free the entry here outside of mutex for performance reasons.
// last_reference will only be true if e != nullptr. // last_reference will only be true if e != nullptr.
if (last_reference) { if (last_reference) {
e->Free(); copy.FreeData();
} }
} }

@ -8,9 +8,11 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once #pragma once
#include <array>
#include <memory> #include <memory>
#include <string> #include <string>
#include "cache/cache_key.h"
#include "cache/sharded_cache.h" #include "cache/sharded_cache.h"
#include "port/lang.h" #include "port/lang.h"
#include "port/malloc.h" #include "port/malloc.h"
@ -22,31 +24,115 @@
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
namespace fast_lru_cache { namespace fast_lru_cache {
// LRU cache implementation using an open-address hash table.
// Every slot in the hash table is an LRUHandle. Because handles can be
// referenced externally, we can't discard them immediately once they are
// deleted (via a delete or an LRU eviction) or replaced by a new version
// (via an insert of the same key). The state of an element is defined by
// the following two properties:
// (R) Referenced: An element can be referenced externally (refs > 0), or not.
// Importantly, an element can be evicted if and only if it's not
// referenced. In particular, when an element becomes referenced, it's
// temporarily taken out of the LRU list until all references to it
// are dropped.
// (V) Visible: An element can visible for lookups (IS_VISIBLE set), or not.
// Initially, every element is visible. An element that is not visible is
// called a ghost.
// These properties induce 4 different states, with transitions defined as
// follows:
// - V --> not V: When a visible element is deleted or replaced by a new
// version.
// - Not V --> V: This cannot happen. A ghost remains in that state until it's
// not referenced any more, at which point it's ready to be removed from the
// hash table. (A ghost simply waits to transition to the afterlife---it will
// never be visible again.)
// - R --> not R: When all references to an element are dropped.
// - Not R --> R: When an unreferenced element becomes referenced. This can only
// happen if the element is V, since references to an element can only be
// created when it's visible.
// Internally, the cache uses an open-addressed hash table to index the handles.
// We use tombstone counters to keep track of displacements.
// Because of the tombstones and the two possible visibility states of an
// element, the table slots can be in 4 different states:
// 1. Visible element (IS_ELEMENT set and IS_VISIBLE set): The slot contains a
// key-value element.
// 2. Ghost element (IS_ELEMENT set and IS_VISIBLE unset): The slot contains an
// element that has been removed, but it's still referenced. It's invisible
// to lookups.
// 3. Tombstone (IS_ELEMENT unset and displacements > 0): The slot contains a
// tombstone.
// 4. Empty (IS_ELEMENT unset and displacements == 0): The slot is unused.
// A slot that is an element can further have IS_VISIBLE set or not.
// When a ghost is removed from the table, it can either transition to being a
// tombstone or an empty slot, depending on the number of displacements of the
// slot. In any case, the slot becomes available. When a handle is inserted
// into that slot, it becomes a visible element again.
constexpr uint8_t kCacheKeySize =
static_cast<uint8_t>(sizeof(ROCKSDB_NAMESPACE::CacheKey));
// The load factor p is a real number in (0, 1) such that at all
// times at most a fraction p of all slots, without counting tombstones,
// are occupied by elements. This means that the probability that a
// random probe hits an empty slot is at most p, and thus at most 1/p probes
// are required on average. We use p = 70%, so between 1 and 2 probes are
// needed on average.
// Because the size of the hash table is always rounded up to the next
// power of 2, p is really an upper bound on the actual load factor---the
// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
// but bear in mind that slots only hold metadata, not actual values.
// Since space cost is dominated by the values (the LSM blocks),
// overprovisioning the table with metadata only increases the total cache space
// usage by a tiny fraction.
constexpr double kLoadFactor = 0.7;
// Arbitrary seeds.
constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5;
// An experimental (under development!) alternative to LRUCache // An experimental (under development!) alternative to LRUCache
struct LRUHandle { struct LRUHandle {
void* value; void* value;
Cache::DeleterFn deleter; Cache::DeleterFn deleter;
LRUHandle* next_hash;
LRUHandle* next; LRUHandle* next;
LRUHandle* prev; LRUHandle* prev;
size_t total_charge; // TODO(opt): Only allow uint32_t? size_t total_charge; // TODO(opt): Only allow uint32_t?
size_t key_length;
// The hash of key(). Used for fast sharding and comparisons. // The hash of key(). Used for fast sharding and comparisons.
uint32_t hash; uint32_t hash;
// The number of external refs to this entry. The cache itself is not counted. // The number of external refs to this entry. The cache itself is not counted.
uint32_t refs; uint32_t refs;
enum Flags : uint8_t { enum Flags : uint8_t {
// Whether this entry is referenced by the hash table. // Whether the handle is visible to Lookups.
IN_CACHE = (1 << 0), IS_VISIBLE = (1 << 0),
// Whether the slot is in use by an element.
IS_ELEMENT = (1 << 1),
}; };
uint8_t flags; uint8_t flags;
// Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!) // The number of elements that hash to this slot or a lower one,
char key_data[1]; // but wind up in a higher slot.
uint32_t displacements;
std::array<char, kCacheKeySize> key_data;
LRUHandle() {
value = nullptr;
deleter = nullptr;
next = nullptr;
prev = nullptr;
total_charge = 0;
hash = 0;
refs = 0;
flags = 0;
displacements = 0;
key_data.fill(0);
}
Slice key() const { return Slice(key_data, key_length); } Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
// Increase the reference count by 1. // Increase the reference count by 1.
void Ref() { refs++; } void Ref() { refs++; }
@ -61,22 +147,31 @@ struct LRUHandle {
// Return true if there are external refs, false otherwise. // Return true if there are external refs, false otherwise.
bool HasRefs() const { return refs > 0; } bool HasRefs() const { return refs > 0; }
bool InCache() const { return flags & IN_CACHE; } bool IsVisible() const { return flags & IS_VISIBLE; }
void SetInCache(bool in_cache) { void SetIsVisible(bool is_visible) {
if (in_cache) { if (is_visible) {
flags |= IN_CACHE; flags |= IS_VISIBLE;
} else { } else {
flags &= ~IN_CACHE; flags &= ~IS_VISIBLE;
} }
} }
void Free() { bool IsElement() const { return flags & IS_ELEMENT; }
void SetIsElement(bool is_element) {
if (is_element) {
flags |= IS_ELEMENT;
} else {
flags &= ~IS_ELEMENT;
}
}
void FreeData() {
assert(refs == 0); assert(refs == 0);
if (deleter) { if (deleter) {
(*deleter)(key(), value); (*deleter)(key(), value);
} }
delete[] reinterpret_cast<char*>(this);
} }
// Calculate the memory usage by metadata. // Calculate the memory usage by metadata.
@ -85,13 +180,22 @@ struct LRUHandle {
if (metadata_charge_policy != kFullChargeCacheMetadata) { if (metadata_charge_policy != kFullChargeCacheMetadata) {
return 0; return 0;
} else { } else {
#ifdef ROCKSDB_MALLOC_USABLE_SIZE // #ifdef ROCKSDB_MALLOC_USABLE_SIZE
return malloc_usable_size( // return malloc_usable_size(
const_cast<void*>(static_cast<const void*>(this))); // const_cast<void*>(static_cast<const void*>(this)));
#else // #else
// This is the size that is used when a new handle is created. // TODO(Guido) malloc_usable_size only works when we call it on
return sizeof(LRUHandle) - 1 + key_length; // a pointer allocated with malloc. Because our handles are all
#endif // allocated in a single shot as an array, the user can't call
// CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
// pointer returned by the cache. Moreover, malloc_usable_size
// expects a heap-allocated handle, but sometimes in our code we
// wish to pass a stack-allocated handle (this is only a performance
// concern).
// What is the right way to compute metadata charges with pre-allocated
// handles?
return sizeof(LRUHandle);
// #endif
} }
} }
@ -106,8 +210,23 @@ struct LRUHandle {
assert(total_charge >= meta_charge); assert(total_charge >= meta_charge);
return total_charge - meta_charge; return total_charge - meta_charge;
} }
inline bool IsEmpty() {
return !this->IsElement() && this->displacements == 0;
}
inline bool IsTombstone() {
return !this->IsElement() && this->displacements > 0;
}
inline bool Matches(const Slice& some_key, uint32_t some_hash) {
return this->IsElement() && this->hash == some_hash &&
this->key() == some_key;
}
}; };
// TODO(Guido) Update the following comment.
// We provide our own simple hash table since it removes a whole bunch // We provide our own simple hash table since it removes a whole bunch
// of porting hacks and is also faster than some of the built-in hash // of porting hacks and is also faster than some of the built-in hash
// table implementations in some of the compiler/runtime combinations // table implementations in some of the compiler/runtime combinations
@ -115,45 +234,72 @@ struct LRUHandle {
// 4.4.3's builtin hashtable. // 4.4.3's builtin hashtable.
class LRUHandleTable { class LRUHandleTable {
public: public:
explicit LRUHandleTable(int hash_bits); explicit LRUHandleTable(uint8_t hash_bits);
~LRUHandleTable(); ~LRUHandleTable();
// Returns a pointer to a visible element matching the key/hash, or
// nullptr if not present.
LRUHandle* Lookup(const Slice& key, uint32_t hash); LRUHandle* Lookup(const Slice& key, uint32_t hash);
LRUHandle* Insert(LRUHandle* h);
LRUHandle* Remove(const Slice& key, uint32_t hash); // Inserts a copy of h into the hash table.
// Returns a pointer to the inserted handle, or nullptr if no slot
// available was found. If an existing visible element matching the
// key/hash is already present in the hash table, the argument old
// is set to pointe to it; otherwise, it's set to nullptr.
LRUHandle* Insert(LRUHandle* h, LRUHandle** old);
// Removes h from the hash table. The handle must already be off
// the LRU list.
void Remove(LRUHandle* h);
// Turns a visible element h into a ghost (i.e., not visible).
void Exclude(LRUHandle* h);
// Assigns a copy of h to the given slot.
void Assign(int slot, LRUHandle* h);
template <typename T> template <typename T>
void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) { void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
for (uint32_t i = index_begin; i < index_end; i++) { for (uint32_t i = index_begin; i < index_end; i++) {
LRUHandle* h = list_[i]; LRUHandle* h = &array_[i];
while (h != nullptr) { if (h->IsVisible()) {
auto n = h->next_hash;
assert(h->InCache());
func(h); func(h);
h = n;
} }
} }
} }
int GetLengthBits() const { return length_bits_; } uint8_t GetLengthBits() const { return length_bits_; }
// Return the address of the head of the chain in the bucket given uint32_t GetOccupancy() const { return occupancy_; }
// by the hash.
inline LRUHandle** Head(uint32_t hash);
private: private:
// Return a pointer to slot that points to a cache entry that int FindVisibleElement(const Slice& key, uint32_t hash, int& probe,
// matches key/hash. If there is no such cache entry, return a int displacement);
// pointer to the trailing slot in the corresponding linked list.
LRUHandle** FindPointer(const Slice& key, uint32_t hash); int FindAvailableSlot(const Slice& key, int& probe, int displacement);
// Number of hash bits (upper because lower bits used for sharding) int FindVisibleElementOrAvailableSlot(const Slice& key, uint32_t hash,
// used for table index. Length == 1 << length_bits_ int& probe, int displacement);
int length_bits_;
// Returns the index of the first slot probed (hashing with
// The table consists of an array of buckets where each bucket is // the given key) with a handle e such that cond(e) is true.
// a linked list of cache entries that hash into the bucket. // Otherwise, if no match is found, returns -1.
std::unique_ptr<LRUHandle*[]> list_; // For every handle e probed except the final slot, updates
// e->displacements += displacement.
// The argument probe is modified such that consecutive calls
// to FindSlot continue probing right after where the previous
// call left.
int FindSlot(const Slice& key, std::function<bool(LRUHandle*)> cond,
int& probe, int displacement);
// Number of hash bits used for table index.
// The size of the table is 1 << length_bits_.
uint8_t length_bits_;
// Number of elements in the table.
uint32_t occupancy_;
std::unique_ptr<LRUHandle[]> array_;
}; };
// A single shard of sharded cache. // A single shard of sharded cache.
@ -173,6 +319,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
void SetStrictCapacityLimit(bool strict_capacity_limit) override; void SetStrictCapacityLimit(bool strict_capacity_limit) override;
// Like Cache methods, but with an extra "hash" parameter. // Like Cache methods, but with an extra "hash" parameter.
// Insert an item into the hash table and, if handle is null, insert into
// the LRU list. Older items are evicted as necessary. If the cache is full
// and free_handle_on_fail is true, the item is deleted and handle is set to
// nullptr.
Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
Cache::DeleterFn deleter, Cache::Handle** handle, Cache::DeleterFn deleter, Cache::Handle** handle,
Cache::Priority priority) override; Cache::Priority priority) override;
@ -217,13 +367,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
private: private:
friend class LRUCache; friend class LRUCache;
// Insert an item into the hash table and, if handle is null, insert into
// the LRU list. Older items are evicted as necessary. If the cache is full
// and free_handle_on_fail is true, the item is deleted and handle is set to
// nullptr.
Status InsertItem(LRUHandle* item, Cache::Handle** handle,
bool free_handle_on_fail);
void LRU_Remove(LRUHandle* e); void LRU_Remove(LRUHandle* e);
void LRU_Insert(LRUHandle* e); void LRU_Insert(LRUHandle* e);
@ -231,11 +374,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
// to hold (usage_ + charge) is freed or the lru list is empty // to hold (usage_ + charge) is freed or the lru list is empty
// This function is not thread safe - it needs to be executed while // This function is not thread safe - it needs to be executed while
// holding the mutex_. // holding the mutex_.
void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted); void EvictFromLRU(size_t charge, autovector<LRUHandle>* deleted);
// Returns the number of bits used to hash an element in the per-shard // Returns the number of bits used to hash an element in the hash
// table. // table.
static int GetHashBits(size_t capacity, size_t estimated_value_size, static uint8_t CalcHashBits(size_t capacity, size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy); CacheMetadataChargePolicy metadata_charge_policy);
// Initialized before use. // Initialized before use.

@ -114,7 +114,8 @@ default_params = {
"use_direct_reads": lambda: random.randint(0, 1), "use_direct_reads": lambda: random.randint(0, 1),
"use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1), "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
"mock_direct_io": False, "mock_direct_io": False,
"cache_type": lambda: random.choice(["fast_lru_cache", "lru_cache"]), # clock_cache is broken "cache_type": "lru_cache", # clock_cache is broken
# fast_lru_cache is currently incompatible with stress tests, because they use strict_capacity_limit = false
"use_full_merge_v1": lambda: random.randint(0, 1), "use_full_merge_v1": lambda: random.randint(0, 1),
"use_merge": lambda: random.randint(0, 1), "use_merge": lambda: random.randint(0, 1),
# 999 -> use Bloom API # 999 -> use Bloom API

Loading…
Cancel
Save