@ -23,102 +23,137 @@
# include "rocksdb/cache.h"
# include "rocksdb/secondary_cache.h"
# include "util/autovector.h"
# include "util/distributed_mutex.h"
namespace ROCKSDB_NAMESPACE {
namespace clock_cache {
// Block cache implementation using a lock-free open-address hash table
// and clock eviction.
// An experimental alternative to LRUCache, using a lock-free, open-addressed
// hash table and clock eviction.
///////////////////////////////////////////////////////////////////////////////
// Part 1: Handles
// ----------------------------------------------------------------------------
// 1. INTRODUCTION
//
// Every slot in the hash table is a ClockHandle. A handle can be in a few
// different states, that stem from the fact that handles can be externally
// referenced and, thus, can't always be immediately evicted when a delete
// operation is executed or when they are replaced by a new version (via an
// insert of the same key). Concretely, the state of a handle is defined by the
// following two properties:
// (R) Externally referenced: A handle can be referenced externally, or not.
// Importantly, a handle can be evicted if and only if it's not
// referenced. In particular, when an handle becomes referenced, it's
// temporarily taken out of clock until all references to it are released.
// (M) Marked for deletion (or invisible): An handle is marked for deletion
// when an operation attempts to delete it, but the handle is externally
// referenced, so it can't be immediately deleted. When this mark is placed,
// lookups will no longer be able to find it. Consequently, no more external
// references will be taken to the handle. When a handle is marked for
// deletion, we also say it's invisible.
// These properties induce 4 different states, with transitions defined as
// follows:
// - Not M --> M: When a handle is deleted or replaced by a new version, but
// not immediately evicted.
// - M --> not M: This cannot happen. Once a handle is marked for deletion,
// there is no can't go back.
// - R --> not R: When all references to an handle are released.
// - Not R --> R: When an unreferenced handle becomes referenced. This can only
// happen if the handle is visible, since references to an handle can only be
// created when it's visible.
// In RocksDB, a Cache is a concurrent unordered dictionary that supports
// external references (a.k.a. user references). A ClockCache is a type of Cache
// that uses the clock algorithm as its eviction policy. Internally, a
// ClockCache is an open-addressed hash table that stores all KV pairs in a
// large array. Every slot in the hash table is a ClockHandle, which holds a KV
// pair plus some additional metadata that controls the different aspects of the
// cache: external references, the hashing mechanism, concurrent access and the
// clock algorithm.
//
///////////////////////////////////////////////////////////////////////////////
// Part 2: Hash table structure
//
// Internally, the cache uses an open-addressed hash table to index the handles.
// We use tombstone counters to keep track of displacements. Probes are
// generated with double-hashing (but the code can be easily modified to use
// other probing schemes, like linear hashing). Because of the tombstones and
// the two possible visibility states of a handle, the table slots (we use the
// word "slot" to refer to handles that are not necessary valid key-value
// elements) can be in 4 different states:
// 1. Visible element: The slot contains an element in not M state.
// 2. To-be-deleted element: The slot contains an element in M state.
// 3. Tombstone: The slot doesn't contain an element, but there is some other
// 2. EXTERNAL REFERENCES
//
// An externally referenced handle can't be deleted (either evicted by the clock
// algorithm, or explicitly deleted) or replaced by a new version (via an insert
// of the same key) until all external references to it have been released by
// the users. ClockHandles have two members to support external references:
// - EXTERNAL_REFS counter: The number of external refs. When EXTERNAL_REFS > 0,
// the handle is externally referenced. Updates that intend to modify the
// handle will refrain from doing so. Eventually, when all references are
// released, we have EXTERNAL_REFS == 0, and updates can operate normally on
// the handle.
// - WILL_BE_DELETED flag: An handle is marked for deletion when an operation
// decides the handle should be deleted. This happens either when the last
// reference to a handle is released (and the release operation is instructed
// to delete on last reference) or on when a delete operation is called on
// the item. This flag is needed because an externally referenced handle
// can't be immediately deleted. In these cases, the flag will be later read
// and acted upon by the eviction algorithm. Importantly, WILL_BE_DELETED is
// used not only to defer deletions, but also as a barrier for external
// references: once WILL_BE_DELETED is set, lookups (which are the means to
// acquire new external references) will ignore the handle. For this reason,
// when WILL_BE_DELETED is set, we say the handle is invisible (and
// otherwise, that it's visible).
//
//
// 3. HASHING AND COLLISION RESOLUTION
//
// ClockCache uses an open-addressed hash table to store the handles.
// We use a variant of tombstones to manage collisions: every slot keeps a
// count of how many KV pairs that are currently in the cache have probed the
// slot in an attempt to insert. Probes are generated with double-hashing
// (although the code can be easily modified to use other probing schemes, like
// linear probing).
//
// A slot in the hash table can be in a few different states:
// - Element: The slot contains an element. This is indicated with the
// IS_ELEMENT flag. Element can be sub-classified depending on the
// value of WILL_BE_DELETED:
// * Visible element.
// * Invisible element.
// - Tombstone: The slot doesn't contain an element, but there is some other
// element that probed this slot during its insertion.
// 4. Empty: The slot is unused.
// When a ghost is removed from the table, it can either transition to being a
// tombstone or an empty slot, depending on the number of displacements of the
// slot. In any case, the slot becomes available. When a handle is inserted
// into that slot, it becomes a visible element again.
// - Empty: The slot is unused---it's neither an element nor a tombstone.
//
///////////////////////////////////////////////////////////////////////////////
// Part 3: The clock algorithm
// A slot cycles through the following sequence of states:
// empty or tombstone --> visible element --> invisible element -->
// empty or tombstone. Initially a slot is available---it's either
// empty or a tombstone. As soon as a KV pair is written into the slot, it
// becomes a visible element. At some point, the handle will be deleted
// by an explicit delete operation, the eviction algorithm, or an overwriting
// insert. In either case, the handle is marked for deletion. When the an
// attempt to delete the element finally succeeds, the slot is freed up
// and becomes available again.
//
// We maintain a circular buffer with the handles available for eviction,
// which the clock algorithm traverses (using a "clock pointer") to pick the
// next victim. We use the hash table array as the circular buffer, and mark
// the handles that are evictable. For this we use different clock flags, namely
// NONE, LOW, MEDIUM, HIGH, that represent priorities: LOW, MEDIUM and HIGH
// represent how close an element is from being evictable, LOW being immediately
// evictable. NONE means the slot is not evictable. This is due to one of the
// following reasons:
// (i) the slot doesn't contain an element, or
// (ii) the slot contains an element that is in R state, or
// (iii) the slot contains an element that was in R state but it's
// not any more, and the clock pointer has not swept through the
// slot since the element stopped being referenced.
//
// The priority NONE is really only important for case (iii), as in the other
// two cases there are other metadata fields that already capture the state.
// When an element stops being referenced (and is not deleted), the clock
// algorithm must acknowledge this, and assign a non-NONE priority to make
// the element evictable again.
// 4. CONCURRENCY
//
///////////////////////////////////////////////////////////////////////////////
// Part 4: Synchronization
// ClockCache is lock-free. At a high level, we synchronize the operations
// using a read-prioritized, non-blocking variant of RW locks on every slot of
// the hash table. To do this we generalize the concept of reference:
// - Internal reference: Taken by a thread that is attempting to read a slot
// or do a very precise type of update.
// - Exclusive reference: Taken by a thread that is attempting to write a
// a slot extensively.
//
// We provide the following synchronization guarantees:
// - Lookup is lock-free.
// - Release is lock-free, unless (i) no references to the element are left,
// and (ii) it was marked for deletion or the user wishes to delete if
// releasing the last reference.
// - Insert and Erase still use a per-shard lock .
// We defer the precise definitions to the comments in the code below.
// A crucial feature of our references is that attempting to take one never
// blocks the thread. Another important feature is that readers are
// prioritized, as they use extremely fast synchronization primitives---they
// use atomic arithmetic/bit operations, but no compare-and-swaps (which are
// much slower) .
//
// Our hash table is lock-free, in the sense that system-wide progress is
// guaranteed, i.e., some thread is always able to make progress.
// Internal references are used by threads to read slots during a probing
// sequence, making them the most common references (probing is performed
// in almost every operation, not just lookups). During a lookup, once
// the target element is found, and just before the handle is handed over
// to the user, an internal reference is converted into an external reference.
// During an update operation, once the target slot is found, an internal
// reference is converted into an exclusive reference. Interestingly, we
// can't atomically upgrade from internal to exclusive, or we may run into a
// deadlock. Releasing the internal reference and then taking an exclusive
// reference avoids the deadlock, but then the handle may change inbetween.
// One of the key observations we use in our implementation is that we can
// make up for this lack of atomicity using IS_ELEMENT and WILL_BE_DELETED.
//
///////////////////////////////////////////////////////////////////////////////
// Distinguishing internal from external references is useful for two reasons:
// - Internal references are short lived, but external references are typically
// not. This is helpful when acquiring an exclusive ref: if there are any
// external references to the item, it's probably not worth waiting until
// they go away.
// - We can precisely determine when there are no more external references to a
// handle, and proceed to mark it for deletion. This is useful when users
// release external references.
//
//
// 5. CLOCK ALGORITHM
//
// The clock algorithm circularly sweeps through the hash table to find the next
// victim. Recall that handles that are referenced are not evictable; the clock
// algorithm never picks those. We use different clock priorities: NONE, LOW,
// MEDIUM and HIGH. Priorities LOW, MEDIUM and HIGH represent how close an
// element is from being evicted, LOW being the closest to evicted. NONE means
// the slot is not evictable. NONE priority is used in one of the following
// cases:
// (a) the slot doesn't contain an element, or
// (b) the slot contains an externally referenced element, or
// (c) the slot contains an element that used to be externally referenced,
// and the clock pointer has not swept through the slot since the element
// stopped being externally referenced.
// ----------------------------------------------------------------------------
// The load factor p is a real number in (0, 1) such that at all
// times at most a fraction p of all slots, without counting tombstones,
@ -138,15 +173,18 @@ constexpr double kLoadFactor = 0.35;
// The user can exceed kLoadFactor if the sizes of the inserted values don't
// match estimated_value_size, or if strict_capacity_limit == false. To
// avoid performance to plunge , we set a strict upper bound on the load factor.
// avoid a performance drop , we set a strict upper bound on the load factor.
constexpr double kStrictLoadFactor = 0.7 ;
// Maximum number of spins when trying to acquire a ref.
// TODO(Guido) This value was set arbitrarily. Is it appropriate?
// What's the best way to bound the spinning?
constexpr uint32_t kSpinsPerTry = 100000 ;
// Arbitrary seeds.
constexpr uint32_t kProbingSeed1 = 0xbc9f1d34 ;
constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5 ;
// An experimental (under development!) alternative to LRUCache.
struct ClockHandle {
void * value ;
Cache : : DeleterFn deleter ;
@ -154,49 +192,6 @@ struct ClockHandle {
size_t total_charge ;
std : : array < char , kCacheKeySize > key_data ;
static constexpr uint8_t kExternalRefsOffset = 0 ;
static constexpr uint8_t kSharedRefsOffset = 15 ;
static constexpr uint8_t kExclusiveRefOffset = 30 ;
static constexpr uint8_t kWillBeDeletedOffset = 31 ;
enum Refs : uint32_t {
// Number of external references to the slot.
EXTERNAL_REFS = ( ( uint32_t { 1 } < < 15 ) - 1 )
< < kExternalRefsOffset , // Bits 0, ..., 14
// Number of internal references plus external references to the slot.
SHARED_REFS = ( ( uint32_t { 1 } < < 15 ) - 1 )
< < kSharedRefsOffset , // Bits 15, ..., 29
// Whether a thread has an exclusive reference to the slot.
EXCLUSIVE_REF = uint32_t { 1 } < < kExclusiveRefOffset , // Bit 30
// Whether the handle will be deleted soon. When this bit is set, new
// internal
// or external references to this handle stop being accepted.
// There is an exception: external references can be created from
// existing external references, or converting from existing internal
// references.
WILL_BE_DELETED = uint32_t { 1 } < < kWillBeDeletedOffset // Bit 31
// Shared references (i.e., external and internal references) and exclusive
// references are our custom implementation of RW locks---external and
// internal references are read locks, and exclusive references are write
// locks. We prioritize readers, which never block; in fact, they don't even
// use compare-and-swap operations. Using our own implementation of RW locks
// allows us to save many atomic operations by packing data more carefully.
// In particular:
// - Combining EXTERNAL_REFS and SHARED_REFS allows us to convert an
// internal
// reference into an external reference in a single atomic arithmetic
// operation.
// - Combining SHARED_REFS and WILL_BE_DELETED allows us to attempt to take
// a shared reference and check whether the entry is marked for deletion
// in a single atomic arithmetic operation.
} ;
static constexpr uint32_t kOneInternalRef = 0x8000 ;
static constexpr uint32_t kOneExternalRef = 0x8001 ;
std : : atomic < uint32_t > refs ;
static constexpr uint8_t kIsElementOffset = 1 ;
static constexpr uint8_t kClockPriorityOffset = 2 ;
static constexpr uint8_t kIsHitOffset = 4 ;
@ -209,7 +204,7 @@ struct ClockHandle {
CLOCK_PRIORITY = 3 < < kClockPriorityOffset ,
// Whether the handle has been looked up after its insertion.
HAS_HIT = 1 < < kIsHitOffset ,
// The value of Cache::Priority for the handle.
// The value of Cache::Priority of the handle.
CACHE_PRIORITY = 1 < < kCachePriorityOffset ,
} ;
@ -226,30 +221,67 @@ struct ClockHandle {
// up in this slot or a higher one.
std : : atomic < uint32_t > displacements ;
// Synchronization rules:
// - Use a shared reference when we want the handle's identity
// members (key_data, hash, value and IS_ELEMENT flag) to
// remain untouched, but not modify them. The only updates
// that a shared reference allows are:
static constexpr uint8_t kExternalRefsOffset = 0 ;
static constexpr uint8_t kSharedRefsOffset = 15 ;
static constexpr uint8_t kExclusiveRefOffset = 30 ;
static constexpr uint8_t kWillBeDeletedOffset = 31 ;
enum Refs : uint32_t {
// Synchronization model:
// - An external reference guarantees that hash, value, key_data
// and the IS_ELEMENT flag are not modified. Doesn't allow
// any writes.
// - An internal reference has the same guarantees as an
// external reference, and additionally allows the following
// idempotent updates on the handle:
// * set CLOCK_PRIORITY to NONE;
// * set the HAS_HIT bit.
// Notice that these two types of updates are idempotent, so
// they don't require synchronization across shared references.
// - Use an exclusive reference when we want identity members
// to remain untouched, as well as modify any identity member
// or flag.
// - displacements can be modified without holding a reference.
// - refs is only modified through appropriate functions to
// take or release references.
// * set the HAS_HIT bit;
// * set the WILL_BE_DELETED bit.
// - A shared reference is either an external reference or an
// internal reference.
// - An exclusive reference guarantees that no other thread has a shared
// or exclusive reference to the handle, and allows writes
// on the handle.
// Number of external references to the slot.
EXTERNAL_REFS = ( ( uint32_t { 1 } < < 15 ) - 1 )
< < kExternalRefsOffset , // Bits 0, ..., 14
// Number of internal references plus external references to the slot.
SHARED_REFS = ( ( uint32_t { 1 } < < 15 ) - 1 )
< < kSharedRefsOffset , // Bits 15, ..., 29
// Whether a thread has an exclusive reference to the slot.
EXCLUSIVE_REF = uint32_t { 1 } < < kExclusiveRefOffset , // Bit 30
// Whether the handle will be deleted soon. When this bit is set, new
// internal
// or external references to this handle stop being accepted.
// There is an exception: external references can be created from
// existing external references, or converting from existing internal
// references.
WILL_BE_DELETED = uint32_t { 1 } < < kWillBeDeletedOffset // Bit 31
// Having these 4 fields in a single variable allows us to support the
// following operations efficiently:
// - Convert an internal reference into an external reference in a single
// atomic arithmetic operation.
// - Attempt to take a shared reference using a single atomic arithmetic
// operation. This is because we can increment the internal ref count
// as well as checking whether the entry is marked for deletion using a
// single atomic arithmetic operation (and one non-atomic comparison).
} ;
static constexpr uint32_t kOneInternalRef = 0x8000 ;
static constexpr uint32_t kOneExternalRef = 0x8001 ;
std : : atomic < uint32_t > refs ;
ClockHandle ( )
: value ( nullptr ) ,
deleter ( nullptr ) ,
hash ( 0 ) ,
total_charge ( 0 ) ,
refs ( 0 ) ,
flags ( 0 ) ,
displacements ( 0 ) {
displacements ( 0 ) ,
refs ( 0 ) {
SetWillBeDeleted ( false ) ;
SetIsElement ( false ) ;
SetClockPriority ( ClockPriority : : NONE ) ;
@ -257,26 +289,66 @@ struct ClockHandle {
key_data . fill ( 0 ) ;
}
// The copy ctor and assignment operator are only used to copy a handle
// for immediate deletion. (We need to copy because the slot may become
// re-used before the deletion is completed.) We only copy the necessary
// members to carry out the deletion. In particular, we don't need
// the atomic members.
ClockHandle ( const ClockHandle & other ) { * this = other ; }
void operator = ( const ClockHandle & other ) {
value = other . value ;
deleter = other . deleter ;
hash = other . hash ;
total_charge = other . total_charge ;
refs . store ( other . refs ) ;
key_data = other . key_data ;
flags . store ( other . flags ) ;
SetWillBeDeleted ( other . WillBeDeleted ( ) ) ;
SetIsElement ( other . IsElement ( ) ) ;
SetClockPriority ( other . GetClockPriority ( ) ) ;
SetCachePriority ( other . GetCachePriority ( ) ) ;
displacements . store ( other . displacements ) ;
total_charge = other . total_charge ;
}
Slice key ( ) const { return Slice ( key_data . data ( ) , kCacheKeySize ) ; }
bool HasExternalRefs ( ) const { return ( refs & EXTERNAL_REFS ) > 0 ; }
void FreeData ( ) {
if ( deleter ) {
( * deleter ) ( key ( ) , value ) ;
}
}
// Calculate the memory usage by metadata.
inline size_t CalcMetaCharge (
CacheMetadataChargePolicy metadata_charge_policy ) const {
if ( metadata_charge_policy ! = kFullChargeCacheMetadata ) {
return 0 ;
} else {
// #ifdef ROCKSDB_MALLOC_USABLE_SIZE
// return malloc_usable_size(
// const_cast<void*>(static_cast<const void*>(this)));
// #else
// TODO(Guido) malloc_usable_size only works when we call it on
// a pointer allocated with malloc. Because our handles are all
// allocated in a single shot as an array, the user can't call
// CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
// pointer returned by the cache. Moreover, malloc_usable_size
// expects a heap-allocated handle, but sometimes in our code we
// wish to pass a stack-allocated handle (this is only a performance
// concern).
// What is the right way to compute metadata charges with pre-allocated
// handles?
return sizeof ( ClockHandle ) ;
// #endif
}
}
inline void CalcTotalCharge (
size_t charge , CacheMetadataChargePolicy metadata_charge_policy ) {
total_charge = charge + CalcMetaCharge ( metadata_charge_policy ) ;
}
inline size_t GetCharge (
CacheMetadataChargePolicy metadata_charge_policy ) const {
size_t meta_charge = CalcMetaCharge ( metadata_charge_policy ) ;
assert ( total_charge > = meta_charge ) ;
return total_charge - meta_charge ;
}
// flags functions.
bool IsElement ( ) const { return flags & IS_ELEMENT ; }
@ -292,10 +364,6 @@ struct ClockHandle {
void SetHit ( ) { flags | = HAS_HIT ; }
bool IsInClock ( ) const {
return GetClockPriority ( ) ! = ClockHandle : : ClockPriority : : NONE ;
}
Cache : : Priority GetCachePriority ( ) const {
return static_cast < Cache : : Priority > ( flags & CACHE_PRIORITY ) ;
}
@ -308,6 +376,10 @@ struct ClockHandle {
}
}
bool IsInClock ( ) const {
return GetClockPriority ( ) ! = ClockHandle : : ClockPriority : : NONE ;
}
ClockPriority GetClockPriority ( ) const {
return static_cast < ClockPriority > ( flags & Flags : : CLOCK_PRIORITY ) ;
}
@ -328,49 +400,6 @@ struct ClockHandle {
flags | = new_priority ;
}
void FreeData ( ) {
if ( deleter ) {
( * deleter ) ( key ( ) , value ) ;
}
}
// Calculate the memory usage by metadata.
inline size_t CalcMetaCharge (
CacheMetadataChargePolicy metadata_charge_policy ) const {
if ( metadata_charge_policy ! = kFullChargeCacheMetadata ) {
return 0 ;
} else {
// #ifdef ROCKSDB_MALLOC_USABLE_SIZE
// return malloc_usable_size(
// const_cast<void*>(static_cast<const void*>(this)));
// #else
// TODO(Guido) malloc_usable_size only works when we call it on
// a pointer allocated with malloc. Because our handles are all
// allocated in a single shot as an array, the user can't call
// CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
// pointer returned by the cache. Moreover, malloc_usable_size
// expects a heap-allocated handle, but sometimes in our code we
// wish to pass a stack-allocated handle (this is only a performance
// concern).
// What is the right way to compute metadata charges with pre-allocated
// handles?
return sizeof ( ClockHandle ) ;
// #endif
}
}
inline void CalcTotalCharge (
size_t charge , CacheMetadataChargePolicy metadata_charge_policy ) {
total_charge = charge + CalcMetaCharge ( metadata_charge_policy ) ;
}
inline size_t GetCharge (
CacheMetadataChargePolicy metadata_charge_policy ) const {
size_t meta_charge = CalcMetaCharge ( metadata_charge_policy ) ;
assert ( total_charge > = meta_charge ) ;
return total_charge - meta_charge ;
}
inline bool IsEmpty ( ) const {
return ! this - > IsElement ( ) & & this - > displacements = = 0 ;
}
@ -380,11 +409,12 @@ struct ClockHandle {
}
inline bool Matches ( const Slice & some_key , uint32_t some_hash ) const {
return this - > IsElement ( ) & & this - > hash = = some_hash & &
this - > key ( ) = = some_key ;
return this - > hash = = some_hash & & this - > key ( ) = = some_key ;
}
bool WillBeDeleted ( ) const { return refs & WILL_BE_DELETED ; }
// refs functions.
inline bool WillBeDeleted ( ) const { return refs & WILL_BE_DELETED ; }
void SetWillBeDeleted ( bool will_be_deleted ) {
if ( will_be_deleted ) {
@ -394,28 +424,7 @@ struct ClockHandle {
}
}
// The following functions are for taking and releasing refs.
// Tries to take an external ref. Returns true iff it succeeds.
inline bool TryExternalRef ( ) {
if ( ! ( ( refs + = kOneExternalRef ) & ( EXCLUSIVE_REF | WILL_BE_DELETED ) ) ) {
return true ;
}
refs - = kOneExternalRef ;
return false ;
}
// Releases an external ref. Returns the new value (this is useful to
// avoid an extra atomic read).
inline uint32_t ReleaseExternalRef ( ) { return refs - = kOneExternalRef ; }
// Take an external ref, assuming there is already one external ref
// to the handle.
void Ref ( ) {
// TODO(Guido) Is it okay to assume that the existing external reference
// survives until this function returns?
refs + = kOneExternalRef ;
}
bool HasExternalRefs ( ) const { return ( refs & EXTERNAL_REFS ) > 0 ; }
// Tries to take an internal ref. Returns true iff it succeeds.
inline bool TryInternalRef ( ) {
@ -426,9 +435,19 @@ struct ClockHandle {
return false ;
}
inline void ReleaseInternalRef ( ) { refs - = kOneInternalRef ; }
// Tries to take an external ref. Returns true iff it succeeds.
inline bool TryExternalRef ( ) {
if ( ! ( ( refs + = kOneExternalRef ) & ( EXCLUSIVE_REF | WILL_BE_DELETED ) ) ) {
return true ;
}
refs - = kOneExternalRef ;
return false ;
}
// Tries to take an exclusive ref. Returns true iff it succeeds.
// TODO(Guido) After every TryExclusiveRef call, we always call
// WillBeDeleted(). We could save an atomic read by having an output parameter
// with the last value of refs.
inline bool TryExclusiveRef ( ) {
uint32_t will_be_deleted = refs & WILL_BE_DELETED ;
uint32_t expected = will_be_deleted ;
@ -436,15 +455,18 @@ struct ClockHandle {
EXCLUSIVE_REF | will_be_deleted ) ;
}
// Repeatedly tries to take an exclusive reference, but stop s as soon
// as an external reference is detected (in this cas e the wait would
// presumably be too long).
inline bool Try SpinExclusiveRef( ) {
// Repeatedly tries to take an exclusive reference, but abort s as soon
// as an external or exclusive reference is detected (s ince the wait
// would presumably be too long).
inline bool SpinTry ExclusiveRef ( ) {
uint32_t expected = 0 ;
uint32_t will_be_deleted = 0 ;
uint32_t spins = kSpinsPerTry ;
while ( ! refs . compare_exchange_strong ( expected ,
EXCLUSIVE_REF | will_be_deleted ) ) {
if ( expected & EXTERNAL_REFS ) {
EXCLUSIVE_REF | will_be_deleted ) & &
spins - - ) {
std : : this_thread : : yield ( ) ;
if ( expected & ( EXTERNAL_REFS | EXCLUSIVE_REF ) ) {
return false ;
}
will_be_deleted = expected & WILL_BE_DELETED ;
@ -453,75 +475,88 @@ struct ClockHandle {
return true ;
}
inline void ReleaseExclusiveRef ( ) { refs . fetch_and ( ~ EXCLUSIVE_REF ) ; }
// Take an external ref, assuming there is already one external ref
// to the handle.
void Ref ( ) {
// TODO(Guido) Is it okay to assume that the existing external reference
// survives until this function returns?
refs + = kOneExternalRef ;
}
// The following functions are for upgrading and downgrading refs.
// They guarantee atomicity, i.e., no exclusive refs to the handle
// can be taken by a different thread during the conversion.
inline void ReleaseExternalRef ( ) { refs - = kOneExternalRef ; }
inline void ExclusiveToInternalRef ( ) {
refs + = kOneInternalRef ;
ReleaseExclusiveRef ( ) ;
}
inline void ReleaseInternalRef ( ) { refs - = kOneInternalRef ; }
inline void ReleaseExclusiveRef ( ) { refs . fetch_and ( ~ EXCLUSIVE_REF ) ; }
// Downgrade an exclusive ref to external.
inline void ExclusiveToExternalRef ( ) {
refs + = kOneExternalRef ;
ReleaseExclusiveRef ( ) ;
}
// TODO(Guido) Do we want to bound the loop and prepare the
// algorithms to react to a failure?
inline void InternalToExclusiveRef ( ) {
uint32_t expected = kOneInternalRef ;
uint32_t will_be_deleted = 0 ;
while ( ! refs . compare_exchange_strong ( expected ,
EXCLUSIVE_REF | will_be_deleted ) ) {
will_be_deleted = expected & WILL_BE_DELETED ;
expected = kOneInternalRef | will_be_deleted ;
}
}
// Convert an internal ref into external.
inline void InternalToExternalRef ( ) {
refs + = kOneExternalRef - kOneInternalRef ;
}
// TODO(Guido) Same concern.
inline void ExternalToExclusiveRef ( ) {
uint32_t expected = kOneExternalRef ;
uint32_t will_be_deleted = 0 ;
while ( ! refs . compare_exchange_strong ( expected ,
EXCLUSIVE_REF | will_be_deleted ) ) {
will_be_deleted = expected & WILL_BE_DELETED ;
expected = kOneExternalRef | will_be_deleted ;
}
}
} ; // struct ClockHandle
class ClockHandleTable {
public :
explicit ClockHandleTable ( int hash_bits ) ;
explicit ClockHandleTable ( size_t capacity , int hash_bits ) ;
~ ClockHandleTable ( ) ;
// Returns a pointer to a visible element matching the key/hash, or
// nullptr if not present.
// Returns a pointer to a visible handle matching the key/hash, or
// nullptr if not present. When an actual handle is produced, an
// internal reference is handed over.
ClockHandle * Lookup ( const Slice & key , uint32_t hash ) ;
// Inserts a copy of h into the hash table.
// Returns a pointer to the inserted handle, or nullptr if no slot
// available was found. If an existing visible element matching the
// key/hash is already present in the hash table, the argument old
// is set to point to it; otherwise, it's set to nullptr.
// Returns an exclusive reference to h, and no references to old.
ClockHandle * Insert ( ClockHandle * h , ClockHandle * * old ) ;
// Inserts a copy of h into the hash table. Returns a pointer to the
// inserted handle, or nullptr if no available slot was found. Every
// existing visible handle matching the key is already present in the
// hash table is marked as WILL_BE_DELETED. The deletion is also attempted,
// and, if the attempt is successful, the handle is inserted into the
// autovector deleted. When take_reference is true, the function hands
// over an external reference on the handle, and otherwise no reference is
// produced.
ClockHandle * Insert ( ClockHandle * h , autovector < ClockHandle > * deleted ,
bool take_reference ) ;
// Assigns h the appropriate clock priority, making it evictable.
void ClockOn ( ClockHandle * h ) ;
// Makes h non-evictable.
void ClockOff ( ClockHandle * h ) ;
// Removes h from the hash table. The handle must already be off clock.
void Remove ( ClockHandle * h ) ;
// Runs the clock eviction algorithm until there is enough space to
// insert an element with the given charge.
void ClockRun ( size_t charge ) ;
// Extracts the element information from a handle (src), and assigns it
// to a hash table slot (dst). Doesn't touch displacements and refs,
// which are maintained by the hash table algorithm.
void Assign ( ClockHandle * dst , ClockHandle * src ) ;
// Remove h from the hash table. Requires an exclusive ref to h.
void Remove ( ClockHandle * h , autovector < ClockHandle > * deleted ) ;
// Remove from the hash table all handles with matching key/hash along a
// probe sequence, starting from the given probe number. Doesn't
// require any references.
void RemoveAll ( const Slice & key , uint32_t hash , uint32_t & probe ,
autovector < ClockHandle > * deleted ) ;
void RemoveAll ( const Slice & key , uint32_t hash ,
autovector < ClockHandle > * deleted ) {
uint32_t probe = 0 ;
RemoveAll ( key , hash , probe , deleted ) ;
}
void Free ( autovector < ClockHandle > * deleted ) ;
// Tries to remove h from the hash table. If the attempt is successful,
// the function hands over an exclusive ref to h.
bool TryRemove ( ClockHandle * h , autovector < ClockHandle > * deleted ) ;
// Similar to TryRemove, except that it spins, increasing the chances of
// success. Requires that the caller thread has no shared ref to h.
bool SpinTryRemove ( ClockHandle * h , autovector < ClockHandle > * deleted ) ;
template < typename T >
void ApplyToEntriesRange ( T func , uint32_t index_begin , uint32_t index_end ,
@ -531,12 +566,9 @@ class ClockHandleTable {
if ( h - > TryExclusiveRef ( ) ) {
if ( h - > IsElement ( ) & &
( apply_if_will_be_deleted | | ! h - > WillBeDeleted ( ) ) ) {
// Hand the internal ref over to func, which is now responsible
// to release it.
func ( h ) ;
} else {
h - > ReleaseExclusiveRef ( ) ;
}
h - > ReleaseExclusiveRef ( ) ;
}
}
}
@ -565,53 +597,81 @@ class ClockHandleTable {
uint32_t GetOccupancy ( ) const { return occupancy_ ; }
size_t GetUsage ( ) const { return usage_ ; }
size_t GetCapacity ( ) const { return capacity_ ; }
// Returns x mod 2^{length_bits_}.
uint32_t ModTableSize ( uint32_t x ) { return x & length_bits_mask_ ; }
private :
friend class ClockCacheShard ;
int FindElement ( const Slice & key , uint32_t hash , uint32_t & probe ) ;
int FindAvailableSlot ( const Slice & key , uint32_t & probe ) ;
int FindElementOrAvailableSlot ( const Slice & key , uint32_t hash ,
uint32_t & probe ) ;
// Extracts the element information from a handle (src), and assigns it
// to a hash table slot (dst). Doesn't touch displacements and refs,
// which are maintained by the hash table algorithm.
void Assign ( ClockHandle * dst , ClockHandle * src ) ;
// Returns the index of the first slot probed (hashing with
// the given key) with a handle e such that match(e) is true.
// At every step, the function first tests whether match(e) holds.
// If it's false, it evaluates abort(e) to decide whether the
// search should be aborted, and in the affirmative returns -1.
// For every handle e probed except the last one, the function runs
// update(e). We say a probe to a handle e is aborting if match(e) is
// false and abort(e) is true. The argument probe is one more than the
// last non-aborting probe during the call. This is so that that the
// variable can be used to keep track of progress across consecutive
// calls to FindSlot.
inline int FindSlot ( const Slice & key , std : : function < bool ( ClockHandle * ) > match ,
// Returns the first slot in the probe sequence, starting from the given
// probe number, with a handle e such that match(e) is true. At every
// step, the function first tests whether match(e) holds. If this is false,
// it evaluates abort(e) to decide whether the search should be aborted,
// and in the affirmative returns -1. For every handle e probed except
// the last one, the function runs update(e).
// The probe parameter is modified as follows. We say a probe to a handle
// e is aborting if match(e) is false and abort(e) is true. Then the final
// value of probe is one more than the last non-aborting probe during the
// call. This is so that that the variable can be used to keep track of
// progress across consecutive calls to FindSlot.
inline ClockHandle * FindSlot ( const Slice & key ,
std : : function < bool ( ClockHandle * ) > match ,
std : : function < bool ( ClockHandle * ) > stop ,
std : : function < void ( ClockHandle * ) > update ,
uint32_t & probe ) ;
// After a failed FindSlot call (i.e., with answer -1), this function
// decrements all displacements, starting from the 0-th probe.
// Returns an available slot for the given key. All copies of the
// key found along the probing sequence until an available slot is
// found are marked for deletion. On each of them, a deletion is
// attempted, and when the attempt succeeds the slot is assigned to
// the new copy of the element.
ClockHandle * FindAvailableSlot ( const Slice & key , uint32_t hash ,
uint32_t & probe ,
autovector < ClockHandle > * deleted ) ;
// After a failed FindSlot call (i.e., with answer -1) in
// FindAvailableSlot, this function fixes all displacements's
// starting from the 0-th probe, until the given probe.
void Rollback ( const Slice & key , uint32_t probe ) ;
// Number of hash bits used for table index.
// The size of the table is 1 << length_bits_.
int length_bits_ ;
const int length_bits_ ;
// For faster computation of ModTableSize.
const uint32_t length_bits_mask_ ;
// Number of elements in the table.
uint32_t occupancy_ ;
// Maximum number of elements the user can store in the table.
uint32_t occupancy_limit_ ;
const uint32_t occupancy_limit_ ;
// Maximum total charge of all elements stored in the table.
const size_t capacity_ ;
// We partition the following members into different cache lines
// to avoid false sharing among Lookup, Release, Erase and Insert
// operations in ClockCacheShard.
ALIGN_AS ( CACHE_LINE_SIZE )
// Array of slots comprising the hash table.
std : : unique_ptr < ClockHandle [ ] > array_ ;
ALIGN_AS ( CACHE_LINE_SIZE )
// Clock algorithm sweep pointer.
std : : atomic < uint32_t > clock_pointer_ ;
ALIGN_AS ( CACHE_LINE_SIZE )
// Number of elements in the table.
std : : atomic < uint32_t > occupancy_ ;
// Memory size for entries residing in the cache.
std : : atomic < size_t > usage_ ;
} ; // class ClockHandleTable
// A single shard of sharded cache.
@ -652,20 +712,26 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
Statistics * /*stats*/ ) override {
return Lookup ( key , hash ) ;
}
Cache : : Handle * Lookup ( const Slice & key , uint32_t hash ) override ;
bool Release ( Cache : : Handle * handle , bool /*useful*/ ,
bool erase_if_last_ref ) override {
return Release ( handle , erase_if_last_ref ) ;
}
bool IsReady ( Cache : : Handle * /*handle*/ ) override { return true ; }
void Wait ( Cache : : Handle * /*handle*/ ) override { }
bool Ref ( Cache : : Handle * handle ) override ;
bool Release ( Cache : : Handle * handle , bool erase_if_last_ref = false ) override ;
void Erase ( const Slice & key , uint32_t hash ) override ;
size_t GetUsage ( ) const override ;
size_t GetPinnedUsage ( ) const override ;
void ApplyToSomeEntries (
@ -675,20 +741,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
void EraseUnRefEntries ( ) override ;
std : : string GetPrintableOptions ( ) const override ;
std : : string GetPrintableOptions ( ) const override { return std : : string { } ; }
private :
friend class ClockCache ;
// Makes an element evictable by clock.
void ClockOn ( ClockHandle * h ) ;
// Makes an element non-evictable.
void ClockOff ( ClockHandle * h ) ;
// Requires an exclusive ref on h.
void Evict ( ClockHandle * h ) ;
// Free some space following strict clock policy until enough space
// to hold (usage_ + charge) is freed or there are no evictable elements.
void EvictFromClock ( size_t charge , autovector < ClockHandle > * deleted ) ;
@ -703,34 +760,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
static int CalcHashBits ( size_t capacity , size_t estimated_value_size ,
CacheMetadataChargePolicy metadata_charge_policy ) ;
// Initialized before use.
size_t capacity_ ;
// Whether to reject insertion if cache reaches its full capacity.
bool strict_capacity_limit_ ;
uint32_t clock_pointer_ ;
std : : atomic < bool > strict_capacity_limit_ ;
// ------------^^^^^^^^^^^^^-----------
// Not frequently modified data members
// ------------------------------------
//
// We separate data members that are updated frequently from the ones that
// are not frequently updated so that they don't share the same cache line
// which will lead into false cache sharing
//
// ------------------------------------
// Frequently modified data members
// ------------vvvvvvvvvvvvv-----------
ClockHandleTable table_ ;
// Memory size for entries residing in the cache.
size_t usage_ ;
// mutex_ protects the following state.
// We don't count mutex_ as the cache's internal state so semantically we
// don't mind mutex_ invoking the non-const actions.
mutable DMutex mutex_ ;
} ; // class ClockCacheShard
class ClockCache
@ -743,19 +776,28 @@ class ClockCache
bool strict_capacity_limit ,
CacheMetadataChargePolicy metadata_charge_policy =
kDontChargeCacheMetadata ) ;
~ ClockCache ( ) override ;
const char * Name ( ) const override { return " ClockCache " ; }
CacheShard * GetShard ( uint32_t shard ) override ;
const CacheShard * GetShard ( uint32_t shard ) const override ;
void * Value ( Handle * handle ) override ;
size_t GetCharge ( Handle * handle ) const override ;
uint32_t GetHash ( Handle * handle ) const override ;
DeleterFn GetDeleter ( Handle * handle ) const override ;
void DisownData ( ) override ;
private :
ClockCacheShard * shards_ = nullptr ;
int num_shards_ = 0 ;
int num_shards_ ;
} ; // class ClockCache
} // namespace clock_cache