Fix bug in HyperClockCache ApplyToEntries; cleanup (#10768)

Summary:
We have seen some rare crash test failures in HyperClockCache, and the source could certainly be a bug fixed in this change, in ClockHandleTable::ConstApplyToEntriesRange. It wasn't properly accounting for the fact that incrementing the acquire counter could be ineffective, due to parallel updates. (When incrementing the acquire counter is ineffective, it is incorrect to then decrement it.)

This change includes some other minor clean-up in HyperClockCache, and adds stats_dump_period_sec with a much lower period to the crash test. This should be the primary caller of ApplyToEntries, in collecting cache entry stats.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10768

Test Plan: haven't been able to reproduce the failure, but should be in a better state (bug fix and improved crash test)

Reviewed By: anand1976

Differential Revision: D40034747

Pulled By: anand1976

fbshipit-source-id: a06fcefe146e17ee35001984445cedcf3b63eb68
main
Peter Dillinger 2 years ago committed by Facebook GitHub Bot
parent f461e064ed
commit b205c6d029
  1. 1
      HISTORY.md
  2. 67
      cache/clock_cache.cc
  3. 1
      db_stress_tool/db_stress_common.h
  4. 4
      db_stress_tool/db_stress_gflags.cc
  5. 2
      db_stress_tool/db_stress_test_base.cc
  6. 2
      tools/db_crashtest.py

@ -11,6 +11,7 @@
* Fixed a bug causing manual flush with `flush_opts.wait=false` to stall when database has stopped all writes (#10001).
* Fixed a bug in iterator refresh that was not freeing up SuperVersion, which could cause excessive resource pinniung (#10770).
* Fixed a bug where RocksDB could be doing compaction endlessly when allow_ingest_behind is true and the bottommost level is not filled (#10767).
* Fixed a memory safety bug in experimental HyperClockCache (#10768)
### Performance Improvements
* Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files.

@ -23,6 +23,12 @@ namespace ROCKSDB_NAMESPACE {
namespace hyper_clock_cache {
inline uint64_t GetRefcount(uint64_t meta) {
return ((meta >> ClockHandle::kAcquireCounterShift) -
(meta >> ClockHandle::kReleaseCounterShift)) &
ClockHandle::kCounterMask;
}
static_assert(sizeof(ClockHandle) == 64U,
"Expecting size / alignment with common cache line size");
@ -49,6 +55,7 @@ ClockHandleTable::~ClockHandleTable() {
break;
case ClockHandle::kStateInvisible: // rare but possible
case ClockHandle::kStateVisible:
assert(GetRefcount(h.meta) == 0);
h.FreeData();
#ifndef NDEBUG
Rollback(h.hash, &h);
@ -562,10 +569,7 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful,
}
// Take ownership if no refs
do {
uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) -
(old_meta >> ClockHandle::kReleaseCounterShift)) &
ClockHandle::kCounterMask;
if (refcount != 0) {
if (GetRefcount(old_meta) != 0) {
// Not last ref at some point in time during this Release call
// Correct for possible (but rare) overflow
CorrectNearOverflow(old_meta, h->meta);
@ -622,6 +626,8 @@ void ClockHandleTable::Ref(ClockHandle& h) {
assert((old_meta >> ClockHandle::kStateShift) &
ClockHandle::kStateShareableBit);
// Must have already had a reference
assert(GetRefcount(old_meta) > 0);
(void)old_meta;
}
@ -671,10 +677,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
<< ClockHandle::kStateShift);
for (;;) {
uint64_t refcount =
((old_meta >> ClockHandle::kAcquireCounterShift) -
(old_meta >> ClockHandle::kReleaseCounterShift)) &
ClockHandle::kCounterMask;
uint64_t refcount = GetRefcount(old_meta);
assert(refcount > 0);
if (refcount > 1) {
// Not last ref at some point in time during this Erase call
@ -683,8 +686,10 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
std::memory_order_release);
break;
} else if (h->meta.compare_exchange_weak(
old_meta, uint64_t{ClockHandle::kStateConstruction}
<< ClockHandle::kStateShift)) {
old_meta,
uint64_t{ClockHandle::kStateConstruction}
<< ClockHandle::kStateShift,
std::memory_order_acq_rel)) {
// Took ownership
assert(hash == h->hash);
// TODO? Delay freeing?
@ -740,20 +745,32 @@ void ClockHandleTable::ConstApplyToEntriesRange(
for (uint32_t i = index_begin; i < index_end; i++) {
ClockHandle& h = array_[i];
// Note: to avoid using compare_exchange, we have to be extra careful.
uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
// Check if it's an entry visible to lookups
if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
// Increment acquire counter
// Increment acquire counter. Note: it's possible that the entry has
// completely changed since we loaded old_meta, but incrementing acquire
// count is always safe. (Similar to optimistic Lookup here.)
old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
std::memory_order_acquire);
// Double-check
if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
func(h);
// Check whether we actually acquired a reference.
if ((old_meta >> ClockHandle::kStateShift) &
ClockHandle::kStateShareableBit) {
// Apply func if appropriate
if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
func(h);
}
// Pretend we never took the reference
h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
std::memory_order_release);
// No net change, so don't need to check for overflow
} else {
// For other states, incrementing the acquire counter has no effect
// so we don't need to undo it. Furthermore, we cannot safely undo
// it because we did not acquire a read reference to lock the
// entry in a Shareable state.
}
// Pretend we never took the reference
h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
std::memory_order_release);
// No net change, so don't need to check for overflow
}
}
}
@ -763,12 +780,9 @@ void ClockHandleTable::EraseUnRefEntries() {
ClockHandle& h = array_[i];
uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) -
(old_meta >> ClockHandle::kReleaseCounterShift)) &
ClockHandle::kCounterMask;
if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
<< ClockHandle::kStateShift) &&
refcount == 0 &&
GetRefcount(old_meta) == 0 &&
h.meta.compare_exchange_strong(old_meta,
uint64_t{ClockHandle::kStateConstruction}
<< ClockHandle::kStateShift,
@ -877,13 +891,12 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
// Only clock update entries with no outstanding refs
continue;
}
if (!(meta >> ClockHandle::kStateShift &
if (!((meta >> ClockHandle::kStateShift) &
ClockHandle::kStateShareableBit)) {
// Only clock update Shareable entries
continue;
}
// ModTableSize(old_clock_pointer + i));
if (meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible &&
if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
acquire_count > 0) {
// Decrement clock
uint64_t new_count = std::min(acquire_count - 1,
@ -1101,9 +1114,7 @@ size_t ClockCacheShard::GetPinnedUsage() const {
table_.ConstApplyToEntriesRange(
[&table_pinned_usage, charge_metadata](const ClockHandle& h) {
uint64_t meta = h.meta.load(std::memory_order_relaxed);
uint64_t refcount = ((meta >> ClockHandle::kAcquireCounterShift) -
(meta >> ClockHandle::kReleaseCounterShift)) &
ClockHandle::kCounterMask;
uint64_t refcount = GetRefcount(meta);
// Holding one ref for ConstApplyToEntriesRange
assert(refcount > 0);
if (refcount > 1) {

@ -174,6 +174,7 @@ DECLARE_bool(mock_direct_io);
DECLARE_bool(statistics);
DECLARE_bool(sync);
DECLARE_bool(use_fsync);
DECLARE_uint64(stats_dump_period_sec);
DECLARE_uint64(bytes_per_sync);
DECLARE_uint64(wal_bytes_per_sync);
DECLARE_int32(kill_random_test);

@ -1063,4 +1063,8 @@ DEFINE_bool(
"be preserved similarly under `FLAGS_expected_values_dir/unverified` when "
"`--expected_values_dir` is nonempty.");
DEFINE_uint64(stats_dump_period_sec,
ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
"Gap between printing stats to log in seconds");
#endif // GFLAGS

@ -3095,6 +3095,8 @@ void InitializeOptionsFromFlags(
options.experimental_mempurge_threshold =
FLAGS_experimental_mempurge_threshold;
options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
options.stats_dump_period_sec =
static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
options.ttl = FLAGS_compaction_ttl;
options.enable_pipelined_write = FLAGS_enable_pipelined_write;
options.enable_write_thread_adaptive_yield =

@ -137,6 +137,8 @@ default_params = {
"index_block_restart_interval": lambda: random.choice(range(1, 16)),
"use_multiget": lambda: random.randint(0, 1),
"periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
# 0 = never (used by some), 10 = often (for threading bugs), 600 = default
"stats_dump_period_sec": lambda: random.choice([0, 10, 600]),
"compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
# Test small max_manifest_file_size in a smaller chance, as most of the
# time we wnat manifest history to be preserved to help debug

Loading…
Cancel
Save