diff --git a/HISTORY.md b/HISTORY.md index 4a639a60a..4122a82c3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,6 +11,7 @@ * Fixed a bug causing manual flush with `flush_opts.wait=false` to stall when database has stopped all writes (#10001). * Fixed a bug in iterator refresh that was not freeing up SuperVersion, which could cause excessive resource pinniung (#10770). * Fixed a bug where RocksDB could be doing compaction endlessly when allow_ingest_behind is true and the bottommost level is not filled (#10767). +* Fixed a memory safety bug in experimental HyperClockCache (#10768) ### Performance Improvements * Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files. diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 0b07542c4..58a7f94bb 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -23,6 +23,12 @@ namespace ROCKSDB_NAMESPACE { namespace hyper_clock_cache { +inline uint64_t GetRefcount(uint64_t meta) { + return ((meta >> ClockHandle::kAcquireCounterShift) - + (meta >> ClockHandle::kReleaseCounterShift)) & + ClockHandle::kCounterMask; +} + static_assert(sizeof(ClockHandle) == 64U, "Expecting size / alignment with common cache line size"); @@ -49,6 +55,7 @@ ClockHandleTable::~ClockHandleTable() { break; case ClockHandle::kStateInvisible: // rare but possible case ClockHandle::kStateVisible: + assert(GetRefcount(h.meta) == 0); h.FreeData(); #ifndef NDEBUG Rollback(h.hash, &h); @@ -562,10 +569,7 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful, } // Take ownership if no refs do { - uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) - - (old_meta >> ClockHandle::kReleaseCounterShift)) & - ClockHandle::kCounterMask; - if (refcount != 0) { + if (GetRefcount(old_meta) != 0) { // Not last ref at some point in time during this Release call // Correct for possible (but rare) overflow CorrectNearOverflow(old_meta, h->meta); @@ -622,6 +626,8 @@ void ClockHandleTable::Ref(ClockHandle& h) { assert((old_meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit); + // Must have already had a reference + assert(GetRefcount(old_meta) > 0); (void)old_meta; } @@ -671,10 +677,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) { old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift); for (;;) { - uint64_t refcount = - ((old_meta >> ClockHandle::kAcquireCounterShift) - - (old_meta >> ClockHandle::kReleaseCounterShift)) & - ClockHandle::kCounterMask; + uint64_t refcount = GetRefcount(old_meta); assert(refcount > 0); if (refcount > 1) { // Not last ref at some point in time during this Erase call @@ -683,8 +686,10 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) { std::memory_order_release); break; } else if (h->meta.compare_exchange_weak( - old_meta, uint64_t{ClockHandle::kStateConstruction} - << ClockHandle::kStateShift)) { + old_meta, + uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift, + std::memory_order_acq_rel)) { // Took ownership assert(hash == h->hash); // TODO? Delay freeing? @@ -740,20 +745,32 @@ void ClockHandleTable::ConstApplyToEntriesRange( for (uint32_t i = index_begin; i < index_end; i++) { ClockHandle& h = array_[i]; + // Note: to avoid using compare_exchange, we have to be extra careful. uint64_t old_meta = h.meta.load(std::memory_order_relaxed); // Check if it's an entry visible to lookups if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { - // Increment acquire counter + // Increment acquire counter. Note: it's possible that the entry has + // completely changed since we loaded old_meta, but incrementing acquire + // count is always safe. (Similar to optimistic Lookup here.) old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement, std::memory_order_acquire); - // Double-check - if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { - func(h); + // Check whether we actually acquired a reference. + if ((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit) { + // Apply func if appropriate + if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { + func(h); + } + // Pretend we never took the reference + h.meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + // No net change, so don't need to check for overflow + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. Furthermore, we cannot safely undo + // it because we did not acquire a read reference to lock the + // entry in a Shareable state. } - // Pretend we never took the reference - h.meta.fetch_sub(ClockHandle::kAcquireIncrement, - std::memory_order_release); - // No net change, so don't need to check for overflow } } } @@ -763,12 +780,9 @@ void ClockHandleTable::EraseUnRefEntries() { ClockHandle& h = array_[i]; uint64_t old_meta = h.meta.load(std::memory_order_relaxed); - uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) - - (old_meta >> ClockHandle::kReleaseCounterShift)) & - ClockHandle::kCounterMask; if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} << ClockHandle::kStateShift) && - refcount == 0 && + GetRefcount(old_meta) == 0 && h.meta.compare_exchange_strong(old_meta, uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift, @@ -877,13 +891,12 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge, // Only clock update entries with no outstanding refs continue; } - if (!(meta >> ClockHandle::kStateShift & + if (!((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit)) { // Only clock update Shareable entries continue; } - // ModTableSize(old_clock_pointer + i)); - if (meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible && + if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) && acquire_count > 0) { // Decrement clock uint64_t new_count = std::min(acquire_count - 1, @@ -1101,9 +1114,7 @@ size_t ClockCacheShard::GetPinnedUsage() const { table_.ConstApplyToEntriesRange( [&table_pinned_usage, charge_metadata](const ClockHandle& h) { uint64_t meta = h.meta.load(std::memory_order_relaxed); - uint64_t refcount = ((meta >> ClockHandle::kAcquireCounterShift) - - (meta >> ClockHandle::kReleaseCounterShift)) & - ClockHandle::kCounterMask; + uint64_t refcount = GetRefcount(meta); // Holding one ref for ConstApplyToEntriesRange assert(refcount > 0); if (refcount > 1) { diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 53e1db42d..e849b770e 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -174,6 +174,7 @@ DECLARE_bool(mock_direct_io); DECLARE_bool(statistics); DECLARE_bool(sync); DECLARE_bool(use_fsync); +DECLARE_uint64(stats_dump_period_sec); DECLARE_uint64(bytes_per_sync); DECLARE_uint64(wal_bytes_per_sync); DECLARE_int32(kill_random_test); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 067600ee9..abc07beec 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -1063,4 +1063,8 @@ DEFINE_bool( "be preserved similarly under `FLAGS_expected_values_dir/unverified` when " "`--expected_values_dir` is nonempty."); +DEFINE_uint64(stats_dump_period_sec, + ROCKSDB_NAMESPACE::Options().stats_dump_period_sec, + "Gap between printing stats to log in seconds"); + #endif // GFLAGS diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 020380dc2..d5af0c30a 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -3095,6 +3095,8 @@ void InitializeOptionsFromFlags( options.experimental_mempurge_threshold = FLAGS_experimental_mempurge_threshold; options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds; + options.stats_dump_period_sec = + static_cast(FLAGS_stats_dump_period_sec); options.ttl = FLAGS_compaction_ttl; options.enable_pipelined_write = FLAGS_enable_pipelined_write; options.enable_write_thread_adaptive_yield = diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index f194a0ddb..f803b060b 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -137,6 +137,8 @@ default_params = { "index_block_restart_interval": lambda: random.choice(range(1, 16)), "use_multiget": lambda: random.randint(0, 1), "periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]), + # 0 = never (used by some), 10 = often (for threading bugs), 600 = default + "stats_dump_period_sec": lambda: random.choice([0, 10, 600]), "compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]), # Test small max_manifest_file_size in a smaller chance, as most of the # time we wnat manifest history to be preserved to help debug