diff --git a/HISTORY.md b/HISTORY.md index 96170b41b..083c4c27b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,7 @@ ### New Features * Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287). * Added `JemallocAllocatorOptions::num_arenas`. Setting `num_arenas > 1` may mitigate mutex contention in the allocator, particularly in scenarios where block allocations commonly bypass jemalloc tcache. +* Improve the operational safety of publishing a DB or SST files to many hosts by using different block cache hash seeds on different hosts. The exact behavior is controlled by new option `ShardedCacheOptions::hash_seed`, which also documents the solved problem in more detail. ### Public API Changes * Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists. diff --git a/cache/cache_test.cc b/cache/cache_test.cc index febed5b42..4585faacb 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -21,6 +21,7 @@ #include "test_util/secondary_cache_test_util.h" #include "test_util/testharness.h" #include "util/coding.h" +#include "util/hash_containers.h" #include "util/string_util.h" // HyperClockCache only supports 16-byte keys, so some of the tests @@ -955,6 +956,97 @@ TEST_P(CacheTest, GetChargeAndDeleter) { cache_->Release(h1); } +namespace { +bool AreTwoCacheKeysOrdered(Cache* cache) { + std::vector keys; + const auto callback = [&](const Slice& key, Cache::ObjectPtr /*value*/, + size_t /*charge*/, + const Cache::CacheItemHelper* /*helper*/) { + keys.push_back(key.ToString()); + }; + cache->ApplyToAllEntries(callback, /*opts*/ {}); + EXPECT_EQ(keys.size(), 2U); + EXPECT_NE(keys[0], keys[1]); + return keys[0] < keys[1]; +} +} // namespace + +TEST_P(CacheTest, CacheUniqueSeeds) { + // kQuasiRandomHashSeed should generate unique seeds (up to 2 billion before + // repeating) + UnorderedSet seeds_seen; + // Roughly sqrt(number of possible values) for a decent chance at detecting + // a random collision if it's possible (shouldn't be) + uint16_t kSamples = 20000; + seeds_seen.reserve(kSamples); + + // Hash seed should affect ordering of entries in the table, so we should + // have extremely high chance of seeing two entries ordered both ways. + bool seen_forward_order = false; + bool seen_reverse_order = false; + + for (int i = 0; i < kSamples; ++i) { + auto cache = NewCache(2, [=](ShardedCacheOptions& opts) { + opts.hash_seed = LRUCacheOptions::kQuasiRandomHashSeed; + opts.num_shard_bits = 0; + opts.metadata_charge_policy = kDontChargeCacheMetadata; + }); + auto val = cache->GetHashSeed(); + ASSERT_TRUE(seeds_seen.insert(val).second); + + ASSERT_OK(cache->Insert(EncodeKey(1), nullptr, &kHelper, /*charge*/ 1)); + ASSERT_OK(cache->Insert(EncodeKey(2), nullptr, &kHelper, /*charge*/ 1)); + + if (AreTwoCacheKeysOrdered(cache.get())) { + seen_forward_order = true; + } else { + seen_reverse_order = true; + } + } + + ASSERT_TRUE(seen_forward_order); + ASSERT_TRUE(seen_reverse_order); +} + +TEST_P(CacheTest, CacheHostSeed) { + // kHostHashSeed should generate a consistent seed within this process + // (and other processes on the same host, but not unit testing that). + // And we should be able to use that chosen seed as an explicit option + // (for debugging). + // And we should verify consistent ordering of entries. + uint32_t expected_seed = 0; + bool expected_order = false; + // 10 iterations -> chance of a random seed falsely appearing consistent + // should be low, just 1 in 2^9. + for (int i = 0; i < 10; ++i) { + auto cache = NewCache(2, [=](ShardedCacheOptions& opts) { + if (i != 5) { + opts.hash_seed = LRUCacheOptions::kHostHashSeed; + } else { + // Can be used as explicit seed + opts.hash_seed = static_cast(expected_seed); + ASSERT_GE(opts.hash_seed, 0); + } + opts.num_shard_bits = 0; + opts.metadata_charge_policy = kDontChargeCacheMetadata; + }); + ASSERT_OK(cache->Insert(EncodeKey(1), nullptr, &kHelper, /*charge*/ 1)); + ASSERT_OK(cache->Insert(EncodeKey(2), nullptr, &kHelper, /*charge*/ 1)); + uint32_t val = cache->GetHashSeed(); + bool order = AreTwoCacheKeysOrdered(cache.get()); + if (i != 0) { + ASSERT_EQ(val, expected_seed); + ASSERT_EQ(order, expected_order); + } else { + expected_seed = val; + expected_order = order; + } + } + // Printed for reference in case it's needed to reproduce other unit test + // failures on another host + fprintf(stderr, "kHostHashSeed -> %u\n", (unsigned)expected_seed); +} + INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, secondary_cache_test_util::GetTestingCacheTypes()); INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 80fbbe88f..bb6495990 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -130,7 +130,8 @@ HyperClockTable::HyperClockTable( size_t capacity, bool /*strict_capacity_limit*/, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, - const Cache::EvictionCallback* eviction_callback, const Opts& opts) + const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, + const Opts& opts) : length_bits_(CalcHashBits(capacity, opts.estimated_value_size, metadata_charge_policy)), length_bits_mask_((size_t{1} << length_bits_) - 1), @@ -138,7 +139,8 @@ HyperClockTable::HyperClockTable( kStrictLoadFactor)), array_(new HandleImpl[size_t{1} << length_bits_]), allocator_(allocator), - eviction_callback_(*eviction_callback) { + eviction_callback_(*eviction_callback), + hash_seed_(*hash_seed) { if (metadata_charge_policy == CacheMetadataChargePolicy::kFullChargeCacheMetadata) { usage_ += size_t{GetTableSize()} * sizeof(HandleImpl); @@ -1010,7 +1012,7 @@ inline void HyperClockTable::Evict(size_t requested_charge, if (eviction_callback_) { took_ownership = eviction_callback_(ClockCacheShard::ReverseHash( - h.GetHash(), &unhashed), + h.GetHash(), &unhashed, hash_seed_), reinterpret_cast(&h)); } if (!took_ownership) { @@ -1039,11 +1041,11 @@ ClockCacheShard::ClockCacheShard( size_t capacity, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, - const Cache::EvictionCallback* eviction_callback, + const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const typename Table::Opts& opts) : CacheShardBase(metadata_charge_policy), table_(capacity, strict_capacity_limit, metadata_charge_policy, allocator, - eviction_callback, opts), + eviction_callback, hash_seed, opts), capacity_(capacity), strict_capacity_limit_(strict_capacity_limit) { // Initial charge metadata should not exceed capacity @@ -1082,10 +1084,11 @@ void ClockCacheShard
::ApplyToSomeEntries( *state = index_end << (sizeof(size_t) * 8u - length_bits); } + auto hash_seed = table_.GetHashSeed(); table_.ConstApplyToEntriesRange( - [callback](const HandleImpl& h) { + [callback, hash_seed](const HandleImpl& h) { UniqueId64x2 unhashed; - callback(ReverseHash(h.hashed_key, &unhashed), h.value, + callback(ReverseHash(h.hashed_key, &unhashed, hash_seed), h.value, h.GetTotalCharge(), h.helper); }, index_begin, index_end, false); @@ -1295,7 +1298,7 @@ HyperClockCache::HyperClockCache(const HyperClockCacheOptions& opts) table_opts.estimated_value_size = opts.estimated_entry_charge; new (cs) Shard(per_shard, opts.strict_capacity_limit, opts.metadata_charge_policy, alloc, &eviction_callback_, - table_opts); + &hash_seed_, table_opts); }); } diff --git a/cache/clock_cache.h b/cache/clock_cache.h index a9515146a..b2578b467 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -407,7 +407,7 @@ class HyperClockTable { CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, - const Opts& opts); + const uint32_t* hash_seed, const Opts& opts); ~HyperClockTable(); Status Insert(const ClockHandleBasicData& proto, HandleImpl** handle, @@ -448,6 +448,8 @@ class HyperClockTable { return standalone_usage_.load(std::memory_order_relaxed); } + uint32_t GetHashSeed() const { return hash_seed_; } + // Acquire/release N references void TEST_RefN(HandleImpl& handle, size_t n); void TEST_ReleaseN(HandleImpl* handle, size_t n); @@ -546,6 +548,9 @@ class HyperClockTable { // A reference to Cache::eviction_callback_ const Cache::EvictionCallback& eviction_callback_; + // A reference to ShardedCacheBase::hash_seed_ + const uint32_t& hash_seed_; + // We partition the following members into different cache lines // to avoid false sharing among Lookup, Release, Erase and Insert // operations in ClockCacheShard. @@ -573,7 +578,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, - const typename Table::Opts& opts); + const uint32_t* hash_seed, const typename Table::Opts& opts); // For CacheShard concept using HandleImpl = typename Table::HandleImpl; @@ -583,22 +588,23 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { static inline uint32_t HashPieceForSharding(HashCref hash) { return Upper32of64(hash[0]); } - static inline HashVal ComputeHash(const Slice& key) { + static inline HashVal ComputeHash(const Slice& key, uint32_t seed) { assert(key.size() == kCacheKeySize); HashVal in; HashVal out; // NOTE: endian dependence // TODO: use GetUnaligned? std::memcpy(&in, key.data(), kCacheKeySize); - BijectiveHash2x64(in[1], in[0], &out[1], &out[0]); + BijectiveHash2x64(in[1], in[0] ^ seed, &out[1], &out[0]); return out; } // For reconstructing key from hashed_key. Requires the caller to provide // backing storage for the Slice in `unhashed` static inline Slice ReverseHash(const UniqueId64x2& hashed, - UniqueId64x2* unhashed) { + UniqueId64x2* unhashed, uint32_t seed) { BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]); + (*unhashed)[0] ^= seed; // NOTE: endian dependence return Slice(reinterpret_cast(unhashed), kCacheKeySize); } diff --git a/cache/lru_cache.h b/cache/lru_cache.h index 9e6f15062..1a9ba0442 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -279,8 +279,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { using HashCref = uint32_t; public: // Function definitions expected as parameter to ShardedCache - static inline HashVal ComputeHash(const Slice& key) { - return Lower32of64(GetSliceNPHash64(key)); + static inline HashVal ComputeHash(const Slice& key, uint32_t seed) { + return Lower32of64(GetSliceNPHash64(key, seed)); } // Separate from constructor so caller can easily make an array of LRUCache diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index c4f392976..6e5118fb0 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -397,7 +397,7 @@ class ClockCacheTest : public testing::Test { opts.estimated_value_size = 1; new (shard_) Shard(capacity, strict_capacity_limit, kDontChargeCacheMetadata, - /*allocator*/ nullptr, &eviction_callback_, opts); + /*allocator*/ nullptr, &eviction_callback_, &hash_seed_, opts); } Status Insert(const UniqueId64x2& hashed_key, @@ -455,6 +455,7 @@ class ClockCacheTest : public testing::Test { private: Cache::EvictionCallback eviction_callback_; + uint32_t hash_seed_ = 0; }; TEST_F(ClockCacheTest, Misc) { diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc index f8d518067..cb8555b35 100644 --- a/cache/sharded_cache.cc +++ b/cache/sharded_cache.cc @@ -13,16 +13,55 @@ #include #include +#include "env/unique_id_gen.h" +#include "rocksdb/env.h" #include "util/hash.h" #include "util/math.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { +namespace { +// The generated seeds must fit in 31 bits so that +// ShardedCacheOptions::hash_seed can be set to it explicitly, for +// diagnostic/debugging purposes. +constexpr uint32_t kSeedMask = 0x7fffffff; +uint32_t DetermineSeed(int32_t hash_seed_option) { + if (hash_seed_option >= 0) { + // User-specified exact seed + return static_cast(hash_seed_option); + } + static SemiStructuredUniqueIdGen gen; + if (hash_seed_option == ShardedCacheOptions::kHostHashSeed) { + std::string hostname; + Status s = Env::Default()->GetHostNameString(&hostname); + if (s.ok()) { + return GetSliceHash(hostname) & kSeedMask; + } else { + // Fall back on something stable within the process. + return static_cast(gen.GetBaseUpper()) & kSeedMask; + } + } else { + // for kQuasiRandomHashSeed and fallback + uint32_t val = gen.GenerateNext() & kSeedMask; + // Perform some 31-bit bijective transformations so that we get + // quasirandom, not just incrementing. (An incrementing seed from a + // random starting point would be fine, but hard to describe in a name.) + // See https://en.wikipedia.org/wiki/Quasirandom and using a murmur-like + // transformation here for our bijection in the lower 31 bits. + // See https://en.wikipedia.org/wiki/MurmurHash + val *= /*31-bit prime*/ 1150630961; + val ^= (val & kSeedMask) >> 17; + val *= /*31-bit prime*/ 1320603883; + return val & kSeedMask; + } +} +} // namespace ShardedCacheBase::ShardedCacheBase(const ShardedCacheOptions& opts) : Cache(opts.memory_allocator), last_id_(1), shard_mask_((uint32_t{1} << opts.num_shard_bits) - 1), + hash_seed_(DetermineSeed(opts.hash_seed)), strict_capacity_limit_(opts.strict_capacity_limit), capacity_(opts.capacity) {} diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index d689783d3..d78cfc246 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -34,8 +34,8 @@ class CacheShardBase { std::string GetPrintableOptions() const { return ""; } using HashVal = uint64_t; using HashCref = uint64_t; - static inline HashVal ComputeHash(const Slice& key) { - return GetSliceNPHash64(key); + static inline HashVal ComputeHash(const Slice& key, uint32_t seed) { + return GetSliceNPHash64(key, seed); } static inline uint32_t HashPieceForSharding(HashCref hash) { return Lower32of64(hash); @@ -104,6 +104,8 @@ class ShardedCacheBase : public Cache { size_t GetUsage(Handle* handle) const override; std::string GetPrintableOptions() const override; + uint32_t GetHashSeed() const override { return hash_seed_; } + protected: // fns virtual void AppendPrintableOptions(std::string& str) const = 0; size_t GetPerShardCapacity() const; @@ -112,6 +114,7 @@ class ShardedCacheBase : public Cache { protected: // data std::atomic last_id_; // For NewId const uint32_t shard_mask_; + const uint32_t hash_seed_; // Dynamic configuration parameters, guarded by config_mutex_ bool strict_capacity_limit_; @@ -171,7 +174,7 @@ class ShardedCache : public ShardedCacheBase { size_t charge, Handle** handle = nullptr, Priority priority = Priority::LOW) override { assert(helper); - HashVal hash = CacheShard::ComputeHash(key); + HashVal hash = CacheShard::ComputeHash(key, hash_seed_); auto h_out = reinterpret_cast(handle); return GetShard(hash).Insert(key, hash, obj, helper, charge, h_out, priority); @@ -181,7 +184,7 @@ class ShardedCache : public ShardedCacheBase { const CacheItemHelper* helper, size_t charge, bool allow_uncharged) override { assert(helper); - HashVal hash = CacheShard::ComputeHash(key); + HashVal hash = CacheShard::ComputeHash(key, hash_seed_); HandleImpl* result = GetShard(hash).CreateStandalone( key, hash, obj, helper, charge, allow_uncharged); return reinterpret_cast(result); @@ -191,14 +194,14 @@ class ShardedCache : public ShardedCacheBase { CreateContext* create_context = nullptr, Priority priority = Priority::LOW, Statistics* stats = nullptr) override { - HashVal hash = CacheShard::ComputeHash(key); + HashVal hash = CacheShard::ComputeHash(key, hash_seed_); HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_context, priority, stats); return reinterpret_cast(result); } void Erase(const Slice& key) override { - HashVal hash = CacheShard::ComputeHash(key); + HashVal hash = CacheShard::ComputeHash(key, hash_seed_); GetShard(hash).Erase(key, hash); } diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 8fa93d8d7..83a027f5e 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -979,12 +979,14 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { const size_t capacity = size_t{1} << 25; int iterations_tested = 0; for (bool partition : {false, true}) { + SCOPED_TRACE("Partition? " + std::to_string(partition)); for (std::shared_ptr cache : {NewLRUCache(capacity), HyperClockCacheOptions( capacity, BlockBasedTableOptions().block_size /*estimated_value_size*/) .MakeSharedCache()}) { + SCOPED_TRACE(std::string("Cache: ") + cache->Name()); ++iterations_tested; Options options = CurrentOptions(); @@ -1278,6 +1280,7 @@ TEST_F(DBBlockCacheTest, HyperClockCacheReportProblems) { HyperClockCacheOptions hcc_opts{capacity, value_size_est}; hcc_opts.num_shard_bits = 2; // 4 shards hcc_opts.metadata_charge_policy = kDontChargeCacheMetadata; + hcc_opts.hash_seed = 0; // deterministic hashing std::shared_ptr cache = hcc_opts.MakeSharedCache(); std::shared_ptr logger = std::make_shared(); diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 4e36af1e2..bf4f9232a 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -258,6 +258,9 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, co.capacity = table_cache_size; co.num_shard_bits = immutable_db_options_.table_cache_numshardbits; co.metadata_charge_policy = kDontChargeCacheMetadata; + // TODO: Consider a non-fixed seed once test fallout (prefetch_test) is + // dealt with + co.hash_seed = 0; table_cache_ = NewLRUCache(co); SetDbSessionId(); assert(!db_session_id_.empty()); diff --git a/db/internal_stats.cc b/db/internal_stats.cc index c75668c0d..fdc4fa905 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -700,6 +700,7 @@ void InternalStats::CacheEntryRoleStats::BeginCollection( cache_usage = cache->GetUsage(); table_size = cache->GetTableAddressCount(); occupancy = cache->GetOccupancyCount(); + hash_seed = cache->GetHashSeed(); } void InternalStats::CacheEntryRoleStats::EndCollection( @@ -724,7 +725,7 @@ std::string InternalStats::CacheEntryRoleStats::ToString( std::ostringstream str; str << "Block cache " << cache_id << " capacity: " << BytesToHumanString(cache_capacity) - << " usage: " << BytesToHumanString(cache_usage) + << " seed: " << hash_seed << " usage: " << BytesToHumanString(cache_usage) << " table_size: " << table_size << " occupancy: " << occupancy << " collections: " << collection_count << " last_copies: " << copies_of_last_collection diff --git a/db/internal_stats.h b/db/internal_stats.h index 7a600384a..a65ef629a 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -478,6 +478,7 @@ class InternalStats { uint32_t copies_of_last_collection = 0; uint64_t last_start_time_micros_ = 0; uint64_t last_end_time_micros_ = 0; + uint32_t hash_seed = 0; void Clear() { // Wipe everything except collection_count diff --git a/env/env_test.cc b/env/env_test.cc index 2f748846b..926e2f86f 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -3134,6 +3134,20 @@ TEST_F(EnvTest, SemiStructuredUniqueIdGenTest) { t.Run(); } +TEST_F(EnvTest, SemiStructuredUniqueIdGenTestSmaller) { + // For small generated types, will cycle through all the possible values. + SemiStructuredUniqueIdGen gen; + std::vector hit(256); + for (int i = 0; i < 256; ++i) { + auto val = gen.GenerateNext(); + ASSERT_FALSE(hit[val]); + hit[val] = true; + } + for (int i = 0; i < 256; ++i) { + ASSERT_TRUE(hit[i]); + } +} + TEST_F(EnvTest, FailureToCreateLockFile) { auto env = Env::Default(); auto fs = env->GetFileSystem(); diff --git a/env/unique_id_gen.h b/env/unique_id_gen.h index 17e71e622..69033f8d3 100644 --- a/env/unique_id_gen.h +++ b/env/unique_id_gen.h @@ -14,6 +14,7 @@ #include #include +#include #include "rocksdb/rocksdb_namespace.h" @@ -61,6 +62,19 @@ class SemiStructuredUniqueIdGen { // to the next (thread safe). void GenerateNext(uint64_t* upper, uint64_t* lower); + // For generating smaller values. Will cycle through all the possibilities + // before repeating. + template + T GenerateNext() { + static_assert(sizeof(T) <= sizeof(uint64_t)); + static_assert(std::is_integral_v); + uint64_t ignore, val; + GenerateNext(&ignore, &val); + return static_cast(val); + } + + uint64_t GetBaseUpper() const { return base_upper_; } + private: uint64_t base_upper_; uint64_t base_lower_; diff --git a/include/rocksdb/advanced_cache.h b/include/rocksdb/advanced_cache.h index bd7d5b09c..997a41499 100644 --- a/include/rocksdb/advanced_cache.h +++ b/include/rocksdb/advanced_cache.h @@ -404,6 +404,9 @@ class Cache { MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } + // See ShardedCacheOptions::hash_seed + virtual uint32_t GetHashSeed() const { return 0; } + // EXPERIMENTAL // The following APIs are experimental and might change in the future. diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 9aadca947..3c49bf322 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -138,6 +138,35 @@ struct ShardedCacheOptions { // A SecondaryCache instance to use the non-volatile tier. std::shared_ptr secondary_cache; + // See hash_seed comments below + static constexpr int32_t kQuasiRandomHashSeed = -1; + static constexpr int32_t kHostHashSeed = -2; + + // EXPERT OPTION: Specifies how a hash seed should be determined for the + // cache, or specifies a specific seed (only recommended for diagnostics or + // testing). + // + // Background: it could be dangerous to have different cache instances + // access the same SST files with the same hash seed, as correlated unlucky + // hashing across hosts or restarts could cause a widespread issue, rather + // than an isolated one. For example, with smaller block caches, it is + // possible for large full Bloom filters in a set of SST files to be randomly + // clustered into one cache shard, causing mutex contention or a thrashing + // condition as there's little or no space left for other entries assigned to + // the shard. If a set of SST files is broadcast and used on many hosts, we + // should ensure all have an independent chance of balanced shards. + // + // Values >= 0 will be treated as fixed hash seeds. Values < 0 are reserved + // for methods of dynamically choosing a seed, currently: + // * kQuasiRandomHashSeed - Each cache created chooses a seed mostly randomly, + // except that within a process, no seed is repeated until all have been + // issued. + // * kHostHashSeed - The seed is determined based on hashing the host name. + // Although this is arguably slightly worse for production reliability, it + // solves the essential problem of cross-host correlation while ensuring + // repeatable behavior on a host, for diagnostic purposes. + int32_t hash_seed = kHostHashSeed; + ShardedCacheOptions() {} ShardedCacheOptions( size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, diff --git a/test_util/secondary_cache_test_util.h b/test_util/secondary_cache_test_util.h index 3e6135b64..1cfb454b5 100644 --- a/test_util/secondary_cache_test_util.h +++ b/test_util/secondary_cache_test_util.h @@ -56,13 +56,15 @@ class WithCacheType : public TestCreateContext { if (type == kLRU) { LRUCacheOptions lru_opts; lru_opts.capacity = capacity; + lru_opts.hash_seed = 0; // deterministic tests if (modify_opts_fn) { modify_opts_fn(lru_opts); } - return NewLRUCache(lru_opts); + return lru_opts.MakeSharedCache(); } if (type == kHyperClock) { HyperClockCacheOptions hc_opts{capacity, estimated_value_size_}; + hc_opts.hash_seed = 0; // deterministic tests if (modify_opts_fn) { modify_opts_fn(hc_opts); } diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 19ca1b4c0..f6edcce58 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -287,7 +288,7 @@ DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality"); DEFINE_int64(seed, 0, "Seed base for random number generators. " "When 0 it is derived from the current time."); -static int64_t seed_base; +static std::optional seed_base; DEFINE_int32(threads, 1, "Number of concurrent threads to run."); @@ -2606,7 +2607,7 @@ struct ThreadState { SharedState* shared; explicit ThreadState(int index, int my_seed) - : tid(index), rand(seed_base + my_seed) {} + : tid(index), rand(*seed_base + my_seed) {} }; class Duration { @@ -3002,6 +3003,11 @@ class Benchmark { return allocator; } + static int32_t GetCacheHashSeed() { + // For a fixed Cache seed, need a non-negative int32 + return static_cast(*seed_base) & 0x7fffffff; + } + static std::shared_ptr NewCache(int64_t capacity) { if (capacity <= 0) { return nullptr; @@ -3010,17 +3016,19 @@ class Benchmark { fprintf(stderr, "Old clock cache implementation has been removed.\n"); exit(1); } else if (FLAGS_cache_type == "hyper_clock_cache") { - return HyperClockCacheOptions(static_cast(capacity), - FLAGS_block_size /*estimated_entry_charge*/, - FLAGS_cache_numshardbits) - .MakeSharedCache(); + HyperClockCacheOptions hcco{ + static_cast(capacity), + static_cast(FLAGS_block_size) /*estimated_entry_charge*/, + FLAGS_cache_numshardbits}; + hcco.hash_seed = GetCacheHashSeed(); + return hcco.MakeSharedCache(); } else if (FLAGS_cache_type == "lru_cache") { LRUCacheOptions opts( static_cast(capacity), FLAGS_cache_numshardbits, false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio, GetCacheAllocator(), kDefaultToAdaptiveMutex, kDefaultCacheMetadataChargePolicy, FLAGS_cache_low_pri_pool_ratio); - + opts.hash_seed = GetCacheHashSeed(); if (!FLAGS_secondary_cache_uri.empty()) { Status s = SecondaryCache::CreateFromString( ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); @@ -3051,7 +3059,7 @@ class Benchmark { NewCompressedSecondaryCache(secondary_cache_opts); } - return NewLRUCache(opts); + return opts.MakeSharedCache(); } else { fprintf(stderr, "Cache type not supported."); exit(1); @@ -4891,7 +4899,7 @@ class Benchmark { values_[i] = i; } RandomShuffle(values_.begin(), values_.end(), - static_cast(seed_base)); + static_cast(*seed_base)); } } @@ -5003,7 +5011,7 @@ class Benchmark { // Default_random_engine provides slightly // improved throughput over mt19937. std::default_random_engine overwrite_gen{ - static_cast(seed_base)}; + static_cast(*seed_base)}; std::bernoulli_distribution overwrite_decider(p); // Inserted key window is filled with the last N @@ -5013,7 +5021,7 @@ class Benchmark { // - random access is O(1) // - insertion/removal at beginning/end is also O(1). std::deque inserted_key_window; - Random64 reservoir_id_gen(seed_base); + Random64 reservoir_id_gen(*seed_base); // --- Variables used in disposable/persistent keys simulation: // The following variables are used when @@ -5050,7 +5058,7 @@ class Benchmark { ErrorExit(); } } - Random rnd_disposable_entry(static_cast(seed_base)); + Random rnd_disposable_entry(static_cast(*seed_base)); std::string random_value; // Queue that stores scheduled timestamp of disposable entries deletes, // along with starting index of disposable entry keys to delete. @@ -8516,7 +8524,7 @@ int db_bench_tool(int argc, char** argv) { uint64_t now = FLAGS_env->GetSystemClock()->NowMicros(); seed_base = static_cast(now); fprintf(stdout, "Set seed to %" PRIu64 " because --seed was 0\n", - seed_base); + *seed_base); } else { seed_base = FLAGS_seed; }