Ribbon: major re-work of hashing, seeds, and more (#7635)

Summary:
* Fully optimized StandardHasher, in terms of efficiently generating Start, CoeffRow, and ResultRow from a stock hash value, with sufficient independence between them to have no measurably degraded behavior. (Degraded behavior would be an FP rate higher than explainable by 2^-b and, if using a 32-bit stock hash function, expected stock hash collisions.) Details in code comments.
* Our standard 64-bit and 32-bit hash functions do not exhibit sufficient independence on sequential seeds (for one Ribbon construction attempt to have independent probability from the next). I have worked around this in the Ribbon code by "pre-mixing" "ordinal seeds," sequentially tried and appropriate for storage in persisted metadata, into "raw seeds," ready for application and appropriate for in-memory storage. This way the pre-mixing step (though fast) is only applied on loading or configuring the structure, not on each query or banding add.
* Fix a subtle flaw in which backtracking not clearing ResultRow data could lead to elevated FP rate on keys that were backtracked on and should (for generality) exhibit the same FP rate as novel keys.
* Added a basic test for PhsfQuery and construction algorithms (map or "retrieval structure" rather than set or filter), and made a few trivial related fixes.
* Better random configuration generation in unit tests
* Some other minor cleanup / clarification / etc.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/7635

Test Plan: unit tests included

Reviewed By: jay-zhuang

Differential Revision: D24738978

Pulled By: pdillinger

fbshipit-source-id: f9d03599d9e2ca3e30e9d3e7d81cd936b56f76f0
main
Peter Dillinger 4 years ago committed by Facebook GitHub Bot
parent 1e40696dd1
commit 8b8a2e9f05
  1. 6
      util/hash.h
  2. 28
      util/ribbon_alg.h
  3. 253
      util/ribbon_impl.h
  4. 487
      util/ribbon_test.cc

@ -29,6 +29,8 @@ namespace ROCKSDB_NAMESPACE {
// Stable/persistent 64-bit hash. Higher quality and generally faster than // Stable/persistent 64-bit hash. Higher quality and generally faster than
// Hash(), especially for inputs > 24 bytes. // Hash(), especially for inputs > 24 bytes.
// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
// results from previous seed. Recommend incrementing by a large odd number.
extern uint64_t Hash64(const char* data, size_t n, uint64_t seed); extern uint64_t Hash64(const char* data, size_t n, uint64_t seed);
// Specific optimization without seed (same as seed = 0) // Specific optimization without seed (same as seed = 0)
@ -37,6 +39,8 @@ extern uint64_t Hash64(const char* data, size_t n);
// Non-persistent hash. Must only used for in-memory data structure. // Non-persistent hash. Must only used for in-memory data structure.
// The hash results are thus applicable to change. (Thus, it rarely makes // The hash results are thus applicable to change. (Thus, it rarely makes
// sense to specify a seed for this function.) // sense to specify a seed for this function.)
// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
// results from previous seed. Recommend incrementing by a large odd number.
inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) { inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
// Currently same as Hash64 // Currently same as Hash64
return Hash64(data, n, seed); return Hash64(data, n, seed);
@ -51,6 +55,8 @@ inline uint64_t NPHash64(const char* data, size_t n) {
// Stable/persistent 32-bit hash. Moderate quality and high speed on // Stable/persistent 32-bit hash. Moderate quality and high speed on
// small inputs. // small inputs.
// TODO: consider rename to Hash32 // TODO: consider rename to Hash32
// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
// results from previous seed. Recommend pseudorandom or hashed seeds.
extern uint32_t Hash(const char* data, size_t n, uint32_t seed); extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
// TODO: consider rename to LegacyBloomHash32 // TODO: consider rename to LegacyBloomHash32

@ -405,7 +405,10 @@ namespace ribbon {
// // big enough for the largest number of columns allowed. // // big enough for the largest number of columns allowed.
// typename ResultRow; // typename ResultRow;
// // An unsigned integer type sufficient for representing the number of // // An unsigned integer type sufficient for representing the number of
// // rows in the solution structure. (TODO: verify any extra needed?) // // rows in the solution structure, and at least the arithmetic
// // promotion size (usually 32 bits). uint32_t recommended because a
// // single Ribbon construction doesn't really scale to billions of
// // entries.
// typename Index; // typename Index;
// }; // };
@ -554,11 +557,10 @@ bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
int tz = CountTrailingZeroBits(cr); int tz = CountTrailingZeroBits(cr);
i += static_cast<Index>(tz); i += static_cast<Index>(tz);
cr >>= tz; cr >>= tz;
} else {
assert((cr & 1) == 1);
} }
for (;;) { for (;;) {
assert((cr & 1) == 1);
CoeffRow other = *(bs->CoeffRowPtr(i)); CoeffRow other = *(bs->CoeffRowPtr(i));
if (other == 0) { if (other == 0) {
*(bs->CoeffRowPtr(i)) = cr; *(bs->CoeffRowPtr(i)) = cr;
@ -568,16 +570,19 @@ bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
return true; return true;
} }
assert((other & 1) == 1); assert((other & 1) == 1);
// Gaussian row reduction
cr ^= other; cr ^= other;
rr ^= *(bs->ResultRowPtr(i)); rr ^= *(bs->ResultRowPtr(i));
if (cr == 0) { if (cr == 0) {
// Inconsistency or (less likely) redundancy // Inconsistency or (less likely) redundancy
break; break;
} }
// Find relative offset of next non-zero coefficient.
int tz = CountTrailingZeroBits(cr); int tz = CountTrailingZeroBits(cr);
i += static_cast<Index>(tz); i += static_cast<Index>(tz);
cr >>= tz; cr >>= tz;
} }
// Failed, unless result row == 0 because e.g. a duplicate input or a // Failed, unless result row == 0 because e.g. a duplicate input or a
// stock hash collision, with same result row. (For filter, stock hash // stock hash collision, with same result row. (For filter, stock hash
// collision implies same result row.) Or we could have a full equation // collision implies same result row.) Or we could have a full equation
@ -674,7 +679,11 @@ bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
--backtrack_pos; --backtrack_pos;
Index i = bts->BacktrackGet(backtrack_pos); Index i = bts->BacktrackGet(backtrack_pos);
*(bs->CoeffRowPtr(i)) = 0; *(bs->CoeffRowPtr(i)) = 0;
// Not required: *(bs->ResultRowPtr(i)) = 0; // Not strictly required, but is required for good FP rate on
// inputs that might have been backtracked out. (We don't want
// anything we've backtracked on to leak into final result, as
// that might not be "harmless".)
*(bs->ResultRowPtr(i)) = 0;
} }
} }
return false; return false;
@ -1088,8 +1097,8 @@ typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
const Hash hash = hasher.GetHash(key); const Hash hash = hasher.GetHash(key);
const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts()); const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
const Index upper_start_block = iss->GetUpperStartBlock(); const Index upper_start_block = iss.GetUpperStartBlock();
Index num_columns = iss->GetUpperNumColumns(); Index num_columns = iss.GetUpperNumColumns();
Index start_block_num = start_slot / kCoeffBits; Index start_block_num = start_slot / kCoeffBits;
Index segment = start_block_num * num_columns - Index segment = start_block_num * num_columns -
std::min(start_block_num, upper_start_block); std::min(start_block_num, upper_start_block);
@ -1103,14 +1112,14 @@ typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
ResultRow sr = 0; ResultRow sr = 0;
const CoeffRow cr_left = cr << start_bit; const CoeffRow cr_left = cr << start_bit;
for (Index i = 0; i < num_columns; ++i) { for (Index i = 0; i < num_columns; ++i) {
sr ^= BitParity(iss->LoadSegment(segment + i) & cr_left) << i; sr ^= BitParity(iss.LoadSegment(segment + i) & cr_left) << i;
} }
if (start_bit > 0) { if (start_bit > 0) {
segment += num_columns; segment += num_columns;
const CoeffRow cr_right = cr >> (kCoeffBits - start_bit); const CoeffRow cr_right = cr >> (kCoeffBits - start_bit);
for (Index i = 0; i < num_columns; ++i) { for (Index i = 0; i < num_columns; ++i) {
sr ^= BitParity(iss->LoadSegment(segment + i) & cr_right) << i; sr ^= BitParity(iss.LoadSegment(segment + i) & cr_right) << i;
} }
} }
@ -1158,6 +1167,9 @@ bool InterleavedFilterQuery(const typename FilterQueryHasher::Key &key,
const ResultRow expected = hasher.GetResultRowFromHash(hash); const ResultRow expected = hasher.GetResultRowFromHash(hash);
// TODO: consider optimizations such as
// * mask fetched values and shift cr, rather than shifting fetched values
// * get rid of start_bit == 0 condition with careful fetching & shifting
if (start_bit == 0) { if (start_bit == 0) {
for (Index i = 0; i < num_columns; ++i) { for (Index i = 0; i < num_columns; ++i) {
if (BitParity(iss.LoadSegment(segment + i) & cr) != if (BitParity(iss.LoadSegment(segment + i) & cr) !=

@ -39,7 +39,8 @@ namespace ribbon {
// static constexpr bool kFirstCoeffAlwaysOne; // static constexpr bool kFirstCoeffAlwaysOne;
// //
// // An unsigned integer type for identifying a hash seed, typically // // An unsigned integer type for identifying a hash seed, typically
// // uint32_t or uint64_t. // // uint32_t or uint64_t. Importantly, this is the amount of data
// // stored in memory for identifying a raw seed. See StandardHasher.
// typename Seed; // typename Seed;
// //
// // When true, the PHSF implements a static filter, expecting just // // When true, the PHSF implements a static filter, expecting just
@ -65,12 +66,7 @@ namespace ribbon {
// // A seedable stock hash function on Keys. All bits of Hash must // // A seedable stock hash function on Keys. All bits of Hash must
// // be reasonably high quality. XXH functions recommended, but // // be reasonably high quality. XXH functions recommended, but
// // Murmur, City, Farm, etc. also work. // // Murmur, City, Farm, etc. also work.
// // // static Hash HashFn(const Key &, Seed raw_seed);
// // If sequential seeds are not sufficiently independent for your
// // stock hash function, consider multiplying by a large odd constant.
// // If seed 0 is still undesirable, consider adding 1 before the
// // multiplication.
// static Hash HashFn(const Key &, Seed);
// }; // };
// A bit of a hack to automatically construct the type for // A bit of a hack to automatically construct the type for
@ -114,6 +110,12 @@ struct AddInputSelector<Key, ResultRow, true /*IsFilter*/> {
0, \ 0, \
"avoid unused warnings, semicolon expected after macro call") "avoid unused warnings, semicolon expected after macro call")
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4309) // cast truncating constant
#pragma warning(disable : 4307) // arithmetic constant overflow
#endif
// StandardHasher: A standard implementation of concepts RibbonTypes, // StandardHasher: A standard implementation of concepts RibbonTypes,
// PhsfQueryHasher, FilterQueryHasher, and BandingHasher from ribbon_alg.h. // PhsfQueryHasher, FilterQueryHasher, and BandingHasher from ribbon_alg.h.
// //
@ -126,15 +128,31 @@ struct AddInputSelector<Key, ResultRow, true /*IsFilter*/> {
// can do" with available hash information in terms of FP rate and // can do" with available hash information in terms of FP rate and
// compactness. (64 bits recommended and sufficient for PHSF practical // compactness. (64 bits recommended and sufficient for PHSF practical
// purposes.) // purposes.)
//
// Another feature of this hasher is a minimal "premixing" of seeds before
// they are provided to TypesAndSettings::HashFn in case that function does
// not provide sufficiently independent hashes when iterating merely
// sequentially on seeds. (This for example works around a problem with the
// preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXH3p or Hash64, and
// MurmurHash1 used in RocksDB, a.k.a. Hash.) We say this pre-mixing step
// translates "ordinal seeds," which we iterate sequentially to find a
// solution, into "raw seeds," with many more bits changing for each
// iteration. The translation is an easily reversible lightweight mixing,
// not suitable for hashing on its own. An advantage of this approach is that
// StandardHasher can store just the raw seed (e.g. 64 bits) for fast query
// times, while from the application perspective, we can limit to a small
// number of ordinal keys (e.g. 64 in 6 bits) for saving in metadata.
//
// The default constructor initializes the seed to ordinal seed zero, which
// is equal to raw seed zero.
//
template <class TypesAndSettings> template <class TypesAndSettings>
class StandardHasher { class StandardHasher {
public: public:
IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
StandardHasher(Seed seed = 0) : seed_(seed) {}
inline Hash GetHash(const Key& key) const { inline Hash GetHash(const Key& key) const {
return TypesAndSettings::HashFn(key, seed_); return TypesAndSettings::HashFn(key, raw_seed_);
}; };
// For when AddInput == pair<Key, ResultRow> (kIsFilter == false) // For when AddInput == pair<Key, ResultRow> (kIsFilter == false)
inline Hash GetHash(const std::pair<Key, ResultRow>& bi) const { inline Hash GetHash(const std::pair<Key, ResultRow>& bi) const {
@ -180,18 +198,59 @@ class StandardHasher {
} }
} }
inline CoeffRow GetCoeffRow(Hash h) const { inline CoeffRow GetCoeffRow(Hash h) const {
// This is a reasonably cheap but empirically effective remix/expansion
// of the hash data to fill CoeffRow. (Large primes)
// This is not so much "critical path" code because it can be done in // This is not so much "critical path" code because it can be done in
// parallel (instruction level) with memory lookup. // parallel (instruction level) with memory lookup.
Unsigned128 a = Multiply64to128(h, 0x85EBCA77C2B2AE63U); //
Unsigned128 b = Multiply64to128(h, 0x27D4EB2F165667C5U); // We do not need exhaustive remixing for CoeffRow, but just enough that
auto cr = static_cast<CoeffRow>(b ^ (a << 64) ^ (a >> 64)); // (a) every bit is reasonably independent from Start.
// (b) every Hash-length bit subsequence of the CoeffRow has full or
// nearly full entropy from h.
// (c) if nontrivial bit subsequences within are correlated, it needs to
// be more complicated than exact copy or bitwise not (at least without
// kFirstCoeffAlwaysOne), or else there seems to be a kind of
// correlated clustering effect.
// (d) the CoeffRow is not zero, so that no one input on its own can
// doom construction success. (Preferably a mix of 1's and 0's if
// satisfying above.)
// First, establish sufficient bitwise independence from Start, with
// multiplication by a large random prime.
// Note that we cast to Hash because if we use product bits beyond
// original input size, that's going to correlate with Start (FastRange)
// even with a (likely) different multiplier here.
Hash a = h * kCoeffAndResultFactor;
// If that's big enough, we're done. If not, we have to expand it,
// maybe up to 4x size.
uint64_t b = a;
static_assert(
sizeof(Hash) == sizeof(uint64_t) || sizeof(Hash) == sizeof(uint32_t),
"Supported sizes");
if (sizeof(Hash) < sizeof(uint64_t)) {
// Almost-trivial hash expansion (OK - see above), favoring roughly
// equal number of 1's and 0's in result
b = (b << 32) ^ b ^ kCoeffXor32;
}
Unsigned128 c = b;
static_assert(sizeof(CoeffRow) == sizeof(uint64_t) ||
sizeof(CoeffRow) == sizeof(Unsigned128),
"Supported sizes");
if (sizeof(uint64_t) < sizeof(CoeffRow)) {
// Almost-trivial hash expansion (OK - see above), favoring roughly
// equal number of 1's and 0's in result
c = (c << 64) ^ c ^ kCoeffXor64;
}
auto cr = static_cast<CoeffRow>(c);
// Now ensure the value is non-zero
if (kFirstCoeffAlwaysOne) { if (kFirstCoeffAlwaysOne) {
cr |= 1; cr |= 1;
} else if (sizeof(CoeffRow) == sizeof(Hash)) {
// Still have to ensure some bit is non-zero
cr |= (cr == 0) ? 1 : 0;
} else { } else {
// Still have to ensure non-zero // (We did trivial expansion with constant xor, which ensures some
cr |= static_cast<unsigned>(cr == 0); // bits are non-zero.)
} }
return cr; return cr;
} }
@ -203,11 +262,19 @@ class StandardHasher {
} }
inline ResultRow GetResultRowFromHash(Hash h) const { inline ResultRow GetResultRowFromHash(Hash h) const {
if (TypesAndSettings::kIsFilter) { if (TypesAndSettings::kIsFilter) {
// In contrast to GetStart, here we draw primarily from lower bits,
// but not literally, which seemed to cause FP rate hit in some cases.
// This is not so much "critical path" code because it can be done in // This is not so much "critical path" code because it can be done in
// parallel (instruction level) with memory lookup. // parallel (instruction level) with memory lookup.
auto rr = static_cast<ResultRow>(h ^ (h >> 13) ^ (h >> 26)); //
// There is no evidence that ResultRow needs to be independent from
// CoeffRow, so we draw from the same bits computed for CoeffRow,
// which are reasonably independent from Start. (Inlining and common
// subexpression elimination with GetCoeffRow should make this
// a single shared multiplication in generated code.)
Hash a = h * kCoeffAndResultFactor;
// The bits here that are *most* independent of Start are the highest
// order bits (as in Knuth multiplicative hash). To make those the
// most preferred for use in the result row, we do a bswap here.
auto rr = static_cast<ResultRow>(EndianSwapValue(a));
return rr & GetResultRowMask(); return rr & GetResultRowMask();
} else { } else {
// Must be zero // Must be zero
@ -226,33 +293,80 @@ class StandardHasher {
return bi.second; return bi.second;
} }
bool NextSeed(Seed max_seed) { // Seed tracking APIs - see class comment
if (seed_ >= max_seed) { void SetRawSeed(Seed seed) { raw_seed_ = seed; }
return false; Seed GetRawSeed() { return raw_seed_; }
} else { void SetOrdinalSeed(Seed count) {
++seed_; // A simple, reversible mixing of any size (whole bytes) up to 64 bits.
return true; // This allows casting the raw seed to any smaller size we use for
} // ordinal seeds without risk of duplicate raw seeds for unique ordinal
// seeds.
// Seed type might be smaller than numerical promotion size, but Hash
// should be at least that size, so we use Hash as intermediate type.
static_assert(sizeof(Seed) <= sizeof(Hash),
"Hash must be at least size of Seed");
// Multiply by a large random prime (one-to-one for any prefix of bits)
Hash tmp = count * kToRawSeedFactor;
// Within-byte one-to-one mixing
static_assert((kSeedMixMask & (kSeedMixMask >> kSeedMixShift)) == 0,
"Illegal mask+shift");
tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift;
raw_seed_ = static_cast<Seed>(tmp);
// dynamic verification
assert(GetOrdinalSeed() == count);
}
Seed GetOrdinalSeed() {
Hash tmp = raw_seed_;
// Within-byte one-to-one mixing (its own inverse)
tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift;
// Multiply by 64-bit multiplicative inverse
static_assert(kToRawSeedFactor * kFromRawSeedFactor == Hash{1},
"Must be inverses");
return static_cast<Seed>(tmp * kFromRawSeedFactor);
} }
Seed GetSeed() const { return seed_; }
void ResetSeed(Seed seed = 0) { seed_ = seed; }
protected: protected:
Seed seed_; // For expanding hash:
// large random prime
static constexpr Hash kCoeffAndResultFactor =
static_cast<Hash>(0xc28f82822b650bedULL);
// random-ish data
static constexpr uint32_t kCoeffXor32 = 0xa6293635U;
static constexpr uint64_t kCoeffXor64 = 0xc367844a6e52731dU;
// For pre-mixing seeds
static constexpr Hash kSeedMixMask = static_cast<Hash>(0xf0f0f0f0f0f0f0f0ULL);
static constexpr unsigned kSeedMixShift = 4U;
static constexpr Hash kToRawSeedFactor =
static_cast<Hash>(0xc78219a23eeadd03ULL);
static constexpr Hash kFromRawSeedFactor =
static_cast<Hash>(0xfe1a137d14b475abULL);
// See class description
Seed raw_seed_ = 0;
}; };
// StandardRehasher (and StandardRehasherAdapter): A variant of // StandardRehasher (and StandardRehasherAdapter): A variant of
// StandardHasher that uses the same type for keys as for hashes. // StandardHasher that uses the same type for keys as for hashes.
// This is primarily intended for building a Ribbon filter/PHSF // This is primarily intended for building a Ribbon filter
// from existing hashes without going back to original inputs in order // from existing hashes without going back to original inputs in
// to apply a different seed. This hasher seeds a 1-to-1 mixing // order to apply a different seed. This hasher seeds a 1-to-1 mixing
// transformation to apply a seed to an existing hash (or hash-sized key). // transformation to apply a seed to an existing hash. (Untested for
// hash-sized keys that are not already uniformly distributed.) This
// transformation builds on the seed pre-mixing done in StandardHasher.
// //
// Testing suggests essentially no degradation of solution success rate // Testing suggests essentially no degradation of solution success rate
// vs. going back to original inputs when changing hash seeds. For example: // vs. going back to original inputs when changing hash seeds. For example:
// Average re-seeds for solution with r=128, 1.02x overhead, and ~100k keys // Average re-seeds for solution with r=128, 1.02x overhead, and ~100k keys
// is about 1.10 for both StandardHasher and StandardRehasher. // is about 1.10 for both StandardHasher and StandardRehasher.
// //
// StandardRehasher is not really recommended for general PHSFs (not
// filters) because a collision in the original hash could prevent
// construction despite re-seeding the Rehasher. (Such collisions
// do not interfere with filter construction.)
//
// concept RehasherTypesAndSettings: like TypesAndSettings but // concept RehasherTypesAndSettings: like TypesAndSettings but
// does not require Key or HashFn. // does not require Key or HashFn.
template <class RehasherTypesAndSettings> template <class RehasherTypesAndSettings>
@ -262,28 +376,20 @@ class StandardRehasherAdapter : public RehasherTypesAndSettings {
using Key = Hash; using Key = Hash;
using Seed = typename RehasherTypesAndSettings::Seed; using Seed = typename RehasherTypesAndSettings::Seed;
static Hash HashFn(const Hash& input, Seed seed) { static Hash HashFn(const Hash& input, Seed raw_seed) {
static_assert(sizeof(Hash) <= 8, "Hash too big"); // Note: raw_seed is already lightly pre-mixed, and this multiplication
if (sizeof(Hash) > 4) { // by a large prime is sufficient mixing (low-to-high bits) on top of
// XXH3_avalanche / XXH3p_avalanche (64-bit), modified for seed // that for good FastRange results, which depends primarily on highest
uint64_t h = input; // bits. (The hashed CoeffRow and ResultRow are less sensitive to
h ^= h >> 37; // mixing than Start.)
h ^= seed * uint64_t{0xC2B2AE3D27D4EB4F}; // Also note: did consider adding ^ (input >> some) before the
h *= uint64_t{0x165667B19E3779F9}; // multiplication, but doesn't appear to be necessary.
h ^= h >> 32; return (input ^ raw_seed) * kRehashFactor;
return static_cast<Hash>(h);
} else {
// XXH32_avalanche (32-bit), modified for seed
uint32_t h32 = static_cast<uint32_t>(input);
h32 ^= h32 >> 15;
h32 ^= seed * uint32_t{0x27D4EB4F};
h32 *= uint32_t{0x85EBCA77};
h32 ^= h32 >> 13;
h32 *= uint32_t{0xC2B2AE3D};
h32 ^= h32 >> 16;
return static_cast<Hash>(h32);
}
} }
private:
static constexpr Hash kRehashFactor =
static_cast<Hash>(0x6193d459236a3a0dULL);
}; };
// See comment on StandardRehasherAdapter // See comment on StandardRehasherAdapter
@ -291,6 +397,10 @@ template <class RehasherTypesAndSettings>
using StandardRehasher = using StandardRehasher =
StandardHasher<StandardRehasherAdapter<RehasherTypesAndSettings>>; StandardHasher<StandardRehasherAdapter<RehasherTypesAndSettings>>;
#ifdef _MSC_VER
#pragma warning(pop)
#endif
// Especially with smaller hashes (e.g. 32 bit), there can be noticeable // Especially with smaller hashes (e.g. 32 bit), there can be noticeable
// false positives due to collisions in the Hash returned by GetHash. // false positives due to collisions in the Hash returned by GetHash.
// This function returns the expected FP rate due to those collisions, // This function returns the expected FP rate due to those collisions,
@ -442,9 +552,17 @@ class StandardBanding : public StandardHasher<TypesAndSettings> {
// Iteratively (a) resets the structure for `num_slots`, (b) attempts // Iteratively (a) resets the structure for `num_slots`, (b) attempts
// to add the range of inputs, and (c) if unsuccessful, chooses next // to add the range of inputs, and (c) if unsuccessful, chooses next
// hash seed, until either successful or unsuccessful with max_seed // hash seed, until either successful or unsuccessful with all the
// (minimum one seed attempted). Returns true if successful. In that // allowed seeds. Returns true if successful. In that case, use
// case, use GetSeed() to get the successful seed. // GetOrdinalSeed() or GetRawSeed() to get the successful seed.
//
// The allowed sequence of hash seeds is determined by
// `starting_ordinal_seed,` the first ordinal seed to be attempted
// (see StandardHasher), and `ordinal_seed_mask,` a bit mask (power of
// two minus one) for the range of ordinal seeds to consider. The
// max number of seeds considered will be ordinal_seed_mask + 1.
// For filters we suggest `starting_ordinal_seed` be chosen randomly
// or round-robin, to minimize false positive correlations between keys.
// //
// If unsuccessful, how best to continue is going to be application // If unsuccessful, how best to continue is going to be application
// specific. It should be possible to choose parameters such that // specific. It should be possible to choose parameters such that
@ -459,16 +577,27 @@ class StandardBanding : public StandardHasher<TypesAndSettings> {
// significant correlation in success, rather than independence.) // significant correlation in success, rather than independence.)
template <typename InputIterator> template <typename InputIterator>
bool ResetAndFindSeedToSolve(Index num_slots, InputIterator begin, bool ResetAndFindSeedToSolve(Index num_slots, InputIterator begin,
InputIterator end, Seed max_seed) { InputIterator end,
StandardHasher<TypesAndSettings>::ResetSeed(); Seed starting_ordinal_seed = 0U,
Seed ordinal_seed_mask = 63U) {
// power of 2 minus 1
assert((ordinal_seed_mask & (ordinal_seed_mask + 1)) == 0);
// starting seed is within mask
assert((starting_ordinal_seed & ordinal_seed_mask) ==
starting_ordinal_seed);
starting_ordinal_seed &= ordinal_seed_mask; // if not debug
Seed cur_ordinal_seed = starting_ordinal_seed;
do { do {
StandardHasher<TypesAndSettings>::SetOrdinalSeed(cur_ordinal_seed);
Reset(num_slots); Reset(num_slots);
bool success = AddRange(begin, end); bool success = AddRange(begin, end);
if (success) { if (success) {
return true; return true;
} }
} while (StandardHasher<TypesAndSettings>::NextSeed(max_seed)); cur_ordinal_seed = (cur_ordinal_seed + 1) & ordinal_seed_mask;
// No seed through max_seed worked. } while (cur_ordinal_seed != starting_ordinal_seed);
// Reached limit by circling around
return false; return false;
} }

@ -27,22 +27,119 @@ class RibbonTypeParamTest : public ::testing::Test {};
class RibbonTest : public ::testing::Test {}; class RibbonTest : public ::testing::Test {};
namespace {
// Different ways of generating keys for testing
// Generate semi-sequential keys
struct StandardKeyGen {
StandardKeyGen(const std::string& prefix, uint64_t id)
: id_(id), str_(prefix) {
ROCKSDB_NAMESPACE::PutFixed64(&str_, /*placeholder*/ 0);
}
// Prefix (only one required)
StandardKeyGen& operator++() {
++id_;
return *this;
}
const std::string& operator*() {
// Use multiplication to mix things up a little in the key
ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
id_ * uint64_t{0x1500000001});
return str_;
}
bool operator==(const StandardKeyGen& other) {
// Same prefix is assumed
return id_ == other.id_;
}
bool operator!=(const StandardKeyGen& other) {
// Same prefix is assumed
return id_ != other.id_;
}
uint64_t id_;
std::string str_;
};
// Generate small sequential keys, that can misbehave with sequential seeds
// as in https://github.com/Cyan4973/xxHash/issues/469.
// These keys are only heuristically unique, but that's OK with 64 bits,
// for testing purposes.
struct SmallKeyGen {
SmallKeyGen(const std::string& prefix, uint64_t id) : id_(id) {
// Hash the prefix for a heuristically unique offset
id_ += ROCKSDB_NAMESPACE::GetSliceHash64(prefix);
ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
}
// Prefix (only one required)
SmallKeyGen& operator++() {
++id_;
return *this;
}
const std::string& operator*() {
ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], id_);
return str_;
}
bool operator==(const SmallKeyGen& other) { return id_ == other.id_; }
bool operator!=(const SmallKeyGen& other) { return id_ != other.id_; }
uint64_t id_;
std::string str_;
};
template <typename KeyGen>
struct Hash32KeyGenWrapper : public KeyGen {
Hash32KeyGenWrapper(const std::string& prefix, uint64_t id)
: KeyGen(prefix, id) {}
uint32_t operator*() {
auto& key = *static_cast<KeyGen&>(*this);
// unseeded
return ROCKSDB_NAMESPACE::GetSliceHash(key);
}
};
template <typename KeyGen>
struct Hash64KeyGenWrapper : public KeyGen {
Hash64KeyGenWrapper(const std::string& prefix, uint64_t id)
: KeyGen(prefix, id) {}
uint64_t operator*() {
auto& key = *static_cast<KeyGen&>(*this);
// unseeded
return ROCKSDB_NAMESPACE::GetSliceHash64(key);
}
};
} // namespace
using ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate;
using ROCKSDB_NAMESPACE::ribbon::StandardHasher;
using ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter;
struct DefaultTypesAndSettings { struct DefaultTypesAndSettings {
using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128; using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
using ResultRow = uint8_t; using ResultRow = uint8_t;
using Index = uint32_t; using Index = uint32_t;
using Hash = uint64_t; using Hash = uint64_t;
using Key = ROCKSDB_NAMESPACE::Slice;
using Seed = uint32_t; using Seed = uint32_t;
using Key = ROCKSDB_NAMESPACE::Slice;
static constexpr bool kIsFilter = true; static constexpr bool kIsFilter = true;
static constexpr bool kFirstCoeffAlwaysOne = true; static constexpr bool kFirstCoeffAlwaysOne = true;
static constexpr bool kUseSmash = false; static constexpr bool kUseSmash = false;
static constexpr bool kAllowZeroStarts = false; static constexpr bool kAllowZeroStarts = false;
static Hash HashFn(const Key& key, Seed seed) { static Hash HashFn(const Key& key, uint64_t raw_seed) {
// TODO/FIXME: is there sufficient independence with sequential keys and // This version 0.7.2 preview of XXH3 (a.k.a. XXH3p) function does
// sequential seeds? // not pass SmallKeyGen tests below without some seed premixing from
return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), seed); // StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469
return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed);
} }
// For testing
using KeyGen = StandardKeyGen;
}; };
using TypesAndSettings_Coeff128 = DefaultTypesAndSettings; using TypesAndSettings_Coeff128 = DefaultTypesAndSettings;
@ -62,16 +159,19 @@ struct TypesAndSettings_Coeff64Smash0 : public TypesAndSettings_Coeff64Smash1 {
struct TypesAndSettings_Result16 : public DefaultTypesAndSettings { struct TypesAndSettings_Result16 : public DefaultTypesAndSettings {
using ResultRow = uint16_t; using ResultRow = uint16_t;
}; };
struct TypesAndSettings_Result32 : public DefaultTypesAndSettings {
using ResultRow = uint32_t;
};
struct TypesAndSettings_IndexSizeT : public DefaultTypesAndSettings { struct TypesAndSettings_IndexSizeT : public DefaultTypesAndSettings {
using Index = size_t; using Index = size_t;
}; };
struct TypesAndSettings_Hash32 : public DefaultTypesAndSettings { struct TypesAndSettings_Hash32 : public DefaultTypesAndSettings {
using Hash = uint32_t; using Hash = uint32_t;
static Hash HashFn(const Key& key, Seed seed) { static Hash HashFn(const Key& key, Hash raw_seed) {
// NOTE: Using RocksDB 32-bit Hash() here fails test below because of // This MurmurHash1 function does not pass tests below without the
// insufficient mixing of seed (or generally insufficient mixing) // seed premixing from StandardHasher. In fact, it needs more than
return ROCKSDB_NAMESPACE::Upper32of64( // just a multiplication mixer on the ordinal seed.
ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), seed)); return ROCKSDB_NAMESPACE::Hash(key.data(), key.size(), raw_seed);
} }
}; };
struct TypesAndSettings_Hash32_Result16 : public TypesAndSettings_Hash32 { struct TypesAndSettings_Hash32_Result16 : public TypesAndSettings_Hash32 {
@ -81,6 +181,9 @@ struct TypesAndSettings_KeyString : public DefaultTypesAndSettings {
using Key = std::string; using Key = std::string;
}; };
struct TypesAndSettings_Seed8 : public DefaultTypesAndSettings { struct TypesAndSettings_Seed8 : public DefaultTypesAndSettings {
// This is not a generally recommended configuration. With the configured
// hash function, it would fail with SmallKeyGen due to insufficient
// independence among the seeds.
using Seed = uint8_t; using Seed = uint8_t;
}; };
struct TypesAndSettings_NoAlwaysOne : public DefaultTypesAndSettings { struct TypesAndSettings_NoAlwaysOne : public DefaultTypesAndSettings {
@ -89,78 +192,58 @@ struct TypesAndSettings_NoAlwaysOne : public DefaultTypesAndSettings {
struct TypesAndSettings_AllowZeroStarts : public DefaultTypesAndSettings { struct TypesAndSettings_AllowZeroStarts : public DefaultTypesAndSettings {
static constexpr bool kAllowZeroStarts = true; static constexpr bool kAllowZeroStarts = true;
}; };
struct TypesAndSettings_RehasherWrapped : public DefaultTypesAndSettings { struct TypesAndSettings_Seed64 : public DefaultTypesAndSettings {
// This doesn't directly use StandardRehasher as a whole, but simulates using Seed = uint64_t;
// its behavior with unseeded hash of key, then seeded hash-to-hash
// transform.
static Hash HashFn(const Key& key, Seed seed) {
Hash unseeded = DefaultTypesAndSettings::HashFn(key, /*seed*/ 0);
using Rehasher = ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter<
DefaultTypesAndSettings>;
return Rehasher::HashFn(unseeded, seed);
}
}; };
struct TypesAndSettings_RehasherWrapped_Result16 struct TypesAndSettings_Rehasher
: public TypesAndSettings_RehasherWrapped { : public StandardRehasherAdapter<DefaultTypesAndSettings> {
using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
};
struct TypesAndSettings_Rehasher_Result16 : public TypesAndSettings_Rehasher {
using ResultRow = uint16_t; using ResultRow = uint16_t;
}; };
struct TypesAndSettings_Rehasher32Wrapped : public TypesAndSettings_Hash32 { struct TypesAndSettings_Rehasher_Result32 : public TypesAndSettings_Rehasher {
// This doesn't directly use StandardRehasher as a whole, but simulates using ResultRow = uint32_t;
// its behavior with unseeded hash of key, then seeded hash-to-hash };
// transform. struct TypesAndSettings_Rehasher_Seed64
static Hash HashFn(const Key& key, Seed seed) { : public StandardRehasherAdapter<TypesAndSettings_Seed64> {
Hash unseeded = TypesAndSettings_Hash32::HashFn(key, /*seed*/ 0); using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
using Rehasher = ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter< // Note: 64-bit seed with Rehasher gives slightly better average reseeds
TypesAndSettings_Hash32>; };
return Rehasher::HashFn(unseeded, seed); struct TypesAndSettings_Rehasher32
} : public StandardRehasherAdapter<TypesAndSettings_Hash32> {
using KeyGen = Hash32KeyGenWrapper<StandardKeyGen>;
};
struct TypesAndSettings_Rehasher32_Coeff64
: public TypesAndSettings_Rehasher32 {
using CoeffRow = uint64_t;
};
struct TypesAndSettings_SmallKeyGen : public DefaultTypesAndSettings {
// SmallKeyGen stresses the independence of different hash seeds
using KeyGen = SmallKeyGen;
};
struct TypesAndSettings_Hash32_SmallKeyGen : public TypesAndSettings_Hash32 {
// SmallKeyGen stresses the independence of different hash seeds
using KeyGen = SmallKeyGen;
}; };
using TestTypesAndSettings = ::testing::Types< using TestTypesAndSettings = ::testing::Types<
TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash, TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash,
TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash0, TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash0,
TypesAndSettings_Coeff64Smash1, TypesAndSettings_Result16, TypesAndSettings_Coeff64Smash1, TypesAndSettings_Result16,
TypesAndSettings_IndexSizeT, TypesAndSettings_Hash32, TypesAndSettings_Result32, TypesAndSettings_IndexSizeT,
TypesAndSettings_Hash32_Result16, TypesAndSettings_KeyString, TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16,
TypesAndSettings_Seed8, TypesAndSettings_NoAlwaysOne, TypesAndSettings_KeyString, TypesAndSettings_Seed8,
TypesAndSettings_AllowZeroStarts, TypesAndSettings_RehasherWrapped, TypesAndSettings_NoAlwaysOne, TypesAndSettings_AllowZeroStarts,
TypesAndSettings_RehasherWrapped_Result16, TypesAndSettings_Seed64, TypesAndSettings_Rehasher,
TypesAndSettings_Rehasher32Wrapped>; TypesAndSettings_Rehasher_Result16, TypesAndSettings_Rehasher_Result32,
TypesAndSettings_Rehasher_Seed64, TypesAndSettings_Rehasher32,
TypesAndSettings_Rehasher32_Coeff64, TypesAndSettings_SmallKeyGen,
TypesAndSettings_Hash32_SmallKeyGen>;
TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings); TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings);
namespace { namespace {
struct KeyGen {
KeyGen(const std::string& prefix, uint64_t id) : id_(id), str_(prefix) {
ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
}
// Prefix (only one required)
KeyGen& operator++() {
++id_;
return *this;
}
const std::string& operator*() {
// Use multiplication to mix things up a little in the key
ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
id_ * uint64_t{0x1500000001});
return str_;
}
bool operator==(const KeyGen& other) {
// Same prefix is assumed
return id_ == other.id_;
}
bool operator!=(const KeyGen& other) {
// Same prefix is assumed
return id_ != other.id_;
}
uint64_t id_;
std::string str_;
};
// For testing Poisson-distributed (or similar) statistics, get value for // For testing Poisson-distributed (or similar) statistics, get value for
// `stddevs_allowed` standard deviations above expected mean // `stddevs_allowed` standard deviations above expected mean
// `expected_count`. // `expected_count`.
@ -199,14 +282,13 @@ uint64_t InfrequentPoissonLowerBound(double expected_count) {
TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) { TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam); IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
IMPORT_RIBBON_IMPL_TYPES(TypeParam); IMPORT_RIBBON_IMPL_TYPES(TypeParam);
using KeyGen = typename TypeParam::KeyGen;
// For testing FP rate etc. // For testing FP rate etc.
constexpr Index kNumToCheck = 100000; constexpr Index kNumToCheck = 100000;
const auto log2_thoroughness = const auto log2_thoroughness =
static_cast<Seed>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness)); static_cast<Hash>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness));
// FIXME: This upper bound seems excessive
const Seed max_seed = 12 + log2_thoroughness;
// With overhead of just 2%, expect ~50% encoding success per // With overhead of just 2%, expect ~50% encoding success per
// seed with ~5k keys on 64-bit ribbon, or ~150k keys on 128-bit ribbon. // seed with ~5k keys on 64-bit ribbon, or ~150k keys on 128-bit ribbon.
@ -224,12 +306,15 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
uint64_t isoln_query_nanos = 0; uint64_t isoln_query_nanos = 0;
uint64_t isoln_query_count = 0; uint64_t isoln_query_count = 0;
// Take different samples if you change thoroughness
ROCKSDB_NAMESPACE::Random32 rnd(FLAGS_thoroughness);
for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) { for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) {
Index num_to_add = uint32_t num_to_add =
sizeof(CoeffRow) == 16 ? 130000 : TypeParam::kUseSmash ? 5500 : 2500; sizeof(CoeffRow) == 16 ? 130000 : TypeParam::kUseSmash ? 5500 : 2500;
// Use different values between that number and 50% of that number // Use different values between that number and 50% of that number
num_to_add -= (i * /* misc prime */ 15485863) % (num_to_add / 2); num_to_add -= rnd.Uniformish(num_to_add / 2);
total_added += num_to_add; total_added += num_to_add;
@ -243,19 +328,21 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
// Round to nearest multiple of kCoeffBits // Round to nearest multiple of kCoeffBits
num_slots = ((num_slots + kCoeffBits / 2) / kCoeffBits) * kCoeffBits; num_slots = ((num_slots + kCoeffBits / 2) / kCoeffBits) * kCoeffBits;
// Re-adjust num_to_add to get as close as possible to kFactor // Re-adjust num_to_add to get as close as possible to kFactor
num_to_add = static_cast<Index>(num_slots / kFactor); num_to_add = static_cast<uint32_t>(num_slots / kFactor);
} }
std::string prefix; std::string prefix;
// Take different samples if you change thoroughness ROCKSDB_NAMESPACE::PutFixed32(&prefix, rnd.Next());
ROCKSDB_NAMESPACE::PutFixed32(&prefix,
i + (FLAGS_thoroughness * 123456789U));
// Batch that must be added // Batch that must be added
std::string added_str = prefix + "added"; std::string added_str = prefix + "added";
KeyGen keys_begin(added_str, 0); KeyGen keys_begin(added_str, 0);
KeyGen keys_end(added_str, num_to_add); KeyGen keys_end(added_str, num_to_add);
// A couple more that will probably be added
KeyGen one_more(prefix + "more", 1);
KeyGen two_more(prefix + "more", 2);
// Batch that may or may not be added // Batch that may or may not be added
const Index kBatchSize = const Index kBatchSize =
sizeof(CoeffRow) == 16 ? 300 : TypeParam::kUseSmash ? 20 : 10; sizeof(CoeffRow) == 16 ? 300 : TypeParam::kUseSmash ? 20 : 10;
@ -268,11 +355,19 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
KeyGen other_keys_begin(not_str, 0); KeyGen other_keys_begin(not_str, 0);
KeyGen other_keys_end(not_str, kNumToCheck); KeyGen other_keys_end(not_str, kNumToCheck);
// Vary bytes uniformly for InterleavedSoln to use number of solution // Vary bytes for InterleavedSoln to use number of solution columns
// columns varying from 0 to max allowed by ResultRow type (and used by // from 0 to max allowed by ResultRow type (and used by SimpleSoln).
// SimpleSoln). // Specifically include 0 and max, and otherwise skew toward max.
size_t ibytes = uint32_t max_ibytes = static_cast<uint32_t>(sizeof(ResultRow) * num_slots);
(i * /* misc odd */ 67896789) % (sizeof(ResultRow) * num_to_add + 1); size_t ibytes;
if (i == 0) {
ibytes = 0;
} else if (i == 1) {
ibytes = max_ibytes;
} else {
// Skewed
ibytes = std::max(rnd.Uniformish(max_ibytes), rnd.Uniformish(max_ibytes));
}
std::unique_ptr<char[]> idata(new char[ibytes]); std::unique_ptr<char[]> idata(new char[ibytes]);
InterleavedSoln isoln(idata.get(), ibytes); InterleavedSoln isoln(idata.get(), ibytes);
@ -284,20 +379,23 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
{ {
Banding banding; Banding banding;
// Traditional solve for a fixed set. // Traditional solve for a fixed set.
ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, keys_begin, ASSERT_TRUE(
keys_end, max_seed)); banding.ResetAndFindSeedToSolve(num_slots, keys_begin, keys_end));
// Now to test backtracking, starting with guaranteed fail // Now to test backtracking, starting with guaranteed fail. By using
// the keys that will be used to test FP rate, we are then doing an
// extra check that after backtracking there are no remnants (e.g. in
// result side of banding) of these entries.
Index occupied_count = banding.GetOccupiedCount(); Index occupied_count = banding.GetOccupiedCount();
banding.EnsureBacktrackSize(kNumToCheck); banding.EnsureBacktrackSize(kNumToCheck);
ASSERT_FALSE( EXPECT_FALSE(
banding.AddRangeOrRollBack(other_keys_begin, other_keys_end)); banding.AddRangeOrRollBack(other_keys_begin, other_keys_end));
ASSERT_EQ(occupied_count, banding.GetOccupiedCount()); EXPECT_EQ(occupied_count, banding.GetOccupiedCount());
// Check that we still have a good chance of adding a couple more // Check that we still have a good chance of adding a couple more
// individually // individually
first_single = banding.Add("one_more"); first_single = banding.Add(*one_more);
second_single = banding.Add("two_more"); second_single = banding.Add(*two_more);
Index more_added = (first_single ? 1 : 0) + (second_single ? 1 : 0); Index more_added = (first_single ? 1 : 0) + (second_single ? 1 : 0);
total_single_failures += 2U - more_added; total_single_failures += 2U - more_added;
@ -307,12 +405,12 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
more_added += kBatchSize; more_added += kBatchSize;
++total_batch_successes; ++total_batch_successes;
} }
ASSERT_LE(banding.GetOccupiedCount(), occupied_count + more_added); EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
// Also verify that redundant adds are OK (no effect) // Also verify that redundant adds are OK (no effect)
ASSERT_TRUE( ASSERT_TRUE(
banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8))); banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8)));
ASSERT_LE(banding.GetOccupiedCount(), occupied_count + more_added); EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
// Now back-substitution // Now back-substitution
soln.BackSubstFrom(banding); soln.BackSubstFrom(banding);
@ -320,39 +418,42 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
isoln.BackSubstFrom(banding); isoln.BackSubstFrom(banding);
} }
Seed seed = banding.GetSeed(); Seed reseeds = banding.GetOrdinalSeed();
total_reseeds += seed; total_reseeds += reseeds;
if (seed > log2_thoroughness + 1) {
fprintf(stderr, "%s high reseeds at %u, %u/%u: %u\n", EXPECT_LE(reseeds, 8 + log2_thoroughness);
seed > log2_thoroughness + 8 ? "FIXME Extremely" : "Somewhat", if (reseeds > log2_thoroughness + 1) {
static_cast<unsigned>(i), static_cast<unsigned>(num_to_add), fprintf(
static_cast<unsigned>(num_slots), static_cast<unsigned>(seed)); stderr, "%s high reseeds at %u, %u/%u: %u\n",
reseeds > log2_thoroughness + 8 ? "ERROR Extremely" : "Somewhat",
static_cast<unsigned>(i), static_cast<unsigned>(num_to_add),
static_cast<unsigned>(num_slots), static_cast<unsigned>(reseeds));
} }
hasher.ResetSeed(seed); hasher.SetOrdinalSeed(reseeds);
} }
// soln and hasher now independent of Banding object // soln and hasher now independent of Banding object
// Verify keys added // Verify keys added
KeyGen cur = keys_begin; KeyGen cur = keys_begin;
while (cur != keys_end) { while (cur != keys_end) {
EXPECT_TRUE(soln.FilterQuery(*cur, hasher)); ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
EXPECT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher)); ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
++cur; ++cur;
} }
// We (maybe) snuck these in! // We (maybe) snuck these in!
if (first_single) { if (first_single) {
EXPECT_TRUE(soln.FilterQuery("one_more", hasher)); ASSERT_TRUE(soln.FilterQuery(*one_more, hasher));
EXPECT_TRUE(!test_interleaved || isoln.FilterQuery("one_more", hasher)); ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*one_more, hasher));
} }
if (second_single) { if (second_single) {
EXPECT_TRUE(soln.FilterQuery("two_more", hasher)); ASSERT_TRUE(soln.FilterQuery(*two_more, hasher));
EXPECT_TRUE(!test_interleaved || isoln.FilterQuery("two_more", hasher)); ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*two_more, hasher));
} }
if (batch_success) { if (batch_success) {
cur = batch_begin; cur = batch_begin;
while (cur != batch_end) { while (cur != batch_end) {
EXPECT_TRUE(soln.FilterQuery(*cur, hasher)); ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
EXPECT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher)); ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
++cur; ++cur;
} }
} }
@ -364,7 +465,8 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(), ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
true); true);
while (cur != other_keys_end) { while (cur != other_keys_end) {
fp_count += soln.FilterQuery(*cur, hasher) ? 1 : 0; bool fp = soln.FilterQuery(*cur, hasher);
fp_count += fp ? 1 : 0;
++cur; ++cur;
} }
soln_query_nanos += timer.ElapsedNanos(); soln_query_nanos += timer.ElapsedNanos();
@ -375,8 +477,7 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
// For expected FP rate, also include false positives due to collisions // For expected FP rate, also include false positives due to collisions
// in Hash value. (Negligible for 64-bit, can matter for 32-bit.) // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
double correction = double correction =
kNumToCheck * ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate( kNumToCheck * ExpectedCollisionFpRate(hasher, num_to_add);
hasher, num_to_add);
EXPECT_LE(fp_count, EXPECT_LE(fp_count,
FrequentPoissonUpperBound(expected_fp_count + correction)); FrequentPoissonUpperBound(expected_fp_count + correction));
EXPECT_GE(fp_count, EXPECT_GE(fp_count,
@ -401,8 +502,7 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
// For expected FP rate, also include false positives due to collisions // For expected FP rate, also include false positives due to collisions
// in Hash value. (Negligible for 64-bit, can matter for 32-bit.) // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
double correction = double correction =
kNumToCheck * ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate( kNumToCheck * ExpectedCollisionFpRate(hasher, num_to_add);
hasher, num_to_add);
EXPECT_LE(ifp_count, EXPECT_LE(ifp_count,
FrequentPoissonUpperBound(expected_fp_count + correction)); FrequentPoissonUpperBound(expected_fp_count + correction));
EXPECT_GE(ifp_count, EXPECT_GE(ifp_count,
@ -448,12 +548,17 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness; double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness;
fprintf(stderr, "Average re-seeds: %g\n", average_reseeds); fprintf(stderr, "Average re-seeds: %g\n", average_reseeds);
// Values above were chosen to target around 50% chance of encoding success // Values above were chosen to target around 50% chance of encoding success
// rate (average of 1.0 re-seeds) or slightly better. But 1.1 is also close // rate (average of 1.0 re-seeds) or slightly better. But 1.15 is also close
// enough. // enough.
EXPECT_LE(total_reseeds, EXPECT_LE(total_reseeds,
InfrequentPoissonUpperBound(1.1 * FLAGS_thoroughness)); InfrequentPoissonUpperBound(1.15 * FLAGS_thoroughness));
// Would use 0.85 here instead of 0.75, but
// TypesAndSettings_Hash32_SmallKeyGen can "beat the odds" because of
// sequential keys with a small, cheap hash function. We accept that
// there are surely inputs that are somewhat bad for this setup, but
// these somewhat good inputs are probably more likely.
EXPECT_GE(total_reseeds, EXPECT_GE(total_reseeds,
InfrequentPoissonLowerBound(0.9 * FLAGS_thoroughness)); InfrequentPoissonLowerBound(0.75 * FLAGS_thoroughness));
} }
{ {
@ -489,8 +594,7 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
// in Hash value. (Negligible for 64-bit, can matter for 32-bit.) // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
double average_added = 1.0 * total_added / FLAGS_thoroughness; double average_added = 1.0 * total_added / FLAGS_thoroughness;
expected_total_fp_count += expected_total_fp_count +=
total_checked * ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate( total_checked * ExpectedCollisionFpRate(Hasher(), average_added);
Hasher(), average_added);
uint64_t upper_bound = InfrequentPoissonUpperBound(expected_total_fp_count); uint64_t upper_bound = InfrequentPoissonUpperBound(expected_total_fp_count);
uint64_t lower_bound = InfrequentPoissonLowerBound(expected_total_fp_count); uint64_t lower_bound = InfrequentPoissonLowerBound(expected_total_fp_count);
@ -499,10 +603,6 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
expected_total_fp_count / total_checked, expected_total_fp_count / total_checked,
1.0 * upper_bound / total_checked, 1.0 * upper_bound / total_checked,
1.0 * lower_bound / total_checked); 1.0 * lower_bound / total_checked);
// FIXME: this can fail for Result16, e.g. --thoroughness=300
// Seems due to inexpensive hashing in StandardHasher::GetCoeffRow and
// GetResultRowFromHash as replacing those with different Hash64 instances
// fixes it, at least mostly.
EXPECT_LE(total_fp_count, upper_bound); EXPECT_LE(total_fp_count, upper_bound);
EXPECT_GE(total_fp_count, lower_bound); EXPECT_GE(total_fp_count, lower_bound);
} }
@ -511,6 +611,7 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
TYPED_TEST(RibbonTypeParamTest, Extremes) { TYPED_TEST(RibbonTypeParamTest, Extremes) {
IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam); IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
IMPORT_RIBBON_IMPL_TYPES(TypeParam); IMPORT_RIBBON_IMPL_TYPES(TypeParam);
using KeyGen = typename TypeParam::KeyGen;
size_t bytes = 128 * 1024; size_t bytes = 128 * 1024;
std::unique_ptr<char[]> buf(new char[bytes]); std::unique_ptr<char[]> buf(new char[bytes]);
@ -523,7 +624,8 @@ TYPED_TEST(RibbonTypeParamTest, Extremes) {
// Add zero keys to minimal number of slots // Add zero keys to minimal number of slots
KeyGen begin_and_end("foo", 123); KeyGen begin_and_end("foo", 123);
ASSERT_TRUE(banding.ResetAndFindSeedToSolve( ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
/*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*max_seed*/ 0)); /*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*first seed*/ 0,
/* seed mask*/ 0));
soln.BackSubstFrom(banding); soln.BackSubstFrom(banding);
isoln.BackSubstFrom(banding); isoln.BackSubstFrom(banding);
@ -547,9 +649,10 @@ TYPED_TEST(RibbonTypeParamTest, Extremes) {
// Solutions are equivalent // Solutions are equivalent
ASSERT_EQ(isoln_query_result, soln_query_result); ASSERT_EQ(isoln_query_result, soln_query_result);
// And in fact we only expect an FP when ResultRow is 0 // And in fact we only expect an FP when ResultRow is 0
ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash( // CHANGE: no longer true because of filling some unused slots
hasher.GetHash(*cur)) == ResultRow{0}); // with pseudorandom values.
// ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash(
// hasher.GetHash(*cur)) == ResultRow{0});
fp_count += soln_query_result ? 1 : 0; fp_count += soln_query_result ? 1 : 0;
++cur; ++cur;
} }
@ -567,7 +670,8 @@ TYPED_TEST(RibbonTypeParamTest, Extremes) {
KeyGen key_begin("added", 0); KeyGen key_begin("added", 0);
KeyGen key_end("added", 1); KeyGen key_end("added", 1);
ASSERT_TRUE(banding.ResetAndFindSeedToSolve( ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
/*slots*/ kCoeffBits, key_begin, key_end, /*max_seed*/ 0)); /*slots*/ kCoeffBits, key_begin, key_end, /*first seed*/ 0,
/* seed mask*/ 0));
InterleavedSoln isoln2(nullptr, /*bytes*/ 0); InterleavedSoln isoln2(nullptr, /*bytes*/ 0);
@ -584,6 +688,7 @@ TYPED_TEST(RibbonTypeParamTest, Extremes) {
TEST(RibbonTest, AllowZeroStarts) { TEST(RibbonTest, AllowZeroStarts) {
IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts); IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts);
IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts); IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts);
using KeyGen = StandardKeyGen;
InterleavedSoln isoln(nullptr, /*bytes*/ 0); InterleavedSoln isoln(nullptr, /*bytes*/ 0);
SimpleSoln soln; SimpleSoln soln;
@ -593,17 +698,16 @@ TEST(RibbonTest, AllowZeroStarts) {
KeyGen begin("foo", 0); KeyGen begin("foo", 0);
KeyGen end("foo", 1); KeyGen end("foo", 1);
// Can't add 1 entry // Can't add 1 entry
ASSERT_FALSE( ASSERT_FALSE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end));
banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end, /*max_seed*/ 5));
KeyGen begin_and_end("foo", 123); KeyGen begin_and_end("foo", 123);
// Can add 0 entries // Can add 0 entries
ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end, ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end,
begin_and_end, /*max_seed*/ 5)); begin_and_end));
Seed seed = banding.GetSeed(); Seed reseeds = banding.GetOrdinalSeed();
ASSERT_EQ(seed, 0U); ASSERT_EQ(reseeds, 0U);
hasher.ResetSeed(seed); hasher.SetOrdinalSeed(reseeds);
// Can construct 0-slot solutions // Can construct 0-slot solutions
isoln.BackSubstFrom(banding); isoln.BackSubstFrom(banding);
@ -618,6 +722,123 @@ TEST(RibbonTest, AllowZeroStarts) {
ASSERT_EQ(soln.ExpectedFpRate(), 0.0); ASSERT_EQ(soln.ExpectedFpRate(), 0.0);
} }
TEST(RibbonTest, RawAndOrdinalSeeds) {
StandardHasher<TypesAndSettings_Seed64> hasher64;
StandardHasher<DefaultTypesAndSettings> hasher64_32;
StandardHasher<TypesAndSettings_Hash32> hasher32;
StandardHasher<TypesAndSettings_Seed8> hasher8;
for (uint32_t limit : {0xffU, 0xffffU}) {
std::vector<bool> seen(limit + 1);
for (uint32_t i = 0; i < limit; ++i) {
hasher64.SetOrdinalSeed(i);
auto raw64 = hasher64.GetRawSeed();
hasher32.SetOrdinalSeed(i);
auto raw32 = hasher32.GetRawSeed();
hasher8.SetOrdinalSeed(static_cast<uint8_t>(i));
auto raw8 = hasher8.GetRawSeed();
{
hasher64_32.SetOrdinalSeed(i);
auto raw64_32 = hasher64_32.GetRawSeed();
ASSERT_EQ(raw64_32, raw32); // Same size seed
}
if (i == 0) {
// Documented that ordinal seed 0 == raw seed 0
ASSERT_EQ(raw64, 0U);
ASSERT_EQ(raw32, 0U);
ASSERT_EQ(raw8, 0U);
} else {
// Extremely likely that upper bits are set
ASSERT_GT(raw64, raw32);
ASSERT_GT(raw32, raw8);
}
// Hashers agree on lower bits
ASSERT_EQ(static_cast<uint32_t>(raw64), raw32);
ASSERT_EQ(static_cast<uint8_t>(raw32), raw8);
// The translation is one-to-one for this size prefix
uint32_t v = static_cast<uint32_t>(raw32 & limit);
ASSERT_EQ(raw64 & limit, v);
ASSERT_FALSE(seen[v]);
seen[v] = true;
}
}
}
namespace {
struct PhsfInputGen {
PhsfInputGen(const std::string& prefix, uint64_t id) : id_(id) {
val_.first = prefix;
ROCKSDB_NAMESPACE::PutFixed64(&val_.first, /*placeholder*/ 0);
}
// Prefix (only one required)
PhsfInputGen& operator++() {
++id_;
return *this;
}
const std::pair<std::string, uint8_t>& operator*() {
// Use multiplication to mix things up a little in the key
ROCKSDB_NAMESPACE::EncodeFixed64(&val_.first[val_.first.size() - 8],
id_ * uint64_t{0x1500000001});
// Occasionally repeat values etc.
val_.second = static_cast<uint8_t>(id_ * 7 / 8);
return val_;
}
const std::pair<std::string, uint8_t>* operator->() { return &**this; }
bool operator==(const PhsfInputGen& other) {
// Same prefix is assumed
return id_ == other.id_;
}
bool operator!=(const PhsfInputGen& other) {
// Same prefix is assumed
return id_ != other.id_;
}
uint64_t id_;
std::pair<std::string, uint8_t> val_;
};
struct PhsfTypesAndSettings : public DefaultTypesAndSettings {
static constexpr bool kIsFilter = false;
};
} // namespace
TEST(RibbonTest, PhsfBasic) {
IMPORT_RIBBON_TYPES_AND_SETTINGS(PhsfTypesAndSettings);
IMPORT_RIBBON_IMPL_TYPES(PhsfTypesAndSettings);
Index num_slots = 12800;
Index num_to_add = static_cast<Index>(num_slots / 1.02);
PhsfInputGen begin("in", 0);
PhsfInputGen end("in", num_to_add);
std::unique_ptr<char[]> idata(new char[/*bytes*/ num_slots]);
InterleavedSoln isoln(idata.get(), /*bytes*/ num_slots);
SimpleSoln soln;
Hasher hasher;
{
Banding banding;
ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, begin, end));
soln.BackSubstFrom(banding);
isoln.BackSubstFrom(banding);
hasher.SetOrdinalSeed(banding.GetOrdinalSeed());
}
for (PhsfInputGen cur = begin; cur != end; ++cur) {
ASSERT_EQ(cur->second, soln.PhsfQuery(cur->first, hasher));
ASSERT_EQ(cur->second, isoln.PhsfQuery(cur->first, hasher));
}
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv); ::testing::InitGoogleTest(&argc, argv);
#ifdef GFLAGS #ifdef GFLAGS

Loading…
Cancel
Save