diff --git a/CMakeLists.txt b/CMakeLists.txt index 505d400f3..c9f36ada1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1178,6 +1178,7 @@ if(WITH_TESTS) util/random_test.cc util/rate_limiter_test.cc util/repeatable_thread_test.cc + util/ribbon_test.cc util/slice_test.cc util/slice_transform_test.cc util/timer_queue_test.cc diff --git a/Makefile b/Makefile index c89358db4..29c36d61f 100644 --- a/Makefile +++ b/Makefile @@ -631,6 +631,7 @@ ifdef ASSERT_STATUS_CHECKED sst_file_reader_test \ range_tombstone_fragmenter_test \ repeatable_thread_test \ + ribbon_test \ skiplist_test \ slice_test \ sst_dump_test \ @@ -708,6 +709,7 @@ TESTS_PLATFORM_DEPENDENT := \ io_posix_test \ hash_test \ random_test \ + ribbon_test \ thread_local_test \ work_queue_test \ rate_limiter_test \ @@ -1420,6 +1422,9 @@ hash_test: $(OBJ_DIR)/util/hash_test.o $(TEST_LIBRARY) $(LIBRARY) random_test: $(OBJ_DIR)/util/random_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +ribbon_test: $(OBJ_DIR)/util/ribbon_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + option_change_migration_test: $(OBJ_DIR)/utilities/option_change_migration/option_change_migration_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 6433dbe06..83260b24a 100644 --- a/TARGETS +++ b/TARGETS @@ -1804,6 +1804,13 @@ ROCKS_TESTS = [ [], [], ], + [ + "ribbon_test", + "util/ribbon_test.cc", + "serial", + [], + [], + ], [ "sim_cache_test", "utilities/simulator_cache/sim_cache_test.cc", diff --git a/src.mk b/src.mk index 251228c89..58e8e841b 100644 --- a/src.mk +++ b/src.mk @@ -495,6 +495,7 @@ TEST_MAIN_SOURCES = \ util/random_test.cc \ util/rate_limiter_test.cc \ util/repeatable_thread_test.cc \ + util/ribbon_test.cc \ util/slice_test.cc \ util/slice_transform_test.cc \ util/timer_queue_test.cc \ diff --git a/util/math128.h b/util/math128.h index caff7a671..5b4434536 100644 --- a/util/math128.h +++ b/util/math128.h @@ -40,6 +40,10 @@ struct Unsigned128 { lo = lower; hi = upper; } + + explicit operator uint64_t() { return lo; } + + explicit operator uint32_t() { return static_cast(lo); } }; inline Unsigned128 operator<<(const Unsigned128& lhs, unsigned shift) { @@ -210,6 +214,11 @@ inline int BitParity(Unsigned128 v) { return BitParity(Lower64of128(v)) ^ BitParity(Upper64of128(v)); } +template +struct IsUnsignedUpTo128 + : std::integral_constant::value || + std::is_same::value> {}; + inline void EncodeFixed128(char* dst, Unsigned128 value) { EncodeFixed64(dst, Lower64of128(value)); EncodeFixed64(dst + 8, Upper64of128(value)); diff --git a/util/ribbon_alg.h b/util/ribbon_alg.h new file mode 100644 index 000000000..9f500aa7f --- /dev/null +++ b/util/ribbon_alg.h @@ -0,0 +1,821 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "util/math128.h" + +namespace ROCKSDB_NAMESPACE { + +namespace ribbon { + +// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly) +// +// ribbon_alg.h: generic versions of core algorithms. +// +// Ribbon is a Perfect Hash Static Function construction useful as a compact +// static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear +// system construction that approximates a Band Matrix with hashing, +// (b) an incremental, on-the-fly Gaussian Elimination algorithm that is +// remarkably efficient and adaptable at constructing an upper-triangular +// band matrix from a set of band-approximating inputs from (a), and +// (c) a storage layout that is fast and adaptable as a filter. +// +// Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices +// with One Short Random Block per Row, with Applications" by Stefan +// Walzer and Martin Dietzfelbinger ("DW paper") +// (b) developed by Peter C. Dillinger, though not the first on-the-fly +// GE algorithm. See "On the fly Gaussian Elimination for LT codes" by +// Bioglio, Grangetto, Gaeta, and Sereno. +// (c) TODO: not yet implemented here +// +// See ribbon_impl.h for high-level behavioral summary. This file focuses +// on the core design details. +// +// ###################################################################### +// ################# PHSF -> static filter reduction #################### +// +// A Perfect Hash Static Function is a data structure representing a +// map from anything hashable (a "key") to values of some fixed size. +// Crucially, it is allowed to return garbage values for anything not in +// the original set of map keys, and it is a "static" structure: entries +// cannot be added or deleted after construction. PHSFs representing n +// mappings to b-bit values (assume uniformly distributed) require at least +// n * b bits to represent, or at least b bits per entry. We typically +// describe the compactness of a PHSF by typical bits per entry as some +// function of b. For example, the MWHC construction (k=3 "peeling") +// requires about 1.0222*b and a variant called Xor+ requires about +// 1.08*b + 0.5 bits per entry. +// +// With more hashing, a PHSF can over-approximate a set as a Bloom filter +// does, with no FN queries and predictable false positive (FP) query +// rate. Instead of the user providing a value to map each input key to, +// a hash function provides the value. Keys in the original set will +// return a positive membership query because the underlying PHSF returns +// the same value as hashing the key. When a key is not in the original set, +// the PHSF returns a "garbage" value, which is only equal to the key's +// hash with (false positive) probability 1 in 2^b. +// +// For a matching false positive rate, standard Bloom filters require +// 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h) +// require a bit more, around 1.5*b bits per entry. Thus, a Bloom +// alternative could save up to or nearly 1/3rd of memory and storage +// that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter +// is dynamic.) +// +// Recommended reading: +// "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters" +// by Graf and Lemire +// First three sections of "Fast Scalable Construction of (Minimal +// Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna +// +// ###################################################################### +// ################## PHSF vs. hash table vs. Bloom ##################### +// +// You can think of traditional hash tables and related filter variants +// such as Cuckoo filters as utilizing an "OR" construction: a hash +// function associates a key with some slots and the data is returned if +// the data is found in any one of those slots. The collision resolution +// is visible in the final data structure and requires extra information. +// For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and +// Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data +// structure associates each input key with data in one slot, the +// structure implicitly constructs a (near-)minimal (near-)perfect hash +// (MPH) of the keys, which requires at least 1.44 bits per key to +// represent. This is why approaches with visible collision resolution +// have a fixed + 1.5 or more in storage overhead per entry, often in +// addition to an overhead multiplier on b. +// +// By contrast Bloom filters utilize an "AND" construction: a query only +// returns true if all bit positions associated with a key are set to 1. +// There is no collision resolution, so Bloom filters do not suffer a +// fixed bits per entry overhead like the above structures. +// +// PHSFs typically use a bitwise XOR construction: the data you want is +// not in a single slot, but in a linear combination of several slots. +// For static data, this gives the best of "AND" and "OR" constructions: +// avoids the +1.44 or more fixed overhead by not approximating a MPH and +// can do much better than Bloom's 1.44 factor on b with collision +// resolution, which here is done ahead of time and invisible at query +// time. +// +// ###################################################################### +// ######################## PHSF construction ########################### +// +// For a typical PHSF, construction is solving a linear system of +// equations, typically in GF(2), which is to say that values are boolean +// and XOR serves both as addition and subtraction. We can use matrices to +// represent the problem: +// +// C * S = R +// (n x m) (m x b) (n x b) +// where C = coefficients, S = solution, R = results +// and solving for S given C and R. +// +// Note that C and R each have n rows, one for each input entry for the +// PHSF. A row in C is given by a hash function on the PHSF input key, +// and the corresponding row in R is the b-bit value to associate with +// that input key. (In a filter, rows of R are given by another hash +// function on the input key.) +// +// On solving, the matrix S (solution) is the final PHSF data, as it +// maps any row from the original C to its corresponding desired result +// in R. We just have to hash our query inputs and compute a linear +// combination of rows in S. +// +// In theory, we could chose m = n and let a hash function associate +// each input key with random rows in C. A solution exists with high +// probability, and uses essentially minimum space, b bits per entry +// (because we set m = n) but this has terrible scaling, something +// like O(n^2) space and O(n^3) time during construction (Gaussian +// elimination) and O(n) query time. But computational efficiency is +// key, and the core of this is avoiding scanning all of S to answer +// each query. +// +// The traditional approach (MWHC, aka Xor filter) starts with setting +// only some small fixed number of columns (typically k=3) to 1 for each +// row of C, with remaining entries implicitly 0. This is implemented as +// three hash functions over [0,m), and S can be implemented as a vector +// vector of b-bit values. Now, a query only involves looking up k rows +// (values) in S and computing their bitwise XOR. Additionally, this +// construction can use a linear time algorithm called "peeling" for +// finding a solution in many cases of one existing, but peeling +// generally requires a larger space overhead factor in the solution +// (m/n) than is required with Gaussian elimination. +// +// Recommended reading: +// "Peeling Close to the Orientability Threshold – Spatial Coupling in +// Hashing-Based Data Structures" by Stefan Walzer +// +// ###################################################################### +// ##################### Ribbon PHSF construction ####################### +// +// Ribbon constructs coefficient rows essentially the same as in the +// Walzer/Dietzfelbinger paper cited above: for some chosen fixed width +// r (kCoeffBits in code), each key is hashed to a starting column in +// [0, m - r] (GetStart() in code) and an r-bit sequence of boolean +// coefficients (GetCoeffRow() in code). If you sort the rows by start, +// the C matrix would look something like this: +// +// [####00000000000000000000] +// [####00000000000000000000] +// [000####00000000000000000] +// [0000####0000000000000000] +// [0000000####0000000000000] +// [000000000####00000000000] +// [000000000####00000000000] +// [0000000000000####0000000] +// [0000000000000000####0000] +// [00000000000000000####000] +// [00000000000000000000####] +// +// where each # could be a 0 or 1, chosen uniformly by a hash function. +// (Except we typically set the start column value to 1.) This scheme +// uses hashing to approximate a band matrix, and it has a solution iff +// it reduces to an upper-triangular boolean r-band matrix, like this: +// +// [1###00000000000000000000] +// [01##00000000000000000000] +// [000000000000000000000000] +// [0001###00000000000000000] +// [000000000000000000000000] +// [000001##0000000000000000] +// [000000000000000000000000] +// [00000001###0000000000000] +// [000000001###000000000000] +// [0000000001##000000000000] +// ... +// [00000000000000000000001#] +// [000000000000000000000001] +// +// where we have expanded to an m x m matrix by filling with rows of +// all zeros as needed. As in Gaussian elimination, this form is ready for +// generating a solution through back-substitution. +// +// The awesome thing about the Ribbon construction (from the DW paper) is +// how row reductions keep each row representable as a start column and +// r coefficients, because row reductions are only needed when two rows +// have the same number of leading zero columns. Thus, the combination +// of those rows, the bitwise XOR of the r-bit coefficient rows, cancels +// out the leading 1s, so starts (at least) one column later and only +// needs (at most) r - 1 coefficients. +// +// ###################################################################### +// ###################### Ribbon PHSF scalability ####################### +// +// Although more practical detail is in ribbon_impl.h, it's worth +// understanding some of the overall benefits and limitations of the +// Ribbon PHSFs. +// +// High-end scalability is a primary issue for Ribbon PHSFs, because in +// a single Ribbon linear system with fixed r and fixed m/n ratio, the +// solution probability approaches zero as n approaches infinity. +// For a given n, solution probability improves with larger r and larger +// m/n. +// +// By contrast, peeling-based PHSFs have somewhat worse storage ratio +// or solution probability for small n (less than ~1000). This is +// especially true with spatial-coupling, where benefits are only +// notable for n on the order of 100k or 1m or more. +// +// To make best use of current hardware, r=128 seems to be closest to +// a "generally good" choice for Ribbon, at least in RocksDB where SST +// Bloom filters typically hold around 10-100k keys, and almost always +// less than 10m keys. r=128 ribbon has a high chance of encoding success +// (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05) +// for roughly 10k - 10m keys in a single linear system. r=64 only scales +// up to about 10k keys with the same storage overhead. Construction and +// access times for r=128 are similar to r=64. r=128 tracks nearly +// twice as much data during construction, but in most cases we expect +// the scalability benefits of r=128 vs. r=64 to make it preferred. +// +// A natural approach to scaling Ribbon beyond ~10m keys is splitting +// (or "sharding") the inputs into multiple linear systems with their +// own hash seeds. This can also help to control peak memory consumption. +// TODO: much more to come +// +// ###################################################################### +// #################### Ribbon on-the-fly banding ####################### +// +// "Banding" is what we call the process of reducing the inputs to an +// upper-triangluar r-band matrix ready for finishing a solution with +// back-substitution. Although the DW paper presents an algorithm for +// this ("SGauss"), the awesome properties of their construction enable +// an even simpler, faster, and more backtrackable algorithm. In simplest +// terms, the SGauss algorithm requires sorting the inputs by start +// columns, but it's possible to make Gaussian elimination resemble hash +// table insertion! +// +// The enhanced algorithm is based on these observations: +// - When processing a coefficient row with first 1 in column j, +// - If it's the first at column j to be processed, it can be part of +// the banding at row j. (And that descision never overwritten, with +// no loss of generality!) +// - Else, it can be combined with existing row j and re-processed, +// which will look for a later "empty" row or reach "no solution". +// +// We call our banding algorithm "incremental" and "on-the-fly" because +// (like hash table insertion) we are "finished" after each input +// processed, with respect to all inputs processed so far. Although the +// band matrix is an intermediate step to the solution structure, we have +// eliminated intermediate steps and unnecessary data tracking for +// banding. +// +// Building on "incremental" and "on-the-fly", the banding algorithm is +// easily backtrackable because no (non-empty) rows are overwritten in +// the banding. Thus, if we want to "try" adding an additional set of +// inputs to the banding, we only have to record which rows were written +// in order to efficiently backtrack to our state before considering +// the additional set. (TODO: how this can mitigate scalability and +// reach sub-1% overheads) +// +// Like in a linear-probed hash table, as the occupancy approaches and +// surpasses 90-95%, collision resolution dominates the construction +// time. (Ribbon doesn't usually pay at query time; see solution +// storage below.) This means that we can speed up construction time +// by using a higher m/n ratio, up to negative returns around 1.2. +// At m/n ~= 1.2, which still saves memory substantially vs. Bloom +// filter's 1.5, construction speed (including back-substitution) is not +// far from sorting speed, but still a few times slower than cache-local +// Bloom construction speed. +// +// Back-substitution from an upper-triangular boolean band matrix is +// especially fast and easy. All the memory accesses are sequential or at +// least local, no random. If the number of result bits (b) is a +// compile-time constant, the back-substitution state can even be tracked +// in CPU registers. Regardless of the solution representation, we prefer +// column-major representation for tracking back-substitution state, as +// r (the band width) will typically be much larger than b (result bits +// or columns), so better to handle r-bit values b times (per solution +// row) than b-bit values r times. +// +// ###################################################################### +// ##################### Ribbon solution storage ######################## +// +// Row-major layout is typical for boolean (bit) matrices, including for +// MWHC (Xor) filters where a query combines k b-bit values, and k is +// typically smaller than b. Even for k=4 and b=2, at least k=4 random +// lookups are required regardless of layout. +// +// Ribbon PHSFs are quite different, however, because +// (a) all of the solution rows relevant to a query are within a single +// range of r rows, and +// (b) the number of solution rows involved (r/2 on average, or r if +// avoiding conditional accesses) is typically much greater than +// b, the number of solution columns. +// +// Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead +// by processing (up to) r entries of b bits each, where b is typically +// less than 10 for filter applications. +// +// Column-major layout has poor locality because of accessing up to b +// memory locations in different pages (and obviously cache lines). Note +// that negative filter queries do not typically need to access all +// solution columns, as they can return when a mismatch is found in any +// result/solution column. This optimization doesn't always pay off on +// recent hardware, where the penalty for unpredictable conditional +// branching can exceed the penalty for unnecessary work, but the +// optimization is essentially unavailable with row-major layout. +// +// The best compromise seems to be interleaving column-major on the small +// scale with row-major on the large scale. For example, let a solution +// "block" be r rows column-major encoded as b r-bit values in sequence. +// Each query accesses (up to) 2 adjacent blocks, which will typically +// span 1-3 cache lines in adjacent memory. We get very close to the same +// locality as row-major, but with much faster reconstruction of each +// result column, at least for filter applications where b is relatively +// small and negative queries can return early. +// +// ###################################################################### +// ###################### Fractional result bits ######################## +// +// Bloom filters have great flexibility that alternatives mostly do not +// have. One of those flexibilities is in utilizing any ratio of data +// structure bits per key. With a typical memory allocator like jemalloc, +// this flexibility can save roughly 10% of the filters' footprint in +// DRAM by rounding up and down filter sizes to minimize memory internal +// fragmentation (see optimize_filters_for_memory RocksDB option). +// +// At first glance, PHSFs only offer a whole number of bits per "slot" +// (m rather than number of keys n), but coefficient locality in the +// Ribbon construction makes fractional bits/key quite possible and +// attractive for filter applications. +// +// TODO: more detail +// + +// ###################################################################### +// ################### CODE: Ribbon core algorithms ##################### +// ###################################################################### +// +// These algorithms are templatized for genericity but near-maximum +// performance in a given application. The template parameters +// adhere to class/struct type concepts outlined below. + +// Rough architecture for these algorithms: +// +// +-----------+ +---+ +-----------------+ +// | AddInputs | --> | H | --> | BandingStorage | +// +-----------+ | a | +-----------------+ +// | s | | +// | h | Back substitution +// | e | V +// +-----------+ | r | +-----------------+ +// | Query Key | --> | | >+< | SolutionStorage | +// +-----------+ +---+ | +-----------------+ +// V +// Query result + +// Common to other concepts +// concept RibbonTypes { +// // An unsigned integer type for an r-bit subsequence of coefficients. +// // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would +// // generally only hurt scalability to leave bits of CoeffRow unused. +// typename CoeffRow; +// // An unsigned integer type big enough to hold a result row (b bits, +// // or number of solution/result columns). +// // In many applications, especially filters, the number of result +// // columns is decided at run time, so ResultRow simply needs to be +// // big enough for the largest number of columns allowed. +// typename ResultRow; +// // An unsigned integer type sufficient for representing the number of +// // rows in the solution structure. (TODO: verify any extra needed?) +// typename Index; +// }; + +// ###################################################################### +// ######################## Hashers and Banding ######################### + +// Hasher concepts abstract out hashing details. + +// concept PhsfQueryHasher extends RibbonTypes { +// // Type for a lookup key, which is hashable. +// typename Key; +// +// // Type for hashed summary of a Key. uint64_t is recommended. +// typename Hash; +// +// // Compute a hash value summarizing a Key +// Hash GetHash(const Key &) const; +// +// // Given a hash value and a number of columns that can start an +// // r-sequence of coefficients (== m - r + 1), return the start +// // column to associate with that hash value. (Starts can be chosen +// // uniformly or "smash" extra entries into the beginning and end for +// // better utilization at those extremes of the structure. Details in +// // ribbon.impl.h) +// Index GetStart(Hash, Index num_starts) const; +// +// // Given a hash value, return the r-bit sequence of coefficients to +// // associate with it. It's generally OK if +// // sizeof(CoeffRow) > sizeof(Hash) +// // as long as the hash itself is not too prone to collsions for the +// // applications and the CoeffRow is generated uniformly from +// // available hash data, but relatively independent of the start. +// // +// // Must be non-zero, because that's required for a solution to exist +// // when mapping to non-zero result row. (Note: BandingAdd could be +// // modified to allow 0 coeff row if that only occurs with 0 result +// // row, which really only makes sense for filter implementation, +// // where both values are hash-derived. Or BandingAdd could reject 0 +// // coeff row, forcing next seed, but that has potential problems with +// // generality/scalability.) +// CoeffRow GetCoeffRow(Hash) const; +// }; + +// concept FilterQueryHasher extends PhsfQueryHasher { +// // For building or querying a filter, this returns the expected +// // result row associated with a hashed input. For general PHSF, +// // this must return 0. +// // +// // Although not strictly required, there's a slightly better chance of +// // solver success if result row is masked down here to only the bits +// // actually needed. +// ResultRow GetResultRowFromHash(Hash) const; +// } + +// concept BandingHasher extends FilterQueryHasher { +// // For a filter, this will generally be the same as Key. +// // For a general PHSF, it must either +// // (a) include a key and a result it maps to (e.g. in a std::pair), or +// // (b) GetResultRowFromInput looks up the result somewhere rather than +// // extracting it. +// typename AddInput; +// +// // Instead of requiring a way to extract a Key from an +// // AddInput, we require getting the hash of the Key part +// // of an AddInput, which is trivial if AddInput == Key. +// Hash GetHash(const AddInput &) const; +// +// // For building a non-filter PHSF, this extracts or looks up the result +// // row to associate with an input. For filter PHSF, this must return 0. +// ResultRow GetResultRowFromInput(const AddInput &) const; +// +// // Whether the solver can assume the lowest bit of GetCoeffRow is +// // always 1. When true, it should improve solver efficiency slightly. +// static bool kFirstCoeffAlwaysOne; +// } + +// Abstract storage for the the result of "banding" the inputs (Gaussian +// elimination to an upper-triangular boolean band matrix). Because the +// banding is an incremental / on-the-fly algorithm, this also represents +// all the intermediate state between input entries. +// +// concept BandingStorage extends RibbonTypes { +// // Tells the banding algorithm to prefetch memory associated with +// // the next input before processing the current input. Generally +// // recommended iff the BandingStorage doesn't easily fit in CPU +// // cache. +// bool UsePrefetch() const; +// +// // Prefetches (e.g. __builtin_prefetch) memory associated with a +// // slot index i. +// void Prefetch(Index i) const; +// +// // Returns a pointer to CoeffRow for slot index i. +// CoeffRow* CoeffRowPtr(Index i); +// +// // Returns a pointer to ResultRow for slot index i. (Gaussian row +// // operations involve both side of the equation.) +// ResultRow* ResultRowPtr(Index i); +// +// // Returns the number of columns that can start an r-sequence of +// // coefficients, which is the number of slots minus r (kCoeffBits) +// // plus one. (m - r + 1) +// Index GetNumStarts() const; +// }; + +// Optional storage for backtracking data in banding a set of input +// entries. It exposes an array structure which will generally be +// used as a stack. It must be able to accommodate as many entries +// as are passed in as inputs to `BandingAddRange`. +// +// concept BacktrackStorage extends RibbonTypes { +// // If false, backtracking support will be disabled in the algorithm. +// // This should preferably be an inline compile-time constant function. +// bool UseBacktrack() const; +// +// // Records `to_save` as the `i`th backtrack entry +// void BacktrackPut(Index i, Index to_save); +// +// // Recalls the `i`th backtrack entry +// Index BacktrackGet(Index i) const; +// } + +// Adds a single entry to BandingStorage (and optionally, BacktrackStorage), +// returning true if successful or false if solution is impossible with +// current hasher (and presumably its seed) and number of "slots" (solution +// or banding rows). (A solution is impossible when there is a linear +// dependence among the inputs that doesn't "cancel out".) +// +// Pre- and post-condition: the BandingStorage represents a band matrix +// ready for back substitution (row echelon form except for zero rows), +// augmented with result values such that back substitution would give a +// solution satisfying all the cr@start -> rr entries added. +template +bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start, + typename BandingStorage::ResultRow rr, + typename BandingStorage::CoeffRow cr, BacktrackStorage *bts, + typename BandingStorage::Index *backtrack_pos) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + + Index i = start; + + if (!kFirstCoeffAlwaysOne) { + // Requires/asserts that cr != 0 + int tz = CountTrailingZeroBits(cr); + i += static_cast(tz); + cr >>= tz; + } else { + assert((cr & 1) == 1); + } + + for (;;) { + CoeffRow other = *(bs->CoeffRowPtr(i)); + if (other == 0) { + *(bs->CoeffRowPtr(i)) = cr; + *(bs->ResultRowPtr(i)) = rr; + bts->BacktrackPut(*backtrack_pos, i); + ++*backtrack_pos; + return true; + } + assert((other & 1) == 1); + cr ^= other; + rr ^= *(bs->ResultRowPtr(i)); + if (cr == 0) { + // Inconsistency or (less likely) redundancy + break; + } + int tz = CountTrailingZeroBits(cr); + i += static_cast(tz); + cr >>= tz; + } + // Failed, unless result row == 0 because e.g. a duplicate input or a + // stock hash collision, with same result row. (For filter, stock hash + // collision implies same result row.) Or we could have a full equation + // equal to sum of other equations, which is very possible with + // small range of values for result row. + return rr == 0; +} + +// Adds a range of entries to BandingStorage returning true if successful +// or false if solution is impossible with current hasher (and presumably +// its seed) and number of "slots" (solution or banding rows). (A solution +// is impossible when there is a linear dependence among the inputs that +// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs. +// +// If UseBacktrack in the BacktrackStorage, this function call rolls back +// to prior state on failure. If !UseBacktrack, some subset of the entries +// will have been added to the BandingStorage, so best considered to be in +// an indeterminate state. +// +template +bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts, + const BandingHasher &bh, InputIterator begin, + InputIterator end) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + using ResultRow = typename BandingStorage::ResultRow; + using Hash = typename BandingHasher::Hash; + + static_assert(IsUnsignedUpTo128::value, "must be unsigned"); + static_assert(IsUnsignedUpTo128::value, "must be unsigned"); + static_assert(IsUnsignedUpTo128::value, "must be unsigned"); + + constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne; + + if (begin == end) { + // trivial + return true; + } + + const Index num_starts = bs->GetNumStarts(); + + InputIterator cur = begin; + Index backtrack_pos = 0; + if (!bs->UsePrefetch()) { + // Simple version, no prefetch + for (;;) { + Hash h = bh.GetHash(*cur); + Index start = bh.GetStart(h, num_starts); + ResultRow rr = + bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h); + CoeffRow cr = bh.GetCoeffRow(h); + + if (!BandingAdd(bs, start, rr, cr, bts, &backtrack_pos)) { + break; + } + if ((++cur) == end) { + return true; + } + } + } else { + // Pipelined w/prefetch + // Prime the pipeline + Hash h = bh.GetHash(*cur); + Index start = bh.GetStart(h, num_starts); + ResultRow rr = bh.GetResultRowFromInput(*cur); + bs->Prefetch(start); + + // Pipeline + for (;;) { + rr |= bh.GetResultRowFromHash(h); + CoeffRow cr = bh.GetCoeffRow(h); + if ((++cur) == end) { + if (!BandingAdd(bs, start, rr, cr, bts, &backtrack_pos)) { + break; + } + return true; + } + Hash next_h = bh.GetHash(*cur); + Index next_start = bh.GetStart(next_h, num_starts); + ResultRow next_rr = bh.GetResultRowFromInput(*cur); + bs->Prefetch(next_start); + if (!BandingAdd(bs, start, rr, cr, bts, &backtrack_pos)) { + break; + } + h = next_h; + start = next_start; + rr = next_rr; + } + } + // failed; backtrack (if implemented) + if (bts->UseBacktrack()) { + while (backtrack_pos > 0) { + --backtrack_pos; + Index i = bts->BacktrackGet(backtrack_pos); + *(bs->CoeffRowPtr(i)) = 0; + // Not required: *(bs->ResultRowPtr(i)) = 0; + } + } + return false; +} + +// Adds a range of entries to BandingStorage returning true if successful +// or false if solution is impossible with current hasher (and presumably +// its seed) and number of "slots" (solution or banding rows). (A solution +// is impossible when there is a linear dependence among the inputs that +// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs. +// +// On failure, some subset of the entries will have been added to the +// BandingStorage, so best considered to be in an indeterminate state. +// +template +bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh, + InputIterator begin, InputIterator end) { + using Index = typename BandingStorage::Index; + struct NoopBacktrackStorage { + bool UseBacktrack() { return false; } + void BacktrackPut(Index, Index) {} + Index BacktrackGet(Index) { + assert(false); + return 0; + } + } nbts; + return BandingAddRange(bs, &nbts, bh, begin, end); +} + +// ###################################################################### +// ######################### Solution Storage ########################### + +// Back-substitution and query algorithms unfortunately depend on some +// details of data layout in the final data structure ("solution"). Thus, +// there is no common SolutionStorage covering all the reasonable +// possibilities. + +// ###################### SimpleSolutionStorage ######################### + +// SimpleSolutionStorage is for a row-major storage, typically with no +// unused bits in each ResultRow. This is mostly for demonstration +// purposes as the simplest solution storage scheme. It is relatively slow +// for filter queries. + +// concept SimpleSolutionStorage extends RibbonTypes { +// void PrepareForNumStarts(Index num_starts) const; +// Index GetNumStarts() const; +// ResultRow Load(Index slot_num) const; +// void Store(Index slot_num, ResultRow data); +// }; + +// Back-substitution for generating a solution from BandingStorage to +// SimpleSolutionStorage. +template +void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &ss) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + using ResultRow = typename BandingStorage::ResultRow; + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + constexpr auto kResultBits = static_cast(sizeof(ResultRow) * 8U); + + // A column-major buffer of the solution matrix, containing enough + // recently-computed solution data to compute the next solution row + // (based also on banding data). + std::array state; + state.fill(0); + + const Index num_starts = ss.GetNumStarts(); + sss->PrepareForNumStarts(num_starts); + const Index num_slots = num_starts + kCoeffBits - 1; + + for (Index i = num_slots; i > 0;) { + --i; + CoeffRow cr = *const_cast(ss).CoeffRowPtr(i); + ResultRow rr = *const_cast(ss).ResultRowPtr(i); + // solution row + ResultRow sr = 0; + for (Index j = 0; j < kResultBits; ++j) { + // Compute next solution bit at row i, column j (see derivation below) + CoeffRow tmp = state[j] << 1; + bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0; + tmp |= bit ? CoeffRow{1} : CoeffRow{0}; + + // Now tmp is solution at column j from row i for next kCoeffBits + // more rows. Thus, for valid solution, the dot product of the + // solution column with the coefficient row has to equal the result + // at that column, + // BitParity(tmp & cr) == ((rr >> j) & 1) + + // Update state. + state[j] = tmp; + // add to solution row + sr |= (bit ? ResultRow{1} : ResultRow{0}) << j; + } + sss->Store(i, sr); + } +} + +// Common functionality for querying a key (already hashed) in +// SimpleSolutionStorage. +template +typename SimpleSolutionStorage::ResultRow SimpleQueryHelper( + typename SimpleSolutionStorage::Index start_slot, + typename SimpleSolutionStorage::CoeffRow cr, + const SimpleSolutionStorage &sss) { + using CoeffRow = typename SimpleSolutionStorage::CoeffRow; + using ResultRow = typename SimpleSolutionStorage::ResultRow; + + constexpr unsigned kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + ResultRow result = 0; + for (unsigned i = 0; i < kCoeffBits; ++i) { + if (static_cast(cr >> i) & 1U) { + result ^= sss.Load(start_slot + i); + } + } + return result; +} + +// General PHSF query a key from SimpleSolutionStorage. +template +typename SimpleSolutionStorage::ResultRow SimplePhsfQuery( + const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher, + const SimpleSolutionStorage &sss) { + const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key); + + return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()), + hasher.GetCoeffRow(hash), sss); +} + +// Filter query a key from SimpleSolutionStorage. +template +bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key, + const FilterQueryHasher &hasher, + const SimpleSolutionStorage &sss) { + const typename FilterQueryHasher::Hash hash = hasher.GetHash(key); + const typename SimpleSolutionStorage::ResultRow expected = + hasher.GetResultRowFromHash(hash); + + return expected == + SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()), + hasher.GetCoeffRow(hash), sss); +} + +// #################### InterleavedSolutionStorage ###################### + +// InterleavedSolutionStorage is row-major at a high level, for good +// locality, and column-major at a low level, for CPU efficiency +// especially in filter querys or relatively small number of result bits +// (== solution columns). The storage is a sequence of "blocks" where a +// block has one CoeffRow for each solution column. + +// concept InterleavedSolutionStorage extends RibbonTypes { +// Index GetNumColumns() const; +// Index GetNumStarts() const; +// CoeffRow Load(Index block_num, Index column) const; +// void Store(Index block_num, Index column, CoeffRow data); +// }; + +// TODO: not yet implemented here (only in prototype code elsewhere) + +} // namespace ribbon + +} // namespace ROCKSDB_NAMESPACE diff --git a/util/ribbon_impl.h b/util/ribbon_impl.h new file mode 100644 index 000000000..cde1c5898 --- /dev/null +++ b/util/ribbon_impl.h @@ -0,0 +1,503 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "port/port.h" // for PREFETCH +#include "util/ribbon_alg.h" + +namespace ROCKSDB_NAMESPACE { + +namespace ribbon { + +// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly) +// +// ribbon_impl.h: templated (parameterized) standard implementations +// +// Ribbon is a Perfect Hash Static Function construction useful as a compact +// static Bloom filter alternative. See ribbon_alg.h for core algorithms +// and core design details. +// +// TODO: more details on trade-offs and practical issues. + +// Ribbon implementations in this file take these parameters, which must be +// provided in a class/struct type with members expressed in this concept: + +// concept TypesAndSettings { +// // See RibbonTypes and *Hasher in ribbon_alg.h, except here we have +// // the added constraint that Hash be equivalent to either uint32_t or +// // uint64_t. +// typename Hash; +// typename CoeffRow; +// typename ResultRow; +// typename Index; +// typename Key; +// static constexpr bool kFirstCoeffAlwaysOne; +// +// // An unsigned integer type for identifying a hash seed, typically +// // uint32_t or uint64_t. +// typename Seed; +// +// // When true, the PHSF implements a static filter, expecting just +// // keys as inputs for construction. When false, implements a general +// // PHSF and expects std::pair as inputs for +// // construction. +// static constexpr bool kIsFilter; +// +// // When true, adds a tiny bit more hashing logic on queries and +// // construction to improve utilization at the beginning and end of +// // the structure. Recommended when CoeffRow is only 64 bits (or +// // less), so typical num_starts < 10k. +// static constexpr bool kUseSmash; +// +// // A seedable stock hash function on Keys. All bits of Hash must +// // be reasonably high quality. XXH functions recommended, but +// // Murmur, City, Farm, etc. also work. +// // +// // If sequential seeds are not sufficiently independent for your +// // stock hash function, consider multiplying by a large odd constant. +// // If seed 0 is still undesirable, consider adding 1 before the +// // multiplication. +// static Hash HashFn(const Key &, Seed); +// }; + +// A bit of a hack to automatically construct the type for +// AddInput based on a constexpr bool. +template +struct AddInputSelector { + // For general PHSF, not filter + using T = std::pair; +}; + +template +struct AddInputSelector { + // For Filter + using T = Key; +}; + +// To avoid writing 'typename' everwhere that we use types like 'Index' +#define IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings) \ + using CoeffRow = typename TypesAndSettings::CoeffRow; \ + using ResultRow = typename TypesAndSettings::ResultRow; \ + using Index = typename TypesAndSettings::Index; \ + using Hash = typename TypesAndSettings::Hash; \ + using Key = typename TypesAndSettings::Key; \ + using Seed = typename TypesAndSettings::Seed; \ + \ + /* Some more additions */ \ + using QueryInput = Key; \ + using AddInput = typename ROCKSDB_NAMESPACE::ribbon::AddInputSelector< \ + Key, ResultRow, TypesAndSettings::kIsFilter>::T; \ + static constexpr auto kCoeffBits = \ + static_cast(sizeof(CoeffRow) * 8U); \ + \ + /* Export to algorithm */ \ + static constexpr bool kFirstCoeffAlwaysOne = \ + TypesAndSettings::kFirstCoeffAlwaysOne; \ + \ + static_assert(sizeof(CoeffRow) + sizeof(ResultRow) + sizeof(Index) + \ + sizeof(Hash) + sizeof(Key) + sizeof(Seed) + \ + sizeof(QueryInput) + sizeof(AddInput) + kCoeffBits + \ + kFirstCoeffAlwaysOne > \ + 0, \ + "avoid unused warnings, semicolon expected after macro call") + +// StandardHasher: A standard implementation of concepts RibbonTypes, +// PhsfQueryHasher, FilterQueryHasher, and BandingHasher from ribbon_alg.h. +// +// This implementation should be suitable for most all practical purposes +// as it "behaves" across a wide range of settings, with little room left +// for improvement. The key functionality in this hasher is generating +// CoeffRows, starts, and (for filters) ResultRows, which could be ~150 +// bits of data or more, from a modest hash of 64 or even just 32 bits, with +// enough uniformity and bitwise independence to be close to "the best you +// can do" with available hash information in terms of FP rate and +// compactness. (64 bits recommended and sufficient for PHSF practical +// purposes.) +template +class StandardHasher { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + StandardHasher(Seed seed = 0) : seed_(seed) {} + + inline Hash GetHash(const Key& key) const { + return TypesAndSettings::HashFn(key, seed_); + }; + // For when AddInput == pair (kIsFilter == false) + inline Hash GetHash(const std::pair& bi) const { + return GetHash(bi.first); + }; + inline Index GetStart(Hash h, Index num_starts) const { + // This is "critical path" code because it's required before memory + // lookup. + // + // FastRange gives us a fast and effective mapping from h to the + // approriate range. This depends most, sometimes exclusively, on + // upper bits of h. + // + if (TypesAndSettings::kUseSmash) { + // Extra logic to "smash" entries at beginning and end, for + // better utilization. For example, without smash and with + // kFirstCoeffAlwaysOne, there's about a 30% chance that the + // first slot in the banding will be unused, and worse without + // kFirstCoeffAlwaysOne. The ending slots are even less utilized + // without smash. + // + // But since this only affects roughly kCoeffBits of the slots, + // it's usually small enough to be ignorable (less computation in + // this function) when number of slots is roughly 10k or larger. + // + // TODO: re-check these degress of smash, esp with kFirstCoeffAlwaysOne + // + constexpr auto kFrontSmash = kCoeffBits / 2 - 1; + constexpr auto kBackSmash = kCoeffBits / 2; + Index start = FastRangeGeneric(h, num_starts + kFrontSmash + kBackSmash); + start = std::max(start, kFrontSmash); + start -= kFrontSmash; + start = std::min(start, num_starts - 1); + return start; + } else { + // For query speed, we allow small number of initial and final + // entries to be under-utilized. + // NOTE: This call statically enforces that Hash is equivalent to + // either uint32_t or uint64_t. + return FastRangeGeneric(h, num_starts); + } + } + inline CoeffRow GetCoeffRow(Hash h) const { + // This is a reasonably cheap but empirically effective remix/expansion + // of the hash data to fill CoeffRow. (Large primes) + // This is not so much "critical path" code because it can be done in + // parallel (instruction level) with memory lookup. + Unsigned128 a = Multiply64to128(h, 0x85EBCA77C2B2AE63U); + Unsigned128 b = Multiply64to128(h, 0x27D4EB2F165667C5U); + auto cr = static_cast(b ^ (a << 64) ^ (a >> 64)); + if (kFirstCoeffAlwaysOne) { + cr |= 1; + } else { + // Still have to ensure non-zero + cr |= static_cast(cr == 0); + } + return cr; + } + inline ResultRow GetResultRowMask() const { + // TODO: will be used with InterleavedSolutionStorage + // For now, all bits set (note: might be a small type so might need to + // narrow after promotion) + return static_cast(~ResultRow{0}); + } + inline ResultRow GetResultRowFromHash(Hash h) const { + if (TypesAndSettings::kIsFilter) { + // In contrast to GetStart, here we draw primarily from lower bits, + // but not literally, which seemed to cause FP rate hit in some cases. + // This is not so much "critical path" code because it can be done in + // parallel (instruction level) with memory lookup. + auto rr = static_cast(h ^ (h >> 13) ^ (h >> 26)); + return rr & GetResultRowMask(); + } else { + // Must be zero + return 0; + } + } + // For when AddInput == Key (kIsFilter == true) + inline ResultRow GetResultRowFromInput(const Key&) const { + // Must be zero + return 0; + } + // For when AddInput == pair (kIsFilter == false) + inline ResultRow GetResultRowFromInput( + const std::pair& bi) const { + // Simple extraction + return bi.second; + } + + bool NextSeed(Seed max_seed) { + if (seed_ >= max_seed) { + return false; + } else { + ++seed_; + return true; + } + } + Seed GetSeed() const { return seed_; } + void ResetSeed(Seed seed = 0) { seed_ = seed; } + + protected: + Seed seed_; +}; + +// StandardRehasher (and StandardRehasherAdapter): A variant of +// StandardHasher that uses the same type for keys as for hashes. +// This is primarily intended for building a Ribbon filter/PHSF +// from existing hashes without going back to original inputs in order +// to apply a different seed. This hasher seeds a 1-to-1 mixing +// transformation to apply a seed to an existing hash (or hash-sized key). +// +// Testing suggests essentially no degredation of solution success rate +// vs. going back to original inputs when changing hash seeds. For example: +// Average re-seeds for solution with r=128, 1.02x overhead, and ~100k keys +// is about 1.10 for both StandardHasher and StandardRehasher. +// +// concept RehasherTypesAndSettings: like TypesAndSettings but +// does not require Key or HashFn. +template +class StandardRehasherAdapter : public RehasherTypesAndSettings { + public: + using Hash = typename RehasherTypesAndSettings::Hash; + using Key = Hash; + using Seed = typename RehasherTypesAndSettings::Seed; + + static Hash HashFn(const Hash& input, Seed seed) { + static_assert(sizeof(Hash) <= 8, "Hash too big"); + if (sizeof(Hash) > 4) { + // XXH3_avalanche / XXH3p_avalanche (64-bit), modified for seed + uint64_t h = input; + h ^= h >> 37; + h ^= seed * uint64_t{0xC2B2AE3D27D4EB4F}; + h *= uint64_t{0x165667B19E3779F9}; + h ^= h >> 32; + return static_cast(h); + } else { + // XXH32_avalanche (32-bit), modified for seed + uint32_t h32 = static_cast(input); + h32 ^= h32 >> 15; + h32 ^= seed * uint32_t{0x27D4EB4F}; + h32 *= uint32_t{0x85EBCA77}; + h32 ^= h32 >> 13; + h32 *= uint32_t{0xC2B2AE3D}; + h32 ^= h32 >> 16; + return static_cast(h32); + } + } +}; + +// See comment on StandardRehasherAdapter +template +using StandardRehasher = + StandardHasher>; + +// StandardBanding: a canonical implementation of BandingStorage and +// BacktrackStorage, with convenience API for banding (solving with on-the-fly +// Gaussian elimination) with and without backtracking. +template +class StandardBanding : public StandardHasher { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + StandardBanding(Index num_slots = 0, Index backtrack_size = 0) { + if (num_slots > 0) { + Reset(num_slots, backtrack_size); + } else { + EnsureBacktrackSize(backtrack_size); + } + } + void Reset(Index num_slots, Index backtrack_size = 0) { + assert(num_slots >= kCoeffBits); + if (num_slots > num_slots_allocated_) { + coeff_rows_.reset(new CoeffRow[num_slots]()); + // Note: don't strictly have to zero-init result_rows, + // except possible information leakage ;) + result_rows_.reset(new ResultRow[num_slots]()); + num_slots_allocated_ = num_slots; + } else { + for (Index i = 0; i < num_slots; ++i) { + coeff_rows_[i] = 0; + // Note: don't strictly have to zero-init result_rows + result_rows_[i] = 0; + } + } + num_starts_ = num_slots - kCoeffBits + 1; + EnsureBacktrackSize(backtrack_size); + } + void EnsureBacktrackSize(Index backtrack_size) { + if (backtrack_size > backtrack_size_) { + backtrack_.reset(new Index[backtrack_size]); + backtrack_size_ = backtrack_size; + } + } + + // ******************************************************************** + // From concept BandingStorage + + inline bool UsePrefetch() const { + // A rough guestimate of when prefetching during construction pays off. + // TODO: verify/validate + return num_starts_ > 1500; + } + inline void Prefetch(Index i) const { + PREFETCH(&coeff_rows_[i], 1 /* rw */, 1 /* locality */); + PREFETCH(&result_rows_[i], 1 /* rw */, 1 /* locality */); + } + inline CoeffRow* CoeffRowPtr(Index i) { return &coeff_rows_[i]; } + inline ResultRow* ResultRowPtr(Index i) { return &result_rows_[i]; } + inline Index GetNumStarts() const { return num_starts_; } + + // from concept BacktrackStorage, for when backtracking is used + inline bool UseBacktrack() const { return true; } + inline void BacktrackPut(Index i, Index to_save) { backtrack_[i] = to_save; } + inline Index BacktrackGet(Index i) const { return backtrack_[i]; } + + // ******************************************************************** + // Some useful API, still somewhat low level. Here an input is + // a Key for filters, or std::pair for general PHSF. + + // Adds a range of inputs to the banding, returning true if successful. + // False means none or some may have been successfully added, so it's + // best to Reset this banding before any further use. + // + // Adding can fail even before all the "slots" are completely "full". + // + template + bool AddRange(InputIterator begin, InputIterator end) { + return BandingAddRange(this, *this, begin, end); + } + + // Adds a range of inputs to the banding, returning true if successful, + // or if unsuccessful, rolls back to state before this call and returns + // false. Caller guarantees that the number of inputs in this batch + // does not exceed `backtrack_size` provided to Reset. + // + // Adding can fail even before all the "slots" are completely "full". + // + template + bool AddRangeOrRollBack(InputIterator begin, InputIterator end) { + return BandingAddRange(this, this, *this, begin, end); + } + + // Adds a single input to the banding, returning true if successful. + // If unsuccessful, returns false and banding state is unchanged. + // + // Adding can fail even before all the "slots" are completely "full". + // + bool Add(const AddInput& input) { return AddRange(&input, &input + 1); } + + // Return the number of "occupied" rows (with non-zero coefficients stored). + Index GetOccupiedCount() const { + Index count = 0; + const Index num_slots = num_starts_ + kCoeffBits - 1; + for (Index i = 0; i < num_slots; ++i) { + if (coeff_rows_[i] != 0) { + ++count; + } + } + return count; + } + + // ******************************************************************** + // High-level API + + // Iteratively (a) resets the structure for `num_slots`, (b) attempts + // to add the range of inputs, and (c) if unsuccessful, chooses next + // hash seed, until either successful or unsuccessful with max_seed + // (minimum one seed attempted). Returns true if successful. In that + // case, use GetSeed() to get the successful seed. + // + // If unsuccessful, how best to continue is going to be application + // specific. It should be possible to choose parameters such that + // failure is extremely unlikely, using max_seed around 32 to 64. + // (TODO: APIs to help choose parameters) One option for fallback in + // constructing a filter is to construct a Bloom filter instead. + // Increasing num_slots is an option, but should not be used often + // unless construction maximum latency is a concern (rather than + // average running time of construction). Instead, choose parameters + // appropriately and trust that seeds are independent. (Also, + // increasing num_slots without changing hash seed would have a + // significant correlation in success, rather than independence.) + template + bool ResetAndFindSeedToSolve(Index num_slots, InputIterator begin, + InputIterator end, Seed max_seed) { + StandardHasher::ResetSeed(); + do { + Reset(num_slots); + bool success = AddRange(begin, end); + if (success) { + return true; + } + } while (StandardHasher::NextSeed(max_seed)); + // No seed through max_seed worked. + return false; + } + + protected: + // TODO: explore combining in a struct + std::unique_ptr coeff_rows_; + std::unique_ptr result_rows_; + // We generally store "starts" instead of slots for speed of GetStart(), + // as in StandardHasher. + Index num_starts_ = 0; + Index num_slots_allocated_ = 0; + std::unique_ptr backtrack_; + Index backtrack_size_ = 0; +}; + +// Implements concept SimpleSolutionStorage, mostly for demonstration +// purposes. This is "in memory" only because it does not handle byte +// ordering issues for serialization. +template +class InMemSimpleSolution { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + void PrepareForNumStarts(Index num_starts) { + const Index num_slots = num_starts + kCoeffBits - 1; + assert(num_slots >= kCoeffBits); + if (num_slots > num_slots_allocated_) { + // Do not need to init the memory + solution_rows_.reset(new ResultRow[num_slots]); + num_slots_allocated_ = num_slots; + } + num_starts_ = num_starts; + } + + Index GetNumStarts() const { return num_starts_; } + + ResultRow Load(Index slot_num) const { return solution_rows_[slot_num]; } + + void Store(Index slot_num, ResultRow solution_row) { + solution_rows_[slot_num] = solution_row; + } + + // ******************************************************************** + // High-level API + + template + void BackSubstFrom(const BandingStorage& ss) { + SimpleBackSubst(this, ss); + } + + template + ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) { + assert(!TypesAndSettings::kIsFilter); + return SimplePhsfQuery(input, hasher, *this); + } + + template + bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) { + assert(TypesAndSettings::kIsFilter); + return SimpleFilterQuery(input, hasher, *this); + } + + protected: + // We generally store "starts" instead of slots for speed of GetStart(), + // as in StandardHasher. + Index num_starts_ = 0; + Index num_slots_allocated_ = 0; + std::unique_ptr solution_rows_; +}; + +} // namespace ribbon + +} // namespace ROCKSDB_NAMESPACE + +// For convenience working with templates +#define IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings) \ + using Hasher = ROCKSDB_NAMESPACE::ribbon::StandardHasher; \ + using Banding = \ + ROCKSDB_NAMESPACE::ribbon::StandardBanding; \ + using SimpleSoln = \ + ROCKSDB_NAMESPACE::ribbon::InMemSimpleSolution; \ + static_assert(sizeof(Hasher) + sizeof(Banding) + sizeof(SimpleSoln) > 0, \ + "avoid unused warnings, semicolon expected after macro call") diff --git a/util/ribbon_test.cc b/util/ribbon_test.cc new file mode 100644 index 000000000..9c47f7aa0 --- /dev/null +++ b/util/ribbon_test.cc @@ -0,0 +1,408 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "test_util/testharness.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/ribbon_impl.h" + +#ifndef GFLAGS +uint32_t FLAGS_thoroughness = 5; +#else +#include "util/gflags_compat.h" +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +// Using 500 is a good test when you have time to be thorough. +// Default is for general RocksDB regression test runs. +DEFINE_uint32(thoroughness, 5, "iterations per configuration"); +#endif // GFLAGS + +template +class RibbonTypeParamTest : public ::testing::Test {}; + +class RibbonTest : public ::testing::Test {}; + +struct DefaultTypesAndSettings { + using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128; + using ResultRow = uint8_t; + using Index = uint32_t; + using Hash = uint64_t; + using Key = ROCKSDB_NAMESPACE::Slice; + using Seed = uint32_t; + static constexpr bool kIsFilter = true; + static constexpr bool kFirstCoeffAlwaysOne = true; + static constexpr bool kUseSmash = false; + static Hash HashFn(const Key& key, Seed seed) { + return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), seed); + } +}; + +using TypesAndSettings_Coeff128 = DefaultTypesAndSettings; +struct TypesAndSettings_Coeff128Smash : public DefaultTypesAndSettings { + static constexpr bool kUseSmash = true; +}; +struct TypesAndSettings_Coeff64 : public DefaultTypesAndSettings { + using CoeffRow = uint64_t; +}; +struct TypesAndSettings_Coeff64Smash : public DefaultTypesAndSettings { + using CoeffRow = uint64_t; + static constexpr bool kUseSmash = true; +}; +struct TypesAndSettings_Result16 : public DefaultTypesAndSettings { + using ResultRow = uint16_t; +}; +struct TypesAndSettings_IndexSizeT : public DefaultTypesAndSettings { + using Index = size_t; +}; +struct TypesAndSettings_Hash32 : public DefaultTypesAndSettings { + using Hash = uint32_t; + static Hash HashFn(const Key& key, Seed seed) { + // NOTE: Using RockDB 32-bit Hash() here fails test below because of + // insufficient mixing of seed (or generally insufficient mixing) + return ROCKSDB_NAMESPACE::Upper32of64( + ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), seed)); + } +}; +struct TypesAndSettings_Hash32_Result16 : public TypesAndSettings_Hash32 { + using ResultRow = uint16_t; +}; +struct TypesAndSettings_KeyString : public DefaultTypesAndSettings { + using Key = std::string; +}; +struct TypesAndSettings_Seed8 : public DefaultTypesAndSettings { + using Seed = uint8_t; +}; +struct TypesAndSettings_NoAlwaysOne : public DefaultTypesAndSettings { + static constexpr bool kFirstCoeffAlwaysOne = false; +}; +struct TypesAndSettings_RehasherWrapped : public DefaultTypesAndSettings { + // This doesn't directly use StandardRehasher as a whole, but simulates + // its behavior with unseeded hash of key, then seeded hash-to-hash + // tranform. + static Hash HashFn(const Key& key, Seed seed) { + Hash unseeded = DefaultTypesAndSettings::HashFn(key, /*seed*/ 0); + using Rehasher = ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter< + DefaultTypesAndSettings>; + return Rehasher::HashFn(unseeded, seed); + } +}; +struct TypesAndSettings_Rehasher32Wrapped : public TypesAndSettings_Hash32 { + // This doesn't directly use StandardRehasher as a whole, but simulates + // its behavior with unseeded hash of key, then seeded hash-to-hash + // tranform. + static Hash HashFn(const Key& key, Seed seed) { + Hash unseeded = TypesAndSettings_Hash32::HashFn(key, /*seed*/ 0); + using Rehasher = ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter< + TypesAndSettings_Hash32>; + return Rehasher::HashFn(unseeded, seed); + } +}; + +using TestTypesAndSettings = + ::testing::Types; +TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings); + +namespace { + +struct KeyGen { + KeyGen(const std::string& prefix, uint64_t id) : id_(id), str_(prefix) { + ROCKSDB_NAMESPACE::PutFixed64(&str_, id_); + } + + // Prefix (only one required) + KeyGen& operator++() { + ++id_; + return *this; + } + + KeyGen& operator+=(uint64_t incr) { + id_ += incr; + return *this; + } + + const std::string& operator*() { + // Use multiplication to mix things up a little in the key + ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], + id_ * uint64_t{0x1500000001}); + return str_; + } + + bool operator==(const KeyGen& other) { + // Same prefix is assumed + return id_ == other.id_; + } + bool operator!=(const KeyGen& other) { + // Same prefix is assumed + return id_ != other.id_; + } + + uint64_t id_; + std::string str_; +}; + +// For testing Poisson-distributed (or similar) statistics, get value for +// `stddevs_allowed` standard deviations above expected mean +// `expected_count`. +// (Poisson approximates Binomial only if probability of a trial being +// in the count is low.) +uint64_t PoissonUpperBound(double expected_count, double stddevs_allowed) { + return static_cast( + expected_count + stddevs_allowed * std::sqrt(expected_count) + 1.0); +} + +uint64_t PoissonLowerBound(double expected_count, double stddevs_allowed) { + return static_cast(std::max( + 0.0, expected_count - stddevs_allowed * std::sqrt(expected_count))); +} + +uint64_t FrequentPoissonUpperBound(double expected_count) { + // Allow up to 5.0 standard deviations for frequently checked statistics + return PoissonUpperBound(expected_count, 5.0); +} + +uint64_t FrequentPoissonLowerBound(double expected_count) { + return PoissonLowerBound(expected_count, 5.0); +} + +uint64_t InfrequentPoissonUpperBound(double expected_count) { + // Allow up to 3 standard deviations for infrequently checked statistics + return PoissonUpperBound(expected_count, 3.0); +} + +uint64_t InfrequentPoissonLowerBound(double expected_count) { + return PoissonLowerBound(expected_count, 3.0); +} + +} // namespace + +TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) { + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam); + IMPORT_RIBBON_IMPL_TYPES(TypeParam); + + // For testing FP rate etc. + constexpr Index kNumToCheck = 100000; + constexpr size_t kNumSolutionColumns = 8U * sizeof(ResultRow); + const double expected_fp_count = + kNumToCheck * std::pow(0.5, kNumSolutionColumns); + + const auto log2_thoroughness = + static_cast(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness)); + // FIXME: This upper bound seems excessive + const Seed max_seed = 12 + log2_thoroughness; + + // With overhead of just 2%, expect ~50% encoding success per + // seed with ~5k keys on 64-bit ribbon, or ~150k keys on 128-bit ribbon. + const double kFactor = 1.02; + + uint64_t total_reseeds = 0; + uint64_t total_single_failures = 0; + uint64_t total_batch_successes = 0; + uint64_t total_fp_count = 0; + uint64_t total_added = 0; + + for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) { + Index numToAdd = + sizeof(CoeffRow) == 16 ? 130000 : TypeParam::kUseSmash ? 5000 : 2500; + + // Use different values between that number and 50% of that number + numToAdd -= (i * 15485863) % (numToAdd / 2); + + total_added += numToAdd; + + const Index kNumSlots = static_cast(numToAdd * kFactor); + + std::string prefix; + // Take different samples if you change thoroughness + ROCKSDB_NAMESPACE::PutFixed32(&prefix, + i + (FLAGS_thoroughness * 123456789U)); + + // Batch that must be added + std::string added_str = prefix + "added"; + KeyGen keys_begin(added_str, 0); + KeyGen keys_end(added_str, numToAdd); + + // Batch that may or may not be added + const Index kBatchSize = + sizeof(CoeffRow) == 16 ? 300 : TypeParam::kUseSmash ? 20 : 10; + std::string batch_str = prefix + "batch"; + KeyGen batch_begin(batch_str, 0); + KeyGen batch_end(batch_str, kBatchSize); + + // Batch never (successfully) added, but used for querying FP rate + std::string not_str = prefix + "not"; + KeyGen other_keys_begin(not_str, 0); + KeyGen other_keys_end(not_str, kNumToCheck); + + SimpleSoln soln; + Hasher hasher; + bool first_single; + bool second_single; + bool batch_success; + { + Banding banding; + // Traditional solve for a fixed set. + ASSERT_TRUE(banding.ResetAndFindSeedToSolve(kNumSlots, keys_begin, + keys_end, max_seed)); + + // Now to test backtracking, starting with guaranteed fail + Index occupied_count = banding.GetOccupiedCount(); + banding.EnsureBacktrackSize(kNumToCheck); + ASSERT_FALSE( + banding.AddRangeOrRollBack(other_keys_begin, other_keys_end)); + ASSERT_EQ(occupied_count, banding.GetOccupiedCount()); + + // Check that we still have a good chance of adding a couple more + // individually + first_single = banding.Add("one_more"); + second_single = banding.Add("two_more"); + Index more_added = (first_single ? 1 : 0) + (second_single ? 1 : 0); + total_single_failures += 2U - more_added; + + // Or as a batch + batch_success = banding.AddRangeOrRollBack(batch_begin, batch_end); + if (batch_success) { + more_added += kBatchSize; + ++total_batch_successes; + } + ASSERT_LE(banding.GetOccupiedCount(), occupied_count + more_added); + + // Now back-substitution + soln.BackSubstFrom(banding); + Seed seed = banding.GetSeed(); + total_reseeds += seed; + if (seed > log2_thoroughness + 1) { + fprintf(stderr, "%s high reseeds at %u, %u: %u\n", + seed > log2_thoroughness + 8 ? "FIXME Extremely" : "Somewhat", + static_cast(i), static_cast(numToAdd), + static_cast(seed)); + } + hasher.ResetSeed(seed); + } + // soln and hasher now independent of Banding object + + // Verify keys added + KeyGen cur = keys_begin; + while (cur != keys_end) { + EXPECT_TRUE(soln.FilterQuery(*cur, hasher)); + ++cur; + } + // We (maybe) snuck these in! + if (first_single) { + EXPECT_TRUE(soln.FilterQuery("one_more", hasher)); + } + if (second_single) { + EXPECT_TRUE(soln.FilterQuery("two_more", hasher)); + } + if (batch_success) { + cur = batch_begin; + while (cur != batch_end) { + EXPECT_TRUE(soln.FilterQuery(*cur, hasher)); + ++cur; + } + } + + // Check FP rate (depends only on number of result bits == solution columns) + Index fp_count = 0; + cur = other_keys_begin; + while (cur != other_keys_end) { + fp_count += soln.FilterQuery(*cur, hasher) ? 1 : 0; + ++cur; + } + // For expected FP rate, also include false positives due to collisions + // in Hash value. (Negligible for 64-bit, can matter for 32-bit.) + double correction = + 1.0 * kNumToCheck * numToAdd / std::pow(256.0, sizeof(Hash)); + EXPECT_LE(fp_count, + FrequentPoissonUpperBound(expected_fp_count + correction)); + EXPECT_GE(fp_count, + FrequentPoissonLowerBound(expected_fp_count + correction)); + + total_fp_count += fp_count; + } + + { + double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness; + fprintf(stderr, "Average re-seeds: %g\n", average_reseeds); + // Values above were chosen to target around 50% chance of encoding success + // rate (average of 1.0 re-seeds) or slightly better. But 1.1 is also close + // enough. + EXPECT_LE(total_reseeds, + InfrequentPoissonUpperBound(1.1 * FLAGS_thoroughness)); + EXPECT_GE(total_reseeds, + InfrequentPoissonLowerBound(0.9 * FLAGS_thoroughness)); + } + + { + uint64_t total_singles = 2 * FLAGS_thoroughness; + double single_failure_rate = 1.0 * total_single_failures / total_singles; + fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate); + // A rough bound (one sided) based on nothing in particular + double expected_single_failures = + 1.0 * total_singles / + (sizeof(CoeffRow) == 16 ? 128 : TypeParam::kUseSmash ? 64 : 32); + EXPECT_LE(total_single_failures, + InfrequentPoissonUpperBound(expected_single_failures)); + } + + { + // Counting successes here for Poisson to approximate the Binomial + // distribution. + // A rough bound (one sided) based on nothing in particular. + double expected_batch_successes = 1.0 * FLAGS_thoroughness / 2; + uint64_t lower_bound = + InfrequentPoissonLowerBound(expected_batch_successes); + fprintf(stderr, "Add'l batch, success rate: %g (>= %g)\n", + 1.0 * total_batch_successes / FLAGS_thoroughness, + 1.0 * lower_bound / FLAGS_thoroughness); + EXPECT_GE(total_batch_successes, lower_bound); + } + + { + uint64_t total_checked = uint64_t{kNumToCheck} * FLAGS_thoroughness; + double expected_total_fp_count = + total_checked * std::pow(0.5, kNumSolutionColumns); + // For expected FP rate, also include false positives due to collisions + // in Hash value. (Negligible for 64-bit, can matter for 32-bit.) + expected_total_fp_count += 1.0 * total_checked * total_added / + FLAGS_thoroughness / + std::pow(256.0, sizeof(Hash)); + uint64_t upper_bound = InfrequentPoissonUpperBound(expected_total_fp_count); + uint64_t lower_bound = InfrequentPoissonLowerBound(expected_total_fp_count); + fprintf(stderr, "Average FP rate: %g (~= %g, <= %g, >= %g)\n", + 1.0 * total_fp_count / total_checked, + expected_total_fp_count / total_checked, + 1.0 * upper_bound / total_checked, + 1.0 * lower_bound / total_checked); + // FIXME: this can fail for Result16, e.g. --thoroughness=100 + // Seems due to inexpensive hashing in StandardHasher::GetCoeffRow and + // GetResultRowFromHash as replacing those with different Hash64 instances + // fixes it, at least mostly. + EXPECT_LE(total_fp_count, upper_bound); + EXPECT_GE(total_fp_count, lower_bound); + } +} + +TEST(RibbonTest, Another) { + IMPORT_RIBBON_TYPES_AND_SETTINGS(DefaultTypesAndSettings); + IMPORT_RIBBON_IMPL_TYPES(DefaultTypesAndSettings); + + // TODO +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); +#ifdef GFLAGS + ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS + return RUN_ALL_TESTS(); +}