You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
rocksdb/util/ribbon_test.cc

408 lines
15 KiB

// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include <cmath>
#include "test_util/testharness.h"
#include "util/coding.h"
#include "util/hash.h"
#include "util/ribbon_impl.h"
#ifndef GFLAGS
uint32_t FLAGS_thoroughness = 5;
#else
#include "util/gflags_compat.h"
using GFLAGS_NAMESPACE::ParseCommandLineFlags;
// Using 500 is a good test when you have time to be thorough.
// Default is for general RocksDB regression test runs.
DEFINE_uint32(thoroughness, 5, "iterations per configuration");
#endif // GFLAGS
template <typename TypesAndSettings>
class RibbonTypeParamTest : public ::testing::Test {};
class RibbonTest : public ::testing::Test {};
struct DefaultTypesAndSettings {
using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
using ResultRow = uint8_t;
using Index = uint32_t;
using Hash = uint64_t;
using Key = ROCKSDB_NAMESPACE::Slice;
using Seed = uint32_t;
static constexpr bool kIsFilter = true;
static constexpr bool kFirstCoeffAlwaysOne = true;
static constexpr bool kUseSmash = false;
static Hash HashFn(const Key& key, Seed seed) {
return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), seed);
}
};
using TypesAndSettings_Coeff128 = DefaultTypesAndSettings;
struct TypesAndSettings_Coeff128Smash : public DefaultTypesAndSettings {
static constexpr bool kUseSmash = true;
};
struct TypesAndSettings_Coeff64 : public DefaultTypesAndSettings {
using CoeffRow = uint64_t;
};
struct TypesAndSettings_Coeff64Smash : public DefaultTypesAndSettings {
using CoeffRow = uint64_t;
static constexpr bool kUseSmash = true;
};
struct TypesAndSettings_Result16 : public DefaultTypesAndSettings {
using ResultRow = uint16_t;
};
struct TypesAndSettings_IndexSizeT : public DefaultTypesAndSettings {
using Index = size_t;
};
struct TypesAndSettings_Hash32 : public DefaultTypesAndSettings {
using Hash = uint32_t;
static Hash HashFn(const Key& key, Seed seed) {
// NOTE: Using RockDB 32-bit Hash() here fails test below because of
// insufficient mixing of seed (or generally insufficient mixing)
return ROCKSDB_NAMESPACE::Upper32of64(
ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), seed));
}
};
struct TypesAndSettings_Hash32_Result16 : public TypesAndSettings_Hash32 {
using ResultRow = uint16_t;
};
struct TypesAndSettings_KeyString : public DefaultTypesAndSettings {
using Key = std::string;
};
struct TypesAndSettings_Seed8 : public DefaultTypesAndSettings {
using Seed = uint8_t;
};
struct TypesAndSettings_NoAlwaysOne : public DefaultTypesAndSettings {
static constexpr bool kFirstCoeffAlwaysOne = false;
};
struct TypesAndSettings_RehasherWrapped : public DefaultTypesAndSettings {
// This doesn't directly use StandardRehasher as a whole, but simulates
// its behavior with unseeded hash of key, then seeded hash-to-hash
// tranform.
static Hash HashFn(const Key& key, Seed seed) {
Hash unseeded = DefaultTypesAndSettings::HashFn(key, /*seed*/ 0);
using Rehasher = ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter<
DefaultTypesAndSettings>;
return Rehasher::HashFn(unseeded, seed);
}
};
struct TypesAndSettings_Rehasher32Wrapped : public TypesAndSettings_Hash32 {
// This doesn't directly use StandardRehasher as a whole, but simulates
// its behavior with unseeded hash of key, then seeded hash-to-hash
// tranform.
static Hash HashFn(const Key& key, Seed seed) {
Hash unseeded = TypesAndSettings_Hash32::HashFn(key, /*seed*/ 0);
using Rehasher = ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter<
TypesAndSettings_Hash32>;
return Rehasher::HashFn(unseeded, seed);
}
};
using TestTypesAndSettings =
::testing::Types<TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash,
TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash,
TypesAndSettings_Result16, TypesAndSettings_IndexSizeT,
TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16,
TypesAndSettings_KeyString, TypesAndSettings_Seed8,
TypesAndSettings_NoAlwaysOne,
TypesAndSettings_RehasherWrapped,
TypesAndSettings_Rehasher32Wrapped>;
TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings);
namespace {
struct KeyGen {
KeyGen(const std::string& prefix, uint64_t id) : id_(id), str_(prefix) {
ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
}
// Prefix (only one required)
KeyGen& operator++() {
++id_;
return *this;
}
KeyGen& operator+=(uint64_t incr) {
id_ += incr;
return *this;
}
const std::string& operator*() {
// Use multiplication to mix things up a little in the key
ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
id_ * uint64_t{0x1500000001});
return str_;
}
bool operator==(const KeyGen& other) {
// Same prefix is assumed
return id_ == other.id_;
}
bool operator!=(const KeyGen& other) {
// Same prefix is assumed
return id_ != other.id_;
}
uint64_t id_;
std::string str_;
};
// For testing Poisson-distributed (or similar) statistics, get value for
// `stddevs_allowed` standard deviations above expected mean
// `expected_count`.
// (Poisson approximates Binomial only if probability of a trial being
// in the count is low.)
uint64_t PoissonUpperBound(double expected_count, double stddevs_allowed) {
return static_cast<uint64_t>(
expected_count + stddevs_allowed * std::sqrt(expected_count) + 1.0);
}
uint64_t PoissonLowerBound(double expected_count, double stddevs_allowed) {
return static_cast<uint64_t>(std::max(
0.0, expected_count - stddevs_allowed * std::sqrt(expected_count)));
}
uint64_t FrequentPoissonUpperBound(double expected_count) {
// Allow up to 5.0 standard deviations for frequently checked statistics
return PoissonUpperBound(expected_count, 5.0);
}
uint64_t FrequentPoissonLowerBound(double expected_count) {
return PoissonLowerBound(expected_count, 5.0);
}
uint64_t InfrequentPoissonUpperBound(double expected_count) {
// Allow up to 3 standard deviations for infrequently checked statistics
return PoissonUpperBound(expected_count, 3.0);
}
uint64_t InfrequentPoissonLowerBound(double expected_count) {
return PoissonLowerBound(expected_count, 3.0);
}
} // namespace
TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
IMPORT_RIBBON_IMPL_TYPES(TypeParam);
// For testing FP rate etc.
constexpr Index kNumToCheck = 100000;
constexpr size_t kNumSolutionColumns = 8U * sizeof(ResultRow);
const double expected_fp_count =
kNumToCheck * std::pow(0.5, kNumSolutionColumns);
const auto log2_thoroughness =
static_cast<Seed>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness));
// FIXME: This upper bound seems excessive
const Seed max_seed = 12 + log2_thoroughness;
// With overhead of just 2%, expect ~50% encoding success per
// seed with ~5k keys on 64-bit ribbon, or ~150k keys on 128-bit ribbon.
const double kFactor = 1.02;
uint64_t total_reseeds = 0;
uint64_t total_single_failures = 0;
uint64_t total_batch_successes = 0;
uint64_t total_fp_count = 0;
uint64_t total_added = 0;
for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) {
Index numToAdd =
sizeof(CoeffRow) == 16 ? 130000 : TypeParam::kUseSmash ? 5000 : 2500;
// Use different values between that number and 50% of that number
numToAdd -= (i * 15485863) % (numToAdd / 2);
total_added += numToAdd;
const Index kNumSlots = static_cast<Index>(numToAdd * kFactor);
std::string prefix;
// Take different samples if you change thoroughness
ROCKSDB_NAMESPACE::PutFixed32(&prefix,
i + (FLAGS_thoroughness * 123456789U));
// Batch that must be added
std::string added_str = prefix + "added";
KeyGen keys_begin(added_str, 0);
KeyGen keys_end(added_str, numToAdd);
// Batch that may or may not be added
const Index kBatchSize =
sizeof(CoeffRow) == 16 ? 300 : TypeParam::kUseSmash ? 20 : 10;
std::string batch_str = prefix + "batch";
KeyGen batch_begin(batch_str, 0);
KeyGen batch_end(batch_str, kBatchSize);
// Batch never (successfully) added, but used for querying FP rate
std::string not_str = prefix + "not";
KeyGen other_keys_begin(not_str, 0);
KeyGen other_keys_end(not_str, kNumToCheck);
SimpleSoln soln;
Hasher hasher;
bool first_single;
bool second_single;
bool batch_success;
{
Banding banding;
// Traditional solve for a fixed set.
ASSERT_TRUE(banding.ResetAndFindSeedToSolve(kNumSlots, keys_begin,
keys_end, max_seed));
// Now to test backtracking, starting with guaranteed fail
Index occupied_count = banding.GetOccupiedCount();
banding.EnsureBacktrackSize(kNumToCheck);
ASSERT_FALSE(
banding.AddRangeOrRollBack(other_keys_begin, other_keys_end));
ASSERT_EQ(occupied_count, banding.GetOccupiedCount());
// Check that we still have a good chance of adding a couple more
// individually
first_single = banding.Add("one_more");
second_single = banding.Add("two_more");
Index more_added = (first_single ? 1 : 0) + (second_single ? 1 : 0);
total_single_failures += 2U - more_added;
// Or as a batch
batch_success = banding.AddRangeOrRollBack(batch_begin, batch_end);
if (batch_success) {
more_added += kBatchSize;
++total_batch_successes;
}
ASSERT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
// Now back-substitution
soln.BackSubstFrom(banding);
Seed seed = banding.GetSeed();
total_reseeds += seed;
if (seed > log2_thoroughness + 1) {
fprintf(stderr, "%s high reseeds at %u, %u: %u\n",
seed > log2_thoroughness + 8 ? "FIXME Extremely" : "Somewhat",
static_cast<unsigned>(i), static_cast<unsigned>(numToAdd),
static_cast<unsigned>(seed));
}
hasher.ResetSeed(seed);
}
// soln and hasher now independent of Banding object
// Verify keys added
KeyGen cur = keys_begin;
while (cur != keys_end) {
EXPECT_TRUE(soln.FilterQuery(*cur, hasher));
++cur;
}
// We (maybe) snuck these in!
if (first_single) {
EXPECT_TRUE(soln.FilterQuery("one_more", hasher));
}
if (second_single) {
EXPECT_TRUE(soln.FilterQuery("two_more", hasher));
}
if (batch_success) {
cur = batch_begin;
while (cur != batch_end) {
EXPECT_TRUE(soln.FilterQuery(*cur, hasher));
++cur;
}
}
// Check FP rate (depends only on number of result bits == solution columns)
Index fp_count = 0;
cur = other_keys_begin;
while (cur != other_keys_end) {
fp_count += soln.FilterQuery(*cur, hasher) ? 1 : 0;
++cur;
}
// For expected FP rate, also include false positives due to collisions
// in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
double correction =
1.0 * kNumToCheck * numToAdd / std::pow(256.0, sizeof(Hash));
EXPECT_LE(fp_count,
FrequentPoissonUpperBound(expected_fp_count + correction));
EXPECT_GE(fp_count,
FrequentPoissonLowerBound(expected_fp_count + correction));
total_fp_count += fp_count;
}
{
double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness;
fprintf(stderr, "Average re-seeds: %g\n", average_reseeds);
// Values above were chosen to target around 50% chance of encoding success
// rate (average of 1.0 re-seeds) or slightly better. But 1.1 is also close
// enough.
EXPECT_LE(total_reseeds,
InfrequentPoissonUpperBound(1.1 * FLAGS_thoroughness));
EXPECT_GE(total_reseeds,
InfrequentPoissonLowerBound(0.9 * FLAGS_thoroughness));
}
{
uint64_t total_singles = 2 * FLAGS_thoroughness;
double single_failure_rate = 1.0 * total_single_failures / total_singles;
fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate);
// A rough bound (one sided) based on nothing in particular
double expected_single_failures =
1.0 * total_singles /
(sizeof(CoeffRow) == 16 ? 128 : TypeParam::kUseSmash ? 64 : 32);
EXPECT_LE(total_single_failures,
InfrequentPoissonUpperBound(expected_single_failures));
}
{
// Counting successes here for Poisson to approximate the Binomial
// distribution.
// A rough bound (one sided) based on nothing in particular.
double expected_batch_successes = 1.0 * FLAGS_thoroughness / 2;
uint64_t lower_bound =
InfrequentPoissonLowerBound(expected_batch_successes);
fprintf(stderr, "Add'l batch, success rate: %g (>= %g)\n",
1.0 * total_batch_successes / FLAGS_thoroughness,
1.0 * lower_bound / FLAGS_thoroughness);
EXPECT_GE(total_batch_successes, lower_bound);
}
{
uint64_t total_checked = uint64_t{kNumToCheck} * FLAGS_thoroughness;
double expected_total_fp_count =
total_checked * std::pow(0.5, kNumSolutionColumns);
// For expected FP rate, also include false positives due to collisions
// in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
expected_total_fp_count += 1.0 * total_checked * total_added /
FLAGS_thoroughness /
std::pow(256.0, sizeof(Hash));
uint64_t upper_bound = InfrequentPoissonUpperBound(expected_total_fp_count);
uint64_t lower_bound = InfrequentPoissonLowerBound(expected_total_fp_count);
fprintf(stderr, "Average FP rate: %g (~= %g, <= %g, >= %g)\n",
1.0 * total_fp_count / total_checked,
expected_total_fp_count / total_checked,
1.0 * upper_bound / total_checked,
1.0 * lower_bound / total_checked);
// FIXME: this can fail for Result16, e.g. --thoroughness=100
// Seems due to inexpensive hashing in StandardHasher::GetCoeffRow and
// GetResultRowFromHash as replacing those with different Hash64 instances
// fixes it, at least mostly.
EXPECT_LE(total_fp_count, upper_bound);
EXPECT_GE(total_fp_count, lower_bound);
}
}
TEST(RibbonTest, Another) {
IMPORT_RIBBON_TYPES_AND_SETTINGS(DefaultTypesAndSettings);
IMPORT_RIBBON_IMPL_TYPES(DefaultTypesAndSettings);
// TODO
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
#ifdef GFLAGS
ParseCommandLineFlags(&argc, &argv, true);
#endif // GFLAGS
return RUN_ALL_TESTS();
}