Upgrade xxhash, add Hash128 (#8634)

Summary:
With expected use for a 128-bit hash, xxhash library is
upgraded to current dev (2c611a76f914828bed675f0f342d6c4199ffee1e)
as of Aug 6 so that we can use production version of XXH3_128bits
as new Hash128 function (added in hash128.h).

To make this work, however, we have to carve out the "preview" version
of XXH3 that is used in new SST Bloom and Ribbon filters, since that
will not get maintenance in xxhash releases. I have consolidated all the
relevant code into xxph3.h and made it "inline only" (no .cc file). The
working name for this hash function is changed from XXH3p to XXPH3
(XX Preview Hash) because the latter is easier to get working with no
symbol name conflicts between the headers.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8634

Test Plan:
no expected change in existing functionality. For Hash128,
added some unit tests based on those for Hash64 to ensure some basic
properties and that the values do not change accidentally.

Reviewed By: zhichao-cao

Differential Revision: D30173490

Pulled By: pdillinger

fbshipit-source-id: 06aa542a7a28b353bc2c865b9b2f8bdfe44158e4
main
Peter Dillinger 3 years ago committed by Facebook GitHub Bot
parent 2a383f21f4
commit 22161b7547
  1. 18
      table/block_based/filter_policy.cc
  2. 20
      util/hash.cc
  3. 2
      util/hash.h
  4. 26
      util/hash128.h
  5. 121
      util/hash_test.cc
  6. 2
      util/ribbon_impl.h
  7. 2
      util/ribbon_test.cc
  8. 1392
      util/xxh3p.h
  9. 1127
      util/xxhash.cc
  10. 5464
      util/xxhash.h
  11. 1761
      util/xxph3.h

@ -44,13 +44,13 @@ Slice FinishAlwaysFalse(std::unique_ptr<const char[]>* /*buf*/) {
// Base class for filter builders using the XXH3 preview hash, // Base class for filter builders using the XXH3 preview hash,
// also known as Hash64 or GetSliceHash64. // also known as Hash64 or GetSliceHash64.
class XXH3pFilterBitsBuilder : public BuiltinFilterBitsBuilder { class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
public: public:
explicit XXH3pFilterBitsBuilder( explicit XXPH3FilterBitsBuilder(
std::atomic<int64_t>* aggregate_rounding_balance) std::atomic<int64_t>* aggregate_rounding_balance)
: aggregate_rounding_balance_(aggregate_rounding_balance) {} : aggregate_rounding_balance_(aggregate_rounding_balance) {}
~XXH3pFilterBitsBuilder() override {} ~XXPH3FilterBitsBuilder() override {}
virtual void AddKey(const Slice& key) override { virtual void AddKey(const Slice& key) override {
uint64_t hash = GetSliceHash64(key); uint64_t hash = GetSliceHash64(key);
@ -70,8 +70,8 @@ class XXH3pFilterBitsBuilder : public BuiltinFilterBitsBuilder {
protected: protected:
static constexpr uint32_t kMetadataLen = 5; static constexpr uint32_t kMetadataLen = 5;
// For delegating between XXH3pFilterBitsBuilders // For delegating between XXPH3FilterBitsBuilders
void SwapEntriesWith(XXH3pFilterBitsBuilder* other) { void SwapEntriesWith(XXPH3FilterBitsBuilder* other) {
std::swap(hash_entries_, other->hash_entries_); std::swap(hash_entries_, other->hash_entries_);
} }
@ -188,13 +188,13 @@ class XXH3pFilterBitsBuilder : public BuiltinFilterBitsBuilder {
// ############## also known as format_version=5 Bloom filter ########## // // ############## also known as format_version=5 Bloom filter ########## //
// See description in FastLocalBloomImpl // See description in FastLocalBloomImpl
class FastLocalBloomBitsBuilder : public XXH3pFilterBitsBuilder { class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder {
public: public:
// Non-null aggregate_rounding_balance implies optimize_filters_for_memory // Non-null aggregate_rounding_balance implies optimize_filters_for_memory
explicit FastLocalBloomBitsBuilder( explicit FastLocalBloomBitsBuilder(
const int millibits_per_key, const int millibits_per_key,
std::atomic<int64_t>* aggregate_rounding_balance) std::atomic<int64_t>* aggregate_rounding_balance)
: XXH3pFilterBitsBuilder(aggregate_rounding_balance), : XXPH3FilterBitsBuilder(aggregate_rounding_balance),
millibits_per_key_(millibits_per_key) { millibits_per_key_(millibits_per_key) {
assert(millibits_per_key >= 1000); assert(millibits_per_key >= 1000);
} }
@ -421,12 +421,12 @@ struct Standard128RibbonRehasherTypesAndSettings {
using Standard128RibbonTypesAndSettings = using Standard128RibbonTypesAndSettings =
ribbon::StandardRehasherAdapter<Standard128RibbonRehasherTypesAndSettings>; ribbon::StandardRehasherAdapter<Standard128RibbonRehasherTypesAndSettings>;
class Standard128RibbonBitsBuilder : public XXH3pFilterBitsBuilder { class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder {
public: public:
explicit Standard128RibbonBitsBuilder( explicit Standard128RibbonBitsBuilder(
double desired_one_in_fp_rate, int bloom_millibits_per_key, double desired_one_in_fp_rate, int bloom_millibits_per_key,
std::atomic<int64_t>* aggregate_rounding_balance, Logger* info_log) std::atomic<int64_t>* aggregate_rounding_balance, Logger* info_log)
: XXH3pFilterBitsBuilder(aggregate_rounding_balance), : XXPH3FilterBitsBuilder(aggregate_rounding_balance),
desired_one_in_fp_rate_(desired_one_in_fp_rate), desired_one_in_fp_rate_(desired_one_in_fp_rate),
info_log_(info_log), info_log_(info_log),
bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance) { bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance) {

@ -8,10 +8,15 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/hash.h" #include "util/hash.h"
#include <string.h> #include <string.h>
#include "port/lang.h" #include "port/lang.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/hash128.h"
#include "util/math128.h"
#include "util/xxhash.h" #include "util/xxhash.h"
#include "util/xxph3.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -74,12 +79,12 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
// bundling hash functions specialized for particular lengths with // bundling hash functions specialized for particular lengths with
// the prefix extractors. // the prefix extractors.
uint64_t Hash64(const char* data, size_t n, uint64_t seed) { uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
return XXH3p_64bits_withSeed(data, n, seed); return XXPH3_64bits_withSeed(data, n, seed);
} }
uint64_t Hash64(const char* data, size_t n) { uint64_t Hash64(const char* data, size_t n) {
// Same as seed = 0 // Same as seed = 0
return XXH3p_64bits(data, n); return XXPH3_64bits(data, n);
} }
uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) { uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
@ -97,4 +102,15 @@ uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
return NPHash64(concat_data.data(), concat_len, seed); return NPHash64(concat_data.data(), concat_len, seed);
} }
Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) {
auto h = XXH3_128bits_withSeed(data, n, seed);
return (Unsigned128{h.high64} << 64) | (h.low64);
}
Unsigned128 Hash128(const char* data, size_t n) {
// Same as seed = 0
auto h = XXH3_128bits(data, n);
return (Unsigned128{h.high64} << 64) | (h.low64);
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

@ -10,7 +10,7 @@
// Common hash functions with convenient interfaces. If hashing a // Common hash functions with convenient interfaces. If hashing a
// statically-sized input in a performance-critical context, consider // statically-sized input in a performance-critical context, consider
// calling a specific hash implementation directly, such as // calling a specific hash implementation directly, such as
// XXH3p_64bits from xxhash.h. // XXH3_64bits from xxhash.h.
// //
// Since this is a very common header, implementation details are kept // Since this is a very common header, implementation details are kept
// out-of-line. Out-of-lining also aids in tracking the time spent in // out-of-line. Out-of-lining also aids in tracking the time spent in

@ -0,0 +1,26 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
// 128-bit hash gets it own header so that more popular hash.h doesn't
// depend on math128.h
#include "rocksdb/slice.h"
#include "util/math128.h"
namespace ROCKSDB_NAMESPACE {
// Stable/persistent 128-bit hash for non-cryptographic applications.
Unsigned128 Hash128(const char* data, size_t n, uint64_t seed);
// Specific optimization without seed (same as seed = 0)
Unsigned128 Hash128(const char* data, size_t n);
inline Unsigned128 GetSliceHash128(const Slice& key) {
return Hash128(key.data(), key.size());
}
} // namespace ROCKSDB_NAMESPACE

@ -14,15 +14,20 @@
#include "test_util/testharness.h" #include "test_util/testharness.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/hash128.h"
#include "util/math128.h" #include "util/math128.h"
using ROCKSDB_NAMESPACE::EncodeFixed32; using ROCKSDB_NAMESPACE::EncodeFixed32;
using ROCKSDB_NAMESPACE::GetSliceHash64; using ROCKSDB_NAMESPACE::GetSliceHash64;
using ROCKSDB_NAMESPACE::Hash; using ROCKSDB_NAMESPACE::Hash;
using ROCKSDB_NAMESPACE::Hash128;
using ROCKSDB_NAMESPACE::Hash64; using ROCKSDB_NAMESPACE::Hash64;
using ROCKSDB_NAMESPACE::Lower32of64; using ROCKSDB_NAMESPACE::Lower32of64;
using ROCKSDB_NAMESPACE::Lower64of128;
using ROCKSDB_NAMESPACE::Slice; using ROCKSDB_NAMESPACE::Slice;
using ROCKSDB_NAMESPACE::Unsigned128;
using ROCKSDB_NAMESPACE::Upper32of64; using ROCKSDB_NAMESPACE::Upper32of64;
using ROCKSDB_NAMESPACE::Upper64of128;
// The hash algorithm is part of the file format, for example for the Bloom // The hash algorithm is part of the file format, for example for the Bloom
// filters. Test that the hash values are stable for a set of random strings of // filters. Test that the hash values are stable for a set of random strings of
@ -93,7 +98,8 @@ TEST(HashTest, Hash64Misc) {
for (size_t size = 0; size <= max_size; ++size) { for (size_t size = 0; size <= max_size; ++size) {
uint64_t here = Hash64(str.data(), size, kSeed); uint64_t here = Hash64(str.data(), size, kSeed);
// Must be same as GetSliceHash64 // Must be same as unseeded Hash64 and GetSliceHash64
EXPECT_EQ(here, Hash64(str.data(), size));
EXPECT_EQ(here, GetSliceHash64(Slice(str.data(), size))); EXPECT_EQ(here, GetSliceHash64(Slice(str.data(), size)));
// Upper and Lower must reconstruct hash // Upper and Lower must reconstruct hash
@ -234,7 +240,7 @@ std::string Hash64TestDescriptor(const char *repeat, size_t limit) {
return rv; return rv;
} }
// XXH3p changes its algorithm for various sizes up through 250 bytes, so // XXPH3 changes its algorithm for various sizes up through 250 bytes, so
// we need to check the stability of larger sizes also. // we need to check the stability of larger sizes also.
TEST(HashTest, Hash64LargeValueSchema) { TEST(HashTest, Hash64LargeValueSchema) {
// Each of these derives a "descriptor" from the hash values for all // Each of these derives a "descriptor" from the hash values for all
@ -267,6 +273,117 @@ TEST(HashTest, Hash64LargeValueSchema) {
"eMFlxCIYUpTCsal2qsmnGOWa8WCcefrohMjDj1fjzSvSaQwlpyR1GZHF2uPOoQagiCpHpm"); "eMFlxCIYUpTCsal2qsmnGOWa8WCcefrohMjDj1fjzSvSaQwlpyR1GZHF2uPOoQagiCpHpm");
} }
TEST(HashTest, Hash128Misc) {
constexpr uint32_t kSeed = 0; // Same as GetSliceHash128
for (char fill : {'\0', 'a', '1', '\xff'}) {
const size_t max_size = 1000;
const std::string str(max_size, fill);
for (size_t size = 0; size <= max_size; ++size) {
Unsigned128 here = Hash128(str.data(), size, kSeed);
// Must be same as unseeded Hash128 and GetSliceHash128
EXPECT_EQ(here, Hash128(str.data(), size));
EXPECT_EQ(here, GetSliceHash128(Slice(str.data(), size)));
// Upper and Lower must reconstruct hash
EXPECT_EQ(here,
(Unsigned128{Upper64of128(here)} << 64) | Lower64of128(here));
EXPECT_EQ(here,
(Unsigned128{Upper64of128(here)} << 64) ^ Lower64of128(here));
// Seed changes hash value (with high probability)
for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) {
EXPECT_NE(here, Hash128(str.data(), size, var_seed));
}
// Size changes hash value (with high probability)
size_t max_smaller_by = std::min(size_t{30}, size);
for (size_t smaller_by = 1; smaller_by <= max_smaller_by; ++smaller_by) {
EXPECT_NE(here, Hash128(str.data(), size - smaller_by, kSeed));
}
}
}
}
// Test that hash values are "non-trivial" for "trivial" inputs
TEST(HashTest, Hash128Trivial) {
// Thorough test too slow for regression testing
constexpr bool thorough = false;
// For various seeds, make sure hash of empty string is not zero.
constexpr uint64_t max_seed = thorough ? 0x1000000 : 0x10000;
for (uint64_t seed = 0; seed < max_seed; ++seed) {
Unsigned128 here = Hash128("", 0, seed);
EXPECT_NE(Lower64of128(here), 0u);
EXPECT_NE(Upper64of128(here), 0u);
}
// For standard seed, make sure hash of small strings are not zero
constexpr uint32_t kSeed = 0; // Same as GetSliceHash128
char input[4];
constexpr int max_len = thorough ? 3 : 2;
for (int len = 1; len <= max_len; ++len) {
for (uint32_t i = 0; (i >> (len * 8)) == 0; ++i) {
EncodeFixed32(input, i);
Unsigned128 here = Hash128(input, len, kSeed);
EXPECT_NE(Lower64of128(here), 0u);
EXPECT_NE(Upper64of128(here), 0u);
}
}
}
std::string Hash128TestDescriptor(const char *repeat, size_t limit) {
const char *mod61_encode =
"abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
std::string input;
while (input.size() < limit) {
input.append(repeat);
}
std::string rv;
for (size_t i = 0; i < limit; ++i) {
auto h = GetSliceHash128(Slice(input.data(), i));
uint64_t h2 = Upper64of128(h) + Lower64of128(h);
rv.append(1, mod61_encode[static_cast<size_t>(h2 % 61)]);
}
return rv;
}
// XXH3 changes its algorithm for various sizes up through 250 bytes, so
// we need to check the stability of larger sizes also.
TEST(HashTest, Hash128ValueSchema) {
// Each of these derives a "descriptor" from the hash values for all
// lengths up to 430.
// Note that "b" is common for the zero-length string.
EXPECT_EQ(
Hash128TestDescriptor("foo", 430),
"bUMA3As8n9I4vNGhThXlEevxZlyMcbb6TYAlIKJ2f5ponsv99q962rYclQ7u3gfnRdCDQ5JI"
"2LrGUaCycbXrvLFe4SjgRb9RQwCfrnmNQ7VSEwSKMnkGCK3bDbXSrnIh5qLXdtvIZklbJpGH"
"Dqr93BlqF9ubTnOSYkSdx89XvQqflMIW8bjfQp9BPjQejWOeEQspnN1D3sfgVdFhpaQdHYA5"
"pI2XcPlCMFPxvrFuRr7joaDvjNe9IUZaunLPMewuXmC3EL95h52Ju3D7y9RNKhgYxMTrA84B"
"yJrMvyjdm3vlBxet4EN7v2GEyjbGuaZW9UL6lrX6PghJDg7ACfLGdxNbH3qXM4zaiG2RKnL5"
"S3WXKR78RBB5fRFQ8KDIEQjHFvSNsc3GrAEi6W8P2lv8JMTzjBODO2uN4wadVQFT9wpGfV");
// Note that "35D2v" is common for "Rocks"
EXPECT_EQ(
Hash128TestDescriptor("Rocks", 430),
"b35D2vzvklFVDqJmyLRXyApwGGO3EAT3swhe8XJAN3mY2UVPglzdmydxcba6JI2tSvwO6zSu"
"ANpjSM7tc9G5iMhsa7R8GfyCXRO1TnLg7HvdWNdgGGBirxZR68BgT7TQsYJt6zyEyISeXI1n"
"MXA48Xo7dWfJeYN6Z4KWlqZY7TgFXGbks9AX4ehZNSGtIhdO5i58qlgVX1bEejeOVaCcjC79"
"67DrMfOKds7rUQzjBa77sMPcoPW1vu6ljGJPZH3XkRyDMZ1twxXKkNxN3tE8nR7JHwyqBAxE"
"fTcjbOWrLZ1irWxRSombD8sGDEmclgF11IxqEhe3Rt7gyofO3nExGckKkS9KfRqsCHbiUyva"
"JGkJwUHRXaZnh58b4i1Ei9aQKZjXlvIVDixoZrjcNaH5XJIJlRZce9Z9t82wYapTpckYSg");
EXPECT_EQ(
Hash128TestDescriptor("RocksDB", 430),
"b35D2vFUst3XDZCRlSrhmYYakmqImV97LbBsV6EZlOEQpUPH1d1sD3xMKAPlA5UErHehg5O7"
"n966fZqhAf3hRc24kGCLfNAWjyUa7vSNOx3IcPoTyVRFZeFlcCtfl7t1QJumHOCpS33EBmBF"
"hvK13QjBbDWYWeHQhJhgV9Mqbx17TIcvUkEnYZxb8IzWNmjVsJG44Z7v52DjGj1ZzS62S2Vv"
"qWcDO7apvH5VHg68E9Wl6nXP21vlmUqEH9GeWRehfWVvY7mUpsAg5drHHQyDSdiMceiUuUxJ"
"XJqHFcDdzbbPk7xDvbLgWCKvH8k3MpQNWOmbSSRDdAP6nGlDjoTToYkcqVREHJzztSWAAq5h"
"GHSUNJ6OxsMHhf8EhXfHtKyUzRmPtjYyeckQcGmrQfFFLidc6cjMDKCdBG6c6HVBrS7H2R");
}
TEST(FastRange32Test, Values) { TEST(FastRange32Test, Values) {
using ROCKSDB_NAMESPACE::FastRange32; using ROCKSDB_NAMESPACE::FastRange32;
// Zero range // Zero range

@ -148,7 +148,7 @@ struct AddInputSelector<Key, ResultRow, true /*IsFilter*/> {
// they are provided to TypesAndSettings::HashFn in case that function does // they are provided to TypesAndSettings::HashFn in case that function does
// not provide sufficiently independent hashes when iterating merely // not provide sufficiently independent hashes when iterating merely
// sequentially on seeds. (This for example works around a problem with the // sequentially on seeds. (This for example works around a problem with the
// preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXH3p or Hash64, and // preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXPH3 or Hash64, and
// MurmurHash1 used in RocksDB, a.k.a. Hash.) We say this pre-mixing step // MurmurHash1 used in RocksDB, a.k.a. Hash.) We say this pre-mixing step
// translates "ordinal seeds," which we iterate sequentially to find a // translates "ordinal seeds," which we iterate sequentially to find a
// solution, into "raw seeds," with many more bits changing for each // solution, into "raw seeds," with many more bits changing for each

@ -204,7 +204,7 @@ struct DefaultTypesAndSettings {
static constexpr bool kUseSmash = false; static constexpr bool kUseSmash = false;
static constexpr bool kAllowZeroStarts = false; static constexpr bool kAllowZeroStarts = false;
static Hash HashFn(const Key& key, uint64_t raw_seed) { static Hash HashFn(const Key& key, uint64_t raw_seed) {
// This version 0.7.2 preview of XXH3 (a.k.a. XXH3p) function does // This version 0.7.2 preview of XXH3 (a.k.a. XXPH3) function does
// not pass SmallKeyGen tests below without some seed premixing from // not pass SmallKeyGen tests below without some seed premixing from
// StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469 // StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469
return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed); return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save