Fix undefined behavior in Hash

Summary:
Instead of ignoring UBSan checks, fix the negative shifts in
Hash(). Also add test to make sure the hash values are stable over
time. The values were computed before this change, so the test also
verifies the correctness of the change.
Closes https://github.com/facebook/rocksdb/pull/2546

Differential Revision: D5386369

Pulled By: yiwu-arbug

fbshipit-source-id: 6de4b44461a544d6222cc5d72d8cda2c0373d17e
main
Giuseppe Ottaviano 7 years ago committed by Facebook Github Bot
parent 643b787c7b
commit 8f927e5f75
  1. 1
      CMakeLists.txt
  2. 4
      Makefile
  3. 3
      TARGETS
  4. 32
      util/hash.cc
  5. 75
      util/hash_test.cc

@ -697,6 +697,7 @@ set(TESTS
util/event_logger_test.cc
util/file_reader_writer_test.cc
util/filelock_test.cc
util/hash_test.cc
util/heap_test.cc
util/rate_limiter_test.cc
util/slice_transform_test.cc

@ -336,6 +336,7 @@ TESTS = \
inlineskiplist_test \
env_basic_test \
env_test \
hash_test \
thread_local_test \
rate_limiter_test \
perf_context_test \
@ -990,6 +991,9 @@ cache_test: cache/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
hash_test: util/hash_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
option_change_migration_test: utilities/option_change_migration/option_change_migration_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)

@ -11,7 +11,7 @@ rocksdb_compiler_flags = [
"-DROCKSDB_MALLOC_USABLE_SIZE",
"-DROCKSDB_RANGESYNC_PRESENT",
"-DROCKSDB_SCHED_GETCPU_PRESENT",
"-DROCKSDB_SUPPORT_THREAD_LOCAL",
"-DROCKSDB_SUPPORT_THREAD_LOCAL",
"-DHAVE_SSE42",
"-DOS_LINUX",
# Flags to enable libs we include
@ -407,6 +407,7 @@ ROCKS_TESTS = [['arena_test', 'util/arena_test.cc', 'serial'],
['hash_table_test',
'utilities/persistent_cache/hash_table_test.cc',
'serial'],
['hash_test', 'util/hash_test.cc', 'serial'],
['heap_test', 'util/heap_test.cc', 'serial'],
['histogram_test', 'monitoring/histogram_test.cc', 'serial'],
['inlineskiplist_test', 'memtable/inlineskiplist_test.cc', 'parallel'],

@ -15,13 +15,6 @@
namespace rocksdb {
#ifdef ROCKSDB_UBSAN_RUN
#if defined(__clang__)
__attribute__((__no_sanitize__("shift")))
#elif defined(__GNUC__)
__attribute__((__no_sanitize_undefined__))
#endif
#endif
uint32_t Hash(const char* data, size_t n, uint32_t seed) {
// Similar to murmur hash
const uint32_t m = 0xc6a4a793;
@ -40,26 +33,21 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
// Pick up remaining bytes
switch (limit - data) {
// Note: It would be better if this was cast to unsigned char, but that
// would be a disk format change since we previously didn't have any cast
// at all (so gcc used signed char).
// To understand the difference between shifting unsigned and signed chars,
// let's use 250 as an example. unsigned char will be 250, while signed char
// will be -6. Bit-wise, they are equivalent: 11111010. However, when
// converting negative number (signed char) to int, it will be converted
// into negative int (of equivalent value, which is -6), while converting
// positive number (unsigned char) will be converted to 250. Bitwise,
// this looks like this:
// signed char 11111010 -> int 11111111111111111111111111111010
// unsigned char 11111010 -> int 00000000000000000000000011111010
// Note: The original hash implementation used data[i] << shift, which
// promotes the char to int and then performs the shift. If the char is
// negative, the shift is undefined behavior in C++. The hash algorithm is
// part of the format definition, so we cannot change it; to obtain the same
// behavior in a legal way we just cast to uint32_t, which will do
// sign-extension. To guarantee compatibility with architectures where chars
// are unsigned we first cast the char to int8_t.
case 3:
h += static_cast<uint32_t>(static_cast<signed char>(data[2]) << 16);
h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16;
// fall through
case 2:
h += static_cast<uint32_t>(static_cast<signed char>(data[1]) << 8);
h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8;
// fall through
case 1:
h += static_cast<uint32_t>(static_cast<signed char>(data[0]));
h += static_cast<uint32_t>(static_cast<int8_t>(data[0]));
h *= m;
h ^= (h >> r);
break;

@ -0,0 +1,75 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// This source code is also licensed under the GPLv2 license found in the
// COPYING file in the root directory of this source tree.
//
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <vector>
#include "util/hash.h"
#include "util/testharness.h"
// The hash algorithm is part of the file format, for example for the Bloom
// filters. Test that the hash values are stable for a set of random strings of
// varying lengths.
TEST(HashTest, Values) {
using rocksdb::Hash;
constexpr uint32_t kSeed = 0xbc9f1d34; // Same as BloomHash.
EXPECT_EQ(Hash("", 0, kSeed), 3164544308);
EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524);
EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998);
EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349);
EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383);
EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956);
EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105);
EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776);
EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603);
EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797);
EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265);
EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661);
EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222);
EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265);
EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611);
EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572);
EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355);
EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370);
EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402);
EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743);
EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500);
EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008);
EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482);
EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422);
EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029);
EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748);
EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698);
EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227);
EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252);
EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402);
EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698);
EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809);
EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120);
EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed), 2706087434);
EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed), 1534654151);
EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed), 2355554696);
EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed), 1400800912);
EXPECT_EQ(Hash("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
3420325137);
EXPECT_EQ(Hash("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
3427803584);
EXPECT_EQ(Hash("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
1152407945);
EXPECT_EQ(Hash("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
3382479516);
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
Loading…
Cancel
Save