Port 3 way SSE4.2 crc32c implementation from Folly

Summary:
**# Summary**

RocksDB uses SSE crc32 intrinsics to calculate the crc32 values but it does it in single way fashion (not pipelined on single CPU core). Intel's whitepaper () published an algorithm that uses 3-way pipelining for the crc32 intrinsics, then use pclmulqdq intrinsic to combine the values. Because pclmulqdq has overhead on its own, this algorithm will show perf gains on buffers larger than 216 bytes, which makes RocksDB a perfect user, since most of the buffers RocksDB call crc32c on is over 4KB. Initial db_bench show tremendous CPU gain.

This change uses the 3-way SSE algorithm by default. The old SSE algorithm is now behind a compiler tag NO_THREEWAY_CRC32C. If user compiles the code with NO_THREEWAY_CRC32C=1 then the old SSE Crc32c algorithm would be used. If the server does not have SSE4.2 at the run time the slow way (Non SSE) will be used.

**# Performance Test Results**
We ran the FillRandom and ReadRandom benchmarks in db_bench. ReadRandom is the point of interest here since it calculates the CRC32 for the in-mem buffers. We did 3 runs for each algorithm.

Before this change the CRC32 value computation takes about 11.5% of total CPU cost, and with the new 3-way algorithm it reduced to around 4.5%. The overall throughput also improved from 25.53MB/s to 27.63MB/s.

1) ReadRandom in db_bench overall metrics

    PER RUN
    Algorithm | run | micros/op | ops/sec |Throughput (MB/s)
    3-way      |  1   | 4.143   | 241387 | 26.7
    3-way      |  2   | 3.775   | 264872 | 29.3
    3-way      | 3    | 4.116   | 242929 | 26.9
    FastCrc32c|1  | 4.037   | 247727 | 27.4
    FastCrc32c|2  | 4.648   | 215166 | 23.8
    FastCrc32c|3  | 4.352   | 229799 | 25.4

     AVG
    Algorithm     |    Average of micros/op |   Average of ops/sec |    Average of Throughput (MB/s)
    3-way           |     4.01                               |      249,729                 |      27.63
    FastCrc32c  |     4.35                              |     230,897                  |      25.53

 2)   Crc32c computation CPU cost (inclusive samples percentage)
    PER RUN
    Implementation | run |  TotalSamples   | Crc32c percentage
    3-way                 |  1    |  4,572,250,000 | 4.37%
    3-way                 |  2    |  3,779,250,000 | 4.62%
    3-way                 |  3    |  4,129,500,000 | 4.48%
    FastCrc32c       |  1    |  4,663,500,000 | 11.24%
    FastCrc32c       |  2    |  4,047,500,000 | 12.34%
    FastCrc32c       |  3    |  4,366,750,000 | 11.68%

 **# Test Plan**
     make -j64 corruption_test && ./corruption_test
      By default it uses 3-way SSE algorithm

     NO_THREEWAY_CRC32C=1 make -j64 corruption_test && ./corruption_test

    make clean && DEBUG_LEVEL=0 make -j64 db_bench
    make clean && DEBUG_LEVEL=0 NO_THREEWAY_CRC32C=1 make -j64 db_bench
Closes https://github.com/facebook/rocksdb/pull/3173

Differential Revision: D6330882

Pulled By: yingsu00

fbshipit-source-id: 8ec3d89719533b63b536a736663ca6f0dd4482e9
main
yingsu00 7 years ago committed by Facebook Github Bot
parent e763e1b623
commit f54d7f5fea
  1. 11
      CMakeLists.txt
  2. 1
      HISTORY.md
  3. 3
      Makefile
  4. 22
      build_tools/build_detect_platform
  5. 6
      tools/db_bench_tool.cc
  6. 596
      util/crc32c.cc
  7. 98
      util/crc32c_test.cc

@ -169,7 +169,7 @@ if(PORTABLE)
# MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h # MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h
# is available, it is available by default. # is available, it is available by default.
if(FORCE_SSE42 AND NOT MSVC) if(FORCE_SSE42 AND NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul")
endif() endif()
else() else()
if(MSVC) if(MSVC)
@ -181,13 +181,18 @@ endif()
include(CheckCXXSourceCompiles) include(CheckCXXSourceCompiles)
if(NOT MSVC) if(NOT MSVC)
set(CMAKE_REQUIRED_FLAGS "-msse4.2") set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
endif() endif()
CHECK_CXX_SOURCE_COMPILES(" CHECK_CXX_SOURCE_COMPILES("
#include <cstdint> #include <cstdint>
#include <nmmintrin.h> #include <nmmintrin.h>
#include <wmmintrin.h>
int main() { int main() {
volatile uint32_t x = _mm_crc32_u32(0, 0); volatile uint32_t x = _mm_crc32_u32(0, 0);
const auto a = _mm_set_epi64x(0, 0);
const auto b = _mm_set_epi64x(0, 0);
const auto c = _mm_clmulepi64_si128(a, b, 0x00);
auto d = _mm_cvtsi128_si64(c);
} }
" HAVE_SSE42) " HAVE_SSE42)
unset(CMAKE_REQUIRED_FLAGS) unset(CMAKE_REQUIRED_FLAGS)
@ -609,7 +614,7 @@ if(HAVE_SSE42 AND NOT FORCE_SSE42)
if(NOT MSVC) if(NOT MSVC)
set_source_files_properties( set_source_files_properties(
util/crc32c.cc util/crc32c.cc
PROPERTIES COMPILE_FLAGS "-msse4.2") PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
endif() endif()
endif() endif()

@ -27,6 +27,7 @@
* Return an error on write if write_options.sync = true and write_options.disableWAL = true to warn user of inconsistent options. Previously we will not write to WAL and not respecting the sync options in this case. * Return an error on write if write_options.sync = true and write_options.disableWAL = true to warn user of inconsistent options. Previously we will not write to WAL and not respecting the sync options in this case.
### New Features ### New Features
* CRC32C is now using the 3-way pipelined SSE algorithm `crc32c_3way` on supported platforms to improve performance. The system will choose to use this algorithm on supported platforms automatically whenever possible. If PCLMULQDQ is not supported it will fall back to the old Fast_CRC32 algorithm.
* `DBOptions::writable_file_max_buffer_size` can now be changed dynamically. * `DBOptions::writable_file_max_buffer_size` can now be changed dynamically.
* `DBOptions::bytes_per_sync`, `DBOptions::compaction_readahead_size`, and `DBOptions::wal_bytes_per_sync` can now be changed dynamically, `DBOptions::wal_bytes_per_sync` will flush all memtables and switch to a new WAL file. * `DBOptions::bytes_per_sync`, `DBOptions::compaction_readahead_size`, and `DBOptions::wal_bytes_per_sync` can now be changed dynamically, `DBOptions::wal_bytes_per_sync` will flush all memtables and switch to a new WAL file.
* Support dynamic adjustment of rate limit according to demand for background I/O. It can be enabled by passing `true` to the `auto_tuned` parameter in `NewGenericRateLimiter()`. The value passed as `rate_bytes_per_sec` will still be respected as an upper-bound. * Support dynamic adjustment of rate limit according to demand for background I/O. It can be enabled by passing `true` to the `auto_tuned` parameter in `NewGenericRateLimiter()`. The value passed as `rate_bytes_per_sec` will still be respected as an upper-bound.

@ -305,6 +305,9 @@ LDFLAGS += $(LUA_LIB)
endif endif
ifeq ($(NO_THREEWAY_CRC32C), 1)
CXXFLAGS += -DNO_THREEWAY_CRC32C
endif
CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers

@ -484,10 +484,10 @@ if test -z "$PORTABLE"; then
elif [ "$TARGET_OS" != AIX ] && [ "$TARGET_OS" != SunOS ]; then elif [ "$TARGET_OS" != AIX ] && [ "$TARGET_OS" != SunOS ]; then
COMMON_FLAGS="$COMMON_FLAGS -march=native " COMMON_FLAGS="$COMMON_FLAGS -march=native "
elif test "$USE_SSE"; then elif test "$USE_SSE"; then
COMMON_FLAGS="$COMMON_FLAGS -msse4.2" COMMON_FLAGS="$COMMON_FLAGS -msse4.2 -mpclmul"
fi fi
elif test "$USE_SSE"; then elif test "$USE_SSE"; then
COMMON_FLAGS="$COMMON_FLAGS -msse4.2" COMMON_FLAGS="$COMMON_FLAGS -msse4.2 -mpclmul"
fi fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
@ -501,6 +501,24 @@ if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DHAVE_SSE42" COMMON_FLAGS="$COMMON_FLAGS -DHAVE_SSE42"
elif test "$USE_SSE"; then elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling"
exit 1
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <cstdint>
#include <wmmintrin.h>
int main() {
const auto a = _mm_set_epi64x(0, 0);
const auto b = _mm_set_epi64x(0, 0);
const auto c = _mm_clmulepi64_si128(a, b, 0x00);
auto d = _mm_cvtsi128_si64(c);
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DHAVE_PCLMUL"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling"
exit 1
fi fi
# iOS doesn't support thread-local storage, but this check would erroneously # iOS doesn't support thread-local storage, but this check would erroneously

@ -2752,8 +2752,10 @@ void VerifyDBFromDB(std::string& truth_db_name) {
void Crc32c(ThreadState* thread) { void Crc32c(ThreadState* thread) {
// Checksum about 500MB of data total // Checksum about 500MB of data total
const int size = 4096; const int size = FLAGS_block_size; // use --block_size option for db_bench
const char* label = "(4K per op)"; std::string labels = "(" + ToString(FLAGS_block_size) + " per op)";
const char* label = labels.c_str();
std::string data(size, 'x'); std::string data(size, 'x');
int64_t bytes = 0; int64_t bytes = 0;
uint32_t crc = 0; uint32_t crc = 0;

@ -9,12 +9,11 @@
// //
// A portable implementation of crc32c, optimized to handle // A portable implementation of crc32c, optimized to handle
// four bytes at a time. // four bytes at a time.
#include "util/crc32c.h" #include "util/crc32c.h"
#include <stdint.h> #include <stdint.h>
#ifdef HAVE_SSE42 #ifdef HAVE_SSE42
#include <nmmintrin.h> #include <nmmintrin.h>
#include <wmmintrin.h>
#endif #endif
#include "util/coding.h" #include "util/coding.h"
@ -352,6 +351,7 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
template<void (*CRC32)(uint64_t*, uint8_t const**)> template<void (*CRC32)(uint64_t*, uint8_t const**)>
uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
const uint8_t *e = p + size; const uint8_t *e = p + size;
uint64_t l = crc ^ 0xffffffffu; uint64_t l = crc ^ 0xffffffffu;
@ -395,13 +395,14 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
// Detect if SS42 or not. // Detect if SS42 or not.
#ifndef HAVE_POWER8 #ifndef HAVE_POWER8
static bool isSSE42() { static bool isSSE42() {
#ifndef HAVE_SSE42 #ifndef HAVE_SSE42
return false; return false;
#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) #elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
uint32_t c_; uint32_t c_;
__asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx"); __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
return c_ & (1U << 20); // copied from CpuId.h in Folly. return c_ & (1U << 20); // copied from CpuId.h in Folly. Test SSE42
#elif defined(_WIN64) #elif defined(_WIN64)
int info[4]; int info[4];
__cpuidex(info, 0x00000001, 0); __cpuidex(info, 0x00000001, 0);
@ -410,7 +411,26 @@ static bool isSSE42() {
return false; return false;
#endif #endif
} }
static bool isPCLMULQDQ() {
#ifndef HAVE_SSE42
// in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ are
// supported by compiler
return false;
#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
uint32_t c_;
__asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
return c_ & (1U << 1); // PCLMULQDQ is in bit 1 (not bit 0)
#elif defined(_WIN64)
int info[4];
__cpuidex(info, 0x00000001, 0);
return (info[2] & ((int)1 << 1)) != 0;
#else
return false;
#endif #endif
}
#endif // HAVE_POWER8
typedef uint32_t (*Function)(uint32_t, const char*, size_t); typedef uint32_t (*Function)(uint32_t, const char*, size_t);
@ -440,13 +460,6 @@ static bool isAltiVec() {
} }
#endif #endif
static inline Function Choose_Extend() {
#ifndef HAVE_POWER8
return isSSE42() ? ExtendImpl<Fast_CRC32> : ExtendImpl<Slow_CRC32>;
#else
return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
#endif
}
std::string IsFastCrc32Supported() { std::string IsFastCrc32Supported() {
bool has_fast_crc = false; bool has_fast_crc = false;
@ -475,11 +488,572 @@ std::string IsFastCrc32Supported() {
return fast_zero_msg; return fast_zero_msg;
} }
static Function ChosenExtend = Choose_Extend();
/*
* Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
* Ferry Toth
* ftoth@exalondelft.nl
*
* https://github.com/htot/crc32c
*
* Modified by Facebook
*
* Original intel whitepaper:
* "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
*
* This version is from the folly library, created by Dave Watson <davejwatson@fb.com>
*
*/
#if defined HAVE_SSE42 && defined HAVE_PCLMUL
#define CRCtriplet(crc, buf, offset) \
crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset));
#define CRCduplet(crc, buf, offset) \
crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
#define CRCsinglet(crc, buf, offset) \
crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset));
// Numbers taken directly from intel whitepaper.
// clang-format off
const uint64_t clmul_constants[] = {
0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
};
// Compute the crc32c value for buffer smaller than 8
inline void align_to_8(
size_t len,
uint64_t& crc0, // crc so far, updated on return
const unsigned char*& next) { // next data pointer, updated on return
uint32_t crc32bit = static_cast<uint32_t>(crc0);
if (len & 0x04) {
crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
next += sizeof(uint32_t);
}
if (len & 0x02) {
crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
next += sizeof(uint16_t);
}
if (len & 0x01) {
crc32bit = _mm_crc32_u8(crc32bit, *(next));
next++;
}
crc0 = crc32bit;
}
//
// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
// chosen constant and xor's these with the remaining CRC.
//
inline uint64_t CombineCRC(
size_t block_size,
uint64_t crc0,
uint64_t crc1,
uint64_t crc2,
const uint64_t* next2) {
const auto multiplier =
*(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
const auto crc0_xmm = _mm_set_epi64x(0, crc0);
const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
const auto crc1_xmm = _mm_set_epi64x(0, crc1);
const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
const auto res = _mm_xor_si128(res0, res1);
crc0 = _mm_cvtsi128_si64(res);
crc0 = crc0 ^ *((uint64_t*)next2 - 1);
crc2 = _mm_crc32_u64(crc2, crc0);
return crc2;
}
// Compute CRC-32C using the Intel hardware instruction.
uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
const unsigned char* next = (const unsigned char*)buf;
uint64_t count;
uint64_t crc0, crc1, crc2;
crc0 = crc ^ 0xffffffffu;
if (len >= 8) {
// if len > 216 then align and use triplets
if (len > 216) {
{
// Work on the bytes (< 8) before the first 8-byte alignment addr starts
uint64_t align_bytes = (8 - (uintptr_t)next) & 7;
len -= align_bytes;
align_to_8(align_bytes, crc0, next);
}
// Now work on the remaining blocks
count = len / 24; // number of triplets
len %= 24; // bytes remaining
uint64_t n = count >> 7; // #blocks = first block + full blocks
uint64_t block_size = count & 127;
if (block_size == 0) {
block_size = 128;
} else {
n++;
}
// points to the first byte of the next block
const uint64_t* next0 = (uint64_t*)next + block_size;
const uint64_t* next1 = next0 + block_size;
const uint64_t* next2 = next1 + block_size;
crc1 = crc2 = 0;
// Use Duff's device, a for() loop inside a switch()
// statement. This needs to execute at least once, round len
// down to nearest triplet multiple
switch (block_size) {
case 128:
do {
// jumps here for a full block of len 128
CRCtriplet(crc, next, -128);
case 127:
// jumps here or below for the first block smaller
CRCtriplet(crc, next, -127);
case 126:
CRCtriplet(crc, next, -126); // than 128
case 125:
CRCtriplet(crc, next, -125);
case 124:
CRCtriplet(crc, next, -124);
case 123:
CRCtriplet(crc, next, -123);
case 122:
CRCtriplet(crc, next, -122);
case 121:
CRCtriplet(crc, next, -121);
case 120:
CRCtriplet(crc, next, -120);
case 119:
CRCtriplet(crc, next, -119);
case 118:
CRCtriplet(crc, next, -118);
case 117:
CRCtriplet(crc, next, -117);
case 116:
CRCtriplet(crc, next, -116);
case 115:
CRCtriplet(crc, next, -115);
case 114:
CRCtriplet(crc, next, -114);
case 113:
CRCtriplet(crc, next, -113);
case 112:
CRCtriplet(crc, next, -112);
case 111:
CRCtriplet(crc, next, -111);
case 110:
CRCtriplet(crc, next, -110);
case 109:
CRCtriplet(crc, next, -109);
case 108:
CRCtriplet(crc, next, -108);
case 107:
CRCtriplet(crc, next, -107);
case 106:
CRCtriplet(crc, next, -106);
case 105:
CRCtriplet(crc, next, -105);
case 104:
CRCtriplet(crc, next, -104);
case 103:
CRCtriplet(crc, next, -103);
case 102:
CRCtriplet(crc, next, -102);
case 101:
CRCtriplet(crc, next, -101);
case 100:
CRCtriplet(crc, next, -100);
case 99:
CRCtriplet(crc, next, -99);
case 98:
CRCtriplet(crc, next, -98);
case 97:
CRCtriplet(crc, next, -97);
case 96:
CRCtriplet(crc, next, -96);
case 95:
CRCtriplet(crc, next, -95);
case 94:
CRCtriplet(crc, next, -94);
case 93:
CRCtriplet(crc, next, -93);
case 92:
CRCtriplet(crc, next, -92);
case 91:
CRCtriplet(crc, next, -91);
case 90:
CRCtriplet(crc, next, -90);
case 89:
CRCtriplet(crc, next, -89);
case 88:
CRCtriplet(crc, next, -88);
case 87:
CRCtriplet(crc, next, -87);
case 86:
CRCtriplet(crc, next, -86);
case 85:
CRCtriplet(crc, next, -85);
case 84:
CRCtriplet(crc, next, -84);
case 83:
CRCtriplet(crc, next, -83);
case 82:
CRCtriplet(crc, next, -82);
case 81:
CRCtriplet(crc, next, -81);
case 80:
CRCtriplet(crc, next, -80);
case 79:
CRCtriplet(crc, next, -79);
case 78:
CRCtriplet(crc, next, -78);
case 77:
CRCtriplet(crc, next, -77);
case 76:
CRCtriplet(crc, next, -76);
case 75:
CRCtriplet(crc, next, -75);
case 74:
CRCtriplet(crc, next, -74);
case 73:
CRCtriplet(crc, next, -73);
case 72:
CRCtriplet(crc, next, -72);
case 71:
CRCtriplet(crc, next, -71);
case 70:
CRCtriplet(crc, next, -70);
case 69:
CRCtriplet(crc, next, -69);
case 68:
CRCtriplet(crc, next, -68);
case 67:
CRCtriplet(crc, next, -67);
case 66:
CRCtriplet(crc, next, -66);
case 65:
CRCtriplet(crc, next, -65);
case 64:
CRCtriplet(crc, next, -64);
case 63:
CRCtriplet(crc, next, -63);
case 62:
CRCtriplet(crc, next, -62);
case 61:
CRCtriplet(crc, next, -61);
case 60:
CRCtriplet(crc, next, -60);
case 59:
CRCtriplet(crc, next, -59);
case 58:
CRCtriplet(crc, next, -58);
case 57:
CRCtriplet(crc, next, -57);
case 56:
CRCtriplet(crc, next, -56);
case 55:
CRCtriplet(crc, next, -55);
case 54:
CRCtriplet(crc, next, -54);
case 53:
CRCtriplet(crc, next, -53);
case 52:
CRCtriplet(crc, next, -52);
case 51:
CRCtriplet(crc, next, -51);
case 50:
CRCtriplet(crc, next, -50);
case 49:
CRCtriplet(crc, next, -49);
case 48:
CRCtriplet(crc, next, -48);
case 47:
CRCtriplet(crc, next, -47);
case 46:
CRCtriplet(crc, next, -46);
case 45:
CRCtriplet(crc, next, -45);
case 44:
CRCtriplet(crc, next, -44);
case 43:
CRCtriplet(crc, next, -43);
case 42:
CRCtriplet(crc, next, -42);
case 41:
CRCtriplet(crc, next, -41);
case 40:
CRCtriplet(crc, next, -40);
case 39:
CRCtriplet(crc, next, -39);
case 38:
CRCtriplet(crc, next, -38);
case 37:
CRCtriplet(crc, next, -37);
case 36:
CRCtriplet(crc, next, -36);
case 35:
CRCtriplet(crc, next, -35);
case 34:
CRCtriplet(crc, next, -34);
case 33:
CRCtriplet(crc, next, -33);
case 32:
CRCtriplet(crc, next, -32);
case 31:
CRCtriplet(crc, next, -31);
case 30:
CRCtriplet(crc, next, -30);
case 29:
CRCtriplet(crc, next, -29);
case 28:
CRCtriplet(crc, next, -28);
case 27:
CRCtriplet(crc, next, -27);
case 26:
CRCtriplet(crc, next, -26);
case 25:
CRCtriplet(crc, next, -25);
case 24:
CRCtriplet(crc, next, -24);
case 23:
CRCtriplet(crc, next, -23);
case 22:
CRCtriplet(crc, next, -22);
case 21:
CRCtriplet(crc, next, -21);
case 20:
CRCtriplet(crc, next, -20);
case 19:
CRCtriplet(crc, next, -19);
case 18:
CRCtriplet(crc, next, -18);
case 17:
CRCtriplet(crc, next, -17);
case 16:
CRCtriplet(crc, next, -16);
case 15:
CRCtriplet(crc, next, -15);
case 14:
CRCtriplet(crc, next, -14);
case 13:
CRCtriplet(crc, next, -13);
case 12:
CRCtriplet(crc, next, -12);
case 11:
CRCtriplet(crc, next, -11);
case 10:
CRCtriplet(crc, next, -10);
case 9:
CRCtriplet(crc, next, -9);
case 8:
CRCtriplet(crc, next, -8);
case 7:
CRCtriplet(crc, next, -7);
case 6:
CRCtriplet(crc, next, -6);
case 5:
CRCtriplet(crc, next, -5);
case 4:
CRCtriplet(crc, next, -4);
case 3:
CRCtriplet(crc, next, -3);
case 2:
CRCtriplet(crc, next, -2);
case 1:
CRCduplet(crc, next, -1); // the final triplet is actually only 2
//{ CombineCRC(); }
crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
if (--n > 0) {
crc1 = crc2 = 0;
block_size = 128;
// points to the first byte of the next block
next0 = next2 + 128;
next1 = next0 + 128; // from here on all blocks are 128 long
next2 = next1 + 128;
}
case 0:;
} while (n > 0);
}
next = (const unsigned char*)next2;
}
uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets
len = len & 7;
next += (count2 * 8);
switch (count2) {
case 27:
CRCsinglet(crc0, next, -27 * 8);
case 26:
CRCsinglet(crc0, next, -26 * 8);
case 25:
CRCsinglet(crc0, next, -25 * 8);
case 24:
CRCsinglet(crc0, next, -24 * 8);
case 23:
CRCsinglet(crc0, next, -23 * 8);
case 22:
CRCsinglet(crc0, next, -22 * 8);
case 21:
CRCsinglet(crc0, next, -21 * 8);
case 20:
CRCsinglet(crc0, next, -20 * 8);
case 19:
CRCsinglet(crc0, next, -19 * 8);
case 18:
CRCsinglet(crc0, next, -18 * 8);
case 17:
CRCsinglet(crc0, next, -17 * 8);
case 16:
CRCsinglet(crc0, next, -16 * 8);
case 15:
CRCsinglet(crc0, next, -15 * 8);
case 14:
CRCsinglet(crc0, next, -14 * 8);
case 13:
CRCsinglet(crc0, next, -13 * 8);
case 12:
CRCsinglet(crc0, next, -12 * 8);
case 11:
CRCsinglet(crc0, next, -11 * 8);
case 10:
CRCsinglet(crc0, next, -10 * 8);
case 9:
CRCsinglet(crc0, next, -9 * 8);
case 8:
CRCsinglet(crc0, next, -8 * 8);
case 7:
CRCsinglet(crc0, next, -7 * 8);
case 6:
CRCsinglet(crc0, next, -6 * 8);
case 5:
CRCsinglet(crc0, next, -5 * 8);
case 4:
CRCsinglet(crc0, next, -4 * 8);
case 3:
CRCsinglet(crc0, next, -3 * 8);
case 2:
CRCsinglet(crc0, next, -2 * 8);
case 1:
CRCsinglet(crc0, next, -1 * 8);
case 0:;
}
}
{
align_to_8(len, crc0, next);
return (uint32_t)crc0 ^ 0xffffffffu;
}
}
#endif //HAVE_SSE42 && HAVE_PCLMUL
static inline Function Choose_Extend() {
#ifndef HAVE_POWER8
if (isSSE42()) {
if (isPCLMULQDQ()) {
#if defined HAVE_SSE42 && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C
return crc32c_3way;
#else
return ExtendImpl<Fast_CRC32>; // Fast_CRC32 will check HAVE_SSE42 itself
#endif
}
else { // no runtime PCLMULQDQ support but has SSE42 support
return ExtendImpl<Fast_CRC32>;
}
} // end of isSSE42()
else {
return ExtendImpl<Slow_CRC32>;
}
#else //HAVE_POWER8
return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
#endif
}
static Function ChosenExtend = Choose_Extend();
uint32_t Extend(uint32_t crc, const char* buf, size_t size) { uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
return ChosenExtend(crc, buf, size); return ChosenExtend(crc, buf, size);
} }
} // namespace crc32c } // namespace crc32c
} // namespace rocksdb } // namespace rocksdb

@ -6,7 +6,6 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/crc32c.h" #include "util/crc32c.h"
#include "util/testharness.h" #include "util/testharness.h"
@ -15,7 +14,57 @@ namespace crc32c {
class CRC { }; class CRC { };
// Tests for 3-way crc32c algorithm. We need these tests because it uses
// different lookup tables than the original Fast_CRC32
const unsigned int BUFFER_SIZE = 512 * 1024 * sizeof(uint64_t);
char buffer[BUFFER_SIZE];
struct ExpectedResult {
size_t offset;
size_t length;
uint32_t crc32c;
};
ExpectedResult expectedResults[] = {
// Zero-byte input
{ 0, 0, ~0U },
// Small aligned inputs to test special cases in SIMD implementations
{ 8, 1, 1543413366 },
{ 8, 2, 523493126 },
{ 8, 3, 1560427360 },
{ 8, 4, 3422504776 },
{ 8, 5, 447841138 },
{ 8, 6, 3910050499 },
{ 8, 7, 3346241981 },
// Small unaligned inputs
{ 9, 1, 3855826643 },
{ 10, 2, 560880875 },
{ 11, 3, 1479707779 },
{ 12, 4, 2237687071 },
{ 13, 5, 4063855784 },
{ 14, 6, 2553454047 },
{ 15, 7, 1349220140 },
// Larger inputs to test leftover chunks at the end of aligned blocks
{ 8, 8, 627613930 },
{ 8, 9, 2105929409 },
{ 8, 10, 2447068514 },
{ 8, 11, 863807079 },
{ 8, 12, 292050879 },
{ 8, 13, 1411837737 },
{ 8, 14, 2614515001 },
{ 8, 15, 3579076296 },
{ 8, 16, 2897079161 },
{ 8, 17, 675168386 },
// // Much larger inputs
{ 0, BUFFER_SIZE, 2096790750 },
{ 1, BUFFER_SIZE / 2, 3854797577 },
};
TEST(CRC, StandardResults) { TEST(CRC, StandardResults) {
// Original Fast_CRC32 tests.
// From rfc3720 section B.4. // From rfc3720 section B.4.
char buf[32]; char buf[32];
@ -50,6 +99,24 @@ TEST(CRC, StandardResults) {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
}; };
ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data))); ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
// 3-Way Crc32c tests ported from folly.
// Test 1: single computation
for (auto expected : expectedResults) {
uint32_t result = Value(buffer + expected.offset, expected.length);
EXPECT_EQ(~expected.crc32c, result);
}
// Test 2: stitching two computations
for (auto expected : expectedResults) {
size_t partialLength = expected.length / 2;
uint32_t partialChecksum = Value(buffer + expected.offset, partialLength);
uint32_t result = Extend(partialChecksum,
buffer + expected.offset + partialLength,
expected.length - partialLength);
EXPECT_EQ(~expected.crc32c, result);
}
} }
TEST(CRC, Values) { TEST(CRC, Values) {
@ -72,7 +139,36 @@ TEST(CRC, Mask) {
} // namespace crc32c } // namespace crc32c
} // namespace rocksdb } // namespace rocksdb
// copied from folly
const uint64_t FNV_64_HASH_START = 14695981039346656037ULL;
inline uint64_t fnv64_buf(const void* buf,
size_t n,
uint64_t hash = FNV_64_HASH_START) {
// forcing signed char, since other platforms can use unsigned
const signed char* char_buf = reinterpret_cast<const signed char*>(buf);
for (size_t i = 0; i < n; ++i) {
hash += (hash << 1) + (hash << 4) + (hash << 5) + (hash << 7) +
(hash << 8) + (hash << 40);
hash ^= char_buf[i];
}
return hash;
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv); ::testing::InitGoogleTest(&argc, argv);
// Populate a buffer with a deterministic pattern
// on which to compute checksums
const uint8_t* src = (uint8_t*)rocksdb::crc32c::buffer;
uint64_t* dst = (uint64_t*)rocksdb::crc32c::buffer;
const uint64_t* end = (const uint64_t*)(rocksdb::crc32c::buffer + rocksdb::crc32c::BUFFER_SIZE);
*dst++ = 0;
while (dst < end) {
*dst++ = fnv64_buf((const char*)src, sizeof(uint64_t));
src += sizeof(uint64_t);
}
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }

Loading…
Cancel
Save