From a3c1832e862ab4b76ccf1299d6e95b15eb50730e Mon Sep 17 00:00:00 2001 From: Yuqi Gu Date: Wed, 17 Jul 2019 11:19:06 -0700 Subject: [PATCH] Arm64 CRC32 parallel computation optimization for RocksDB (#5494) Summary: Crc32c Parallel computation optimization: Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf) Input data is divided into three equal-sized blocks Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes 1. crc32c_test: ``` [==========] Running 4 tests from 1 test case. [----------] Global test environment set-up. [----------] 4 tests from CRC [ RUN ] CRC.StandardResults [ OK ] CRC.StandardResults (1 ms) [ RUN ] CRC.Values [ OK ] CRC.Values (0 ms) [ RUN ] CRC.Extend [ OK ] CRC.Extend (0 ms) [ RUN ] CRC.Mask [ OK ] CRC.Mask (0 ms) [----------] 4 tests from CRC (1 ms total) [----------] Global test environment tear-down [==========] 4 tests from 1 test case ran. (1 ms total) [ PASSED ] 4 tests. ``` 2. RocksDB benchmark: db_bench --benchmarks="crc32c" ``` Linear Arm crc32c: crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op) ``` ``` Parallel optimization with Armv8 crypto extension: crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op) ``` It gets ~2.4x speedup compared to linear Arm crc32c instructions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494 Differential Revision: D16340806 fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945 --- Makefile | 4 +-- util/crc32c_arm64.cc | 83 ++++++++++++++++++++++++++++++++++---------- util/crc32c_arm64.h | 15 ++++++-- 3 files changed, 80 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index 1828b833b..100f160ca 100644 --- a/Makefile +++ b/Makefile @@ -144,8 +144,8 @@ HAVE_POWER8=1 endif ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc -xc /dev/null 2>&1)) -CXXFLAGS += -march=armv8-a+crc -CFLAGS += -march=armv8-a+crc +CXXFLAGS += -march=armv8-a+crc+crypto +CFLAGS += -march=armv8-a+crc+crypto ARMCRC_SOURCE=1 endif diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc index 62fabe99e..8743f8c72 100644 --- a/util/crc32c_arm64.cc +++ b/util/crc32c_arm64.cc @@ -19,35 +19,82 @@ uint32_t crc32c_runtime_check(void) { uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len) { - const uint8_t *buf1; - const uint16_t *buf2; - const uint32_t *buf4; - const uint64_t *buf8; + const uint8_t *buf8; + const uint64_t *buf64 = (uint64_t *)data; + int length = (int)len; + crc ^= 0xffffffff; - int64_t length = (int64_t)len; +#ifdef HAVE_ARM64_CRYPTO + /* Crc32c Parallel computation + * Algorithm comes from Intel whitepaper: + * crc-iscsi-polynomial-crc32-instruction-paper + * + * Input data is divided into three equal-sized blocks + * Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes + * One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes + */ + #define BLK_LENGTH 42 + while (length >= 1024) { + uint64_t t0, t1; + uint32_t crc0 = 0, crc1 = 0, crc2 = 0; - crc ^= 0xffffffff; - buf8 = (const uint64_t *)data; - while ((length -= sizeof(uint64_t)) >= 0) { - crc = __crc32cd(crc, *buf8++); + /* Parallel Param: + * k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1)); + * k1 = CRC32(x ^ (42 * 8 * 8 - 1)); + */ + uint32_t k0 = 0xe417f38a, k1 = 0x8f158014; + + /* First 8 bytei for better pipelining */ + crc0 = crc32c_u64(crc, *buf64++); + + /* 3 blocks crc32c parallel computation + * + * 42 * 8 * 3 = 1008 (bytes) + */ + for (int i = 0; i < BLK_LENGTH; i++, buf64++) { + crc0 = crc32c_u64(crc0, *buf64); + crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH)); + crc2 = crc32c_u64(crc2, *(buf64 + (BLK_LENGTH * 2))); + } + buf64 += (BLK_LENGTH * 2); + + /* Last 8 bytes */ + crc = crc32c_u64(crc2, *buf64++); + + t0 = (uint64_t)vmull_p64(crc0, k0); + t1 = (uint64_t)vmull_p64(crc1, k1); + + /* Merge (crc0, crc1, crc2) -> crc */ + crc1 = crc32c_u64(0, t1); + crc ^= crc1; + crc0 = crc32c_u64(0, t0); + crc ^= crc0; + + length -= 1024; + } +#endif + buf8 = (const uint8_t *)buf64; + while (length >= 8) { + crc = crc32c_u64(crc, *(const uint64_t*)buf8); + buf8 += 8; + length -= 8; } /* The following is more efficient than the straight loop */ - buf4 = (const uint32_t *)buf8; - if (length & sizeof(uint32_t)) { - crc = __crc32cw(crc, *buf4++); + if (length >= 4) { + crc = crc32c_u32(crc, *(const uint32_t*)buf8); + buf8 += 4; length -= 4; } - buf2 = (const uint16_t *)buf4; - if (length & sizeof(uint16_t)) { - crc = __crc32ch(crc, *buf2++); + if (length >= 2) { + crc = crc32c_u16(crc, *(const uint16_t*)buf8); + buf8 += 2; length -= 2; } - buf1 = (const uint8_t *)buf2; - if (length & sizeof(uint8_t)) - crc = __crc32cb(crc, *buf1); + if (length >= 1) + crc = crc32c_u8(crc, *buf8); crc ^= 0xffffffff; return crc; diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h index 80b3aca36..fb727ce40 100644 --- a/util/crc32c_arm64.h +++ b/util/crc32c_arm64.h @@ -9,13 +9,24 @@ #include #if defined(__aarch64__) || defined(__AARCH64__) + #ifdef __ARM_FEATURE_CRC32 #define HAVE_ARM64_CRC #include +#define crc32c_u8(crc, v) __crc32cb(crc, v) +#define crc32c_u16(crc, v) __crc32ch(crc, v) +#define crc32c_u32(crc, v) __crc32cw(crc, v) +#define crc32c_u64(crc, v) __crc32cd(crc, v) + extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len); extern uint32_t crc32c_runtime_check(void); -#endif -#endif +#ifdef __ARM_FEATURE_CRYPTO +#define HAVE_ARM64_CRYPTO +#include +#endif // __ARM_FEATURE_CRYPTO +#endif // __ARM_FEATURE_CRC32 + +#endif // defined(__aarch64__) || defined(__AARCH64__) #endif