From a3c1832e862ab4b76ccf1299d6e95b15eb50730e Mon Sep 17 00:00:00 2001
From: Yuqi Gu <yuqi.gu@arm.com>
Date: Wed, 17 Jul 2019 11:19:06 -0700
Subject: [PATCH] Arm64 CRC32 parallel computation optimization for RocksDB
 (#5494)

Summary:
Crc32c Parallel computation optimization:
Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf)
 Input data is divided into three equal-sized blocks
Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes

1. crc32c_test:
```
[==========] Running 4 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 4 tests from CRC
[ RUN      ] CRC.StandardResults
[       OK ] CRC.StandardResults (1 ms)
[ RUN      ] CRC.Values
[       OK ] CRC.Values (0 ms)
[ RUN      ] CRC.Extend
[       OK ] CRC.Extend (0 ms)
[ RUN      ] CRC.Mask
[       OK ] CRC.Mask (0 ms)
[----------] 4 tests from CRC (1 ms total)

[----------] Global test environment tear-down
[==========] 4 tests from 1 test case ran. (1 ms total)
[  PASSED  ] 4 tests.
```

2. RocksDB benchmark: db_bench --benchmarks="crc32c"

```
Linear Arm crc32c:
  crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op)
```

```
Parallel optimization with Armv8 crypto extension:
  crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op)
```

It gets ~2.4x speedup compared to linear Arm crc32c instructions.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494

Differential Revision: D16340806

fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945
---
 Makefile             |  4 +--
 util/crc32c_arm64.cc | 83 ++++++++++++++++++++++++++++++++++----------
 util/crc32c_arm64.h  | 15 ++++++--
 3 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index 1828b833b..100f160ca 100644
--- a/Makefile
+++ b/Makefile
@@ -144,8 +144,8 @@ HAVE_POWER8=1
 endif
 
 ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc -xc /dev/null 2>&1))
-CXXFLAGS += -march=armv8-a+crc
-CFLAGS += -march=armv8-a+crc
+CXXFLAGS += -march=armv8-a+crc+crypto
+CFLAGS += -march=armv8-a+crc+crypto
 ARMCRC_SOURCE=1
 endif
 
diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index 62fabe99e..8743f8c72 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -19,35 +19,82 @@ uint32_t crc32c_runtime_check(void) {
 
 uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
                              unsigned len) {
-  const uint8_t *buf1;
-  const uint16_t *buf2;
-  const uint32_t *buf4;
-  const uint64_t *buf8;
+  const uint8_t *buf8;
+  const uint64_t *buf64 = (uint64_t *)data;
+  int length = (int)len;
+  crc ^= 0xffffffff;
 
-  int64_t length = (int64_t)len;
+#ifdef HAVE_ARM64_CRYPTO
+  /* Crc32c Parallel computation
+   *   Algorithm comes from Intel whitepaper:
+   *   crc-iscsi-polynomial-crc32-instruction-paper
+   *
+   * Input data is divided into three equal-sized blocks
+   *   Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
+   *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
+   */
+  #define BLK_LENGTH 42
+  while (length >= 1024) {
+    uint64_t t0, t1;
+    uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
 
-  crc ^= 0xffffffff;
-  buf8 = (const uint64_t *)data;
-  while ((length -= sizeof(uint64_t)) >= 0) {
-    crc = __crc32cd(crc, *buf8++);
+    /* Parallel Param:
+     *   k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
+     *   k1 = CRC32(x ^ (42 * 8 * 8 - 1));
+     */
+    uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
+
+    /* First 8 bytei for better pipelining */
+    crc0 = crc32c_u64(crc, *buf64++);
+
+    /* 3 blocks crc32c parallel computation
+     *
+     * 42 * 8 * 3 = 1008 (bytes)
+     */
+    for (int i = 0; i < BLK_LENGTH; i++, buf64++) {
+      crc0 = crc32c_u64(crc0, *buf64);
+      crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH));
+      crc2 = crc32c_u64(crc2, *(buf64 + (BLK_LENGTH * 2)));
+    }
+    buf64 += (BLK_LENGTH * 2);
+
+    /* Last 8 bytes */
+    crc = crc32c_u64(crc2, *buf64++);
+
+    t0 = (uint64_t)vmull_p64(crc0, k0);
+    t1 = (uint64_t)vmull_p64(crc1, k1);
+
+    /* Merge (crc0, crc1, crc2) -> crc */
+    crc1 = crc32c_u64(0, t1);
+    crc ^= crc1;
+    crc0 = crc32c_u64(0, t0);
+    crc ^= crc0;
+
+    length -= 1024;
+  }
+#endif
+  buf8 = (const uint8_t *)buf64;
+  while (length >= 8) {
+    crc = crc32c_u64(crc, *(const uint64_t*)buf8);
+    buf8 += 8;
+    length -= 8;
   }
 
   /* The following is more efficient than the straight loop */
-  buf4 = (const uint32_t *)buf8;
-  if (length & sizeof(uint32_t)) {
-    crc = __crc32cw(crc, *buf4++);
+  if (length >= 4) {
+    crc = crc32c_u32(crc, *(const uint32_t*)buf8);
+    buf8 += 4;
     length -= 4;
   }
 
-  buf2 = (const uint16_t *)buf4;
-  if (length & sizeof(uint16_t)) {
-    crc = __crc32ch(crc, *buf2++);
+  if (length >= 2) {
+    crc = crc32c_u16(crc, *(const uint16_t*)buf8);
+    buf8 += 2;
     length -= 2;
   }
 
-  buf1 = (const uint8_t *)buf2;
-  if (length & sizeof(uint8_t))
-    crc = __crc32cb(crc, *buf1);
+  if (length >= 1)
+    crc = crc32c_u8(crc, *buf8);
 
   crc ^= 0xffffffff;
   return crc;
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
index 80b3aca36..fb727ce40 100644
--- a/util/crc32c_arm64.h
+++ b/util/crc32c_arm64.h
@@ -9,13 +9,24 @@
 #include <cinttypes>
 
 #if defined(__aarch64__) || defined(__AARCH64__)
+
 #ifdef __ARM_FEATURE_CRC32
 #define HAVE_ARM64_CRC
 #include <arm_acle.h>
+#define crc32c_u8(crc, v)  __crc32cb(crc, v)
+#define crc32c_u16(crc, v) __crc32ch(crc, v)
+#define crc32c_u32(crc, v) __crc32cw(crc, v)
+#define crc32c_u64(crc, v) __crc32cd(crc, v)
+
 extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
 extern uint32_t crc32c_runtime_check(void);
-#endif
-#endif
 
+#ifdef __ARM_FEATURE_CRYPTO
+#define HAVE_ARM64_CRYPTO
+#include <arm_neon.h>
+#endif  // __ARM_FEATURE_CRYPTO
+#endif  // __ARM_FEATURE_CRC32
+
+#endif  // defined(__aarch64__) || defined(__AARCH64__)
 
 #endif