From 26293c89a65625c34f362385779358cb16905e38 Mon Sep 17 00:00:00 2001
From: DaiZhiwei <53242408+zhiwei-dai@users.noreply.github.com>
Date: Fri, 23 Aug 2019 11:02:06 -0700
Subject: [PATCH] crc32c_arm64 performance optimization (#5675)

Summary:
Crc32c Parallel computation coding optimization:
Macro unfolding removes the "for" loop and is good to decrease branch-miss in arm64 micro architecture
1024 Bytes is divided into  8(head) + 1008( 6 * 7 * 3 * 8 ) + 8(tail)  three parts
Macro unfolding 42 loops to 6 CRC32C7X24BYTESs
1 CRC32C7X24BYTES containing 7 CRC32C24BYTESs

1, crc32c_test
[==========] Running 4 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 4 tests from CRC
[ RUN      ] CRC.StandardResults
[       OK ] CRC.StandardResults (1 ms)
[ RUN      ] CRC.Values
[       OK ] CRC.Values (0 ms)
[ RUN      ] CRC.Extend
[       OK ] CRC.Extend (0 ms)
[ RUN      ] CRC.Mask
[       OK ] CRC.Mask (0 ms)
[----------] 4 tests from CRC (1 ms total)

[----------] Global test environment tear-down
[==========] 4 tests from 1 test case ran. (1 ms total)
[  PASSED  ] 4 tests.

2, db_bench --benchmarks="crc32c"
crc32c : 0.218 micros/op 4595390 ops/sec; 17950.7 MB/s (4096 per op)

3, repeated crc32c_test case  60000 times
perf stat -e branch-miss -- ./crc32c_test
before optimization:
739,426,504      branch-miss
after optimization:
1,128,572      branch-miss
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5675

Differential Revision: D16989210

fbshipit-source-id: 7204e6069bb6ed066d49c2d1b3ac385065a98557
---
 util/crc32c_arm64.cc | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index 8743f8c72..d346c2612 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -12,6 +12,26 @@
 #ifndef HWCAP_CRC32
 #define HWCAP_CRC32 (1 << 7)
 #endif
+
+#ifdef HAVE_ARM64_CRYPTO
+/* unfolding to compute 8 * 3 = 24 bytes parallelly */
+#define CRC32C24BYTES(ITR) \
+  crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR)));\
+  crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH*2 + (ITR)));\
+  crc0 = crc32c_u64(crc0, *(buf64 + (ITR)));
+
+/* unfolding to compute 24 * 7 = 168 bytes parallelly */
+#define CRC32C7X24BYTES(ITR) do {\
+  CRC32C24BYTES((ITR)*7+0) \
+  CRC32C24BYTES((ITR)*7+1) \
+  CRC32C24BYTES((ITR)*7+2) \
+  CRC32C24BYTES((ITR)*7+3) \
+  CRC32C24BYTES((ITR)*7+4) \
+  CRC32C24BYTES((ITR)*7+5) \
+  CRC32C24BYTES((ITR)*7+6) \
+} while(0)
+#endif
+
 uint32_t crc32c_runtime_check(void) {
   uint64_t auxv = getauxval(AT_HWCAP);
   return (auxv & HWCAP_CRC32) != 0;
@@ -48,15 +68,16 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
     crc0 = crc32c_u64(crc, *buf64++);
 
     /* 3 blocks crc32c parallel computation
-     *
-     * 42 * 8 * 3 = 1008 (bytes)
+     * Macro unfolding to compute parallelly
+     * 168 * 6 = 1008 (bytes)
      */
-    for (int i = 0; i < BLK_LENGTH; i++, buf64++) {
-      crc0 = crc32c_u64(crc0, *buf64);
-      crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH));
-      crc2 = crc32c_u64(crc2, *(buf64 + (BLK_LENGTH * 2)));
-    }
-    buf64 += (BLK_LENGTH * 2);
+    CRC32C7X24BYTES(0);
+    CRC32C7X24BYTES(1);
+    CRC32C7X24BYTES(2);
+    CRC32C7X24BYTES(3);
+    CRC32C7X24BYTES(4);
+    CRC32C7X24BYTES(5);
+    buf64 += (BLK_LENGTH * 3);
 
     /* Last 8 bytes */
     crc = crc32c_u64(crc2, *buf64++);
@@ -72,6 +93,9 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
 
     length -= 1024;
   }
+
+  if (length == 0)
+    return crc ^ (0xffffffffU);
 #endif
   buf8 = (const uint8_t *)buf64;
   while (length >= 8) {