|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "util/crc32c.h"
|
|
|
|
|
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/random.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
namespace crc32c {
|
|
|
|
|
|
|
|
class CRC { };
|
|
|
|
|
Port 3 way SSE4.2 crc32c implementation from Folly
Summary:
**# Summary**
RocksDB uses SSE crc32 intrinsics to calculate the crc32 values but it does it in single way fashion (not pipelined on single CPU core). Intel's whitepaper () published an algorithm that uses 3-way pipelining for the crc32 intrinsics, then use pclmulqdq intrinsic to combine the values. Because pclmulqdq has overhead on its own, this algorithm will show perf gains on buffers larger than 216 bytes, which makes RocksDB a perfect user, since most of the buffers RocksDB call crc32c on is over 4KB. Initial db_bench show tremendous CPU gain.
This change uses the 3-way SSE algorithm by default. The old SSE algorithm is now behind a compiler tag NO_THREEWAY_CRC32C. If user compiles the code with NO_THREEWAY_CRC32C=1 then the old SSE Crc32c algorithm would be used. If the server does not have SSE4.2 at the run time the slow way (Non SSE) will be used.
**# Performance Test Results**
We ran the FillRandom and ReadRandom benchmarks in db_bench. ReadRandom is the point of interest here since it calculates the CRC32 for the in-mem buffers. We did 3 runs for each algorithm.
Before this change the CRC32 value computation takes about 11.5% of total CPU cost, and with the new 3-way algorithm it reduced to around 4.5%. The overall throughput also improved from 25.53MB/s to 27.63MB/s.
1) ReadRandom in db_bench overall metrics
PER RUN
Algorithm | run | micros/op | ops/sec |Throughput (MB/s)
3-way | 1 | 4.143 | 241387 | 26.7
3-way | 2 | 3.775 | 264872 | 29.3
3-way | 3 | 4.116 | 242929 | 26.9
FastCrc32c|1 | 4.037 | 247727 | 27.4
FastCrc32c|2 | 4.648 | 215166 | 23.8
FastCrc32c|3 | 4.352 | 229799 | 25.4
AVG
Algorithm | Average of micros/op | Average of ops/sec | Average of Throughput (MB/s)
3-way | 4.01 | 249,729 | 27.63
FastCrc32c | 4.35 | 230,897 | 25.53
2) Crc32c computation CPU cost (inclusive samples percentage)
PER RUN
Implementation | run | TotalSamples | Crc32c percentage
3-way | 1 | 4,572,250,000 | 4.37%
3-way | 2 | 3,779,250,000 | 4.62%
3-way | 3 | 4,129,500,000 | 4.48%
FastCrc32c | 1 | 4,663,500,000 | 11.24%
FastCrc32c | 2 | 4,047,500,000 | 12.34%
FastCrc32c | 3 | 4,366,750,000 | 11.68%
**# Test Plan**
make -j64 corruption_test && ./corruption_test
By default it uses 3-way SSE algorithm
NO_THREEWAY_CRC32C=1 make -j64 corruption_test && ./corruption_test
make clean && DEBUG_LEVEL=0 make -j64 db_bench
make clean && DEBUG_LEVEL=0 NO_THREEWAY_CRC32C=1 make -j64 db_bench
Closes https://github.com/facebook/rocksdb/pull/3173
Differential Revision: D6330882
Pulled By: yingsu00
fbshipit-source-id: 8ec3d89719533b63b536a736663ca6f0dd4482e9
7 years ago
|
|
|
|
|
|
|
// Tests for 3-way crc32c algorithm. We need these tests because it uses
|
|
|
|
// different lookup tables than the original Fast_CRC32
|
|
|
|
const unsigned int BUFFER_SIZE = 512 * 1024 * sizeof(uint64_t);
|
|
|
|
char buffer[BUFFER_SIZE];
|
|
|
|
|
|
|
|
struct ExpectedResult {
|
|
|
|
size_t offset;
|
|
|
|
size_t length;
|
|
|
|
uint32_t crc32c;
|
|
|
|
};
|
|
|
|
|
|
|
|
ExpectedResult expectedResults[] = {
|
|
|
|
// Zero-byte input
|
|
|
|
{ 0, 0, ~0U },
|
|
|
|
// Small aligned inputs to test special cases in SIMD implementations
|
|
|
|
{ 8, 1, 1543413366 },
|
|
|
|
{ 8, 2, 523493126 },
|
|
|
|
{ 8, 3, 1560427360 },
|
|
|
|
{ 8, 4, 3422504776 },
|
|
|
|
{ 8, 5, 447841138 },
|
|
|
|
{ 8, 6, 3910050499 },
|
|
|
|
{ 8, 7, 3346241981 },
|
|
|
|
// Small unaligned inputs
|
|
|
|
{ 9, 1, 3855826643 },
|
|
|
|
{ 10, 2, 560880875 },
|
|
|
|
{ 11, 3, 1479707779 },
|
|
|
|
{ 12, 4, 2237687071 },
|
|
|
|
{ 13, 5, 4063855784 },
|
|
|
|
{ 14, 6, 2553454047 },
|
|
|
|
{ 15, 7, 1349220140 },
|
|
|
|
// Larger inputs to test leftover chunks at the end of aligned blocks
|
|
|
|
{ 8, 8, 627613930 },
|
|
|
|
{ 8, 9, 2105929409 },
|
|
|
|
{ 8, 10, 2447068514 },
|
|
|
|
{ 8, 11, 863807079 },
|
|
|
|
{ 8, 12, 292050879 },
|
|
|
|
{ 8, 13, 1411837737 },
|
|
|
|
{ 8, 14, 2614515001 },
|
|
|
|
{ 8, 15, 3579076296 },
|
|
|
|
{ 8, 16, 2897079161 },
|
|
|
|
{ 8, 17, 675168386 },
|
|
|
|
// // Much larger inputs
|
|
|
|
{ 0, BUFFER_SIZE, 2096790750 },
|
|
|
|
{ 1, BUFFER_SIZE / 2, 3854797577 },
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST(CRC, StandardResults) {
|
Port 3 way SSE4.2 crc32c implementation from Folly
Summary:
**# Summary**
RocksDB uses SSE crc32 intrinsics to calculate the crc32 values but it does it in single way fashion (not pipelined on single CPU core). Intel's whitepaper () published an algorithm that uses 3-way pipelining for the crc32 intrinsics, then use pclmulqdq intrinsic to combine the values. Because pclmulqdq has overhead on its own, this algorithm will show perf gains on buffers larger than 216 bytes, which makes RocksDB a perfect user, since most of the buffers RocksDB call crc32c on is over 4KB. Initial db_bench show tremendous CPU gain.
This change uses the 3-way SSE algorithm by default. The old SSE algorithm is now behind a compiler tag NO_THREEWAY_CRC32C. If user compiles the code with NO_THREEWAY_CRC32C=1 then the old SSE Crc32c algorithm would be used. If the server does not have SSE4.2 at the run time the slow way (Non SSE) will be used.
**# Performance Test Results**
We ran the FillRandom and ReadRandom benchmarks in db_bench. ReadRandom is the point of interest here since it calculates the CRC32 for the in-mem buffers. We did 3 runs for each algorithm.
Before this change the CRC32 value computation takes about 11.5% of total CPU cost, and with the new 3-way algorithm it reduced to around 4.5%. The overall throughput also improved from 25.53MB/s to 27.63MB/s.
1) ReadRandom in db_bench overall metrics
PER RUN
Algorithm | run | micros/op | ops/sec |Throughput (MB/s)
3-way | 1 | 4.143 | 241387 | 26.7
3-way | 2 | 3.775 | 264872 | 29.3
3-way | 3 | 4.116 | 242929 | 26.9
FastCrc32c|1 | 4.037 | 247727 | 27.4
FastCrc32c|2 | 4.648 | 215166 | 23.8
FastCrc32c|3 | 4.352 | 229799 | 25.4
AVG
Algorithm | Average of micros/op | Average of ops/sec | Average of Throughput (MB/s)
3-way | 4.01 | 249,729 | 27.63
FastCrc32c | 4.35 | 230,897 | 25.53
2) Crc32c computation CPU cost (inclusive samples percentage)
PER RUN
Implementation | run | TotalSamples | Crc32c percentage
3-way | 1 | 4,572,250,000 | 4.37%
3-way | 2 | 3,779,250,000 | 4.62%
3-way | 3 | 4,129,500,000 | 4.48%
FastCrc32c | 1 | 4,663,500,000 | 11.24%
FastCrc32c | 2 | 4,047,500,000 | 12.34%
FastCrc32c | 3 | 4,366,750,000 | 11.68%
**# Test Plan**
make -j64 corruption_test && ./corruption_test
By default it uses 3-way SSE algorithm
NO_THREEWAY_CRC32C=1 make -j64 corruption_test && ./corruption_test
make clean && DEBUG_LEVEL=0 make -j64 db_bench
make clean && DEBUG_LEVEL=0 NO_THREEWAY_CRC32C=1 make -j64 db_bench
Closes https://github.com/facebook/rocksdb/pull/3173
Differential Revision: D6330882
Pulled By: yingsu00
fbshipit-source-id: 8ec3d89719533b63b536a736663ca6f0dd4482e9
7 years ago
|
|
|
|
|
|
|
// Original Fast_CRC32 tests.
|
|
|
|
// From rfc3720 section B.4.
|
|
|
|
char buf[32];
|
|
|
|
|
|
|
|
memset(buf, 0, sizeof(buf));
|
|
|
|
ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf)));
|
|
|
|
|
|
|
|
memset(buf, 0xff, sizeof(buf));
|
|
|
|
ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf)));
|
|
|
|
|
|
|
|
for (int i = 0; i < 32; i++) {
|
|
|
|
buf[i] = static_cast<char>(i);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf)));
|
|
|
|
|
|
|
|
for (int i = 0; i < 32; i++) {
|
|
|
|
buf[i] = static_cast<char>(31 - i);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf)));
|
|
|
|
|
|
|
|
unsigned char data[48] = {
|
|
|
|
0x01, 0xc0, 0x00, 0x00,
|
|
|
|
0x00, 0x00, 0x00, 0x00,
|
|
|
|
0x00, 0x00, 0x00, 0x00,
|
|
|
|
0x00, 0x00, 0x00, 0x00,
|
|
|
|
0x14, 0x00, 0x00, 0x00,
|
|
|
|
0x00, 0x00, 0x04, 0x00,
|
|
|
|
0x00, 0x00, 0x00, 0x14,
|
|
|
|
0x00, 0x00, 0x00, 0x18,
|
|
|
|
0x28, 0x00, 0x00, 0x00,
|
|
|
|
0x00, 0x00, 0x00, 0x00,
|
|
|
|
0x02, 0x00, 0x00, 0x00,
|
|
|
|
0x00, 0x00, 0x00, 0x00,
|
|
|
|
};
|
|
|
|
ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
|
Port 3 way SSE4.2 crc32c implementation from Folly
Summary:
**# Summary**
RocksDB uses SSE crc32 intrinsics to calculate the crc32 values but it does it in single way fashion (not pipelined on single CPU core). Intel's whitepaper () published an algorithm that uses 3-way pipelining for the crc32 intrinsics, then use pclmulqdq intrinsic to combine the values. Because pclmulqdq has overhead on its own, this algorithm will show perf gains on buffers larger than 216 bytes, which makes RocksDB a perfect user, since most of the buffers RocksDB call crc32c on is over 4KB. Initial db_bench show tremendous CPU gain.
This change uses the 3-way SSE algorithm by default. The old SSE algorithm is now behind a compiler tag NO_THREEWAY_CRC32C. If user compiles the code with NO_THREEWAY_CRC32C=1 then the old SSE Crc32c algorithm would be used. If the server does not have SSE4.2 at the run time the slow way (Non SSE) will be used.
**# Performance Test Results**
We ran the FillRandom and ReadRandom benchmarks in db_bench. ReadRandom is the point of interest here since it calculates the CRC32 for the in-mem buffers. We did 3 runs for each algorithm.
Before this change the CRC32 value computation takes about 11.5% of total CPU cost, and with the new 3-way algorithm it reduced to around 4.5%. The overall throughput also improved from 25.53MB/s to 27.63MB/s.
1) ReadRandom in db_bench overall metrics
PER RUN
Algorithm | run | micros/op | ops/sec |Throughput (MB/s)
3-way | 1 | 4.143 | 241387 | 26.7
3-way | 2 | 3.775 | 264872 | 29.3
3-way | 3 | 4.116 | 242929 | 26.9
FastCrc32c|1 | 4.037 | 247727 | 27.4
FastCrc32c|2 | 4.648 | 215166 | 23.8
FastCrc32c|3 | 4.352 | 229799 | 25.4
AVG
Algorithm | Average of micros/op | Average of ops/sec | Average of Throughput (MB/s)
3-way | 4.01 | 249,729 | 27.63
FastCrc32c | 4.35 | 230,897 | 25.53
2) Crc32c computation CPU cost (inclusive samples percentage)
PER RUN
Implementation | run | TotalSamples | Crc32c percentage
3-way | 1 | 4,572,250,000 | 4.37%
3-way | 2 | 3,779,250,000 | 4.62%
3-way | 3 | 4,129,500,000 | 4.48%
FastCrc32c | 1 | 4,663,500,000 | 11.24%
FastCrc32c | 2 | 4,047,500,000 | 12.34%
FastCrc32c | 3 | 4,366,750,000 | 11.68%
**# Test Plan**
make -j64 corruption_test && ./corruption_test
By default it uses 3-way SSE algorithm
NO_THREEWAY_CRC32C=1 make -j64 corruption_test && ./corruption_test
make clean && DEBUG_LEVEL=0 make -j64 db_bench
make clean && DEBUG_LEVEL=0 NO_THREEWAY_CRC32C=1 make -j64 db_bench
Closes https://github.com/facebook/rocksdb/pull/3173
Differential Revision: D6330882
Pulled By: yingsu00
fbshipit-source-id: 8ec3d89719533b63b536a736663ca6f0dd4482e9
7 years ago
|
|
|
|
|
|
|
// 3-Way Crc32c tests ported from folly.
|
|
|
|
// Test 1: single computation
|
|
|
|
for (auto expected : expectedResults) {
|
|
|
|
uint32_t result = Value(buffer + expected.offset, expected.length);
|
|
|
|
EXPECT_EQ(~expected.crc32c, result);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test 2: stitching two computations
|
|
|
|
for (auto expected : expectedResults) {
|
|
|
|
size_t partialLength = expected.length / 2;
|
|
|
|
uint32_t partialChecksum = Value(buffer + expected.offset, partialLength);
|
|
|
|
uint32_t result = Extend(partialChecksum,
|
|
|
|
buffer + expected.offset + partialLength,
|
|
|
|
expected.length - partialLength);
|
|
|
|
EXPECT_EQ(~expected.crc32c, result);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(CRC, Values) {
|
|
|
|
ASSERT_NE(Value("a", 1), Value("foo", 3));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(CRC, Extend) {
|
|
|
|
ASSERT_EQ(Value("hello world", 11),
|
|
|
|
Extend(Value("hello ", 6), "world", 5));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(CRC, Mask) {
|
|
|
|
uint32_t crc = Value("foo", 3);
|
|
|
|
ASSERT_NE(crc, Mask(crc));
|
|
|
|
ASSERT_NE(crc, Mask(Mask(crc)));
|
|
|
|
ASSERT_EQ(crc, Unmask(Mask(crc)));
|
|
|
|
ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(CRC, Crc32cCombineBasicTest) {
|
|
|
|
uint32_t crc1 = Value("hello ", 6);
|
|
|
|
uint32_t crc2 = Value("world", 5);
|
|
|
|
uint32_t crc3 = Value("hello world", 11);
|
|
|
|
uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, 5);
|
|
|
|
ASSERT_EQ(crc3, crc1_2_combine);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(CRC, Crc32cCombineOrderMattersTest) {
|
|
|
|
uint32_t crc1 = Value("hello ", 6);
|
|
|
|
uint32_t crc2 = Value("world", 5);
|
|
|
|
uint32_t crc3 = Value("hello world", 11);
|
|
|
|
uint32_t crc2_1_combine = Crc32cCombine(crc2, crc1, 6);
|
|
|
|
ASSERT_NE(crc3, crc2_1_combine);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(CRC, Crc32cCombineFullCoverTest) {
|
|
|
|
int scale = 4 * 1024;
|
|
|
|
Random rnd(test::RandomSeed());
|
|
|
|
int size_1 = 1024 * 1024;
|
|
|
|
std::string s1 = rnd.RandomBinaryString(size_1);
|
|
|
|
uint32_t crc1 = Value(s1.data(), size_1);
|
|
|
|
for (int i = 0; i < scale; i++) {
|
|
|
|
int size_2 = i;
|
|
|
|
std::string s2 = rnd.RandomBinaryString(size_2);
|
|
|
|
uint32_t crc2 = Value(s2.data(), s2.size());
|
|
|
|
uint32_t crc1_2 = Extend(crc1, s2.data(), s2.size());
|
|
|
|
uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, size_2);
|
|
|
|
ASSERT_EQ(crc1_2, crc1_2_combine);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(CRC, Crc32cCombineBigSizeTest) {
|
|
|
|
Random rnd(test::RandomSeed());
|
|
|
|
int size_1 = 1024 * 1024;
|
|
|
|
std::string s1 = rnd.RandomBinaryString(size_1);
|
|
|
|
uint32_t crc1 = Value(s1.data(), size_1);
|
|
|
|
int size_2 = 16 * 1024 * 1024 - 1;
|
|
|
|
std::string s2 = rnd.RandomBinaryString(size_2);
|
|
|
|
uint32_t crc2 = Value(s2.data(), s2.size());
|
|
|
|
uint32_t crc1_2 = Extend(crc1, s2.data(), s2.size());
|
|
|
|
uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, size_2);
|
|
|
|
ASSERT_EQ(crc1_2, crc1_2_combine);
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace crc32c
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
Port 3 way SSE4.2 crc32c implementation from Folly
Summary:
**# Summary**
RocksDB uses SSE crc32 intrinsics to calculate the crc32 values but it does it in single way fashion (not pipelined on single CPU core). Intel's whitepaper () published an algorithm that uses 3-way pipelining for the crc32 intrinsics, then use pclmulqdq intrinsic to combine the values. Because pclmulqdq has overhead on its own, this algorithm will show perf gains on buffers larger than 216 bytes, which makes RocksDB a perfect user, since most of the buffers RocksDB call crc32c on is over 4KB. Initial db_bench show tremendous CPU gain.
This change uses the 3-way SSE algorithm by default. The old SSE algorithm is now behind a compiler tag NO_THREEWAY_CRC32C. If user compiles the code with NO_THREEWAY_CRC32C=1 then the old SSE Crc32c algorithm would be used. If the server does not have SSE4.2 at the run time the slow way (Non SSE) will be used.
**# Performance Test Results**
We ran the FillRandom and ReadRandom benchmarks in db_bench. ReadRandom is the point of interest here since it calculates the CRC32 for the in-mem buffers. We did 3 runs for each algorithm.
Before this change the CRC32 value computation takes about 11.5% of total CPU cost, and with the new 3-way algorithm it reduced to around 4.5%. The overall throughput also improved from 25.53MB/s to 27.63MB/s.
1) ReadRandom in db_bench overall metrics
PER RUN
Algorithm | run | micros/op | ops/sec |Throughput (MB/s)
3-way | 1 | 4.143 | 241387 | 26.7
3-way | 2 | 3.775 | 264872 | 29.3
3-way | 3 | 4.116 | 242929 | 26.9
FastCrc32c|1 | 4.037 | 247727 | 27.4
FastCrc32c|2 | 4.648 | 215166 | 23.8
FastCrc32c|3 | 4.352 | 229799 | 25.4
AVG
Algorithm | Average of micros/op | Average of ops/sec | Average of Throughput (MB/s)
3-way | 4.01 | 249,729 | 27.63
FastCrc32c | 4.35 | 230,897 | 25.53
2) Crc32c computation CPU cost (inclusive samples percentage)
PER RUN
Implementation | run | TotalSamples | Crc32c percentage
3-way | 1 | 4,572,250,000 | 4.37%
3-way | 2 | 3,779,250,000 | 4.62%
3-way | 3 | 4,129,500,000 | 4.48%
FastCrc32c | 1 | 4,663,500,000 | 11.24%
FastCrc32c | 2 | 4,047,500,000 | 12.34%
FastCrc32c | 3 | 4,366,750,000 | 11.68%
**# Test Plan**
make -j64 corruption_test && ./corruption_test
By default it uses 3-way SSE algorithm
NO_THREEWAY_CRC32C=1 make -j64 corruption_test && ./corruption_test
make clean && DEBUG_LEVEL=0 make -j64 db_bench
make clean && DEBUG_LEVEL=0 NO_THREEWAY_CRC32C=1 make -j64 db_bench
Closes https://github.com/facebook/rocksdb/pull/3173
Differential Revision: D6330882
Pulled By: yingsu00
fbshipit-source-id: 8ec3d89719533b63b536a736663ca6f0dd4482e9
7 years ago
|
|
|
// copied from folly
|
|
|
|
const uint64_t FNV_64_HASH_START = 14695981039346656037ULL;
|
|
|
|
inline uint64_t fnv64_buf(const void* buf,
|
|
|
|
size_t n,
|
|
|
|
uint64_t hash = FNV_64_HASH_START) {
|
|
|
|
// forcing signed char, since other platforms can use unsigned
|
|
|
|
const signed char* char_buf = reinterpret_cast<const signed char*>(buf);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; ++i) {
|
|
|
|
hash += (hash << 1) + (hash << 4) + (hash << 5) + (hash << 7) +
|
|
|
|
(hash << 8) + (hash << 40);
|
|
|
|
hash ^= char_buf[i];
|
|
|
|
}
|
|
|
|
return hash;
|
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
Port 3 way SSE4.2 crc32c implementation from Folly
Summary:
**# Summary**
RocksDB uses SSE crc32 intrinsics to calculate the crc32 values but it does it in single way fashion (not pipelined on single CPU core). Intel's whitepaper () published an algorithm that uses 3-way pipelining for the crc32 intrinsics, then use pclmulqdq intrinsic to combine the values. Because pclmulqdq has overhead on its own, this algorithm will show perf gains on buffers larger than 216 bytes, which makes RocksDB a perfect user, since most of the buffers RocksDB call crc32c on is over 4KB. Initial db_bench show tremendous CPU gain.
This change uses the 3-way SSE algorithm by default. The old SSE algorithm is now behind a compiler tag NO_THREEWAY_CRC32C. If user compiles the code with NO_THREEWAY_CRC32C=1 then the old SSE Crc32c algorithm would be used. If the server does not have SSE4.2 at the run time the slow way (Non SSE) will be used.
**# Performance Test Results**
We ran the FillRandom and ReadRandom benchmarks in db_bench. ReadRandom is the point of interest here since it calculates the CRC32 for the in-mem buffers. We did 3 runs for each algorithm.
Before this change the CRC32 value computation takes about 11.5% of total CPU cost, and with the new 3-way algorithm it reduced to around 4.5%. The overall throughput also improved from 25.53MB/s to 27.63MB/s.
1) ReadRandom in db_bench overall metrics
PER RUN
Algorithm | run | micros/op | ops/sec |Throughput (MB/s)
3-way | 1 | 4.143 | 241387 | 26.7
3-way | 2 | 3.775 | 264872 | 29.3
3-way | 3 | 4.116 | 242929 | 26.9
FastCrc32c|1 | 4.037 | 247727 | 27.4
FastCrc32c|2 | 4.648 | 215166 | 23.8
FastCrc32c|3 | 4.352 | 229799 | 25.4
AVG
Algorithm | Average of micros/op | Average of ops/sec | Average of Throughput (MB/s)
3-way | 4.01 | 249,729 | 27.63
FastCrc32c | 4.35 | 230,897 | 25.53
2) Crc32c computation CPU cost (inclusive samples percentage)
PER RUN
Implementation | run | TotalSamples | Crc32c percentage
3-way | 1 | 4,572,250,000 | 4.37%
3-way | 2 | 3,779,250,000 | 4.62%
3-way | 3 | 4,129,500,000 | 4.48%
FastCrc32c | 1 | 4,663,500,000 | 11.24%
FastCrc32c | 2 | 4,047,500,000 | 12.34%
FastCrc32c | 3 | 4,366,750,000 | 11.68%
**# Test Plan**
make -j64 corruption_test && ./corruption_test
By default it uses 3-way SSE algorithm
NO_THREEWAY_CRC32C=1 make -j64 corruption_test && ./corruption_test
make clean && DEBUG_LEVEL=0 make -j64 db_bench
make clean && DEBUG_LEVEL=0 NO_THREEWAY_CRC32C=1 make -j64 db_bench
Closes https://github.com/facebook/rocksdb/pull/3173
Differential Revision: D6330882
Pulled By: yingsu00
fbshipit-source-id: 8ec3d89719533b63b536a736663ca6f0dd4482e9
7 years ago
|
|
|
|
|
|
|
// Populate a buffer with a deterministic pattern
|
|
|
|
// on which to compute checksums
|
|
|
|
|
|
|
|
const uint8_t* src = (uint8_t*)ROCKSDB_NAMESPACE::crc32c::buffer;
|
|
|
|
uint64_t* dst = (uint64_t*)ROCKSDB_NAMESPACE::crc32c::buffer;
|
|
|
|
const uint64_t* end =
|
|
|
|
(const uint64_t*)(ROCKSDB_NAMESPACE::crc32c::buffer +
|
|
|
|
ROCKSDB_NAMESPACE::crc32c::BUFFER_SIZE);
|
Port 3 way SSE4.2 crc32c implementation from Folly
Summary:
**# Summary**
RocksDB uses SSE crc32 intrinsics to calculate the crc32 values but it does it in single way fashion (not pipelined on single CPU core). Intel's whitepaper () published an algorithm that uses 3-way pipelining for the crc32 intrinsics, then use pclmulqdq intrinsic to combine the values. Because pclmulqdq has overhead on its own, this algorithm will show perf gains on buffers larger than 216 bytes, which makes RocksDB a perfect user, since most of the buffers RocksDB call crc32c on is over 4KB. Initial db_bench show tremendous CPU gain.
This change uses the 3-way SSE algorithm by default. The old SSE algorithm is now behind a compiler tag NO_THREEWAY_CRC32C. If user compiles the code with NO_THREEWAY_CRC32C=1 then the old SSE Crc32c algorithm would be used. If the server does not have SSE4.2 at the run time the slow way (Non SSE) will be used.
**# Performance Test Results**
We ran the FillRandom and ReadRandom benchmarks in db_bench. ReadRandom is the point of interest here since it calculates the CRC32 for the in-mem buffers. We did 3 runs for each algorithm.
Before this change the CRC32 value computation takes about 11.5% of total CPU cost, and with the new 3-way algorithm it reduced to around 4.5%. The overall throughput also improved from 25.53MB/s to 27.63MB/s.
1) ReadRandom in db_bench overall metrics
PER RUN
Algorithm | run | micros/op | ops/sec |Throughput (MB/s)
3-way | 1 | 4.143 | 241387 | 26.7
3-way | 2 | 3.775 | 264872 | 29.3
3-way | 3 | 4.116 | 242929 | 26.9
FastCrc32c|1 | 4.037 | 247727 | 27.4
FastCrc32c|2 | 4.648 | 215166 | 23.8
FastCrc32c|3 | 4.352 | 229799 | 25.4
AVG
Algorithm | Average of micros/op | Average of ops/sec | Average of Throughput (MB/s)
3-way | 4.01 | 249,729 | 27.63
FastCrc32c | 4.35 | 230,897 | 25.53
2) Crc32c computation CPU cost (inclusive samples percentage)
PER RUN
Implementation | run | TotalSamples | Crc32c percentage
3-way | 1 | 4,572,250,000 | 4.37%
3-way | 2 | 3,779,250,000 | 4.62%
3-way | 3 | 4,129,500,000 | 4.48%
FastCrc32c | 1 | 4,663,500,000 | 11.24%
FastCrc32c | 2 | 4,047,500,000 | 12.34%
FastCrc32c | 3 | 4,366,750,000 | 11.68%
**# Test Plan**
make -j64 corruption_test && ./corruption_test
By default it uses 3-way SSE algorithm
NO_THREEWAY_CRC32C=1 make -j64 corruption_test && ./corruption_test
make clean && DEBUG_LEVEL=0 make -j64 db_bench
make clean && DEBUG_LEVEL=0 NO_THREEWAY_CRC32C=1 make -j64 db_bench
Closes https://github.com/facebook/rocksdb/pull/3173
Differential Revision: D6330882
Pulled By: yingsu00
fbshipit-source-id: 8ec3d89719533b63b536a736663ca6f0dd4482e9
7 years ago
|
|
|
*dst++ = 0;
|
|
|
|
while (dst < end) {
|
|
|
|
ROCKSDB_NAMESPACE::EncodeFixed64(
|
|
|
|
reinterpret_cast<char*>(dst),
|
|
|
|
fnv64_buf((const char*)src, sizeof(uint64_t)));
|
|
|
|
dst++;
|
Port 3 way SSE4.2 crc32c implementation from Folly
Summary:
**# Summary**
RocksDB uses SSE crc32 intrinsics to calculate the crc32 values but it does it in single way fashion (not pipelined on single CPU core). Intel's whitepaper () published an algorithm that uses 3-way pipelining for the crc32 intrinsics, then use pclmulqdq intrinsic to combine the values. Because pclmulqdq has overhead on its own, this algorithm will show perf gains on buffers larger than 216 bytes, which makes RocksDB a perfect user, since most of the buffers RocksDB call crc32c on is over 4KB. Initial db_bench show tremendous CPU gain.
This change uses the 3-way SSE algorithm by default. The old SSE algorithm is now behind a compiler tag NO_THREEWAY_CRC32C. If user compiles the code with NO_THREEWAY_CRC32C=1 then the old SSE Crc32c algorithm would be used. If the server does not have SSE4.2 at the run time the slow way (Non SSE) will be used.
**# Performance Test Results**
We ran the FillRandom and ReadRandom benchmarks in db_bench. ReadRandom is the point of interest here since it calculates the CRC32 for the in-mem buffers. We did 3 runs for each algorithm.
Before this change the CRC32 value computation takes about 11.5% of total CPU cost, and with the new 3-way algorithm it reduced to around 4.5%. The overall throughput also improved from 25.53MB/s to 27.63MB/s.
1) ReadRandom in db_bench overall metrics
PER RUN
Algorithm | run | micros/op | ops/sec |Throughput (MB/s)
3-way | 1 | 4.143 | 241387 | 26.7
3-way | 2 | 3.775 | 264872 | 29.3
3-way | 3 | 4.116 | 242929 | 26.9
FastCrc32c|1 | 4.037 | 247727 | 27.4
FastCrc32c|2 | 4.648 | 215166 | 23.8
FastCrc32c|3 | 4.352 | 229799 | 25.4
AVG
Algorithm | Average of micros/op | Average of ops/sec | Average of Throughput (MB/s)
3-way | 4.01 | 249,729 | 27.63
FastCrc32c | 4.35 | 230,897 | 25.53
2) Crc32c computation CPU cost (inclusive samples percentage)
PER RUN
Implementation | run | TotalSamples | Crc32c percentage
3-way | 1 | 4,572,250,000 | 4.37%
3-way | 2 | 3,779,250,000 | 4.62%
3-way | 3 | 4,129,500,000 | 4.48%
FastCrc32c | 1 | 4,663,500,000 | 11.24%
FastCrc32c | 2 | 4,047,500,000 | 12.34%
FastCrc32c | 3 | 4,366,750,000 | 11.68%
**# Test Plan**
make -j64 corruption_test && ./corruption_test
By default it uses 3-way SSE algorithm
NO_THREEWAY_CRC32C=1 make -j64 corruption_test && ./corruption_test
make clean && DEBUG_LEVEL=0 make -j64 db_bench
make clean && DEBUG_LEVEL=0 NO_THREEWAY_CRC32C=1 make -j64 db_bench
Closes https://github.com/facebook/rocksdb/pull/3173
Differential Revision: D6330882
Pulled By: yingsu00
fbshipit-source-id: 8ec3d89719533b63b536a736663ca6f0dd4482e9
7 years ago
|
|
|
src += sizeof(uint64_t);
|
|
|
|
}
|
|
|
|
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|