diff --git a/CMakeLists.txt b/CMakeLists.txt index 598c72815..109981c1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -253,33 +253,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64") endif(HAS_LOONGARCH64) endif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64") -option(PORTABLE "build a portable binary" OFF) -option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF) -option(FORCE_AVX "force building with AVX, even when PORTABLE=ON" OFF) -option(FORCE_AVX2 "force building with AVX2, even when PORTABLE=ON" OFF) -if(PORTABLE) - add_definitions(-DROCKSDB_PORTABLE) - - # MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h - # is available, it is available by default. - if(FORCE_SSE42 AND NOT MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul") - endif() - if(MSVC) - if(FORCE_AVX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX") - endif() - # MSVC automatically enables BMI / lzcnt with AVX2. - if(FORCE_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") - endif() - else() - if(FORCE_AVX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") - endif() - if(FORCE_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mbmi -mlzcnt") - endif() +set(PORTABLE 0 CACHE STRING "Minimum CPU arch to support, or 0 = current CPU, 1 = baseline CPU") +if(PORTABLE STREQUAL 1) + # Usually nothing to do; compiler default is typically the most general + if(NOT MSVC) if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196") endif() @@ -287,10 +264,21 @@ if(PORTABLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=loongarch64") endif() endif() +elseif(PORTABLE MATCHES [^0]+) + # Name of a CPU arch spec or feature set to require + if(MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:${PORTABLE}") + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${PORTABLE}") + endif() else() if(MSVC) + # NOTE: No auto-detection of current CPU, but instead assume some useful + # level of optimization is supported set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") else() + # Require instruction set from current CPU (with some legacy or opt-out + # exceptions) if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x" AND NOT HAS_S390X_MARCH_NATIVE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196") elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND NOT HAS_ARMV8_CRC) @@ -305,25 +293,6 @@ if(NOT MSVC) set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul") endif() -CHECK_CXX_SOURCE_COMPILES(" -#include -#include -#include -int main() { - volatile uint32_t x = _mm_crc32_u32(0, 0); - const auto a = _mm_set_epi64x(0, 0); - const auto b = _mm_set_epi64x(0, 0); - const auto c = _mm_clmulepi64_si128(a, b, 0x00); - auto d = _mm_cvtsi128_si64(c); -} -" HAVE_SSE42) -if(HAVE_SSE42) - add_definitions(-DHAVE_SSE42) - add_definitions(-DHAVE_PCLMUL) -elseif(FORCE_SSE42) - message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled") -endif() - # Check if -latomic is required or not if (NOT MSVC) set(CMAKE_REQUIRED_FLAGS "--std=c++17") @@ -1010,12 +979,6 @@ if ( ROCKSDB_PLUGINS ) endforeach() endif() -if(HAVE_SSE42 AND NOT MSVC) - set_source_files_properties( - util/crc32c.cc - PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") -endif() - if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") list(APPEND SOURCES util/crc32c_ppc.c diff --git a/HISTORY.md b/HISTORY.md index 083c4c27b..ba85890e6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -8,6 +8,9 @@ ### Public API Changes * Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists. +### Behavior changes +* For x86, CPU features are no longer detected at runtime nor in build scripts, but in source code using common preprocessor defines. This will likely unlock some small performance improvements on some newer hardware, but could hurt performance of the kCRC32c checksum, which is no longer the default, on some "portable" builds. See PR #11419 for details. + ### Bug Fixes * Delete an empty WAL file on DB open if the log number is less than the min log number to keep diff --git a/INSTALL.md b/INSTALL.md index eb1e4933f..f4bb7e62a 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -20,12 +20,15 @@ There are few options when compiling RocksDB: depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't use binaries compiled by `make all` in production. -* By default the binary we produce is optimized for the platform you're compiling on -(`-march=native` or the equivalent). SSE4.2 will thus be enabled automatically if your -CPU supports it. To print a warning if your CPU does not support SSE4.2, build with -`USE_SSE=1 make static_lib` or, if using CMake, `cmake -DFORCE_SSE42=ON`. If you want -to build a portable binary, add `PORTABLE=1` before your make commands, like this: -`PORTABLE=1 make static_lib`. +* By default the binary we produce is optimized for the CPU you're compiling on +(`-march=native` or the equivalent). To build a binary compatible with the most +general architecture supported by your CPU and compiler, set `PORTABLE=1` for +the build, but performance will suffer as many operations benefit from newer +and wider instructions. In addition to `PORTABLE=0` (default) and `PORTABLE=1`, +it can be set to an architecture name recognized by your compiler. For example, +on 64-bit x86, a reasonable compromise is `PORTABLE=haswell` which supports +many or most of the available optimizations while still being compatible with +most processors made since roughly 2013. ## Dependencies diff --git a/Makefile b/Makefile index 169f641a9..75a5d7359 100644 --- a/Makefile +++ b/Makefile @@ -337,8 +337,8 @@ ifneq ($(MACHINE), arm64) # linking with jemalloc (as it won't be arm64-compatible) and remove some other options # set during platform detection DISABLE_JEMALLOC=1 -PLATFORM_CCFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CCFLAGS)) -PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS)) +PLATFORM_CCFLAGS := $(filter-out -march=native, $(PLATFORM_CCFLAGS)) +PLATFORM_CXXFLAGS := $(filter-out -march=native, $(PLATFORM_CXXFLAGS)) endif endif endif @@ -2435,6 +2435,8 @@ checkout_folly: @# NOTE: this hack is required for gcc in some cases perl -pi -e 's/(__has_include..)/__cpp_rtti && $$1/' third-party/folly/folly/memory/MemoryResource.h +CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS)) + build_folly: FOLLY_INST_PATH=`cd third-party/folly; $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \ if [ "$$FOLLY_INST_PATH" ]; then \ @@ -2445,8 +2447,8 @@ build_folly: fi # Restore the original version of Invoke.h with boost dependency cd third-party/folly && ${GIT_COMMAND} checkout folly/functional/Invoke.h - cd third-party/folly && MAYBE_AVX2=`echo $(CXXFLAGS) | grep -o -- -DHAVE_AVX2 | sed 's/-DHAVE_AVX2/-mavx2/g' || true` && \ - CXXFLAGS=" $$MAYBE_AVX2 -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests + cd third-party/folly && \ + CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests # --------------------------------------------------------------------------- # Build size testing diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index c03d9ae41..aa290f8b3 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -621,7 +621,7 @@ EOF fi fi -if test "0$PORTABLE" -eq 0; then +if [ "$PORTABLE" == "" ] || [ "$PORTABLE" == 0 ]; then if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then # Tune for this POWER processor, treating '+' models as base models POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+` @@ -644,27 +644,26 @@ if test "0$PORTABLE" -eq 0; then COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}" elif [ "$TARGET_OS" == "IOS" ]; then COMMON_FLAGS="$COMMON_FLAGS" - elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then - # TODO: Not sure why we don't use -march=native on these OSes - if test "$USE_SSE"; then - TRY_SSE_ETC="1" - fi else COMMON_FLAGS="$COMMON_FLAGS -march=native " fi else - # PORTABLE=1 - if test "$USE_SSE"; then - TRY_SSE_ETC="1" - fi - - if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then - COMMON_FLAGS="$COMMON_FLAGS -march=z196 " - fi - - if test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then - RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-) - COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}" + # PORTABLE specified + if [ "$PORTABLE" == 1 ]; then + if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then + COMMON_FLAGS="$COMMON_FLAGS -march=z196 " + elif test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then + RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-) + COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}" + elif test "$USE_SSE"; then + # USE_SSE is DEPRECATED + # This is a rough approximation of the old USE_SSE behavior + COMMON_FLAGS="$COMMON_FLAGS -march=haswell" + fi + # Other than those cases, not setting -march= here. + else + # Assume PORTABLE is a minimum assumed cpu type, e.g. PORTABLE=haswell + COMMON_FLAGS="$COMMON_FLAGS -march=${PORTABLE}" fi if [[ "${PLATFORM}" == "OS_MACOSX" ]]; then @@ -698,101 +697,6 @@ EOF fi fi -if test "$TRY_SSE_ETC"; then - # The USE_SSE flag now means "attempt to compile with widely-available - # Intel architecture extensions utilized by specific optimizations in the - # source code." It's a qualifier on PORTABLE=1 that means "mostly portable." - # It doesn't even really check that your current CPU is compatible. - # - # SSE4.2 available since nehalem, ca. 2008-2010 - # Includes POPCNT for BitsSetToOne, BitParity - TRY_SSE42="-msse4.2" - # PCLMUL available since westmere, ca. 2010-2011 - TRY_PCLMUL="-mpclmul" - # AVX2 available since haswell, ca. 2013-2015 - TRY_AVX2="-mavx2" - # BMI available since haswell, ca. 2013-2015 - # Primarily for TZCNT for CountTrailingZeroBits - TRY_BMI="-mbmi" - # LZCNT available since haswell, ca. 2013-2015 - # For FloorLog2 - TRY_LZCNT="-mlzcnt" -fi - -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o test.o 2>/dev/null < - #include - int main() { - volatile uint32_t x = _mm_crc32_u32(0, 0); - (void)x; - } -EOF -if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $TRY_SSE42 -DHAVE_SSE42" -elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2 -fi - -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o test.o 2>/dev/null < - #include - int main() { - const auto a = _mm_set_epi64x(0, 0); - const auto b = _mm_set_epi64x(0, 0); - const auto c = _mm_clmulepi64_si128(a, b, 0x00); - auto d = _mm_cvtsi128_si64(c); - (void)d; - } -EOF -if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $TRY_PCLMUL -DHAVE_PCLMUL" -elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2 -fi - -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o test.o 2>/dev/null < - #include - int main() { - const auto a = _mm256_setr_epi32(0, 1, 2, 3, 4, 7, 6, 5); - const auto b = _mm256_permutevar8x32_epi32(a, a); - (void)b; - } -EOF -if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $TRY_AVX2 -DHAVE_AVX2" -elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2 -fi - -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_BMI -x c++ - -o test.o 2>/dev/null < - #include - int main(int argc, char *argv[]) { - (void)argv; - return (int)_tzcnt_u64((uint64_t)argc); - } -EOF -if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $TRY_BMI -DHAVE_BMI" -elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use BMI intrinsics, disabling" >&2 -fi - -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_LZCNT -x c++ - -o test.o 2>/dev/null < - #include - int main(int argc, char *argv[]) { - (void)argv; - return (int)_lzcnt_u64((uint64_t)argc); - } -EOF -if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $TRY_LZCNT -DHAVE_LZCNT" -elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use LZCNT intrinsics, disabling" >&2 -fi - $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null < int main() { diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh index cf3c355b1..fa629af97 100644 --- a/build_tools/fbcode_config.sh +++ b/build_tools/fbcode_config.sh @@ -147,7 +147,7 @@ else fi CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT" CXXFLAGS+=" $CFLAGS" EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS" diff --git a/build_tools/fbcode_config_platform010.sh b/build_tools/fbcode_config_platform010.sh index babe92c41..25835d091 100644 --- a/build_tools/fbcode_config_platform010.sh +++ b/build_tools/fbcode_config_platform010.sh @@ -154,7 +154,7 @@ CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux " CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE " CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_IOURING_PRESENT" CXXFLAGS+=" $CFLAGS" EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS" diff --git a/port/lang.h b/port/lang.h index 52c597acd..a4201ca3b 100644 --- a/port/lang.h +++ b/port/lang.h @@ -68,3 +68,30 @@ constexpr bool kMustFreeHeapAllocations = false; #else #define TSAN_SUPPRESSION #endif // TSAN_SUPPRESSION + +// Compile-time CPU feature testing compatibility +// +// A way to be extra sure these defines have been included. +#define ASSERT_FEATURE_COMPAT_HEADER() /* empty */ + +// MSVC doesn't support the same defines that gcc and clang provide +// but does some like __AVX__. Here we can infer some features from others. +#ifdef __AVX__ +#define __SSE4_2__ 1 +#define __PCLMUL__ 1 +#endif // __AVX__ + +// A way to disable PCLMUL +#ifdef NO_PCLMUL +#undef __PCLMUL__ +#endif + +// popcnt is generally implied by SSE4.2 +#if defined(__SSE4_2__) +#define __POPCNT__ 1 +#endif + +// A way to disable POPCNT +#ifdef NO_POPCNT +#undef __POPCNT__ +#endif diff --git a/util/bloom_impl.h b/util/bloom_impl.h index fadd012d3..53b70aa68 100644 --- a/util/bloom_impl.h +++ b/util/bloom_impl.h @@ -17,7 +17,7 @@ #include "rocksdb/slice.h" #include "util/hash.h" -#ifdef HAVE_AVX2 +#ifdef __AVX2__ #include #endif @@ -231,7 +231,7 @@ class FastLocalBloomImpl { static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes, const char *data_at_cache_line) { uint32_t h = h2; -#ifdef HAVE_AVX2 +#ifdef __AVX2__ int rem_probes = num_probes; // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18, diff --git a/util/crc32c.cc b/util/crc32c.cc index d71c71c2e..d4cd78b52 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -15,10 +15,6 @@ #include #include -#ifdef HAVE_SSE42 -#include -#include -#endif #include "port/lang.h" #include "util/coding.h" @@ -50,6 +46,13 @@ #endif +ASSERT_FEATURE_COMPAT_HEADER(); + +#ifdef __SSE4_2__ +#include +#include +#endif + #if defined(HAVE_ARM64_CRC) bool pmull_runtime_flag = false; #endif @@ -107,6 +110,7 @@ static const uint32_t table0_[256] = { 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351}; +#ifndef __SSE4_2__ static const uint32_t table1_[256] = { 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, @@ -244,14 +248,10 @@ static const uint32_t table3_[256] = { static inline uint32_t LE_LOAD32(const uint8_t* p) { return DecodeFixed32(reinterpret_cast(p)); } +#endif // !__SSE4_2__ -#if defined(HAVE_SSE42) && (defined(__LP64__) || defined(_WIN64)) -static inline uint64_t LE_LOAD64(const uint8_t* p) { - return DecodeFixed64(reinterpret_cast(p)); -} -#endif - -static inline void Slow_CRC32(uint64_t* l, uint8_t const** p) { +static inline void DefaultCRC32(uint64_t* l, uint8_t const** p) { +#ifndef __SSE4_2__ uint32_t c = static_cast(*l ^ LE_LOAD32(*p)); *p += 4; *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ @@ -261,16 +261,8 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const** p) { *p += 4; *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ table1_[(c >> 16) & 0xff] ^ table0_[c >> 24]; -} - -#if (!(defined(HAVE_POWER8) && defined(HAS_ALTIVEC))) && \ - (!defined(HAVE_ARM64_CRC)) || \ - defined(NO_THREEWAY_CRC32C) -static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) { -#ifndef HAVE_SSE42 - Slow_CRC32(l, p); #elif defined(__LP64__) || defined(_WIN64) - *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); + *l = _mm_crc32_u64(*l, DecodeFixed64(reinterpret_cast(*p))); *p += 8; #else *l = _mm_crc32_u32(static_cast(*l), LE_LOAD32(*p)); @@ -279,7 +271,6 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) { *p += 4; #endif } -#endif template uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { @@ -324,48 +315,6 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { return static_cast(l ^ 0xffffffffu); } -// Detect if ARM64 CRC or not. -#ifndef HAVE_ARM64_CRC -// Detect if SS42 or not. -#ifndef HAVE_POWER8 - -static bool isSSE42() { -#ifndef HAVE_SSE42 - return false; -#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) - uint32_t c_; - __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx"); - return c_ & (1U << 20); // copied from CpuId.h in Folly. Test SSE42 -#elif defined(_WIN64) - int info[4]; - __cpuidex(info, 0x00000001, 0); - return (info[2] & ((int)1 << 20)) != 0; -#else - return false; -#endif -} - -static bool isPCLMULQDQ() { -#ifndef HAVE_SSE42 - // in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ - // are supported by compiler - return false; -#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) - uint32_t c_; - __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx"); - return c_ & (1U << 1); // PCLMULQDQ is in bit 1 (not bit 0) -#elif defined(_WIN64) - int info[4]; - __cpuidex(info, 0x00000001, 0); - return (info[2] & ((int)1 << 1)) != 0; -#else - return false; -#endif -} - -#endif // HAVE_POWER8 -#endif // HAVE_ARM64_CRC - using Function = uint32_t (*)(uint32_t, const char*, size_t); #if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) @@ -436,7 +385,9 @@ std::string IsFastCrc32Supported() { arch = "Arm64"; } #else - has_fast_crc = isSSE42(); +#ifdef __SSE4_2__ + has_fast_crc = true; +#endif // __SSE4_2__ arch = "x86"; #endif if (has_fast_crc) { @@ -477,7 +428,7 @@ std::string IsFastCrc32Supported() { * * */ -#if defined HAVE_SSE42 && defined HAVE_PCLMUL +#if defined(__SSE4_2__) && defined(__PCLMUL__) #define CRCtriplet(crc, buf, offset) \ crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ @@ -1152,34 +1103,24 @@ uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) { } } -#endif //HAVE_SSE42 && HAVE_PCLMUL +#endif //__SSE4_2__ && __PCLMUL__ static inline Function Choose_Extend() { #ifdef HAVE_POWER8 - return isAltiVec() ? ExtendPPCImpl : ExtendImpl; + return isAltiVec() ? ExtendPPCImpl : ExtendImpl; #elif defined(HAVE_ARM64_CRC) if(crc32c_runtime_check()) { pmull_runtime_flag = crc32c_pmull_runtime_check(); return ExtendARMImpl; } else { - return ExtendImpl; + return ExtendImpl; } +#elif defined(__SSE4_2__) && defined(__PCLMUL__) && !defined NO_THREEWAY_CRC32C + // NOTE: runtime detection no longer supported on x86 + (void)ExtendImpl; // suppress unused warning + return crc32c_3way; #else - if (isSSE42()) { - if (isPCLMULQDQ()) { -#if (defined HAVE_SSE42 && defined HAVE_PCLMUL) && !defined NO_THREEWAY_CRC32C - return crc32c_3way; -#else - return ExtendImpl; // Fast_CRC32 will check HAVE_SSE42 itself -#endif - } - else { // no runtime PCLMULQDQ support but has SSE42 support - return ExtendImpl; - } - } // end of isSSE42() - else { - return ExtendImpl; - } + return ExtendImpl; #endif } diff --git a/util/math.h b/util/math.h index da31b43ec..39f308328 100644 --- a/util/math.h +++ b/util/math.h @@ -13,8 +13,11 @@ #include #include +#include "port/lang.h" #include "rocksdb/rocksdb_namespace.h" +ASSERT_FEATURE_COMPAT_HEADER(); + namespace ROCKSDB_NAMESPACE { // Fast implementation of floor(log2(v)). Undefined for 0 or negative @@ -145,27 +148,29 @@ inline int BitsSetToOne(T v) { constexpr auto mm = 8 * sizeof(uint32_t) - 1; // The bit mask is to neutralize sign extension on small signed types constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1; -#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86)) +#if __POPCNT__ return static_cast(__popcnt(static_cast(v) & m)); #else return static_cast(detail::BitsSetToOneFallback(v) & m); -#endif +#endif // __POPCNT__ } else if (sizeof(T) == sizeof(uint32_t)) { -#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86)) +#if __POPCNT__ return static_cast(__popcnt(static_cast(v))); #else return detail::BitsSetToOneFallback(static_cast(v)); -#endif +#endif // __POPCNT__ } else { -#if defined(HAVE_SSE42) && defined(_M_X64) +#if __POPCNT__ +#ifdef _M_X64 return static_cast(__popcnt64(static_cast(v))); -#elif defined(HAVE_SSE42) && defined(_M_IX86) +#else return static_cast( __popcnt(static_cast(static_cast(v) >> 32) + __popcnt(static_cast(v)))); +#endif // _M_X64 #else return detail::BitsSetToOneFallback(static_cast(v)); -#endif +#endif // __POPCNT__ } #else static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");