From 062396af15ea9b295ea72d98cac32206f3dcf299 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Wed, 9 Mar 2022 21:07:31 -0800 Subject: [PATCH] Avoid popcnt on Windows when unavailable and in portable builds (#9680) Summary: Fixes https://github.com/facebook/rocksdb/issues/9560. Only use popcnt intrinsic when HAVE_SSE42 is set. Also avoid setting it based on compiler test in portable builds because such test will pass on MSVC even without proper arch flags (ref: https://devblogs.microsoft.com/oldnewthing/20201026-00/?p=104397). Pull Request resolved: https://github.com/facebook/rocksdb/pull/9680 Test Plan: verified the combinations of -DPORTABLE and -DFORCE_SSE42 produce expected compiler flags on Linux. Verified MSVC build using PORTABLE=1 (in CircleCI) does not set HAVE_SSE42. Reviewed By: pdillinger Differential Revision: D34739033 Pulled By: ajkr fbshipit-source-id: d10456f3392945fc3e59430a1777840f7b60b276 --- CMakeLists.txt | 14 ++++++++------ util/math.h | 22 +++++++++++++++------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b4f73050..1f7b67b31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -321,7 +321,8 @@ if(NOT MSVC) set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul") endif() -CHECK_CXX_SOURCE_COMPILES(" +if (NOT PORTABLE OR FORCE_SSE42) + CHECK_CXX_SOURCE_COMPILES(" #include #include #include @@ -333,11 +334,12 @@ int main() { auto d = _mm_cvtsi128_si64(c); } " HAVE_SSE42) -if(HAVE_SSE42) - add_definitions(-DHAVE_SSE42) - add_definitions(-DHAVE_PCLMUL) -elseif(FORCE_SSE42) - message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled") + if(HAVE_SSE42) + add_definitions(-DHAVE_SSE42) + add_definitions(-DHAVE_PCLMUL) + elseif(FORCE_SSE42) + message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled") + endif() endif() # Check if -latomic is required or not diff --git a/util/math.h b/util/math.h index c6dc2a29f..2254b4aab 100644 --- a/util/math.h +++ b/util/math.h @@ -92,18 +92,25 @@ inline int CountTrailingZeroBits(T v) { #endif } -#if defined(_MSC_VER) && !defined(_M_X64) +// Not all MSVC compile settings will use `BitsSetToOneFallback()`. We include +// the following code at coarse granularity for simpler macros. It's important +// to exclude at least so our non-MSVC unit test coverage tool doesn't see it. +#ifdef _MSC_VER + namespace detail { + template int BitsSetToOneFallback(T v) { const int kBits = static_cast(sizeof(T)) * 8; static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits"); // we static_cast these bit patterns in order to truncate them to the correct - // size + // size. Warning C4309 dislikes this technique, so disable it here. +#pragma warning(disable : 4309) v = static_cast(v - ((v >> 1) & static_cast(0x5555555555555555ull))); v = static_cast((v & static_cast(0x3333333333333333ull)) + ((v >> 2) & static_cast(0x3333333333333333ull))); v = static_cast((v + (v >> 4)) & static_cast(0x0F0F0F0F0F0F0F0Full)); +#pragma warning(default : 4309) for (int shift_bits = 8; shift_bits < kBits; shift_bits <<= 1) { v += static_cast(v >> shift_bits); } @@ -113,7 +120,8 @@ int BitsSetToOneFallback(T v) { } } // namespace detail -#endif + +#endif // _MSC_VER // Number of bits set to 1. Also known as "population count". template @@ -126,21 +134,21 @@ inline int BitsSetToOne(T v) { constexpr auto mm = 8 * sizeof(uint32_t) - 1; // The bit mask is to neutralize sign extension on small signed types constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1; -#if defined(_M_X64) || defined(_M_IX86) +#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86)) return static_cast(__popcnt(static_cast(v) & m)); #else return static_cast(detail::BitsSetToOneFallback(v) & m); #endif } else if (sizeof(T) == sizeof(uint32_t)) { -#if defined(_M_X64) || defined(_M_IX86) +#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86)) return static_cast(__popcnt(static_cast(v))); #else return detail::BitsSetToOneFallback(static_cast(v)); #endif } else { -#ifdef _M_X64 +#if defined(HAVE_SSE42) && defined(_M_X64) return static_cast(__popcnt64(static_cast(v))); -#elif defined(_M_IX86) +#elif defined(HAVE_SSE42) && defined(_M_IX86) return static_cast( __popcnt(static_cast(static_cast(v) >> 32) + __popcnt(static_cast(v))));