From 99536a82b00dae120155ee07526910fe523f3e09 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 11 Nov 2024 15:23:18 +0100 Subject: [PATCH] Add AVX512 and ARM SVE pre-sieving (#154) * Add AVX512 pre-sieving * Fix typo * Update ChangeLog * Fix typo * Add AVX512 BW info * Fix macro name * Add ARM SVE support * Fix preprocessor logic * Remove comments * Add ARM SVE pre-sieving * Fix undefined behavior * Update ChangeLog --- CMakeLists.txt | 7 +- ChangeLog | 3 +- cmake/multiarch_avx512_bw.cmake | 80 +++++ cmake/multiarch_sve_arm.cmake | 78 +++++ include/primesieve/cpu_supports_arm_sve.hpp | 27 ++ include/primesieve/cpu_supports_avx512_bw.hpp | 27 ++ src/PreSieve.cpp | 306 ++++++++++++++---- src/app/main.cpp | 34 +- src/x86/cpuid.cpp | 34 +- 9 files changed, 526 insertions(+), 70 deletions(-) create mode 100644 cmake/multiarch_avx512_bw.cmake create mode 100644 cmake/multiarch_sve_arm.cmake create mode 100644 include/primesieve/cpu_supports_arm_sve.hpp create mode 100644 include/primesieve/cpu_supports_avx512_bw.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 508102a6..5e7df285 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,9 +86,14 @@ set(LIB_SRC src/api-c.cpp if(WITH_MULTIARCH) include("${PROJECT_SOURCE_DIR}/cmake/multiarch_x86_popcnt.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_bw.cmake") include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake") - if(multiarch_x86_popcnt OR multiarch_avx512_vbmi2) + if(NOT multiarch_avx512_bw) + include("${PROJECT_SOURCE_DIR}/cmake/multiarch_sve_arm.cmake") + endif() + + if(multiarch_x86_popcnt OR multiarch_avx512_bw OR multiarch_avx512_vbmi2) set(LIB_SRC ${LIB_SRC} src/x86/cpuid.cpp) endif() endif() diff --git a/ChangeLog b/ChangeLog index 46ccbc2c..a0ae4391 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -Changes in version 12.6, 10/11/2024 +Changes in version 12.6, 11/11/2024 =================================== * CpuInfo.cpp: Correctly detect Intel Arrow Lake CPU cache @@ -6,6 +6,7 @@ Changes in version 12.6, 10/11/2024 * PreSieve.cpp: Increased pre-sieving from primes <= 100 to primes <= 163. Memory usage of pre-sieve lookup tables has been reduced from 210 kilobytes to 123 kilobytes. +* PreSieve.cpp: Added AVX512 and ARM SVE multiarch support. Changes in version 12.5, 22/10/2024 =================================== diff --git a/cmake/multiarch_avx512_bw.cmake b/cmake/multiarch_avx512_bw.cmake new file mode 100644 index 00000000..171c4d47 --- /dev/null +++ b/cmake/multiarch_avx512_bw.cmake @@ -0,0 +1,80 @@ +# We use GCC/Clang's function multi-versioning for AVX512 +# support. This code will automatically dispatch to the +# AVX512 BW algorithm if the CPU supports it and use the +# default (portable) algorithm otherwise. + +include(CheckCXXSourceCompiles) +include(CMakePushCheckState) + +cmake_push_check_state() +set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}") + +check_cxx_source_compiles(" + // GCC/Clang function multiversioning for AVX512 is not needed if + // the user compiles with -mavx512f -mavx512bw. + // GCC/Clang function multiversioning generally causes a minor + // overhead, hence we disable it if it is not needed. + #if defined(__AVX512F__) && \ + defined(__AVX512BW__) + Error: AVX512BW multiarch not needed! + #endif + + #include + #include + #include + #include + + __attribute__ ((target (\"avx512f,avx512bw\"))) + void AND_PreSieveTables_avx512(const uint8_t* __restrict preSieve0, + const uint8_t* __restrict preSieve1, + uint8_t* __restrict sieve, + std::size_t bytes) + { + std::size_t i = 0; + + for (; i + 64 <= bytes; i += sizeof(__m512i)) + { + _mm512_storeu_epi8((__m512i*) &sieve[i], + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieve0[i]), + _mm512_loadu_epi8((const __m512i*) &preSieve1[i]))); + } + + if (i < bytes) + { + __mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes); + + _mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask, + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve0[i]), + _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve1[i]))); + } + } + + void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + uint8_t* __restrict sieve, + std::size_t bytes) + { + for (std::size_t i = 0; i < bytes; i++) + sieve[i] = preSieved0[i] & preSieved1[i]; + } + + int main() + { + uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + + if (primesieve::has_cpuid_avx512_bw()) + AND_PreSieveTables_avx512(&PreSieveTable1[0], &PreSieveTable2[1], &sieve[0], 10); + else + AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable2[1], &sieve[0], 10); + + return (sieve[0] == 0) ? 0 : 1; + } +" multiarch_avx512_bw) + +if(multiarch_avx512_bw) + list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_BW") +endif() + +cmake_pop_check_state() diff --git a/cmake/multiarch_sve_arm.cmake b/cmake/multiarch_sve_arm.cmake new file mode 100644 index 00000000..f7efde33 --- /dev/null +++ b/cmake/multiarch_sve_arm.cmake @@ -0,0 +1,78 @@ +# We use GCC/Clang's function multi-versioning for ARM SVE +# support. This code will automatically dispatch to the +# ARM SVE algorithm if the CPU supports it and use the default +# (portable) algorithm otherwise. + +include(CheckCXXSourceCompiles) +include(CMakePushCheckState) + +cmake_push_check_state() +set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include") + +check_cxx_source_compiles(" + // GCC/Clang function multiversioning for ARM SVE is not needed + // if the user compiles with -march=armv8-a+sve. GCC/Clang + // function multiversioning generally causes a minor overhead, + // hence we disable it if it is not needed. + #if defined(__ARM_FEATURE_SVE) && \ + __has_include() + Error: ARM SVE multiarch not needed! + #endif + + #include + #include + #include + #include + + __attribute__ ((target (\"arch=armv8-a+sve\"))) + void AND_PreSieveTables_arm_sve(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) + { + for (std::size_t i = 0; i < bytes; i += svcntb()) + { + svbool_t pg = svwhilelt_b8(i, bytes); + + svst1_u8(pg, &sieve[i], + svand_u8_x(svptrue_b64(), + svand_u8_z(pg, svld1_u8(pg, &preSieved0[i]), svld1_u8(pg, &preSieved1[i])), + svand_u8_z(pg, svld1_u8(pg, &preSieved2[i]), svld1_u8(pg, &preSieved3[i])))); + } + } + + void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) + { + for (std::size_t i = 0; i < bytes; i++) + sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; + } + + int main() + { + uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + uint8_t PreSieveTable3[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + uint8_t PreSieveTable4[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + + if (cpu_supports_sve) + AND_PreSieveTables_arm_sve(&PreSieveTable1[0], &PreSieveTable2[1], &PreSieveTable3[1], &PreSieveTable4[1], &sieve[0], 10); + else + AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable2[1], &PreSieveTable3[1], &PreSieveTable4[1], &sieve[0], 10); + + return (sieve[0] == 0) ? 0 : 1; + } +" multiarch_sve_arm) + +if(multiarch_sve_arm) + list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_ARM_SVE") +endif() + +cmake_pop_check_state() diff --git a/include/primesieve/cpu_supports_arm_sve.hpp b/include/primesieve/cpu_supports_arm_sve.hpp new file mode 100644 index 00000000..74bb6fea --- /dev/null +++ b/include/primesieve/cpu_supports_arm_sve.hpp @@ -0,0 +1,27 @@ +/// +/// @file cpu_supports_arm_sve.hpp +/// Check if the CPU supports the ARM SVE instruction set. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#ifndef CPU_SUPPORTS_ARM_SVE_HPP +#define CPU_SUPPORTS_ARM_SVE_HPP + +#include "macros.hpp" + +#if __has_builtin(__builtin_cpu_supports) + +namespace { + +/// Initialized at startup +const bool cpu_supports_sve = __builtin_cpu_supports("sve"); + +} // namespace + +#endif // __builtin_cpu_supports + +#endif diff --git a/include/primesieve/cpu_supports_avx512_bw.hpp b/include/primesieve/cpu_supports_avx512_bw.hpp new file mode 100644 index 00000000..e0a866e1 --- /dev/null +++ b/include/primesieve/cpu_supports_avx512_bw.hpp @@ -0,0 +1,27 @@ +/// +/// @file cpu_supports_avx512_bw.hpp +/// @brief Detect if the x86 CPU supports AVX512 BW. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#ifndef CPU_SUPPORTS_AVX512_BW_HPP +#define CPU_SUPPORTS_AVX512_BW_HPP + +namespace primesieve { + +bool has_cpuid_avx512_bw(); + +} // namespace + +namespace { + +/// Initialized at startup +const bool cpu_supports_avx512_bw = primesieve::has_cpuid_avx512_bw(); + +} // namespace + +#endif diff --git a/src/PreSieve.cpp b/src/PreSieve.cpp index e84cbe39..886f4795 100644 --- a/src/PreSieve.cpp +++ b/src/PreSieve.cpp @@ -45,46 +45,175 @@ #include #include -/// All x64 CPUs support the SSE2 vector instruction set -#if defined(__SSE2__) && \ - __has_include() - #include - #define HAS_SSE2 +#if defined(__ARM_FEATURE_SVE) && \ + __has_include() + #include + #define ENABLE_ARM_SVE + +#elif defined(__AVX512F__) && \ + defined(__AVX512BW__) && \ + __has_include() + #include + #define ENABLE_AVX512_BW + +#elif defined(ENABLE_MULTIARCH_ARM_SVE) + #include + #include + #define ENABLE_DEFAULT + +#elif defined(ENABLE_MULTIARCH_AVX512_BW) && \ + __has_include() + #include + #include + #define ENABLE_DEFAULT +#else + #define ENABLE_DEFAULT #endif -// All ARM64 CPUs support the NEON vector instruction set -#if (defined(__ARM_NEON) || defined(__aarch64__)) && \ - __has_include() - #include - #define HAS_ARM_NEON +#if defined(ENABLE_DEFAULT) + /// All x64 CPUs support the SSE2 vector instruction set + #if defined(__SSE2__) && \ + __has_include() + #include + #define HAS_SSE2 + #endif + // All ARM64 CPUs support the NEON vector instruction set + #if (defined(__ARM_NEON) || defined(__aarch64__)) && \ + __has_include() + #include + #define HAS_ARM_NEON + #endif #endif namespace { +#if defined(ENABLE_ARM_SVE) || \ + defined(ENABLE_MULTIARCH_ARM_SVE) + +#if defined(ENABLE_MULTIARCH_ARM_SVE) + __attribute__ ((target ("arch=armv8-a+sve"))) +#endif +void AND_PreSieveTables_arm_sve(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) +{ + for (std::size_t i = 0; i < bytes; i += svcntb()) + { + svbool_t pg = svwhilelt_b8(i, bytes); + + svst1_u8(pg, &sieve[i], + svand_u8_x(svptrue_b64(), + svand_u8_z(pg, svld1_u8(pg, &preSieved0[i]), svld1_u8(pg, &preSieved1[i])), + svand_u8_z(pg, svld1_u8(pg, &preSieved2[i]), svld1_u8(pg, &preSieved3[i])))); + } +} + +#if defined(ENABLE_MULTIARCH_ARM_SVE) + __attribute__ ((target ("arch=armv8-a+sve"))) +#endif +void AND_PreSieveTables_Sieve_arm_sve(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) +{ + for (std::size_t i = 0; i < bytes; i += svcntb()) + { + svbool_t pg = svwhilelt_b8(i, bytes); + + svst1_u8(pg, &sieve[i], + svand_u8_z(pg, svld1_u8(pg, &sieve[i]), svand_u8_x(svptrue_b64(), + svand_u8_z(pg, svld1_u8(pg, &preSieved0[i]), svld1_u8(pg, &preSieved1[i])), + svand_u8_z(pg, svld1_u8(pg, &preSieved2[i]), svld1_u8(pg, &preSieved3[i]))))); + } +} + +#elif defined(ENABLE_AVX512_BW) || \ + defined(ENABLE_MULTIARCH_AVX512_BW) + +#if defined(ENABLE_MULTIARCH_AVX512_BW) + __attribute__ ((target ("avx512f,avx512bw"))) +#endif +void AND_PreSieveTables_avx512(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) +{ + std::size_t i = 0; + + for (; i + 64 <= bytes; i += sizeof(__m512i)) + { + _mm512_storeu_epi8((__m512i*) &sieve[i], + _mm512_and_si512( + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved0[i]), _mm512_loadu_epi8((const __m512i*) &preSieved1[i])), + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved2[i]), _mm512_loadu_epi8((const __m512i*) &preSieved3[i])))); + } + + if (i < bytes) + { + __mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes); + + _mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask, + _mm512_and_si512( + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved0[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved1[i])), + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved2[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved3[i])))); + } +} + +#if defined(ENABLE_MULTIARCH_AVX512_BW) + __attribute__ ((target ("avx512f,avx512bw"))) +#endif +void AND_PreSieveTables_Sieve_avx512(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) +{ + std::size_t i = 0; + + for (; i + 64 <= bytes; i += sizeof(__m512i)) + { + _mm512_storeu_epi8((__m512i*) &sieve[i], + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &sieve[i]), _mm512_and_si512( + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved0[i]), _mm512_loadu_epi8((const __m512i*) &preSieved1[i])), + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved2[i]), _mm512_loadu_epi8((const __m512i*) &preSieved3[i]))))); + } + + if (i < bytes) + { + __mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes); + + _mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask, + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &sieve[i]), _mm512_and_si512( + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved0[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved1[i])), + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved2[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved3[i]))))); + } +} + +#endif + +/// This section contains portable SIMD algorithms that don't need +/// any runtime CPU support checks. +#if defined(ENABLE_DEFAULT) #if defined(HAS_SSE2) -/// Since compiler auto-vectorization is not 100% reliable, we have -/// manually vectorized the AND_PreSieveTables() function for x64 CPUs. -/// This algorithm is portable since all x64 CPUs support the SSE2 -/// instruction set. -/// -void AND_PreSieveTables(const uint8_t* __restrict preSieved0, - const uint8_t* __restrict preSieved1, - const uint8_t* __restrict preSieved2, - const uint8_t* __restrict preSieved3, - uint8_t* __restrict sieve, - std::size_t bytes) +void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) { std::size_t i = 0; std::size_t limit = bytes - bytes % sizeof(__m128i); - // Note that I also tried vectorizing this algorithm using AVX2 - // which has twice the vector width compared to SSE2, but this did - // not provide any speedup. On average, this loop processes only - // 956 bytes, hence there aren't many vector loop iterations and - // by increasing the vector width this also increases the number of - // scalar loop iterations after the vector loop finishes which - // could potentially even become a bottleneck. for (; i < limit; i += sizeof(__m128i)) { _mm_storeu_si128((__m128i*) &sieve[i], @@ -97,12 +226,12 @@ void AND_PreSieveTables(const uint8_t* __restrict preSieved0, sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; } -void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, - const uint8_t* __restrict preSieved1, - const uint8_t* __restrict preSieved2, - const uint8_t* __restrict preSieved3, - uint8_t* __restrict sieve, - std::size_t bytes) +void AND_PreSieveTables_Sieve_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) { std::size_t i = 0; std::size_t limit = bytes - bytes % sizeof(__m128i); @@ -121,19 +250,12 @@ void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, #elif defined(HAS_ARM_NEON) -/// Homebrew compiles its C/C++ packages on macOS using Clang -Os -/// (instead of -O2 or -O3) which does not auto-vectorize our simple -/// loop with Bitwise AND. If this loop is not vectorized this -/// deteriorates the performance of primesieve by up to 40%. As a -/// workaround for this Homebrew issue we have manually vectorized -/// the Bitwise AND loop using ARM NEON. -/// -void AND_PreSieveTables(const uint8_t* __restrict preSieved0, - const uint8_t* __restrict preSieved1, - const uint8_t* __restrict preSieved2, - const uint8_t* __restrict preSieved3, - uint8_t* __restrict sieve, - std::size_t bytes) +void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) { std::size_t i = 0; std::size_t limit = bytes - bytes % sizeof(uint8x16_t); @@ -150,12 +272,12 @@ void AND_PreSieveTables(const uint8_t* __restrict preSieved0, sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; } -void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, - const uint8_t* __restrict preSieved1, - const uint8_t* __restrict preSieved2, - const uint8_t* __restrict preSieved3, - uint8_t* __restrict sieve, - std::size_t bytes) +void AND_PreSieveTables_Sieve_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) { std::size_t i = 0; std::size_t limit = bytes - bytes % sizeof(uint8x16_t); @@ -174,6 +296,31 @@ void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, #else +void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) +{ + for (std::size_t i = 0; i < bytes; i++) + sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; +} + +void AND_PreSieveTables_Sieve_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) +{ + for (std::size_t i = 0; i < bytes; i++) + sieve[i] &= preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; +} + +#endif +#endif + void AND_PreSieveTables(const uint8_t* __restrict preSieved0, const uint8_t* __restrict preSieved1, const uint8_t* __restrict preSieved2, @@ -181,13 +328,28 @@ void AND_PreSieveTables(const uint8_t* __restrict preSieved0, uint8_t* __restrict sieve, std::size_t bytes) { - // This loop will get auto-vectorized if compiled with GCC/Clang - // using -O3. Using GCC -O2 does not auto-vectorize this loop - // because -O2 uses the "very-cheap" vector cost model. To fix - // this issue we enable -ftree-vectorize -fvect-cost-model=dynamic - // if the compiler supports it in auto_vectorization.cmake. - for (std::size_t i = 0; i < bytes; i++) - sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; +#if defined(ENABLE_ARM_SVE) + AND_PreSieveTables_arm_sve(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); +#elif defined(ENABLE_AVX512_BW) + AND_PreSieveTables_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + +#elif defined(ENABLE_MULTIARCH_ARM_SVE) + + if (cpu_supports_sve) + AND_PreSieveTables_arm_sve(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + else + AND_PreSieveTables_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + +#elif defined(ENABLE_MULTIARCH_AVX512_BW) + + if (cpu_supports_avx512_bw) + AND_PreSieveTables_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + else + AND_PreSieveTables_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + +#else + AND_PreSieveTables_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); +#endif } void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, @@ -197,11 +359,29 @@ void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, uint8_t* __restrict sieve, std::size_t bytes) { - for (std::size_t i = 0; i < bytes; i++) - sieve[i] &= preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; -} +#if defined(ENABLE_ARM_SVE) + AND_PreSieveTables_Sieve_arm_sve(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); +#elif defined(ENABLE_AVX512_BW) + AND_PreSieveTables_Sieve_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); +#elif defined(ENABLE_MULTIARCH_ARM_SVE) + + if (cpu_supports_sve) + AND_PreSieveTables_Sieve_arm_sve(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + else + AND_PreSieveTables_Sieve_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + +#elif defined(ENABLE_MULTIARCH_AVX512_BW) + + if (cpu_supports_avx512_bw) + AND_PreSieveTables_Sieve_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + else + AND_PreSieveTables_Sieve_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + +#else + AND_PreSieveTables_Sieve_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); #endif +} } // namespace diff --git a/src/app/main.cpp b/src/app/main.cpp index ce052562..fe3ca05e 100644 --- a/src/app/main.cpp +++ b/src/app/main.cpp @@ -35,6 +35,20 @@ #include #include +#if defined(ENABLE_MULTIARCH_ARM_SVE) + #include +#endif + +#if defined(ENABLE_MULTIARCH_AVX512_BW) + +namespace primesieve { + +bool has_cpuid_avx512_bw(); + +} // namespace + +#endif + #if defined(ENABLE_MULTIARCH_AVX512_VBMI2) namespace primesieve { @@ -233,13 +247,25 @@ void cpuInfo() else std::cout << "Logical CPU cores: unknown" << std::endl; - #if defined(ENABLE_MULTIARCH_AVX512_VBMI2) + #if defined(ENABLE_MULTIARCH_ARM_SVE) + if (cpu_supports_sve) + std::cout << "Has ARM SVE: yes" << std::endl; + else + std::cout << "Has ARM SVE: no" << std::endl; + #endif - if (primesieve::has_cpuid_avx512_vbmi2()) - std::cout << "Has AVX512: yes" << std::endl; + #if defined(ENABLE_MULTIARCH_AVX512_BW) + if (primesieve::has_cpuid_avx512_bw()) + std::cout << "Has AVX512 BW: yes" << std::endl; else - std::cout << "Has AVX512: no" << std::endl; + std::cout << "Has AVX512 BW: no" << std::endl; + #endif + #if defined(ENABLE_MULTIARCH_AVX512_VBMI2) + if (primesieve::has_cpuid_avx512_vbmi2()) + std::cout << "Has AVX512 VBMI2: yes" << std::endl; + else + std::cout << "Has AVX512 VBMI2: no" << std::endl; #endif if (cpu.hasL1Cache()) diff --git a/src/x86/cpuid.cpp b/src/x86/cpuid.cpp index d51e7908..d15c74ff 100644 --- a/src/x86/cpuid.cpp +++ b/src/x86/cpuid.cpp @@ -19,7 +19,8 @@ // https://en.wikipedia.org/wiki/CPUID // %ebx bit flags -#define bit_AVX512F (1 << 16) +#define bit_AVX512F (1 << 16) +#define bit_AVX512BW (1 << 30) // %ecx bit flags #define bit_AVX512VBMI (1 << 1) @@ -91,6 +92,37 @@ bool has_cpuid_popcnt() return (abcd[2] & bit_POPCNT) == bit_POPCNT; } +bool has_cpuid_avx512_bw() +{ + int abcd[4]; + + run_cpuid(1, 0, abcd); + + int osxsave_mask = (1 << 27); + + // Ensure OS supports extended processor state management + if ((abcd[2] & osxsave_mask) != osxsave_mask) + return false; + + uint64_t ymm_mask = XSTATE_SSE | XSTATE_YMM; + uint64_t zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; + uint64_t xcr0 = get_xcr0(); + + // Check AVX OS support + if ((xcr0 & ymm_mask) != ymm_mask) + return false; + + // Check AVX512 OS support + if ((xcr0 & zmm_mask) != zmm_mask) + return false; + + run_cpuid(7, 0, abcd); + + // AND_PreSieveTables_avx512 requires AVX512F, AVX512BW + return ((abcd[1] & bit_AVX512F) == bit_AVX512F && + (abcd[1] & bit_AVX512BW) == bit_AVX512BW); +} + bool has_cpuid_avx512_vbmi2() { int abcd[4];