From d2812e5e43a71b8fbb84cc4c0c24c6d27b8e9fcd Mon Sep 17 00:00:00 2001 From: sven Date: Mon, 10 Sep 2018 10:17:33 -0700 Subject: [PATCH 1/3] add intel specific DEPRECATED macros --- common/include/pcl/pcl_macros.h | 52 +++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/common/include/pcl/pcl_macros.h b/common/include/pcl/pcl_macros.h index 4f45aac49e3..26fb0230c5f 100644 --- a/common/include/pcl/pcl_macros.h +++ b/common/include/pcl/pcl_macros.h @@ -321,18 +321,26 @@ log2f (float x) #define __has_extension(x) 0 // Compatibility with pre-3.0 compilers. #endif -#if (defined(__GNUC__) && PCL_LINEAR_VERSION(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) < PCL_LINEAR_VERSION(4,5,0) && ! defined(__clang__)) || defined(__INTEL_COMPILER) -#define PCL_DEPRECATED(message) __attribute__ ((deprecated)) +// check Intel compiler first since it usually also defines __GNUC__, __clang__, etc. +#if defined(__INTEL_COMPILER) + #define PCL_DEPRECATED(message) __attribute((deprecated)) #endif -// gcc supports this starting from 4.5 : http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43666 -#if (defined(__GNUC__) && PCL_LINEAR_VERSION(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) >= PCL_LINEAR_VERSION(4,5,0)) || (defined(__clang__) && __has_extension(attribute_deprecated_with_message)) -#define PCL_DEPRECATED(message) __attribute__ ((deprecated(message))) -#endif +// wrapper: skip remaining checks if defined by __INTEL_COMPILER branch +#if !defined(PCL_DEPRECATED) + #if (defined(__GNUC__) && PCL_LINEAR_VERSION(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) < PCL_LINEAR_VERSION(4,5,0) && ! defined(__clang__)) || defined(__INTEL_COMPILER) + #define PCL_DEPRECATED(message) __attribute__ ((deprecated)) + #endif -#ifdef _MSC_VER -#define PCL_DEPRECATED(message) __declspec(deprecated(message)) -#endif + // gcc supports this starting from 4.5 : http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43666 + #if (defined(__GNUC__) && PCL_LINEAR_VERSION(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) >= PCL_LINEAR_VERSION(4,5,0)) || (defined(__clang__) && __has_extension(attribute_deprecated_with_message)) + #define PCL_DEPRECATED(message) __attribute__ ((deprecated(message))) + #endif + + #ifdef _MSC_VER + #define PCL_DEPRECATED(message) __declspec(deprecated(message)) + #endif +#endif // PCL_DEPRECATED (via __INTEL_COMPILER) #ifndef PCL_DEPRECATED #pragma message("WARNING: You need to implement PCL_DEPRECATED for this compiler") @@ -356,18 +364,26 @@ log2f (float x) // NewClass() {} // }; -#if (defined(__GNUC__) && PCL_LINEAR_VERSION(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) < PCL_LINEAR_VERSION(4,5,0) && ! defined(__clang__)) || defined(__INTEL_COMPILER) -#define PCL_DEPRECATED_CLASS(func, message) __attribute__ ((deprecated)) func +// check Intel compiler first since it usually also defines __GNUC__, __clang__, etc. +#if defined(__INTEL_COMPILER) + #define PCL_DEPRECATED_CLASS(func, message) __attribute((deprecated)) func #endif -// gcc supports this starting from 4.5 : http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43666 -#if (defined(__GNUC__) && PCL_LINEAR_VERSION(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) >= PCL_LINEAR_VERSION(4,5,0)) || (defined(__clang__) && __has_extension(attribute_deprecated_with_message)) -#define PCL_DEPRECATED_CLASS(func, message) __attribute__ ((deprecated(message))) func -#endif +// wrapper: skip remaining checks if defined by __INTEL_COMPILER branch +#if !defined(PCL_DEPRECATED_CLASS) + #if (defined(__GNUC__) && PCL_LINEAR_VERSION(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) < PCL_LINEAR_VERSION(4,5,0) && ! defined(__clang__)) || defined(__INTEL_COMPILER) + #define PCL_DEPRECATED_CLASS(func, message) __attribute__ ((deprecated)) func + #endif -#ifdef _MSC_VER -#define PCL_DEPRECATED_CLASS(func, message) __declspec(deprecated(message)) func -#endif + // gcc supports this starting from 4.5 : http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43666 + #if (defined(__GNUC__) && PCL_LINEAR_VERSION(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) >= PCL_LINEAR_VERSION(4,5,0)) || (defined(__clang__) && __has_extension(attribute_deprecated_with_message)) + #define PCL_DEPRECATED_CLASS(func, message) __attribute__ ((deprecated(message))) func + #endif + + #ifdef _MSC_VER + #define PCL_DEPRECATED_CLASS(func, message) __declspec(deprecated(message)) func + #endif +#endif // PCL_DEPRECATED (via __INTEL_COMPILER) #ifndef PCL_DEPRECATED_CLASS #pragma message("WARNING: You need to implement PCL_DEPRECATED_CLASS for this compiler") From 8b61be94e9aab021a8d19a4b9a019cecce872880 Mon Sep 17 00:00:00 2001 From: sven Date: Mon, 10 Sep 2018 10:23:09 -0700 Subject: [PATCH 2/3] fix intel _mm_malloc definition conflicts with GCC --- cmake/pcl_find_sse.cmake | 13 +++++++++---- common/include/pcl/pcl_macros.h | 7 ++++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/cmake/pcl_find_sse.cmake b/cmake/pcl_find_sse.cmake index a4c8d3a9a73..ba36eda02c6 100644 --- a/cmake/pcl_find_sse.cmake +++ b/cmake/pcl_find_sse.cmake @@ -21,18 +21,23 @@ macro(PCL_CHECK_FOR_SSE) endif() endif() - # Unfortunately we need to check for SSE to enable "-mfpmath=sse" alongside + # Unfortunately we need to check for SSE to enable "-mfpmath=sse" alongside # "-march=native". The reason for this is that by default, 32bit architectures # tend to use the x87 FPU (which has 80 bit internal precision), thus leading # to different results than 64bit architectures which are using SSE2 (64 bit internal - # precision). One solution would be to use "-ffloat-store" on 32bit (see + # precision). One solution would be to use "-ffloat-store" on 32bit (see # http://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html), but that slows code down, # so the preferred solution is to try "-mpfmath=sse" first. include(CheckCXXSourceRuns) set(CMAKE_REQUIRED_FLAGS) check_cxx_source_runs(" - #include + // Intel compiler defines an incompatible _mm_malloc signature + #if defined(__INTEL_COMPILER) + #include + #else + #include + #endif int main() { void* mem = _mm_malloc (100, 16); @@ -131,7 +136,7 @@ macro(PCL_CHECK_FOR_SSE) elseif(MSVC AND NOT CMAKE_CL_64) set(CMAKE_REQUIRED_FLAGS "/arch:SSE2") endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - + check_cxx_source_runs(" #include int main () diff --git a/common/include/pcl/pcl_macros.h b/common/include/pcl/pcl_macros.h index 26fb0230c5f..be22bc38a04 100644 --- a/common/include/pcl/pcl_macros.h +++ b/common/include/pcl/pcl_macros.h @@ -415,7 +415,12 @@ log2f (float x) #endif #if defined (HAVE_MM_MALLOC) - #include + // Intel compiler defines an incompatible _mm_malloc signature + #if defined(__INTEL_COMPILER) + #include + #else + #include + #endif #endif inline void* From 6c188e641f5aa66dcf7ef31f16ecda4dce86fe51 Mon Sep 17 00:00:00 2001 From: sven Date: Mon, 10 Sep 2018 10:43:55 -0700 Subject: [PATCH 3/3] add AVX,AVX2 support for Intel and MSVC --- CMakeLists.txt | 6 ++ cmake/pcl_find_sse.cmake | 216 ++++++++++++++++++++++++++++----------- 2 files changed, 163 insertions(+), 59 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ffebbda3523..54f5a7f6f83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,6 +201,12 @@ if(CMAKE_COMPILER_IS_CLANG) SET(CLANG_LIBRARIES "stdc++") endif() +if(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + if("${CMAKE_CXX_FLAGS}" STREQUAL "") + SET(CMAKE_CXX_FLAGS "${SSE_FLAGS_STR}") + endif() +endif() + include("${PCL_SOURCE_DIR}/cmake/pcl_utils.cmake") set(PCL_VERSION "1.8.1-dev" CACHE STRING "PCL version") DISSECT_VERSION() diff --git a/cmake/pcl_find_sse.cmake b/cmake/pcl_find_sse.cmake index ba36eda02c6..03764b99019 100644 --- a/cmake/pcl_find_sse.cmake +++ b/cmake/pcl_find_sse.cmake @@ -1,21 +1,92 @@ +# Useful information for the Intel compiler specifically to determine if we can +# use the -x or /Qx versions, or if we need to use -m or +# /arch. Using -x on a non-Intel CPU is not allowed. +cmake_host_system_information( + RESULT PCL_HOST_PROCESSOR_DESCRIPTION + QUERY PROCESSOR_DESCRIPTION +) +if(PCL_HOST_PROCESSOR_DESCRIPTION MATCHES "Intel") + set(PCL_HOST_PROCESSOR_IS_INTEL TRUE) +else() + set(PCL_HOST_PROCESSOR_IS_INTEL FALSE) +endif() + +# Helper macro for pcl_check_for_sse, *NOT* intended to be called anywhere else. +# No error checking is performed! +# +# output: Where to store the compiler specific flag. +# arch: the architecture flag to test. Inputs _must_ be lower-case, the only +# acceptable values are: +# +# sse, sse2, sse3, sse4.1, sse4.2 +# +# NOTE: ${output} set to the empty string in cases where the compiler does not +# have a flag for this ${arch} (e.g., Visual Studio and sse4.2). +# +# Example: pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS sse4.2) +function(PCL_SSE_COMPILER_ARCH_FLAG output arch) + if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) + # sse -> -msse, sse4.2 -> -msse4.2 + set(${output} "-m${arch}" PARENT_SCOPE) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + # Intel Compiler rules: + # + # If compiling for Intel CPU (https://software.intel.com/en-us/node/522845): + # Windows: /Qx + # Unix: -x + # + # Otherwise: + # Windows: /arch: + # https://software.intel.com/en-us/node/522822 + # Unix: -m + # https://software.intel.com/en-us/node/522834 + # + # As such, since we are doing all of the tests anyway, we will avoid -xHost + # and /QxHost because those are only for Intel CPUs (their equivalent of + # -march=native). + # + # The logic is included because the -x or /Qx versions include additional + # optimizations that -m or /arch do not. + if(arch STREQUAL "sse") + set(${output} "" PARENT_SCOPE) + else() + string(TOUPPER "${arch}" arch) + if (WIN32) + if(PCL_HOST_PROCESSOR_IS_INTEL) + set(${output} "/Qx${arch}" PARENT_SCOPE) + else() + set(${output} "/arch:${arch}" PARENT_SCOPE) + endif() + else() + if(PCL_HOST_PROCESSOR_IS_INTEL) + set(${output} "-x${arch}" PARENT_SCOPE) + else() + set(${output} "-m${arch}" PARENT_SCOPE) + endif() + endif() + endif() + elseif(MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 8) # 64 bit Visual Studio + # MSVC only supports SSE, SSE2, AVX, and AVX2 + string(TOUPPER "${arch}" arch) + set(msvc_archs SSE SSE2 AVX AVX2) + if(arch IN_LIST msvc_archs) + set(${output} "/arch:${arch}" PARENT_SCOPE) + else() + set(${output} "" PARENT_SCOPE) + endif() + else() + set(${output} "" PARENT_SCOPE) + endif() +endfunction() + ############################################################################### # Check for the presence of SSE and figure out the flags to use for it. macro(PCL_CHECK_FOR_SSE) set(SSE_FLAGS) set(SSE_DEFINITIONS) - # Test CLANG - #if(CMAKE_COMPILER_IS_CLANG) - # if(APPLE) - # SET(SSE_FLAGS "${SSE_FLAGS} -march=native") - # endif(APPLE) - #endif(CMAKE_COMPILER_IS_CLANG) - - # Test GCC/G++ - if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX) - execute_process(COMMAND ${CMAKE_CXX_COMPILER} "-dumpversion" - OUTPUT_VARIABLE GCC_VERSION_STRING) - if(GCC_VERSION_STRING VERSION_GREATER 4.2 AND NOT APPLE AND NOT CMAKE_CROSSCOMPILING) + if (NOT CMAKE_CROSSCOMPILING) + if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) list(APPEND SSE_FLAGS "-march=native") message(STATUS "Using CPU native flags for SSE optimization: ${SSE_FLAGS}") endif() @@ -54,10 +125,41 @@ macro(PCL_CHECK_FOR_SSE) }" HAVE_POSIX_MEMALIGN) - if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - set(CMAKE_REQUIRED_FLAGS "-msse4.2") - endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) + pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "avx2") + check_cxx_source_runs(" + #include + int main () + { + /* simple test: subtract result = left - right */ + /* using _mm256_sub_epi32 is the 'test' (new in AVX2) */ + __m256i left = _mm256_set_epi32 (0, 1, 2, 3, 4, 5, 6, 7); + __m256i right = _mm256_set_epi32 (7, 6, 5, 4, 3, 2, 1, 0); + __m256i result = _mm256_sub_epi32 (right, left); + + // result: {-7, -5, -3, -1, 1, 3, 5, 7} + // could check using int *i = (int *)&result; + return (0); + }" + HAVE_AVX2_EXTENSIONS) + pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "avx") + check_cxx_source_runs(" + #include + int main () + { + /* simple test: subtract result = left - right */ + /* using _mm256_sub_ps is the 'test' (new in AVX) */ + __m256 left = _mm256_set_ps (0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + __m256 right = _mm256_set_ps (7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0); + __m256 result = _mm256_sub_ps (right, left); + + // result: {-7, -5, -3, -1, 1, 3, 5, 7} + // could check using float *f = (float *)&result; + return (0); + }" + HAVE_AVX_EXTENSIONS) + + pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "sse4.2") check_cxx_source_runs(" #include #include @@ -78,10 +180,7 @@ macro(PCL_CHECK_FOR_SSE) }" HAVE_SSE4_2_EXTENSIONS) - if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - set(CMAKE_REQUIRED_FLAGS "-msse4.1") - endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - + pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "sse4.1") check_cxx_source_runs(" #include int main () @@ -97,10 +196,7 @@ macro(PCL_CHECK_FOR_SSE) }" HAVE_SSE4_1_EXTENSIONS) - if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - set(CMAKE_REQUIRED_FLAGS "-mssse3") - endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - + pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "ssse3") check_cxx_source_runs(" #include int main () @@ -114,10 +210,7 @@ macro(PCL_CHECK_FOR_SSE) }" HAVE_SSSE3_EXTENSIONS) - if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - set(CMAKE_REQUIRED_FLAGS "-msse3") - endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - + pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "sse3") check_cxx_source_runs(" #include int main () @@ -131,12 +224,7 @@ macro(PCL_CHECK_FOR_SSE) }" HAVE_SSE3_EXTENSIONS) - if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - set(CMAKE_REQUIRED_FLAGS "-msse2") - elseif(MSVC AND NOT CMAKE_CL_64) - set(CMAKE_REQUIRED_FLAGS "/arch:SSE2") - endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - + pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "sse2") check_cxx_source_runs(" #include int main () @@ -150,12 +238,7 @@ macro(PCL_CHECK_FOR_SSE) }" HAVE_SSE2_EXTENSIONS) - if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - set(CMAKE_REQUIRED_FLAGS "-msse") - elseif(MSVC AND NOT CMAKE_CL_64) - set(CMAKE_REQUIRED_FLAGS "/arch:SSE") - endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - + pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "sse") check_cxx_source_runs(" #include int main () @@ -170,35 +253,50 @@ macro(PCL_CHECK_FOR_SSE) }" HAVE_SSE_EXTENSIONS) + # Make sure to un-set this variable so later code is not affected. set(CMAKE_REQUIRED_FLAGS) - if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) - if(HAVE_SSE4_2_EXTENSIONS) - list(APPEND SSE_FLAGS "-msse4.2" "-mfpmath=sse") - elseif(HAVE_SSE4_1_EXTENSIONS) - list(APPEND SSE_FLAGS "-msse4.1" "-mfpmath=sse") - elseif(HAVE_SSSE3_EXTENSIONS) - list(APPEND SSE_FLAGS "-mssse3" "-mfpmath=sse") - elseif(HAVE_SSE3_EXTENSIONS) - list(APPEND SSE_FLAGS "-msse3" "-mfpmath=sse") - elseif(HAVE_SSE2_EXTENSIONS) - list(APPEND SSE_FLAGS "-msse2" "-mfpmath=sse") - elseif(HAVE_SSE_EXTENSIONS) - list(APPEND SSE_FLAGS "-msse" "-mfpmath=sse") - else() + # Start: empty, if empty after all if-elseif signals no SSE/AVX support. + # Order matters: make sure to do "highest" vectorization checks first. + # NOTE: helper function sets to empty string on 32-bit MSVC build always. + set(architecture_flag) + if(HAVE_AVX2_EXTENSIONS) + pcl_sse_compiler_arch_flag(architecture_flag "avx2") + elseif(HAVE_AVX_EXTENSIONS) + pcl_sse_compiler_arch_flag(architecture_flag "avx") + elseif(HAVE_SSE4_2_EXTENSIONS) + pcl_sse_compiler_arch_flag(architecture_flag "sse4.2") + elseif(HAVE_SSE4_1_EXTENSIONS) + pcl_sse_compiler_arch_flag(architecture_flag "sse4.1") + elseif(HAVE_SSSE3_EXTENSIONS) + pcl_sse_compiler_arch_flag(architecture_flag "ssse3") + elseif(HAVE_SSE3_EXTENSIONS) + pcl_sse_compiler_arch_flag(architecture_flag "sse3") + elseif(HAVE_SSE2_EXTENSIONS) + pcl_sse_compiler_arch_flag(architecture_flag "sse2") + elseif(HAVE_SSE_EXTENSIONS) + pcl_sse_compiler_arch_flag(architecture_flag "sse") + endif() + + if(architecture_flag) + list(APPEND SSE_FLAGS "${architecture_flag}") + # GCC, Clang, or Intel on Non-Windows (AKA Intel backed by GCC or Clang) + if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) + list(APPEND SSE_FLAGS "-mfpmath=sse") + endif() + else() + if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) # Setting -ffloat-store to alleviate 32bit vs 64bit discrepancies on non-SSE # platforms. list(APPEND SSE_FLAGS "-ffloat-store") endif() - elseif(MSVC AND NOT CMAKE_CL_64) - if(HAVE_SSE2_EXTENSIONS) - list(APPEND SSE_FLAGS "/arch:SSE2") - elseif(HAVE_SSE_EXTENSIONS) - list(APPEND SSE_FLAGS "/arch:SSE") - endif(HAVE_SSE2_EXTENSIONS) endif() + # Erase architecture_flag + set(architecture_flag) + if(MSVC) + # TODO: why are these definitions here and what are the SSE4, AVX, and AVX2 ones? if(HAVE_SSSE3_EXTENSIONS) SET(SSE_DEFINITIONS "${SSE_DEFINITIONS} -D__SSSE3__") endif()