Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Fix Vectorization Flags (long-term / playground) #2426

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,12 @@ if(CMAKE_COMPILER_IS_CLANG)
SET(CLANG_LIBRARIES "stdc++")
endif()

if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
if("${CMAKE_CXX_FLAGS}" STREQUAL "")
SET(CMAKE_CXX_FLAGS "${SSE_FLAGS_STR}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we be populating the compiler options instead. I'm just talking semantics here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, long-term these should be public compile options assuming you want consuming projects to inherit them (which we should). I just mirrored what the other compilers do here, but since this one is going to be a longer-term effort definitely yes.

I think a more explicit option PCL_SSE_FLAGS or PCL_VECTORIZATION_FLAGS or PCL_PLATFORM_FLAGS or something should be added (cached empty string variable, not an option call). Usage would be something like cmake .. -DPCL_SSE_FLAGS="-march=native" or cmake .. -DPCL_SSE_FLAGS="-xavx512" or whatever, probably allow it to be a list. Then instead of guarding against "${CMAKE_CXX_FLAGS}" STREQUAL "", we would do something like

if (PCL_SSE_FLAGS)
  target_compile_options(pcl PUBLIC ${PCL_SSE_FLAGS})
else()
  include(pcl_sse_flags) # I'll fix up the CMAKE_MODULE_PATH to allow this include
  pcl_check_for_sse()
  if (PCL_SSE_FLAGS)
    target_compile_options(pcl PUBLIC ${PCL_SSE_FLAGS})
  endif()
endif()

Realistically, creating a pcl-interface target is the right thing to do here -- all libs link against that, so we actually would do target_compile_options(pcl-interface INTERFACE ${PCL_SSE_FLAGS}). No need to special-case on compilers. Working with lists is preferred, no need to convert to strings. When you do target_link_libraries(pcl PUBLIC pcl-interface), the INTERFACE_*_ properties propagate from pcl-interface to pcl :)

endif()
endif()

include("${PCL_SOURCE_DIR}/cmake/pcl_utils.cmake")
set(PCL_VERSION "1.8.1-dev" CACHE STRING "PCL version")
DISSECT_VERSION()
Expand Down
227 changes: 165 additions & 62 deletions cmake/pcl_find_sse.cmake
Original file line number Diff line number Diff line change
@@ -1,38 +1,114 @@
# Useful information for the Intel compiler specifically to determine if we can
# use the -x<code> or /Qx<code> versions, or if we need to use -m<code> or
# /arch<code>. Using -x<code> on a non-Intel CPU is not allowed.
cmake_host_system_information(
RESULT PCL_HOST_PROCESSOR_DESCRIPTION
QUERY PROCESSOR_DESCRIPTION
)
if(PCL_HOST_PROCESSOR_DESCRIPTION MATCHES "Intel")
set(PCL_HOST_PROCESSOR_IS_INTEL TRUE)
else()
set(PCL_HOST_PROCESSOR_IS_INTEL FALSE)
endif()

# Helper macro for pcl_check_for_sse, *NOT* intended to be called anywhere else.
# No error checking is performed!
#
# output: Where to store the compiler specific flag.
# arch: the architecture flag to test. Inputs _must_ be lower-case, the only
# acceptable values are:
#
# sse, sse2, sse3, sse4.1, sse4.2
#
# NOTE: ${output} set to the empty string in cases where the compiler does not
# have a flag for this ${arch} (e.g., Visual Studio and sse4.2).
#
# Example: pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS sse4.2)
function(PCL_SSE_COMPILER_ARCH_FLAG output arch)
if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
# sse -> -msse, sse4.2 -> -msse4.2
set(${output} "-m${arch}" PARENT_SCOPE)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
# Intel Compiler rules:
#
# If compiling for Intel CPU (https://software.intel.com/en-us/node/522845):
# Windows: /Qx<code>
# Unix: -x<code>
#
# Otherwise:
# Windows: /arch:<code>
# https://software.intel.com/en-us/node/522822
# Unix: -m<code>
# https://software.intel.com/en-us/node/522834
#
# As such, since we are doing all of the tests anyway, we will avoid -xHost
# and /QxHost because those are only for Intel CPUs (their equivalent of
# -march=native).
#
# The logic is included because the -x or /Qx versions include additional
# optimizations that -m or /arch do not.
if(arch STREQUAL "sse")
set(${output} "" PARENT_SCOPE)
else()
string(TOUPPER "${arch}" arch)
if (WIN32)
if(PCL_HOST_PROCESSOR_IS_INTEL)
set(${output} "/Qx${arch}" PARENT_SCOPE)
else()
set(${output} "/arch:${arch}" PARENT_SCOPE)
endif()
else()
if(PCL_HOST_PROCESSOR_IS_INTEL)
set(${output} "-x${arch}" PARENT_SCOPE)
else()
set(${output} "-m${arch}" PARENT_SCOPE)
endif()
endif()
endif()
elseif(MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 8) # 64 bit Visual Studio
# MSVC only supports SSE, SSE2, AVX, and AVX2
string(TOUPPER "${arch}" arch)
set(msvc_archs SSE SSE2 AVX AVX2)
if(arch IN_LIST msvc_archs)
set(${output} "/arch:${arch}" PARENT_SCOPE)
else()
set(${output} "" PARENT_SCOPE)
endif()
else()
set(${output} "" PARENT_SCOPE)
endif()
endfunction()

###############################################################################
# Check for the presence of SSE and figure out the flags to use for it.
macro(PCL_CHECK_FOR_SSE)
set(SSE_FLAGS)
set(SSE_DEFINITIONS)

# Test CLANG
#if(CMAKE_COMPILER_IS_CLANG)
# if(APPLE)
# SET(SSE_FLAGS "${SSE_FLAGS} -march=native")
# endif(APPLE)
#endif(CMAKE_COMPILER_IS_CLANG)

# Test GCC/G++
if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
execute_process(COMMAND ${CMAKE_CXX_COMPILER} "-dumpversion"
OUTPUT_VARIABLE GCC_VERSION_STRING)
if(GCC_VERSION_STRING VERSION_GREATER 4.2 AND NOT APPLE AND NOT CMAKE_CROSSCOMPILING)
if (NOT CMAKE_CROSSCOMPILING)
if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
list(APPEND SSE_FLAGS "-march=native")
message(STATUS "Using CPU native flags for SSE optimization: ${SSE_FLAGS}")
endif()
endif()

# Unfortunately we need to check for SSE to enable "-mfpmath=sse" alongside
# Unfortunately we need to check for SSE to enable "-mfpmath=sse" alongside
# "-march=native". The reason for this is that by default, 32bit architectures
# tend to use the x87 FPU (which has 80 bit internal precision), thus leading
# to different results than 64bit architectures which are using SSE2 (64 bit internal
# precision). One solution would be to use "-ffloat-store" on 32bit (see
# precision). One solution would be to use "-ffloat-store" on 32bit (see
# http://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html), but that slows code down,
# so the preferred solution is to try "-mpfmath=sse" first.
include(CheckCXXSourceRuns)
set(CMAKE_REQUIRED_FLAGS)

check_cxx_source_runs("
#include <mm_malloc.h>
// Intel compiler defines an incompatible _mm_malloc signature
#if defined(__INTEL_COMPILER)
#include <malloc.h>
#else
#include <mm_malloc.h>
#endif
int main()
{
void* mem = _mm_malloc (100, 16);
Expand All @@ -49,10 +125,41 @@ macro(PCL_CHECK_FOR_SSE)
}"
HAVE_POSIX_MEMALIGN)

if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
set(CMAKE_REQUIRED_FLAGS "-msse4.2")
endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "avx2")
check_cxx_source_runs("
#include <immintrin.h>
int main ()
{
/* simple test: subtract result = left - right */
/* using _mm256_sub_epi32 is the 'test' (new in AVX2) */
__m256i left = _mm256_set_epi32 (0, 1, 2, 3, 4, 5, 6, 7);
__m256i right = _mm256_set_epi32 (7, 6, 5, 4, 3, 2, 1, 0);
__m256i result = _mm256_sub_epi32 (right, left);

// result: {-7, -5, -3, -1, 1, 3, 5, 7}
// could check using int *i = (int *)&result;
return (0);
}"
HAVE_AVX2_EXTENSIONS)

pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "avx")
check_cxx_source_runs("
#include <immintrin.h>
int main ()
{
/* simple test: subtract result = left - right */
/* using _mm256_sub_ps is the 'test' (new in AVX) */
__m256 left = _mm256_set_ps (0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
__m256 right = _mm256_set_ps (7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0);
__m256 result = _mm256_sub_ps (right, left);

// result: {-7, -5, -3, -1, 1, 3, 5, 7}
// could check using float *f = (float *)&result;
return (0);
}"
HAVE_AVX_EXTENSIONS)

pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "sse4.2")
check_cxx_source_runs("
#include <emmintrin.h>
#include <nmmintrin.h>
Expand All @@ -73,10 +180,7 @@ macro(PCL_CHECK_FOR_SSE)
}"
HAVE_SSE4_2_EXTENSIONS)

if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
set(CMAKE_REQUIRED_FLAGS "-msse4.1")
endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)

pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "sse4.1")
check_cxx_source_runs("
#include <smmintrin.h>
int main ()
Expand All @@ -92,10 +196,7 @@ macro(PCL_CHECK_FOR_SSE)
}"
HAVE_SSE4_1_EXTENSIONS)

if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
set(CMAKE_REQUIRED_FLAGS "-mssse3")
endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)

pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "ssse3")
check_cxx_source_runs("
#include <tmmintrin.h>
int main ()
Expand All @@ -109,10 +210,7 @@ macro(PCL_CHECK_FOR_SSE)
}"
HAVE_SSSE3_EXTENSIONS)

if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
set(CMAKE_REQUIRED_FLAGS "-msse3")
endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)

pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "sse3")
check_cxx_source_runs("
#include <pmmintrin.h>
int main ()
Expand All @@ -126,12 +224,7 @@ macro(PCL_CHECK_FOR_SSE)
}"
HAVE_SSE3_EXTENSIONS)

if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
set(CMAKE_REQUIRED_FLAGS "-msse2")
elseif(MSVC AND NOT CMAKE_CL_64)
set(CMAKE_REQUIRED_FLAGS "/arch:SSE2")
endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)

pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "sse2")
check_cxx_source_runs("
#include <emmintrin.h>
int main ()
Expand All @@ -145,12 +238,7 @@ macro(PCL_CHECK_FOR_SSE)
}"
HAVE_SSE2_EXTENSIONS)

if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
set(CMAKE_REQUIRED_FLAGS "-msse")
elseif(MSVC AND NOT CMAKE_CL_64)
set(CMAKE_REQUIRED_FLAGS "/arch:SSE")
endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)

pcl_sse_compiler_arch_flag(CMAKE_REQUIRED_FLAGS "sse")
check_cxx_source_runs("
#include <xmmintrin.h>
int main ()
Expand All @@ -165,35 +253,50 @@ macro(PCL_CHECK_FOR_SSE)
}"
HAVE_SSE_EXTENSIONS)

# Make sure to un-set this variable so later code is not affected.
set(CMAKE_REQUIRED_FLAGS)

if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
if(HAVE_SSE4_2_EXTENSIONS)
list(APPEND SSE_FLAGS "-msse4.2" "-mfpmath=sse")
elseif(HAVE_SSE4_1_EXTENSIONS)
list(APPEND SSE_FLAGS "-msse4.1" "-mfpmath=sse")
elseif(HAVE_SSSE3_EXTENSIONS)
list(APPEND SSE_FLAGS "-mssse3" "-mfpmath=sse")
elseif(HAVE_SSE3_EXTENSIONS)
list(APPEND SSE_FLAGS "-msse3" "-mfpmath=sse")
elseif(HAVE_SSE2_EXTENSIONS)
list(APPEND SSE_FLAGS "-msse2" "-mfpmath=sse")
elseif(HAVE_SSE_EXTENSIONS)
list(APPEND SSE_FLAGS "-msse" "-mfpmath=sse")
else()
# Start: empty, if empty after all if-elseif signals no SSE/AVX support.
# Order matters: make sure to do "highest" vectorization checks first.
# NOTE: helper function sets to empty string on 32-bit MSVC build always.
set(architecture_flag)
if(HAVE_AVX2_EXTENSIONS)
pcl_sse_compiler_arch_flag(architecture_flag "avx2")
elseif(HAVE_AVX_EXTENSIONS)
pcl_sse_compiler_arch_flag(architecture_flag "avx")
elseif(HAVE_SSE4_2_EXTENSIONS)
pcl_sse_compiler_arch_flag(architecture_flag "sse4.2")
elseif(HAVE_SSE4_1_EXTENSIONS)
pcl_sse_compiler_arch_flag(architecture_flag "sse4.1")
elseif(HAVE_SSSE3_EXTENSIONS)
pcl_sse_compiler_arch_flag(architecture_flag "ssse3")
elseif(HAVE_SSE3_EXTENSIONS)
pcl_sse_compiler_arch_flag(architecture_flag "sse3")
elseif(HAVE_SSE2_EXTENSIONS)
pcl_sse_compiler_arch_flag(architecture_flag "sse2")
elseif(HAVE_SSE_EXTENSIONS)
pcl_sse_compiler_arch_flag(architecture_flag "sse")
endif()

if(architecture_flag)
list(APPEND SSE_FLAGS "${architecture_flag}")
# GCC, Clang, or Intel on Non-Windows (AKA Intel backed by GCC or Clang)
if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
list(APPEND SSE_FLAGS "-mfpmath=sse")
endif()
else()
if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG)
# Setting -ffloat-store to alleviate 32bit vs 64bit discrepancies on non-SSE
# platforms.
list(APPEND SSE_FLAGS "-ffloat-store")
endif()
elseif(MSVC AND NOT CMAKE_CL_64)
if(HAVE_SSE2_EXTENSIONS)
list(APPEND SSE_FLAGS "/arch:SSE2")
elseif(HAVE_SSE_EXTENSIONS)
list(APPEND SSE_FLAGS "/arch:SSE")
endif(HAVE_SSE2_EXTENSIONS)
endif()

# Erase architecture_flag
set(architecture_flag)

if(MSVC)
# TODO: why are these definitions here and what are the SSE4, AVX, and AVX2 ones?
if(HAVE_SSSE3_EXTENSIONS)
SET(SSE_DEFINITIONS "${SSE_DEFINITIONS} -D__SSSE3__")
endif()
Expand Down
Loading