From fd21d7c1ac458388afc077a0a021d08a3c3def05 Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Mon, 30 Dec 2019 17:37:43 +0800 Subject: [PATCH] Switch to modern CMake CUDA handling (#17031) Introduce unified MXNET_CUDA_ARCH option to specify cuda architectures. Previously cuda architecture setting was partially broken and different options were applied to different parts of the build (CUDA_ARCH_NAME CUDA_ARCH_BIN CUDA_ARCH_PTX and CUDA_ARCH_LIST). Include FindCUDAToolkit from CMake 3.17, which replaces the deprecated FindCUDA functionality for finding the cuda toolkit include directories and libraries. Workaround for DLL size limitation on Windows (#16980) * change windows build system. add gen_warp cpp version add add_custom_command to run warp_gen add download cmake add option change option add dynamic read mxnet dll --- 3rdparty/mshadow/cmake/Cuda.cmake | 324 --------- 3rdparty/mshadow/cmake/Utils.cmake | 398 ----------- 3rdparty/mshadow/cmake/mshadow.cmake | 91 --- 3rdparty/mshadow/cmake/mshadowUtils.cmake | 2 - CMakeLists.txt | 422 +++++------ ci/build_windows.py | 8 +- ci/docker/install/ubuntu_core.sh | 2 +- ci/docker/runtime_functions.sh | 17 +- cmake/BuildTVM.cmake | 26 +- cmake/FirstClassLangCuda.cmake | 277 ------- cmake/Modules/FindCUDAToolkit.cmake | 833 ++++++++++++++++++++++ contrib/tvmop/compile.py | 5 + tools/windowsbuild/README.md | 19 + tools/windowsbuild/gen_warp.cpp | 209 ++++++ tools/windowsbuild/warp_dll.cpp | 151 ++++ 15 files changed, 1463 insertions(+), 1321 deletions(-) delete mode 100644 3rdparty/mshadow/cmake/Cuda.cmake delete mode 100644 3rdparty/mshadow/cmake/Utils.cmake delete mode 100644 3rdparty/mshadow/cmake/mshadow.cmake delete mode 100644 3rdparty/mshadow/cmake/mshadowUtils.cmake delete mode 100644 cmake/FirstClassLangCuda.cmake create mode 100644 cmake/Modules/FindCUDAToolkit.cmake create mode 100644 tools/windowsbuild/README.md create mode 100644 tools/windowsbuild/gen_warp.cpp create mode 100644 tools/windowsbuild/warp_dll.cpp diff --git a/3rdparty/mshadow/cmake/Cuda.cmake b/3rdparty/mshadow/cmake/Cuda.cmake deleted file mode 100644 index bc09a3905076..000000000000 --- a/3rdparty/mshadow/cmake/Cuda.cmake +++ /dev/null @@ -1,324 +0,0 @@ -if(NOT USE_CUDA) - return() -endif() - -include(CheckCXXCompilerFlag) -check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11) - -################################################################################################ -# A function for automatic detection of GPUs installed (if autodetection is enabled) -# Usage: -# mshadow_detect_installed_gpus(out_variable) -function(mshadow_detect_installed_gpus out_variable) -set(CUDA_gpu_detect_output "") - if(NOT CUDA_gpu_detect_output) - message(STATUS "Running GPU architecture autodetection") - set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) - - file(WRITE ${__cufile} "" - "#include \n" - "#include \n" - "using namespace std;\n" - "int main()\n" - "{\n" - " int count = 0;\n" - " if (cudaSuccess != cudaGetDeviceCount(&count)) { return -1; }\n" - " if (count == 0) { cerr << \"No cuda devices detected\" << endl; return -1; }\n" - " for (int device = 0; device < count; ++device)\n" - " {\n" - " cudaDeviceProp prop;\n" - " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" - " std::printf(\"%d.%d \", prop.major, prop.minor);\n" - " }\n" - " return 0;\n" - "}\n") - if(MSVC) - #find vcvarsall.bat and run it building msvc environment - get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY) - find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..") - execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} - WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" - RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out - OUTPUT_STRIP_TRAILING_WHITESPACE) - else() - if(CUDA_LIBRARY_PATH) - set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}") - endif() - execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH} - WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" - RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out - OUTPUT_STRIP_TRAILING_WHITESPACE) - endif() - if(__nvcc_res EQUAL 0) - # nvcc outputs text containing line breaks when building with MSVC. - # The line below prevents CMake from inserting a variable with line - # breaks in the cache - message(STATUS "Found CUDA arch ${__nvcc_out}") - string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}") - string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}") - set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from mshadow_detect_gpus tool" FORCE) - else() - message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}") - endif() - endif() - - if(NOT CUDA_gpu_detect_output) - message(WARNING "Automatic GPU detection failed. Building for all known architectures (${mshadow_known_gpu_archs}).") - set(${out_variable} ${mshadow_known_gpu_archs} PARENT_SCOPE) - else() - set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) - endif() -endfunction() - - -################################################################################################ -# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME -# Usage: -# mshadow_select_nvcc_arch_flags(out_variable) -function(mshadow_select_nvcc_arch_flags out_variable) - # List of arch names - set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "All" "Manual") - set(__archs_name_default "All") - if(NOT CMAKE_CROSSCOMPILING) - list(APPEND __archs_names "Auto") - set(__archs_name_default "Auto") - endif() - - # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) - set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") - set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} ) - mark_as_advanced(CUDA_ARCH_NAME) - - # verify CUDA_ARCH_NAME value - if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};") - string(REPLACE ";" ", " __archs_names "${__archs_names}") - message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.") - endif() - - if(${CUDA_ARCH_NAME} STREQUAL "Manual") - set(CUDA_ARCH_BIN ${mshadow_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") - set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") - mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) - else() - unset(CUDA_ARCH_BIN CACHE) - unset(CUDA_ARCH_PTX CACHE) - endif() - - if(${CUDA_ARCH_NAME} STREQUAL "Fermi") - set(__cuda_arch_bin "20 21(20)") - elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler") - set(__cuda_arch_bin "30 35") - elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") - set(__cuda_arch_bin "50") - elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") - set(__cuda_arch_bin "60 61") - elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") - set(__cuda_arch_bin "70") - elseif(${CUDA_ARCH_NAME} STREQUAL "All") - set(__cuda_arch_bin ${mshadow_known_gpu_archs}) - elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") - mshadow_detect_installed_gpus(__cuda_arch_bin) - else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") - set(__cuda_arch_bin ${CUDA_ARCH_BIN}) - endif() - - # remove dots and convert to lists - string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}") - string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}") - string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}") - string(REGEX MATCHALL "[0-9]+" __cuda_arch_ptx "${__cuda_arch_ptx}") - mshadow_list_unique(__cuda_arch_bin __cuda_arch_ptx) - - set(__nvcc_flags "") - set(__nvcc_archs_readable "") - - # Tell NVCC to add binaries for the specified GPUs - foreach(__arch ${__cuda_arch_bin}) - if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)") - # User explicitly specified PTX for the concrete BIN - list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) - list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1}) - else() - # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN - list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch}) - list(APPEND __nvcc_archs_readable sm_${__arch}) - endif() - endforeach() - - # Tell NVCC to add PTX intermediate code for the specified architectures - foreach(__arch ${__cuda_arch_ptx}) - list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch}) - list(APPEND __nvcc_archs_readable compute_${__arch}) - endforeach() - - string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}") - set(${out_variable} ${__nvcc_flags} PARENT_SCOPE) - set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE) -endfunction() - -################################################################################################ -# Short command for cuda comnpilation -# Usage: -# mshadow_cuda_compile( ) -macro(mshadow_cuda_compile objlist_variable) - foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG) - set(${var}_backup_in_cuda_compile_ "${${var}}") - - # we remove /EHa as it generates warnings under windows - string(REPLACE "/EHa" "" ${var} "${${var}}") - - endforeach() - if(UNIX OR APPLE) - list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC) - endif() - - if(APPLE) - list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function) - endif() - - set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G") - - if(MSVC) - # disable noisy warnings: - # 4819: The file contains a character that cannot be represented in the current code page (number). - list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819") - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/MD") - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/MD") - endforeach(flag_var) - endif() - - # If the build system is a container, make sure the nvcc intermediate files - # go into the build output area rather than in /tmp, which may run out of space - if(IS_CONTAINER_BUILD) - set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}") - message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}") - list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}") - endif() - - cuda_compile(cuda_objcs ${ARGN}) - - foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG) - set(${var} "${${var}_backup_in_cuda_compile_}") - unset(${var}_backup_in_cuda_compile_) - endforeach() - - set(${objlist_variable} ${cuda_objcs}) -endmacro() - -################################################################################################ -# Short command for cuDNN detection. Believe it soon will be a part of CUDA toolkit distribution. -# That's why not FindcuDNN.cmake file, but just the macro -# Usage: -# detect_cuDNN() -function(detect_cuDNN) - set(CUDNN_ROOT "" CACHE PATH "CUDNN root folder") - - find_path(CUDNN_INCLUDE cudnn.h - PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDA_TOOLKIT_INCLUDE} - DOC "Path to cuDNN include directory." ) - - get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) - find_library(CUDNN_LIBRARY NAMES libcudnn.so cudnn.lib # libcudnn_static.a - PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE} ${__libpath_hist} - DOC "Path to cuDNN library.") - - if(CUDNN_INCLUDE AND CUDNN_LIBRARY) - set(HAVE_CUDNN TRUE PARENT_SCOPE) - set(CUDNN_FOUND TRUE PARENT_SCOPE) - - mark_as_advanced(CUDNN_INCLUDE CUDNN_LIBRARY CUDNN_ROOT) - message(STATUS "Found cuDNN (include: ${CUDNN_INCLUDE}, library: ${CUDNN_LIBRARY})") - endif() -endfunction() - - -################################################################################################ -### Non macro section -################################################################################################ - -# Try to prime CUDA_TOOLKIT_ROOT_DIR by looking for libcudart.so -if(NOT CUDA_TOOLKIT_ROOT_DIR) - find_library(CUDA_LIBRARY_PATH libcudart.so PATHS ENV LD_LIBRARY_PATH PATH_SUFFIXES lib lib64) - if(CUDA_LIBRARY_PATH) - get_filename_component(CUDA_LIBRARY_PATH ${CUDA_LIBRARY_PATH} DIRECTORY) - set(CUDA_TOOLKIT_ROOT_DIR "${CUDA_LIBRARY_PATH}/..") - endif() -endif() - -find_package(CUDA 5.5 QUIET REQUIRED) -find_cuda_helper_libs(curand) # cmake 2.8.7 compartibility which doesn't search for curand - -if(NOT CUDA_FOUND) - return() -endif() - -set(HAVE_CUDA TRUE) -message(STATUS "CUDA detected: " ${CUDA_VERSION}) -include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) -list(APPEND mshadow_LINKER_LIBS ${CUDA_CUDART_LIBRARY} - ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) - -# Known NVIDIA GPU achitectures mshadow can be compiled for. -# This list will be used for CUDA_ARCH_NAME = All option -if(CUDA_ARCH_ALL) - set(mshadow_known_gpu_archs "${CUDA_ARCH_ALL}") -else() - if(${CUDA_VERSION} EQUAL 9.0 OR ${CUDA_VERSION} GREATER 9.0) - set(mshadow_known_gpu_archs "30 35 50 52 60 61 70") - elseif(${CUDA_VERSION} EQUAL 8.0 OR ${CUDA_VERSION} GREATER 8.0) - set(mshadow_known_gpu_archs "30 35 50 52 60 61") - else() - set(mshadow_known_gpu_archs "30 35 50 52") - endif() -endif() - -# cudnn detection -if(USE_CUDNN) - detect_cuDNN() - if(HAVE_CUDNN) - add_definitions(-DUSE_CUDNN) - include_directories(SYSTEM ${CUDNN_INCLUDE}) - list(APPEND mshadow_LINKER_LIBS ${CUDNN_LIBRARY}) - endif() -endif() - -# setting nvcc arch flags -mshadow_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) -list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) -message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") - -# Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or -# https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt -if(Boost_VERSION EQUAL 105500) - message(STATUS "Cuda + Boost 1.55: Applying noinline work around") - # avoid warning for CMake >= 2.8.12 - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ") -endif() - -# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc. -foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used) - list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag}) -endforeach() - -# setting default testing device -if(NOT CUDA_TEST_DEVICE) - set(CUDA_TEST_DEVICE -1) -endif() - -mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) -mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) - -# Handle clang/libc++ issue -if(APPLE) - mshadow_detect_darwin_version(OSX_VERSION) - - # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits - if(OSX_VERSION VERSION_GREATER 10.8) - # enabled by default if and only if CUDA version is less than 7.0 - mshadow_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0)) - endif() -endif() diff --git a/3rdparty/mshadow/cmake/Utils.cmake b/3rdparty/mshadow/cmake/Utils.cmake deleted file mode 100644 index dc464f0092f5..000000000000 --- a/3rdparty/mshadow/cmake/Utils.cmake +++ /dev/null @@ -1,398 +0,0 @@ -################################################################################################ -# Command alias for debugging messages -# Usage: -# dmsg() -function(dmsg) - message(STATUS ${ARGN}) -endfunction() - -################################################################################################ -# Removes duplicates from list(s) -# Usage: -# mshadow_list_unique( [] [...]) -macro(mshadow_list_unique) - foreach(__lst ${ARGN}) - if(${__lst}) - list(REMOVE_DUPLICATES ${__lst}) - endif() - endforeach() -endmacro() - -################################################################################################ -# Clears variables from list -# Usage: -# mshadow_clear_vars() -macro(mshadow_clear_vars) - foreach(_var ${ARGN}) - unset(${_var}) - endforeach() -endmacro() - -################################################################################################ -# Removes duplicates from string -# Usage: -# mshadow_string_unique() -function(mshadow_string_unique __string) - if(${__string}) - set(__list ${${__string}}) - separate_arguments(__list) - list(REMOVE_DUPLICATES __list) - foreach(__e ${__list}) - set(__str "${__str} ${__e}") - endforeach() - set(${__string} ${__str} PARENT_SCOPE) - endif() -endfunction() - -################################################################################################ -# Prints list element per line -# Usage: -# mshadow_print_list() -function(mshadow_print_list) - foreach(e ${ARGN}) - message(STATUS ${e}) - endforeach() -endfunction() - -################################################################################################ -# Function merging lists of compiler flags to single string. -# Usage: -# mshadow_merge_flag_lists(out_variable [] [] ...) -function(mshadow_merge_flag_lists out_var) - set(__result "") - foreach(__list ${ARGN}) - foreach(__flag ${${__list}}) - string(STRIP ${__flag} __flag) - set(__result "${__result} ${__flag}") - endforeach() - endforeach() - string(STRIP ${__result} __result) - set(${out_var} ${__result} PARENT_SCOPE) -endfunction() - -################################################################################################ -# Converts all paths in list to absolute -# Usage: -# mshadow_convert_absolute_paths() -function(mshadow_convert_absolute_paths variable) - set(__dlist "") - foreach(__s ${${variable}}) - get_filename_component(__abspath ${__s} ABSOLUTE) - list(APPEND __list ${__abspath}) - endforeach() - set(${variable} ${__list} PARENT_SCOPE) -endfunction() - -################################################################################################ -# Reads set of version defines from the header file -# Usage: -# mshadow_parse_header( ..) -macro(mshadow_parse_header FILENAME FILE_VAR) - set(vars_regex "") - set(__parnet_scope OFF) - set(__add_cache OFF) - foreach(name ${ARGN}) - if("${name}" STREQUAL "PARENT_SCOPE") - set(__parnet_scope ON) - elseif("${name}" STREQUAL "CACHE") - set(__add_cache ON) - elseif(vars_regex) - set(vars_regex "${vars_regex}|${name}") - else() - set(vars_regex "${name}") - endif() - endforeach() - if(EXISTS "${FILENAME}") - file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" ) - else() - unset(${FILE_VAR}) - endif() - foreach(name ${ARGN}) - if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE") - if(${FILE_VAR}) - if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*") - string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}") - else() - set(${name} "") - endif() - if(__add_cache) - set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE) - elseif(__parnet_scope) - set(${name} "${${name}}" PARENT_SCOPE) - endif() - else() - unset(${name} CACHE) - endif() - endif() - endforeach() -endmacro() - -################################################################################################ -# Reads single version define from the header file and parses it -# Usage: -# mshadow_parse_header_single_define( ) -function(mshadow_parse_header_single_define LIBNAME HDR_PATH VARNAME) - set(${LIBNAME}_H "") - if(EXISTS "${HDR_PATH}") - file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1) - endif() - - if(${LIBNAME}_H) - string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}") - string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR "${${LIBNAME}_H}") - string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}") - set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE) - set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE) - set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE) - set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE) - - # append a TWEAK version if it exists: - set(${LIBNAME}_VERSION_TWEAK "") - if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$") - set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE) - endif() - if(${LIBNAME}_VERSION_TWEAK) - set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE) - else() - set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE) - endif() - endif() -endfunction() - -######################################################################################################## -# An option that the user can select. Can accept condition to control when option is available for user. -# Usage: -# mshadow_option( "doc string" [IF ]) -function(mshadow_option variable description value) - set(__value ${value}) - set(__condition "") - set(__varname "__value") - foreach(arg ${ARGN}) - if(arg STREQUAL "IF" OR arg STREQUAL "if") - set(__varname "__condition") - else() - list(APPEND ${__varname} ${arg}) - endif() - endforeach() - unset(__varname) - if("${__condition}" STREQUAL "") - set(__condition 2 GREATER 1) - endif() - - if(${__condition}) - if("${__value}" MATCHES ";") - if(${__value}) - option(${variable} "${description}" ON) - else() - option(${variable} "${description}" OFF) - endif() - elseif(DEFINED ${__value}) - if(${__value}) - option(${variable} "${description}" ON) - else() - option(${variable} "${description}" OFF) - endif() - else() - option(${variable} "${description}" ${__value}) - endif() - else() - unset(${variable} CACHE) - endif() -endfunction() - -################################################################################################ -# Utility macro for comparing two lists. Used for CMake debugging purposes -# Usage: -# mshadow_compare_lists( [description]) -function(mshadow_compare_lists list1 list2 desc) - set(__list1 ${${list1}}) - set(__list2 ${${list2}}) - list(SORT __list1) - list(SORT __list2) - list(LENGTH __list1 __len1) - list(LENGTH __list2 __len2) - - if(NOT ${__len1} EQUAL ${__len2}) - message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}") - endif() - - foreach(__i RANGE 1 ${__len1}) - math(EXPR __index "${__i}- 1") - list(GET __list1 ${__index} __item1) - list(GET __list2 ${__index} __item2) - if(NOT ${__item1} STREQUAL ${__item2}) - message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}") - endif() - endforeach() -endfunction() - -################################################################################################ -# Command for disabling warnings for different platforms (see below for gcc and VisualStudio) -# Usage: -# mshadow_warnings_disable( -Wshadow /wd4996 ..,) -macro(mshadow_warnings_disable) - set(_flag_vars "") - set(_msvc_warnings "") - set(_gxx_warnings "") - - foreach(arg ${ARGN}) - if(arg MATCHES "^CMAKE_") - list(APPEND _flag_vars ${arg}) - elseif(arg MATCHES "^/wd") - list(APPEND _msvc_warnings ${arg}) - elseif(arg MATCHES "^-W") - list(APPEND _gxx_warnings ${arg}) - endif() - endforeach() - - if(NOT _flag_vars) - set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS) - endif() - - if(MSVC AND _msvc_warnings) - foreach(var ${_flag_vars}) - foreach(warning ${_msvc_warnings}) - set(${var} "${${var}} ${warning}") - endforeach() - endforeach() - elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings) - foreach(var ${_flag_vars}) - foreach(warning ${_gxx_warnings}) - if(NOT warning MATCHES "^-Wno-") - string(REPLACE "${warning}" "" ${var} "${${var}}") - string(REPLACE "-W" "-Wno-" warning "${warning}") - endif() - set(${var} "${${var}} ${warning}") - endforeach() - endforeach() - endif() - mshadow_clear_vars(_flag_vars _msvc_warnings _gxx_warnings) -endmacro() - -################################################################################################ -# Helper function get current definitions -# Usage: -# mshadow_get_current_definitions() -function(mshadow_get_current_definitions definitions_var) - get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS) - set(result "") - - foreach(d ${current_definitions}) - list(APPEND result -D${d}) - endforeach() - - mshadow_list_unique(result) - set(${definitions_var} ${result} PARENT_SCOPE) -endfunction() - -################################################################################################ -# Helper function get current includes/definitions -# Usage: -# mshadow_get_current_cflags() -function(mshadow_get_current_cflags cflags_var) - get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES) - mshadow_convert_absolute_paths(current_includes) - mshadow_get_current_definitions(cflags) - - foreach(i ${current_includes}) - list(APPEND cflags "-I${i}") - endforeach() - - mshadow_list_unique(cflags) - set(${cflags_var} ${cflags} PARENT_SCOPE) -endfunction() - -################################################################################################ -# Helper function to parse current linker libs into link directories, libflags and osx frameworks -# Usage: -# mshadow_parse_linker_libs( ) -function(mshadow_parse_linker_libs mshadow_LINKER_LIBS_variable folders_var flags_var frameworks_var) - - set(__unspec "") - set(__debug "") - set(__optimized "") - set(__framework "") - set(__varname "__unspec") - - # split libs into debug, optimized, unspecified and frameworks - foreach(list_elem ${${mshadow_LINKER_LIBS_variable}}) - if(list_elem STREQUAL "debug") - set(__varname "__debug") - elseif(list_elem STREQUAL "optimized") - set(__varname "__optimized") - elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)") - list(APPEND __framework -framework ${CMAKE_MATCH_1}) - else() - list(APPEND ${__varname} ${list_elem}) - set(__varname "__unspec") - endif() - endforeach() - - # attach debug or optimized libs to unspecified according to current configuration - if(CMAKE_BUILD_TYPE MATCHES "Debug") - set(__libs ${__unspec} ${__debug}) - else() - set(__libs ${__unspec} ${__optimized}) - endif() - - set(libflags "") - set(folders "") - - # convert linker libraries list to link flags - foreach(lib ${__libs}) - if(TARGET ${lib}) - list(APPEND folders $) - list(APPEND libflags -l${lib}) - elseif(lib MATCHES "^-l.*") - list(APPEND libflags ${lib}) - elseif(IS_ABSOLUTE ${lib}) - get_filename_component(name_we ${lib} NAME_WE) - get_filename_component(folder ${lib} PATH) - - string(REGEX MATCH "^lib(.*)" __match ${name_we}) - list(APPEND libflags -l${CMAKE_MATCH_1}) - list(APPEND folders ${folder}) - else() - message(FATAL_ERROR "Logic error. Need to update cmake script") - endif() - endforeach() - - mshadow_list_unique(libflags folders) - - set(${folders_var} ${folders} PARENT_SCOPE) - set(${flags_var} ${libflags} PARENT_SCOPE) - set(${frameworks_var} ${__framework} PARENT_SCOPE) -endfunction() - -################################################################################################ -# Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, .... -# Usage: -# mshadow_detect_darwin_version() -function(mshadow_detect_darwin_version output_var) - if(APPLE) - execute_process(COMMAND /usr/bin/sw_vers -productVersion - RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - - set(${output_var} ${__sw_vers_out} PARENT_SCOPE) - else() - set(${output_var} "" PARENT_SCOPE) - endif() -endfunction() - -################################################################################################ -# Convenient command to setup source group for IDEs that support this feature (VS, XCode) -# Usage: -# caffe_source_group( GLOB[_RECURSE] ) -function(mshadow_source_group group) - cmake_parse_arguments(CAFFE_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN}) - if(CAFFE_SOURCE_GROUP_GLOB) - file(GLOB srcs1 ${CAFFE_SOURCE_GROUP_GLOB}) - source_group(${group} FILES ${srcs1}) - endif() - - if(CAFFE_SOURCE_GROUP_GLOB_RECURSE) - file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE}) - source_group(${group} FILES ${srcs2}) - endif() -endfunction() \ No newline at end of file diff --git a/3rdparty/mshadow/cmake/mshadow.cmake b/3rdparty/mshadow/cmake/mshadow.cmake deleted file mode 100644 index 1ef76988d8d0..000000000000 --- a/3rdparty/mshadow/cmake/mshadow.cmake +++ /dev/null @@ -1,91 +0,0 @@ -set(mshadow_LINKER_LIBS "") - -set(BLAS "Open" CACHE STRING "Selected BLAS library") -set_property(CACHE BLAS PROPERTY STRINGS "Atlas;Open;MKL") - -if(DEFINED USE_BLAS) - set(BLAS "${USE_BLAS}") -else() - if(USE_MKL_IF_AVAILABLE) - if(NOT MKL_FOUND) - find_package(MKL) - endif() - if(MKL_FOUND) - set(BLAS "MKL") - endif() - endif() -endif() - -if(BLAS STREQUAL "Atlas" OR BLAS STREQUAL "atlas") - find_package(Atlas REQUIRED) - include_directories(SYSTEM ${Atlas_INCLUDE_DIR}) - list(APPEND mshadow_LINKER_LIBS ${Atlas_LIBRARIES}) - add_definitions(-DMSHADOW_USE_CBLAS=1) - add_definitions(-DMSHADOW_USE_MKL=0) -elseif(BLAS STREQUAL "Open" OR BLAS STREQUAL "open") - find_package(OpenBLAS REQUIRED) - include_directories(SYSTEM ${OpenBLAS_INCLUDE_DIR}) - list(APPEND mshadow_LINKER_LIBS ${OpenBLAS_LIB}) - add_definitions(-DMSHADOW_USE_CBLAS=1) - add_definitions(-DMSHADOW_USE_MKL=0) -elseif(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl") - find_package(MKL REQUIRED) - include_directories(SYSTEM ${MKL_INCLUDE_DIR}) - list(APPEND mshadow_LINKER_LIBS ${MKL_LIBRARIES}) - add_definitions(-DMSHADOW_USE_CBLAS=0) - add_definitions(-DMSHADOW_USE_MKL=1) -elseif(BLAS STREQUAL "apple") - find_package(Accelerate REQUIRED) - include_directories(SYSTEM ${Accelerate_INCLUDE_DIR}) - list(APPEND mshadow_LINKER_LIBS ${Accelerate_LIBRARIES}) - add_definitions(-DMSHADOW_USE_MKL=0) - add_definitions(-DMSHADOW_USE_CBLAS=1) -endif() - -if(SUPPORT_MSSE2) - add_definitions(-DMSHADOW_USE_SSE=1) -else() - add_definitions(-DMSHADOW_USE_SSE=0) -endif() - -if(NOT DEFINED SUPPORT_F16C AND NOT MSVC) - check_cxx_compiler_flag("-mf16c" COMPILER_SUPPORT_MF16C) - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - execute_process(COMMAND cat /proc/cpuinfo - COMMAND grep flags - COMMAND grep f16c - OUTPUT_VARIABLE CPU_SUPPORT_F16C) - elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - execute_process(COMMAND sysctl -a - COMMAND grep machdep.cpu.features - COMMAND grep F16C - OUTPUT_VARIABLE CPU_SUPPORT_F16C) - endif() - if(NOT CPU_SUPPORT_F16C) - message("CPU does not support F16C instructions") - endif() - if(CPU_SUPPORT_F16C AND COMPILER_SUPPORT_MF16C) - set(SUPPORT_F16C TRUE) - endif() -endif() - -if(SUPPORT_F16C) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c") -else() - add_definitions(-DMSHADOW_USE_F16C=0) -endif() - -if(USE_CUDA) - find_package(CUDA 5.5 QUIET) - find_cuda_helper_libs(curand) - if(NOT CUDA_FOUND) - message(FATAL_ERROR "-- CUDA is disabled.") - endif() - add_definitions(-DMSHADOW_USE_CUDA=1) - add_definitions(-DMSHADOW_FORCE_STREAM) - include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) - list(APPEND mshadow_LINKER_LIBS ${CUDA_CUDART_LIBRARY} - ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) -else() - add_definitions(-DMSHADOW_USE_CUDA=0) -endif() diff --git a/3rdparty/mshadow/cmake/mshadowUtils.cmake b/3rdparty/mshadow/cmake/mshadowUtils.cmake deleted file mode 100644 index d4b8bfc89b7a..000000000000 --- a/3rdparty/mshadow/cmake/mshadowUtils.cmake +++ /dev/null @@ -1,2 +0,0 @@ -include("${CMAKE_CURRENT_LIST_DIR}/Utils.cmake") - diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d329f5f1079..9e4ef25e5af4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.0.2) +cmake_minimum_required(VERSION 3.13) # workaround to store CMAKE_CROSSCOMPILING because is getting reset by the project command if(CMAKE_CROSSCOMPILING) @@ -18,39 +18,57 @@ endif() include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Utils.cmake) +include(CMakeDependentOption) #Some things have order. This must be put in front alone -mxnet_option(USE_CUDA "Build with CUDA support" ON) -mxnet_option(USE_OLDCMAKECUDA "Build with old cmake cuda" OFF) -mxnet_option(USE_NCCL "Use NVidia NCCL with CUDA" OFF) -mxnet_option(USE_OPENCV "Build with OpenCV support" ON) -mxnet_option(USE_OPENMP "Build with Openmp support" ON) -mxnet_option(USE_CUDNN "Build with cudnn support" ON) # one could set CUDNN_ROOT for search path -mxnet_option(USE_SSE "Build with x86 SSE instruction support" ON IF NOT ARM) -mxnet_option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON -mxnet_option(USE_LAPACK "Build with lapack support" ON) -mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) -mxnet_option(USE_MKLDNN "Build with MKL-DNN support" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING)) -mxnet_option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON IF NOT MSVC) -mxnet_option(USE_GPERFTOOLS "Build with GPerfTools support" OFF) -mxnet_option(USE_JEMALLOC "Build with Jemalloc support" ON) -mxnet_option(USE_DIST_KVSTORE "Build with DIST_KVSTORE support" OFF) -mxnet_option(USE_PLUGINS_WARPCTC "Use WARPCTC Plugins" OFF) -mxnet_option(USE_PLUGIN_CAFFE "Use Caffe Plugin" OFF) -mxnet_option(USE_CPP_PACKAGE "Build C++ Package" OFF) -mxnet_option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON) -mxnet_option(USE_GPROF "Compile with gprof (profiling) flag" OFF) -mxnet_option(USE_CXX14_IF_AVAILABLE "Build with C++14 if the compiler supports it" OFF) -mxnet_option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path -mxnet_option(USE_TVM_OP "Enable use of TVM operator build system." OFF) -mxnet_option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON) -mxnet_option(BUILD_CPP_EXAMPLES "Build cpp examples" ON) -mxnet_option(INSTALL_EXAMPLES "Install the example source files." OFF) -mxnet_option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON) -mxnet_option(USE_TENSORRT "Enable inference optimization with TensorRT." OFF) -mxnet_option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF) -mxnet_option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF) -mxnet_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF) -mxnet_option(BUILD_CYTHON_MODULES "Build cython modules." OFF) +option(USE_CUDA "Build with CUDA support" ON) +set(MXNET_CUDA_ARCH "Auto" CACHE STRING "Target NVIDIA GPU achitecture. +Format: Auto | Common | All | LIST(ARCH_AND_PTX ...) +- \"Auto\" detects local machine GPU compute arch at runtime. +- \"Common\" and \"All\" cover common and entire subsets of architectures +- ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX +- NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal Volta Turing +- NUM: Any number. Only those pairs are currently accepted by NVCC though: + 2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2 7.0 7.2 7.5") +option(USE_NCCL "Use NVidia NCCL with CUDA" OFF) +option(USE_OPENCV "Build with OpenCV support" ON) +option(USE_OPENMP "Build with Openmp support" ON) +cmake_dependent_option(USE_CUDNN "Build with cudnn support" ON "USE_CUDA" OFF) # one could set CUDNN_ROOT for search path +cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT ARM" OFF) +option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON +option(USE_LAPACK "Build with lapack support" ON) +option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) +if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING)) + option(USE_MKLDNN "Build with MKL-DNN support" ON) +else() + option(USE_MKLDNN "Build with MKL-DNN support" OFF) +endif() +if(NOT MSVC) + option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON) +else() + option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" OFF) +endif() +option(USE_GPERFTOOLS "Build with GPerfTools support" OFF) +option(USE_JEMALLOC "Build with Jemalloc support" ON) +option(USE_DIST_KVSTORE "Build with DIST_KVSTORE support" OFF) +option(USE_PLUGINS_WARPCTC "Use WARPCTC Plugins" OFF) +option(USE_PLUGIN_CAFFE "Use Caffe Plugin" OFF) +option(USE_CPP_PACKAGE "Build C++ Package" OFF) +option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON) +option(USE_GPROF "Compile with gprof (profiling) flag" OFF) +option(USE_CXX14_IF_AVAILABLE "Build with C++14 if the compiler supports it" OFF) +option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path +option(USE_TVM_OP "Enable use of TVM operator build system." OFF) +option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON) +option(BUILD_CPP_EXAMPLES "Build cpp examples" ON) +option(INSTALL_EXAMPLES "Install the example source files." OFF) +option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON) +option(USE_TENSORRT "Enable inference optimization with TensorRT." OFF) +option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF) +option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF) +option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF) +option(BUILD_CYTHON_MODULES "Build cython modules." OFF) +cmake_dependent_option(USE_SPLIT_ARCH_DLL "Build a separate DLL for each Cuda arch (Windows only)." ON "MSVC" OFF) + message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}") message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}") @@ -62,31 +80,29 @@ if(USE_TVM_OP) add_definitions(-DMXNET_USE_TVM_OP=1) endif() -if(USE_CUDA AND NOT USE_OLDCMAKECUDA) - message(STATUS "CMake version '${CMAKE_VERSION}' using generator '${CMAKE_GENERATOR}'") - if( - ( - (${CMAKE_GENERATOR} MATCHES "Visual Studio.*") - OR (${CMAKE_GENERATOR} MATCHES "Xcode.*") - OR (${CMAKE_GENERATOR} STREQUAL "Unix Makefiles") - ) AND ( - (${CMAKE_VERSION} VERSION_GREATER "3.9.0") OR (${CMAKE_VERSION} VERSION_EQUAL "3.9.0") - ) - ) - set(FIRST_CUDA TRUE) - project(mxnet C CXX CUDA) - else() - set(FIRST_CUDA FALSE) - set(USE_OLDCMAKECUDA TRUE) - project(mxnet C CXX) +message(STATUS "CMake version '${CMAKE_VERSION}' using generator '${CMAKE_GENERATOR}'") +project(mxnet C CXX) +if(USE_CUDA) + cmake_minimum_required(VERSION 3.13.2) # CUDA 10 (Turing) detection available starting 3.13.2 + enable_language(CUDA) + set(CMAKE_CUDA_STANDARD 11) + include(CheckCXXCompilerFlag) + if(USE_CXX14_IF_AVAILABLE) + check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14) + if (SUPPORT_CXX14) + set(CMAKE_CUDA_STANDARD 14) + endif() endif() -else() - project(mxnet C CXX) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) endif() +if(UNIX) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) +endif() if(MSVC) set(SYSTEM_ARCHITECTURE x86_64) + enable_language(ASM_MASM) else() execute_process(COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE SYSTEM_ARCHITECTURE) endif() @@ -102,7 +118,8 @@ endif() #Switch off modern thread local for dmlc-core, please see: https://github.com/dmlc/dmlc-core/issues/571#issuecomment-543467484 add_definitions(-DDMLC_MODERN_THREAD_LOCAL=0) - +# disable stack trace in exception by default. +add_definitions(-DDMLC_LOG_STACK_TRACE_SIZE=0) if(MSVC) add_definitions(-DWIN32_LEAN_AND_MEAN) @@ -119,7 +136,7 @@ if(MSVC) endif() set(CMAKE_C_FLAGS "/MP") set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} /bigobj") -else(MSVC) +else() include(CheckCXXCompilerFlag) if(USE_CXX14_IF_AVAILABLE) check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14) @@ -132,6 +149,7 @@ else(MSVC) check_cxx_compiler_flag("-msse3" SUPPORT_MSSE3) check_cxx_compiler_flag("-msse2" SUPPORT_MSSE2) else() + set(SUPPORT_MSSE3 FALSE) set(SUPPORT_MSSE2 FALSE) endif() # For cross complication, turn off flag if target device does not support it @@ -148,7 +166,6 @@ else(MSVC) else() add_definitions(-DMSHADOW_USE_F16C=0) endif() - set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unknown-pragmas -Wno-sign-compare") if ("${CMAKE_CXX_COMPILER_ID}" MATCHES ".*Clang$") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-braced-scalar-init") @@ -166,8 +183,12 @@ else(MSVC) endif() if(SUPPORT_MSSE3) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3") + add_definitions(-DMSHADOW_USE_SSE=1) elseif(SUPPORT_MSSE2) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2") + add_definitions(-DMSHADOW_USE_SSE=1) + else() + add_definitions(-DMSHADOW_USE_SSE=0) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_C_FLAGS}") if(SUPPORT_CXX14) @@ -240,7 +261,7 @@ if(USE_TENSORRT) endif() # please note that when you enable this, you might run into an linker not being able to work properly due to large code injection. -# you can find more information here https://github.com/apache/incubator-mxnet/issues/15971 +# you can find more information here https://github.com/apache/incubator-mxnet/issues/15971 if(ENABLE_TESTCOVERAGE) message(STATUS "Compiling with test coverage support enabled. This will result in additional files being written to your source directory!") find_program( GCOV_PATH gcov ) @@ -258,67 +279,28 @@ endif() if(USE_MKLDNN) # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3). if(MSVC) - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /EHsc /Gy") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc /MT") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /EHsc /Gy /MT") endif() set(MKLDNN_BUILD_TESTS OFF CACHE INTERNAL "" FORCE) set(MKLDNN_BUILD_EXAMPLES OFF CACHE INTERNAL "" FORCE) set(MKLDNN_ARCH_OPT_FLAGS "" CACHE INTERNAL "" FORCE) - set(MKLDNN_USE_MKL NONE CACHE INTERNAL "" FORCE) set(MKLDNN_ENABLE_JIT_PROFILING OFF CACHE INTERNAL "" FORCE) + set(MKLDNN_LIBRARY_TYPE STATIC CACHE INTERNAL "" FORCE) add_subdirectory(3rdparty/mkldnn) include_directories(3rdparty/mkldnn/include) include_directories(${PROJECT_BINARY_DIR}/3rdparty/mkldnn/include) add_definitions(-DMXNET_USE_MKLDNN=1) - list(APPEND mxnet_LINKER_LIBS mkldnn) + list(APPEND mxnet_LINKER_LIBS dnnl) endif() # Allow Cuda compiles outside of src tree to find things in 'src' and 'include' include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) -if(USE_CUDA) - find_package(CUDA REQUIRED) - add_definitions(-DMSHADOW_USE_CUDA=1) - if(FIRST_CUDA AND (NOT USE_OLDCMAKECUDA)) - if(NOT CUDA_TOOLSET) - set(CUDA_TOOLSET "${CUDA_VERSION_STRING}") - endif() - else() - set(FIRST_CUDA FALSE) - endif() - if(USE_NCCL) - find_package(NCCL) - if(NCCL_FOUND) - include_directories(${NCCL_INCLUDE_DIRS}) - list(APPEND mxnet_LINKER_LIBS ${NCCL_LIBRARIES}) - else() - message(WARNING "Could not find NCCL libraries") - endif() - endif() - if(UNIX) - find_package(NVTX) - if(NVTX_FOUND) - include_directories(${NVTX_INCLUDE_DIRS}) - list(APPEND mxnet_LINKER_LIBS ${NVTX_LIBRARIES}) - add_definitions(-DMXNET_USE_NVTX=1) - else() - message(WARNING "Could not find NVTX libraries") - endif() - endif() -else() - add_definitions(-DMSHADOW_USE_CUDA=0) -endif() - -if(NCCL_FOUND) - add_definitions(-DMXNET_USE_NCCL=1) -else() - add_definitions(-DMXNET_USE_NCCL=0) -endif() - if (USE_INT64_TENSOR_SIZE) message(STATUS "Using 64-bit integer for tensor size") add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=1) @@ -327,21 +309,6 @@ else() endif() include(cmake/ChooseBlas.cmake) -if(USE_CUDA AND FIRST_CUDA) - include(3rdparty/mshadow/cmake/Utils.cmake) - include(cmake/FirstClassLangCuda.cmake) - include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -else() - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mshadow/cmake) - include(3rdparty/mshadow/cmake/mshadow.cmake) - include(3rdparty/mshadow/cmake/Utils.cmake) - include(3rdparty/mshadow/cmake/Cuda.cmake) - else() - include(mshadowUtils) - include(Cuda) - include(mshadow) - endif() -endif() if(USE_ASAN) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-omit-frame-pointer -fsanitize=address") @@ -436,6 +403,16 @@ endif() # ---[ OpenMP if(USE_OPENMP) + + function(load_omp) + # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp + set(OPENMP_STANDALONE_BUILD TRUE) + set(LIBOMP_ENABLE_SHARED TRUE) + set(CMAKE_BUILD_TYPE Release) + set(OPENMP_ENABLE_LIBOMPTARGET OFF CACHE BOOL "LLVM OpenMP offloading support") # Requires CMP0077 CMake 3.13 + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp) + endfunction() + find_package(OpenMP REQUIRED) # This should build on Windows, but there's some problem and I don't have a Windows box, so # could a Windows user please fix? @@ -443,11 +420,7 @@ if(USE_OPENMP) AND SYSTEM_ARCHITECTURE STREQUAL "x86_64" AND NOT MSVC AND NOT CMAKE_CROSSCOMPILING) - - # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp - set(OPENMP_STANDALONE_BUILD TRUE) - set(LIBOMP_ENABLE_SHARED TRUE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp) + load_omp() list(REMOVE_ITEM mxnet_LINKER_LIBS iomp5) list(APPEND mxnet_LINKER_LIBS omp) if(UNIX) @@ -502,13 +475,15 @@ add_subdirectory(${GTEST_ROOT}) find_package(GTest REQUIRED) # cudnn detection -if(USE_CUDNN AND USE_CUDA) - detect_cuDNN() - if(HAVE_CUDNN) +if(USE_CUDNN) + find_package(CUDNN) + if(CUDNN_FOUND) add_definitions(-DUSE_CUDNN) include_directories(SYSTEM ${CUDNN_INCLUDE}) list(APPEND mxnet_LINKER_LIBS ${CUDNN_LIBRARY}) - add_definitions(-DMSHADOW_USE_CUDNN=1) + add_definitions(-DMSHADOW_USE_CUDNN=1) + else() + set(USE_CUDNN OFF) endif() endif() @@ -516,9 +491,7 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/dmlc-core/cmake) add_subdirectory("3rdparty/dmlc-core") endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mshadow/cmake) - add_subdirectory("3rdparty/mshadow") -endif() +add_subdirectory("3rdparty/mshadow") FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h") FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh") @@ -618,61 +591,63 @@ if(MSVC) endif() if(USE_CUDA) - if(FIRST_CUDA) - mshadow_select_nvcc_arch_flags(NVCC_FLAGS_ARCH) - string(REPLACE ";" " " NVCC_FLAGS_ARCH "${NVCC_FLAGS_ARCH}") - set(CMAKE_CUDA_FLAGS "${NVCC_FLAGS_ARCH}") - list(APPEND mxnet_LINKER_LIBS cublas cufft cusolver curand) - if(ENABLE_CUDA_RTC) - list(APPEND mxnet_LINKER_LIBS nvrtc cuda) - add_definitions(-DMXNET_ENABLE_CUDA_RTC=1) + # CUDA_SELECT_NVCC_ARCH_FLAGS is not deprecated, though part of deprecated + # FindCUDA https://gitlab.kitware.com/cmake/cmake/issues/19199 + include(${CMAKE_ROOT}/Modules/FindCUDA/select_compute_arch.cmake) + CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS ${MXNET_CUDA_ARCH}) + message("-- CUDA: Using the following NVCC architecture flags ${CUDA_ARCH_FLAGS}") + set(arch_code_list) + foreach(arch_str ${CUDA_ARCH_FLAGS}) + if((arch_str MATCHES ".*sm_[0-9]+")) + string( REGEX REPLACE ".*sm_([0-9]+)" "\\1" arch_code ${arch_str} ) + list(APPEND arch_code_list ${arch_code}) endif() - list(APPEND SOURCE ${CUDA}) - add_definitions(-DMXNET_USE_CUDA=1) - link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64) - else() - list(APPEND CUDA_INCLUDE_DIRS ${INCLUDE_DIRECTORIES}) - # define preprocessor macro so that we will not include the generated forcelink header - if(ENABLE_CUDA_RTC) + endforeach() + + string(REPLACE ";" " " CUDA_ARCH_FLAGS_SPACES "${CUDA_ARCH_FLAGS}") + + + find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand + OPTIONAL_COMPONENTS nvToolsExt nvrtc) + + list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand) + if(ENABLE_CUDA_RTC) + if(CUDA_nvrtc_LIBRARY) + list(APPEND mxnet_LINKER_LIBS CUDA::nvrtc cuda) add_definitions(-DMXNET_ENABLE_CUDA_RTC=1) + else() + message(FATAL_ERROR "ENABLE_CUDA_RTC=ON, but failed to find NVRTC. CMake will exit." ) endif() - # Create '.cmake' files for cuda compiles given definitions added thus far - mshadow_cuda_compile(cuda_objs ${CUDA}) - if(MSVC) - if(ENABLE_CUDA_RTC) - FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32") - list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY}) - set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib") - list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY}) - endif() - FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32") - list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator - FIND_LIBRARY(CUDA_cusolver_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32") - list(APPEND mxnet_LINKER_LIBS "${CUDA_cusolver_LIBRARY}/../cusolver.lib") # For cusolver - link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/win32) - link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) - else(MSVC) - list(APPEND mxnet_LINKER_LIBS cufft cusolver) - if(ENABLE_CUDA_RTC) - list(APPEND mxnet_LINKER_LIBS nvrtc cuda) - endif() - link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") + endif() + list(APPEND SOURCE ${CUDA}) + add_definitions(-DMXNET_USE_CUDA=1) + add_definitions(-DMSHADOW_USE_CUDA=1) + add_definitions(-DMSHADOW_FORCE_STREAM) + + if(USE_NCCL) + find_package(NCCL) + if(NCCL_FOUND) + include_directories(${NCCL_INCLUDE_DIRS}) + list(APPEND mxnet_LINKER_LIBS ${NCCL_LIBRARIES}) + add_definitions(-DMXNET_USE_NCCL=1) + else() + add_definitions(-DMXNET_USE_NCCL=0) + message(WARNING "Could not find NCCL libraries") endif() - list(APPEND SOURCE ${cuda_objs} ${CUDA}) - add_definitions(-DMXNET_USE_CUDA=1) - if(CUDA_LIBRARY_PATH) - if(IS_CONTAINER_BUILD) - # In case of building on a production-like build container which may not have Cuda installed - if(NOT CMAKE_SYSTEM_HAS_CUDA) - # Assuming building in a container that doesn't have CUDA installed (ie CPU-only build machine) - # so use the stub cuda driver shared library - if(EXISTS ${CUDA_LIBRARY_PATH}/stubs/libcuda.so) - link_directories(${CUDA_LIBRARY_PATH}/stubs) - endif() - endif() - endif() + endif() + if(UNIX) + if(CUDA_nvToolsExt_LIBRARY) + list(APPEND mxnet_LINKER_LIBS CUDA::nvToolsExt) + add_definitions(-DMXNET_USE_NVTX=1) + else() + message("Building without NVTX support.") endif() - endif() + endif() + + include_directories(${CUDAToolkit_INCLUDE_DIRS}) + link_directories(${CUDAToolkit_LIBRARY_DIR}) +else() + add_definitions(-DMSHADOW_USE_CUDA=0) endif() # unsupported: if caffe is a subdirectory of mxnet, load its CMakeLists.txt as well @@ -699,10 +674,11 @@ else() endif() -add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/lib_api/mylib.cc) +add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/gemm_lib.cc) target_include_directories(sample_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet) set(MXNET_INSTALL_TARGETS mxnet) if(UNIX) + string(APPEND CMAKE_CUDA_FLAGS "${CUDA_ARCH_FLAGS_SPACES}") # Create dummy file since we want an empty shared library before linking set(DUMMY_SOURCE ${CMAKE_BINARY_DIR}/dummy.c) file(WRITE ${DUMMY_SOURCE} "") @@ -714,32 +690,66 @@ if(UNIX) target_link_libraries(mxnet_static PUBLIC ${CMAKE_DL_LIBS}) target_compile_options(sample_lib PUBLIC -shared) set_target_properties(mxnet_static PROPERTIES OUTPUT_NAME mxnet) -else() - add_library(mxnet SHARED ${SOURCE}) +elseif(MSVC) target_compile_options(sample_lib PUBLIC /LD) set_target_properties(sample_lib PROPERTIES PREFIX "lib") -endif() -if(USE_CUDA) - if(FIRST_CUDA AND MSVC) - target_compile_options(mxnet PUBLIC "$<$:-Xcompiler=-MTd -Gy>") - target_compile_options(mxnet PUBLIC "$<$:-Xcompiler=-MT -Gy>") + if(USE_CUDA) + if(MSVC) + if(USE_SPLIT_ARCH_DLL) + add_executable(gen_warp tools/windowsbuild/gen_warp.cpp) + add_library(mxnet SHARED tools/windowsbuild/warp_dll.cpp ${CMAKE_BINARY_DIR}/warp_gen_cpp.cpp + ${CMAKE_BINARY_DIR}/warp_gen.asm) + target_link_libraries(mxnet PRIVATE cudart Shlwapi) + list(GET arch_code_list 0 mxnet_first_arch) + foreach(arch ${arch_code_list}) + add_library(mxnet_${arch} SHARED ${SOURCE}) + target_compile_options( + mxnet_${arch} + PRIVATE + "$<$:--gpu-architecture=compute_${arch}>" + ) + target_compile_options( + mxnet_${arch} + PRIVATE + "$<$:--gpu-code=sm_${arch},compute_${arch}>" + ) + target_compile_options( + mxnet_${arch} + PRIVATE "$<$,$>:-Xcompiler=-MTd -Gy /bigobj>") + target_compile_options( + mxnet_${arch} + PRIVATE "$<$,$>:-Xcompiler=-MT -Gy /bigobj>") + endforeach() + + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/warp_gen_cpp.cpp ${CMAKE_BINARY_DIR}/warp_gen.asm + COMMAND gen_warp $ WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/ DEPENDS $) + else(USE_SPLIT_ARCH_DLL) + string(REPLACE ";" " " NVCC_FLAGS_ARCH "${NVCC_FLAGS_ARCH}") + set(CMAKE_CUDA_FLAGS "${CUDA_ARCH_FLAGS_SPACES}") + add_library(mxnet SHARED ${SOURCE}) + target_compile_options( + mxnet + PRIVATE "$<$,$>:-Xcompiler=-MTd -Gy /bigobj>") + target_compile_options( + mxnet + PRIVATE "$<$,$>:-Xcompiler=-MT -Gy /bigobj>") + + endif(USE_SPLIT_ARCH_DLL) + else() + add_library(mxnet SHARED ${SOURCE}) + endif() + else() + add_library(mxnet SHARED ${SOURCE}) endif() + endif() + if(USE_DIST_KVSTORE) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/ps-lite/CMakeLists.txt) add_subdirectory("3rdparty/ps-lite") list(APPEND pslite_LINKER_LIBS pslite protobuf) - target_link_libraries(mxnet PUBLIC debug ${pslite_LINKER_LIBS_DEBUG}) - target_link_libraries(mxnet PUBLIC optimized ${pslite_LINKER_LIBS_RELEASE}) - if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND mxnet_LINKER_LIBS ${pslite_LINKER_LIBS_DEBUG}) - else() - list(APPEND mxnet_LINKER_LIBS ${pslite_LINKER_LIBS_RELEASE}) - endif() - target_link_libraries(mxnet PUBLIC debug ${pslite_LINKER_LIBS_DEBUG}) - target_link_libraries(mxnet PUBLIC optimized ${pslite_LINKER_LIBS_RELEASE}) - else() set(pslite_LINKER_LIBS protobuf zmq-static) endif() @@ -762,8 +772,8 @@ if(USE_TVM_OP) endif() set(TVM_OP_COMPILE_OPTIONS "-o${CMAKE_CURRENT_BINARY_DIR}/libtvmop.so" "--config" "${CMAKE_CURRENT_BINARY_DIR}/tvmop.conf") - if(CUDA_ARCH_BIN) - set(TVM_OP_COMPILE_OPTIONS "${TVM_OP_COMPILE_OPTIONS}" "--cuda-arch" "${CUDA_ARCH_BIN}") + if(USE_CUDA) + set(TVM_OP_COMPILE_OPTIONS "${TVM_OP_COMPILE_OPTIONS}" "--cuda-arch" "\"${CUDA_ARCH_FLAGS}\"") endif() add_custom_command(TARGET mxnet POST_BUILD COMMAND ${CMAKE_COMMAND} -E env @@ -773,13 +783,24 @@ if(USE_TVM_OP) ) endif() -target_link_libraries(mxnet PUBLIC ${mxnet_LINKER_LIBS}) - if(USE_PLUGINS_WARPCTC) - target_link_libraries(mxnet PUBLIC debug ${WARPCTC_LIB_DEBUG}) - target_link_libraries(mxnet PUBLIC optimized ${WARPCTC_LIB_RELEASE}) + list(APPEND mxnet_LINKER_LIBS ${WARPCTC_LIB}) endif() +if(MSVC) + if(USE_SPLIT_ARCH_DLL AND USE_CUDA) + foreach(arch ${arch_code_list}) + target_link_libraries(mxnet_${arch} PUBLIC ${mxnet_LINKER_LIBS}) + target_link_libraries(mxnet_${arch} PUBLIC dmlc) + endforeach() + else() + target_link_libraries(mxnet PUBLIC ${mxnet_LINKER_LIBS}) + target_link_libraries(mxnet PUBLIC dmlc) + endif() +else() + target_link_libraries(mxnet PUBLIC ${mxnet_LINKER_LIBS}) + target_link_libraries(mxnet PUBLIC dmlc) +endif() if(USE_OPENCV AND OpenCV_VERSION_MAJOR GREATER 2) add_executable(im2rec "tools/im2rec.cc") @@ -799,7 +820,6 @@ else() is required for im2rec, im2rec will not be available") endif() -target_link_libraries(mxnet PUBLIC dmlc) if(MSVC AND USE_MXNET_LIB_NAMING) set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet") diff --git a/ci/build_windows.py b/ci/build_windows.py index ce77c316ab20..9af616d2331a 100755 --- a/ci/build_windows.py +++ b/ci/build_windows.py @@ -112,9 +112,7 @@ class BuildFlavour(Enum): '-DUSE_BLAS=open ' '-DUSE_LAPACK=ON ' '-DUSE_DIST_KVSTORE=OFF ' - '-DCUDA_ARCH_NAME=Manual ' - '-DCUDA_ARCH_BIN=52 ' - '-DCUDA_ARCH_PTX=52 ' + '-DMXNET_CUDA_ARCH="5.2" ' '-DCMAKE_CXX_FLAGS="/FS /MD /O2 /Ob2" ' '-DUSE_MKL_IF_AVAILABLE=OFF ' '-DCMAKE_BUILD_TYPE=Release') @@ -128,9 +126,7 @@ class BuildFlavour(Enum): '-DUSE_BLAS=open ' '-DUSE_LAPACK=ON ' '-DUSE_DIST_KVSTORE=OFF ' - '-DCUDA_ARCH_NAME=Manual ' - '-DCUDA_ARCH_BIN=52 ' - '-DCUDA_ARCH_PTX=52 ' + '-DMXNET_CUDA_ARCH="5.2" ' '-DUSE_MKLDNN=ON ' '-DCMAKE_CXX_FLAGS="/FS /MD /O2 /Ob2" ' '-DCMAKE_BUILD_TYPE=Release') diff --git a/ci/docker/install/ubuntu_core.sh b/ci/docker/install/ubuntu_core.sh index 3cb806e0aadd..77c1fe2fb59d 100755 --- a/ci/docker/install/ubuntu_core.sh +++ b/ci/docker/install/ubuntu_core.sh @@ -49,7 +49,7 @@ apt-get install -y \ wget # Use libturbojpeg package as it is correctly compiled with -fPIC flag -# https://github.com/HaxeFoundation/hashlink/issues/147 +# https://github.com/HaxeFoundation/hashlink/issues/147 ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0.1.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 581bb2fd5280..745214af2eea 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -25,7 +25,7 @@ set -ex NOSE_COVERAGE_ARGUMENTS="--with-coverage --cover-inclusive --cover-xml --cover-branches --cover-package=mxnet" NOSE_TIMER_ARGUMENTS="--with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error" CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70" -CI_CMAKE_CUDA_ARCH_BIN="52,70" +CI_CMAKE_CUDA_ARCH="5.2 7.0" clean_repo() { set -ex @@ -753,8 +753,7 @@ build_ubuntu_gpu_tensorrt() { -DUSE_OPENMP=0 \ -DUSE_MKLDNN=0 \ -DUSE_MKL_IF_AVAILABLE=OFF \ - -DCUDA_ARCH_NAME=Manual \ - -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \ + -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -G Ninja \ /work/mxnet @@ -872,8 +871,7 @@ build_ubuntu_gpu_cmake_mkldnn() { -DPython3_EXECUTABLE=/usr/bin/python3 \ -DUSE_MKLML_MKL=1 \ -DCMAKE_BUILD_TYPE=Release \ - -DCUDA_ARCH_NAME=Manual \ - -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \ + -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -G Ninja \ /work/mxnet @@ -901,8 +899,7 @@ build_ubuntu_gpu_cmake() { -DUSE_MKLDNN=OFF \ -DUSE_DIST_KVSTORE=ON \ -DCMAKE_BUILD_TYPE=Release \ - -DCUDA_ARCH_NAME=Manual \ - -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \ + -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -DBUILD_CYTHON_MODULES=1 \ -G Ninja \ /work/mxnet @@ -928,8 +925,7 @@ build_ubuntu_gpu_cmake_no_tvm_op() { -DUSE_MKLDNN=OFF \ -DUSE_DIST_KVSTORE=ON \ -DCMAKE_BUILD_TYPE=Release \ - -DCUDA_ARCH_NAME=Manual \ - -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \ + -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -DBUILD_CYTHON_MODULES=1 \ -G Ninja \ /work/mxnet @@ -975,8 +971,7 @@ build_ubuntu_gpu_large_tensor() { -DUSE_MKLDNN=OFF \ -DUSE_DIST_KVSTORE=ON \ -DCMAKE_BUILD_TYPE=Release \ - -DCUDA_ARCH_NAME=Manual \ - -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \ + -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -DUSE_INT64_TENSOR_SIZE=ON \ -G Ninja \ /work/mxnet diff --git a/cmake/BuildTVM.cmake b/cmake/BuildTVM.cmake index db8b33b84596..2c2f573cddbd 100644 --- a/cmake/BuildTVM.cmake +++ b/cmake/BuildTVM.cmake @@ -98,16 +98,19 @@ set(USE_RANDOM OFF) # Whether use NNPack set(USE_NNPACK OFF) -# Whether use CuDNN -if(USE_CUDNN AND USE_CUDA) - detect_cuDNN() - if(HAVE_CUDNN) - set(USE_CUDNN ON) - else() - set(USE_CUDNN OFF) - endif() -else() - set(USE_CUDNN OFF) +# First-class Cuda in modern CMake provides us with CMAKE_CUDA_COMPILER But TVM +# uses the deprecated findCUDA functionality which requires +# CUDA_TOOLKIT_ROOT_DIR We follow the FindCUDAToolkit.cmake logic to compute +# CUDA_TOOLKIT_ROOT_DIR for TVM https://gitlab.kitware.com/cmake/cmake/merge_requests/4093/ +if(USE_CUDA) + get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY) + set(CUDA_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE) + unset(cuda_dir) + get_filename_component(CUDA_TOOLKIT_ROOT_DIR ${CUDA_BIN_DIR} DIRECTORY ABSOLUTE) + + message("CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") + message("Inferred CUDA_TOOLKIT_ROOT_DIR for TVM as: ${CUDA_TOOLKIT_ROOT_DIR}") + set(USE_CUDA ${CUDA_TOOLKIT_ROOT_DIR}) endif() # Whether use cuBLAS @@ -133,3 +136,6 @@ set(USE_VTA_TSIM OFF) # Whether use Relay debug mode set(USE_RELAY_DEBUG OFF) + +# Use OPENMP thread pool to be compatible with MXNet +set(USE_OPENMP ON) diff --git a/cmake/FirstClassLangCuda.cmake b/cmake/FirstClassLangCuda.cmake deleted file mode 100644 index 8d79c2b63ad9..000000000000 --- a/cmake/FirstClassLangCuda.cmake +++ /dev/null @@ -1,277 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -#this file is CUDA help function with CMAKE first class CUDA - -include(CheckCXXCompilerFlag) -check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11) -if(USE_CXX14_IF_AVAILABLE) - check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14) -endif() - -################################################################################################ -# Short command for cuDNN detection. Believe it soon will be a part of CUDA toolkit distribution. -# That's why not FindcuDNN.cmake file, but just the macro -# Usage: -# detect_cuDNN() -function(detect_cuDNN) - set(CUDNN_ROOT "" CACHE PATH "CUDNN root folder") - - find_path(CUDNN_INCLUDE cudnn.h - PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} - DOC "Path to cuDNN include directory." ) - - - find_library(CUDNN_LIBRARY NAMES libcudnn.so cudnn.lib # libcudnn_static.a - PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE} - PATH_SUFFIXES lib lib/x64 - DOC "Path to cuDNN library.") - - if(CUDNN_INCLUDE AND CUDNN_LIBRARY) - set(HAVE_CUDNN TRUE PARENT_SCOPE) - set(CUDNN_FOUND TRUE PARENT_SCOPE) - - mark_as_advanced(CUDNN_INCLUDE CUDNN_LIBRARY CUDNN_ROOT) - message(STATUS "Found cuDNN (include: ${CUDNN_INCLUDE}, library: ${CUDNN_LIBRARY})") - endif() -endfunction() - - - -################################################################################################ -# A function for automatic detection of GPUs installed (if autodetection is enabled) -# Usage: -# mshadow_detect_installed_gpus(out_variable) -function(mshadow_detect_installed_gpus out_variable) - if(NOT CUDA_gpu_detect_output) - set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) - - file(WRITE ${__cufile} "" - "#include \n" - "int main()\n" - "{\n" - " int count = 0;\n" - " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" - " if (count == 0) return -1;\n" - " for (int device = 0; device < count; ++device)\n" - " {\n" - " cudaDeviceProp prop;\n" - " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" - " std::printf(\"%d.%d \", prop.major, prop.minor);\n" - " }\n" - " return 0;\n" - "}\n") - enable_language(CUDA) - - try_run(__nvcc_res __compile_result ${PROJECT_BINARY_DIR} ${__cufile} - COMPILE_OUTPUT_VARIABLE __compile_out - RUN_OUTPUT_VARIABLE __nvcc_out) - - if(__nvcc_res EQUAL 0 AND __compile_result) - # nvcc outputs text containing line breaks when building with MSVC. - # The line below prevents CMake from inserting a variable with line - # breaks in the cache - string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}") - string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}") - set(CUDA_gpu_detect_output ${__nvcc_out}) - else() - message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out} ${__compile_out}") - endif() - endif() - - if(NOT CUDA_gpu_detect_output) - message(WARNING "Automatic GPU detection failed. Building for all known architectures (${mxnet_known_gpu_archs}).") - set(${out_variable} ${mxnet_known_gpu_archs} PARENT_SCOPE) - else() - set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) - endif() -endfunction() - - -# This list will be used for CUDA_ARCH_NAME = All option -set(CUDA_KNOWN_GPU_ARCHITECTURES "Kepler" "Maxwell") - -# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default) -set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0") - -if (CUDA_TOOLSET VERSION_GREATER "6.5") - list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra") - list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2" "3.7") -endif () - -if (CUDA_TOOLSET VERSION_GREATER "7.5") - list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal") - list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1" "6.1+PTX") -else() - list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX") -endif () - -if (CUDA_TOOLSET VERSION_GREATER "9.0") - list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Volta") - list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0") -endif() - -if (CUDA_TOOLSET VERSION_GREATER "10.0") - list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Turing") - list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5") -endif() - -################################################################################################ -# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME -# Usage: -# mshadow_select_nvcc_arch_flags(out_variable) -function(mshadow_select_nvcc_arch_flags out_variable) - - set(CUDA_ARCH_LIST "Auto" CACHE STRING "Select target NVIDIA GPU achitecture.") - set_property( CACHE CUDA_ARCH_LIST PROPERTY STRINGS "" "Auto" "All" "Common" ${CUDA_KNOWN_GPU_ARCHITECTURES} ) - mark_as_advanced(CUDA_ARCH_NAME) - - - if("X${CUDA_ARCH_LIST}" STREQUAL "X" ) - set(CUDA_ARCH_LIST "All") - endif() - - set(cuda_arch_bin) - set(cuda_arch_ptx) - - message(STATUS " CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}") - if("${CUDA_ARCH_LIST}" STREQUAL "All") - set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES}) - elseif("${CUDA_ARCH_LIST}" STREQUAL "Common") - set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES}) - elseif("${CUDA_ARCH_LIST}" STREQUAL "Auto" OR "${CUDA_ARCH_LIST}" STREQUAL "") - set(mxnet_known_gpu_archs ${CUDA_COMMON_GPU_ARCHITECTURES}) - mshadow_detect_installed_gpus(CUDA_ARCH_LIST) - message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}") - endif() - - # Now process the list and look for names - string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}") - list(REMOVE_DUPLICATES CUDA_ARCH_LIST) - foreach(arch_name ${CUDA_ARCH_LIST}) - set(arch_bin) - set(arch_ptx) - set(add_ptx FALSE) - # Check to see if we are compiling PTX - if(arch_name MATCHES "(.*)\\+PTX$") - set(add_ptx TRUE) - set(arch_name ${CMAKE_MATCH_1}) - endif() - if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$") - set(arch_bin ${CMAKE_MATCH_1}) - set(arch_ptx ${arch_bin}) - else() - # Look for it in our list of known architectures - if(${arch_name} STREQUAL "Fermi") - if (CUDA_TOOLSET VERSION_LESS "8.0") - set(arch_bin 2.0 "2.1(2.0)") - endif() - elseif(${arch_name} STREQUAL "Kepler+Tegra") - set(arch_bin 3.2) - elseif(${arch_name} STREQUAL "Kepler+Tesla") - set(arch_bin 3.7) - elseif(${arch_name} STREQUAL "Kepler") - set(arch_bin 3.0 3.5) - set(arch_ptx 3.5) - elseif(${arch_name} STREQUAL "Maxwell+Tegra") - set(arch_bin 5.3) - elseif(${arch_name} STREQUAL "Maxwell") - set(arch_bin 5.0 5.2) - set(arch_ptx 5.2) - elseif(${arch_name} STREQUAL "Pascal") - set(arch_bin 6.0 6.1) - set(arch_ptx 6.1) - elseif(${arch_name} STREQUAL "Volta") - set(arch_bin 7.0) - set(arch_ptx 7.0) - elseif(${arch_name} STREQUAL "Turing") - set(arch_bin 7.5) - set(arch_ptx 7.5) - else() - message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS") - endif() - endif() - list(APPEND cuda_arch_bin ${arch_bin}) - if(add_ptx) - if (NOT arch_ptx) - set(arch_ptx ${arch_bin}) - endif() - list(APPEND cuda_arch_ptx ${arch_ptx}) - endif() - endforeach() - - # remove dots and convert to lists - string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") - string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}") - string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") - string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") - - if(cuda_arch_bin) - list(REMOVE_DUPLICATES cuda_arch_bin) - endif() - if(cuda_arch_ptx) - list(REMOVE_DUPLICATES cuda_arch_ptx) - endif() - - message(STATUS "cuda arch bin: ${cuda_arch_bin}") - message(STATUS "cuda arch ptx: ${cuda_arch_ptx}") - set(nvcc_flags "") - set(nvcc_archs_readable "") - - # Tell NVCC to add binaries for the specified GPUs - foreach(arch ${cuda_arch_bin}) - if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") - # User explicitly specified ARCH for the concrete CODE - list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) - list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1}) - else() - # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE - list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch}) - list(APPEND nvcc_archs_readable sm_${arch}) - endif() - endforeach() - - # Tell NVCC to add PTX intermediate code for the specified architectures - foreach(arch ${cuda_arch_ptx}) - list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch}) - list(APPEND nvcc_archs_readable compute_${arch}) - endforeach() - - if(NOT MSVC) - if(SUPPORT_CXX14) - list(APPEND nvcc_flags "-std=c++14") - elseif(SUPPORT_CXX11) - list(APPEND nvcc_flags "-std=c++11") - endif() - endif() - - string (REPLACE " " ";" CMAKE_CXX_FLAGS_STR "${CMAKE_CXX_FLAGS}") - foreach(_flag ${CMAKE_CXX_FLAGS_STR}) - # Remove -std=c++XX flags - if(NOT "${_flag}" MATCHES "-std=.+") - # Remove link flags - if(NOT "${_flag}" MATCHES "-Wl,.+") - list(APPEND nvcc_flags "-Xcompiler ${_flag}") - endif() - endif() - endforeach() - - string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") - set(${out_variable} ${nvcc_flags} PARENT_SCOPE) - set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) -endfunction() - diff --git a/cmake/Modules/FindCUDAToolkit.cmake b/cmake/Modules/FindCUDAToolkit.cmake new file mode 100644 index 000000000000..1d9af2f548d0 --- /dev/null +++ b/cmake/Modules/FindCUDAToolkit.cmake @@ -0,0 +1,833 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Original license notice, prior to modification by MXNet Contributors: +# +# Copyright 2000-2019 Kitware, Inc. and Contributors +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Kitware, Inc. nor the names of Contributors +# may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#[=======================================================================[.rst: +FindCUDAToolkit +--------------- + +This script locates the NVIDIA CUDA toolkit and the associated libraries, but +does not require the ``CUDA`` language be enabled for a given project. This +module does not search for the NVIDIA CUDA Samples. + +Search Behavior +^^^^^^^^^^^^^^^ + +Finding the CUDA Toolkit requires finding the ``nvcc`` executable, which is +searched for in the following order: + +1. If the ``CUDA`` language has been enabled we will use the directory + containing the compiler as the first search location for ``nvcc``. + +2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g., + ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it + will be searched. If both an environment variable **and** a + configuration variable are specified, the *configuration* variable takes + precedence. + + The directory specified here must be such that the executable ``nvcc`` can be + found underneath the directory specified by ``CUDAToolkit_ROOT``. If + ``CUDAToolkit_ROOT`` is specified, but no ``nvcc`` is found underneath, this + package is marked as **not** found. No subsequent search attempts are + performed. + +3. If the CUDA_PATH environment variable is defined, it will be searched. + +4. The user's path is searched for ``nvcc`` using :command:`find_program`. If + this is found, no subsequent search attempts are performed. Users are + responsible for ensuring that the first ``nvcc`` to show up in the path is + the desired path in the event that multiple CUDA Toolkits are installed. + +5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is + used. No subsequent search attempts are performed. No default symbolic link + location exists for the Windows platform. + +6. The platform specific default install locations are searched. If exactly one + candidate is found, this is used. The default CUDA Toolkit install locations + searched are: + + +-------------+-------------------------------------------------------------+ + | Platform | Search Pattern | + +=============+=============================================================+ + | macOS | ``/Developer/NVIDIA/CUDA-X.Y`` | + +-------------+-------------------------------------------------------------+ + | Other Unix | ``/usr/local/cuda-X.Y`` | + +-------------+-------------------------------------------------------------+ + | Windows | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` | + +-------------+-------------------------------------------------------------+ + + Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as + ``/usr/local/cuda-9.0`` or + ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`` + + .. note:: + + When multiple CUDA Toolkits are installed in the default location of a + system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0`` + exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this + package is marked as **not** found. + + There are too many factors involved in making an automatic decision in + the presence of multiple CUDA Toolkits being installed. In this + situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or + (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for + :command:`find_program` to find. + +Options +^^^^^^^ + +``VERSION`` + If specified, describes the version of the CUDA Toolkit to search for. + +``REQUIRED`` + If specified, configuration will error if a suitable CUDA Toolkit is not + found. + +``QUIET`` + If specified, the search for a suitable CUDA Toolkit will not produce any + messages. + +``EXACT`` + If specified, the CUDA Toolkit is considered found only if the exact + ``VERSION`` specified is recovered. + +Imported targets +^^^^^^^^^^^^^^^^ + +An :ref:`imported target ` named ``CUDA::toolkit`` is provided. + +This module defines :prop_tgt:`IMPORTED` targets for each +of the following libraries that are part of the CUDAToolkit: + +- :ref:`CUDA Runtime Library` +- :ref:`CUDA Driver Library` +- :ref:`cuBLAS` +- :ref:`cuFFT` +- :ref:`cuRAND` +- :ref:`cuSOLVER` +- :ref:`cuSPARSE` +- :ref:`NPP` +- :ref:`nvBLAS` +- :ref:`nvGRAPH` +- :ref:`nvJPEG` +- :ref:`nvidia-ML` +- :ref:`nvRTC` +- :ref:`nvToolsExt` +- :ref:`OpenCL` +- :ref:`cuLIBOS` + +.. _`cuda_toolkit_rt_lib`: + +CUDA Runtime Library +"""""""""""""""""""" + +The CUDA Runtime library (cudart) are what most applications will typically +need to link against to make any calls such as `cudaMalloc`, and `cudaFree`. +They are an explicit dependency of almost every library. + +Targets Created: + +- ``CUDA::cudart`` +- ``CUDA::cudart_static`` + +.. _`cuda_toolkit_driver_lib`: + +CUDA Driver Library +"""""""""""""""""""" + +The CUDA Driver library (cuda) are used by applications that use calls +such as `cuMemAlloc`, and `cuMemFree`. This is generally used by advanced + + +Targets Created: + +- ``CUDA::cuda_driver`` +- ``CUDA::cuda_driver`` + +.. _`cuda_toolkit_cuBLAS`: + +cuBLAS +"""""" + +The `cuBLAS `_ library. + +Targets Created: + +- ``CUDA::cublas`` +- ``CUDA::cublas_static`` + +.. _`cuda_toolkit_cuFFT`: + +cuFFT +""""" + +The `cuFFT `_ library. + +Targets Created: + +- ``CUDA::cufft`` +- ``CUDA::cufftw`` +- ``CUDA::cufft_static`` +- ``CUDA::cufftw_static`` + +cuRAND +"""""" + +The `cuRAND `_ library. + +Targets Created: + +- ``CUDA::curand`` +- ``CUDA::curand_static`` + +.. _`cuda_toolkit_cuSOLVER`: + +cuSOLVER +"""""""" + +The `cuSOLVER `_ library. + +Targets Created: + +- ``CUDA::cusolver`` +- ``CUDA::cusolver_static`` + +.. _`cuda_toolkit_cuSPARSE`: + +cuSPARSE +"""""""" + +The `cuSPARSE `_ library. + +Targets Created: + +- ``CUDA::cusparse`` +- ``CUDA::cusparse_static`` + +.. _`cuda_toolkit_NPP`: + +NPP +""" + +The `NPP `_ libraries. + +Targets Created: + +- `nppc`: + + - ``CUDA::nppc`` + - ``CUDA::nppc_static`` + +- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h` + + - ``CUDA::nppial`` + - ``CUDA::nppial_static`` + +- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h` + + - ``CUDA::nppicc`` + - ``CUDA::nppicc_static`` + +- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h` + + - ``CUDA::nppicom`` + - ``CUDA::nppicom_static`` + +- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h` + + - ``CUDA::nppidei`` + - ``CUDA::nppidei_static`` + +- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h` + + - ``CUDA::nppif`` + - ``CUDA::nppif_static`` + +- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h` + + - ``CUDA::nppig`` + - ``CUDA::nppig_static`` + +- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h` + + - ``CUDA::nppim`` + - ``CUDA::nppim_static`` + +- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h` + + - ``CUDA::nppist`` + - ``CUDA::nppist_static`` + +- `nppisu`: Memory support functions in `nppi_support_functions.h` + + - ``CUDA::nppisu`` + - ``CUDA::nppisu_static`` + +- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h` + + - ``CUDA::nppitc`` + - ``CUDA::nppitc_static`` + +- `npps`: + + - ``CUDA::npps`` + - ``CUDA::npps_static`` + +.. _`cuda_toolkit_nvBLAS`: + +nvBLAS +"""""" + +The `nvBLAS `_ libraries. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvblas`` + +.. _`cuda_toolkit_nvGRAPH`: + +nvGRAPH +""""""" + +The `nvGRAPH `_ library. + +Targets Created: + +- ``CUDA::nvgraph`` +- ``CUDA::nvgraph_static`` + + +.. _`cuda_toolkit_nvJPEG`: + +nvJPEG +"""""" + +The `nvJPEG `_ library. +Introduced in CUDA 10. + +Targets Created: + +- ``CUDA::nvjpeg`` +- ``CUDA::nvjpeg_static`` + +.. _`cuda_toolkit_nvRTC`: + +nvRTC +""""" + +The `nvRTC `_ (Runtime Compilation) library. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvrtc`` + +.. _`cuda_toolkit_nvml`: + +nvidia-ML +""""""""" + +The `NVIDIA Management Library `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvml`` + +.. _`cuda_toolkit_opencl`: + +.. _`cuda_toolkit_nvToolsExt`: + +nvToolsExt +"""""""""" + +The `NVIDIA Tools Extension `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvToolsExt`` + +OpenCL +"""""" + +The `NVIDIA OpenCL Library `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::OpenCL`` + +.. _`cuda_toolkit_cuLIBOS`: + +cuLIBOS +""""""" + +The cuLIBOS library is a backend thread abstraction layer library which is +static only. The ``CUDA::cublas_static``, ``CUDA::cusparse_static``, +``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP +libraries all automatically have this dependency linked. + +Target Created: + +- ``CUDA::culibos`` + +**Note**: direct usage of this target by consumers should not be necessary. + +.. _`cuda_toolkit_cuRAND`: + + + +Result variables +^^^^^^^^^^^^^^^^ + +``CUDAToolkit_FOUND`` + A boolean specifying whether or not the CUDA Toolkit was found. + +``CUDAToolkit_VERSION`` + The exact version of the CUDA Toolkit found (as reported by + ``nvcc --version``). + +``CUDAToolkit_VERSION_MAJOR`` + The major version of the CUDA Toolkit. + +``CUDAToolkit_VERSION_MAJOR`` + The minor version of the CUDA Toolkit. + +``CUDAToolkit_VERSION_PATCH`` + The patch version of the CUDA Toolkit. + +``CUDAToolkit_BIN_DIR`` + The path to the CUDA Toolkit library directory that contains the CUDA + executable ``nvcc``. + +``CUDAToolkit_INCLUDE_DIRS`` + The path to the CUDA Toolkit ``include`` folder containing the header files + required to compile a project linking against CUDA. + +``CUDAToolkit_LIBRARY_DIR`` + The path to the CUDA Toolkit library directory that contains the CUDA + Runtime library ``cudart``. + +``CUDAToolkit_NVCC_EXECUTABLE`` + The path to the NVIDIA CUDA compiler ``nvcc``. Note that this path may + **not** be the same as + :variable:`CMAKE_CUDA_COMPILER _COMPILER>`. ``nvcc`` must be + found to determine the CUDA Toolkit version as well as determining other + features of the Toolkit. This variable is set for the convenience of + modules that depend on this one. + + +#]=======================================================================] + +# NOTE: much of this was simply extracted from FindCUDA.cmake. + +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# Copyright (c) 2007-2009 +# Scientific Computing and Imaging Institute, University of Utah +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR) + get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY) + # use the already detected cuda compiler + set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "") + unset(cuda_dir) +endif() + +# Try language- or user-provided path first. +if(CUDAToolkit_BIN_DIR) + find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ${CUDAToolkit_BIN_DIR} + NO_DEFAULT_PATH + ) +endif() + +# Search using CUDAToolkit_ROOT +find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ENV CUDA_PATH + PATH_SUFFIXES bin +) + +# If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error. +if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) + # Declare error messages now, print later depending on find_package args. + set(fail_base "Could not find nvcc executable in path specified by") + set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") + set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") + + if (CUDAToolkit_FIND_REQUIRED) + if (DEFINED CUDAToolkit_ROOT) + message(FATAL_ERROR ${cuda_root_fail}) + elseif (DEFINED ENV{CUDAToolkit_ROOT}) + message(FATAL_ERROR ${env_cuda_root_fail}) + endif() + else() + if (NOT CUDAToolkit_FIND_QUIETLY) + if (DEFINED CUDAToolkit_ROOT) + message(STATUS ${cuda_root_fail}) + elseif (DEFINED ENV{CUDAToolkit_ROOT}) + message(STATUS ${env_cuda_root_fail}) + endif() + endif() + set(CUDAToolkit_FOUND FALSE) + unset(fail_base) + unset(cuda_root_fail) + unset(env_cuda_root_fail) + return() + endif() +endif() + +# CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults. +# +# - Linux: /usr/local/cuda-X.Y +# - macOS: /Developer/NVIDIA/CUDA-X.Y +# - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y +# +# We will also search the default symlink location /usr/local/cuda first since +# if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked +# directory is the desired location. +if (NOT CUDAToolkit_NVCC_EXECUTABLE) + if (UNIX) + if (NOT APPLE) + set(platform_base "/usr/local/cuda-") + else() + set(platform_base "/Developer/NVIDIA/CUDA-") + endif() + else() + set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v") + endif() + + # Build out a descending list of possible cuda installations, e.g. + file(GLOB possible_paths "${platform_base}*") + # Iterate the glob results and create a descending list. + set(possible_versions) + foreach (p ${possible_paths}) + # Extract version number from end of string + string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) + if (IS_DIRECTORY ${p} AND p_version) + list(APPEND possible_versions ${p_version}) + endif() + endforeach() + + # Cannot use list(SORT) because that is alphabetical, we need numerical. + # NOTE: this is not an efficient sorting strategy. But even if a user had + # every possible version of CUDA installed, this wouldn't create any + # significant overhead. + set(versions) + foreach (v ${possible_versions}) + list(LENGTH versions num_versions) + # First version, nothing to compare with so just append. + if (num_versions EQUAL 0) + list(APPEND versions ${v}) + else() + # Loop through list. Insert at an index when comparison is + # VERSION_GREATER since we want a descending list. Duplicates will not + # happen since this came from a glob list of directories. + set(i 0) + set(early_terminate FALSE) + while (i LESS num_versions) + list(GET versions ${i} curr) + if (v VERSION_GREATER curr) + list(INSERT versions ${i} ${v}) + set(early_terminate TRUE) + break() + endif() + math(EXPR i "${i} + 1") + endwhile() + # If it did not get inserted, place it at the end. + if (NOT early_terminate) + list(APPEND versions ${v}) + endif() + endif() + endforeach() + + # With a descending list of versions, populate possible paths to search. + set(search_paths) + foreach (v ${versions}) + list(APPEND search_paths "${platform_base}${v}") + endforeach() + + # Force the global default /usr/local/cuda to the front on Unix. + if (UNIX) + list(INSERT search_paths 0 "/usr/local/cuda") + endif() + + # Now search for nvcc again using the platform default search paths. + find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ${search_paths} + PATH_SUFFIXES bin + ) + + # We are done with these variables now, cleanup for caller. + unset(platform_base) + unset(possible_paths) + unset(possible_versions) + unset(versions) + unset(i) + unset(early_terminate) + unset(search_paths) + + if (NOT CUDAToolkit_NVCC_EXECUTABLE) + if (CUDAToolkit_FIND_REQUIRED) + message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") + elseif(NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") + endif() + + set(CUDAToolkit_FOUND FALSE) + return() + endif() +endif() + +if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE) + get_filename_component(cuda_dir "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY) + set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE) + unset(cuda_dir) +endif() + +if(CUDAToolkit_NVCC_EXECUTABLE AND + CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) + # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value + # This if statement will always match, but is used to provide variables for MATCH 1,2,3... + if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}") + endif() +else() + # Compute the version by invoking nvcc + execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + endif() + unset(NVCC_OUT) +endif() + + +get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) + +# Now that we have the real ROOT_DIR, find components inside it. +list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR}) + +# Find the include/ directory +find_path(CUDAToolkit_INCLUDE_DIR + NAMES cuda_runtime.h +) + +# And find the CUDA Runtime Library libcudart +find_library(CUDA_CUDART + NAMES cudart + PATH_SUFFIXES lib64 lib/x64 +) +if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Unable to find cudart library.") +endif() + +unset(CUDAToolkit_ROOT_DIR) +list(REMOVE_AT CMAKE_PREFIX_PATH -1) + +#----------------------------------------------------------------------------- +# Perform version comparison and validate all required variables are set. +# MXNET NOTE: This differs from CMake source by ${CMAKE_CURRENT_LIST_DIR} +# replaced with ${CMAKE_ROOT}/Modules +include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) +find_package_handle_standard_args(CUDAToolkit + REQUIRED_VARS + CUDAToolkit_INCLUDE_DIR + CUDA_CUDART + CUDAToolkit_NVCC_EXECUTABLE + VERSION_VAR + CUDAToolkit_VERSION +) + +#----------------------------------------------------------------------------- +# Construct result variables +if(CUDAToolkit_FOUND) + set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) +endif() + +#----------------------------------------------------------------------------- +# Construct import targets +if(CUDAToolkit_FOUND) + + function(find_and_add_cuda_import_lib lib_name) + + if(ARGC GREATER 1) + set(search_names ${ARGN}) + else() + set(search_names ${lib_name}) + endif() + + find_library(CUDA_${lib_name}_LIBRARY + NAMES ${search_names} + PATHS ${CUDAToolkit_LIBRARY_DIR} + ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib + ) + + if (NOT CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + add_library(CUDA::${lib_name} IMPORTED INTERFACE) + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") + endif() + endfunction() + + function(add_cuda_link_dependency lib_name) + foreach(dependency IN LISTS ${ARGN}) + target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dependency}) + endforeach() + endfunction() + + add_library(CUDA::toolkit IMPORTED INTERFACE) + target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") + + + find_and_add_cuda_import_lib(cuda_driver cuda) + + find_and_add_cuda_import_lib(cudart) + find_and_add_cuda_import_lib(cudart_static) + + foreach (cuda_lib cublas cufft cufftw curand cusolver cusparse nvgraph nvjpeg) + find_and_add_cuda_import_lib(${cuda_lib}) + add_cuda_link_dependency(${cuda_lib} cudart) + + find_and_add_cuda_import_lib(${cuda_lib}_static) + add_cuda_link_dependency(${cuda_lib}_static cudart_static) + endforeach() + + # cuSOLVER depends on cuBLAS, and cuSPARSE + add_cuda_link_dependency(cusolver cublas cusparse) + add_cuda_link_dependency(cusolver_static cublas_static cusparse) + + # nvGRAPH depends on cuRAND, and cuSOLVER. + add_cuda_link_dependency(nvgraph curand cusolver) + add_cuda_link_dependency(nvgraph_static curand_static cusolver_static) + + find_and_add_cuda_import_lib(nppc) + find_and_add_cuda_import_lib(nppc_static) + + add_cuda_link_dependency(nppc cudart) + add_cuda_link_dependency(nppc_static cudart_static culibos) + + # Process the majority of the NPP libraries. + foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) + find_and_add_cuda_import_lib(${cuda_lib}) + find_and_add_cuda_import_lib(${cuda_lib}_static) + add_cuda_link_dependency(${cuda_lib} nppc) + add_cuda_link_dependency(${cuda_lib}_static nppc_static) + endforeach() + + find_and_add_cuda_import_lib(nvrtc) + add_cuda_link_dependency(nvrtc cuda_driver) + + find_and_add_cuda_import_lib(nvml nvidia-ml nvml) + + if(WIN32) + # nvtools can be installed outside the CUDA toolkit directory + # so prefer the NVTOOLSEXT_PATH windows only environment variable + # In addition on windows the most common name is nvToolsExt64_1 + find_library(CUDA_nvToolsExt_LIBRARY + NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt + PATHS ENV NVTOOLSEXT_PATH + ENV CUDA_PATH + PATH_SUFFIXES lib/x64 lib + ) + endif() + find_and_add_cuda_import_lib(nvToolsExt nvToolsExt nvToolsExt64) + + add_cuda_link_dependency(nvToolsExt cudart) + + find_and_add_cuda_import_lib(OpenCL) + + find_and_add_cuda_import_lib(culibos) + if(TARGET CUDA::culibos) + foreach (cuda_lib cublas cufft cusparse curand nvjpeg) + add_cuda_link_dependency(${cuda_lib}_static culibos) + endforeach() + endif() + +endif() diff --git a/contrib/tvmop/compile.py b/contrib/tvmop/compile.py index b0254218077a..43657f274348 100644 --- a/contrib/tvmop/compile.py +++ b/contrib/tvmop/compile.py @@ -50,6 +50,11 @@ def get_cuda_arch(arch): if len(arch) == 0: return None + # the arch string is of format '-gencode;arch=compute_XX,code=sm_XX' + # this format is computed by CMake CUDA_SELECT_NVCC_ARCH_FLAGS + if arch.startswith('-gencode;'): + return arch.split(';') + # the arch string contains '-arch=sm_xx' flags = arch.split() for flag in flags: diff --git a/tools/windowsbuild/README.md b/tools/windowsbuild/README.md new file mode 100644 index 000000000000..7d8e7cf331cf --- /dev/null +++ b/tools/windowsbuild/README.md @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + +Due to dll size limitation under windows. Split dll into different dlls according to arch +Reference https://github.com/apache/incubator-mxnet/pull/16980 \ No newline at end of file diff --git a/tools/windowsbuild/gen_warp.cpp b/tools/windowsbuild/gen_warp.cpp new file mode 100644 index 000000000000..2d90eaf364f3 --- /dev/null +++ b/tools/windowsbuild/gen_warp.cpp @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IMAGE_SIZEOF_SIGNATURE 4 + + +DWORD rva_to_foa(IN DWORD RVA, IN PIMAGE_SECTION_HEADER section_header) +{ + + size_t count = 0; + for (count = 1; RVA > (section_header->VirtualAddress + section_header->Misc.VirtualSize); count++, section_header++); + + DWORD FOA = RVA - section_header->VirtualAddress + section_header->PointerToRawData; + + return FOA; +} + +std::string format(const char* format, ...) +{ + va_list args; + va_start(args, format); +#ifndef _MSC_VER + size_t size = std::snprintf(nullptr, 0, format, args) + 1; // Extra space for '\0' + std::unique_ptr buf(new char[size]); + std::vsnprintf(buf.get(), size, format, args); + return std::string(buf.get(), buf.get() + size - 1); // We don't want the '\0' inside +#else + int size = _vscprintf(format, args) +1; + std::unique_ptr buf(new char[size]); + vsnprintf_s(buf.get(), size, _TRUNCATE, format, args); + return std::string(buf.get()); +#endif + va_end(args); +} + +int main(int argc, char* argv[]) +{ + + if (argc != 2) + { + return 0; + } + + //open file + const HANDLE h_file = CreateFile( + argv[1], + GENERIC_READ , + FILE_SHARE_READ , + nullptr, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, + nullptr); + + + DWORD size_high; + const DWORD size_low = GetFileSize(h_file, &size_high); + + uint64_t dll_size = ((uint64_t(size_high)) << 32) + (uint64_t)size_low; + + // Create File Mapping + const HANDLE h_map_file = CreateFileMapping( + h_file, + nullptr, + PAGE_READONLY, + size_high, + size_low, + nullptr); + if (h_map_file == INVALID_HANDLE_VALUE || h_map_file == nullptr) + { + std::cout << "error"; + CloseHandle(h_file); + return 0; + } + + //Map File to memory + void* pv_file = MapViewOfFile( + h_map_file, + FILE_MAP_READ, + 0, + 0, + 0); + + if (pv_file == nullptr) + { + std::cout << "error"; + CloseHandle(h_file); + return 0; + } + + uint8_t* p = static_cast(pv_file); + + + PIMAGE_DOS_HEADER dos_header = reinterpret_cast(p); + + const PIMAGE_NT_HEADERS nt_headers = reinterpret_cast(p + dos_header->e_lfanew); + + const PIMAGE_FILE_HEADER file_header = &nt_headers->FileHeader; + + PIMAGE_OPTIONAL_HEADER optional_header = (PIMAGE_OPTIONAL_HEADER)(&nt_headers->OptionalHeader); + + const DWORD file_alignment = optional_header->FileAlignment; + + + PIMAGE_SECTION_HEADER section_table = + reinterpret_cast(p + dos_header->e_lfanew + + IMAGE_SIZEOF_SIGNATURE + + IMAGE_SIZEOF_FILE_HEADER + + file_header->SizeOfOptionalHeader); + + DWORD export_foa = rva_to_foa(optional_header->DataDirectory[0].VirtualAddress, section_table); + + PIMAGE_EXPORT_DIRECTORY export_directory = (PIMAGE_EXPORT_DIRECTORY)(p + export_foa); + + + DWORD name_list_foa = rva_to_foa(export_directory->AddressOfNames, section_table); + + PDWORD name_list = (PDWORD)(p + name_list_foa); + + + + + std::vector func_list; + + for (size_t i = 0; i < export_directory->NumberOfNames; i++, name_list++) + { + + DWORD name_foa = rva_to_foa(* name_list, section_table); + char* name = (char*)(p + name_foa); + func_list.emplace_back(name); + + } + + + UnmapViewOfFile(pv_file); + CloseHandle(h_map_file); + CloseHandle(h_file); + + + std::ofstream gen_cpp_obj; + gen_cpp_obj.open("warp_gen_cpp.cpp", std::ios::out | std::ios::trunc); + gen_cpp_obj << "#include \n"; + gen_cpp_obj << "extern \"C\" \n{\n"; + + + for (size_t i = 0; i < func_list.size(); ++i) + { + auto fun = func_list[i]; + gen_cpp_obj << format("void * warp_point_%d;\n", i); + gen_cpp_obj << format("#pragma comment(linker, \"/export:%s=warp_func_%d\")\n", fun.c_str(), i); + gen_cpp_obj << format("void warp_func_%d();\n", i); + gen_cpp_obj << ("\n"); + } + gen_cpp_obj << ("}\n"); + + + gen_cpp_obj << ("void load_function(HMODULE hm)\n{\n"); + for (size_t i = 0; i < func_list.size(); ++i) + { + auto fun = func_list[i]; + gen_cpp_obj << format("warp_point_%d = (void*)GetProcAddress(hm, \"%s\");\n", i, fun.c_str()); + } + gen_cpp_obj << ("}\n"); + + gen_cpp_obj.close(); + + + + std::ofstream gen_asm_obj; + gen_asm_obj.open("warp_gen.asm", std::ios::out | std::ios::trunc); + for (size_t i = 0; i < func_list.size(); ++i) + { + auto fun = func_list[i]; + gen_asm_obj << format("EXTERN warp_point_%d:QWORD;\n", i); + } + gen_asm_obj << ".CODE\n"; + for (size_t i = 0; i < func_list.size(); ++i) + { + auto fun = func_list[i]; + gen_asm_obj << format("warp_func_%d PROC\njmp warp_point_%d;\nwarp_func_%d ENDP\n", i,i,i); + } + gen_asm_obj << "END\n"; + gen_asm_obj.close(); +} diff --git a/tools/windowsbuild/warp_dll.cpp b/tools/windowsbuild/warp_dll.cpp new file mode 100644 index 000000000000..6a89a4e189de --- /dev/null +++ b/tools/windowsbuild/warp_dll.cpp @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +extern "C" IMAGE_DOS_HEADER __ImageBase; + + +std::vector find_mxnet_dll() +{ + std::vector version; + intptr_t handle; + + _wfinddata_t findData{}; + std::wregex reg(L".*?mxnet_([0-9]+)\\.dll"); + + HMODULE hModule = reinterpret_cast(&__ImageBase); + WCHAR szPathBuffer[MAX_PATH] = { 0 }; + GetModuleFileNameW(hModule, szPathBuffer, MAX_PATH); + + PathRemoveFileSpecW(szPathBuffer); + wcscat_s(szPathBuffer, L"\\mxnet_*.dll"); + + handle = _wfindfirst(szPathBuffer, &findData); + if (handle == -1) + { + return version; + } + + do + { + if (!(findData.attrib & _A_SUBDIR) || wcscmp(findData.name, L".") != 0 || wcscmp(findData.name, L"..") != 0) + { + std::wstring str(findData.name); + std::wsmatch base_match; + if(std::regex_match(str, base_match, reg)) + { + if (base_match.size() == 2) { + std::wssub_match base_sub_match = base_match[1]; + std::wstring base = base_sub_match.str(); + version.push_back(std::stoi(base)) ; + } + } + } + } while (_wfindnext(handle, &findData) == 0); + + _findclose(handle); + std::sort(version.begin(), version.end()); + return version; +} + +int find_version() +{ + std::vector known_sm = find_mxnet_dll(); + int count = 0; + int version = 75; + if (cudaSuccess != cudaGetDeviceCount(&count)) + { + return 30; + } + if (count == 0) + { + return 30; + } + + + for (int device = 0; device < count; ++device) + { + cudaDeviceProp prop{}; + if (cudaSuccess == cudaGetDeviceProperties(&prop, device)) + { + version = std::min(version, prop.major * 10 + prop.minor); + } + } + + for (int i = known_sm.size() -1 ; i >=0; --i) + { + if(known_sm[i]<= version) + { + return known_sm[i]; + } + } + + return version; +} + +void load_function(HMODULE hm); + +void mxnet_init() +{ + int version = find_version(); + WCHAR dll_name[MAX_PATH]; + wsprintfW(dll_name, L"mxnet_%d.dll", version); + HMODULE hm = LoadLibraryW(dll_name); + load_function(hm); +} + + +extern "C" BOOL WINAPI DllMain( + HINSTANCE const instance, // handle to DLL module + DWORD const reason, // reason for calling function + LPVOID const reserved) // reserved +{ + // Perform actions based on the reason for calling. + switch (reason) + { + case DLL_PROCESS_ATTACH: + mxnet_init(); + // Initialize once for each new process. + // Return FALSE to fail DLL load. + break; + + case DLL_THREAD_ATTACH: + // Do thread-specific initialization. + break; + + case DLL_THREAD_DETACH: + // Do thread-specific cleanup. + break; + + case DLL_PROCESS_DETACH: + // Perform any necessary cleanup. + break; + } + return TRUE; // Successful DLL_PROCESS_ATTACH. +} \ No newline at end of file