Skip to content

Commit

Permalink
3.6.0 update (#2005)
Browse files Browse the repository at this point in the history
* 3.6.0 update

* doc and swap stuff

---------

Co-authored-by: yuzhai <[email protected]>
Co-authored-by: Haicheng Wu <[email protected]>
  • Loading branch information
3 people authored Dec 25, 2024
1 parent e1cd8c7 commit 3d261a5
Show file tree
Hide file tree
Showing 258 changed files with 10,810 additions and 3,830 deletions.
58 changes: 41 additions & 17 deletions CHANGELOG.md

Large diffs are not rendered by default.

168 changes: 75 additions & 93 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,18 @@ project(CUTLASS VERSION ${_CUTLASS_VERSION_MAJOR}.${_CUTLASS_VERSION_MINOR}.${_C

################################################################################

if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
set(CUTLASS_GNU_HOST_COMPILE ON CACHE BOOL "Using GNU tools for host code compilation")
endif()
if (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang")
set(CUTLASS_CLANG_HOST_COMPILE ON CACHE BOOL "Using Clang tools for host code compilation")
endif()
if (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
set(CUTLASS_MSVC_HOST_COMPILE ON CACHE BOOL "Using MSVC tools for host code compilation")
endif()

################################################################################

include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)

if (CUDA_VERSION VERSION_LESS 11.3)
Expand All @@ -67,11 +79,11 @@ elseif (CUDA_VERSION VERSION_LESS 11.4)
message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.8 or higher.")
endif()

if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.3)
if(CUTLASS_GNU_HOST_COMPILE AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.3)
message(FATAL_ERROR "GCC version must be at least 7.3!")
endif()

if (CUDA_COMPILER MATCHES "[Cc]lang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
if (CUTLASS_CLANG_DEVICE_COMPILE AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
message(FATAL_ERROR "Clang 7.0+ required for GPU compilation")
endif()
find_package(Doxygen QUIET)
Expand All @@ -85,13 +97,10 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

if(CUTLASS_NATIVE_CUDA)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
else()
list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17)
endif()
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)

if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX install CACHE PATH "Default installation location." FORCE)
Expand Down Expand Up @@ -146,13 +155,13 @@ endif()
################################################################################

set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.4 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.4)
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70 72 75 80 86 87)
endif()
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 89 90)
endif()
if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 90a)
endif()
set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
Expand Down Expand Up @@ -246,7 +255,7 @@ set(KERNEL_FILTER_FILE "" CACHE STRING "KERNEL FILTER FILE FULL PATH")
if (KERNEL_FILTER_FILE AND NOT CUTLASS_LIBRARY_KERNELS)
# If a kernel filter file is specified, we want to generate and then
# filter on the entire kernel set, not the default kernel
# (sub)set. The user may have overridden CUTLASS_LIBRRARY_KERNELS, in which
# (sub)set. The user may have overridden CUTLASS_LIBRARY_KERNELS, in which
# case the resulting kernel set will be the intersection of the two
# options differenced against CUTLASS_LIBRARY_IGNORE_KERNELS.
set(CUTLASS_LIBRARY_KERNELS_INIT "*")
Expand Down Expand Up @@ -375,15 +384,22 @@ endif()


# Warnings-as-error exceptions and warning suppressions for Clang builds
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=implicit-int-conversion ")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS "-Wno-error=implicit-int-conversion" )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pass-failed ")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS "-Wno-error=pass-failed" )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=inconsistent-missing-override ")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS "-Wno-error=inconsistent-missing-override" )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-conversion ")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS "-Wno-sign-conversion" )
if (CUTLASS_CLANG_HOST_COMPILE)

set(FLAGS_TO_ADD
"-Wno-error=implicit-int-conversion"
"-Wno-error=pass-failed"
"-Wno-error=inconsistent-missing-override"
"-Wno-sign-conversion"
"-Wno-unused-parameter"
)

foreach(FLAG ${FLAGS_TO_ADD})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS "${FLAG}")
list(APPEND CUTLASS_CUDA_CLANG_FLAGS "${FLAG}")
endforeach()

endif()

if (NOT MSVC AND CUTLASS_NVCC_KEEP)
Expand All @@ -396,9 +412,9 @@ endif()

if (CUTLASS_ENABLE_F16C AND NOT CMAKE_CROSSCOMPILING)
list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_F16C=1)
if ((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
if (CUTLASS_GNU_HOST_COMPILE OR CUTLASS_CLANG_HOST_COMPILE)
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=-mf16c)
elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC"))
elseif(CUTLASS_MSVC_HOST_COMPILE)
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=/arch:AVX2)
endif()
endif()
Expand All @@ -423,19 +439,8 @@ if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -lineinfo)
endif()

#Report CUDA build flags
if (CUDA_COMPILER MATCHES "[Cc]lang")
if(CUTLASS_CUDA_CLANG_FLAGS)
message(STATUS "Using CLANG flags: ${CUTLASS_CUDA_CLANG_FLAGS}")
endif()
else()
if(CUTLASS_CUDA_NVCC_FLAGS)
message(STATUS "Using NVCC flags: ${CUTLASS_CUDA_NVCC_FLAGS}")
endif()
endif()

if(CUDA_COMPILER MATCHES "[Cc]lang")
if( NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang" )
if (CUTLASS_CLANG_DEVICE_COMPILE)
if (NOT CUTLASS_CLANG_HOST_COMPILE)
message(FATAL_ERROR "Clang CUDA compilation requires Clang CXX compilation. Currently CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER_ID}" )
endif()

Expand All @@ -451,24 +456,35 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -unroll-threshold=5000)
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unused-command-line-argument)

string(REPLACE "." ";" CUDA_VERSION_PARTS ${CMAKE_CUDA_COMPILER_VERSION})
list(GET CUDA_VERSION_PARTS 0 CUDA_VERSION_MAJOR)
list(GET CUDA_VERSION_PARTS 1 CUDA_VERSION_MINOR)
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -D__CUDACC_VER_MAJOR__=${CUDA_VERSION_MAJOR} -D__CUDACC_VER_MINOR__=${CUDA_VERSION_MINOR})


# needed for libcublasLt.so in case it's installed in the same location as libcudart.so
# dynamic linker can find it if linker sets RPATH (forced by --disable-new-tags)
# Otherwise linker uses RUNPATH and that does not propagate to loaded libs.
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wl,--disable-new-dtags)

link_libraries(nvidia::cudart)
link_libraries(nvidia::cuda_driver)

endif()

#Report CUDA build flags
if (CUTLASS_CLANG_DEVICE_COMPILE AND CUTLASS_CUDA_CLANG_FLAGS)
set(__FLAG_GROUP Clang)
set(__FLAG_LIST CUTLASS_CUDA_CLANG_FLAGS)
else(CUTLASS_NVCC_DEVICE_COMPILE AND CUTLASS_CUDA_NVCC_FLAGS)
set(__FLAG_GROUP NVCC)
set(__FLAG_LIST CUTLASS_CUDA_NVCC_FLAGS)
endif()

set(__FLAG_DISPLAY_STRING "")
set(__FLAG_DISPLAY_SEPARATOR)
list(JOIN ${__FLAG_LIST} "\n " __FLAG_DISPLAY_STRING)
message(STATUS "Using the following ${__FLAG_GROUP} flags: \n ${__FLAG_DISPLAY_STRING}")

# Known gcc 8.1-8.3 SFINAE issue (fixed in gcc 8.4), check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87748
# Also see https://github.com/NVIDIA/nccl/issues/835 for nvtx3.hpp
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1 AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 8.3)
if (CUTLASS_GNU_HOST_COMPILE AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1 AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 8.3)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNVTX3_USE_CHECKED_OVERLOADS_FOR_GET=0")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DNVTX3_USE_CHECKED_OVERLOADS_FOR_GET=0")
endif()
Expand All @@ -478,12 +494,10 @@ if (${CMAKE_CXX_COMPILER_ID} MATCHES "PGI" OR ${CMAKE_CXX_COMPILER_ID} MATCHES "
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Mint128 ")
endif()

if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
# CMake 3.18 added support for CUDA_ARCHITECTURES target property. We will use this
# property for CMake 3.18+, so we request the NEW behavior for correct compatibility.
# https://cmake.org/cmake/help/v3.18/policy/CMP0104.html#policy:CMP0104
cmake_policy(SET CMP0104 NEW)
endif()
# CMake 3.18 added support for CUDA_ARCHITECTURES target property. We will use this
# property for CMake 3.18+, so we request the NEW behavior for correct compatibility.
# https://cmake.org/cmake/help/v3.18/policy/CMP0104.html#policy:CMP0104
cmake_policy(SET CMP0104 NEW)

if (MSVC)

Expand Down Expand Up @@ -519,55 +533,21 @@ function(cutlass_apply_cuda_gencode_flags TARGET)
set(ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS_ENABLED})
endif()

set(NVCC_FLAGS)
set(CLANG_FLAGS)
set(__CMAKE_CUDA_ARCHS)
foreach(ARCH ${ARCHS_ENABLED})
list(APPEND CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH})
set(CODES)
if(CUTLASS_NVCC_EMBED_CUBIN)
list(APPEND CODES sm_${ARCH})
list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-real)
endif()
if(CUTLASS_NVCC_EMBED_PTX)
list(APPEND CODES compute_${ARCH})
if(CUTLASS_NVCC_EMBED_PTX AND NOT CUTLASS_CLANG_DEVICE_COMPILE)
# If we're using clang for device compilation, the ptx is inserted
# via another command line option and the `-virtual` flags will cause an error.
list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-virtual)
endif()
list(JOIN CODES "," CODES_STR)
list(APPEND NVCC_FLAGS -gencode=arch=compute_${ARCH},code=[${CODES_STR}])
endforeach()

if (NOT __SM_ARCHS)
if (CUDA_COMPILER MATCHES "[Cc]lang")
target_compile_options(
${TARGET}
PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:${CLANG_FLAGS}>
)
elseif(CMAKE_VERSION GREATER_EQUAL 3.18)
set_property(TARGET ${TARGET} PROPERTY CUDA_ARCHITECTURES ${__CMAKE_CUDA_ARCHS})
else()
target_compile_options(
${TARGET}
PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:${NVCC_FLAGS}>
)
endif()
else()
list(JOIN CLANG_FLAGS " " CLANG_FLAGS_STR)
list(JOIN NVCC_FLAGS " " STR_NVCC_FLAGS)
if (CUDA_COMPILER MATCHES "[Cc]lang")
if(${TARGET} MATCHES ".*\.cpp")
set_source_files_properties(${TARGET} PROPERTIES COMPILE_FLAGS ${CLANG_FLAGS_STR})
endif()
elseif(CMAKE_VERSION GREATER_EQUAL 3.18)
set_source_files_properties(${TARGET} PROPERTIES CUDA_ARCHITECTURES ${STR_NVCC_FLAGS})
else()
if(${TARGET} MATCHES ".*\.cu")
set_source_files_properties(${TARGET} PROPERTIES COMPILE_FLAGS ${STR_NVCC_FLAGS})
endif()
endif()
endif()
set_property(TARGET ${TARGET} PROPERTY CUDA_ARCHITECTURES ${__CMAKE_CUDA_ARCHS})

endfunction()

Expand All @@ -588,8 +568,8 @@ set(__CUTLASS_CUDA_NVCC_FLAGS_DEBUG ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG} CACHE INTER

function(cutlass_apply_standard_compile_options TARGET)

if(CUDA_COMPILER MATCHES "[Cc]lang")
set(CUDA_COMPILE_LANGUAGE CXX)
if(CUTLASS_CLANG_DEVICE_COMPILE)
set(CUDA_COMPILE_LANGUAGE CUDA)
set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_CLANG_FLAGS})
set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_CLANG_FLAGS_RELEASE})
set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO})
Expand Down Expand Up @@ -682,8 +662,6 @@ target_include_directories(
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CUTLASS_INCLUDE_DIR}>
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
$<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
)

# Mark CTK headers as system to supress warnings from them
Expand Down Expand Up @@ -825,7 +803,7 @@ function(cutlass_add_executable_tests NAME TARGET)
# TEST_SETS_SUPPORTED: A list of test set names these tests support.
#

set(options DISABLE_EXECUTABLE_INSTALL_RULE)
set(options DISABLE_EXECUTABLE_INSTALL_RULE DO_NOT_LOWERCASE_TEST_NAME)
set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE TEST_COMMAND_OPTIONS_PREFIX)
set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS TEST_SETS_SUPPORTED)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
Expand Down Expand Up @@ -915,11 +893,15 @@ function(cutlass_add_executable_tests NAME TARGET)
foreach(CMD_OPTIONS_VAR IN LISTS __TEST_COMMAND_OPTIONS)

if (CMD_COUNT GREATER 1)
string(TOLOWER "${NAME}_${CMD_OPTIONS_VAR}" TESTCASE_NAME)
set(TESTCASE_NAME "${NAME}_${CMD_OPTIONS_VAR}")
else()
string(TOLOWER "${NAME}" TESTCASE_NAME)
set(TESTCASE_NAME "${NAME}")
endif()

if (NOT __DO_NOT_LOWERCASE_TEST_NAME)
string(TOLOWER "${TESTCASE_NAME}" TESTCASE_NAME)
endif()

# The following rigmarole is needed to deal with spaces and possible quotes in
# command line arguments. The options are passed "by reference" as the actual
# variable names holding the real options. We then expand these in a way that
Expand Down
Loading

0 comments on commit 3d261a5

Please sign in to comment.