Skip to content

Commit

Permalink
CUTLASS 2.3 initial commit (#134)
Browse files Browse the repository at this point in the history
CUTLASS 2.3 adds GEMMs targeting Sparse Tensor Cores on the NVIDIA Ampere Architecture, fast SGEMM, and small matrix classes, bug fixes, and performance enhancements.
  • Loading branch information
kerrmudgeon authored Sep 23, 2020
1 parent 4dac749 commit c53f333
Show file tree
Hide file tree
Showing 209 changed files with 46,919 additions and 1,674 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@

# CUTLASS 2.x

## [2.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.3.0) (2020-09-23)
* [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
* [Sparse Tensor Core GEMM kernels](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu):
* Direct access to Sparse Tensor Cores and maximum performance via [`mma.sp.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma-and-friends)
* Fast SGEMM targeting GeForce RTX 30-series CUDA Cores
* Minor Features:
* [Activation functions](/include/cutlass/epilogue/thread/activation.h) such as [GeLU](/include/cutlass/epilogue/thread/linear_combination_gelu.h) and [Sigmoid](/include/cutlass/epilogue/thread/linear_combination_sigmoid.h)
* Small [matrix](/include/cutlass/matrix.h) and [quaternion](/include/cutlass/quaternion.h) template classes in device code
* [Floating-point constants](/include/cutlass/constants.h)
* NVIDIA Ampere GPU Architecture examples and documentation:
* [Tensor Float 32](/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu) and
* [Sparse Tensor Cores](/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu)
* Documentation added on CUTLASS [efficient row-major epilogue](/media/docs/gemm_api.md#efficient-epilogue)

## [2.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.2.0) (2020-06-08)
* [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
* Fast Tensor Core operations:
Expand Down
16 changes: 14 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ endif()

message(STATUS "CMake Version: ${CMAKE_VERSION}")

project(CUTLASS VERSION 2.2.0 LANGUAGES CXX)
project(CUTLASS VERSION 2.3.0 LANGUAGES CXX)
include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)

find_package(Doxygen QUIET)
Expand Down Expand Up @@ -69,6 +69,8 @@ endif()

set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable CUTLASS Examples")
set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_TOOLS} CACHE BOOL "Enable CUTLASS Library")
set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_TOOLS} CACHE BOOL "Enable CUTLASS Profiler")

if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_TOOLS_INIT})
Expand Down Expand Up @@ -101,6 +103,9 @@ endif()
if (NOT CUDA_VERSION VERSION_LESS 11.0)
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80)
endif()
if (NOT CUDA_VERSION VERSION_LESS 11.1)
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 86)
endif()
set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.")

Expand Down Expand Up @@ -164,12 +169,14 @@ set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.
#
set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.")
set(CUTLASS_LIBRARY_KERNELS "" CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")


# Test Levels L0, L1, L2
set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
set_property(CACHE CUTLASS_TEST_LEVEL PROPERTY STRINGS 0 1 2)
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_TEST_LEVEL=${CUTLASS_TEST_LEVEL})
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -DCUTLASS_TEST_LEVEL=${CUTLASS_TEST_LEVEL})

#
# CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
Expand All @@ -181,6 +188,11 @@ else()
set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT ON)
endif()

# Trace levels for debugging
set(CUTLASS_DEBUG_TRACE_LEVEL "0" CACHE STRING "Level of debug tracing to perform.")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_DEBUG_TRACE_LEVEL=${CUTLASS_DEBUG_TRACE_LEVEL})


set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL
"Enable PTX mma instruction for collective matrix multiply operations.")

Expand Down Expand Up @@ -352,7 +364,7 @@ set_target_properties(CUTLASS PROPERTIES EXPORT_NAME cutlass)

set(CUTLASS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE PATH "CUTLASS Header Library")

set(CUTLASS_GENERATOR_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/library/)
set(CUTLASS_GENERATOR_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/library CACHE INTERNAL "Location of generator scripts")

# The following utility directory is needed even if the tools build is disabled, so it exists here.
set(CUTLASS_TOOLS_UTIL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/util/include CACHE INTERNAL "")
Expand Down
5 changes: 5 additions & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ Naila Farooqui
Piotr Majcher
Paul Springer
Jin Wang
Aniket Shivam
Chinmay Talegaonkar
Shang Zhang
Scott Yokim
Markus Hohnerbach
Aditya Atluri
Expand Down Expand Up @@ -52,6 +55,8 @@ Olivier Giroux
Stephen Jones
Rishkul Kulkarni
Bryce Lelbach
Matthew Nicely
Joel McCormack
Kyrylo Perelygin


9 changes: 8 additions & 1 deletion CUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,14 @@ function(cutlass_correct_source_file_language_property)
endif()
endfunction()

set(CUTLASS_UNITY_BUILD_ENABLED OFF CACHE BOOL "Enable combined source compilation")
# If building with all kernels, set UNITY build on by default.
if (CUTLASS_LIBRARY_KERNELS MATCHES "all")
set(CUTLASS_UNITY_BUILD_ENABLED_INIT ON)
else()
set(CUTLASS_UNITY_BUILD_ENABLED_INIT OFF)
endif()

set(CUTLASS_UNITY_BUILD_ENABLED ${CUTLASS_UNITY_BUILD_ENABLED_INIT} CACHE BOOL "Enable combined source compilation")
set(CUTLASS_UNITY_BUILD_BATCH_SIZE 16 CACHE STRING "Batch size for unified source files")

function(cutlass_unify_source_files TARGET_ARGS_VAR)
Expand Down
41 changes: 30 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")

# CUTLASS 2.2
# CUTLASS 2.3

_CUTLASS 2.2 - June 2020_
_CUTLASS 2.3 - September 2020_

CUTLASS is a collection of CUDA C++ template abstractions for implementing
high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
Expand Down Expand Up @@ -30,6 +30,14 @@ See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
See the [functionality listing](media/docs/functionality.md) for the list of operations
supported at each level of the execution model hierarchy.

# What's New in CUTLASS 2.3

CUTLASS 2.3 is a minor update to CUTLASS adding:
- GEMMs targeting structured [Sparse Tensor Cores](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) in NVIDIA Ampere Architecture GPUs
- Fast SGEMM kernels targeting GeForce RTX 30-series CUDA Cores
- Intended to be compiled with [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit)
- See the [CHANGELOG](CHANGELOG.md) for more details.

# What's New in CUTLASS 2.2

CUTLASS 2.2 is a significant update to CUTLASS adding:
Expand All @@ -42,7 +50,7 @@ CUTLASS 2.2 is a significant update to CUTLASS adding:

# What's New in CUTLASS 2.1

CUTLASS 2.1 is a minor update to CUTLASS 2.0 adding:
CUTLASS 2.1 is a minor update to CUTLASS adding:

- [Planar complex GEMM kernels](/examples/10_planar_complex/planar_complex.cu) targeting Volta and Turing Tensor Cores
- BLAS-style API to launch kernels compiled into the [CUTLASS Library](/media/docs/quickstart.md#cutlass-library)
Expand Down Expand Up @@ -71,8 +79,8 @@ using CUDA 11.0 Toolkit. Tensor Core operations are implemented using CUDA's
# Compatibility

CUTLASS requires a C++11 host compiler and
performs best when built with the [CUDA 11.0 Toolkit](https://developer.nvidia.com/cuda-toolkit).
It is compatible with CUDA 9.2, CUDA 10.0, CUDA 10.1, and CUDA 10.2.
performs best when built with the [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit).
It is compatible with CUDA 9.2, CUDA 10.0, CUDA 10.1, CUDA 10.2, and CUDA 11.0.

We have tested the following environments.

Expand All @@ -99,10 +107,11 @@ any Maxwell-, Pascal-, Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GP
|NVIDIA GeForce RTX 2080 TI, 2080, 2070|7.5|10.0|10.2|
|NVIDIA Tesla T4|7.5|10.0|10.2|
|NVIDIA A100|8.0|11.0|11.0|
|NVIDIA GeForce 3090|8.6|11.1|11.1|

# Documentation

CUTLASS 2.2 is described in the following documents and the accompanying
CUTLASS is described in the following documents and the accompanying
[Doxygen documentation](https://nvidia.github.io/cutlass).

- [Quick Start Guide](/media/docs/quickstart.md) - build and run CUTLASS
Expand Down Expand Up @@ -136,14 +145,14 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
```

Create a build directory within the CUTLASS project, then run CMake. By default CUTLASS will build kernels
for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, and 8.0. To reduce compile time you can specify
for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, and 8.6. To reduce compile time you can specify
the architectures to build CUTLASS for by changing the CMake configuration setting
`CUTLASS_NVCC_ARCHS`.

```
$ mkdir build && cd build
$ cmake .. -DCUTLASS_NVCC_ARCHS=75 # compiles for NVIDIA's Turing GPU architecture
$ cmake .. -DCUTLASS_NVCC_ARCHS=80 # compiles for NVIDIA's Ampere Architecture
```

From the `build/` directory, compile and run the CUTLASS unit tests by building the target `test_unit` with make.
Expand Down Expand Up @@ -258,15 +267,25 @@ The `tools/profiler/` directory contains a command-line utility for launching ea
It can be built as follows:

```
$ make cutlass_profiler -j
$ make cutlass_profiler -j16
```

To limit compilation time, only one tile size is instantiated for each data type, math instruction, and layout.
By default, only one tile size is instantiated for each data type, math instruction, and layout.
To instantiate all, set the following environment variable when running CMake from an empty `build/` directory.
Beware, this results in *thousands* of kernels and long build times.
```
$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all
...
$ make cutlass_profiler -j
$ make cutlass_profiler -j16
```

To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with
wildcard characters may be reduce the set of kernels. The following builds exactly one kernel:

```
$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
...
$ make cutlass_profiler -j16
```

Example command line for profiling SGEMM kernels is as follows:
Expand Down
7 changes: 5 additions & 2 deletions examples/02_dump_reg_shmem/dump_reg_shmem.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
template <typename Element, typename GmemIterator, typename SmemIterator>
__global__ void kernel_dump(typename GmemIterator::Params params,
typename GmemIterator::TensorRef ref) {
__shared__ Element shared_storage[EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL];
extern __shared__ Element shared_storage[];

// Construct the global iterator and load the data to the fragments.
int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
Expand Down Expand Up @@ -164,8 +164,11 @@ int main() {
dim3 grid(1, 1);
dim3 block(32, 1, 1);

int smem_size =
int(sizeof(Element) * EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL);

kernel_dump<Element, GmemIterator, SmemIterator>
<<<grid, block>>>(params, matrix.device_ref());
<<<grid, block, smem_size, 0>>>(params, matrix.device_ref());

cudaError_t result = cudaDeviceSynchronize();

Expand Down
9 changes: 7 additions & 2 deletions examples/10_planar_complex/planar_complex.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
To build strictly the planar complex kernels needed for general application, execute the following
CMake command in an empty build directory.
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
This builds all planar complex GEMM variants for Volta and Turing architectures.
Expand All @@ -59,7 +59,7 @@
specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn
$ make 10_planar_complex
Expand Down Expand Up @@ -526,6 +526,11 @@ int main(int argc, char const **args) {
return 0;
}
}
else {
// NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
//
// fall through
}

//
// Parse options
Expand Down
9 changes: 7 additions & 2 deletions examples/11_planar_complex_array/planar_complex_array.cu
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
To build strictly the planar complex kernels needed for general application, execute the following
CMake command in an empty build directory.
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
This builds all planar complex GEMM variants for Volta and Turing architectures.
Expand All @@ -57,7 +57,7 @@
specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn
$ make 11_planar_complex_array
Expand Down Expand Up @@ -586,6 +586,11 @@ int main(int argc, char const **args) {
return 0;
}
}
else {
// NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
//
// fall through
}

//
// Parse options
Expand Down
Loading

0 comments on commit c53f333

Please sign in to comment.