CUTLASS 2.3 initial commit (#134)

CUTLASS 2.3 adds GEMMs targeting Sparse Tensor Cores on the NVIDIA Ampere Architecture, fast SGEMM, and small matrix classes, bug fixes, and performance enhancements.
NVIDIA · Sep 23, 2020 · c53f333 · c53f333
1 parent 4dac749
commit c53f333
Show file tree

Hide file tree

Showing 209 changed files with 46,919 additions and 1,674 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,20 @@
 
 # CUTLASS 2.x
 
+## [2.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.3.0) (2020-09-23)
+ * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
+   * [Sparse Tensor Core GEMM kernels](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu):
+     * Direct access to Sparse Tensor Cores and maximum performance via [`mma.sp.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma-and-friends)
+   * Fast SGEMM targeting GeForce RTX 30-series CUDA Cores
+ * Minor Features:
+   * [Activation functions](/include/cutlass/epilogue/thread/activation.h) such as [GeLU](/include/cutlass/epilogue/thread/linear_combination_gelu.h) and [Sigmoid](/include/cutlass/epilogue/thread/linear_combination_sigmoid.h)
+   * Small [matrix](/include/cutlass/matrix.h) and [quaternion](/include/cutlass/quaternion.h) template classes in device code
+   * [Floating-point constants](/include/cutlass/constants.h)
+ * NVIDIA Ampere GPU Architecture examples and documentation:
+   * [Tensor Float 32](/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu) and 
+   * [Sparse Tensor Cores](/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu)
+   * Documentation added on CUTLASS [efficient row-major epilogue](/media/docs/gemm_api.md#efficient-epilogue)
+
 ## [2.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.2.0) (2020-06-08)
  * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
    * Fast Tensor Core operations: 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,7 +32,7 @@ endif()
 
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
 
-project(CUTLASS VERSION 2.2.0 LANGUAGES CXX)
+project(CUTLASS VERSION 2.3.0 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
 
 find_package(Doxygen QUIET)
@@ -69,6 +69,8 @@ endif()
 
 set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable CUTLASS Examples")
 set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
+set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_TOOLS} CACHE BOOL "Enable CUTLASS Library")
+set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_TOOLS} CACHE BOOL "Enable CUTLASS Profiler")
 
 if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
   set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_TOOLS_INIT})
@@ -101,6 +103,9 @@ endif()
 if (NOT CUDA_VERSION VERSION_LESS 11.0)
   list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80)
 endif()
+if (NOT CUDA_VERSION VERSION_LESS 11.1)
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 86)
+endif()
 set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
 set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.")
 
@@ -164,12 +169,14 @@ set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.
 #
 set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.")
 set(CUTLASS_LIBRARY_KERNELS "" CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
+set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")
 
 
 # Test Levels L0, L1, L2
 set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
 set_property(CACHE CUTLASS_TEST_LEVEL PROPERTY STRINGS 0 1 2)
 list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_TEST_LEVEL=${CUTLASS_TEST_LEVEL})
+list(APPEND CUTLASS_CUDA_CLANG_FLAGS -DCUTLASS_TEST_LEVEL=${CUTLASS_TEST_LEVEL})
 
 #
 # CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
@@ -181,6 +188,11 @@ else()
   set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT ON)
 endif()
 
+# Trace levels for debugging
+set(CUTLASS_DEBUG_TRACE_LEVEL "0" CACHE STRING "Level of debug tracing to perform.")
+list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_DEBUG_TRACE_LEVEL=${CUTLASS_DEBUG_TRACE_LEVEL})
+
+
 set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL
   "Enable PTX mma instruction for collective matrix multiply operations.")
 
@@ -352,7 +364,7 @@ set_target_properties(CUTLASS PROPERTIES EXPORT_NAME cutlass)
 
 set(CUTLASS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE PATH "CUTLASS Header Library")
 
-set(CUTLASS_GENERATOR_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/library/)
+set(CUTLASS_GENERATOR_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/library CACHE INTERNAL "Location of generator scripts")
 
 # The following utility directory is needed even if the tools build is disabled, so it exists here.
 set(CUTLASS_TOOLS_UTIL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/util/include CACHE INTERNAL "")

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -16,6 +16,9 @@ Naila Farooqui
 Piotr Majcher  
 Paul Springer  
 Jin Wang   
+Aniket Shivam  
+Chinmay Talegaonkar  
+Shang Zhang   
 Scott Yokim      
 Markus Hohnerbach  
 Aditya Atluri  
@@ -52,6 +55,8 @@ Olivier Giroux
 Stephen Jones  
 Rishkul Kulkarni  
 Bryce Lelbach  
+Matthew Nicely  
 Joel McCormack  
 Kyrylo Perelygin  
 
+
diff --git a/CUDA.cmake b/CUDA.cmake
@@ -213,7 +213,14 @@ function(cutlass_correct_source_file_language_property)
   endif()
 endfunction()
 
-set(CUTLASS_UNITY_BUILD_ENABLED OFF CACHE BOOL "Enable combined source compilation")
+# If building with all kernels, set UNITY build on by default.
+if (CUTLASS_LIBRARY_KERNELS MATCHES "all")
+  set(CUTLASS_UNITY_BUILD_ENABLED_INIT ON)
+else()
+  set(CUTLASS_UNITY_BUILD_ENABLED_INIT OFF)
+endif()
+
+set(CUTLASS_UNITY_BUILD_ENABLED ${CUTLASS_UNITY_BUILD_ENABLED_INIT} CACHE BOOL "Enable combined source compilation")
 set(CUTLASS_UNITY_BUILD_BATCH_SIZE 16 CACHE STRING "Batch size for unified source files")
 
 function(cutlass_unify_source_files TARGET_ARGS_VAR)

diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 2.2
+# CUTLASS 2.3
 
-_CUTLASS 2.2 - June 2020_
+_CUTLASS 2.3 - September 2020_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
@@ -30,6 +30,14 @@ See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
 See the [functionality listing](media/docs/functionality.md) for the list of operations
 supported at each level of the execution model hierarchy.
 
+# What's New in CUTLASS 2.3
+
+CUTLASS 2.3 is a minor update to CUTLASS adding:
+- GEMMs targeting structured [Sparse Tensor Cores](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) in NVIDIA Ampere Architecture GPUs
+- Fast SGEMM kernels targeting GeForce RTX 30-series CUDA Cores
+- Intended to be compiled with [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit)
+- See the [CHANGELOG](CHANGELOG.md) for more details.
+
 # What's New in CUTLASS 2.2
 
 CUTLASS 2.2 is a significant update to CUTLASS adding:
@@ -42,7 +50,7 @@ CUTLASS 2.2 is a significant update to CUTLASS adding:
 
 # What's New in CUTLASS 2.1
 
-CUTLASS 2.1 is a minor update to CUTLASS 2.0 adding:
+CUTLASS 2.1 is a minor update to CUTLASS adding:
 
 - [Planar complex GEMM kernels](/examples/10_planar_complex/planar_complex.cu) targeting Volta and Turing Tensor Cores
 - BLAS-style API to launch kernels compiled into the [CUTLASS Library](/media/docs/quickstart.md#cutlass-library)
@@ -71,8 +79,8 @@ using CUDA 11.0 Toolkit. Tensor Core operations are implemented using CUDA's
 # Compatibility
 
 CUTLASS requires a C++11 host compiler and 
-performs best when built with the [CUDA 11.0 Toolkit](https://developer.nvidia.com/cuda-toolkit).
-It is compatible with CUDA 9.2, CUDA 10.0, CUDA 10.1, and CUDA 10.2.
+performs best when built with the [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit).
+It is compatible with CUDA 9.2, CUDA 10.0, CUDA 10.1, CUDA 10.2, and CUDA 11.0.
 
 We have tested the following environments.
 
@@ -99,10 +107,11 @@ any Maxwell-, Pascal-, Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GP
 |NVIDIA GeForce RTX 2080 TI, 2080, 2070|7.5|10.0|10.2|
 |NVIDIA Tesla T4|7.5|10.0|10.2|
 |NVIDIA A100|8.0|11.0|11.0|
+|NVIDIA GeForce 3090|8.6|11.1|11.1|
 
 # Documentation
 
-CUTLASS 2.2 is described in the following documents and the accompanying
+CUTLASS is described in the following documents and the accompanying
 [Doxygen documentation](https://nvidia.github.io/cutlass).
 
 - [Quick Start Guide](/media/docs/quickstart.md) - build and run CUTLASS
@@ -136,14 +145,14 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
 ```
 
 Create a build directory within the CUTLASS project, then run CMake. By default CUTLASS will build kernels
-for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, and 8.0. To reduce compile time you can specify
+for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, and 8.6. To reduce compile time you can specify
 the architectures to build CUTLASS for by changing the CMake configuration setting
 `CUTLASS_NVCC_ARCHS`.
 
 ```
 $ mkdir build && cd build
 
-$ cmake .. -DCUTLASS_NVCC_ARCHS=75               # compiles for NVIDIA's Turing GPU architecture
+$ cmake .. -DCUTLASS_NVCC_ARCHS=80               # compiles for NVIDIA's Ampere Architecture
 ```
 
 From the `build/` directory, compile and run the CUTLASS unit tests by building the target `test_unit` with make.
@@ -258,15 +267,25 @@ The `tools/profiler/` directory contains a command-line utility for launching ea
 It can be built as follows:
 
 ```
-$ make cutlass_profiler -j
+$ make cutlass_profiler -j16
 ```
 
-To limit compilation time, only one tile size is instantiated for each data type, math instruction, and layout.
+By default, only one tile size is instantiated for each data type, math instruction, and layout.
 To instantiate all, set the following environment variable when running CMake from an empty `build/` directory.
+Beware, this results in *thousands* of kernels and long build times.
 ```
 $ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all
 ...
-$ make cutlass_profiler -j
+$ make cutlass_profiler -j16
+```
+
+To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with 
+wildcard characters may be reduce the set of kernels. The following builds exactly one kernel:
+
+```
+$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
+...
+$ make cutlass_profiler -j16
 ```
 
 Example command line for profiling SGEMM kernels is as follows:

diff --git a/examples/02_dump_reg_shmem/dump_reg_shmem.cu b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
@@ -69,7 +69,7 @@
 template <typename Element, typename GmemIterator, typename SmemIterator>
 __global__ void kernel_dump(typename GmemIterator::Params params,
                             typename GmemIterator::TensorRef ref) {
-  __shared__ Element shared_storage[EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL];
+  extern __shared__ Element shared_storage[];
 
   // Construct the global iterator and load the data to the fragments.
   int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
@@ -164,8 +164,11 @@ int main() {
   dim3 grid(1, 1);
   dim3 block(32, 1, 1);
 
+  int smem_size =
+      int(sizeof(Element) * EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL);
+
   kernel_dump<Element, GmemIterator, SmemIterator>
-      <<<grid, block>>>(params, matrix.device_ref());
+      <<<grid, block, smem_size, 0>>>(params, matrix.device_ref());
 
   cudaError_t result = cudaDeviceSynchronize();
 

diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu
@@ -50,7 +50,7 @@
   To build strictly the planar complex kernels needed for general application, execute the following
   CMake command in an empty build directory.
     
-    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
   	  -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
 
   This builds all planar complex GEMM variants for Volta and Turing architectures.
@@ -59,7 +59,7 @@
   specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
   the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
 
-    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
   	  -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn
 
     $ make 10_planar_complex
@@ -526,6 +526,11 @@ int main(int argc, char const **args) {
       return 0;
     }
   }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
 
   //
   // Parse options

diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu
@@ -48,7 +48,7 @@
   To build strictly the planar complex kernels needed for general application, execute the following
   CMake command in an empty build directory.
 
-    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
       -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
 
   This builds all planar complex GEMM variants for Volta and Turing architectures.
@@ -57,7 +57,7 @@
   specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
   the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
 
-    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
       -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn
 
     $ make 11_planar_complex_array
@@ -586,6 +586,11 @@ int main(int argc, char const **args) {
       return 0;
     }
   }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
 
   //
   // Parse options