Initial commit from internal repo

NVIDIA · Oct 25, 2021 · d8505c9 · d8505c9
1 parent b59498f
commit d8505c9
Show file tree

Hide file tree

Showing 140 changed files with 33,009 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,213 @@
+cmake_minimum_required(VERSION 3.18)
+
+# Used for config file generation
+if(NOT DEFINED PROJECT_NAME)
+  set(NOT_SUBPROJECT ON)
+else()
+  set(NOT_SUBPROJECT OFF)
+endif()
+
+# Command line options
+option(BUILD_EXAMPLES "Build examples" OFF)
+option(BUILD_TESTS "Build unit tests" OFF)
+option(BUILD_BENCHMARKS "Build benchmarks" OFF)
+option(BUILD_DOCS "Build documentation. Mutually exclusive with all other options" OFF)
+option(BUILD_32_BIT "Build with 32-bit indexing support" OFF)
+option(MULTI_GPU "Multi-GPU support" OFF)
+option(EN_VISUALIZATION "Enable visualization support" OFF)
+option(EN_CUTLASS OFF)
+option(GPU_ARCH "List of GPU architectures to build for, separated by semicolon" OFF)
+
+# Building documentation is mutually exclusive with everything else, and doesn't require CUDA
+if (BUILD_DOCS)
+    project(MATX_DOCS)
+    add_subdirectory(docs)
+    return()
+endif()
+
+# This needs to go after BUILD_DOCS check so it doesn't look for CUDA if we're just building docs
+project(MATX
+        LANGUAGES CUDA CXX
+        DESCRIPTION "A modern and efficient header-only C++ library for numerical computing on GPU"
+        VERSION 0.0.13
+        HOMEPAGE_URL "https://github.com/NVIDIA/MatX")
+
+# In an upcoming CMake it will have the capability to auto-detect GPU architectures. For now, rapids-cmake has a utility
+# function to do it, so we grab that as a dependency. The user can optionally override GPU_ARCH to specify
+# their own
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake
+     ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-export)
+include(rapids-find)
+
+rapids_cmake_write_version_file(include/version_config.h)
+
+# Figure out what GPU arch their on if it's not specified. If we can't figure it out and it's not specified, fall back to 70;80
+if(NOT GPU_ARCH)
+    include(rapids-cuda)
+    set(CMAKE_CUDA_ARCHITECTURES "NATIVE")
+    rapids_cuda_init_architectures(MATX)
+    if (NOT CMAKE_CUDA_ARCHITECTURES)
+        message(STATUS "Tried to detect GPU architecture of current machine, but failed. Falling back to 70;80")
+        set(CMAKE_CUDA_ARCHITECTURES "70;80")
+    else()
+        message(STATUS "GPU_ARCH not specified. Using GPU architectures of this machine for building (${CMAKE_CUDA_ARCHITECTURES})")    
+    endif()
+else()
+    message(STATUS "Using GPU architectures ${GPU_ARCH}")
+    set(CMAKE_CUDA_ARCHITECTURES ${GPU_ARCH})
+endif()
+
+
+# MatX requires C++17 to build. Enforce on all libraries pulled in as well
+set (CMAKE_CXX_STANDARD 17)
+set (CUDA_CXX_STANDARD 17)
+
+# CPM is required for all package management
+include(cmake/GetCPM.cmake)
+# Helper for selecting build type
+include(cmake/BuildType.cmake)
+
+rapids_find_package(
+  CUDAToolkit 11.5 REQUIRED
+  BUILD_EXPORT_SET matx-exports
+  INSTALL_EXPORT_SET matx-exports)
+
+rapids_cpm_init()
+
+# Create our transitive target to pass build properties to external users and our own build environment
+add_library(matx INTERFACE)
+add_library(matx::matx ALIAS matx)
+target_include_directories(matx INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+                                         "$<INSTALL_INTERFACE:include>")
+target_include_directories(matx INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/kernels>"
+"$<INSTALL_INTERFACE:include/kernels>")                                         
+target_compile_features(matx INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_options(matx INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
+
+# Set flags for compiling tests faster
+set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0)
+if (NOT CMAKE_BUILD_TYPE OR ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    set(MATX_CUDA_FLAGS ${MATX_CUDA_FLAGS} -g -lineinfo)
+endif()
+
+# Set preferred compiler warning flags
+set(WARN_FLAGS  -Wall 
+                -Wextra 
+                -Werror all-warnings
+                -Wcast-align
+                -Wunused
+                -Wconversion
+                -Wno-unknown-pragmas 
+                -Wnon-virtual-dtor 
+                -Wshadow)
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    set(WARN_FLAGS ${WARN_FLAGS} 
+        -Wmisleading-indentation
+        -Wduplicated-cond
+        -Wduplicated-branches
+        -Wlogical-op
+        -Wnull-dereference
+        -Wuseless-cast)
+endif()
+
+# CUTLASS slows down compile times when used, so leave it as optional for now
+if (EN_CUTLASS)
+    include(cmake/GetCUTLASS.cmake)
+    set (CUTLASS_INC ${cutlass_SOURCE_DIR}/include/ ${cutlass_SOURCE_DIR}/tools/util/include/)
+    target_compile_definitions(matx INTERFACE ENABLE_CUTLASS=1)
+else()
+    set (CUTLASS_INC "")
+    target_compile_definitions(matx INTERFACE ENABLE_CUTLASS=0)
+endif()
+
+if (MULTI_GPU)
+    include(cmake/FindNvshmem.cmake)
+    find_package(Nvshmem REQUIRED)
+endif()
+
+# Find python3 and pybind11 for generating unit tests and benchmarks
+if (BUILD_BENCHMARKS OR BUILD_TESTS OR EN_VISUALIZATION OR BUILD_EXAMPLES)
+    include(cmake/GetPyBind11.cmake)
+    find_package(Python3  REQUIRED COMPONENTS Interpreter Development)
+    find_package(pybind11 REQUIRED)
+
+    # Check for python libs
+    include(cmake/CheckPythonLibs.cmake)
+    check_python_libs("numpy")
+
+    # Required by pybind
+    # https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-
+    # visibility-than-the-type-of-its-field-someclass-member-wattributes
+    target_compile_options(matx INTERFACE -fvisibility=hidden)
+    target_link_libraries(matx INTERFACE pybind11::embed)
+
+    # Visualization requires Python libraries
+    if (EN_VISUALIZATION) 
+        check_python_libs("plotly" "pandas")
+    endif()    
+endif()
+
+# Build config files if the user isn't adding this as a subdirectory. At this point our transitive target
+# should have all build properties needed based on the options passed in
+if (NOT_SUBPROJECT)
+    include(GNUInstallDirs)
+    include(CMakePackageConfigHelpers)
+
+    install(TARGETS matx EXPORT matx-exports)
+    install(DIRECTORY include/ DESTINATION include)
+    install(FILES ${CMAKE_BINARY_DIR}/include/version_config.h DESTINATION include)
+
+    set(doc_string
+    [=[
+    Provide targets for MatX.
+
+    [MatX](https://github.com/NVIDIA/MatX) provides a Python-like syntax for near-native speed
+    numerical computing on NVIDIA GPUs.
+    ]=])
+
+    rapids_export(
+        INSTALL matx
+        EXPORT_SET matx-exports
+        GLOBAL_TARGETS matx
+        NAMESPACE matx::
+        DOCUMENTATION doc_string)
+
+      # build export targets
+      rapids_export(
+        BUILD matx
+        EXPORT_SET matx-exports
+        GLOBAL_TARGETS matx
+        NAMESPACE matx::
+        DOCUMENTATION doc_string)
+endif()
+
+
+
+if (BUILD_32_BIT)
+    add_definitions(-DINDEX_32_BIT)
+    target_compile_definitions(matx INTERFACE INDEX_32_BIT)
+else()
+    add_definitions(-DINDEX_64_BIT)
+    target_compile_definitions(matx INTERFACE INDEX_64_BIT)
+endif()
+
+if (BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif()
+
+if (BUILD_BENCHMARKS)
+    include(cmake/GetNVBench.cmake)
+    add_subdirectory(bench)
+endif()
+
+if (BUILD_TESTS)
+    include(cmake/GetGTest.cmake)
+    add_subdirectory(test)
+endif()
+
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,31 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
diff --git a/README.md b/README.md
@@ -0,0 +1,143 @@
+# MatX - Matrix Primitives Library
+
+MatX is a modern C++ library for numerical computing on NVIDIA GPUs. Near-native performance can be achieved while using a simple syntax common in higher-level languages such as Python or MATLAB.
+
+![FFT resampler](docs/img/fft_resamp.PNG)
+
+The above image shows the Python (Numpy) version of an FFT resampler next to the MatX version. The total runtimes the NumPy version, CuPy version,
+and MatX version are shown below:
+
+* Python/Numpy: **4500ms** (Xeon(R) CPU E5-2698 v4 @ 2.20GHz)
+* CuPy: **10.6ms**  (A100)
+* MatX: **2.54ms** (A100)
+
+While the code complexity and length are roughly the same, the MatX version shows a **1771x** over the Numpy version, and over **4x** faster than
+the CuPy version on the same GPU. 
+
+Key features include:
+
+* :zap: MatX is fast. By using existing, optimized libraries as a backend, and efficient kernel generation when needed, no hand-optimizations
+are necessary
+
+* :open_hands: MatX is easy to learn. Users familiar with high-level languages will pick up the syntax quickly
+
+* :bookmark_tabs: MatX easily integrates with existing libraries and code
+
+* :sparkler: Visualize data from the GPU right on a web browser
+
+* :arrow_up_down: IO capabilities for reading/writing files
+
+# Requirements
+MatX is using bleeding edge features in the CUDA compilers and libraries. For this reason, a minimum of CUDA 11.4 and g++9 is required currently.
+
+# Documentation
+Documentation for MatX can be either built locally as shown below
+
+# Supported Data Types
+MatX supports all types that use standard C++ operators for math (+, -, etc). Unit tests are run against all common types shown below. 
+
+* Integer: int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t
+* Floating Point: matxFp16 (fp16), matxBf16 (bfloat16), float, double
+* Complex: matxfp16Complex, matxBf16Complex, cuda::std::complex<float>, cuda::std::complex<double>
+
+Since CUDA half precision types (``__half`` and ``__nv_bfloat16``) do not support all C++ operators on the host side, MatX provides the ``matxFp16`` and
+``matxBf16`` types for scalars, and ``matxFp16Complex`` and ``matxBf16Complex`` for complex types. These wrappers are needed so that tensor
+views can be evaluated on both the host and device, regardless of CUDA or hardware support. When possible, the half types will use hardware-
+accelerated intrinsics automatically. Existing code using ``__half`` and ``__nv_bfloat16`` may be converted to the ``matx`` equivalent types directly
+and leverage all operators.
+
+# Building
+MatX is a header-only library that does not require compiling for using in your applications. However, building unit tests, benchmarks, 
+or examples must be compiled. CPM is used as a package manager for CMake to download and configure any dependencies. If MatX is to
+be used in an air-gapped environment, CPM [can be configured](https://github.com/cpm-cmake/CPM.cmake#cpm_source_cache) to search locally for files.
+Depending on what options are enabled, compiling could take very long without parallelism enabled. Using the ``-j`` flag on ``make`` is
+suggested with the highest number your system will accommodate. 
+
+To build all components, issue the standard cmake build commands in a cloned repo:
+
+```
+mkdir build && cd build
+cmake -DBUILD_TESTS=ON -DBUILD_BENCHMARKS=ON -DBUILD_EXAMPLES=ON ..
+make -j
+```
+
+By default CMake will target the GPU architecture(s) of the system you're compiling on. If you wish to target other architectures, pass the
+CMAKE_CUDA_ARCHITECTURES flag with a list of architectures to build for.
+
+By default nothing is compiled. If you wish to compile certain options, use the CMake flags below with ON or OFF values:
+
+```
+BUILD_TESTS
+BUILD_BENCHMARKS
+BUILD_EXAMPLES
+BUILD_DOCS
+```
+
+For example, to disable unit test building:
+```
+mkdir build && cd build
+cmake -DBUILD_TESTS=OFF ..
+make -j
+```
+
+Note that if documentation is selected all other build options are off. This eases the dependencies needed to build documentation
+so large libraries such as CUDA don't need to be installed.
+
+## CMake
+MatX uses CMake as a first-class build generator, and therefor provides the proper config files to include into your own project. There are
+typically two ways to do this: adding a subdirectory and finding the package. 
+
+### MatX As A Subdirectory
+Adding the subdirectory is useful if you include the MatX
+source into the directory structure of your project. Using this method, you can simply add the MatX directory:
+
+```
+add_subdirectory(path/to/matx)
+```
+
+### MatX Installed To The System
+The other option is to install MatX and use the configuration file provided after building. This is typically done in a way similar to what is
+shown below:
+
+```
+cd /path/to/matx
+mkdir build && cd build
+cmake ..
+make && make install
+```
+
+If you have the correct permissions, the headers and cmake packages will be installed on your system in the expected paths for your operating
+system. With the package installed you can use ``find_package`` as follows:
+
+```
+find_package(matx CONFIG REQUIRED)
+```
+
+An example of using this method can be found in the examples/cmake_sample_project directory
+
+### MatX CMake Targets
+Once either of the two methods above are done, you can use the transitive target ``matx::matx`` in your library inside of ``target_link_libraries``.
+MatX may add other optional targets in the future inside the matx:: namespace as well.
+
+# Unit Tests
+MatX contains a suite of unit tests to test functionality of the primitive functions, plus end-to-end tests of example code.
+MatX uses [pybind11](https://github.com/pybind/pybind11) to generate some of the unit test inputs and outputs. This avoids
+the need to store large test vector files in git, and instead can be generated as-needed.
+
+To run the unit tests, from the cmake build directory run:
+```
+make test
+```
+
+This will execute all unit tests defined. If you wish to execute a subset of tests, or run with different options, you
+may run test/matx_test directly with parameters defined by [Google Test](https://github.com/google/googletest). To run matx_test
+directly, you must be inside the build/test directory for the correct paths to be set. For example,
+to run only tests with the name FFT:
+
+```
+cd build/test
+./matx_test --gtest_filter="*FFT*"
+```
+
+# Quick Start
+A [quick start guide](docs/quickstart.rst) can be found in the docs directory