Skip to content

Commit

Permalink
Improve: Scaling CUDA kernels
Browse files Browse the repository at this point in the history
Add support for larger input arrays except
for CUB. Adds a new interleaving Thrust variant,
combining additions with FMA, that doesn't
yield any performance improvements on 4060.
  • Loading branch information
ashvardanian committed Jan 19, 2025
1 parent aeabdce commit e57eff0
Show file tree
Hide file tree
Showing 2 changed files with 282 additions and 145 deletions.
142 changes: 107 additions & 35 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,24 +1,78 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)

# Define the CUDA architectures.
# Empty value is forbidden.
# Let's use a recent CMake version:
# 3.16+ for native sanitizers support
# 3.17+ for `FindCUDAToolkit`
# 3.25.2 for CUDA20 support
# The good news is that Ubuntu 24.04 comes with 3.28!
cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR)

# ------------------------------------------------------------------------------
# Project Setup
# ------------------------------------------------------------------------------
project(
ParallelReductionsBenchmark
LANGUAGES CXX
VERSION 0.1.0
)
LANGUAGES CXX
DESCRIPTION "Parallel Reductions Benchmark for CPUs & GPUs"
HOMEPAGE_URL "https://github.com/ashvardanian/ParallelReductionsBenchmark")

set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED YES)
set(CMAKE_CXX_EXTENSIONS NO)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED YES)

# Make Release by default
# Some extra logging for the user:
message(STATUS "----------------------------------------")
message(STATUS "CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}")
message(STATUS "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
message(STATUS "----------------------------------------")

# Default to Release if no build type is set:
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()

# Get external content: Google Benchmark, cccl
# ------------------------------------------------------------------------------
# Detect CUDA Support
# ------------------------------------------------------------------------------
set(ENABLE_CUDA OFF)
include(CheckLanguage)
check_language(CUDA)

if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
set(ENABLE_CUDA ON)
message(STATUS "CUDA detected! Using compiler: ${CMAKE_CUDA_COMPILER}")
else()
message(STATUS "CUDA not detected. Skipping CUDA-specific builds.")
endif()

# ------------------------------------------------------------------------------
# Options
# ------------------------------------------------------------------------------
option(USE_INTEL_TBB "Use Intel TBB for parallel STL algorithms" ON)
option(USE_NVIDIA_CCCL "Use Nvidia CCCL for CUDA acceleration" ON)

# Enable or disable options based on system and CUDA support
if(ENABLE_CUDA)
set(USE_NVIDIA_CCCL ON)
set(USE_INTEL_TBB OFF) # Prioritize CUDA acceleration
elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
set(USE_INTEL_TBB ON) # Default to TBB on Linux without CUDA
endif()

message(STATUS "USE_INTEL_TBB: ${USE_INTEL_TBB}")
message(STATUS "USE_NVIDIA_CCCL: ${USE_NVIDIA_CCCL}")

# ------------------------------------------------------------------------------
# Dependencies
# ------------------------------------------------------------------------------
find_package(Threads REQUIRED)
find_package(OpenMP QUIET)
find_package(OpenCL QUIET)

set(FETCHCONTENT_QUIET OFF)
include(FetchContent)

FetchContent_Declare(
Expand All @@ -44,26 +98,43 @@ set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
set(BENCHMARK_USE_BUNDLED_GTEST ON)
FetchContent_MakeAvailable(benchmark)

# Nvidia CCCL can be configured with Intel TBB.
# https://github.com/rapidsai/cuml/issues/3540
FetchContent_Declare(
CCCL
GIT_REPOSITORY https://github.com/nvidia/cccl.git
GIT_TAG v2.7.0
GIT_SHALLOW TRUE
)
FetchContent_MakeAvailable(CCCL)
# Intel TBB for "Parallel STL" algorithms
# https://github.com/oneapi-src/oneTBB/tree/onetbb_2021
if(USE_INTEL_TBB)
FetchContent_Declare(
IntelTBB
GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git
GIT_TAG master
)

# We need TBB for Parallel CPU Algorithms in GCC.
# https://github.com/oneapi-src/oneTBB/blob/onetbb_2021/cmake/README.md
FetchContent_Declare(
TBB
GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
GIT_TAG v2022.0.0
GIT_SHALLOW TRUE
)
set(TBB_TEST OFF CACHE BOOL "Do not build TBB tests" FORCE)
FetchContent_MakeAvailable(TBB)
# Suppress TBB's own tests:
set(TBB_TEST OFF CACHE BOOL "Do not build TBB tests" FORCE)
FetchContent_MakeAvailable(IntelTBB)

# ------------------------------------------------------------------------------
# TBB fix for -Wstringop-overflow warnings treated as errors
# ------------------------------------------------------------------------------
# The TBB library target is typically called "tbb". We can explicitly disable
# the `stringop-overflow` warning for TBB only:
if(TARGET tbb)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
target_compile_options(tbb PRIVATE -Wno-stringop-overflow)
endif()
endif()
endif()

# Nvidia's CUDA Core Compute Libraries for GPU acceleration
if(USE_NVIDIA_CCCL)
# CUB, Thrust, and other libraries of interest are now included into the
# CUDA Toolkit, so we don't need this anymore:
#
# FetchContent_Declare(NvidiaCCCL GIT_REPOSITORY https://github.com/nvidia/cccl.git)
# FetchContent_MakeAvailable(NvidiaCCCL)
find_package(CUDAToolkit REQUIRED)
message(STATUS "CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
message(STATUS "CUDA Toolkit Include Path: ${CUDAToolkit_INCLUDE_DIRS}")
message(STATUS "CUDA Toolkit Libraries Path: ${CUDAToolkit_LIBRARY_DIR}")
endif()

set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
Expand All @@ -72,20 +143,21 @@ set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")

set(CMAKE_GCC_FLAGS "${CMAKE_GCC_FLAGS} -march=native -fopenmp")

# Add CUDA and CCCL dependencies
find_package(Threads REQUIRED)

add_executable(reduce_bench reduce_bench.cpp)
target_link_libraries(reduce_bench benchmark::benchmark fmt::fmt Threads::Threads TBB::tbb)
target_link_libraries(reduce_bench benchmark::benchmark fmt::fmt Threads::Threads)

find_package(OpenMP)
if(USE_INTEL_TBB)
target_link_libraries(reduce_bench tbb)
endif()

if(USE_NVIDIA_CCCL)
target_link_libraries(reduce_bench CUDA::cudart CUDA::cublas)
endif()

if(OpenMP_FOUND)
target_link_libraries(reduce_bench OpenMP::OpenMP_CXX)
endif()

find_package(OpenCL)

if(OpenCL_FOUND)
target_link_libraries(reduce_bench OpenCL::OpenCL)
endif()
Expand Down
Loading

0 comments on commit e57eff0

Please sign in to comment.