Skip to content

Commit

Permalink
CUDA variant optimised beyond k-caching, e.g., pipelined global-to-sh…
Browse files Browse the repository at this point in the history
…ared mem copies overlaped with compute
  • Loading branch information
MichaelSt98 committed Dec 11, 2024
1 parent 82fdf4b commit 99afc47
Show file tree
Hide file tree
Showing 5 changed files with 3,559 additions and 3 deletions.
65 changes: 62 additions & 3 deletions src/cloudsc_cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ if( HAVE_CLOUDSC_C_CUDA )
target_compile_options(dwarf-cloudsc-c-cuda-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>>)
else()
target_compile_options(dwarf-cloudsc-c-cuda-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--ptxas-options=-O3 -use_fast_math -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
--ptxas-options=-O3 -use_fast_math -lineinfo -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
endif()

set_target_properties( dwarf-cloudsc-c-cuda-lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
Expand Down Expand Up @@ -112,7 +112,7 @@ if( HAVE_CLOUDSC_C_CUDA )
target_compile_options(dwarf-cloudsc-c-cuda-hoist-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>>)
else()
target_compile_options(dwarf-cloudsc-c-cuda-hoist-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--ptxas-options=-O3 -use_fast_math -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
--ptxas-options=-O3 -use_fast_math -lineinfo -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
endif()

set_target_properties( dwarf-cloudsc-c-cuda-hoist-lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
Expand Down Expand Up @@ -170,7 +170,8 @@ if( HAVE_CLOUDSC_C_CUDA )
target_compile_options(dwarf-cloudsc-c-cuda-k-caching-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>>)
else()
target_compile_options(dwarf-cloudsc-c-cuda-k-caching-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--ptxas-options=-O3 -use_fast_math -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
-O3 -use_fast_math -lineinfo -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
# -O0 -g -G -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
endif()
set_target_properties( dwarf-cloudsc-c-cuda-k-caching-lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

Expand All @@ -191,6 +192,64 @@ if( HAVE_CLOUDSC_C_CUDA )
)
###

###### SCC-CUDA-OPT ####
ecbuild_add_library(
TARGET dwarf-cloudsc-c-cuda-opt-lib
INSTALL_HEADERS LISTED
SOURCES
cloudsc/yoecldp_c.h
cloudsc/load_state.h
cloudsc/load_state.cu
cloudsc/cloudsc_c_opt.h
cloudsc/cloudsc_c_opt.cu
cloudsc/cloudsc_driver_opt.h
cloudsc/cloudsc_driver_opt.cu
cloudsc/cloudsc_validate.h
cloudsc/cloudsc_validate.cu
cloudsc/mycpu.h
cloudsc/mycpu.cu
PUBLIC_INCLUDES
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/cloudsc>
PUBLIC_LIBS
$<${HAVE_HDF5}:hdf5::hdf5>
$<${HAVE_SERIALBOX}:Serialbox::Serialbox_C>
$<${HAVE_OMP}:OpenMP::OpenMP_C>
DEFINITIONS
${CLOUDSC_DEFINITIONS}
)

target_include_directories(
dwarf-cloudsc-c-cuda-opt-lib
PUBLIC
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
)
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
target_compile_options(dwarf-cloudsc-c-cuda-opt-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>>)
else()
target_compile_options(dwarf-cloudsc-c-cuda-opt-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
-O3 -use_fast_math -lineinfo -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
# -O0 -g -G -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
endif()
set_target_properties( dwarf-cloudsc-c-cuda-opt-lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

ecbuild_add_executable(
TARGET dwarf-cloudsc-c-cuda-opt
SOURCES dwarf_cloudsc.cpp
LIBS dwarf-cloudsc-c-cuda-opt-lib
)

target_link_libraries(dwarf-cloudsc-c-cuda-opt dwarf-cloudsc-c-cuda-opt-lib)

ecbuild_add_test(
TARGET dwarf-cloudsc-c-cuda-opt-serial
COMMAND bin/dwarf-cloudsc-c-cuda-opt
ARGS 1 1000 128
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../..
OMP 1
)
###

# Create symlink for the input data
if( HAVE_SERIALBOX )
execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink
Expand Down
Loading

0 comments on commit 99afc47

Please sign in to comment.