Skip to content

Commit

Permalink
Update CMake and use native CUDA language support (pytorch#62445)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#62445

PyTorch currently uses the old style of compiling CUDA in CMake which is just a
bunch of scripts in `FindCUDA.cmake`. Newer versions support CUDA natively as
a language just like C++ or C.

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D31503350

fbshipit-source-id: 2ee817edc9698531ae1b87eda3ad271ee459fd55
  • Loading branch information
malfet authored and facebook-github-bot committed Oct 11, 2021
1 parent d3b29af commit c373387
Show file tree
Hide file tree
Showing 23 changed files with 264 additions and 161 deletions.
2 changes: 1 addition & 1 deletion .azure_pipelines/job_templates/prepare-build-template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ steps:
curl -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output .\tmp_bin\sccache.exe
curl -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output .\tmp_bin\sccache-cl.exe
copy .\tmp_bin\sccache.exe .\tmp_bin\nvcc.exe
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.3/randomtemp.exe --output .\tmp_bin\randomtemp.exe
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output .\tmp_bin\randomtemp.exe
displayName: Install sccache and randomtemp
condition: not(eq(variables.CUDA_VERSION, ''))
Expand Down
4 changes: 1 addition & 3 deletions .azure_pipelines/job_templates/set-environment-variables.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,7 @@ steps:
Write-Host "##vso[task.setvariable variable=CMAKE_LIBRARY_PATH;]$(Build.SourcesDirectory)\mkl\lib;$env:CMAKE_LIBRARY_PATH"
Write-Host "##vso[task.setvariable variable=ADDITIONAL_PATH;]$(Build.SourcesDirectory)\tmp_bin"
Write-Host "##vso[task.setvariable variable=SCCACHE_IDLE_TIMEOUT;]1500"
Write-Host "##vso[task.setvariable variable=RANDOMTEMP_EXECUTABLE;]$(Build.SourcesDirectory)\tmp_bin\nvcc.exe"
Write-Host "##vso[task.setvariable variable=CUDA_NVCC_EXECUTABLE;]$(Build.SourcesDirectory)\tmp_bin\randomtemp.exe"
Write-Host "##vso[task.setvariable variable=RANDOMTEMP_BASEDIR;]$(Build.SourcesDirectory)\tmp_bin"
Write-Host "##vso[task.setvariable variable=CMAKE_CUDA_COMPILER_LAUNCHER;]$(Build.SourcesDirectory)/tmp_bin/randomtemp.exe;$(Build.SourcesDirectory)/tmp_bin/sccache.exe"
displayName: Set MKL, sccache and randomtemp environment variables
# View current environment variables
Expand Down
3 changes: 2 additions & 1 deletion .circleci/docker/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ RUN rm install_cmake.sh
ADD ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH
RUN bash ./install_cache.sh && rm install_cache.sh
ENV CUDA_NVCC_EXECUTABLE=/opt/cache/lib/nvcc
ENV CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache

# Add jni.h for java host build
ADD ./common/install_jni.sh install_jni.sh
Expand All @@ -94,6 +94,7 @@ ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
# AWS specific CUDA build guidance
ENV TORCH_CUDA_ARCH_LIST Maxwell
ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
ENV CUDA_PATH /usr/local/cuda

# Install LLVM dev version (Defined in the pytorch/builder github repository)
COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
Expand Down
2 changes: 1 addition & 1 deletion .github/templates/windows_ci_workflow.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ env:
CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
{%- if cuda_version != "cpu" %}
TORCH_CUDA_ARCH_LIST: "7.0"
USE_CUDA: 1
{%- endif %}
USE_CUDA: !{{ 1 if cuda_version != "cpu" else 0 }}

!{{ common.concurrency(build_environment) }}

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/generated-win-vs2019-cpu-py3.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions .jenkins/caffe2/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ if [ -z "${SCCACHE}" ] && which ccache > /dev/null; then
ln -sf "$(which ccache)" ./ccache/g++
ln -sf "$(which ccache)" ./ccache/x86_64-linux-gnu-gcc
if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then
ln -sf "$(which ccache)" ./ccache/nvcc
mkdir -p ./ccache/cuda
ln -sf "$(which ccache)" ./ccache/cuda/nvcc
fi
export CACHE_WRAPPER_DIR="$PWD/ccache"
export PATH="$CACHE_WRAPPER_DIR:$PATH"
Expand Down Expand Up @@ -93,7 +94,8 @@ if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then

# Explicitly set path to NVCC such that the symlink to ccache or sccache is used
if [ -n "${CACHE_WRAPPER_DIR}" ]; then
build_args+=("CUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc")
build_args+=("CUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/cuda/nvcc")
build_args+=("CMAKE_CUDA_COMPILER_LAUNCHER=${CACHE_WRAPPER_DIR}/ccache")
fi

# Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.
Expand Down
19 changes: 8 additions & 11 deletions .jenkins/pytorch/win-test-helpers/build_pytorch.bat
Original file line number Diff line number Diff line change
Expand Up @@ -97,23 +97,20 @@ set CXX=sccache-cl
set CMAKE_GENERATOR=Ninja

if "%USE_CUDA%"=="1" (
copy %TMP_DIR_WIN%\bin\sccache.exe %TMP_DIR_WIN%\bin\nvcc.exe

:: randomtemp is used to resolve the intermittent build error related to CUDA.
:: code: https://github.com/peterjc123/randomtemp-rust
:: issue: https://github.com/pytorch/pytorch/issues/25393
::
:: Previously, CMake uses CUDA_NVCC_EXECUTABLE for finding nvcc and then
:: the calls are redirected to sccache. sccache looks for the actual nvcc
:: in PATH, and then pass the arguments to it.
:: Currently, randomtemp is placed before sccache (%TMP_DIR_WIN%\bin\nvcc)
:: so we are actually pretending sccache instead of nvcc itself.
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.3/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
:: CMake requires a single command as CUDA_NVCC_EXECUTABLE, so we push the wrappers
:: randomtemp.exe and sccache.exe into a batch file which CMake invokes.
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
set RANDOMTEMP_EXECUTABLE=%TMP_DIR_WIN%\bin\nvcc.exe
set CUDA_NVCC_EXECUTABLE=%TMP_DIR_WIN%\bin\randomtemp.exe
set RANDOMTEMP_BASEDIR=%TMP_DIR_WIN%\bin
echo @"%TMP_DIR_WIN%\bin\randomtemp.exe" "%TMP_DIR_WIN%\bin\sccache.exe" "%CUDA_PATH%\bin\nvcc.exe" %%* > "%TMP_DIR%/bin/nvcc.bat"
cat %TMP_DIR%/bin/nvcc.bat
set CUDA_NVCC_EXECUTABLE=%TMP_DIR%/bin/nvcc.bat
for /F "usebackq delims=" %%n in (`cygpath -m "%CUDA_PATH%\bin\nvcc.exe"`) do set CMAKE_CUDA_COMPILER=%%n
set CMAKE_CUDA_COMPILER_LAUNCHER=%TMP_DIR%/bin/randomtemp.exe;%TMP_DIR%\bin\sccache.exe
)

@echo off
Expand Down
20 changes: 14 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -517,16 +517,14 @@ if(MSVC)
endforeach(flag_var)

# Try harder
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "/w" "-w")
string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler /w -w")
endif(MSVC)

list(APPEND CUDA_NVCC_FLAGS "-Xfatbin" "-compress-all")
list(APPEND CUDA_NVCC_FLAGS_DEBUG "-Xfatbin" "-compress-all")
list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBINFO "-Xfatbin" "-compress-all")
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")

if(NOT MSVC)
list(APPEND CUDA_NVCC_FLAGS_DEBUG "-g" "-lineinfo" "--source-in-ptx")
list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBINFO "-g" "-lineinfo" "--source-in-ptx")
string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -g -lineinfo --source-in-ptx")
string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -g -lineinfo --source-in-ptx")
endif(NOT MSVC)

# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
Expand Down Expand Up @@ -667,6 +665,16 @@ endif()

include(cmake/Dependencies.cmake)

if((CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.2) AND (CMAKE_HOST_SYSTEM_NAME MATCHES "Windows"))
# CUDA < 10.2 doesn't support compiling and extracting header dependencies in
# one call, so instead CMake calls nvcc twice with && in between.
# However, on windows cmd.exe has a 8191 character limit for commands which we
# start hitting. This moves most argments into a file to avoid going over the limit

set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_OBJECTS ON)
set(CMAKE_NINJA_FORCE_RESPONSE_FILE ON CACHE INTERNAL "")
endif()

if(USE_FBGEMM)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
endif()
Expand Down
6 changes: 0 additions & 6 deletions aten/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,6 @@ if(USE_CUDA AND USE_ROCM)
message(FATAL_ERROR "Both CUDA and ROCm are enabled and found. PyTorch can only be built with either of them. Please turn one off by using either USE_CUDA=OFF or USE_ROCM=OFF.")
endif()

if(MSVC)
# we want to respect the standard, and we are bored of those **** .
add_definitions(-D_CRT_SECURE_NO_DEPRECATE=1)
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "/wd4819" "-Xcompiler" "/wd4503" "-Xcompiler" "/wd4190" "-Xcompiler" "/wd4244" "-Xcompiler" "/wd4251" "-Xcompiler" "/wd4275" "-Xcompiler" "/wd4522")
endif(MSVC)

if(USE_ROCM)
# TODO: AT_HIP_ENABLED (change this once we represent HIP as HIP in
# ATen proper)
Expand Down
4 changes: 1 addition & 3 deletions c10/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,7 @@ if(${COMPILER_SUPPORTS_HIDDEN_VISIBILITY})
endif()

# ---[ Dependency of c10_cuda
target_link_libraries(c10_cuda PUBLIC c10)

target_link_libraries(c10_cuda INTERFACE torch::cudart)
target_link_libraries(c10_cuda PUBLIC c10 torch::cudart)

target_include_directories(
c10_cuda PUBLIC
Expand Down
23 changes: 11 additions & 12 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -895,19 +895,18 @@ elseif(USE_CUDA)
set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
if(CUDA_SEPARABLE_COMPILATION)
# Separate compilation fails when kernels using `thrust::sort_by_key`
# are linked with the rest of CUDA code. Workaround by linking them separately
set(_generated_name "torch_cuda_w_sort_by_key_intermediate_link${CMAKE_C_OUTPUT_EXTENSION}")
set(torch_cuda_w_sort_by_key_link_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/torch_cuda.dir/${CMAKE_CFG_INTDIR}/${_generated_name}")
cuda_wrap_srcs(torch_cuda OBJ Caffe2_GPU_W_SORT_BY_KEY_OBJ ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${torch_cuda_w_sort_by_key_link_file}" torch_cpu "${_options}" "${torch_cuda_SEPARABLE_COMPILATION_OBJECTS}")
set( torch_cuda_SEPARABLE_COMPILATION_OBJECTS )
# Pass compiled sort-by-key object + device-linked fatbin as extra dependencies of torch_cuda
cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS} ${torch_cuda_w_sort_by_key_link_file} ${Caffe2_GPU_W_SORT_BY_KEY_OBJ})
# are linked with the rest of CUDA code. Workaround by linking them separately.
add_library(torch_cuda ${Caffe2_GPU_SRCS})
set_property(TARGET torch_cuda PROPERTY CUDA_SEPARABLE_COMPILATION ON)

add_library(torch_cuda_w_sort_by_key OBJECT ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
set_property(TARGET torch_cuda_w_sort_by_key PROPERTY CUDA_SEPARABLE_COMPILATION OFF)
target_link_libraries(torch_cuda PRIVATE torch_cuda_w_sort_by_key)
elseif(BUILD_SPLIT_CUDA)
cuda_add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS_CPP} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CPP})
cuda_add_library(torch_cuda_cu ${Caffe2_GPU_SRCS_CU} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CU})
add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS_CPP} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CPP})
add_library(torch_cuda_cu ${Caffe2_GPU_SRCS_CU} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CU})
else()
cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
endif()
set(CUDA_LINK_LIBRARIES_KEYWORD)
if(BUILD_SPLIT_CUDA)
Expand Down Expand Up @@ -1803,7 +1802,7 @@ if(BUILD_TEST)
if(USE_CUDA)
foreach(test_src ${Caffe2_GPU_TEST_SRCS})
get_filename_component(test_name ${test_src} NAME_WE)
cuda_add_executable(${test_name} "${test_src}")
add_executable(${test_name} "${test_src}")
target_link_libraries(${test_name} torch_library gtest_main)
target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
Expand Down
Loading

0 comments on commit c373387

Please sign in to comment.