Skip to content

Commit

Permalink
Building System: Only build one shared library for the user (#1243)
Browse files Browse the repository at this point in the history
This is the second phase of aligning binary distribution with CUDA.
Currently, there is only libtorch_xpu.so on both windows and linux, but
there are still some differences between win and lin, which we will fix
after the sycl compiler is fixed.

---------

Signed-off-by: Feng Yuan <[email protected]>
Co-authored-by: Feng Yuan <[email protected]>
  • Loading branch information
chunhuanMeng and fengyuan14 authored Feb 14, 2025
1 parent 3510f91 commit ac1466c
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 58 deletions.
58 changes: 35 additions & 23 deletions src/BuildOnLinux.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,25 @@
set(TORCH_XPU_OPS_LIBRARIES)
set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)

add_library(
torch_xpu_ops
STATIC
${ATen_XPU_CPP_SRCS}
${ATen_XPU_MKL_SRCS}
${ATen_XPU_NATIVE_CPP_SRCS}
${ATen_XPU_GEN_SRCS}
${ATen_XPU_XCCL_SRCS})

if(USE_C10D_XCCL)
target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
endif()
macro(setup_common_libraries)
add_library(
torch_xpu_ops
STATIC
${ATen_XPU_CPP_SRCS}
${ATen_XPU_MKL_SRCS}
${ATen_XPU_NATIVE_CPP_SRCS}
${ATen_XPU_GEN_SRCS}
${ATen_XPU_XCCL_SRCS})

if(USE_C10D_XCCL)
target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
endif()
list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
endmacro()

if(BUILD_SEPARATE_OPS)
setup_common_libraries()
foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
set(sycl_lib torch-xpu-ops-sycl-${name})
Expand All @@ -31,7 +35,10 @@ if(BUILD_SEPARATE_OPS)
# Decouple with PyTorch cmake definition.
install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
endforeach()
elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE LESS 20241211)
# Working with the compilers which don't support device code compression, we have to split kernels
# into multiple libraries to meet the bin size limitation.
elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250004 OR ICX_DATE LESS 20241205)
setup_common_libraries()
# Split SYCL kernels into 4 libraries as categories 1) Unary+Binary 2) Reduce 3) Foreach 4) Others.
set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
set(ATen_XPU_SYCL_REDUCE_SRCS)
Expand Down Expand Up @@ -111,18 +118,23 @@ elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE
install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
else()
sycl_add_library(
torch_xpu_ops_sycl_kernels
SHARED
xpu_sycl
STATIC
CXX_SOURCES ${ATen_XPU_CPP_SRCS} ${ATen_XPU_MKL_SRCS} ${ATen_XPU_NATIVE_CPP_SRCS} ${ATen_XPU_GEN_SRCS} ${ATen_XPU_XCCL_SRCS}
SYCL_SOURCES ${ATen_XPU_SYCL_SRCS})
target_link_libraries(torch_xpu_ops PUBLIC torch_xpu_ops_sycl_kernels)
list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_sycl_kernels)

install(TARGETS torch_xpu_ops_sycl_kernels DESTINATION "${TORCH_INSTALL_LIB_DIR}")
add_library(torch_xpu_ops ALIAS xpu_sycl)
set_target_properties(xpu_sycl PROPERTIES OUTPUT_NAME torch_xpu_ops)
set(SYCL_TARGET xpu_sycl)
if(USE_C10D_XCCL)
target_compile_definitions(xpu_sycl PRIVATE USE_C10D_XCCL)
target_link_libraries(xpu_sycl PUBLIC torch::xccl)
endif()

install(TARGETS xpu_sycl DESTINATION "${TORCH_INSTALL_LIB_DIR}")
list(APPEND TORCH_XPU_OPS_LIBRARIES xpu_sycl)
endif()
set(SYCL_LINK_LIBRARIES_KEYWORD)

list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)

foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
# Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
torch_compile_options(${lib})
Expand All @@ -140,4 +152,4 @@ if(USE_ONEMKL)
target_compile_options(torch_xpu_ops PRIVATE "-DUSE_ONEMKL")
target_include_directories(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_INCLUDE_DIR})
target_link_libraries(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_LIBRARIES})
endif()
endif()
136 changes: 102 additions & 34 deletions src/BuildOnWindows.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,29 @@
set(TORCH_XPU_OPS_LIBRARIES)
set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)

add_library(
torch_xpu_ops
STATIC
${ATen_XPU_CPP_SRCS}
${ATen_XPU_MKL_SRCS})
set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\")
target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB})

add_library(
torch_xpu_ops_aten
SHARED
${ATen_XPU_NATIVE_CPP_SRCS}
${ATen_XPU_GEN_SRCS})
install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
target_link_libraries(torch_xpu_ops_aten PUBLIC c10)
macro(setup_common_libraries)
add_library(
torch_xpu_ops
STATIC
${ATen_XPU_CPP_SRCS})
set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\")
target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB})

add_library(
torch_xpu_ops_aten
SHARED
${ATen_XPU_MKL_SRCS}
${ATen_XPU_NATIVE_CPP_SRCS}
${ATen_XPU_GEN_SRCS})
install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
target_link_libraries(torch_xpu_ops_aten PUBLIC c10)
endmacro()

if(BUILD_SEPARATE_OPS)
setup_common_libraries()
foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
set(sycl_lib torch-xpu-ops-sycl-${name})
Expand All @@ -36,7 +39,12 @@ if(BUILD_SEPARATE_OPS)
# Decouple with PyTorch cmake definition.
install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
endforeach()
elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE LESS 20241211)
list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
# Working with the compilers which don't support device code compression, we have to split kernels
# into multiple libraries to meet the bin size limitation.
elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250004 OR ICX_DATE LESS 20241205)
setup_common_libraries()
# Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
set(ATen_XPU_SYCL_BINARY_SRCS)
set(ATen_XPU_SYCL_UNARY_SRCS)
Expand Down Expand Up @@ -230,27 +238,87 @@ elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE

# Decouple with PyTorch cmake definition.
install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")

list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
else()
# Internal file name is decided by the target name. On windows, torch_xpu_ops_sycl_kernels
# is too long in device code linkage command.
# On Windows, it is not possible to combine all obj files into one library
# because the obj files of kernels compiled on Windows are much larger than
# those on Linux. If they are combined into one, the library size will exceed
# 4GB, which conflicts with the size limit of a single library on Windows.
# We will combine the libraries on Windows into one after the compiler is fixed.
add_library(
torch_xpu_ops
STATIC
${ATen_XPU_CPP_SRCS}
${ATen_XPU_MKL_SRCS}
${ATen_XPU_NATIVE_CPP_SRCS}
${ATen_XPU_GEN_SRCS})
install(TARGETS torch_xpu_ops DESTINATION "${TORCH_INSTALL_LIB_DIR}")
target_compile_definitions(torch_xpu_ops PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
# Split SYCL kernels into 2 libraries as categories 1) Common (Unary+Binary+Reduce+Pow+Copy+Activation+Foreach) 2) Others.
set(ATen_XPU_SYCL_COMMON_SRCS)
set(ATen_XPU_SYCL_OTHERS_SRCS)
foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
string(REGEX MATCH "Pow" IS_POW ${sycl_src})
string(REGEX MATCH "Copy" IS_COPY ${sycl_src})
string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})

if(NOT IS_FOREACH STREQUAL "")
list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src})
elseif(NOT IS_REDUCE STREQUAL "")
list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src})
elseif(NOT IS_UNARY STREQUAL "" OR NOT IS_BINARY STREQUAL "")
list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src})
elseif(NOT IS_COPY STREQUAL "" OR NOT IS_POW STREQUAL "")
list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src})
elseif(NOT IS_ACTIVATION STREQUAL "")
list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src})
else()
list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
endif()
endforeach()
# Common kernel lib
set(sycl_common_lib torch_xpu_ops_sycl_common_kernels)
sycl_add_library(
xpu_sycl
SHARED
SYCL_SOURCES ${ATen_XPU_SYCL_SRCS})
target_compile_definitions(xpu_sycl PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
target_link_libraries(torch_xpu_ops_aten PUBLIC xpu_sycl)
target_link_libraries(xpu_sycl PUBLIC torch_xpu)
list(APPEND TORCH_XPU_OPS_LIBRARIES xpu_sycl)
${sycl_common_lib}
STATIC
SYCL_SOURCES ${ATen_XPU_SYCL_COMMON_SRCS})
target_compile_definitions(${sycl_common_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_common_lib})

# Decouple with PyTorch cmake definition.
install(TARGETS ${sycl_common_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")

# Other kernel lib
set(sycl_lib torch_xpu_ops_sycl_kernels)
sycl_add_library(
${sycl_lib}
STATIC
SYCL_SOURCES ${ATen_XPU_SYCL_OTHERS_SRCS})
target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})

set_target_properties(xpu_sycl PROPERTIES OUTPUT_NAME torch_xpu_ops_sycl_kernels)
# Decouple with PyTorch cmake definition.
install(TARGETS xpu_sycl DESTINATION "${TORCH_INSTALL_LIB_DIR}")
install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")

target_link_libraries(torch_xpu_ops
PUBLIC
${sycl_common_lib}
${sycl_lib}
)
target_link_options(torch_xpu_ops PUBLIC
"-WHOLEARCHIVE:$<TARGET_FILE:${sycl_common_lib}>"
"-WHOLEARCHIVE:$<TARGET_FILE:${sycl_lib}>"
)
list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
endif()
set(SYCL_LINK_LIBRARIES_KEYWORD)

list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)

foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
# Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
torch_compile_options(${lib})
Expand All @@ -270,4 +338,4 @@ if(USE_ONEMKL)
target_compile_options(torch_xpu_ops PRIVATE "-DUSE_ONEMKL")
target_include_directories(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_INCLUDE_DIR})
target_link_libraries(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_LIBRARIES})
endif()
endif()
3 changes: 2 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set(ATen_XPU_MKL_SRCS)
set(ATen_XPU_NATIVE_CPP_SRCS)
set(ATen_XPU_SYCL_SRCS)
set(ATen_XPU_XCCL_SRCS)
set(SYCL_TARGET torch_xpu_ops)

set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory")

Expand All @@ -29,5 +30,5 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake)
if(CLANG_FORMAT)
file(GLOB_RECURSE ALL_CSRCS ${TORCH_XPU_OPS_ROOT}/**.[ch] ${TORCH_XPU_OPS_ROOT}/**.[ch]pp)
add_custom_target(CL_FORMAT_CSRCS COMMAND ${CLANG_FORMAT_EXEC} -i -style=file ${ALL_CSRCS})
add_dependencies(torch_xpu_ops CL_FORMAT_CSRCS)
add_dependencies(${SYCL_TARGET} CL_FORMAT_CSRCS)
endif()

0 comments on commit ac1466c

Please sign in to comment.