diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake index 1bd595219..5488771db 100644 --- a/src/BuildOnLinux.cmake +++ b/src/BuildOnLinux.cmake @@ -3,21 +3,25 @@ set(TORCH_XPU_OPS_LIBRARIES) set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE) -add_library( - torch_xpu_ops - STATIC - ${ATen_XPU_CPP_SRCS} - ${ATen_XPU_MKL_SRCS} - ${ATen_XPU_NATIVE_CPP_SRCS} - ${ATen_XPU_GEN_SRCS} - ${ATen_XPU_XCCL_SRCS}) - -if(USE_C10D_XCCL) - target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL) - target_link_libraries(torch_xpu_ops PUBLIC torch::xccl) -endif() +macro(setup_common_libraries) + add_library( + torch_xpu_ops + STATIC + ${ATen_XPU_CPP_SRCS} + ${ATen_XPU_MKL_SRCS} + ${ATen_XPU_NATIVE_CPP_SRCS} + ${ATen_XPU_GEN_SRCS} + ${ATen_XPU_XCCL_SRCS}) + + if(USE_C10D_XCCL) + target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL) + target_link_libraries(torch_xpu_ops PUBLIC torch::xccl) + endif() + list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) +endmacro() if(BUILD_SEPARATE_OPS) + setup_common_libraries() foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) get_filename_component(name ${sycl_src} NAME_WLE REALPATH) set(sycl_lib torch-xpu-ops-sycl-${name}) @@ -31,7 +35,10 @@ if(BUILD_SEPARATE_OPS) # Decouple with PyTorch cmake definition. install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") endforeach() -elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE LESS 20241211) +# Working with the compilers which don't support device code compression, we have to split kernels +# into multiple libraries to meet the bin size limitation. +elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250004 OR ICX_DATE LESS 20241205) + setup_common_libraries() # Split SYCL kernels into 4 libraries as categories 1) Unary+Binary 2) Reduce 3) Foreach 4) Others. set(ATen_XPU_SYCL_UNARY_BINARY_SRCS) set(ATen_XPU_SYCL_REDUCE_SRCS) @@ -111,18 +118,23 @@ elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") else() sycl_add_library( - torch_xpu_ops_sycl_kernels - SHARED + xpu_sycl + STATIC + CXX_SOURCES ${ATen_XPU_CPP_SRCS} ${ATen_XPU_MKL_SRCS} ${ATen_XPU_NATIVE_CPP_SRCS} ${ATen_XPU_GEN_SRCS} ${ATen_XPU_XCCL_SRCS} SYCL_SOURCES ${ATen_XPU_SYCL_SRCS}) - target_link_libraries(torch_xpu_ops PUBLIC torch_xpu_ops_sycl_kernels) - list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_sycl_kernels) - - install(TARGETS torch_xpu_ops_sycl_kernels DESTINATION "${TORCH_INSTALL_LIB_DIR}") + add_library(torch_xpu_ops ALIAS xpu_sycl) + set_target_properties(xpu_sycl PROPERTIES OUTPUT_NAME torch_xpu_ops) + set(SYCL_TARGET xpu_sycl) + if(USE_C10D_XCCL) + target_compile_definitions(xpu_sycl PRIVATE USE_C10D_XCCL) + target_link_libraries(xpu_sycl PUBLIC torch::xccl) + endif() + + install(TARGETS xpu_sycl DESTINATION "${TORCH_INSTALL_LIB_DIR}") + list(APPEND TORCH_XPU_OPS_LIBRARIES xpu_sycl) endif() set(SYCL_LINK_LIBRARIES_KEYWORD) -list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) - foreach(lib ${TORCH_XPU_OPS_LIBRARIES}) # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake torch_compile_options(${lib}) @@ -140,4 +152,4 @@ if(USE_ONEMKL) target_compile_options(torch_xpu_ops PRIVATE "-DUSE_ONEMKL") target_include_directories(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_INCLUDE_DIR}) target_link_libraries(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_LIBRARIES}) -endif() +endif() \ No newline at end of file diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake index 5bfd47e63..08ed30e84 100644 --- a/src/BuildOnWindows.cmake +++ b/src/BuildOnWindows.cmake @@ -3,26 +3,29 @@ set(TORCH_XPU_OPS_LIBRARIES) set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE) -add_library( - torch_xpu_ops - STATIC - ${ATen_XPU_CPP_SRCS} - ${ATen_XPU_MKL_SRCS}) -set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\") -target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB}) - -add_library( - torch_xpu_ops_aten - SHARED - ${ATen_XPU_NATIVE_CPP_SRCS} - ${ATen_XPU_GEN_SRCS}) -install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}") -target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB) -target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu) -target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu) -target_link_libraries(torch_xpu_ops_aten PUBLIC c10) +macro(setup_common_libraries) + add_library( + torch_xpu_ops + STATIC + ${ATen_XPU_CPP_SRCS}) + set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\") + target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB}) + + add_library( + torch_xpu_ops_aten + SHARED + ${ATen_XPU_MKL_SRCS} + ${ATen_XPU_NATIVE_CPP_SRCS} + ${ATen_XPU_GEN_SRCS}) + install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}") + target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB) + target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu) + target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu) + target_link_libraries(torch_xpu_ops_aten PUBLIC c10) +endmacro() if(BUILD_SEPARATE_OPS) + setup_common_libraries() foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) get_filename_component(name ${sycl_src} NAME_WLE REALPATH) set(sycl_lib torch-xpu-ops-sycl-${name}) @@ -36,7 +39,12 @@ if(BUILD_SEPARATE_OPS) # Decouple with PyTorch cmake definition. install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") endforeach() -elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE LESS 20241211) + list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) + list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten) +# Working with the compilers which don't support device code compression, we have to split kernels +# into multiple libraries to meet the bin size limitation. +elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250004 OR ICX_DATE LESS 20241205) + setup_common_libraries() # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others. set(ATen_XPU_SYCL_BINARY_SRCS) set(ATen_XPU_SYCL_UNARY_SRCS) @@ -230,27 +238,87 @@ elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE # Decouple with PyTorch cmake definition. install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + + list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) + list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten) else() - # Internal file name is decided by the target name. On windows, torch_xpu_ops_sycl_kernels - # is too long in device code linkage command. + # On Windows, it is not possible to combine all obj files into one library + # because the obj files of kernels compiled on Windows are much larger than + # those on Linux. If they are combined into one, the library size will exceed + # 4GB, which conflicts with the size limit of a single library on Windows. + # We will combine the libraries on Windows into one after the compiler is fixed. + add_library( + torch_xpu_ops + STATIC + ${ATen_XPU_CPP_SRCS} + ${ATen_XPU_MKL_SRCS} + ${ATen_XPU_NATIVE_CPP_SRCS} + ${ATen_XPU_GEN_SRCS}) + install(TARGETS torch_xpu_ops DESTINATION "${TORCH_INSTALL_LIB_DIR}") + target_compile_definitions(torch_xpu_ops PRIVATE TORCH_XPU_BUILD_MAIN_LIB) + # Split SYCL kernels into 2 libraries as categories 1) Common (Unary+Binary+Reduce+Pow+Copy+Activation+Foreach) 2) Others. + set(ATen_XPU_SYCL_COMMON_SRCS) + set(ATen_XPU_SYCL_OTHERS_SRCS) + foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) + string(REGEX MATCH "Binary" IS_BINARY ${sycl_src}) + string(REGEX MATCH "Unary" IS_UNARY ${sycl_src}) + string(REGEX MATCH "Pow" IS_POW ${sycl_src}) + string(REGEX MATCH "Copy" IS_COPY ${sycl_src}) + string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src}) + string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src}) + string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src}) + + if(NOT IS_FOREACH STREQUAL "") + list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src}) + elseif(NOT IS_REDUCE STREQUAL "") + list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src}) + elseif(NOT IS_UNARY STREQUAL "" OR NOT IS_BINARY STREQUAL "") + list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src}) + elseif(NOT IS_COPY STREQUAL "" OR NOT IS_POW STREQUAL "") + list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src}) + elseif(NOT IS_ACTIVATION STREQUAL "") + list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src}) + else() + list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src}) + endif() + endforeach() + # Common kernel lib + set(sycl_common_lib torch_xpu_ops_sycl_common_kernels) sycl_add_library( - xpu_sycl - SHARED - SYCL_SOURCES ${ATen_XPU_SYCL_SRCS}) - target_compile_definitions(xpu_sycl PRIVATE TORCH_XPU_BUILD_MAIN_LIB) - target_link_libraries(torch_xpu_ops_aten PUBLIC xpu_sycl) - target_link_libraries(xpu_sycl PUBLIC torch_xpu) - list(APPEND TORCH_XPU_OPS_LIBRARIES xpu_sycl) + ${sycl_common_lib} + STATIC + SYCL_SOURCES ${ATen_XPU_SYCL_COMMON_SRCS}) + target_compile_definitions(${sycl_common_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_common_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_common_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + + # Other kernel lib + set(sycl_lib torch_xpu_ops_sycl_kernels) + sycl_add_library( + ${sycl_lib} + STATIC + SYCL_SOURCES ${ATen_XPU_SYCL_OTHERS_SRCS}) + target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) - set_target_properties(xpu_sycl PROPERTIES OUTPUT_NAME torch_xpu_ops_sycl_kernels) # Decouple with PyTorch cmake definition. - install(TARGETS xpu_sycl DESTINATION "${TORCH_INSTALL_LIB_DIR}") + install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + + target_link_libraries(torch_xpu_ops + PUBLIC + ${sycl_common_lib} + ${sycl_lib} + ) + target_link_options(torch_xpu_ops PUBLIC + "-WHOLEARCHIVE:$" + "-WHOLEARCHIVE:$" + ) + list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) endif() set(SYCL_LINK_LIBRARIES_KEYWORD) -list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) -list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten) - foreach(lib ${TORCH_XPU_OPS_LIBRARIES}) # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake torch_compile_options(${lib}) @@ -270,4 +338,4 @@ if(USE_ONEMKL) target_compile_options(torch_xpu_ops PRIVATE "-DUSE_ONEMKL") target_include_directories(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_INCLUDE_DIR}) target_link_libraries(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_LIBRARIES}) -endif() +endif() \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index af7e3f38e..f1767f1cd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,6 +6,7 @@ set(ATen_XPU_MKL_SRCS) set(ATen_XPU_NATIVE_CPP_SRCS) set(ATen_XPU_SYCL_SRCS) set(ATen_XPU_XCCL_SRCS) +set(SYCL_TARGET torch_xpu_ops) set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory") @@ -29,5 +30,5 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake) if(CLANG_FORMAT) file(GLOB_RECURSE ALL_CSRCS ${TORCH_XPU_OPS_ROOT}/**.[ch] ${TORCH_XPU_OPS_ROOT}/**.[ch]pp) add_custom_target(CL_FORMAT_CSRCS COMMAND ${CLANG_FORMAT_EXEC} -i -style=file ${ALL_CSRCS}) - add_dependencies(torch_xpu_ops CL_FORMAT_CSRCS) + add_dependencies(${SYCL_TARGET} CL_FORMAT_CSRCS) endif()