Building System: Only build one shared library for the user (#1243)

This is the second phase of aligning binary distribution with CUDA. Currently, there is only libtorch_xpu.so on both windows and linux, but there are still some differences between win and lin, which we will fix after the sycl compiler is fixed. --------- Signed-off-by: Feng Yuan <[email protected]> Co-authored-by: Feng Yuan <[email protected]>
intel · Feb 14, 2025 · ac1466c · ac1466c
1 parent 3510f91
commit ac1466c
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 58 deletions.
diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake
@@ -3,21 +3,25 @@
 set(TORCH_XPU_OPS_LIBRARIES)
 set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
 
-add_library(
-  torch_xpu_ops
-  STATIC
-  ${ATen_XPU_CPP_SRCS}
-  ${ATen_XPU_MKL_SRCS}
-  ${ATen_XPU_NATIVE_CPP_SRCS}
-  ${ATen_XPU_GEN_SRCS}
-  ${ATen_XPU_XCCL_SRCS})
-
-if(USE_C10D_XCCL)
-  target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
-  target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
-endif()
+macro(setup_common_libraries)
+  add_library(
+    torch_xpu_ops
+    STATIC
+    ${ATen_XPU_CPP_SRCS}
+    ${ATen_XPU_MKL_SRCS}
+    ${ATen_XPU_NATIVE_CPP_SRCS}
+    ${ATen_XPU_GEN_SRCS}
+    ${ATen_XPU_XCCL_SRCS})
+
+  if(USE_C10D_XCCL)
+    target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
+    target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
+  endif()
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
+endmacro()
 
 if(BUILD_SEPARATE_OPS)
+  setup_common_libraries()
   foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
     get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
     set(sycl_lib torch-xpu-ops-sycl-${name})
@@ -31,7 +35,10 @@ if(BUILD_SEPARATE_OPS)
     # Decouple with PyTorch cmake definition.
     install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
   endforeach()
-elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE LESS 20241211)
+# Working with the compilers which don't support device code compression, we have to split kernels
+# into multiple libraries to meet the bin size limitation.
+elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250004 OR ICX_DATE LESS 20241205)
+  setup_common_libraries()
   # Split SYCL kernels into 4 libraries as categories 1) Unary+Binary 2) Reduce 3) Foreach 4) Others.
   set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
   set(ATen_XPU_SYCL_REDUCE_SRCS)
@@ -111,18 +118,23 @@ elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE
   install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 else()
   sycl_add_library(
-    torch_xpu_ops_sycl_kernels
-    SHARED
+    xpu_sycl
+    STATIC
+    CXX_SOURCES  ${ATen_XPU_CPP_SRCS} ${ATen_XPU_MKL_SRCS} ${ATen_XPU_NATIVE_CPP_SRCS} ${ATen_XPU_GEN_SRCS} ${ATen_XPU_XCCL_SRCS}
     SYCL_SOURCES ${ATen_XPU_SYCL_SRCS})
-  target_link_libraries(torch_xpu_ops PUBLIC torch_xpu_ops_sycl_kernels)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_sycl_kernels)
-
-  install(TARGETS torch_xpu_ops_sycl_kernels DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  add_library(torch_xpu_ops ALIAS xpu_sycl)
+  set_target_properties(xpu_sycl PROPERTIES OUTPUT_NAME torch_xpu_ops)
+  set(SYCL_TARGET xpu_sycl)
+  if(USE_C10D_XCCL)
+    target_compile_definitions(xpu_sycl PRIVATE USE_C10D_XCCL)
+    target_link_libraries(xpu_sycl  PUBLIC torch::xccl)
+  endif()
+
+  install(TARGETS xpu_sycl DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  list(APPEND TORCH_XPU_OPS_LIBRARIES xpu_sycl)
 endif()
 set(SYCL_LINK_LIBRARIES_KEYWORD)
 
-list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
-
 foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
   # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
   torch_compile_options(${lib})
@@ -140,4 +152,4 @@ if(USE_ONEMKL)
   target_compile_options(torch_xpu_ops PRIVATE "-DUSE_ONEMKL")
   target_include_directories(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_INCLUDE_DIR})
   target_link_libraries(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_LIBRARIES})
-endif()
+endif()
diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake
@@ -3,26 +3,29 @@
 set(TORCH_XPU_OPS_LIBRARIES)
 set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
 
-add_library(
-  torch_xpu_ops
-  STATIC
-  ${ATen_XPU_CPP_SRCS}
-  ${ATen_XPU_MKL_SRCS})
-set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\")
-target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB})
-
-add_library(
-  torch_xpu_ops_aten
-  SHARED
-  ${ATen_XPU_NATIVE_CPP_SRCS}
-  ${ATen_XPU_GEN_SRCS})
-install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
-target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
-target_link_libraries(torch_xpu_ops_aten PUBLIC c10)
+macro(setup_common_libraries)
+  add_library(
+    torch_xpu_ops
+    STATIC
+    ${ATen_XPU_CPP_SRCS})
+  set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\")
+  target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB})
+
+  add_library(
+    torch_xpu_ops_aten
+    SHARED
+    ${ATen_XPU_MKL_SRCS}
+    ${ATen_XPU_NATIVE_CPP_SRCS}
+    ${ATen_XPU_GEN_SRCS})
+  install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC c10)
+endmacro()
 
 if(BUILD_SEPARATE_OPS)
+  setup_common_libraries()
   foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
     get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
     set(sycl_lib torch-xpu-ops-sycl-${name})
@@ -36,7 +39,12 @@ if(BUILD_SEPARATE_OPS)
     # Decouple with PyTorch cmake definition.
     install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
   endforeach()
-elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE LESS 20241211)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
+# Working with the compilers which don't support device code compression, we have to split kernels
+# into multiple libraries to meet the bin size limitation.
+elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250004 OR ICX_DATE LESS 20241205)
+  setup_common_libraries()
   # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
   set(ATen_XPU_SYCL_BINARY_SRCS)
   set(ATen_XPU_SYCL_UNARY_SRCS)
@@ -230,27 +238,87 @@ elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250001 OR ICX_DATE
 
   # Decouple with PyTorch cmake definition.
   install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
 else()
-  # Internal file name is decided by the target name. On windows, torch_xpu_ops_sycl_kernels
-  # is too long in device code linkage command.
+  # On Windows, it is not possible to combine all obj files into one library
+  # because the obj files of kernels compiled on Windows are much larger than
+  # those on Linux. If they are combined into one, the library size will exceed
+  # 4GB, which conflicts with the size limit of a single library on Windows.
+  # We will combine the libraries on Windows into one after the compiler is fixed.
+  add_library(
+    torch_xpu_ops
+    STATIC
+    ${ATen_XPU_CPP_SRCS}
+    ${ATen_XPU_MKL_SRCS}
+    ${ATen_XPU_NATIVE_CPP_SRCS}
+    ${ATen_XPU_GEN_SRCS})
+  install(TARGETS torch_xpu_ops DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  target_compile_definitions(torch_xpu_ops PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+ # Split SYCL kernels into 2 libraries as categories 1) Common (Unary+Binary+Reduce+Pow+Copy+Activation+Foreach) 2) Others.
+  set(ATen_XPU_SYCL_COMMON_SRCS)
+  set(ATen_XPU_SYCL_OTHERS_SRCS)
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
+    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
+    string(REGEX MATCH "Pow" IS_POW ${sycl_src})
+    string(REGEX MATCH "Copy" IS_COPY ${sycl_src})
+    string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
+    string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
+    string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})
+
+    if(NOT IS_FOREACH STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src})
+    elseif(NOT IS_REDUCE STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src})
+    elseif(NOT IS_UNARY STREQUAL "" OR NOT IS_BINARY STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src})
+    elseif(NOT IS_COPY STREQUAL "" OR NOT IS_POW STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src})
+    elseif(NOT IS_ACTIVATION STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_COMMON_SRCS ${sycl_src})
+    else()
+      list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
+    endif()
+  endforeach()
+  # Common kernel lib
+  set(sycl_common_lib torch_xpu_ops_sycl_common_kernels)
   sycl_add_library(
-    xpu_sycl
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_SRCS})
-  target_compile_definitions(xpu_sycl PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC xpu_sycl)
-  target_link_libraries(xpu_sycl PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES xpu_sycl)
+    ${sycl_common_lib}
+    STATIC
+    SYCL_SOURCES ${ATen_XPU_SYCL_COMMON_SRCS})
+  target_compile_definitions(${sycl_common_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_common_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_common_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Other kernel lib
+  set(sycl_lib torch_xpu_ops_sycl_kernels)
+  sycl_add_library(
+    ${sycl_lib}
+    STATIC
+    SYCL_SOURCES ${ATen_XPU_SYCL_OTHERS_SRCS})
+  target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
 
-  set_target_properties(xpu_sycl PROPERTIES OUTPUT_NAME torch_xpu_ops_sycl_kernels)
   # Decouple with PyTorch cmake definition.
-  install(TARGETS xpu_sycl DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  target_link_libraries(torch_xpu_ops
+      PUBLIC
+      ${sycl_common_lib}
+      ${sycl_lib}
+  )
+  target_link_options(torch_xpu_ops PUBLIC
+      "-WHOLEARCHIVE:$<TARGET_FILE:${sycl_common_lib}>"
+      "-WHOLEARCHIVE:$<TARGET_FILE:${sycl_lib}>"
+  )
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
 endif()
 set(SYCL_LINK_LIBRARIES_KEYWORD)
 
-list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
-list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
-
 foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
   # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
   torch_compile_options(${lib})
@@ -270,4 +338,4 @@ if(USE_ONEMKL)
   target_compile_options(torch_xpu_ops PRIVATE "-DUSE_ONEMKL")
   target_include_directories(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_INCLUDE_DIR})
   target_link_libraries(torch_xpu_ops PUBLIC ${TORCH_XPU_OPS_ONEMKL_LIBRARIES})
-endif()
+endif()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -6,6 +6,7 @@ set(ATen_XPU_MKL_SRCS)
 set(ATen_XPU_NATIVE_CPP_SRCS)
 set(ATen_XPU_SYCL_SRCS)
 set(ATen_XPU_XCCL_SRCS)
+set(SYCL_TARGET torch_xpu_ops)
 
 set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory")
 
@@ -29,5 +30,5 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake)
 if(CLANG_FORMAT)
   file(GLOB_RECURSE ALL_CSRCS ${TORCH_XPU_OPS_ROOT}/**.[ch] ${TORCH_XPU_OPS_ROOT}/**.[ch]pp)
   add_custom_target(CL_FORMAT_CSRCS COMMAND ${CLANG_FORMAT_EXEC} -i -style=file ${ALL_CSRCS})
-  add_dependencies(torch_xpu_ops CL_FORMAT_CSRCS)
+  add_dependencies(${SYCL_TARGET} CL_FORMAT_CSRCS)
 endif()