Merge branch 'main' into mengfeil/triton

intel · Feb 19, 2025 · 90d6998 · 90d6998
2 parents ac79abb + e4ce4df
commit 90d6998
Show file tree

Hide file tree

Showing 34 changed files with 3,351 additions and 407 deletions.
diff --git a/.github/ci_expected_accuracy/inductor_timm_models_inference.csv b/.github/ci_expected_accuracy/inductor_timm_models_inference.csv
@@ -27,8 +27,7 @@ hrnet_w18,pass,pass,pass,pass,pass
 inception_v3,pass,pass,pass,pass,pass
 jx_nest_base,pass,pass,pass,pass,pass
 lcnet_050,pass,pass,pass,pass,pass
-# https://github.com/pytorch/pytorch/pull/145112
-levit_128,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+levit_128,pass,pass,pass,pass,pass
 mixer_b16_224,pass,pass,pass,pass,pass
 mixnet_l,pass,pass,pass,pass,pass
 mnasnet_100,pass,pass,pass,pass,pass

diff --git a/.github/ci_expected_accuracy/inductor_timm_models_training.csv b/.github/ci_expected_accuracy/inductor_timm_models_training.csv
@@ -30,8 +30,7 @@ hrnet_w18,pass,pass,pass,pass,pass
 inception_v3,pass,pass,pass,pass,pass
 jx_nest_base,pass,pass,pass,pass,pass
 lcnet_050,pass,pass,pass,pass,pass
-# https://github.com/pytorch/pytorch/pull/145112
-levit_128,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+levit_128,pass,pass,pass,pass,pass
 mixer_b16_224,pass,pass,pass,pass,pass
 mixnet_l,pass,pass,pass,pass,pass
 mnasnet_100,pass,pass,pass,pass,pass

diff --git a/.github/ci_expected_accuracy/inductor_torchbench_inference.csv b/.github/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -4,8 +4,7 @@ torchrec_dlrm,pass,eager_fail_to_run,eager_fail_to_run,fail_to_run,fail_to_run
 BERT_pytorch,pass,pass,pass,pass,pass
 Background_Matting,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
 DALLE2_pytorch,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
-# https://github.com/intel/torch-xpu-ops/issues/1263
-LearningToPaint,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+LearningToPaint,pass,pass,pass,pass,pass
 Super_SloMo,pass,pass,pass,pass,pass
 alexnet,pass,pass,pass,pass,pass
 basic_gnn_edgecnn,pass,pass,pass,pass,pass
@@ -45,8 +44,7 @@ hf_DistilBert,pass,pass,pass,pass,pass
 hf_GPT2,pass,pass,pass,pass,pass
 hf_GPT2_large,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
 hf_Longformer,pass,pass,pass,pass,pass
-# https://github.com/intel/torch-xpu-ops/issues/1262
-hf_Reformer,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ
+hf_Reformer,pass,pass,pass,pass,pass
 hf_T5,pass,pass,pass,pass,pass
 # https://github.com/intel/torch-xpu-ops/issues/1276
 hf_T5_base,pass,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
@@ -70,8 +68,7 @@ mobilenet_v3_large,pass,pass,pass,pass,pass
 moco,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 moondream,pass,pass,pass,pass,pass
 nanogpt,pass,pass,pass,pass,pass
-# https://github.com/intel/torch-xpu-ops/issues/1260
-nvidia_deeprecommender,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+nvidia_deeprecommender,pass,pass,pass,pass,pass
 opacus_cifar10,pass,pass,pass,pass,pass
 phlippe_densenet,pass,pass,pass,pass,pass
 phlippe_resnet,pass,pass,pass,pass,pass
@@ -91,8 +88,7 @@ sam_fast,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 shufflenet_v2_x1_0,pass,pass,pass,pass,pass
 simple_gpt,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load
 simple_gpt_tp_manual,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load
-# https://github.com/intel/torch-xpu-ops/issues/1273
-soft_actor_critic,pass,fail_accuracy,pass,pass,pass
+soft_actor_critic,pass,pass,pass,pass,pass
 speech_transformer,pass,pass,pass,pass,pass
 squeezenet1_1,pass,fail_accuracy,pass,pass,pass
 stable_diffusion_text_encoder,pass,pass,pass,pass,pass

diff --git a/.github/ci_expected_accuracy/inductor_torchbench_training.csv b/.github/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -3,10 +3,9 @@ torchrec_dlrm,pass,eager_fail_to_run,eager_fail_to_run,pass,pass
 BERT_pytorch,pass,pass,pass,pass,pass
 Background_Matting,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
 DALLE2_pytorch,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
-# https://github.com/intel/torch-xpu-ops/issues/1263
-LearningToPaint,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+LearningToPaint,pass,pass,pass,pass,pass
 # https://github.com/intel/torch-xpu-ops/issues/1256
-Super_SloMo,eager_two_runs_differ,pass,pass,eager_two_runs_differ,pass
+Super_SloMo,eager_two_runs_differ,pass,pass,pass,pass
 alexnet,pass,pass,pass,pass,pass
 basic_gnn_edgecnn,pass,pass,pass,pass,pass
 basic_gnn_gcn,pass,pass,pass,pass,pass
@@ -46,8 +45,7 @@ hf_DistilBert,pass,pass,pass,pass,pass
 hf_GPT2,pass,pass,pass,pass,pass
 hf_GPT2_large,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
 hf_Longformer,pass,pass,pass,pass,pass
-# https://github.com/intel/torch-xpu-ops/issues/1262
-hf_Reformer,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ
+hf_Reformer,pass,pass,pass,pass,pass
 hf_T5,pass,pass,pass,pass,pass
 hf_T5_base,pass,pass,pass,pass,pass
 hf_T5_generate,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
@@ -69,8 +67,7 @@ mobilenet_v3_large,pass,pass,pass,pass,pass
 moco,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 moondream,pass,pass,pass,pass,pass
 nanogpt,pass,pass,pass,pass,pass
-# https://github.com/intel/torch-xpu-ops/issues/1260
-nvidia_deeprecommender,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+nvidia_deeprecommender,pass,pass,pass,pass,pass
 opacus_cifar10,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 phlippe_densenet,pass,pass,pass,pass,pass
 # https://github.com/intel/torch-xpu-ops/issues/509
@@ -112,5 +109,5 @@ tts_angular,pass,pass,pass,pass,pass
 vgg16,pass,pass,pass,pass,pass
 # https://github.com/intel/torch-xpu-ops/issues/1264
 vision_maskrcnn,eager_fail_to_run,pass,pass,eager_fail_to_run,eager_fail_to_run
-yolov3,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+yolov3,pass,pass,pass,pass,pass
 hf_Roberta_base,pass,pass,pass,pass,pass
diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py
@@ -13,6 +13,8 @@
         "https://github.com/pytorch/pytorch/pull/126516",
         # Modify the tolerance level in TIMM benchmark
         "https://github.com/pytorch/pytorch/pull/143739",
+        # Allow XPU device for validating the arguments to sparse compressed tensor factory functions
+        "https://github.com/pytorch/pytorch/pull/147306",
     ]
 )
 parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])

diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh
@@ -28,9 +28,11 @@ lintrunner init 2> /dev/null
 
 # Do build steps necessary for linters
 if [[ "${CLANG}" == "1" ]]; then
-    pushd ../../
-    cp -rf ./torchgen/packaged/ATen/templates third_party/torch-xpu-ops/yaml/templates
-    python3 third_party/torch-xpu-ops/tools/linter/clang_tidy/generate_build_files.py
+    if [[ -e "third_party/torch-xpu-ops/tools/linter/clang_tidy/generate_build_files.py" ]];then
+        python3 third_party/torch-xpu-ops/tools/linter/clang_tidy/generate_build_files.py
+    else
+        echo "Please run the checker under pytorch source code folder"
+    fi
 fi
 #python3 -m tools.generate_torch_version --is_debug=false
 #python3 -m tools.pyi.gen_pyi \
@@ -49,10 +51,6 @@ if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS}
     RC=1
 fi
 
-if [[ "${CLANG}" == "1" ]]; then
-    popd
-fi
-
 # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
 jq --raw-output \
     '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -24,14 +24,23 @@ jobs:
     # Don't run on forked repos and draft PRs
     if: ${{ github.repository_owner == 'intel' }}
     name: preci-lint-check
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
       - name: Run lint check
         run: |
           export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MERGE_CONFLICTLESS_CSV --all-files"
           bash .github/scripts/lintrunner.sh
+      - name: Run lint check with Clang
+        run: |
+          sudo apt update -y && sudo apt install -y libomp-dev
+          cd ../ && rm -rf pytorch
+          git clone https://github.com/pytorch/pytorch pytorch
+          cd pytorch && cp -r ../torch-xpu-ops third_party/
+          export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT build/xpu/**/*.* build/xpu/*.* third_party/torch-xpu-ops/src/*.* third_party/torch-xpu-ops/src/**/*.* third_party/torch-xpu-ops/src/**/**/*.* third_party/torch-xpu-ops/src/**/**/**/*.*"
+          export CLANG=1
+          bash third_party/torch-xpu-ops/.github/scripts/lintrunner.sh
 
   preci-linux-build:
     # Don't run on forked repos and draft PRs

diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
@@ -122,12 +122,23 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "${SYCL_OFFLINE_COMPILER_CG_OPTIONS} -cl-fp32-correctly-rounded-divide-sqrt")
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'")
 
+  # AOT default targets.
+  # The motivation of AOT default targets here is to provide a minimized set and
+  # enable an efficient building for developers in build from source scenarios.
+  # It doesn't tell what all we support, but targets are the most common
+  # platforms for PyTorch XPU development, which ensures we can meet the
+  # requirement of development at the current stage. TODO: We will support
+  # auto-check on the native platform and Intel SYCL SDK to decide a proper AOT
+  # target for developers automatically.
+  # PyTorch CI/CD and special development requirements are treated as custom
+  # scenarios, where customers are recommended to config custom AOT targets with
+  # `TORCH_XPU_ARCH_LIST`.
   if(WIN32)
-    set(AOT_TARGETS "dg2-g10,mtl-u,mtl-h,xe2-lpg,xe2-hpg")
+    set(AOT_TARGETS "dg2,bmg")
   else()
-    set(AOT_TARGETS "pvc,xe-lpg,dg2-g10")
+    set(AOT_TARGETS "pvc")
   endif()
-  if((DEFINED ENV{TORCH_XPU_ARCH_LIST}) AND NOT ("$ENV{TORCH_XPU_ARCH_LIST}" STREQUAL ""))
+  if(DEFINED ENV{TORCH_XPU_ARCH_LIST})
     set(AOT_TARGETS "$ENV{TORCH_XPU_ARCH_LIST}")
   endif()
   set(TORCH_XPU_ARCH_LIST ${AOT_TARGETS} PARENT_SCOPE)

diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
@@ -8,6 +8,7 @@ file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED})
 
 set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
 set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
+set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
 set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
 set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
 
@@ -48,6 +49,7 @@ endfunction(GEN_BACKEND)
 
 set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
 set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
+set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
 set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
 set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
 set(XPU_AOTI_INSTALL_DIR ${TORCH_ROOT}/torch/csrc/inductor/aoti_torch/generated/extend)
@@ -79,7 +81,7 @@ function(GEN_XPU file_yaml)
     --install-dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
     --per-operator-headers
     --static-dispatch-backend
-    --backend-whitelist XPU SparseXPU NestedTensorXPU
+    --backend-whitelist XPU SparseXPU SparseCsrXPU NestedTensorXPU
     # --xpu: generate in-tree RegisterXPU_0.cpp for in-tree OPs
     --xpu
     # --update-aoti-c-shim: generate extend/c_shim_xpu.h
@@ -95,6 +97,7 @@ function(GEN_XPU file_yaml)
     # Codegen post-process
     COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_PATH}
     COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseXPU_PATH}
+    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseCsrXPU_PATH}
     COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterNestedTensorXPU_PATH}
     ${SIMPLE_TRACE}
     WORKING_DIRECTORY ${TORCH_ROOT}
@@ -125,6 +128,7 @@ GEN_XPU(
   ${BUILD_TORCH_XPU_ATEN_GENERATED}/XPUFunctions.h
   ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp
   ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp
+  ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp
   ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp
   ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.h
   ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp
@@ -137,7 +141,7 @@ GEN_XPU(
 # $TORCH_XPU_OPS_INCLUDE_DIRS, so that "#include <ATen/ops/*.h>" works.
 list(APPEND TORCH_XPU_OPS_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/xpu)
 
-list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterNestedTensorXPU_PATH})
+list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterSparseCsrXPU_PATH} ${RegisterNestedTensorXPU_PATH})
 list(APPEND xpu_generated_src ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp)
 add_custom_target(TORCH_XPU_GEN_TARGET DEPENDS ${xpu_generated_src})
 set(ATen_XPU_GEN_SRCS ${xpu_generated_src})
diff --git a/src/ATen/native/sparse/SparseCsrTensor.cpp b/src/ATen/native/sparse/SparseCsrTensor.cpp
diff --git a/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp b/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
@@ -0,0 +1,46 @@
+#include <ATen/native/sparse/SparseStubs.h>
+#include <ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h>
+#include <xpu/ATen/ops/_convert_indices_from_coo_to_csr_native.h>
+#include <xpu/ATen/ops/_convert_indices_from_csr_to_coo_native.h>
+
+namespace at::native {
+
+using namespace at::sparse;
+
+TORCH_IMPL_FUNC(_convert_indices_from_coo_to_csr_structured_xpu)
+(const Tensor& input,
+ const int64_t size,
+ const bool out_int32,
+ const Tensor& result) {
+  xpu::convert_indices_from_coo_to_csr_structured_kernel(
+      input, size, out_int32, result);
+};
+
+TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_xpu)
+(const Tensor& crow_indices,
+ const Tensor& col_indices,
+ const bool out_int32,
+ const bool transpose,
+ const Tensor& result) {
+  xpu::convert_indices_from_csr_to_coo_structured_kernel(
+      crow_indices, col_indices, out_int32, transpose, result);
+};
+
+Tensor _sparse_csr_sum_xpu(
+    const Tensor& input,
+    IntArrayRef dims_to_sum,
+    bool keepdim,
+    std::optional<ScalarType> dtype) {
+  return xpu::_sparse_csr_sum_xpu_kernel(input, dims_to_sum, keepdim, dtype);
+}
+
+Tensor _sparse_csr_prod_xpu(
+    const Tensor& input,
+    IntArrayRef dims_to_reduce,
+    bool keepdim,
+    std::optional<ScalarType> dtype) {
+  return xpu::_sparse_csr_prod_xpu_kernel(
+      input, dims_to_reduce, keepdim, dtype);
+}
+
+} // namespace at::native
diff --git a/src/ATen/native/sparse/xpu/SparseSoftmax.cpp b/src/ATen/native/sparse/xpu/SparseSoftmax.cpp
@@ -0,0 +1,39 @@
+#include <ATen/native/sparse/ParamUtils.h>
+#include <ATen/native/sparse/SparseStubs.h>
+#include <ATen/native/sparse/xpu/sycl/SparseSoftmaxKernels.h>
+
+namespace at::native {
+
+using namespace at::sparse;
+
+Tensor softmax_sparse_xpu(
+    const Tensor& input_,
+    const int64_t dim_,
+    const bool half_to_float) {
+  return xpu::softmax_sparse_xpu_kernel(input_, dim_, half_to_float);
+}
+
+Tensor log_softmax_sparse_xpu(
+    const Tensor& input_,
+    const int64_t dim_,
+    const bool half_to_float) {
+  return xpu::log_softmax_sparse_xpu_kernel(input_, dim_, half_to_float);
+}
+
+Tensor softmax_backward_sparse_xpu(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_) {
+  return xpu::softmax_backward_sparse_xpu_kernel(grad_, output_, dim_, input_);
+}
+
+Tensor log_softmax_backward_sparse_xpu(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_) {
+  return xpu::log_softmax_backward_sparse_xpu_kernel(
+      grad_, output_, dim_, input_);
+}
+} // namespace at::native
diff --git a/src/ATen/native/sparse/xpu/SparseTensorMath.cpp b/src/ATen/native/sparse/xpu/SparseTensorMath.cpp
@@ -19,4 +19,11 @@ SparseTensor& mul_out_sparse_xpu(
   return xpu::mul_sparse_kernel(t_, src_, r_);
 }
 
+Tensor _sparse_sum_backward_xpu(
+    const Tensor& grad_,
+    const SparseTensor& input_,
+    IntArrayRef dims_to_sum) {
+  return xpu::_sparse_sum_backward_kernel(grad_, input_, dims_to_sum);
+}
+
 } // namespace at::native