Skip to content

Commit

Permalink
Merge branch 'main' into mengfeil/triton
Browse files Browse the repository at this point in the history
  • Loading branch information
mengfei25 committed Feb 19, 2025
2 parents ac79abb + e4ce4df commit 90d6998
Show file tree
Hide file tree
Showing 34 changed files with 3,351 additions and 407 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ hrnet_w18,pass,pass,pass,pass,pass
inception_v3,pass,pass,pass,pass,pass
jx_nest_base,pass,pass,pass,pass,pass
lcnet_050,pass,pass,pass,pass,pass
# https://github.com/pytorch/pytorch/pull/145112
levit_128,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
levit_128,pass,pass,pass,pass,pass
mixer_b16_224,pass,pass,pass,pass,pass
mixnet_l,pass,pass,pass,pass,pass
mnasnet_100,pass,pass,pass,pass,pass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ hrnet_w18,pass,pass,pass,pass,pass
inception_v3,pass,pass,pass,pass,pass
jx_nest_base,pass,pass,pass,pass,pass
lcnet_050,pass,pass,pass,pass,pass
# https://github.com/pytorch/pytorch/pull/145112
levit_128,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
levit_128,pass,pass,pass,pass,pass
mixer_b16_224,pass,pass,pass,pass,pass
mixnet_l,pass,pass,pass,pass,pass
mnasnet_100,pass,pass,pass,pass,pass
Expand Down
12 changes: 4 additions & 8 deletions .github/ci_expected_accuracy/inductor_torchbench_inference.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ torchrec_dlrm,pass,eager_fail_to_run,eager_fail_to_run,fail_to_run,fail_to_run
BERT_pytorch,pass,pass,pass,pass,pass
Background_Matting,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
DALLE2_pytorch,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
# https://github.com/intel/torch-xpu-ops/issues/1263
LearningToPaint,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
LearningToPaint,pass,pass,pass,pass,pass
Super_SloMo,pass,pass,pass,pass,pass
alexnet,pass,pass,pass,pass,pass
basic_gnn_edgecnn,pass,pass,pass,pass,pass
Expand Down Expand Up @@ -45,8 +44,7 @@ hf_DistilBert,pass,pass,pass,pass,pass
hf_GPT2,pass,pass,pass,pass,pass
hf_GPT2_large,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
hf_Longformer,pass,pass,pass,pass,pass
# https://github.com/intel/torch-xpu-ops/issues/1262
hf_Reformer,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ
hf_Reformer,pass,pass,pass,pass,pass
hf_T5,pass,pass,pass,pass,pass
# https://github.com/intel/torch-xpu-ops/issues/1276
hf_T5_base,pass,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
Expand All @@ -70,8 +68,7 @@ mobilenet_v3_large,pass,pass,pass,pass,pass
moco,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
moondream,pass,pass,pass,pass,pass
nanogpt,pass,pass,pass,pass,pass
# https://github.com/intel/torch-xpu-ops/issues/1260
nvidia_deeprecommender,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
nvidia_deeprecommender,pass,pass,pass,pass,pass
opacus_cifar10,pass,pass,pass,pass,pass
phlippe_densenet,pass,pass,pass,pass,pass
phlippe_resnet,pass,pass,pass,pass,pass
Expand All @@ -91,8 +88,7 @@ sam_fast,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
shufflenet_v2_x1_0,pass,pass,pass,pass,pass
simple_gpt,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load
simple_gpt_tp_manual,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load
# https://github.com/intel/torch-xpu-ops/issues/1273
soft_actor_critic,pass,fail_accuracy,pass,pass,pass
soft_actor_critic,pass,pass,pass,pass,pass
speech_transformer,pass,pass,pass,pass,pass
squeezenet1_1,pass,fail_accuracy,pass,pass,pass
stable_diffusion_text_encoder,pass,pass,pass,pass,pass
Expand Down
13 changes: 5 additions & 8 deletions .github/ci_expected_accuracy/inductor_torchbench_training.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@ torchrec_dlrm,pass,eager_fail_to_run,eager_fail_to_run,pass,pass
BERT_pytorch,pass,pass,pass,pass,pass
Background_Matting,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
DALLE2_pytorch,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
# https://github.com/intel/torch-xpu-ops/issues/1263
LearningToPaint,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
LearningToPaint,pass,pass,pass,pass,pass
# https://github.com/intel/torch-xpu-ops/issues/1256
Super_SloMo,eager_two_runs_differ,pass,pass,eager_two_runs_differ,pass
Super_SloMo,eager_two_runs_differ,pass,pass,pass,pass
alexnet,pass,pass,pass,pass,pass
basic_gnn_edgecnn,pass,pass,pass,pass,pass
basic_gnn_gcn,pass,pass,pass,pass,pass
Expand Down Expand Up @@ -46,8 +45,7 @@ hf_DistilBert,pass,pass,pass,pass,pass
hf_GPT2,pass,pass,pass,pass,pass
hf_GPT2_large,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
hf_Longformer,pass,pass,pass,pass,pass
# https://github.com/intel/torch-xpu-ops/issues/1262
hf_Reformer,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ,eager_two_runs_differ
hf_Reformer,pass,pass,pass,pass,pass
hf_T5,pass,pass,pass,pass,pass
hf_T5_base,pass,pass,pass,pass,pass
hf_T5_generate,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
Expand All @@ -69,8 +67,7 @@ mobilenet_v3_large,pass,pass,pass,pass,pass
moco,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
moondream,pass,pass,pass,pass,pass
nanogpt,pass,pass,pass,pass,pass
# https://github.com/intel/torch-xpu-ops/issues/1260
nvidia_deeprecommender,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
nvidia_deeprecommender,pass,pass,pass,pass,pass
opacus_cifar10,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
phlippe_densenet,pass,pass,pass,pass,pass
# https://github.com/intel/torch-xpu-ops/issues/509
Expand Down Expand Up @@ -112,5 +109,5 @@ tts_angular,pass,pass,pass,pass,pass
vgg16,pass,pass,pass,pass,pass
# https://github.com/intel/torch-xpu-ops/issues/1264
vision_maskrcnn,eager_fail_to_run,pass,pass,eager_fail_to_run,eager_fail_to_run
yolov3,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
yolov3,pass,pass,pass,pass,pass
hf_Roberta_base,pass,pass,pass,pass,pass
2 changes: 2 additions & 0 deletions .github/scripts/apply_torch_pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
"https://github.com/pytorch/pytorch/pull/126516",
# Modify the tolerance level in TIMM benchmark
"https://github.com/pytorch/pytorch/pull/143739",
# Allow XPU device for validating the arguments to sparse compressed tensor factory functions
"https://github.com/pytorch/pytorch/pull/147306",
]
)
parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])
Expand Down
12 changes: 5 additions & 7 deletions .github/scripts/lintrunner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ lintrunner init 2> /dev/null

# Do build steps necessary for linters
if [[ "${CLANG}" == "1" ]]; then
pushd ../../
cp -rf ./torchgen/packaged/ATen/templates third_party/torch-xpu-ops/yaml/templates
python3 third_party/torch-xpu-ops/tools/linter/clang_tidy/generate_build_files.py
if [[ -e "third_party/torch-xpu-ops/tools/linter/clang_tidy/generate_build_files.py" ]];then
python3 third_party/torch-xpu-ops/tools/linter/clang_tidy/generate_build_files.py
else
echo "Please run the checker under pytorch source code folder"
fi
fi
#python3 -m tools.generate_torch_version --is_debug=false
#python3 -m tools.pyi.gen_pyi \
Expand All @@ -49,10 +51,6 @@ if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS}
RC=1
fi

if [[ "${CLANG}" == "1" ]]; then
popd
fi

# Use jq to massage the JSON lint output into GitHub Actions workflow commands.
jq --raw-output \
'"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
Expand Down
11 changes: 10 additions & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,23 @@ jobs:
# Don't run on forked repos and draft PRs
if: ${{ github.repository_owner == 'intel' }}
name: preci-lint-check
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Run lint check
run: |
export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MERGE_CONFLICTLESS_CSV --all-files"
bash .github/scripts/lintrunner.sh
- name: Run lint check with Clang
run: |
sudo apt update -y && sudo apt install -y libomp-dev
cd ../ && rm -rf pytorch
git clone https://github.com/pytorch/pytorch pytorch
cd pytorch && cp -r ../torch-xpu-ops third_party/
export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT build/xpu/**/*.* build/xpu/*.* third_party/torch-xpu-ops/src/*.* third_party/torch-xpu-ops/src/**/*.* third_party/torch-xpu-ops/src/**/**/*.* third_party/torch-xpu-ops/src/**/**/**/*.*"
export CLANG=1
bash third_party/torch-xpu-ops/.github/scripts/lintrunner.sh
preci-linux-build:
# Don't run on forked repos and draft PRs
Expand Down
17 changes: 14 additions & 3 deletions cmake/BuildFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,23 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "${SYCL_OFFLINE_COMPILER_CG_OPTIONS} -cl-fp32-correctly-rounded-divide-sqrt")
set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'")

# AOT default targets.
# The motivation of AOT default targets here is to provide a minimized set and
# enable an efficient building for developers in build from source scenarios.
# It doesn't tell what all we support, but targets are the most common
# platforms for PyTorch XPU development, which ensures we can meet the
# requirement of development at the current stage. TODO: We will support
# auto-check on the native platform and Intel SYCL SDK to decide a proper AOT
# target for developers automatically.
# PyTorch CI/CD and special development requirements are treated as custom
# scenarios, where customers are recommended to config custom AOT targets with
# `TORCH_XPU_ARCH_LIST`.
if(WIN32)
set(AOT_TARGETS "dg2-g10,mtl-u,mtl-h,xe2-lpg,xe2-hpg")
set(AOT_TARGETS "dg2,bmg")
else()
set(AOT_TARGETS "pvc,xe-lpg,dg2-g10")
set(AOT_TARGETS "pvc")
endif()
if((DEFINED ENV{TORCH_XPU_ARCH_LIST}) AND NOT ("$ENV{TORCH_XPU_ARCH_LIST}" STREQUAL ""))
if(DEFINED ENV{TORCH_XPU_ARCH_LIST})
set(AOT_TARGETS "$ENV{TORCH_XPU_ARCH_LIST}")
endif()
set(TORCH_XPU_ARCH_LIST ${AOT_TARGETS} PARENT_SCOPE)
Expand Down
8 changes: 6 additions & 2 deletions cmake/Codegen.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED})

set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)

Expand Down Expand Up @@ -48,6 +49,7 @@ endfunction(GEN_BACKEND)

set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
set(XPU_AOTI_INSTALL_DIR ${TORCH_ROOT}/torch/csrc/inductor/aoti_torch/generated/extend)
Expand Down Expand Up @@ -79,7 +81,7 @@ function(GEN_XPU file_yaml)
--install-dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
--per-operator-headers
--static-dispatch-backend
--backend-whitelist XPU SparseXPU NestedTensorXPU
--backend-whitelist XPU SparseXPU SparseCsrXPU NestedTensorXPU
# --xpu: generate in-tree RegisterXPU_0.cpp for in-tree OPs
--xpu
# --update-aoti-c-shim: generate extend/c_shim_xpu.h
Expand All @@ -95,6 +97,7 @@ function(GEN_XPU file_yaml)
# Codegen post-process
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_PATH}
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseXPU_PATH}
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseCsrXPU_PATH}
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterNestedTensorXPU_PATH}
${SIMPLE_TRACE}
WORKING_DIRECTORY ${TORCH_ROOT}
Expand Down Expand Up @@ -125,6 +128,7 @@ GEN_XPU(
${BUILD_TORCH_XPU_ATEN_GENERATED}/XPUFunctions.h
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp
${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.h
${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp
Expand All @@ -137,7 +141,7 @@ GEN_XPU(
# $TORCH_XPU_OPS_INCLUDE_DIRS, so that "#include <ATen/ops/*.h>" works.
list(APPEND TORCH_XPU_OPS_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/xpu)

list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterNestedTensorXPU_PATH})
list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterSparseCsrXPU_PATH} ${RegisterNestedTensorXPU_PATH})
list(APPEND xpu_generated_src ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp)
add_custom_target(TORCH_XPU_GEN_TARGET DEPENDS ${xpu_generated_src})
set(ATen_XPU_GEN_SRCS ${xpu_generated_src})
23 changes: 0 additions & 23 deletions src/ATen/native/sparse/SparseCsrTensor.cpp

This file was deleted.

46 changes: 46 additions & 0 deletions src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include <ATen/native/sparse/SparseStubs.h>
#include <ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h>
#include <xpu/ATen/ops/_convert_indices_from_coo_to_csr_native.h>
#include <xpu/ATen/ops/_convert_indices_from_csr_to_coo_native.h>

namespace at::native {

using namespace at::sparse;

TORCH_IMPL_FUNC(_convert_indices_from_coo_to_csr_structured_xpu)
(const Tensor& input,
const int64_t size,
const bool out_int32,
const Tensor& result) {
xpu::convert_indices_from_coo_to_csr_structured_kernel(
input, size, out_int32, result);
};

TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_xpu)
(const Tensor& crow_indices,
const Tensor& col_indices,
const bool out_int32,
const bool transpose,
const Tensor& result) {
xpu::convert_indices_from_csr_to_coo_structured_kernel(
crow_indices, col_indices, out_int32, transpose, result);
};

Tensor _sparse_csr_sum_xpu(
const Tensor& input,
IntArrayRef dims_to_sum,
bool keepdim,
std::optional<ScalarType> dtype) {
return xpu::_sparse_csr_sum_xpu_kernel(input, dims_to_sum, keepdim, dtype);
}

Tensor _sparse_csr_prod_xpu(
const Tensor& input,
IntArrayRef dims_to_reduce,
bool keepdim,
std::optional<ScalarType> dtype) {
return xpu::_sparse_csr_prod_xpu_kernel(
input, dims_to_reduce, keepdim, dtype);
}

} // namespace at::native
39 changes: 39 additions & 0 deletions src/ATen/native/sparse/xpu/SparseSoftmax.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#include <ATen/native/sparse/ParamUtils.h>
#include <ATen/native/sparse/SparseStubs.h>
#include <ATen/native/sparse/xpu/sycl/SparseSoftmaxKernels.h>

namespace at::native {

using namespace at::sparse;

Tensor softmax_sparse_xpu(
const Tensor& input_,
const int64_t dim_,
const bool half_to_float) {
return xpu::softmax_sparse_xpu_kernel(input_, dim_, half_to_float);
}

Tensor log_softmax_sparse_xpu(
const Tensor& input_,
const int64_t dim_,
const bool half_to_float) {
return xpu::log_softmax_sparse_xpu_kernel(input_, dim_, half_to_float);
}

Tensor softmax_backward_sparse_xpu(
const Tensor& grad_,
const Tensor& output_,
int64_t dim_,
const Tensor& input_) {
return xpu::softmax_backward_sparse_xpu_kernel(grad_, output_, dim_, input_);
}

Tensor log_softmax_backward_sparse_xpu(
const Tensor& grad_,
const Tensor& output_,
int64_t dim_,
const Tensor& input_) {
return xpu::log_softmax_backward_sparse_xpu_kernel(
grad_, output_, dim_, input_);
}
} // namespace at::native
7 changes: 7 additions & 0 deletions src/ATen/native/sparse/xpu/SparseTensorMath.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,11 @@ SparseTensor& mul_out_sparse_xpu(
return xpu::mul_sparse_kernel(t_, src_, r_);
}

Tensor _sparse_sum_backward_xpu(
const Tensor& grad_,
const SparseTensor& input_,
IntArrayRef dims_to_sum) {
return xpu::_sparse_sum_backward_kernel(grad_, input_, dims_to_sum);
}

} // namespace at::native
Loading

0 comments on commit 90d6998

Please sign in to comment.