diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index a4be06964..b3a46ba5e 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -49,6 +49,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} + if: ${{ inputs.ut != 'xpu_distributed' }} timeout-minutes: 900 env: NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} @@ -234,8 +235,134 @@ jobs: test_cmd="${test_cmd} test_xpu.py" fi eval $test_cmd 2>${{ github.workspace }}/ut_log/torch_xpu/torch_xpu_test_error.log | tee ${{ github.workspace }}/ut_log/torch_xpu/torch_xpu_test.log + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + for ut_suite in $(echo ${{ inputs.ut }} |sed 's/,/ /g') + do + contains "op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu" $ut_suite + $contains_status + cd ${{ github.workspace }}/ut_log/${ut_suite} + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh ${ut_suite} + done + - name: Upload Inductor XPU UT Log + if: always() + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-${{ env.UT_NAME }} + path: ${{ github.workspace }}/ut_log + + distributed_ut_test: + runs-on: pvc_e2e + if: contains(inputs.ut, 'xpu_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/pytorch/pytorch pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [[ ${{ inputs.abi }} == '0' ]]; then + export _GLIBCXX_USE_CXX11_ABI=0 + else + export _GLIBCXX_USE_CXX11_ABI=1 + fi + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(> "${GITHUB_ENV}" - for ut_suite in $(echo ${{ inputs.ut }} |sed 's/,/ /g') - do - contains "op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu,xpu_distributed" $ut_suite - $contains_status - cd ${{ github.workspace }}/ut_log/${ut_suite} - cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ - bash ut_result_check.sh ${ut_suite} - done + cd ${{ github.workspace }}/ut_log/xpu_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'xpu_distributed' - name: Upload Inductor XPU UT Log if: always() uses: actions/upload-artifact@v4 diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 09d09af5e..bc88bd875 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -64,21 +64,9 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} - ut: op_regression,op_regression_dev1,op_extended,op_ut + ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu - preci-ut-distributed: - # Don't run on forked repos and draft PRs - secrets: inherit - if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} - name: preci-linux - needs: preci-linux-build - uses: ./.github/workflows/_linux_ut.yml - with: - pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} - ut: xpu_distributed - runner: pvc_e2e - Inductor-XPU-E2E-CI-Tests: name: preci-linux / e2e_test needs: preci-linux-build