From be0efc090c533c765282f9514938882131e0d8e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Wed, 6 Nov 2024 09:02:10 +0100 Subject: [PATCH] [CI] remove unused inductor workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These tests have completely offloaded torch inductor tests to Meta a few months ago. They are currently disabled on GitHub. Signed-off-by: Sébastien Han --- .github/workflows/torch-inductor-tests.yml | 45 ----------- .../torch-inductor/scripts/check_acc.py | 11 --- .../torch-inductor/scripts/check_perf.py | 70 ------------------ .../torch-inductor/scripts/common.sh | 9 --- .../scripts/install_torchinductor.sh | 74 ------------------- .../torch-inductor/scripts/install_triton.sh | 25 ------- .../scripts/run_torchinductor_acc.sh | 55 -------------- .../scripts/run_torchinductor_perf.sh | 71 ------------------ 8 files changed, 360 deletions(-) delete mode 100644 .github/workflows/torch-inductor-tests.yml delete mode 100644 .github/workflows/torch-inductor/scripts/check_acc.py delete mode 100644 .github/workflows/torch-inductor/scripts/check_perf.py delete mode 100755 .github/workflows/torch-inductor/scripts/common.sh delete mode 100755 .github/workflows/torch-inductor/scripts/install_torchinductor.sh delete mode 100755 .github/workflows/torch-inductor/scripts/install_triton.sh delete mode 100755 .github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh delete mode 100755 .github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh diff --git a/.github/workflows/torch-inductor-tests.yml b/.github/workflows/torch-inductor-tests.yml deleted file mode 100644 index 3d8f98095291..000000000000 --- a/.github/workflows/torch-inductor-tests.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: Torchinductor - -on: - workflow_run: - workflows: ["Wheels"] - types: [completed] - workflow_dispatch: - -permissions: read-all - -jobs: - Runner-Preparation: - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - steps: - - name: Prepare runner matrix - id: set-matrix - run: | - echo '::set-output name=matrix::[["self-hosted", "A100"]]' - - Torch-Inductor-Tests: - needs: Runner-Preparation - timeout-minutes: 240 # 4 hours - runs-on: ${{ matrix.runner }} - strategy: - matrix: - runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix)}} - steps: - - name: Checkout - uses: actions/checkout@v4 - - name: Packages - run: | - ./.github/workflows/torch-inductor/scripts/install_torchinductor.sh torchbench - - name: Environment - run: | - source /tmp/torchinductor_venv/bin/activate - ./.github/workflows/torch-inductor/scripts/install_triton.sh - - name: Performance - run: | - ./.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh torchbench - # Runs too long time - #- name: Accuracy - # run: | - # ./.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh torchbench diff --git a/.github/workflows/torch-inductor/scripts/check_acc.py b/.github/workflows/torch-inductor/scripts/check_acc.py deleted file mode 100644 index c89976acab11..000000000000 --- a/.github/workflows/torch-inductor/scripts/check_acc.py +++ /dev/null @@ -1,11 +0,0 @@ -import csv -import sys - -file_path = sys.argv[1] -with open(file_path) as f: - reader = csv.reader(f) - for i, row in enumerate(reader): - if i == 0: - continue - if row[3] != "pass": - print(f"{row[1]} failed on device {row[0]} with batch size {row[2]}") diff --git a/.github/workflows/torch-inductor/scripts/check_perf.py b/.github/workflows/torch-inductor/scripts/check_perf.py deleted file mode 100644 index 212eadad55ae..000000000000 --- a/.github/workflows/torch-inductor/scripts/check_perf.py +++ /dev/null @@ -1,70 +0,0 @@ -import argparse -import csv -from collections import namedtuple - -# Create a named tuple for the output of the benchmark -BenchmarkOutput = namedtuple('BenchmarkOutput', ['dev', 'name', 'batch_size', 'speedup', 'latency']) - - -def parse_output(file_path: str) -> dict: - entries = {} - with open(file_path) as f: - reader = csv.reader(f) - for i, row in enumerate(reader): - if i == 0 or len(row) < 5: - continue - dev = row[0] - name = row[1] - batch_size = row[2] - speedup = float(row[3]) - latency = float(row[4]) - entries[name] = BenchmarkOutput(dev, name, batch_size, speedup, latency) - return entries - - -def compare(baseline: dict, new: dict, threshold: float, geomean_threshold: float) -> bool: - baseline_geomean = 1.0 - new_geomean = 1.0 - for key in new: - if key not in baseline: - print(f"New benchmark {key} not found in baseline") - baseline_latency = baseline[key].latency - new_latency = new[key].latency - if baseline_latency == 0: - print(f"Baseline latency for {key} is 0") - continue - elif new_latency == 0: - print(f"New latency for {key} is 0") - continue - - if new_latency < baseline_latency * (1 - threshold): - print(f"New benchmark {key} is faster than baseline: {new_latency} vs {baseline_latency}") - elif new_latency > baseline_latency * (1 + threshold): - print(f"New benchmark {key} is slower than baseline: {new_latency} vs {baseline_latency}") - else: - print(f"New benchmark {key} is within threshold: {new_latency} vs {baseline_latency}") - baseline_geomean *= baseline[key].speedup - new_geomean *= new[key].speedup - - baseline_geomean = baseline_geomean**(1 / len(baseline)) - new_geomean = new_geomean**(1 / len(new)) - print(f"Baseline geomean: {baseline_geomean}") - print(f"New geomean: {new_geomean}") - assert new_geomean >= baseline_geomean * (1 - geomean_threshold), \ - f"New geomean is slower than baseline: {new_geomean} vs {baseline_geomean}" - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--baseline', required=True) - parser.add_argument('--new', required=True) - parser.add_argument('--threshold', type=float, default=0.1) - parser.add_argument('--geomean-threshold', type=float, default=0.02) - args = parser.parse_args() - baseline = parse_output(args.baseline) - new = parse_output(args.new) - compare(baseline, new, args.threshold, args.geomean_threshold) - - -if __name__ == "__main__": - main() diff --git a/.github/workflows/torch-inductor/scripts/common.sh b/.github/workflows/torch-inductor/scripts/common.sh deleted file mode 100755 index 7e212a06a1ba..000000000000 --- a/.github/workflows/torch-inductor/scripts/common.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -TEST_REPORTS_DIR=/tmp/torchinductor_reports -PYTORCH_DIR=/tmp/pytorch -MODELS=(timm_models huggingface torchbench) - -echo "$TEST_REPORTS_DIR" -echo "$PYTORCH_DIR" -echo "${MODELS[@]}" diff --git a/.github/workflows/torch-inductor/scripts/install_torchinductor.sh b/.github/workflows/torch-inductor/scripts/install_torchinductor.sh deleted file mode 100755 index 18bea1f1716f..000000000000 --- a/.github/workflows/torch-inductor/scripts/install_torchinductor.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -# remember where we started -ROOT="$(pwd)" -MODEL_SPEC=$1 - -# torchinductor venv -whoami - -sudo apt-get update && sudo apt-get install -y python3-venv libgl1 - -# clean up old venv -rm -rf /tmp/torchinductor_venv -python3 -m venv /tmp/torchinductor_venv -# shellcheck source=/dev/null -source /tmp/torchinductor_venv/bin/activate -# shellcheck source=/dev/null -source ./.github/workflows/torch-inductor/scripts/common.sh - -pip3 install --upgrade pip wheel setuptools - -# Install torchtext stable first. Bundling it in the same install as torch -# nightly forces torch stable release to be installed instead. -# From https://github.com/pytorch/text?tab=readme-ov-file#torchtext, -# "WARNING: TorchText development is stopped and the 0.18 release (April 2024) -# will be the last stable release of the library." -pip3 install --force-reinstall torchtext - -# pytorch nightly -pip3 install --force-reinstall --pre torch torchvision torchaudio torchrec --extra-index-url https://download.pytorch.org/whl/nightly/cu121 -# pytorch source to get torchbench for dynamo -cd /tmp || exit -# cleanup old pytorch -rm -rf pytorch -git clone --recursive https://github.com/pytorch/pytorch -cd pytorch || exit -# if you are updating an existing checkout -git submodule sync -git submodule update --init --recursive -cd .. - -# required packages -# https://github.com/pytorch/benchmark/blob/main/docker/gcp-a100-runner-dind.dockerfile#L17 -sudo apt-get install --yes libpango-1.0-0 libpangoft2-1.0-0 -pip3 install expecttest psutil lightning-utilities pyre_extensions - -# torchbench -if [ "$MODEL_SPEC" == "torchbench" ] || [ "$MODEL_SPEC" != "all" ]; then - # clean up old torchbench - rm -rf benchmark - pip3 install pyyaml - git clone https://github.com/pytorch/benchmark.git - cd benchmark || exit - python3 install.py - cd .. -fi - -# timm -if [ "$MODEL_SPEC" == "timm_models" ] || [ "$MODEL_SPEC" != "all" ]; then - # clean up old timm - rm -rf pytorch-image-models - git clone https://github.com/huggingface/pytorch-image-models.git - cd pytorch-image-models || exit - pip3 install -e . - cd .. -fi - -# clean up cache -rm -rf /tmp/torchinductor_"$(whoami)"/ -rm -rf ~/.triton/cache -rm -rf "$TEST_REPORTS_DIR" - -# go back to where we started -cd "$ROOT" || exit diff --git a/.github/workflows/torch-inductor/scripts/install_triton.sh b/.github/workflows/torch-inductor/scripts/install_triton.sh deleted file mode 100755 index 43367a02f527..000000000000 --- a/.github/workflows/torch-inductor/scripts/install_triton.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# remember where we started -ROOT="$(pwd)" - -# shellcheck source=/dev/null -source /tmp/torchinductor_venv/bin/activate -# shellcheck source=/dev/null -source ./.github/workflows/torch-inductor/scripts/common.sh - -# Triton build-time dependencies -pip3 install --upgrade cmake ninja lit - -# build our own triton and preserve the wheel build for later re-use in this test run. -cd python || exit -pip3 uninstall pytorch-triton -y -rm -rf build dist -python3 setup.py bdist_wheel -pip3 install dist/triton*.whl - -# clean up cache -rm -rf ~/.triton/cache - -# go back to where we started -cd "$ROOT" || exit diff --git a/.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh b/.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh deleted file mode 100755 index aefd798f39ff..000000000000 --- a/.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -# remember where we started -ROOT="$(pwd)" -INDUCTOR="$ROOT"/.github/workflows/torch-inductor -MODEL_SPEC=$1 - -# shellcheck source=/dev/null -source /tmp/torchinductor_venv/bin/activate -# shellcheck source=/dev/null -source "$INDUCTOR"/scripts/common.sh - -# Dependency of 'torch/fx/experimental/validator.py'. -pip3 install --upgrade z3-solver - -# Install our own triton. -pip3 uninstall pytorch-triton -y -cd $ROOT/python || exit -if [ -d "./dist" ]; then - pip3 install dist/triton*.whl -else - rm -rf build - pip3 install -e . -fi - -cd "$PYTORCH_DIR" || exit -TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc -mkdir -p "$TEST_REPORTS_DIR" - -for model in "${MODELS[@]}"; do - if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then - continue - fi - echo "Running accuracy test for $model" - python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --inference --device cuda \ - --output "$TEST_REPORTS_DIR"/inference_"$model".csv - python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --training --amp --device cuda \ - --output "$TEST_REPORTS_DIR"/training_"$model".csv - python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --training --dynamic-shapes --device cuda \ - --output "$TEST_REPORTS_DIR"/dynamic_shapes_"$model".csv -done - -cd "$ROOT" || exit -for model in "${MODELS[@]}"; do - if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then - continue - fi - echo "Checking accuracy test for $model" - python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/inference_"$model".csv - python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/training_"$model".csv - python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/dynamic_shapes_"$model".csv -done - -# go back to where we started -cd "$ROOT" || exit diff --git a/.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh b/.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh deleted file mode 100755 index 35853d97c8fe..000000000000 --- a/.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -# remember where we started -ROOT="$(pwd)" -INDUCTOR="$ROOT"/.github/workflows/torch-inductor -MODEL_SPEC=$1 - -# shellcheck source=/dev/null -source /tmp/torchinductor_venv/bin/activate -# shellcheck source=/dev/null -source "$INDUCTOR"/scripts/common.sh - -# lock GPU clocks to 1350 MHz -sudo nvidia-smi -i 0 -pm 1 -sudo nvidia-smi -i 0 --lock-gpu-clocks=1350,1350 - -cd "$PYTORCH_DIR" || exit -TRITON_TEST_REPORTS_DIR=$TEST_REPORTS_DIR/perf -BASE_TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc -mkdir -p "$TRITON_TEST_REPORTS_DIR" -mkdir -p "$BASE_TEST_REPORTS_DIR" - -# Dependency of 'pytorch/benchmarks/dynamo/common.py'. -pip3 install pandas scipy - -echo "Running with Triton Nightly" -for model in "${MODELS[@]}"; do - if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then - continue - fi - echo "Running performance test for $model" - python3 benchmarks/dynamo/"$model".py --ci --float32 --training --inductor --performance --device cuda \ - --output "$TRITON_TEST_REPORTS_DIR"/"$model".csv -done - -# install pytorch-triton -pip3 uninstall triton -y -pip3 install --pre pytorch-triton --extra-index-url https://download.pytorch.org/whl/nightly/cu121 - -echo "Running with pytorch-triton" -for model in "${MODELS[@]}"; do - if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then - continue - fi - echo "Running performance test for $model" - python3 benchmarks/dynamo/"$model".py --ci --float32 --training --inductor --performance --device cuda \ - --output "$BASE_TEST_REPORTS_DIR"/"$model".csv -done - -# uninstall pytorch-triton -pip3 uninstall pytorch-triton -y - -cd "$ROOT" || exit -for model in "${MODELS[@]}"; do - if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then - continue - fi - echo "Checking performance test for $model" - python3 "$INDUCTOR"/scripts/check_perf.py --new "$TRITON_TEST_REPORTS_DIR"/"$model".csv --baseline "$BASE_TEST_REPORTS_DIR"/"$model".csv - EXIT_STATUS=$? - if [ "$EXIT_STATUS" -ne 0 ]; then - echo "Performance test for $model failed" - exit "$EXIT_STATUS" - fi -done - -# unlock GPU clocks -sudo nvidia-smi -i 0 -rgc - -# go back to where we started -cd "$ROOT" || exit