periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck #3848
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# @generated DO NOT EDIT MANUALLY | |
# Template is at: .github/templates/linux_ci_workflow.yml.j2 | |
# Generation script: .github/scripts/generate_ci_workflows.py | |
name: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck | |
on: | |
push: | |
tags: | |
- 'ciflow/all/*' | |
- 'ciflow/cuda/*' | |
- 'ciflow/linux/*' | |
- 'ciflow/scheduled/*' | |
- 'ciflow/slow/*' | |
- 'ciflow/slow-gradcheck/*' | |
schedule: | |
- cron: 0 */4 * * * | |
workflow_dispatch: | |
env: | |
BUILD_ENVIRONMENT: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck | |
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 | |
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 | |
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla | |
TORCH_CUDA_ARCH_LIST: 5.2 | |
IN_CI: 1 | |
IS_GHA: 1 | |
# This is used for the phase of adding wheel tests only, will be removed once completed | |
IN_WHEEL_TEST: 1 | |
# Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh | |
CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts | |
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" | |
PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
AWS_DEFAULT_REGION: us-east-1 | |
PR_NUMBER: ${{ github.event.pull_request.number }} | |
SHA1: ${{ github.event.pull_request.head.sha || github.sha }} | |
PYTORCH_RETRY_TEST_CASES: 1 | |
concurrency: | |
group: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} | |
cancel-in-progress: true | |
jobs: | |
build: | |
runs-on: linux.2xlarge | |
timeout-minutes: 240 | |
env: | |
JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-build | |
outputs: | |
docker_image: ${{ steps.calculate-tag.outputs.docker_image }} | |
steps: | |
- name: print labels | |
run: echo "${PR_LABELS}" | |
- name: Display EC2 information | |
shell: bash | |
run: | | |
set -euo pipefail | |
function get_ec2_metadata() { | |
# Pulled from instance metadata endpoint for EC2 | |
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html | |
category=$1 | |
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" | |
} | |
echo "ami-id: $(get_ec2_metadata ami-id)" | |
echo "instance-id: $(get_ec2_metadata instance-id)" | |
echo "instance-type: $(get_ec2_metadata instance-type)" | |
echo "system info $(uname -a)" | |
- name: Start docker if docker deamon is not running | |
run: | | |
if systemctl is-active --quiet docker; then | |
echo "Docker daemon is running..."; | |
else | |
echo "Starting docker deamon..." && sudo systemctl start docker; | |
fi | |
- name: Log in to ECR | |
env: | |
AWS_RETRY_MODE: standard | |
AWS_MAX_ATTEMPTS: 5 | |
run: | | |
AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") | |
retry () { | |
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@") | |
} | |
retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ | |
--password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" | |
- name: Chown workspace | |
run: | | |
retry () { | |
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@") | |
} | |
retry docker pull "${ALPINE_IMAGE}" | |
# Ensure the working directory gets chowned back to the current user | |
docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . | |
- name: Clean workspace | |
run: | | |
rm -rf "${GITHUB_WORKSPACE}" | |
mkdir "${GITHUB_WORKSPACE}" | |
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" | |
uses: seemethere/add-github-ssh-key@v1 | |
with: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
- name: Preserve github env variables for use in docker | |
run: | | |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" | |
- name: Checkout PyTorch | |
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 | |
with: | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
# deep clone, to allow use of git merge-base | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Clean PyTorch checkout | |
run: | | |
# Remove any artifacts from the previous checkouts | |
git clean -fxd | |
- name: Calculate docker image tag | |
id: calculate-tag | |
run: | | |
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) | |
echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" | |
echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" | |
echo "::set-output name=docker_tag::${DOCKER_TAG}" | |
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" | |
- name: Check if image should be built | |
id: check | |
env: | |
BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} | |
run: | | |
set -x | |
# Check if image already exists, if it does then skip building it | |
if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then | |
exit 0 | |
fi | |
if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then | |
# if we're on the base branch then use the parent commit | |
MERGE_BASE=$(git rev-parse HEAD~) | |
else | |
# otherwise we're on a PR, so use the most recent base commit | |
MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") | |
fi | |
# Covers the case where a previous tag doesn't exist for the tree | |
# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly | |
if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then | |
echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" | |
exit 1 | |
fi | |
PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") | |
# If no image exists but the hash is the same as the previous hash then we should error out here | |
if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then | |
echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" | |
echo " contact the PyTorch team to restore the original images" | |
exit 1 | |
fi | |
echo ::set-output name=rebuild::yes | |
- name: Build and push docker image | |
if: ${{ steps.check.outputs.rebuild }} | |
env: | |
DOCKER_SKIP_S3_UPLOAD: 1 | |
working-directory: .circleci/docker | |
run: | | |
export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} | |
./build_docker.sh | |
- name: Pull Docker image | |
run: | | |
retry () { | |
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@") | |
} | |
retry docker pull "${DOCKER_IMAGE}" | |
- name: Parse ref | |
shell: bash | |
id: parse-ref | |
run: ./.github/scripts/parse_ref.py | |
- name: Build | |
env: | |
BRANCH: ${{ steps.parse-ref.outputs.branch }} | |
run: | | |
# detached container should get cleaned up by teardown_ec2_linux | |
container_name=$(docker run \ | |
-e BUILD_ENVIRONMENT \ | |
-e JOB_BASE_NAME \ | |
-e MAX_JOBS="$(nproc --ignore=2)" \ | |
-e AWS_DEFAULT_REGION \ | |
-e IS_GHA \ | |
-e PR_NUMBER \ | |
-e SHA1 \ | |
-e BRANCH \ | |
-e GITHUB_RUN_ID \ | |
-e SCCACHE_BUCKET \ | |
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \ | |
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ | |
-e SKIP_SCCACHE_INITIALIZATION=1 \ | |
-e TORCH_CUDA_ARCH_LIST \ | |
-e PR_LABELS \ | |
-e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ | |
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ | |
--security-opt seccomp=unconfined \ | |
--cap-add=SYS_PTRACE \ | |
--tty \ | |
--detach \ | |
--user jenkins \ | |
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ | |
-w /var/lib/jenkins/workspace \ | |
"${DOCKER_IMAGE}" | |
) | |
docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' | |
- name: Display and upload binary build size statistics (Click Me) | |
# temporary hack: set CIRCLE_* vars, until we update | |
# tools/stats/print_test_stats.py to natively support GitHub Actions | |
env: | |
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} | |
BRANCH: ${{ steps.parse-ref.outputs.branch }} | |
TAG: ${{ steps.parse-ref.outputs.tag }} | |
WORKFLOW_ID: '${{ github.run_id }}' | |
run: | | |
COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) | |
export COMMIT_TIME | |
pip3 install requests==2.26 boto3==1.16.34 | |
python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 | |
- name: Chown workspace | |
run: | | |
# Ensure the working directory gets chowned back to the current user | |
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . | |
- name: Archive artifacts into zip | |
run: | | |
zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json | |
- uses: seemethere/upload-artifact-s3@v3 | |
name: Store PyTorch Build Artifacts on S3 | |
with: | |
name: ${{ env.BUILD_ENVIRONMENT }} | |
retention-days: 14 | |
if-no-files-found: error | |
path: | |
artifacts.zip | |
- name: Hold runner for 2 hours or until ssh sessions have drained | |
# Always hold for active ssh sessions | |
if: always() | |
run: .github/scripts/wait_for_ssh_to_drain.sh | |
- name: Chown workspace | |
if: always() | |
run: | | |
# Ensure the working directory gets chowned back to the current user | |
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . | |
- name: Kill containers, clean up images | |
if: always() | |
run: | | |
# ignore expansion of "docker ps -q" since it could be empty | |
# shellcheck disable=SC2046 | |
docker stop $(docker ps -q) || true | |
# Prune all of the docker images | |
docker system prune -af | |
- name: Hold runner for 2 hours or until ssh sessions have drained | |
# Always hold for active ssh sessions | |
if: always() | |
run: .github/scripts/wait_for_ssh_to_drain.sh | |
- name: Clean up docker images | |
if: always() | |
run: | | |
# Prune all of the docker images | |
docker system prune -af | |
test_default_1_2: | |
name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) | |
needs: build | |
runs-on: linux.4xlarge.nvidia.gpu | |
timeout-minutes: 390 | |
env: | |
DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} | |
JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test | |
TEST_CONFIG: default | |
SHARD_NUMBER: 1 | |
NUM_TEST_SHARDS: 2 | |
PR_BODY: ${{ github.event.pull_request.body }} | |
steps: | |
- name: Display EC2 information | |
shell: bash | |
run: | | |
set -euo pipefail | |
function get_ec2_metadata() { | |
# Pulled from instance metadata endpoint for EC2 | |
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html | |
category=$1 | |
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" | |
} | |
echo "ami-id: $(get_ec2_metadata ami-id)" | |
echo "instance-id: $(get_ec2_metadata instance-id)" | |
echo "instance-type: $(get_ec2_metadata instance-type)" | |
echo "system info $(uname -a)" | |
- name: Start docker if docker deamon is not running | |
run: | | |
if systemctl is-active --quiet docker; then | |
echo "Docker daemon is running..."; | |
else | |
echo "Starting docker deamon..." && sudo systemctl start docker; | |
fi | |
- name: Log in to ECR | |
env: | |
AWS_RETRY_MODE: standard | |
AWS_MAX_ATTEMPTS: 5 | |
run: | | |
AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") | |
retry () { | |
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@") | |
} | |
retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ | |
--password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" | |
- name: Chown workspace | |
run: | | |
retry () { | |
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@") | |
} | |
retry docker pull "${ALPINE_IMAGE}" | |
# Ensure the working directory gets chowned back to the current user | |
docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . | |
- name: Clean workspace | |
run: | | |
rm -rf "${GITHUB_WORKSPACE}" | |
mkdir "${GITHUB_WORKSPACE}" | |
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" | |
uses: seemethere/add-github-ssh-key@v1 | |
with: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
- name: Preserve github env variables for use in docker | |
run: | | |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" | |
- name: Checkout PyTorch | |
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 | |
with: | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
# deep clone, to allow use of git merge-base | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Clean PyTorch checkout | |
run: | | |
# Remove any artifacts from the previous checkouts | |
git clean -fxd | |
- name: Pull Docker image | |
run: | | |
retry () { | |
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@") | |
} | |
retry docker pull "${DOCKER_IMAGE}" | |
- uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a | |
name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG | |
with: | |
timeout_minutes: 10 | |
max_attempts: 3 | |
command: | | |
set -ex | |
bash .github/scripts/install_nvidia_utils_linux.sh | |
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" | |
- name: Determine shm-size | |
run: | | |
shm_size="1g" | |
case "${BUILD_ENVIRONMENT}" in | |
*cuda*) | |
shm_size="2g" | |
;; | |
*rocm*) | |
shm_size="8g" | |
;; | |
esac | |
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" | |
- uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b | |
name: Download PyTorch Build Artifacts | |
with: | |
name: ${{ env.BUILD_ENVIRONMENT }} | |
- name: Unzip artifacts | |
run: | | |
unzip -o artifacts.zip | |
- name: Output disk space left | |
run: | | |
sudo df -H | |
- name: Parse ref | |
shell: bash | |
id: parse-ref | |
run: ./.github/scripts/parse_ref.py | |
- name: Test | |
env: | |
PR_NUMBER: ${{ github.event.pull_request.number }} | |
BRANCH: ${{ steps.parse-ref.outputs.branch }} | |
# Time out the test phase after 360 minutes | |
timeout-minutes: 360 | |
run: | | |
set -x | |
if [[ $TEST_CONFIG == 'multigpu' ]]; then | |
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh | |
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then | |
TEST_COMMAND=.jenkins/caffe2/test.sh | |
else | |
TEST_COMMAND=.jenkins/pytorch/test.sh | |
fi | |
PROXY_ENV= | |
# NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now | |
# We should investigate whether or not there's a list of hostnames we can add to no_proxy to | |
# make it so that we shouldn't have to fully disable squid for XLA tests | |
if [[ $TEST_CONFIG != 'xla' ]]; then | |
# shellcheck disable=SC2089 | |
PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" | |
fi | |
# detached container should get cleaned up by teardown_ec2_linux | |
# TODO: Stop building test binaries as part of the build phase | |
# Used for GPU_FLAG since that doesn't play nice | |
# shellcheck disable=SC2086,SC2090 | |
container_name=$(docker run \ | |
${GPU_FLAG:-} \ | |
-e BUILD_ENVIRONMENT \ | |
-e PR_NUMBER \ | |
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ | |
-e GITHUB_ACTIONS \ | |
-e IN_CI \ | |
-e IS_GHA \ | |
-e BRANCH \ | |
-e SHA1 \ | |
-e AWS_DEFAULT_REGION \ | |
-e IN_WHEEL_TEST \ | |
-e SHARD_NUMBER \ | |
-e JOB_BASE_NAME \ | |
-e TEST_CONFIG \ | |
-e NUM_TEST_SHARDS \ | |
-e PR_BODY \ | |
-e PYTORCH_RETRY_TEST_CASES \ | |
-e PR_LABELS \ | |
-e MAX_JOBS="$(nproc --ignore=2)" \ | |
-e SCCACHE_BUCKET \ | |
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \ | |
${PROXY_ENV} \ | |
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ | |
--ulimit stack=10485760:83886080 \ | |
--security-opt seccomp=unconfined \ | |
--cap-add=SYS_PTRACE \ | |
--ipc=host \ | |
--shm-size="${SHM_SIZE}" \ | |
--tty \ | |
--detach \ | |
--name="${container_name}" \ | |
--user jenkins \ | |
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ | |
-w /var/lib/jenkins/workspace \ | |
"${DOCKER_IMAGE}" | |
) | |
docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" | |
- name: Chown workspace | |
if: always() | |
run: | | |
# Ensure the working directory gets chowned back to the current user | |
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . | |
- name: Install render_test_results dependencies | |
if: always() | |
shell: bash | |
run: | | |
python3 -m pip install junitparser==2.1.1 rich==10.9.0 | |
- name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" | |
if: always() | |
shell: bash | |
# Encoding is weird on windows, just try to default to utf-8 if possible | |
env: | |
PYTHONIOENCODING: "utf-8" | |
run: | | |
python3 tools/render_junit.py test/ | |
- name: Zip JSONs for upload | |
if: always() | |
env: | |
FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' | |
run: | | |
# Remove any previous test jsons if they exist | |
rm -f test-jsons-*.zip | |
zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' | |
- uses: seemethere/upload-artifact-s3@v3 | |
name: Store Test Downloaded JSONs on S3 | |
if: always() | |
with: | |
retention-days: 14 | |
if-no-files-found: warn | |
path: | |
test-jsons-*.zip | |
- name: Zip test reports for upload | |
if: always() | |
env: | |
FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' | |
run: | | |
# Remove any previous test reports if they exist | |
rm -f test-reports-*.zip | |
zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' | |
- uses: seemethere/upload-artifact-s3@v3 | |
name: Store Test Reports on S3 | |
if: always() | |
with: | |
retention-days: 14 | |
if-no-files-found: error | |
path: | |
test-reports-*.zip | |
- uses: seemethere/upload-artifact-s3@v3 | |
name: Store Core dumps on S3 | |
if: failure() | |
with: | |
name: coredumps-default-1 | |
retention-days: 14 | |
if-no-files-found: ignore | |
path: | |
./**/core.[1-9]* | |
- name: Upload test statistics | |
if: always() | |
env: | |
AWS_DEFAULT_REGION: us-east-1 | |
BRANCH: ${{ steps.parse-ref.outputs.branch }} | |
JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test | |
PR_NUMBER: ${{ github.event.pull_request.number }} | |
SHA1: ${{ github.event.pull_request.head.sha || github.sha }} | |
TAG: ${{ steps.parse-ref.outputs.tag }} | |
WORKFLOW_ID: '${{ github.run_id }}' | |
shell: bash | |
run: | | |
python3 -m pip install -r requirements.txt | |
python3 -m pip install boto3==1.19.12 | |
python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test | |
- name: Hold runner for 2 hours or until ssh sessions have drained | |
# Always hold for active ssh sessions | |
if: always() | |
run: .github/scripts/wait_for_ssh_to_drain.sh | |
- name: Chown workspace | |
if: always() | |
run: | | |
# Ensure the working directory gets chowned back to the current user | |
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . | |
- name: Kill containers, clean up images | |
if: always() | |
run: | | |
# ignore expansion of "docker ps -q" since it could be empty | |
# shellcheck disable=SC2046 | |
docker stop $(docker ps -q) || true | |
# Prune all of the docker images | |
docker system prune -af | |
test_default_2_2: | |
name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) | |
needs: build | |
runs-on: linux.4xlarge.nvidia.gpu | |
timeout-minutes: 390 | |
env: | |
DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} | |
JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test | |
TEST_CONFIG: default | |
SHARD_NUMBER: 2 | |
NUM_TEST_SHARDS: 2 | |
PR_BODY: ${{ github.event.pull_request.body }} | |
steps: | |
- name: Display EC2 information | |
shell: bash | |
run: | | |
set -euo pipefail | |
function get_ec2_metadata() { | |
# Pulled from instance metadata endpoint for EC2 | |
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html | |
category=$1 | |
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" | |
} | |
echo "ami-id: $(get_ec2_metadata ami-id)" | |
echo "instance-id: $(get_ec2_metadata instance-id)" | |
echo "instance-type: $(get_ec2_metadata instance-type)" | |
echo "system info $(uname -a)" | |
- name: Start docker if docker deamon is not running | |
run: | | |
if systemctl is-active --quiet docker; then | |
echo "Docker daemon is running..."; | |
else | |
echo "Starting docker deamon..." && sudo systemctl start docker; | |
fi | |
- name: Log in to ECR | |
env: | |
AWS_RETRY_MODE: standard | |
AWS_MAX_ATTEMPTS: 5 | |
run: | | |
AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") | |
retry () { | |
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@") | |
} | |
retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ | |
--password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" | |
- name: Chown workspace | |
run: | | |
retry () { | |
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@") | |
} | |
retry docker pull "${ALPINE_IMAGE}" | |
# Ensure the working directory gets chowned back to the current user | |
docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . | |
- name: Clean workspace | |
run: | | |
rm -rf "${GITHUB_WORKSPACE}" | |
mkdir "${GITHUB_WORKSPACE}" | |
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" | |
uses: seemethere/add-github-ssh-key@v1 | |
with: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
- name: Preserve github env variables for use in docker | |
run: | | |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" | |
- name: Checkout PyTorch | |
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 | |
with: | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
# deep clone, to allow use of git merge-base | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Clean PyTorch checkout | |
run: | | |
# Remove any artifacts from the previous checkouts | |
git clean -fxd | |
- name: Pull Docker image | |
run: | | |
retry () { | |
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@") | |
} | |
retry docker pull "${DOCKER_IMAGE}" | |
- uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a | |
name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG | |
with: | |
timeout_minutes: 10 | |
max_attempts: 3 | |
command: | | |
set -ex | |
bash .github/scripts/install_nvidia_utils_linux.sh | |
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" | |
- name: Determine shm-size | |
run: | | |
shm_size="1g" | |
case "${BUILD_ENVIRONMENT}" in | |
*cuda*) | |
shm_size="2g" | |
;; | |
*rocm*) | |
shm_size="8g" | |
;; | |
esac | |
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" | |
- uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b | |
name: Download PyTorch Build Artifacts | |
with: | |
name: ${{ env.BUILD_ENVIRONMENT }} | |
- name: Unzip artifacts | |
run: | | |
unzip -o artifacts.zip | |
- name: Output disk space left | |
run: | | |
sudo df -H | |
- name: Parse ref | |
shell: bash | |
id: parse-ref | |
run: ./.github/scripts/parse_ref.py | |
- name: Test | |
env: | |
PR_NUMBER: ${{ github.event.pull_request.number }} | |
BRANCH: ${{ steps.parse-ref.outputs.branch }} | |
# Time out the test phase after 360 minutes | |
timeout-minutes: 360 | |
run: | | |
set -x | |
if [[ $TEST_CONFIG == 'multigpu' ]]; then | |
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh | |
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then | |
TEST_COMMAND=.jenkins/caffe2/test.sh | |
else | |
TEST_COMMAND=.jenkins/pytorch/test.sh | |
fi | |
PROXY_ENV= | |
# NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now | |
# We should investigate whether or not there's a list of hostnames we can add to no_proxy to | |
# make it so that we shouldn't have to fully disable squid for XLA tests | |
if [[ $TEST_CONFIG != 'xla' ]]; then | |
# shellcheck disable=SC2089 | |
PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" | |
fi | |
# detached container should get cleaned up by teardown_ec2_linux | |
# TODO: Stop building test binaries as part of the build phase | |
# Used for GPU_FLAG since that doesn't play nice | |
# shellcheck disable=SC2086,SC2090 | |
container_name=$(docker run \ | |
${GPU_FLAG:-} \ | |
-e BUILD_ENVIRONMENT \ | |
-e PR_NUMBER \ | |
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ | |
-e GITHUB_ACTIONS \ | |
-e IN_CI \ | |
-e IS_GHA \ | |
-e BRANCH \ | |
-e SHA1 \ | |
-e AWS_DEFAULT_REGION \ | |
-e IN_WHEEL_TEST \ | |
-e SHARD_NUMBER \ | |
-e JOB_BASE_NAME \ | |
-e TEST_CONFIG \ | |
-e NUM_TEST_SHARDS \ | |
-e PR_BODY \ | |
-e PYTORCH_RETRY_TEST_CASES \ | |
-e PR_LABELS \ | |
-e MAX_JOBS="$(nproc --ignore=2)" \ | |
-e SCCACHE_BUCKET \ | |
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \ | |
${PROXY_ENV} \ | |
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ | |
--ulimit stack=10485760:83886080 \ | |
--security-opt seccomp=unconfined \ | |
--cap-add=SYS_PTRACE \ | |
--ipc=host \ | |
--shm-size="${SHM_SIZE}" \ | |
--tty \ | |
--detach \ | |
--name="${container_name}" \ | |
--user jenkins \ | |
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ | |
-w /var/lib/jenkins/workspace \ | |
"${DOCKER_IMAGE}" | |
) | |
docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" | |
- name: Chown workspace | |
if: always() | |
run: | | |
# Ensure the working directory gets chowned back to the current user | |
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . | |
- name: Install render_test_results dependencies | |
if: always() | |
shell: bash | |
run: | | |
python3 -m pip install junitparser==2.1.1 rich==10.9.0 | |
- name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" | |
if: always() | |
shell: bash | |
# Encoding is weird on windows, just try to default to utf-8 if possible | |
env: | |
PYTHONIOENCODING: "utf-8" | |
run: | | |
python3 tools/render_junit.py test/ | |
- name: Zip JSONs for upload | |
if: always() | |
env: | |
FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' | |
run: | | |
# Remove any previous test jsons if they exist | |
rm -f test-jsons-*.zip | |
zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' | |
- uses: seemethere/upload-artifact-s3@v3 | |
name: Store Test Downloaded JSONs on S3 | |
if: always() | |
with: | |
retention-days: 14 | |
if-no-files-found: warn | |
path: | |
test-jsons-*.zip | |
- name: Zip test reports for upload | |
if: always() | |
env: | |
FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' | |
run: | | |
# Remove any previous test reports if they exist | |
rm -f test-reports-*.zip | |
zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' | |
- uses: seemethere/upload-artifact-s3@v3 | |
name: Store Test Reports on S3 | |
if: always() | |
with: | |
retention-days: 14 | |
if-no-files-found: error | |
path: | |
test-reports-*.zip | |
- uses: seemethere/upload-artifact-s3@v3 | |
name: Store Core dumps on S3 | |
if: failure() | |
with: | |
name: coredumps-default-2 | |
retention-days: 14 | |
if-no-files-found: ignore | |
path: | |
./**/core.[1-9]* | |
- name: Upload test statistics | |
if: always() | |
env: | |
AWS_DEFAULT_REGION: us-east-1 | |
BRANCH: ${{ steps.parse-ref.outputs.branch }} | |
JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test | |
PR_NUMBER: ${{ github.event.pull_request.number }} | |
SHA1: ${{ github.event.pull_request.head.sha || github.sha }} | |
TAG: ${{ steps.parse-ref.outputs.tag }} | |
WORKFLOW_ID: '${{ github.run_id }}' | |
shell: bash | |
run: | | |
python3 -m pip install -r requirements.txt | |
python3 -m pip install boto3==1.19.12 | |
python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test | |
- name: Hold runner for 2 hours or until ssh sessions have drained | |
# Always hold for active ssh sessions | |
if: always() | |
run: .github/scripts/wait_for_ssh_to_drain.sh | |
- name: Chown workspace | |
if: always() | |
run: | | |
# Ensure the working directory gets chowned back to the current user | |
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . | |
- name: Kill containers, clean up images | |
if: always() | |
run: | | |
# ignore expansion of "docker ps -q" since it could be empty | |
# shellcheck disable=SC2046 | |
docker stop $(docker ps -q) || true | |
# Prune all of the docker images | |
docker system prune -af |