periodic-linux-bionic-cuda11.5-py3.7-gcc7

periodic-linux-bionic-cuda11.5-py3.7-gcc7 #2569

Workflow file for this run

.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml at 22b8767

	# @generated DO NOT EDIT MANUALLY
	# Template is at: .github/templates/linux_ci_workflow.yml.j2
	# Generation script: .github/scripts/generate_ci_workflows.py
	name: periodic-linux-bionic-cuda11.5-py3.7-gcc7

	on:
	push:
	tags:
	- 'ciflow/all/*'
	- 'ciflow/cuda/*'
	- 'ciflow/linux/*'
	- 'ciflow/scheduled/*'
	schedule:
	- cron: 45 4,10,16,22 * * *
	workflow_dispatch:

	env:
	BUILD_ENVIRONMENT: periodic-linux-bionic-cuda11.5-py3.7-gcc7
	DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7
	SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
	XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
	TORCH_CUDA_ARCH_LIST: 5.2
	IN_CI: 1
	IS_GHA: 1
	# This is used for the phase of adding wheel tests only, will be removed once completed
	IN_WHEEL_TEST: 1
	# Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
	CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
	ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
	PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	AWS_DEFAULT_REGION: us-east-1
	PR_NUMBER: ${{ github.event.pull_request.number }}
	SHA1: ${{ github.event.pull_request.head.sha \|\| github.sha }}
	PYTORCH_RETRY_TEST_CASES: 1
	concurrency:
	group: periodic-linux-bionic-cuda11.5-py3.7-gcc7-${{ github.event.pull_request.number \|\| github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
	cancel-in-progress: true

	jobs:

	build:
	runs-on: linux.2xlarge
	timeout-minutes: 240
	env:
	JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-build
	outputs:
	docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
	steps:
	- name: print labels
	run: echo "${PR_LABELS}"
	- name: Display EC2 information
	shell: bash
	run: \|
	set -euo pipefail
	function get_ec2_metadata() {
	# Pulled from instance metadata endpoint for EC2
	# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
	category=$1
	curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
	}
	echo "ami-id: $(get_ec2_metadata ami-id)"
	echo "instance-id: $(get_ec2_metadata instance-id)"
	echo "instance-type: $(get_ec2_metadata instance-type)"
	echo "system info $(uname -a)"
	- name: Start docker if docker deamon is not running
	run: \|
	if systemctl is-active --quiet docker; then
	echo "Docker daemon is running...";
	else
	echo "Starting docker deamon..." && sudo systemctl start docker;
	fi
	- name: Log in to ECR
	env:
	AWS_RETRY_MODE: standard
	AWS_MAX_ATTEMPTS: 5
	run: \|
	AWS_ACCOUNT_ID=$(aws sts get-caller-identity\|grep Account\|cut -f4 -d\")
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" \| docker login --username AWS \
	--password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
	- name: Chown workspace
	run: \|
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry docker pull "${ALPINE_IMAGE}"
	# Ensure the working directory gets chowned back to the current user
	docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Clean workspace
	run: \|
	rm -rf "${GITHUB_WORKSPACE}"
	mkdir "${GITHUB_WORKSPACE}"
	- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
	uses: seemethere/add-github-ssh-key@v1
	with:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	- name: Preserve github env variables for use in docker
	run: \|
	env \| grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
	- name: Checkout PyTorch
	uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
	with:
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	# deep clone, to allow use of git merge-base
	fetch-depth: 0
	submodules: recursive
	- name: Clean PyTorch checkout
	run: \|
	# Remove any artifacts from the previous checkouts
	git clean -fxd
	- name: Calculate docker image tag
	id: calculate-tag
	run: \|
	DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
	echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
	echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
	echo "::set-output name=docker_tag::${DOCKER_TAG}"
	echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
	- name: Check if image should be built
	id: check
	env:
	BASE_REVISION: ${{ github.event.pull_request.base.sha \|\| github.sha }}
	run: \|
	set -x
	# Check if image already exists, if it does then skip building it
	if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
	exit 0
	fi
	if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
	# if we're on the base branch then use the parent commit
	MERGE_BASE=$(git rev-parse HEAD~)
	else
	# otherwise we're on a PR, so use the most recent base commit
	MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
	fi
	# Covers the case where a previous tag doesn't exist for the tree
	# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
	if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
	echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
	exit 1
	fi
	PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
	# If no image exists but the hash is the same as the previous hash then we should error out here
	if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
	echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
	echo " contact the PyTorch team to restore the original images"
	exit 1
	fi
	echo ::set-output name=rebuild::yes
	- name: Build and push docker image
	if: ${{ steps.check.outputs.rebuild }}
	env:
	DOCKER_SKIP_S3_UPLOAD: 1
	working-directory: .circleci/docker
	run: \|
	export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
	./build_docker.sh
	- name: Pull Docker image
	run: \|
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry docker pull "${DOCKER_IMAGE}"
	- name: Parse ref
	shell: bash
	id: parse-ref
	run: ./.github/scripts/parse_ref.py
	- name: Build
	env:
	BRANCH: ${{ steps.parse-ref.outputs.branch }}
	run: \|
	# detached container should get cleaned up by teardown_ec2_linux
	container_name=$(docker run \
	-e BUILD_ENVIRONMENT \
	-e JOB_BASE_NAME \
	-e MAX_JOBS="$(nproc --ignore=2)" \
	-e AWS_DEFAULT_REGION \
	-e IS_GHA \
	-e PR_NUMBER \
	-e SHA1 \
	-e BRANCH \
	-e GITHUB_RUN_ID \
	-e SCCACHE_BUCKET \
	-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
	-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
	-e SKIP_SCCACHE_INITIALIZATION=1 \
	-e TORCH_CUDA_ARCH_LIST \
	-e PR_LABELS \
	-e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
	--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
	--security-opt seccomp=unconfined \
	--cap-add=SYS_PTRACE \
	--tty \
	--detach \
	--user jenkins \
	-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
	-w /var/lib/jenkins/workspace \
	"${DOCKER_IMAGE}"
	)
	docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
	- name: Display and upload binary build size statistics (Click Me)
	# temporary hack: set CIRCLE_* vars, until we update
	# tools/stats/print_test_stats.py to natively support GitHub Actions
	env:
	SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
	BRANCH: ${{ steps.parse-ref.outputs.branch }}
	TAG: ${{ steps.parse-ref.outputs.tag }}
	WORKFLOW_ID: '${{ github.run_id }}'
	run: \|
	COMMIT_TIME=$(git log --max-count=1 --format=%ct \|\| echo 0)
	export COMMIT_TIME
	pip3 install requests==2.26 boto3==1.16.34
	python3 -m tools.stats.upload_binary_size_to_scuba \|\| exit 0
	- name: Chown workspace
	run: \|
	# Ensure the working directory gets chowned back to the current user
	docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Archive artifacts into zip
	run: \|
	zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
	- uses: seemethere/upload-artifact-s3@v3
	name: Store PyTorch Build Artifacts on S3
	with:
	name: ${{ env.BUILD_ENVIRONMENT }}
	retention-days: 14
	if-no-files-found: error
	path:
	artifacts.zip
	- name: Hold runner for 2 hours or until ssh sessions have drained
	# Always hold for active ssh sessions
	if: always()
	run: .github/scripts/wait_for_ssh_to_drain.sh
	- name: Chown workspace
	if: always()
	run: \|
	# Ensure the working directory gets chowned back to the current user
	docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Kill containers, clean up images
	if: always()
	run: \|
	# ignore expansion of "docker ps -q" since it could be empty
	# shellcheck disable=SC2046
	docker stop $(docker ps -q) \|\| true
	# Prune all of the docker images
	docker system prune -af
	- name: Hold runner for 2 hours or until ssh sessions have drained
	# Always hold for active ssh sessions
	if: always()
	run: .github/scripts/wait_for_ssh_to_drain.sh
	- name: Clean up docker images
	if: always()
	run: \|
	# Prune all of the docker images
	docker system prune -af

	test_distributed_1_1:
	name: test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)
	needs: build
	runs-on: linux.8xlarge.nvidia.gpu
	timeout-minutes: 270
	env:
	DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
	JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test
	TEST_CONFIG: distributed
	SHARD_NUMBER: 1
	NUM_TEST_SHARDS: 1
	PR_BODY: ${{ github.event.pull_request.body }}
	steps:
	- name: Display EC2 information
	shell: bash
	run: \|
	set -euo pipefail
	function get_ec2_metadata() {
	# Pulled from instance metadata endpoint for EC2
	# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
	category=$1
	curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
	}
	echo "ami-id: $(get_ec2_metadata ami-id)"
	echo "instance-id: $(get_ec2_metadata instance-id)"
	echo "instance-type: $(get_ec2_metadata instance-type)"
	echo "system info $(uname -a)"
	- name: Start docker if docker deamon is not running
	run: \|
	if systemctl is-active --quiet docker; then
	echo "Docker daemon is running...";
	else
	echo "Starting docker deamon..." && sudo systemctl start docker;
	fi
	- name: Log in to ECR
	env:
	AWS_RETRY_MODE: standard
	AWS_MAX_ATTEMPTS: 5
	run: \|
	AWS_ACCOUNT_ID=$(aws sts get-caller-identity\|grep Account\|cut -f4 -d\")
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" \| docker login --username AWS \
	--password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
	- name: Chown workspace
	run: \|
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry docker pull "${ALPINE_IMAGE}"
	# Ensure the working directory gets chowned back to the current user
	docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Clean workspace
	run: \|
	rm -rf "${GITHUB_WORKSPACE}"
	mkdir "${GITHUB_WORKSPACE}"
	- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
	uses: seemethere/add-github-ssh-key@v1
	with:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	- name: Preserve github env variables for use in docker
	run: \|
	env \| grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
	- name: Checkout PyTorch
	uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
	with:
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	# deep clone, to allow use of git merge-base
	fetch-depth: 0
	submodules: recursive
	- name: Clean PyTorch checkout
	run: \|
	# Remove any artifacts from the previous checkouts
	git clean -fxd
	- name: Pull Docker image
	run: \|
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry docker pull "${DOCKER_IMAGE}"
	- uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
	name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
	with:
	timeout_minutes: 10
	max_attempts: 3
	command: \|
	set -ex
	bash .github/scripts/install_nvidia_utils_linux.sh
	echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
	- name: Determine shm-size
	run: \|
	shm_size="1g"
	case "${BUILD_ENVIRONMENT}" in
	cuda)
	shm_size="2g"
	;;
	rocm)
	shm_size="8g"
	;;
	esac
	echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
	- uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
	name: Download PyTorch Build Artifacts
	with:
	name: ${{ env.BUILD_ENVIRONMENT }}
	- name: Unzip artifacts
	run: \|
	unzip -o artifacts.zip
	- name: Output disk space left
	run: \|
	sudo df -H
	- name: Parse ref
	shell: bash
	id: parse-ref
	run: ./.github/scripts/parse_ref.py
	- name: Test
	env:
	PR_NUMBER: ${{ github.event.pull_request.number }}
	BRANCH: ${{ steps.parse-ref.outputs.branch }}
	# Time out the test phase after 240 minutes
	timeout-minutes: 240
	run: \|
	set -x

	if [[ $TEST_CONFIG == 'multigpu' ]]; then
	TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
	elif [[ $BUILD_ENVIRONMENT == onnx ]]; then
	TEST_COMMAND=.jenkins/caffe2/test.sh
	else
	TEST_COMMAND=.jenkins/pytorch/test.sh
	fi
	PROXY_ENV=
	# NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
	# We should investigate whether or not there's a list of hostnames we can add to no_proxy to
	# make it so that we shouldn't have to fully disable squid for XLA tests
	if [[ $TEST_CONFIG != 'xla' ]]; then
	# shellcheck disable=SC2089
	PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
	fi
	# detached container should get cleaned up by teardown_ec2_linux
	# TODO: Stop building test binaries as part of the build phase
	# Used for GPU_FLAG since that doesn't play nice
	# shellcheck disable=SC2086,SC2090
	container_name=$(docker run \
	${GPU_FLAG:-} \
	-e BUILD_ENVIRONMENT \
	-e PR_NUMBER \
	-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
	-e GITHUB_ACTIONS \
	-e IN_CI \
	-e IS_GHA \
	-e BRANCH \
	-e SHA1 \
	-e AWS_DEFAULT_REGION \
	-e IN_WHEEL_TEST \
	-e SHARD_NUMBER \
	-e JOB_BASE_NAME \
	-e TEST_CONFIG \
	-e NUM_TEST_SHARDS \
	-e PR_BODY \
	-e PYTORCH_RETRY_TEST_CASES \
	-e PR_LABELS \
	-e MAX_JOBS="$(nproc --ignore=2)" \
	-e SCCACHE_BUCKET \
	-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
	${PROXY_ENV} \
	--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
	--ulimit stack=10485760:83886080 \
	--security-opt seccomp=unconfined \
	--cap-add=SYS_PTRACE \
	--ipc=host \
	--shm-size="${SHM_SIZE}" \
	--tty \
	--detach \
	--name="${container_name}" \
	--user jenkins \
	-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
	-w /var/lib/jenkins/workspace \
	"${DOCKER_IMAGE}"
	)
	docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
	- name: Chown workspace
	if: always()
	run: \|
	# Ensure the working directory gets chowned back to the current user
	docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Install render_test_results dependencies
	if: always()
	shell: bash
	run: \|
	python3 -m pip install junitparser==2.1.1 rich==10.9.0
	- name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
	if: always()
	shell: bash
	# Encoding is weird on windows, just try to default to utf-8 if possible
	env:
	PYTHONIOENCODING: "utf-8"
	run: \|
	python3 tools/render_junit.py test/
	- name: Zip JSONs for upload
	if: always()
	env:
	FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu'
	run: \|
	# Remove any previous test jsons if they exist
	rm -f test-jsons-*.zip
	zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
	- uses: seemethere/upload-artifact-s3@v3
	name: Store Test Downloaded JSONs on S3
	if: always()
	with:
	retention-days: 14
	if-no-files-found: warn
	path:
	test-jsons-*.zip
	- name: Zip test reports for upload
	if: always()
	env:
	FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu'
	run: \|
	# Remove any previous test reports if they exist
	rm -f test-reports-*.zip
	zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
	- uses: seemethere/upload-artifact-s3@v3
	name: Store Test Reports on S3
	if: always()
	with:
	retention-days: 14
	if-no-files-found: error
	path:
	test-reports-*.zip
	- uses: seemethere/upload-artifact-s3@v3
	name: Store Core dumps on S3
	if: failure()
	with:
	name: coredumps-distributed-1
	retention-days: 14
	if-no-files-found: ignore
	path:
	./*/core.[1-9]
	- name: Upload test statistics
	if: always()
	env:
	AWS_DEFAULT_REGION: us-east-1
	BRANCH: ${{ steps.parse-ref.outputs.branch }}
	JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test
	PR_NUMBER: ${{ github.event.pull_request.number }}
	SHA1: ${{ github.event.pull_request.head.sha \|\| github.sha }}
	TAG: ${{ steps.parse-ref.outputs.tag }}
	WORKFLOW_ID: '${{ github.run_id }}'
	shell: bash
	run: \|
	python3 -m pip install -r requirements.txt
	python3 -m pip install boto3==1.19.12
	python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
	- name: Hold runner for 2 hours or until ssh sessions have drained
	# Always hold for active ssh sessions
	if: always()
	run: .github/scripts/wait_for_ssh_to_drain.sh
	- name: Chown workspace
	if: always()
	run: \|
	# Ensure the working directory gets chowned back to the current user
	docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Kill containers, clean up images
	if: always()
	run: \|
	# ignore expansion of "docker ps -q" since it could be empty
	# shellcheck disable=SC2046
	docker stop $(docker ps -q) \|\| true
	# Prune all of the docker images
	docker system prune -af
	test_default_1_2:
	name: test (default, 1, 2, linux.4xlarge.nvidia.gpu)
	needs: build
	runs-on: linux.4xlarge.nvidia.gpu
	timeout-minutes: 270
	env:
	DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
	JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test
	TEST_CONFIG: default
	SHARD_NUMBER: 1
	NUM_TEST_SHARDS: 2
	PR_BODY: ${{ github.event.pull_request.body }}
	steps:
	- name: Display EC2 information
	shell: bash
	run: \|
	set -euo pipefail
	function get_ec2_metadata() {
	# Pulled from instance metadata endpoint for EC2
	# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
	category=$1
	curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
	}
	echo "ami-id: $(get_ec2_metadata ami-id)"
	echo "instance-id: $(get_ec2_metadata instance-id)"
	echo "instance-type: $(get_ec2_metadata instance-type)"
	echo "system info $(uname -a)"
	- name: Start docker if docker deamon is not running
	run: \|
	if systemctl is-active --quiet docker; then
	echo "Docker daemon is running...";
	else
	echo "Starting docker deamon..." && sudo systemctl start docker;
	fi
	- name: Log in to ECR
	env:
	AWS_RETRY_MODE: standard
	AWS_MAX_ATTEMPTS: 5
	run: \|
	AWS_ACCOUNT_ID=$(aws sts get-caller-identity\|grep Account\|cut -f4 -d\")
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" \| docker login --username AWS \
	--password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
	- name: Chown workspace
	run: \|
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry docker pull "${ALPINE_IMAGE}"
	# Ensure the working directory gets chowned back to the current user
	docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Clean workspace
	run: \|
	rm -rf "${GITHUB_WORKSPACE}"
	mkdir "${GITHUB_WORKSPACE}"
	- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
	uses: seemethere/add-github-ssh-key@v1
	with:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	- name: Preserve github env variables for use in docker
	run: \|
	env \| grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
	- name: Checkout PyTorch
	uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
	with:
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	# deep clone, to allow use of git merge-base
	fetch-depth: 0
	submodules: recursive
	- name: Clean PyTorch checkout
	run: \|
	# Remove any artifacts from the previous checkouts
	git clean -fxd
	- name: Pull Docker image
	run: \|
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry docker pull "${DOCKER_IMAGE}"
	- uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
	name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
	with:
	timeout_minutes: 10
	max_attempts: 3
	command: \|
	set -ex
	bash .github/scripts/install_nvidia_utils_linux.sh
	echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
	- name: Determine shm-size
	run: \|
	shm_size="1g"
	case "${BUILD_ENVIRONMENT}" in
	cuda)
	shm_size="2g"
	;;
	rocm)
	shm_size="8g"
	;;
	esac
	echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
	- uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
	name: Download PyTorch Build Artifacts
	with:
	name: ${{ env.BUILD_ENVIRONMENT }}
	- name: Unzip artifacts
	run: \|
	unzip -o artifacts.zip
	- name: Output disk space left
	run: \|
	sudo df -H
	- name: Parse ref
	shell: bash
	id: parse-ref
	run: ./.github/scripts/parse_ref.py
	- name: Test
	env:
	PR_NUMBER: ${{ github.event.pull_request.number }}
	BRANCH: ${{ steps.parse-ref.outputs.branch }}
	# Time out the test phase after 240 minutes
	timeout-minutes: 240
	run: \|
	set -x

	if [[ $TEST_CONFIG == 'multigpu' ]]; then
	TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
	elif [[ $BUILD_ENVIRONMENT == onnx ]]; then
	TEST_COMMAND=.jenkins/caffe2/test.sh
	else
	TEST_COMMAND=.jenkins/pytorch/test.sh
	fi
	PROXY_ENV=
	# NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
	# We should investigate whether or not there's a list of hostnames we can add to no_proxy to
	# make it so that we shouldn't have to fully disable squid for XLA tests
	if [[ $TEST_CONFIG != 'xla' ]]; then
	# shellcheck disable=SC2089
	PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
	fi
	# detached container should get cleaned up by teardown_ec2_linux
	# TODO: Stop building test binaries as part of the build phase
	# Used for GPU_FLAG since that doesn't play nice
	# shellcheck disable=SC2086,SC2090
	container_name=$(docker run \
	${GPU_FLAG:-} \
	-e BUILD_ENVIRONMENT \
	-e PR_NUMBER \
	-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
	-e GITHUB_ACTIONS \
	-e IN_CI \
	-e IS_GHA \
	-e BRANCH \
	-e SHA1 \
	-e AWS_DEFAULT_REGION \
	-e IN_WHEEL_TEST \
	-e SHARD_NUMBER \
	-e JOB_BASE_NAME \
	-e TEST_CONFIG \
	-e NUM_TEST_SHARDS \
	-e PR_BODY \
	-e PYTORCH_RETRY_TEST_CASES \
	-e PR_LABELS \
	-e MAX_JOBS="$(nproc --ignore=2)" \
	-e SCCACHE_BUCKET \
	-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
	${PROXY_ENV} \
	--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
	--ulimit stack=10485760:83886080 \
	--security-opt seccomp=unconfined \
	--cap-add=SYS_PTRACE \
	--ipc=host \
	--shm-size="${SHM_SIZE}" \
	--tty \
	--detach \
	--name="${container_name}" \
	--user jenkins \
	-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
	-w /var/lib/jenkins/workspace \
	"${DOCKER_IMAGE}"
	)
	docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
	- name: Chown workspace
	if: always()
	run: \|
	# Ensure the working directory gets chowned back to the current user
	docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Install render_test_results dependencies
	if: always()
	shell: bash
	run: \|
	python3 -m pip install junitparser==2.1.1 rich==10.9.0
	- name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
	if: always()
	shell: bash
	# Encoding is weird on windows, just try to default to utf-8 if possible
	env:
	PYTHONIOENCODING: "utf-8"
	run: \|
	python3 tools/render_junit.py test/
	- name: Zip JSONs for upload
	if: always()
	env:
	FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu'
	run: \|
	# Remove any previous test jsons if they exist
	rm -f test-jsons-*.zip
	zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
	- uses: seemethere/upload-artifact-s3@v3
	name: Store Test Downloaded JSONs on S3
	if: always()
	with:
	retention-days: 14
	if-no-files-found: warn
	path:
	test-jsons-*.zip
	- name: Zip test reports for upload
	if: always()
	env:
	FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu'
	run: \|
	# Remove any previous test reports if they exist
	rm -f test-reports-*.zip
	zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
	- uses: seemethere/upload-artifact-s3@v3
	name: Store Test Reports on S3
	if: always()
	with:
	retention-days: 14
	if-no-files-found: error
	path:
	test-reports-*.zip
	- uses: seemethere/upload-artifact-s3@v3
	name: Store Core dumps on S3
	if: failure()
	with:
	name: coredumps-default-1
	retention-days: 14
	if-no-files-found: ignore
	path:
	./*/core.[1-9]
	- name: Upload test statistics
	if: always()
	env:
	AWS_DEFAULT_REGION: us-east-1
	BRANCH: ${{ steps.parse-ref.outputs.branch }}
	JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test
	PR_NUMBER: ${{ github.event.pull_request.number }}
	SHA1: ${{ github.event.pull_request.head.sha \|\| github.sha }}
	TAG: ${{ steps.parse-ref.outputs.tag }}
	WORKFLOW_ID: '${{ github.run_id }}'
	shell: bash
	run: \|
	python3 -m pip install -r requirements.txt
	python3 -m pip install boto3==1.19.12
	python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
	- name: Hold runner for 2 hours or until ssh sessions have drained
	# Always hold for active ssh sessions
	if: always()
	run: .github/scripts/wait_for_ssh_to_drain.sh
	- name: Chown workspace
	if: always()
	run: \|
	# Ensure the working directory gets chowned back to the current user
	docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Kill containers, clean up images
	if: always()
	run: \|
	# ignore expansion of "docker ps -q" since it could be empty
	# shellcheck disable=SC2046
	docker stop $(docker ps -q) \|\| true
	# Prune all of the docker images
	docker system prune -af
	test_default_2_2:
	name: test (default, 2, 2, linux.4xlarge.nvidia.gpu)
	needs: build
	runs-on: linux.4xlarge.nvidia.gpu
	timeout-minutes: 270
	env:
	DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
	JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test
	TEST_CONFIG: default
	SHARD_NUMBER: 2
	NUM_TEST_SHARDS: 2
	PR_BODY: ${{ github.event.pull_request.body }}
	steps:
	- name: Display EC2 information
	shell: bash
	run: \|
	set -euo pipefail
	function get_ec2_metadata() {
	# Pulled from instance metadata endpoint for EC2
	# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
	category=$1
	curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
	}
	echo "ami-id: $(get_ec2_metadata ami-id)"
	echo "instance-id: $(get_ec2_metadata instance-id)"
	echo "instance-type: $(get_ec2_metadata instance-type)"
	echo "system info $(uname -a)"
	- name: Start docker if docker deamon is not running
	run: \|
	if systemctl is-active --quiet docker; then
	echo "Docker daemon is running...";
	else
	echo "Starting docker deamon..." && sudo systemctl start docker;
	fi
	- name: Log in to ECR
	env:
	AWS_RETRY_MODE: standard
	AWS_MAX_ATTEMPTS: 5
	run: \|
	AWS_ACCOUNT_ID=$(aws sts get-caller-identity\|grep Account\|cut -f4 -d\")
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" \| docker login --username AWS \
	--password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
	- name: Chown workspace
	run: \|
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry docker pull "${ALPINE_IMAGE}"
	# Ensure the working directory gets chowned back to the current user
	docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Clean workspace
	run: \|
	rm -rf "${GITHUB_WORKSPACE}"
	mkdir "${GITHUB_WORKSPACE}"
	- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
	uses: seemethere/add-github-ssh-key@v1
	with:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	- name: Preserve github env variables for use in docker
	run: \|
	env \| grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
	- name: Checkout PyTorch
	uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
	with:
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	# deep clone, to allow use of git merge-base
	fetch-depth: 0
	submodules: recursive
	- name: Clean PyTorch checkout
	run: \|
	# Remove any artifacts from the previous checkouts
	git clean -fxd
	- name: Pull Docker image
	run: \|
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry docker pull "${DOCKER_IMAGE}"
	- uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
	name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
	with:
	timeout_minutes: 10
	max_attempts: 3
	command: \|
	set -ex
	bash .github/scripts/install_nvidia_utils_linux.sh
	echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
	- name: Determine shm-size
	run: \|
	shm_size="1g"
	case "${BUILD_ENVIRONMENT}" in
	cuda)
	shm_size="2g"
	;;
	rocm)
	shm_size="8g"
	;;
	esac
	echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
	- uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
	name: Download PyTorch Build Artifacts
	with:
	name: ${{ env.BUILD_ENVIRONMENT }}
	- name: Unzip artifacts
	run: \|
	unzip -o artifacts.zip
	- name: Output disk space left
	run: \|
	sudo df -H
	- name: Parse ref
	shell: bash
	id: parse-ref
	run: ./.github/scripts/parse_ref.py
	- name: Test
	env:
	PR_NUMBER: ${{ github.event.pull_request.number }}
	BRANCH: ${{ steps.parse-ref.outputs.branch }}
	# Time out the test phase after 240 minutes
	timeout-minutes: 240
	run: \|
	set -x

	if [[ $TEST_CONFIG == 'multigpu' ]]; then
	TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
	elif [[ $BUILD_ENVIRONMENT == onnx ]]; then
	TEST_COMMAND=.jenkins/caffe2/test.sh
	else
	TEST_COMMAND=.jenkins/pytorch/test.sh
	fi
	PROXY_ENV=
	# NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
	# We should investigate whether or not there's a list of hostnames we can add to no_proxy to
	# make it so that we shouldn't have to fully disable squid for XLA tests
	if [[ $TEST_CONFIG != 'xla' ]]; then
	# shellcheck disable=SC2089
	PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
	fi
	# detached container should get cleaned up by teardown_ec2_linux
	# TODO: Stop building test binaries as part of the build phase
	# Used for GPU_FLAG since that doesn't play nice
	# shellcheck disable=SC2086,SC2090
	container_name=$(docker run \
	${GPU_FLAG:-} \
	-e BUILD_ENVIRONMENT \
	-e PR_NUMBER \
	-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
	-e GITHUB_ACTIONS \
	-e IN_CI \
	-e IS_GHA \
	-e BRANCH \
	-e SHA1 \
	-e AWS_DEFAULT_REGION \
	-e IN_WHEEL_TEST \
	-e SHARD_NUMBER \
	-e JOB_BASE_NAME \
	-e TEST_CONFIG \
	-e NUM_TEST_SHARDS \
	-e PR_BODY \
	-e PYTORCH_RETRY_TEST_CASES \
	-e PR_LABELS \
	-e MAX_JOBS="$(nproc --ignore=2)" \
	-e SCCACHE_BUCKET \
	-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
	${PROXY_ENV} \
	--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
	--ulimit stack=10485760:83886080 \
	--security-opt seccomp=unconfined \
	--cap-add=SYS_PTRACE \
	--ipc=host \
	--shm-size="${SHM_SIZE}" \
	--tty \
	--detach \
	--name="${container_name}" \
	--user jenkins \
	-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
	-w /var/lib/jenkins/workspace \
	"${DOCKER_IMAGE}"
	)
	docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
	- name: Chown workspace
	if: always()
	run: \|
	# Ensure the working directory gets chowned back to the current user
	docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Install render_test_results dependencies
	if: always()
	shell: bash
	run: \|
	python3 -m pip install junitparser==2.1.1 rich==10.9.0
	- name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
	if: always()
	shell: bash
	# Encoding is weird on windows, just try to default to utf-8 if possible
	env:
	PYTHONIOENCODING: "utf-8"
	run: \|
	python3 tools/render_junit.py test/
	- name: Zip JSONs for upload
	if: always()
	env:
	FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu'
	run: \|
	# Remove any previous test jsons if they exist
	rm -f test-jsons-*.zip
	zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
	- uses: seemethere/upload-artifact-s3@v3
	name: Store Test Downloaded JSONs on S3
	if: always()
	with:
	retention-days: 14
	if-no-files-found: warn
	path:
	test-jsons-*.zip
	- name: Zip test reports for upload
	if: always()
	env:
	FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu'
	run: \|
	# Remove any previous test reports if they exist
	rm -f test-reports-*.zip
	zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
	- uses: seemethere/upload-artifact-s3@v3
	name: Store Test Reports on S3
	if: always()
	with:
	retention-days: 14
	if-no-files-found: error
	path:
	test-reports-*.zip
	- uses: seemethere/upload-artifact-s3@v3
	name: Store Core dumps on S3
	if: failure()
	with:
	name: coredumps-default-2
	retention-days: 14
	if-no-files-found: ignore
	path:
	./*/core.[1-9]
	- name: Upload test statistics
	if: always()
	env:
	AWS_DEFAULT_REGION: us-east-1
	BRANCH: ${{ steps.parse-ref.outputs.branch }}
	JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test
	PR_NUMBER: ${{ github.event.pull_request.number }}
	SHA1: ${{ github.event.pull_request.head.sha \|\| github.sha }}
	TAG: ${{ steps.parse-ref.outputs.tag }}
	WORKFLOW_ID: '${{ github.run_id }}'
	shell: bash
	run: \|
	python3 -m pip install -r requirements.txt
	python3 -m pip install boto3==1.19.12
	python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
	- name: Hold runner for 2 hours or until ssh sessions have drained
	# Always hold for active ssh sessions
	if: always()
	run: .github/scripts/wait_for_ssh_to_drain.sh
	- name: Chown workspace
	if: always()
	run: \|
	# Ensure the working directory gets chowned back to the current user
	docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Kill containers, clean up images
	if: always()
	run: \|
	# ignore expansion of "docker ps -q" since it could be empty
	# shellcheck disable=SC2046
	docker stop $(docker ps -q) \|\| true
	# Prune all of the docker images
	docker system prune -af

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

periodic-linux-bionic-cuda11.5-py3.7-gcc7 #2569

Workflow file

periodic-linux-bionic-cuda11.5-py3.7-gcc7 #2569

Jobs

Run details

Workflow file for this run