Skip to content

Commit

Permalink
[Feat] Improve TPU CI (#6078)
Browse files Browse the repository at this point in the history
* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* i

* update

* update ci

* i

* i

* i

* i
  • Loading branch information
tchaton authored Jul 19, 2021
1 parent 4bc3d70 commit 8d0df6f
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 17 deletions.
44 changes: 30 additions & 14 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ orbs:
go: circleci/[email protected]
codecov: codecov/[email protected]

# Workflow Steps:
# 1. Checkout
# 2. Install GO
# 3. Checkout ml-testing-accelerators
# 4. GCP GKE install
# 5. Update Kubeconfig with credintials
# 6. Install jsonnet
# 7. Update jsonnet
# 8. Deploy the job on the kubernetes cluster
# 9. Statistics
# 10. Upload coverage results
# 11. Upload coverage to Codecov

references:

make_docs: &make_docs
Expand Down Expand Up @@ -33,25 +46,28 @@ references:
git checkout stable
cd ..
build_push_docker: &build_push_docker
install_jsonnet: &install_jsonnet
run:
name: Install jsonnet
command: |
go get github.com/google/go-jsonnet/cmd/jsonnet
update_jsonnet: &update_jsonnet
run:
name: Build and push Docker image
environment:
- PYTHON_VER: 3.7
name: Update jsonnet
command: |
gcloud --quiet auth configure-docker
#cd dockers/tpu-tests
docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f ./dockers/tpu-tests/Dockerfile --build-arg "PYTHON_VERSION=$PYTHON_VER" --build-arg "PYTORCH_VERSION=$XLA_VER" .
docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"
export PR_NUMBER=$(git ls-remote origin "pull/*/head" | grep -F -f <(git rev-parse HEAD) | awk -F'/' '{print $3}')
export SHA=$(git rev-parse --short HEAD)
python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
cat dockers/tpu-tests/tpu_test_cases.jsonnet
deploy_cluster: &deploy_cluster
run:
name: Deploy the job on the kubernetes cluster
command: |
go get github.com/google/go-jsonnet/cmd/jsonnet
export PATH=$PATH:$HOME/go/bin
python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; fff = open(fname).read().replace('pytorch-VERSION', 'pytorch-$XLA_VER') ; open(fname, 'w').write(fff)"
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -)
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \
job_name=${job_name#job.batch/}
job_name=${job_name% created}
echo "Waiting on kubernetes job: $job_name"
Expand All @@ -72,7 +88,6 @@ references:
# First portion is the test logs. Print these to Github Action stdout.
cat xx00 && \
echo "Done with log retrieval attempt." && \
gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \
exit $status_code
stats: &stats
Expand All @@ -92,6 +107,7 @@ jobs:
- image: circleci/python:3.7
environment:
- XLA_VER: 1.8
- PYTHON_VER: 3.7
- MAX_CHECKS: 240
- CHECK_SPEEP: 5
steps:
Expand All @@ -102,8 +118,8 @@ jobs:
- gcp-gke/update-kubeconfig-with-credentials:
cluster: $GKE_CLUSTER
perform-login: true
- setup_remote_docker
- *build_push_docker
- *install_jsonnet
- *update_jsonnet
- *deploy_cluster
- *stats
- codecov/upload:
Expand Down
17 changes: 14 additions & 3 deletions dockers/tpu-tests/tpu_test_cases.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,28 @@ local tputests = base.BaseTest {

timeout: 900, # 15 minutes, in seconds.

image: std.extVar('image'),
imageTag: std.extVar('image-tag'),
image: 'pytorchlightning/pytorch_lightning',
imageTag: 'base-xla-py{PYTHON_VERSION}-torch{PYTORCH_VERSION}',

tpuSettings+: {
softwareVersion: 'pytorch-VERSION',
softwareVersion: 'pytorch-{PYTORCH_VERSION}',
},
accelerator: tpus.v3_8,

command: utils.scriptCommand(
|||
source ~/.bashrc
conda activate lightning
mkdir -p /home/runner/work/pytorch-lightning && cd /home/runner/work/pytorch-lightning
git clone https://github.com/PyTorchLightning/pytorch-lightning.git
cd pytorch-lightning
echo $PWD
git ls-remote --refs origin
git fetch origin "refs/pull/{PR_NUMBER}/head:pr/{PR_NUMBER}" && git checkout "pr/{PR_NUMBER}"
git checkout {SHA}
pip install -e .
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
coverage run --source=pytorch_lightning -m pytest -v --capture=no \
tests/profiler/test_xla_profiler.py \
pytorch_lightning/utilities/xla_device.py \
Expand Down

0 comments on commit 8d0df6f

Please sign in to comment.