Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

radiation-cloud ML: workflow #2268

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
8 changes: 4 additions & 4 deletions external/vcm/vcm/catalog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1006,7 +1006,7 @@ sources:
- uflx_coarse
- vflx_coarse
args:
urlpath: "gs://vcm-ml-raw-flexible-retention/2023-05-22-PIRE-like-C3072-ccnorm-true-simulation/C3072-to-C48-diagnostics/gfsphysics_15min_coarse.zarr"
urlpath: "gs://vcm-ml-raw-flexible-retention/2023-06-29-PIRE-and-FV3GFS-like-C3072-ccnorm-true-simulation/C3072-to-C48-diagnostics/gfsphysics_15min_coarse.zarr"
consolidated: True

10day_c48_PIRE_ccnorm_physics_tendencies_may2023:
Expand Down Expand Up @@ -1047,7 +1047,7 @@ sources:
- tendency_of_specific_humidity_due_to_turbulence_coarse
- time_bnds
args:
urlpath: "gs://vcm-ml-raw-flexible-retention/2023-05-22-PIRE-like-C3072-ccnorm-true-simulation/C3072-to-C48-diagnostics/physics_tendencies.zarr"
urlpath: "gs://vcm-ml-raw-flexible-retention/2023-06-29-PIRE-and-FV3GFS-like-C3072-ccnorm-true-simulation/C3072-to-C48-diagnostics/physics_tendencies.zarr"
consolidated: True

10day_c48_PIRE_ccnorm_additional_tendencies_may2023:
Expand Down Expand Up @@ -1076,7 +1076,7 @@ sources:
- t_dt_phys_coarse
- time_bnds
args:
urlpath: "gs://vcm-ml-raw-flexible-retention/2023-05-22-PIRE-like-C3072-ccnorm-true-simulation/C3072-to-C48-diagnostics/atmos_15min_coarse_ave.zarr"
urlpath: "gs://vcm-ml-raw-flexible-retention/2023-06-29-PIRE-and-FV3GFS-like-C3072-ccnorm-true-simulation/C3072-to-C48-diagnostics/atmos_15min_coarse_ave.zarr"
consolidated: True

10day_c48_PIRE_ccnorm_restarts_as_zarr_may2023:
Expand Down Expand Up @@ -1148,7 +1148,7 @@ sources:
- vtype
- zorl
args:
urlpath: "gs://vcm-ml-intermediate/2023-05-22-PIRE-like-C3072-ccnorm-true-simulation-restarts.zarr"
urlpath: "gs://vcm-ml-intermediate/2023-06-30-PIRE-like-C3072-ccnorm-true-simulation-restarts.zarr"
consolidated: True

# Regression testing data. Do not use for analysis, values are randomized.
Expand Down
94 changes: 94 additions & 0 deletions projects/cloud_ml/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
.PHONY: surface_reference_zarr
surface_reference_zarr:
cd scripts && python ./surface_reference_zarr.py

.PHONY: fine_restarts_to_zarr
fine_restarts_to_zarr:
cd argo/fine-restarts-to-zarr && kubectl apply -f pod.yaml

.PHONY: nudge_to_fine_baseline_run
nudge_to_fine_baseline_run: deploy
cd argo/prognostic-run && ./run.sh nudge-to-fine-baseline 39

.PHONY: extend_prognostic_run
extend_prognostic_run: deploy
cd argo/prognostic-run && ./extend.sh $(URL) $(SEGMENTS)

.PHONY: training_data_zarr
training_data_zarr:
cd scripts && python ./training_data_zarr.py training_data_zarr_config.yaml

.PHONY: prescribed_cloud_cc_decorr_run
prescribed_cloud_cc_decorr_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-cc-decorr 39

.PHONY: prescribed_cloud_cc_max_random_run
prescribed_cloud_cc_max_random_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-cc-max-random 39

.PHONY: prescribed_cloud_cc_random_run
prescribed_cloud_cc_random_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-cc-random 39

.PHONY: prescribed_cloud_decorr_run
prescribed_cloud_decorr_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-decorr 39

.PHONY: prescribed_cloud_max_random_run
prescribed_cloud_max_random_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-max-random 39

.PHONY: prescribed_cloud_random_run
prescribed_cloud_random_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-random 39

.PHONY: train_cloud_ml_dense_seed
train_cloud_ml_dense_seed: deploy
cd argo/training && ./run_random_seed.sh dense 4

.PHONY: train_cloud_ml_dense_nocoarsephys_seed
train_cloud_ml_dense_nocoarsephys_seed: deploy
cd argo/training && ./run_random_seed.sh dense-nocoarsephys 4

.PHONY: upload_squashed_models
upload_squashed_models:
cd scripts/upload-squashed-models && ./upload.sh 4 gs://vcm-ml-experiments/cloud-ml/2023-07-04/train-cloud-ml-dense

.PHONY: upload_squashed_models_nocoarsephys
upload_squashed_models_nocoarsephys:
cd scripts/upload-squashed-models && ./upload.sh \
4 gs://vcm-ml-experiments/cloud-ml/2023-12-06/train-cloud-ml-dense-nocoarsephys \
squash_thresholds_nocoarsephys.yaml \
squashed_output_model_nocoarsephys.yaml

.PHONY: prescribed_cloud_dense_seed_squash_threshold
prescribed_cloud_dense_seed_squash_threshold: deploy
cd argo/prognostic-run && ./run_seed_squash_threshold.sh prescribed-cloud-dense 4 39

.PHONY: prescribed_cloud_dense_nocoarsephys_seed_squash_threshold
prescribed_cloud_dense_nocoarsephys_seed_squash_threshold: deploy
cd argo/prognostic-run && ./run_seed_squash_threshold.sh \
prescribed-cloud-dense-ncp 4 39 squash_thresholds_nocoarsephys.yaml

.PHONY: offline_cloud_predictions_zarr
offline_cloud_predictions_zarr:
cd scripts && python ./offline_predictions_zarr.py offline_predictions_config.yaml

.PHONY: offline_cloud_predictions_squashed_zarr
offline_cloud_predictions_squashed_zarr:
cd scripts && python ./offline_predictions_zarr.py offline_predictions_squashed_config.yaml

.PHONY: offline_cloud_predictions_nocoarsephys_zarr
offline_cloud_predictions_nocoarsephys_zarr:
cd scripts && python ./offline_predictions_zarr.py offline_predictions_nocoarsephys_config.yaml

.PHONY: offline_cloud_predictions_nocoarsephys_squashed_zarr
offline_cloud_predictions_nocoarsephys_squashed_zarr:
cd scripts && python ./offline_predictions_zarr.py offline_predictions_nocoarsephys_squashed_config.yaml

.PHONY: deploy
deploy: argo/kustomize
cd argo && ./kustomize build . | kubectl apply -f -

argo/kustomize:
cd argo && ./install_kustomize.sh 3.10.0
24 changes: 0 additions & 24 deletions projects/cloud_ml/argo/Makefile

This file was deleted.

51 changes: 51 additions & 0 deletions projects/cloud_ml/argo/fine-restarts-to-zarr/pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
apiVersion: v1
kind: Pod
metadata:
name: fine-restarts-to-zarr
spec:
volumes:
- name: gcp-credentials-user-gcp-sa
secret:
secretName: gcp-key
containers:
- name: main
image: us.gcr.io/vcm-ml/fv3net:ae9b07271ae0133cc7099aacaf5081db77455739
command: ["bash", "-x", "-c"]
workingDir: /home/jovyan/fv3net/workflows/dataflow
env:
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /secret/gcp-credentials/key.json
- name: CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE
value: /secret/gcp-credentials/key.json
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- mountPath: /secret/gcp-credentials
name: gcp-credentials-user-gcp-sa
args:
- |
./dataflow.sh submit -m fv3net.pipelines.restarts_to_zarr \
gs://vcm-ml-raw-flexible-retention/2023-06-29-PIRE-and-FV3GFS-like-C3072-ccnorm-true-simulation/C3072-to-C48-restarts-re-upload \
gs://vcm-ml-intermediate/2023-06-30-PIRE-like-C3072-ccnorm-true-simulation-restarts.zarr \
--no-coarse-suffix \
--job_name $POD_NAME-$(openssl rand -hex 6) \
--project vcm-ml \
--region us-central1 \
--runner DataFlow \
--temp_location gs://vcm-ml-scratch/tmp_dataflow \
--num_workers 64 \
--autoscaling_algorithm=NONE
resources:
limits:
memory: 2Gi
cpu: "1000m"
requests:
memory: 1Gi
cpu: "1000m"
tolerations:
- effect: NoSchedule
key: dedicated
value: med-sim-pool
restartPolicy: Never
8 changes: 5 additions & 3 deletions projects/cloud_ml/argo/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ resources:
kind: Kustomization
images:
- name: us.gcr.io/vcm-ml/fv3net
newTag: 762dfc1e945720f59f5b9e9ab68e10e82a0594a9
newTag: ae9b07271ae0133cc7099aacaf5081db77455739
- name: us.gcr.io/vcm-ml/fv3fit
newTag: ae9b07271ae0133cc7099aacaf5081db77455739
- name: us.gcr.io/vcm-ml/post_process_run
newTag: 762dfc1e945720f59f5b9e9ab68e10e82a0594a9
newTag: ae9b07271ae0133cc7099aacaf5081db77455739
- name: us.gcr.io/vcm-ml/prognostic_run
newTag: 762dfc1e945720f59f5b9e9ab68e10e82a0594a9
newTag: ae9b07271ae0133cc7099aacaf5081db77455739
13 changes: 0 additions & 13 deletions projects/cloud_ml/argo/nudge-to-fine-run/extend.sh

This file was deleted.

Loading