molecules/run-cloud

#!/bin/bash

# Copyright 2019 Google Inc. All Rights Reserved. Licensed under the Apache
# License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

set -e

# Parse command line arguments
unset WORK_DIR
MAX_DATA_FILES=5
PROJECT=$(gcloud config get-value project || echo $PROJECT)
REGION=us-central1
while [[ $# -gt 0 ]]; do
  case $1 in
    --work-dir)
      WORK_DIR=$2
      shift
      ;;
    --max-data-files)
      MAX_DATA_FILES=$2
      shift
      ;;
    --project)
      PROJECT=$2
      shift
      ;;
    --region)
      REGION=$2
      shift
      ;;
    *)
      echo "error: unrecognized argument $1"
      exit 1
      ;;
  esac
  shift
done

if [[ -z $WORK_DIR ]]; then
  echo "error: argument --work-dir is required"
  exit 1
fi

if [[ $WORK_DIR != gs://* ]]; then
  echo "error: --work-dir must be a Google Cloud Storage path"
  echo "       example: gs://your-bucket/cloudml-samples/molecules"
  exit 1
fi

if [[ -z $PROJECT ]]; then
  echo 'error: --project is required to run in Google Cloud Platform.'
  exit 1
fi

# Wrapper function to print the command being run
function run {
  echo "$ $@"
  "$@"
}

# Extract the data files
echo '>> Extracting data'
run python data-extractor.py \
  --work-dir $WORK_DIR \
  --max-data-files $MAX_DATA_FILES
echo ''

# Preprocess the datasets using Apache Beam's DataflowRunner
echo '>> Preprocessing'
run python preprocess.py \
  --project $PROJECT \
  --runner DataflowRunner \
  --temp_location $WORK_DIR/beam-temp \
  --setup_file ./setup.py \
  --work-dir $WORK_DIR
echo ''

# Train and evaluate the model in AI Platform
echo '>> Training'
JOB="cloudml_samples_molecules_$(date +%Y%m%d_%H%M%S)"
BUCKET=$(echo $WORK_DIR | egrep -o gs://[^/]+)
run gcloud ai-platform jobs submit training $JOB \
  --module-name trainer.task \
  --package-path trainer \
  --staging-bucket $BUCKET \
  --runtime-version 1.13 \
  --region $REGION \
  --stream-logs \
  -- \
  --work-dir $WORK_DIR
echo ''

# Get the model path
EXPORT_DIR=$WORK_DIR/model/export/final
MODEL_DIR=$(gsutil ls -d "$EXPORT_DIR/*" | sort -r | head -n 1)
echo "Model: $MODEL_DIR"
echo ''

# Make batch predictions on SDF files
echo '>> Batch prediction'
run python predict.py \
  --work-dir $WORK_DIR \
  --model-dir $MODEL_DIR \
  batch \
  --project $PROJECT \
  --runner DataflowRunner \
  --temp_location $WORK_DIR/beam-temp \
  --setup_file ./setup.py \
  --inputs-dir $WORK_DIR/data \
  --outputs-dir $WORK_DIR/predictions

# Display some predictions
gsutil cat $WORK_DIR/predictions/* | head -n 10