Skip to content

Commit

Permalink
Feature/custom dataflow container (#707)
Browse files Browse the repository at this point in the history
* build on the beam sdk base image, not gcloud sdk
* add requirements.txt and beam=2.34.0
* fix gcp-variant-transforms image to use python 3.8
* update cloudbuild for custom runner container
* adding documentation for the custom runner image
* use --sdk_container_image vs --custom_runner_image
* pysam MUST be less than 0.16.0 (not equal)
* specify the runner image in integration tests

Co-authored-by: Lawrence, Andrew <[email protected]>
  • Loading branch information
lawrenae and aelawrence authored Jan 28, 2022
1 parent 95824d1 commit 37709f8
Show file tree
Hide file tree
Showing 14 changed files with 184 additions and 118 deletions.
9 changes: 9 additions & 0 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,14 @@ steps:
- '--tag=gcr.io/$PROJECT_ID/gcp-variant-transforms:${_CUSTOM_TAG_NAME}'
- '--file=docker/Dockerfile'
- '.'
id: 'build-gcp-variant-transforms-image'
- name: 'gcr.io/cloud-builders/docker'
args:
- 'build'
- '--tag=gcr.io/$PROJECT_ID/variant-transforms-custom-runner:${_CUSTOM_TAG_NAME}'
- '--file=docker/Dockerfile.custom_dataflow_container'
- '.'
id: 'build-variant-transforms-custom-runner-image'
images:
- 'gcr.io/$PROJECT_ID/gcp-variant-transforms:${_CUSTOM_TAG_NAME}'
- 'gcr.io/$PROJECT_ID/variant-transforms-custom-runner:${_CUSTOM_TAG_NAME}'
17 changes: 17 additions & 0 deletions cloudbuild_CI.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,28 @@ steps:
- '.'
id: 'build-gcp-variant-transforms-docker'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'build'
- '--build-arg'
- 'commit_sha=${COMMIT_SHA}'
- '--tag=gcr.io/${PROJECT_ID}/variant-transforms-custom-runner:${COMMIT_SHA}'
- '--file=docker/Dockerfile.custom_dataflow_container'
- '.'
id: 'build-variant-transforms-custom-runner-docker'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'push'
- 'gcr.io/${PROJECT_ID}/gcp-variant-transforms:${COMMIT_SHA}'
id: 'push-gcp-variant-transforms-docker'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'push'
- 'gcr.io/${PROJECT_ID}/variant-transforms-custom-runner:${COMMIT_SHA}'
id: 'push-variant-transforms-custom-runner-docker'

- name: 'gcr.io/${PROJECT_ID}/gcp-variant-transforms:${COMMIT_SHA}'
args:
- '--keep_image'
Expand All @@ -56,4 +72,5 @@ steps:
# - '--gs_dir bashir-variant_integration_test_runs'
images:
- 'gcr.io/${PROJECT_ID}/gcp-variant-transforms:${COMMIT_SHA}'
- 'gcr.io/${PROJECT_ID}/variant-transforms-custom-runner:${COMMIT_SHA}'
timeout: 270m
38 changes: 34 additions & 4 deletions cloudbuild_release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,60 @@ steps:
args:
- 'pull'
- 'gcr.io/${PROJECT_ID}/gcp-variant-transforms:${COMMIT_SHA}'
id: 'pull-image'
id: 'pull-gcp-variant-transforms-image'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'pull'
- 'gcr.io/${PROJECT_ID}/variant-transforms-custom-runner:${COMMIT_SHA}'
id: 'pull-variant-transforms-custom-runner-image'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'tag'
- 'gcr.io/${PROJECT_ID}/gcp-variant-transforms:${COMMIT_SHA}'
- 'gcr.io/cloud-lifesciences/gcp-variant-transforms:${COMMIT_SHA}'
id: 'tag-image-commit-sha'
id: 'tag-gcp-variant-transforms-commit-sha'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'tag'
- 'gcr.io/${PROJECT_ID}/variant-transforms-custom-runner:${COMMIT_SHA}'
- 'gcr.io/cloud-lifesciences/variant-transforms-custom-runner:${COMMIT_SHA}'
id: 'tag-variant-transforms-custom-runner-commit-sha'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'tag'
- 'gcr.io/cloud-lifesciences/gcp-variant-transforms:${COMMIT_SHA}'
- 'gcr.io/cloud-lifesciences/gcp-variant-transforms:${TAG_NAME}'
id: 'tag-image-release-version'
id: 'tag-gcp-variant-transforms-release-version'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'tag'
- 'gcr.io/cloud-lifesciences/variant-transforms-custom-runner:${COMMIT_SHA}'
- 'gcr.io/cloud-lifesciences/variant-transforms-custom-runner:${TAG_NAME}'
id: 'tag-variant-transforms-custom-runner-release-version'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'tag'
- 'gcr.io/cloud-lifesciences/gcp-variant-transforms:${COMMIT_SHA}'
- 'gcr.io/cloud-lifesciences/gcp-variant-transforms:latest'
id: 'tag-image-latest'
id: 'tag-gcp-variant-transforms-latest'

- name: 'gcr.io/cloud-builders/docker'
args:
- 'tag'
- 'gcr.io/cloud-lifesciences/variant-transforms-custom-runner:${COMMIT_SHA}'
- 'gcr.io/cloud-lifesciences/variant-transforms-custom-runner:latest'
id: 'tag-variant-transforms-custom-runner-latest'

images:
- 'gcr.io/cloud-lifesciences/gcp-variant-transforms:${COMMIT_SHA}'
- 'gcr.io/cloud-lifesciences/gcp-variant-transforms:${TAG_NAME}'
- 'gcr.io/cloud-lifesciences/gcp-variant-transforms:latest'
- 'gcr.io/cloud-lifesciences/variant-transforms-custom-runner:${COMMIT_SHA}'
- 'gcr.io/cloud-lifesciences/variant-transforms-custom-runner:${TAG_NAME}'
- 'gcr.io/cloud-lifesciences/variant-transforms-custom-runner:latest'
6 changes: 5 additions & 1 deletion deploy_and_run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ if [[ -z "${image_tag}" ]]; then
image_tag="test_${time_tag}"
fi
full_image_name="gcr.io/${project}/gcp-variant-transforms:${image_tag}"
full_sdk_container_image_name="gcr.io/${project}/variant-transforms-custom-runner:${image_tag}"

if [[ -z "${skip_build}" ]]; then
color_print "Building the Docker image with tag ${image_tag}" "${GREEN}"
Expand All @@ -239,7 +240,7 @@ trap clean_up EXIT

if [[ -n "${run_unit_tests}" ]]; then
python -m pip install --upgrade wheel
python -m pip install --upgrade .
python -m pip install -r /opt/gcp_variant_transforms/src/requirements.txt
python setup.py test
fi
python -m pip install --upgrade wheel
Expand All @@ -255,6 +256,7 @@ python gcp_variant_transforms/testing/integration/run_vcf_to_bq_tests.py \
--staging_location "gs://${gs_dir}/staging" \
--temp_location "gs://${gs_dir}/temp" \
--logging_location "gs://${gs_dir}/temp/logs" \
--sdk_container_image "${full_sdk_container_image_name}" \
--image "${full_image_name}" ${TEST_ARGUMENTS} &
pid_vcf_to_bq="$!"
if [[ -n "${run_preprocessor_tests}" ]]; then
Expand All @@ -264,6 +266,7 @@ if [[ -n "${run_preprocessor_tests}" ]]; then
--staging_location "gs://${gs_dir}/staging" \
--temp_location "gs://${gs_dir}/temp" \
--logging_location "gs://${gs_dir}/temp/logs" \
--sdk_container_image "${full_sdk_container_image_name}" \
--image "${full_image_name}" &
fi
# `pid_preprocess` could be the same as `pid_vcf_to_bq` if preprocessor tests
Expand All @@ -276,6 +279,7 @@ if [[ -n "${run_bq_to_vcf_tests}" ]]; then
--staging_location "gs://${gs_dir}/staging" \
--temp_location "gs://${gs_dir}/temp" \
--logging_location "gs://${gs_dir}/temp/logs" \
--sdk_container_image "${full_sdk_container_image_name}" \
--image "${full_image_name}" &
fi
pid_bq_to_vcf="$!"
Expand Down
28 changes: 16 additions & 12 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,29 @@
# To build a new docker image, run the following from the root source dir:
# $ docker build . -f docker/Dockerfile -t $IMAGE_NAME

FROM golang:latest

RUN go get -ldflags '-extldflags "-fno-PIC -static"' -buildmode pie -tags 'osusergo netgo static_build' github.com/googlegenomics/pipelines-tools/pipelines

FROM google/cloud-sdk:slim

COPY --from=0 /go/bin/pipelines /usr/bin
FROM golang:latest
RUN go install -ldflags '-extldflags "-fno-PIC -static"' -buildmode pie -tags 'osusergo netgo static_build' github.com/googlegenomics/pipelines-tools/pipelines@latest

FROM apache/beam_python3.8_sdk:2.34.0
ARG commit_sha
ENV COMMIT_SHA=${commit_sha}

RUN mkdir -p /opt/gcp_variant_transforms/bin && mkdir -p /opt/gcp_variant_transforms/src
ADD / /opt/gcp_variant_transforms/src/
COPY --from=0 /go/bin/pipelines /usr/bin

# Needed for installing mmh3 (one of the required packages in setup.py).
RUN apt install -y g++
# install gcloud sdk
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -

# g++ Needed for installing mmh3 (one of the required packages in setup.py).
# Install Pysam dependencies. These dependencies are only required because we
# have a monolithic binary - they primarily have to be installed on the workers.
RUN apt-get update && apt-get install -y \
apt-transport-https \
autoconf \
automake \
g++ \
gcc \
gnupg \
google-cloud-sdk \
libbz2-dev \
libcurl4-openssl-dev \
liblzma-dev \
Expand All @@ -48,14 +48,18 @@ RUN apt-get update && apt-get install -y \
python3-pip \
python3-venv


RUN mkdir -p /opt/gcp_variant_transforms/bin && mkdir -p /opt/gcp_variant_transforms/src
ADD / /opt/gcp_variant_transforms/src/

# Install dependencies.
RUN python3 -m venv /opt/gcp_variant_transforms/venv3 && \
sed -i 's/$1/${1:-}/' /opt/gcp_variant_transforms/venv3/bin/activate && \
. /opt/gcp_variant_transforms/venv3/bin/activate && \
cd /opt/gcp_variant_transforms/src && \
python3 -m pip install --upgrade pip && \
python3 -m pip install --upgrade wheel && \
python3 -m pip install --upgrade .
python3 -m pip install --upgrade -r requirements.txt

RUN printf '#!/bin/bash\n%s\n%s' \
". /opt/gcp_variant_transforms/venv3/bin/activate && cd /opt/gcp_variant_transforms/src" \
Expand Down
38 changes: 38 additions & 0 deletions docker/Dockerfile.custom_dataflow_container
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2021 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# To build a new docker image, run the following from the root source dir:
# $ docker build . -f docker/Dockerfile.custom_dataflow_container -t $IMAGE_NAME

# https://cloud.google.com/dataflow/docs/guides/using-custom-containers#python


FROM apache/beam_python3.8_sdk:2.34.0

RUN apt-get update && apt-get install -y \
autoconf \
automake \
gcc \
libbz2-dev \
libcurl4-openssl-dev \
liblzma-dev \
libssl-dev \
make\
perl \
zlib1g-dev \
python3-pysam

ADD /requirements.txt /requirements.txt

RUN pip install -r /requirements.txt
15 changes: 13 additions & 2 deletions docker/pipelines_runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ set -euo pipefail
#################################################
function parse_args {
# getopt command is only for checking arguments.
getopt -o '' -l project:,temp_location:,docker_image:,region:,subnetwork:,use_public_ips:,service_account:,location: -- "$@"
getopt -o '' -l project:,temp_location:,docker_image:,sdk_container_image:,region:,subnetwork:,use_public_ips:,service_account:,location: -- "$@"
while [[ "$#" -gt 0 ]]; do
case "$1" in
--project)
Expand All @@ -45,6 +45,10 @@ function parse_args {
vt_docker_image="$2"
;;

--sdk_container_image)
sdk_container_image="$2"
;;

--subnetwork)
subnetwork="$2"
;;
Expand Down Expand Up @@ -76,6 +80,7 @@ function main {
google_cloud_project="${google_cloud_project:-$(gcloud config get-value project)}"
region="${region:-$(gcloud config get-value compute/region)}"
vt_docker_image="${vt_docker_image:-gcr.io/cloud-lifesciences/gcp-variant-transforms}"
sdk_container_image="${sdk_container_image:-gcr.io/cloud-lifesciences/variant-transforms-custom-runner:latest}"

location="${location:-}"
temp_location="${temp_location:-}"
Expand Down Expand Up @@ -112,13 +117,18 @@ function main {
pt_optional_args=""
df_optional_args=""

if [[ ! -z "${sdk_container_image}" ]]; then
echo "Adding --sdk_container_image ${sdk_container_image} to optional_args"
df_optional_args="${df_optional_args} --experiments=use_runner_v2 --sdk_container_image=${sdk_container_image}"
fi

if [[ ! -z "${subnetwork}" ]]; then
echo "Adding --subnetwork ${subnetwork} to optional_args"
pt_optional_args="${pt_optional_args} --subnetwork projects/${google_cloud_project}/regions/${region}/subnetworks/${subnetwork}"
df_optional_args="${df_optional_args} --subnetwork https://www.googleapis.com/compute/v1/projects/${google_cloud_project}/regions/${region}/subnetworks/${subnetwork}"
fi

if [[ ! -z "${use_public_ips}" && "${use_public_ips}" == "false" ]]; then
if [[ ! -z "${use_public_ips}" && "${use_public_ips}" == "false" ]]; then
echo "Adding --private-address and --no_use_public_ips to optional_args"
pt_optional_args="${pt_optional_args} --private-address"
df_optional_args="${df_optional_args} --no_use_public_ips"
Expand Down Expand Up @@ -150,6 +160,7 @@ function main {
--pvm-attempts 0 \
--attempts 1 \
--disk-size 10 \
--boot-disk-size 100 \
${pt_optional_args}
}

Expand Down
20 changes: 20 additions & 0 deletions docs/setting_region.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,23 @@ docker run gcr.io/cloud-lifesciences/gcp-variant-transforms \
--use_public_ips false \
"${COMMAND}"
```
## Custom Dataflow Runner Image
By default Variant Transforms uses a custom docker image to run the pipeline in: `gcr.io/cloud-lifesciences/variant-transforms-custom-runner:latest`.
This image contains all the necessary python/linux dependencies needed to run variant transforms so that they are not downloaded from the internet when the pipeline starts.
You can override which container is used by passing a `--sdk_container_image` as in the following example:
```bash
COMMAND="/opt/gcp_variant_transforms/bin/vcf_to_bq ...
docker run gcr.io/cloud-lifesciences/gcp-variant-transforms \
--project "${GOOGLE_CLOUD_PROJECT}" \
--region us-central1 \
--location us-central1 \
--temp_location "${TEMP_LOCATION}" \
--subnetwork my-subnet \
--use_public_ips false \
--sdk_container_image gcr.io/path/to/my/container\
"${COMMAND}"
```
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def __init__(self,
parsed_args.region,
filesystems.FileSystems.join(parsed_args.logging_location,
'_'.join([test_name, timestamp])),
parsed_args.image, _TOOL_NAME, args)
parsed_args.image, parsed_args.sdk_container_image, _TOOL_NAME, args)

def validate_result(self):
"""Validates the results.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(self,
parser_args.region,
filesystems.FileSystems.join(parser_args.logging_location,
self._report_blob_name),
parser_args.image, _TOOL_NAME, args)
parser_args.image, parser_args.sdk_container_image, _TOOL_NAME, args)

def validate_result(self):
"""Validates the results.
Expand Down
14 changes: 12 additions & 2 deletions gcp_variant_transforms/testing/integration/run_tests_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@


_DEFAULT_IMAGE_NAME = 'gcr.io/cloud-lifesciences/gcp-variant-transforms'
_DEFAULT_SDK_CONTAINER_IMAGE_NAME = 'gcr.io/cloud-lifesciences/variant-transforms-custom-runner'

# `TestCaseState` saves current running test and the remaining tests in the same
# test script (.json).
Expand Down Expand Up @@ -127,13 +128,14 @@ def print_results(self):
return 0


def form_command(project, region, temp_location, image, tool_name, args):
# type: (str, str, str, str, str, List[str]) -> List[str]
def form_command(project, region, temp_location, image, sdk_container_image, tool_name, args):
# type: (str, str, str, str, str, str, List[str]) -> List[str]
return ['/opt/gcp_variant_transforms/src/docker/pipelines_runner.sh',
'--project', project,
'--region', region,
'--docker_image', image,
'--temp_location', temp_location,
'--sdk_container_image', sdk_container_image,
' '.join([tool_name] + args)]


Expand All @@ -153,6 +155,14 @@ def add_args(parser):
'production image {} is used.').format(_DEFAULT_IMAGE_NAME),
default=_DEFAULT_IMAGE_NAME,
required=False)
parser.add_argument(
'--sdk_container_image',
help=('The name of the dataflow runner container image to run, for '
'example: gcr.io/test-gcp-variant-transforms/'
'variant-transforms-custom-runner:latest. By default the '
'production image {} is used.').format(_DEFAULT_IMAGE_NAME),
default=_DEFAULT_SDK_CONTAINER_IMAGE_NAME,
required=False)


def get_configs(test_file_dir, required_keys, test_file_suffix=''):
Expand Down
Loading

0 comments on commit 37709f8

Please sign in to comment.