From 6e61ebeda3d99dbff187176d04a2c0495140b33c Mon Sep 17 00:00:00 2001 From: Don Freed Date: Mon, 10 Jul 2023 20:10:33 -0700 Subject: [PATCH] Use the requester project for file upload - Update the container base image - Update the default container image - Respect the requester project for log upload --- pipeline_scripts/Dockerfile | 6 +++--- pipeline_scripts/gc_functions.sh | 6 +++--- pipeline_scripts/gc_germline.sh | 12 ++++++------ pipeline_scripts/gc_somatic.sh | 14 +++++++------- runner/runner_default.json | 2 +- runner/sentieon_runner.py | 11 ++++++++--- 6 files changed, 28 insertions(+), 23 deletions(-) diff --git a/pipeline_scripts/Dockerfile b/pipeline_scripts/Dockerfile index 20385b1..b3a85ec 100644 --- a/pipeline_scripts/Dockerfile +++ b/pipeline_scripts/Dockerfile @@ -1,4 +1,4 @@ -FROM google/cloud-sdk:367.0.0-slim as downloader +FROM google/cloud-sdk:437.0.1-slim as downloader # Install samtools RUN apt-get update && \ @@ -22,9 +22,9 @@ RUN curl -Lo samblaster-v.0.1.24.tar.gz https://github.com/GregoryFaust/samblast make && \ cp samblaster /usr/local/bin/ -FROM google/cloud-sdk:367.0.0-slim +FROM google/cloud-sdk:437.0.1-slim -LABEL container.base.image="google/cloud-sdk:367.0.0-slim" +LABEL container.base.image="google/cloud-sdk:437.0.1-slim" COPY --from=downloader /usr/local/bin/samtools /usr/local/bin COPY --from=downloader /usr/local/bin/samblaster /usr/local/bin diff --git a/pipeline_scripts/gc_functions.sh b/pipeline_scripts/gc_functions.sh index 91e8604..a1a5c7b 100644 --- a/pipeline_scripts/gc_functions.sh +++ b/pipeline_scripts/gc_functions.sh @@ -88,7 +88,7 @@ upload_metrics() eval "fun_metrics_cmd2=\$$fun_var_cmd2" if [[ -n "$fun_metrics_cmd2" && -z "$fun_metrics_cmd1" && -f "${fun_metrics_files[0]}" ]]; then (run "$fun_metrics_cmd2" "Plotting metrics results." && - gsutil cp ${fun_metrics_files[@]} "$out_metrics" && + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp ${fun_metrics_files[@]} "$out_metrics" && rm ${fun_metrics_files[@]}) & eval "$fun_pid=$! " eval "$fun_var_cmd2=''" @@ -297,7 +297,7 @@ bwa_mem_align() fi bwa_cmd="$bwa_cmd | $release_dir/bin/sentieon util sort ${fun_util_sort_xargs} --block_size 512M -o $local_bam -t $nt --sam2bam -i -" run "$bwa_cmd" "BWA-mem and sorting" - gsutil cp $bwa_log "$out_bam" + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $bwa_log "$out_bam" fun_bam_dest+=($local_bam) done echo "BWA ended" @@ -444,7 +444,7 @@ run_bqsr_post() run "$cmd" "BQSR post" run "$fun_bqsr_cmd3" "BQSR CSV" run "$fun_bqsr_cmd4" "BQSR plot" - gsutil cp $fun_plot "$out_metrics" & + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $fun_plot "$out_metrics" & eval "$fun_upload_pid=$1 " fi diff --git a/pipeline_scripts/gc_germline.sh b/pipeline_scripts/gc_germline.sh index 4a10872..551a24c 100644 --- a/pipeline_scripts/gc_germline.sh +++ b/pipeline_scripts/gc_germline.sh @@ -102,7 +102,7 @@ output_ext="bam" run_mark_duplicates "" "$DEDUP" metrics_cmd1 "$local_bams_str" dedup_bam_str dedup_bams "$dedup_xargs" $output_ext "false" "${local_bams[@]}" if [[ "$DEDUP" != "nodup" ]]; then if [[ -z "$NO_METRICS" ]]; then - (gsutil cp $metrics_dir/dedup_metrics.txt "$out_metrics" && + (gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $metrics_dir/dedup_metrics.txt "$out_metrics" && rm $metrics_dir/dedup_metrics.txt) & upload_dedup_pid=$! else @@ -131,7 +131,7 @@ if [[ -z "$NO_BAM_OUTPUT" && (-z "$bqsr_sites" || -z "$RECALIBRATED_OUTPUT" ) ]] upload_list+=" \"${bam}.crai\" " fi done - eval gsutil cp $upload_list "$out_bam" & + eval gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $upload_list "$out_bam" & upload_deduped_pid=$! fi @@ -146,12 +146,12 @@ if [[ -n "$bqsr_sites" && -z "$NO_BAM_OUTPUT" && -n "$RECALIBRATED_OUTPUT" ]]; t outrecal=$work/recalibrated.bam cmd="$release_dir/bin/sentieon driver $dedup_bam_str -q $bqsr_table --algo ReadWriter $outrecal" (run "$cmd" "ReadWriter"; - gsutil cp $outrecal ${outrecal}.bai "$out_bam") & + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $outrecal ${outrecal}.bai "$out_bam") & upload_recal_pid=$! fi if [[ -n "$bqsr_sites" && -z "$NO_BAM_OUTPUT" && -z "$RECALIBRATED_OUTPUT" ]]; then - gsutil cp $bqsr_table "$out_bam" & + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $bqsr_table "$out_bam" & upload_bqsr_pid=$! fi @@ -218,7 +218,7 @@ if [[ -z $NO_HAPLOTYPER ]]; then run "$cmd" "DNAscope model apply" fi - gsutil cp $outfile ${outfile}.tbi "$out_variants" & + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $outfile ${outfile}.tbi "$out_variants" & upload_vcf_pid=$! fi @@ -240,7 +240,7 @@ fi if [[ -n $bqsr_cmd3 ]]; then run "$bqsr_cmd3" "BQSR CSV" run "$bqsr_cmd4" "BQSR plot" - gsutil cp $plot "$out_metrics" & + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $plot "$out_metrics" & upload_bqsr_metrics_pid=$! fi diff --git a/pipeline_scripts/gc_somatic.sh b/pipeline_scripts/gc_somatic.sh index 9874538..f1baa8f 100644 --- a/pipeline_scripts/gc_somatic.sh +++ b/pipeline_scripts/gc_somatic.sh @@ -159,7 +159,7 @@ if [[ "$DEDUP" != "nodup" ]]; then if [[ -n "$dedup_bam_str" ]]; then to_upload+=" $metrics_dir/normal_dedup_metrics.txt" fi - (gsutil cp $to_upload "$out_metrics" && + (gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $to_upload "$out_metrics" && rm $metrics_dir/*_dedup_metrics.txt) & upload_dedup_pid=$! else @@ -188,7 +188,7 @@ if [[ -z "$NO_BAM_OUTPUT" && -z "$REALIGN_SITES" ]]; then upload_list+=" \"${bam}.crai\" " fi done - eval gsutil cp $upload_list "$out_bam" & + eval gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $upload_list "$out_bam" & upload_deduped_pid=$! fi @@ -254,7 +254,7 @@ if [[ -n "$realigned_bam_str" ]]; then fi if [[ -n "$bqsr_sites" && -z "$NO_BAM_OUTPUT" ]]; then - gsutil cp $bqsr_table $tumor_bqsr_table "$out_bam" & + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $bqsr_table $tumor_bqsr_table "$out_bam" & upload_bqsr_pid=$! fi @@ -296,7 +296,7 @@ if [[ -n "$REALIGN_SITES" && -n "$RUN_TNSNV" && -n "$realigned_bam_str" ]]; then elif [[ -f "${corealigned_bam}.crai" ]]; then upload_list+=" ${corealigned_bam}.crai " fi - gsutil cp $upload_list "$out_bam" & + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $upload_list "$out_bam" & upload_corealigned_pid=$! corealigned_bam_str=" -i $corealigned_bam " @@ -310,7 +310,7 @@ elif [[ -n "$REALIGN_SITES" && -n "$RUN_TNSNV" ]]; then upload_list+=" \"${bam}.crai\" " fi done - eval gsutil cp $upload_list "$out_bam" & + eval gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $upload_list "$out_bam" & upload_corealigned_pid=$! corealigned_bam_str=" $tumor_realigned_bam_str " @@ -350,7 +350,7 @@ if [[ -z "$NO_VCF" ]]; then fi run "$cmd" "Variant calling" - gsutil cp $vcf ${vcf}.tbi "$out_variants" & + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $vcf ${vcf}.tbi "$out_variants" & upload_vcf_pid=$! fi @@ -382,7 +382,7 @@ if [[ -n $tumor_bqsr_cmd3 ]]; then fi run "$tumor_bqsr_cmd3" "Tumor BQSR CSV" run "$tumor_bqsr_cmd4" "Tumor BQSR plot" - gsutil cp $upload_list $tumor_bqsr_plot "$out_metrics" & + gsutil ${REQUESTER_PROJECT:+-u $REQUESTER_PROJECT} cp $upload_list $tumor_bqsr_plot "$out_metrics" & upload_bqsr_metrics_pid=$! fi diff --git a/runner/runner_default.json b/runner/runner_default.json index de4940d..59cbd6c 100644 --- a/runner/runner_default.json +++ b/runner/runner_default.json @@ -34,7 +34,7 @@ "CPU_PLATFORM": "Intel Broadwell", "PROJECT_ID": null, "REQUESTER_PROJECT": null, - "DOCKER_IMAGE": "sentieon/sentieon-google-cloud:0.2.4", + "DOCKER_IMAGE": "sentieon/sentieon-google-cloud:0.2.6", "CALLING_ARGS": null, "CALLING_ALGO": "Haplotyper", "DNASCOPE_MODEL": "https://s3.amazonaws.com/sentieon-release/other/SentieonDNAscopeModel1.0.model", diff --git a/runner/sentieon_runner.py b/runner/sentieon_runner.py index 674c229..3640e60 100644 --- a/runner/sentieon_runner.py +++ b/runner/sentieon_runner.py @@ -435,11 +435,16 @@ def main( "/bin/bash", "-c", ( - "gsutil cp /google/logs/action/1/stderr " + "gsutil -u {} cp /google/logs/action/1/stderr " '"{}/worker_logs/stderr.txt" && ' - "gsutil cp /google/logs/action/1/stdout " + "gsutil -u {} cp /google/logs/action/1/stdout " '"{}/worker_logs/stdout.txt"' - ).format(job_vars["OUTPUT_BUCKET"], job_vars["OUTPUT_BUCKET"]), + ).format( + job_vars["REQUESTER_PROJECT"], + job_vars["OUTPUT_BUCKET"], + job_vars["REQUESTER_PROJECT"], + job_vars["OUTPUT_BUCKET"] + ), ], "alwaysRun": True, }