Skip to content
This repository has been archived by the owner on Oct 29, 2023. It is now read-only.

Commit

Permalink
Merge pull request #42 from iliat/dev-broad
Browse files Browse the repository at this point in the history
Allow start/end when reading from the API in CountReads, improve script documentation.
  • Loading branch information
deflaux committed Mar 13, 2015
2 parents 70bc9f7 + 213acd0 commit 41953b1
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
import java.util.List;
import java.util.logging.Logger;

import javax.annotation.Nullable;

/**
* Simple read counting pipeline, intended as an example for reading data from
* APIs OR BAM files and invoking GATK tools.
Expand Down Expand Up @@ -150,15 +152,23 @@ private static PCollection<Read> getReadsFromAPI() {
}

private static List<SearchReadsRequest> getReadRequests(CountReadsOptions options) {
List<SearchReadsRequest> requests = Lists.newArrayList();
requests.add(new SearchReadsRequest()
.setReadGroupSetIds(
Collections.singletonList(options.getReadGroupSetId()))
.setReferenceName(options.getReferences())
.setPageSize(2048));

return requests;
}

final String readGroupSetId = options.getReadGroupSetId();
return Lists.newArrayList(Iterables.transform(
Iterables.concat(Iterables.transform(options.getContigs(),
new Function<Contig, Iterable<Contig>>() {
@Override
public Iterable<Contig> apply(Contig contig) {
return contig.getShards();
}
})),
new Function<Contig, SearchReadsRequest>() {
@Override
public SearchReadsRequest apply(Contig shard) {
return shard.getReadsRequest(readGroupSetId);
}
}));
}

private static PCollection<Read> getReadsFromBAMFile() throws IOException {
LOG.info("getReadsFromBAMFile");
Expand Down
51 changes: 38 additions & 13 deletions src/main/scripts/count_reads.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,49 @@
# Assumes the script is run from dataflow-java directory.
# Assumes client_secrets.json is located in a parent directory of dataflow-java.

JAR=target/google-genomics-dataflow-v1beta2-0.2-SNAPSHOT.jar
JAR=target/google-genomics-dataflow-v1beta2-0.4-SNAPSHOT.jar
CLIENT_SECRETS=$(readlink -f ../client_secrets.json)
# Assumes the following variables are set
# Please set them before calling or edit this file and set them.
# PROJECT_ID=cloud-project-name
# OUTPUT=gs://test/df/count_reads/output/count.txt
# STAGING=gs://test/df/count_reads/staging
# DATASET_ID=15448427866823121459
# READGROUPSET_ID=CK256frpGBD44IWHwLP22R4
# DESIRED_CONTIGS=20:56311809:62603264
# BAM_FILE_PATH=gs://test/NA12878.chrom20.ILLUMINA.bwa.CEU.exome.20121211.bam
# PROJECT_ID - id of your cloud project
# OUTPUT - GCS path for output file
# STAGING - GCS path for staging files
# DATASET_ID - Id of genomics API data set (only needed if using API)
# READGROUPSET_ID - Id of genomics AP readgroup set (only needed if using API)
# DESIRED_CONTIGS - reference:start:end[,reference:start:end]
# BAM_FILE_PATH - GCS path to BAM file (only needed if using BAM file input)
#
# Example call for API reading
# export PROJECT_ID=your-project-id
# export OUTPUT=gs://test/df/count_reads/output/count.txt
# export STAGING=gs://test/df/count_reads/staging
# export DATASET_ID=15448427866823121459
# export READGROUPSET_ID=CK256frpGBD44IWHwLP22R4
# export DESIRED_CONTIGS=seq1:0:800
# # Local run:
# src/main/scripts/count_reads.sh
# # Cloud run:
# src/main/scripts/count_reads.sh cloud
#
# Example call for BAM reading
# export PROJECT_ID=your-project-id
# export OUTPUT=gs://test/df/count_reads/output/count.txt
# export STAGING=gs://test/df/count_reads/staging
# export DESIRED_CONTIGS=seq1:0:800
# BAM_FILE_PATH=gs://test/df/NA12878.chrom20.ILLUMINA.bwa.CEU.exome.20121211.bam
# # Local run:
# src/main/scripts/count_reads.sh bam
# # Cloud run:
# src/main/scripts/count_reads.sh bam cloud


if [ "$1" = "bam" ]; then
bam_argument="--BAMFilePath=$BAM_FILE_PATH"
else
api_argument="--datasetId=$DATASET_ID --readGroupSetId=$READGROUPSET_ID"
fi
if [ "$2" = "cloud" ]; then
additional_arguments="--stagingLocation=${STAGING} --numWorkers=1 --runner=BlockingDataflowPipelineRunner"
additional_arguments="--stagingLocation=${STAGING} --numWorkers=2 --runner=BlockingDataflowPipelineRunner"
else
additional_arguments="--numWorkers=1"
fi
Expand All @@ -34,8 +60,7 @@ com.google.cloud.genomics.dataflow.pipelines.CountReads \
--project=$PROJECT_ID \
--output=$OUTPUT \
--genomicsSecretsFile=$CLIENT_SECRETS \
--datasetId=$DATASET_ID \
--readGroupSetId=$READGROUPSET_ID \
--references=$DESIRED_CONTIGS \
$additional_arguments \
$bam_argument
$bam_argument $api_argument $additional_arguments


0 comments on commit 41953b1

Please sign in to comment.