Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added --sites-only argument to GATK engine #4764

Merged
merged 4 commits into from
May 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ private StandardArgumentDefinitions(){}
public static final String ANNOTATIONS_TO_EXCLUDE_LONG_NAME = "annotations-to-exclude";
public static final String SAMPLE_NAME_LONG_NAME = "sample-name";
public static final String PEDIGREE_FILE_LONG_NAME = "pedigree";
public static final String SITES_ONLY_LONG_NAME = "sites-only-vcf-output";

public static final String INPUT_SHORT_NAME = "I";
public static final String OUTPUT_SHORT_NAME = "O";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ public abstract class GATKTool extends CommandLineProgram {
optional = true)
public boolean disableBamIndexCaching = false;

@Argument(fullName = StandardArgumentDefinitions.SITES_ONLY_LONG_NAME,
doc = "If true, don't emit genotype fields when writing vcf file output.", optional = true)
public boolean outputSitesOnlyVCFs = false;

/**
* Master sequence dictionary to be used instead of all other dictionaries (if provided).
*/
Expand Down Expand Up @@ -728,6 +732,10 @@ public VariantContextWriter createVCFWriter(final File outFile) {
}
}

if (outputSitesOnlyVCFs) {
options.add(Options.DO_NOT_WRITE_GENOTYPES);
}

return GATKVariantContextUtils.createVCFWriter(
outFile,
sequenceDictionary,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import java.nio.file.Path;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
Expand All @@ -16,9 +15,7 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.List;
import org.broadinstitute.hellbender.utils.io.IOUtils;
Expand Down Expand Up @@ -198,7 +195,7 @@ public void onTraversalStart() {

// The HC engine will make the right kind (VCF or GVCF) of writer for us
final SAMSequenceDictionary sequenceDictionary = getHeaderForReads().getSequenceDictionary();
vcfWriter = hcEngine.makeVCFWriter(outputVCF, sequenceDictionary, createOutputVariantIndex, createOutputVariantMD5);
vcfWriter = hcEngine.makeVCFWriter(outputVCF, sequenceDictionary, createOutputVariantIndex, createOutputVariantMD5, outputSitesOnlyVCFs);
hcEngine.writeHeader(vcfWriter, sequenceDictionary, getDefaultToolVCFHeaderLines());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -322,17 +322,20 @@ public static List<ReadFilter> makeStandardHCReadFilters() {
* @return a VCF or GVCF writer as appropriate, ready to use
*/
public VariantContextWriter makeVCFWriter( final String outputVCF, final SAMSequenceDictionary readsDictionary,
final boolean createOutputVariantIndex, final boolean createOutputVariantMD5 ) {
final boolean createOutputVariantIndex, final boolean createOutputVariantMD5,
final boolean sitesOnlyMode ) {
Utils.nonNull(outputVCF);
Utils.nonNull(readsDictionary);

final List<Options> options = new ArrayList<>(2);
if (createOutputVariantIndex) {options.add(Options.INDEX_ON_THE_FLY);}
if (sitesOnlyMode) {options.add(Options.DO_NOT_WRITE_GENOTYPES);}

VariantContextWriter writer = GATKVariantContextUtils.createVCFWriter(
new File(outputVCF),
readsDictionary,
createOutputVariantMD5,
createOutputVariantIndex ?
new Options[]{Options.INDEX_ON_THE_FLY} :
new Options[0]
options.toArray(new Options[options.size()])
);

if ( hcArgs.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,21 @@ default List<String> injectDefaultVerbosity(final List<String> args) {
/**
* Runs the command line implemented by this test.
*
* Default behaviour uses {@link Main} with the command line arguments created by {@link #makeCommandLineArgs(List)}.
* Default behavior uses {@link Main} with the command line arguments created by {@link #makeCommandLineArgs(List)}.
*/
default Object runCommandLine(final List<String> args) {
return new Main().instanceMain(makeCommandLineArgs(args));
}

/**
* Lets you explicitly specify a tool to run with the provided arguments
*
* Default behavior uses {@link Main} with the command line arguments created by {@link #makeCommandLineArgs(List, String)}.
*/
default Object runCommandLine(final List<String> args, final String toolName) {
return new Main().instanceMain(makeCommandLineArgs(args, toolName));
}

default Object runCommandLine(final String[] args) {
return runCommandLine(Arrays.asList(args));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,9 @@ public Object runCommandLine(final List<String> args) {
return new Main().instanceMain(makeCommandLineArgs(args));
}

@Override
public Object runCommandLine(final List<String> args, final String toolName) {
return new Main().instanceMain(makeCommandLineArgs(args, toolName));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import org.broadinstitute.hellbender.tools.examples.ExampleReadWalkerWithVariants;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants;
import org.broadinstitute.hellbender.utils.test.VariantContextTestUtils;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;
import java.util.Arrays;
import java.util.List;

public class GatkToolIntegrationTest extends CommandLineProgramTest {
private static final String TEST_DIRECTORY = publicTestDir + "org/broadinstitute/hellbender/engine/";

@Test
public void testSitesOnlyMode() {
File out = createTempFile("GTStrippedOutput", "vcf");
String[] args = new String[] {
"-V", TEST_DIRECTORY + "vcf_with_genotypes.vcf",
"--" + StandardArgumentDefinitions.SITES_ONLY_LONG_NAME,
"-O",
out.getAbsolutePath()};
runCommandLine(Arrays.asList(args), SelectVariants.class.getSimpleName());

// Assert that the genotype field has been stripped from the file
Pair<VCFHeader, List<VariantContext>> results = VariantContextTestUtils.readEntireVCFIntoMemory(out.getAbsolutePath());

Assert.assertFalse(results.getLeft().hasGenotypingData());
for (VariantContext v: results.getRight()) {
Assert.assertFalse(v.hasGenotypes());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import htsjdk.tribble.Tribble;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
Expand Down Expand Up @@ -377,6 +376,32 @@ public void testBamoutProducesReasonablySizedOutput() {
}
}


@Test
public void testSitesOnlyMode() {
Utils.resetRandomGenerator();
File out = createTempFile("GTStrippedOutput", "vcf");
final String[] args = {
"-I", NA12878_20_21_WGS_bam,
"-R", b37_reference_20_21,
"-L", "20:10000000-10010000",
"-O", out.getAbsolutePath(),
"-pairHMM", "AVX_LOGLESS_CACHING",
"--" + StandardArgumentDefinitions.SITES_ONLY_LONG_NAME,
"--" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"
};

runCommandLine(args);

// Assert that the genotype field has been stripped from the file
Pair<VCFHeader, List<VariantContext>> results = VariantContextTestUtils.readEntireVCFIntoMemory(out.getAbsolutePath());

Assert.assertFalse(results.getLeft().hasGenotypingData());
for (VariantContext v: results.getRight()) {
Assert.assertFalse(v.hasGenotypes());
}
}

@DataProvider(name="outputFileVariations")
public Object[][] getOutputFileVariations() {
return new Object[][]{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1
20 69491 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:94:99:82:99:0,120,1800
20 69511 . A G,<NON_REF> 2253.77 . . GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33
20 69512 . C <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:96:99:82:99:0,120,1800
20 69522 . C <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:95:0:95:0:0,0,0
20 69549 . C <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:156:99:56:66:0,66,990
20 69635 . C T,<NON_REF> 60.77 . . GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:89:89,0,119,101,128,229:0,4,0,3
20 69762 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:18:7:18:0,18,270
20 69763 . C <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:21:7:21:0,21,253
20 69767 . C <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:12:7:12:0,12,180
20 69772 . TTATC T,<NON_REF> 60.77 . . GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:89:89,0,119,101,128,229:0,4,0,3
20 69773 . T <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:0:3:0:0,0,0