Skip to content

Commit

Permalink
Added --sites-only-vcf-output argument to GATK engine (#4764)
Browse files Browse the repository at this point in the history
* Added --sites-only-vcf-output argument to the GATK engine, to allow sites-only output when writing VCFs, plus tests

Resolves #4763
  • Loading branch information
jamesemery authored and droazen committed May 17, 2018
1 parent e8b3e8a commit 4e3affb
Show file tree
Hide file tree
Showing 10 changed files with 119 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ private StandardArgumentDefinitions(){}
public static final String ANNOTATIONS_TO_EXCLUDE_LONG_NAME = "annotations-to-exclude";
public static final String SAMPLE_NAME_LONG_NAME = "sample-name";
public static final String PEDIGREE_FILE_LONG_NAME = "pedigree";
public static final String SITES_ONLY_LONG_NAME = "sites-only-vcf-output";

public static final String INPUT_SHORT_NAME = "I";
public static final String OUTPUT_SHORT_NAME = "O";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ public abstract class GATKTool extends CommandLineProgram {
optional = true)
public boolean disableBamIndexCaching = false;

@Argument(fullName = StandardArgumentDefinitions.SITES_ONLY_LONG_NAME,
doc = "If true, don't emit genotype fields when writing vcf file output.", optional = true)
public boolean outputSitesOnlyVCFs = false;

/**
* Master sequence dictionary to be used instead of all other dictionaries (if provided).
*/
Expand Down Expand Up @@ -728,6 +732,10 @@ public VariantContextWriter createVCFWriter(final File outFile) {
}
}

if (outputSitesOnlyVCFs) {
options.add(Options.DO_NOT_WRITE_GENOTYPES);
}

return GATKVariantContextUtils.createVCFWriter(
outFile,
sequenceDictionary,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import java.nio.file.Path;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
Expand All @@ -16,9 +15,7 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.List;
import org.broadinstitute.hellbender.utils.io.IOUtils;
Expand Down Expand Up @@ -198,7 +195,7 @@ public void onTraversalStart() {

// The HC engine will make the right kind (VCF or GVCF) of writer for us
final SAMSequenceDictionary sequenceDictionary = getHeaderForReads().getSequenceDictionary();
vcfWriter = hcEngine.makeVCFWriter(outputVCF, sequenceDictionary, createOutputVariantIndex, createOutputVariantMD5);
vcfWriter = hcEngine.makeVCFWriter(outputVCF, sequenceDictionary, createOutputVariantIndex, createOutputVariantMD5, outputSitesOnlyVCFs);
hcEngine.writeHeader(vcfWriter, sequenceDictionary, getDefaultToolVCFHeaderLines());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -322,17 +322,20 @@ public static List<ReadFilter> makeStandardHCReadFilters() {
* @return a VCF or GVCF writer as appropriate, ready to use
*/
public VariantContextWriter makeVCFWriter( final String outputVCF, final SAMSequenceDictionary readsDictionary,
final boolean createOutputVariantIndex, final boolean createOutputVariantMD5 ) {
final boolean createOutputVariantIndex, final boolean createOutputVariantMD5,
final boolean sitesOnlyMode ) {
Utils.nonNull(outputVCF);
Utils.nonNull(readsDictionary);

final List<Options> options = new ArrayList<>(2);
if (createOutputVariantIndex) {options.add(Options.INDEX_ON_THE_FLY);}
if (sitesOnlyMode) {options.add(Options.DO_NOT_WRITE_GENOTYPES);}

VariantContextWriter writer = GATKVariantContextUtils.createVCFWriter(
new File(outputVCF),
readsDictionary,
createOutputVariantMD5,
createOutputVariantIndex ?
new Options[]{Options.INDEX_ON_THE_FLY} :
new Options[0]
options.toArray(new Options[options.size()])
);

if ( hcArgs.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,21 @@ default List<String> injectDefaultVerbosity(final List<String> args) {
/**
* Runs the command line implemented by this test.
*
* Default behaviour uses {@link Main} with the command line arguments created by {@link #makeCommandLineArgs(List)}.
* Default behavior uses {@link Main} with the command line arguments created by {@link #makeCommandLineArgs(List)}.
*/
default Object runCommandLine(final List<String> args) {
return new Main().instanceMain(makeCommandLineArgs(args));
}

/**
* Lets you explicitly specify a tool to run with the provided arguments
*
* Default behavior uses {@link Main} with the command line arguments created by {@link #makeCommandLineArgs(List, String)}.
*/
default Object runCommandLine(final List<String> args, final String toolName) {
return new Main().instanceMain(makeCommandLineArgs(args, toolName));
}

default Object runCommandLine(final String[] args) {
return runCommandLine(Arrays.asList(args));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,9 @@ public Object runCommandLine(final List<String> args) {
return new Main().instanceMain(makeCommandLineArgs(args));
}

@Override
public Object runCommandLine(final List<String> args, final String toolName) {
return new Main().instanceMain(makeCommandLineArgs(args, toolName));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import org.broadinstitute.hellbender.tools.examples.ExampleReadWalkerWithVariants;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants;
import org.broadinstitute.hellbender.utils.test.VariantContextTestUtils;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;
import java.util.Arrays;
import java.util.List;

public class GatkToolIntegrationTest extends CommandLineProgramTest {
private static final String TEST_DIRECTORY = publicTestDir + "org/broadinstitute/hellbender/engine/";

@Test
public void testSitesOnlyMode() {
File out = createTempFile("GTStrippedOutput", "vcf");
String[] args = new String[] {
"-V", TEST_DIRECTORY + "vcf_with_genotypes.vcf",
"--" + StandardArgumentDefinitions.SITES_ONLY_LONG_NAME,
"-O",
out.getAbsolutePath()};
runCommandLine(Arrays.asList(args), SelectVariants.class.getSimpleName());

// Assert that the genotype field has been stripped from the file
Pair<VCFHeader, List<VariantContext>> results = VariantContextTestUtils.readEntireVCFIntoMemory(out.getAbsolutePath());

Assert.assertFalse(results.getLeft().hasGenotypingData());
for (VariantContext v: results.getRight()) {
Assert.assertFalse(v.hasGenotypes());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import htsjdk.tribble.Tribble;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
Expand Down Expand Up @@ -377,6 +376,32 @@ public void testBamoutProducesReasonablySizedOutput() {
}
}


@Test
public void testSitesOnlyMode() {
Utils.resetRandomGenerator();
File out = createTempFile("GTStrippedOutput", "vcf");
final String[] args = {
"-I", NA12878_20_21_WGS_bam,
"-R", b37_reference_20_21,
"-L", "20:10000000-10010000",
"-O", out.getAbsolutePath(),
"-pairHMM", "AVX_LOGLESS_CACHING",
"--" + StandardArgumentDefinitions.SITES_ONLY_LONG_NAME,
"--" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"
};

runCommandLine(args);

// Assert that the genotype field has been stripped from the file
Pair<VCFHeader, List<VariantContext>> results = VariantContextTestUtils.readEntireVCFIntoMemory(out.getAbsolutePath());

Assert.assertFalse(results.getLeft().hasGenotypingData());
for (VariantContext v: results.getRight()) {
Assert.assertFalse(v.hasGenotypes());
}
}

@DataProvider(name="outputFileVariations")
public Object[][] getOutputFileVariations() {
return new Object[][]{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1
20 69491 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:94:99:82:99:0,120,1800
20 69511 . A G,<NON_REF> 2253.77 . . GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33
20 69512 . C <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:96:99:82:99:0,120,1800
20 69522 . C <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:95:0:95:0:0,0,0
20 69549 . C <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:156:99:56:66:0,66,990
20 69635 . C T,<NON_REF> 60.77 . . GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:89:89,0,119,101,128,229:0,4,0,3
20 69762 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:18:7:18:0,18,270
20 69763 . C <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:21:7:21:0,21,253
20 69767 . C <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:12:7:12:0,12,180
20 69772 . TTATC T,<NON_REF> 60.77 . . GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:89:89,0,119,101,128,229:0,4,0,3
20 69773 . T <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:0:3:0:0,0,0

0 comments on commit 4e3affb

Please sign in to comment.