From 1f732a5365b63629ba92455b1b7e5f5d55d257a9 Mon Sep 17 00:00:00 2001 From: Samuel Friedman Date: Tue, 22 May 2018 19:14:11 -0400 Subject: [PATCH 1/3] tranche filtering in java --- .../tools/walkers/vqsr/CNNVariantTrain.java | 2 +- .../walkers/vqsr/CNNVariantWriteTensors.java | 2 +- .../walkers/vqsr/FilterVariantTranches.java | 277 +++++++++++------- .../hellbender/tools/walkers/vqsr/tranches.py | 124 -------- .../walkers/vqsr/CNNVariantPipelineTest.java | 5 +- .../FilterVariantTranchesIntegrationTest.java | 117 ++++++++ .../g94982_20_1m_10m_tranched_95_99_99.9.vcf | 3 + .../expected/g94982_20_1m_10m_tranched_99.vcf | 3 + .../VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz | 3 + .../g94982_20_1m_10m_python_2dcnn.vcf.gz.tbi | 3 + 10 files changed, 296 insertions(+), 243 deletions(-) delete mode 100644 src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/tranches.py create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java create mode 100644 src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf create mode 100644 src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_99.vcf create mode 100644 src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz create mode 100644 src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz.tbi diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java index cd0bfc3200c..a11449ddcd9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java @@ -103,7 +103,7 @@ protected void onStartup() { @Override protected Object doWork() { - final Resource pythonScriptResource = new Resource("training.py", FilterVariantTranches.class); + final Resource pythonScriptResource = new Resource("training.py", CNNVariantTrain.class); List arguments = new ArrayList<>(Arrays.asList( "--data_dir", inputTensorDir, "--output_dir", outputDir, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java index 53d282d617d..b850e5e8cb1 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java @@ -121,7 +121,7 @@ protected void onStartup() { @Override protected Object doWork() { - final Resource pythonScriptResource = new Resource("training.py", FilterVariantTranches.class); + final Resource pythonScriptResource = new Resource("training.py", CNNVariantWriteTensors.class); List arguments = new ArrayList<>(Arrays.asList( "--reference_fasta", reference, "--input_vcf", inputVcf, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java index b8813b38e7e..a9528aac4a0 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java @@ -8,35 +8,38 @@ import htsjdk.tribble.index.Index; import htsjdk.tribble.index.IndexFactory; import htsjdk.tribble.util.TabixUtils; + +import java.util.*; +import java.io.File; +import java.util.stream.Collectors; + +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFFilterHeaderLine; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; + +import org.broadinstitute.hellbender.engine.*; + import org.broadinstitute.barclay.argparser.Argument; -import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; -import org.broadinstitute.barclay.argparser.ExperimentalFeature; import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.hellbender.cmdline.CommandLineProgram; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.barclay.argparser.ExperimentalFeature; import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.utils.io.Resource; -import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import picard.cmdline.programgroups.VariantFilteringProgramGroup; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; +import picard.cmdline.programgroups.VariantEvaluationProgramGroup; /** - * Apply tranche filtering to VCF based on scores from the INFO field. + * Apply tranche filtering to VCF based on scores from an annotation in the INFO field. * *

Inputs

*
    *
  • The input variants to tranche filter.
  • - *
  • snp-truth-vcf A VCF containing known common SNP sites
  • - *
  • indel-truth-vcf A VCF containing known and common INDEL sites.
  • + *
  • resource A VCF containing known SNP and or INDEL sites. Can be supplied as many times as necessary
  • *
  • info-key The key from the INFO field of the VCF which contains the values that will be used to filter.
  • *
  • tranche List of percent sensitivities to the known sites at which we will filter. Must be between 0 and 100.
  • *
@@ -53,139 +56,185 @@ *
  * gatk FilterVariantTranches \
  *   -V input.vcf.gz \
- *   --snp-truth-vcf hapmap.vcf \
- *   --indel-truth-vcf mills.vcf \
+ *   --resource hapmap.vcf \
+ *   --resource mills.vcf \
  *   --info-key CNN_1D \
  *   --tranche 99.9 --tranche 99.0 --tranche 95 \
- *   --max-sites 8000 \
  *   -O filtered.vcf
  * 
* */ +@DocumentedFeature +@ExperimentalFeature @CommandLineProgramProperties( - summary = "Apply tranche filtering based on a truth VCF of known common sites of variation and score from VCF INFO field", + summary = "Apply tranche filtering based on a truth VCF of known common sites of variation and a score from VCF INFO field", oneLineSummary = "Apply tranche filtering", programGroup = VariantFilteringProgramGroup.class ) -@DocumentedFeature -@ExperimentalFeature -public class FilterVariantTranches extends CommandLineProgram { - @Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME, - shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME, - doc = "Input VCF file") - private String inputVcf = null; +public class FilterVariantTranches extends TwoPassVariantWalker { @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Output VCF file") private String outputVcf = null; - @Argument(fullName = "snp-truth-vcf", shortName = "snp-truth-vcf", doc = "Input file of known common SNP sites.") - private String snpTruthVcf = null; + @Argument(fullName="tranche", + shortName="t", + doc="The levels of truth sensitivity at which to slice the data. (in percents, i.e. 99.9 for 99.9 percent and 1.0 for 1 percent)", + optional=true) + private List tranches = new ArrayList<>(Arrays.asList(99.9, 99.99)); - @Argument(fullName = "indel-truth-vcf", shortName = "indel-truth-vcf", doc = "Input file of known common INDEL sites.") - private String indelTruthVcf = null; + @Argument(fullName="resource", + shortName = "resource", + doc="A list of validated VCFs with known sites of common variation", + optional=false) + private List> resources = new ArrayList<>(); @Argument(fullName = "info-key", shortName = "info-key", doc = "The key must be in the INFO field of the input VCF.") - private String infoKey = GATKVCFConstants.CNN_1D_KEY; + private String infoKey = GATKVCFConstants.CNN_2D_KEY; - @Argument(fullName = "max-sites", shortName = "max-sites", doc = "Maximum number of truth VCF sites to check.") - private int maxSites = 1200; + private VariantContextWriter vcfWriter; + private List snpScores = new ArrayList<>(); + private List snpCutoffs = new ArrayList<>(); + private List indelScores = new ArrayList<>(); + private List indelCutoffs = new ArrayList<>(); - @Argument(fullName="tranche", - shortName="t", - doc="The levels of truth sensitivity at which to slice the data. (in percents, i.e. 99.9 for 99.9 percent and 1.0 for 1 percent)", - optional=true) - private List tranches = new ArrayList(Arrays.asList(99.9, 99.0, 90.0)); + private int scoredSnps = 0; + private int filteredSnps = 0; + private int scoredIndels = 0; + private int filteredIndels = 0; - // Start the Python executor. This does not actually start the Python process, but fails if python can't be located - final PythonScriptExecutor pythonExecutor = new PythonScriptExecutor(true); - private File tempFile, tempFileIdx; - private String trancheString; + @Override + public void onTraversalStart() { + if (tranches.size() < 1 || tranches.stream().anyMatch(d -> d < 0 || d >= 100.0)){ + throw new GATKException("At least 1 tranche value must be given and all tranches must be greater than 0 and less than 100."); + } + tranches = tranches.stream().distinct().collect(Collectors.toList()); + tranches.sort(Double::compareTo); + vcfWriter = createVCFWriter(new File(outputVcf)); + writeVCFHeader(vcfWriter); + } @Override - protected void onStartup() { - /* check for successful import of libraries */ - PythonScriptExecutor.checkPythonEnvironmentForPackage("vqsr_cnn"); - trancheString = " --tranches " + tranches.stream().map(d -> Double.toString(d)).collect(Collectors.joining(" ")); - try { - final String idxExt = ".tbi"; - tempFile = File.createTempFile(outputVcf, "_temp.vcf.gz"); - tempFile.deleteOnExit(); - tempFileIdx = new File(tempFile.getAbsolutePath()+idxExt); - tempFileIdx.deleteOnExit(); - } catch (IOException e) { - throw new GATKException("Error when creating temp files.", e); + public void firstPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + if (!variant.hasAttribute(infoKey)){ + return; + } else if (variant.isSNP()){ + scoredSnps++; + } else if (variant.isIndel()){ + scoredIndels++; + } + + for (FeatureInput featureSource : resources) { + for (VariantContext v : featureContext.getValues(featureSource)) { + if (variant.isSNP()){ + if(variant.getAlternateAlleles().stream().anyMatch(v.getAlternateAlleles()::contains)) { + snpScores.add(Double.parseDouble((String) variant.getAttribute(infoKey))); + return; + } + } else if (variant.isIndel()){ + if(variant.getAlternateAlleles().stream().anyMatch(v.getAlternateAlleles()::contains)){ + indelScores.add(Double.parseDouble((String)variant.getAttribute(infoKey))); + return; + } + } + } } } @Override - protected Object doWork() { - final Resource pythonScriptResource = new Resource("tranches.py", FilterVariantTranches.class); - final List snpArguments = new ArrayList<>(Arrays.asList( - "--mode", "write_snp_tranches", - "--input_vcf", inputVcf, - "--train_vcf", snpTruthVcf, - "--score_keys", infoKey, - trancheString, - "--samples", Integer.toString(maxSites), - "--output_vcf", tempFile.getAbsolutePath())); - - logger.info("SNP Args are:"+ Arrays.toString(snpArguments.toArray())); - final boolean pythonReturnCode = pythonExecutor.executeScript( - pythonScriptResource, - null, - snpArguments - ); - - final FeatureCodec codec = FeatureManager.getCodecForFile(tempFile); - final Index index = createAppropriateIndexInMemory(codec, tempFile, tempFileIdx); - - try { - index.write(tempFileIdx); - } catch (final IOException e) { - throw new GATKException(String.format("Could not write temporary index to file: %s", tempFileIdx.getAbsolutePath()), e); + public void afterFirstPass() { + logger.info(String.format("Found %d SNPs %d indels with INFO score key:%s.", scoredSnps, scoredIndels, infoKey)); + logger.info(String.format("Found %d SNPs %d indels in the resources.", snpScores.size(), indelScores.size())); + + if (scoredSnps == 0 || scoredIndels == 0 || snpScores.size() == 0 || indelScores.size() == 0){ + throw new GATKException("VCF must contain SNPs and indels with scores and resources must contain matching SNPs and indels."); + } + + Collections.sort(snpScores, Collections.reverseOrder()); + Collections.sort(indelScores, Collections.reverseOrder()); + + for(double t : tranches) { + int snpIndex = (int)((t/100.0)*(double)(snpScores.size()-1)); + snpCutoffs.add(snpScores.get(snpIndex)); + int indelIndex = (int)((t/100.0)*(double)(indelScores.size()-1)); + indelCutoffs.add(indelScores.get(indelIndex)); } - final List indelArguments = new ArrayList<>(Arrays.asList( - "--mode", "write_indel_tranches", - "--input_vcf", tempFile.getAbsolutePath(), - "--train_vcf", indelTruthVcf, - "--score_keys", infoKey, - trancheString, - "--samples", Integer.toString(maxSites), - "--output_vcf", outputVcf)); - - logger.info("Did SNP filtering, now INDELs. Arguments are:"+ Arrays.toString(indelArguments.toArray())); - final boolean pythonReturnCode2 = pythonExecutor.executeScript( - pythonScriptResource, - null, - indelArguments - ); - return pythonReturnCode && pythonReturnCode2; } - // Stolen and adapted from IndexFeatureFile.java - private Index createAppropriateIndexInMemory(final FeatureCodec codec, File featureFile, File indexFile) { - try { - // For block-compression files, write a Tabix index - if (IOUtil.hasBlockCompressedExtension(featureFile)) { - // Creating tabix indices with a non standard extensions can cause problems so we disable it - if (!indexFile.getAbsolutePath().endsWith(TabixUtils.STANDARD_INDEX_EXTENSION)) { - throw new UserException("The index for " + featureFile + " must be written to a file with a \"" + TabixUtils.STANDARD_INDEX_EXTENSION + "\" extension"); - } + @Override + protected void secondPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + final VariantContextBuilder builder = new VariantContextBuilder(variant); + + if (variant.hasAttribute(infoKey)) { + final double score = Double.parseDouble((String) variant.getAttribute(infoKey)); + if (variant.isSNP() && isTrancheFiltered(score, snpCutoffs)) { + builder.filter(filterStringFromScore(score, snpCutoffs)); + filteredSnps++; + } else if (variant.isIndel() && isTrancheFiltered(score, indelCutoffs)) { + builder.filter(filterStringFromScore(score, indelCutoffs)); + filteredIndels++; + } + } + + vcfWriter.add(builder.make()); + } + + @Override + public void closeTool() { + logger.info(String.format("Filtered %d SNPs out of %d and filtered %d indels out of %d with INFO score: %s.", + filteredSnps, scoredSnps, filteredIndels, scoredIndels, infoKey)); + + if ( vcfWriter != null ) { + vcfWriter.close(); + } + } + + private void writeVCFHeader(VariantContextWriter vcfWriter) { + // setup the header fields + final VCFHeader inputHeader = getHeaderForVariants(); + final Set inputHeaders = inputHeader.getMetaDataInSortedOrder(); + final Set hInfo = new HashSet<>(inputHeaders); + + if( tranches.size() >= 2 ) { + for(int i = 0; i < tranches.size() - 1; i++) { + String filterKey = filterKeyFromTranches(infoKey, tranches.get(i), tranches.get(i+1)); + String filterDescription = filterDescriptionFromTranches(infoKey, tranches.get(i), tranches.get(i+1)); + hInfo.add(new VCFFilterHeaderLine(filterKey, filterDescription)); + } + } + String filterKey = filterKeyFromTranches(infoKey, tranches.get(tranches.size()-1), 100.0); + String filterDescription = filterDescriptionFromTranches(infoKey, tranches.get(tranches.size()-1), 100.0); + hInfo.add(new VCFFilterHeaderLine(filterKey, filterDescription)); + final TreeSet samples = new TreeSet<>(); + samples.addAll(inputHeader.getGenotypeSamples()); + hInfo.addAll(getDefaultToolVCFHeaderLines()); + final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); + vcfWriter.writeHeader(vcfHeader); + } - return IndexFactory.createIndex(featureFile, codec, IndexFactory.IndexType.TABIX, null); + private String filterKeyFromTranches(String infoKey, double t1, double t2){ + return String.format("%s_Tranche_%.2f_%.2f", infoKey, t1, t2); + } + + private String filterDescriptionFromTranches(String infoKey, double t1, double t2){ + return String.format("Truth sensitivity between %.2f and %.2f for info key %s", t1, t2, infoKey); + } + + private boolean isTrancheFiltered(double score, List cutoffs) { + return score <= cutoffs.get(0); + } - } else { - // Optimize indices for other kinds of files for seek time / querying - return IndexFactory.createDynamicIndex(featureFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + private String filterStringFromScore(double score, List cutoffs){ + for (int i = 0; i < cutoffs.size(); i++){ + if (score > cutoffs.get(i) && i == 0){ + throw new GATKException("Trying to add a filter to a passing variant."); + } else if (score > cutoffs.get(i)){ + return filterKeyFromTranches(infoKey, tranches.get(i-1), tranches.get(i)); } - } catch (TribbleException e) { - // Underlying cause here is usually a malformed file, but can also be things like - // "codec does not support tabix" - throw new UserException.CouldNotIndexFile(featureFile, e); } + return filterKeyFromTranches(infoKey, tranches.get(tranches.size()-1), 100.0); } } diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/tranches.py b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/tranches.py deleted file mode 100644 index c7e6e4c4c25..00000000000 --- a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/tranches.py +++ /dev/null @@ -1,124 +0,0 @@ -import os -import pysam -import vqsr_cnn -from collections import Counter - - -def run(): - args = vqsr_cnn.parse_args() - if 'write_snp_tranches' == args.mode: - write_tranches(args) - elif 'write_indel_tranches' == args.mode: - write_tranches(args, variant_type='INDEL') - else: - raise ValueError('Unknown tranche mode:', args.mode) - - -def write_tranches(args, variant_type='SNP'): - score_key = args.score_keys[0] - tranches = sorted(args.tranches, reverse=True) - - vcf_truth = pysam.VariantFile(args.train_vcf, 'rb') - vcf_reader = pysam.VariantFile(args.input_vcf, 'r') - - print('Writing sensitivity tranches:', tranches, ' with score_key:', score_key, 'for', variant_type, 'variants.') - stats = Counter() - scores = [] - - print('Iterate over tranche truth VCF.') - for variant in vcf_truth: - if not variant.alts: - stats['Variant without alts'] += 1 - continue - for allele_idx, allele in enumerate(variant.alts): - v_scored = allele_in_vcf(allele, variant, vcf_reader) - if not v_scored: - stats['Variant not in input VCF'] += 1 - continue - scores.append(v_scored.info[score_key]) - stats['count'] += 1 - if stats['count']%2000==0: - for k in stats.keys(): - print(k, ' has:', stats[k]) - if stats['count'] >= args.samples: - break - - scores = sorted(scores, reverse=True) - thresholds = [] - - for i,t in enumerate(tranches): - thresholds.append(scores[int(clamp((t/100.0)*len(scores), 0, len(scores)-1))]) - - if len(thresholds) == 1: - filter_id = score_key + 'Tranche' + variant_type + format_tranche(t) + 'to100.0' - vcf_reader.header.filters.add(filter_id, None, None, 'Tranche filtering from '+score_key) - else: - filter_id = score_key + 'Tranche' + variant_type + format_tranche(t) + 'to' + format_tranche(tranches[i-1]) - vcf_reader.header.filters.add(filter_id, None, None, 'Tranche filtering from '+score_key) - print('At tranch:', t, ' score key:', score_key, ' cutoff is:', thresholds[-1], 'filter id is:', filter_id) - - vcf_writer = pysam.VariantFile(args.output_vcf, 'w', header=vcf_reader.header) - for variant in vcf_reader: - if score_key in variant.info: - if variant_type == 'SNP' and is_snp(variant) or variant_type == 'INDEL' and is_indel(variant): - variant.filter.add(score_to_filter_status(variant.info[score_key], - score_key, variant_type, tranches, thresholds)) - - vcf_writer.write(variant) - - for k in stats.keys(): - print(k, ' has:', stats[k]) - - -def allele_in_vcf(allele, variant, vcf_ram): - ''' Check if variant's allele is in a VCF file. - - Arguments - allele: the allele from the provided variant that we are checking - variant: the variant whose allele we are looking for - vcf_ram: the VCF we look in, must have an index (tbi, or idx) - - Returns - variant if it is found otherwise None - ''' - variants = vcf_ram.fetch(variant.contig, variant.pos-1, variant.pos) - - for v in variants: - if v.contig == variant.contig and v.pos == variant.pos and allele in v.alts: - return v - - return None - - -def score_to_filter_status(score, score_key, variant_type, tranches, thresholds): - - for i, threshold in enumerate(thresholds): - if score < threshold: - if i == 0: - high_thresh = '100.0' - else: - high_thresh = format_tranche(tranches[i-1]) - low_thresh = format_tranche(tranches[i]) - - return score_key + 'Tranche' + variant_type + low_thresh + 'to' + high_thresh - - return 'PASS' - - -def format_tranche(t): - return '{0:.1f}'.format(100*t) - -def is_snp(variant): - return len(variant.ref) == 1 and any(map(lambda x: len(x) == 1, variant.alts)) - -def is_indel(variant): - return len(variant.ref) > 1 or any(map(lambda x: len(x) > 1, variant.alts)) - - -def clamp(n, minn, maxn): - return max(min(maxn, n), minn) - - -# Back to the top! -if "__main__" == __name__: - run() \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java index 04c5cad7d90..51b08c0bca2 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java @@ -95,11 +95,10 @@ public void testTranches() { args.add("FilterVariantTranches") .addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF) .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, outputVCF.getAbsolutePath()) - .addArgument("snp-truth-vcf", snpTruthVCF) - .addArgument("indel-truth-vcf", indelTruthVCF) + .addArgument("resource", snpTruthVCF) + .addArgument("resource", indelTruthVCF) .addArgument("tranche", "99.0") .addArgument("tranche", "95.0") - .addArgument("max-sites", "2000") .addArgument("info-key", "VQSLOD"); new Main().instanceMain(args.getArgsArray()); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java new file mode 100644 index 00000000000..ef43fde0dc7 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java @@ -0,0 +1,117 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr; + +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder; +import org.broadinstitute.hellbender.utils.test.IntegrationTestSpec; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; + +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.io.IOException; + +public class FilterVariantTranchesIntegrationTest extends CommandLineProgramTest { + + + /** + * Run the tool on a small test VCF. + */ + @Test + public void testTrancheFiltering() throws IOException { + final boolean newExpectations = false; + + final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz"; + final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf"; + final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf"; + + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); + argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF) + .addArgument("resource", snpTruthVCF) + .addArgument("resource", indelTruthVCF) + .addArgument("tranche", "99.0") + .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT") + .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); + + if(newExpectations){ + argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99.vcf"); + runCommandLine(argsBuilder); + } else { + argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s"); + final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(), + Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99.vcf")); + spec.executeTest("testTrancheFiltering", this); + } + } + + @Test + public void testTrancheFilteringDuplicate() throws IOException { + final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz"; + final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf"; + final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf"; + + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); + argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF) + .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s") + .addArgument("resource", snpTruthVCF) + .addArgument("resource", indelTruthVCF) + .addArgument("tranche", "99.0") + .addArgument("tranche", "99.0") + .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT") + .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); + + final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(), + Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99.vcf")); + spec.executeTest("testTrancheFilteringDuplicate", this); + } + + @Test + public void testTrancheFilteringTranches() throws IOException { + final boolean newExpectations = false; + + final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz"; + final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf"; + final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf"; + + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); + argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF) + .addArgument("resource", snpTruthVCF) + .addArgument("resource", indelTruthVCF) + .addArgument("tranche", "95.0") + .addArgument("tranche", "99.0") + .addArgument("tranche", "99.9") + .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT") + .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); + + if(newExpectations){ + argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf"); + runCommandLine(argsBuilder); + } else { + argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s"); + final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(), + Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf")); + spec.executeTest("testTrancheFilteringTranches", this); + } + } + + @Test + public void testTrancheFilteringTranchesOrder() throws IOException { + final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz"; + final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf"; + final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf"; + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); + argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF) + .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s") + .addArgument("resource", snpTruthVCF) + .addArgument("resource", indelTruthVCF) + .addArgument("tranche", "99.9") + .addArgument("tranche", "95.0") + .addArgument("tranche", "99.0") + .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT") + .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); + + final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(), + Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf")); + spec.executeTest("testTrancheFilteringTranchesOrder", this); + } + +} diff --git a/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf new file mode 100644 index 00000000000..1506afd2f89 --- /dev/null +++ b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5669299f8d1781cd96cad26c8d6983698ab203569485ceb5ccf82bb90b820577 +size 5375112 diff --git a/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_99.vcf b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_99.vcf new file mode 100644 index 00000000000..4a90747f375 --- /dev/null +++ b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_99.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d8ce8df57d0434172d99c55f6b2ac8c68d9e64509914d2ebd4fb12c0c48dcc9 +size 5332769 diff --git a/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz b/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz new file mode 100644 index 00000000000..7181008349f --- /dev/null +++ b/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a894fbe4544f00bd5c3c643086cef954f489495935cf9c442ac8bad4ffc6c19 +size 1013156 diff --git a/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz.tbi b/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz.tbi new file mode 100644 index 00000000000..23c2a2fe97e --- /dev/null +++ b/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f3d08f0722db860860c2514be8289f26c0cc3f8be9069d81ab81f779052d067 +size 4601 From 26175adb4fd98363c94ec98d204fbadae3a4cc25 Mon Sep 17 00:00:00 2001 From: Samuel Friedman Date: Wed, 20 Jun 2018 09:03:41 -0400 Subject: [PATCH 2/3] cleanup --- .../tools/walkers/vqsr/FilterVariantTranches.java | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java index a9528aac4a0..4c567da2906 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java @@ -1,14 +1,5 @@ package org.broadinstitute.hellbender.tools.walkers.vqsr; -import htsjdk.samtools.util.IOUtil; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.index.Index; -import htsjdk.tribble.index.IndexFactory; -import htsjdk.tribble.util.TabixUtils; - import java.util.*; import java.io.File; import java.util.stream.Collectors; @@ -21,17 +12,15 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.hellbender.engine.*; - import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.barclay.argparser.ExperimentalFeature; import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.barclay.argparser.ExperimentalFeature; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; -import picard.cmdline.programgroups.VariantFilteringProgramGroup; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import picard.cmdline.programgroups.VariantEvaluationProgramGroup; +import picard.cmdline.programgroups.VariantFilteringProgramGroup; /** * Apply tranche filtering to VCF based on scores from an annotation in the INFO field. From c88dbde5f20e700bcc0594164535f9b29114abdc Mon Sep 17 00:00:00 2001 From: Samuel Friedman Date: Thu, 21 Jun 2018 08:48:26 -0400 Subject: [PATCH 3/3] fix tests --- .../FilterVariantTranchesIntegrationTest.java | 27 +++++-------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java index ef43fde0dc7..1857781f6b3 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java @@ -18,29 +18,23 @@ public class FilterVariantTranchesIntegrationTest extends CommandLineProgramTes */ @Test public void testTrancheFiltering() throws IOException { - final boolean newExpectations = false; - final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz"; final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf"; final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf"; final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF) + .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s") .addArgument("resource", snpTruthVCF) .addArgument("resource", indelTruthVCF) .addArgument("tranche", "99.0") .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT") .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - if(newExpectations){ - argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99.vcf"); - runCommandLine(argsBuilder); - } else { - argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s"); - final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(), + final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(), Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99.vcf")); - spec.executeTest("testTrancheFiltering", this); - } + spec.executeTest("testTrancheFiltering", this); + } @Test @@ -66,14 +60,13 @@ public void testTrancheFilteringDuplicate() throws IOException { @Test public void testTrancheFilteringTranches() throws IOException { - final boolean newExpectations = false; - final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz"; final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf"; final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf"; final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF) + .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s") .addArgument("resource", snpTruthVCF) .addArgument("resource", indelTruthVCF) .addArgument("tranche", "95.0") @@ -82,15 +75,9 @@ public void testTrancheFilteringTranches() throws IOException { .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT") .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - if(newExpectations){ - argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf"); - runCommandLine(argsBuilder); - } else { - argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s"); - final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(), + final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(), Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf")); - spec.executeTest("testTrancheFilteringTranches", this); - } + spec.executeTest("testTrancheFilteringTranches", this); } @Test