From 1f732a5365b63629ba92455b1b7e5f5d55d257a9 Mon Sep 17 00:00:00 2001
From: Samuel Friedman <sam@broadinstitute.org>
Date: Tue, 22 May 2018 19:14:11 -0400
Subject: [PATCH 1/3] tranche filtering in java

---
 .../tools/walkers/vqsr/CNNVariantTrain.java   |   2 +-
 .../walkers/vqsr/CNNVariantWriteTensors.java  |   2 +-
 .../walkers/vqsr/FilterVariantTranches.java   | 277 +++++++++++-------
 .../hellbender/tools/walkers/vqsr/tranches.py | 124 --------
 .../walkers/vqsr/CNNVariantPipelineTest.java  |   5 +-
 .../FilterVariantTranchesIntegrationTest.java | 117 ++++++++
 .../g94982_20_1m_10m_tranched_95_99_99.9.vcf  |   3 +
 .../expected/g94982_20_1m_10m_tranched_99.vcf |   3 +
 .../VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz |   3 +
 .../g94982_20_1m_10m_python_2dcnn.vcf.gz.tbi  |   3 +
 10 files changed, 296 insertions(+), 243 deletions(-)
 delete mode 100644 src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/tranches.py
 create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java
 create mode 100644 src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf
 create mode 100644 src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_99.vcf
 create mode 100644 src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz
 create mode 100644 src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz.tbi
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java
index cd0bfc3200c..a11449ddcd9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java
@@ -103,7 +103,7 @@ protected void onStartup() {
 
     @Override
     protected Object doWork() {
-        final Resource pythonScriptResource = new Resource("training.py", FilterVariantTranches.class);
+        final Resource pythonScriptResource = new Resource("training.py", CNNVariantTrain.class);
         List<String> arguments = new ArrayList<>(Arrays.asList(
                 "--data_dir", inputTensorDir,
                 "--output_dir", outputDir,
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java
index 53d282d617d..b850e5e8cb1 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java
@@ -121,7 +121,7 @@ protected void onStartup() {
 
     @Override
     protected Object doWork() {
-        final Resource pythonScriptResource = new Resource("training.py", FilterVariantTranches.class);
+        final Resource pythonScriptResource = new Resource("training.py", CNNVariantWriteTensors.class);
         List<String> arguments = new ArrayList<>(Arrays.asList(
                 "--reference_fasta", reference,
                 "--input_vcf", inputVcf,
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java
index b8813b38e7e..a9528aac4a0 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java
@@ -8,35 +8,38 @@
 import htsjdk.tribble.index.Index;
 import htsjdk.tribble.index.IndexFactory;
 import htsjdk.tribble.util.TabixUtils;
+
+import java.util.*;
+import java.io.File;
+import java.util.stream.Collectors;
+
+import htsjdk.variant.vcf.VCFHeader;
+import htsjdk.variant.vcf.VCFHeaderLine;
+import htsjdk.variant.vcf.VCFFilterHeaderLine;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+
+import org.broadinstitute.hellbender.engine.*;
+
 import org.broadinstitute.barclay.argparser.Argument;
-import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
-import org.broadinstitute.barclay.argparser.ExperimentalFeature;
 import org.broadinstitute.barclay.help.DocumentedFeature;
-import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
-import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
 import org.broadinstitute.hellbender.exceptions.GATKException;
-import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.utils.io.Resource;
-import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;
 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
 import picard.cmdline.programgroups.VariantFilteringProgramGroup;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.stream.Collectors;
+import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
 
 /**
- * Apply tranche filtering to VCF based on scores from the INFO field.
+ * Apply tranche filtering to VCF based on scores from an annotation in the INFO field.
  *
  * <h3>Inputs</h3>
  * <ul>
  *      <li>The input variants to tranche filter.</li>
- *      <li>snp-truth-vcf A VCF containing known common SNP sites</li>
- *      <li>indel-truth-vcf A VCF containing known and common INDEL sites.</li>
+ *      <li>resource A VCF containing known SNP and or INDEL sites. Can be supplied as many times as necessary </li>
  *      <li>info-key The key from the INFO field of the VCF which contains the values that will be used to filter.</li>
  *      <li>tranche List of percent sensitivities to the known sites at which we will filter.  Must be between 0 and 100.</li>
  * </ul>
@@ -53,139 +56,185 @@
  * <pre>
  * gatk FilterVariantTranches \
  *   -V input.vcf.gz \
- *   --snp-truth-vcf hapmap.vcf \
- *   --indel-truth-vcf mills.vcf \
+ *   --resource hapmap.vcf \
+ *   --resource mills.vcf \
  *   --info-key CNN_1D \
  *   --tranche 99.9 --tranche 99.0 --tranche 95 \
- *   --max-sites 8000 \
  *   -O filtered.vcf
  * </pre>
  *
  */
+@DocumentedFeature
+@ExperimentalFeature
 @CommandLineProgramProperties(
-        summary = "Apply tranche filtering based on a truth VCF of known common sites of variation and score from VCF INFO field",
+        summary = "Apply tranche filtering based on a truth VCF of known common sites of variation and a score from VCF INFO field",
         oneLineSummary = "Apply tranche filtering",
         programGroup = VariantFilteringProgramGroup.class
 )
-@DocumentedFeature
-@ExperimentalFeature
-public class FilterVariantTranches extends CommandLineProgram {
-    @Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME,
-            shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME,
-            doc = "Input VCF file")
-    private String inputVcf = null;
 
+public class FilterVariantTranches extends TwoPassVariantWalker {
     @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
             shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
             doc = "Output VCF file")
     private String outputVcf = null;
 
-    @Argument(fullName = "snp-truth-vcf", shortName = "snp-truth-vcf", doc = "Input file of known common SNP sites.")
-    private String snpTruthVcf = null;
+    @Argument(fullName="tranche",
+            shortName="t",
+            doc="The levels of truth sensitivity at which to slice the data. (in percents, i.e. 99.9 for 99.9 percent and 1.0 for 1 percent)",
+            optional=true)
+    private List<Double> tranches = new ArrayList<>(Arrays.asList(99.9, 99.99));
 
-    @Argument(fullName = "indel-truth-vcf", shortName = "indel-truth-vcf", doc = "Input file of known common INDEL sites.")
-    private String indelTruthVcf = null;
+    @Argument(fullName="resource",
+            shortName = "resource",
+            doc="A list of validated VCFs with known sites of common variation",
+            optional=false)
+    private List<FeatureInput<VariantContext>> resources = new ArrayList<>();
 
     @Argument(fullName = "info-key", shortName = "info-key", doc = "The key must be in the INFO field of the input VCF.")
-    private String infoKey = GATKVCFConstants.CNN_1D_KEY;
+    private String infoKey = GATKVCFConstants.CNN_2D_KEY;
 
-    @Argument(fullName = "max-sites", shortName = "max-sites", doc = "Maximum number of truth VCF sites to check.")
-    private int maxSites = 1200;
+    private VariantContextWriter vcfWriter;
+    private List<Double> snpScores = new ArrayList<>();
+    private List<Double> snpCutoffs = new ArrayList<>();
+    private List<Double> indelScores = new ArrayList<>();
+    private List<Double> indelCutoffs = new ArrayList<>();
 
-    @Argument(fullName="tranche",
-            shortName="t",
-            doc="The levels of truth sensitivity at which to slice the data. (in percents, i.e. 99.9 for 99.9 percent and 1.0 for 1 percent)",
-            optional=true)
-    private List<Double> tranches = new ArrayList<Double>(Arrays.asList(99.9, 99.0, 90.0));
+    private int scoredSnps = 0;
+    private int filteredSnps = 0;
+    private int scoredIndels = 0;
+    private int filteredIndels = 0;
 
-    // Start the Python executor. This does not actually start the Python process, but fails if python can't be located
-    final PythonScriptExecutor pythonExecutor = new PythonScriptExecutor(true);
-    private File tempFile, tempFileIdx;
-    private String trancheString;
+    @Override
+    public void onTraversalStart() {
+        if (tranches.size() < 1 || tranches.stream().anyMatch(d -> d < 0 || d >= 100.0)){
+            throw new GATKException("At least 1 tranche value must be given and all tranches must be greater than 0 and less than 100.");
+        }
+        tranches = tranches.stream().distinct().collect(Collectors.toList());
+        tranches.sort(Double::compareTo);
+        vcfWriter = createVCFWriter(new File(outputVcf));
+        writeVCFHeader(vcfWriter);
+    }
 
     @Override
-    protected void onStartup() {
-        /* check for successful import of libraries */
-        PythonScriptExecutor.checkPythonEnvironmentForPackage("vqsr_cnn");
-        trancheString = " --tranches " + tranches.stream().map(d -> Double.toString(d)).collect(Collectors.joining(" "));
-        try {
-            final String idxExt = ".tbi";
-            tempFile = File.createTempFile(outputVcf, "_temp.vcf.gz");
-            tempFile.deleteOnExit();
-            tempFileIdx = new File(tempFile.getAbsolutePath()+idxExt);
-            tempFileIdx.deleteOnExit();
-        } catch (IOException e) {
-            throw new GATKException("Error when creating temp files.", e);
+    public void firstPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) {
+        if (!variant.hasAttribute(infoKey)){
+            return;
+        } else if (variant.isSNP()){
+            scoredSnps++;
+        } else if (variant.isIndel()){
+            scoredIndels++;
+        }
+
+        for (FeatureInput<VariantContext> featureSource : resources) {
+            for (VariantContext v : featureContext.getValues(featureSource)) {
+                if (variant.isSNP()){
+                    if(variant.getAlternateAlleles().stream().anyMatch(v.getAlternateAlleles()::contains)) {
+                        snpScores.add(Double.parseDouble((String) variant.getAttribute(infoKey)));
+                        return;
+                    }
+                } else if (variant.isIndel()){
+                    if(variant.getAlternateAlleles().stream().anyMatch(v.getAlternateAlleles()::contains)){
+                        indelScores.add(Double.parseDouble((String)variant.getAttribute(infoKey)));
+                        return;
+                    }
+                }
+            }
         }
     }
 
     @Override
-    protected Object doWork() {
-        final Resource pythonScriptResource = new Resource("tranches.py", FilterVariantTranches.class);
-        final List<String> snpArguments = new ArrayList<>(Arrays.asList(
-                "--mode", "write_snp_tranches",
-                "--input_vcf", inputVcf,
-                "--train_vcf", snpTruthVcf,
-                "--score_keys", infoKey,
-                trancheString,
-                "--samples", Integer.toString(maxSites),
-                "--output_vcf", tempFile.getAbsolutePath()));
-
-        logger.info("SNP Args are:"+ Arrays.toString(snpArguments.toArray()));
-        final boolean pythonReturnCode = pythonExecutor.executeScript(
-                pythonScriptResource,
-                null,
-                snpArguments
-        );
-
-        final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(tempFile);
-        final Index index = createAppropriateIndexInMemory(codec, tempFile, tempFileIdx);
-
-        try {
-            index.write(tempFileIdx);
-        } catch (final IOException e) {
-            throw new GATKException(String.format("Could not write temporary index to file: %s", tempFileIdx.getAbsolutePath()), e);
+    public void afterFirstPass() {
+        logger.info(String.format("Found %d SNPs %d indels with INFO score key:%s.", scoredSnps, scoredIndels, infoKey));
+        logger.info(String.format("Found %d SNPs %d indels in the resources.", snpScores.size(), indelScores.size()));
+
+        if (scoredSnps == 0 || scoredIndels == 0 || snpScores.size() == 0 || indelScores.size() == 0){
+            throw new GATKException("VCF must contain SNPs and indels with scores and resources must contain matching SNPs and indels.");
+        }
+
+        Collections.sort(snpScores, Collections.reverseOrder());
+        Collections.sort(indelScores, Collections.reverseOrder());
+
+        for(double t : tranches) {
+            int snpIndex = (int)((t/100.0)*(double)(snpScores.size()-1));
+            snpCutoffs.add(snpScores.get(snpIndex));
+            int indelIndex = (int)((t/100.0)*(double)(indelScores.size()-1));
+            indelCutoffs.add(indelScores.get(indelIndex));
         }
 
-        final List<String> indelArguments = new ArrayList<>(Arrays.asList(
-                "--mode", "write_indel_tranches",
-                "--input_vcf", tempFile.getAbsolutePath(),
-                "--train_vcf", indelTruthVcf,
-                "--score_keys", infoKey,
-                trancheString,
-                "--samples", Integer.toString(maxSites),
-                "--output_vcf", outputVcf));
-
-        logger.info("Did SNP filtering, now INDELs. Arguments are:"+ Arrays.toString(indelArguments.toArray()));
-        final boolean pythonReturnCode2 = pythonExecutor.executeScript(
-                pythonScriptResource,
-                null,
-                indelArguments
-        );
-        return pythonReturnCode && pythonReturnCode2;
     }
 
-    // Stolen and adapted from IndexFeatureFile.java
-    private Index createAppropriateIndexInMemory(final FeatureCodec<? extends Feature, ?> codec, File featureFile, File indexFile) {
-        try {
-            // For block-compression files, write a Tabix index
-            if (IOUtil.hasBlockCompressedExtension(featureFile)) {
-                // Creating tabix indices with a non standard extensions can cause problems so we disable it
-                if (!indexFile.getAbsolutePath().endsWith(TabixUtils.STANDARD_INDEX_EXTENSION)) {
-                    throw new UserException("The index for " + featureFile + " must be written to a file with a \"" + TabixUtils.STANDARD_INDEX_EXTENSION + "\" extension");
-                }
+    @Override
+    protected void secondPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {
+        final VariantContextBuilder builder = new VariantContextBuilder(variant);
+
+        if (variant.hasAttribute(infoKey)) {
+            final double score = Double.parseDouble((String) variant.getAttribute(infoKey));
+            if (variant.isSNP() && isTrancheFiltered(score, snpCutoffs)) {
+                builder.filter(filterStringFromScore(score, snpCutoffs));
+                filteredSnps++;
+            } else if (variant.isIndel() && isTrancheFiltered(score, indelCutoffs)) {
+                builder.filter(filterStringFromScore(score, indelCutoffs));
+                filteredIndels++;
+            }
+        }
+        
+        vcfWriter.add(builder.make());
+    }
+
+    @Override
+    public void closeTool() {
+        logger.info(String.format("Filtered %d SNPs out of %d and filtered %d indels out of %d with INFO score: %s.",
+                filteredSnps, scoredSnps, filteredIndels, scoredIndels, infoKey));
+
+        if ( vcfWriter != null ) {
+            vcfWriter.close();
+        }
+    }
+
+    private void writeVCFHeader(VariantContextWriter vcfWriter) {
+        // setup the header fields
+        final VCFHeader inputHeader = getHeaderForVariants();
+        final Set<VCFHeaderLine> inputHeaders = inputHeader.getMetaDataInSortedOrder();
+        final Set<VCFHeaderLine> hInfo = new HashSet<>(inputHeaders);
+
+        if( tranches.size() >= 2 ) {
+            for(int i = 0; i < tranches.size() - 1; i++) {
+                String filterKey = filterKeyFromTranches(infoKey, tranches.get(i), tranches.get(i+1));
+                String filterDescription = filterDescriptionFromTranches(infoKey, tranches.get(i), tranches.get(i+1));
+                hInfo.add(new VCFFilterHeaderLine(filterKey, filterDescription));
+            }
+        }
+        String filterKey = filterKeyFromTranches(infoKey, tranches.get(tranches.size()-1), 100.0);
+        String filterDescription = filterDescriptionFromTranches(infoKey, tranches.get(tranches.size()-1), 100.0);
+        hInfo.add(new VCFFilterHeaderLine(filterKey, filterDescription));
+        final TreeSet<String> samples = new TreeSet<>();
+        samples.addAll(inputHeader.getGenotypeSamples());
+        hInfo.addAll(getDefaultToolVCFHeaderLines());
+        final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
+        vcfWriter.writeHeader(vcfHeader);
+    }
 
-                return IndexFactory.createIndex(featureFile, codec, IndexFactory.IndexType.TABIX, null);
+    private String filterKeyFromTranches(String infoKey, double t1, double t2){
+        return String.format("%s_Tranche_%.2f_%.2f", infoKey, t1, t2);
+    }
+    
+    private String filterDescriptionFromTranches(String infoKey, double t1, double t2){
+        return String.format("Truth sensitivity between %.2f and %.2f for info key %s", t1, t2, infoKey);
+    }
+
+    private boolean isTrancheFiltered(double score, List<Double> cutoffs) {
+        return score <= cutoffs.get(0);
+    }
 
-            } else {
-                // Optimize indices for other kinds of files for seek time / querying
-                return IndexFactory.createDynamicIndex(featureFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
+    private String filterStringFromScore(double score, List<Double> cutoffs){
+        for (int i = 0; i < cutoffs.size(); i++){
+            if (score > cutoffs.get(i) && i == 0){
+                throw new GATKException("Trying to add a filter to a passing variant.");
+            } else if (score > cutoffs.get(i)){
+                return filterKeyFromTranches(infoKey, tranches.get(i-1), tranches.get(i));
             }
-        } catch (TribbleException e) {
-            // Underlying cause here is usually a malformed file, but can also be things like
-            // "codec does not support tabix"
-            throw new UserException.CouldNotIndexFile(featureFile, e);
         }
+        return filterKeyFromTranches(infoKey, tranches.get(tranches.size()-1), 100.0);
     }
 
 }
diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/tranches.py b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/tranches.py
deleted file mode 100644
index c7e6e4c4c25..00000000000
--- a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/tranches.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import os
-import pysam
-import vqsr_cnn
-from collections import Counter
-
-
-def run():
-    args = vqsr_cnn.parse_args()
-    if 'write_snp_tranches' == args.mode:
-        write_tranches(args)
-    elif 'write_indel_tranches' == args.mode:
-            write_tranches(args, variant_type='INDEL')
-    else:
-        raise ValueError('Unknown tranche mode:', args.mode)
-
-
-def write_tranches(args, variant_type='SNP'):
-    score_key = args.score_keys[0]
-    tranches = sorted(args.tranches, reverse=True)
-
-    vcf_truth = pysam.VariantFile(args.train_vcf, 'rb')
-    vcf_reader = pysam.VariantFile(args.input_vcf, 'r')
-
-    print('Writing sensitivity tranches:', tranches, ' with score_key:', score_key, 'for', variant_type, 'variants.')
-    stats = Counter()
-    scores = []
-
-    print('Iterate over tranche truth VCF.')
-    for variant in vcf_truth:
-        if not variant.alts:
-            stats['Variant without alts'] += 1
-            continue
-        for allele_idx, allele in enumerate(variant.alts):
-            v_scored = allele_in_vcf(allele, variant, vcf_reader)
-            if not v_scored:
-                stats['Variant not in input VCF'] += 1
-                continue
-            scores.append(v_scored.info[score_key])
-            stats['count'] += 1
-            if stats['count']%2000==0:
-                for k in stats.keys():
-                    print(k, ' has:', stats[k])
-        if stats['count'] >= args.samples:
-            break
-
-    scores = sorted(scores, reverse=True)
-    thresholds = []
-
-    for i,t in enumerate(tranches):
-        thresholds.append(scores[int(clamp((t/100.0)*len(scores), 0, len(scores)-1))])
-
-        if len(thresholds) == 1:
-            filter_id = score_key + 'Tranche' + variant_type + format_tranche(t) + 'to100.0'
-            vcf_reader.header.filters.add(filter_id, None, None, 'Tranche filtering from '+score_key)
-        else:
-            filter_id = score_key + 'Tranche' + variant_type + format_tranche(t) + 'to' + format_tranche(tranches[i-1])
-            vcf_reader.header.filters.add(filter_id, None, None, 'Tranche filtering from '+score_key)
-        print('At tranch:', t, ' score key:', score_key, ' cutoff is:', thresholds[-1], 'filter id is:', filter_id)
-
-    vcf_writer = pysam.VariantFile(args.output_vcf, 'w', header=vcf_reader.header)
-    for variant in vcf_reader:
-        if score_key in variant.info:
-            if variant_type == 'SNP' and is_snp(variant) or variant_type == 'INDEL' and is_indel(variant):
-                variant.filter.add(score_to_filter_status(variant.info[score_key],
-                                                          score_key, variant_type, tranches, thresholds))
-
-        vcf_writer.write(variant)
-
-    for k in stats.keys():
-        print(k, ' has:', stats[k])
-
-
-def allele_in_vcf(allele, variant, vcf_ram):
-    ''' Check if variant's allele is in a VCF file.
-
-    Arguments
-        allele: the allele from the provided variant that we are checking
-        variant: the variant whose allele we are looking for
-        vcf_ram: the VCF we look in, must have an index (tbi, or idx)
-
-    Returns
-        variant if it is found otherwise None
-    '''
-    variants = vcf_ram.fetch(variant.contig, variant.pos-1, variant.pos)
-
-    for v in variants:
-        if v.contig == variant.contig and v.pos == variant.pos and allele in v.alts:
-            return v
-
-    return None
-
-
-def score_to_filter_status(score, score_key, variant_type, tranches, thresholds):
-
-    for i, threshold in enumerate(thresholds):
-        if score < threshold:
-            if i == 0:
-                high_thresh = '100.0'
-            else:
-                high_thresh = format_tranche(tranches[i-1])
-            low_thresh = format_tranche(tranches[i])
-
-            return score_key + 'Tranche' + variant_type + low_thresh + 'to' + high_thresh
-
-    return 'PASS'
-
-
-def format_tranche(t):
-    return '{0:.1f}'.format(100*t)
-
-def is_snp(variant):
-    return len(variant.ref) == 1 and any(map(lambda x: len(x) == 1, variant.alts))
-
-def is_indel(variant):
-    return len(variant.ref) > 1 or any(map(lambda x: len(x) > 1, variant.alts))
-
-
-def clamp(n, minn, maxn):
-    return max(min(maxn, n), minn)
-
-
-# Back to the top!
-if "__main__" == __name__:
-    run()
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java
index 04c5cad7d90..51b08c0bca2 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java
@@ -95,11 +95,10 @@ public void testTranches() {
         args.add("FilterVariantTranches")
                 .addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF)
                 .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, outputVCF.getAbsolutePath())
-                .addArgument("snp-truth-vcf", snpTruthVCF)
-                .addArgument("indel-truth-vcf", indelTruthVCF)
+                .addArgument("resource", snpTruthVCF)
+                .addArgument("resource", indelTruthVCF)
                 .addArgument("tranche", "99.0")
                 .addArgument("tranche", "95.0")
-                .addArgument("max-sites", "2000")
                 .addArgument("info-key", "VQSLOD");
 
         new Main().instanceMain(args.getArgsArray());
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java
new file mode 100644
index 00000000000..ef43fde0dc7
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java
@@ -0,0 +1,117 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr;
+
+import org.broadinstitute.hellbender.CommandLineProgramTest;
+import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
+import org.broadinstitute.hellbender.utils.test.IntegrationTestSpec;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+
+import org.testng.annotations.Test;
+
+import java.util.Arrays;
+import java.io.IOException;
+
+public class FilterVariantTranchesIntegrationTest  extends CommandLineProgramTest {
+
+
+    /**
+     * Run the tool on a small test VCF.
+     */
+    @Test
+    public void testTrancheFiltering() throws IOException {
+        final boolean newExpectations = false;
+
+        final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz";
+        final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf";
+        final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf";
+
+        final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
+        argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF)
+                .addArgument("resource", snpTruthVCF)
+                .addArgument("resource", indelTruthVCF)
+                .addArgument("tranche", "99.0")
+                .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
+                .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
+
+        if(newExpectations){
+            argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99.vcf");
+            runCommandLine(argsBuilder);
+        } else {
+            argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s");
+            final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
+                    Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99.vcf"));
+            spec.executeTest("testTrancheFiltering", this);
+        }
+    }
+
+    @Test
+    public void testTrancheFilteringDuplicate() throws IOException {
+        final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz";
+        final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf";
+        final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf";
+
+        final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
+        argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF)
+                .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s")
+                .addArgument("resource", snpTruthVCF)
+                .addArgument("resource", indelTruthVCF)
+                .addArgument("tranche", "99.0")
+                .addArgument("tranche", "99.0")
+                .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
+                .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
+
+        final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
+                Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99.vcf"));
+        spec.executeTest("testTrancheFilteringDuplicate", this);
+    }
+
+    @Test
+    public void testTrancheFilteringTranches() throws IOException {
+        final boolean newExpectations = false;
+
+        final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz";
+        final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf";
+        final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf";
+
+        final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
+        argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF)
+                .addArgument("resource", snpTruthVCF)
+                .addArgument("resource", indelTruthVCF)
+                .addArgument("tranche", "95.0")
+                .addArgument("tranche", "99.0")
+                .addArgument("tranche", "99.9")
+                .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
+                .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
+
+        if(newExpectations){
+            argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf");
+            runCommandLine(argsBuilder);
+        } else {
+            argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s");
+            final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
+                    Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf"));
+            spec.executeTest("testTrancheFilteringTranches", this);
+        }
+    }
+
+    @Test
+    public void testTrancheFilteringTranchesOrder() throws IOException {
+        final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz";
+        final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf";
+        final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf";
+        final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
+        argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF)
+                .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s")
+                .addArgument("resource", snpTruthVCF)
+                .addArgument("resource", indelTruthVCF)
+                .addArgument("tranche", "99.9")
+                .addArgument("tranche", "95.0")
+                .addArgument("tranche", "99.0")
+                .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
+                .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
+
+        final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
+                Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf"));
+        spec.executeTest("testTrancheFilteringTranchesOrder", this);
+    }
+
+}
diff --git a/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf
new file mode 100644
index 00000000000..1506afd2f89
--- /dev/null
+++ b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5669299f8d1781cd96cad26c8d6983698ab203569485ceb5ccf82bb90b820577
+size 5375112
diff --git a/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_99.vcf b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_99.vcf
new file mode 100644
index 00000000000..4a90747f375
--- /dev/null
+++ b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_99.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d8ce8df57d0434172d99c55f6b2ac8c68d9e64509914d2ebd4fb12c0c48dcc9
+size 5332769
diff --git a/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz b/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz
new file mode 100644
index 00000000000..7181008349f
--- /dev/null
+++ b/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a894fbe4544f00bd5c3c643086cef954f489495935cf9c442ac8bad4ffc6c19
+size 1013156
diff --git a/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz.tbi b/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz.tbi
new file mode 100644
index 00000000000..23c2a2fe97e
--- /dev/null
+++ b/src/test/resources/large/VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f3d08f0722db860860c2514be8289f26c0cc3f8be9069d81ab81f779052d067
+size 4601

From 26175adb4fd98363c94ec98d204fbadae3a4cc25 Mon Sep 17 00:00:00 2001
From: Samuel Friedman <sam@broadinstitute.org>
Date: Wed, 20 Jun 2018 09:03:41 -0400
Subject: [PATCH 2/3] cleanup

---
 .../tools/walkers/vqsr/FilterVariantTranches.java | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java
index a9528aac4a0..4c567da2906 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java
@@ -1,14 +1,5 @@
 package org.broadinstitute.hellbender.tools.walkers.vqsr;
 
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.tribble.AbstractFeatureReader;
-import htsjdk.tribble.Feature;
-import htsjdk.tribble.FeatureCodec;
-import htsjdk.tribble.TribbleException;
-import htsjdk.tribble.index.Index;
-import htsjdk.tribble.index.IndexFactory;
-import htsjdk.tribble.util.TabixUtils;
-
 import java.util.*;
 import java.io.File;
 import java.util.stream.Collectors;
@@ -21,17 +12,15 @@
 import htsjdk.variant.variantcontext.writer.VariantContextWriter;
 
 import org.broadinstitute.hellbender.engine.*;
-
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.barclay.help.DocumentedFeature;
-import org.broadinstitute.barclay.argparser.ExperimentalFeature;
 import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
-import picard.cmdline.programgroups.VariantFilteringProgramGroup;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 
-import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
+import picard.cmdline.programgroups.VariantFilteringProgramGroup;
 
 /**
  * Apply tranche filtering to VCF based on scores from an annotation in the INFO field.

From c88dbde5f20e700bcc0594164535f9b29114abdc Mon Sep 17 00:00:00 2001
From: Samuel Friedman <sam@broadinstitute.org>
Date: Thu, 21 Jun 2018 08:48:26 -0400
Subject: [PATCH 3/3] fix tests

---
 .../FilterVariantTranchesIntegrationTest.java | 27 +++++--------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java
index ef43fde0dc7..1857781f6b3 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java
@@ -18,29 +18,23 @@ public class FilterVariantTranchesIntegrationTest  extends CommandLineProgramTes
      */
     @Test
     public void testTrancheFiltering() throws IOException {
-        final boolean newExpectations = false;
-
         final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz";
         final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf";
         final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf";
 
         final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
         argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF)
+                .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s")
                 .addArgument("resource", snpTruthVCF)
                 .addArgument("resource", indelTruthVCF)
                 .addArgument("tranche", "99.0")
                 .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
                 .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
 
-        if(newExpectations){
-            argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99.vcf");
-            runCommandLine(argsBuilder);
-        } else {
-            argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s");
-            final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
+        final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
                     Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99.vcf"));
-            spec.executeTest("testTrancheFiltering", this);
-        }
+        spec.executeTest("testTrancheFiltering", this);
+
     }
 
     @Test
@@ -66,14 +60,13 @@ public void testTrancheFilteringDuplicate() throws IOException {
 
     @Test
     public void testTrancheFilteringTranches() throws IOException {
-        final boolean newExpectations = false;
-
         final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz";
         final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf";
         final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf";
 
         final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
         argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF)
+                .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s")
                 .addArgument("resource", snpTruthVCF)
                 .addArgument("resource", indelTruthVCF)
                 .addArgument("tranche", "95.0")
@@ -82,15 +75,9 @@ public void testTrancheFilteringTranches() throws IOException {
                 .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
                 .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
 
-        if(newExpectations){
-            argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf");
-            runCommandLine(argsBuilder);
-        } else {
-            argsBuilder.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s");
-            final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
+        final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
                     Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf"));
-            spec.executeTest("testTrancheFilteringTranches", this);
-        }
+        spec.executeTest("testTrancheFilteringTranches", this);
     }
 
     @Test