From ce34240673d642a7a8b476bc9aad1e0a21dbb8f1 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Sat, 7 Apr 2018 23:51:27 -0400 Subject: [PATCH] GGA mode for Mutect (#4601) --- scripts/mutect2_wdl/mutect2.wdl | 7 +++ .../walkers/genotyper/GenotypingEngine.java | 2 +- .../GenotypingGivenAllelesUtils.java | 4 +- .../StandardCallerArgumentCollection.java | 7 +++ .../HaplotypeCallerEngine.java | 4 +- .../tools/walkers/mutect/Mutect2Engine.java | 16 +++++- .../mutect/SomaticGenotypingEngine.java | 46 ++++++++++++++---- .../mutect/Mutect2IntegrationTest.java | 26 +++++++++- .../hellbender/tools/mutect/gga_mode.vcf | 33 +++++++++++++ .../hellbender/tools/mutect/gga_mode.vcf.idx | Bin 0 -> 369 bytes 10 files changed, 130 insertions(+), 15 deletions(-) create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/mutect/gga_mode.vcf create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/mutect/gga_mode.vcf.idx diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl index 7b05f39e44f..23ceb18fd46 100755 --- a/scripts/mutect2_wdl/mutect2.wdl +++ b/scripts/mutect2_wdl/mutect2.wdl @@ -84,6 +84,8 @@ workflow Mutect2 { Boolean make_bamout_or_default = select_first([make_bamout, false]) Boolean? compress_vcfs Boolean compress = select_first([compress_vcfs, false]) + File? gga_vcf + File? gga_vcf_idx # oncotator inputs Boolean? run_oncotator @@ -182,6 +184,8 @@ workflow Mutect2 { m2_extra_args = m2_extra_args, make_bamout = make_bamout_or_default, compress = compress, + gga_vcf = gga_vcf, + gga_vcf_idx = gga_vcf_idx, gatk_override = gatk_override, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, @@ -425,6 +429,8 @@ task M2 { String? m2_extra_args Boolean? make_bamout Boolean compress + File? gga_vcf + File? gga_vcf_idx String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" @@ -468,6 +474,7 @@ task M2 { ${"--germline-resource " + gnomad} \ ${"-pon " + pon} \ ${"-L " + intervals} \ + ${"--genotyping-mode GENOTYPE_GIVEN_ALLELES --alleles " + gga_vcf} \ -O "${output_vcf}" \ ${true='--bam-output bamout.bam' false='' make_bamout} \ ${m2_extra_args} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java index bfc671fdd4f..c2dbbafa47f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java @@ -509,7 +509,7 @@ protected final VariantCallContext emptyCallContext(final FeatureContext feature if ( configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) { final VariantContext ggaVc = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(features, - rawContext.getLocation(), false, logger, configuration.alleles); + rawContext.getLocation(), false, configuration.genotypeFilteredAlleles, logger, configuration.alleles); if (ggaVc == null) { return null; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingGivenAllelesUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingGivenAllelesUtils.java index d0301ebad43..43c36a806a8 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingGivenAllelesUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingGivenAllelesUtils.java @@ -27,6 +27,7 @@ public final class GenotypingGivenAllelesUtils { * @param tracker the meta data tracker. * @param loc the query location. * @param snpsOnly whether we only should consider SNP variation. + * @param keepFiltered whether to include filtered variants * @param logger where to output warnings. * @param allelesBinding the target variation context binding containing the given alleles. * @return never {@code null} @@ -34,6 +35,7 @@ public final class GenotypingGivenAllelesUtils { public static VariantContext composeGivenAllelesVariantContextFromRod(final FeatureContext tracker, final Locatable loc, final boolean snpsOnly, + final boolean keepFiltered, final Logger logger, final FeatureInput allelesBinding) { Utils.nonNull(tracker, "tracker may not be null"); @@ -43,7 +45,7 @@ public static VariantContext composeGivenAllelesVariantContextFromRod(final Feat // search for usable record for ( final VariantContext rodVc : tracker.getValues(allelesBinding, new SimpleInterval(loc)) ) { - if ( rodVc != null && ! rodVc.isFiltered() && (! snpsOnly || rodVc.isSNP() )) { + if ( rodVc != null && (keepFiltered || rodVc.isNotFiltered()) && (! snpsOnly || rodVc.isSNP() )) { if ( vc == null ) { vc = rodVc; } else { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/StandardCallerArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/StandardCallerArgumentCollection.java index c0aadf5f9ff..677a68113a2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/StandardCallerArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/StandardCallerArgumentCollection.java @@ -55,6 +55,13 @@ public void copyStandardCallerArgsFrom( final StandardCallerArgumentCollection o @Argument(fullName="alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", optional=true) public FeatureInput alleles; + /** + * When set to true an when in GENOTYPE_GIVEN_ALLELES mode all given alleles, even filtered ones, are genotyped + */ + @Advanced + @Argument(fullName = "genotype-filtered-alleles", doc = "Whether to genotype all given alleles, even filtered ones, --genotyping_mode is GENOTYPE_GIVEN_ALLELES", optional = true) + public boolean genotypeFilteredAlleles = false; + /** * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index 43d4e80782c..e957cc66803 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -416,7 +416,7 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence public ActivityProfileState isActive( final AlignmentContext context, final ReferenceContext ref, final FeatureContext features ) { if ( hcArgs.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) { - final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(features, ref.getInterval(), false, logger, hcArgs.alleles); + final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(features, ref.getInterval(), false, hcArgs.genotypeFilteredAlleles, logger, hcArgs.alleles); if( vcFromAllelesRod != null ) { return new ActivityProfileState(ref.getInterval(), 1.0); } @@ -489,7 +489,7 @@ public List callRegion(final AssemblyRegion region, final Featur final List givenAlleles = new ArrayList<>(); if ( hcArgs.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) { - features.getValues(hcArgs.alleles).stream().filter(VariantContext::isNotFiltered).forEach(givenAlleles::add); + features.getValues(hcArgs.alleles).stream().filter(vc -> hcArgs.genotypeFilteredAlleles || vc.isNotFiltered()).forEach(givenAlleles::add); // No alleles found in this region so nothing to do! if ( givenAlleles.isEmpty() ) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java index c446c2f731f..025138d9c86 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java @@ -15,6 +15,7 @@ import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingGivenAllelesUtils; import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingOutputMode; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.*; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; @@ -37,6 +38,7 @@ import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; import java.util.*; +import java.util.stream.Collectors; /** * Created by davidben on 9/15/16. @@ -212,8 +214,12 @@ public List callRegion(final AssemblyRegion originalAssemblyRegi return NO_CALLS; } + final List givenAlleles = MTAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ? + featureContext.getValues(MTAC.alleles).stream().filter(vc -> MTAC.genotypeFilteredAlleles || vc.isNotFiltered()).collect(Collectors.toList()) : + Collections.emptyList(); + final AssemblyRegion assemblyActiveRegion = AssemblyBasedCallerUtils.assemblyRegionWithWellMappedReads(originalAssemblyRegion, READ_QUALITY_FILTER_THRESHOLD, header); - final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, Collections.emptyList(), MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner); + final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, givenAlleles, MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner); final SortedSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(); final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion,allVariationEvents); if (!trimmingResult.isVariationPresent()) { @@ -242,6 +248,7 @@ public List callRegion(final AssemblyRegion originalAssemblyRegi referenceContext, regionForGenotyping.getSpan(), featureContext, + givenAlleles, header); writeBamOutput(assemblyResult, readLikelihoods, calledHaplotypes); @@ -285,6 +292,13 @@ public void shutdown() { @Override public ActivityProfileState isActive(final AlignmentContext context, final ReferenceContext ref, final FeatureContext featureContext) { + if ( MTAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) { + final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(featureContext, ref.getInterval(), false, MTAC.genotypeFilteredAlleles, logger, MTAC.alleles); + if( vcFromAllelesRod != null ) { + return new ActivityProfileState(ref.getInterval(), 1.0); + } + } + final byte refBase = ref.getBase(); final SimpleInterval refInterval = ref.getInterval(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java index 875596490c8..7877baef77d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java @@ -44,9 +44,6 @@ public class SomaticGenotypingEngine extends AssemblyBasedCallerGenotypingEngine private final String matchedNormalSampleName; final boolean hasNormal; - //Mutect2 does not run in GGA mode - private static final List NO_GIVEN_ALLELES = Collections.emptyList(); - // {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy private static final AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() { @Override @@ -88,6 +85,7 @@ public CalledHaplotypes callMutations( final ReferenceContext referenceContext, final SimpleInterval activeRegionWindow, final FeatureContext featureContext, + final List givenAlleles, final SAMFileHeader header) { Utils.nonNull(log10ReadLikelihoods, "likelihoods are null"); Utils.validateArg(log10ReadLikelihoods.numberOfSamples() > 0, "likelihoods have no samples"); @@ -96,9 +94,11 @@ public CalledHaplotypes callMutations( final List haplotypes = log10ReadLikelihoods.alleles(); - // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference - // that carry events among the haplotypes - final List startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, assemblyResultSet.getFullReferenceWithPadding(), assemblyResultSet.getPaddedReferenceLoc(), NO_GIVEN_ALLELES).stream() + // Note: passing an empty list of activeAllelesToGenotype is the correct behavior even when givenAlleles is + // non-empty. At this point any given alleles have already been injected into the haplotypes, and passing + // givenAlleles to {@code decomposeHaplotypesIntoVariantContexts} actually overrides any non-given (discovery) alleles, which + // is not what we want. + final List startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, assemblyResultSet.getFullReferenceWithPadding(), assemblyResultSet.getPaddedReferenceLoc(), Collections.emptyList()).stream() .filter(loc -> activeRegionWindow.getStart() <= loc && loc <= activeRegionWindow.getEnd()) .collect(Collectors.toList()); @@ -106,7 +106,9 @@ public CalledHaplotypes callMutations( final List returnCalls = new ArrayList<>(); for( final int loc : startPosKeySet ) { - final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, NO_GIVEN_ALLELES); + // Note: as above, passing an empty list of activeAllelesToGenotype is the correct behavior even when givenAlleles is + // non-empty, for the same reason. If you don't believe this, check {@code testGivenAllelesMode} in {@link Mutect2IntegrationTest}. + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, Collections.emptyList()); final VariantContext mergedVC = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc); if( mergedVC == null ) { continue; @@ -126,9 +128,19 @@ public CalledHaplotypes callMutations( final Optional> normalLog10Odds = getForNormal(() -> diploidAltLog10Odds(log10NormalMatrix.get())); final Optional> normalArtifactLog10Odds = getForNormal(() -> somaticLog10Odds(log10NormalMatrix.get())); + final List> givenAltAndRefAllelesInOriginalContext = getVCsAtThisLocation(Collections.emptyList(), loc, givenAlleles).stream() + .flatMap(vc -> vc.getAlternateAlleles().stream().map(allele -> ImmutablePair.of(allele, vc.getReference()))).collect(Collectors.toList()); + + final Set allelesConsistentWithGivenAlleles = mergedVC.getAlternateAlleles().stream() + .map(allele -> ImmutablePair.of(allele, mergedVC.getReference())) + .filter(altAndRef -> givenAltAndRefAllelesInOriginalContext.stream().anyMatch(givenAltAndRef -> allelesAreConsistent(givenAltAndRef, altAndRef))) + .map(altAndRefPair -> altAndRefPair.getLeft()) + .collect(Collectors.toSet()); + final List somaticAltAlleles = mergedVC.getAlternateAlleles().stream() - .filter(allele -> tumorLog10Odds.getAlt(allele) > MTAC.emissionLodThreshold) - .filter(allele -> !hasNormal || MTAC.genotypeGermlineSites || normalLog10Odds.get().getAlt(allele) > MTAC.normalLodThreshold) + .filter(allele -> allelesConsistentWithGivenAlleles.contains(allele) || + ((tumorLog10Odds.getAlt(allele) > MTAC.emissionLodThreshold) && + (!hasNormal || MTAC.genotypeGermlineSites || normalLog10Odds.get().getAlt(allele) > MTAC.normalLodThreshold))) .collect(Collectors.toList()); final List allSomaticAlleles = ListUtils.union(Arrays.asList(mergedVC.getReference()), somaticAltAlleles); if (somaticAltAlleles.isEmpty()) { @@ -176,6 +188,22 @@ public CalledHaplotypes callMutations( return new CalledHaplotypes(outputCallsWithEventCountAnnotation, calledHaplotypes); } + // check whether two alleles coming from different variant contexts and with possibly different reference alleles + // could in fact be the same. The condition is that one is a prefix of the other + private boolean allelesAreConsistent(final Pair altAndRef1, final Pair altAndRef2) { + final Allele alt1 = altAndRef1.getLeft(); + final Allele alt2 = altAndRef2.getLeft(); + if (alt1.isSymbolic() || alt2.isSymbolic()) { + return false; + } else { + final int sizeDiff1 = alt1.length() - altAndRef1.getRight().length(); + final int sizeDiff2 = alt2.length() - altAndRef2.getRight().length(); + return (sizeDiff1 == sizeDiff2) && (alt1.length() < alt2.length() ? + alt1.basesMatch(Arrays.copyOf(alt2.getBases(), alt1.length())) : + alt2.basesMatch(Arrays.copyOf(alt1.getBases(), alt2.length()))); + } + } + // compute the likelihoods that each allele is contained at some allele fraction in the sample private PerAlleleCollection somaticLog10Odds(final LikelihoodMatrix log10Matrix) { final double log10EvidenceWithAllAlleles = log10Matrix.numberOfReads() == 0 ? 0 : diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java index 6d7e5e37b3c..905928e5d8c 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.walkers.mutect; import htsjdk.samtools.SamFiles; +import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import org.apache.commons.collections4.ListUtils; @@ -26,6 +27,7 @@ import java.nio.file.Files; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.StreamSupport; @@ -201,7 +203,6 @@ public void testTumorNormal() throws Exception { } // run tumor-only using our mini gnomAD on NA12878, which is not a tumor - // we're just making sure nothing blows up @Test public void testTumorOnly() throws Exception { Utils.resetRandomGenerator(); @@ -231,6 +232,29 @@ public void testTumorOnly() throws Exception { Assert.assertTrue(numVariantsPassingFilters < 2); } + @Test + public void testGivenAllelesMode() throws Exception { + Utils.resetRandomGenerator(); + final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); + + final File givenAllelesVcf = new File(toolsTestDir, "mutect/gga_mode.vcf"); + final List args = Arrays.asList("-I", NA12878_20_21_WGS_bam, + "-" + M2ArgumentCollection.TUMOR_SAMPLE_SHORT_NAME, "NA12878", + "-R", b37_reference_20_21, + "-L", "20:10000000-10010000", + "-O", unfilteredVcf.getAbsolutePath(), + "--genotyping-mode", "GENOTYPE_GIVEN_ALLELES", + "--alleles", givenAllelesVcf.getAbsolutePath()); + runCommandLine(args); + + final Map> altAllelesByPosition = StreamSupport.stream(new FeatureDataSource(unfilteredVcf).spliterator(), false) + .collect(Collectors.toMap(vc -> vc.getStart(), vc-> vc.getAlternateAlleles())); + for (final VariantContext vc : new FeatureDataSource(givenAllelesVcf)) { + final List altAllelesAtThisLocus = altAllelesByPosition.get(vc.getStart()); + vc.getAlternateAlleles().forEach(a -> Assert.assertTrue(altAllelesAtThisLocus.contains(a))); + } + } + @Test public void testContaminationFilter() throws Exception { Utils.resetRandomGenerator(); diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/gga_mode.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/gga_mode.vcf new file mode 100644 index 00000000000..c026c7bb72d --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/gga_mode.vcf @@ -0,0 +1,33 @@ +##fileformat=VCFv4.2 +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +20 10000600 . A T . . . +20 10001900 . C A . . . +20 10004094 . A T . . . +20 10004999 . AGT A . . . +20 10006816 . TAAA T . . . diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/gga_mode.vcf.idx b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/gga_mode.vcf.idx new file mode 100644 index 0000000000000000000000000000000000000000..9b6313fbef396d424d2a54f39f013a391f0544ec GIT binary patch literal 369 zcmb_X&1%9x5T0UtsR!RAlOJPgZ^3L;Xj8Bo>1~^2#~2!2*j@8Hf@h!37wCrSp*?rt zxv92UH@D( zJ7XKWw}cyNTPKvH(JIv%+`4ULv91fTRg%1}TElth65V{Bj+?S?(9f;IGfQ~BWa5Qo zGd>ruEXg4B{J;Bmo{E&MAn?Y)I1HkA5=IY^{}>0*OT^V?l`I!YHf6s&rf+%1A%;FT W^Z^bdxWB|Zf;)%q((k|c`tLWM`%sGj literal 0 HcmV?d00001