Skip to content

Commit

Permalink
GGA mode for Mutect (#4601)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbenjamin authored Apr 8, 2018
1 parent d48895f commit ce34240
Show file tree
Hide file tree
Showing 10 changed files with 130 additions and 15 deletions.
7 changes: 7 additions & 0 deletions scripts/mutect2_wdl/mutect2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ workflow Mutect2 {
Boolean make_bamout_or_default = select_first([make_bamout, false])
Boolean? compress_vcfs
Boolean compress = select_first([compress_vcfs, false])
File? gga_vcf
File? gga_vcf_idx

# oncotator inputs
Boolean? run_oncotator
Expand Down Expand Up @@ -182,6 +184,8 @@ workflow Mutect2 {
m2_extra_args = m2_extra_args,
make_bamout = make_bamout_or_default,
compress = compress,
gga_vcf = gga_vcf,
gga_vcf_idx = gga_vcf_idx,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
Expand Down Expand Up @@ -425,6 +429,8 @@ task M2 {
String? m2_extra_args
Boolean? make_bamout
Boolean compress
File? gga_vcf
File? gga_vcf_idx

String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
Expand Down Expand Up @@ -468,6 +474,7 @@ task M2 {
${"--germline-resource " + gnomad} \
${"-pon " + pon} \
${"-L " + intervals} \
${"--genotyping-mode GENOTYPE_GIVEN_ALLELES --alleles " + gga_vcf} \
-O "${output_vcf}" \
${true='--bam-output bamout.bam' false='' make_bamout} \
${m2_extra_args}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ protected final VariantCallContext emptyCallContext(final FeatureContext feature

if ( configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) {
final VariantContext ggaVc = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(features,
rawContext.getLocation(), false, logger, configuration.alleles);
rawContext.getLocation(), false, configuration.genotypeFilteredAlleles, logger, configuration.alleles);
if (ggaVc == null) {
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@ public final class GenotypingGivenAllelesUtils {
* @param tracker the meta data tracker.
* @param loc the query location.
* @param snpsOnly whether we only should consider SNP variation.
* @param keepFiltered whether to include filtered variants
* @param logger where to output warnings.
* @param allelesBinding the target variation context binding containing the given alleles.
* @return never {@code null}
*/
public static VariantContext composeGivenAllelesVariantContextFromRod(final FeatureContext tracker,
final Locatable loc,
final boolean snpsOnly,
final boolean keepFiltered,
final Logger logger,
final FeatureInput<VariantContext> allelesBinding) {
Utils.nonNull(tracker, "tracker may not be null");
Expand All @@ -43,7 +45,7 @@ public static VariantContext composeGivenAllelesVariantContextFromRod(final Feat

// search for usable record
for ( final VariantContext rodVc : tracker.getValues(allelesBinding, new SimpleInterval(loc)) ) {
if ( rodVc != null && ! rodVc.isFiltered() && (! snpsOnly || rodVc.isSNP() )) {
if ( rodVc != null && (keepFiltered || rodVc.isNotFiltered()) && (! snpsOnly || rodVc.isSNP() )) {
if ( vc == null ) {
vc = rodVc;
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ public void copyStandardCallerArgsFrom( final StandardCallerArgumentCollection o
@Argument(fullName="alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", optional=true)
public FeatureInput<VariantContext> alleles;

/**
* When set to true an when in GENOTYPE_GIVEN_ALLELES mode all given alleles, even filtered ones, are genotyped
*/
@Advanced
@Argument(fullName = "genotype-filtered-alleles", doc = "Whether to genotype all given alleles, even filtered ones, --genotyping_mode is GENOTYPE_GIVEN_ALLELES", optional = true)
public boolean genotypeFilteredAlleles = false;

/**
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence
public ActivityProfileState isActive( final AlignmentContext context, final ReferenceContext ref, final FeatureContext features ) {

if ( hcArgs.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) {
final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(features, ref.getInterval(), false, logger, hcArgs.alleles);
final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(features, ref.getInterval(), false, hcArgs.genotypeFilteredAlleles, logger, hcArgs.alleles);
if( vcFromAllelesRod != null ) {
return new ActivityProfileState(ref.getInterval(), 1.0);
}
Expand Down Expand Up @@ -489,7 +489,7 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur

final List<VariantContext> givenAlleles = new ArrayList<>();
if ( hcArgs.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) {
features.getValues(hcArgs.alleles).stream().filter(VariantContext::isNotFiltered).forEach(givenAlleles::add);
features.getValues(hcArgs.alleles).stream().filter(vc -> hcArgs.genotypeFilteredAlleles || vc.isNotFiltered()).forEach(givenAlleles::add);

// No alleles found in this region so nothing to do!
if ( givenAlleles.isEmpty() ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingGivenAllelesUtils;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingOutputMode;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.*;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
Expand All @@ -37,6 +38,7 @@
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.*;
import java.util.stream.Collectors;

/**
* Created by davidben on 9/15/16.
Expand Down Expand Up @@ -212,8 +214,12 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
return NO_CALLS;
}

final List<VariantContext> givenAlleles = MTAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ?
featureContext.getValues(MTAC.alleles).stream().filter(vc -> MTAC.genotypeFilteredAlleles || vc.isNotFiltered()).collect(Collectors.toList()) :
Collections.emptyList();

final AssemblyRegion assemblyActiveRegion = AssemblyBasedCallerUtils.assemblyRegionWithWellMappedReads(originalAssemblyRegion, READ_QUALITY_FILTER_THRESHOLD, header);
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, Collections.emptyList(), MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner);
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, givenAlleles, MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner);
final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents();
final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion,allVariationEvents);
if (!trimmingResult.isVariationPresent()) {
Expand Down Expand Up @@ -242,6 +248,7 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
referenceContext,
regionForGenotyping.getSpan(),
featureContext,
givenAlleles,
header);

writeBamOutput(assemblyResult, readLikelihoods, calledHaplotypes);
Expand Down Expand Up @@ -285,6 +292,13 @@ public void shutdown() {

@Override
public ActivityProfileState isActive(final AlignmentContext context, final ReferenceContext ref, final FeatureContext featureContext) {
if ( MTAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) {
final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(featureContext, ref.getInterval(), false, MTAC.genotypeFilteredAlleles, logger, MTAC.alleles);
if( vcFromAllelesRod != null ) {
return new ActivityProfileState(ref.getInterval(), 1.0);
}
}

final byte refBase = ref.getBase();
final SimpleInterval refInterval = ref.getInterval();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@ public class SomaticGenotypingEngine extends AssemblyBasedCallerGenotypingEngine
private final String matchedNormalSampleName;
final boolean hasNormal;

//Mutect2 does not run in GGA mode
private static final List<VariantContext> NO_GIVEN_ALLELES = Collections.emptyList();

// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
private static final AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
@Override
Expand Down Expand Up @@ -88,6 +85,7 @@ public CalledHaplotypes callMutations(
final ReferenceContext referenceContext,
final SimpleInterval activeRegionWindow,
final FeatureContext featureContext,
final List<VariantContext> givenAlleles,
final SAMFileHeader header) {
Utils.nonNull(log10ReadLikelihoods, "likelihoods are null");
Utils.validateArg(log10ReadLikelihoods.numberOfSamples() > 0, "likelihoods have no samples");
Expand All @@ -96,17 +94,21 @@ public CalledHaplotypes callMutations(

final List<Haplotype> haplotypes = log10ReadLikelihoods.alleles();

// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
// that carry events among the haplotypes
final List<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, assemblyResultSet.getFullReferenceWithPadding(), assemblyResultSet.getPaddedReferenceLoc(), NO_GIVEN_ALLELES).stream()
// Note: passing an empty list of activeAllelesToGenotype is the correct behavior even when givenAlleles is
// non-empty. At this point any given alleles have already been injected into the haplotypes, and passing
// givenAlleles to {@code decomposeHaplotypesIntoVariantContexts} actually overrides any non-given (discovery) alleles, which
// is not what we want.
final List<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, assemblyResultSet.getFullReferenceWithPadding(), assemblyResultSet.getPaddedReferenceLoc(), Collections.emptyList()).stream()
.filter(loc -> activeRegionWindow.getStart() <= loc && loc <= activeRegionWindow.getEnd())
.collect(Collectors.toList());

final Set<Haplotype> calledHaplotypes = new HashSet<>();
final List<VariantContext> returnCalls = new ArrayList<>();

for( final int loc : startPosKeySet ) {
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, NO_GIVEN_ALLELES);
// Note: as above, passing an empty list of activeAllelesToGenotype is the correct behavior even when givenAlleles is
// non-empty, for the same reason. If you don't believe this, check {@code testGivenAllelesMode} in {@link Mutect2IntegrationTest}.
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, Collections.emptyList());
final VariantContext mergedVC = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc);
if( mergedVC == null ) {
continue;
Expand All @@ -126,9 +128,19 @@ public CalledHaplotypes callMutations(
final Optional<PerAlleleCollection<Double>> normalLog10Odds = getForNormal(() -> diploidAltLog10Odds(log10NormalMatrix.get()));
final Optional<PerAlleleCollection<Double>> normalArtifactLog10Odds = getForNormal(() -> somaticLog10Odds(log10NormalMatrix.get()));

final List<Pair<Allele, Allele>> givenAltAndRefAllelesInOriginalContext = getVCsAtThisLocation(Collections.emptyList(), loc, givenAlleles).stream()
.flatMap(vc -> vc.getAlternateAlleles().stream().map(allele -> ImmutablePair.of(allele, vc.getReference()))).collect(Collectors.toList());

final Set<Allele> allelesConsistentWithGivenAlleles = mergedVC.getAlternateAlleles().stream()
.map(allele -> ImmutablePair.of(allele, mergedVC.getReference()))
.filter(altAndRef -> givenAltAndRefAllelesInOriginalContext.stream().anyMatch(givenAltAndRef -> allelesAreConsistent(givenAltAndRef, altAndRef)))
.map(altAndRefPair -> altAndRefPair.getLeft())
.collect(Collectors.toSet());

final List<Allele> somaticAltAlleles = mergedVC.getAlternateAlleles().stream()
.filter(allele -> tumorLog10Odds.getAlt(allele) > MTAC.emissionLodThreshold)
.filter(allele -> !hasNormal || MTAC.genotypeGermlineSites || normalLog10Odds.get().getAlt(allele) > MTAC.normalLodThreshold)
.filter(allele -> allelesConsistentWithGivenAlleles.contains(allele) ||
((tumorLog10Odds.getAlt(allele) > MTAC.emissionLodThreshold) &&
(!hasNormal || MTAC.genotypeGermlineSites || normalLog10Odds.get().getAlt(allele) > MTAC.normalLodThreshold)))
.collect(Collectors.toList());
final List<Allele> allSomaticAlleles = ListUtils.union(Arrays.asList(mergedVC.getReference()), somaticAltAlleles);
if (somaticAltAlleles.isEmpty()) {
Expand Down Expand Up @@ -176,6 +188,22 @@ public CalledHaplotypes callMutations(
return new CalledHaplotypes(outputCallsWithEventCountAnnotation, calledHaplotypes);
}

// check whether two alleles coming from different variant contexts and with possibly different reference alleles
// could in fact be the same. The condition is that one is a prefix of the other
private boolean allelesAreConsistent(final Pair<Allele,Allele> altAndRef1, final Pair<Allele,Allele> altAndRef2) {
final Allele alt1 = altAndRef1.getLeft();
final Allele alt2 = altAndRef2.getLeft();
if (alt1.isSymbolic() || alt2.isSymbolic()) {
return false;
} else {
final int sizeDiff1 = alt1.length() - altAndRef1.getRight().length();
final int sizeDiff2 = alt2.length() - altAndRef2.getRight().length();
return (sizeDiff1 == sizeDiff2) && (alt1.length() < alt2.length() ?
alt1.basesMatch(Arrays.copyOf(alt2.getBases(), alt1.length())) :
alt2.basesMatch(Arrays.copyOf(alt1.getBases(), alt2.length())));
}
}

// compute the likelihoods that each allele is contained at some allele fraction in the sample
private PerAlleleCollection<Double> somaticLog10Odds(final LikelihoodMatrix<Allele> log10Matrix) {
final double log10EvidenceWithAllAlleles = log10Matrix.numberOfReads() == 0 ? 0 :
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.mutect;

import htsjdk.samtools.SamFiles;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.collections4.ListUtils;
Expand All @@ -26,6 +27,7 @@
import java.nio.file.Files;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
Expand Down Expand Up @@ -201,7 +203,6 @@ public void testTumorNormal() throws Exception {
}

// run tumor-only using our mini gnomAD on NA12878, which is not a tumor
// we're just making sure nothing blows up
@Test
public void testTumorOnly() throws Exception {
Utils.resetRandomGenerator();
Expand Down Expand Up @@ -231,6 +232,29 @@ public void testTumorOnly() throws Exception {
Assert.assertTrue(numVariantsPassingFilters < 2);
}

@Test
public void testGivenAllelesMode() throws Exception {
Utils.resetRandomGenerator();
final File unfilteredVcf = createTempFile("unfiltered", ".vcf");

final File givenAllelesVcf = new File(toolsTestDir, "mutect/gga_mode.vcf");
final List<String> args = Arrays.asList("-I", NA12878_20_21_WGS_bam,
"-" + M2ArgumentCollection.TUMOR_SAMPLE_SHORT_NAME, "NA12878",
"-R", b37_reference_20_21,
"-L", "20:10000000-10010000",
"-O", unfilteredVcf.getAbsolutePath(),
"--genotyping-mode", "GENOTYPE_GIVEN_ALLELES",
"--alleles", givenAllelesVcf.getAbsolutePath());
runCommandLine(args);

final Map<Integer, List<Allele>> altAllelesByPosition = StreamSupport.stream(new FeatureDataSource<VariantContext>(unfilteredVcf).spliterator(), false)
.collect(Collectors.toMap(vc -> vc.getStart(), vc-> vc.getAlternateAlleles()));
for (final VariantContext vc : new FeatureDataSource<VariantContext>(givenAllelesVcf)) {
final List<Allele> altAllelesAtThisLocus = altAllelesByPosition.get(vc.getStart());
vc.getAlternateAlleles().forEach(a -> Assert.assertTrue(altAllelesAtThisLocus.contains(a)));
}
}

@Test
public void testContaminationFilter() throws Exception {
Utils.resetRandomGenerator();
Expand Down
Loading

0 comments on commit ce34240

Please sign in to comment.