Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GGA mode for Mutect #4601

Merged
merged 4 commits into from
Apr 8, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions scripts/mutect2_wdl/mutect2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ workflow Mutect2 {
Boolean make_bamout_or_default = select_first([make_bamout, false])
Boolean? compress_vcfs
Boolean compress = select_first([compress_vcfs, false])
File? gga_vcf
File? gga_vcf_idx

# oncotator inputs
Boolean? run_oncotator
Expand Down Expand Up @@ -182,6 +184,8 @@ workflow Mutect2 {
m2_extra_args = m2_extra_args,
make_bamout = make_bamout_or_default,
compress = compress,
gga_vcf = gga_vcf,
gga_vcf_idx = gga_vcf_idx,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
Expand Down Expand Up @@ -425,6 +429,8 @@ task M2 {
String? m2_extra_args
Boolean? make_bamout
Boolean compress
File? gga_vcf
File? gga_vcf_idx

String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
Expand Down Expand Up @@ -468,6 +474,7 @@ task M2 {
${"--germline-resource " + gnomad} \
${"-pon " + pon} \
${"-L " + intervals} \
${"--genotyping-mode GENOTYPE_GIVEN_ALLELES --alleles " + gga_vcf} \
-O "${output_vcf}" \
${true='--bam-output bamout.bam' false='' make_bamout} \
${m2_extra_args}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ protected final VariantCallContext emptyCallContext(final FeatureContext feature

if ( configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) {
final VariantContext ggaVc = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(features,
rawContext.getLocation(), false, logger, configuration.alleles);
rawContext.getLocation(), false, configuration.genotypeFilteredAlleles, logger, configuration.alleles);
if (ggaVc == null) {
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@ public final class GenotypingGivenAllelesUtils {
* @param tracker the meta data tracker.
* @param loc the query location.
* @param snpsOnly whether we only should consider SNP variation.
* @param keepFiltered whether to include filtered variants
* @param logger where to output warnings.
* @param allelesBinding the target variation context binding containing the given alleles.
* @return never {@code null}
*/
public static VariantContext composeGivenAllelesVariantContextFromRod(final FeatureContext tracker,
final Locatable loc,
final boolean snpsOnly,
final boolean keepFiltered,
final Logger logger,
final FeatureInput<VariantContext> allelesBinding) {
Utils.nonNull(tracker, "tracker may not be null");
Expand All @@ -43,7 +45,7 @@ public static VariantContext composeGivenAllelesVariantContextFromRod(final Feat

// search for usable record
for ( final VariantContext rodVc : tracker.getValues(allelesBinding, new SimpleInterval(loc)) ) {
if ( rodVc != null && ! rodVc.isFiltered() && (! snpsOnly || rodVc.isSNP() )) {
if ( rodVc != null && (keepFiltered || rodVc.isNotFiltered()) && (! snpsOnly || rodVc.isSNP() )) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not your code but here I think a stream with findFirst is cleaner and more clear.

Optional<VariantContext> vc = tracker.getValues(allelesBinding, new SimpleInterval(loc)).stream()
                .filter(rodVc -> rodVc != null && (keepFiltered || rodVc.isNotFiltered()) && (! snpsOnly || rodVc.isSNP()))
                .findFirst();

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree but I don't think the engine team wants us touching HaplotypeCaller code more than absolutely necessary for the moment.

if ( vc == null ) {
vc = rodVc;
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ public void copyStandardCallerArgsFrom( final StandardCallerArgumentCollection o
@Argument(fullName="alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", optional=true)
public FeatureInput<VariantContext> alleles;

/**
* When set to true an when in GENOTYPE_GIVEN_ALLELES mode all given alleles, even filtered ones, are genotyped
*/
@Advanced
@Argument(fullName = "genotype-filtered-alleles", doc = "Whether to genotype all given alleles, even filtered ones, --genotyping_mode is GENOTYPE_GIVEN_ALLELES", optional = true)
public boolean genotypeFilteredAlleles = false;

/**
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence
public ActivityProfileState isActive( final AlignmentContext context, final ReferenceContext ref, final FeatureContext features ) {

if ( hcArgs.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) {
final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(features, ref.getInterval(), false, logger, hcArgs.alleles);
final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(features, ref.getInterval(), false, hcArgs.genotypeFilteredAlleles, logger, hcArgs.alleles);
if( vcFromAllelesRod != null ) {
return new ActivityProfileState(ref.getInterval(), 1.0);
}
Expand Down Expand Up @@ -489,7 +489,7 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur

final List<VariantContext> givenAlleles = new ArrayList<>();
if ( hcArgs.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) {
features.getValues(hcArgs.alleles).stream().filter(VariantContext::isNotFiltered).forEach(givenAlleles::add);
features.getValues(hcArgs.alleles).stream().filter(vc -> hcArgs.genotypeFilteredAlleles || vc.isNotFiltered()).forEach(givenAlleles::add);

// No alleles found in this region so nothing to do!
if ( givenAlleles.isEmpty() ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingGivenAllelesUtils;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingOutputMode;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.*;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
Expand All @@ -37,6 +38,7 @@
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.*;
import java.util.stream.Collectors;

/**
* Created by davidben on 9/15/16.
Expand Down Expand Up @@ -212,8 +214,12 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
return NO_CALLS;
}

final List<VariantContext> givenAlleles = MTAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ?
featureContext.getValues(MTAC.alleles).stream().filter(vc -> MTAC.genotypeFilteredAlleles || vc.isNotFiltered()).collect(Collectors.toList()) :
Collections.emptyList();

final AssemblyRegion assemblyActiveRegion = AssemblyBasedCallerUtils.assemblyRegionWithWellMappedReads(originalAssemblyRegion, READ_QUALITY_FILTER_THRESHOLD, header);
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, Collections.emptyList(), MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner);
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, givenAlleles, MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner);
final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents();
final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion,allVariationEvents);
if (!trimmingResult.isVariationPresent()) {
Expand Down Expand Up @@ -242,6 +248,7 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
referenceContext,
regionForGenotyping.getSpan(),
featureContext,
givenAlleles,
header);

writeBamOutput(assemblyResult, readLikelihoods, calledHaplotypes);
Expand Down Expand Up @@ -285,6 +292,13 @@ public void shutdown() {

@Override
public ActivityProfileState isActive(final AlignmentContext context, final ReferenceContext ref, final FeatureContext featureContext) {
if ( MTAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) {
final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(featureContext, ref.getInterval(), false, MTAC.genotypeFilteredAlleles, logger, MTAC.alleles);
if( vcFromAllelesRod != null ) {
return new ActivityProfileState(ref.getInterval(), 1.0);
}
}

final byte refBase = ref.getBase();
final SimpleInterval refInterval = ref.getInterval();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@ public class SomaticGenotypingEngine extends AssemblyBasedCallerGenotypingEngine
private final String matchedNormalSampleName;
final boolean hasNormal;

//Mutect2 does not run in GGA mode
private static final List<VariantContext> NO_GIVEN_ALLELES = Collections.emptyList();

// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
private static final AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
@Override
Expand Down Expand Up @@ -88,6 +85,7 @@ public CalledHaplotypes callMutations(
final ReferenceContext referenceContext,
final SimpleInterval activeRegionWindow,
final FeatureContext featureContext,
final List<VariantContext> givenAlleles,
final SAMFileHeader header) {
Utils.nonNull(log10ReadLikelihoods, "likelihoods are null");
Utils.validateArg(log10ReadLikelihoods.numberOfSamples() > 0, "likelihoods have no samples");
Expand All @@ -96,17 +94,21 @@ public CalledHaplotypes callMutations(

final List<Haplotype> haplotypes = log10ReadLikelihoods.alleles();

// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
// that carry events among the haplotypes
final List<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, assemblyResultSet.getFullReferenceWithPadding(), assemblyResultSet.getPaddedReferenceLoc(), NO_GIVEN_ALLELES).stream()
// Note: passing an empty list of activeAllelesToGenotype is the correct behavior even when givenAlleles is
// non-empty. At this point any given alleles have already been injected into the haplotypes, and passing
// givenAlleles to {@code decomposeHaplotypesIntoVariantContexts} actually overrides any non-given (discovery) alleles, which
// is not what we want.
final List<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, assemblyResultSet.getFullReferenceWithPadding(), assemblyResultSet.getPaddedReferenceLoc(), Collections.emptyList()).stream()
.filter(loc -> activeRegionWindow.getStart() <= loc && loc <= activeRegionWindow.getEnd())
.collect(Collectors.toList());

final Set<Haplotype> calledHaplotypes = new HashSet<>();
final List<VariantContext> returnCalls = new ArrayList<>();

for( final int loc : startPosKeySet ) {
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, NO_GIVEN_ALLELES);
// Note: as above, passing an empty list of activeAllelesToGenotype is the correct behavior even when givenAlleles is
// non-empty, for the same reason. If you don't believe this, check {@code testGivenAllelesMode} in {@link Mutect2IntegrationTest}.
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, Collections.emptyList());
final VariantContext mergedVC = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc);
if( mergedVC == null ) {
continue;
Expand All @@ -126,9 +128,19 @@ public CalledHaplotypes callMutations(
final Optional<PerAlleleCollection<Double>> normalLog10Odds = getForNormal(() -> diploidAltLog10Odds(log10NormalMatrix.get()));
final Optional<PerAlleleCollection<Double>> normalArtifactLog10Odds = getForNormal(() -> somaticLog10Odds(log10NormalMatrix.get()));

final List<Pair<Allele, Allele>> givenAltAndRefAllelesInOriginalContext = getVCsAtThisLocation(Collections.emptyList(), loc, givenAlleles).stream()
.flatMap(vc -> vc.getAlternateAlleles().stream().map(allele -> ImmutablePair.of(allele, vc.getReference()))).collect(Collectors.toList());

final Set<Allele> allelesConsistentWithGivenAlleles = mergedVC.getAlternateAlleles().stream()
.map(allele -> ImmutablePair.of(allele, mergedVC.getReference()))
.filter(altAndRef -> givenAltAndRefAllelesInOriginalContext.stream().anyMatch(givenAltAndRef -> allelesAreConsistent(givenAltAndRef, altAndRef)))
.map(altAndRefPair -> altAndRefPair.getLeft())
.collect(Collectors.toSet());

final List<Allele> somaticAltAlleles = mergedVC.getAlternateAlleles().stream()
.filter(allele -> tumorLog10Odds.getAlt(allele) > MTAC.emissionLodThreshold)
.filter(allele -> !hasNormal || MTAC.genotypeGermlineSites || normalLog10Odds.get().getAlt(allele) > MTAC.normalLodThreshold)
.filter(allele -> allelesConsistentWithGivenAlleles.contains(allele) ||
((tumorLog10Odds.getAlt(allele) > MTAC.emissionLodThreshold) &&
(!hasNormal || MTAC.genotypeGermlineSites || normalLog10Odds.get().getAlt(allele) > MTAC.normalLodThreshold)))
.collect(Collectors.toList());
final List<Allele> allSomaticAlleles = ListUtils.union(Arrays.asList(mergedVC.getReference()), somaticAltAlleles);
if (somaticAltAlleles.isEmpty()) {
Expand Down Expand Up @@ -176,6 +188,22 @@ public CalledHaplotypes callMutations(
return new CalledHaplotypes(outputCallsWithEventCountAnnotation, calledHaplotypes);
}

// check whether two alleles coming from different variant contexts and with possibly different reference alleles
// could in fact be the same. The condition is that one is a prefix of the other
private boolean allelesAreConsistent(final Pair<Allele,Allele> altAndRef1, final Pair<Allele,Allele> altAndRef2) {
final Allele alt1 = altAndRef1.getLeft();
final Allele alt2 = altAndRef2.getLeft();
if (alt1.isSymbolic() || alt2.isSymbolic()) {
return false;
} else {
final int sizeDiff1 = alt1.length() - altAndRef1.getRight().length();
final int sizeDiff2 = alt2.length() - altAndRef2.getRight().length();
return (sizeDiff1 == sizeDiff2) && (alt1.length() < alt2.length() ?
alt1.basesMatch(Arrays.copyOf(alt2.getBases(), alt1.length())) :
alt2.basesMatch(Arrays.copyOf(alt1.getBases(), alt2.length())));
}
}

// compute the likelihoods that each allele is contained at some allele fraction in the sample
private PerAlleleCollection<Double> somaticLog10Odds(final LikelihoodMatrix<Allele> log10Matrix) {
final double log10EvidenceWithAllAlleles = log10Matrix.numberOfReads() == 0 ? 0 :
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.mutect;

import htsjdk.samtools.SamFiles;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.collections4.ListUtils;
Expand All @@ -26,6 +27,7 @@
import java.nio.file.Files;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
Expand Down Expand Up @@ -201,7 +203,6 @@ public void testTumorNormal() throws Exception {
}

// run tumor-only using our mini gnomAD on NA12878, which is not a tumor
// we're just making sure nothing blows up
@Test
public void testTumorOnly() throws Exception {
Utils.resetRandomGenerator();
Expand Down Expand Up @@ -231,6 +232,29 @@ public void testTumorOnly() throws Exception {
Assert.assertTrue(numVariantsPassingFilters < 2);
}

@Test
public void testGivenAllelesMode() throws Exception {
Utils.resetRandomGenerator();
final File unfilteredVcf = createTempFile("unfiltered", ".vcf");

final File givenAllelesVcf = new File(toolsTestDir, "mutect/gga_mode.vcf");
final List<String> args = Arrays.asList("-I", NA12878_20_21_WGS_bam,
"-" + M2ArgumentCollection.TUMOR_SAMPLE_SHORT_NAME, "NA12878",
"-R", b37_reference_20_21,
"-L", "20:10000000-10010000",
"-O", unfilteredVcf.getAbsolutePath(),
"--genotyping-mode", "GENOTYPE_GIVEN_ALLELES",
"--alleles", givenAllelesVcf.getAbsolutePath());
runCommandLine(args);

final Map<Integer, List<Allele>> altAllelesByPosition = StreamSupport.stream(new FeatureDataSource<VariantContext>(unfilteredVcf).spliterator(), false)
.collect(Collectors.toMap(vc -> vc.getStart(), vc-> vc.getAlternateAlleles()));
for (final VariantContext vc : new FeatureDataSource<VariantContext>(givenAllelesVcf)) {
final List<Allele> altAllelesAtThisLocus = altAllelesByPosition.get(vc.getStart());
vc.getAlternateAlleles().forEach(a -> Assert.assertTrue(altAllelesAtThisLocus.contains(a)));
}
}

@Test
public void testContaminationFilter() throws Exception {
Utils.resetRandomGenerator();
Expand Down
Loading