Skip to content

Commit

Permalink
Improved M2 normal artifact filter (#4966)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbenjamin authored Jun 29, 2018
1 parent 54260e3 commit cd1b453
Show file tree
Hide file tree
Showing 6 changed files with 316 additions and 265 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
import java.util.OptionalInt;

/**
* Median base quality of bases supporting each alt allele.
* Median base quality of bases supporting each allele.
*
* <p>The output is an array containing, for each alt allele, the median base quality at the variant (for SNVs) and one base before the variant (for indels) over all reads that best match that allele.</p>
* <p>The output is an array containing, for each allele, the median base quality at the variant (for SNVs) and one base before the variant (for indels) over all reads that best match that allele.</p>
*
* <p>For example, for variant context with ref allele A and alt allele C we use base qualities at the C. For variant context with ref allele AG and alt allele A (deletion),
* <p>For example, for variant context with ref allele A and alt allele C we use base qualities at the A and C. For variant context with ref allele AG and alt allele A (deletion),
* we use base qualities at the A. For variant context with ref allele A and alt allele AG (insertion) we use base qualities at the A.</p>
*/
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Median base quality of bases supporting each allele (MBQ)")
Expand All @@ -36,6 +36,9 @@ protected int aggregate(final List<Integer> values) {
@Override
protected String getDescription() { return "median base quality"; }

@Override
protected boolean includeRefAllele() { return true; }

@Override
protected OptionalInt getValueForRead(final GATKRead read, final VariantContext vc) {
Utils.nonNull(read);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.io.File;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -98,7 +99,10 @@ public void onTraversalStart() {
vcfWriter.writeHeader(vcfHeader);

final String tumorSample = getHeaderForVariants().getMetaDataLine(Mutect2Engine.TUMOR_SAMPLE_KEY_IN_VCF_HEADER).getValue();
filteringEngine = new Mutect2FilteringEngine(MTFAC, tumorSample);
final VCFHeaderLine normalSampleHeaderLine = getHeaderForVariants().getMetaDataLine(Mutect2Engine.NORMAL_SAMPLE_KEY_IN_VCF_HEADER);
final Optional<String> normalSample = normalSampleHeaderLine == null ? Optional.empty() : Optional.of(normalSampleHeaderLine.getValue());

filteringEngine = new Mutect2FilteringEngine(MTFAC, tumorSample, normalSample);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ public class M2FiltersArgumentCollection extends AssemblyBasedCallerArgumentColl
public static final String LOG_SOMATIC_PRIOR_LONG_NAME = "log-somatic-prior";
public static final String TUMOR_LOD_LONG_NAME = "tumor-lod";
public static final String NORMAL_ARTIFACT_LOD_LONG_NAME = "normal-artifact-lod";
public static final String NORMAL_P_VALUE_THRESHOLD_LONG_NAME = "normal-p-value-threshold";
public static final String MAX_GERMLINE_POSTERIOR_LONG_NAME = "max-germline-posterior";
public static final String MAX_ALT_ALLELE_COUNT_LONG_NAME = "max-alt-allele-count";
public static final String MIN_BASES_TO_SUSPECT_PCR_SLIPPAGE_LONG_NAME = "min-pcr-slippage-size";
Expand Down Expand Up @@ -61,6 +62,9 @@ public class M2FiltersArgumentCollection extends AssemblyBasedCallerArgumentColl
@Argument(fullName = NORMAL_ARTIFACT_LOD_LONG_NAME, optional = true, doc = "LOD threshold for calling normal artifacts")
public double NORMAL_ARTIFACT_LOD_THRESHOLD = 0.0;

@Argument(fullName = NORMAL_P_VALUE_THRESHOLD_LONG_NAME, optional = true, doc = "P value threshold for normal artifact filter")
public static final double normalPileupPValueThreshold = 0.001;

@Argument(fullName = MAX_GERMLINE_POSTERIOR_LONG_NAME, optional = true, doc = "Maximum posterior probability that an allele is a germline variant")
public double maxGermlinePosterior = 0.1;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils;
import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import java.util.*;
import java.util.stream.Collectors;
Expand All @@ -21,17 +22,21 @@
*/
public class Mutect2FilteringEngine {
public static final double MIN_ALLELE_FRACTION_FOR_GERMLINE_HOM_ALT = 0.9;
private static final int IMPUTED_NORMAL_BASE_QUALITY = 30; // only used if normal base quality annotation fails somehow
private static final double MIN_NORMAL_ARTIFACT_RATIO = 0.1; // don't call normal artifact if allele fraction in normal is much smaller than allele fraction in tumor
private M2FiltersArgumentCollection MTFAC;
private final double contamination;
private final double somaticPriorProb;
private final String tumorSample;
private final Optional<String> normalSample;
final OverlapDetector<MinorAlleleFractionRecord> tumorSegments;
public static final String FILTERING_STATUS_VCF_KEY = "filtering_status";

public Mutect2FilteringEngine(final M2FiltersArgumentCollection MTFAC, final String tumorSample) {
public Mutect2FilteringEngine(final M2FiltersArgumentCollection MTFAC, final String tumorSample, final Optional<String> normalSample) {
this.MTFAC = MTFAC;
contamination = MTFAC.contaminationTable == null ? 0.0 : ContaminationRecord.readFromFile(MTFAC.contaminationTable).get(0).getContamination();
this.tumorSample = tumorSample;
this.normalSample = normalSample;
somaticPriorProb = Math.pow(10, MTFAC.log10PriorProbOfSomaticEvent);

final List<MinorAlleleFractionRecord> tumorMinorAlleleFractionRecords = MTFAC.tumorSegmentationTable == null ?
Expand Down Expand Up @@ -107,14 +112,17 @@ private static void applyPanelOfNormalsFilter(final M2FiltersArgumentCollection
}
}

private void applyMedianBaseQualityDifferenceFilter(final M2FiltersArgumentCollection MTFAC, final VariantContext vc, final VariantContextBuilder vcb) {
private void applyBaseQualityFilter(final M2FiltersArgumentCollection MTFAC, final VariantContext vc, final VariantContextBuilder vcb) {
final int[] baseQualityByAllele = getIntArrayTumorField(vc, BaseQuality.KEY);
if (baseQualityByAllele != null && baseQualityByAllele[0] < MTFAC.minMedianBaseQuality) {
final double[] tumorLods = getDoubleArrayAttribute(vc, GATKVCFConstants.TUMOR_LOD_KEY);
final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods);

if (baseQualityByAllele != null && baseQualityByAllele[indexOfMaxTumorLod + 1] < MTFAC.minMedianBaseQuality) {
vcb.filter(GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME);
}
}

private void applyMedianMappingQualityDifferenceFilter(final M2FiltersArgumentCollection MTFAC, final VariantContext vc, final VariantContextBuilder vcb) {
private void applyMappingQualityFilter(final M2FiltersArgumentCollection MTFAC, final VariantContext vc, final VariantContextBuilder vcb) {
final int[] mappingQualityByAllele = getIntArrayTumorField(vc, MappingQuality.KEY);
if (mappingQualityByAllele != null && mappingQualityByAllele[0] < MTFAC.minMedianMappingQuality) {
vcb.filter(GATKVCFConstants.MEDIAN_MAPPING_QUALITY_FILTER_NAME);
Expand Down Expand Up @@ -196,18 +204,50 @@ private static void applyInsufficientEvidenceFilter(final M2FiltersArgumentColle

// filter out anything called in tumor that would also be called in the normal if it were treated as a tumor.
// this handles shared artifacts, such as ones due to alignment and any shared aspects of sequencing
private static void applyArtifactInNormalFilter(final M2FiltersArgumentCollection MTFAC, final VariantContext vc, final VariantContextBuilder vcb) {
private void applyArtifactInNormalFilter(final M2FiltersArgumentCollection MTFAC, final VariantContext vc, final VariantContextBuilder vcb) {
if (!( vc.hasAttribute(GATKVCFConstants.NORMAL_ARTIFACT_LOD_ATTRIBUTE)
&& vc.hasAttribute(GATKVCFConstants.TUMOR_LOD_KEY))) {
return;
}

final double[] normalArtifactLods = getDoubleArrayAttribute(vc, GATKVCFConstants.NORMAL_ARTIFACT_LOD_ATTRIBUTE);
final double[] tumorLods = getDoubleArrayAttribute(vc, GATKVCFConstants.TUMOR_LOD_KEY);
final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods);

Genotype tumorGenotype = vc.getGenotype(tumorSample);
final int[] tumorAlleleDepths = tumorGenotype.getAD();
final int tumorDepth = (int) MathUtils.sum(tumorAlleleDepths);
final int tumorAltDepth = tumorAlleleDepths[indexOfMaxTumorLod + 1];

Genotype normalGenotype = vc.getGenotype(normalSample.get());
final int[] normalAlleleDepths = normalGenotype.getAD();
final int normalDepth = (int) MathUtils.sum(normalAlleleDepths);
final int normalAltDepth = normalAlleleDepths[indexOfMaxTumorLod + 1];

// if normal AF << tumor AF, don't filter regardless of LOD
final double tumorAlleleFraction = (double) tumorAltDepth / tumorDepth;
final double normalAlleleFraction = normalDepth == 0 ? 0 : (double) normalAltDepth / normalDepth;

if (normalAlleleFraction < MIN_NORMAL_ARTIFACT_RATIO * tumorAlleleFraction) {
return;
}

final double[] normalArtifactLods = getDoubleArrayAttribute(vc, GATKVCFConstants.NORMAL_ARTIFACT_LOD_ATTRIBUTE);
if (normalArtifactLods[indexOfMaxTumorLod] > MTFAC.NORMAL_ARTIFACT_LOD_THRESHOLD) {
vcb.filter(GATKVCFConstants.ARTIFACT_IN_NORMAL_FILTER_NAME);
return;
}

// the above filter misses artifacts whose support in the normal consists entirely of low base quality reads
// Since a lot of low-BQ reads is itself evidence of an artifact, we filter these by hand via an estimated LOD
// that uses the average base quality of *ref* reads in the normal

final int normalMedianRefBaseQuality = GATKProtectedVariantContextUtils.getAttributeAsIntArray(
normalGenotype, BaseQuality.KEY, () -> new int[] {IMPUTED_NORMAL_BASE_QUALITY}, IMPUTED_NORMAL_BASE_QUALITY)[0];
final double normalPValue = 1 - new BinomialDistribution(null, normalDepth, QualityUtils.qualToErrorProb(normalMedianRefBaseQuality))
.cumulativeProbability(normalAltDepth - 1);

if (normalPValue < M2FiltersArgumentCollection.normalPileupPValueThreshold) {
vcb.filter(GATKVCFConstants.ARTIFACT_IN_NORMAL_FILTER_NAME);
}
}

Expand Down Expand Up @@ -273,8 +313,8 @@ public void applyFilters(final M2FiltersArgumentCollection MTFAC, final VariantC
applyStrandArtifactFilter(MTFAC, vc, vcb);
applySTRFilter(vc, vcb);
applyContaminationFilter(MTFAC, vc, vcb);
applyMedianBaseQualityDifferenceFilter(MTFAC, vc, vcb);
applyMedianMappingQualityDifferenceFilter(MTFAC, vc, vcb);
applyBaseQualityFilter(MTFAC, vc, vcb);
applyMappingQualityFilter(MTFAC, vc, vcb);
applyMedianFragmentLengthDifferenceFilter(MTFAC, vc, vcb);
applyReadPositionFilter(MTFAC, vc, vcb);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,6 @@ public void test() {

final int[] medianAltQuals = (int[]) g.getExtendedAttribute(BaseQuality.KEY);

Assert.assertEquals(medianAltQuals[0], 25);
Assert.assertEquals(medianAltQuals[1], 25);
}
}
Loading

0 comments on commit cd1b453

Please sign in to comment.