Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

removing A,R, and G length attributes when ReblockGVCF subsets an allele #8209

Merged
merged 4 commits into from
Mar 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
import htsjdk.variant.vcf.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
import org.broadinstitute.hellbender.utils.genotyper.GenotypePriorCalculator;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.collections.Permutation;
import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;
Expand All @@ -33,9 +35,18 @@ private AlleleSubsettingUtils() {} // prevent instantiation
private static final int PL_INDEX_OF_HOM_REF = 0;
public static final int NUM_OF_STRANDS = 2; // forward and reverse strands

private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();
private static final OneShotLogger attributesRemovedOneShotLogger = new OneShotLogger(AlleleSubsettingUtils.class);

private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();

public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, final int defaultPloidy,
final List<Allele> originalAlleles,
final List<Allele> allelesToKeep,
final GenotypePriorCalculator gpc,
final GenotypeAssignmentMethod assignmentMethod) {
//TODO: if other usages of this method should update or remove A,R, or G length annotations then header parsing is necessary and the method below should be used
return subsetAlleles(originalGs, defaultPloidy, originalAlleles, allelesToKeep, gpc, assignmentMethod, Collections.emptyList());
}
/**
* Create the new GenotypesContext with the subsetted PLs and ADs
*
Expand All @@ -45,13 +56,15 @@ private AlleleSubsettingUtils() {} // prevent instantiation
* @param originalAlleles the original alleles
* @param allelesToKeep the subset of alleles to use with the new Genotypes
* @param assignmentMethod assignment strategy for the (subsetted) PLs
* @param alleleBasedLengthAnnots list of annotations that have lengths based on the number of alleles (A, R, and G length types)
* @return a new non-null GenotypesContext
*/
public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, final int defaultPloidy,
final List<Allele> originalAlleles,
final List<Allele> allelesToKeep,
final GenotypePriorCalculator gpc,
final GenotypeAssignmentMethod assignmentMethod) {
final GenotypeAssignmentMethod assignmentMethod,
final List<String> alleleBasedLengthAnnots) {
Utils.nonNull(originalGs, "original GenotypesContext must not be null");
Utils.nonNull(allelesToKeep, "allelesToKeep is null");
Utils.nonEmpty(allelesToKeep, "must keep at least one allele");
Expand Down Expand Up @@ -93,9 +106,23 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
final GenotypeBuilder gb = new GenotypeBuilder(g);
final Map<String, Object> attributes = new HashMap<>(g.getExtendedAttributes());
attributes.remove(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY);
//TODO: remove other G-length attributes, although that may require header parsing
attributes.remove(VCFConstants.GENOTYPE_POSTERIORS_KEY);
attributes.remove(GATKVCFConstants.GENOTYPE_PRIOR_KEY);
final List<String> attributesToRemove = new ArrayList<>();
for(final String attribute : attributes.keySet()) {
if (alleleBasedLengthAnnots.contains(attribute)) {
if (attribute.equals(GATKVCFConstants.F1R2_KEY) || attribute.equals(GATKVCFConstants.F2R1_KEY)) {
final String oldAttributeString = (String) attributes.get(attribute);
final String[] oldAttributeArray = oldAttributeString.split(AnnotationUtils.LIST_DELIMITER);
final int[] oldAttribute = Arrays.stream(oldAttributeArray).mapToInt(Integer::parseInt).toArray();
final int[] newAttribute = getNewAlleleBasedReadCountAnnotation(allelesToKeep, allelePermutation, oldAttribute);
attributes.put(attribute, newAttribute);
} else {
attributesToRemove.add(attribute);
}
}
}
attributesToRemove.forEach(attributes::remove);
gb.noPL().noGQ().noAttributes().attributes(attributes); //if alleles are subset, old PLs and GQ are invalid
if (newLog10GQ != Double.NEGATIVE_INFINITY && g.hasGQ()) { //only put GQ if originally present
gb.log10PError(newLog10GQ);
Expand All @@ -112,19 +139,39 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,

// restrict AD to the new allele subset
if(g.hasAD()) {
final int[] oldAD = g.getAD();
final int[] newAD = IntStream.range(0, allelesToKeep.size()).map(n -> oldAD[allelePermutation.fromIndex(n)]).toArray();
final int nonRefIndex = allelesToKeep.indexOf(Allele.NON_REF_ALLELE);
if (nonRefIndex != -1 && nonRefIndex < newAD.length) {
newAD[nonRefIndex] = 0; //we will "lose" coverage here, but otherwise merging NON_REF AD counts with other alleles "creates" reads
}
final int[] newAD = getNewAlleleBasedReadCountAnnotation(allelesToKeep, allelePermutation, g.getAD());
gb.AD(newAD);
// if we have recalculated AD and the original genotype had AF but was then removed, then recalculate AF based on AD counts
if (alleleBasedLengthAnnots.contains(GATKVCFConstants.ALLELE_FRACTION_KEY) && g.hasExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY)) {
attributesToRemove.remove(GATKVCFConstants.ALLELE_FRACTION_KEY);
final double[] newAFs = MathUtils.normalizeSumToOne(Arrays.stream(newAD).mapToDouble(x -> x).toArray());
gb.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, Arrays.copyOfRange(newAFs, 1, newAFs.length)); //omit the first entry of the array corresponding to the reference
}
}
if (attributesToRemove.size() > 0) {
attributesRemovedOneShotLogger.warn(() -> "The following attributes have been removed at sites where alleles were subset: " + String.join(",", attributesToRemove));
}
newGTs.add(gb.make());
}
return newGTs;
}

/**
* Subsets allele length annotation to match the alleles that have been kept
* @param allelesToKeep list of alleles that are not being filtered out
* @param allelePermutation permutation that matches the index from the old annotation to the new annotation given which alleles have been removed
* @param oldAnnot the original annotation
* @return the new subset annotation
*/
private static int[] getNewAlleleBasedReadCountAnnotation(List<Allele> allelesToKeep, Permutation<Allele> allelePermutation, int[] oldAnnot) {
final int[] newAnnot = IntStream.range(0, allelesToKeep.size()).map(n -> oldAnnot[allelePermutation.fromIndex(n)]).toArray();
final int nonRefIndex = allelesToKeep.indexOf(Allele.NON_REF_ALLELE);
if (nonRefIndex != -1 && nonRefIndex < newAnnot.length) {
newAnnot[nonRefIndex] = 0; //we will "lose" coverage here, but otherwise merging NON_REF AD counts with other alleles "creates" reads
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I can't follow my own logic here, but you just copied so you're not to blame.

}
return newAnnot;
}


/**
* Remove alternate alleles from a set of genotypes turning removed alleles to no-call and dropping other per-allele attributes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ public final class ReblockGVCF extends MultiVariantWalker {

private CachingIndexedFastaSequenceFile referenceReader;

private static final List<String> alleleBasedLengthAnnots = new ArrayList<>();

public static class AlleleLengthComparator implements Comparator<Allele> {
public int compare(Allele a1, Allele a2) {
return a1.getBaseString().length() - a2.getBaseString().length();
Expand Down Expand Up @@ -256,6 +258,15 @@ public void onTraversalStart() {
}
}

//Allele length and Genotype length annotations need to be subset or removed if alleles are dropped so we need to parse the header for annotation count types
for(final VCFFormatHeaderLine formatHeaderLine : inputHeader.getFormatHeaderLines()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Occasionally the header is wrong, but it's usually A should be R or vice versa, not const should be A. I think this is fine since we're just using this as a list to remove, but it's something to keep in mind.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, in that case I'll add a warning for the logs that at least lists which annotations have been removed for easier debugging in the future.

if(formatHeaderLine.getCountType().equals(VCFHeaderLineCount.A) ||
formatHeaderLine.getCountType().equals(VCFHeaderLineCount.R) ||
formatHeaderLine.getCountType().equals(VCFHeaderLineCount.G)) {
alleleBasedLengthAnnots.add(formatHeaderLine.getID());
}
}

if ( dbsnp.dbsnp != null ) {
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY);
}
Expand Down Expand Up @@ -599,7 +610,7 @@ VariantContext cleanUpHighQualityVariant(final VariantContext variant) {
if(allelesNeedSubsetting && !keepAllAlts) {
newAlleleSetUntrimmed.removeAll(allelesToDrop);
final GenotypesContext gc = AlleleSubsettingUtils.subsetAlleles(variant.getGenotypes(), genotype.getPloidy(), variant.getAlleles(),
newAlleleSetUntrimmed, null, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN);
newAlleleSetUntrimmed, null, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, alleleBasedLengthAnnots);
if (gc.get(0).isHomRef() || !gc.get(0).hasGQ() || gc.get(0).getAlleles().contains(Allele.NO_CALL)) { //could be low quality or no-call after subsetting
if (dropLowQuals) {
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -517,4 +517,38 @@ public void testTreeScoreWithNoAnnotation() {

Assert.assertThrows(UserException.class, () -> runCommandLine(args));
}

@Test
public void testDragenGvcfs() {
final File input = getTestFile("dragen.g.vcf");
final File output = createTempFile("reblockedgvcf", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(hg38Reference))
.add("V", input)
.add("L", "chr1:939436")
.addOutput(output);
runCommandLine(args);

final VCFHeader outHeader = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()).getLeft();
final VariantContext outVC = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()).getRight().get(0);
// The extra allele was dropped so this site is now one alt allele and the <NON_REF> allele
Assert.assertEquals(outVC.getAlternateAlleles().size(), 2);
final Genotype g = outVC.getGenotype(0);
Assert.assertEquals(g.getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY),"1.00,0.00");
Assert.assertEquals(g.getExtendedAttribute(GATKVCFConstants.F1R2_KEY), "0,3,0");
Assert.assertEquals(g.getExtendedAttribute(GATKVCFConstants.F2R1_KEY), "0,3,0");
for (String attribute : g.getExtendedAttributes().keySet()) {
final VCFHeaderLineCount countType = outHeader.getFormatHeaderLine(attribute).getCountType();
if (countType.equals(VCFHeaderLineCount.A)) {
Assert.assertEquals(((String) g.getExtendedAttribute(attribute)).split(",").length, 2);
}
if (countType.equals(VCFHeaderLineCount.R)) {
Assert.assertEquals(((String) g.getExtendedAttribute(attribute)).split(",").length, 3);
}
if (countType.equals(VCFHeaderLineCount.G)) {
Assert.assertEquals(((String) g.getExtendedAttribute(attribute)).split(",").length, 6);
}
}
}
}
Loading