Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes LiftoverVCF for indels (again) #1469

Merged
merged 10 commits into from
Apr 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 61 additions & 62 deletions src/main/java/picard/util/LiftoverUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,23 @@
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.ArrayUtils;
import picard.vcf.LiftoverVcf;

import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

public class LiftoverUtils {
Expand All @@ -48,8 +59,8 @@ public class LiftoverUtils {
/**
* Default list of attributes that need to be reversed or dropped from the INFO field when alleles have been swapped.
*/
public static final Collection<String> DEFAULT_TAGS_TO_REVERSE = Arrays.asList("AF");
public static final Collection<String> DEFAULT_TAGS_TO_DROP = Arrays.asList("MAX_AF");
public static final Collection<String> DEFAULT_TAGS_TO_REVERSE = Collections.singletonList("AF");
public static final Collection<String> DEFAULT_TAGS_TO_DROP = Collections.singletonList("MAX_AF");

public static final Log log = Log.getInstance(LiftoverUtils.class);

Expand Down Expand Up @@ -88,8 +99,7 @@ public static VariantContext liftVariant(final VariantContext source, final Inte
// If any of the source alleles is symbolic, do not populate the END tag (protecting things like <NON_REF> from
// getting screwed up in reverse-complemented variants
if (source.hasAttribute(VCFConstants.END_KEY) && builder.getAlleles().stream().noneMatch(Allele::isSymbolic)) {
// TODO: add start() and stop() methods to the builder in htsjdk and use stop() here.
builder.attribute(VCFConstants.END_KEY, builder.make().getEnd());
builder.attribute(VCFConstants.END_KEY, (int) builder.getStop());
} else {
builder.rmAttribute(VCFConstants.END_KEY);
}
Expand Down Expand Up @@ -149,30 +159,21 @@ protected static VariantContextBuilder reverseComplementVariantContext(final Var
if (target.isPositiveStrand()) {
throw new IllegalArgumentException("This should only be called for negative strand liftovers");
}
final int start = target.getStart();
final int stop = target.getEnd();

final List<Allele> origAlleles = new ArrayList<>(source.getAlleles());
final VariantContextBuilder vcb = new VariantContextBuilder(source);

vcb.rmAttribute(VCFConstants.END_KEY);
vcb.chr(target.getContig());
vcb.rmAttribute(VCFConstants.END_KEY)
.chr(target.getContig())
.start(start)
.stop(stop)
.alleles(reverseComplementAlleles(origAlleles));

// By convention, indels are left aligned and include the base prior to that indel.
// When reverse complementing, will be necessary to include this additional base.
// This check prevents the extremely rare situation in which the indel occurs on the
// first base of the sequence
final boolean addToStart = source.isIndel() && target.getStart() > 1;

final int start = target.getStart() - (addToStart ? 1 : 0);
vcb.start(start);

final int stop = target.getEnd() - (addToStart ? 1 : 0);
vcb.stop(stop);

vcb.alleles(reverseComplementAlleles(origAlleles, target, refSeq, source.isIndel(), addToStart));

if (!source.isSNP()) {
if (isIndelForLiftover(source)) {
// check that the reverse complemented bases match the new reference
if (!referenceAlleleMatchesReferenceForIndel(vcb.getAlleles(), refSeq, start, stop)) {
if (referenceAlleleDiffersFromReferenceForIndel(vcb.getAlleles(), refSeq, start, stop)) {
return null;
}
leftAlignVariant(vcb, start, stop, vcb.getAlleles(), refSeq);
Expand All @@ -183,33 +184,28 @@ protected static VariantContextBuilder reverseComplementVariantContext(final Var
return vcb;
}

private static List<Allele> reverseComplementAlleles(final List<Allele> originalAlleles, final Interval target, final ReferenceSequence refSeq, final boolean isIndel, final boolean addToStart) {
final List<Allele> alleles = new ArrayList<>();

for (final Allele oldAllele : originalAlleles) {
alleles.add(LiftoverUtils.reverseComplement(oldAllele, target, refSeq, isIndel, addToStart));
private static boolean isIndelForLiftover(final VariantContext vc) {
final Allele ref = vc.getReference();
if (ref.length() != 1) {
return true;
}

return alleles;
return vc.getAlleles().stream()
.filter(a -> !a.isSymbolic())
.filter(a -> !a.equals(Allele.SPAN_DEL))
.anyMatch(a -> a.length() != 1);
}

private static Allele reverseComplement(final Allele oldAllele, final Interval target, final ReferenceSequence referenceSequence, final boolean isIndel, final boolean addToStart) {
private static List<Allele> reverseComplementAlleles(final List<Allele> originalAlleles) {
return originalAlleles
.stream()
.map(LiftoverUtils::reverseComplement)
.collect(Collectors.toList());
}

private static Allele reverseComplement(final Allele oldAllele) {
if (oldAllele.isSymbolic() || oldAllele.isNoCall() || oldAllele.equals(Allele.SPAN_DEL)) {
return oldAllele;
} else if (isIndel) {
// target.getStart is 1-based, reference bases are 0-based
final StringBuilder alleleBuilder = new StringBuilder(target.getEnd() - target.getStart() + 1);

if (addToStart) {
alleleBuilder.append((char) referenceSequence.getBases()[target.getStart() - 2]);
}
alleleBuilder.append(SequenceUtil.reverseComplement(oldAllele.getBaseString().substring(1, oldAllele.length())));
if (!addToStart) {
alleleBuilder.append((char) referenceSequence.getBases()[target.getEnd() - 1]);
}

return Allele.create(alleleBuilder.toString(), oldAllele.isReference());
} else {
return Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference());
}
Expand Down Expand Up @@ -335,13 +331,13 @@ public static VariantContext swapRefAlt(final VariantContext vc, final Collectio
* @param end the end position of the actual indel
* @return true if they match, false otherwise
*/
protected static boolean referenceAlleleMatchesReferenceForIndel(final List<Allele> alleles,
final ReferenceSequence referenceSequence,
final int start,
final int end) {
private static boolean referenceAlleleDiffersFromReferenceForIndel(final List<Allele> alleles,
final ReferenceSequence referenceSequence,
final int start,
final int end) {
final String refString = StringUtil.bytesToString(referenceSequence.getBases(), start - 1, end - start + 1);
final Allele refAllele = alleles.stream().filter(Allele::isReference).findAny().orElseThrow(() -> new IllegalStateException("Error: no reference allele was present"));
return (refString.equalsIgnoreCase(refAllele.getBaseString()));
return !refString.equalsIgnoreCase(refAllele.getBaseString());
}

/**
Expand All @@ -355,7 +351,7 @@ protected static boolean referenceAlleleMatchesReferenceForIndel(final List<Alle
protected static void leftAlignVariant(final VariantContextBuilder builder, final int start, final int end, final List<Allele> alleles, final ReferenceSequence referenceSequence) {

// make sure that referenceAllele matches reference
if (!referenceAlleleMatchesReferenceForIndel(alleles, referenceSequence, start, end)) {
if (referenceAlleleDiffersFromReferenceForIndel(alleles, referenceSequence, start, end)) {
throw new IllegalArgumentException(String.format("Reference allele doesn't match reference at %s:%d-%d", referenceSequence.getName(), start, end));
}

Expand All @@ -365,7 +361,9 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina

// Put each allele into the alleleBasesMap unless it is a spanning deletion.
// Spanning deletions are dealt with as a special case later in fixedAlleleMap.
alleles.stream().filter(a->!a.equals(Allele.SPAN_DEL)).forEach(a -> alleleBasesMap.put(a, a.getBases()));
alleles.stream()
.filter(a -> !a.equals(Allele.SPAN_DEL) && !a.isSymbolic())
.forEach(a -> alleleBasesMap.put(a, a.getBases()));

int theStart = start;
int theEnd = end;
Expand All @@ -379,9 +377,7 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
.collect(Collectors.groupingBy(a -> a[a.length - 1], Collectors.toSet()))
.size() == 1 && theEnd > 1) {
// 3. truncate rightmost nucleotide of each allele
for (final Allele allele : alleleBasesMap.keySet()) {
alleleBasesMap.put(allele, truncateBase(alleleBasesMap.get(allele), true));
}
alleleBasesMap.replaceAll((a, v) -> truncateBase(alleleBasesMap.get(a), true));
changesInAlleles = true;
theEnd--;
// 4. end if
Expand Down Expand Up @@ -415,12 +411,10 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
alleleBasesMap.values().stream()
.collect(Collectors.groupingBy(a -> a[0], Collectors.toSet()))
.size() == 1
) {
) {

//9. truncate the leftmost base of the alleles
for (final Allele allele : alleleBasesMap.keySet()) {
alleleBasesMap.put(allele, truncateBase(alleleBasesMap.get(allele), false));
}
alleleBasesMap.replaceAll((a, v) -> truncateBase(alleleBasesMap.get(a), false));
theStart++;
}

Expand All @@ -435,6 +429,12 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
// a special case here.
fixedAlleleMap.put(Allele.SPAN_DEL, Allele.SPAN_DEL);

// symbolic alleles cannot be left-aligned so they need to be put
// into the map directly
alleles.stream()
.filter(Allele::isSymbolic)
.forEach(a -> fixedAlleleMap.put(a, a));

//retain original order:
List<Allele> fixedAlleles = alleles.stream()
.map(fixedAlleleMap::get)
Expand All @@ -444,14 +444,13 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
}

private static byte[] truncateBase(final byte[] allele, final boolean truncateRightmost) {
return Arrays.copyOfRange(allele, truncateRightmost ? 0 : 1, truncateRightmost ?
allele.length - 1 :
allele.length);
return Arrays.copyOfRange(allele,
truncateRightmost ? 0 : 1,
truncateRightmost ? allele.length - 1 : allele.length);
}

//creates a new byte array with the base added at the beginning
private static byte[] extendOneBase(final byte[] bases, final byte base) {

final byte[] newBases = new byte[bases.length + 1];

System.arraycopy(bases, 0, newBases, 1, bases.length);
Expand Down
63 changes: 41 additions & 22 deletions src/main/java/picard/vcf/LiftoverVcf.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,28 @@
import htsjdk.samtools.liftover.LiftOver;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.*;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.*;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFFilterHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.vcf.VCFRecordCodec;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
Expand All @@ -50,7 +64,15 @@
import java.io.File;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -530,29 +552,26 @@ private void tryToAddVariant(final VariantContext vc, final ReferenceSequence re
}

// Check that the reference allele still agrees with the reference sequence
boolean mismatchesReference = false;
for (final Allele allele : vc.getAlleles()) {
if (allele.isReference()) {
final byte[] ref = refSeq.getBases();
final String refString = StringUtil.bytesToString(ref, vc.getStart() - 1, vc.getEnd() - vc.getStart() + 1);

if (!refString.equalsIgnoreCase(allele.getBaseString())) {
// consider that the ref and the alt may have been swapped in a simple biallelic SNP
if (vc.isBiallelic() && vc.isSNP() && refString.equalsIgnoreCase(vc.getAlternateAllele(0).getBaseString())) {
if (RECOVER_SWAPPED_REF_ALT) {
totalTrackedAsSwapRefAlt++;
addAndTrack(LiftoverUtils.swapRefAlt(vc, TAGS_TO_REVERSE, TAGS_TO_DROP), source);
return;
} else {
totalTrackedAsSwapRefAlt++;
}
}
mismatchesReference = true;
final boolean mismatchesReference;
final Allele allele = vc.getReference();
final byte[] ref = refSeq.getBases();
final String refString = StringUtil.bytesToString(ref, vc.getStart() - 1, vc.getEnd() - vc.getStart() + 1);

if (!refString.equalsIgnoreCase(allele.getBaseString())) {
// consider that the ref and the alt may have been swapped in a simple biallelic SNP
if (vc.isBiallelic() && vc.isSNP() && refString.equalsIgnoreCase(vc.getAlternateAllele(0).getBaseString())) {
totalTrackedAsSwapRefAlt++;
if (RECOVER_SWAPPED_REF_ALT) {
addAndTrack(LiftoverUtils.swapRefAlt(vc, TAGS_TO_REVERSE, TAGS_TO_DROP), source);
return;
}
break;
}
mismatchesReference = true;
} else {
mismatchesReference = false;
}


if (mismatchesReference) {
rejectedRecords.add(new VariantContextBuilder(source)
.filter(FILTER_MISMATCHING_REF_ALLELE)
Expand Down
6 changes: 5 additions & 1 deletion src/test/java/picard/util/LiftoverUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
import org.testng.annotations.Test;
import picard.vcf.VcfTestUtils;

import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;

/**
Expand Down
Loading