Skip to content

Commit

Permalink
Fixes LiftoverVCF for indels (again) (#1469)
Browse files Browse the repository at this point in the history
* Included getStop from rev'ed htsJDK.
* fixes (again) #1258 which was still broken due to the call to "make()" in the presence of a mismatching "END" attribute and "getEnd()" result.
* Greatly simplified liftover logic for indels.
* Symbolic alleles are now lifted over as well, by simply carrying them along.
  • Loading branch information
Yossi Farjoun authored Apr 6, 2020
1 parent d755973 commit f20d6ec
Show file tree
Hide file tree
Showing 13 changed files with 297 additions and 220 deletions.
123 changes: 61 additions & 62 deletions src/main/java/picard/util/LiftoverUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,23 @@
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.ArrayUtils;
import picard.vcf.LiftoverVcf;

import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

public class LiftoverUtils {
Expand All @@ -48,8 +59,8 @@ public class LiftoverUtils {
/**
* Default list of attributes that need to be reversed or dropped from the INFO field when alleles have been swapped.
*/
public static final Collection<String> DEFAULT_TAGS_TO_REVERSE = Arrays.asList("AF");
public static final Collection<String> DEFAULT_TAGS_TO_DROP = Arrays.asList("MAX_AF");
public static final Collection<String> DEFAULT_TAGS_TO_REVERSE = Collections.singletonList("AF");
public static final Collection<String> DEFAULT_TAGS_TO_DROP = Collections.singletonList("MAX_AF");

public static final Log log = Log.getInstance(LiftoverUtils.class);

Expand Down Expand Up @@ -88,8 +99,7 @@ public static VariantContext liftVariant(final VariantContext source, final Inte
// If any of the source alleles is symbolic, do not populate the END tag (protecting things like <NON_REF> from
// getting screwed up in reverse-complemented variants
if (source.hasAttribute(VCFConstants.END_KEY) && builder.getAlleles().stream().noneMatch(Allele::isSymbolic)) {
// TODO: add start() and stop() methods to the builder in htsjdk and use stop() here.
builder.attribute(VCFConstants.END_KEY, builder.make().getEnd());
builder.attribute(VCFConstants.END_KEY, (int) builder.getStop());
} else {
builder.rmAttribute(VCFConstants.END_KEY);
}
Expand Down Expand Up @@ -149,30 +159,21 @@ protected static VariantContextBuilder reverseComplementVariantContext(final Var
if (target.isPositiveStrand()) {
throw new IllegalArgumentException("This should only be called for negative strand liftovers");
}
final int start = target.getStart();
final int stop = target.getEnd();

final List<Allele> origAlleles = new ArrayList<>(source.getAlleles());
final VariantContextBuilder vcb = new VariantContextBuilder(source);

vcb.rmAttribute(VCFConstants.END_KEY);
vcb.chr(target.getContig());
vcb.rmAttribute(VCFConstants.END_KEY)
.chr(target.getContig())
.start(start)
.stop(stop)
.alleles(reverseComplementAlleles(origAlleles));

// By convention, indels are left aligned and include the base prior to that indel.
// When reverse complementing, will be necessary to include this additional base.
// This check prevents the extremely rare situation in which the indel occurs on the
// first base of the sequence
final boolean addToStart = source.isIndel() && target.getStart() > 1;

final int start = target.getStart() - (addToStart ? 1 : 0);
vcb.start(start);

final int stop = target.getEnd() - (addToStart ? 1 : 0);
vcb.stop(stop);

vcb.alleles(reverseComplementAlleles(origAlleles, target, refSeq, source.isIndel(), addToStart));

if (!source.isSNP()) {
if (isIndelForLiftover(source)) {
// check that the reverse complemented bases match the new reference
if (!referenceAlleleMatchesReferenceForIndel(vcb.getAlleles(), refSeq, start, stop)) {
if (referenceAlleleDiffersFromReferenceForIndel(vcb.getAlleles(), refSeq, start, stop)) {
return null;
}
leftAlignVariant(vcb, start, stop, vcb.getAlleles(), refSeq);
Expand All @@ -183,33 +184,28 @@ protected static VariantContextBuilder reverseComplementVariantContext(final Var
return vcb;
}

private static List<Allele> reverseComplementAlleles(final List<Allele> originalAlleles, final Interval target, final ReferenceSequence refSeq, final boolean isIndel, final boolean addToStart) {
final List<Allele> alleles = new ArrayList<>();

for (final Allele oldAllele : originalAlleles) {
alleles.add(LiftoverUtils.reverseComplement(oldAllele, target, refSeq, isIndel, addToStart));
private static boolean isIndelForLiftover(final VariantContext vc) {
final Allele ref = vc.getReference();
if (ref.length() != 1) {
return true;
}

return alleles;
return vc.getAlleles().stream()
.filter(a -> !a.isSymbolic())
.filter(a -> !a.equals(Allele.SPAN_DEL))
.anyMatch(a -> a.length() != 1);
}

private static Allele reverseComplement(final Allele oldAllele, final Interval target, final ReferenceSequence referenceSequence, final boolean isIndel, final boolean addToStart) {
private static List<Allele> reverseComplementAlleles(final List<Allele> originalAlleles) {
return originalAlleles
.stream()
.map(LiftoverUtils::reverseComplement)
.collect(Collectors.toList());
}

private static Allele reverseComplement(final Allele oldAllele) {
if (oldAllele.isSymbolic() || oldAllele.isNoCall() || oldAllele.equals(Allele.SPAN_DEL)) {
return oldAllele;
} else if (isIndel) {
// target.getStart is 1-based, reference bases are 0-based
final StringBuilder alleleBuilder = new StringBuilder(target.getEnd() - target.getStart() + 1);

if (addToStart) {
alleleBuilder.append((char) referenceSequence.getBases()[target.getStart() - 2]);
}
alleleBuilder.append(SequenceUtil.reverseComplement(oldAllele.getBaseString().substring(1, oldAllele.length())));
if (!addToStart) {
alleleBuilder.append((char) referenceSequence.getBases()[target.getEnd() - 1]);
}

return Allele.create(alleleBuilder.toString(), oldAllele.isReference());
} else {
return Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference());
}
Expand Down Expand Up @@ -335,13 +331,13 @@ public static VariantContext swapRefAlt(final VariantContext vc, final Collectio
* @param end the end position of the actual indel
* @return true if they match, false otherwise
*/
protected static boolean referenceAlleleMatchesReferenceForIndel(final List<Allele> alleles,
final ReferenceSequence referenceSequence,
final int start,
final int end) {
private static boolean referenceAlleleDiffersFromReferenceForIndel(final List<Allele> alleles,
final ReferenceSequence referenceSequence,
final int start,
final int end) {
final String refString = StringUtil.bytesToString(referenceSequence.getBases(), start - 1, end - start + 1);
final Allele refAllele = alleles.stream().filter(Allele::isReference).findAny().orElseThrow(() -> new IllegalStateException("Error: no reference allele was present"));
return (refString.equalsIgnoreCase(refAllele.getBaseString()));
return !refString.equalsIgnoreCase(refAllele.getBaseString());
}

/**
Expand All @@ -355,7 +351,7 @@ protected static boolean referenceAlleleMatchesReferenceForIndel(final List<Alle
protected static void leftAlignVariant(final VariantContextBuilder builder, final int start, final int end, final List<Allele> alleles, final ReferenceSequence referenceSequence) {

// make sure that referenceAllele matches reference
if (!referenceAlleleMatchesReferenceForIndel(alleles, referenceSequence, start, end)) {
if (referenceAlleleDiffersFromReferenceForIndel(alleles, referenceSequence, start, end)) {
throw new IllegalArgumentException(String.format("Reference allele doesn't match reference at %s:%d-%d", referenceSequence.getName(), start, end));
}

Expand All @@ -365,7 +361,9 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina

// Put each allele into the alleleBasesMap unless it is a spanning deletion.
// Spanning deletions are dealt with as a special case later in fixedAlleleMap.
alleles.stream().filter(a->!a.equals(Allele.SPAN_DEL)).forEach(a -> alleleBasesMap.put(a, a.getBases()));
alleles.stream()
.filter(a -> !a.equals(Allele.SPAN_DEL) && !a.isSymbolic())
.forEach(a -> alleleBasesMap.put(a, a.getBases()));

int theStart = start;
int theEnd = end;
Expand All @@ -379,9 +377,7 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
.collect(Collectors.groupingBy(a -> a[a.length - 1], Collectors.toSet()))
.size() == 1 && theEnd > 1) {
// 3. truncate rightmost nucleotide of each allele
for (final Allele allele : alleleBasesMap.keySet()) {
alleleBasesMap.put(allele, truncateBase(alleleBasesMap.get(allele), true));
}
alleleBasesMap.replaceAll((a, v) -> truncateBase(alleleBasesMap.get(a), true));
changesInAlleles = true;
theEnd--;
// 4. end if
Expand Down Expand Up @@ -415,12 +411,10 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
alleleBasesMap.values().stream()
.collect(Collectors.groupingBy(a -> a[0], Collectors.toSet()))
.size() == 1
) {
) {

//9. truncate the leftmost base of the alleles
for (final Allele allele : alleleBasesMap.keySet()) {
alleleBasesMap.put(allele, truncateBase(alleleBasesMap.get(allele), false));
}
alleleBasesMap.replaceAll((a, v) -> truncateBase(alleleBasesMap.get(a), false));
theStart++;
}

Expand All @@ -435,6 +429,12 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
// a special case here.
fixedAlleleMap.put(Allele.SPAN_DEL, Allele.SPAN_DEL);

// symbolic alleles cannot be left-aligned so they need to be put
// into the map directly
alleles.stream()
.filter(Allele::isSymbolic)
.forEach(a -> fixedAlleleMap.put(a, a));

//retain original order:
List<Allele> fixedAlleles = alleles.stream()
.map(fixedAlleleMap::get)
Expand All @@ -444,14 +444,13 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
}

private static byte[] truncateBase(final byte[] allele, final boolean truncateRightmost) {
return Arrays.copyOfRange(allele, truncateRightmost ? 0 : 1, truncateRightmost ?
allele.length - 1 :
allele.length);
return Arrays.copyOfRange(allele,
truncateRightmost ? 0 : 1,
truncateRightmost ? allele.length - 1 : allele.length);
}

//creates a new byte array with the base added at the beginning
private static byte[] extendOneBase(final byte[] bases, final byte base) {

final byte[] newBases = new byte[bases.length + 1];

System.arraycopy(bases, 0, newBases, 1, bases.length);
Expand Down
63 changes: 41 additions & 22 deletions src/main/java/picard/vcf/LiftoverVcf.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,28 @@
import htsjdk.samtools.liftover.LiftOver;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.*;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.*;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFFilterHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.vcf.VCFRecordCodec;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
Expand All @@ -50,7 +64,15 @@
import java.io.File;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -530,29 +552,26 @@ private void tryToAddVariant(final VariantContext vc, final ReferenceSequence re
}

// Check that the reference allele still agrees with the reference sequence
boolean mismatchesReference = false;
for (final Allele allele : vc.getAlleles()) {
if (allele.isReference()) {
final byte[] ref = refSeq.getBases();
final String refString = StringUtil.bytesToString(ref, vc.getStart() - 1, vc.getEnd() - vc.getStart() + 1);

if (!refString.equalsIgnoreCase(allele.getBaseString())) {
// consider that the ref and the alt may have been swapped in a simple biallelic SNP
if (vc.isBiallelic() && vc.isSNP() && refString.equalsIgnoreCase(vc.getAlternateAllele(0).getBaseString())) {
if (RECOVER_SWAPPED_REF_ALT) {
totalTrackedAsSwapRefAlt++;
addAndTrack(LiftoverUtils.swapRefAlt(vc, TAGS_TO_REVERSE, TAGS_TO_DROP), source);
return;
} else {
totalTrackedAsSwapRefAlt++;
}
}
mismatchesReference = true;
final boolean mismatchesReference;
final Allele allele = vc.getReference();
final byte[] ref = refSeq.getBases();
final String refString = StringUtil.bytesToString(ref, vc.getStart() - 1, vc.getEnd() - vc.getStart() + 1);

if (!refString.equalsIgnoreCase(allele.getBaseString())) {
// consider that the ref and the alt may have been swapped in a simple biallelic SNP
if (vc.isBiallelic() && vc.isSNP() && refString.equalsIgnoreCase(vc.getAlternateAllele(0).getBaseString())) {
totalTrackedAsSwapRefAlt++;
if (RECOVER_SWAPPED_REF_ALT) {
addAndTrack(LiftoverUtils.swapRefAlt(vc, TAGS_TO_REVERSE, TAGS_TO_DROP), source);
return;
}
break;
}
mismatchesReference = true;
} else {
mismatchesReference = false;
}


if (mismatchesReference) {
rejectedRecords.add(new VariantContextBuilder(source)
.filter(FILTER_MISMATCHING_REF_ALLELE)
Expand Down
6 changes: 5 additions & 1 deletion src/test/java/picard/util/LiftoverUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
import org.testng.annotations.Test;
import picard.vcf.VcfTestUtils;

import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;

/**
Expand Down
Loading

0 comments on commit f20d6ec

Please sign in to comment.