Skip to content

Commit

Permalink
Size similarity linkage and bug fixes for SV matching tools (#8257)
Browse files Browse the repository at this point in the history
* Add size similarity to SVCluster
* Check that vcf contigs sorted the same, with integration testing
* Add FILTER and QUAL fields to SVCallRecord
  • Loading branch information
mwalker174 authored May 15, 2023
1 parent 642ba63 commit 18edcd3
Show file tree
Hide file tree
Showing 26 changed files with 442 additions and 171 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public class SVCallRecord implements SVLocatable {
private final List<Allele> altAlleles;
private final GenotypesContext genotypes;
private final Map<String,Object> attributes;
private final Set<String> filters;
private final Double log10PError;

// CPX related fields
private final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype;
Expand All @@ -70,8 +72,10 @@ public SVCallRecord(final String id,
final List<Allele> alleles,
final List<Genotype> genotypes,
final Map<String,Object> attributes,
final Set<String> filters,
final Double log10PError,
final SAMSequenceDictionary dictionary) {
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, length, algorithms, alleles, genotypes, attributes);
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
validateCoordinates(dictionary);
}

Expand All @@ -88,11 +92,14 @@ protected SVCallRecord(final String id,
final List<String> algorithms,
final List<Allele> alleles,
final List<Genotype> genotypes,
final Map<String, Object> attributes) {
final Map<String, Object> attributes,
final Set<String> filters,
final Double log10PError) {
Utils.nonNull(algorithms);
Utils.nonNull(alleles);
Utils.nonNull(genotypes);
Utils.nonNull(attributes);
Utils.nonNull(filters);
this.id = Utils.nonNull(id);
this.contigA = contigA;
this.positionA = positionA;
Expand All @@ -112,6 +119,8 @@ protected SVCallRecord(final String id,
final Pair<Boolean, Boolean> strands = inferStrands(type, strandA, strandB);
this.strandA = strands.getLeft();
this.strandB = strands.getRight();
this.filters = filters;
this.log10PError = log10PError;
}

/**
Expand Down Expand Up @@ -366,4 +375,16 @@ public SimpleInterval getPositionAInterval() {
public SimpleInterval getPositionBInterval() {
return new SimpleInterval(contigB, positionB, positionB);
}

public Set<String> getFilters() {
return filters;
}

public Double getLog10PError() {
return log10PError;
}

public GATKSVVCFConstants.ComplexVariantSubtype getCpxSubtype() {
return cpxSubtype;
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.sv;

import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
Expand Down Expand Up @@ -99,15 +100,27 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
&& record.getStrandA() != null && record.getStrandB() != null) {
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
}
final GenotypesContext genotypes = GenotypesContext.create(record.getGenotypes().size());
for (final Genotype g : record.getGenotypes()) {
genotypes.add(sanitizeEmptyGenotype(g));
if (!record.getFilters().isEmpty()) {
builder.filters(record.getFilters());
}
if (record.getLog10PError() != null) {
builder.log10PError(record.getLog10PError());
}

// htsjdk vcf encoder does not allow genotypes to have empty alleles
builder.genotypes(record.getGenotypes().stream().map(SVCallRecordUtils::sanitizeEmptyGenotype).collect(Collectors.toList()));
sanitizeNullAttributes(builder);
return builder;
}

/**
* Removes entries with null values. This can cause problems with vcf parsing, for example Integer fields
* set to "." cause exceptions.
*/
private static void sanitizeNullAttributes(final VariantContextBuilder builder) {
builder.attributes(Maps.filterValues(builder.getAttributes(), o -> o != null));
}

/**
* Adds NO_CALL allele if empty
*/
Expand Down Expand Up @@ -170,12 +183,12 @@ public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(fin
public static SVCallRecord copyCallWithNewGenotypes(final SVCallRecord record, final GenotypesContext genotypes) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
genotypes, record.getAttributes());
genotypes, record.getAttributes(), record.getFilters(), record.getLog10PError());
}
public static SVCallRecord copyCallWithNewAttributes(final SVCallRecord record, final Map<String, Object> attr) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
record.getGenotypes(), attr);
record.getGenotypes(), attr, record.getFilters(), record.getLog10PError());
}

/**
Expand Down Expand Up @@ -287,10 +300,10 @@ public static Stream<SVCallRecord> convertInversionsToBreakends(final SVCallReco
Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal");
final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), dictionary);
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), dictionary);
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
return Stream.of(positiveBreakend, negativeBreakend);
}

Expand Down Expand Up @@ -372,12 +385,18 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
}
} else {
contigB = contigA;
positionB = variant.getEnd();
// Force reset of END coordinate
if (type.equals(StructuralVariantType.INS)) {
positionB = positionA + 1;
} else {
positionB = variant.getEnd();
}
}
final Double log10PError = variant.hasLog10PError() ? variant.getLog10PError() : null;

final Map<String, Object> sanitizedAttributes = sanitizeAttributes(attributes);
return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype, length, algorithms,
variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes);
variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes, variant.getFilters(), log10PError);
}

private static Map<String, Object> sanitizeAttributes(final Map<String, Object> attributes) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,12 @@ public SVCallRecord collapse(final SVClusterEngine.OutputCluster cluster) {
final Boolean strandA = type == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV ? null : representative.getStrandA();
final Boolean strandB = type == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV ? null : representative.getStrandB();

final Set<String> filters = collapseFilters(items);
final Double quality = collapseQuality(items);

return new SVCallRecord(representative.getId(), representative.getContigA(), start, strandA, representative.getContigB(),
end, strandB, type, representative.getComplexSubtype(), length, algorithms, alleles, genotypes, attributes, dictionary);
end, strandB, type, representative.getComplexSubtype(), length, algorithms, alleles, genotypes, attributes,
filters, quality, dictionary);
}

protected List<Allele> collapseAlleles(final List<Allele> altAlleles, final Allele refAllele) {
Expand All @@ -193,6 +197,21 @@ protected List<Allele> collapseAlleles(final List<Allele> altAlleles, final Alle
return alleles;
}

protected Set<String> collapseFilters(final List<SVCallRecord> items) {
return items.stream()
.map(SVCallRecord::getFilters)
.flatMap(Collection::stream)
.collect(Collectors.toSet());
}

protected Double collapseQuality(final List<SVCallRecord> items) {
if (items.size() == 1) {
return items.get(0).getLog10PError();
} else {
return null;
}
}

/**
* Asserts that the given records are valid for collapsing.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,20 @@ public class CanonicalSVLinkage<T extends SVCallRecord> extends SVClusterLinkage
protected ClusteringParameters evidenceParams;

public static final int INSERTION_ASSUMED_LENGTH_FOR_OVERLAP = 50;
public static final int INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY = 1;

public static final double DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY = 0.8;
public static final double DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY = 0;
public static final int DEFAULT_WINDOW_DEPTH_ONLY = 0;
public static final double DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY = 0;

public static final double DEFAULT_RECIPROCAL_OVERLAP_MIXED = 0.8;
public static final double DEFAULT_SIZE_SIMILARITY_MIXED = 0;
public static final int DEFAULT_WINDOW_MIXED = 1000;
public static final double DEFAULT_SAMPLE_OVERLAP_MIXED = 0;

public static final double DEFAULT_RECIPROCAL_OVERLAP_PESR = 0.5;
public static final double DEFAULT_SIZE_SIMILARITY_PESR = 0;
public static final int DEFAULT_WINDOW_PESR = 500;
public static final double DEFAULT_SAMPLE_OVERLAP_PESR = 0;

Expand All @@ -56,16 +60,19 @@ public class CanonicalSVLinkage<T extends SVCallRecord> extends SVClusterLinkage
public static final ClusteringParameters DEFAULT_DEPTH_ONLY_PARAMS =
ClusteringParameters.createDepthParameters(
DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY,
DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY,
DEFAULT_WINDOW_DEPTH_ONLY,
DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY);
public static final ClusteringParameters DEFAULT_MIXED_PARAMS =
ClusteringParameters.createMixedParameters(
DEFAULT_RECIPROCAL_OVERLAP_MIXED,
DEFAULT_SIZE_SIMILARITY_MIXED,
DEFAULT_WINDOW_MIXED,
DEFAULT_SAMPLE_OVERLAP_MIXED);
public static final ClusteringParameters DEFAULT_PESR_PARAMS =
ClusteringParameters.createPesrParameters(
DEFAULT_RECIPROCAL_OVERLAP_PESR,
DEFAULT_SIZE_SIMILARITY_PESR,
DEFAULT_WINDOW_PESR,
DEFAULT_SAMPLE_OVERLAP_PESR);

Expand Down Expand Up @@ -139,28 +146,55 @@ protected boolean strandsMatch(final SVCallRecord a, final SVCallRecord b) {
private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVCallRecord b,
final ClusteringParameters params,
final SAMSequenceDictionary dictionary) {

// Contigs match
if (!(a.getContigA().equals(b.getContigA()) && a.getContigB().equals(b.getContigB()))) {
return false;
}

// Reciprocal overlap
// Reciprocal overlap and size similarity
// Check bypassed if both are inter-chromosomal
final Boolean hasReciprocalOverlapAndSizeSimilarity;
if (a.isIntrachromosomal()) {
final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLengthForOverlap(a) - 1);
final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLengthForOverlap(b) - 1);
final boolean hasReciprocalOverlap = IntervalUtils.isReciprocalOverlap(intervalA, intervalB, params.getReciprocalOverlap());
if (params.requiresOverlapAndProximity() && !hasReciprocalOverlap) {
final boolean hasReciprocalOverlap = testReciprocalOverlap(a, b, params.getReciprocalOverlap());
final boolean hasSizeSimilarity = testSizeSimilarity(a, b, params.getSizeSimilarity());
hasReciprocalOverlapAndSizeSimilarity = hasReciprocalOverlap && hasSizeSimilarity;
if (params.requiresOverlapAndProximity() && !hasReciprocalOverlapAndSizeSimilarity) {
return false;
} else if (!params.requiresOverlapAndProximity() && hasReciprocalOverlap) {
return true;
}
} else {
hasReciprocalOverlapAndSizeSimilarity = null;
}

// Breakend proximity
final SimpleInterval intervalA1 = a.getPositionAInterval().expandWithinContig(params.getWindow(), dictionary);
final SimpleInterval intervalA2 = a.getPositionBInterval().expandWithinContig(params.getWindow(), dictionary);
final boolean hasBreakendProximity = testBreakendProximity(a, b, params.getWindow(), dictionary);
// Use short-circuiting statements since sample overlap is the least scalable / slowest check
if (hasReciprocalOverlapAndSizeSimilarity == null) {
return hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap());
} else {
if (params.requiresOverlapAndProximity()) {
return hasReciprocalOverlapAndSizeSimilarity && hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap());
} else {
return (hasReciprocalOverlapAndSizeSimilarity || hasBreakendProximity) && hasSampleOverlap(a, b, params.getSampleOverlap());
}
}
}

private static boolean testReciprocalOverlap(final SVCallRecord a, final SVCallRecord b, final double threshold) {
final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLength(a, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1);
final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLength(b, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1);
return IntervalUtils.isReciprocalOverlap(intervalA, intervalB, threshold);
}

private static boolean testSizeSimilarity(final SVCallRecord a, final SVCallRecord b, final double threshold) {
final int sizeSimilarityLengthA = getLength(a, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY);
final int sizeSimilarityLengthB = getLength(b, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY);
return Math.min(sizeSimilarityLengthA, sizeSimilarityLengthB) / (double) Math.max(sizeSimilarityLengthA, sizeSimilarityLengthB) >= threshold;
}

private static boolean testBreakendProximity(final SVCallRecord a, final SVCallRecord b, final int window,
final SAMSequenceDictionary dictionary) {
final SimpleInterval intervalA1 = a.getPositionAInterval().expandWithinContig(window, dictionary);
final SimpleInterval intervalA2 = a.getPositionBInterval().expandWithinContig(window, dictionary);
if (intervalA1 == null) {
logger.warn("Invalid start position " + a.getPositionA() + " in record " + a.getId() +
" - record will not be matched");
Expand All @@ -173,30 +207,21 @@ private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVC
}
final SimpleInterval intervalB1 = b.getPositionAInterval();
final SimpleInterval intervalB2 = b.getPositionBInterval();
if (!(intervalA1.overlaps(intervalB1) && intervalA2.overlaps(intervalB2))) {
return false;
}

// Sample overlap (possibly the least scalable check)
return hasSampleOverlap(a, b, params.getSampleOverlap());
return intervalA1.overlaps(intervalB1) && intervalA2.overlaps(intervalB2);
}

/**
* Gets event length used for overlap testing.
*/
private static int getLengthForOverlap(final SVCallRecord record) {
Utils.validate(record.isIntrachromosomal(), "Record even must be intra-chromosomal");
final Integer length = record.getLength();
if (length == null) {
if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS
|| record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX){
return INSERTION_ASSUMED_LENGTH_FOR_OVERLAP;
} else {
return 1;
}
private static int getLength(final SVCallRecord record, final int missingInsertionLength) {
Utils.validate(record.isIntrachromosomal(), "Record must be intra-chromosomal");
if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) {
return record.getLength() == null ? missingInsertionLength : Math.max(record.getLength(), 1);
} else if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND) {
return record.getPositionB() - record.getPositionA() + 1;
} else {
// TODO lengths less than 1 shouldn't be valid
return Math.max(record.getLength(), 1);
return Math.max(record.getLength() == null ? 1 : record.getLength(), 1);
}
}

Expand Down Expand Up @@ -227,7 +252,7 @@ private static int getMaxClusterableStartingPositionWithParams(final SVCallRecor

// Reciprocal overlap window
final int maxPositionByOverlap;
final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLengthForOverlap(call));
final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLength(call, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP));
maxPositionByOverlap = Math.min(maxPosition, contigLength);

if (params.requiresOverlapAndProximity()) {
Expand Down
Loading

0 comments on commit 18edcd3

Please sign in to comment.