Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Size similarity linkage and bug fixes for SV matching tools #8257

Merged
merged 6 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public class SVCallRecord implements SVLocatable {
private final List<Allele> altAlleles;
private final GenotypesContext genotypes;
private final Map<String,Object> attributes;
private final Set<String> filters;
private final Double log10PError;

// CPX related fields
private final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype;
Expand All @@ -70,8 +72,10 @@ public SVCallRecord(final String id,
final List<Allele> alleles,
final List<Genotype> genotypes,
final Map<String,Object> attributes,
final Set<String> filters,
final Double log10PError,
final SAMSequenceDictionary dictionary) {
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, length, algorithms, alleles, genotypes, attributes);
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
validateCoordinates(dictionary);
}

Expand All @@ -88,11 +92,14 @@ protected SVCallRecord(final String id,
final List<String> algorithms,
final List<Allele> alleles,
final List<Genotype> genotypes,
final Map<String, Object> attributes) {
final Map<String, Object> attributes,
final Set<String> filters,
final Double log10PError) {
Utils.nonNull(algorithms);
Utils.nonNull(alleles);
Utils.nonNull(genotypes);
Utils.nonNull(attributes);
Utils.nonNull(filters);
this.id = Utils.nonNull(id);
this.contigA = contigA;
this.positionA = positionA;
Expand All @@ -112,6 +119,8 @@ protected SVCallRecord(final String id,
final Pair<Boolean, Boolean> strands = inferStrands(type, strandA, strandB);
this.strandA = strands.getLeft();
this.strandB = strands.getRight();
this.filters = filters;
this.log10PError = log10PError;
}

/**
Expand Down Expand Up @@ -366,4 +375,16 @@ public SimpleInterval getPositionAInterval() {
public SimpleInterval getPositionBInterval() {
return new SimpleInterval(contigB, positionB, positionB);
}

public Set<String> getFilters() {
return filters;
}

public Double getLog10PError() {
return log10PError;
}

public GATKSVVCFConstants.ComplexVariantSubtype getCpxSubtype() {
return cpxSubtype;
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.sv;

import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
Expand Down Expand Up @@ -99,15 +100,27 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
&& record.getStrandA() != null && record.getStrandB() != null) {
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
}
final GenotypesContext genotypes = GenotypesContext.create(record.getGenotypes().size());
for (final Genotype g : record.getGenotypes()) {
genotypes.add(sanitizeEmptyGenotype(g));
if (!record.getFilters().isEmpty()) {
builder.filters(record.getFilters());
}
if (record.getLog10PError() != null) {
builder.log10PError(record.getLog10PError());
}

// htsjdk vcf encoder does not allow genotypes to have empty alleles
builder.genotypes(record.getGenotypes().stream().map(SVCallRecordUtils::sanitizeEmptyGenotype).collect(Collectors.toList()));
sanitizeNullAttributes(builder);
return builder;
}

/**
* Removes entries with null values. This can cause problems with vcf parsing, for example Integer fields
* set to "." cause exceptions.
*/
private static void sanitizeNullAttributes(final VariantContextBuilder builder) {
builder.attributes(Maps.filterValues(builder.getAttributes(), o -> o != null));
}

/**
* Adds NO_CALL allele if empty
*/
Expand Down Expand Up @@ -170,12 +183,12 @@ public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(fin
public static SVCallRecord copyCallWithNewGenotypes(final SVCallRecord record, final GenotypesContext genotypes) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
genotypes, record.getAttributes());
genotypes, record.getAttributes(), record.getFilters(), record.getLog10PError());
}
public static SVCallRecord copyCallWithNewAttributes(final SVCallRecord record, final Map<String, Object> attr) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
record.getGenotypes(), attr);
record.getGenotypes(), attr, record.getFilters(), record.getLog10PError());
}

/**
Expand Down Expand Up @@ -287,10 +300,10 @@ public static Stream<SVCallRecord> convertInversionsToBreakends(final SVCallReco
Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal");
final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), dictionary);
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), dictionary);
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
return Stream.of(positiveBreakend, negativeBreakend);
}

Expand Down Expand Up @@ -372,12 +385,18 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
}
} else {
contigB = contigA;
positionB = variant.getEnd();
// Force reset of END coordinate
if (type.equals(StructuralVariantType.INS)) {
positionB = positionA + 1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have some insertion records with END - POS > 1 to indicate microdeletions at the source? Do we definitely always want to reset the END of those here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that's true, but so far the GATK tools we are using in the pipeline would not affect this where it matters (GenerateBatchMetrics through GenotypeBatch).

In the future, however, I'd like to use POS/END to mark the nominal location of the insertion but separate INFO tags to indicate the left/right SR coordinates. This will avoid difficulties with making sure POS precedes END, which really should not be used for marking the SR positions IMO according to the VCF spec.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally agreed, we should have separate info fields for SR positions.

} else {
positionB = variant.getEnd();
}
}
final Double log10PError = variant.hasLog10PError() ? variant.getLog10PError() : null;

final Map<String, Object> sanitizedAttributes = sanitizeAttributes(attributes);
return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype, length, algorithms,
variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes);
variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes, variant.getFilters(), log10PError);
}

private static Map<String, Object> sanitizeAttributes(final Map<String, Object> attributes) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,12 @@ public SVCallRecord collapse(final SVClusterEngine.OutputCluster cluster) {
final Boolean strandA = type == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV ? null : representative.getStrandA();
final Boolean strandB = type == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV ? null : representative.getStrandB();

final Set<String> filters = collapseFilters(items);
final Double quality = collapseQuality(items);

return new SVCallRecord(representative.getId(), representative.getContigA(), start, strandA, representative.getContigB(),
end, strandB, type, representative.getComplexSubtype(), length, algorithms, alleles, genotypes, attributes, dictionary);
end, strandB, type, representative.getComplexSubtype(), length, algorithms, alleles, genotypes, attributes,
filters, quality, dictionary);
}

protected List<Allele> collapseAlleles(final List<Allele> altAlleles, final Allele refAllele) {
Expand All @@ -193,6 +197,21 @@ protected List<Allele> collapseAlleles(final List<Allele> altAlleles, final Alle
return alleles;
}

protected Set<String> collapseFilters(final List<SVCallRecord> items) {
return items.stream()
.map(SVCallRecord::getFilters)
.flatMap(Collection::stream)
.collect(Collectors.toSet());
}

protected Double collapseQuality(final List<SVCallRecord> items) {
if (items.size() == 1) {
return items.get(0).getLog10PError();
} else {
return null;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any case to be made to take the minimum here? I'm not sure what this is used for downstream yet but it seems like if we are collapsing variants that we assume to be the same our confidence that the variant is real should be the min error.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was conservative here by marking them as null. You could take the min, but I'd argue this could be misleading and you'd really want to go back and recalculate variant qualities after collapsing.

}
}

/**
* Asserts that the given records are valid for collapsing.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,20 @@ public class CanonicalSVLinkage<T extends SVCallRecord> extends SVClusterLinkage
protected ClusteringParameters evidenceParams;

public static final int INSERTION_ASSUMED_LENGTH_FOR_OVERLAP = 50;
public static final int INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY = 1;

public static final double DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY = 0.8;
public static final double DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY = 0;
public static final int DEFAULT_WINDOW_DEPTH_ONLY = 0;
public static final double DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY = 0;

public static final double DEFAULT_RECIPROCAL_OVERLAP_MIXED = 0.8;
public static final double DEFAULT_SIZE_SIMILARITY_MIXED = 0;
public static final int DEFAULT_WINDOW_MIXED = 1000;
public static final double DEFAULT_SAMPLE_OVERLAP_MIXED = 0;

public static final double DEFAULT_RECIPROCAL_OVERLAP_PESR = 0.5;
public static final double DEFAULT_SIZE_SIMILARITY_PESR = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason for keeping these defaults all at zero? Is using size similarity something that only needs to happen in special cases? Otherwise I might try to set some good defaults for everyday use.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are what we use for ClusterBatch still. We'll have to go back and re-evaluate the effects of using size similarity, which right now I used more for SVConcordance in the new filtering workflows.

public static final int DEFAULT_WINDOW_PESR = 500;
public static final double DEFAULT_SAMPLE_OVERLAP_PESR = 0;

Expand All @@ -56,16 +60,19 @@ public class CanonicalSVLinkage<T extends SVCallRecord> extends SVClusterLinkage
public static final ClusteringParameters DEFAULT_DEPTH_ONLY_PARAMS =
ClusteringParameters.createDepthParameters(
DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY,
DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY,
DEFAULT_WINDOW_DEPTH_ONLY,
DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY);
public static final ClusteringParameters DEFAULT_MIXED_PARAMS =
ClusteringParameters.createMixedParameters(
DEFAULT_RECIPROCAL_OVERLAP_MIXED,
DEFAULT_SIZE_SIMILARITY_MIXED,
DEFAULT_WINDOW_MIXED,
DEFAULT_SAMPLE_OVERLAP_MIXED);
public static final ClusteringParameters DEFAULT_PESR_PARAMS =
ClusteringParameters.createPesrParameters(
DEFAULT_RECIPROCAL_OVERLAP_PESR,
DEFAULT_SIZE_SIMILARITY_PESR,
DEFAULT_WINDOW_PESR,
DEFAULT_SAMPLE_OVERLAP_PESR);

Expand Down Expand Up @@ -139,23 +146,27 @@ protected boolean strandsMatch(final SVCallRecord a, final SVCallRecord b) {
private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVCallRecord b,
final ClusteringParameters params,
final SAMSequenceDictionary dictionary) {

// Contigs match
if (!(a.getContigA().equals(b.getContigA()) && a.getContigB().equals(b.getContigB()))) {
return false;
}

// Reciprocal overlap
// Reciprocal overlap and size similarity
// Check bypassed if both are inter-chromosomal
final Boolean hasReciprocalOverlapAndSizeSimilarity;
if (a.isIntrachromosomal()) {
final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLengthForOverlap(a) - 1);
final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLengthForOverlap(b) - 1);
final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLength(a, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1);
final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLength(b, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1);
final boolean hasReciprocalOverlap = IntervalUtils.isReciprocalOverlap(intervalA, intervalB, params.getReciprocalOverlap());
if (params.requiresOverlapAndProximity() && !hasReciprocalOverlap) {
final int sizeSimilarityLengthA = getLength(a, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY);
final int sizeSimilarityLengthB = getLength(b, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY);
final boolean hasSizeSimilarity = Math.min(sizeSimilarityLengthA, sizeSimilarityLengthB) / (double) Math.max(sizeSimilarityLengthA, sizeSimilarityLengthB) >= params.getSizeSimilarity();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might extract this into a little method, if only to make it easier for someone scanning the code to quickly find the definition of size similarity based on the class method list.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Did the same for reciprocal overlap and breakend window as well.

hasReciprocalOverlapAndSizeSimilarity = hasReciprocalOverlap && hasSizeSimilarity;
if (params.requiresOverlapAndProximity() && !hasReciprocalOverlapAndSizeSimilarity) {
return false;
} else if (!params.requiresOverlapAndProximity() && hasReciprocalOverlap) {
return true;
}
} else {
hasReciprocalOverlapAndSizeSimilarity = null;
}

// Breakend proximity
Expand All @@ -173,30 +184,31 @@ private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVC
}
final SimpleInterval intervalB1 = b.getPositionAInterval();
final SimpleInterval intervalB2 = b.getPositionBInterval();
if (!(intervalA1.overlaps(intervalB1) && intervalA2.overlaps(intervalB2))) {
return false;
final boolean hasBreakendProximity = intervalA1.overlaps(intervalB1) && intervalA2.overlaps(intervalB2);
// Use short-circuiting statements since sample overlap is the least scalable / slowest check
if (hasReciprocalOverlapAndSizeSimilarity == null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not make this a lower case boolean and just use false? This clause won't trigger if it's Boolean(false) which seems counterintuitive.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I used null as a distinct case where the variants are interchromosomal, in which case the logic is different from False as with intrachromosomal variants. I found the code to be clearest this way (it's a bit complicated since there are 3 booleans) although maybe there is a better way to write it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I guess I'd advocating for just re-checking the intrachromosal status explicitly here again here instead of using null, but it's not a big deal either way.

return hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap());
} else {
if (params.requiresOverlapAndProximity()) {
return hasReciprocalOverlapAndSizeSimilarity && hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap());
} else {
return (hasReciprocalOverlapAndSizeSimilarity || hasBreakendProximity) && hasSampleOverlap(a, b, params.getSampleOverlap());
}
}

// Sample overlap (possibly the least scalable check)
return hasSampleOverlap(a, b, params.getSampleOverlap());
}

/**
* Gets event length used for overlap testing.
*/
private static int getLengthForOverlap(final SVCallRecord record) {
Utils.validate(record.isIntrachromosomal(), "Record even must be intra-chromosomal");
final Integer length = record.getLength();
if (length == null) {
if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS
|| record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX){
return INSERTION_ASSUMED_LENGTH_FOR_OVERLAP;
} else {
return 1;
}
private static int getLength(final SVCallRecord record, final int missingInsertionLength) {
Utils.validate(record.isIntrachromosomal(), "Record must be intra-chromosomal");
if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) {
return record.getLength() == null ? missingInsertionLength : Math.max(record.getLength(), 1);
} else if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND) {
return record.getPositionB() - record.getPositionA() + 1;
} else {
// TODO lengths less than 1 shouldn't be valid
return Math.max(record.getLength(), 1);
return Math.max(record.getLength() == null ? 1 : record.getLength(), 1);
}
}

Expand Down Expand Up @@ -227,7 +239,7 @@ private static int getMaxClusterableStartingPositionWithParams(final SVCallRecor

// Reciprocal overlap window
final int maxPositionByOverlap;
final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLengthForOverlap(call));
final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLength(call, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP));
maxPositionByOverlap = Math.min(maxPosition, contigLength);

if (params.requiresOverlapAndProximity()) {
Expand Down
Loading