-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Size similarity linkage and bug fixes for SV matching tools #8257
Changes from 5 commits
4e7d682
c3b05b5
d3e8c7a
a071f88
b059070
fc4205d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -182,8 +182,12 @@ public SVCallRecord collapse(final SVClusterEngine.OutputCluster cluster) { | |
final Boolean strandA = type == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV ? null : representative.getStrandA(); | ||
final Boolean strandB = type == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV ? null : representative.getStrandB(); | ||
|
||
final Set<String> filters = collapseFilters(items); | ||
final Double quality = collapseQuality(items); | ||
|
||
return new SVCallRecord(representative.getId(), representative.getContigA(), start, strandA, representative.getContigB(), | ||
end, strandB, type, representative.getComplexSubtype(), length, algorithms, alleles, genotypes, attributes, dictionary); | ||
end, strandB, type, representative.getComplexSubtype(), length, algorithms, alleles, genotypes, attributes, | ||
filters, quality, dictionary); | ||
} | ||
|
||
protected List<Allele> collapseAlleles(final List<Allele> altAlleles, final Allele refAllele) { | ||
|
@@ -193,6 +197,21 @@ protected List<Allele> collapseAlleles(final List<Allele> altAlleles, final Alle | |
return alleles; | ||
} | ||
|
||
protected Set<String> collapseFilters(final List<SVCallRecord> items) { | ||
return items.stream() | ||
.map(SVCallRecord::getFilters) | ||
.flatMap(Collection::stream) | ||
.collect(Collectors.toSet()); | ||
} | ||
|
||
protected Double collapseQuality(final List<SVCallRecord> items) { | ||
if (items.size() == 1) { | ||
return items.get(0).getLog10PError(); | ||
} else { | ||
return null; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any case to be made to take the minimum here? I'm not sure what this is used for downstream yet but it seems like if we are collapsing variants that we assume to be the same our confidence that the variant is real should be the min error. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was conservative here by marking them as null. You could take the min, but I'd argue this could be misleading and you'd really want to go back and recalculate variant qualities after collapsing. |
||
} | ||
} | ||
|
||
/** | ||
* Asserts that the given records are valid for collapsing. | ||
*/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,16 +38,20 @@ public class CanonicalSVLinkage<T extends SVCallRecord> extends SVClusterLinkage | |
protected ClusteringParameters evidenceParams; | ||
|
||
public static final int INSERTION_ASSUMED_LENGTH_FOR_OVERLAP = 50; | ||
public static final int INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY = 1; | ||
|
||
public static final double DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY = 0.8; | ||
public static final double DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY = 0; | ||
public static final int DEFAULT_WINDOW_DEPTH_ONLY = 0; | ||
public static final double DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY = 0; | ||
|
||
public static final double DEFAULT_RECIPROCAL_OVERLAP_MIXED = 0.8; | ||
public static final double DEFAULT_SIZE_SIMILARITY_MIXED = 0; | ||
public static final int DEFAULT_WINDOW_MIXED = 1000; | ||
public static final double DEFAULT_SAMPLE_OVERLAP_MIXED = 0; | ||
|
||
public static final double DEFAULT_RECIPROCAL_OVERLAP_PESR = 0.5; | ||
public static final double DEFAULT_SIZE_SIMILARITY_PESR = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the reason for keeping these defaults all at zero? Is using size similarity something that only needs to happen in special cases? Otherwise I might try to set some good defaults for everyday use. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are what we use for ClusterBatch still. We'll have to go back and re-evaluate the effects of using size similarity, which right now I used more for SVConcordance in the new filtering workflows. |
||
public static final int DEFAULT_WINDOW_PESR = 500; | ||
public static final double DEFAULT_SAMPLE_OVERLAP_PESR = 0; | ||
|
||
|
@@ -56,16 +60,19 @@ public class CanonicalSVLinkage<T extends SVCallRecord> extends SVClusterLinkage | |
public static final ClusteringParameters DEFAULT_DEPTH_ONLY_PARAMS = | ||
ClusteringParameters.createDepthParameters( | ||
DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY, | ||
DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY, | ||
DEFAULT_WINDOW_DEPTH_ONLY, | ||
DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY); | ||
public static final ClusteringParameters DEFAULT_MIXED_PARAMS = | ||
ClusteringParameters.createMixedParameters( | ||
DEFAULT_RECIPROCAL_OVERLAP_MIXED, | ||
DEFAULT_SIZE_SIMILARITY_MIXED, | ||
DEFAULT_WINDOW_MIXED, | ||
DEFAULT_SAMPLE_OVERLAP_MIXED); | ||
public static final ClusteringParameters DEFAULT_PESR_PARAMS = | ||
ClusteringParameters.createPesrParameters( | ||
DEFAULT_RECIPROCAL_OVERLAP_PESR, | ||
DEFAULT_SIZE_SIMILARITY_PESR, | ||
DEFAULT_WINDOW_PESR, | ||
DEFAULT_SAMPLE_OVERLAP_PESR); | ||
|
||
|
@@ -139,23 +146,27 @@ protected boolean strandsMatch(final SVCallRecord a, final SVCallRecord b) { | |
private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVCallRecord b, | ||
final ClusteringParameters params, | ||
final SAMSequenceDictionary dictionary) { | ||
|
||
// Contigs match | ||
if (!(a.getContigA().equals(b.getContigA()) && a.getContigB().equals(b.getContigB()))) { | ||
return false; | ||
} | ||
|
||
// Reciprocal overlap | ||
// Reciprocal overlap and size similarity | ||
// Check bypassed if both are inter-chromosomal | ||
final Boolean hasReciprocalOverlapAndSizeSimilarity; | ||
if (a.isIntrachromosomal()) { | ||
final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLengthForOverlap(a) - 1); | ||
final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLengthForOverlap(b) - 1); | ||
final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLength(a, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1); | ||
final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLength(b, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1); | ||
final boolean hasReciprocalOverlap = IntervalUtils.isReciprocalOverlap(intervalA, intervalB, params.getReciprocalOverlap()); | ||
if (params.requiresOverlapAndProximity() && !hasReciprocalOverlap) { | ||
final int sizeSimilarityLengthA = getLength(a, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY); | ||
final int sizeSimilarityLengthB = getLength(b, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY); | ||
final boolean hasSizeSimilarity = Math.min(sizeSimilarityLengthA, sizeSimilarityLengthB) / (double) Math.max(sizeSimilarityLengthA, sizeSimilarityLengthB) >= params.getSizeSimilarity(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I might extract this into a little method, if only to make it easier for someone scanning the code to quickly find the definition of size similarity based on the class method list. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. Did the same for reciprocal overlap and breakend window as well. |
||
hasReciprocalOverlapAndSizeSimilarity = hasReciprocalOverlap && hasSizeSimilarity; | ||
if (params.requiresOverlapAndProximity() && !hasReciprocalOverlapAndSizeSimilarity) { | ||
return false; | ||
} else if (!params.requiresOverlapAndProximity() && hasReciprocalOverlap) { | ||
return true; | ||
} | ||
} else { | ||
hasReciprocalOverlapAndSizeSimilarity = null; | ||
} | ||
|
||
// Breakend proximity | ||
|
@@ -173,30 +184,31 @@ private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVC | |
} | ||
final SimpleInterval intervalB1 = b.getPositionAInterval(); | ||
final SimpleInterval intervalB2 = b.getPositionBInterval(); | ||
if (!(intervalA1.overlaps(intervalB1) && intervalA2.overlaps(intervalB2))) { | ||
return false; | ||
final boolean hasBreakendProximity = intervalA1.overlaps(intervalB1) && intervalA2.overlaps(intervalB2); | ||
// Use short-circuiting statements since sample overlap is the least scalable / slowest check | ||
if (hasReciprocalOverlapAndSizeSimilarity == null) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not make this a lower case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I used There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. I guess I'd advocating for just re-checking the intrachromosal status explicitly here again here instead of using null, but it's not a big deal either way. |
||
return hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap()); | ||
} else { | ||
if (params.requiresOverlapAndProximity()) { | ||
return hasReciprocalOverlapAndSizeSimilarity && hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap()); | ||
} else { | ||
return (hasReciprocalOverlapAndSizeSimilarity || hasBreakendProximity) && hasSampleOverlap(a, b, params.getSampleOverlap()); | ||
} | ||
} | ||
|
||
// Sample overlap (possibly the least scalable check) | ||
return hasSampleOverlap(a, b, params.getSampleOverlap()); | ||
} | ||
|
||
/** | ||
* Gets event length used for overlap testing. | ||
*/ | ||
private static int getLengthForOverlap(final SVCallRecord record) { | ||
Utils.validate(record.isIntrachromosomal(), "Record even must be intra-chromosomal"); | ||
final Integer length = record.getLength(); | ||
if (length == null) { | ||
if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS | ||
|| record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX){ | ||
return INSERTION_ASSUMED_LENGTH_FOR_OVERLAP; | ||
} else { | ||
return 1; | ||
} | ||
private static int getLength(final SVCallRecord record, final int missingInsertionLength) { | ||
Utils.validate(record.isIntrachromosomal(), "Record must be intra-chromosomal"); | ||
if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) { | ||
return record.getLength() == null ? missingInsertionLength : Math.max(record.getLength(), 1); | ||
} else if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND) { | ||
return record.getPositionB() - record.getPositionA() + 1; | ||
} else { | ||
// TODO lengths less than 1 shouldn't be valid | ||
return Math.max(record.getLength(), 1); | ||
return Math.max(record.getLength() == null ? 1 : record.getLength(), 1); | ||
} | ||
} | ||
|
||
|
@@ -227,7 +239,7 @@ private static int getMaxClusterableStartingPositionWithParams(final SVCallRecor | |
|
||
// Reciprocal overlap window | ||
final int maxPositionByOverlap; | ||
final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLengthForOverlap(call)); | ||
final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLength(call, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP)); | ||
maxPositionByOverlap = Math.min(maxPosition, contigLength); | ||
|
||
if (params.requiresOverlapAndProximity()) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have some insertion records with
END - POS > 1
to indicate microdeletions at the source? Do we definitely always want to reset the END of those here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes that's true, but so far the GATK tools we are using in the pipeline would not affect this where it matters (GenerateBatchMetrics through GenotypeBatch).
In the future, however, I'd like to use POS/END to mark the nominal location of the insertion but separate INFO tags to indicate the left/right SR coordinates. This will avoid difficulties with making sure POS precedes END, which really should not be used for marking the SR positions IMO according to the VCF spec.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Totally agreed, we should have separate info fields for SR positions.