broadinstitute · mwalker174 · May 15, 2023 · Nov 23, 2022 · Mar 30, 2023 · Apr 30, 2023
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
@@ -52,6 +52,8 @@ public class SVCallRecord implements SVLocatable {
     private final List<Allele> altAlleles;
     private final GenotypesContext genotypes;
     private final Map<String,Object> attributes;
+    private final Set<String> filters;
+    private final Double log10PError;
 
     // CPX related fields
     private final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype;
@@ -70,8 +72,10 @@ public SVCallRecord(final String id,
                         final List<Allele> alleles,
                         final List<Genotype> genotypes,
                         final Map<String,Object> attributes,
+                        final Set<String> filters,
+                        final Double log10PError,
                         final SAMSequenceDictionary dictionary) {
-        this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, length, algorithms, alleles, genotypes, attributes);
+        this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
         validateCoordinates(dictionary);
     }
 
@@ -88,11 +92,14 @@ protected SVCallRecord(final String id,
                            final List<String> algorithms,
                            final List<Allele> alleles,
                            final List<Genotype> genotypes,
-                           final Map<String, Object> attributes) {
+                           final Map<String, Object> attributes,
+                           final Set<String> filters,
+                           final Double log10PError) {
         Utils.nonNull(algorithms);
         Utils.nonNull(alleles);
         Utils.nonNull(genotypes);
         Utils.nonNull(attributes);
+        Utils.nonNull(filters);
         this.id = Utils.nonNull(id);
         this.contigA = contigA;
         this.positionA = positionA;
@@ -112,6 +119,8 @@ protected SVCallRecord(final String id,
         final Pair<Boolean, Boolean> strands = inferStrands(type, strandA, strandB);
         this.strandA = strands.getLeft();
         this.strandB = strands.getRight();
+        this.filters = filters;
+        this.log10PError = log10PError;
     }
 
     /**
@@ -366,4 +375,16 @@ public SimpleInterval getPositionAInterval() {
     public SimpleInterval getPositionBInterval() {
         return new SimpleInterval(contigB, positionB, positionB);
     }
+
+    public Set<String> getFilters() {
+        return filters;
+    }
+
+    public Double getLog10PError() {
+        return log10PError;
+    }
+
+    public GATKSVVCFConstants.ComplexVariantSubtype getCpxSubtype() {
+        return cpxSubtype;
+    }
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java
@@ -1,5 +1,6 @@
 package org.broadinstitute.hellbender.tools.sv;
 
+import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import htsjdk.samtools.SAMSequenceDictionary;
 import htsjdk.samtools.util.Locatable;
@@ -99,15 +100,27 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
                 && record.getStrandA() != null && record.getStrandB() != null) {
             builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
         }
-        final GenotypesContext genotypes = GenotypesContext.create(record.getGenotypes().size());
-        for (final Genotype g : record.getGenotypes()) {
-            genotypes.add(sanitizeEmptyGenotype(g));
+        if (!record.getFilters().isEmpty()) {
+            builder.filters(record.getFilters());
         }
+        if (record.getLog10PError() != null) {
+            builder.log10PError(record.getLog10PError());
+        }
+
         // htsjdk vcf encoder does not allow genotypes to have empty alleles
         builder.genotypes(record.getGenotypes().stream().map(SVCallRecordUtils::sanitizeEmptyGenotype).collect(Collectors.toList()));
+        sanitizeNullAttributes(builder);
         return builder;
     }
 
+    /**
+     * Removes entries with null values. This can cause problems with vcf parsing, for example Integer fields
+     * set to "." cause exceptions.
+     */
+    private static void sanitizeNullAttributes(final VariantContextBuilder builder) {
+        builder.attributes(Maps.filterValues(builder.getAttributes(), o -> o != null));
+    }
+
     /**
      * Adds NO_CALL allele if empty
      */
@@ -170,12 +183,12 @@ public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(fin
     public static SVCallRecord copyCallWithNewGenotypes(final SVCallRecord record, final GenotypesContext genotypes) {
         return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
                 record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
-                genotypes, record.getAttributes());
+                genotypes, record.getAttributes(), record.getFilters(), record.getLog10PError());
     }
     public static SVCallRecord copyCallWithNewAttributes(final SVCallRecord record, final Map<String, Object> attr) {
         return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
                 record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
-                record.getGenotypes(), attr);
+                record.getGenotypes(), attr, record.getFilters(), record.getLog10PError());
     }
 
     /**
@@ -287,10 +300,10 @@ public static Stream<SVCallRecord> convertInversionsToBreakends(final SVCallReco
         Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal");
         final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(),
                 record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,null,
-                record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), dictionary);
+                record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
         final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(),
                 record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,null,
-                record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), dictionary);
+                record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
         return Stream.of(positiveBreakend, negativeBreakend);
     }
 
@@ -372,12 +385,18 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
             }
         } else {
             contigB = contigA;
-            positionB = variant.getEnd();
+            // Force reset of END coordinate
+            if (type.equals(StructuralVariantType.INS)) {
+                positionB = positionA + 1;
+            } else {
+                positionB = variant.getEnd();
+            }
         }
+        final Double log10PError = variant.hasLog10PError() ? variant.getLog10PError() : null;
 
         final Map<String, Object> sanitizedAttributes = sanitizeAttributes(attributes);
         return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype, length, algorithms,
-                variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes);
+                variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes, variant.getFilters(), log10PError);
     }
 
     private static Map<String, Object> sanitizeAttributes(final Map<String, Object> attributes) {

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java
@@ -182,8 +182,12 @@ public SVCallRecord collapse(final SVClusterEngine.OutputCluster cluster) {
         final Boolean strandA = type == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV ? null : representative.getStrandA();
         final Boolean strandB = type == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV ? null : representative.getStrandB();
 
+        final Set<String> filters = collapseFilters(items);
+        final Double quality = collapseQuality(items);
+
         return new SVCallRecord(representative.getId(), representative.getContigA(), start, strandA, representative.getContigB(),
-                end, strandB, type, representative.getComplexSubtype(), length, algorithms, alleles, genotypes, attributes, dictionary);
+                end, strandB, type, representative.getComplexSubtype(), length, algorithms, alleles, genotypes, attributes,
+                filters, quality, dictionary);
     }
 
     protected List<Allele> collapseAlleles(final List<Allele> altAlleles, final Allele refAllele) {
@@ -193,6 +197,21 @@ protected List<Allele> collapseAlleles(final List<Allele> altAlleles, final Alle
         return alleles;
     }
 
+    protected Set<String> collapseFilters(final List<SVCallRecord> items) {
+        return items.stream()
+                .map(SVCallRecord::getFilters)
+                .flatMap(Collection::stream)
+                .collect(Collectors.toSet());
+    }
+
+    protected Double collapseQuality(final List<SVCallRecord> items) {
+        if (items.size() == 1) {
+            return items.get(0).getLog10PError();
+        } else {
+            return null;
+        }
+    }
+
     /**
      * Asserts that the given records are valid for collapsing.
      */

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java
@@ -38,16 +38,20 @@ public class CanonicalSVLinkage<T extends SVCallRecord> extends SVClusterLinkage
     protected ClusteringParameters evidenceParams;
 
     public static final int INSERTION_ASSUMED_LENGTH_FOR_OVERLAP = 50;
+    public static final int INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY = 1;
 
     public static final double DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY = 0.8;
+    public static final double DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY = 0;
     public static final int DEFAULT_WINDOW_DEPTH_ONLY = 0;
     public static final double DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY = 0;
 
     public static final double DEFAULT_RECIPROCAL_OVERLAP_MIXED = 0.8;
+    public static final double DEFAULT_SIZE_SIMILARITY_MIXED = 0;
     public static final int DEFAULT_WINDOW_MIXED = 1000;
     public static final double DEFAULT_SAMPLE_OVERLAP_MIXED = 0;
 
     public static final double DEFAULT_RECIPROCAL_OVERLAP_PESR = 0.5;
+    public static final double DEFAULT_SIZE_SIMILARITY_PESR = 0;
     public static final int DEFAULT_WINDOW_PESR = 500;
     public static final double DEFAULT_SAMPLE_OVERLAP_PESR = 0;
 
@@ -56,16 +60,19 @@ public class CanonicalSVLinkage<T extends SVCallRecord> extends SVClusterLinkage
     public static final ClusteringParameters DEFAULT_DEPTH_ONLY_PARAMS =
             ClusteringParameters.createDepthParameters(
                     DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY,
+                    DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY,
                     DEFAULT_WINDOW_DEPTH_ONLY,
                     DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY);
     public static final ClusteringParameters DEFAULT_MIXED_PARAMS =
             ClusteringParameters.createMixedParameters(
                     DEFAULT_RECIPROCAL_OVERLAP_MIXED,
+                    DEFAULT_SIZE_SIMILARITY_MIXED,
                     DEFAULT_WINDOW_MIXED,
                     DEFAULT_SAMPLE_OVERLAP_MIXED);
     public static final ClusteringParameters DEFAULT_PESR_PARAMS =
             ClusteringParameters.createPesrParameters(
                     DEFAULT_RECIPROCAL_OVERLAP_PESR,
+                    DEFAULT_SIZE_SIMILARITY_PESR,
                     DEFAULT_WINDOW_PESR,
                     DEFAULT_SAMPLE_OVERLAP_PESR);
 
@@ -139,23 +146,27 @@ protected boolean strandsMatch(final SVCallRecord a, final SVCallRecord b) {
     private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVCallRecord b,
                                                      final ClusteringParameters params,
                                                      final SAMSequenceDictionary dictionary) {
-
         // Contigs match
         if (!(a.getContigA().equals(b.getContigA()) && a.getContigB().equals(b.getContigB()))) {
             return false;
         }
 
-        // Reciprocal overlap
+        // Reciprocal overlap and size similarity
         // Check bypassed if both are inter-chromosomal
+        final Boolean hasReciprocalOverlapAndSizeSimilarity;
         if (a.isIntrachromosomal()) {
-            final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLengthForOverlap(a) - 1);
-            final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLengthForOverlap(b) - 1);
+            final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLength(a, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1);
+            final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLength(b, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1);
             final boolean hasReciprocalOverlap = IntervalUtils.isReciprocalOverlap(intervalA, intervalB, params.getReciprocalOverlap());
-            if (params.requiresOverlapAndProximity() && !hasReciprocalOverlap) {
+            final int sizeSimilarityLengthA = getLength(a, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY);
+            final int sizeSimilarityLengthB = getLength(b, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY);
+            final boolean hasSizeSimilarity = Math.min(sizeSimilarityLengthA, sizeSimilarityLengthB) / (double) Math.max(sizeSimilarityLengthA, sizeSimilarityLengthB) >= params.getSizeSimilarity();
+            hasReciprocalOverlapAndSizeSimilarity = hasReciprocalOverlap && hasSizeSimilarity;
+            if (params.requiresOverlapAndProximity() && !hasReciprocalOverlapAndSizeSimilarity) {
                 return false;
-            } else if (!params.requiresOverlapAndProximity() && hasReciprocalOverlap) {
-                return true;
             }
+        } else {
+            hasReciprocalOverlapAndSizeSimilarity = null;
         }
 
         // Breakend proximity
@@ -173,30 +184,31 @@ private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVC
         }
         final SimpleInterval intervalB1 = b.getPositionAInterval();
         final SimpleInterval intervalB2 = b.getPositionBInterval();
-        if (!(intervalA1.overlaps(intervalB1) && intervalA2.overlaps(intervalB2))) {
-            return false;
+        final boolean hasBreakendProximity = intervalA1.overlaps(intervalB1) && intervalA2.overlaps(intervalB2);
+        // Use short-circuiting statements since sample overlap is the least scalable / slowest check
+        if (hasReciprocalOverlapAndSizeSimilarity == null) {
+            return hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap());
+        } else {
+            if (params.requiresOverlapAndProximity()) {
+                return hasReciprocalOverlapAndSizeSimilarity && hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap());
+            } else {
+                return (hasReciprocalOverlapAndSizeSimilarity || hasBreakendProximity) && hasSampleOverlap(a, b, params.getSampleOverlap());
+            }
         }
-
-        // Sample overlap (possibly the least scalable check)
-        return hasSampleOverlap(a, b, params.getSampleOverlap());
     }
 
     /**
      * Gets event length used for overlap testing.
      */
-    private static int getLengthForOverlap(final SVCallRecord record) {
-        Utils.validate(record.isIntrachromosomal(), "Record even must be intra-chromosomal");
-        final Integer length = record.getLength();
-        if (length == null) {
-            if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS
-                || record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX){
-                return INSERTION_ASSUMED_LENGTH_FOR_OVERLAP;
-            } else {
-                return 1;
-            }
+    private static int getLength(final SVCallRecord record, final int missingInsertionLength) {
+        Utils.validate(record.isIntrachromosomal(), "Record must be intra-chromosomal");
+        if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) {
+            return record.getLength() == null ? missingInsertionLength : Math.max(record.getLength(), 1);
+        } else if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND) {
+            return record.getPositionB() - record.getPositionA() + 1;
         } else {
             // TODO lengths less than 1 shouldn't be valid
-            return Math.max(record.getLength(), 1);
+            return Math.max(record.getLength() == null ? 1 : record.getLength(), 1);
         }
     }
 
@@ -227,7 +239,7 @@ private static int getMaxClusterableStartingPositionWithParams(final SVCallRecor
 
         // Reciprocal overlap window
         final int maxPositionByOverlap;
-        final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLengthForOverlap(call));
+        final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLength(call, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP));
         maxPositionByOverlap = Math.min(maxPosition, contigLength);
 
         if (params.requiresOverlapAndProximity()) {