broadinstitute · SHuang-Broad · Sep 26, 2017 · Sep 26, 2017 · Sep 27, 2017 · Oct 5, 2017
diff --git a/...java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducer.java b/...java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducer.java
@@ -197,7 +197,7 @@ private static Map<String, Object> parseComplicationsAndMakeThemAttributeMap(fin
             }
 
             if (breakpointComplications.getDupSeqStrandOnCtg() != null) {
-                attributeMap.put(GATKSVVCFConstants.DUP_INV_ORIENTATIONS,
+                attributeMap.put(GATKSVVCFConstants.DUP_ORIENTATIONS,
                         breakpointComplications.getDupSeqStrandOnCtg().stream().map(Strand::toString).collect(Collectors.joining()));
             }
         }

diff --git a/.../java/org/broadinstitute/hellbender/tools/spark/sv/discovery/BreakpointComplications.java b/.../java/org/broadinstitute/hellbender/tools/spark/sv/discovery/BreakpointComplications.java
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/ChimericAlignment.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/ChimericAlignment.java
@@ -104,9 +104,8 @@ public ChimericAlignment(final AlignmentInterval intervalWithLowerCoordOnContig,
 
         this.strandSwitch = determineStrandSwitch(intervalWithLowerCoordOnContig, intervalWithHigherCoordOnContig);
 
-        final boolean involvesRefIntervalSwitch = involvesRefPositionSwitch(intervalWithLowerCoordOnContig, intervalWithHigherCoordOnContig);
         this.isForwardStrandRepresentation = isForwardStrandRepresentation(intervalWithLowerCoordOnContig, intervalWithHigherCoordOnContig,
-                this.strandSwitch, involvesRefIntervalSwitch);
+                this.strandSwitch, involvesRefPositionSwitch(intervalWithLowerCoordOnContig, intervalWithHigherCoordOnContig));
 
         this.insertionMappings = insertionMappings;
     }
@@ -161,10 +160,10 @@ public static List<ChimericAlignment> parseOneContig(final AlignedContig aligned
                 }
             }
 
-            final boolean isNotSimpleTranslocation = isNotSimpleTranslocation(current, next,
-                    determineStrandSwitch(current, next), involvesRefPositionSwitch(current, next));
-            if (isNotSimpleTranslocation)
-                results.add(new ChimericAlignment(current, next, insertionMappings, alignedContig.contigName));
+            final ChimericAlignment chimericAlignment = new ChimericAlignment(current, next, insertionMappings, alignedContig.contigName);
+            if (chimericAlignment.isNotSimpleTranslocation() &&
+                    !BreakpointComplications.isLikelyInvertedDuplication(current, next))
+                results.add(chimericAlignment);
 
             current = next;
         }
@@ -191,9 +190,9 @@ static boolean firstAlignmentIsTooShort(final AlignmentInterval first, final Ali
      * To implement the idea that for two consecutive alignment regions of a contig, the one with higher reference coordinate might be a novel insertion.
      */
     @VisibleForTesting
-    public static boolean nextAlignmentMayBeInsertion(final AlignmentInterval current, final AlignmentInterval next,
-                                                      final Integer minAlignLength, final int mapQThresholdInclusive,
-                                                      final boolean filterWhollyContained) {
+    static boolean nextAlignmentMayBeInsertion(final AlignmentInterval current, final AlignmentInterval next,
+                                               final Integer minAlignLength, final int mapQThresholdInclusive,
+                                               final boolean filterWhollyContained) {
         // not unique: inserted sequence may have low mapping quality (low reference uniqueness) or may be very small (low read uniqueness)
         final boolean isNotUnique = mapQualTooLow(next, mapQThresholdInclusive) || firstAlignmentIsTooShort(next, current, minAlignLength);
         return isNotUnique
@@ -202,7 +201,7 @@ public static boolean nextAlignmentMayBeInsertion(final AlignmentInterval curren
     }
 
     @VisibleForTesting
-    public static StrandSwitch determineStrandSwitch(final AlignmentInterval first, final AlignmentInterval second) {
+    static StrandSwitch determineStrandSwitch(final AlignmentInterval first, final AlignmentInterval second) {
         if (first.forwardStrand == second.forwardStrand) {
             return StrandSwitch.NO_SWITCH;
         } else {
@@ -214,8 +213,8 @@ public static StrandSwitch determineStrandSwitch(final AlignmentInterval first,
      * Determine if the region that maps to a lower coordinate on the contig also maps to a lower coordinate on the reference.
      */
     @VisibleForTesting
-    public static boolean involvesRefPositionSwitch(final AlignmentInterval regionWithLowerCoordOnContig,
-                                                    final AlignmentInterval regionWithHigherCoordOnContig) {
+    static boolean involvesRefPositionSwitch(final AlignmentInterval regionWithLowerCoordOnContig,
+                                             final AlignmentInterval regionWithHigherCoordOnContig) {
 
         return regionWithHigherCoordOnContig.referenceSpan.getStart() < regionWithLowerCoordOnContig.referenceSpan.getStart();
     }
@@ -267,10 +266,11 @@ public boolean isNotSimpleTranslocation() {
     }
 
     Tuple2<SimpleInterval, SimpleInterval> getCoordSortedReferenceSpans() {
-        if (involvesRefPositionSwitch(regionWithLowerCoordOnContig, regionWithHigherCoordOnContig))
-            return new Tuple2<>(regionWithHigherCoordOnContig.referenceSpan, regionWithLowerCoordOnContig.referenceSpan);
-        else
+        if (isForwardStrandRepresentation) {
             return new Tuple2<>(regionWithLowerCoordOnContig.referenceSpan, regionWithHigherCoordOnContig.referenceSpan);
+        } else {
+            return new Tuple2<>(regionWithHigherCoordOnContig.referenceSpan, regionWithLowerCoordOnContig.referenceSpan);
+        }
     }
 
 

diff --git a/...ute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java b/...ute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java
@@ -1,5 +1,9 @@
 package org.broadinstitute.hellbender.tools.spark.sv.discovery;
 
+import com.esotericsoftware.kryo.DefaultSerializer;
+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.io.Input;
+import com.esotericsoftware.kryo.io.Output;
 import com.google.common.annotations.VisibleForTesting;
 import htsjdk.samtools.CigarOperator;
 import htsjdk.samtools.SAMFileHeader;
@@ -26,15 +30,20 @@
 import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection;
 import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
 import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
-import org.broadinstitute.hellbender.tools.spark.sv.utils.*;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.PairedStrandedIntervalTree;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
 import scala.Tuple2;
 
 import java.io.IOException;
 import java.io.Serializable;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -63,7 +72,7 @@ public final class DiscoverVariantsFromContigAlignmentsSAMSpark extends GATKSpar
             discoverStageArgs
             = new StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection();
 
-    @Argument(doc = "sam file for aligned contigs", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+    @Argument(doc = "output path for discovery (non-genotyped) VCF", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
             fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
     private String vcfOutputFileName;
 
@@ -197,16 +206,15 @@ public static void discoverVariantsAndWriteVCF(final JavaRDD<AlignedContig> alig
         JavaRDD<VariantContext> annotatedVariants =
                 alignedContigs.filter(alignedContig -> alignedContig.alignmentIntervals.size()>1)                                     // filter out any contigs that has less than two alignment records
                         .mapToPair(alignedContig -> new Tuple2<>(alignedContig.contigSequence,                                        // filter a contig's alignment and massage into ordered collection of chimeric alignments
-                                ChimericAlignment.parseOneContig(alignedContig, DEFAULT_MIN_ALIGNMENT_LENGTH, true, true)))
+                                ChimericAlignment.parseOneContig(alignedContig, DEFAULT_MIN_ALIGNMENT_LENGTH, true, false)))
                         .flatMapToPair(DiscoverVariantsFromContigAlignmentsSAMSpark::discoverNovelAdjacencyFromChimericAlignments)    // a filter-passing contig's alignments may or may not produce novel adjacency
-                        .groupByKey()                                                                                                 // group the same novel adjacency produced by different contigs together
+                        .groupByKey()                                                                                                 // group the same novel adjacency and alt haplotype sequence produced by different contigs together
                         .mapToPair(noveltyAndEvidence -> inferType(noveltyAndEvidence._1, noveltyAndEvidence._2))                     // type inference based on novel adjacency and evidence alignments
                         .map(noveltyTypeAndEvidence ->
                                 annotateVariant(                                                                                      // annotate the novel adjacency and inferred type
-                                        noveltyTypeAndEvidence._1,
-                                        noveltyTypeAndEvidence._2._1,
-                                        null,
-                                        noveltyTypeAndEvidence._2._2,
+                                        noveltyTypeAndEvidence._1.novelAdjacencyReferenceLocations,
+                                        noveltyTypeAndEvidence._1.altHaplotypeSequence,
+                                        noveltyTypeAndEvidence._2._1, noveltyTypeAndEvidence._2._2,
                                         broadcastReference));
 
         List<VariantContext> collectedAnnotatedVariants = annotatedVariants.collect();
@@ -270,31 +278,42 @@ private static VariantContext createImpreciseDeletionVariant(final EvidenceTarge
     /**
      * Given contig alignments, emit novel adjacency not present on the reference to which the locally-assembled contigs were aligned.
      */
-    public static Iterator<Tuple2<NovelAdjacencyReferenceLocations, ChimericAlignment>>
-    discoverNovelAdjacencyFromChimericAlignments(final Tuple2<byte[], List<ChimericAlignment>> ticSeqAndChimeras) {
-        return Utils.stream(ticSeqAndChimeras._2)
-                .map(ca -> new Tuple2<>(new NovelAdjacencyReferenceLocations(ca, ticSeqAndChimeras._1), ca))
-                .collect(Collectors.toList()).iterator();
+    public static Iterator<Tuple2<NovelAdjacencyAndInferredAltHaptype, ChimericAlignment>>
+    discoverNovelAdjacencyFromChimericAlignments(final Tuple2<byte[], List<ChimericAlignment>> tigSeqAndChimeras) {
+        return Utils.stream(tigSeqAndChimeras._2)
+                .map(ca -> new Tuple2<>(new NovelAdjacencyReferenceLocations(ca, tigSeqAndChimeras._1), ca))
+                .map(narlAndEvidenceCA -> {
+                    final ChimericAlignment ca = narlAndEvidenceCA._2;
+                    final SimpleInterval referenceSpan1 = ca.regionWithLowerCoordOnContig.referenceSpan;
+                    final SimpleInterval referenceSpan2 = ca.regionWithHigherCoordOnContig.referenceSpan;
+                    final byte[] altHaplotypeSeq;
+                    if (ca.strandSwitch == StrandSwitch.NO_SWITCH && (referenceSpan1.contains(referenceSpan2) || referenceSpan2.contains(referenceSpan1))) {
+                        altHaplotypeSeq = BreakpointComplications.extractAltHaplotypeForTandupExpansionWithContainment(ca.regionWithLowerCoordOnContig,
+                                ca.regionWithHigherCoordOnContig, narlAndEvidenceCA._1.complication, tigSeqAndChimeras._1);
+                    } else {
+                        altHaplotypeSeq = null;
+                    }
+                    return new Tuple2<>(new NovelAdjacencyAndInferredAltHaptype(narlAndEvidenceCA._1, altHaplotypeSeq), ca);
+                }).iterator();
     }
 
     // TODO: 7/6/17 interface to be changed in the new implementation, where a set of NRAL's associated with a single contig is considered together.
     /**
      * Given input novel adjacency and evidence chimeric alignments, infer type of variant.
      */
-    public static Tuple2<NovelAdjacencyReferenceLocations, Tuple2<SvType, Iterable<ChimericAlignment>>> inferType(
-            final NovelAdjacencyReferenceLocations novelAdjacency,
+    public static Tuple2<NovelAdjacencyAndInferredAltHaptype, Tuple2<SvType, Iterable<ChimericAlignment>>> inferType(
+            final NovelAdjacencyAndInferredAltHaptype novelAdjacency,
             final Iterable<ChimericAlignment> chimericAlignments) {
         return new Tuple2<>(novelAdjacency,
-                new Tuple2<>(SvTypeInference.inferFromNovelAdjacency(novelAdjacency),
+                new Tuple2<>(SvTypeInference.inferFromNovelAdjacency(novelAdjacency.novelAdjacencyReferenceLocations),
                         chimericAlignments));
     }
 
     /**
      * Produces annotated variant as described in {@link GATKSVVCFHeaderLines}, given input arguments.
      */
     public static VariantContext annotateVariant(final NovelAdjacencyReferenceLocations novelAdjacency,
-                                                 final SvType inferredType,
-                                                 final byte[] altHaplotypeSeq,
+                                                 final byte[] altHaplotypeSeq, final SvType inferredType,
                                                  final Iterable<ChimericAlignment> chimericAlignments,
                                                  final Broadcast<ReferenceMultiSource> broadcastReference)
             throws IOException {
@@ -305,4 +324,74 @@ public static VariantContext annotateVariant(final NovelAdjacencyReferenceLocati
                         broadcastReference);
     }
 
+    @DefaultSerializer(NovelAdjacencyAndInferredAltHaptype.Serializer.class)
+    public static final class NovelAdjacencyAndInferredAltHaptype {
+
+        private static final NovelAdjacencyReferenceLocations.Serializer localSerializer = new NovelAdjacencyReferenceLocations.Serializer();
+        public final NovelAdjacencyReferenceLocations novelAdjacencyReferenceLocations;
+        public final byte[] altHaplotypeSequence;
+
+        NovelAdjacencyAndInferredAltHaptype(final NovelAdjacencyReferenceLocations novelAdjacencyReferenceLocations,
+                                            final byte[] altHaplotypeSequence) {
+            this.novelAdjacencyReferenceLocations = novelAdjacencyReferenceLocations;
+            this.altHaplotypeSequence = altHaplotypeSequence;
+        }
+
+        public NovelAdjacencyAndInferredAltHaptype(final Kryo kryo, final Input input) {
+            novelAdjacencyReferenceLocations = localSerializer.read(kryo, input, NovelAdjacencyReferenceLocations.class);
+            final boolean altSeqIsNull = input.readBoolean();
+            if (altSeqIsNull) {
+                altHaplotypeSequence = null;
+            } else {
+                final int arraySize = input.readInt();
+                altHaplotypeSequence = new byte[arraySize];
+                for (int i = 0 ; i < arraySize; ++i) {
+                    altHaplotypeSequence[i] = input.readByte();
+                }
+            }
+        }
+
+        private void serialize(final Kryo kryo, final Output output) {
+            localSerializer.write(kryo, output, novelAdjacencyReferenceLocations);
+            if (altHaplotypeSequence==null) {
+                output.writeBoolean(true);
+            } else {
+                output.writeBoolean(false);
+                output.writeInt(altHaplotypeSequence.length);
+                for (final byte b : altHaplotypeSequence) {
+                    output.writeByte(b);
+                }
+            }
+        }
+
+        public static final class Serializer extends com.esotericsoftware.kryo.Serializer<NovelAdjacencyAndInferredAltHaptype> {
+            @Override
+            public void write(final Kryo kryo, final Output output, final NovelAdjacencyAndInferredAltHaptype novelAdjacencyReferenceLocations ) {
+                novelAdjacencyReferenceLocations.serialize(kryo, output);
+            }
+
+            @Override
+            public NovelAdjacencyAndInferredAltHaptype read(final Kryo kryo, final Input input, final Class<NovelAdjacencyAndInferredAltHaptype> klass ) {
+                return new NovelAdjacencyAndInferredAltHaptype(kryo, input);
+            }
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+
+            NovelAdjacencyAndInferredAltHaptype that = (NovelAdjacencyAndInferredAltHaptype) o;
+
+            if (!novelAdjacencyReferenceLocations.equals(that.novelAdjacencyReferenceLocations)) return false;
+            return Arrays.equals(altHaplotypeSequence, that.altHaplotypeSequence);
+        }
+
+        @Override
+        public int hashCode() {
+            int result = novelAdjacencyReferenceLocations.hashCode();
+            result = 31 * result + Arrays.hashCode(altHaplotypeSequence);
+            return result;
+        }
+    }
 }