Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HOLD: Bring in more duplication expansion calls #3668

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ private static Map<String, Object> parseComplicationsAndMakeThemAttributeMap(fin
}

if (breakpointComplications.getDupSeqStrandOnCtg() != null) {
attributeMap.put(GATKSVVCFConstants.DUP_INV_ORIENTATIONS,
attributeMap.put(GATKSVVCFConstants.DUP_ORIENTATIONS,
breakpointComplications.getDupSeqStrandOnCtg().stream().map(Strand::toString).collect(Collectors.joining()));
}
}
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,8 @@ public ChimericAlignment(final AlignmentInterval intervalWithLowerCoordOnContig,

this.strandSwitch = determineStrandSwitch(intervalWithLowerCoordOnContig, intervalWithHigherCoordOnContig);

final boolean involvesRefIntervalSwitch = involvesRefPositionSwitch(intervalWithLowerCoordOnContig, intervalWithHigherCoordOnContig);
this.isForwardStrandRepresentation = isForwardStrandRepresentation(intervalWithLowerCoordOnContig, intervalWithHigherCoordOnContig,
this.strandSwitch, involvesRefIntervalSwitch);
this.strandSwitch, involvesRefPositionSwitch(intervalWithLowerCoordOnContig, intervalWithHigherCoordOnContig));

this.insertionMappings = insertionMappings;
}
Expand Down Expand Up @@ -161,10 +160,10 @@ public static List<ChimericAlignment> parseOneContig(final AlignedContig aligned
}
}

final boolean isNotSimpleTranslocation = isNotSimpleTranslocation(current, next,
determineStrandSwitch(current, next), involvesRefPositionSwitch(current, next));
if (isNotSimpleTranslocation)
results.add(new ChimericAlignment(current, next, insertionMappings, alignedContig.contigName));
final ChimericAlignment chimericAlignment = new ChimericAlignment(current, next, insertionMappings, alignedContig.contigName);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

method to be phased out, but it makes sense to let down stream users to check what type of novel adjacency this CA could point to, which requests the state checkers to be more comprehensive

if (chimericAlignment.isNotSimpleTranslocation() &&
!BreakpointComplications.isLikelyInvertedDuplication(current, next))
results.add(chimericAlignment);

current = next;
}
Expand All @@ -191,9 +190,9 @@ static boolean firstAlignmentIsTooShort(final AlignmentInterval first, final Ali
* To implement the idea that for two consecutive alignment regions of a contig, the one with higher reference coordinate might be a novel insertion.
*/
@VisibleForTesting
public static boolean nextAlignmentMayBeInsertion(final AlignmentInterval current, final AlignmentInterval next,
final Integer minAlignLength, final int mapQThresholdInclusive,
final boolean filterWhollyContained) {
static boolean nextAlignmentMayBeInsertion(final AlignmentInterval current, final AlignmentInterval next,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check how this is different from the new method splitPairStrongEnoughEvidenceForCA()

final Integer minAlignLength, final int mapQThresholdInclusive,
final boolean filterWhollyContained) {
// not unique: inserted sequence may have low mapping quality (low reference uniqueness) or may be very small (low read uniqueness)
final boolean isNotUnique = mapQualTooLow(next, mapQThresholdInclusive) || firstAlignmentIsTooShort(next, current, minAlignLength);
return isNotUnique
Expand All @@ -202,7 +201,7 @@ public static boolean nextAlignmentMayBeInsertion(final AlignmentInterval curren
}

@VisibleForTesting
public static StrandSwitch determineStrandSwitch(final AlignmentInterval first, final AlignmentInterval second) {
static StrandSwitch determineStrandSwitch(final AlignmentInterval first, final AlignmentInterval second) {
if (first.forwardStrand == second.forwardStrand) {
return StrandSwitch.NO_SWITCH;
} else {
Expand All @@ -214,8 +213,8 @@ public static StrandSwitch determineStrandSwitch(final AlignmentInterval first,
* Determine if the region that maps to a lower coordinate on the contig also maps to a lower coordinate on the reference.
*/
@VisibleForTesting
public static boolean involvesRefPositionSwitch(final AlignmentInterval regionWithLowerCoordOnContig,
final AlignmentInterval regionWithHigherCoordOnContig) {
static boolean involvesRefPositionSwitch(final AlignmentInterval regionWithLowerCoordOnContig,
final AlignmentInterval regionWithHigherCoordOnContig) {

return regionWithHigherCoordOnContig.referenceSpan.getStart() < regionWithLowerCoordOnContig.referenceSpan.getStart();
}
Expand Down Expand Up @@ -267,10 +266,11 @@ public boolean isNotSimpleTranslocation() {
}

Tuple2<SimpleInterval, SimpleInterval> getCoordSortedReferenceSpans() {
if (involvesRefPositionSwitch(regionWithLowerCoordOnContig, regionWithHigherCoordOnContig))
return new Tuple2<>(regionWithHigherCoordOnContig.referenceSpan, regionWithLowerCoordOnContig.referenceSpan);
else
if (isForwardStrandRepresentation) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

used in two places: BC.iniForInsDel() and NARL.leftJustifyBreakpoints().
The first use is trivial as it is type specific, whereas for the 2nd use in NARL, we need to be careful since we are considering expanding the NARL to cover cases of ref order switch.

return new Tuple2<>(regionWithLowerCoordOnContig.referenceSpan, regionWithHigherCoordOnContig.referenceSpan);
} else {
return new Tuple2<>(regionWithHigherCoordOnContig.referenceSpan, regionWithLowerCoordOnContig.referenceSpan);
}
}


Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
package org.broadinstitute.hellbender.tools.spark.sv.discovery;

import com.esotericsoftware.kryo.DefaultSerializer;
import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
Expand All @@ -26,15 +30,20 @@
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
import org.broadinstitute.hellbender.tools.spark.sv.utils.*;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines;
import org.broadinstitute.hellbender.tools.spark.sv.utils.PairedStrandedIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import scala.Tuple2;

import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

Expand Down Expand Up @@ -63,7 +72,7 @@ public final class DiscoverVariantsFromContigAlignmentsSAMSpark extends GATKSpar
discoverStageArgs
= new StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection();

@Argument(doc = "sam file for aligned contigs", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
@Argument(doc = "output path for discovery (non-genotyped) VCF", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
private String vcfOutputFileName;

Expand Down Expand Up @@ -197,16 +206,15 @@ public static void discoverVariantsAndWriteVCF(final JavaRDD<AlignedContig> alig
JavaRDD<VariantContext> annotatedVariants =
alignedContigs.filter(alignedContig -> alignedContig.alignmentIntervals.size()>1) // filter out any contigs that has less than two alignment records
.mapToPair(alignedContig -> new Tuple2<>(alignedContig.contigSequence, // filter a contig's alignment and massage into ordered collection of chimeric alignments
ChimericAlignment.parseOneContig(alignedContig, DEFAULT_MIN_ALIGNMENT_LENGTH, true, true)))
ChimericAlignment.parseOneContig(alignedContig, DEFAULT_MIN_ALIGNMENT_LENGTH, true, false)))
.flatMapToPair(DiscoverVariantsFromContigAlignmentsSAMSpark::discoverNovelAdjacencyFromChimericAlignments) // a filter-passing contig's alignments may or may not produce novel adjacency
.groupByKey() // group the same novel adjacency produced by different contigs together
.groupByKey() // group the same novel adjacency and alt haplotype sequence produced by different contigs together
.mapToPair(noveltyAndEvidence -> inferType(noveltyAndEvidence._1, noveltyAndEvidence._2)) // type inference based on novel adjacency and evidence alignments
.map(noveltyTypeAndEvidence ->
annotateVariant( // annotate the novel adjacency and inferred type
noveltyTypeAndEvidence._1,
noveltyTypeAndEvidence._2._1,
null,
noveltyTypeAndEvidence._2._2,
noveltyTypeAndEvidence._1.novelAdjacencyReferenceLocations,
noveltyTypeAndEvidence._1.altHaplotypeSequence,
noveltyTypeAndEvidence._2._1, noveltyTypeAndEvidence._2._2,
broadcastReference));

List<VariantContext> collectedAnnotatedVariants = annotatedVariants.collect();
Expand Down Expand Up @@ -270,31 +278,42 @@ private static VariantContext createImpreciseDeletionVariant(final EvidenceTarge
/**
* Given contig alignments, emit novel adjacency not present on the reference to which the locally-assembled contigs were aligned.
*/
public static Iterator<Tuple2<NovelAdjacencyReferenceLocations, ChimericAlignment>>
discoverNovelAdjacencyFromChimericAlignments(final Tuple2<byte[], List<ChimericAlignment>> ticSeqAndChimeras) {
return Utils.stream(ticSeqAndChimeras._2)
.map(ca -> new Tuple2<>(new NovelAdjacencyReferenceLocations(ca, ticSeqAndChimeras._1), ca))
.collect(Collectors.toList()).iterator();
public static Iterator<Tuple2<NovelAdjacencyAndInferredAltHaptype, ChimericAlignment>>
discoverNovelAdjacencyFromChimericAlignments(final Tuple2<byte[], List<ChimericAlignment>> tigSeqAndChimeras) {
return Utils.stream(tigSeqAndChimeras._2)
.map(ca -> new Tuple2<>(new NovelAdjacencyReferenceLocations(ca, tigSeqAndChimeras._1), ca))
.map(narlAndEvidenceCA -> {
final ChimericAlignment ca = narlAndEvidenceCA._2;
final SimpleInterval referenceSpan1 = ca.regionWithLowerCoordOnContig.referenceSpan;
final SimpleInterval referenceSpan2 = ca.regionWithHigherCoordOnContig.referenceSpan;
final byte[] altHaplotypeSeq;
if (ca.strandSwitch == StrandSwitch.NO_SWITCH && (referenceSpan1.contains(referenceSpan2) || referenceSpan2.contains(referenceSpan1))) {
altHaplotypeSeq = BreakpointComplications.extractAltHaplotypeForTandupExpansionWithContainment(ca.regionWithLowerCoordOnContig,
ca.regionWithHigherCoordOnContig, narlAndEvidenceCA._1.complication, tigSeqAndChimeras._1);
} else {
altHaplotypeSeq = null;
}
return new Tuple2<>(new NovelAdjacencyAndInferredAltHaptype(narlAndEvidenceCA._1, altHaplotypeSeq), ca);
}).iterator();
}

// TODO: 7/6/17 interface to be changed in the new implementation, where a set of NRAL's associated with a single contig is considered together.
/**
* Given input novel adjacency and evidence chimeric alignments, infer type of variant.
*/
public static Tuple2<NovelAdjacencyReferenceLocations, Tuple2<SvType, Iterable<ChimericAlignment>>> inferType(
final NovelAdjacencyReferenceLocations novelAdjacency,
public static Tuple2<NovelAdjacencyAndInferredAltHaptype, Tuple2<SvType, Iterable<ChimericAlignment>>> inferType(
final NovelAdjacencyAndInferredAltHaptype novelAdjacency,
final Iterable<ChimericAlignment> chimericAlignments) {
return new Tuple2<>(novelAdjacency,
new Tuple2<>(SvTypeInference.inferFromNovelAdjacency(novelAdjacency),
new Tuple2<>(SvTypeInference.inferFromNovelAdjacency(novelAdjacency.novelAdjacencyReferenceLocations),
chimericAlignments));
}

/**
* Produces annotated variant as described in {@link GATKSVVCFHeaderLines}, given input arguments.
*/
public static VariantContext annotateVariant(final NovelAdjacencyReferenceLocations novelAdjacency,
final SvType inferredType,
final byte[] altHaplotypeSeq,
final byte[] altHaplotypeSeq, final SvType inferredType,
final Iterable<ChimericAlignment> chimericAlignments,
final Broadcast<ReferenceMultiSource> broadcastReference)
throws IOException {
Expand All @@ -305,4 +324,74 @@ public static VariantContext annotateVariant(final NovelAdjacencyReferenceLocati
broadcastReference);
}

@DefaultSerializer(NovelAdjacencyAndInferredAltHaptype.Serializer.class)
public static final class NovelAdjacencyAndInferredAltHaptype {

private static final NovelAdjacencyReferenceLocations.Serializer localSerializer = new NovelAdjacencyReferenceLocations.Serializer();
public final NovelAdjacencyReferenceLocations novelAdjacencyReferenceLocations;
public final byte[] altHaplotypeSequence;

NovelAdjacencyAndInferredAltHaptype(final NovelAdjacencyReferenceLocations novelAdjacencyReferenceLocations,
final byte[] altHaplotypeSequence) {
this.novelAdjacencyReferenceLocations = novelAdjacencyReferenceLocations;
this.altHaplotypeSequence = altHaplotypeSequence;
}

public NovelAdjacencyAndInferredAltHaptype(final Kryo kryo, final Input input) {
novelAdjacencyReferenceLocations = localSerializer.read(kryo, input, NovelAdjacencyReferenceLocations.class);
final boolean altSeqIsNull = input.readBoolean();
if (altSeqIsNull) {
altHaplotypeSequence = null;
} else {
final int arraySize = input.readInt();
altHaplotypeSequence = new byte[arraySize];
for (int i = 0 ; i < arraySize; ++i) {
altHaplotypeSequence[i] = input.readByte();
}
}
}

private void serialize(final Kryo kryo, final Output output) {
localSerializer.write(kryo, output, novelAdjacencyReferenceLocations);
if (altHaplotypeSequence==null) {
output.writeBoolean(true);
} else {
output.writeBoolean(false);
output.writeInt(altHaplotypeSequence.length);
for (final byte b : altHaplotypeSequence) {
output.writeByte(b);
}
}
}

public static final class Serializer extends com.esotericsoftware.kryo.Serializer<NovelAdjacencyAndInferredAltHaptype> {
@Override
public void write(final Kryo kryo, final Output output, final NovelAdjacencyAndInferredAltHaptype novelAdjacencyReferenceLocations ) {
novelAdjacencyReferenceLocations.serialize(kryo, output);
}

@Override
public NovelAdjacencyAndInferredAltHaptype read(final Kryo kryo, final Input input, final Class<NovelAdjacencyAndInferredAltHaptype> klass ) {
return new NovelAdjacencyAndInferredAltHaptype(kryo, input);
}
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;

NovelAdjacencyAndInferredAltHaptype that = (NovelAdjacencyAndInferredAltHaptype) o;

if (!novelAdjacencyReferenceLocations.equals(that.novelAdjacencyReferenceLocations)) return false;
return Arrays.equals(altHaplotypeSequence, that.altHaplotypeSequence);
}

@Override
public int hashCode() {
int result = novelAdjacencyReferenceLocations.hashCode();
result = 31 * result + Arrays.hashCode(altHaplotypeSequence);
return result;
}
}
}
Loading