Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(SV) consolidate logic in simple chimera inference, update how variants are represented in VCF emitted by new code path #4663

Merged
merged 1 commit into from
May 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
public class StructuralVariationDiscoveryArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;

public static final int STRUCTURAL_VARIANT_SIZE_LOWER_BOUND = 50;

public static class FindBreakpointEvidenceSparkArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;

Expand Down Expand Up @@ -202,7 +204,7 @@ public static class FindBreakpointEvidenceSparkArgumentCollection implements Ser
public static class DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;

public static final int GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY = 50; // alignment with gap of size >= 50 will be broken apart.
public static final int GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY = STRUCTURAL_VARIANT_SIZE_LOWER_BOUND; // alignment with gap of size >= 50 will be broken apart.
public static final int CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD = 60;
public static final int DEFAULT_MIN_ALIGNMENT_LENGTH = 50; // Minimum flanking alignment length filters used when going through contig alignments.
public static final int DEFAULT_ASSEMBLED_IMPRECISE_EVIDENCE_OVERLAP_UNCERTAINTY = 100;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.FindBreakpointEvidenceSparkArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.AnnotatedVariantProducer;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoverFromLocalAssemblyContigAlignmentsSpark;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoveryInputData;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.*;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.AssemblyContigAlignmentSignatureClassifier;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoveryInputMetaData;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContigGenerator;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.ContigAlignmentsModifier;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.ImpreciseVariantDetector;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.AlignedAssemblyOrExcuse;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
Expand All @@ -43,7 +45,6 @@

import java.io.IOException;
import java.nio.file.Paths;
import java.util.EnumMap;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
Expand Down Expand Up @@ -170,30 +171,31 @@ protected void runTool( final JavaSparkContext ctx ) {
// todo: when we call imprecise variants don't return here
if(parsedAlignments.isEmpty()) return;

final SvDiscoveryInputData svDiscoveryInputData = getSvDiscoveryInputData(ctx, headerForReads, assembledEvidenceResults);
final SvDiscoveryInputMetaData svDiscoveryInputMetaData = getSvDiscoveryInputData(ctx, headerForReads, assembledEvidenceResults);

// TODO: 1/14/18 this is to be phased-out: old way of calling precise variants
// assembled breakpoints
@SuppressWarnings("deprecation")
List<VariantContext> assemblyBasedVariants =
org.broadinstitute.hellbender.tools.spark.sv.discovery.DiscoverVariantsFromContigAlignmentsSAMSpark
.discoverVariantsFromChimeras(svDiscoveryInputData, parsedAlignments);
.discoverVariantsFromChimeras(svDiscoveryInputMetaData, parsedAlignments);

final List<VariantContext> annotatedVariants = processEvidenceTargetLinks(assemblyBasedVariants, svDiscoveryInputData);
final List<VariantContext> annotatedVariants = processEvidenceTargetLinks(assemblyBasedVariants, svDiscoveryInputMetaData);

final String outputPath = svDiscoveryInputData.outputPath;
final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputData.referenceSequenceDictionaryBroadcast.getValue();
final Logger toolLogger = svDiscoveryInputData.toolLogger;
final String outputPath = svDiscoveryInputMetaData.outputPath;
final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputMetaData.referenceData.referenceSequenceDictionaryBroadcast.getValue();
final Logger toolLogger = svDiscoveryInputMetaData.toolLogger;
SVVCFWriter.writeVCF(annotatedVariants, outputPath + "inv_del_ins.vcf", refSeqDictionary, toolLogger);

// TODO: 1/14/18 this is the next version of precise variant calling
if ( expInterpret != null ) {
experimentalInterpretation(ctx, assembledEvidenceResults, svDiscoveryInputData, evidenceAndAssemblyArgs.crossContigsToIgnoreFile);
experimentalInterpretation(ctx, assembledEvidenceResults, svDiscoveryInputMetaData);
}
}

private SvDiscoveryInputData getSvDiscoveryInputData(final JavaSparkContext ctx,
final SAMFileHeader headerForReads,
final FindBreakpointEvidenceSpark.AssembledEvidenceResults assembledEvidenceResults) {
private SvDiscoveryInputMetaData getSvDiscoveryInputData(final JavaSparkContext ctx,
final SAMFileHeader headerForReads,
final FindBreakpointEvidenceSpark.AssembledEvidenceResults assembledEvidenceResults) {
final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast =
broadcastCNVCalls(ctx, headerForReads, discoverStageArgs.cnvCallsFile);
try {
Expand All @@ -207,11 +209,11 @@ private SvDiscoveryInputData getSvDiscoveryInputData(final JavaSparkContext ctx,
final String outputPrefixWithSampleName = variantsOutDir + (variantsOutDir.endsWith("/") ? "" : "/")
+ SVUtils.getSampleId(headerForReads) + "_";

return new SvDiscoveryInputData(ctx, discoverStageArgs, outputPrefixWithSampleName,
return new SvDiscoveryInputMetaData(ctx, discoverStageArgs, evidenceAndAssemblyArgs.crossContigsToIgnoreFile,
outputPrefixWithSampleName,
assembledEvidenceResults.getReadMetadata(), assembledEvidenceResults.getAssembledIntervals(),
makeEvidenceLinkTree(assembledEvidenceResults.getEvidenceTargetLinks()),
cnvCallsBroadcast,
getReads(), getHeaderForReads(), getReference(), localLogger);
cnvCallsBroadcast, getHeaderForReads(), getReference(), localLogger);
}

/**
Expand All @@ -227,24 +229,24 @@ private SvDiscoveryInputData getSvDiscoveryInputData(final JavaSparkContext ctx,
*
*/
private static List<VariantContext> processEvidenceTargetLinks(List<VariantContext> assemblyBasedVariants,
final SvDiscoveryInputData svDiscoveryInputData) {
final SvDiscoveryInputMetaData svDiscoveryInputMetaData) {

final List<VariantContext> annotatedVariants;
if (svDiscoveryInputData.evidenceTargetLinks != null) {
final PairedStrandedIntervalTree<EvidenceTargetLink> evidenceTargetLinks = svDiscoveryInputData.evidenceTargetLinks;
final ReadMetadata metadata = svDiscoveryInputData.metadata;
final ReferenceMultiSource reference = svDiscoveryInputData.referenceBroadcast.getValue();
final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs = svDiscoveryInputData.discoverStageArgs;
final Logger toolLogger = svDiscoveryInputData.toolLogger;
if (svDiscoveryInputMetaData.sampleSpecificData.evidenceTargetLinks != null) {
final PairedStrandedIntervalTree<EvidenceTargetLink> evidenceTargetLinks = svDiscoveryInputMetaData.sampleSpecificData.evidenceTargetLinks;
final ReadMetadata readMetadata = svDiscoveryInputMetaData.sampleSpecificData.readMetadata;
final ReferenceMultiSource reference = svDiscoveryInputMetaData.referenceData.referenceBroadcast.getValue();
final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs = svDiscoveryInputMetaData.discoverStageArgs;
final Logger toolLogger = svDiscoveryInputMetaData.toolLogger;

// annotate with evidence links
annotatedVariants = AnnotatedVariantProducer.
annotateBreakpointBasedCallsWithImpreciseEvidenceLinks(assemblyBasedVariants,
evidenceTargetLinks, metadata, reference, discoverStageArgs, toolLogger);
evidenceTargetLinks, readMetadata, reference, discoverStageArgs, toolLogger);

// then also imprecise deletion
final List<VariantContext> impreciseVariants = ImpreciseVariantDetector.
callImpreciseDeletionFromEvidenceLinks(evidenceTargetLinks, metadata, reference,
callImpreciseDeletionFromEvidenceLinks(evidenceTargetLinks, readMetadata, reference,
discoverStageArgs.impreciseVariantEvidenceThreshold,
discoverStageArgs.maxCallableImpreciseVariantDeletionSize,
toolLogger);
Expand All @@ -257,46 +259,40 @@ private static List<VariantContext> processEvidenceTargetLinks(List<VariantConte
return annotatedVariants;
}

// hook up prototyping breakpoint and type inference tool
private void experimentalInterpretation(final JavaSparkContext ctx,
final FindBreakpointEvidenceSpark.AssembledEvidenceResults assembledEvidenceResults,
final SvDiscoveryInputData svDiscoveryInputData,
final String nonCanonicalChromosomeNamesFile) {
private static void experimentalInterpretation(final JavaSparkContext ctx,
final FindBreakpointEvidenceSpark.AssembledEvidenceResults assembledEvidenceResults,
final SvDiscoveryInputMetaData svDiscoveryInputMetaData) {

if ( ! expInterpret )
return;
final JavaRDD<GATKRead> assemblyRawAlignments = getContigRawAlignments(ctx, assembledEvidenceResults, svDiscoveryInputMetaData);

final Broadcast<SAMSequenceDictionary> referenceSequenceDictionaryBroadcast = svDiscoveryInputData.referenceSequenceDictionaryBroadcast;
final Broadcast<SAMFileHeader> headerBroadcast = svDiscoveryInputData.headerBroadcast;
final String updatedOutputPath = svDiscoveryInputMetaData.outputPath + "experimentalInterpretation_";
svDiscoveryInputMetaData.updateOutputPath(updatedOutputPath);

SvDiscoverFromLocalAssemblyContigAlignmentsSpark.AssemblyContigsClassifiedByAlignmentSignatures contigsByPossibleRawTypes
= SvDiscoverFromLocalAssemblyContigAlignmentsSpark.preprocess(svDiscoveryInputMetaData, assemblyRawAlignments);

SvDiscoverFromLocalAssemblyContigAlignmentsSpark.dispatchJobs(contigsByPossibleRawTypes, svDiscoveryInputMetaData,
assemblyRawAlignments, true);
}

private static JavaRDD<GATKRead> getContigRawAlignments(final JavaSparkContext ctx,
final FindBreakpointEvidenceSpark.AssembledEvidenceResults assembledEvidenceResults,
final SvDiscoveryInputMetaData svDiscoveryInputMetaData) {
final Broadcast<SAMSequenceDictionary> referenceSequenceDictionaryBroadcast =
svDiscoveryInputMetaData.referenceData.referenceSequenceDictionaryBroadcast;
final Broadcast<SAMFileHeader> headerBroadcast = svDiscoveryInputMetaData.sampleSpecificData.headerBroadcast;
final SAMFileHeader headerForReads = headerBroadcast.getValue();
final SAMReadGroupRecord contigAlignmentsReadGroup = new SAMReadGroupRecord(SVUtils.GATKSV_CONTIG_ALIGNMENTS_READ_GROUP_ID);
final List<String> refNames = SequenceDictionaryUtils.getContigNamesList(referenceSequenceDictionaryBroadcast.getValue());

List<GATKRead> readsList =
return ctx.parallelize(
assembledEvidenceResults
.getAlignedAssemblyOrExcuseList().stream()
.filter(AlignedAssemblyOrExcuse::isNotFailure)
.flatMap(aa -> aa.toSAMStreamForAlignmentsOfThisAssembly(headerForReads, refNames, contigAlignmentsReadGroup))
.map(SAMRecordToGATKReadAdapter::new)
.collect(Collectors.toList());
JavaRDD<GATKRead> reads = ctx.parallelize(readsList);

final String sampleId = svDiscoveryInputData.sampleId;
final Broadcast<ReferenceMultiSource> referenceBroadcast = svDiscoveryInputData.referenceBroadcast;
final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast = svDiscoveryInputData.cnvCallsBroadcast;

final SvDiscoveryInputData updatedSvDiscoveryInputData =
new SvDiscoveryInputData(sampleId, svDiscoveryInputData.discoverStageArgs,
svDiscoveryInputData.outputPath + "experimentalInterpretation_",
svDiscoveryInputData.metadata, svDiscoveryInputData.assembledIntervals,
svDiscoveryInputData.evidenceTargetLinks, reads, svDiscoveryInputData.toolLogger,
referenceBroadcast, referenceSequenceDictionaryBroadcast, headerBroadcast, cnvCallsBroadcast);

EnumMap<AssemblyContigAlignmentSignatureClassifier.RawTypes, JavaRDD<AssemblyContigWithFineTunedAlignments>>
contigsByPossibleRawTypes =
SvDiscoverFromLocalAssemblyContigAlignmentsSpark.preprocess(updatedSvDiscoveryInputData, nonCanonicalChromosomeNamesFile,true);

SvDiscoverFromLocalAssemblyContigAlignmentsSpark.dispatchJobs(contigsByPossibleRawTypes, updatedSvDiscoveryInputData);
.collect(Collectors.toList())
);
}

/**
Expand Down
Loading