diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPickerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPickerUnitTest.java index 206a82548c9..e5b444620f9 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPickerUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPickerUnitTest.java @@ -2,7 +2,6 @@ import com.google.common.collect.Lists; import htsjdk.samtools.TextCigarCodec; -import org.apache.commons.collections4.IteratorUtils; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.engine.spark.SparkContextFactory; import org.broadinstitute.hellbender.utils.SimpleInterval; @@ -17,159 +16,15 @@ import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY; import static org.broadinstitute.hellbender.tools.spark.sv.discovery.TestUtilsForAssemblyBasedSVDiscovery.*; -import static org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigAlignmentsConfigPicker.GoodAndBadMappings; +import static org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigAlignmentsConfigPicker.*; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; public class AssemblyContigAlignmentsConfigPickerUnitTest extends GATKBaseTest { - @DataProvider(name = "contigAlignmentsHeuristicFilter") - private Object[][] createTestData() { + // step 1: parse and primitive filter ============================================================================== - final List data = new ArrayList<>(20); - - AlignmentInterval intervalOne = new AlignmentInterval(new SimpleInterval("chr21", 1948156, 1948936), - 1, 787, TextCigarCodec.decode("257M4I182M2I342M361S"), true, 60, 8, 733, ContigAlignmentsModifier.AlnModType.NONE); - AlignmentInterval intervalTwo = new AlignmentInterval(new SimpleInterval("chr21", 1948935, 1949190), - 893, 1148, TextCigarCodec.decode("892H256M"), true, 60, 3, 241, ContigAlignmentsModifier.AlnModType.NONE); - AlignedContig contig = new AlignedContig("asm000063:tig00003", "CCACTGTGCCCGGCCAAGGGTCCCCGGTTCTGAAAGTGGAAGGGGTGCGGCTGCCTCAGGAGTCACCACGGCAACAAGAACCTGGACCTGAGCGCAGGTGGTCAGATTCTGGGGCCAGCAGCTTTTTGGTTTTTAGAGACGAGGTCTCACTCTGTTGCCCAGGCTGGAGTGCAGTGGTGCGATCACTGCACCCTGCAGCCTCGGCCTCCTGGTTTCAAGTGACCACAGATGCATGCAGCCATGCTTGGCATATATAAATATATATATATATATATATTTATGTGTATATTGGTAGAGACATGGTCTTGTTATATTGCCCAGGCTGATCGCAAACATCTGCTTAAGCGATCCTCCTGCGTTGGCTCTCCAAAGTATTGGGATTATAGGCATGAGCTACCATGGCCTGGCCTCCTTATTCTAGTCTTTTCTTTCCTTTCTTCTTGTTTTTTTTTTTTTTTTGGCAGGGTCTCACTCTGTCACCCAGGCTGCAGTGCAGTGGTGTGATCACAGCTCACTGCAGCCTCAACTTCCCAGGCTCAAGCGATCCTCCCGGCTCAGCATCCTGAGTAGCTGGGACTACAGATGCATGTCACCACGCCTGGCTAAATTTTCTTCTTTGTAGATATGGGGTCTCACCATGTAGTACTTTTCAATGTATTAAGCATCCTTATTTGATATTTGATGCCTGATAATACCCATGTCTGAACCATGCAAGATTGCTGCAATTCCTTCCTTCCTTCCCTCCCTCCTTCCCTTCCTTCCTTCCCTTTCCTTCCTTCCTCTTTCCCTCCCTTCTTTCCTTCCCTTTCCCTCCCTCCCTTCCTTCCTCTTTCCTTCCTTCCTTTCCCTCCCTTACTCCTTCCTTCCCTTCCCCTTCCTTCTTCCTTCTCTCCCTCCCTCCCTTCCCCTCCCTTACTCCCTTCCTTCCTCCTTCCCTCCCTCCTTTCCTTCATTCCCTTCCTTCCCCTTCCCCTTCCTTCCTTCTCTCCCTCCCTCCTTCCTTCCCTCCTTTCCTTCCTTCCTTCCTTTCCTTTCCCTCCTTCCTCCCTCCCTCCTTTCCTTCCTTCCTTTCCTTTCCTCCCTTCCCTCCCTCCCTCCCTCCCTTCCTTCCCCTCCCTCCCTCCTTTCCTTCTTTCGACAGAGTCTTG".getBytes(), - Arrays.asList(intervalOne, intervalTwo)); - data.add(new Object[]{contig, Arrays.asList(intervalOne), Arrays.asList(intervalOne, intervalTwo), 1, 2}); - - intervalOne = new AlignmentInterval(new SimpleInterval("chr2", 1422222, 1422435), - 1, 270, TextCigarCodec.decode("75M56I139M"), false, 60, 56, 142, ContigAlignmentsModifier.AlnModType.NONE); - intervalTwo = new AlignmentInterval(new SimpleInterval("chr2_KI270774v1_alt", 105288, 105555), - 1, 270, TextCigarCodec.decode("114M1I27M1I127M"), false, 56, 13, 179, ContigAlignmentsModifier.AlnModType.NONE); - contig = new AlignedContig("asm002608:tig00001", "ATGCTGGGGAATTTGTGTGCTCCTTGGGTGGGGACGAGCATGGAAGGCGCGTGGGACTGAAGCCTTGAAGACCCCGCAGGCGCCTCTCCTGGACAGACCTCGTGCAGGCGCCTCTCCTGGACCGACCTCGTGCAGGCGCCTCTCCTGGACAGACCTCGTGCAGGCGCCTCTCCTGGACCGACCTCGTGCAGGCGCCGCGCTGGACCGACCTCGTGCAGGCGCCGCGCTGGGCCATGGGGAGAGCGAGAGCCTGGTGTGCCCCTCAGGGAC".getBytes(), - Arrays.asList(intervalOne, intervalTwo)/*, true*/); - data.add(new Object[]{contig, Arrays.asList(intervalTwo), Arrays.asList(intervalOne), 3, 1}); - - intervalOne = new AlignmentInterval(new SimpleInterval("chr21", 30374719, 30375721), - 1, 1002, TextCigarCodec.decode("966M1D36M2362H"), true, 60, 6, 960, ContigAlignmentsModifier.AlnModType.NONE); - intervalTwo = new AlignmentInterval(new SimpleInterval("chr21", 30375922, 30378473), - 826, 3364, TextCigarCodec.decode("825S33M1D1047M7D553M5D906M"), true, 60, 24, 2423, ContigAlignmentsModifier.AlnModType.NONE); - AlignmentInterval intervalThree = new AlignmentInterval(new SimpleInterval("chr1_KI270760v1_alt", 22529, 23531), - 1, 1002, TextCigarCodec.decode("966M1D36M2362H"), true, 14, 3, 975, ContigAlignmentsModifier.AlnModType.NONE); - AlignmentInterval intervalFour = new AlignmentInterval(new SimpleInterval("chr1_KI270760v1_alt", 23681, 26220), - 826, 3364, TextCigarCodec.decode("825H33M1D2506M"), true, 60, 2, 2517, ContigAlignmentsModifier.AlnModType.NONE); - contig = new AlignedContig("asm027070:tig00000", "GAGCCCATCTCCTTGACTGTGGCTCTGATGCTGCCTCCACACTGGGATCTCTCTGCTCTCTTCACCTCATACCTCCTTCCCCCCACCTCACCCCATCGCCCCCGTTCTTGATCCTGCAATTGTAGAAACAGAAAGTTGGCTGATTTCTTGGGCCCGCAAATTGCCCAACAGGGAGACTGGGTGGGCGGCCCCCGCTTCCACTCCATCGCCCACCCTGATGCATCGTCTGACACTTTCAATTTATTTTTCAATTCCTCTACCATCAGAAATGACGATTAGATTTCCAGCATAAATACCGCCTTACCAAACTGAATTAATCACGGCAAGGAGGGGCACACACAGGCTCCAGCAGCCTGGGCAGAACATCCCCAGCATTAACCCTTCCGTCCTCACCCAGGCCCCCACCAGCAGGACGGAGGCTCCAGGCCTCACAGAAGACGCCACTCAAAATATCACTGGGGTCACCTAATCCCATCCCCCTTACCCTTTGCAGCCTCCCTCCTGTGGGAGTTCCTAGGAAGTGTCTTGCCCAAAGCCATCCACTCCATCAGGGCAGAGTCAGAGACACTGGCCCCTCATCTCCAGCCCCATCAGGGAAGGAGGCTCCATCCACATCCAGGACAAGATGTGGGAGTATCCGGGGTTTGGCGTTGTCCAGGACACATACGGGACGGGACTCCTGCAGACCCGAGGGTGGGGGCACCCAGTGATCACAGGGCCTGAACTGAAAGGGGTCTTGGAGAGACCTGGAGGCAGGTTCCAACCCTTGCCCCACAAACAAGACCATCACCCCTCTTTGCTGAGACTGTTCATTGCTCAGTCCAACAACCACAGCTCAGGTTGACCTCCAGCCTCCCCACTTCTCCACCTCCCTGACTCCAACCACAGCTCAGGGTGACCTCCAGCCTCCCCACTTCTCCACCTCCCTGACTCCAACCACAGCTCAGGGTGACATCCAGCCTCCCCACTTCTCCACCTCCCTGACTCCAGCCACAGCTCAGGCTCCTTCCTATGAGACCCCCATGGCCTCTCACAGCCTCTCCACTTCTATGCCTGTTCTCACCCAATCCCCATCCCTCAGCAGTCATCACCTCAAAATGCAAACACTGTCCTATGGTTTCCTGGCTCAGAACCCATCGGGCCCTCCTCTGCTCTCAAATCAGGCCCCCACCCTTCAAGGCCATGAGGACTGGGCTGGCCTGGCCCCTACCGGTCAGTGCACTCCCCCATCCTGGCTGGGTTGTCTCCTCTTTCTCCTTCAAGTTTTTCTATTTAAAATTCCCCTCCTCAGAGAACCTTCTCTGGCCACCATCCCCCAATCTAAATTAGGTTCTCCCTCCTAAGGTTCTTTCTCAAATCCATTTCCTTTCCTTCTGAGCACTTAAGCGAGCGATAATTACACACTAACTTGTGTAATTTGTTTAATAGGATCTTTGGGACAGAGACTTTATCTGACTCGCTTGATGCTGCAGCTGCTAGAACCCAGACCGTAATGTAGTGGGAGCTCAGTGCAGACTTTTGAAGGAGTAAGTGAGTAAAAGAACAACAAGCCCCTCTTGGTGCCCACCAAGTGCCAAGCTGAGACTGGGCCCTGGAGCTGGAGTCAAGATGTGGACCTGGCCTTGGTGTGCTGGGCCCTAACAGATGAGTAGGAGTTTGCCGAGCACTGAAGGTGGGGTTGACATGACCAACTTCTGAGAGGCACTCTTTGCCTCTGGATGGCCCCTTCCCAGTCACCCCAAAAGGAAGCCCTTGCCCTTTCAAAAGTGGTGAATGTGGTGGTTCAGATCGGTAGGTGTTCCTATGAATAGGTGAGGGGCCAGGCTTCAGGTCAGTTGAACCTGGGTTTGAATCCTGATTTTGCTCTTGGTACTAGGGCAGGTCACTGAGACGCTCTGAGCCTCTCTGCTCCAGGATGAGGATCCCTTCATCCATGCTCACTCAAAGTCCTGCCCACCAGGATGGAGGCAGACAGGCTGCAATGCCCTCCCCTCTCAGTGGGGGAAAAATACCAGGTCAGGCAGCCAGCAGCCGAGAATGCCAGGCAGAGCAAAGGTGTCCTAAGGGATGGACAGAATAAGGGCTTGAGAGCCTAGCCAAGGGTGAGGCTAGGAGAGGCTTCCCGGAGGACGAGGCAAGTCAGAGCTCTTTGCCTCTTACTCCCATGACTGTGGGTGCCTTTCTCCTCCTCCTCTCATTCTCTCTCCTTTCCAGCTCCTGCTCTGCTCATTTCTTCACCTCAGTCTCTCTGCCCCGACAGGAGCCCTGAGGGACACAACCCCGTCCCGAGGAATGTATCTGCCCACTTCCAGCAGGTTCCTGGAGGCCCTCTAAATTCCCCTTCCCCCCAAAGTCATCTCCCAACACTGCTGCTCCCAGGGTGGGACGCCTGCTGCTGCACCTCCACACACGTGCACACACCCAGCCAGGTGCAGACAGCGTGGGCAGTGCAGAGGGGAGGGCTGGGGATTAAGGAGTTCGTGTTCTTGAGCAGCCTGGAAAGCAGCAGGGCTTCCACAGGAGCCGCCCCTGCCCTCACCCCTGCCCAGTAGGGTTAAGGGGCTGGCTTAGATGTCACCCCAAGCCAAGGCTGTCCTTCTCAGAGGCTCCTTCCCAGCTCCCCTGAGTGGGTCAGTCCCTTCCCCTCTCTGAGCCCCTCTTTCCTCTTCTGTAAAGCAGACTCAGTGATGTTGCTCAGAGGATTGAAGGACAAAGAAAAGCAACACAATGGACAGCAGGGATTTGCAAACAGCCGGGTGCTGTACCCAAGACAGGGTATTGCTGGTGATGTCTGATGGATGGGGAGTTGAAAGACTCAGCTGTCACTGGGCAGCTGGGTCTGGTTCCCCTGAGTCATTCGTAATTCACCAACCCAGTCTATAGAAGCTTATTAAGCACTTATTGTGTGCCATGCTCCATGCAAGGGCCAAAGACACCATGAGCAGAGCCAGACCCCACCCTCAGGTTCCCCCATGGGATGGGGTTAGCCAGATGACCTGAAGGCCTCTCCAGCCAGCTCAACCCCCTTAATCCAGAATTACTCCCTGTGCCAGGCTGACGGTGTGGCCAGAGAGGCCAGGGCCTGGGAGGGGGCCTGGCAGTGGGTGGTGGGAAGAGATGGAGTGGCTGTGTCAGGGGAAGGAGAGAGCAGGTTGTTCCTGTACAGGTTTCGCTCCTCGGATAGGGGGCTGCAATGACAGCTTCCAGGAAAGACCAGGCAAGTGCCTCACCCCATCCATTCTTGCTCACCCCTGCGGCCTCTTGGCCAATGGCTGCTGTGACCCTGTCCTCCTCTGGGAATCTGGTCTCGGGGAGGAGCCCTGGACCCTGACATTGACTAGAAACCTGACCCCATGTCTGAGCA".getBytes(), - Arrays.asList(intervalOne, intervalTwo, intervalThree, intervalFour)); - data.add(new Object[]{contig, Arrays.asList(intervalOne, intervalFour), Arrays.asList(intervalThree, intervalFour), 2, 2}); - - // this is a case where {intervalOne} is equally good with {intervalOne, intervalTwo}, but somehow the score for latter case is tiny bit better than the first - intervalOne = new AlignmentInterval(new SimpleInterval("chr20", 60230348, 60231029), - 1, 682, TextCigarCodec.decode("682M"), false, 57, 68, 342, ContigAlignmentsModifier.AlnModType.NONE); - intervalTwo = new AlignmentInterval(new SimpleInterval("chrUn_JTFH01001804v1_decoy", 3674, 4300), - 1, 627, TextCigarCodec.decode("627M55H"), true, 60, 1, 622, ContigAlignmentsModifier.AlnModType.NONE); - contig = new AlignedContig("asm005003:tig00056", "AAAACTGCTCTATCAGAAGAAAGGTTAAGCTCTGAGAGTTGAACGCACACATCACAAAGTAGTTTCTAAGAATCATTCTGTCTGGTTTTCCTATGAAGATATTGCCTTTTCTACCATAGGCCTCAAACGGCACTAAATATCCTCTTTGAAATCCTTCAAAAAGAGACTCTCAAAACTTCTCTATCGAAAGGAAGGTTCAACACCGTGAGTTGAAAGCACACATCAGAAAGAAGTTTCTGAGAAGTATTCTGTCTAGTTTTATAGGAAGAAATCACGTTTCAAAAGAAGGCCACAAAGAGGTCCAAATATCCACTTGCAGATTCTACAAAAAGAGTGTTTCAAAACTGCTCTATCAAGAGAAATGTTCATCTCCGTGAGGTGAATGCAAATATTTCAATGTAGTTTCTGACAGTGCTTCTGTCTAGTTTTTATGTGAAGATATTTCCTTTTCTACCGTAGGCCTCAAAACACTCTCAATATACACTTGCAAATTCCACAAAAAGAGTGATTCAAAACTGCTCTATCAAAAGAAATTTTAAACGCTGTAAGCTGAATGCACACATCACAAAGTAGTTTCTGAGAATGATTCTGTCTAGTTTTTCTATGAAGATATTTCCTTTTCTACCATAGGCCTTGAAGCGCTCTAAATATCCACTTGGAAATTCTACAAAAAGAGTATTTC".getBytes(), - Arrays.asList(intervalOne, intervalTwo)/*, true*/); - data.add(new Object[]{contig, Arrays.asList(intervalOne, intervalTwo), Arrays.asList(intervalOne), 2, 1}); - - return data.toArray(new Object[data.size()][]); - } - - @Test(dataProvider = "contigAlignmentsHeuristicFilter", groups = "sv") - public void testSuite(final AlignedContig contig, - final List configuration, - final List configurationEquallyGoodOrBetter, - final int expectedConfigurationCount, - final int expectedAICount) { - - final double scoreOne = AssemblyContigAlignmentsConfigPicker.computeScoreOfConfiguration(configuration, b38_canonicalChromosomes, 60); - final double equallyGoodOrBetterScore = AssemblyContigAlignmentsConfigPicker.computeScoreOfConfiguration(configurationEquallyGoodOrBetter, b38_canonicalChromosomes, 60); - assertTrue( scoreOne < equallyGoodOrBetterScore || scoreOne - equallyGoodOrBetterScore <= Math.ulp(equallyGoodOrBetterScore)); - - assertEquals(AssemblyContigAlignmentsConfigPicker.pickBestConfigurations(contig, b38_canonicalChromosomes, 0.0).size(), expectedConfigurationCount); - - if (expectedConfigurationCount == 1) { - - final List alignments = AssemblyContigAlignmentsConfigPicker - .gatherBestConfigurationsForOneContig( - SparkContextFactory.getTestSparkContext().parallelize(Collections.singletonList(contig)), - b38_canonicalChromosomes, 0.0).values().collect().get(0).get(0).getGoodMappings(); - assertEquals(alignments.size(), expectedAICount, - alignments.stream().map(AlignmentInterval::toPackedString).collect(Collectors.toList()).toString()); - } - } - - - @DataProvider(name = "gapSplitFineTuning") - private Object[][] createTestDataForGapSplit() { - final List data = new ArrayList<>(20); - - final AlignmentInterval alignmentOne = new AlignmentInterval(new SimpleInterval("chrUn_JTFH01000492v1_decoy", 501, 1597), - 1, 1097, TextCigarCodec.decode("1097M6H"), true, 60, 1, 1092, ContigAlignmentsModifier.AlnModType.NONE); - final AlignmentInterval alignmentTwo = new AlignmentInterval(new SimpleInterval("chr17", 26962248, 26962806), - 483, 1103, CigarUtils.invertCigar(TextCigarCodec.decode("121M1D142M1I165M62I130M482S")), false, 60, 97, 281, ContigAlignmentsModifier.AlnModType.NONE); - - final Iterable split = ContigAlignmentsModifier.splitGappedAlignment(alignmentTwo, GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, 1103); - data.add(new Object[]{ - new GoodAndBadMappings(Arrays.asList(alignmentOne, alignmentTwo), Collections.emptyList()), - new GoodAndBadMappings(Collections.singletonList(alignmentOne), Lists.newArrayList(split)) - }); - - return data.toArray(new Object[data.size()][]); - } - - @Test(dataProvider = "gapSplitFineTuning", groups = "sv") - public void testGapSplit(final GoodAndBadMappings inputConfiguration, - final GoodAndBadMappings expectedOutputConfiguration) { - - final GoodAndBadMappings configuration = AssemblyContigAlignmentsConfigPicker.splitGaps(inputConfiguration); - Assert.assertEquals(configuration, expectedOutputConfiguration); - } - - @DataProvider(name = "forFilterSecondaryConfigurationsByMappingQualityThreshold") - private Object[][] forFilterSecondaryConfigurationsByMappingQualityThreshold() { - - final List data = new ArrayList<>(20); - - AlignmentInterval intervalOne = new AlignmentInterval( - new SimpleInterval("chr21", 100001, 100100), - 1, 100, TextCigarCodec.decode("100M220S"), - true, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE); - AlignmentInterval intervalTwo = new AlignmentInterval( - new SimpleInterval("chr21", 100099, 100122), - 99, 122, TextCigarCodec.decode("98S24M78S"), - true, 10, 3, 241, ContigAlignmentsModifier.AlnModType.NONE); - AlignmentInterval intervalThree = new AlignmentInterval( - new SimpleInterval("chr21", 100123, 100200), - 223, 300, TextCigarCodec.decode("222S78M"), - true, 60, 0, 78, ContigAlignmentsModifier.AlnModType.NONE); - final GoodAndBadMappings rep1 = - new GoodAndBadMappings(Arrays.asList(intervalOne, intervalThree), - Collections.singletonList(intervalThree)); - final GoodAndBadMappings rep2 = - new GoodAndBadMappings(Arrays.asList(intervalOne, intervalTwo, intervalThree), - Collections.emptyList()); - - data.add(new Object[]{Arrays.asList(rep1, rep2), 0, Arrays.asList(rep1, rep2)}); - - data.add(new Object[]{Arrays.asList(rep1, rep2), 10, Collections.singletonList(rep1)}); - - final AlignedContig alignedContig = fromPrimarySAMRecordString("asm031090:tig00000\t16\tchr5\t49659827\t60\t332S112M161S\t*\t0\t0\tCATTCCGTTCCGTTCCATTCCATTCCATTCCATTCTATTCGGGTTAATTCCATTCCATTCCATTCGATTGCAATCGAGTTGATTCCATTCCCTAACATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCCTTTCCATTCCATTACGGATGATTCCATTCCATTGCATTCCATTCCATTCCATTCCCCTGTACTCGGGTTGATTCCATTCCATTGCATTCCAATCCATGCCCTTCCACTCGTGTTGATTCCATTCTTTCCATTCCATTCAAGTTGAATCCATTCCATTGCAATCCATTCCATTCGATTCCATTCGATTGCACTCGGGTTGATTCCATTCCATTGCATTCCATTCCATTCCATTCCATTCCATTCCGTTCCATTCCTTTCCATTACATTCGGATTGATTCTATTCAATTCCCTTACACTCCATTACATTCCATTTCATTCCGGTAGTTTTCACTCCATTCCATTCCATTTCTCTCCATTCCATTGCACTCGGGTTGATTCCATTCCATTGCATTCCATTCCATTTGGGTAGTTTCCACTCCATTCCATTCCATTTCTCTCCATTCCATTGCACTCGGGTTGATTCCATTCC\t*\tSA:Z:chr22_KI270736v1_random,101512,+,455S56M94S,0,1;chr10,41903518,+,372S74M159S,48,7;chr20,31162579,+,37S59M509S,0,5;chr20,31188805,+,298S43M264S,0,2;chr4,49639434,+,331S37M237S,60,1;chrUn_KI270519v1,137524,+,101S37M467S,3,1;chrUn_KN707896v1_decoy,6014,-,81M15I253M5D189M67S,0,34;chrUn_KN707896v1_decoy,6436,-,517S88M,60,3;\tMD:Z:58A7C7G18T12C5\tRG:Z:GATKSVContigAlignments\tNM:i:5\tAS:i:87\tXS:i:55", - true); - final List goodAndBadMappings = AssemblyContigAlignmentsConfigPicker.pickBestConfigurations(alignedContig, - new HashSet<>(Arrays.asList("chr4", "chr5", "chr10", "chr20", "")), 0.0); - final List goodAfterTieBreak = fromPrimarySAMRecordString("asm031090:tig00000\t16\tchr5\t49659827\t60\t332S112M161S\t*\t0\t0\tCATTCCGTTCCGTTCCATTCCATTCCATTCCATTCTATTCGGGTTAATTCCATTCCATTCCATTCGATTGCAATCGAGTTGATTCCATTCCCTAACATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCCTTTCCATTCCATTACGGATGATTCCATTCCATTGCATTCCATTCCATTCCATTCCCCTGTACTCGGGTTGATTCCATTCCATTGCATTCCAATCCATGCCCTTCCACTCGTGTTGATTCCATTCTTTCCATTCCATTCAAGTTGAATCCATTCCATTGCAATCCATTCCATTCGATTCCATTCGATTGCACTCGGGTTGATTCCATTCCATTGCATTCCATTCCATTCCATTCCATTCCATTCCGTTCCATTCCTTTCCATTACATTCGGATTGATTCTATTCAATTCCCTTACACTCCATTACATTCCATTTCATTCCGGTAGTTTTCACTCCATTCCATTCCATTTCTCTCCATTCCATTGCACTCGGGTTGATTCCATTCCATTGCATTCCATTCCATTTGGGTAGTTTCCACTCCATTCCATTCCATTTCTCTCCATTCCATTGCACTCGGGTTGATTCCATTCC\t*\tSA:Z:chr10,41903518,+,372S74M159S,48,7;chr4,49639434,+,331S37M237S,60,1;chrUn_KI270519v1,137524,+,101S37M467S,3,1;chrUn_KN707896v1_decoy,6436,-,517S88M,60,3;\tMD:Z:58A7C7G18T12C5\tRG:Z:GATKSVContigAlignments\tNM:i:5\tAS:i:87\tXS:i:55", - true).getAlignments(); - final ArrayList copy = new ArrayList<>(alignedContig.getAlignments()); - copy.removeAll(goodAfterTieBreak); - data.add(new Object[]{goodAndBadMappings, 0, Collections.singletonList(new GoodAndBadMappings(goodAfterTieBreak, copy))}); - - return data.toArray(new Object[data.size()][]); - } - - @Test(groups = "sv", dataProvider = "forFilterSecondaryConfigurationsByMappingQualityThreshold") - public void testFilterSecondaryConfigurationsByMappingQualityThreshold(final List representations, - final int threshold, - final List expectedResult) { - - Assert.assertEquals(AssemblyContigAlignmentsConfigPicker.filterSecondaryConfigurationsByMappingQualityThreshold(representations, threshold), - expectedResult); - } - - @DataProvider(name = "forTestingNotDiscardForBadMQ") + @DataProvider private Object[][] forTestingNotDiscardForBadMQ() { final List data = new ArrayList<>(20); @@ -211,14 +66,31 @@ private Object[][] forTestingNotDiscardForBadMQ() { return data.toArray(new Object[data.size()][]); } - @Test(dataProvider = "forTestingNotDiscardForBadMQ", groups = "sv") public void testNotDiscardForBadMQ(final AlignedContig contig, final boolean shouldKeep) { - Assert.assertEquals(AssemblyContigAlignmentsConfigPicker.notDiscardForBadMQ(contig), shouldKeep); + Assert.assertEquals(notDiscardForBadMQ(contig), shouldKeep); + } + + // step 2: score possible configurations and pick the best ones ==================================================== + + @Test(groups = "sv") + public void testHeuristicSpeedUpWhenFacingManyMappings() { + final AlignedContig alignedContig = fromPrimarySAMRecordString("asm010147:tig00010\t16\tchr6\t31427489\t43\t4S54M2I26M7D22M4I33M4I70M2I109M2I103M1525S\t*\t0\t0\tt*\tSA:Z:chr6,31428609,-,1753H65M9D61M4I77M,17,22,113;chr6,31428242,-,883H188M4D95M794H,43,36,103;chr3,152694227,+,1245H26M2I118M569H,17,9,91;chr7,15531019,-,1172H147M7I62M2D28M544H,19,31,86;chr6,31428246,-,679H125M1156H,31,9,80;chr4,12386256,-,780H100M1080H,0,7,65;chrX,50228609,+,739H24M2D22M4I49M1122H,16,7,52;chr3,151030799,+,150H24M2D35M2D57M1694H,43,10,50;chr3,151030872,+,399H47M1514H,10,0,47;chr6,31428687,-,1467H40M453H,43,1,35;chr6_GL000253v2_alt,2736084,-,215M4I26M4I142M1569H,7,38,193;chr6_GL000256v2_alt,2730310,-,1017H129M13D57M15D57M2D38M9I134M2I58M459H,30,76,177;chrUn_JTFH01001202v1_decoy,266,-,306H38M6D23M6D89M4I24M1D139M2D64M1273H,34,45,148;chrUn_JTFH01001628v1_decoy,267,+,918H273M4I24M2I33M4D28M2D55M623H,60,52,137;\tMD:Z:57A17C4^TCTATTC3A8A3C21C5T18C0T34T0C1A92A0T1C22T23T10T23T12T1T10T12T1T1T13\tRG:Z:GATKSVContigAlignments\tNM:i:46\tAS:i:175\tXS:i:138", + true); + + final GoodAndBadMappings goodAndBadMappings = + heuristicSpeedUpWhenFacingManyMappings(alignedContig, hg38CanonicalChromosomes, 175); + + final List alignments = alignedContig.getAlignments(); + final List expectedBad = Arrays.asList(new AlignmentInterval("chr3,151030872,+,399H47M1514H,10,0,47"), + new AlignmentInterval("chr4,12386256,-,780H100M1080H,0,7,65")); + alignments.removeAll(expectedBad); + Assert.assertEquals(goodAndBadMappings.getGoodMappings(), alignments); + Assert.assertEquals(goodAndBadMappings.getBadMappings(), expectedBad); } - @DataProvider(name = "forTestSpecialChanelForSingleNonCanonicalMappings") - private Object[][] forTestSpecialChanelForSingleNonCanonicalMappings() { + @DataProvider + private Object[][] forGetBetterNonCanonicalMapping() { final List data = new ArrayList<>(20); // note chromosome names is hacked to use test seq dict @@ -245,99 +117,136 @@ private Object[][] forTestSpecialChanelForSingleNonCanonicalMappings() { data.add(new Object[]{Arrays.asList(normalOne, normalTwo), 416, null}); return data.toArray(new Object[data.size()][]); } - - @Test(groups = "sv", dataProvider = "forTestSpecialChanelForSingleNonCanonicalMappings") - public void testSpecialChanelForSingleNonCanonicalMappings(final List configuration, - final int maxCanonicalAS, - final AlignmentInterval expectedOutput) { - final AlignmentInterval result = AssemblyContigAlignmentsConfigPicker.getBetterNonCanonicalMapping( - b38_canonicalChromosomes, configuration, maxCanonicalAS); + @Test(groups = "sv", dataProvider = "forGetBetterNonCanonicalMapping") + public void testGetBetterNonCanonicalMapping(final List configuration, final int maxCanonicalAS, + final AlignmentInterval expectedOutput) { + final AlignmentInterval result = AssemblyContigAlignmentsConfigPicker + .getBetterNonCanonicalMapping(b38_canonicalChromosomes, configuration, maxCanonicalAS); Assert.assertEquals(result, expectedOutput); } - @Test(groups = "sv") - public void testHeuristicSpeedUpWhenFacingManyMappings() { - final AlignedContig alignedContig = fromPrimarySAMRecordString("asm010147:tig00010\t16\tchr6\t31427489\t43\t4S54M2I26M7D22M4I33M4I70M2I109M2I103M1525S\t*\t0\t0\tt*\tSA:Z:chr6,31428609,-,1753H65M9D61M4I77M,17,22,113;chr6,31428242,-,883H188M4D95M794H,43,36,103;chr3,152694227,+,1245H26M2I118M569H,17,9,91;chr7,15531019,-,1172H147M7I62M2D28M544H,19,31,86;chr6,31428246,-,679H125M1156H,31,9,80;chr4,12386256,-,780H100M1080H,0,7,65;chrX,50228609,+,739H24M2D22M4I49M1122H,16,7,52;chr3,151030799,+,150H24M2D35M2D57M1694H,43,10,50;chr3,151030872,+,399H47M1514H,10,0,47;chr6,31428687,-,1467H40M453H,43,1,35;chr6_GL000253v2_alt,2736084,-,215M4I26M4I142M1569H,7,38,193;chr6_GL000256v2_alt,2730310,-,1017H129M13D57M15D57M2D38M9I134M2I58M459H,30,76,177;chrUn_JTFH01001202v1_decoy,266,-,306H38M6D23M6D89M4I24M1D139M2D64M1273H,34,45,148;chrUn_JTFH01001628v1_decoy,267,+,918H273M4I24M2I33M4D28M2D55M623H,60,52,137;\tMD:Z:57A17C4^TCTATTC3A8A3C21C5T18C0T34T0C1A92A0T1C22T23T10T23T12T1T10T12T1T1T13\tRG:Z:GATKSVContigAlignments\tNM:i:46\tAS:i:175\tXS:i:138", + @DataProvider + private Object[][] forFilterSecondaryConfigurationsByMappingQualityThreshold() { + + final List data = new ArrayList<>(20); + + AlignmentInterval intervalOne = new AlignmentInterval( + new SimpleInterval("chr21", 100001, 100100), + 1, 100, TextCigarCodec.decode("100M220S"), + true, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE); + AlignmentInterval intervalTwo = new AlignmentInterval( + new SimpleInterval("chr21", 100099, 100122), + 99, 122, TextCigarCodec.decode("98S24M78S"), + true, 10, 3, 241, ContigAlignmentsModifier.AlnModType.NONE); + AlignmentInterval intervalThree = new AlignmentInterval( + new SimpleInterval("chr21", 100123, 100200), + 223, 300, TextCigarCodec.decode("222S78M"), + true, 60, 0, 78, ContigAlignmentsModifier.AlnModType.NONE); + final GoodAndBadMappings rep1 = + new GoodAndBadMappings(Arrays.asList(intervalOne, intervalThree), + Collections.singletonList(intervalThree)); + final GoodAndBadMappings rep2 = + new GoodAndBadMappings(Arrays.asList(intervalOne, intervalTwo, intervalThree), + Collections.emptyList()); + + data.add(new Object[]{Arrays.asList(rep1, rep2), 0, Arrays.asList(rep1, rep2)}); + + data.add(new Object[]{Arrays.asList(rep1, rep2), 10, Collections.singletonList(rep1)}); + + final AlignedContig alignedContig = fromPrimarySAMRecordString("asm031090:tig00000\t16\tchr5\t49659827\t60\t332S112M161S\t*\t0\t0\tCATTCCGTTCCGTTCCATTCCATTCCATTCCATTCTATTCGGGTTAATTCCATTCCATTCCATTCGATTGCAATCGAGTTGATTCCATTCCCTAACATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCCTTTCCATTCCATTACGGATGATTCCATTCCATTGCATTCCATTCCATTCCATTCCCCTGTACTCGGGTTGATTCCATTCCATTGCATTCCAATCCATGCCCTTCCACTCGTGTTGATTCCATTCTTTCCATTCCATTCAAGTTGAATCCATTCCATTGCAATCCATTCCATTCGATTCCATTCGATTGCACTCGGGTTGATTCCATTCCATTGCATTCCATTCCATTCCATTCCATTCCATTCCGTTCCATTCCTTTCCATTACATTCGGATTGATTCTATTCAATTCCCTTACACTCCATTACATTCCATTTCATTCCGGTAGTTTTCACTCCATTCCATTCCATTTCTCTCCATTCCATTGCACTCGGGTTGATTCCATTCCATTGCATTCCATTCCATTTGGGTAGTTTCCACTCCATTCCATTCCATTTCTCTCCATTCCATTGCACTCGGGTTGATTCCATTCC\t*\tSA:Z:chr22_KI270736v1_random,101512,+,455S56M94S,0,1;chr10,41903518,+,372S74M159S,48,7;chr20,31162579,+,37S59M509S,0,5;chr20,31188805,+,298S43M264S,0,2;chr4,49639434,+,331S37M237S,60,1;chrUn_KI270519v1,137524,+,101S37M467S,3,1;chrUn_KN707896v1_decoy,6014,-,81M15I253M5D189M67S,0,34;chrUn_KN707896v1_decoy,6436,-,517S88M,60,3;\tMD:Z:58A7C7G18T12C5\tRG:Z:GATKSVContigAlignments\tNM:i:5\tAS:i:87\tXS:i:55", true); + final List goodAndBadMappings = AssemblyContigAlignmentsConfigPicker.pickBestConfigurations(alignedContig, + new HashSet<>(Arrays.asList("chr4", "chr5", "chr10", "chr20", "")), 0.0); + final List goodAfterTieBreak = fromPrimarySAMRecordString("asm031090:tig00000\t16\tchr5\t49659827\t60\t332S112M161S\t*\t0\t0\tCATTCCGTTCCGTTCCATTCCATTCCATTCCATTCTATTCGGGTTAATTCCATTCCATTCCATTCGATTGCAATCGAGTTGATTCCATTCCCTAACATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCCTTTCCATTCCATTACGGATGATTCCATTCCATTGCATTCCATTCCATTCCATTCCCCTGTACTCGGGTTGATTCCATTCCATTGCATTCCAATCCATGCCCTTCCACTCGTGTTGATTCCATTCTTTCCATTCCATTCAAGTTGAATCCATTCCATTGCAATCCATTCCATTCGATTCCATTCGATTGCACTCGGGTTGATTCCATTCCATTGCATTCCATTCCATTCCATTCCATTCCATTCCGTTCCATTCCTTTCCATTACATTCGGATTGATTCTATTCAATTCCCTTACACTCCATTACATTCCATTTCATTCCGGTAGTTTTCACTCCATTCCATTCCATTTCTCTCCATTCCATTGCACTCGGGTTGATTCCATTCCATTGCATTCCATTCCATTTGGGTAGTTTCCACTCCATTCCATTCCATTTCTCTCCATTCCATTGCACTCGGGTTGATTCCATTCC\t*\tSA:Z:chr10,41903518,+,372S74M159S,48,7;chr4,49639434,+,331S37M237S,60,1;chrUn_KI270519v1,137524,+,101S37M467S,3,1;chrUn_KN707896v1_decoy,6436,-,517S88M,60,3;\tMD:Z:58A7C7G18T12C5\tRG:Z:GATKSVContigAlignments\tNM:i:5\tAS:i:87\tXS:i:55", + true).getAlignments(); + final ArrayList copy = new ArrayList<>(alignedContig.getAlignments()); + copy.removeAll(goodAfterTieBreak); + data.add(new Object[]{goodAndBadMappings, 0, Collections.singletonList(new GoodAndBadMappings(goodAfterTieBreak, copy))}); - final GoodAndBadMappings goodAndBadMappings = - AssemblyContigAlignmentsConfigPicker - .speedUpWhenTooManyMappings(alignedContig, hg38CanonicalChromosomes, 175); + return data.toArray(new Object[data.size()][]); + } + @Test(groups = "sv", dataProvider = "forFilterSecondaryConfigurationsByMappingQualityThreshold") + public void testFilterSecondaryConfigurationsByMappingQualityThreshold(final List representations, + final int threshold, + final List expectedResult) { - final List alignments = alignedContig.getAlignments(); - final List expectedBad = Arrays.asList(new AlignmentInterval("chr3,151030872,+,399H47M1514H,10,0,47"), - new AlignmentInterval("chr4,12386256,-,780H100M1080H,0,7,65")); - alignments.removeAll(expectedBad); - Assert.assertEquals(goodAndBadMappings.getGoodMappings(), alignments); - Assert.assertEquals(goodAndBadMappings.getBadMappings(), expectedBad); + Assert.assertEquals(filterSecondaryConfigurationsByMappingQualityThreshold(representations, threshold), + expectedResult); } - @DataProvider(name = "forConfigurationSorting") - private Object[][] forConfigurationSorting() { + @DataProvider + private Object[][] forMiscFunctions() { + final List data = new ArrayList<>(20); - // case for two equally-good configurations, one has fewer alignments - String sam = "asm001160:tig00000\t16\tchr1\t93876139\t60\t516S1317M\t*\t0\t0\tCATGTTGCCCAAGCCAGTCTTGAACTCCAGGGCTCAAAATGCTGAAATTACAGGCACGAGTCACTTACTGCTCTTAACAATCACGTACAAAAATCTTAACATATGATTTTTTTTTTTTTTTTTTGAGACAACATCTCCCTCCATTGCCCAGGCTGGAGTGCAGCGGCACAATCATGGCTCACCGCAGCCTCAATGTCCAGGGCTCAAGCAATCCTCCCACCTCAGCTTCCCAAGTAGCTGGGACCACAGGCGCACAGGGCACGGCTAATTTAAAAAAAATTTTTTGTGTAGAGATAGGGTCTCCTTATATTGCCCAGGCTGATCTCAAACACCTACTTGGGCTCAAGTGATCCTCCTGCCTCAGCCTCACAAAGTGCTGGGATTACAGGCATGAGTCACTGCATCCAACAGATTGATTTCTAATATGTCACCAAAAGGAGCACCTTTAGCTATGATTGGTGGGAAAAATATGACTAAAATAGGTATCCAAAAAGACAAGGGAAATGCTGGATAGAAGAGCCATTCCATGAAGAACCCAAGGCAGTGATTTTCTCATTCCCCAGGCTAACATTTCATATTTTTATGGTAAATTAACCACTTGAAATACATGTATCAAAAACTTATAAAAATAAAGGAAAAACTTACAGTTTAGCCTTTGTGCTATTTAGGAAGTCTTCTTCATCACTAAACTCATCTTCATTTTCGTCATGGTCTGATGAATCTTCTTCACTTTTTTCATCCTCTTCCTCTTCTTTTTCTTCCTCAATGGCAACCTCACTTGCCTTGTCTTCCTCTTCCAAGTAAAAATTTTTATCAGCACTCATTCCAGGAGTTGTGTCAATTACAAACAATGCATTGTCACATGACAGACTTCCTGTGTCTCCACTTAATGACTCCCTTTGGCCACTATTTTCAACAAAACATAAAGTATCCTCTTCATTCTCACTGTTTTCAGACTGTTGGCTTTCATCACTGCTGAGAACTAGTAAGACAGAATTATCTTTACCCTGAGATGTGTTGGGCGCAGACGTGTATAGTTTGGTATCACATTCAAAATCTACATTCCCTTCACTGTTCATGTCTTCACTGACACTTATAACTGTGGACTCTTCTTCATCATCACTACCACCACAATCACCAAACTTTGTCAAGTCACTTGCTTTTATGGGGCTCTTTTTGTTGTTATTCCATCTGCCTACTTCCACAGTTGCAAATGTTTGAGTTAATGATTTCATTACAGCCTCAGAGTTCAGATTAGAGTGCACTGATACAGCATTTTTATTTTGGGGGGTTGAATGTCTCTGAGAAACTAACTGTTGAAGGCTAGTGTCCTGAAGTTCAGAAAGATTCTTCAGCTGAGAACTTTTCTCATTAATTTCTTTCCCCTCATCTGTTATTCCATTGGCATCTTCATCCAAATCCTTACAATTCTGTTTTGTTTCTTTAAGAGATTCAACATTGGCCTGTTCGTGCACTGTTAATATATTTTCTGAACTTCTGTGGGAGAAATCATCATCAAAGTCATTATTATAGAAATTTGGCTTATTTATCTCAGAAAGAGATCTTGCTTGTAAATGGGAAGTTTGTCTGGTATCTGAATCCTCTGAATTCACAGGTGTACCCACGATCTGTTTCTCATTTCCTGGTACAATCTTACTATCTTTCTTTTCAGTTTGTGCCTTTAATTTCCTCTGCATACTCCTGGTTCTTCTAGTTGCAATTCCAGAGAATGAAATGTCTGAGCTTGATGTCTCAGCATCAGATATAGCTTCTGTATGAGATTCTTGGCTTGGATCTGTCAGAGATTTAGCCTTACTTCTTCTGGCTCCTGTA\t*\tSA:Z:chr1,61662787,-,165H74M1594H,12,2,64;chr15,43085606,+,1427H65M341H,0,2,55;chr10,96642377,+,1660H67M106H,60,5,42;chrUn_JTFH01001621v1_decoy,641,-,106M2I408M1317H,60,5,481;\tMD:Z:1317\tRG:Z:GATKSVContigAlignments\tNM:i:0\tAS:i:1317\tXS:i:0"; - AlignedContig alignedContig = fromPrimarySAMRecordString(sam, true); - data.add(new Object[]{alignedContig, - Arrays.asList( - new AssemblyContigWithFineTunedAlignments( - new AlignedContig(alignedContig.getContigName(), alignedContig.getContigSequence(), - Arrays.asList(alignedContig.getAlignments().get(0), alignedContig.getAlignments().get(1))), - alignedContig.getAlignments().subList(2,alignedContig.getAlignments().size()).stream().map(AlignmentInterval::toPackedString).collect(Collectors.toList()), - true, - (AlignmentInterval)null - ), - new AssemblyContigWithFineTunedAlignments( - new AlignedContig(alignedContig.getContigName(), alignedContig.getContigSequence(), - Arrays.asList(alignedContig.getAlignments().get(0), alignedContig.getAlignments().get(1), alignedContig.getAlignments().get(4))), - alignedContig.getAlignments().subList(2,4).stream().map(AlignmentInterval::toPackedString).collect(Collectors.toList()), - true, - (AlignmentInterval)null) - ) - }); + AlignmentInterval intervalOne = new AlignmentInterval(new SimpleInterval("chr21", 1948156, 1948936), + 1, 787, TextCigarCodec.decode("257M4I182M2I342M361S"), true, 60, 8, 733, ContigAlignmentsModifier.AlnModType.NONE); + AlignmentInterval intervalTwo = new AlignmentInterval(new SimpleInterval("chr21", 1948935, 1949190), + 893, 1148, TextCigarCodec.decode("892H256M"), true, 60, 3, 241, ContigAlignmentsModifier.AlnModType.NONE); + AlignedContig contig = new AlignedContig("asm000063:tig00003", "CCACTGTGCCCGGCCAAGGGTCCCCGGTTCTGAAAGTGGAAGGGGTGCGGCTGCCTCAGGAGTCACCACGGCAACAAGAACCTGGACCTGAGCGCAGGTGGTCAGATTCTGGGGCCAGCAGCTTTTTGGTTTTTAGAGACGAGGTCTCACTCTGTTGCCCAGGCTGGAGTGCAGTGGTGCGATCACTGCACCCTGCAGCCTCGGCCTCCTGGTTTCAAGTGACCACAGATGCATGCAGCCATGCTTGGCATATATAAATATATATATATATATATATTTATGTGTATATTGGTAGAGACATGGTCTTGTTATATTGCCCAGGCTGATCGCAAACATCTGCTTAAGCGATCCTCCTGCGTTGGCTCTCCAAAGTATTGGGATTATAGGCATGAGCTACCATGGCCTGGCCTCCTTATTCTAGTCTTTTCTTTCCTTTCTTCTTGTTTTTTTTTTTTTTTTGGCAGGGTCTCACTCTGTCACCCAGGCTGCAGTGCAGTGGTGTGATCACAGCTCACTGCAGCCTCAACTTCCCAGGCTCAAGCGATCCTCCCGGCTCAGCATCCTGAGTAGCTGGGACTACAGATGCATGTCACCACGCCTGGCTAAATTTTCTTCTTTGTAGATATGGGGTCTCACCATGTAGTACTTTTCAATGTATTAAGCATCCTTATTTGATATTTGATGCCTGATAATACCCATGTCTGAACCATGCAAGATTGCTGCAATTCCTTCCTTCCTTCCCTCCCTCCTTCCCTTCCTTCCTTCCCTTTCCTTCCTTCCTCTTTCCCTCCCTTCTTTCCTTCCCTTTCCCTCCCTCCCTTCCTTCCTCTTTCCTTCCTTCCTTTCCCTCCCTTACTCCTTCCTTCCCTTCCCCTTCCTTCTTCCTTCTCTCCCTCCCTCCCTTCCCCTCCCTTACTCCCTTCCTTCCTCCTTCCCTCCCTCCTTTCCTTCATTCCCTTCCTTCCCCTTCCCCTTCCTTCCTTCTCTCCCTCCCTCCTTCCTTCCCTCCTTTCCTTCCTTCCTTCCTTTCCTTTCCCTCCTTCCTCCCTCCCTCCTTTCCTTCCTTCCTTTCCTTTCCTCCCTTCCCTCCCTCCCTCCCTCCCTTCCTTCCCCTCCCTCCCTCCTTTCCTTCTTTCGACAGAGTCTTG".getBytes(), + Arrays.asList(intervalOne, intervalTwo)); + data.add(new Object[]{contig, Arrays.asList(intervalOne), Arrays.asList(intervalOne, intervalTwo), 1, 2}); - // case for two equally-good configurations, having same num of alignments, one has lower total NM - sam = "asm000168:tig00027\t0\tchr1\t4939534\t60\t54S139M96S\t*\t0\t0\tGTCCTCCGTATGACGTCAGTGTCCTCCATATGACATCAATATCCTCCATATGACATCAATATCCTCCATATGATGTCAGTGTGCTCCATATGACATCAATATCCTCCATATGATGTCAATATCCTGCGTATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCATATGATGTCAATATCCTCTGTATGATGTCAGTGTCCTCCATATGATGTCAATCGCCTCCATATGATGCCAATATCCTCCGTATGATGTCAATGCCCTCCGTATGATGTCAATGTCCTCCGT\t*\tSA:Z:chr1,4939535,+,155H134M,60,15,61;chr1,4939436,+,66M223H,23,3,51;chrUn_JTFH01000538v1_decoy,1338,-,153M136H,60,1,148;\tMD:Z:14A13C10T0G5G24C52G1G12\tRG:Z:GATKSVContigAlignments\tNM:i:8\tAS:i:99\tXS:i:45"; - alignedContig = fromPrimarySAMRecordString(sam, true); + intervalOne = new AlignmentInterval(new SimpleInterval("chr2", 1422222, 1422435), + 1, 270, TextCigarCodec.decode("75M56I139M"), false, 60, 56, 142, ContigAlignmentsModifier.AlnModType.NONE); + intervalTwo = new AlignmentInterval(new SimpleInterval("chr2_KI270774v1_alt", 105288, 105555), + 1, 270, TextCigarCodec.decode("114M1I27M1I127M"), false, 56, 13, 179, ContigAlignmentsModifier.AlnModType.NONE); + contig = new AlignedContig("asm002608:tig00001", "ATGCTGGGGAATTTGTGTGCTCCTTGGGTGGGGACGAGCATGGAAGGCGCGTGGGACTGAAGCCTTGAAGACCCCGCAGGCGCCTCTCCTGGACAGACCTCGTGCAGGCGCCTCTCCTGGACCGACCTCGTGCAGGCGCCTCTCCTGGACAGACCTCGTGCAGGCGCCTCTCCTGGACCGACCTCGTGCAGGCGCCGCGCTGGACCGACCTCGTGCAGGCGCCGCGCTGGGCCATGGGGAGAGCGAGAGCCTGGTGTGCCCCTCAGGGAC".getBytes(), + Arrays.asList(intervalOne, intervalTwo)); + data.add(new Object[]{contig, Arrays.asList(intervalTwo), Arrays.asList(intervalOne), 3, 1}); - data.add(new Object[]{alignedContig, - Arrays.asList( - new AssemblyContigWithFineTunedAlignments( - new AlignedContig(alignedContig.getContigName(), alignedContig.getContigSequence(), - Arrays.asList(alignedContig.getAlignments().get(0), alignedContig.getAlignments().get(1), new AlignmentInterval("chrUn_JTFH01000538v1_decoy,1338,-,153M136H,60,1,148"))), - Collections.singletonList(new AlignmentInterval("chr1,4939535,+,155H134M,60,15,61").toPackedString()), - true, - (AlignmentInterval)null - ), - new AssemblyContigWithFineTunedAlignments( - new AlignedContig(alignedContig.getContigName(), alignedContig.getContigSequence(), - Arrays.asList(alignedContig.getAlignments().get(0), alignedContig.getAlignments().get(1), new AlignmentInterval("chr1,4939535,+,155H134M,60,15,61"))), - Collections.singletonList(new AlignmentInterval("chrUn_JTFH01000538v1_decoy,1338,-,153M136H,60,1,148").toPackedString()), - true, - (AlignmentInterval)null) - ) - }); + intervalOne = new AlignmentInterval(new SimpleInterval("chr21", 30374719, 30375721), + 1, 1002, TextCigarCodec.decode("966M1D36M2362H"), true, 60, 6, 960, ContigAlignmentsModifier.AlnModType.NONE); + intervalTwo = new AlignmentInterval(new SimpleInterval("chr21", 30375922, 30378473), + 826, 3364, TextCigarCodec.decode("825S33M1D1047M7D553M5D906M"), true, 60, 24, 2423, ContigAlignmentsModifier.AlnModType.NONE); + AlignmentInterval intervalThree = new AlignmentInterval(new SimpleInterval("chr1_KI270760v1_alt", 22529, 23531), + 1, 1002, TextCigarCodec.decode("966M1D36M2362H"), true, 14, 3, 975, ContigAlignmentsModifier.AlnModType.NONE); + AlignmentInterval intervalFour = new AlignmentInterval(new SimpleInterval("chr1_KI270760v1_alt", 23681, 26220), + 826, 3364, TextCigarCodec.decode("825H33M1D2506M"), true, 60, 2, 2517, ContigAlignmentsModifier.AlnModType.NONE); + contig = new AlignedContig("asm027070:tig00000", "GAGCCCATCTCCTTGACTGTGGCTCTGATGCTGCCTCCACACTGGGATCTCTCTGCTCTCTTCACCTCATACCTCCTTCCCCCCACCTCACCCCATCGCCCCCGTTCTTGATCCTGCAATTGTAGAAACAGAAAGTTGGCTGATTTCTTGGGCCCGCAAATTGCCCAACAGGGAGACTGGGTGGGCGGCCCCCGCTTCCACTCCATCGCCCACCCTGATGCATCGTCTGACACTTTCAATTTATTTTTCAATTCCTCTACCATCAGAAATGACGATTAGATTTCCAGCATAAATACCGCCTTACCAAACTGAATTAATCACGGCAAGGAGGGGCACACACAGGCTCCAGCAGCCTGGGCAGAACATCCCCAGCATTAACCCTTCCGTCCTCACCCAGGCCCCCACCAGCAGGACGGAGGCTCCAGGCCTCACAGAAGACGCCACTCAAAATATCACTGGGGTCACCTAATCCCATCCCCCTTACCCTTTGCAGCCTCCCTCCTGTGGGAGTTCCTAGGAAGTGTCTTGCCCAAAGCCATCCACTCCATCAGGGCAGAGTCAGAGACACTGGCCCCTCATCTCCAGCCCCATCAGGGAAGGAGGCTCCATCCACATCCAGGACAAGATGTGGGAGTATCCGGGGTTTGGCGTTGTCCAGGACACATACGGGACGGGACTCCTGCAGACCCGAGGGTGGGGGCACCCAGTGATCACAGGGCCTGAACTGAAAGGGGTCTTGGAGAGACCTGGAGGCAGGTTCCAACCCTTGCCCCACAAACAAGACCATCACCCCTCTTTGCTGAGACTGTTCATTGCTCAGTCCAACAACCACAGCTCAGGTTGACCTCCAGCCTCCCCACTTCTCCACCTCCCTGACTCCAACCACAGCTCAGGGTGACCTCCAGCCTCCCCACTTCTCCACCTCCCTGACTCCAACCACAGCTCAGGGTGACATCCAGCCTCCCCACTTCTCCACCTCCCTGACTCCAGCCACAGCTCAGGCTCCTTCCTATGAGACCCCCATGGCCTCTCACAGCCTCTCCACTTCTATGCCTGTTCTCACCCAATCCCCATCCCTCAGCAGTCATCACCTCAAAATGCAAACACTGTCCTATGGTTTCCTGGCTCAGAACCCATCGGGCCCTCCTCTGCTCTCAAATCAGGCCCCCACCCTTCAAGGCCATGAGGACTGGGCTGGCCTGGCCCCTACCGGTCAGTGCACTCCCCCATCCTGGCTGGGTTGTCTCCTCTTTCTCCTTCAAGTTTTTCTATTTAAAATTCCCCTCCTCAGAGAACCTTCTCTGGCCACCATCCCCCAATCTAAATTAGGTTCTCCCTCCTAAGGTTCTTTCTCAAATCCATTTCCTTTCCTTCTGAGCACTTAAGCGAGCGATAATTACACACTAACTTGTGTAATTTGTTTAATAGGATCTTTGGGACAGAGACTTTATCTGACTCGCTTGATGCTGCAGCTGCTAGAACCCAGACCGTAATGTAGTGGGAGCTCAGTGCAGACTTTTGAAGGAGTAAGTGAGTAAAAGAACAACAAGCCCCTCTTGGTGCCCACCAAGTGCCAAGCTGAGACTGGGCCCTGGAGCTGGAGTCAAGATGTGGACCTGGCCTTGGTGTGCTGGGCCCTAACAGATGAGTAGGAGTTTGCCGAGCACTGAAGGTGGGGTTGACATGACCAACTTCTGAGAGGCACTCTTTGCCTCTGGATGGCCCCTTCCCAGTCACCCCAAAAGGAAGCCCTTGCCCTTTCAAAAGTGGTGAATGTGGTGGTTCAGATCGGTAGGTGTTCCTATGAATAGGTGAGGGGCCAGGCTTCAGGTCAGTTGAACCTGGGTTTGAATCCTGATTTTGCTCTTGGTACTAGGGCAGGTCACTGAGACGCTCTGAGCCTCTCTGCTCCAGGATGAGGATCCCTTCATCCATGCTCACTCAAAGTCCTGCCCACCAGGATGGAGGCAGACAGGCTGCAATGCCCTCCCCTCTCAGTGGGGGAAAAATACCAGGTCAGGCAGCCAGCAGCCGAGAATGCCAGGCAGAGCAAAGGTGTCCTAAGGGATGGACAGAATAAGGGCTTGAGAGCCTAGCCAAGGGTGAGGCTAGGAGAGGCTTCCCGGAGGACGAGGCAAGTCAGAGCTCTTTGCCTCTTACTCCCATGACTGTGGGTGCCTTTCTCCTCCTCCTCTCATTCTCTCTCCTTTCCAGCTCCTGCTCTGCTCATTTCTTCACCTCAGTCTCTCTGCCCCGACAGGAGCCCTGAGGGACACAACCCCGTCCCGAGGAATGTATCTGCCCACTTCCAGCAGGTTCCTGGAGGCCCTCTAAATTCCCCTTCCCCCCAAAGTCATCTCCCAACACTGCTGCTCCCAGGGTGGGACGCCTGCTGCTGCACCTCCACACACGTGCACACACCCAGCCAGGTGCAGACAGCGTGGGCAGTGCAGAGGGGAGGGCTGGGGATTAAGGAGTTCGTGTTCTTGAGCAGCCTGGAAAGCAGCAGGGCTTCCACAGGAGCCGCCCCTGCCCTCACCCCTGCCCAGTAGGGTTAAGGGGCTGGCTTAGATGTCACCCCAAGCCAAGGCTGTCCTTCTCAGAGGCTCCTTCCCAGCTCCCCTGAGTGGGTCAGTCCCTTCCCCTCTCTGAGCCCCTCTTTCCTCTTCTGTAAAGCAGACTCAGTGATGTTGCTCAGAGGATTGAAGGACAAAGAAAAGCAACACAATGGACAGCAGGGATTTGCAAACAGCCGGGTGCTGTACCCAAGACAGGGTATTGCTGGTGATGTCTGATGGATGGGGAGTTGAAAGACTCAGCTGTCACTGGGCAGCTGGGTCTGGTTCCCCTGAGTCATTCGTAATTCACCAACCCAGTCTATAGAAGCTTATTAAGCACTTATTGTGTGCCATGCTCCATGCAAGGGCCAAAGACACCATGAGCAGAGCCAGACCCCACCCTCAGGTTCCCCCATGGGATGGGGTTAGCCAGATGACCTGAAGGCCTCTCCAGCCAGCTCAACCCCCTTAATCCAGAATTACTCCCTGTGCCAGGCTGACGGTGTGGCCAGAGAGGCCAGGGCCTGGGAGGGGGCCTGGCAGTGGGTGGTGGGAAGAGATGGAGTGGCTGTGTCAGGGGAAGGAGAGAGCAGGTTGTTCCTGTACAGGTTTCGCTCCTCGGATAGGGGGCTGCAATGACAGCTTCCAGGAAAGACCAGGCAAGTGCCTCACCCCATCCATTCTTGCTCACCCCTGCGGCCTCTTGGCCAATGGCTGCTGTGACCCTGTCCTCCTCTGGGAATCTGGTCTCGGGGAGGAGCCCTGGACCCTGACATTGACTAGAAACCTGACCCCATGTCTGAGCA".getBytes(), + Arrays.asList(intervalOne, intervalTwo, intervalThree, intervalFour)); + data.add(new Object[]{contig, Arrays.asList(intervalOne, intervalFour), Arrays.asList(intervalThree, intervalFour), 2, 2}); + + // this is a case where {intervalOne} is equally good with {intervalOne, intervalTwo}, but somehow the score for latter case is tiny bit better than the first + intervalOne = new AlignmentInterval(new SimpleInterval("chr20", 60230348, 60231029), + 1, 682, TextCigarCodec.decode("682M"), false, 57, 68, 342, ContigAlignmentsModifier.AlnModType.NONE); + intervalTwo = new AlignmentInterval(new SimpleInterval("chrUn_JTFH01001804v1_decoy", 3674, 4300), + 1, 627, TextCigarCodec.decode("627M55H"), true, 60, 1, 622, ContigAlignmentsModifier.AlnModType.NONE); + contig = new AlignedContig("asm005003:tig00056", "AAAACTGCTCTATCAGAAGAAAGGTTAAGCTCTGAGAGTTGAACGCACACATCACAAAGTAGTTTCTAAGAATCATTCTGTCTGGTTTTCCTATGAAGATATTGCCTTTTCTACCATAGGCCTCAAACGGCACTAAATATCCTCTTTGAAATCCTTCAAAAAGAGACTCTCAAAACTTCTCTATCGAAAGGAAGGTTCAACACCGTGAGTTGAAAGCACACATCAGAAAGAAGTTTCTGAGAAGTATTCTGTCTAGTTTTATAGGAAGAAATCACGTTTCAAAAGAAGGCCACAAAGAGGTCCAAATATCCACTTGCAGATTCTACAAAAAGAGTGTTTCAAAACTGCTCTATCAAGAGAAATGTTCATCTCCGTGAGGTGAATGCAAATATTTCAATGTAGTTTCTGACAGTGCTTCTGTCTAGTTTTTATGTGAAGATATTTCCTTTTCTACCGTAGGCCTCAAAACACTCTCAATATACACTTGCAAATTCCACAAAAAGAGTGATTCAAAACTGCTCTATCAAAAGAAATTTTAAACGCTGTAAGCTGAATGCACACATCACAAAGTAGTTTCTGAGAATGATTCTGTCTAGTTTTTCTATGAAGATATTTCCTTTTCTACCATAGGCCTTGAAGCGCTCTAAATATCCACTTGGAAATTCTACAAAAAGAGTATTTC".getBytes(), + Arrays.asList(intervalOne, intervalTwo)); + data.add(new Object[]{contig, Arrays.asList(intervalOne, intervalTwo), Arrays.asList(intervalOne), 2, 1}); return data.toArray(new Object[data.size()][]); } - @Test(groups = "sv", dataProvider = "forConfigurationSorting") - public void testConfigurationSorting(final AlignedContig alignedContig, - final List expectedRepresentationsInOrder) { + @Test(dataProvider = "forMiscFunctions", groups = "sv") + public void testMiscFunctions(final AlignedContig contig, + final List configuration, + final List configurationEquallyGoodOrBetter, + final int expectedConfigurationCount, + final int expectedAICount) { + + final double scoreOne = computeScoreOfConfiguration(configuration, b38_canonicalChromosomes, 60); + final double equallyGoodOrBetterScore = computeScoreOfConfiguration(configurationEquallyGoodOrBetter, b38_canonicalChromosomes, 60); + assertTrue( scoreOne < equallyGoodOrBetterScore || scoreOne - equallyGoodOrBetterScore <= Math.ulp(equallyGoodOrBetterScore)); - List result = - IteratorUtils.toList( - AssemblyContigAlignmentsConfigPicker.reConstructContigFromPickedConfiguration( - new Tuple2<>(new Tuple2<>(alignedContig.getContigName(), alignedContig.getContigSequence()), - AssemblyContigAlignmentsConfigPicker.pickBestConfigurations(alignedContig, - hg38CanonicalChromosomes, 0.0)))); + assertEquals(pickBestConfigurations(contig, b38_canonicalChromosomes, 0.0).size(), expectedConfigurationCount); - Assert.assertEquals(result, expectedRepresentationsInOrder); + if (expectedConfigurationCount == 1) { + + final List alignments = + gatherBestConfigurationsForOneContig( + SparkContextFactory.getTestSparkContext().parallelize(Collections.singletonList(contig)), + b38_canonicalChromosomes, 0.0).values().collect().get(0).get(0).getGoodMappings(); + assertEquals(alignments.size(), expectedAICount, + alignments.stream().map(AlignmentInterval::toPackedString).collect(Collectors.toList()).toString()); + } } + // step 3: reconstruction from the picked configurations =========================================================== - @DataProvider(name = "forSpecialCaseGapSplit") - private Object[][] forSpecialCaseGapSplit() { + // functionality group 3.1: split gaps + @DataProvider + private Object[] forSplitGapsAndKeepChildrenTogether() { final List data = new ArrayList<>(20); AlignmentInterval noGap; @@ -353,7 +262,19 @@ private Object[][] forSpecialCaseGapSplit() { false, new GoodAndBadMappings(Collections.singletonList(noGap), Arrays.asList(new AlignmentInterval(new SimpleInterval("chr1", 1_000_101, 1_000_400), 101, 400, TextCigarCodec.decode("100S300M600S"), true, 60, AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT), - new AlignmentInterval(new SimpleInterval("chr1", 1_000_601, 1_001_200), 401, 1000, TextCigarCodec.decode("400S600M"), true, 60, AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT))) + new AlignmentInterval(new SimpleInterval("chr1", 1_000_601, 1_001_200), 401, 1000, TextCigarCodec.decode("400S600M"), true, 60, AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT))) + }); + + noGap = new AlignmentInterval(new SimpleInterval("chrUn_JTFH01000492v1_decoy", 501, 1597), + 1, 1097, TextCigarCodec.decode("1097M6H"), + true, 60, 1, 1092, ContigAlignmentsModifier.AlnModType.NONE); + gapped = new AlignmentInterval(new SimpleInterval("chr17", 26962248, 26962806), + 483, 1103, CigarUtils.invertCigar(TextCigarCodec.decode("121M1D142M1I165M62I130M482S")), + false, 60, 97, 281, ContigAlignmentsModifier.AlnModType.NONE); + data.add(new Object[]{new Tuple2<>(noGap, gapped), + false, + new GoodAndBadMappings(Collections.singletonList(noGap), + Lists.newArrayList(ContigAlignmentsModifier.splitGappedAlignment(gapped, GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, 1103))) }); // case two: gapped alignment provides better coverage with a D-gap @@ -367,7 +288,7 @@ private Object[][] forSpecialCaseGapSplit() { data.add(new Object[]{new Tuple2<>(noGap, gapped), true, new GoodAndBadMappings(Arrays.asList(new AlignmentInterval(new SimpleInterval("chr1", 1_000_101, 1_000_400), 101, 400, TextCigarCodec.decode("100S300M600S"), true, 60, AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT), - new AlignmentInterval(new SimpleInterval("chr1", 1_000_601, 1_001_200), 401, 1000, TextCigarCodec.decode("400S600M"), true, 60, AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT)), + new AlignmentInterval(new SimpleInterval("chr1", 1_000_601, 1_001_200), 401, 1000, TextCigarCodec.decode("400S600M"), true, 60, AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT)), Collections.singletonList(noGap)) }); @@ -378,27 +299,173 @@ private Object[][] forSpecialCaseGapSplit() { data.add(new Object[]{new Tuple2<>(noGap, gapped), true, new GoodAndBadMappings(Arrays.asList(new AlignmentInterval(new SimpleInterval("chr1", 1_000_101, 1_000_400), 101, 400, TextCigarCodec.decode("100S300M600S"), true, 60, AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT), - new AlignmentInterval(new SimpleInterval("chr1", 1_000_401, 1_000_850), 551, 1000, TextCigarCodec.decode("550S450M"), true, 60, AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT)), + new AlignmentInterval(new SimpleInterval("chr1", 1_000_401, 1_000_850), 551, 1000, TextCigarCodec.decode("550S450M"), true, 60, AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT)), Collections.singletonList(noGap)) }); return data.toArray(new Object[data.size()][]); } - @Test(groups = "sv", dataProvider = "forSpecialCaseGapSplit") - public void testSpecialCaseGapSplit(final Tuple2 nonGappedAndGappedAlignment, - final boolean expectedGappedAlignmentOffersBetterCoverage, - final GoodAndBadMappings expectedOutput) { + @Test(groups = "sv", dataProvider = "forSplitGapsAndKeepChildrenTogether") + public void testSplitGapsAndKeepChildrenTogether(final Tuple2 nonGappedAndGappedAlignment, + final boolean expectedGappedAlignmentOffersBetterCoverage, + final GoodAndBadMappings expectedOutput) { Assert.assertEquals( - AssemblyContigAlignmentsConfigPicker - .gappedAlignmentOffersBetterCoverage(nonGappedAndGappedAlignment._2, nonGappedAndGappedAlignment._1), + gappedAlignmentOffersBetterCoverage(nonGappedAndGappedAlignment._2, nonGappedAndGappedAlignment._1), expectedGappedAlignmentOffersBetterCoverage ); - Assert.assertEquals( - AssemblyContigAlignmentsConfigPicker.splitGaps( - new GoodAndBadMappings(Arrays.asList( nonGappedAndGappedAlignment._1, nonGappedAndGappedAlignment._2))), - expectedOutput + final GoodAndBadMappings actual = splitGapsAndKeepChildrenTogether( + new GoodAndBadMappings(Arrays.asList(nonGappedAndGappedAlignment._1, nonGappedAndGappedAlignment._2))); + Assert.assertEquals(actual, expectedOutput); + Assert.assertEquals(actual.hashCode(), expectedOutput.hashCode()); + } + + @DataProvider + private Object[][] forSplitGapsAndDropContainedAln() { + final List data = new ArrayList<>(20); + + String samString = "asm005554:tig00000\t0\tchr3\t96003552\t60\t1137M58I16M438S\t*\t00\tTATTAAAACCTAGCATTGATACTGATAGACAATGAGACTGGAGGATTTTTCTCTAGTGAAATAAATGAAGCAAGTAAAAAAATTGTCAGTTATGAAAATAAATCCTTGACCTTCACCATTTTATACTTCAATCACACAAACCCAAAATAAAACACTACTCATGATCACACAGATTCCATTCAACTTTTTAGTACTTTGATTTTAAATATTACTAGAAAATTTATGATTAGTGAATATTAGAAAAAGAAATCAGTTTCTACCATAAAAATACAGAAATACAAATAAGAAGAAAAAAAGAAAATTAGAGAAAACAGAGGCATTATCAAAGGTGATAGAACTTAAAAAAAAACCTTAATTTAATATACCAAGGTATATATGGTAATACATTACATCCATTAATATACCACAAGTTTAGATATCCCTTCTGTTCATATCCCATTGATAAATTTTTATGTAAATTGCCACATCTAAAATCAAGGAAAACAGGTAACATAGTTTGGCTGAATAGCAAATTATTTGAGAGGGGGAAAAGCAAATATACGTATCCACTAAAAAAAAACAAACAAAAACTAATATCATTATAAAAAGAGTTTTAAAAAAAGGAGAACAAAAAATATAGAGATCACAACATCAAATAACTTTTTCAATAAATATGTTCTGGAGAAAGGGATGACAACAGTTGTCTTGTGTTCATTTGCCCCACAAAAAGAGACCACAACGTCAATAAACAAGTTGACTTCAACGCCAGTGACTGAGGAGGTATGGTGGAGAGAACCAGGGGAGCAGCAAAATCTCTGTAGAACATAGAAGCCCAGGATAACACCATAGAGAGGGGAATAAGACATCCCACCCCTGCCACACTGTCTCCCCTGATAGGATTGGCTCAGTGGTGGGAGGACTTATTTTAGGAAAAAGATAAGCTGGAGATCCCTCTTGGTCCCCATTTCTACCATGGACACAAGACAACCTTTCTACAGGAGAGACCCACTGTCCTCACAGCCCCTAAATCCAGTTTGGAGAGTTCTTAAAAGTTTACACCATGCATCCCCCACTCTCATGATCTAAAAATTATACTTTTTTATATATGTATGCATACATATATAATTTTTATATATGTATGCATACATATATAATTTTTATATATGTATGCATACATATATAATTTTTATATATGTATGCATACATATAATTTTTATATATGTATGCATACATATATAATTTTTATATGTATGCATACATATATAATTTTTTATATGTATGCATACATATATAATTTTTATATGTATGCATACAATATAATTTTTTATATATTGTATGCATACATATACTGTTTTATATATGTATGCATACATATATACTTTTTTGTATGTACACATACACATATAATTTTTTATATATGCATGCATACATATATGATTTTTATATAGGCATGCATACATATATAATTTTTTATATATGCATGCATATATAAATGCATATAACTTTTTATATATGTATGCATACATAACTTTTTATATATGTATGCATACATAACTTTTTATATATGTATGCATACATAACTTTTATATATGTATGCATACATAACTTTTTATATATGTATGCATACATATATAACTTTTTATATATGTATGCATACATATATAACTTTT\t*\tSA:Z:chr3,96004627,+,1189S35M26I399M,60,26;\tMD:Z:1153\tRG:Z:GATKSVContigAlignments\tNM:i:58\tAS:i:1079\tXS:i:58"; + AlignmentInterval gapped = fromSAMRecordString(samString, true); + samString = "asm005554:tig00000\t2048\tchr3\t96004627\t60\t1189H35M26I399M\t*\t00\tTTTTTATATATGTATGCATACATATATAATTTTTATATGTATGCATACATATATAATTTTTTATATGTATGCATACATATATAATTTTTATATGTATGCATACAATATAATTTTTTATATATTGTATGCATACATATACTGTTTTATATATGTATGCATACATATATACTTTTTTGTATGTACACATACACATATAATTTTTTATATATGCATGCATACATATATGATTTTTATATAGGCATGCATACATATATAATTTTTTATATATGCATGCATATATAAATGCATATAACTTTTTATATATGTATGCATACATAACTTTTTATATATGTATGCATACATAACTTTTTATATATGTATGCATACATAACTTTTATATATGTATGCATACATAACTTTTTATATATGTATGCATACATATATAACTTTTTATATATGTATGCATACATATATAACTTTT\t*\tSA:Z:chr3,96003552,+,1137M58I16M438S,60,58;\tMD:Z:434\tRG:Z:GATKSVContigAlignments\tNM:i:26\tAS:i:392\tXS:i:204"; + AlignmentInterval nonGapped = fromSAMRecordString(samString, true); + ArrayList splitAlignments = Lists.newArrayList((ContigAlignmentsModifier.splitGappedAlignment(gapped, GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, 1649))); + data.add(new Object[]{new GoodAndBadMappings(Arrays.asList(gapped, nonGapped)), + new GoodAndBadMappings(Arrays.asList(splitAlignments.get(0), nonGapped), Collections.singletonList(splitAlignments.get(1))) + }); + + return data.toArray(new Object[data.size()][]); + } + @Test(groups = "sv", dataProvider = "forSplitGapsAndDropContainedAln") + public void testSplitGapsAndDropContainedAln(final GoodAndBadMappings input, final GoodAndBadMappings expectedOutput) { + Assert.assertEquals(splitGapsAndDropAlignmentContainedByOtherOnRead(input), + expectedOutput); + } + + // functionality group 3.2: after gap split, further reclassify some good mapping as bad + + @DataProvider + private Object[][] forRemoveNonUniqueMappings() { + final List data = new ArrayList<>(20); + + // case zero: only one or two alignments + AlignmentInterval singleAlignment = fromSAMRecordString("asm000146:tig00004\t2048\tchrUn_JTFH01001925v1_decoy\t600\t60\t646M4D370M\t*\t0\t0\tTGACTGAGCAAGTGTGGGAGTGTGAGTGAATGAGTGAGTGAGTGAATGAGTGACTGAGTGTGAGTGAACGAGTGACTGAGCAAGTGTGTGAGTGACTGAGCGAGTGTGTGTGAGTGAATAAGTGAGTGAGTGAATGAATCAGTGACTGTGTGTGATGAGCAAGTGTATGTGAATGAGTGACTGAGCAAGTGAATGAGTGTGAGTGTGTGAGTGACTGTGAGTGAGTGAATGAGCAAGTGTGAGTGAATGAGTGTGTGTGTGAATGAGCGAGTGAGTGAATAAGTGAGTGAATGAGTGTGCGAGTGAGTGAATGATTGTGAGTGAATGAGTGAGTGAGTAAGTGTGAGCGTGTGAGTGAATGAGTGTGTGTGAGTGAATGAGTGGATGAGTGTGAGTGAATGAGTGACTGAGCGAGTATGTGAGTGAATGAGTGACTGTGAGTGAGTGAGCAAGTGTGAGTGAATGAGTGTATGAGTGAATGAATGAGTGAGCGAGTGTGTGTGACTGAATGAGTGTGAGTGTGTGAGTAAATGAGTGTGTGAATGAGTGACTGTGACTGAGTGTGAATGAGTGACAGCAAGTGTGTGAGTGAATGAGTGTGAATATGAGTGAGTGAATGAGTGAGCAAGTGTGTGAGTGAATGAGTGTATGAGTGAATGAATGAGCGAGTGTGTGTGACTGAATGAGTGTGTGTGAGTAAATGAGTGTGAGTGAATGAGTGAGTGACTGTGAGTGTGAATGAGTGACTGAACAACTGTGTGTGAATGAGTGTATGAGTGAATGAATGAATGAGCGTGTGTGTGAGTGACTGAATGAGAGTGTGAGTAAATGAGTGTGAGTGAATGAGTGAGTGACTGAGTGACTGAATGAGTGACTGAGCAAGTGTGTGAGTGAATGAGTGTGAGTATGAGTGAATGAGTGAGCATGTGTGTGAGTGAGTGAGTGGGTGTGAGTGAGTGAATGAGTGACTGAATGTGTGAGTGTGAGTGAATGAGTGACTGAATGTGAGTGTGACT\t*SA:Z:chr1,4066380,-,441S22M2D18M2I138M2I10M4D28M1I24M10D13M4D49M4D51M4D65M2D50M10I34M2I56M,60,59;chr1,4064439,-,33S55M4I58M4D34M832S,20,15;chr1,4064651,-,255S32M2D99M630S,35,15;chr1,4064590,-,368S73M2D23M552S,60,8;\tMD:Z:314G2A0T0T4G3A317^GTGA370\tRG:Z:GATKSVContigAlignments\tNM:i:10\tAS:i:966\tXS:i:259", true); + data.add(new Object[]{new GoodAndBadMappings(Collections.singletonList(singleAlignment)), new GoodAndBadMappings(Collections.singletonList(singleAlignment)), + AssemblyContigAlignmentsConfigPicker.ALIGNMENT_MQ_THRESHOLD, AssemblyContigAlignmentsConfigPicker.ALIGNMENT_LOW_READ_UNIQUENESS_THRESHOLD}); + + AlignmentInterval head = new AlignmentInterval(new SimpleInterval("chr1:1-100"), 1, 100, TextCigarCodec.decode("100M94S"), true, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE); + AlignmentInterval tail = new AlignmentInterval(new SimpleInterval("chr1:137-200"), 137, 200, TextCigarCodec.decode("130S64M"), true, 60, 0, 64, ContigAlignmentsModifier.AlnModType.NONE); + data.add(new Object[]{new GoodAndBadMappings(Arrays.asList(head, tail)), + new GoodAndBadMappings(Arrays.asList(head, tail)), + AssemblyContigAlignmentsConfigPicker.ALIGNMENT_MQ_THRESHOLD, AssemblyContigAlignmentsConfigPicker.ALIGNMENT_LOW_READ_UNIQUENESS_THRESHOLD + }); + + // case one: some alignments are bad MQ + int middleAlnMQ = 19; + AlignmentInterval middle = new AlignmentInterval(new SimpleInterval("chr2:1-29"), 102, 130, TextCigarCodec.decode("101S29M64S"), false, middleAlnMQ, 2, 20, ContigAlignmentsModifier.AlnModType.NONE); + data.add(new Object[]{new GoodAndBadMappings(Arrays.asList(head, middle, tail)), + new GoodAndBadMappings(Arrays.asList(head, tail), Collections.singletonList(middle)), + AssemblyContigAlignmentsConfigPicker.ALIGNMENT_MQ_THRESHOLD, AssemblyContigAlignmentsConfigPicker.ALIGNMENT_LOW_READ_UNIQUENESS_THRESHOLD + }); + middleAlnMQ = 20; + middle = new AlignmentInterval(new SimpleInterval("chr2:1-29"), 102, 130, TextCigarCodec.decode("101S29M64S"), false, middleAlnMQ, 2, 20, ContigAlignmentsModifier.AlnModType.NONE); + data.add(new Object[]{new GoodAndBadMappings(Arrays.asList(head, middle, tail)), + new GoodAndBadMappings(Arrays.asList(head, middle, tail)), + AssemblyContigAlignmentsConfigPicker.ALIGNMENT_MQ_THRESHOLD, AssemblyContigAlignmentsConfigPicker.ALIGNMENT_LOW_READ_UNIQUENESS_THRESHOLD + }); + + // case two: some alignments would be too short after overlap removal + + // first, no overlap but still too short + middleAlnMQ = 40; + middle = new AlignmentInterval(new SimpleInterval("chr2:1-9"), 122, 130, TextCigarCodec.decode("121S9M64S"), false, middleAlnMQ, 0, 9, ContigAlignmentsModifier.AlnModType.NONE); + data.add(new Object[]{new GoodAndBadMappings(Arrays.asList(head, middle, tail)), + new GoodAndBadMappings(Arrays.asList(head, tail), Collections.singletonList(middle)), + AssemblyContigAlignmentsConfigPicker.ALIGNMENT_MQ_THRESHOLD, AssemblyContigAlignmentsConfigPicker.ALIGNMENT_LOW_READ_UNIQUENESS_THRESHOLD + }); + // or just long enough + middle = new AlignmentInterval(new SimpleInterval("chr2:1-10"), 121, 130, TextCigarCodec.decode("120S10M64S"), false, middleAlnMQ, 1, 9, ContigAlignmentsModifier.AlnModType.NONE); + data.add(new Object[]{new GoodAndBadMappings(Arrays.asList(head, middle, tail)), + new GoodAndBadMappings(Arrays.asList(head, middle, tail)), + AssemblyContigAlignmentsConfigPicker.ALIGNMENT_MQ_THRESHOLD, AssemblyContigAlignmentsConfigPicker.ALIGNMENT_LOW_READ_UNIQUENESS_THRESHOLD + }); + + // now real data + String samString = "asm000266:tig00003\t0\tchr1\t10817996\t60\t1017S43M2I870M\t*\t0\t0\tCTTCTGCCGCCCAGGCTCCCCTGGGATTCTGCAGCCTCCTCCTTGATGGCTGCTGGCCCTGCCCACCTGCCGTTCTTGCAGTGGCAAACCTGAGCCCACAGTCCCCTGCTCAAAGCCCATCGGAGGCTCCTGGGGCCTGCAGGGCCTGGTCCAGGTCCCTTCACATGACTCGCAAGGTCCCACCACCCTCTCTGGCCTCACCCTCTCCTCTCTTCGCTGGGGCTCCCCCTCTCCAATGCACTGGCCTGCACTCACTTCCCCAGGCCCAGGTGGTCTAGCCCCCACCTTTGCCCCTGCTGTGGCTTCCCAGGGAATGCTCTTCCTACCTGCTCCCTGCCCCCACCCCTCTGTTGTAAGATCTCAAATGAGACAGCACCTTCCTGGCTCCTGCCTCCCTAGCCTTGACCCCCCTGCAAGTTCCCAGAAACTCTGGCTTTTCCTGCGTGTAGGACATCACCTGGTCCCTGTCTTCAGAGAAGGACATGAAGCAAGCCCACTGGTACTGGCACCTTCATTCAGCTCATTCTTCAACCAGCAAGGATTTATTGAGCACATACTATGAACAGCTGCCAGGGCTGAGCCTGGGGTGCTTGCGCCCCTGAGGACTGGGGCCCTCAGACCCAGGGGGTATGGGTGGAAGAAGAACTTGGCTATTTAGAAAGGGACTCTAGGAAGGCACATGTCATCTCCTCTCCTCCAGGCCTGAGAGCATATACAAGGCCAGTACCATGAGCTAATAATATTTTACTTTTCCCCGTAGAGCACAGCATTGGGCTTGGCATACAGTAGGGGCTCAACCAATGCAGGCAGAAGAGAACTGACAGATGATAAGGTTTTCTTTCTTTCTTTCTTTCTCTCTTTCTCCCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCCTTTCCTTTCCTTTCCTTTCCTTTCCTTCCTTTCCTTTCCTTTCCTTTCCTTTCCCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTTTTGAGACAGAGTCTTGCTCTGCCGCCCAGGCTGGAGTGCAGTGGTGTGATCTCGGTTCACTGCAACCTCCGCCTCCCGGGTTCAAGTGATTCTTGCACCTTAGCCTCCCAAGTAGCTGGGATTACAGTTGCCCCCAACCATGCCTGGCTAATTTTTGTATTTTTAGTAGAGGCAGGGTTTCACCATGTTGGCCAGGCTGGTCTCAAACTCCTGACCTCAGATGATCCACCCGCCTCGGCCTCCAAAAGTGCTGGGATTATAGGCATGAGCCACCGTGCCTGGCCAAAGATAACCCTACACCAGGAACTTCATGAGTTCCAGGAGGGAAAGGCAGACTAGTGTGTGTTGCAGCAGGCAGGGAGGGCTTCCTGAGGGAGGTGCTGAGAACAGGGCCTTGAAGCCCGTGGAGGCTCAAAGTAGTTGGGAGGGAGGAGGGCGGATGCTTTCTAGGGATTGTGGAGACCAGGATACAGGCAACAGGAGCAAGAGGCGTGAGGTTGAAAGCAGGTGGGAGGGGTGGGGCATCTGTACAAACATCGTGGGTGATGTTTAGGAGAGTGCCAGGCTGTGCCTCTGGCCACCACCATACCTAAGACCCCTAAGTCTTGCTCTGGCTGGGGGTGACTGCGGGCCACAGTTCTTGTCTGCAGGGAAACCAATGGCTGCAGTTAAAGACAAGGCTGCCCTCCCCCCAAGCTCCAGAGACTGGGGAGTGCCCCGGGCAGGGCTTGCCCAGACCTGGCACTCCAGCTGCACCCTCCGCCCTGGGACATCTTGTACCCAGGAGGACCTATTAAAGGGACAAAGGTCCCCATGGGGTGCAGGCACCCCAGGCTCAGCCCTGGCAGCTGGCCCGGGCTTG\t*\tSA:Z:chr1,10817090,+,854M1078S,60,3,843;chr22,22585882,+,945S125M862S,20,0,125;chr8,6150682,-,1005S91M836S,0,3,76;chr11,48640941,+,906S55M971S,60,0,55;\tMD:Z:913\tRG:Z:GATKSVContigAlignments\tNM:i:2\tAS:i:895\tXS:i:134"; + AlignedContig alignedContig = fromPrimarySAMRecordString(samString, true); + GoodAndBadMappings expected = new GoodAndBadMappings( + Arrays.asList(fromSAMRecordString("asm000266:tig00003\t2048\tchr1\t10817090\t60\t854M1078S\t*\t0\t0\tCTTCTGCCGCCCAGGCTCCCCTGGGATTCTGCAGCCTCCTCCTTGATGGCTGCTGGCCCTGCCCACCTGCCGTTCTTGCAGTGGCAAACCTGAGCCCACAGTCCCCTGCTCAAAGCCCATCGGAGGCTCCTGGGGCCTGCAGGGCCTGGTCCAGGTCCCTTCACATGACTCGCAAGGTCCCACCACCCTCTCTGGCCTCACCCTCTCCTCTCTTCGCTGGGGCTCCCCCTCTCCAATGCACTGGCCTGCACTCACTTCCCCAGGCCCAGGTGGTCTAGCCCCCACCTTTGCCCCTGCTGTGGCTTCCCAGGGAATGCTCTTCCTACCTGCTCCCTGCCCCCACCCCTCTGTTGTAAGATCTCAAATGAGACAGCACCTTCCTGGCTCCTGCCTCCCTAGCCTTGACCCCCCTGCAAGTTCCCAGAAACTCTGGCTTTTCCTGCGTGTAGGACATCACCTGGTCCCTGTCTTCAGAGAAGGACATGAAGCAAGCCCACTGGTACTGGCACCTTCATTCAGCTCATTCTTCAACCAGCAAGGATTTATTGAGCACATACTATGAACAGCTGCCAGGGCTGAGCCTGGGGTGCTTGCGCCCCTGAGGACTGGGGCCCTCAGACCCAGGGGGTATGGGTGGAAGAAGAACTTGGCTATTTAGAAAGGGACTCTAGGAAGGCACATGTCATCTCCTCTCCTCCAGGCCTGAGAGCATATACAAGGCCAGTACCATGAGCTAATAATATTTTACTTTTCCCCGTAGAGCACAGCATTGGGCTTGGCATACAGTAGGGGCTCAACCAATGCAGGCAGAAGAGAACTGACAGATGATAAGGTTTTCTTTCTTTCTTTCTTTC\t*\tSA:Z:chr1,10817996,+,1017S43M2I870M,60,2;chr22,22585882,+,945S125M862S,20,0;chr8,6150682,-,1005S91M836S,0,3;chr11,48640941,+,906S55M971S,60,0;\tMD:Z:0A356G251A244\tRG:Z:GATKSVContigAlignments\tNM:i:3\tAS:i:843\tXS:i:0", true), + fromSAMRecordString("asm000266:tig00003\t2048\tchr22\t22585882\t20\t945S125M862S\t*\t0\t0\tCTTTCCTTTCCTTTCCCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTT\t*\tSA:Z:chr1,10817996,+,1017S43M2I870M,60,2;chr1,10817090,+,854M1078S,60,3;chr8,6150682,-,1005S91M836S,0,3;chr11,48640941,+,906S55M971S,60,0;\tMD:Z:125\tRG:Z:GATKSVContigAlignments\tNM:i:0\tAS:i:125\tXS:i:111", true), + fromSAMRecordString("asm000266:tig00003\t0\tchr1\t10817996\t60\t1017S43M2I870M\t*\t0\t0\tCTTCTGCCGCCCAGGCTCCCCTGGGATTCTGCAGCCTCCTCCTTGATGGCTGCTGGCCCTGCCCACCTGCCGTTCTTGCAGTGGCAAACCTGAGCCCACAGTCCCCTGCTCAAAGCCCATCGGAGGCTCCTGGGGCCTGCAGGGCCTGGTCCAGGTCCCTTCACATGACTCGCAAGGTCCCACCACCCTCTCTGGCCTCACCCTCTCCTCTCTTCGCTGGGGCTCCCCCTCTCCAATGCACTGGCCTGCACTCACTTCCCCAGGCCCAGGTGGTCTAGCCCCCACCTTTGCCCCTGCTGTGGCTTCCCAGGGAATGCTCTTCCTACCTGCTCCCTGCCCCCACCCCTCTGTTGTAAGATCTCAAATGAGACAGCACCTTCCTGGCTCCTGCCTCCCTAGCCTTGACCCCCCTGCAAGTTCCCAGAAACTCTGGCTTTTCCTGCGTGTAGGACATCACCTGGTCCCTGTCTTCAGAGAAGGACATGAAGCAAGCCCACTGGTACTGGCACCTTCATTCAGCTCATTCTTCAACCAGCAAGGATTTATTGAGCACATACTATGAACAGCTGCCAGGGCTGAGCCTGGGGTGCTTGCGCCCCTGAGGACTGGGGCCCTCAGACCCAGGGGGTATGGGTGGAAGAAGAACTTGGCTATTTAGAAAGGGACTCTAGGAAGGCACATGTCATCTCCTCTCCTCCAGGCCTGAGAGCATATACAAGGCCAGTACCATGAGCTAATAATATTTTACTTTTCCCCGTAGAGCACAGCATTGGGCTTGGCATACAGTAGGGGCTCAACCAATGCAGGCAGAAGAGAACTGACAGATGATAAGGTTTTCTTTCTTTCTTTCTTTCTCTCTTTCTCCCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCCTTTCCTTTCCTTTCCTTTCCTTTCCTTCCTTTCCTTTCCTTTCCTTTCCTTTCCCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTTTTGAGACAGAGTCTTGCTCTGCCGCCCAGGCTGGAGTGCAGTGGTGTGATCTCGGTTCACTGCAACCTCCGCCTCCCGGGTTCAAGTGATTCTTGCACCTTAGCCTCCCAAGTAGCTGGGATTACAGTTGCCCCCAACCATGCCTGGCTAATTTTTGTATTTTTAGTAGAGGCAGGGTTTCACCATGTTGGCCAGGCTGGTCTCAAACTCCTGACCTCAGATGATCCACCCGCCTCGGCCTCCAAAAGTGCTGGGATTATAGGCATGAGCCACCGTGCCTGGCCAAAGATAACCCTACACCAGGAACTTCATGAGTTCCAGGAGGGAAAGGCAGACTAGTGTGTGTTGCAGCAGGCAGGGAGGGCTTCCTGAGGGAGGTGCTGAGAACAGGGCCTTGAAGCCCGTGGAGGCTCAAAGTAGTTGGGAGGGAGGAGGGCGGATGCTTTCTAGGGATTGTGGAGACCAGGATACAGGCAACAGGAGCAAGAGGCGTGAGGTTGAAAGCAGGTGGGAGGGGTGGGGCATCTGTACAAACATCGTGGGTGATGTTTAGGAGAGTGCCAGGCTGTGCCTCTGGCCACCACCATACCTAAGACCCCTAAGTCTTGCTCTGGCTGGGGGTGACTGCGGGCCACAGTTCTTGTCTGCAGGGAAACCAATGGCTGCAGTTAAAGACAAGGCTGCCCTCCCCCCAAGCTCCAGAGACTGGGGAGTGCCCCGGGCAGGGCTTGCCCAGACCTGGCACTCCAGCTGCACCCTCCGCCCTGGGACATCTTGTACCCAGGAGGACCTATTAAAGGGACAAAGGTCCCCATGGGGTGCAGGCACCCCAGGCTCAGCCCTGGCAGCTGGCCCGGGCTTG\t*\tSA:Z:chr1,10817090,+,854M1078S,60,3;chr22,22585882,+,945S125M862S,20,0;chr8,6150682,-,1005S91M836S,0,3;chr11,48640941,+,906S55M971S,60,0;\tMD:Z:913\tRG:Z:GATKSVContigAlignments\tNM:i:2\tAS:i:895\tXS:i:134", true)), + Arrays.asList(fromSAMRecordString("asm000266:tig00003\t2064\tchr8\t6150682\t0\t1005S91M836S\t*\t0\t0\tGGAAAGGAAAGGAAAGGAAAGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGGGAGAAAGAGAGAAAGAAAGAAAGAAAGA\t*\tSA:Z:chr1,10817996,+,1017S43M2I870M,60,2;chr1,10817090,+,854M1078S,60,3;chr22,22585882,+,945S125M862S,20,0;chr11,48640941,+,906S55M971S,60,0;\tMD:Z:62A0A7A19\tRG:Z:GATKSVContigAlignments\tNM:i:3\tAS:i:76\tXS:i:72", true), + fromSAMRecordString("asm000266:tig00003\t2048\tchr11\t48640941\t60\t906S55M971S\t*\t0\t0\tCTTTCCTTTCCTTTCCTTTCCTTTCCTTCCTTTCCTTTCCTTTCCTTTCCTTTCC\t*\tSA:Z:chr1,10817996,+,1017S43M2I870M,60,2;chr1,10817090,+,854M1078S,60,3;chr22,22585882,+,945S125M862S,20,0;chr8,6150682,-,1005S91M836S,0,3;\tMD:Z:55\tRG:Z:GATKSVContigAlignments\tNM:i:0\tAS:i:55\tXS:i:0", true)) + ); + data.add(new Object[]{new GoodAndBadMappings(alignedContig.getAlignments(), Collections.emptyList()), + expected, + 20, 40 + }); + + return data.toArray(new Object[data.size()][]); + } + @Test(groups = "sv", dataProvider = "forRemoveNonUniqueMappings") + public void testRemoveNonUniqueMappings(final GoodAndBadMappings input, final GoodAndBadMappings expectedOutput, + final int mapQThresholdInclusive, final int uniqReadLenInclusive) { + GoodAndBadMappings actual = AssemblyContigAlignmentsConfigPicker.removeNonUniqueMappings(input, mapQThresholdInclusive, uniqReadLenInclusive); + Assert.assertEquals(actual, expectedOutput); + } + + // misc + + @DataProvider + private Object[][] forConfigurationSorting() { + final List data = new ArrayList<>(20); + // case for two equally-good configurations, one has fewer alignments + String sam = "asm001160:tig00000\t16\tchr1\t93876139\t60\t516S1317M\t*\t0\t0\tCATGTTGCCCAAGCCAGTCTTGAACTCCAGGGCTCAAAATGCTGAAATTACAGGCACGAGTCACTTACTGCTCTTAACAATCACGTACAAAAATCTTAACATATGATTTTTTTTTTTTTTTTTTGAGACAACATCTCCCTCCATTGCCCAGGCTGGAGTGCAGCGGCACAATCATGGCTCACCGCAGCCTCAATGTCCAGGGCTCAAGCAATCCTCCCACCTCAGCTTCCCAAGTAGCTGGGACCACAGGCGCACAGGGCACGGCTAATTTAAAAAAAATTTTTTGTGTAGAGATAGGGTCTCCTTATATTGCCCAGGCTGATCTCAAACACCTACTTGGGCTCAAGTGATCCTCCTGCCTCAGCCTCACAAAGTGCTGGGATTACAGGCATGAGTCACTGCATCCAACAGATTGATTTCTAATATGTCACCAAAAGGAGCACCTTTAGCTATGATTGGTGGGAAAAATATGACTAAAATAGGTATCCAAAAAGACAAGGGAAATGCTGGATAGAAGAGCCATTCCATGAAGAACCCAAGGCAGTGATTTTCTCATTCCCCAGGCTAACATTTCATATTTTTATGGTAAATTAACCACTTGAAATACATGTATCAAAAACTTATAAAAATAAAGGAAAAACTTACAGTTTAGCCTTTGTGCTATTTAGGAAGTCTTCTTCATCACTAAACTCATCTTCATTTTCGTCATGGTCTGATGAATCTTCTTCACTTTTTTCATCCTCTTCCTCTTCTTTTTCTTCCTCAATGGCAACCTCACTTGCCTTGTCTTCCTCTTCCAAGTAAAAATTTTTATCAGCACTCATTCCAGGAGTTGTGTCAATTACAAACAATGCATTGTCACATGACAGACTTCCTGTGTCTCCACTTAATGACTCCCTTTGGCCACTATTTTCAACAAAACATAAAGTATCCTCTTCATTCTCACTGTTTTCAGACTGTTGGCTTTCATCACTGCTGAGAACTAGTAAGACAGAATTATCTTTACCCTGAGATGTGTTGGGCGCAGACGTGTATAGTTTGGTATCACATTCAAAATCTACATTCCCTTCACTGTTCATGTCTTCACTGACACTTATAACTGTGGACTCTTCTTCATCATCACTACCACCACAATCACCAAACTTTGTCAAGTCACTTGCTTTTATGGGGCTCTTTTTGTTGTTATTCCATCTGCCTACTTCCACAGTTGCAAATGTTTGAGTTAATGATTTCATTACAGCCTCAGAGTTCAGATTAGAGTGCACTGATACAGCATTTTTATTTTGGGGGGTTGAATGTCTCTGAGAAACTAACTGTTGAAGGCTAGTGTCCTGAAGTTCAGAAAGATTCTTCAGCTGAGAACTTTTCTCATTAATTTCTTTCCCCTCATCTGTTATTCCATTGGCATCTTCATCCAAATCCTTACAATTCTGTTTTGTTTCTTTAAGAGATTCAACATTGGCCTGTTCGTGCACTGTTAATATATTTTCTGAACTTCTGTGGGAGAAATCATCATCAAAGTCATTATTATAGAAATTTGGCTTATTTATCTCAGAAAGAGATCTTGCTTGTAAATGGGAAGTTTGTCTGGTATCTGAATCCTCTGAATTCACAGGTGTACCCACGATCTGTTTCTCATTTCCTGGTACAATCTTACTATCTTTCTTTTCAGTTTGTGCCTTTAATTTCCTCTGCATACTCCTGGTTCTTCTAGTTGCAATTCCAGAGAATGAAATGTCTGAGCTTGATGTCTCAGCATCAGATATAGCTTCTGTATGAGATTCTTGGCTTGGATCTGTCAGAGATTTAGCCTTACTTCTTCTGGCTCCTGTA\t*\tSA:Z:chr1,61662787,-,165H74M1594H,12,2,64;chr15,43085606,+,1427H65M341H,0,2,55;chr10,96642377,+,1660H67M106H,60,5,42;chrUn_JTFH01001621v1_decoy,641,-,106M2I408M1317H,60,5,481;\tMD:Z:1317\tRG:Z:GATKSVContigAlignments\tNM:i:0\tAS:i:1317\tXS:i:0"; + AlignedContig alignedContig = fromPrimarySAMRecordString(sam, true); + + AssemblyContigWithFineTunedAlignments tigOne = new AssemblyContigWithFineTunedAlignments( + new AlignedContig(alignedContig.getContigName(), alignedContig.getContigSequence(), + Arrays.asList(alignedContig.getAlignments().get(0), alignedContig.getAlignments().get(1))), + alignedContig.getAlignments().subList(2, alignedContig.getAlignments().size()).stream().map(AlignmentInterval::toPackedString).collect(Collectors.toList()), + true, + (AlignmentInterval) null + ); + AssemblyContigWithFineTunedAlignments tigTwo = new AssemblyContigWithFineTunedAlignments( + new AlignedContig(alignedContig.getContigName(), alignedContig.getContigSequence(), + Arrays.asList(alignedContig.getAlignments().get(0), alignedContig.getAlignments().get(1), alignedContig.getAlignments().get(4))), + alignedContig.getAlignments().subList(2, 4).stream().map(AlignmentInterval::toPackedString).collect(Collectors.toList()), + true, + (AlignmentInterval) null); + + data.add(new Object[]{Arrays.asList(tigTwo, tigOne), Arrays.asList(tigOne, tigTwo)}); + + // case for two equally-good configurations, having same num of alignments, one has lower total NM + sam = "asm000168:tig00027\t0\tchr1\t4939534\t60\t54S139M96S\t*\t0\t0\tGTCCTCCGTATGACGTCAGTGTCCTCCATATGACATCAATATCCTCCATATGACATCAATATCCTCCATATGATGTCAGTGTGCTCCATATGACATCAATATCCTCCATATGATGTCAATATCCTGCGTATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCATATGATGTCAATATCCTCTGTATGATGTCAGTGTCCTCCATATGATGTCAATCGCCTCCATATGATGCCAATATCCTCCGTATGATGTCAATGCCCTCCGTATGATGTCAATGTCCTCCGT\t*\tSA:Z:chr1,4939535,+,155H134M,60,15,61;chr1,4939436,+,66M223H,23,3,51;chrUn_JTFH01000538v1_decoy,1338,-,153M136H,60,1,148;\tMD:Z:14A13C10T0G5G24C52G1G12\tRG:Z:GATKSVContigAlignments\tNM:i:8\tAS:i:99\tXS:i:45"; + alignedContig = fromPrimarySAMRecordString(sam, true); + + tigOne = new AssemblyContigWithFineTunedAlignments( + new AlignedContig(alignedContig.getContigName(), alignedContig.getContigSequence(), + Arrays.asList(alignedContig.getAlignments().get(0), alignedContig.getAlignments().get(1), new AlignmentInterval("chrUn_JTFH01000538v1_decoy,1338,-,153M136H,60,1,148"))), + Collections.singletonList(new AlignmentInterval("chr1,4939535,+,155H134M,60,15,61").toPackedString()), + true, + (AlignmentInterval)null ); + tigTwo = new AssemblyContigWithFineTunedAlignments( + new AlignedContig(alignedContig.getContigName(), alignedContig.getContigSequence(), + Arrays.asList(alignedContig.getAlignments().get(0), alignedContig.getAlignments().get(1), new AlignmentInterval("chr1,4939535,+,155H134M,60,15,61"))), + Collections.singletonList(new AlignmentInterval("chrUn_JTFH01000538v1_decoy,1338,-,153M136H,60,1,148").toPackedString()), + true, + (AlignmentInterval)null); + data.add(new Object[]{Arrays.asList(tigTwo, tigOne), Arrays.asList(tigOne, tigTwo)}); + + + return data.toArray(new Object[data.size()][]); + } + @Test(groups = "sv", dataProvider = "forConfigurationSorting") + public void testConfigurationSorting(final List tobeSorted, + final List expectedRepresentationsInOrder) { + + tobeSorted.sort(getConfigurationComparator()); + Assert.assertEquals(tobeSorted, expectedRepresentationsInOrder); } }