diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java new file mode 100644 index 00000000000..71f8ad9967f --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java @@ -0,0 +1,258 @@ +package org.broadinstitute.hellbender.tools.walkers.qc; + +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMFileHeader; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.http.annotation.Experimental; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.engine.filters.CountingReadFilter; +import org.broadinstitute.hellbender.engine.filters.ReadFilter; +import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * Performs post-processing steps to get a bam aligned to a transcriptome ready for RSEM (https://github.com/deweylab/RSEM) + * + * + * Suppose the read name "Q1" aligns to multiple loci in the transcriptome. + * STAR aligner outputs the reads in the following order: + * + * Q1: First-of-pair (Chr 1:1000) + * Q1: Second-of-pair (Chr 1:2000) + * Q1: First-of-pair (Chr 20:5000) + * Q1: Second-of-pair (Chr 20:6000) + * + * This is the format required by RSEM. After query-name sorting the reads for duplicate marking, + * the reads will be ordered as follows; + * + * Q1: First-of-pair (Chr 1:1000) + * Q1: First-of-pair (Chr 20:5000) + * Q1: Second-of-pair (Chr 1:2000) + * Q1: Second-of-pair (Chr 20:6000) + * + * That is, all the read1's come first, then read2's. + * + * This tool reorders such that the alignment pair appears together as in the first example. + * + * Caveat: It may be desirable to remove duplicate reads before running RSEM. + * This tool does not remove duplicate reads; it assumes they have been removed upstream e.g. + * MarkDuplicates with REMOVE_DUPLICATES=true + * + * + * Usage Example: + * + * gatk PostProcessReadsForRSEM \ + * -I input.bam \ + * -O output.bam + */ +@CommandLineProgramProperties( + summary = "Reorder reads before running RSEM", + oneLineSummary = "Reorder reads before running RSEM", + programGroup = ReadDataManipulationProgramGroup.class +) + +@BetaFeature +@DocumentedFeature +public class PostProcessReadsForRSEM extends GATKTool { + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME) + public GATKPath outSam; + + SAMFileGATKReadWriter writer; + + // Should use CountingFilter or another existing framework for counting and printing filtered reads + int totalOutputPair = 0; + int notBothMapped = 0; + int chimera = 0; + int unsupportedCigar = 0; + + @Override + public boolean requiresReads(){ + return true; + } + + @Override + public void onTraversalStart(){ + Utils.nonNull(getHeaderForReads(), "Input bam must have a header"); + if (getHeaderForReads().getSortOrder() != SAMFileHeader.SortOrder.queryname){ + throw new UserException("Input must be query-name sorted."); + } + + writer = createSAMWriter(outSam, true); + } + + @Override + public List getDefaultReadFilters() { + return Collections.singletonList(ReadFilterLibrary.NOT_SUPPLEMENTARY_ALIGNMENT); + } + + @Override + public void traverse() { + final CountingReadFilter countingReadFilter = makeReadFilter(); + final Iterator readIterator = getTransformedReadStream(countingReadFilter).iterator(); + ReadPair currentReadPair = null; + + // Initialize the first ReadPair object + if (readIterator.hasNext()) { + currentReadPair = new ReadPair(readIterator.next()); + totalOutputPair++; + } + + while (readIterator.hasNext()){ + final GATKRead read = readIterator.next(); + if (!currentReadPair.getName().equals(read.getName())){ + // We have gathered all the reads with the same query name (primary, secondary, and supplementary alignments) + // Write the reads to output file, reordering the reads as needed. + writeReads(currentReadPair); + currentReadPair = new ReadPair(read); + } else { + currentReadPair.add(read); + } + progressMeter.update(read); + } + + if (currentReadPair != null){ + writeReads(currentReadPair); + } + + } + + /** + * For the list of conditions required by RSEM, see: https://github.com/deweylab/RSEM/blob/master/samValidator.cpp + */ + public boolean passesRSEMFilter(final GATKRead read1, final GATKRead read2) { + if (read1 == null || read2 == null){ + logger.warn("read1 or read2 is null. This read will not be output. " + read1.getName()); + return false; + } + // If either of the pair is unmapped, throw out. + // With the STAR argument --quantTranscriptomeBan IndelSoftclipSingleend this should not occur, + // but we check just to be thorough. + if (read1.isUnmapped() || read2.isUnmapped()){ + notBothMapped++; + return false; + } + + // Chimeric reads are not allowed. Chimeric alignments should be just as suspect (or potentially interesting) + // in the case of transcriptome as in the genome. + if (!read1.contigsMatch(read2)){ + chimera++; + return false; + } + + // Cigar must have a single operator M. + if (read1.numCigarElements() != 1 || read2.numCigarElements() != 1){ + unsupportedCigar++; + return false; + } + + if (read1.getCigarElement(0).getOperator() != CigarOperator.M || + read2.getCigarElement(0).getOperator() != CigarOperator.M){ + unsupportedCigar++; + return false; + } else { + return true; + } + } + + /** Write reads in this order + * 1. Primary. First of pair, second of pair + * 2. For each secondary. First of pair, second of pair. + * **/ + private void writeReads(final ReadPair readPair){ + // Update read by side effect + final GATKRead firstOfPair = readPair.getFirstOfPair(); + final GATKRead secondOfPair = readPair.getSecondOfPair(); + + // Write Primary Reads. If either fails, we discard both, and we don't bother with the secondary alignments. + if (passesRSEMFilter(firstOfPair, secondOfPair)){ + writer.addRead(firstOfPair); + writer.addRead(secondOfPair); + totalOutputPair += 1; + + // Now handle secondary alignments. + final List> secondaryAlignmentPairs = groupSecondaryReads(readPair.getSecondaryAlignments()); + for (Pair mates : secondaryAlignmentPairs){ + // The pair is either both written or both not written. + if (passesRSEMFilter(mates.getLeft(), mates.getRight())) { + writer.addRead(mates.getLeft()); + writer.addRead(mates.getRight()); + totalOutputPair += 1; + + } + } + + // Supplementary reads are not handled i.e. removed + } + } + + /** + * + * Reorder the secondary alignments as described above. + * @param secondaryAlignments may be empty. + * @return a list of pair of matching reads (i.e. read1 with the corresponding read2) + */ + private List> groupSecondaryReads(final List secondaryAlignments){ + if (secondaryAlignments.isEmpty()){ + return Collections.emptyList(); + } + + final boolean isRead1 = true; + final Map> groupedByRead1 = + secondaryAlignments.stream().collect(Collectors.groupingBy(r -> r.isFirstOfPair())); + final List read1Reads = groupedByRead1.get(isRead1); + final List read2Reads = groupedByRead1.get(!isRead1); + if(read1Reads.size() != read2Reads.size()){ + logger.warn("Num read1s != num read2s among the secondary alignments; " + + secondaryAlignments.get(0).getName()); + return Collections.emptyList(); + } + + + final List> result = new ArrayList<>(read1Reads.size()); + for (GATKRead read1 : read1Reads){ + final List read2s = read2Reads.stream() + .filter(r -> r.contigsMatch(read1) + && r.getStart() == read1.getMateStart() + && r.getMateStart() == read1.getStart()) + .collect(Collectors.toList()); + if (read2s.size() == 1){ + result.add(new ImmutablePair<>(read1, read2s.get(0))); + } else if (read2s.size() > 1){ + logger.warn("Multiple mates found for the secondary alignment " + read1.getName()); + } else { + logger.warn("Mate not found for the secondary alignment " + read1.getName()); + } + } + return result; + } + + @Override + public Object onTraversalSuccess(){ + // Write out the last set of reads + logger.info("Total read pairs output: " + totalOutputPair); + logger.info("Read pairs filtered due to unmapped mate: " + notBothMapped); + logger.info("Read pairs filtered due to chimeric alignment: " + chimera); + logger.info("Read pairs filtered due to a cigar element not supported by RSEM: " + unsupportedCigar); + return "SUCCESS"; + } + + @Override + public void closeTool(){ + if (writer != null){ + writer.close(); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java new file mode 100644 index 00000000000..87d991cd253 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java @@ -0,0 +1,105 @@ +package org.broadinstitute.hellbender.tools.walkers.qc; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.read.GATKRead; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Data structure that contains the set of reads sharing the same queryname, including + * the primary, secondary (i.e. multi-mapping), and supplementary (e.g. chimeric) alignments. + * + */ +public class ReadPair { + private GATKRead firstOfPair; + private GATKRead secondOfPair; + private final List secondaryAlignments = new ArrayList<>(); + private final List supplementaryAlignments = new ArrayList<>(); + private final String queryName; + + public ReadPair(final GATKRead read) { + this.queryName = read.getName(); + add(read); + } + + public int numberOfAlignments(){ + int num = 0; + num = firstOfPair != null ? num + 1 : num; + num = secondOfPair != null ? num + 1 : num; + num += secondaryAlignments.size(); + num += supplementaryAlignments.size(); + return num; + } + + public String getName() { + return queryName; + } + + public GATKRead getFirstOfPair(){ + return firstOfPair; + } + + public GATKRead getSecondOfPair(){ + return secondOfPair; + } + + public List getSecondaryAlignments() { + return secondaryAlignments; + } + + public void add(final GATKRead read) { + if (! this.queryName.equals(read.getName())){ + throw new UserException("Read names do not match: " + this.queryName + " vs " + read.getName()); + } + + if (isPrimaryAlignment(read) && read.isFirstOfPair()) { + Utils.validate(this.firstOfPair == null, + "The primary firstOfPair is already set. Read = " + read.getName()); + this.firstOfPair = read; + } else if (isPrimaryAlignment(read) && read.isSecondOfPair()) { + Utils.validate(this.secondOfPair == null, + "The primary secondOfPair is already set. Read = " + read.getName()); + this.secondOfPair = read; + } else if (read.isSecondaryAlignment()) { + this.secondaryAlignments.add(read); + } else if (read.isSupplementaryAlignment()) { + this.supplementaryAlignments.add(read); + } else { + throw new UserException("Unknown read type: " + read.getContig()); + } + } + + private boolean isPrimaryAlignment(final GATKRead read){ + return ! read.isSecondaryAlignment() && ! read.isSupplementaryAlignment(); + } + + public boolean isDuplicateMarked() { + if (firstOfPair.isDuplicate()) { + // Make sure the rest is duplicate-marked + if (!secondOfPair.isDuplicate() || secondaryAlignments.stream().anyMatch(r -> !r.isDuplicate())) { + throw new UserException("First of pair a duplicate but the rest is not" + secondOfPair.getName()); + } + } else { + // Make sure the rest is not duplicate-marked + if (secondOfPair.isDuplicate() || secondaryAlignments.stream().anyMatch(r -> r.isDuplicate())) { + throw new UserException("First of pair a not duplicate but the rest is " + secondOfPair.getName()); + } + } + + return firstOfPair.isDuplicate(); + } + + public List getReads(final boolean onlyPrimaryAlignments){ + List reads = Arrays.asList(firstOfPair, secondOfPair); + if (onlyPrimaryAlignments){ + return(reads); + } else { + reads.addAll(secondaryAlignments); + reads.addAll(supplementaryAlignments); + return(reads); + } + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java new file mode 100644 index 00000000000..48fafe2e988 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java @@ -0,0 +1,59 @@ +package org.broadinstitute.hellbender.tools.walkers.qc; + +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.engine.ReadsPathDataSource; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +public class PostProcessReadsForRSEMIntegrationTest extends CommandLineProgramTest { + @Test + public void test(){ + final ArgumentsBuilder args = new ArgumentsBuilder(); + final String testTranscriptomeSam = publicTestDir + "transcriptome_query_sorted_abbr_header.bam"; + final File output = createTempFile("output", "bam"); + + args.addInput(testTranscriptomeSam); + args.addOutput(output.getAbsolutePath()); + args.add("XL", publicTestDir + "gencode_v19_MT_transcriptome_abbr_header.interval_list"); + runCommandLine(args, PostProcessReadsForRSEM.class.getSimpleName()); + + final ReadsPathDataSource outputReadSource = new ReadsPathDataSource(output.toPath()); + final Iterator outputSamIterator = outputReadSource.iterator(); + + final List inputReadNames = Arrays.asList("SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:11804", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:27367", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:30906", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:36761", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:9079", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10276:1689"); // aligns to a transcript in MT + final int[] expectedReadPairCount = new int[] { 4, 5, 2, 1, 7, 0 }; + String currentReadName = inputReadNames.get(0); + int i = 0; + int j = 0; + int count = 0; + + while (outputSamIterator.hasNext()){ + final GATKRead outputRead = outputSamIterator.next(); + // Check that read1 and read2 alternate + Assert.assertTrue(i % 2 == 0 ? outputRead.isFirstOfPair() : ! outputRead.isFirstOfPair()); + if (! outputRead.getName().equals(currentReadName)){ + Assert.assertEquals(count, 2*expectedReadPairCount[j++]); + currentReadName = outputRead.getName(); + count = 1; + i++; + continue; + } + + count++; + i++; + } + + } +} \ No newline at end of file diff --git a/src/test/resources/gencode_v19_MT_transcriptome_abbr_header.interval_list b/src/test/resources/gencode_v19_MT_transcriptome_abbr_header.interval_list new file mode 100644 index 00000000000..521939b0255 --- /dev/null +++ b/src/test/resources/gencode_v19_MT_transcriptome_abbr_header.interval_list @@ -0,0 +1,94 @@ +@HD VN:1.6 SO:queryname +@SQ SN:ENST00000375267.2 LN:15945 +@SQ SN:ENST00000375217.2 LN:15861 +@SQ SN:ENST00000375226.2 LN:15810 +@SQ SN:ENST00000375254.3 LN:15906 +@SQ SN:ENST00000375218.3 LN:4031 +@SQ SN:ENST00000321358.7 LN:1514 +@SQ SN:ENST00000436427.1 LN:1524 +@SQ SN:ENST00000361813.5 LN:4559 +@SQ SN:ENST00000422945.2 LN:4481 +@SQ SN:ENST00000328724.5 LN:4045 +@SQ SN:ENST00000557268.1 LN:2013 +@SQ SN:ENST00000350249.3 LN:3845 +@SQ SN:ENST00000445439.3 LN:1526 +@SQ SN:ENST00000334743.5 LN:4328 +@SQ SN:ENST00000557095.1 LN:1307 +@SQ SN:ENST00000415930.3 LN:4000 +@SQ SN:ENST00000588991.2 LN:2021 +@SQ SN:ENST00000356487.5 LN:2210 +@SQ SN:ENST00000586392.1 LN:1706 +@SQ SN:ENST00000387314.1 LN:71 +@SQ SN:ENST00000389680.2 LN:954 +@SQ SN:ENST00000387342.1 LN:69 +@SQ SN:ENST00000387347.2 LN:1559 +@SQ SN:ENST00000386347.1 LN:75 +@SQ SN:ENST00000361390.2 LN:956 +@SQ SN:ENST00000387365.1 LN:69 +@SQ SN:ENST00000387372.1 LN:72 +@SQ SN:ENST00000387377.1 LN:68 +@SQ SN:ENST00000361453.3 LN:1042 +@SQ SN:ENST00000387382.1 LN:68 +@SQ SN:ENST00000387392.1 LN:69 +@SQ SN:ENST00000387400.1 LN:73 +@SQ SN:ENST00000387405.1 LN:66 +@SQ SN:ENST00000387409.1 LN:66 +@SQ SN:ENST00000361624.2 LN:1542 +@SQ SN:ENST00000387416.2 LN:69 +@SQ SN:ENST00000387419.1 LN:68 +@SQ SN:ENST00000361739.1 LN:684 +@SQ SN:ENST00000387421.1 LN:70 +@SQ SN:ENST00000361851.1 LN:207 +@SQ SN:ENST00000361899.2 LN:681 +@SQ SN:ENST00000362079.2 LN:784 +@SQ SN:ENST00000387429.1 LN:68 +@SQ SN:ENST00000361227.2 LN:346 +@SQ SN:ENST00000387439.1 LN:65 +@SQ SN:ENST00000361335.1 LN:297 +@SQ SN:ENST00000361381.2 LN:1378 +@SQ SN:ENST00000387441.1 LN:69 +@SQ SN:ENST00000387449.1 LN:59 +@SQ SN:ENST00000387456.1 LN:71 +@SQ SN:ENST00000361567.2 LN:1812 +@SQ SN:ENST00000361681.2 LN:525 +@SQ SN:ENST00000387459.1 LN:69 +@SQ SN:ENST00000361789.2 LN:1141 +@SQ SN:ENST00000387460.2 LN:66 +@SQ SN:ENST00000387461.2 LN:68 +ENST00000387314.1 1 71 + . +ENST00000389680.2 1 954 + . +ENST00000387342.1 1 69 + . +ENST00000387347.2 1 1559 + . +ENST00000386347.1 1 75 + . +ENST00000361390.2 1 956 + . +ENST00000387365.1 1 69 + . +ENST00000387372.1 1 72 + . +ENST00000387377.1 1 68 + . +ENST00000361453.3 1 1042 + . +ENST00000387382.1 1 68 + . +ENST00000387392.1 1 69 + . +ENST00000387400.1 1 73 + . +ENST00000387405.1 1 66 + . +ENST00000387409.1 1 66 + . +ENST00000361624.2 1 1542 + . +ENST00000387416.2 1 69 + . +ENST00000387419.1 1 68 + . +ENST00000361739.1 1 684 + . +ENST00000387421.1 1 70 + . +ENST00000361851.1 1 207 + . +ENST00000361899.2 1 681 + . +ENST00000362079.2 1 784 + . +ENST00000387429.1 1 68 + . +ENST00000361227.2 1 346 + . +ENST00000387439.1 1 65 + . +ENST00000361335.1 1 297 + . +ENST00000361381.2 1 1378 + . +ENST00000387441.1 1 69 + . +ENST00000387449.1 1 59 + . +ENST00000387456.1 1 71 + . +ENST00000361567.2 1 1812 + . +ENST00000361681.2 1 525 + . +ENST00000387459.1 1 69 + . +ENST00000361789.2 1 1141 + . +ENST00000387460.2 1 66 + . +ENST00000387461.2 1 68 + . diff --git a/src/test/resources/transcriptome_query_sorted_abbr_header.bam b/src/test/resources/transcriptome_query_sorted_abbr_header.bam new file mode 100644 index 00000000000..ac50e1cfb9b Binary files /dev/null and b/src/test/resources/transcriptome_query_sorted_abbr_header.bam differ