diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java index bad4c131d79..6918789d45d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java @@ -22,6 +22,7 @@ import org.broadinstitute.hellbender.utils.read.markduplicates.*; import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.*; import picard.sam.markduplicates.util.OpticalDuplicateFinder; +import picard.sam.markduplicates.util.ReadEnds; import scala.Tuple2; import java.io.Serializable; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/EstimateLibraryComplexityGATK.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/EstimateLibraryComplexityGATK.java deleted file mode 100644 index 1e1b35f50b4..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/EstimateLibraryComplexityGATK.java +++ /dev/null @@ -1,558 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.markduplicates; - -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SamReader; -import htsjdk.samtools.SamReaderFactory; -import htsjdk.samtools.metrics.MetricsFile; -import htsjdk.samtools.util.CloserUtil; -import htsjdk.samtools.util.Histogram; -import htsjdk.samtools.util.IOUtil; -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.util.SequenceUtil; -import htsjdk.samtools.util.SortingCollection; -import htsjdk.samtools.util.StringUtil; -import org.broadinstitute.barclay.argparser.Argument; -import org.broadinstitute.barclay.argparser.BetaFeature; -import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; -import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.utils.io.IOUtils; -import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup; -import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.utils.read.markduplicates.AbstractOpticalDuplicateFinderCommandLineProgram; -import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics; -import picard.sam.markduplicates.util.OpticalDuplicateFinder; -import org.broadinstitute.hellbender.utils.runtime.ProgressLogger; -import picard.sam.util.PhysicalLocation; - -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.EOFException; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.Serializable; -import java.util.*; - -import static java.lang.Math.pow; - -/** - * Estimate library complexity from the sequence of read pairs - * - *

- * The estimation is done by sorting all reads by the first N bases (defined by --min-identical-bases with default of 5) - * of each read and then comparing reads with the first N bases identical to each other for duplicates. - * Reads are considered to be duplicates if they match each other with no gaps and an overall mismatch - * rate less than or equal to MAX_DIFF_RATE (0.03 by default). The approach differs from that taken by - * Picard MarkDuplicates to estimate library complexity in that here alignment is not a factor. - *

- * - *

- * Reads of poor quality are filtered out so as to provide a more accurate estimate. The filtering - * removes reads with any no-calls in the first N bases or with a mean base quality lower than - * MIN_MEAN_QUALITY across either the first or second read. Unpaired reads are ignored in this computation. - *

- * - *

- * The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes - * these in the calculation of library size. Also, since there is no alignment to screen out technical - * reads one further filter is applied on the data. After examining all reads a Histogram is built of - * [#reads in duplicate set -> #of duplicate sets]; all bins that contain exactly one duplicate set are - * then removed from the Histogram as outliers before library size is estimated. - *

- * - *

Input

- * - * - *

Output

- * - * - *

Usage Example

- *
- *   gatk EstimateLibraryComplexityGATK \
- *     -I input.bam \
- *     -O metrics.txt
- * 
- * - * @author Tim Fennell - */ -@BetaFeature -@DocumentedFeature -@CommandLineProgramProperties( - summary = "Estimate library complexity from the sequence of read pairs", - oneLineSummary = "Estimate library complexity from the sequence of read pairs", - programGroup = DiagnosticsAndQCProgramGroup.class -) -public final class EstimateLibraryComplexityGATK extends AbstractOpticalDuplicateFinderCommandLineProgram { - - @Argument( - fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, - shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, - doc = "One or more files to combine and " + - "estimate library complexity from. Reads can be mapped or unmapped." - ) - public List INPUT; - - @Argument( - fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, - shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, - doc = "Output file to writes per-library metrics to." - ) - public File OUTPUT; - - @Argument( - fullName = "min-identical-bases", - doc = "The minimum number of bases at the starts of reads that must be identical for reads to " + - "be grouped together for duplicate detection. In effect total_reads / 4^max_id_bases reads will " + - "be compared at a time, so lower numbers will produce more accurate results but consume " + - "exponentially more memory and CPU." - ) - public int MIN_IDENTICAL_BASES = 5; - - @Argument( - fullName = "max-diff-rate", - doc = "The maximum rate of differences between two reads to call them identical." - ) - public double MAX_DIFF_RATE = 0.03; - - @Argument( - fullName = "min-mean-quality", - doc = "The minimum mean quality of the bases in a read pair for the read to be analyzed. Reads with " + - "lower average quality are filtered out and not considered in any calculations." - ) - public int MIN_MEAN_QUALITY = 20; - - @Argument( - fullName = "max-group-ratio", - doc = "Do not process self-similar groups that are this many times over the mean expected group size. " + - "I.e. if the input contains 10m read pairs and MIN_IDENTICAL_BASES is set to 5, then the mean expected " + - "group size would be approximately 10 reads." - ) - public int MAX_GROUP_RATIO = 500; - - /** - * Little class to hold the sequence of a pair of reads and tile location information. - */ - static class PairedReadSequence implements PhysicalLocation { - static int size_in_bytes = 2 + 1 + 4 + 1 + 300; // rough guess at memory footprint - short readGroup = -1; - short tile = -1; - int x = -1, y = -1; - boolean qualityOk = true; - byte[] read1; - byte[] read2; - short libraryId; - - @Override - public short getReadGroup() { return this.readGroup; } - - @Override - public void setReadGroup(final short readGroup) { this.readGroup = readGroup; } - - @Override - public short getTile() { return this.tile; } - - @Override - public void setTile(final short tile) { this.tile = tile; } - - @Override - public int getX() { return this.x; } - - @Override - public void setX(final int x) { this.x = x; } - - @Override - public int getY() { return this.y; } - - @Override - public void setY(final int y) { this.y = y; } - - @Override - public short getLibraryId() { return this.libraryId; } - - @Override - public void setLibraryId(final short libraryId) { this.libraryId = libraryId; } - } - - /** - * Codec class for writing and read PairedReadSequence objects. - */ - static class PairedReadCodec implements SortingCollection.Codec { - private DataOutputStream out; - private DataInputStream in; - - @Override - public void setOutputStream(final OutputStream out) { - this.out = new DataOutputStream(out); - } - - @Override - public void setInputStream(final InputStream in) { - this.in = new DataInputStream(in); - } - - @Override - public void encode(final PairedReadSequence val) { - try { - this.out.writeShort(val.readGroup); - this.out.writeShort(val.tile); - this.out.writeShort(val.x); - this.out.writeShort(val.y); - this.out.writeInt(val.read1.length); - this.out.write(val.read1); - this.out.writeInt(val.read2.length); - this.out.write(val.read2); - } catch (IOException ioe) { - throw new GATKException("Error write out read pair.", ioe); - } - } - - @Override - public PairedReadSequence decode() { - try { - final PairedReadSequence val = new PairedReadSequence(); - try { - val.readGroup = this.in.readShort(); - } catch (EOFException eof) { - return null; - } - - val.tile = this.in.readShort(); - val.x = this.in.readShort(); - val.y = this.in.readShort(); - - int length = this.in.readInt(); - val.read1 = new byte[length]; - if (this.in.read(val.read1) != length) { - throw new GATKException("Could not read " + length + " bytes from temporary file."); - } - - length = this.in.readInt(); - val.read2 = new byte[length]; - if (this.in.read(val.read2) != length) { - throw new GATKException("Could not read " + length + " bytes from temporary file."); - } - - return val; - } catch (IOException ioe) { - throw new GATKException("Exception reading read pair.", ioe); - } - } - - /** - * For an explanation of why codecs must implement clone(), - * see the HTSJDK documentation for {#link htsjdk.samtools.util.SortingCollection.Codec}. - */ - @Override - public PairedReadCodec clone() { return new PairedReadCodec(); } - } - - /** - * Comparator that orders read pairs on the first N bases of both reads. - */ - class PairedReadComparator implements Comparator, Serializable { - private static final long serialVersionUID = 7452449563074722818L; - - final int BASES = EstimateLibraryComplexityGATK.this.MIN_IDENTICAL_BASES; - - @Override - public int compare(final PairedReadSequence lhs, final PairedReadSequence rhs) { - // First compare the first N bases of the first read - for (int i = 0; i < BASES; ++i) { - final int retval = lhs.read1[i] - rhs.read1[i]; - if (retval != 0) return retval; - } - - // Then compare the first N bases of the second read - for (int i = 0; i < BASES; ++i) { - final int retval = lhs.read2[i] - rhs.read2[i]; - if (retval != 0) return retval; - } - - return System.identityHashCode(lhs) - System.identityHashCode(rhs); - } - } - - public EstimateLibraryComplexityGATK() { - MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / PairedReadSequence.size_in_bytes) / 2; - } - - /** - * Method that does most of the work. Reads through the input BAM file and extracts the - * read sequences of each read pair and sorts them via a SortingCollection. Then traverses - * the sorted reads and looks at small groups at a time to find duplicates. - */ - @Override - protected Object doWork() { - for (final File f : INPUT) IOUtil.assertFileIsReadable(f); - - logger.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting."); - - final List readGroups = new ArrayList<>(); - final int recordsRead = 0; - final SortingCollection sorter = SortingCollection.newInstanceFromPaths(PairedReadSequence.class, - new PairedReadCodec(), - new PairedReadComparator(), - MAX_RECORDS_IN_RAM, - Collections.singletonList(IOUtils.getPath(tmpDir))); - - // Loop through the input files and pick out the read sequences etc. - final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read"); - for (final File f : INPUT) { - final Map pendingByName = new HashMap<>(); - final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f); - readGroups.addAll(in.getFileHeader().getReadGroups()); - - for (final SAMRecord rec : in) { - if (!rec.getReadPairedFlag()) continue; - if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) { - continue; - } - - PairedReadSequence prs = pendingByName.remove(rec.getReadName()); - if (prs == null) { - // Make a new paired read object and add RG and physical location information to it - prs = new PairedReadSequence(); - if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) { - final SAMReadGroupRecord rg = rec.getReadGroup(); - if (rg != null) prs.setReadGroup((short) readGroups.indexOf(rg)); - } - - pendingByName.put(rec.getReadName(), prs); - } - - // Read passes quality check if both ends meet the mean quality criteria - final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(), - rec.getBaseQualities(), - MIN_IDENTICAL_BASES, - MIN_MEAN_QUALITY); - prs.qualityOk = prs.qualityOk && passesQualityCheck; - - // Get the bases and restore them to their original orientation if necessary - final byte[] bases = rec.getReadBases(); - if (rec.getReadNegativeStrandFlag()) SequenceUtil.reverseComplement(bases); - - if (rec.getFirstOfPairFlag()) { - prs.read1 = bases; - } else { - prs.read2 = bases; - } - - if (prs.read1 != null && prs.read2 != null && prs.qualityOk) { - sorter.add(prs); - } - - progress.record(rec); - } - CloserUtil.close(in); - } - - logger.info("Finished reading - moving on to scanning for duplicates."); - - // Now go through the sorted reads and attempt to find duplicates - try (final PeekableIterator iterator = new PeekableIterator<>(sorter.iterator())) { - - final Map> duplicationHistosByLibrary = new HashMap<>(); - final Map> opticalHistosByLibrary = new HashMap<>(); - - int groupsProcessed = 0; - long lastLogTime = System.currentTimeMillis(); - final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4.0, (double) MIN_IDENTICAL_BASES * 2)); - - while (iterator.hasNext()) { - // Get the next group and split it apart by library - final List group = getNextGroup(iterator); - - if (group.size() > meanGroupSize * MAX_GROUP_RATIO) { - final PairedReadSequence prs = group.get(0); - logger.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " + - "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " + - StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) + - " / " + - StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES)); - } else { - final Map> sequencesByLibrary = splitByLibrary(group, readGroups); - - // Now process the reads by library - for (final Map.Entry> entry : sequencesByLibrary.entrySet()) { - final String library = entry.getKey(); - final List seqs = entry.getValue(); - - Histogram duplicationHisto = duplicationHistosByLibrary.get(library); - Histogram opticalHisto = opticalHistosByLibrary.get(library); - if (duplicationHisto == null) { - duplicationHisto = new Histogram<>("duplication_group_count", library); - opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates"); - duplicationHistosByLibrary.put(library, duplicationHisto); - opticalHistosByLibrary.put(library, opticalHisto); - } - - // Figure out if any reads within this group are duplicates of one another - for (int i = 0; i < seqs.size(); ++i) { - final PairedReadSequence lhs = seqs.get(i); - if (lhs == null) continue; - final List dupes = new ArrayList<>(); - - for (int j = i + 1; j < seqs.size(); ++j) { - final PairedReadSequence rhs = seqs.get(j); - if (rhs == null) continue; - - if (matches(lhs, rhs, MAX_DIFF_RATE)) { - dupes.add(rhs); - seqs.set(j, null); - } - } - - if (!dupes.isEmpty()) { - dupes.add(lhs); - final int duplicateCount = dupes.size(); - duplicationHisto.increment(duplicateCount); - - final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes, null); - for (final boolean b : flags) { - if (b) opticalHisto.increment(duplicateCount); - } - } else { - duplicationHisto.increment(1); - } - } - } - - ++groupsProcessed; - if (lastLogTime < System.currentTimeMillis() - 60000) { - logger.info("Processed " + groupsProcessed + " groups."); - lastLogTime = System.currentTimeMillis(); - } - } - } - sorter.cleanup(); - - final MetricsFile file = getMetricsFile(); - for (final String library : duplicationHistosByLibrary.keySet()) { - final Histogram duplicationHisto = duplicationHistosByLibrary.get(library); - final Histogram opticalHisto = opticalHistosByLibrary.get(library); - final GATKDuplicationMetrics metrics = new GATKDuplicationMetrics(); - metrics.LIBRARY = library; - - // Filter out any bins that have only a single entry in them and calcu - for (final Integer bin : duplicationHisto.keySet()) { - final double duplicateGroups = duplicationHisto.get(bin).getValue(); - final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue(); - - if (duplicateGroups > 1) { - metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups); - metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups); - metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates; - } - } - - metrics.calculateDerivedFields(); - file.addMetric(metrics); - file.addHistogram(duplicationHisto); - - } - - file.write(OUTPUT); - } - return null; - } - - /** - * Checks to see if two reads pairs have sequence that are the same, give or take a few - * errors/diffs as dictated by the maxDiffRate. - */ - private boolean matches(final PairedReadSequence lhs, final PairedReadSequence rhs, final double maxDiffRate) { - final int read1Length = Math.min(lhs.read1.length, rhs.read1.length); - final int read2Length = Math.min(lhs.read2.length, rhs.read2.length); - final int maxErrors = (int) Math.floor((read1Length + read2Length) * maxDiffRate); - int errors = 0; - - // The loop can start from MIN_IDENTICAL_BASES because we've already confirmed that - // at least those first few bases are identical when sorting. - for (int i = MIN_IDENTICAL_BASES; i < read1Length; ++i) { - if (lhs.read1[i] != rhs.read1[i]) { - if (++errors > maxErrors) return false; - } - } - - for (int i = MIN_IDENTICAL_BASES; i < read2Length; ++i) { - if (lhs.read2[i] != rhs.read2[i]) { - if (++errors > maxErrors) return false; - } - } - - return true; - } - - /** - * Pulls out of the iterator the next group of reads that can be compared to each other to - * identify duplicates. - */ - List getNextGroup(final PeekableIterator iterator) { - final List group = new ArrayList<>(); - final PairedReadSequence first = iterator.next(); - group.add(first); - - outer: - while (iterator.hasNext()) { - final PairedReadSequence next = iterator.peek(); - for (int i = 0; i < MIN_IDENTICAL_BASES; ++i) { - if (first.read1[i] != next.read1[i] || first.read2[i] != next.read2[i]) break outer; - } - - group.add(iterator.next()); - - } - - return group; - } - - /** - * Takes a list of PairedReadSequence objects and splits them into lists by library. - */ - Map> splitByLibrary(final List input, - final List rgs) { - - final Map> out = new HashMap<>(); - for (final PairedReadSequence seq : input) { - String library = null; - if (seq.getReadGroup() != -1) { - library = rgs.get(seq.getReadGroup()).getLibrary(); - if (library == null) library = "Unknown"; - } else { - library = "Unknown"; - } - - List librarySeqs = out.get(library); - if (librarySeqs == null) { - librarySeqs = new ArrayList<>(); - out.put(library, librarySeqs); - } - librarySeqs.add(seq); - } - - return out; - } - - /** - * Checks that the average quality over the entire read is >= min, and that the first N bases do - * not contain any no-calls. - */ - boolean passesQualityCheck(final byte[] bases, final byte[] quals, final int seedLength, final int minQuality) { - if (bases.length < seedLength) return false; - - for (int i = 0; i < seedLength; ++i) { - if (SequenceUtil.isNoCall(bases[i])) return false; - } - - int total = 0; - for (final byte b : quals) total += b; - return total / quals.length >= minQuality; - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATK.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATK.java deleted file mode 100644 index c05285db0b4..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATK.java +++ /dev/null @@ -1,492 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.markduplicates; - -import com.google.common.annotations.VisibleForTesting; -import htsjdk.samtools.*; -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.samtools.util.IOUtil; -import htsjdk.samtools.util.SortingCollection; -import htsjdk.samtools.util.SortingLongCollection; -import org.broadinstitute.barclay.argparser.Argument; -import org.broadinstitute.barclay.argparser.ExperimentalFeature; -import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; -import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.hellbender.utils.io.IOUtils; -import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; -import org.broadinstitute.hellbender.utils.read.ReadUtils; -import org.broadinstitute.hellbender.utils.read.markduplicates.*; -import org.broadinstitute.hellbender.utils.runtime.ProgressLogger; - -import java.io.Serializable; -import java.util.*; - -/** - * A better duplication marking algorithm that handles all cases including clipped - * and gapped alignments. - * - * @author Tim Fennell - */ -@ExperimentalFeature -@DocumentedFeature -@CommandLineProgramProperties( - summary = "(Experimental) Examines aligned records in the supplied SAM/BAM/CRAM file to locate duplicate molecules. " + - "All records are then written to the output file with the duplicate records flagged.", - oneLineSummary = "Examines aligned records in the supplied SAM/BAM/CRAM file to locate duplicate molecules.", - programGroup = ReadDataManipulationProgramGroup.class -) -public final class MarkDuplicatesGATK extends AbstractMarkDuplicatesCommandLineProgram { - /** - * If more than this many sequences in SAM file, don't spill to disk because there will not - * be enough file handles. - */ - - @Argument(shortName = "MAX_SEQS", - doc = "This option is obsolete. ReadEnds will always be spilled to disk.") - public int MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP = 50000; - - @Argument(shortName = "MAX_FILE_HANDLES", - doc = "Maximum number of file handles to keep open when spilling read ends to disk. " + - "Set this number a little lower than the per-process maximum number of file that may be open. " + - "This number can be found by executing the 'ulimit -n' command on a Unix system.") - public int MAX_FILE_HANDLES_FOR_READ_ENDS_MAP = 8000; - - @Argument(doc = "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by " + - "some of the sorting collections. If you are running out of memory, try reducing this number.") - public double SORTING_COLLECTION_SIZE_RATIO = 0.25; - - @Argument(doc = "Report Memory Stats at various times during the run") - public boolean reportMemoryStats = false; - - - private SortingCollection pairSort; - private SortingCollection fragSort; - private SortingLongCollection duplicateIndexes; - private int numDuplicateIndices = 0; - - private LibraryIdGenerator libraryIdGenerator = null; // this is initialized in buildSortedReadEndLists - - public MarkDuplicatesGATK() { - DUPLICATE_SCORING_STRATEGY = DuplicateScoringStrategy.ScoringStrategy.SUM_OF_BASE_QUALITIES; - } - - /** - * Main work method. Reads the BAM file once and collects sorted information about - * the 5' ends of both ends of each read (or just one end in the case of pairs). - * Then makes a pass through those determining duplicates before re-reading the - * input file and writing it out with duplication flags set correctly. - */ - @Override - protected Object doWork() { - IOUtil.assertFilesAreReadable(INPUT); - IOUtil.assertFileIsWritable(OUTPUT); - IOUtil.assertFileIsWritable(METRICS_FILE); - - reportMemoryStats("Start of doWork"); - logger.info("Reading input file and constructing read end information."); - buildSortedReadEndLists(); - reportMemoryStats("After buildSortedReadEndLists"); - generateDuplicateIndexes(); - reportMemoryStats("After generateDuplicateIndexes"); - logger.info("Marking " + this.numDuplicateIndices + " records as duplicates."); - - if (this.opticalDuplicatesArgumentCollection.READ_NAME_REGEX == null) { - logger.warn("Skipped optical duplicate cluster discovery; library size estimation may be inaccurate!"); - } else { - logger.info("Found " + (this.libraryIdGenerator.getNumberOfOpticalDuplicateClusters()) + " optical duplicate clusters."); - } - - try( final SamHeaderAndIterator headerAndIterator = openInputs()) { - final SAMFileHeader header = headerAndIterator.header; - - final SAMFileHeader outputHeader = ReadUtils.cloneSAMFileHeader(header); - outputHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); - for (final String comment : COMMENT) outputHeader.addComment(comment); - - // Key: previous PG ID on a SAM Record (or null). Value: New PG ID to replace it. - final Map chainedPgIds = getChainedPgIds(outputHeader); - - try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outputHeader, true)) { - - // Now copy over the file while marking all the necessary indexes as duplicates - long recordInFileIndex = 0; - long nextDuplicateIndex = (this.duplicateIndexes.hasNext() ? this.duplicateIndexes.next() : -1); - - final ProgressLogger progress = new ProgressLogger(logger, (int) 1e7, "Written"); - try (final CloseableIterator iterator = headerAndIterator.iterator) { - while (iterator.hasNext()) { - final SAMRecord rec = iterator.next(); - final String library = LibraryIdGenerator.getLibraryName(header, rec); - GATKDuplicationMetrics metrics = libraryIdGenerator.getMetricsByLibrary(library); - if (metrics == null) { - metrics = new GATKDuplicationMetrics(); - metrics.LIBRARY = library; - libraryIdGenerator.addMetricsByLibrary(library, metrics); - } - - if (recordInFileIndex == nextDuplicateIndex) { - rec.setDuplicateReadFlag(true); - // Now try and figure out the next duplicate index - if (this.duplicateIndexes.hasNext()) { - nextDuplicateIndex = this.duplicateIndexes.next(); - } else { - // Only happens once we've marked all the duplicates - nextDuplicateIndex = -1; - } - } else { - rec.setDuplicateReadFlag(false); - } - - metrics.updateMetrics(rec); - - recordInFileIndex++; - - if (!this.REMOVE_DUPLICATES || !rec.getDuplicateReadFlag()) { - if (PROGRAM_RECORD_ID != null) { - rec.setAttribute(SAMTag.PG.name(), chainedPgIds.get(rec.getStringAttribute(SAMTag.PG.name()))); - } - out.addAlignment(rec); - progress.record(rec); - } - } - } - this.duplicateIndexes.cleanup(); - - reportMemoryStats("Before output close"); - } - reportMemoryStats("After output close"); - } - - // Write out the metrics - finalizeAndWriteMetrics(libraryIdGenerator); - - return null; - } - - @VisibleForTesting - long numOpticalDuplicates() { return ((long) this.libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap().getSumOfValues()); } // cast as long due to returning a double - - /** Print out some quick JVM memory stats. */ - private void reportMemoryStats(final String stage) { - if(reportMemoryStats) { - System.gc(); - final Runtime runtime = Runtime.getRuntime(); - logger.info(stage + " freeMemory: " + runtime.freeMemory() + "; totalMemory: " + runtime.totalMemory() + - "; maxMemory: " + runtime.maxMemory()); - } - } - - /** - * Goes through all the records in a file and generates a set of ReadEndsForMarkDuplicates objects that - * hold the necessary information (reference sequence, 5' read coordinate) to do - * duplication, caching to disk as necessary to sort them. - */ - private void buildSortedReadEndLists() { - final int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * SORTING_COLLECTION_SIZE_RATIO) / ReadEndsForMarkDuplicates.SIZE_OF); - logger.info("Will retain up to " + maxInMemory + " data points before spilling to disk."); - - this.pairSort = SortingCollection.newInstanceFromPaths(ReadEndsForMarkDuplicates.class, - new ReadEndsForMarkDuplicatesCodec(), - new ReadEndsMDComparator(), - maxInMemory, - Collections.singletonList(IOUtils.getPath(tmpDir))); - - this.fragSort = SortingCollection.newInstanceFromPaths(ReadEndsForMarkDuplicates.class, - new ReadEndsForMarkDuplicatesCodec(), - new ReadEndsMDComparator(), - maxInMemory, - Collections.singletonList(IOUtils.getPath(tmpDir))); - - try(final SamHeaderAndIterator headerAndIterator = openInputs()) { - final SAMFileHeader header = headerAndIterator.header; - final ReadEndsForMarkDuplicatesMap tmp = new DiskBasedReadEndsForMarkDuplicatesMap(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP); - long index = 0; - final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read"); - final CloseableIterator iterator = headerAndIterator.iterator; - - if (null == this.libraryIdGenerator) { - this.libraryIdGenerator = new LibraryIdGenerator(header); - } - - while (iterator.hasNext()) { - final SAMRecord rec = iterator.next(); - - // This doesn't have anything to do with building sorted ReadEnd lists, but it can be done in the same pass - // over the input - if (PROGRAM_RECORD_ID != null) { - // Gather all PG IDs seen in merged input files in first pass. These are gathered for two reasons: - // - to know how many different PG records to create to represent this program invocation. - // - to know what PG IDs are already used to avoid collisions when creating new ones. - // Note that if there are one or more records that do not have a PG tag, then a null value - // will be stored in this set. - pgIdsSeen.add(rec.getStringAttribute(SAMTag.PG.name())); - } - - if (rec.getReadUnmappedFlag()) { - if (rec.getReferenceIndex() == -1) { - // When we hit the unmapped reads with no coordinate, no reason to continue. - break; - } - // If this read is unmapped but sorted with the mapped reads, just skip it. - } else if (!rec.isSecondaryOrSupplementary()) { - final ReadEndsForMarkDuplicates fragmentEnd = buildReadEnds(header, index, rec); - this.fragSort.add(fragmentEnd); - - if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { - final String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName(); - ReadEndsForMarkDuplicates pairedEnds = tmp.remove(rec.getReferenceIndex(), key); - - // See if we've already seen the first end or not - if (pairedEnds == null) { - pairedEnds = buildReadEnds(header, index, rec); - tmp.put(pairedEnds.read2ReferenceIndex, key, pairedEnds); - } else { - final int sequence = fragmentEnd.read1ReferenceIndex; - final int coordinate = fragmentEnd.read1Coordinate; - - // Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this - // before updating the orientation later. - if (rec.getFirstOfPairFlag()) { - pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R); - } else { - pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag()); - } - - // If the second read is actually later, just add the second read data, else flip the reads - if (sequence > pairedEnds.read1ReferenceIndex || - (sequence == pairedEnds.read1ReferenceIndex && coordinate >= pairedEnds.read1Coordinate)) { - pairedEnds.read2ReferenceIndex = sequence; - pairedEnds.read2Coordinate = coordinate; - pairedEnds.read2IndexInFile = index; - pairedEnds.orientation = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R, - rec.getReadNegativeStrandFlag()); - } else { - pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex; - pairedEnds.read2Coordinate = pairedEnds.read1Coordinate; - pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile; - pairedEnds.read1ReferenceIndex = sequence; - pairedEnds.read1Coordinate = coordinate; - pairedEnds.read1IndexInFile = index; - pairedEnds.orientation = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(), - pairedEnds.orientation == ReadEnds.R); - } - - pairedEnds.score += DuplicateScoringStrategy.computeDuplicateScore(rec, this.DUPLICATE_SCORING_STRATEGY); - this.pairSort.add(pairedEnds); - } - } - } - - // Print out some stats every 1m reads - ++index; - if (progress.record(rec)) { - logger.info("Tracking " + tmp.size() + " as yet unmatched pairs. " + tmp.sizeInRam() + " records in RAM."); - } - } - - logger.info("Read " + index + " records. " + tmp.size() + " pairs never matched."); - iterator.close(); - } - - // Tell these collections to free up memory if possible. - this.pairSort.doneAdding(); - this.fragSort.doneAdding(); - } - - /** Builds a read ends object that represents a single read. */ - private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final long index, final SAMRecord rec) { - final ReadEndsForMarkDuplicates ends = new ReadEndsForMarkDuplicates(); - ends.read1ReferenceIndex = rec.getReferenceIndex(); - ends.read1Coordinate = rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart(); - ends.orientation = rec.getReadNegativeStrandFlag() ? ReadEnds.R : ReadEnds.F; - ends.read1IndexInFile = index; - ends.score = DuplicateScoringStrategy.computeDuplicateScore(rec, this.DUPLICATE_SCORING_STRATEGY); - - // Doing this lets the ends object know that it's part of a pair - if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { - ends.read2ReferenceIndex = rec.getMateReferenceIndex(); - } - - // Fill in the library ID - ends.libraryId = libraryIdGenerator.getLibraryId(rec); - - // Fill in the location information for optical duplicates - if (this.opticalDuplicateFinder.addLocationInformation(rec.getReadName(), ends)) { - // calculate the RG number (nth in list) - ends.readGroup = 0; - final String rg = (String) rec.getAttribute("RG"); - final List readGroups = header.getReadGroups(); - - if (rg != null && readGroups != null) { - for (final SAMReadGroupRecord readGroup : readGroups) { - if (readGroup.getReadGroupId().equals(rg)) break; - else ends.readGroup++; - } - } - } - - return ends; - } - - /** - * Goes through the accumulated ReadEndsForMarkDuplicates objects and determines which of them are - * to be marked as duplicates. - * - * @return an array with an ordered list of indexes into the source file - */ - private void generateDuplicateIndexes() { - // Keep this number from getting too large even if there is a huge heap. - final int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / SortingLongCollection.SIZEOF, - (double) (Integer.MAX_VALUE - 5)); - logger.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk."); - this.duplicateIndexes = new SortingLongCollection(maxInMemory, IOUtils.getPath(tmpDir)); - - ReadEndsForMarkDuplicates firstOfNextChunk = null; - final List nextChunk = new ArrayList<>(200); - - // First just do the pairs - logger.info("Traversing read pair information and detecting duplicates."); - for (final ReadEndsForMarkDuplicates next : this.pairSort) { - if (firstOfNextChunk == null) { - firstOfNextChunk = next; - nextChunk.add(firstOfNextChunk); - } else if (areComparableForDuplicates(firstOfNextChunk, next, true)) { - nextChunk.add(next); - } else { - if (nextChunk.size() > 1) { - markDuplicatePairs(nextChunk); - } - - nextChunk.clear(); - nextChunk.add(next); - firstOfNextChunk = next; - } - } - if (nextChunk.size() > 1) markDuplicatePairs(nextChunk); - this.pairSort.cleanup(); - this.pairSort = null; - - // Now deal with the fragments - logger.info("Traversing fragment information and detecting duplicates."); - boolean containsPairs = false; - boolean containsFrags = false; - - for (final ReadEndsForMarkDuplicates next : this.fragSort) { - if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false)) { - nextChunk.add(next); - containsPairs = containsPairs || next.isPaired(); - containsFrags = containsFrags || !next.isPaired(); - } else { - if (nextChunk.size() > 1 && containsFrags) { - markDuplicateFragments(nextChunk, containsPairs); - } - - nextChunk.clear(); - nextChunk.add(next); - firstOfNextChunk = next; - containsPairs = next.isPaired(); - containsFrags = !next.isPaired(); - } - } - markDuplicateFragments(nextChunk, containsPairs); - this.fragSort.cleanup(); - this.fragSort = null; - - logger.info("Sorting list of duplicate records."); - this.duplicateIndexes.doneAddingStartIteration(); - } - - private static boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean compareRead2) { - boolean retval = lhs.libraryId == rhs.libraryId && - lhs.read1ReferenceIndex == rhs.read1ReferenceIndex && - lhs.read1Coordinate == rhs.read1Coordinate && - lhs.orientation == rhs.orientation; - - if (retval && compareRead2) { - retval = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex && - lhs.read2Coordinate == rhs.read2Coordinate; - } - - return retval; - } - - private void addIndexAsDuplicate(final long bamIndex) { - this.duplicateIndexes.add(bamIndex); - ++this.numDuplicateIndices; - } - - /** - * Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should - * not be marked as duplicates. This assumes that the list contains objects representing pairs. - * - * @param list - */ - private void markDuplicatePairs(final List list) { - short maxScore = 0; - ReadEndsForMarkDuplicates best = null; - - /** All read ends should have orientation FF, FR, RF, or RR **/ - for (final ReadEndsForMarkDuplicates end : list) { - if (end.score > maxScore || best == null) { - maxScore = end.score; - best = end; - } - } - - for (final ReadEndsForMarkDuplicates end : list) { - if (end != best) { - addIndexAsDuplicate(end.read1IndexInFile); - addIndexAsDuplicate(end.read2IndexInFile); - } - } - - if (this.opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null) { - AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(list, best, opticalDuplicateFinder, libraryIdGenerator); - } - } - - /** - * Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should - * not be marked as duplicates. This will set the duplicate index for only list items are fragments. - * - * @param list - * @param containsPairs true if the list also contains objects containing pairs, false otherwise. - */ - private void markDuplicateFragments(final List list, final boolean containsPairs) { - if (containsPairs) { - for (final ReadEndsForMarkDuplicates end : list) { - if (!end.isPaired()) addIndexAsDuplicate(end.read1IndexInFile); - } - } else { - short maxScore = 0; - ReadEndsForMarkDuplicates best = null; - for (final ReadEndsForMarkDuplicates end : list) { - if (end.score > maxScore || best == null) { - maxScore = end.score; - best = end; - } - } - - for (final ReadEndsForMarkDuplicates end : list) { - if (end != best) { - addIndexAsDuplicate(end.read1IndexInFile); - } - } - } - } - - /** Comparator for ReadEndsForMarkDuplicates that orders by read1 position then pair orientation then read2 position. */ - static class ReadEndsMDComparator implements Comparator, Serializable { - private static final long serialVersionUID = 1L; - @Override - public int compare(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs) { - int retval = lhs.libraryId - rhs.libraryId; - if (retval == 0) retval = lhs.read1ReferenceIndex - rhs.read1ReferenceIndex; - if (retval == 0) retval = lhs.read1Coordinate - rhs.read1Coordinate; - if (retval == 0) retval = lhs.orientation - rhs.orientation; - if (retval == 0) retval = lhs.read2ReferenceIndex - rhs.read2ReferenceIndex; - if (retval == 0) retval = lhs.read2Coordinate - rhs.read2Coordinate; - if (retval == 0) retval = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile); - if (retval == 0) retval = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile); - return retval; - } - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractMarkDuplicatesCommandLineProgram.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractMarkDuplicatesCommandLineProgram.java deleted file mode 100644 index 65192d3b6b1..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractMarkDuplicatesCommandLineProgram.java +++ /dev/null @@ -1,302 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -import htsjdk.samtools.*; -import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy; -import htsjdk.samtools.metrics.MetricsFile; -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.samtools.util.CloserUtil; -import htsjdk.samtools.util.Histogram; -import org.broadinstitute.barclay.argparser.Argument; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.utils.Utils; -import picard.sam.markduplicates.util.OpticalDuplicateFinder; -import picard.sam.util.PhysicalLocation; - -import java.io.File; -import java.util.*; - -/** - * Abstract class that holds parameters and methods common to classes that perform duplicate - * detection and/or marking within SAM/BAM/CRAM files. - * - * @author Nils Homer - */ -public abstract class AbstractMarkDuplicatesCommandLineProgram extends AbstractOpticalDuplicateFinderCommandLineProgram { - - @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, - doc = "One or more input SAM/BAM/CRAM files to analyze. Must be coordinate sorted.") - public List INPUT; - - @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, - doc = "The output file to write marked records to") - public File OUTPUT; - - @Argument(shortName = StandardArgumentDefinitions.METRICS_FILE_SHORT_NAME, - fullName = StandardArgumentDefinitions.METRICS_FILE_LONG_NAME, - doc = "File to write duplication metrics to") - public File METRICS_FILE; - - @Argument(shortName = StandardArgumentDefinitions.PROGRAM_RECORD_ID_SHORT_NAME, - doc = "The program record ID for the @PG record(s) created by this program. Set to null to disable " + - "PG record creation. This string may have a suffix appended to avoid collision with other " + - "program record IDs.", - optional = true) - public String PROGRAM_RECORD_ID = "MarkDuplicatesGATK"; - - @Argument(shortName = "PG_VERSION", - doc = "Value of VN tag of PG record to be created. If not specified, the version will be detected automatically.", - optional = true) - public String PROGRAM_GROUP_VERSION; - - @Argument(shortName = "PG_COMMAND", - doc = "Value of CL tag of PG record to be created. If not supplied the command line will be detected automatically.", - optional = true) - public String PROGRAM_GROUP_COMMAND_LINE; - - @Argument(shortName = "PG_NAME", - doc = "Value of PN tag of PG record to be created.") - public String PROGRAM_GROUP_NAME = getClass().getSimpleName(); - - @Argument(shortName = "CO", - doc = "Comment(s) to include in the output file's header.", - optional = true) - public List COMMENT = new ArrayList<>(); - - @Argument(doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.") - public boolean REMOVE_DUPLICATES = false; - - @Argument(shortName = StandardArgumentDefinitions.ASSUME_SORTED_SHORT_NAME, - doc = "If true, assume that the input file is coordinate sorted even if the header says otherwise.") - public boolean ASSUME_SORTED = false; - - @Argument(shortName = StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_SHORT_NAME, - fullName = StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_LONG_NAME, - doc = "The scoring strategy for choosing the non-duplicate among candidates.") - public DuplicateScoringStrategy.ScoringStrategy DUPLICATE_SCORING_STRATEGY = ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH; - - /** The program groups that have been seen during the course of examining the input records. */ - protected final Set pgIdsSeen = new LinkedHashSet<>(); - - /** - * We have to re-chain the program groups based on this algorithm. This returns the map from existing program group ID - * to new program group ID. - */ - protected Map getChainedPgIds(final SAMFileHeader outputHeader) { - final Map chainedPgIds; - // Generate new PG record(s) - if (PROGRAM_RECORD_ID != null) { - final PgIdGenerator pgIdGenerator = new PgIdGenerator(outputHeader); - if (PROGRAM_GROUP_VERSION == null) { - PROGRAM_GROUP_VERSION = this.getVersion(); - } - if (PROGRAM_GROUP_COMMAND_LINE == null) { - PROGRAM_GROUP_COMMAND_LINE = this.getCommandLine(); - } - chainedPgIds = new LinkedHashMap<>(); - for (final String existingId : this.pgIdsSeen) { - final String newPgId = pgIdGenerator.getNonCollidingId(PROGRAM_RECORD_ID); - chainedPgIds.put(existingId, newPgId); - final SAMProgramRecord programRecord = new SAMProgramRecord(newPgId); - programRecord.setProgramVersion(PROGRAM_GROUP_VERSION); - programRecord.setCommandLine(PROGRAM_GROUP_COMMAND_LINE); - programRecord.setProgramName(PROGRAM_GROUP_NAME); - programRecord.setPreviousProgramGroupId(existingId); - outputHeader.addProgramRecord(programRecord); - } - } else { - chainedPgIds = null; - } - return chainedPgIds; - } - - /** - * Writes the metrics given by the libraryIdGenerator to the METRICS_FILE. - * - * @param libraryIdGenerator - */ - protected void finalizeAndWriteMetrics(final LibraryIdGenerator libraryIdGenerator) { - //We want to sort libraries by name - final SortedMap metricsByLibrary = new TreeMap<>(Utils.COMPARE_STRINGS_NULLS_FIRST); - metricsByLibrary.putAll(libraryIdGenerator.getMetricsByLibraryMap()); - - final Histogram opticalDuplicatesByLibraryId = libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap(); - final Map libraryIds = libraryIdGenerator.getLibraryIdsMap(); - - // Write out the metrics - final MetricsFile file = getMetricsFile(); - for (final Map.Entry entry : metricsByLibrary.entrySet()) { - final String libraryName = entry.getKey(); - final GATKDuplicationMetrics metrics = entry.getValue(); - - metrics.READ_PAIRS_EXAMINED = metrics.READ_PAIRS_EXAMINED / 2; - metrics.READ_PAIR_DUPLICATES = metrics.READ_PAIR_DUPLICATES / 2; - - // Add the optical dupes to the metrics - final Short libraryId = libraryIds.get(libraryName); - if (libraryId != null) { - @SuppressWarnings("unchecked")//type-checker being annoying here - final Histogram.Bin bin = opticalDuplicatesByLibraryId.get(libraryId); - if (bin != null) { - metrics.READ_PAIR_OPTICAL_DUPLICATES = (long) bin.getValue(); - } - } - metrics.calculateDerivedFields(); - file.addMetric(metrics); - } - - if (metricsByLibrary.size() == 1) { - file.setHistogram(metricsByLibrary.values().iterator().next().calculateRoiHistogram()); - } - - file.write(METRICS_FILE); - } - - /** Little class to generate program group IDs */ - static class PgIdGenerator { - private int recordCounter; - - private final Set idsThatAreAlreadyTaken = new LinkedHashSet<>(); - - PgIdGenerator(final SAMFileHeader header) { - for (final SAMProgramRecord pgRecord : header.getProgramRecords()) { - idsThatAreAlreadyTaken.add(pgRecord.getProgramGroupId()); - } - recordCounter = idsThatAreAlreadyTaken.size(); - } - - String getNonCollidingId(final String recordId) { - if (!idsThatAreAlreadyTaken.contains(recordId)) { - // don't remap 1st record. If there are more records - // with this id, they will be remapped in the 'else'. - idsThatAreAlreadyTaken.add(recordId); - ++recordCounter; - return recordId; - } else { - String newId; - // Below we tack on one of roughly 1.7 million possible 4 digit base36 at random. We do this because - // our old process of just counting from 0 upward and adding that to the previous id led to 1000s of - // calls idsThatAreAlreadyTaken.contains() just to resolve 1 collision when merging 1000s of similarly - // processed bams. - while (idsThatAreAlreadyTaken.contains(newId = recordId + "." + SamFileHeaderMerger.positiveFourDigitBase36Str(recordCounter++))) - ; - - idsThatAreAlreadyTaken.add(newId); - return newId; - } - - } - } - - /** Little class used to package up a header and an iterable/iterator. */ - public static final class SamHeaderAndIterator implements AutoCloseable{ - public final SAMFileHeader header; - public final CloseableIterator iterator; - private final List readers; - - public SamHeaderAndIterator(final SAMFileHeader header, final CloseableIterator iterator, final List readers) { - this.header = header; - this.iterator = iterator; - this.readers = readers; - } - - @Override - public void close(){ - CloserUtil.close(readers); - CloserUtil.close(iterator); - } - } - - /** - * Since this may read it's inputs more than once this method does all the opening - * and checking of the inputs. - */ - protected SamHeaderAndIterator openInputs() { - final List headers = new ArrayList<>(INPUT.size()); - final List readers = new ArrayList<>(INPUT.size()); - - for (final File f : INPUT) { - final SamReader reader = SamReaderFactory.makeDefault() - .enable(SamReaderFactory.Option.EAGERLY_DECODE) - .referenceSequence(REFERENCE_SEQUENCE) - .open(f); // eager decode - final SAMFileHeader header = reader.getFileHeader(); - - if (!ASSUME_SORTED && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) { - throw new UserException("Input file " + f.getAbsolutePath() + " is not coordinate sorted."); - } - - headers.add(header); - readers.add(reader); - } - - if (headers.size() == 1) { - return new SamHeaderAndIterator(headers.get(0), readers.get(0).iterator(), readers); - } else { - final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, false); - final MergingSamRecordIterator iterator = new MergingSamRecordIterator(headerMerger, readers, ASSUME_SORTED); - return new SamHeaderAndIterator(headerMerger.getMergedHeader(), iterator, readers); - } - } - - /** - * Looks through the set of reads and identifies how many of the duplicates are - * in fact optical duplicates, and stores the data in the instance level histogram. - */ - public static void trackOpticalDuplicates(List ends, - final ReadEnds keeper, - final OpticalDuplicateFinder opticalDuplicateFinder, - final LibraryIdGenerator libraryIdGenerator) { - boolean hasFR = false, hasRF = false; - - // Check to see if we have a mixture of FR/RF - for (final ReadEnds end : ends) { - if (ReadEnds.FR == end.orientationForOpticalDuplicates) { - hasFR = true; - } else if (ReadEnds.RF == end.orientationForOpticalDuplicates) { - hasRF = true; - } - } - - // Check if we need to partition since the orientations could have changed - if (hasFR && hasRF) { // need to track them independently - // Variables used for optical duplicate detection and tracking - final List trackOpticalDuplicatesF = new ArrayList<>(); - final List trackOpticalDuplicatesR = new ArrayList<>(); - - // Split into two lists: first of pairs and second of pairs, since they must have orientation and same starting end - for (final ReadEnds end : ends) { - if (ReadEnds.FR == end.orientationForOpticalDuplicates) { - trackOpticalDuplicatesF.add(end); - } else if (ReadEnds.RF == end.orientationForOpticalDuplicates) { - trackOpticalDuplicatesR.add(end); - } else { - throw new UserException("Found an unexpected orientation: " + end.orientation); - } - } - - // track the duplicates - trackOpticalDuplicates(trackOpticalDuplicatesF, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap(), keeper); - trackOpticalDuplicates(trackOpticalDuplicatesR, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap(), keeper); - } else { // No need to partition - AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(ends, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap(), keeper); - } - } - - /** - * Looks through the set of reads and identifies how many of the duplicates are - * in fact optical duplicates, and stores the data in the instance level histogram. - */ - private static void trackOpticalDuplicates(final List list, - final OpticalDuplicateFinder opticalDuplicateFinder, - final Histogram opticalDuplicatesByLibraryId, - final ReadEnds keeper) { - final boolean[] opticalDuplicateFlags = opticalDuplicateFinder.findOpticalDuplicates(list, keeper); - - int opticalDuplicates = 0; - for (final boolean b : opticalDuplicateFlags) if (b) ++opticalDuplicates; - if (opticalDuplicates > 0) { - opticalDuplicatesByLibraryId.increment(list.get(0).getLibraryId(), opticalDuplicates); - } - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractOpticalDuplicateFinderCommandLineProgram.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractOpticalDuplicateFinderCommandLineProgram.java deleted file mode 100644 index 1bc28b4fd45..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractOpticalDuplicateFinderCommandLineProgram.java +++ /dev/null @@ -1,36 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -import org.apache.logging.log4j.Logger; -import org.apache.logging.log4j.LogManager; -import org.broadinstitute.barclay.argparser.ArgumentCollection; -import org.broadinstitute.hellbender.cmdline.PicardCommandLineProgram; -import org.broadinstitute.hellbender.cmdline.argumentcollections.OpticalDuplicatesArgumentCollection; -import picard.sam.markduplicates.util.OpticalDuplicateFinder; - -/** - * Abstract class that holds parameters and methods common to classes that optical duplicate detection. We put them here so that - * the explanation about how read names are parsed is in once place - * - * @author Tim Fennell - */ -public abstract class AbstractOpticalDuplicateFinderCommandLineProgram extends PicardCommandLineProgram { - protected static Logger LOG = LogManager.getLogger(AbstractOpticalDuplicateFinderCommandLineProgram.class); - - @ArgumentCollection - protected OpticalDuplicatesArgumentCollection opticalDuplicatesArgumentCollection = new OpticalDuplicatesArgumentCollection(); - - // The tool with which to find optical duplicates - protected OpticalDuplicateFinder opticalDuplicateFinder = null; - - // Needed for testing - public void setupOpticalDuplicateFinder() { - this.opticalDuplicateFinder = new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, - opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE,null);//TODO logger firgure out, logger); - } - - @Override - protected String[] customCommandLineValidation() { - setupOpticalDuplicateFinder(); - return super.customCommandLineValidation(); - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/DiskBasedReadEndsForMarkDuplicatesMap.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/DiskBasedReadEndsForMarkDuplicatesMap.java deleted file mode 100644 index 7b1ab967a3e..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/DiskBasedReadEndsForMarkDuplicatesMap.java +++ /dev/null @@ -1,96 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -import htsjdk.samtools.CoordinateSortedPairInfoMap; -import org.broadinstitute.hellbender.exceptions.GATKException; - - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.AbstractMap; -import java.util.Map; - -/** - * Disk-based implementation of ReadEndsForMarkDuplicatesMap. A subdirectory of the system tmpdir is created to store - * files, one for each reference sequence. The reference sequence that is currently being queried (i.e. the - * sequence for which remove() has been most recently called) is stored in RAM. ReadEnds for all other sequences - * are stored on disk. - *

- * When put() is called for a sequence that is the current one in RAM, the ReadEnds object is merely put into the - * in-memory map. If put() is called for a sequence ID that is not the current RAM one, the ReadEnds object is - * appended to the file for that sequence, creating the file if necessary. - *

- * When remove() is called for a sequence that is the current one in RAM, remove() is called on the in-memory map. - * If remove() is called for a sequence other than the current RAM sequence, then the current RAM sequence is written - * to disk, the new sequence is read from disk into RAM map, and the file for the new sequence is deleted. - *

- * If things work properly, and reads are processed in genomic order, records will be written for mates that are in - * a later sequence. When the mate is reached in the input SAM file, the file that was written will be deleted. - * This should result in all temporary files being deleted by the time all the reads are processed. The temp - * directory is marked to be deleted on exit so everything should get cleaned up. - * - * @author alecw@broadinstitute.org - */ -public final class DiskBasedReadEndsForMarkDuplicatesMap implements ReadEndsForMarkDuplicatesMap { - private final CoordinateSortedPairInfoMap pairInfoMap; - - public DiskBasedReadEndsForMarkDuplicatesMap(int maxOpenFiles) { - pairInfoMap = new CoordinateSortedPairInfoMap<>(maxOpenFiles, new Codec()); - } - - @Override - public ReadEndsForMarkDuplicates remove(int mateSequenceIndex, String key) { - return pairInfoMap.remove(mateSequenceIndex, key); - } - - @Override - public void put(int mateSequenceIndex, String key, ReadEndsForMarkDuplicates readEnds) { - pairInfoMap.put(mateSequenceIndex, key, readEnds); - } - - @Override - public int size() { - return pairInfoMap.size(); - } - - @Override - public int sizeInRam() { - return pairInfoMap.sizeInRam(); - } - - private static class Codec implements CoordinateSortedPairInfoMap.Codec { - private final ReadEndsForMarkDuplicatesCodec readEndsForMarkDuplicatesCodec = new ReadEndsForMarkDuplicatesCodec(); - - @Override - public void setInputStream(final InputStream is) { - readEndsForMarkDuplicatesCodec.setInputStream(is); - } - - @Override - public void setOutputStream(final OutputStream os) { - readEndsForMarkDuplicatesCodec.setOutputStream(os); - } - - @Override - public Map.Entry decode() { - try { - final String key = readEndsForMarkDuplicatesCodec.getInputStream().readUTF(); - final ReadEndsForMarkDuplicates record = readEndsForMarkDuplicatesCodec.decode(); - return new AbstractMap.SimpleEntry<>(key, record); - } catch (IOException e) { - throw new GATKException("Error loading ReadEndsForMarkDuplicatesMap from disk", e); - } - } - - @Override - public void encode(final String key, final ReadEndsForMarkDuplicates readEnds) { - try { - readEndsForMarkDuplicatesCodec.getOutputStream().writeUTF(key); - readEndsForMarkDuplicatesCodec.encode(readEnds); - } catch (IOException e) { - throw new GATKException("Error spilling ReadEndsForMarkDuplicatesMap to disk.", e); - } - } - } - -} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEnds.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEnds.java deleted file mode 100644 index 9b0382d039c..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEnds.java +++ /dev/null @@ -1,71 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -import org.broadinstitute.hellbender.exceptions.GATKException; -import picard.sam.util.PhysicalLocation; - -/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */ -public abstract class ReadEnds implements PhysicalLocation { - - public static final byte F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5; - - public short libraryId; - public byte orientation; - public int read1ReferenceIndex = -1; - public int read1Coordinate = -1; - public int read2ReferenceIndex = -1; - public int read2Coordinate = -1; - - // Information used to detect optical dupes - public short readGroup = -1; - public short tile = -1; - public int x = -1, y = -1; - - /** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */ - public byte orientationForOpticalDuplicates = -1; - - - public boolean isPaired() { return this.read2ReferenceIndex != -1; } - - @Override - public short getReadGroup() { return this.readGroup; } - - @Override - public void setReadGroup(final short readGroup) { this.readGroup = readGroup; } - - @Override - public short getTile() { return this.tile; } - - @Override - public void setTile(final short tile) { this.tile = tile; } - - @Override - public int getX() { return this.x; } - - @Override - public void setX(final int x) { this.x = x; } - - @Override - public int getY() { return this.y; } - - @Override - public void setY(final int y) { this.y = y; } - - @Override - public short getLibraryId() { return this.libraryId; } - - @Override - public void setLibraryId(final short libraryId) { this.libraryId = libraryId; } - - /** - * Returns a single byte that encodes the orientation of the two reads in a pair. - */ - public static byte getOrientationByte(final boolean read1NegativeStrand, final boolean read2NegativeStrand) { - if (read1NegativeStrand) { - if (read2NegativeStrand) return ReadEnds.RR; - else return ReadEnds.RF; - } else { - if (read2NegativeStrand) return ReadEnds.FR; - else return ReadEnds.FF; - } - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicates.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicates.java deleted file mode 100644 index 3719d4307fc..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicates.java +++ /dev/null @@ -1,23 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -/** - * Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar - * - * @author Nils Homer - */ -public final class ReadEndsForMarkDuplicates extends ReadEnds { - /* - What do we need to store you ask? Well, we need to store: - - byte: orientation - - short: libraryId, readGroup, tile, x, y, score - - int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate - - long: read1IndexInFile, read2IndexInFile - */ - public static final int SIZE_OF = (1 * 1) + (5 * 2) + (4 * 4) + (8 * 2) + 1 - + 8 + // last 8 == reference overhead - 13; // This is determined experimentally with JProfiler - - public short score = 0; - public long read1IndexInFile = -1; - public long read2IndexInFile = -1; -} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesCodec.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesCodec.java deleted file mode 100644 index 0067a32881a..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesCodec.java +++ /dev/null @@ -1,98 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -import htsjdk.samtools.util.SortingCollection; -import org.broadinstitute.hellbender.exceptions.GATKException; - - -import java.io.*; - -/** Codec for ReadEnds that just outputs the primitive fields and reads them back. */ -public final class ReadEndsForMarkDuplicatesCodec implements SortingCollection.Codec { - private DataInputStream in; - private DataOutputStream out; - - /** - * For an explanation of why codecs must implement clone(), - * see the HTSJDK documentation for {@link SortingCollection.Codec}. - */ - @Override - public ReadEndsForMarkDuplicatesCodec clone() { - return new ReadEndsForMarkDuplicatesCodec(); - } - - @Override - public void setOutputStream(final OutputStream os) { this.out = new DataOutputStream(os); } - - @Override - public void setInputStream(final InputStream is) { this.in = new DataInputStream(is); } - - public DataInputStream getInputStream() { - return in; - } - - public DataOutputStream getOutputStream() { - return out; - } - - @Override - public void encode(final ReadEndsForMarkDuplicates read) { - try { - this.out.writeShort(read.score); - this.out.writeShort(read.libraryId); - this.out.writeByte(read.orientation); - this.out.writeInt(read.read1ReferenceIndex); - this.out.writeInt(read.read1Coordinate); - this.out.writeLong(read.read1IndexInFile); - this.out.writeInt(read.read2ReferenceIndex); - - if (read.orientation > ReadEnds.R) { - this.out.writeInt(read.read2Coordinate); - this.out.writeLong(read.read2IndexInFile); - } - - this.out.writeShort(read.readGroup); - this.out.writeShort(read.tile); - this.out.writeShort(read.x); - this.out.writeShort(read.y); - this.out.writeByte(read.orientationForOpticalDuplicates); - } catch (final IOException ioe) { - throw new GATKException("Exception writing ReadEnds to file.", ioe); - } - } - - @Override - public ReadEndsForMarkDuplicates decode() { - final ReadEndsForMarkDuplicates read = new ReadEndsForMarkDuplicates(); - try { - // If the first read results in an EOF we've exhausted the stream - try { - read.score = this.in.readShort(); - } catch (final EOFException eof) { - return null; - } - - read.libraryId = this.in.readShort(); - read.orientation = this.in.readByte(); - read.read1ReferenceIndex = this.in.readInt(); - read.read1Coordinate = this.in.readInt(); - read.read1IndexInFile = this.in.readLong(); - read.read2ReferenceIndex = this.in.readInt(); - - if (read.orientation > ReadEnds.R) { - read.read2Coordinate = this.in.readInt(); - read.read2IndexInFile = this.in.readLong(); - } - - read.readGroup = this.in.readShort(); - read.tile = this.in.readShort(); - read.x = this.in.readShort(); - read.y = this.in.readShort(); - - read.orientationForOpticalDuplicates = this.in.readByte(); - - return read; - } catch (final IOException ioe) { - throw new GATKException("Exception writing ReadEnds to file.", ioe); - } - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesMap.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesMap.java deleted file mode 100644 index 9b33f0d903b..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesMap.java +++ /dev/null @@ -1,38 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -/** - * Interface for storing and retrieving ReadEnds objects. An implementation may be disk-based to - * reduce memory footprint. - */ -public interface ReadEndsForMarkDuplicatesMap { - /** - * Remove element with given key from the map. Because an implementation may be disk-based, - * the object returned may not be the same object that was put into the map - * - * @param mateSequenceIndex must agree with the value used when the object was put into the map - * @param key typically, concatenation of read group ID and read name - * @return null if the key is not found, otherwise the object removed. - */ - ReadEndsForMarkDuplicates remove(int mateSequenceIndex, String key); - - /** - * Store the element in the map with the given key. It is assumed that the element - * is not already present in the map. - * - * @param mateSequenceIndex use to optimize storage & retrieval. The same value must be used when trying - * to remove this element. It is not valid to store the same key with two different mateSequenceIndexes. - * @param key typically, concatenation of read group ID and read name - * @param readEnds the object to be stored - */ - void put(int mateSequenceIndex, String key, ReadEndsForMarkDuplicates readEnds); - - /** - * @return number of elements stored in map - */ - int size(); - - /** - * @return number of elements stored in RAM. Always <= size() - */ - int sizeInRam(); -} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/SamRecordWithOrdinalAndSetDuplicateReadFlag.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/SamRecordWithOrdinalAndSetDuplicateReadFlag.java deleted file mode 100644 index 7058709d0d0..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/SamRecordWithOrdinalAndSetDuplicateReadFlag.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.SamRecordWithOrdinal; - -/** - * This class sets the duplicate read flag as the result state when examining sets of records. - * - * @author nhomer - */ -public final class SamRecordWithOrdinalAndSetDuplicateReadFlag extends SamRecordWithOrdinal { - - public SamRecordWithOrdinalAndSetDuplicateReadFlag() { - super(); - } - - public SamRecordWithOrdinalAndSetDuplicateReadFlag(final SAMRecord record, final long recordIndex) { - super(record, recordIndex); - } - - @Override - public void setResultState(final boolean resultState) { - this.getRecord().setDuplicateReadFlag(resultState); - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/EmptyFragment.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/EmptyFragment.java index 24d704fd121..d09bcd7649f 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/EmptyFragment.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/EmptyFragment.java @@ -4,8 +4,8 @@ import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.ReadUtils; import org.broadinstitute.hellbender.utils.read.markduplicates.LibraryIdGenerator; -import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds; import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey; +import picard.sam.markduplicates.util.ReadEnds; import java.util.Map; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Fragment.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Fragment.java index ac5bfd01b45..835171457c5 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Fragment.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Fragment.java @@ -1,14 +1,12 @@ package org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords; import htsjdk.samtools.SAMFileHeader; -import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.ReadUtils; import org.broadinstitute.hellbender.utils.read.markduplicates.LibraryIdGenerator; import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy; -import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds; import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey; -import picard.sam.util.PhysicalLocation; +import picard.sam.markduplicates.util.ReadEnds; import java.util.Map; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Pair.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Pair.java index 50aae58190e..77c781498d1 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Pair.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Pair.java @@ -9,9 +9,8 @@ import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.ReadUtils; import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy; -import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds; import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey; -import picard.sam.util.PhysicalLocation; +import picard.sam.markduplicates.util.ReadEnds; import java.util.Map; diff --git a/src/test/java/org/broadinstitute/hellbender/testutils/testers/AbstractMarkDuplicatesTester.java b/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java similarity index 86% rename from src/test/java/org/broadinstitute/hellbender/testutils/testers/AbstractMarkDuplicatesTester.java rename to src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java index 788bd3d64f5..3cc4848274a 100644 --- a/src/test/java/org/broadinstitute/hellbender/testutils/testers/AbstractMarkDuplicatesTester.java +++ b/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java @@ -9,6 +9,8 @@ import htsjdk.samtools.util.TestUtil; import org.broadinstitute.hellbender.cmdline.CommandLineProgram; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; +import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark; import org.testng.Assert; import picard.sam.DuplicationMetrics; @@ -20,16 +22,17 @@ * This class is an extension of SamFileTester used to test AbstractMarkDuplicatesCommandLineProgram's with SAM files generated on the fly. * This performs the underlying tests defined by classes such as AbstractMarkDuplicatesCommandLineProgramTest. */ -public abstract class AbstractMarkDuplicatesTester extends SamFileTester { +public class MarkDuplicatesSparkTester extends SamFileTester { final private File metricsFile; final DuplicationMetrics expectedMetrics; + boolean markUnmappedReads = false; - public AbstractMarkDuplicatesTester(final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder) { + public MarkDuplicatesSparkTester(final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder) { this(duplicateScoringStrategy, sortOrder, true); } - public AbstractMarkDuplicatesTester(final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder, final boolean recordNeedSorting) { + public MarkDuplicatesSparkTester(final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder, final boolean recordNeedSorting) { super(50, true, SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH, duplicateScoringStrategy, sortOrder, recordNeedSorting); expectedMetrics = new DuplicationMetrics(); @@ -40,16 +43,17 @@ public AbstractMarkDuplicatesTester(final ScoringStrategy duplicateScoringStrate addArg("--"+StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_LONG_NAME, duplicateScoringStrategy.name()); } - public AbstractMarkDuplicatesTester(final ScoringStrategy duplicateScoringStrategy) { + public MarkDuplicatesSparkTester(final ScoringStrategy duplicateScoringStrategy, boolean markUnmappedReads) { this(duplicateScoringStrategy, SAMFileHeader.SortOrder.coordinate); + this.markUnmappedReads = markUnmappedReads; } - public AbstractMarkDuplicatesTester() { - this(SAMRecordSetBuilder.DEFAULT_DUPLICATE_SCORING_STRATEGY); + public MarkDuplicatesSparkTester(boolean markUnmappedReads) { + this(DuplicateScoringStrategy.ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH, markUnmappedReads); } - public File getMetricsFile() { - return metricsFile; + public MarkDuplicatesSparkTester() { + this(false); } @Override @@ -158,5 +162,12 @@ public void test() { } } - protected abstract CommandLineProgram getProgram(); + protected CommandLineProgram getProgram() { return new MarkDuplicatesSpark(); } + + @Override + protected void addArgs() { + if (!markUnmappedReads) { + addArg("--" + MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME); + } + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java index c8a5ff2b0e6..4dad0bfcba4 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java @@ -3,26 +3,20 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import htsjdk.samtools.metrics.MetricsFile; -import org.apache.spark.SparkException; import org.broadinstitute.hellbender.cmdline.CommandLineProgram; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; import org.broadinstitute.hellbender.engine.ReadsDataSource; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; -import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.walkers.markduplicates.MarkDuplicatesGATKIntegrationTest; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark; +import org.broadinstitute.hellbender.tools.walkers.markduplicates.AbstractMarkDuplicatesCommandLineProgramTest; +import org.broadinstitute.hellbender.testutils.testers.MarkDuplicatesSparkTester; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics; -import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesSparkTester; -import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; -import org.broadinstitute.hellbender.tools.walkers.markduplicates.AbstractMarkDuplicatesCommandLineProgramTest; -import org.broadinstitute.hellbender.testutils.testers.AbstractMarkDuplicatesTester; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import picard.sam.util.PhysicalLocationInt; -import picard.sam.util.ReadNameParser; import java.io.File; import java.io.FileNotFoundException; @@ -37,7 +31,7 @@ public class MarkDuplicatesSparkIntegrationTest extends AbstractMarkDuplicatesCommandLineProgramTest { @Override - protected AbstractMarkDuplicatesTester getTester() { + protected MarkDuplicatesSparkTester getTester() { MarkDuplicatesSparkTester markDuplicatesSparkTester = new MarkDuplicatesSparkTester(); markDuplicatesSparkTester.addArg("--"+ MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME); return markDuplicatesSparkTester; @@ -68,31 +62,42 @@ public Object[][] md(){ //Note: in each of those cases, we'd really want to pass null as the last parameter (not 0L) but IntelliJ // does not like it and skips the test (rendering issue) - so we pass 0L and account for it at test time // (see comment in testMarkDuplicatesSparkIntegrationTestLocal) - {new File(MarkDuplicatesGATKIntegrationTest.TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam"), 20, 0, + {new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam"), 20, 0, ImmutableMap.of("Solexa-16419", ImmutableList.of(0L, 3L, 0L, 0L, 0L, 0L, 0.0, 0L), "Solexa-16416", ImmutableList.of(0L, 1L, 0L, 0L, 0L, 0L, 0.0, 0L), "Solexa-16404", ImmutableList.of(0L, 3L, 0L, 0L, 0L, 0L, 0.0, 0L), "Solexa-16406", ImmutableList.of(0L, 1L, 0L, 0L, 0L, 0L, 0.0, 0L), "Solexa-16412", ImmutableList.of(0L, 1L, 0L, 0L, 0L, 0L, 0.0, 0L))}, - {new File(MarkDuplicatesGATKIntegrationTest.TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.bam"), 90, 6, + {new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.bam"), 90, 6, ImmutableMap.of("Solexa-16419", ImmutableList.of(4L, 4L, 4L, 0L, 0L, 0L, 0.0, 0L), "Solexa-16416", ImmutableList.of(2L, 2L, 2L, 0L, 0L, 0L, 0.0, 0L), "Solexa-16404", ImmutableList.of(3L, 9L, 3L, 0L, 2L, 0L, 0.190476, 17L), "Solexa-16406", ImmutableList.of(1L, 10L, 1L, 0L, 0L, 0L, 0.0, 0L), "Solexa-16412", ImmutableList.of(3L, 6L, 3L, 0L, 1L, 0L, 0.133333, 15L))}, - {new File(MarkDuplicatesGATKIntegrationTest.TEST_DATA_DIR,"example.chr1.1-1K.markedDups.bam"), 90, 6, + {new File(TEST_DATA_DIR,"example.chr1.1-1K.markedDups.bam"), 90, 6, ImmutableMap.of("Solexa-16419", ImmutableList.of(4L, 4L, 4L, 0L, 0L, 0L, 0.0, 0L), "Solexa-16416", ImmutableList.of(2L, 2L, 2L, 0L, 0L, 0L, 0.0, 0L), "Solexa-16404", ImmutableList.of(3L, 9L, 3L, 0L, 2L, 0L, 0.190476, 17L), "Solexa-16406", ImmutableList.of(1L, 10L, 1L, 0L, 0L, 0L, 0.0, 0L), "Solexa-16412", ImmutableList.of(3L, 6L, 3L, 0L, 1L, 0L, 0.133333, 15L))}, - {new File(MarkDuplicatesGATKIntegrationTest.TEST_DATA_DIR, "optical_dupes.bam"), 4, 2, + {new File(TEST_DATA_DIR, "optical_dupes.bam"), 4, 2, ImmutableMap.of("mylib", ImmutableList.of(0L, 2L, 0L, 0L, 1L, 1L, 0.5, 0L))}, - {new File(MarkDuplicatesGATKIntegrationTest.TEST_DATA_DIR, "optical_dupes_casava.bam"), 4, 2, + {new File(TEST_DATA_DIR, "optical_dupes_casava.bam"), 4, 2, ImmutableMap.of("mylib", ImmutableList.of(0L, 2L, 0L, 0L, 1L, 1L, 0.5, 0L))}, }; } + // Tests asserting that without --do-not-mark-unmapped-mates argument that unmapped mates are still duplicate marked with their partner + @Test + public void testMappedPairAndMappedFragmentAndMatePairSecondUnmapped() { + final MarkDuplicatesSparkTester tester = new MarkDuplicatesSparkTester(true); + tester.addMatePair(1, 10040, 10040, false, true, true, true, "76M", null, false, false, false, false, false, DEFAULT_BASE_QUALITY); // first a duplicate, + // second end unmapped + tester.addMappedPair(1, 10189, 10040, false, false, "41S35M", "65M11S", true, false, false, ELIGIBLE_BASE_QUALITY); // mapped OK + tester.addMappedFragment(1, 10040, true, DEFAULT_BASE_QUALITY); // duplicate + tester.runTest(); + } + @Test( dataProvider = "md") public void testMarkDuplicatesSparkIntegrationTestLocal( final File input, final long totalExpected, final long dupsExpected, @@ -170,25 +175,6 @@ public void testMarkDuplicatesSparkIntegrationTestLocal( } } - @Test - public void testSupplementaryReadUnmappedMate() { - File output = createTempFile("supplementaryReadUnmappedMate", "bam"); - final ArgumentsBuilder args = new ArgumentsBuilder(); - args.addOutput(output); - args.addInput(getTestFile("supplementaryReadUnmappedmate.bam")); - runCommandLine(args); - - try ( final ReadsDataSource outputReadsSource = new ReadsDataSource(output.toPath()) ) { - final List actualReads = new ArrayList<>(); - for ( final GATKRead read : outputReadsSource ) { - Assert.assertFalse(read.isDuplicate()); - actualReads.add(read); - } - - Assert.assertEquals(actualReads.size(), 3, "Wrong number of reads output"); - } - } - @Test public void testHashCollisionHandling() { // This test asserts that the handling of two read pairs with the same start positions but on different in such a way @@ -209,137 +195,4 @@ public void testHashCollisionHandling() { Assert.assertEquals(actualReads.size(), 4, "Wrong number of reads output"); } } - - // Tests asserting that without --do-not-mark-unmapped-mates argument that unmapped mates are still duplicate marked with their partner - @Test - public void testMappedPairAndMappedFragmentAndMatePairSecondUnmapped() { - final AbstractMarkDuplicatesTester tester = new MarkDuplicatesSparkTester(true); - tester.addMatePair(1, 10040, 10040, false, true, true, true, "76M", null, false, false, false, false, false, DEFAULT_BASE_QUALITY); // first a duplicate, - // second end unmapped - tester.addMappedPair(1, 10189, 10040, false, false, "41S35M", "65M11S", true, false, false, ELIGIBLE_BASE_QUALITY); // mapped OK - tester.addMappedFragment(1, 10040, true, DEFAULT_BASE_QUALITY); // duplicate - tester.runTest(); - } - - @Test - public void testNonExistantReadGroupInRead() { - final AbstractMarkDuplicatesTester tester = new MarkDuplicatesSparkTester(true); - tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "NotADuplicateGroup"); - try { - tester.runTest(); - Assert.fail("Should have thrown an exception"); - } catch (Exception e){ - Assert.assertTrue(e instanceof SparkException); - Assert.assertTrue(e.getCause() instanceof UserException.HeaderMissingReadGroup); - } - } - - @Test - public void testNoReadGroupInRead() { - final AbstractMarkDuplicatesTester tester = new MarkDuplicatesSparkTester(true); - tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, null); - - try { - tester.runTest(); - Assert.fail("Should have thrown an exception"); - } catch (Exception e){ - Assert.assertTrue(e instanceof SparkException); - Assert.assertTrue(e.getCause() instanceof UserException.ReadMissingReadGroup); - } - } - - @Test(dataProvider = "readNameData") - public void testOpticalDuplicatesTiebrokenByPhysicalLocationNotStartPosition(final String readName1, final String readName2) { - // This tests the readname based tiebreaking code in mark duplicates. Since it's ambiguous which read should be marked - // as duplicate or not if scores match we break ties by evaluating the readname for consistencies sake. - - final ReadNameParser parser = new ReadNameParser(); - - final PhysicalLocationInt position1 = new PhysicalLocationInt(); - final PhysicalLocationInt position2 = new PhysicalLocationInt(); - - parser.addLocationInformation(readName1, position1); - parser.addLocationInformation(readName2, position2); - - final AbstractMarkDuplicatesTester tester = getTester(); - tester.getSamRecordSetBuilder().setReadLength(101); - tester.setExpectedOpticalDuplicate(0); - - int compare = position1.tile - position2.tile; - if (compare == 0) { - compare = (short)position1.x - (short)position2.x; - } - - if (compare == 0) { - compare = (short)position1.y - (short)position2.y; - } - - final boolean isDuplicate = compare < 0; - - // NOTE these reads are offset slightly but should have the same unclipped start postitions - tester.addMatePair(readName1, 1,2, 46, false, false, !isDuplicate, !isDuplicate, "6S42M28S", "3S68M", true, false, false, false, false, DEFAULT_BASE_QUALITY); - tester.addMatePair(readName2, 1,2, 51, false, false, isDuplicate, isDuplicate, "6S42M28S", "8S68M", true, false, false, false, false, DEFAULT_BASE_QUALITY); - tester.runTest(); - } - - @DataProvider - public Object[][] readNameData(){ - return new Object[][]{ - {"RUNID:7:1203:2886:82292", "RUNID:7:1205:3886:16834"}, - - {"RUNID:7:1203:2886:16756", "RUNID:7:1205:3886:16756"}, - {"RUNID:7:1204:2886:16756", "RUNID:7:1205:3886:16756"}, - {"RUNID:7:1205:2886:16756", "RUNID:7:1205:3886:16756"}, - {"RUNID:7:1206:2886:16756", "RUNID:7:1205:3886:16756"}, - {"RUNID:7:1207:2886:16756", "RUNID:7:1205:3886:16756"}, - - {"RUNID:7:1203:2886:16756", "RUNID:7:1203:4886:26756"}, - {"RUNID:7:1203:3886:16756", "RUNID:7:1203:4886:26756"}, - {"RUNID:7:1203:4886:16756", "RUNID:7:1203:4886:26756"}, - {"RUNID:7:1203:5886:16756", "RUNID:7:1203:4886:26756"}, - {"RUNID:7:1203:6886:16756", "RUNID:7:1203:4886:26756"}, - - {"RUNID:7:1203:2886:34756", "RUNID:7:1203:2886:36756"}, - {"RUNID:7:1203:2886:35756", "RUNID:7:1203:2886:36756"}, - {"RUNID:7:1203:2886:37756", "RUNID:7:1203:2886:36756"}, - {"RUNID:7:1203:2886:38756", "RUNID:7:1203:2886:36756"}, - - //Added a test for tiebreaking accounting for the short casting done in picard - {"HK3T5CCXX160204:3:1112:11586:37067", "HK3T5CCXX160204:3:1112:11586:32144"} - }; - } - - @Test(dataProvider = "readNameData") - public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicates(final String readName1, final String readName2) { - // This tests the readname based tiebreaking code in mark duplicates. Since it's ambiguous which read should be marked - // as duplicate or not if scores match we break ties by evaluating the readname for consistencies sake. - - final ReadNameParser parser = new ReadNameParser(); - - final PhysicalLocationInt position1 = new PhysicalLocationInt(); - final PhysicalLocationInt position2 = new PhysicalLocationInt(); - - parser.addLocationInformation(readName1, position1); - parser.addLocationInformation(readName2, position2); - - final AbstractMarkDuplicatesTester tester = getTester(); - tester.getSamRecordSetBuilder().setReadLength(101); - tester.setExpectedOpticalDuplicate(0); - - int compare = position1.tile - position2.tile; - if (compare == 0) { - compare = (short)position1.x - (short)position2.x; - } - - if (compare == 0) { - compare = (short)position1.y - (short)position2.y; - } - - final boolean isDuplicate = compare < 0; - - tester.addMatePair(readName1, 1,485253, 485253, false, false, !isDuplicate, !isDuplicate, "42M59S", "59S42M", false, true, false, false, false, DEFAULT_BASE_QUALITY); - tester.addMatePair(readName2, 1,485253, 485253, false, false, isDuplicate, isDuplicate, "59S42M", "42M59S", true, false, false, false, false, DEFAULT_BASE_QUALITY); - - tester.runTest(); - } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java index 3b961f99e56..552d46ac0e0 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java @@ -1,17 +1,23 @@ package org.broadinstitute.hellbender.tools.walkers.markduplicates; import htsjdk.samtools.*; +import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkException; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.cmdline.CommandLineProgram; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.ReadsDataSource; +import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; import org.broadinstitute.hellbender.testutils.BaseTest; import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; import org.broadinstitute.hellbender.testutils.SamAssertionUtils; -import org.broadinstitute.hellbender.testutils.testers.AbstractMarkDuplicatesTester; +import org.broadinstitute.hellbender.testutils.testers.MarkDuplicatesSparkTester; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -19,9 +25,14 @@ import picard.sam.util.ReadNameParser; import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; /** * This class defines the individual test cases to run. The actual running of the test is done @@ -31,7 +42,7 @@ public abstract class AbstractMarkDuplicatesCommandLineProgramTest extends Comma public static final File TEST_DATA_DIR = new File(getTestDataDir(), "walkers/MarkDuplicatesGATK/"); - protected abstract AbstractMarkDuplicatesTester getTester(); + protected abstract MarkDuplicatesSparkTester getTester(); protected abstract CommandLineProgram getCommandLineProgramInstance(); @@ -45,14 +56,14 @@ public abstract class AbstractMarkDuplicatesCommandLineProgramTest extends Comma @Test public void testSingleUnmappedFragment() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addUnmappedFragment(-1, DEFAULT_BASE_QUALITY); tester.runTest(); } @Test public void testTwoUnmappedFragments() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addUnmappedFragment(-1, DEFAULT_BASE_QUALITY); tester.addUnmappedFragment(-1, DEFAULT_BASE_QUALITY); tester.runTest(); @@ -60,21 +71,21 @@ public void testTwoUnmappedFragments() { @Test public void testSingleUnmappedPair() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addUnmappedPair(-1, DEFAULT_BASE_QUALITY); tester.runTest(); } @Test public void testSingleMappedFragment() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedFragment(1, 1, false, DEFAULT_BASE_QUALITY); tester.runTest(); } @Test public void testTwoMappedFragments() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedFragment(0, 1, false, ELIGIBLE_BASE_QUALITY); tester.addMappedFragment(0, 1, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.runTest(); @@ -82,14 +93,14 @@ public void testTwoMappedFragments() { @Test public void testSingleMappedPair() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(1, 1, 100, false, false, DEFAULT_BASE_QUALITY); tester.runTest(); } @Test public void testSingleMappedFragmentAndSingleMappedPair() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedFragment(1, 1, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); tester.runTest(); @@ -97,7 +108,7 @@ public void testSingleMappedFragmentAndSingleMappedPair() { @Test public void testTwoMappedPairs() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.runTest(); @@ -105,7 +116,7 @@ public void testTwoMappedPairs() { @Test public void testThreeMappedPairs() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! @@ -114,7 +125,7 @@ public void testThreeMappedPairs() { @Test public void testSingleMappedFragmentAndTwoMappedPairs() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedFragment(1, 1, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! @@ -123,7 +134,7 @@ public void testSingleMappedFragmentAndTwoMappedPairs() { @Test public void testTwoMappedPairsAndTerminalUnmappedFragment() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.addUnmappedFragment(-1, DEFAULT_BASE_QUALITY); // unmapped fragment at end of file @@ -132,7 +143,7 @@ public void testTwoMappedPairsAndTerminalUnmappedFragment() { @Test public void testTwoMappedPairsAndTerminalUnmappedPair() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.addUnmappedPair(-1, DEFAULT_BASE_QUALITY); // unmapped pair at end of file @@ -141,7 +152,7 @@ public void testTwoMappedPairsAndTerminalUnmappedPair() { @Test public void testOpticalDuplicateFinding() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); // explicitly creating 1 expected optical duplicate pair tester.setExpectedOpticalDuplicate(1); @@ -158,7 +169,7 @@ public void testOpticalDuplicateFinding() { @Test public void testOpticalDuplicatesDifferentReadGroups() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.setExpectedOpticalDuplicate(0); tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.2"); // duplicate tester.addMatePair("RUNID:7:1203:2886:82242", 19, 19, 485253, 485253, false, false, false, false, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.1"); @@ -174,7 +185,7 @@ public void testOpticalDuplicatesDifferentReadGroups() { @Test public void testOpticalDuplicatesTheSameReadGroup() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.setExpectedOpticalDuplicate(1); tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.2"); // duplicate (tie broken by readname) tester.addMatePair("RUNID:7:1203:2886:82242", 19, 19, 485253, 485253, false, false, false, false, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.2"); @@ -191,7 +202,7 @@ public void testOpticalDuplicatesTheSameReadGroup() { @Test public void testOpticalDuplicatesAndPCRDuplicatesOrientationDifference() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.setExpectedOpticalDuplicate(0); tester.addMatePair("RUNID:7:1203:2886:16838", 19, 19, 485253, 486253, false, false, true, true, "101M", "101M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "1"); // duplicate tester.addMatePair("RUNID:7:1203:2886:16834", 19, 19, 486253, 485253, false, false, false, false, "101M", "101M", false, true, false, false, false, DEFAULT_BASE_QUALITY, "1"); @@ -204,7 +215,7 @@ public void testOpticalDuplicatesAndPCRDuplicatesOrientationDifference() { // to FR if they point in opposite directions. This test asserts that two pairs of reads that ultimately all map to 93470499-93470499 with clipping // and orientations will ultimately be marked and mapped together even though they have opposite orientations (one is FR while the other is RF). public void testReadSameStartPositionOrientationOverride() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.setExpectedOpticalDuplicate(0); tester.addMatePair("HJYFJCCXX160204:8:1124:31659:21526", 19, 19, 93470380, 93470499, false, false, false, false, "31S111M9S", "64M87S", true, false, false, false, false, DEFAULT_BASE_QUALITY, "1"); // after clipping these both start on 93470499 tester.addMatePair("HJYFJCCXX160204:8:1124:31659:21527", 19, 19, 93470499, 93470380, false, false, true, true, "64M87S", "31S111M9S", false, true, false, false, false, DEFAULT_BASE_QUALITY, "1"); // after clipping these both start on 93470499 @@ -215,7 +226,7 @@ public void testReadSameStartPositionOrientationOverride() { // This asserts that we are flipping reads in the Pair object based on both start position and contig index, this does not // make a difference unless the start position is the same across two contigs, so we assert it is handled properly public void testReadsHaveSameStartPositionButDifferentChromosomeNonEquivalence() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.setExpectedOpticalDuplicate(0); tester.addMatePair("HJYFJCCXX160204:8:1124:31659:21526", 19, 20, 93470380, 93470499, false, false, false, false, "31S111M9S", "64M87S", true, false, false, false, false, DEFAULT_BASE_QUALITY, "1"); // after clipping these both start on 93470499 tester.addMatePair("HJYFJCCXX160204:8:1124:31659:21527", 20, 19, 93470499, 93470380, false, false, true, true, "64M87S", "31S111M9S", false, true, false, false, false, DEFAULT_BASE_QUALITY, "1"); // after clipping these both start on 93470499 @@ -224,7 +235,7 @@ public void testReadsHaveSameStartPositionButDifferentChromosomeNonEquivalence() @Test public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicatesWithinPixelDistance() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(101); tester.setExpectedOpticalDuplicate(0); tester.addMatePair("RUNID:7:1203:2886:16834", 1, 485253, 485253, false, false, true, true, "42M59S", "59S42M", false, true, false, false, false, DEFAULT_BASE_QUALITY); // duplicate @@ -234,7 +245,7 @@ public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicatesWithinPixe @Test public void testOpticalDuplicateClusterSamePositionOneOpticalDuplicatesWithinPixelDistance() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(45); tester.setExpectedOpticalDuplicate(1); tester.addMatePair("RUNID:7:1203:2886:16834", 1, 485253, 485253, false, false, true, true, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY); // duplicate @@ -244,7 +255,7 @@ public void testOpticalDuplicateClusterSamePositionOneOpticalDuplicatesWithinPix @Test public void testOpticalDuplicateClusterOneEndSamePositionOneCluster() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(101); tester.setExpectedOpticalDuplicate(1); tester.addMatePair("RUNID:7:2205:17939:39728", 1, 485328, 485312, false, false, false, false, "55M46S", "30S71M", false, true, false, false, false, ELIGIBLE_BASE_QUALITY); @@ -254,7 +265,7 @@ public void testOpticalDuplicateClusterOneEndSamePositionOneCluster() { @Test public void testTwoMappedPairsAndMappedSecondaryFragment() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.addMappedFragment(1, 200, false, DEFAULT_BASE_QUALITY, true); // mapped non-primary fragment @@ -263,7 +274,7 @@ public void testTwoMappedPairsAndMappedSecondaryFragment() { @Test public void testMappedFragmentAndMappedPairFirstOfPairNonPrimary() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMappedFragment(1, 1, false, ELIGIBLE_BASE_QUALITY); tester.addMatePair(1, 200, 0, false, true, false, false, "54M22S", null, false, false, true, true, false, DEFAULT_BASE_QUALITY); @@ -272,7 +283,7 @@ public void testMappedFragmentAndMappedPairFirstOfPairNonPrimary() { @Test public void testTwoMappedPairsMatesSoftClipped() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMappedPair(1, 10022, 10051, false, false, "76M", "8S68M", false, true, false, DEFAULT_BASE_QUALITY); tester.addMappedPair(1, 10022, 10063, false, false, "76M", "5S71M", false, true, false, DEFAULT_BASE_QUALITY); @@ -282,7 +293,7 @@ public void testTwoMappedPairsMatesSoftClipped() { @Test public void testTwoMappedPairsWithSoftClipping() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); //todo Picard says no duplicates, we say duplicate, what's going on? // NB: no duplicates @@ -295,7 +306,7 @@ public void testTwoMappedPairsWithSoftClipping() { @Test public void testTwoMappedPairsWithSoftClippingFirstOfPairOnlyNoMateCigar() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.setNoMateCigars(true); // NB: no duplicates @@ -308,7 +319,7 @@ public void testTwoMappedPairsWithSoftClippingFirstOfPairOnlyNoMateCigar() { @Test public void testTwoMappedPairsWithSoftClippingBoth() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); // mapped reference length: 73 + 42 = 115 tester.addMappedPair(1, 10046, 10002, true, true, "3S73M", "6S42M28S", true, false, false, DEFAULT_BASE_QUALITY); // duplicate @@ -319,7 +330,7 @@ public void testTwoMappedPairsWithSoftClippingBoth() { @Test public void testMatePairSecondUnmapped() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMatePair(1, 10049, 10049, false, true, false, false, "11M2I63M", null, false, false, false, false, false, DEFAULT_BASE_QUALITY); // neither are duplicates tester.runTest(); @@ -327,7 +338,7 @@ public void testMatePairSecondUnmapped() { @Test public void testMatePairFirstUnmapped() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMatePair(1, 10056, 10056, true, false, false, false, null, "54M22S", false, false, false, false, false, DEFAULT_BASE_QUALITY); // neither are duplicates tester.runTest(); @@ -335,7 +346,7 @@ public void testMatePairFirstUnmapped() { @Test public void testMappedFragmentAndMatePairSecondUnmapped() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMatePair(1, 10049, 10049, false, true, false, false, "11M2I63M", null, false, false, false, false, false, ELIGIBLE_BASE_QUALITY); // We set the length here separately because some of the MD variants use TOTAL_MAPPED_REFERENCE_LENGTH as their DUPLICATE_SCORING_STRATEGY. If we kept the read length @@ -347,7 +358,7 @@ public void testMappedFragmentAndMatePairSecondUnmapped() { @Test public void testMappedFragmentAndMatePairFirstUnmapped() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMatePair(1, 10049, 10049, true, false, false, false, null, "11M2I63M", false, false, false, false, false, ELIGIBLE_BASE_QUALITY); // We set the length here separately because some of the MD variants use TOTAL_MAPPED_REFERENCE_LENGTH as their DUPLICATE_SCORING_STRATEGY. If we kept the read length @@ -359,7 +370,7 @@ public void testMappedFragmentAndMatePairFirstUnmapped() { @Test public void testMappedPairAndMatePairSecondUnmapped() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMatePair(1, 10040, 10040, false, true, true, false, "76M", null, false, false, false, false, false, DEFAULT_BASE_QUALITY); // second a duplicate, // second end unmapped @@ -369,7 +380,7 @@ public void testMappedPairAndMatePairSecondUnmapped() { @Test public void testMappedPairAndMatePairFirstUnmapped() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMatePair(1, 10040, 10040, true, false, false, true, null, "76M", false, false, false, false, false, DEFAULT_BASE_QUALITY); // first a duplicate, // first end unmapped @@ -380,7 +391,7 @@ public void testMappedPairAndMatePairFirstUnmapped() { // TODO: fails on MarkDuplicatesWithMateCigar @Test public void testMappedPairAndMatePairFirstOppositeStrandSecondUnmapped() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(101); // first end mapped OK -, second end unmapped tester.addMatePair(1, 484071, 484071, false, true, false, false, "66S35M", null, true, false, false, false, false, DEFAULT_BASE_QUALITY); @@ -389,9 +400,28 @@ public void testMappedPairAndMatePairFirstOppositeStrandSecondUnmapped() { tester.runTest(); } + @Test + public void testSupplementaryReadUnmappedMate() { + File output = createTempFile("supplementaryReadUnmappedMate", "bam"); + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.addOutput(output); + args.addInput(getTestFile("supplementaryReadUnmappedmate.bam")); + runCommandLine(args); + + try ( final ReadsDataSource outputReadsSource = new ReadsDataSource(output.toPath()) ) { + final List actualReads = new ArrayList<>(); + for ( final GATKRead read : outputReadsSource ) { + Assert.assertFalse(read.isDuplicate()); + actualReads.add(read); + } + + Assert.assertEquals(actualReads.size(), 3, "Wrong number of reads output"); + } + } + @Test public void testMappedPairAndMappedFragmentAndMatePairSecondUnmapped() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMatePair(1, 10040, 10040, false, true, true, false, "76M", null, false, false, false, false, false, DEFAULT_BASE_QUALITY); // first a duplicate, // second end unmapped @@ -402,7 +432,7 @@ public void testMappedPairAndMappedFragmentAndMatePairSecondUnmapped() { @Test public void testMappedPairAndMappedFragmentAndMatePairFirstUnmapped() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMatePair(1, 10040, 10040, true, false, false, true, null, "76M", false, false, false, false, false, DEFAULT_BASE_QUALITY); // first a duplicate, // first end unmapped @@ -413,7 +443,7 @@ public void testMappedPairAndMappedFragmentAndMatePairFirstUnmapped() { @Test public void testTwoMappedPairsWithOppositeOrientations() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMappedPair(1, 10182, 10038, true, true, "32S44M", "66M10S", true, false, false, DEFAULT_BASE_QUALITY); // -/+ tester.addMappedPair(1, 10038, 10182, false, false, "70M6S", "32S44M", false, true, false, ELIGIBLE_BASE_QUALITY); // +/-, both are duplicates @@ -422,7 +452,7 @@ public void testTwoMappedPairsWithOppositeOrientations() { @Test public void testTwoMappedPairsWithOppositeOrientationsNumberTwo() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMappedPair(1, 10038, 10182, false, false, "70M6S", "32S44M", false, true, false, ELIGIBLE_BASE_QUALITY); // +/-, both are duplicates tester.addMappedPair(1, 10182, 10038, true, true, "32S44M", "66M10S", true, false, false, DEFAULT_BASE_QUALITY); // -/+ @@ -431,7 +461,7 @@ public void testTwoMappedPairsWithOppositeOrientationsNumberTwo() { @Test public void testThreeMappedPairsWithMatchingSecondMate() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); // Read0 and Read2 are duplicates // 10181+41=10220, 10058 @@ -445,7 +475,7 @@ public void testThreeMappedPairsWithMatchingSecondMate() { @Test public void testMappedPairWithSamePosition() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMappedPair(1, 4914, 4914, false, false, "37M39S", "73M3S", false, false, false, DEFAULT_BASE_QUALITY); // +/+ tester.runTest(); @@ -453,7 +483,7 @@ public void testMappedPairWithSamePosition() { @Test public void testMappedPairWithSamePositionSameCigar() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMappedPair(1, 4914, 4914, false, false, "37M39S", "37M39S", false, false, false, DEFAULT_BASE_QUALITY); // +/+ tester.runTest(); @@ -461,7 +491,7 @@ public void testMappedPairWithSamePositionSameCigar() { @Test public void testTwoMappedPairWithSamePosition() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMappedPair(0, 5604914, 5604914, false, false, "37M39S", "73M3S", false, false, false, ELIGIBLE_BASE_QUALITY); // +/+ tester.addMappedPair(0, 5604914, 5604914, true, true, "37M39S", "73M3S", false, false, false, DEFAULT_BASE_QUALITY); // +/+ @@ -470,7 +500,7 @@ public void testTwoMappedPairWithSamePosition() { @Test public void testTwoMappedPairWithSamePositionDifferentStrands() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(0, 5604914, 5604914, false, false, "50M", "50M", true, false, false, ELIGIBLE_BASE_QUALITY); // +/- tester.addMappedPair(0, 5604914, 5604914, true, true, "50M", "50M", false, true, false, DEFAULT_BASE_QUALITY); // -/+ tester.runTest(); @@ -478,7 +508,7 @@ public void testTwoMappedPairWithSamePositionDifferentStrands() { @Test public void testTwoMappedPairWithSamePositionDifferentStrands2() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(0, 5604914, 5604915, false, false, "50M", "50M", true, false, false, ELIGIBLE_BASE_QUALITY); // +/- tester.addMappedPair(0, 5604915, 5604914, true, true, "50M", "50M", false, true, false, DEFAULT_BASE_QUALITY); // -/+ tester.runTest(); @@ -486,7 +516,7 @@ public void testTwoMappedPairWithSamePositionDifferentStrands2() { @Test public void testMappedPairWithFirstEndSamePositionAndOther() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(76); tester.addMappedPair(0, 5604914, 5605914, false, false, "37M39S", "73M3S", false, false, false, DEFAULT_BASE_QUALITY); // +/+ tester.addMappedPair(0, 5604914, 5604914, false, false, "37M39S", "73M3S", false, false, false, DEFAULT_BASE_QUALITY); // +/+ @@ -495,7 +525,7 @@ public void testMappedPairWithFirstEndSamePositionAndOther() { @Test public void testTwoGroupsOnDifferentChromosomesOfTwoFragments() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedFragment(0, 1, false, ELIGIBLE_BASE_QUALITY); tester.addMappedFragment(0, 1, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.addMappedFragment(1, 1, false, ELIGIBLE_BASE_QUALITY); @@ -506,7 +536,7 @@ public void testTwoGroupsOnDifferentChromosomesOfTwoFragments() { @Test public void testTwoGroupsOnDifferentChromosomesOfTwoMappedPairs() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(0, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); tester.addMappedPair(0, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); @@ -516,7 +546,7 @@ public void testTwoGroupsOnDifferentChromosomesOfTwoMappedPairs() { @Test public void testTwoGroupsOnDifferentChromosomesOfThreeMappedPairs() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(0, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); tester.addMappedPair(0, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.addMappedPair(0, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! @@ -528,7 +558,7 @@ public void testTwoGroupsOnDifferentChromosomesOfThreeMappedPairs() { @Test public void testThreeGroupsOnDifferentChromosomesOfThreeMappedPairs() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.addMappedPair(0, 1, 100, false, false, ELIGIBLE_BASE_QUALITY); tester.addMappedPair(0, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! tester.addMappedPair(0, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!! @@ -543,7 +573,7 @@ public void testThreeGroupsOnDifferentChromosomesOfThreeMappedPairs() { @Test public void testBulkFragmentsNoDuplicates() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(100); for(int position = 1; position <= 10000; position += 1) { tester.addMappedFragment(0, position, false, "100M", DEFAULT_BASE_QUALITY); @@ -553,7 +583,7 @@ public void testBulkFragmentsNoDuplicates() { @Test public void testBulkFragmentsWithDuplicates() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(100); for(int position = 1; position <= 10000; position += 1) { tester.addMappedFragment(0, position, false, "100M", ELIGIBLE_BASE_QUALITY); @@ -567,7 +597,7 @@ public void testBulkFragmentsWithDuplicates() { @Test public void testStackOverFlowPairSetSwap() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(68); final File input = new File(TEST_DATA_DIR, "markDuplicatesWithMateCigar.pairSet.swap.sam"); @@ -583,7 +613,7 @@ public void testStackOverFlowPairSetSwap() { @Test public void testSecondEndIsBeforeFirstInCoordinate() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(68); tester.addMappedPair(0, 108855339, 108855323, false, false, "33S35M", "17S51M", true, true, false, DEFAULT_BASE_QUALITY); // +/- tester.runTest(); @@ -591,7 +621,7 @@ public void testSecondEndIsBeforeFirstInCoordinate() { @Test public void testPathologicalOrderingAtTheSamePosition() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(68); tester.setExpectedOpticalDuplicate(1); @@ -620,7 +650,7 @@ public void testPathologicalOrderingAtTheSamePosition() { @Test public void testDifferentChromosomesInOppositeOrder() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(101); tester.setExpectedOpticalDuplicate(1); tester.addMatePair("RUNID:6:101:17642:6835", 0, 1, 123989, 18281, false, false, true, true, "37S64M", "52M49S", false, false, false, false, false, DEFAULT_BASE_QUALITY, "1"); @@ -630,7 +660,7 @@ public void testDifferentChromosomesInOppositeOrder() { @Test public void testOpticalDuplicateClustersAddingSecondEndFirstSameCoordinate() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(68); tester.setExpectedOpticalDuplicate(1); tester.addMatePair("RUNID:1:1:15993:13361", 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, ELIGIBLE_BASE_QUALITY); @@ -652,7 +682,7 @@ public Object[][] secondarySupplementaryData() { @Test(dataProvider = "secondarySupplementaryData") public void testTwoMappedPairsWithSupplementaryReads(final Boolean additionalFragSecondary, final Boolean additionalFragSupplementary, final Boolean fragLikeFirst) { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); tester.getSamRecordSetBuilder().setReadLength(68); tester.setExpectedOpticalDuplicate(1); tester.addMatePair("RUNID:1:1:15993:13361", 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY); @@ -720,7 +750,7 @@ public void testMDOrder(final File input, final File expectedOutput) throws Exce @Test public void testTwoMappedPairsWithSoftClippingFirstOfPairOnly() { - final AbstractMarkDuplicatesTester tester = getTester(); + final MarkDuplicatesSparkTester tester = getTester(); // NB: no duplicates // 5'1: 2, 5'2:46+73M=118 // 5'1: 2, 5'2:51+68M=118 @@ -751,4 +781,125 @@ protected void testMDOrderImpl(final File input, final File expectedOutput, fina } + @Test + public void testNonExistantReadGroupInRead() { + final MarkDuplicatesSparkTester tester = new MarkDuplicatesSparkTester(true); + tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "NotADuplicateGroup"); + try { + tester.runTest(); + Assert.fail("Should have thrown an exception"); + } catch (Exception e){ + Assert.assertTrue(e instanceof SparkException); + Assert.assertTrue(e.getCause() instanceof UserException.HeaderMissingReadGroup); + } + } + + @Test + public void testNoReadGroupInRead() { + final MarkDuplicatesSparkTester tester = new MarkDuplicatesSparkTester(true); + tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, null); + + try { + tester.runTest(); + Assert.fail("Should have thrown an exception"); + } catch (Exception e){ + Assert.assertTrue(e instanceof SparkException); + Assert.assertTrue(e.getCause() instanceof UserException.ReadMissingReadGroup); + } + } + + @Test(dataProvider = "readNameData") + public void testOpticalDuplicatesTiebrokenByPhysicalLocationNotStartPosition(final String readName1, final String readName2) { + // This tests the readname based tiebreaking code in mark duplicates. Since it's ambiguous which read should be marked + // as duplicate or not if scores match we break ties by evaluating the readname for consistencies sake. + + final ReadNameParser parser = new ReadNameParser(); + + final PhysicalLocationInt position1 = new PhysicalLocationInt(); + final PhysicalLocationInt position2 = new PhysicalLocationInt(); + + parser.addLocationInformation(readName1, position1); + parser.addLocationInformation(readName2, position2); + + final MarkDuplicatesSparkTester tester = getTester(); + tester.getSamRecordSetBuilder().setReadLength(101); + tester.setExpectedOpticalDuplicate(0); + + int compare = position1.tile - position2.tile; + if (compare == 0) { + compare = (short)position1.x - (short)position2.x; + } + + if (compare == 0) { + compare = (short)position1.y - (short)position2.y; + } + + final boolean isDuplicate = compare < 0; + + // NOTE these reads are offset slightly but should have the same unclipped start postitions + tester.addMatePair(readName1, 1,2, 46, false, false, !isDuplicate, !isDuplicate, "6S42M28S", "3S68M", true, false, false, false, false, DEFAULT_BASE_QUALITY); + tester.addMatePair(readName2, 1,2, 51, false, false, isDuplicate, isDuplicate, "6S42M28S", "8S68M", true, false, false, false, false, DEFAULT_BASE_QUALITY); + tester.runTest(); + } + + @DataProvider + public Object[][] readNameData(){ + return new Object[][]{ + {"RUNID:7:1203:2886:82292", "RUNID:7:1205:3886:16834"}, + + {"RUNID:7:1203:2886:16756", "RUNID:7:1205:3886:16756"}, + {"RUNID:7:1204:2886:16756", "RUNID:7:1205:3886:16756"}, + {"RUNID:7:1205:2886:16756", "RUNID:7:1205:3886:16756"}, + {"RUNID:7:1206:2886:16756", "RUNID:7:1205:3886:16756"}, + {"RUNID:7:1207:2886:16756", "RUNID:7:1205:3886:16756"}, + + {"RUNID:7:1203:2886:16756", "RUNID:7:1203:4886:26756"}, + {"RUNID:7:1203:3886:16756", "RUNID:7:1203:4886:26756"}, + {"RUNID:7:1203:4886:16756", "RUNID:7:1203:4886:26756"}, + {"RUNID:7:1203:5886:16756", "RUNID:7:1203:4886:26756"}, + {"RUNID:7:1203:6886:16756", "RUNID:7:1203:4886:26756"}, + + {"RUNID:7:1203:2886:34756", "RUNID:7:1203:2886:36756"}, + {"RUNID:7:1203:2886:35756", "RUNID:7:1203:2886:36756"}, + {"RUNID:7:1203:2886:37756", "RUNID:7:1203:2886:36756"}, + {"RUNID:7:1203:2886:38756", "RUNID:7:1203:2886:36756"}, + + //Added a test for tiebreaking accounting for the short casting done in picard + {"HK3T5CCXX160204:3:1112:11586:37067", "HK3T5CCXX160204:3:1112:11586:32144"} + }; + } + + @Test(dataProvider = "readNameData") + public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicates(final String readName1, final String readName2) { + // This tests the readname based tiebreaking code in mark duplicates. Since it's ambiguous which read should be marked + // as duplicate or not if scores match we break ties by evaluating the readname for consistencies sake. + + final ReadNameParser parser = new ReadNameParser(); + + final PhysicalLocationInt position1 = new PhysicalLocationInt(); + final PhysicalLocationInt position2 = new PhysicalLocationInt(); + + parser.addLocationInformation(readName1, position1); + parser.addLocationInformation(readName2, position2); + + final MarkDuplicatesSparkTester tester = getTester(); + tester.getSamRecordSetBuilder().setReadLength(101); + tester.setExpectedOpticalDuplicate(0); + + int compare = position1.tile - position2.tile; + if (compare == 0) { + compare = (short)position1.x - (short)position2.x; + } + + if (compare == 0) { + compare = (short)position1.y - (short)position2.y; + } + + final boolean isDuplicate = compare < 0; + + tester.addMatePair(readName1, 1,485253, 485253, false, false, !isDuplicate, !isDuplicate, "42M59S", "59S42M", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.addMatePair(readName2, 1,485253, 485253, false, false, isDuplicate, isDuplicate, "59S42M", "42M59S", true, false, false, false, false, DEFAULT_BASE_QUALITY); + + tester.runTest(); + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATKIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATKIntegrationTest.java deleted file mode 100644 index 06e8209a5f1..00000000000 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATKIntegrationTest.java +++ /dev/null @@ -1,273 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.markduplicates; - -import htsjdk.samtools.*; -import htsjdk.samtools.util.CloserUtil; -import htsjdk.samtools.util.CollectionUtil; -import htsjdk.samtools.util.IOUtil; -import htsjdk.samtools.util.TestUtil; -import org.broadinstitute.hellbender.cmdline.CommandLineProgram; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesTester; -import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; -import org.broadinstitute.hellbender.testutils.testers.AbstractMarkDuplicatesTester; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -/** - * This class defines the individual test cases to run. The actual running of the test is done - * by MarkDuplicatesTester (see getTester). - */ -public final class MarkDuplicatesGATKIntegrationTest extends AbstractMarkDuplicatesCommandLineProgramTest { - protected static String TEST_BASE_NAME = null; - - @BeforeClass - public void setUp() { - TEST_BASE_NAME = "MarkDuplicatesGATK"; - } - - @Override - protected AbstractMarkDuplicatesTester getTester() { - return new MarkDuplicatesTester(); - } - - @DataProvider(name="strictlyBadBams") - public Object[][] strictlyBadBams() { - return new Object[][]{ - {new File(getTestDataDir(), "non_strict.bam")}, - }; - } - - @Override - protected CommandLineProgram getCommandLineProgramInstance() { - return new MarkDuplicatesGATK(); - } - - // NB: this test should return different results than MarkDuplicatesWithMateCigar - - /** - * Test that PG header records are created & chained appropriately (or not created), and that the PG record chains - * are as expected. MarkDuplicatesGATK is used both to merge and to mark dupes in this case. - * @param suppressPg If true, do not create PG header record. - * @param expectedPnVnByReadName For each read, info about the expect chain of PG records. - */ - @Test(dataProvider = "pgRecordChainingTest") - public void pgRecordChainingTest(final boolean suppressPg, - final Map> expectedPnVnByReadName) { - final File outputDir = IOUtil.createTempDir(TEST_BASE_NAME + ".", ".tmp"); - outputDir.deleteOnExit(); - try { - // Run MarkDuplicatesGATK, merging the 3 input files, and either enabling or suppressing PG header - // record creation according to suppressPg. - final MarkDuplicatesGATK markDuplicatesGATK = new MarkDuplicatesGATK(); - final ArgumentsBuilder args = new ArgumentsBuilder(); - for (int i = 1; i <= 3; ++i) { - args.add("--input"); - args.add(new File(TEST_DATA_DIR, "merge" + i + ".sam").getAbsolutePath()); - } - final File outputSam = new File(outputDir, TEST_BASE_NAME + ".sam"); - args.add("--output"); - args.add(outputSam.getAbsolutePath()); - args.add("--"+StandardArgumentDefinitions.METRICS_FILE_LONG_NAME); - args.add(new File(outputDir, TEST_BASE_NAME + ".duplicate_metrics").getAbsolutePath()); - if (suppressPg) { - args.add("--PROGRAM_RECORD_ID"); - args.add("null"); - } - - // I generally prefer to call doWork rather than invoking the argument parser, but it is necessary - // in this case to initialize the command line. - // Note that for the unit test, version won't come through because it is obtained through jar - // manifest, and unit test doesn't run code from a jar. - markDuplicatesGATK.instanceMain(args.getArgsArray()); - - // Read the MarkDuplicatesGATK output file, and get the PG ID for each read. In this particular test, - // the PG ID should be the same for both ends of a pair. - final SamReader reader = SamReaderFactory.makeDefault().open(outputSam); - - final Map pgIdForReadName = new HashMap<>(); - for (final SAMRecord rec : reader) { - final String existingPgId = pgIdForReadName.get(rec.getReadName()); - final String thisPgId = rec.getStringAttribute(SAMTag.PG.name()); - if (existingPgId != null) { - Assert.assertEquals(thisPgId, existingPgId); - } else { - pgIdForReadName.put(rec.getReadName(), thisPgId); - } - } - final SAMFileHeader header = reader.getFileHeader(); - CloserUtil.close(reader); - - // Confirm that for each read name, the chain of PG records contains exactly the number that is expected, - // and that values in the PG chain are as expected. - for (final Map.Entry> entry : expectedPnVnByReadName.entrySet()) { - final String readName = entry.getKey(); - final List expectedList = entry.getValue(); - String pgId = pgIdForReadName.get(readName); - for (final ExpectedPnAndVn expected : expectedList) { - final SAMProgramRecord programRecord = header.getProgramRecord(pgId); - if (expected.expectedPn != null) { - Assert.assertEquals(programRecord.getProgramName(), expected.expectedPn); - } - if (expected.expectedVn != null) { - Assert.assertEquals(programRecord.getProgramVersion(), expected.expectedVn); - } - pgId = programRecord.getPreviousProgramGroupId(); - } - Assert.assertNull(pgId); - } - - } finally { - TestUtil.recursiveDelete(outputDir); - } - } - - /** - * Represents an expected PN value and VN value for a PG record. If one of thexe is null, any value is allowed - * in the PG record being tested. - */ - private static class ExpectedPnAndVn { - final String expectedPn; - final String expectedVn; - - private ExpectedPnAndVn(final String expectedPn, final String expectedVn) { - this.expectedPn = expectedPn; - this.expectedVn = expectedVn; - } - } - - @DataProvider(name = "pgRecordChainingTest") - public Object[][] pgRecordChainingTestDataProvider() { - // Two test cases: One in which PG record generation is enabled, the other in which it is turned off. - final Map> withPgMap = new HashMap<>(); - withPgMap.put("1AAXX.1.1", Arrays.asList(new ExpectedPnAndVn(TEST_BASE_NAME, null), new ExpectedPnAndVn(TEST_BASE_NAME, "1"), new ExpectedPnAndVn("bwa", "1"))); - withPgMap.put("1AAXX.2.1", Arrays.asList(new ExpectedPnAndVn(TEST_BASE_NAME, null), new ExpectedPnAndVn("bwa", "2"))); - withPgMap.put("1AAXX.3.1", Arrays.asList(new ExpectedPnAndVn(TEST_BASE_NAME, null))); - - final Map> suppressPgMap = new HashMap<>(); - suppressPgMap .put("1AAXX.1.1", Arrays.asList(new ExpectedPnAndVn(TEST_BASE_NAME, "1"), new ExpectedPnAndVn("bwa", "1"))); - suppressPgMap .put("1AAXX.2.1", Arrays.asList(new ExpectedPnAndVn("bwa", "2"))); - suppressPgMap .put("1AAXX.3.1", new ArrayList<>(0)); - return new Object[][] { - { false, withPgMap}, - { true, suppressPgMap} - }; - } - - @Test(dataProvider = "testOpticalDuplicateDetectionDataProvider") - public void testOpticalDuplicateDetection(final File sam, final File reference, final String outputExtension, final long expectedNumOpticalDuplicates) { - final File outputDir = IOUtil.createTempDir(TEST_BASE_NAME + ".", ".tmp"); - outputDir.deleteOnExit(); - final File outputSam = new File(outputDir, TEST_BASE_NAME + outputExtension); - outputSam.deleteOnExit(); - final File metricsFile = new File(outputDir, TEST_BASE_NAME + ".duplicate_metrics"); - metricsFile.deleteOnExit(); - // Run MarkDuplicatesGATK, merging the 3 input files, and either enabling or suppressing PG header - // record creation according to suppressPg. - final MarkDuplicatesGATK markDuplicatesGATK = new MarkDuplicatesGATK(); - markDuplicatesGATK.setupOpticalDuplicateFinder(); - markDuplicatesGATK.INPUT = CollectionUtil.makeList(sam); - if (null != reference) { - markDuplicatesGATK.REFERENCE_SEQUENCE = reference; - } - markDuplicatesGATK.OUTPUT = outputSam; - markDuplicatesGATK.METRICS_FILE = metricsFile; - markDuplicatesGATK.tmpDir = outputDir.toString(); - // Needed to suppress calling CommandLineProgram.getVersion(), which doesn't work for code not in a jar - markDuplicatesGATK.PROGRAM_RECORD_ID = null; - Assert.assertEquals(markDuplicatesGATK.doWork(), null); - Assert.assertEquals(markDuplicatesGATK.numOpticalDuplicates(), expectedNumOpticalDuplicates); - } - - @Test(dataProvider = "strictlyBadBams") - public void testLenientStringency(File input) { - // TODO: This test should really be a PicardCommandLineProgramTest, not here (see #1170). - ArgumentsBuilder args = new ArgumentsBuilder(); - args.add("--"+StandardArgumentDefinitions.INPUT_SHORT_NAME); - args.add(input.getPath()); - args.add("--"+"VALIDATION_STRINGENCY"); - args.add(ValidationStringency.LENIENT); - args.add("--"+StandardArgumentDefinitions.OUTPUT_SHORT_NAME); - - File outputFile = createTempFile("markdups", ".bam"); - outputFile.delete(); - args.add(outputFile.getAbsolutePath()); - - args.add("--"+StandardArgumentDefinitions.METRICS_FILE_LONG_NAME); - File metricsFile = createTempFile("markdups_metrics", ".txt"); - args.add(metricsFile.getAbsolutePath()); - - runCommandLine(args.getArgsArray()); - // We don't care about the results, only that the program doesn't crash because of stringency. - } - - @Test(dataProvider = "strictlyBadBams", expectedExceptions = SAMFormatException.class) - public void testStrictStringency(File input) { - // TODO: This test should really be a PicardCommandLineProgramTest, not here (see #1170). - // For these inputs, we expect an expection to be thrown because the bams have some issues. - ArgumentsBuilder args = new ArgumentsBuilder(); - args.add("--"+StandardArgumentDefinitions.INPUT_SHORT_NAME); - args.add(input.getPath()); - args.add("--"+"VALIDATION_STRINGENCY"); - args.add(ValidationStringency.STRICT); - args.add("--"+StandardArgumentDefinitions.OUTPUT_SHORT_NAME); - - File outputFile = createTempFile("markdups", ".bam"); - outputFile.delete(); - args.add(outputFile.getAbsolutePath()); - - args.add("--"+StandardArgumentDefinitions.METRICS_FILE_LONG_NAME); - File metricsFile = createTempFile("markdups_metrics", ".txt"); - args.add(metricsFile.getAbsolutePath()); - - runCommandLine(args.getArgsArray()); - // We don't care about the results, only that the program it throws an exception because of stringency. - } - - @DataProvider(name="testOpticalDuplicateDetectionDataProvider") - public Object[][] testOpticalDuplicateDetectionDataProvider() { - return new Object[][] { - {new File(TEST_DATA_DIR, "optical_dupes.sam"), null, ".sam", 1L}, - {new File(TEST_DATA_DIR, "optical_dupes.cram"), new File(TEST_DATA_DIR, "optical_dupes.fasta"), ".cram", 1L}, - {new File(TEST_DATA_DIR, "optical_dupes_casava.sam"), null, ".sam", 1L} - }; - } - - @Test(dataProvider = "testDuplicateDetectionDataProvider") - public void testDuplicateDetection(final File sam, final long expectedNumOpticalDuplicates) { - final File outputDir = IOUtil.createTempDir(TEST_BASE_NAME + ".", ".tmp"); - outputDir.deleteOnExit(); - final File outputSam = new File(outputDir, TEST_BASE_NAME + ".sam"); - outputSam.deleteOnExit(); - final File metricsFile = new File(outputDir, TEST_BASE_NAME + ".duplicate_metrics"); - metricsFile.deleteOnExit(); - // Run MarkDuplicatesGATK, merging the 3 input files, and either enabling or suppressing PG header - // record creation according to suppressPg. - final MarkDuplicatesGATK markDuplicatesGATK = new MarkDuplicatesGATK(); - markDuplicatesGATK.setupOpticalDuplicateFinder(); - markDuplicatesGATK.INPUT = CollectionUtil.makeList(sam); - markDuplicatesGATK.OUTPUT = outputSam; - markDuplicatesGATK.METRICS_FILE = metricsFile; - markDuplicatesGATK.tmpDir = outputDir.toString(); - // Needed to suppress calling CommandLineProgram.getVersion(), which doesn't work for code not in a jar - markDuplicatesGATK.PROGRAM_RECORD_ID = null; - Assert.assertEquals(markDuplicatesGATK.doWork(), null); - Assert.assertEquals(markDuplicatesGATK.numOpticalDuplicates(), expectedNumOpticalDuplicates); - } - - @DataProvider(name="testDuplicateDetectionDataProvider") - public Object[][] testDuplicateDetectionDataProvider() { - return new Object[][] { - {new File(TEST_DATA_DIR, "markDups.test2reads.bam"), 0L}, - {new File(TEST_DATA_DIR, "example.chr1.1-1K.unmarkedDups.noDups.bam"), 0L}, - {new File(TEST_DATA_DIR, "example.chr1.1-1K.markedDups.bam"), 0L}, - {new File(TEST_DATA_DIR, "example.chr1.1-1K.unmarkedDups.bam"), 0L}, - }; - } - - -} diff --git a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesSparkTester.java b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesSparkTester.java deleted file mode 100644 index 888bf81b352..00000000000 --- a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesSparkTester.java +++ /dev/null @@ -1,33 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -import htsjdk.samtools.DuplicateScoringStrategy; -import org.broadinstitute.hellbender.cmdline.CommandLineProgram; -import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; -import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark; -import org.broadinstitute.hellbender.testutils.testers.AbstractMarkDuplicatesTester; - -/** - * A tester class for {@link MarkDuplicatesSpark}. - */ -public final class MarkDuplicatesSparkTester extends AbstractMarkDuplicatesTester { - boolean markUnmappedReads = false; - - public MarkDuplicatesSparkTester() { - this(false); - } - - public MarkDuplicatesSparkTester(boolean markUnmappedReads) { - super(DuplicateScoringStrategy.ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH); - this.markUnmappedReads = markUnmappedReads; - } - - @Override - protected CommandLineProgram getProgram() { return new MarkDuplicatesSpark(); } - - @Override - protected void addArgs() { - if (!markUnmappedReads) { - addArg("--" + MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME); - } - } -} diff --git a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesTester.java b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesTester.java deleted file mode 100644 index f7a83b534b7..00000000000 --- a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesTester.java +++ /dev/null @@ -1,22 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -import htsjdk.samtools.DuplicateScoringStrategy; -import htsjdk.samtools.ValidationStringency; -import org.broadinstitute.hellbender.cmdline.CommandLineProgram; -import org.broadinstitute.hellbender.tools.walkers.markduplicates.MarkDuplicatesGATK; -import org.broadinstitute.hellbender.testutils.testers.AbstractMarkDuplicatesTester; - -/** - * This class is an extension of AbstractMarkDuplicatesCommandLineProgramTester used to test MarkDuplicatesGATK with SAM files generated on the fly. - * This performs the underlying tests defined by classes such as see AbstractMarkDuplicatesCommandLineProgramTest and MarkDuplicatesTest. - */ -public final class MarkDuplicatesTester extends AbstractMarkDuplicatesTester { - - public MarkDuplicatesTester() { - super(DuplicateScoringStrategy.ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH); - addArg("--VALIDATION_STRINGENCY", ValidationStringency.LENIENT.name()); - } - - @Override - protected CommandLineProgram getProgram() { return new MarkDuplicatesGATK(); } -} diff --git a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadsKeyUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadsKeyUnitTest.java index 2795c12ea60..3793fc56b72 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadsKeyUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadsKeyUnitTest.java @@ -1,19 +1,15 @@ package org.broadinstitute.hellbender.utils.read.markduplicates; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMReadGroupRecord; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSparkUtils; -import org.broadinstitute.hellbender.tools.walkers.markduplicates.MarkDuplicatesGATKIntegrationTest; import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; import java.util.Map; import java.util.Random; diff --git a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/PairedEndsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/PairedEndsUnitTest.java index 3aa98b55ab8..e8ea98e9b6b 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/PairedEndsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/PairedEndsUnitTest.java @@ -6,10 +6,10 @@ import org.broadinstitute.hellbender.utils.read.ReadUtils; import org.broadinstitute.hellbender.utils.read.markduplicates.LibraryIdGenerator; import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy; -import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import picard.sam.markduplicates.util.ReadEnds; import java.util.Collections;