diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java
index bad4c131d79..6918789d45d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java
@@ -22,6 +22,7 @@
import org.broadinstitute.hellbender.utils.read.markduplicates.*;
import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.*;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;
+import picard.sam.markduplicates.util.ReadEnds;
import scala.Tuple2;
import java.io.Serializable;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/EstimateLibraryComplexityGATK.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/EstimateLibraryComplexityGATK.java
deleted file mode 100644
index 1e1b35f50b4..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/EstimateLibraryComplexityGATK.java
+++ /dev/null
@@ -1,558 +0,0 @@
-package org.broadinstitute.hellbender.tools.walkers.markduplicates;
-
-import htsjdk.samtools.SAMReadGroupRecord;
-import htsjdk.samtools.SAMRecord;
-import htsjdk.samtools.SamReader;
-import htsjdk.samtools.SamReaderFactory;
-import htsjdk.samtools.metrics.MetricsFile;
-import htsjdk.samtools.util.CloserUtil;
-import htsjdk.samtools.util.Histogram;
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.PeekableIterator;
-import htsjdk.samtools.util.SequenceUtil;
-import htsjdk.samtools.util.SortingCollection;
-import htsjdk.samtools.util.StringUtil;
-import org.broadinstitute.barclay.argparser.Argument;
-import org.broadinstitute.barclay.argparser.BetaFeature;
-import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
-import org.broadinstitute.barclay.help.DocumentedFeature;
-import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.utils.io.IOUtils;
-import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
-import org.broadinstitute.hellbender.exceptions.GATKException;
-import org.broadinstitute.hellbender.utils.read.markduplicates.AbstractOpticalDuplicateFinderCommandLineProgram;
-import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics;
-import picard.sam.markduplicates.util.OpticalDuplicateFinder;
-import org.broadinstitute.hellbender.utils.runtime.ProgressLogger;
-import picard.sam.util.PhysicalLocation;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.EOFException;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Serializable;
-import java.util.*;
-
-import static java.lang.Math.pow;
-
-/**
- * Estimate library complexity from the sequence of read pairs
- *
- *
- * The estimation is done by sorting all reads by the first N bases (defined by --min-identical-bases with default of 5)
- * of each read and then comparing reads with the first N bases identical to each other for duplicates.
- * Reads are considered to be duplicates if they match each other with no gaps and an overall mismatch
- * rate less than or equal to MAX_DIFF_RATE (0.03 by default). The approach differs from that taken by
- * Picard MarkDuplicates to estimate library complexity in that here alignment is not a factor.
- *
- *
- *
- * Reads of poor quality are filtered out so as to provide a more accurate estimate. The filtering
- * removes reads with any no-calls in the first N bases or with a mean base quality lower than
- * MIN_MEAN_QUALITY across either the first or second read. Unpaired reads are ignored in this computation.
- *
- *
- *
- * The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes
- * these in the calculation of library size. Also, since there is no alignment to screen out technical
- * reads one further filter is applied on the data. After examining all reads a Histogram is built of
- * [#reads in duplicate set -> #of duplicate sets]; all bins that contain exactly one duplicate set are
- * then removed from the Histogram as outliers before library size is estimated.
- *
- *
- * Input
- *
- * - A BAM or CRAM file containing aligned read data.
- *
- *
- * Output
- *
- * - A text file with per-library complexity metrics
- *
- *
- * Usage Example
- *
- * gatk EstimateLibraryComplexityGATK \
- * -I input.bam \
- * -O metrics.txt
- *
- *
- * @author Tim Fennell
- */
-@BetaFeature
-@DocumentedFeature
-@CommandLineProgramProperties(
- summary = "Estimate library complexity from the sequence of read pairs",
- oneLineSummary = "Estimate library complexity from the sequence of read pairs",
- programGroup = DiagnosticsAndQCProgramGroup.class
-)
-public final class EstimateLibraryComplexityGATK extends AbstractOpticalDuplicateFinderCommandLineProgram {
-
- @Argument(
- fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
- shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
- doc = "One or more files to combine and " +
- "estimate library complexity from. Reads can be mapped or unmapped."
- )
- public List INPUT;
-
- @Argument(
- fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
- shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
- doc = "Output file to writes per-library metrics to."
- )
- public File OUTPUT;
-
- @Argument(
- fullName = "min-identical-bases",
- doc = "The minimum number of bases at the starts of reads that must be identical for reads to " +
- "be grouped together for duplicate detection. In effect total_reads / 4^max_id_bases reads will " +
- "be compared at a time, so lower numbers will produce more accurate results but consume " +
- "exponentially more memory and CPU."
- )
- public int MIN_IDENTICAL_BASES = 5;
-
- @Argument(
- fullName = "max-diff-rate",
- doc = "The maximum rate of differences between two reads to call them identical."
- )
- public double MAX_DIFF_RATE = 0.03;
-
- @Argument(
- fullName = "min-mean-quality",
- doc = "The minimum mean quality of the bases in a read pair for the read to be analyzed. Reads with " +
- "lower average quality are filtered out and not considered in any calculations."
- )
- public int MIN_MEAN_QUALITY = 20;
-
- @Argument(
- fullName = "max-group-ratio",
- doc = "Do not process self-similar groups that are this many times over the mean expected group size. " +
- "I.e. if the input contains 10m read pairs and MIN_IDENTICAL_BASES is set to 5, then the mean expected " +
- "group size would be approximately 10 reads."
- )
- public int MAX_GROUP_RATIO = 500;
-
- /**
- * Little class to hold the sequence of a pair of reads and tile location information.
- */
- static class PairedReadSequence implements PhysicalLocation {
- static int size_in_bytes = 2 + 1 + 4 + 1 + 300; // rough guess at memory footprint
- short readGroup = -1;
- short tile = -1;
- int x = -1, y = -1;
- boolean qualityOk = true;
- byte[] read1;
- byte[] read2;
- short libraryId;
-
- @Override
- public short getReadGroup() { return this.readGroup; }
-
- @Override
- public void setReadGroup(final short readGroup) { this.readGroup = readGroup; }
-
- @Override
- public short getTile() { return this.tile; }
-
- @Override
- public void setTile(final short tile) { this.tile = tile; }
-
- @Override
- public int getX() { return this.x; }
-
- @Override
- public void setX(final int x) { this.x = x; }
-
- @Override
- public int getY() { return this.y; }
-
- @Override
- public void setY(final int y) { this.y = y; }
-
- @Override
- public short getLibraryId() { return this.libraryId; }
-
- @Override
- public void setLibraryId(final short libraryId) { this.libraryId = libraryId; }
- }
-
- /**
- * Codec class for writing and read PairedReadSequence objects.
- */
- static class PairedReadCodec implements SortingCollection.Codec {
- private DataOutputStream out;
- private DataInputStream in;
-
- @Override
- public void setOutputStream(final OutputStream out) {
- this.out = new DataOutputStream(out);
- }
-
- @Override
- public void setInputStream(final InputStream in) {
- this.in = new DataInputStream(in);
- }
-
- @Override
- public void encode(final PairedReadSequence val) {
- try {
- this.out.writeShort(val.readGroup);
- this.out.writeShort(val.tile);
- this.out.writeShort(val.x);
- this.out.writeShort(val.y);
- this.out.writeInt(val.read1.length);
- this.out.write(val.read1);
- this.out.writeInt(val.read2.length);
- this.out.write(val.read2);
- } catch (IOException ioe) {
- throw new GATKException("Error write out read pair.", ioe);
- }
- }
-
- @Override
- public PairedReadSequence decode() {
- try {
- final PairedReadSequence val = new PairedReadSequence();
- try {
- val.readGroup = this.in.readShort();
- } catch (EOFException eof) {
- return null;
- }
-
- val.tile = this.in.readShort();
- val.x = this.in.readShort();
- val.y = this.in.readShort();
-
- int length = this.in.readInt();
- val.read1 = new byte[length];
- if (this.in.read(val.read1) != length) {
- throw new GATKException("Could not read " + length + " bytes from temporary file.");
- }
-
- length = this.in.readInt();
- val.read2 = new byte[length];
- if (this.in.read(val.read2) != length) {
- throw new GATKException("Could not read " + length + " bytes from temporary file.");
- }
-
- return val;
- } catch (IOException ioe) {
- throw new GATKException("Exception reading read pair.", ioe);
- }
- }
-
- /**
- * For an explanation of why codecs must implement clone(),
- * see the HTSJDK documentation for {#link htsjdk.samtools.util.SortingCollection.Codec}.
- */
- @Override
- public PairedReadCodec clone() { return new PairedReadCodec(); }
- }
-
- /**
- * Comparator that orders read pairs on the first N bases of both reads.
- */
- class PairedReadComparator implements Comparator, Serializable {
- private static final long serialVersionUID = 7452449563074722818L;
-
- final int BASES = EstimateLibraryComplexityGATK.this.MIN_IDENTICAL_BASES;
-
- @Override
- public int compare(final PairedReadSequence lhs, final PairedReadSequence rhs) {
- // First compare the first N bases of the first read
- for (int i = 0; i < BASES; ++i) {
- final int retval = lhs.read1[i] - rhs.read1[i];
- if (retval != 0) return retval;
- }
-
- // Then compare the first N bases of the second read
- for (int i = 0; i < BASES; ++i) {
- final int retval = lhs.read2[i] - rhs.read2[i];
- if (retval != 0) return retval;
- }
-
- return System.identityHashCode(lhs) - System.identityHashCode(rhs);
- }
- }
-
- public EstimateLibraryComplexityGATK() {
- MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / PairedReadSequence.size_in_bytes) / 2;
- }
-
- /**
- * Method that does most of the work. Reads through the input BAM file and extracts the
- * read sequences of each read pair and sorts them via a SortingCollection. Then traverses
- * the sorted reads and looks at small groups at a time to find duplicates.
- */
- @Override
- protected Object doWork() {
- for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
-
- logger.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");
-
- final List readGroups = new ArrayList<>();
- final int recordsRead = 0;
- final SortingCollection sorter = SortingCollection.newInstanceFromPaths(PairedReadSequence.class,
- new PairedReadCodec(),
- new PairedReadComparator(),
- MAX_RECORDS_IN_RAM,
- Collections.singletonList(IOUtils.getPath(tmpDir)));
-
- // Loop through the input files and pick out the read sequences etc.
- final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
- for (final File f : INPUT) {
- final Map pendingByName = new HashMap<>();
- final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
- readGroups.addAll(in.getFileHeader().getReadGroups());
-
- for (final SAMRecord rec : in) {
- if (!rec.getReadPairedFlag()) continue;
- if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
- continue;
- }
-
- PairedReadSequence prs = pendingByName.remove(rec.getReadName());
- if (prs == null) {
- // Make a new paired read object and add RG and physical location information to it
- prs = new PairedReadSequence();
- if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
- final SAMReadGroupRecord rg = rec.getReadGroup();
- if (rg != null) prs.setReadGroup((short) readGroups.indexOf(rg));
- }
-
- pendingByName.put(rec.getReadName(), prs);
- }
-
- // Read passes quality check if both ends meet the mean quality criteria
- final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(),
- rec.getBaseQualities(),
- MIN_IDENTICAL_BASES,
- MIN_MEAN_QUALITY);
- prs.qualityOk = prs.qualityOk && passesQualityCheck;
-
- // Get the bases and restore them to their original orientation if necessary
- final byte[] bases = rec.getReadBases();
- if (rec.getReadNegativeStrandFlag()) SequenceUtil.reverseComplement(bases);
-
- if (rec.getFirstOfPairFlag()) {
- prs.read1 = bases;
- } else {
- prs.read2 = bases;
- }
-
- if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
- sorter.add(prs);
- }
-
- progress.record(rec);
- }
- CloserUtil.close(in);
- }
-
- logger.info("Finished reading - moving on to scanning for duplicates.");
-
- // Now go through the sorted reads and attempt to find duplicates
- try (final PeekableIterator iterator = new PeekableIterator<>(sorter.iterator())) {
-
- final Map> duplicationHistosByLibrary = new HashMap<>();
- final Map> opticalHistosByLibrary = new HashMap<>();
-
- int groupsProcessed = 0;
- long lastLogTime = System.currentTimeMillis();
- final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4.0, (double) MIN_IDENTICAL_BASES * 2));
-
- while (iterator.hasNext()) {
- // Get the next group and split it apart by library
- final List group = getNextGroup(iterator);
-
- if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
- final PairedReadSequence prs = group.get(0);
- logger.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " +
- "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " +
- StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) +
- " / " +
- StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES));
- } else {
- final Map> sequencesByLibrary = splitByLibrary(group, readGroups);
-
- // Now process the reads by library
- for (final Map.Entry> entry : sequencesByLibrary.entrySet()) {
- final String library = entry.getKey();
- final List seqs = entry.getValue();
-
- Histogram duplicationHisto = duplicationHistosByLibrary.get(library);
- Histogram opticalHisto = opticalHistosByLibrary.get(library);
- if (duplicationHisto == null) {
- duplicationHisto = new Histogram<>("duplication_group_count", library);
- opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates");
- duplicationHistosByLibrary.put(library, duplicationHisto);
- opticalHistosByLibrary.put(library, opticalHisto);
- }
-
- // Figure out if any reads within this group are duplicates of one another
- for (int i = 0; i < seqs.size(); ++i) {
- final PairedReadSequence lhs = seqs.get(i);
- if (lhs == null) continue;
- final List dupes = new ArrayList<>();
-
- for (int j = i + 1; j < seqs.size(); ++j) {
- final PairedReadSequence rhs = seqs.get(j);
- if (rhs == null) continue;
-
- if (matches(lhs, rhs, MAX_DIFF_RATE)) {
- dupes.add(rhs);
- seqs.set(j, null);
- }
- }
-
- if (!dupes.isEmpty()) {
- dupes.add(lhs);
- final int duplicateCount = dupes.size();
- duplicationHisto.increment(duplicateCount);
-
- final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes, null);
- for (final boolean b : flags) {
- if (b) opticalHisto.increment(duplicateCount);
- }
- } else {
- duplicationHisto.increment(1);
- }
- }
- }
-
- ++groupsProcessed;
- if (lastLogTime < System.currentTimeMillis() - 60000) {
- logger.info("Processed " + groupsProcessed + " groups.");
- lastLogTime = System.currentTimeMillis();
- }
- }
- }
- sorter.cleanup();
-
- final MetricsFile file = getMetricsFile();
- for (final String library : duplicationHistosByLibrary.keySet()) {
- final Histogram duplicationHisto = duplicationHistosByLibrary.get(library);
- final Histogram opticalHisto = opticalHistosByLibrary.get(library);
- final GATKDuplicationMetrics metrics = new GATKDuplicationMetrics();
- metrics.LIBRARY = library;
-
- // Filter out any bins that have only a single entry in them and calcu
- for (final Integer bin : duplicationHisto.keySet()) {
- final double duplicateGroups = duplicationHisto.get(bin).getValue();
- final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();
-
- if (duplicateGroups > 1) {
- metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
- metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
- metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
- }
- }
-
- metrics.calculateDerivedFields();
- file.addMetric(metrics);
- file.addHistogram(duplicationHisto);
-
- }
-
- file.write(OUTPUT);
- }
- return null;
- }
-
- /**
- * Checks to see if two reads pairs have sequence that are the same, give or take a few
- * errors/diffs as dictated by the maxDiffRate.
- */
- private boolean matches(final PairedReadSequence lhs, final PairedReadSequence rhs, final double maxDiffRate) {
- final int read1Length = Math.min(lhs.read1.length, rhs.read1.length);
- final int read2Length = Math.min(lhs.read2.length, rhs.read2.length);
- final int maxErrors = (int) Math.floor((read1Length + read2Length) * maxDiffRate);
- int errors = 0;
-
- // The loop can start from MIN_IDENTICAL_BASES because we've already confirmed that
- // at least those first few bases are identical when sorting.
- for (int i = MIN_IDENTICAL_BASES; i < read1Length; ++i) {
- if (lhs.read1[i] != rhs.read1[i]) {
- if (++errors > maxErrors) return false;
- }
- }
-
- for (int i = MIN_IDENTICAL_BASES; i < read2Length; ++i) {
- if (lhs.read2[i] != rhs.read2[i]) {
- if (++errors > maxErrors) return false;
- }
- }
-
- return true;
- }
-
- /**
- * Pulls out of the iterator the next group of reads that can be compared to each other to
- * identify duplicates.
- */
- List getNextGroup(final PeekableIterator iterator) {
- final List group = new ArrayList<>();
- final PairedReadSequence first = iterator.next();
- group.add(first);
-
- outer:
- while (iterator.hasNext()) {
- final PairedReadSequence next = iterator.peek();
- for (int i = 0; i < MIN_IDENTICAL_BASES; ++i) {
- if (first.read1[i] != next.read1[i] || first.read2[i] != next.read2[i]) break outer;
- }
-
- group.add(iterator.next());
-
- }
-
- return group;
- }
-
- /**
- * Takes a list of PairedReadSequence objects and splits them into lists by library.
- */
- Map> splitByLibrary(final List input,
- final List rgs) {
-
- final Map> out = new HashMap<>();
- for (final PairedReadSequence seq : input) {
- String library = null;
- if (seq.getReadGroup() != -1) {
- library = rgs.get(seq.getReadGroup()).getLibrary();
- if (library == null) library = "Unknown";
- } else {
- library = "Unknown";
- }
-
- List librarySeqs = out.get(library);
- if (librarySeqs == null) {
- librarySeqs = new ArrayList<>();
- out.put(library, librarySeqs);
- }
- librarySeqs.add(seq);
- }
-
- return out;
- }
-
- /**
- * Checks that the average quality over the entire read is >= min, and that the first N bases do
- * not contain any no-calls.
- */
- boolean passesQualityCheck(final byte[] bases, final byte[] quals, final int seedLength, final int minQuality) {
- if (bases.length < seedLength) return false;
-
- for (int i = 0; i < seedLength; ++i) {
- if (SequenceUtil.isNoCall(bases[i])) return false;
- }
-
- int total = 0;
- for (final byte b : quals) total += b;
- return total / quals.length >= minQuality;
- }
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATK.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATK.java
deleted file mode 100644
index c05285db0b4..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATK.java
+++ /dev/null
@@ -1,492 +0,0 @@
-package org.broadinstitute.hellbender.tools.walkers.markduplicates;
-
-import com.google.common.annotations.VisibleForTesting;
-import htsjdk.samtools.*;
-import htsjdk.samtools.util.CloseableIterator;
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.SortingCollection;
-import htsjdk.samtools.util.SortingLongCollection;
-import org.broadinstitute.barclay.argparser.Argument;
-import org.broadinstitute.barclay.argparser.ExperimentalFeature;
-import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
-import org.broadinstitute.barclay.help.DocumentedFeature;
-import org.broadinstitute.hellbender.utils.io.IOUtils;
-import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
-import org.broadinstitute.hellbender.utils.read.ReadUtils;
-import org.broadinstitute.hellbender.utils.read.markduplicates.*;
-import org.broadinstitute.hellbender.utils.runtime.ProgressLogger;
-
-import java.io.Serializable;
-import java.util.*;
-
-/**
- * A better duplication marking algorithm that handles all cases including clipped
- * and gapped alignments.
- *
- * @author Tim Fennell
- */
-@ExperimentalFeature
-@DocumentedFeature
-@CommandLineProgramProperties(
- summary = "(Experimental) Examines aligned records in the supplied SAM/BAM/CRAM file to locate duplicate molecules. " +
- "All records are then written to the output file with the duplicate records flagged.",
- oneLineSummary = "Examines aligned records in the supplied SAM/BAM/CRAM file to locate duplicate molecules.",
- programGroup = ReadDataManipulationProgramGroup.class
-)
-public final class MarkDuplicatesGATK extends AbstractMarkDuplicatesCommandLineProgram {
- /**
- * If more than this many sequences in SAM file, don't spill to disk because there will not
- * be enough file handles.
- */
-
- @Argument(shortName = "MAX_SEQS",
- doc = "This option is obsolete. ReadEnds will always be spilled to disk.")
- public int MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP = 50000;
-
- @Argument(shortName = "MAX_FILE_HANDLES",
- doc = "Maximum number of file handles to keep open when spilling read ends to disk. " +
- "Set this number a little lower than the per-process maximum number of file that may be open. " +
- "This number can be found by executing the 'ulimit -n' command on a Unix system.")
- public int MAX_FILE_HANDLES_FOR_READ_ENDS_MAP = 8000;
-
- @Argument(doc = "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by " +
- "some of the sorting collections. If you are running out of memory, try reducing this number.")
- public double SORTING_COLLECTION_SIZE_RATIO = 0.25;
-
- @Argument(doc = "Report Memory Stats at various times during the run")
- public boolean reportMemoryStats = false;
-
-
- private SortingCollection pairSort;
- private SortingCollection fragSort;
- private SortingLongCollection duplicateIndexes;
- private int numDuplicateIndices = 0;
-
- private LibraryIdGenerator libraryIdGenerator = null; // this is initialized in buildSortedReadEndLists
-
- public MarkDuplicatesGATK() {
- DUPLICATE_SCORING_STRATEGY = DuplicateScoringStrategy.ScoringStrategy.SUM_OF_BASE_QUALITIES;
- }
-
- /**
- * Main work method. Reads the BAM file once and collects sorted information about
- * the 5' ends of both ends of each read (or just one end in the case of pairs).
- * Then makes a pass through those determining duplicates before re-reading the
- * input file and writing it out with duplication flags set correctly.
- */
- @Override
- protected Object doWork() {
- IOUtil.assertFilesAreReadable(INPUT);
- IOUtil.assertFileIsWritable(OUTPUT);
- IOUtil.assertFileIsWritable(METRICS_FILE);
-
- reportMemoryStats("Start of doWork");
- logger.info("Reading input file and constructing read end information.");
- buildSortedReadEndLists();
- reportMemoryStats("After buildSortedReadEndLists");
- generateDuplicateIndexes();
- reportMemoryStats("After generateDuplicateIndexes");
- logger.info("Marking " + this.numDuplicateIndices + " records as duplicates.");
-
- if (this.opticalDuplicatesArgumentCollection.READ_NAME_REGEX == null) {
- logger.warn("Skipped optical duplicate cluster discovery; library size estimation may be inaccurate!");
- } else {
- logger.info("Found " + (this.libraryIdGenerator.getNumberOfOpticalDuplicateClusters()) + " optical duplicate clusters.");
- }
-
- try( final SamHeaderAndIterator headerAndIterator = openInputs()) {
- final SAMFileHeader header = headerAndIterator.header;
-
- final SAMFileHeader outputHeader = ReadUtils.cloneSAMFileHeader(header);
- outputHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
- for (final String comment : COMMENT) outputHeader.addComment(comment);
-
- // Key: previous PG ID on a SAM Record (or null). Value: New PG ID to replace it.
- final Map chainedPgIds = getChainedPgIds(outputHeader);
-
- try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outputHeader, true)) {
-
- // Now copy over the file while marking all the necessary indexes as duplicates
- long recordInFileIndex = 0;
- long nextDuplicateIndex = (this.duplicateIndexes.hasNext() ? this.duplicateIndexes.next() : -1);
-
- final ProgressLogger progress = new ProgressLogger(logger, (int) 1e7, "Written");
- try (final CloseableIterator iterator = headerAndIterator.iterator) {
- while (iterator.hasNext()) {
- final SAMRecord rec = iterator.next();
- final String library = LibraryIdGenerator.getLibraryName(header, rec);
- GATKDuplicationMetrics metrics = libraryIdGenerator.getMetricsByLibrary(library);
- if (metrics == null) {
- metrics = new GATKDuplicationMetrics();
- metrics.LIBRARY = library;
- libraryIdGenerator.addMetricsByLibrary(library, metrics);
- }
-
- if (recordInFileIndex == nextDuplicateIndex) {
- rec.setDuplicateReadFlag(true);
- // Now try and figure out the next duplicate index
- if (this.duplicateIndexes.hasNext()) {
- nextDuplicateIndex = this.duplicateIndexes.next();
- } else {
- // Only happens once we've marked all the duplicates
- nextDuplicateIndex = -1;
- }
- } else {
- rec.setDuplicateReadFlag(false);
- }
-
- metrics.updateMetrics(rec);
-
- recordInFileIndex++;
-
- if (!this.REMOVE_DUPLICATES || !rec.getDuplicateReadFlag()) {
- if (PROGRAM_RECORD_ID != null) {
- rec.setAttribute(SAMTag.PG.name(), chainedPgIds.get(rec.getStringAttribute(SAMTag.PG.name())));
- }
- out.addAlignment(rec);
- progress.record(rec);
- }
- }
- }
- this.duplicateIndexes.cleanup();
-
- reportMemoryStats("Before output close");
- }
- reportMemoryStats("After output close");
- }
-
- // Write out the metrics
- finalizeAndWriteMetrics(libraryIdGenerator);
-
- return null;
- }
-
- @VisibleForTesting
- long numOpticalDuplicates() { return ((long) this.libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap().getSumOfValues()); } // cast as long due to returning a double
-
- /** Print out some quick JVM memory stats. */
- private void reportMemoryStats(final String stage) {
- if(reportMemoryStats) {
- System.gc();
- final Runtime runtime = Runtime.getRuntime();
- logger.info(stage + " freeMemory: " + runtime.freeMemory() + "; totalMemory: " + runtime.totalMemory() +
- "; maxMemory: " + runtime.maxMemory());
- }
- }
-
- /**
- * Goes through all the records in a file and generates a set of ReadEndsForMarkDuplicates objects that
- * hold the necessary information (reference sequence, 5' read coordinate) to do
- * duplication, caching to disk as necessary to sort them.
- */
- private void buildSortedReadEndLists() {
- final int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * SORTING_COLLECTION_SIZE_RATIO) / ReadEndsForMarkDuplicates.SIZE_OF);
- logger.info("Will retain up to " + maxInMemory + " data points before spilling to disk.");
-
- this.pairSort = SortingCollection.newInstanceFromPaths(ReadEndsForMarkDuplicates.class,
- new ReadEndsForMarkDuplicatesCodec(),
- new ReadEndsMDComparator(),
- maxInMemory,
- Collections.singletonList(IOUtils.getPath(tmpDir)));
-
- this.fragSort = SortingCollection.newInstanceFromPaths(ReadEndsForMarkDuplicates.class,
- new ReadEndsForMarkDuplicatesCodec(),
- new ReadEndsMDComparator(),
- maxInMemory,
- Collections.singletonList(IOUtils.getPath(tmpDir)));
-
- try(final SamHeaderAndIterator headerAndIterator = openInputs()) {
- final SAMFileHeader header = headerAndIterator.header;
- final ReadEndsForMarkDuplicatesMap tmp = new DiskBasedReadEndsForMarkDuplicatesMap(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP);
- long index = 0;
- final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
- final CloseableIterator iterator = headerAndIterator.iterator;
-
- if (null == this.libraryIdGenerator) {
- this.libraryIdGenerator = new LibraryIdGenerator(header);
- }
-
- while (iterator.hasNext()) {
- final SAMRecord rec = iterator.next();
-
- // This doesn't have anything to do with building sorted ReadEnd lists, but it can be done in the same pass
- // over the input
- if (PROGRAM_RECORD_ID != null) {
- // Gather all PG IDs seen in merged input files in first pass. These are gathered for two reasons:
- // - to know how many different PG records to create to represent this program invocation.
- // - to know what PG IDs are already used to avoid collisions when creating new ones.
- // Note that if there are one or more records that do not have a PG tag, then a null value
- // will be stored in this set.
- pgIdsSeen.add(rec.getStringAttribute(SAMTag.PG.name()));
- }
-
- if (rec.getReadUnmappedFlag()) {
- if (rec.getReferenceIndex() == -1) {
- // When we hit the unmapped reads with no coordinate, no reason to continue.
- break;
- }
- // If this read is unmapped but sorted with the mapped reads, just skip it.
- } else if (!rec.isSecondaryOrSupplementary()) {
- final ReadEndsForMarkDuplicates fragmentEnd = buildReadEnds(header, index, rec);
- this.fragSort.add(fragmentEnd);
-
- if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
- final String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName();
- ReadEndsForMarkDuplicates pairedEnds = tmp.remove(rec.getReferenceIndex(), key);
-
- // See if we've already seen the first end or not
- if (pairedEnds == null) {
- pairedEnds = buildReadEnds(header, index, rec);
- tmp.put(pairedEnds.read2ReferenceIndex, key, pairedEnds);
- } else {
- final int sequence = fragmentEnd.read1ReferenceIndex;
- final int coordinate = fragmentEnd.read1Coordinate;
-
- // Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this
- // before updating the orientation later.
- if (rec.getFirstOfPairFlag()) {
- pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R);
- } else {
- pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag());
- }
-
- // If the second read is actually later, just add the second read data, else flip the reads
- if (sequence > pairedEnds.read1ReferenceIndex ||
- (sequence == pairedEnds.read1ReferenceIndex && coordinate >= pairedEnds.read1Coordinate)) {
- pairedEnds.read2ReferenceIndex = sequence;
- pairedEnds.read2Coordinate = coordinate;
- pairedEnds.read2IndexInFile = index;
- pairedEnds.orientation = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R,
- rec.getReadNegativeStrandFlag());
- } else {
- pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
- pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
- pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
- pairedEnds.read1ReferenceIndex = sequence;
- pairedEnds.read1Coordinate = coordinate;
- pairedEnds.read1IndexInFile = index;
- pairedEnds.orientation = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(),
- pairedEnds.orientation == ReadEnds.R);
- }
-
- pairedEnds.score += DuplicateScoringStrategy.computeDuplicateScore(rec, this.DUPLICATE_SCORING_STRATEGY);
- this.pairSort.add(pairedEnds);
- }
- }
- }
-
- // Print out some stats every 1m reads
- ++index;
- if (progress.record(rec)) {
- logger.info("Tracking " + tmp.size() + " as yet unmatched pairs. " + tmp.sizeInRam() + " records in RAM.");
- }
- }
-
- logger.info("Read " + index + " records. " + tmp.size() + " pairs never matched.");
- iterator.close();
- }
-
- // Tell these collections to free up memory if possible.
- this.pairSort.doneAdding();
- this.fragSort.doneAdding();
- }
-
- /** Builds a read ends object that represents a single read. */
- private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final long index, final SAMRecord rec) {
- final ReadEndsForMarkDuplicates ends = new ReadEndsForMarkDuplicates();
- ends.read1ReferenceIndex = rec.getReferenceIndex();
- ends.read1Coordinate = rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart();
- ends.orientation = rec.getReadNegativeStrandFlag() ? ReadEnds.R : ReadEnds.F;
- ends.read1IndexInFile = index;
- ends.score = DuplicateScoringStrategy.computeDuplicateScore(rec, this.DUPLICATE_SCORING_STRATEGY);
-
- // Doing this lets the ends object know that it's part of a pair
- if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
- ends.read2ReferenceIndex = rec.getMateReferenceIndex();
- }
-
- // Fill in the library ID
- ends.libraryId = libraryIdGenerator.getLibraryId(rec);
-
- // Fill in the location information for optical duplicates
- if (this.opticalDuplicateFinder.addLocationInformation(rec.getReadName(), ends)) {
- // calculate the RG number (nth in list)
- ends.readGroup = 0;
- final String rg = (String) rec.getAttribute("RG");
- final List readGroups = header.getReadGroups();
-
- if (rg != null && readGroups != null) {
- for (final SAMReadGroupRecord readGroup : readGroups) {
- if (readGroup.getReadGroupId().equals(rg)) break;
- else ends.readGroup++;
- }
- }
- }
-
- return ends;
- }
-
- /**
- * Goes through the accumulated ReadEndsForMarkDuplicates objects and determines which of them are
- * to be marked as duplicates.
- *
- * @return an array with an ordered list of indexes into the source file
- */
- private void generateDuplicateIndexes() {
- // Keep this number from getting too large even if there is a huge heap.
- final int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / SortingLongCollection.SIZEOF,
- (double) (Integer.MAX_VALUE - 5));
- logger.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk.");
- this.duplicateIndexes = new SortingLongCollection(maxInMemory, IOUtils.getPath(tmpDir));
-
- ReadEndsForMarkDuplicates firstOfNextChunk = null;
- final List nextChunk = new ArrayList<>(200);
-
- // First just do the pairs
- logger.info("Traversing read pair information and detecting duplicates.");
- for (final ReadEndsForMarkDuplicates next : this.pairSort) {
- if (firstOfNextChunk == null) {
- firstOfNextChunk = next;
- nextChunk.add(firstOfNextChunk);
- } else if (areComparableForDuplicates(firstOfNextChunk, next, true)) {
- nextChunk.add(next);
- } else {
- if (nextChunk.size() > 1) {
- markDuplicatePairs(nextChunk);
- }
-
- nextChunk.clear();
- nextChunk.add(next);
- firstOfNextChunk = next;
- }
- }
- if (nextChunk.size() > 1) markDuplicatePairs(nextChunk);
- this.pairSort.cleanup();
- this.pairSort = null;
-
- // Now deal with the fragments
- logger.info("Traversing fragment information and detecting duplicates.");
- boolean containsPairs = false;
- boolean containsFrags = false;
-
- for (final ReadEndsForMarkDuplicates next : this.fragSort) {
- if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false)) {
- nextChunk.add(next);
- containsPairs = containsPairs || next.isPaired();
- containsFrags = containsFrags || !next.isPaired();
- } else {
- if (nextChunk.size() > 1 && containsFrags) {
- markDuplicateFragments(nextChunk, containsPairs);
- }
-
- nextChunk.clear();
- nextChunk.add(next);
- firstOfNextChunk = next;
- containsPairs = next.isPaired();
- containsFrags = !next.isPaired();
- }
- }
- markDuplicateFragments(nextChunk, containsPairs);
- this.fragSort.cleanup();
- this.fragSort = null;
-
- logger.info("Sorting list of duplicate records.");
- this.duplicateIndexes.doneAddingStartIteration();
- }
-
- private static boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean compareRead2) {
- boolean retval = lhs.libraryId == rhs.libraryId &&
- lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
- lhs.read1Coordinate == rhs.read1Coordinate &&
- lhs.orientation == rhs.orientation;
-
- if (retval && compareRead2) {
- retval = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
- lhs.read2Coordinate == rhs.read2Coordinate;
- }
-
- return retval;
- }
-
- private void addIndexAsDuplicate(final long bamIndex) {
- this.duplicateIndexes.add(bamIndex);
- ++this.numDuplicateIndices;
- }
-
- /**
- * Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should
- * not be marked as duplicates. This assumes that the list contains objects representing pairs.
- *
- * @param list
- */
- private void markDuplicatePairs(final List list) {
- short maxScore = 0;
- ReadEndsForMarkDuplicates best = null;
-
- /** All read ends should have orientation FF, FR, RF, or RR **/
- for (final ReadEndsForMarkDuplicates end : list) {
- if (end.score > maxScore || best == null) {
- maxScore = end.score;
- best = end;
- }
- }
-
- for (final ReadEndsForMarkDuplicates end : list) {
- if (end != best) {
- addIndexAsDuplicate(end.read1IndexInFile);
- addIndexAsDuplicate(end.read2IndexInFile);
- }
- }
-
- if (this.opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null) {
- AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(list, best, opticalDuplicateFinder, libraryIdGenerator);
- }
- }
-
- /**
- * Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should
- * not be marked as duplicates. This will set the duplicate index for only list items are fragments.
- *
- * @param list
- * @param containsPairs true if the list also contains objects containing pairs, false otherwise.
- */
- private void markDuplicateFragments(final List list, final boolean containsPairs) {
- if (containsPairs) {
- for (final ReadEndsForMarkDuplicates end : list) {
- if (!end.isPaired()) addIndexAsDuplicate(end.read1IndexInFile);
- }
- } else {
- short maxScore = 0;
- ReadEndsForMarkDuplicates best = null;
- for (final ReadEndsForMarkDuplicates end : list) {
- if (end.score > maxScore || best == null) {
- maxScore = end.score;
- best = end;
- }
- }
-
- for (final ReadEndsForMarkDuplicates end : list) {
- if (end != best) {
- addIndexAsDuplicate(end.read1IndexInFile);
- }
- }
- }
- }
-
- /** Comparator for ReadEndsForMarkDuplicates that orders by read1 position then pair orientation then read2 position. */
- static class ReadEndsMDComparator implements Comparator, Serializable {
- private static final long serialVersionUID = 1L;
- @Override
- public int compare(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs) {
- int retval = lhs.libraryId - rhs.libraryId;
- if (retval == 0) retval = lhs.read1ReferenceIndex - rhs.read1ReferenceIndex;
- if (retval == 0) retval = lhs.read1Coordinate - rhs.read1Coordinate;
- if (retval == 0) retval = lhs.orientation - rhs.orientation;
- if (retval == 0) retval = lhs.read2ReferenceIndex - rhs.read2ReferenceIndex;
- if (retval == 0) retval = lhs.read2Coordinate - rhs.read2Coordinate;
- if (retval == 0) retval = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile);
- if (retval == 0) retval = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile);
- return retval;
- }
- }
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractMarkDuplicatesCommandLineProgram.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractMarkDuplicatesCommandLineProgram.java
deleted file mode 100644
index 65192d3b6b1..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractMarkDuplicatesCommandLineProgram.java
+++ /dev/null
@@ -1,302 +0,0 @@
-package org.broadinstitute.hellbender.utils.read.markduplicates;
-
-import htsjdk.samtools.*;
-import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
-import htsjdk.samtools.metrics.MetricsFile;
-import htsjdk.samtools.util.CloseableIterator;
-import htsjdk.samtools.util.CloserUtil;
-import htsjdk.samtools.util.Histogram;
-import org.broadinstitute.barclay.argparser.Argument;
-import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.utils.Utils;
-import picard.sam.markduplicates.util.OpticalDuplicateFinder;
-import picard.sam.util.PhysicalLocation;
-
-import java.io.File;
-import java.util.*;
-
-/**
- * Abstract class that holds parameters and methods common to classes that perform duplicate
- * detection and/or marking within SAM/BAM/CRAM files.
- *
- * @author Nils Homer
- */
-public abstract class AbstractMarkDuplicatesCommandLineProgram extends AbstractOpticalDuplicateFinderCommandLineProgram {
-
- @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
- doc = "One or more input SAM/BAM/CRAM files to analyze. Must be coordinate sorted.")
- public List INPUT;
-
- @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
- doc = "The output file to write marked records to")
- public File OUTPUT;
-
- @Argument(shortName = StandardArgumentDefinitions.METRICS_FILE_SHORT_NAME,
- fullName = StandardArgumentDefinitions.METRICS_FILE_LONG_NAME,
- doc = "File to write duplication metrics to")
- public File METRICS_FILE;
-
- @Argument(shortName = StandardArgumentDefinitions.PROGRAM_RECORD_ID_SHORT_NAME,
- doc = "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
- "PG record creation. This string may have a suffix appended to avoid collision with other " +
- "program record IDs.",
- optional = true)
- public String PROGRAM_RECORD_ID = "MarkDuplicatesGATK";
-
- @Argument(shortName = "PG_VERSION",
- doc = "Value of VN tag of PG record to be created. If not specified, the version will be detected automatically.",
- optional = true)
- public String PROGRAM_GROUP_VERSION;
-
- @Argument(shortName = "PG_COMMAND",
- doc = "Value of CL tag of PG record to be created. If not supplied the command line will be detected automatically.",
- optional = true)
- public String PROGRAM_GROUP_COMMAND_LINE;
-
- @Argument(shortName = "PG_NAME",
- doc = "Value of PN tag of PG record to be created.")
- public String PROGRAM_GROUP_NAME = getClass().getSimpleName();
-
- @Argument(shortName = "CO",
- doc = "Comment(s) to include in the output file's header.",
- optional = true)
- public List COMMENT = new ArrayList<>();
-
- @Argument(doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.")
- public boolean REMOVE_DUPLICATES = false;
-
- @Argument(shortName = StandardArgumentDefinitions.ASSUME_SORTED_SHORT_NAME,
- doc = "If true, assume that the input file is coordinate sorted even if the header says otherwise.")
- public boolean ASSUME_SORTED = false;
-
- @Argument(shortName = StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_SHORT_NAME,
- fullName = StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_LONG_NAME,
- doc = "The scoring strategy for choosing the non-duplicate among candidates.")
- public DuplicateScoringStrategy.ScoringStrategy DUPLICATE_SCORING_STRATEGY = ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH;
-
- /** The program groups that have been seen during the course of examining the input records. */
- protected final Set pgIdsSeen = new LinkedHashSet<>();
-
- /**
- * We have to re-chain the program groups based on this algorithm. This returns the map from existing program group ID
- * to new program group ID.
- */
- protected Map getChainedPgIds(final SAMFileHeader outputHeader) {
- final Map chainedPgIds;
- // Generate new PG record(s)
- if (PROGRAM_RECORD_ID != null) {
- final PgIdGenerator pgIdGenerator = new PgIdGenerator(outputHeader);
- if (PROGRAM_GROUP_VERSION == null) {
- PROGRAM_GROUP_VERSION = this.getVersion();
- }
- if (PROGRAM_GROUP_COMMAND_LINE == null) {
- PROGRAM_GROUP_COMMAND_LINE = this.getCommandLine();
- }
- chainedPgIds = new LinkedHashMap<>();
- for (final String existingId : this.pgIdsSeen) {
- final String newPgId = pgIdGenerator.getNonCollidingId(PROGRAM_RECORD_ID);
- chainedPgIds.put(existingId, newPgId);
- final SAMProgramRecord programRecord = new SAMProgramRecord(newPgId);
- programRecord.setProgramVersion(PROGRAM_GROUP_VERSION);
- programRecord.setCommandLine(PROGRAM_GROUP_COMMAND_LINE);
- programRecord.setProgramName(PROGRAM_GROUP_NAME);
- programRecord.setPreviousProgramGroupId(existingId);
- outputHeader.addProgramRecord(programRecord);
- }
- } else {
- chainedPgIds = null;
- }
- return chainedPgIds;
- }
-
- /**
- * Writes the metrics given by the libraryIdGenerator to the METRICS_FILE.
- *
- * @param libraryIdGenerator
- */
- protected void finalizeAndWriteMetrics(final LibraryIdGenerator libraryIdGenerator) {
- //We want to sort libraries by name
- final SortedMap metricsByLibrary = new TreeMap<>(Utils.COMPARE_STRINGS_NULLS_FIRST);
- metricsByLibrary.putAll(libraryIdGenerator.getMetricsByLibraryMap());
-
- final Histogram opticalDuplicatesByLibraryId = libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap();
- final Map libraryIds = libraryIdGenerator.getLibraryIdsMap();
-
- // Write out the metrics
- final MetricsFile file = getMetricsFile();
- for (final Map.Entry entry : metricsByLibrary.entrySet()) {
- final String libraryName = entry.getKey();
- final GATKDuplicationMetrics metrics = entry.getValue();
-
- metrics.READ_PAIRS_EXAMINED = metrics.READ_PAIRS_EXAMINED / 2;
- metrics.READ_PAIR_DUPLICATES = metrics.READ_PAIR_DUPLICATES / 2;
-
- // Add the optical dupes to the metrics
- final Short libraryId = libraryIds.get(libraryName);
- if (libraryId != null) {
- @SuppressWarnings("unchecked")//type-checker being annoying here
- final Histogram.Bin bin = opticalDuplicatesByLibraryId.get(libraryId);
- if (bin != null) {
- metrics.READ_PAIR_OPTICAL_DUPLICATES = (long) bin.getValue();
- }
- }
- metrics.calculateDerivedFields();
- file.addMetric(metrics);
- }
-
- if (metricsByLibrary.size() == 1) {
- file.setHistogram(metricsByLibrary.values().iterator().next().calculateRoiHistogram());
- }
-
- file.write(METRICS_FILE);
- }
-
- /** Little class to generate program group IDs */
- static class PgIdGenerator {
- private int recordCounter;
-
- private final Set idsThatAreAlreadyTaken = new LinkedHashSet<>();
-
- PgIdGenerator(final SAMFileHeader header) {
- for (final SAMProgramRecord pgRecord : header.getProgramRecords()) {
- idsThatAreAlreadyTaken.add(pgRecord.getProgramGroupId());
- }
- recordCounter = idsThatAreAlreadyTaken.size();
- }
-
- String getNonCollidingId(final String recordId) {
- if (!idsThatAreAlreadyTaken.contains(recordId)) {
- // don't remap 1st record. If there are more records
- // with this id, they will be remapped in the 'else'.
- idsThatAreAlreadyTaken.add(recordId);
- ++recordCounter;
- return recordId;
- } else {
- String newId;
- // Below we tack on one of roughly 1.7 million possible 4 digit base36 at random. We do this because
- // our old process of just counting from 0 upward and adding that to the previous id led to 1000s of
- // calls idsThatAreAlreadyTaken.contains() just to resolve 1 collision when merging 1000s of similarly
- // processed bams.
- while (idsThatAreAlreadyTaken.contains(newId = recordId + "." + SamFileHeaderMerger.positiveFourDigitBase36Str(recordCounter++)))
- ;
-
- idsThatAreAlreadyTaken.add(newId);
- return newId;
- }
-
- }
- }
-
- /** Little class used to package up a header and an iterable/iterator. */
- public static final class SamHeaderAndIterator implements AutoCloseable{
- public final SAMFileHeader header;
- public final CloseableIterator iterator;
- private final List readers;
-
- public SamHeaderAndIterator(final SAMFileHeader header, final CloseableIterator iterator, final List readers) {
- this.header = header;
- this.iterator = iterator;
- this.readers = readers;
- }
-
- @Override
- public void close(){
- CloserUtil.close(readers);
- CloserUtil.close(iterator);
- }
- }
-
- /**
- * Since this may read it's inputs more than once this method does all the opening
- * and checking of the inputs.
- */
- protected SamHeaderAndIterator openInputs() {
- final List headers = new ArrayList<>(INPUT.size());
- final List readers = new ArrayList<>(INPUT.size());
-
- for (final File f : INPUT) {
- final SamReader reader = SamReaderFactory.makeDefault()
- .enable(SamReaderFactory.Option.EAGERLY_DECODE)
- .referenceSequence(REFERENCE_SEQUENCE)
- .open(f); // eager decode
- final SAMFileHeader header = reader.getFileHeader();
-
- if (!ASSUME_SORTED && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
- throw new UserException("Input file " + f.getAbsolutePath() + " is not coordinate sorted.");
- }
-
- headers.add(header);
- readers.add(reader);
- }
-
- if (headers.size() == 1) {
- return new SamHeaderAndIterator(headers.get(0), readers.get(0).iterator(), readers);
- } else {
- final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, false);
- final MergingSamRecordIterator iterator = new MergingSamRecordIterator(headerMerger, readers, ASSUME_SORTED);
- return new SamHeaderAndIterator(headerMerger.getMergedHeader(), iterator, readers);
- }
- }
-
- /**
- * Looks through the set of reads and identifies how many of the duplicates are
- * in fact optical duplicates, and stores the data in the instance level histogram.
- */
- public static void trackOpticalDuplicates(List extends ReadEnds> ends,
- final ReadEnds keeper,
- final OpticalDuplicateFinder opticalDuplicateFinder,
- final LibraryIdGenerator libraryIdGenerator) {
- boolean hasFR = false, hasRF = false;
-
- // Check to see if we have a mixture of FR/RF
- for (final ReadEnds end : ends) {
- if (ReadEnds.FR == end.orientationForOpticalDuplicates) {
- hasFR = true;
- } else if (ReadEnds.RF == end.orientationForOpticalDuplicates) {
- hasRF = true;
- }
- }
-
- // Check if we need to partition since the orientations could have changed
- if (hasFR && hasRF) { // need to track them independently
- // Variables used for optical duplicate detection and tracking
- final List trackOpticalDuplicatesF = new ArrayList<>();
- final List trackOpticalDuplicatesR = new ArrayList<>();
-
- // Split into two lists: first of pairs and second of pairs, since they must have orientation and same starting end
- for (final ReadEnds end : ends) {
- if (ReadEnds.FR == end.orientationForOpticalDuplicates) {
- trackOpticalDuplicatesF.add(end);
- } else if (ReadEnds.RF == end.orientationForOpticalDuplicates) {
- trackOpticalDuplicatesR.add(end);
- } else {
- throw new UserException("Found an unexpected orientation: " + end.orientation);
- }
- }
-
- // track the duplicates
- trackOpticalDuplicates(trackOpticalDuplicatesF, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap(), keeper);
- trackOpticalDuplicates(trackOpticalDuplicatesR, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap(), keeper);
- } else { // No need to partition
- AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(ends, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap(), keeper);
- }
- }
-
- /**
- * Looks through the set of reads and identifies how many of the duplicates are
- * in fact optical duplicates, and stores the data in the instance level histogram.
- */
- private static void trackOpticalDuplicates(final List extends PhysicalLocation> list,
- final OpticalDuplicateFinder opticalDuplicateFinder,
- final Histogram opticalDuplicatesByLibraryId,
- final ReadEnds keeper) {
- final boolean[] opticalDuplicateFlags = opticalDuplicateFinder.findOpticalDuplicates(list, keeper);
-
- int opticalDuplicates = 0;
- for (final boolean b : opticalDuplicateFlags) if (b) ++opticalDuplicates;
- if (opticalDuplicates > 0) {
- opticalDuplicatesByLibraryId.increment(list.get(0).getLibraryId(), opticalDuplicates);
- }
- }
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractOpticalDuplicateFinderCommandLineProgram.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractOpticalDuplicateFinderCommandLineProgram.java
deleted file mode 100644
index 1bc28b4fd45..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/AbstractOpticalDuplicateFinderCommandLineProgram.java
+++ /dev/null
@@ -1,36 +0,0 @@
-package org.broadinstitute.hellbender.utils.read.markduplicates;
-
-import org.apache.logging.log4j.Logger;
-import org.apache.logging.log4j.LogManager;
-import org.broadinstitute.barclay.argparser.ArgumentCollection;
-import org.broadinstitute.hellbender.cmdline.PicardCommandLineProgram;
-import org.broadinstitute.hellbender.cmdline.argumentcollections.OpticalDuplicatesArgumentCollection;
-import picard.sam.markduplicates.util.OpticalDuplicateFinder;
-
-/**
- * Abstract class that holds parameters and methods common to classes that optical duplicate detection. We put them here so that
- * the explanation about how read names are parsed is in once place
- *
- * @author Tim Fennell
- */
-public abstract class AbstractOpticalDuplicateFinderCommandLineProgram extends PicardCommandLineProgram {
- protected static Logger LOG = LogManager.getLogger(AbstractOpticalDuplicateFinderCommandLineProgram.class);
-
- @ArgumentCollection
- protected OpticalDuplicatesArgumentCollection opticalDuplicatesArgumentCollection = new OpticalDuplicatesArgumentCollection();
-
- // The tool with which to find optical duplicates
- protected OpticalDuplicateFinder opticalDuplicateFinder = null;
-
- // Needed for testing
- public void setupOpticalDuplicateFinder() {
- this.opticalDuplicateFinder = new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX,
- opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE,null);//TODO logger firgure out, logger);
- }
-
- @Override
- protected String[] customCommandLineValidation() {
- setupOpticalDuplicateFinder();
- return super.customCommandLineValidation();
- }
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/DiskBasedReadEndsForMarkDuplicatesMap.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/DiskBasedReadEndsForMarkDuplicatesMap.java
deleted file mode 100644
index 7b1ab967a3e..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/DiskBasedReadEndsForMarkDuplicatesMap.java
+++ /dev/null
@@ -1,96 +0,0 @@
-package org.broadinstitute.hellbender.utils.read.markduplicates;
-
-import htsjdk.samtools.CoordinateSortedPairInfoMap;
-import org.broadinstitute.hellbender.exceptions.GATKException;
-
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.AbstractMap;
-import java.util.Map;
-
-/**
- * Disk-based implementation of ReadEndsForMarkDuplicatesMap. A subdirectory of the system tmpdir is created to store
- * files, one for each reference sequence. The reference sequence that is currently being queried (i.e. the
- * sequence for which remove() has been most recently called) is stored in RAM. ReadEnds for all other sequences
- * are stored on disk.
- *
- * When put() is called for a sequence that is the current one in RAM, the ReadEnds object is merely put into the
- * in-memory map. If put() is called for a sequence ID that is not the current RAM one, the ReadEnds object is
- * appended to the file for that sequence, creating the file if necessary.
- *
- * When remove() is called for a sequence that is the current one in RAM, remove() is called on the in-memory map.
- * If remove() is called for a sequence other than the current RAM sequence, then the current RAM sequence is written
- * to disk, the new sequence is read from disk into RAM map, and the file for the new sequence is deleted.
- *
- * If things work properly, and reads are processed in genomic order, records will be written for mates that are in
- * a later sequence. When the mate is reached in the input SAM file, the file that was written will be deleted.
- * This should result in all temporary files being deleted by the time all the reads are processed. The temp
- * directory is marked to be deleted on exit so everything should get cleaned up.
- *
- * @author alecw@broadinstitute.org
- */
-public final class DiskBasedReadEndsForMarkDuplicatesMap implements ReadEndsForMarkDuplicatesMap {
- private final CoordinateSortedPairInfoMap pairInfoMap;
-
- public DiskBasedReadEndsForMarkDuplicatesMap(int maxOpenFiles) {
- pairInfoMap = new CoordinateSortedPairInfoMap<>(maxOpenFiles, new Codec());
- }
-
- @Override
- public ReadEndsForMarkDuplicates remove(int mateSequenceIndex, String key) {
- return pairInfoMap.remove(mateSequenceIndex, key);
- }
-
- @Override
- public void put(int mateSequenceIndex, String key, ReadEndsForMarkDuplicates readEnds) {
- pairInfoMap.put(mateSequenceIndex, key, readEnds);
- }
-
- @Override
- public int size() {
- return pairInfoMap.size();
- }
-
- @Override
- public int sizeInRam() {
- return pairInfoMap.sizeInRam();
- }
-
- private static class Codec implements CoordinateSortedPairInfoMap.Codec {
- private final ReadEndsForMarkDuplicatesCodec readEndsForMarkDuplicatesCodec = new ReadEndsForMarkDuplicatesCodec();
-
- @Override
- public void setInputStream(final InputStream is) {
- readEndsForMarkDuplicatesCodec.setInputStream(is);
- }
-
- @Override
- public void setOutputStream(final OutputStream os) {
- readEndsForMarkDuplicatesCodec.setOutputStream(os);
- }
-
- @Override
- public Map.Entry decode() {
- try {
- final String key = readEndsForMarkDuplicatesCodec.getInputStream().readUTF();
- final ReadEndsForMarkDuplicates record = readEndsForMarkDuplicatesCodec.decode();
- return new AbstractMap.SimpleEntry<>(key, record);
- } catch (IOException e) {
- throw new GATKException("Error loading ReadEndsForMarkDuplicatesMap from disk", e);
- }
- }
-
- @Override
- public void encode(final String key, final ReadEndsForMarkDuplicates readEnds) {
- try {
- readEndsForMarkDuplicatesCodec.getOutputStream().writeUTF(key);
- readEndsForMarkDuplicatesCodec.encode(readEnds);
- } catch (IOException e) {
- throw new GATKException("Error spilling ReadEndsForMarkDuplicatesMap to disk.", e);
- }
- }
- }
-
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEnds.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEnds.java
deleted file mode 100644
index 9b0382d039c..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEnds.java
+++ /dev/null
@@ -1,71 +0,0 @@
-package org.broadinstitute.hellbender.utils.read.markduplicates;
-
-import org.broadinstitute.hellbender.exceptions.GATKException;
-import picard.sam.util.PhysicalLocation;
-
-/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */
-public abstract class ReadEnds implements PhysicalLocation {
-
- public static final byte F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
-
- public short libraryId;
- public byte orientation;
- public int read1ReferenceIndex = -1;
- public int read1Coordinate = -1;
- public int read2ReferenceIndex = -1;
- public int read2Coordinate = -1;
-
- // Information used to detect optical dupes
- public short readGroup = -1;
- public short tile = -1;
- public int x = -1, y = -1;
-
- /** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */
- public byte orientationForOpticalDuplicates = -1;
-
-
- public boolean isPaired() { return this.read2ReferenceIndex != -1; }
-
- @Override
- public short getReadGroup() { return this.readGroup; }
-
- @Override
- public void setReadGroup(final short readGroup) { this.readGroup = readGroup; }
-
- @Override
- public short getTile() { return this.tile; }
-
- @Override
- public void setTile(final short tile) { this.tile = tile; }
-
- @Override
- public int getX() { return this.x; }
-
- @Override
- public void setX(final int x) { this.x = x; }
-
- @Override
- public int getY() { return this.y; }
-
- @Override
- public void setY(final int y) { this.y = y; }
-
- @Override
- public short getLibraryId() { return this.libraryId; }
-
- @Override
- public void setLibraryId(final short libraryId) { this.libraryId = libraryId; }
-
- /**
- * Returns a single byte that encodes the orientation of the two reads in a pair.
- */
- public static byte getOrientationByte(final boolean read1NegativeStrand, final boolean read2NegativeStrand) {
- if (read1NegativeStrand) {
- if (read2NegativeStrand) return ReadEnds.RR;
- else return ReadEnds.RF;
- } else {
- if (read2NegativeStrand) return ReadEnds.FR;
- else return ReadEnds.FF;
- }
- }
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicates.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicates.java
deleted file mode 100644
index 3719d4307fc..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicates.java
+++ /dev/null
@@ -1,23 +0,0 @@
-package org.broadinstitute.hellbender.utils.read.markduplicates;
-
-/**
- * Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar
- *
- * @author Nils Homer
- */
-public final class ReadEndsForMarkDuplicates extends ReadEnds {
- /*
- What do we need to store you ask? Well, we need to store:
- - byte: orientation
- - short: libraryId, readGroup, tile, x, y, score
- - int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate
- - long: read1IndexInFile, read2IndexInFile
- */
- public static final int SIZE_OF = (1 * 1) + (5 * 2) + (4 * 4) + (8 * 2) + 1
- + 8 + // last 8 == reference overhead
- 13; // This is determined experimentally with JProfiler
-
- public short score = 0;
- public long read1IndexInFile = -1;
- public long read2IndexInFile = -1;
-}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesCodec.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesCodec.java
deleted file mode 100644
index 0067a32881a..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesCodec.java
+++ /dev/null
@@ -1,98 +0,0 @@
-package org.broadinstitute.hellbender.utils.read.markduplicates;
-
-import htsjdk.samtools.util.SortingCollection;
-import org.broadinstitute.hellbender.exceptions.GATKException;
-
-
-import java.io.*;
-
-/** Codec for ReadEnds that just outputs the primitive fields and reads them back. */
-public final class ReadEndsForMarkDuplicatesCodec implements SortingCollection.Codec {
- private DataInputStream in;
- private DataOutputStream out;
-
- /**
- * For an explanation of why codecs must implement clone(),
- * see the HTSJDK documentation for {@link SortingCollection.Codec}.
- */
- @Override
- public ReadEndsForMarkDuplicatesCodec clone() {
- return new ReadEndsForMarkDuplicatesCodec();
- }
-
- @Override
- public void setOutputStream(final OutputStream os) { this.out = new DataOutputStream(os); }
-
- @Override
- public void setInputStream(final InputStream is) { this.in = new DataInputStream(is); }
-
- public DataInputStream getInputStream() {
- return in;
- }
-
- public DataOutputStream getOutputStream() {
- return out;
- }
-
- @Override
- public void encode(final ReadEndsForMarkDuplicates read) {
- try {
- this.out.writeShort(read.score);
- this.out.writeShort(read.libraryId);
- this.out.writeByte(read.orientation);
- this.out.writeInt(read.read1ReferenceIndex);
- this.out.writeInt(read.read1Coordinate);
- this.out.writeLong(read.read1IndexInFile);
- this.out.writeInt(read.read2ReferenceIndex);
-
- if (read.orientation > ReadEnds.R) {
- this.out.writeInt(read.read2Coordinate);
- this.out.writeLong(read.read2IndexInFile);
- }
-
- this.out.writeShort(read.readGroup);
- this.out.writeShort(read.tile);
- this.out.writeShort(read.x);
- this.out.writeShort(read.y);
- this.out.writeByte(read.orientationForOpticalDuplicates);
- } catch (final IOException ioe) {
- throw new GATKException("Exception writing ReadEnds to file.", ioe);
- }
- }
-
- @Override
- public ReadEndsForMarkDuplicates decode() {
- final ReadEndsForMarkDuplicates read = new ReadEndsForMarkDuplicates();
- try {
- // If the first read results in an EOF we've exhausted the stream
- try {
- read.score = this.in.readShort();
- } catch (final EOFException eof) {
- return null;
- }
-
- read.libraryId = this.in.readShort();
- read.orientation = this.in.readByte();
- read.read1ReferenceIndex = this.in.readInt();
- read.read1Coordinate = this.in.readInt();
- read.read1IndexInFile = this.in.readLong();
- read.read2ReferenceIndex = this.in.readInt();
-
- if (read.orientation > ReadEnds.R) {
- read.read2Coordinate = this.in.readInt();
- read.read2IndexInFile = this.in.readLong();
- }
-
- read.readGroup = this.in.readShort();
- read.tile = this.in.readShort();
- read.x = this.in.readShort();
- read.y = this.in.readShort();
-
- read.orientationForOpticalDuplicates = this.in.readByte();
-
- return read;
- } catch (final IOException ioe) {
- throw new GATKException("Exception writing ReadEnds to file.", ioe);
- }
- }
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesMap.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesMap.java
deleted file mode 100644
index 9b33f0d903b..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadEndsForMarkDuplicatesMap.java
+++ /dev/null
@@ -1,38 +0,0 @@
-package org.broadinstitute.hellbender.utils.read.markduplicates;
-
-/**
- * Interface for storing and retrieving ReadEnds objects. An implementation may be disk-based to
- * reduce memory footprint.
- */
-public interface ReadEndsForMarkDuplicatesMap {
- /**
- * Remove element with given key from the map. Because an implementation may be disk-based,
- * the object returned may not be the same object that was put into the map
- *
- * @param mateSequenceIndex must agree with the value used when the object was put into the map
- * @param key typically, concatenation of read group ID and read name
- * @return null if the key is not found, otherwise the object removed.
- */
- ReadEndsForMarkDuplicates remove(int mateSequenceIndex, String key);
-
- /**
- * Store the element in the map with the given key. It is assumed that the element
- * is not already present in the map.
- *
- * @param mateSequenceIndex use to optimize storage & retrieval. The same value must be used when trying
- * to remove this element. It is not valid to store the same key with two different mateSequenceIndexes.
- * @param key typically, concatenation of read group ID and read name
- * @param readEnds the object to be stored
- */
- void put(int mateSequenceIndex, String key, ReadEndsForMarkDuplicates readEnds);
-
- /**
- * @return number of elements stored in map
- */
- int size();
-
- /**
- * @return number of elements stored in RAM. Always <= size()
- */
- int sizeInRam();
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/SamRecordWithOrdinalAndSetDuplicateReadFlag.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/SamRecordWithOrdinalAndSetDuplicateReadFlag.java
deleted file mode 100644
index 7058709d0d0..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/SamRecordWithOrdinalAndSetDuplicateReadFlag.java
+++ /dev/null
@@ -1,25 +0,0 @@
-package org.broadinstitute.hellbender.utils.read.markduplicates;
-
-import htsjdk.samtools.SAMRecord;
-import htsjdk.samtools.util.SamRecordWithOrdinal;
-
-/**
- * This class sets the duplicate read flag as the result state when examining sets of records.
- *
- * @author nhomer
- */
-public final class SamRecordWithOrdinalAndSetDuplicateReadFlag extends SamRecordWithOrdinal {
-
- public SamRecordWithOrdinalAndSetDuplicateReadFlag() {
- super();
- }
-
- public SamRecordWithOrdinalAndSetDuplicateReadFlag(final SAMRecord record, final long recordIndex) {
- super(record, recordIndex);
- }
-
- @Override
- public void setResultState(final boolean resultState) {
- this.getRecord().setDuplicateReadFlag(resultState);
- }
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/EmptyFragment.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/EmptyFragment.java
index 24d704fd121..d09bcd7649f 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/EmptyFragment.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/EmptyFragment.java
@@ -4,8 +4,8 @@
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.markduplicates.LibraryIdGenerator;
-import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey;
+import picard.sam.markduplicates.util.ReadEnds;
import java.util.Map;
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Fragment.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Fragment.java
index ac5bfd01b45..835171457c5 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Fragment.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Fragment.java
@@ -1,14 +1,12 @@
package org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords;
import htsjdk.samtools.SAMFileHeader;
-import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.markduplicates.LibraryIdGenerator;
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
-import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey;
-import picard.sam.util.PhysicalLocation;
+import picard.sam.markduplicates.util.ReadEnds;
import java.util.Map;
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Pair.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Pair.java
index 50aae58190e..77c781498d1 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Pair.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/Pair.java
@@ -9,9 +9,8 @@
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
-import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey;
-import picard.sam.util.PhysicalLocation;
+import picard.sam.markduplicates.util.ReadEnds;
import java.util.Map;
diff --git a/src/test/java/org/broadinstitute/hellbender/testutils/testers/AbstractMarkDuplicatesTester.java b/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java
similarity index 86%
rename from src/test/java/org/broadinstitute/hellbender/testutils/testers/AbstractMarkDuplicatesTester.java
rename to src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java
index 788bd3d64f5..3cc4848274a 100644
--- a/src/test/java/org/broadinstitute/hellbender/testutils/testers/AbstractMarkDuplicatesTester.java
+++ b/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java
@@ -9,6 +9,8 @@
import htsjdk.samtools.util.TestUtil;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
+import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark;
import org.testng.Assert;
import picard.sam.DuplicationMetrics;
@@ -20,16 +22,17 @@
* This class is an extension of SamFileTester used to test AbstractMarkDuplicatesCommandLineProgram's with SAM files generated on the fly.
* This performs the underlying tests defined by classes such as AbstractMarkDuplicatesCommandLineProgramTest.
*/
-public abstract class AbstractMarkDuplicatesTester extends SamFileTester {
+public class MarkDuplicatesSparkTester extends SamFileTester {
final private File metricsFile;
final DuplicationMetrics expectedMetrics;
+ boolean markUnmappedReads = false;
- public AbstractMarkDuplicatesTester(final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder) {
+ public MarkDuplicatesSparkTester(final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder) {
this(duplicateScoringStrategy, sortOrder, true);
}
- public AbstractMarkDuplicatesTester(final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder, final boolean recordNeedSorting) {
+ public MarkDuplicatesSparkTester(final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder, final boolean recordNeedSorting) {
super(50, true, SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH, duplicateScoringStrategy, sortOrder, recordNeedSorting);
expectedMetrics = new DuplicationMetrics();
@@ -40,16 +43,17 @@ public AbstractMarkDuplicatesTester(final ScoringStrategy duplicateScoringStrate
addArg("--"+StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_LONG_NAME, duplicateScoringStrategy.name());
}
- public AbstractMarkDuplicatesTester(final ScoringStrategy duplicateScoringStrategy) {
+ public MarkDuplicatesSparkTester(final ScoringStrategy duplicateScoringStrategy, boolean markUnmappedReads) {
this(duplicateScoringStrategy, SAMFileHeader.SortOrder.coordinate);
+ this.markUnmappedReads = markUnmappedReads;
}
- public AbstractMarkDuplicatesTester() {
- this(SAMRecordSetBuilder.DEFAULT_DUPLICATE_SCORING_STRATEGY);
+ public MarkDuplicatesSparkTester(boolean markUnmappedReads) {
+ this(DuplicateScoringStrategy.ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH, markUnmappedReads);
}
- public File getMetricsFile() {
- return metricsFile;
+ public MarkDuplicatesSparkTester() {
+ this(false);
}
@Override
@@ -158,5 +162,12 @@ public void test() {
}
}
- protected abstract CommandLineProgram getProgram();
+ protected CommandLineProgram getProgram() { return new MarkDuplicatesSpark(); }
+
+ @Override
+ protected void addArgs() {
+ if (!markUnmappedReads) {
+ addArg("--" + MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME);
+ }
+ }
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java
index c8a5ff2b0e6..4dad0bfcba4 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java
@@ -3,26 +3,20 @@
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import htsjdk.samtools.metrics.MetricsFile;
-import org.apache.spark.SparkException;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
import org.broadinstitute.hellbender.engine.ReadsDataSource;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
-import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.walkers.markduplicates.MarkDuplicatesGATKIntegrationTest;
+import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark;
+import org.broadinstitute.hellbender.tools.walkers.markduplicates.AbstractMarkDuplicatesCommandLineProgramTest;
+import org.broadinstitute.hellbender.testutils.testers.MarkDuplicatesSparkTester;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics;
-import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesSparkTester;
-import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
-import org.broadinstitute.hellbender.tools.walkers.markduplicates.AbstractMarkDuplicatesCommandLineProgramTest;
-import org.broadinstitute.hellbender.testutils.testers.AbstractMarkDuplicatesTester;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
-import picard.sam.util.PhysicalLocationInt;
-import picard.sam.util.ReadNameParser;
import java.io.File;
import java.io.FileNotFoundException;
@@ -37,7 +31,7 @@
public class MarkDuplicatesSparkIntegrationTest extends AbstractMarkDuplicatesCommandLineProgramTest {
@Override
- protected AbstractMarkDuplicatesTester getTester() {
+ protected MarkDuplicatesSparkTester getTester() {
MarkDuplicatesSparkTester markDuplicatesSparkTester = new MarkDuplicatesSparkTester();
markDuplicatesSparkTester.addArg("--"+ MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME);
return markDuplicatesSparkTester;
@@ -68,31 +62,42 @@ public Object[][] md(){
//Note: in each of those cases, we'd really want to pass null as the last parameter (not 0L) but IntelliJ
// does not like it and skips the test (rendering issue) - so we pass 0L and account for it at test time
// (see comment in testMarkDuplicatesSparkIntegrationTestLocal)
- {new File(MarkDuplicatesGATKIntegrationTest.TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam"), 20, 0,
+ {new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam"), 20, 0,
ImmutableMap.of("Solexa-16419", ImmutableList.of(0L, 3L, 0L, 0L, 0L, 0L, 0.0, 0L),
"Solexa-16416", ImmutableList.of(0L, 1L, 0L, 0L, 0L, 0L, 0.0, 0L),
"Solexa-16404", ImmutableList.of(0L, 3L, 0L, 0L, 0L, 0L, 0.0, 0L),
"Solexa-16406", ImmutableList.of(0L, 1L, 0L, 0L, 0L, 0L, 0.0, 0L),
"Solexa-16412", ImmutableList.of(0L, 1L, 0L, 0L, 0L, 0L, 0.0, 0L))},
- {new File(MarkDuplicatesGATKIntegrationTest.TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.bam"), 90, 6,
+ {new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.bam"), 90, 6,
ImmutableMap.of("Solexa-16419", ImmutableList.of(4L, 4L, 4L, 0L, 0L, 0L, 0.0, 0L),
"Solexa-16416", ImmutableList.of(2L, 2L, 2L, 0L, 0L, 0L, 0.0, 0L),
"Solexa-16404", ImmutableList.of(3L, 9L, 3L, 0L, 2L, 0L, 0.190476, 17L),
"Solexa-16406", ImmutableList.of(1L, 10L, 1L, 0L, 0L, 0L, 0.0, 0L),
"Solexa-16412", ImmutableList.of(3L, 6L, 3L, 0L, 1L, 0L, 0.133333, 15L))},
- {new File(MarkDuplicatesGATKIntegrationTest.TEST_DATA_DIR,"example.chr1.1-1K.markedDups.bam"), 90, 6,
+ {new File(TEST_DATA_DIR,"example.chr1.1-1K.markedDups.bam"), 90, 6,
ImmutableMap.of("Solexa-16419", ImmutableList.of(4L, 4L, 4L, 0L, 0L, 0L, 0.0, 0L),
"Solexa-16416", ImmutableList.of(2L, 2L, 2L, 0L, 0L, 0L, 0.0, 0L),
"Solexa-16404", ImmutableList.of(3L, 9L, 3L, 0L, 2L, 0L, 0.190476, 17L),
"Solexa-16406", ImmutableList.of(1L, 10L, 1L, 0L, 0L, 0L, 0.0, 0L),
"Solexa-16412", ImmutableList.of(3L, 6L, 3L, 0L, 1L, 0L, 0.133333, 15L))},
- {new File(MarkDuplicatesGATKIntegrationTest.TEST_DATA_DIR, "optical_dupes.bam"), 4, 2,
+ {new File(TEST_DATA_DIR, "optical_dupes.bam"), 4, 2,
ImmutableMap.of("mylib", ImmutableList.of(0L, 2L, 0L, 0L, 1L, 1L, 0.5, 0L))},
- {new File(MarkDuplicatesGATKIntegrationTest.TEST_DATA_DIR, "optical_dupes_casava.bam"), 4, 2,
+ {new File(TEST_DATA_DIR, "optical_dupes_casava.bam"), 4, 2,
ImmutableMap.of("mylib", ImmutableList.of(0L, 2L, 0L, 0L, 1L, 1L, 0.5, 0L))},
};
}
+ // Tests asserting that without --do-not-mark-unmapped-mates argument that unmapped mates are still duplicate marked with their partner
+ @Test
+ public void testMappedPairAndMappedFragmentAndMatePairSecondUnmapped() {
+ final MarkDuplicatesSparkTester tester = new MarkDuplicatesSparkTester(true);
+ tester.addMatePair(1, 10040, 10040, false, true, true, true, "76M", null, false, false, false, false, false, DEFAULT_BASE_QUALITY); // first a duplicate,
+ // second end unmapped
+ tester.addMappedPair(1, 10189, 10040, false, false, "41S35M", "65M11S", true, false, false, ELIGIBLE_BASE_QUALITY); // mapped OK
+ tester.addMappedFragment(1, 10040, true, DEFAULT_BASE_QUALITY); // duplicate
+ tester.runTest();
+ }
+
@Test( dataProvider = "md")
public void testMarkDuplicatesSparkIntegrationTestLocal(
final File input, final long totalExpected, final long dupsExpected,
@@ -170,25 +175,6 @@ public void testMarkDuplicatesSparkIntegrationTestLocal(
}
}
- @Test
- public void testSupplementaryReadUnmappedMate() {
- File output = createTempFile("supplementaryReadUnmappedMate", "bam");
- final ArgumentsBuilder args = new ArgumentsBuilder();
- args.addOutput(output);
- args.addInput(getTestFile("supplementaryReadUnmappedmate.bam"));
- runCommandLine(args);
-
- try ( final ReadsDataSource outputReadsSource = new ReadsDataSource(output.toPath()) ) {
- final List actualReads = new ArrayList<>();
- for ( final GATKRead read : outputReadsSource ) {
- Assert.assertFalse(read.isDuplicate());
- actualReads.add(read);
- }
-
- Assert.assertEquals(actualReads.size(), 3, "Wrong number of reads output");
- }
- }
-
@Test
public void testHashCollisionHandling() {
// This test asserts that the handling of two read pairs with the same start positions but on different in such a way
@@ -209,137 +195,4 @@ public void testHashCollisionHandling() {
Assert.assertEquals(actualReads.size(), 4, "Wrong number of reads output");
}
}
-
- // Tests asserting that without --do-not-mark-unmapped-mates argument that unmapped mates are still duplicate marked with their partner
- @Test
- public void testMappedPairAndMappedFragmentAndMatePairSecondUnmapped() {
- final AbstractMarkDuplicatesTester tester = new MarkDuplicatesSparkTester(true);
- tester.addMatePair(1, 10040, 10040, false, true, true, true, "76M", null, false, false, false, false, false, DEFAULT_BASE_QUALITY); // first a duplicate,
- // second end unmapped
- tester.addMappedPair(1, 10189, 10040, false, false, "41S35M", "65M11S", true, false, false, ELIGIBLE_BASE_QUALITY); // mapped OK
- tester.addMappedFragment(1, 10040, true, DEFAULT_BASE_QUALITY); // duplicate
- tester.runTest();
- }
-
- @Test
- public void testNonExistantReadGroupInRead() {
- final AbstractMarkDuplicatesTester tester = new MarkDuplicatesSparkTester(true);
- tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "NotADuplicateGroup");
- try {
- tester.runTest();
- Assert.fail("Should have thrown an exception");
- } catch (Exception e){
- Assert.assertTrue(e instanceof SparkException);
- Assert.assertTrue(e.getCause() instanceof UserException.HeaderMissingReadGroup);
- }
- }
-
- @Test
- public void testNoReadGroupInRead() {
- final AbstractMarkDuplicatesTester tester = new MarkDuplicatesSparkTester(true);
- tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, null);
-
- try {
- tester.runTest();
- Assert.fail("Should have thrown an exception");
- } catch (Exception e){
- Assert.assertTrue(e instanceof SparkException);
- Assert.assertTrue(e.getCause() instanceof UserException.ReadMissingReadGroup);
- }
- }
-
- @Test(dataProvider = "readNameData")
- public void testOpticalDuplicatesTiebrokenByPhysicalLocationNotStartPosition(final String readName1, final String readName2) {
- // This tests the readname based tiebreaking code in mark duplicates. Since it's ambiguous which read should be marked
- // as duplicate or not if scores match we break ties by evaluating the readname for consistencies sake.
-
- final ReadNameParser parser = new ReadNameParser();
-
- final PhysicalLocationInt position1 = new PhysicalLocationInt();
- final PhysicalLocationInt position2 = new PhysicalLocationInt();
-
- parser.addLocationInformation(readName1, position1);
- parser.addLocationInformation(readName2, position2);
-
- final AbstractMarkDuplicatesTester tester = getTester();
- tester.getSamRecordSetBuilder().setReadLength(101);
- tester.setExpectedOpticalDuplicate(0);
-
- int compare = position1.tile - position2.tile;
- if (compare == 0) {
- compare = (short)position1.x - (short)position2.x;
- }
-
- if (compare == 0) {
- compare = (short)position1.y - (short)position2.y;
- }
-
- final boolean isDuplicate = compare < 0;
-
- // NOTE these reads are offset slightly but should have the same unclipped start postitions
- tester.addMatePair(readName1, 1,2, 46, false, false, !isDuplicate, !isDuplicate, "6S42M28S", "3S68M", true, false, false, false, false, DEFAULT_BASE_QUALITY);
- tester.addMatePair(readName2, 1,2, 51, false, false, isDuplicate, isDuplicate, "6S42M28S", "8S68M", true, false, false, false, false, DEFAULT_BASE_QUALITY);
- tester.runTest();
- }
-
- @DataProvider
- public Object[][] readNameData(){
- return new Object[][]{
- {"RUNID:7:1203:2886:82292", "RUNID:7:1205:3886:16834"},
-
- {"RUNID:7:1203:2886:16756", "RUNID:7:1205:3886:16756"},
- {"RUNID:7:1204:2886:16756", "RUNID:7:1205:3886:16756"},
- {"RUNID:7:1205:2886:16756", "RUNID:7:1205:3886:16756"},
- {"RUNID:7:1206:2886:16756", "RUNID:7:1205:3886:16756"},
- {"RUNID:7:1207:2886:16756", "RUNID:7:1205:3886:16756"},
-
- {"RUNID:7:1203:2886:16756", "RUNID:7:1203:4886:26756"},
- {"RUNID:7:1203:3886:16756", "RUNID:7:1203:4886:26756"},
- {"RUNID:7:1203:4886:16756", "RUNID:7:1203:4886:26756"},
- {"RUNID:7:1203:5886:16756", "RUNID:7:1203:4886:26756"},
- {"RUNID:7:1203:6886:16756", "RUNID:7:1203:4886:26756"},
-
- {"RUNID:7:1203:2886:34756", "RUNID:7:1203:2886:36756"},
- {"RUNID:7:1203:2886:35756", "RUNID:7:1203:2886:36756"},
- {"RUNID:7:1203:2886:37756", "RUNID:7:1203:2886:36756"},
- {"RUNID:7:1203:2886:38756", "RUNID:7:1203:2886:36756"},
-
- //Added a test for tiebreaking accounting for the short casting done in picard
- {"HK3T5CCXX160204:3:1112:11586:37067", "HK3T5CCXX160204:3:1112:11586:32144"}
- };
- }
-
- @Test(dataProvider = "readNameData")
- public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicates(final String readName1, final String readName2) {
- // This tests the readname based tiebreaking code in mark duplicates. Since it's ambiguous which read should be marked
- // as duplicate or not if scores match we break ties by evaluating the readname for consistencies sake.
-
- final ReadNameParser parser = new ReadNameParser();
-
- final PhysicalLocationInt position1 = new PhysicalLocationInt();
- final PhysicalLocationInt position2 = new PhysicalLocationInt();
-
- parser.addLocationInformation(readName1, position1);
- parser.addLocationInformation(readName2, position2);
-
- final AbstractMarkDuplicatesTester tester = getTester();
- tester.getSamRecordSetBuilder().setReadLength(101);
- tester.setExpectedOpticalDuplicate(0);
-
- int compare = position1.tile - position2.tile;
- if (compare == 0) {
- compare = (short)position1.x - (short)position2.x;
- }
-
- if (compare == 0) {
- compare = (short)position1.y - (short)position2.y;
- }
-
- final boolean isDuplicate = compare < 0;
-
- tester.addMatePair(readName1, 1,485253, 485253, false, false, !isDuplicate, !isDuplicate, "42M59S", "59S42M", false, true, false, false, false, DEFAULT_BASE_QUALITY);
- tester.addMatePair(readName2, 1,485253, 485253, false, false, isDuplicate, isDuplicate, "59S42M", "42M59S", true, false, false, false, false, DEFAULT_BASE_QUALITY);
-
- tester.runTest();
- }
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java
index 3b961f99e56..552d46ac0e0 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java
@@ -1,17 +1,23 @@
package org.broadinstitute.hellbender.tools.walkers.markduplicates;
import htsjdk.samtools.*;
+import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import org.apache.commons.io.FileUtils;
+import org.apache.spark.SparkException;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.ReadsDataSource;
+import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.testutils.BaseTest;
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.broadinstitute.hellbender.testutils.SamAssertionUtils;
-import org.broadinstitute.hellbender.testutils.testers.AbstractMarkDuplicatesTester;
+import org.broadinstitute.hellbender.testutils.testers.MarkDuplicatesSparkTester;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@@ -19,9 +25,14 @@
import picard.sam.util.ReadNameParser;
import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
/**
* This class defines the individual test cases to run. The actual running of the test is done
@@ -31,7 +42,7 @@ public abstract class AbstractMarkDuplicatesCommandLineProgramTest extends Comma
public static final File TEST_DATA_DIR = new File(getTestDataDir(), "walkers/MarkDuplicatesGATK/");
- protected abstract AbstractMarkDuplicatesTester getTester();
+ protected abstract MarkDuplicatesSparkTester getTester();
protected abstract CommandLineProgram getCommandLineProgramInstance();
@@ -45,14 +56,14 @@ public abstract class AbstractMarkDuplicatesCommandLineProgramTest extends Comma
@Test
public void testSingleUnmappedFragment() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addUnmappedFragment(-1, DEFAULT_BASE_QUALITY);
tester.runTest();
}
@Test
public void testTwoUnmappedFragments() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addUnmappedFragment(-1, DEFAULT_BASE_QUALITY);
tester.addUnmappedFragment(-1, DEFAULT_BASE_QUALITY);
tester.runTest();
@@ -60,21 +71,21 @@ public void testTwoUnmappedFragments() {
@Test
public void testSingleUnmappedPair() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addUnmappedPair(-1, DEFAULT_BASE_QUALITY);
tester.runTest();
}
@Test
public void testSingleMappedFragment() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedFragment(1, 1, false, DEFAULT_BASE_QUALITY);
tester.runTest();
}
@Test
public void testTwoMappedFragments() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedFragment(0, 1, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedFragment(0, 1, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.runTest();
@@ -82,14 +93,14 @@ public void testTwoMappedFragments() {
@Test
public void testSingleMappedPair() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(1, 1, 100, false, false, DEFAULT_BASE_QUALITY);
tester.runTest();
}
@Test
public void testSingleMappedFragmentAndSingleMappedPair() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedFragment(1, 1, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
tester.runTest();
@@ -97,7 +108,7 @@ public void testSingleMappedFragmentAndSingleMappedPair() {
@Test
public void testTwoMappedPairs() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.runTest();
@@ -105,7 +116,7 @@ public void testTwoMappedPairs() {
@Test
public void testThreeMappedPairs() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
@@ -114,7 +125,7 @@ public void testThreeMappedPairs() {
@Test
public void testSingleMappedFragmentAndTwoMappedPairs() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedFragment(1, 1, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
@@ -123,7 +134,7 @@ public void testSingleMappedFragmentAndTwoMappedPairs() {
@Test
public void testTwoMappedPairsAndTerminalUnmappedFragment() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.addUnmappedFragment(-1, DEFAULT_BASE_QUALITY); // unmapped fragment at end of file
@@ -132,7 +143,7 @@ public void testTwoMappedPairsAndTerminalUnmappedFragment() {
@Test
public void testTwoMappedPairsAndTerminalUnmappedPair() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.addUnmappedPair(-1, DEFAULT_BASE_QUALITY); // unmapped pair at end of file
@@ -141,7 +152,7 @@ public void testTwoMappedPairsAndTerminalUnmappedPair() {
@Test
public void testOpticalDuplicateFinding() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
// explicitly creating 1 expected optical duplicate pair
tester.setExpectedOpticalDuplicate(1);
@@ -158,7 +169,7 @@ public void testOpticalDuplicateFinding() {
@Test
public void testOpticalDuplicatesDifferentReadGroups() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.setExpectedOpticalDuplicate(0);
tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.2"); // duplicate
tester.addMatePair("RUNID:7:1203:2886:82242", 19, 19, 485253, 485253, false, false, false, false, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.1");
@@ -174,7 +185,7 @@ public void testOpticalDuplicatesDifferentReadGroups() {
@Test
public void testOpticalDuplicatesTheSameReadGroup() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.setExpectedOpticalDuplicate(1);
tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.2"); // duplicate (tie broken by readname)
tester.addMatePair("RUNID:7:1203:2886:82242", 19, 19, 485253, 485253, false, false, false, false, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.2");
@@ -191,7 +202,7 @@ public void testOpticalDuplicatesTheSameReadGroup() {
@Test
public void testOpticalDuplicatesAndPCRDuplicatesOrientationDifference() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.setExpectedOpticalDuplicate(0);
tester.addMatePair("RUNID:7:1203:2886:16838", 19, 19, 485253, 486253, false, false, true, true, "101M", "101M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "1"); // duplicate
tester.addMatePair("RUNID:7:1203:2886:16834", 19, 19, 486253, 485253, false, false, false, false, "101M", "101M", false, true, false, false, false, DEFAULT_BASE_QUALITY, "1");
@@ -204,7 +215,7 @@ public void testOpticalDuplicatesAndPCRDuplicatesOrientationDifference() {
// to FR if they point in opposite directions. This test asserts that two pairs of reads that ultimately all map to 93470499-93470499 with clipping
// and orientations will ultimately be marked and mapped together even though they have opposite orientations (one is FR while the other is RF).
public void testReadSameStartPositionOrientationOverride() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.setExpectedOpticalDuplicate(0);
tester.addMatePair("HJYFJCCXX160204:8:1124:31659:21526", 19, 19, 93470380, 93470499, false, false, false, false, "31S111M9S", "64M87S", true, false, false, false, false, DEFAULT_BASE_QUALITY, "1"); // after clipping these both start on 93470499
tester.addMatePair("HJYFJCCXX160204:8:1124:31659:21527", 19, 19, 93470499, 93470380, false, false, true, true, "64M87S", "31S111M9S", false, true, false, false, false, DEFAULT_BASE_QUALITY, "1"); // after clipping these both start on 93470499
@@ -215,7 +226,7 @@ public void testReadSameStartPositionOrientationOverride() {
// This asserts that we are flipping reads in the Pair object based on both start position and contig index, this does not
// make a difference unless the start position is the same across two contigs, so we assert it is handled properly
public void testReadsHaveSameStartPositionButDifferentChromosomeNonEquivalence() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.setExpectedOpticalDuplicate(0);
tester.addMatePair("HJYFJCCXX160204:8:1124:31659:21526", 19, 20, 93470380, 93470499, false, false, false, false, "31S111M9S", "64M87S", true, false, false, false, false, DEFAULT_BASE_QUALITY, "1"); // after clipping these both start on 93470499
tester.addMatePair("HJYFJCCXX160204:8:1124:31659:21527", 20, 19, 93470499, 93470380, false, false, true, true, "64M87S", "31S111M9S", false, true, false, false, false, DEFAULT_BASE_QUALITY, "1"); // after clipping these both start on 93470499
@@ -224,7 +235,7 @@ public void testReadsHaveSameStartPositionButDifferentChromosomeNonEquivalence()
@Test
public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicatesWithinPixelDistance() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(101);
tester.setExpectedOpticalDuplicate(0);
tester.addMatePair("RUNID:7:1203:2886:16834", 1, 485253, 485253, false, false, true, true, "42M59S", "59S42M", false, true, false, false, false, DEFAULT_BASE_QUALITY); // duplicate
@@ -234,7 +245,7 @@ public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicatesWithinPixe
@Test
public void testOpticalDuplicateClusterSamePositionOneOpticalDuplicatesWithinPixelDistance() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(45);
tester.setExpectedOpticalDuplicate(1);
tester.addMatePair("RUNID:7:1203:2886:16834", 1, 485253, 485253, false, false, true, true, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY); // duplicate
@@ -244,7 +255,7 @@ public void testOpticalDuplicateClusterSamePositionOneOpticalDuplicatesWithinPix
@Test
public void testOpticalDuplicateClusterOneEndSamePositionOneCluster() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(101);
tester.setExpectedOpticalDuplicate(1);
tester.addMatePair("RUNID:7:2205:17939:39728", 1, 485328, 485312, false, false, false, false, "55M46S", "30S71M", false, true, false, false, false, ELIGIBLE_BASE_QUALITY);
@@ -254,7 +265,7 @@ public void testOpticalDuplicateClusterOneEndSamePositionOneCluster() {
@Test
public void testTwoMappedPairsAndMappedSecondaryFragment() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedPair(1, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.addMappedFragment(1, 200, false, DEFAULT_BASE_QUALITY, true); // mapped non-primary fragment
@@ -263,7 +274,7 @@ public void testTwoMappedPairsAndMappedSecondaryFragment() {
@Test
public void testMappedFragmentAndMappedPairFirstOfPairNonPrimary() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMappedFragment(1, 1, false, ELIGIBLE_BASE_QUALITY);
tester.addMatePair(1, 200, 0, false, true, false, false, "54M22S", null, false, false, true, true, false, DEFAULT_BASE_QUALITY);
@@ -272,7 +283,7 @@ public void testMappedFragmentAndMappedPairFirstOfPairNonPrimary() {
@Test
public void testTwoMappedPairsMatesSoftClipped() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMappedPair(1, 10022, 10051, false, false, "76M", "8S68M", false, true, false, DEFAULT_BASE_QUALITY);
tester.addMappedPair(1, 10022, 10063, false, false, "76M", "5S71M", false, true, false, DEFAULT_BASE_QUALITY);
@@ -282,7 +293,7 @@ public void testTwoMappedPairsMatesSoftClipped() {
@Test
public void testTwoMappedPairsWithSoftClipping() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
//todo Picard says no duplicates, we say duplicate, what's going on?
// NB: no duplicates
@@ -295,7 +306,7 @@ public void testTwoMappedPairsWithSoftClipping() {
@Test
public void testTwoMappedPairsWithSoftClippingFirstOfPairOnlyNoMateCigar() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.setNoMateCigars(true);
// NB: no duplicates
@@ -308,7 +319,7 @@ public void testTwoMappedPairsWithSoftClippingFirstOfPairOnlyNoMateCigar() {
@Test
public void testTwoMappedPairsWithSoftClippingBoth() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
// mapped reference length: 73 + 42 = 115
tester.addMappedPair(1, 10046, 10002, true, true, "3S73M", "6S42M28S", true, false, false, DEFAULT_BASE_QUALITY); // duplicate
@@ -319,7 +330,7 @@ public void testTwoMappedPairsWithSoftClippingBoth() {
@Test
public void testMatePairSecondUnmapped() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMatePair(1, 10049, 10049, false, true, false, false, "11M2I63M", null, false, false, false, false, false, DEFAULT_BASE_QUALITY); // neither are duplicates
tester.runTest();
@@ -327,7 +338,7 @@ public void testMatePairSecondUnmapped() {
@Test
public void testMatePairFirstUnmapped() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMatePair(1, 10056, 10056, true, false, false, false, null, "54M22S", false, false, false, false, false, DEFAULT_BASE_QUALITY); // neither are duplicates
tester.runTest();
@@ -335,7 +346,7 @@ public void testMatePairFirstUnmapped() {
@Test
public void testMappedFragmentAndMatePairSecondUnmapped() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMatePair(1, 10049, 10049, false, true, false, false, "11M2I63M", null, false, false, false, false, false, ELIGIBLE_BASE_QUALITY);
// We set the length here separately because some of the MD variants use TOTAL_MAPPED_REFERENCE_LENGTH as their DUPLICATE_SCORING_STRATEGY. If we kept the read length
@@ -347,7 +358,7 @@ public void testMappedFragmentAndMatePairSecondUnmapped() {
@Test
public void testMappedFragmentAndMatePairFirstUnmapped() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMatePair(1, 10049, 10049, true, false, false, false, null, "11M2I63M", false, false, false, false, false, ELIGIBLE_BASE_QUALITY);
// We set the length here separately because some of the MD variants use TOTAL_MAPPED_REFERENCE_LENGTH as their DUPLICATE_SCORING_STRATEGY. If we kept the read length
@@ -359,7 +370,7 @@ public void testMappedFragmentAndMatePairFirstUnmapped() {
@Test
public void testMappedPairAndMatePairSecondUnmapped() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMatePair(1, 10040, 10040, false, true, true, false, "76M", null, false, false, false, false, false, DEFAULT_BASE_QUALITY); // second a duplicate,
// second end unmapped
@@ -369,7 +380,7 @@ public void testMappedPairAndMatePairSecondUnmapped() {
@Test
public void testMappedPairAndMatePairFirstUnmapped() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMatePair(1, 10040, 10040, true, false, false, true, null, "76M", false, false, false, false, false, DEFAULT_BASE_QUALITY); // first a duplicate,
// first end unmapped
@@ -380,7 +391,7 @@ public void testMappedPairAndMatePairFirstUnmapped() {
// TODO: fails on MarkDuplicatesWithMateCigar
@Test
public void testMappedPairAndMatePairFirstOppositeStrandSecondUnmapped() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(101);
// first end mapped OK -, second end unmapped
tester.addMatePair(1, 484071, 484071, false, true, false, false, "66S35M", null, true, false, false, false, false, DEFAULT_BASE_QUALITY);
@@ -389,9 +400,28 @@ public void testMappedPairAndMatePairFirstOppositeStrandSecondUnmapped() {
tester.runTest();
}
+ @Test
+ public void testSupplementaryReadUnmappedMate() {
+ File output = createTempFile("supplementaryReadUnmappedMate", "bam");
+ final ArgumentsBuilder args = new ArgumentsBuilder();
+ args.addOutput(output);
+ args.addInput(getTestFile("supplementaryReadUnmappedmate.bam"));
+ runCommandLine(args);
+
+ try ( final ReadsDataSource outputReadsSource = new ReadsDataSource(output.toPath()) ) {
+ final List actualReads = new ArrayList<>();
+ for ( final GATKRead read : outputReadsSource ) {
+ Assert.assertFalse(read.isDuplicate());
+ actualReads.add(read);
+ }
+
+ Assert.assertEquals(actualReads.size(), 3, "Wrong number of reads output");
+ }
+ }
+
@Test
public void testMappedPairAndMappedFragmentAndMatePairSecondUnmapped() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMatePair(1, 10040, 10040, false, true, true, false, "76M", null, false, false, false, false, false, DEFAULT_BASE_QUALITY); // first a duplicate,
// second end unmapped
@@ -402,7 +432,7 @@ public void testMappedPairAndMappedFragmentAndMatePairSecondUnmapped() {
@Test
public void testMappedPairAndMappedFragmentAndMatePairFirstUnmapped() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMatePair(1, 10040, 10040, true, false, false, true, null, "76M", false, false, false, false, false, DEFAULT_BASE_QUALITY); // first a duplicate,
// first end unmapped
@@ -413,7 +443,7 @@ public void testMappedPairAndMappedFragmentAndMatePairFirstUnmapped() {
@Test
public void testTwoMappedPairsWithOppositeOrientations() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMappedPair(1, 10182, 10038, true, true, "32S44M", "66M10S", true, false, false, DEFAULT_BASE_QUALITY); // -/+
tester.addMappedPair(1, 10038, 10182, false, false, "70M6S", "32S44M", false, true, false, ELIGIBLE_BASE_QUALITY); // +/-, both are duplicates
@@ -422,7 +452,7 @@ public void testTwoMappedPairsWithOppositeOrientations() {
@Test
public void testTwoMappedPairsWithOppositeOrientationsNumberTwo() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMappedPair(1, 10038, 10182, false, false, "70M6S", "32S44M", false, true, false, ELIGIBLE_BASE_QUALITY); // +/-, both are duplicates
tester.addMappedPair(1, 10182, 10038, true, true, "32S44M", "66M10S", true, false, false, DEFAULT_BASE_QUALITY); // -/+
@@ -431,7 +461,7 @@ public void testTwoMappedPairsWithOppositeOrientationsNumberTwo() {
@Test
public void testThreeMappedPairsWithMatchingSecondMate() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
// Read0 and Read2 are duplicates
// 10181+41=10220, 10058
@@ -445,7 +475,7 @@ public void testThreeMappedPairsWithMatchingSecondMate() {
@Test
public void testMappedPairWithSamePosition() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMappedPair(1, 4914, 4914, false, false, "37M39S", "73M3S", false, false, false, DEFAULT_BASE_QUALITY); // +/+
tester.runTest();
@@ -453,7 +483,7 @@ public void testMappedPairWithSamePosition() {
@Test
public void testMappedPairWithSamePositionSameCigar() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMappedPair(1, 4914, 4914, false, false, "37M39S", "37M39S", false, false, false, DEFAULT_BASE_QUALITY); // +/+
tester.runTest();
@@ -461,7 +491,7 @@ public void testMappedPairWithSamePositionSameCigar() {
@Test
public void testTwoMappedPairWithSamePosition() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMappedPair(0, 5604914, 5604914, false, false, "37M39S", "73M3S", false, false, false, ELIGIBLE_BASE_QUALITY); // +/+
tester.addMappedPair(0, 5604914, 5604914, true, true, "37M39S", "73M3S", false, false, false, DEFAULT_BASE_QUALITY); // +/+
@@ -470,7 +500,7 @@ public void testTwoMappedPairWithSamePosition() {
@Test
public void testTwoMappedPairWithSamePositionDifferentStrands() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(0, 5604914, 5604914, false, false, "50M", "50M", true, false, false, ELIGIBLE_BASE_QUALITY); // +/-
tester.addMappedPair(0, 5604914, 5604914, true, true, "50M", "50M", false, true, false, DEFAULT_BASE_QUALITY); // -/+
tester.runTest();
@@ -478,7 +508,7 @@ public void testTwoMappedPairWithSamePositionDifferentStrands() {
@Test
public void testTwoMappedPairWithSamePositionDifferentStrands2() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(0, 5604914, 5604915, false, false, "50M", "50M", true, false, false, ELIGIBLE_BASE_QUALITY); // +/-
tester.addMappedPair(0, 5604915, 5604914, true, true, "50M", "50M", false, true, false, DEFAULT_BASE_QUALITY); // -/+
tester.runTest();
@@ -486,7 +516,7 @@ public void testTwoMappedPairWithSamePositionDifferentStrands2() {
@Test
public void testMappedPairWithFirstEndSamePositionAndOther() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(76);
tester.addMappedPair(0, 5604914, 5605914, false, false, "37M39S", "73M3S", false, false, false, DEFAULT_BASE_QUALITY); // +/+
tester.addMappedPair(0, 5604914, 5604914, false, false, "37M39S", "73M3S", false, false, false, DEFAULT_BASE_QUALITY); // +/+
@@ -495,7 +525,7 @@ public void testMappedPairWithFirstEndSamePositionAndOther() {
@Test
public void testTwoGroupsOnDifferentChromosomesOfTwoFragments() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedFragment(0, 1, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedFragment(0, 1, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.addMappedFragment(1, 1, false, ELIGIBLE_BASE_QUALITY);
@@ -506,7 +536,7 @@ public void testTwoGroupsOnDifferentChromosomesOfTwoFragments() {
@Test
public void testTwoGroupsOnDifferentChromosomesOfTwoMappedPairs() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(0, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedPair(0, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.addMappedPair(1, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
@@ -516,7 +546,7 @@ public void testTwoGroupsOnDifferentChromosomesOfTwoMappedPairs() {
@Test
public void testTwoGroupsOnDifferentChromosomesOfThreeMappedPairs() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(0, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedPair(0, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.addMappedPair(0, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
@@ -528,7 +558,7 @@ public void testTwoGroupsOnDifferentChromosomesOfThreeMappedPairs() {
@Test
public void testThreeGroupsOnDifferentChromosomesOfThreeMappedPairs() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.addMappedPair(0, 1, 100, false, false, ELIGIBLE_BASE_QUALITY);
tester.addMappedPair(0, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
tester.addMappedPair(0, 1, 100, true, true, DEFAULT_BASE_QUALITY); // duplicate!!!
@@ -543,7 +573,7 @@ public void testThreeGroupsOnDifferentChromosomesOfThreeMappedPairs() {
@Test
public void testBulkFragmentsNoDuplicates() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(100);
for(int position = 1; position <= 10000; position += 1) {
tester.addMappedFragment(0, position, false, "100M", DEFAULT_BASE_QUALITY);
@@ -553,7 +583,7 @@ public void testBulkFragmentsNoDuplicates() {
@Test
public void testBulkFragmentsWithDuplicates() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(100);
for(int position = 1; position <= 10000; position += 1) {
tester.addMappedFragment(0, position, false, "100M", ELIGIBLE_BASE_QUALITY);
@@ -567,7 +597,7 @@ public void testBulkFragmentsWithDuplicates() {
@Test
public void testStackOverFlowPairSetSwap() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(68);
final File input = new File(TEST_DATA_DIR, "markDuplicatesWithMateCigar.pairSet.swap.sam");
@@ -583,7 +613,7 @@ public void testStackOverFlowPairSetSwap() {
@Test
public void testSecondEndIsBeforeFirstInCoordinate() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(68);
tester.addMappedPair(0, 108855339, 108855323, false, false, "33S35M", "17S51M", true, true, false, DEFAULT_BASE_QUALITY); // +/-
tester.runTest();
@@ -591,7 +621,7 @@ public void testSecondEndIsBeforeFirstInCoordinate() {
@Test
public void testPathologicalOrderingAtTheSamePosition() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(68);
tester.setExpectedOpticalDuplicate(1);
@@ -620,7 +650,7 @@ public void testPathologicalOrderingAtTheSamePosition() {
@Test
public void testDifferentChromosomesInOppositeOrder() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(101);
tester.setExpectedOpticalDuplicate(1);
tester.addMatePair("RUNID:6:101:17642:6835", 0, 1, 123989, 18281, false, false, true, true, "37S64M", "52M49S", false, false, false, false, false, DEFAULT_BASE_QUALITY, "1");
@@ -630,7 +660,7 @@ public void testDifferentChromosomesInOppositeOrder() {
@Test
public void testOpticalDuplicateClustersAddingSecondEndFirstSameCoordinate() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(68);
tester.setExpectedOpticalDuplicate(1);
tester.addMatePair("RUNID:1:1:15993:13361", 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, ELIGIBLE_BASE_QUALITY);
@@ -652,7 +682,7 @@ public Object[][] secondarySupplementaryData() {
@Test(dataProvider = "secondarySupplementaryData")
public void testTwoMappedPairsWithSupplementaryReads(final Boolean additionalFragSecondary, final Boolean additionalFragSupplementary, final Boolean fragLikeFirst) {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(68);
tester.setExpectedOpticalDuplicate(1);
tester.addMatePair("RUNID:1:1:15993:13361", 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY);
@@ -720,7 +750,7 @@ public void testMDOrder(final File input, final File expectedOutput) throws Exce
@Test
public void testTwoMappedPairsWithSoftClippingFirstOfPairOnly() {
- final AbstractMarkDuplicatesTester tester = getTester();
+ final MarkDuplicatesSparkTester tester = getTester();
// NB: no duplicates
// 5'1: 2, 5'2:46+73M=118
// 5'1: 2, 5'2:51+68M=118
@@ -751,4 +781,125 @@ protected void testMDOrderImpl(final File input, final File expectedOutput, fina
}
+ @Test
+ public void testNonExistantReadGroupInRead() {
+ final MarkDuplicatesSparkTester tester = new MarkDuplicatesSparkTester(true);
+ tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "NotADuplicateGroup");
+ try {
+ tester.runTest();
+ Assert.fail("Should have thrown an exception");
+ } catch (Exception e){
+ Assert.assertTrue(e instanceof SparkException);
+ Assert.assertTrue(e.getCause() instanceof UserException.HeaderMissingReadGroup);
+ }
+ }
+
+ @Test
+ public void testNoReadGroupInRead() {
+ final MarkDuplicatesSparkTester tester = new MarkDuplicatesSparkTester(true);
+ tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, null);
+
+ try {
+ tester.runTest();
+ Assert.fail("Should have thrown an exception");
+ } catch (Exception e){
+ Assert.assertTrue(e instanceof SparkException);
+ Assert.assertTrue(e.getCause() instanceof UserException.ReadMissingReadGroup);
+ }
+ }
+
+ @Test(dataProvider = "readNameData")
+ public void testOpticalDuplicatesTiebrokenByPhysicalLocationNotStartPosition(final String readName1, final String readName2) {
+ // This tests the readname based tiebreaking code in mark duplicates. Since it's ambiguous which read should be marked
+ // as duplicate or not if scores match we break ties by evaluating the readname for consistencies sake.
+
+ final ReadNameParser parser = new ReadNameParser();
+
+ final PhysicalLocationInt position1 = new PhysicalLocationInt();
+ final PhysicalLocationInt position2 = new PhysicalLocationInt();
+
+ parser.addLocationInformation(readName1, position1);
+ parser.addLocationInformation(readName2, position2);
+
+ final MarkDuplicatesSparkTester tester = getTester();
+ tester.getSamRecordSetBuilder().setReadLength(101);
+ tester.setExpectedOpticalDuplicate(0);
+
+ int compare = position1.tile - position2.tile;
+ if (compare == 0) {
+ compare = (short)position1.x - (short)position2.x;
+ }
+
+ if (compare == 0) {
+ compare = (short)position1.y - (short)position2.y;
+ }
+
+ final boolean isDuplicate = compare < 0;
+
+ // NOTE these reads are offset slightly but should have the same unclipped start postitions
+ tester.addMatePair(readName1, 1,2, 46, false, false, !isDuplicate, !isDuplicate, "6S42M28S", "3S68M", true, false, false, false, false, DEFAULT_BASE_QUALITY);
+ tester.addMatePair(readName2, 1,2, 51, false, false, isDuplicate, isDuplicate, "6S42M28S", "8S68M", true, false, false, false, false, DEFAULT_BASE_QUALITY);
+ tester.runTest();
+ }
+
+ @DataProvider
+ public Object[][] readNameData(){
+ return new Object[][]{
+ {"RUNID:7:1203:2886:82292", "RUNID:7:1205:3886:16834"},
+
+ {"RUNID:7:1203:2886:16756", "RUNID:7:1205:3886:16756"},
+ {"RUNID:7:1204:2886:16756", "RUNID:7:1205:3886:16756"},
+ {"RUNID:7:1205:2886:16756", "RUNID:7:1205:3886:16756"},
+ {"RUNID:7:1206:2886:16756", "RUNID:7:1205:3886:16756"},
+ {"RUNID:7:1207:2886:16756", "RUNID:7:1205:3886:16756"},
+
+ {"RUNID:7:1203:2886:16756", "RUNID:7:1203:4886:26756"},
+ {"RUNID:7:1203:3886:16756", "RUNID:7:1203:4886:26756"},
+ {"RUNID:7:1203:4886:16756", "RUNID:7:1203:4886:26756"},
+ {"RUNID:7:1203:5886:16756", "RUNID:7:1203:4886:26756"},
+ {"RUNID:7:1203:6886:16756", "RUNID:7:1203:4886:26756"},
+
+ {"RUNID:7:1203:2886:34756", "RUNID:7:1203:2886:36756"},
+ {"RUNID:7:1203:2886:35756", "RUNID:7:1203:2886:36756"},
+ {"RUNID:7:1203:2886:37756", "RUNID:7:1203:2886:36756"},
+ {"RUNID:7:1203:2886:38756", "RUNID:7:1203:2886:36756"},
+
+ //Added a test for tiebreaking accounting for the short casting done in picard
+ {"HK3T5CCXX160204:3:1112:11586:37067", "HK3T5CCXX160204:3:1112:11586:32144"}
+ };
+ }
+
+ @Test(dataProvider = "readNameData")
+ public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicates(final String readName1, final String readName2) {
+ // This tests the readname based tiebreaking code in mark duplicates. Since it's ambiguous which read should be marked
+ // as duplicate or not if scores match we break ties by evaluating the readname for consistencies sake.
+
+ final ReadNameParser parser = new ReadNameParser();
+
+ final PhysicalLocationInt position1 = new PhysicalLocationInt();
+ final PhysicalLocationInt position2 = new PhysicalLocationInt();
+
+ parser.addLocationInformation(readName1, position1);
+ parser.addLocationInformation(readName2, position2);
+
+ final MarkDuplicatesSparkTester tester = getTester();
+ tester.getSamRecordSetBuilder().setReadLength(101);
+ tester.setExpectedOpticalDuplicate(0);
+
+ int compare = position1.tile - position2.tile;
+ if (compare == 0) {
+ compare = (short)position1.x - (short)position2.x;
+ }
+
+ if (compare == 0) {
+ compare = (short)position1.y - (short)position2.y;
+ }
+
+ final boolean isDuplicate = compare < 0;
+
+ tester.addMatePair(readName1, 1,485253, 485253, false, false, !isDuplicate, !isDuplicate, "42M59S", "59S42M", false, true, false, false, false, DEFAULT_BASE_QUALITY);
+ tester.addMatePair(readName2, 1,485253, 485253, false, false, isDuplicate, isDuplicate, "59S42M", "42M59S", true, false, false, false, false, DEFAULT_BASE_QUALITY);
+
+ tester.runTest();
+ }
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATKIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATKIntegrationTest.java
deleted file mode 100644
index 06e8209a5f1..00000000000
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/MarkDuplicatesGATKIntegrationTest.java
+++ /dev/null
@@ -1,273 +0,0 @@
-package org.broadinstitute.hellbender.tools.walkers.markduplicates;
-
-import htsjdk.samtools.*;
-import htsjdk.samtools.util.CloserUtil;
-import htsjdk.samtools.util.CollectionUtil;
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.TestUtil;
-import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
-import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesTester;
-import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
-import org.broadinstitute.hellbender.testutils.testers.AbstractMarkDuplicatesTester;
-import org.testng.Assert;
-import org.testng.annotations.BeforeClass;
-import org.testng.annotations.DataProvider;
-import org.testng.annotations.Test;
-
-import java.io.File;
-import java.util.*;
-
-/**
- * This class defines the individual test cases to run. The actual running of the test is done
- * by MarkDuplicatesTester (see getTester).
- */
-public final class MarkDuplicatesGATKIntegrationTest extends AbstractMarkDuplicatesCommandLineProgramTest {
- protected static String TEST_BASE_NAME = null;
-
- @BeforeClass
- public void setUp() {
- TEST_BASE_NAME = "MarkDuplicatesGATK";
- }
-
- @Override
- protected AbstractMarkDuplicatesTester getTester() {
- return new MarkDuplicatesTester();
- }
-
- @DataProvider(name="strictlyBadBams")
- public Object[][] strictlyBadBams() {
- return new Object[][]{
- {new File(getTestDataDir(), "non_strict.bam")},
- };
- }
-
- @Override
- protected CommandLineProgram getCommandLineProgramInstance() {
- return new MarkDuplicatesGATK();
- }
-
- // NB: this test should return different results than MarkDuplicatesWithMateCigar
-
- /**
- * Test that PG header records are created & chained appropriately (or not created), and that the PG record chains
- * are as expected. MarkDuplicatesGATK is used both to merge and to mark dupes in this case.
- * @param suppressPg If true, do not create PG header record.
- * @param expectedPnVnByReadName For each read, info about the expect chain of PG records.
- */
- @Test(dataProvider = "pgRecordChainingTest")
- public void pgRecordChainingTest(final boolean suppressPg,
- final Map> expectedPnVnByReadName) {
- final File outputDir = IOUtil.createTempDir(TEST_BASE_NAME + ".", ".tmp");
- outputDir.deleteOnExit();
- try {
- // Run MarkDuplicatesGATK, merging the 3 input files, and either enabling or suppressing PG header
- // record creation according to suppressPg.
- final MarkDuplicatesGATK markDuplicatesGATK = new MarkDuplicatesGATK();
- final ArgumentsBuilder args = new ArgumentsBuilder();
- for (int i = 1; i <= 3; ++i) {
- args.add("--input");
- args.add(new File(TEST_DATA_DIR, "merge" + i + ".sam").getAbsolutePath());
- }
- final File outputSam = new File(outputDir, TEST_BASE_NAME + ".sam");
- args.add("--output");
- args.add(outputSam.getAbsolutePath());
- args.add("--"+StandardArgumentDefinitions.METRICS_FILE_LONG_NAME);
- args.add(new File(outputDir, TEST_BASE_NAME + ".duplicate_metrics").getAbsolutePath());
- if (suppressPg) {
- args.add("--PROGRAM_RECORD_ID");
- args.add("null");
- }
-
- // I generally prefer to call doWork rather than invoking the argument parser, but it is necessary
- // in this case to initialize the command line.
- // Note that for the unit test, version won't come through because it is obtained through jar
- // manifest, and unit test doesn't run code from a jar.
- markDuplicatesGATK.instanceMain(args.getArgsArray());
-
- // Read the MarkDuplicatesGATK output file, and get the PG ID for each read. In this particular test,
- // the PG ID should be the same for both ends of a pair.
- final SamReader reader = SamReaderFactory.makeDefault().open(outputSam);
-
- final Map pgIdForReadName = new HashMap<>();
- for (final SAMRecord rec : reader) {
- final String existingPgId = pgIdForReadName.get(rec.getReadName());
- final String thisPgId = rec.getStringAttribute(SAMTag.PG.name());
- if (existingPgId != null) {
- Assert.assertEquals(thisPgId, existingPgId);
- } else {
- pgIdForReadName.put(rec.getReadName(), thisPgId);
- }
- }
- final SAMFileHeader header = reader.getFileHeader();
- CloserUtil.close(reader);
-
- // Confirm that for each read name, the chain of PG records contains exactly the number that is expected,
- // and that values in the PG chain are as expected.
- for (final Map.Entry> entry : expectedPnVnByReadName.entrySet()) {
- final String readName = entry.getKey();
- final List expectedList = entry.getValue();
- String pgId = pgIdForReadName.get(readName);
- for (final ExpectedPnAndVn expected : expectedList) {
- final SAMProgramRecord programRecord = header.getProgramRecord(pgId);
- if (expected.expectedPn != null) {
- Assert.assertEquals(programRecord.getProgramName(), expected.expectedPn);
- }
- if (expected.expectedVn != null) {
- Assert.assertEquals(programRecord.getProgramVersion(), expected.expectedVn);
- }
- pgId = programRecord.getPreviousProgramGroupId();
- }
- Assert.assertNull(pgId);
- }
-
- } finally {
- TestUtil.recursiveDelete(outputDir);
- }
- }
-
- /**
- * Represents an expected PN value and VN value for a PG record. If one of thexe is null, any value is allowed
- * in the PG record being tested.
- */
- private static class ExpectedPnAndVn {
- final String expectedPn;
- final String expectedVn;
-
- private ExpectedPnAndVn(final String expectedPn, final String expectedVn) {
- this.expectedPn = expectedPn;
- this.expectedVn = expectedVn;
- }
- }
-
- @DataProvider(name = "pgRecordChainingTest")
- public Object[][] pgRecordChainingTestDataProvider() {
- // Two test cases: One in which PG record generation is enabled, the other in which it is turned off.
- final Map> withPgMap = new HashMap<>();
- withPgMap.put("1AAXX.1.1", Arrays.asList(new ExpectedPnAndVn(TEST_BASE_NAME, null), new ExpectedPnAndVn(TEST_BASE_NAME, "1"), new ExpectedPnAndVn("bwa", "1")));
- withPgMap.put("1AAXX.2.1", Arrays.asList(new ExpectedPnAndVn(TEST_BASE_NAME, null), new ExpectedPnAndVn("bwa", "2")));
- withPgMap.put("1AAXX.3.1", Arrays.asList(new ExpectedPnAndVn(TEST_BASE_NAME, null)));
-
- final Map> suppressPgMap = new HashMap<>();
- suppressPgMap .put("1AAXX.1.1", Arrays.asList(new ExpectedPnAndVn(TEST_BASE_NAME, "1"), new ExpectedPnAndVn("bwa", "1")));
- suppressPgMap .put("1AAXX.2.1", Arrays.asList(new ExpectedPnAndVn("bwa", "2")));
- suppressPgMap .put("1AAXX.3.1", new ArrayList<>(0));
- return new Object[][] {
- { false, withPgMap},
- { true, suppressPgMap}
- };
- }
-
- @Test(dataProvider = "testOpticalDuplicateDetectionDataProvider")
- public void testOpticalDuplicateDetection(final File sam, final File reference, final String outputExtension, final long expectedNumOpticalDuplicates) {
- final File outputDir = IOUtil.createTempDir(TEST_BASE_NAME + ".", ".tmp");
- outputDir.deleteOnExit();
- final File outputSam = new File(outputDir, TEST_BASE_NAME + outputExtension);
- outputSam.deleteOnExit();
- final File metricsFile = new File(outputDir, TEST_BASE_NAME + ".duplicate_metrics");
- metricsFile.deleteOnExit();
- // Run MarkDuplicatesGATK, merging the 3 input files, and either enabling or suppressing PG header
- // record creation according to suppressPg.
- final MarkDuplicatesGATK markDuplicatesGATK = new MarkDuplicatesGATK();
- markDuplicatesGATK.setupOpticalDuplicateFinder();
- markDuplicatesGATK.INPUT = CollectionUtil.makeList(sam);
- if (null != reference) {
- markDuplicatesGATK.REFERENCE_SEQUENCE = reference;
- }
- markDuplicatesGATK.OUTPUT = outputSam;
- markDuplicatesGATK.METRICS_FILE = metricsFile;
- markDuplicatesGATK.tmpDir = outputDir.toString();
- // Needed to suppress calling CommandLineProgram.getVersion(), which doesn't work for code not in a jar
- markDuplicatesGATK.PROGRAM_RECORD_ID = null;
- Assert.assertEquals(markDuplicatesGATK.doWork(), null);
- Assert.assertEquals(markDuplicatesGATK.numOpticalDuplicates(), expectedNumOpticalDuplicates);
- }
-
- @Test(dataProvider = "strictlyBadBams")
- public void testLenientStringency(File input) {
- // TODO: This test should really be a PicardCommandLineProgramTest, not here (see #1170).
- ArgumentsBuilder args = new ArgumentsBuilder();
- args.add("--"+StandardArgumentDefinitions.INPUT_SHORT_NAME);
- args.add(input.getPath());
- args.add("--"+"VALIDATION_STRINGENCY");
- args.add(ValidationStringency.LENIENT);
- args.add("--"+StandardArgumentDefinitions.OUTPUT_SHORT_NAME);
-
- File outputFile = createTempFile("markdups", ".bam");
- outputFile.delete();
- args.add(outputFile.getAbsolutePath());
-
- args.add("--"+StandardArgumentDefinitions.METRICS_FILE_LONG_NAME);
- File metricsFile = createTempFile("markdups_metrics", ".txt");
- args.add(metricsFile.getAbsolutePath());
-
- runCommandLine(args.getArgsArray());
- // We don't care about the results, only that the program doesn't crash because of stringency.
- }
-
- @Test(dataProvider = "strictlyBadBams", expectedExceptions = SAMFormatException.class)
- public void testStrictStringency(File input) {
- // TODO: This test should really be a PicardCommandLineProgramTest, not here (see #1170).
- // For these inputs, we expect an expection to be thrown because the bams have some issues.
- ArgumentsBuilder args = new ArgumentsBuilder();
- args.add("--"+StandardArgumentDefinitions.INPUT_SHORT_NAME);
- args.add(input.getPath());
- args.add("--"+"VALIDATION_STRINGENCY");
- args.add(ValidationStringency.STRICT);
- args.add("--"+StandardArgumentDefinitions.OUTPUT_SHORT_NAME);
-
- File outputFile = createTempFile("markdups", ".bam");
- outputFile.delete();
- args.add(outputFile.getAbsolutePath());
-
- args.add("--"+StandardArgumentDefinitions.METRICS_FILE_LONG_NAME);
- File metricsFile = createTempFile("markdups_metrics", ".txt");
- args.add(metricsFile.getAbsolutePath());
-
- runCommandLine(args.getArgsArray());
- // We don't care about the results, only that the program it throws an exception because of stringency.
- }
-
- @DataProvider(name="testOpticalDuplicateDetectionDataProvider")
- public Object[][] testOpticalDuplicateDetectionDataProvider() {
- return new Object[][] {
- {new File(TEST_DATA_DIR, "optical_dupes.sam"), null, ".sam", 1L},
- {new File(TEST_DATA_DIR, "optical_dupes.cram"), new File(TEST_DATA_DIR, "optical_dupes.fasta"), ".cram", 1L},
- {new File(TEST_DATA_DIR, "optical_dupes_casava.sam"), null, ".sam", 1L}
- };
- }
-
- @Test(dataProvider = "testDuplicateDetectionDataProvider")
- public void testDuplicateDetection(final File sam, final long expectedNumOpticalDuplicates) {
- final File outputDir = IOUtil.createTempDir(TEST_BASE_NAME + ".", ".tmp");
- outputDir.deleteOnExit();
- final File outputSam = new File(outputDir, TEST_BASE_NAME + ".sam");
- outputSam.deleteOnExit();
- final File metricsFile = new File(outputDir, TEST_BASE_NAME + ".duplicate_metrics");
- metricsFile.deleteOnExit();
- // Run MarkDuplicatesGATK, merging the 3 input files, and either enabling or suppressing PG header
- // record creation according to suppressPg.
- final MarkDuplicatesGATK markDuplicatesGATK = new MarkDuplicatesGATK();
- markDuplicatesGATK.setupOpticalDuplicateFinder();
- markDuplicatesGATK.INPUT = CollectionUtil.makeList(sam);
- markDuplicatesGATK.OUTPUT = outputSam;
- markDuplicatesGATK.METRICS_FILE = metricsFile;
- markDuplicatesGATK.tmpDir = outputDir.toString();
- // Needed to suppress calling CommandLineProgram.getVersion(), which doesn't work for code not in a jar
- markDuplicatesGATK.PROGRAM_RECORD_ID = null;
- Assert.assertEquals(markDuplicatesGATK.doWork(), null);
- Assert.assertEquals(markDuplicatesGATK.numOpticalDuplicates(), expectedNumOpticalDuplicates);
- }
-
- @DataProvider(name="testDuplicateDetectionDataProvider")
- public Object[][] testDuplicateDetectionDataProvider() {
- return new Object[][] {
- {new File(TEST_DATA_DIR, "markDups.test2reads.bam"), 0L},
- {new File(TEST_DATA_DIR, "example.chr1.1-1K.unmarkedDups.noDups.bam"), 0L},
- {new File(TEST_DATA_DIR, "example.chr1.1-1K.markedDups.bam"), 0L},
- {new File(TEST_DATA_DIR, "example.chr1.1-1K.unmarkedDups.bam"), 0L},
- };
- }
-
-
-}
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesSparkTester.java b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesSparkTester.java
deleted file mode 100644
index 888bf81b352..00000000000
--- a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesSparkTester.java
+++ /dev/null
@@ -1,33 +0,0 @@
-package org.broadinstitute.hellbender.utils.read.markduplicates;
-
-import htsjdk.samtools.DuplicateScoringStrategy;
-import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
-import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
-import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark;
-import org.broadinstitute.hellbender.testutils.testers.AbstractMarkDuplicatesTester;
-
-/**
- * A tester class for {@link MarkDuplicatesSpark}.
- */
-public final class MarkDuplicatesSparkTester extends AbstractMarkDuplicatesTester {
- boolean markUnmappedReads = false;
-
- public MarkDuplicatesSparkTester() {
- this(false);
- }
-
- public MarkDuplicatesSparkTester(boolean markUnmappedReads) {
- super(DuplicateScoringStrategy.ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH);
- this.markUnmappedReads = markUnmappedReads;
- }
-
- @Override
- protected CommandLineProgram getProgram() { return new MarkDuplicatesSpark(); }
-
- @Override
- protected void addArgs() {
- if (!markUnmappedReads) {
- addArg("--" + MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME);
- }
- }
-}
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesTester.java b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesTester.java
deleted file mode 100644
index f7a83b534b7..00000000000
--- a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/MarkDuplicatesTester.java
+++ /dev/null
@@ -1,22 +0,0 @@
-package org.broadinstitute.hellbender.utils.read.markduplicates;
-
-import htsjdk.samtools.DuplicateScoringStrategy;
-import htsjdk.samtools.ValidationStringency;
-import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
-import org.broadinstitute.hellbender.tools.walkers.markduplicates.MarkDuplicatesGATK;
-import org.broadinstitute.hellbender.testutils.testers.AbstractMarkDuplicatesTester;
-
-/**
- * This class is an extension of AbstractMarkDuplicatesCommandLineProgramTester used to test MarkDuplicatesGATK with SAM files generated on the fly.
- * This performs the underlying tests defined by classes such as see AbstractMarkDuplicatesCommandLineProgramTest and MarkDuplicatesTest.
- */
-public final class MarkDuplicatesTester extends AbstractMarkDuplicatesTester {
-
- public MarkDuplicatesTester() {
- super(DuplicateScoringStrategy.ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH);
- addArg("--VALIDATION_STRINGENCY", ValidationStringency.LENIENT.name());
- }
-
- @Override
- protected CommandLineProgram getProgram() { return new MarkDuplicatesGATK(); }
-}
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadsKeyUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadsKeyUnitTest.java
index 2795c12ea60..3793fc56b72 100644
--- a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadsKeyUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/ReadsKeyUnitTest.java
@@ -1,19 +1,15 @@
package org.broadinstitute.hellbender.utils.read.markduplicates;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSparkUtils;
-import org.broadinstitute.hellbender.tools.walkers.markduplicates.MarkDuplicatesGATKIntegrationTest;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
-import java.io.File;
import java.util.Map;
import java.util.Random;
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/PairedEndsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/PairedEndsUnitTest.java
index 3aa98b55ab8..e8ea98e9b6b 100644
--- a/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/PairedEndsUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/PairedEndsUnitTest.java
@@ -6,10 +6,10 @@
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.markduplicates.LibraryIdGenerator;
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
-import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
+import picard.sam.markduplicates.util.ReadEnds;
import java.util.Collections;