Minor refactoring in MarkDuplicates, IlluminaBasecallsToFastq, and Ma…

…rkIlluminaAdapters (#1868) Minor refactoring and code cleanup in MarkDuplicates, IlluminaBasecallsToFastq, and MarkIlluminaAdapters
broadinstitute · Jun 13, 2023 · 802089a · 802089a
1 parent 00dba54
commit 802089a
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 45 deletions.
diff --git a/src/main/java/picard/illumina/IlluminaBasecallsToFastq.java b/src/main/java/picard/illumina/IlluminaBasecallsToFastq.java
@@ -441,23 +441,30 @@ private Writer<ClusterData> buildWriter(final File outputPrefix, final int numSa
         final File[] templateFiles = new File[inputReadStructure.templates.length()];
         final File[] sampleBarcodeFiles = new File[inputReadStructure.sampleBarcodes.length()];
         final File[] molecularBarcodeFiles = new File[inputReadStructure.molecularBarcode.length()];
+        final String templateFormat = "%s.%d.%s";
+        final String sampleBarcodeFormat = "%s.barcode_%d.%s";
+        final String molecularBarcodeFormat = "%s.index_%d.%s";
 
-        for (int i = 0; i < templateFiles.length; ++i) {
-            templateFiles[i] = new File(outputDir, String.format("%s.%d.%s", prefixString, i + 1, suffixString));
-        }
+        // write templateFiles
+        writeFileWithFormat(outputDir, templateFormat, prefixString, suffixString, templateFiles);
 
-        for (int i = 0; i < sampleBarcodeFiles.length; ++i) {
-            sampleBarcodeFiles[i] = new File(outputDir, String.format("%s.barcode_%d.%s", prefixString, i + 1, suffixString));
-        }
+        // write sampleBarcodeFiles
+        writeFileWithFormat(outputDir, sampleBarcodeFormat, prefixString, suffixString, sampleBarcodeFiles);
 
-        for (int i = 0; i < molecularBarcodeFiles.length; ++i) {
-            molecularBarcodeFiles[i] = new File(outputDir, String.format("%s.index_%d.%s", prefixString, i + 1, suffixString));
-        }
+        // write molecularBarcodeFiles
+        writeFileWithFormat(outputDir, molecularBarcodeFormat, prefixString, suffixString, molecularBarcodeFiles);
 
         int queueSize = (MAX_RECORDS_IN_RAM / 2) / numSamples;
         return writerPool.pool(new ClusterToFastqWriter(templateFiles, sampleBarcodeFiles, molecularBarcodeFiles, TRIMMING_QUALITY, adapters), new LinkedBlockingQueue<>(queueSize), (int) (queueSize * 0.5));
     }
-
+    /**
+     *  A separate method to write the different types of files in desired format
+     */
+    private void writeFileWithFormat(File outputDir, String format,String prefixString, String suffixString, File[] files) {
+        for (int i = 0; i < files.length; ++i) {
+            files[i] = new File(outputDir, String.format(format, prefixString, i + 1, suffixString));
+        }
+    }
     /**
      * Trivial class to avoid converting ClusterData to another type when not sorting outputs.
      */

diff --git a/src/main/java/picard/illumina/MarkIlluminaAdapters.java b/src/main/java/picard/illumina/MarkIlluminaAdapters.java
@@ -140,13 +140,17 @@ public class MarkIlluminaAdapters extends CommandLineProgram {
 
     @Override
     protected String[] customCommandLineValidation() {
-        if ((FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER == null) || (THREE_PRIME_ADAPTER != null && FIVE_PRIME_ADAPTER == null)) {
+        if (hasEitherAdapter()) {
             return new String[]{"THREE_PRIME_ADAPTER and FIVE_PRIME_ADAPTER must either both be null or both be set."};
         } else {
             return null;
         }
     }
 
+    private boolean hasEitherAdapter() {
+        return (FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER == null) || (THREE_PRIME_ADAPTER != null && FIVE_PRIME_ADAPTER == null);
+    }
+
     @Override
     protected int doWork() {
         IOUtil.assertFileIsReadable(INPUT);
@@ -167,7 +171,7 @@ protected int doWork() {
         {
             final List<AdapterPair> tmp = new ArrayList<AdapterPair>();
             tmp.addAll(ADAPTERS);
-            if (FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER != null) {
+            if (hasBothAdapters()) {
                 tmp.add(new CustomAdapterPair(FIVE_PRIME_ADAPTER, THREE_PRIME_ADAPTER));
             }
             adapters = tmp.toArray(new AdapterPair[tmp.size()]);
@@ -246,4 +250,8 @@ protected int doWork() {
         CloserUtil.close(in);
         return 0;
     }
+
+    private boolean hasBothAdapters(){
+        return FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER != null;
+    }
 }
diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java
@@ -709,7 +709,7 @@ public ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final
      * Goes through the accumulated ReadEndsForMarkDuplicates objects and determines which of them are
      * to be marked as duplicates.
      */
-    public void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
+    protected void sortIndicesForDuplicates(final boolean indexOpticalDuplicates){
         final int entryOverhead;
         if (TAG_DUPLICATE_SET_MEMBERS) {
             // Memory requirements for RepresentativeReadIndexer:
@@ -735,6 +735,9 @@ public void generateDuplicateIndexes(final boolean useBarcodes, final boolean in
                     maxInMemory,
                     TMP_DIR);
         }
+    }
+    public void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
+        sortIndicesForDuplicates(indexOpticalDuplicates);
 
         ReadEndsForMarkDuplicates firstOfNextChunk = null;
         final List<ReadEndsForMarkDuplicates> nextChunk = new ArrayList<>(200);

diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java
@@ -29,15 +29,9 @@
 import htsjdk.samtools.SAMReadGroupRecord;
 import htsjdk.samtools.SAMRecord;
 import htsjdk.samtools.util.Log;
-import htsjdk.samtools.util.SortingCollection;
-import htsjdk.samtools.util.SortingLongCollection;
 import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates;
-import picard.sam.markduplicates.util.RepresentativeReadIndexerCodec;
-import picard.sam.util.RepresentativeReadIndexer;
 
-import java.io.File;
 import java.util.ArrayList;
-import java.util.Comparator;
 import java.util.List;
 
 /**
@@ -86,32 +80,7 @@ private void validateFlowParameteres() {
      * applicable for flow mode invocation.
      */
     public void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
-        final int entryOverhead;
-        if (md.TAG_DUPLICATE_SET_MEMBERS) {
-            // Memory requirements for RepresentativeReadIndexer:
-            // three int entries + overhead: (3 * 4) + 4 = 16 bytes
-            entryOverhead = 16;
-        } else {
-            entryOverhead = SortingLongCollection.SIZEOF;
-        }
-        // Keep this number from getting too large even if there is a huge heap.
-        int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5));
-        // If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections
-        if (indexOpticalDuplicates) {
-            maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead);
-            md.opticalDuplicateIndexes = new SortingLongCollection(maxInMemory, md.TMP_DIR.toArray(new File[md.TMP_DIR.size()]));
-        }
-        log.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk.");
-        md.duplicateIndexes = new SortingLongCollection(maxInMemory, md.TMP_DIR.toArray(new File[md.TMP_DIR.size()]));
-        if (md.TAG_DUPLICATE_SET_MEMBERS) {
-            final RepresentativeReadIndexerCodec representativeIndexCodec = new RepresentativeReadIndexerCodec();
-            md.representativeReadIndicesForDuplicates = SortingCollection.newInstance(RepresentativeReadIndexer.class,
-                    representativeIndexCodec,
-                    Comparator.comparing(read -> read.readIndexInFile),
-                    maxInMemory,
-                    md.TMP_DIR);
-        }
-
+        md.sortIndicesForDuplicates(indexOpticalDuplicates);
         // this code does support pairs at this time
         if ( md.pairSort.iterator().hasNext() ) {
             throw new IllegalArgumentException("Flow based code does not support paired reads");