Skip to content

Commit

Permalink
Minor refactoring in MarkDuplicates, IlluminaBasecallsToFastq, and Ma…
Browse files Browse the repository at this point in the history
…rkIlluminaAdapters (#1868)

Minor refactoring and code cleanup in MarkDuplicates,  IlluminaBasecallsToFastq, and MarkIlluminaAdapters
  • Loading branch information
LadDeep authored Jun 13, 2023
1 parent 00dba54 commit 802089a
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 45 deletions.
27 changes: 17 additions & 10 deletions src/main/java/picard/illumina/IlluminaBasecallsToFastq.java
Original file line number Diff line number Diff line change
Expand Up @@ -441,23 +441,30 @@ private Writer<ClusterData> buildWriter(final File outputPrefix, final int numSa
final File[] templateFiles = new File[inputReadStructure.templates.length()];
final File[] sampleBarcodeFiles = new File[inputReadStructure.sampleBarcodes.length()];
final File[] molecularBarcodeFiles = new File[inputReadStructure.molecularBarcode.length()];
final String templateFormat = "%s.%d.%s";
final String sampleBarcodeFormat = "%s.barcode_%d.%s";
final String molecularBarcodeFormat = "%s.index_%d.%s";

for (int i = 0; i < templateFiles.length; ++i) {
templateFiles[i] = new File(outputDir, String.format("%s.%d.%s", prefixString, i + 1, suffixString));
}
// write templateFiles
writeFileWithFormat(outputDir, templateFormat, prefixString, suffixString, templateFiles);

for (int i = 0; i < sampleBarcodeFiles.length; ++i) {
sampleBarcodeFiles[i] = new File(outputDir, String.format("%s.barcode_%d.%s", prefixString, i + 1, suffixString));
}
// write sampleBarcodeFiles
writeFileWithFormat(outputDir, sampleBarcodeFormat, prefixString, suffixString, sampleBarcodeFiles);

for (int i = 0; i < molecularBarcodeFiles.length; ++i) {
molecularBarcodeFiles[i] = new File(outputDir, String.format("%s.index_%d.%s", prefixString, i + 1, suffixString));
}
// write molecularBarcodeFiles
writeFileWithFormat(outputDir, molecularBarcodeFormat, prefixString, suffixString, molecularBarcodeFiles);

int queueSize = (MAX_RECORDS_IN_RAM / 2) / numSamples;
return writerPool.pool(new ClusterToFastqWriter(templateFiles, sampleBarcodeFiles, molecularBarcodeFiles, TRIMMING_QUALITY, adapters), new LinkedBlockingQueue<>(queueSize), (int) (queueSize * 0.5));
}

/**
* A separate method to write the different types of files in desired format
*/
private void writeFileWithFormat(File outputDir, String format,String prefixString, String suffixString, File[] files) {
for (int i = 0; i < files.length; ++i) {
files[i] = new File(outputDir, String.format(format, prefixString, i + 1, suffixString));
}
}
/**
* Trivial class to avoid converting ClusterData to another type when not sorting outputs.
*/
Expand Down
12 changes: 10 additions & 2 deletions src/main/java/picard/illumina/MarkIlluminaAdapters.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,17 @@ public class MarkIlluminaAdapters extends CommandLineProgram {

@Override
protected String[] customCommandLineValidation() {
if ((FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER == null) || (THREE_PRIME_ADAPTER != null && FIVE_PRIME_ADAPTER == null)) {
if (hasEitherAdapter()) {
return new String[]{"THREE_PRIME_ADAPTER and FIVE_PRIME_ADAPTER must either both be null or both be set."};
} else {
return null;
}
}

private boolean hasEitherAdapter() {
return (FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER == null) || (THREE_PRIME_ADAPTER != null && FIVE_PRIME_ADAPTER == null);
}

@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
Expand All @@ -167,7 +171,7 @@ protected int doWork() {
{
final List<AdapterPair> tmp = new ArrayList<AdapterPair>();
tmp.addAll(ADAPTERS);
if (FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER != null) {
if (hasBothAdapters()) {
tmp.add(new CustomAdapterPair(FIVE_PRIME_ADAPTER, THREE_PRIME_ADAPTER));
}
adapters = tmp.toArray(new AdapterPair[tmp.size()]);
Expand Down Expand Up @@ -246,4 +250,8 @@ protected int doWork() {
CloserUtil.close(in);
return 0;
}

private boolean hasBothAdapters(){
return FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER != null;
}
}
5 changes: 4 additions & 1 deletion src/main/java/picard/sam/markduplicates/MarkDuplicates.java
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ public ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final
* Goes through the accumulated ReadEndsForMarkDuplicates objects and determines which of them are
* to be marked as duplicates.
*/
public void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
protected void sortIndicesForDuplicates(final boolean indexOpticalDuplicates){
final int entryOverhead;
if (TAG_DUPLICATE_SET_MEMBERS) {
// Memory requirements for RepresentativeReadIndexer:
Expand All @@ -735,6 +735,9 @@ public void generateDuplicateIndexes(final boolean useBarcodes, final boolean in
maxInMemory,
TMP_DIR);
}
}
public void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
sortIndicesForDuplicates(indexOpticalDuplicates);

ReadEndsForMarkDuplicates firstOfNextChunk = null;
final List<ReadEndsForMarkDuplicates> nextChunk = new ArrayList<>(200);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,9 @@
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.SortingLongCollection;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates;
import picard.sam.markduplicates.util.RepresentativeReadIndexerCodec;
import picard.sam.util.RepresentativeReadIndexer;

import java.io.File;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

/**
Expand Down Expand Up @@ -86,32 +80,7 @@ private void validateFlowParameteres() {
* applicable for flow mode invocation.
*/
public void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
final int entryOverhead;
if (md.TAG_DUPLICATE_SET_MEMBERS) {
// Memory requirements for RepresentativeReadIndexer:
// three int entries + overhead: (3 * 4) + 4 = 16 bytes
entryOverhead = 16;
} else {
entryOverhead = SortingLongCollection.SIZEOF;
}
// Keep this number from getting too large even if there is a huge heap.
int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5));
// If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections
if (indexOpticalDuplicates) {
maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead);
md.opticalDuplicateIndexes = new SortingLongCollection(maxInMemory, md.TMP_DIR.toArray(new File[md.TMP_DIR.size()]));
}
log.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk.");
md.duplicateIndexes = new SortingLongCollection(maxInMemory, md.TMP_DIR.toArray(new File[md.TMP_DIR.size()]));
if (md.TAG_DUPLICATE_SET_MEMBERS) {
final RepresentativeReadIndexerCodec representativeIndexCodec = new RepresentativeReadIndexerCodec();
md.representativeReadIndicesForDuplicates = SortingCollection.newInstance(RepresentativeReadIndexer.class,
representativeIndexCodec,
Comparator.comparing(read -> read.readIndexInFile),
maxInMemory,
md.TMP_DIR);
}

md.sortIndicesForDuplicates(indexOpticalDuplicates);
// this code does support pairs at this time
if ( md.pairSort.iterator().hasNext() ) {
throw new IllegalArgumentException("Flow based code does not support paired reads");
Expand Down

0 comments on commit 802089a

Please sign in to comment.