Updated htsjdk/picard to latest version

Direct incorporation of samtools/htsjdk#1249 Added async CollectGridssMetrics incorporating the performance improvements
PapenfussLab · Jan 7, 2019 · 1243674 · 1243674
1 parent cbafa85
commit 1243674
Show file tree

Hide file tree

Showing 14 changed files with 4,452 additions and 97 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@
 bin
 /chr12.*
 /hg19.*
+/.idea
diff --git a/pom.xml b/pom.xml
@@ -166,12 +166,12 @@
 		<dependency>
 			<groupId>com.github.samtools</groupId>
 			<artifactId>htsjdk</artifactId>
-			<version>2.16.0</version>
+			<version>2.18.1</version>
 		</dependency>
 		<dependency>
 			<groupId>com.github.broadinstitute</groupId>
 			<artifactId>picard</artifactId>
-			<version>2.18.11</version>
+			<version>2.18.22</version>
 		</dependency>
 		<dependency>
 			<groupId>org.broadinstitute</groupId>

diff --git a/src/main/java/au/edu/wehi/idsv/FullReadExtractor.java b/src/main/java/au/edu/wehi/idsv/FullReadExtractor.java
@@ -2,11 +2,13 @@
 
 import au.edu.wehi.idsv.bed.IntervalBed;
 import au.edu.wehi.idsv.sam.ChimericAlignment;
+import au.edu.wehi.idsv.util.AsyncBufferedIterator;
+import au.edu.wehi.idsv.util.FileHelper;
 import com.google.common.collect.Range;
-import htsjdk.samtools.Cigar;
-import htsjdk.samtools.SAMRecord;
-import htsjdk.samtools.SAMTag;
-import htsjdk.samtools.TextCigarCodec;
+import gridss.ExtractFullReads;
+import htsjdk.samtools.*;
+import htsjdk.samtools.util.Log;
+import htsjdk.samtools.util.ProgressLogger;
 
 import java.io.File;
 import java.io.IOException;
@@ -17,8 +19,8 @@
 import static htsjdk.samtools.SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
 import static htsjdk.samtools.SAMRecord.NO_ALIGNMENT_START;
 
-public abstract class FullReadExtractor {
-
+public class FullReadExtractor {
+    private static final Log log = Log.getInstance(FullReadExtractor.class);
     private final LinearGenomicCoordinate lgc;
     private final IntervalBed bed;
     private final boolean extractMates;
@@ -101,7 +103,6 @@ public boolean shouldExtract(SAMRecord r) {
         }
         return false;
     }
-    public abstract void extract(File input, File output, int workerThreads) throws IOException;
 
     public boolean shouldExtractMates() {
         return extractMates;
@@ -110,4 +111,25 @@ public boolean shouldExtractMates() {
     public boolean shouldExtractSplits() {
         return extractSplits;
     }
+
+    public void extract(File input, File output, int workerThreads) throws IOException {
+        File tmpOut = gridss.Defaults.OUTPUT_TO_TEMP_FILE ? FileSystemContext.getWorkingFileFor(output) : output;
+        try (SamReader reader = SamReaderFactory.makeDefault().open(input)) {
+            SAMFileHeader header = reader.getFileHeader();
+            try (AsyncBufferedIterator<SAMRecord> asyncIt = new AsyncBufferedIterator<>(reader.iterator(), input.getName())) {
+                ProgressLoggingSAMRecordIterator it = new ProgressLoggingSAMRecordIterator(asyncIt, new ProgressLogger(log));
+                try (SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, tmpOut)) {
+                    while (it.hasNext()) {
+                        SAMRecord r = it.next();
+                        if (shouldExtract(r)) {
+                            writer.addAlignment(r);
+                        }
+                    }
+                }
+            }
+        }
+        if (tmpOut != output) {
+            FileHelper.move(tmpOut, output, true);
+        }
+    }
 }
diff --git a/src/main/java/au/edu/wehi/idsv/IndexedLookupFullReadExtractor.java b/src/main/java/au/edu/wehi/idsv/IndexedLookupFullReadExtractor.java
@@ -16,7 +16,6 @@ public IndexedLookupFullReadExtractor(LinearGenomicCoordinate lgc, IntervalBed b
         super(lgc, bed, extractMates, extractSplits);
         this.regionPaddingSize = regionPaddingSize;
     }
-    @Override
     public void extract(File input, File output, int workerThreads) throws IOException {
         IntervalBed lookupIntervals = new IntervalBed(getLinearGenomicCoordinate().getDictionary(), getLinearGenomicCoordinate());
         for (QueryInterval qi : getRegionBed().asQueryInterval()) {

diff --git a/src/main/java/au/edu/wehi/idsv/LinearScanFullReadExtractor.java b/src/main/java/au/edu/wehi/idsv/LinearScanFullReadExtractor.java
diff --git a/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java b/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java
@@ -0,0 +1,98 @@
+package gridss;
+
+import au.edu.wehi.idsv.ReadPairConcordanceMethod;
+import gridss.analysis.CollectGridssMetrics;
+import gridss.cmdline.CommandLineProgramHelper;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import picard.analysis.MetricAccumulationLevel;
+import picard.analysis.SinglePassSamProgram;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.Set;
+
+/**
+ * Class that is designed to instantiate and execute multiple metrics programs that extend
+ * SinglePassSamProgram while making only a single pass through the SAM file and supplying
+ * each program with the records as it goes.
+ *
+ */
+@CommandLineProgramProperties(
+        summary = "Merging of CollectGridssMetrics and ExtactSVReads designed for pipelines that already have insert size metrics. "
+        		+ "Combining the two programs removes the unnecessary CPU overhead of parsing the input file multiple times.",
+        oneLineSummary = "A \"meta-metrics\" calculating program that produces multiple metrics for the provided SAM/BAM and extracts SV reads.",
+        programGroup = gridss.cmdline.programgroups.DataConversion.class
+)
+public class CollectGridssMetricsAndExtractFullReads extends CollectGridssMetrics {
+    @Argument(shortName="MO", doc="Output file containing SV metrics", optional=true)
+    public File REGION_BED;
+    @Argument(doc = "Extract reads whose mate maps to an export region. " +
+            "If the MC tag is not present, only the starting alignment position of the mate is considered. " +
+            "When determining whether the mate maps to an export region " +
+            "only the primary alignment of that mate is considered. Secondary " +
+            "and supplementary alignments are ignored.")
+    public boolean EXTRACT_MATES = true;
+    @Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region")
+    public boolean EXTRACT_SPLITS = true;
+    @Argument(doc = "Number of bases surrounding each export region to include in the index query. " +
+            "Setting this to a value slightly greater than the 99.99% fragment size distribution will reduce the number of random access" +
+            "IO requests made. " +
+            "This parameter is not used if a linear scan is performed.")
+    public int REGION_PADDING_SIZE = 2000;
+    @Argument(doc = "File to write the output reads to.")
+    public File READ_OUTPUT;
+    @Argument(doc="Number of worker threads to spawn. Defaults to number of cores available."
+            + " Note that I/O threads are not included in this worker thread count so CPU usage can be higher than the number of worker thread.",
+            shortName="THREADS")
+    public int WORKER_THREADS = 1;
+    public static void main(final String[] args) {
+        new CollectGridssMetricsAndExtractFullReads().instanceMainWithExit(args);
+    }
+    @Override
+    protected String[] customCommandLineValidation() {
+        return super.customCommandLineValidation();
+    }
+    protected ExtractFullReads getExtractFullReads() {
+    	ExtractFullReads extract = new ExtractFullReads();
+    	CommandLineProgramHelper.copyInputs(this, extract);
+    	extract.OUTPUT = READ_OUTPUT;
+        extract.INPUT = this.INPUT;
+        extract.REGION_BED = REGION_BED;
+        extract.EXTRACT_MATES = EXTRACT_MATES;
+        extract.EXTRACT_SPLITS = EXTRACT_SPLITS;
+    	extract.REGION_PADDING_SIZE = REGION_PADDING_SIZE;
+    	return extract;
+    }
+    public ProgramInterface createExtractFullReads() {
+    	return new ProgramInterface() {
+			@Override
+			public SinglePassSamProgram makeInstance(final String outbase,
+                                                     final String outext,
+                                                     final File input,
+                                                     final File reference,
+                                                     final Set<MetricAccumulationLevel> metricAccumulationLevel,
+                                                     final File dbSnp,
+                                                     final File intervals,
+                                                     final File refflat,
+                                                     final  Set<String> ignoreSequence) {
+				final ExtractFullReads program = getExtractFullReads();
+                return program;
+			}
+			@Override
+			public boolean needsReferenceSequence() {
+				return false;
+			}
+			@Override
+			public boolean supportsMetricAccumulationLevel() {
+				return false;
+			}
+        };
+    }
+    @Override
+    public void setProgramsToRun(Collection<ProgramInterface> programsToRun) {
+    	// Inject SV read extraction
+    	programsToRun.add(createExtractFullReads());
+    	super.setProgramsToRun(programsToRun);
+    }
+}
diff --git a/src/main/java/gridss/ExtractFullReads.java b/src/main/java/gridss/ExtractFullReads.java
@@ -2,29 +2,32 @@
 
 import au.edu.wehi.idsv.*;
 import au.edu.wehi.idsv.bed.IntervalBed;
+import au.edu.wehi.idsv.util.AsyncBufferedIterator;
+import au.edu.wehi.idsv.util.FileHelper;
 import htsjdk.samtools.*;
+import htsjdk.samtools.reference.ReferenceSequence;
 import htsjdk.samtools.util.IOUtil;
 import htsjdk.samtools.util.Log;
+import htsjdk.samtools.util.ProgressLogger;
+import htsjdk.samtools.util.RuntimeIOException;
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import picard.analysis.SinglePassSamProgram;
 import picard.cmdline.CommandLineProgram;
 import picard.cmdline.StandardOptionDefinitions;
 import sun.reflect.generics.reflectiveObjects.NotImplementedException;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.Arrays;
 
 @CommandLineProgramProperties(
 		summary = "Extracts reads and read pairs with a mapping to a region to extract.",
         oneLineSummary = "Extracts reads and read pairs with a mapping to a region to extract.",
         programGroup = picard.cmdline.programgroups.ReadDataManipulationProgramGroup.class
 )
-public class ExtractFullReads extends CommandLineProgram {
+public class ExtractFullReads extends SinglePassSamProgram {
 	private static final Log log = Log.getInstance(ExtractFullReads.class);
-	@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output file")
-	public File OUTPUT;
-	@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input file")
-	public File INPUT;
 	@Argument(shortName = "B", doc = "BED file containing regions to export")
 	public File REGION_BED;
 	@Argument(doc = "Extract reads whose mate maps to an export region. " +
@@ -35,61 +38,53 @@ public class ExtractFullReads extends CommandLineProgram {
 	public boolean EXTRACT_MATES = true;
 	@Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region")
 	public boolean EXTRACT_SPLITS = true;
-	@Argument(doc = "Process the entire input file. A linear scan of the entire input file is" +
-			"faster for Small files, files containing many export regions, " +
-			"and sequencing data with high discordant read pair mapping rate.")
-	public boolean LINEAR_SCAN = false;
-	@Argument(doc = "Number of bases surrounding each export region to include in the index query. " +
-			"Setting this to a value slightly greater than the 99.99% fragment size distribution will reduce the number of random access" +
-			"IO requests made. " +
-			"This parameter is not used if a linear scan is performed.")
+	@Argument(doc = "Number of bases surrounding each export region to include in the index query. ")
 	public int REGION_PADDING_SIZE = 2000;
 
-	@Argument(doc="Number of worker threads to spawn. Defaults to number of cores available."
-			+ " Note that I/O threads are not included in this worker thread count so CPU usage can be higher than the number of worker thread.",
-			shortName="THREADS")
-	public int WORKER_THREADS = 1;
+	private File tmpOut;
+	private SAMFileWriter writer;
+	private FullReadExtractor extractor;
 
 	public static void main(String[] argv) {
 		System.exit(new ExtractFullReads().instanceMain(argv));
 	}
 
 	@Override
-	protected int doWork() {
+	protected void setup(SAMFileHeader header, File samFile) {
 		IOUtil.assertFileIsWritable(OUTPUT);
-		IOUtil.assertFileIsReadable(INPUT);
 		IOUtil.assertFileIsReadable(REGION_BED);
+		SAMSequenceDictionary dict = header.getSequenceDictionary();
+		if (dict == null) {
+			throw new IllegalArgumentException("Missing input file sequence dictionary. Ensure the correct @SQ headers exist.");
+		}
+		LinearGenomicCoordinate lgc = new PaddedLinearGenomicCoordinate(dict, GenomicProcessingContext.LINEAR_COORDINATE_CHROMOSOME_BUFFER, true);
+		tmpOut = gridss.Defaults.OUTPUT_TO_TEMP_FILE ? FileSystemContext.getWorkingFileFor(OUTPUT) : OUTPUT;
+		IntervalBed bed;
 		try {
-			SamReader reader = SamReaderFactory.make().open(INPUT);
-			SAMFileHeader header = reader.getFileHeader();
-			if (!LINEAR_SCAN) {
-				if (header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
-					log.error("INPUT is not coordinate sorted. "
-							+ "Specify LINEAR_SCAN=true when processing files that are not coordinate sorted.");
-					return -1;
-				} else if (!reader.hasIndex()) {
-					log.error("INPUT does not have an associated index. "
-							+ "Specify LINEAR_SCAN=true or create an index file.");
-					return -1;
-				}
-			}
-			SAMSequenceDictionary dict = header.getSequenceDictionary();
-			if (dict == null) {
-				throw new IllegalArgumentException("Missing input file sequence dictionary. Ensure the correct @SQ headers exist.");
-			}
-			LinearGenomicCoordinate lgc = new PaddedLinearGenomicCoordinate(dict, GenomicProcessingContext.LINEAR_COORDINATE_CHROMOSOME_BUFFER, true);
-			IntervalBed bed = new IntervalBed(dict, lgc, REGION_BED);
+			bed = new IntervalBed(dict, lgc, REGION_BED);
+		} catch (IOException e) {
+			throw new RuntimeIOException(e);
+		}
+		extractor = new FullReadExtractor(lgc, bed, EXTRACT_MATES, EXTRACT_SPLITS);
+		writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, tmpOut);
+	}
 
-			FullReadExtractor extractor;
-			if (LINEAR_SCAN) {
-				extractor = new LinearScanFullReadExtractor(lgc, bed, EXTRACT_MATES, EXTRACT_SPLITS);
-			} else {
-				extractor = new IndexedLookupFullReadExtractor(lgc, bed, EXTRACT_MATES, EXTRACT_SPLITS, REGION_PADDING_SIZE);
+	@Override
+	protected void acceptRead(SAMRecord rec, ReferenceSequence ref) {
+		if (extractor.shouldExtract(rec)) {
+			writer.addAlignment(rec);
+		}
+	}
+
+	@Override
+	protected void finish() {
+		writer.close();
+		if (tmpOut != OUTPUT) {
+			try {
+				FileHelper.move(tmpOut, OUTPUT, true);
+			} catch (IOException e) {
+				throw new RuntimeIOException(e);
 			}
-			extractor.extract(INPUT, OUTPUT, WORKER_THREADS);
-		} catch (IOException e) {
-			throw new RuntimeException(e);
 		}
-		return 0;
 	}
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ @@
     bin
     /chr12.*
     /hg19.*
+    /.idea