Skip to content

Commit

Permalink
Updated htsjdk/picard to latest version
Browse files Browse the repository at this point in the history
Direct incorporation of samtools/htsjdk#1249
Added async CollectGridssMetrics incorporating the performance improvements
  • Loading branch information
d-cameron committed Jan 7, 2019
1 parent cbafa85 commit 1243674
Show file tree
Hide file tree
Showing 14 changed files with 4,452 additions and 97 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
bin
/chr12.*
/hg19.*
/.idea
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,12 @@
<dependency>
<groupId>com.github.samtools</groupId>
<artifactId>htsjdk</artifactId>
<version>2.16.0</version>
<version>2.18.1</version>
</dependency>
<dependency>
<groupId>com.github.broadinstitute</groupId>
<artifactId>picard</artifactId>
<version>2.18.11</version>
<version>2.18.22</version>
</dependency>
<dependency>
<groupId>org.broadinstitute</groupId>
Expand Down
36 changes: 29 additions & 7 deletions src/main/java/au/edu/wehi/idsv/FullReadExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

import au.edu.wehi.idsv.bed.IntervalBed;
import au.edu.wehi.idsv.sam.ChimericAlignment;
import au.edu.wehi.idsv.util.AsyncBufferedIterator;
import au.edu.wehi.idsv.util.FileHelper;
import com.google.common.collect.Range;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.TextCigarCodec;
import gridss.ExtractFullReads;
import htsjdk.samtools.*;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;

import java.io.File;
import java.io.IOException;
Expand All @@ -17,8 +19,8 @@
import static htsjdk.samtools.SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
import static htsjdk.samtools.SAMRecord.NO_ALIGNMENT_START;

public abstract class FullReadExtractor {

public class FullReadExtractor {
private static final Log log = Log.getInstance(FullReadExtractor.class);
private final LinearGenomicCoordinate lgc;
private final IntervalBed bed;
private final boolean extractMates;
Expand Down Expand Up @@ -101,7 +103,6 @@ public boolean shouldExtract(SAMRecord r) {
}
return false;
}
public abstract void extract(File input, File output, int workerThreads) throws IOException;

public boolean shouldExtractMates() {
return extractMates;
Expand All @@ -110,4 +111,25 @@ public boolean shouldExtractMates() {
public boolean shouldExtractSplits() {
return extractSplits;
}

public void extract(File input, File output, int workerThreads) throws IOException {
File tmpOut = gridss.Defaults.OUTPUT_TO_TEMP_FILE ? FileSystemContext.getWorkingFileFor(output) : output;
try (SamReader reader = SamReaderFactory.makeDefault().open(input)) {
SAMFileHeader header = reader.getFileHeader();
try (AsyncBufferedIterator<SAMRecord> asyncIt = new AsyncBufferedIterator<>(reader.iterator(), input.getName())) {
ProgressLoggingSAMRecordIterator it = new ProgressLoggingSAMRecordIterator(asyncIt, new ProgressLogger(log));
try (SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, tmpOut)) {
while (it.hasNext()) {
SAMRecord r = it.next();
if (shouldExtract(r)) {
writer.addAlignment(r);
}
}
}
}
}
if (tmpOut != output) {
FileHelper.move(tmpOut, output, true);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ public IndexedLookupFullReadExtractor(LinearGenomicCoordinate lgc, IntervalBed b
super(lgc, bed, extractMates, extractSplits);
this.regionPaddingSize = regionPaddingSize;
}
@Override
public void extract(File input, File output, int workerThreads) throws IOException {
IntervalBed lookupIntervals = new IntervalBed(getLinearGenomicCoordinate().getDictionary(), getLinearGenomicCoordinate());
for (QueryInterval qi : getRegionBed().asQueryInterval()) {
Expand Down
40 changes: 0 additions & 40 deletions src/main/java/au/edu/wehi/idsv/LinearScanFullReadExtractor.java

This file was deleted.

98 changes: 98 additions & 0 deletions src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package gridss;

import au.edu.wehi.idsv.ReadPairConcordanceMethod;
import gridss.analysis.CollectGridssMetrics;
import gridss.cmdline.CommandLineProgramHelper;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import picard.analysis.MetricAccumulationLevel;
import picard.analysis.SinglePassSamProgram;

import java.io.File;
import java.util.Collection;
import java.util.Set;

/**
* Class that is designed to instantiate and execute multiple metrics programs that extend
* SinglePassSamProgram while making only a single pass through the SAM file and supplying
* each program with the records as it goes.
*
*/
@CommandLineProgramProperties(
summary = "Merging of CollectGridssMetrics and ExtactSVReads designed for pipelines that already have insert size metrics. "
+ "Combining the two programs removes the unnecessary CPU overhead of parsing the input file multiple times.",
oneLineSummary = "A \"meta-metrics\" calculating program that produces multiple metrics for the provided SAM/BAM and extracts SV reads.",
programGroup = gridss.cmdline.programgroups.DataConversion.class
)
public class CollectGridssMetricsAndExtractFullReads extends CollectGridssMetrics {
@Argument(shortName="MO", doc="Output file containing SV metrics", optional=true)
public File REGION_BED;
@Argument(doc = "Extract reads whose mate maps to an export region. " +
"If the MC tag is not present, only the starting alignment position of the mate is considered. " +
"When determining whether the mate maps to an export region " +
"only the primary alignment of that mate is considered. Secondary " +
"and supplementary alignments are ignored.")
public boolean EXTRACT_MATES = true;
@Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region")
public boolean EXTRACT_SPLITS = true;
@Argument(doc = "Number of bases surrounding each export region to include in the index query. " +
"Setting this to a value slightly greater than the 99.99% fragment size distribution will reduce the number of random access" +
"IO requests made. " +
"This parameter is not used if a linear scan is performed.")
public int REGION_PADDING_SIZE = 2000;
@Argument(doc = "File to write the output reads to.")
public File READ_OUTPUT;
@Argument(doc="Number of worker threads to spawn. Defaults to number of cores available."
+ " Note that I/O threads are not included in this worker thread count so CPU usage can be higher than the number of worker thread.",
shortName="THREADS")
public int WORKER_THREADS = 1;
public static void main(final String[] args) {
new CollectGridssMetricsAndExtractFullReads().instanceMainWithExit(args);
}
@Override
protected String[] customCommandLineValidation() {
return super.customCommandLineValidation();
}
protected ExtractFullReads getExtractFullReads() {
ExtractFullReads extract = new ExtractFullReads();
CommandLineProgramHelper.copyInputs(this, extract);
extract.OUTPUT = READ_OUTPUT;
extract.INPUT = this.INPUT;
extract.REGION_BED = REGION_BED;
extract.EXTRACT_MATES = EXTRACT_MATES;
extract.EXTRACT_SPLITS = EXTRACT_SPLITS;
extract.REGION_PADDING_SIZE = REGION_PADDING_SIZE;
return extract;
}
public ProgramInterface createExtractFullReads() {
return new ProgramInterface() {
@Override
public SinglePassSamProgram makeInstance(final String outbase,
final String outext,
final File input,
final File reference,
final Set<MetricAccumulationLevel> metricAccumulationLevel,
final File dbSnp,
final File intervals,
final File refflat,
final Set<String> ignoreSequence) {
final ExtractFullReads program = getExtractFullReads();
return program;
}
@Override
public boolean needsReferenceSequence() {
return false;
}
@Override
public boolean supportsMetricAccumulationLevel() {
return false;
}
};
}
@Override
public void setProgramsToRun(Collection<ProgramInterface> programsToRun) {
// Inject SV read extraction
programsToRun.add(createExtractFullReads());
super.setProgramsToRun(programsToRun);
}
}
89 changes: 42 additions & 47 deletions src/main/java/gridss/ExtractFullReads.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,32 @@

import au.edu.wehi.idsv.*;
import au.edu.wehi.idsv.bed.IntervalBed;
import au.edu.wehi.idsv.util.AsyncBufferedIterator;
import au.edu.wehi.idsv.util.FileHelper;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.RuntimeIOException;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import picard.analysis.SinglePassSamProgram;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import sun.reflect.generics.reflectiveObjects.NotImplementedException;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;

@CommandLineProgramProperties(
summary = "Extracts reads and read pairs with a mapping to a region to extract.",
oneLineSummary = "Extracts reads and read pairs with a mapping to a region to extract.",
programGroup = picard.cmdline.programgroups.ReadDataManipulationProgramGroup.class
)
public class ExtractFullReads extends CommandLineProgram {
public class ExtractFullReads extends SinglePassSamProgram {
private static final Log log = Log.getInstance(ExtractFullReads.class);
@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output file")
public File OUTPUT;
@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input file")
public File INPUT;
@Argument(shortName = "B", doc = "BED file containing regions to export")
public File REGION_BED;
@Argument(doc = "Extract reads whose mate maps to an export region. " +
Expand All @@ -35,61 +38,53 @@ public class ExtractFullReads extends CommandLineProgram {
public boolean EXTRACT_MATES = true;
@Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region")
public boolean EXTRACT_SPLITS = true;
@Argument(doc = "Process the entire input file. A linear scan of the entire input file is" +
"faster for Small files, files containing many export regions, " +
"and sequencing data with high discordant read pair mapping rate.")
public boolean LINEAR_SCAN = false;
@Argument(doc = "Number of bases surrounding each export region to include in the index query. " +
"Setting this to a value slightly greater than the 99.99% fragment size distribution will reduce the number of random access" +
"IO requests made. " +
"This parameter is not used if a linear scan is performed.")
@Argument(doc = "Number of bases surrounding each export region to include in the index query. ")
public int REGION_PADDING_SIZE = 2000;

@Argument(doc="Number of worker threads to spawn. Defaults to number of cores available."
+ " Note that I/O threads are not included in this worker thread count so CPU usage can be higher than the number of worker thread.",
shortName="THREADS")
public int WORKER_THREADS = 1;
private File tmpOut;
private SAMFileWriter writer;
private FullReadExtractor extractor;

public static void main(String[] argv) {
System.exit(new ExtractFullReads().instanceMain(argv));
}

@Override
protected int doWork() {
protected void setup(SAMFileHeader header, File samFile) {
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(REGION_BED);
SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null) {
throw new IllegalArgumentException("Missing input file sequence dictionary. Ensure the correct @SQ headers exist.");
}
LinearGenomicCoordinate lgc = new PaddedLinearGenomicCoordinate(dict, GenomicProcessingContext.LINEAR_COORDINATE_CHROMOSOME_BUFFER, true);
tmpOut = gridss.Defaults.OUTPUT_TO_TEMP_FILE ? FileSystemContext.getWorkingFileFor(OUTPUT) : OUTPUT;
IntervalBed bed;
try {
SamReader reader = SamReaderFactory.make().open(INPUT);
SAMFileHeader header = reader.getFileHeader();
if (!LINEAR_SCAN) {
if (header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
log.error("INPUT is not coordinate sorted. "
+ "Specify LINEAR_SCAN=true when processing files that are not coordinate sorted.");
return -1;
} else if (!reader.hasIndex()) {
log.error("INPUT does not have an associated index. "
+ "Specify LINEAR_SCAN=true or create an index file.");
return -1;
}
}
SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null) {
throw new IllegalArgumentException("Missing input file sequence dictionary. Ensure the correct @SQ headers exist.");
}
LinearGenomicCoordinate lgc = new PaddedLinearGenomicCoordinate(dict, GenomicProcessingContext.LINEAR_COORDINATE_CHROMOSOME_BUFFER, true);
IntervalBed bed = new IntervalBed(dict, lgc, REGION_BED);
bed = new IntervalBed(dict, lgc, REGION_BED);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
extractor = new FullReadExtractor(lgc, bed, EXTRACT_MATES, EXTRACT_SPLITS);
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, tmpOut);
}

FullReadExtractor extractor;
if (LINEAR_SCAN) {
extractor = new LinearScanFullReadExtractor(lgc, bed, EXTRACT_MATES, EXTRACT_SPLITS);
} else {
extractor = new IndexedLookupFullReadExtractor(lgc, bed, EXTRACT_MATES, EXTRACT_SPLITS, REGION_PADDING_SIZE);
@Override
protected void acceptRead(SAMRecord rec, ReferenceSequence ref) {
if (extractor.shouldExtract(rec)) {
writer.addAlignment(rec);
}
}

@Override
protected void finish() {
writer.close();
if (tmpOut != OUTPUT) {
try {
FileHelper.move(tmpOut, OUTPUT, true);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
extractor.extract(INPUT, OUTPUT, WORKER_THREADS);
} catch (IOException e) {
throw new RuntimeException(e);
}
return 0;
}
}
Loading

0 comments on commit 1243674

Please sign in to comment.