Skip to content

Commit

Permalink
Rename SetNmAndUqTags and fix #622 (MergeBamAlignment MD tag). (#636)
Browse files Browse the repository at this point in the history
1. Rename SetNmAndUqTags to SetNmMDAndUqTags.
2. Fixes: #622
  • Loading branch information
nh13 authored Sep 19, 2016
1 parent e7d6c8f commit 915ffa7
Show file tree
Hide file tree
Showing 14 changed files with 275 additions and 137 deletions.
30 changes: 23 additions & 7 deletions src/main/java/picard/sam/AbstractAlignmentMerger.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.filter.FilteringIterator;
import htsjdk.samtools.filter.OverclippedReadFilter;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CigarUtil;
Expand Down Expand Up @@ -73,7 +72,7 @@
* 2. Merge the alignment information and public tags ONLY from the aligned SAMRecords
* 3. Do additional modifications -- handle clipping, trimming, etc.
* 4. Fix up mate information on paired reads
* 5. Do a final calculation of the NM and UQ tags.
* 5. Do a final calculation of the NM and UQ tags (coordinate sorted only)
* 6. Write the records to the output file.
* <p/>
* Concrete subclasses which extend AbstractAlignmentMerger should implement getQueryNameSortedAlignedRecords.
Expand Down Expand Up @@ -477,7 +476,7 @@ SAMRecord.class, new BAMRecordCodec(header), new SAMRecordCoordinateComparator()

for (final SAMRecord rec : sink.sorter) {
if (!rec.getReadUnmappedFlag() && refSeq != null) {
fixNMandUQ(rec, refSeq, bisulfiteSequence);
fixNmMdAndUq(rec, refSeq, bisulfiteSequence);
}
writer.addAlignment(rec);
finalProgress.record(rec);
Expand All @@ -490,7 +489,7 @@ SAMRecord.class, new BAMRecordCodec(header), new SAMRecordCoordinateComparator()
log.info("Wrote " + aligned + " alignment records and " + (alignedReadsOnly ? 0 : unmapped) + " unmapped reads.");
}

/** Calculates and sets the NM and UQ tags from the record and the reference
/** Calculates and sets the NM, MD, and and UQ tags from the record and the reference
*
* @param record the record to be fixed
* @param refSeqWalker a ReferenceSequenceWalker that will be used to traverse the reference
Expand All @@ -499,10 +498,13 @@ SAMRecord.class, new BAMRecordCodec(header), new SAMRecordCoordinateComparator()
*
* No return value, modifies the provided record.
*/
public static void fixNMandUQ(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) {
public static void fixNmMdAndUq(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) {
final byte[] referenceBases = refSeqWalker.get(refSeqWalker.getSequenceDictionary().getSequenceIndex(record.getReferenceName())).getBases();
record.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(record, referenceBases, 0, isBisulfiteSequence));

// only recalculate NM if it isn't bisulfite, since it needs to be treated specially below
SequenceUtil.calculateMdAndNmTags(record, referenceBases, true, !isBisulfiteSequence);
if (isBisulfiteSequence) { // recalculate the NM tag for bisulfite data
record.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(record, referenceBases, 0, isBisulfiteSequence));
}
if (record.getBaseQualities() != SAMRecord.NULL_QUALS) {
record.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(record, referenceBases, 0, isBisulfiteSequence));
}
Expand Down Expand Up @@ -605,11 +607,13 @@ protected void clipForOverlappingReads(final SAMRecord read1, final SAMRecord re
if (posDiff > 0) {
CigarUtil.softClip3PrimeEndOfRead(pos, Math.min(pos.getReadLength(),
pos.getReadLength() - posDiff + 1));
removeNmMdAndUqTags(pos); // these tags are now invalid!
}

if (negDiff > 0) {
CigarUtil.softClip3PrimeEndOfRead(neg, Math.min(neg.getReadLength(),
neg.getReadLength() - negDiff + 1));
removeNmMdAndUqTags(neg); // these tags are now invalid!
}

}
Expand Down Expand Up @@ -736,6 +740,7 @@ protected void updateCigarForTrimmedOrClippedBases(final SAMRecord rec, final SA
// If the adapter sequence is marked and clipAdapter is true, clip it
if (this.clipAdapters && rec.getAttribute(ReservedTagConstants.XT) != null) {
CigarUtil.softClip3PrimeEndOfRead(rec, rec.getIntegerAttribute(ReservedTagConstants.XT));
removeNmMdAndUqTags(rec); // these tags are now invalid!
}
}

Expand Down Expand Up @@ -797,4 +802,15 @@ public void setIncludeSecondaryAlignments(final boolean includeSecondaryAlignmen
public void close() {
CloserUtil.close(this.refSeq);
}


/** Removes the NM, MD, and UQ tags. This is useful if we modify the read and are not able to recompute these tags,
* for example when no reference is available.
* @param rec the record to modify.
*/
private static void removeNmMdAndUqTags(final SAMRecord rec) {
rec.setAttribute(SAMTag.NM.name(), null);
rec.setAttribute(SAMTag.MD.name(), null);
rec.setAttribute(SAMTag.UQ.name(), null);
}
}
4 changes: 3 additions & 1 deletion src/main/java/picard/sam/MergeBamAlignment.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ public class MergeBamAlignment extends CommandLineProgram {
" The purpose of this tool is to use information from the unmapped BAM to fix up aligner output. The resulting file will be valid " +
"for use by other Picard tools. For simple BAM file merges, use MergeSamFiles. Note that MergeBamAlignment expects to " +
"find a sequence dictionary in the same directory as REFERENCE_SEQUENCE and expects it " +
"to have the same base name as the reference FASTA except with the extension \".dict\". " +
"to have the same base name as the reference FASTA except with the extension \".dict\". If " +
"the output sort order is not coordinate, then reads that are clipped due to adapters or overlapping " +
"will not contain the NM, MD, or UQ tags." +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar MergeBamAlignment \\<br /> " +
Expand Down
79 changes: 9 additions & 70 deletions src/main/java/picard/sam/SetNmAndUqTags.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,94 +21,33 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package picard.sam;

import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.SamOrBam;

import java.io.File;
import java.util.stream.StreamSupport;

/**
* @author Yossi Farjoun
*/
@Deprecated
@CommandLineProgramProperties(
usage = SetNmAndUqTags.USAGE_SUMMARY + SetNmAndUqTags.USAGE_DETAILS,
usage = SetNmAndUqTags.USAGE_SUMMARY + SetNmMdAndUqTags.USAGE_DETAILS,
usageShort = SetNmAndUqTags.USAGE_SUMMARY,
programGroup = SamOrBam.class
)
public class SetNmAndUqTags extends CommandLineProgram {
static final String USAGE_SUMMARY = "Fixes the UQ and NM tags in a SAM file. ";
static final String USAGE_DETAILS = "This tool takes in a SAM or BAM file (sorted by coordinate) and calculates the NM and UQ tags by comparing with the reference."+
public class SetNmAndUqTags extends SetNmMdAndUqTags {
static final String USAGE_SUMMARY = "DEPRECATED: Use SetNmMdAndUqTags instead.";
static final String USAGE_DETAILS = "DEPRECATED: Use SetNmMdAndUqTags instead. This tool takes in a SAM or BAM file (sorted by coordinate) and calculates the NM, MD, and UQ tags by comparing with the reference."+
"<br />" +
"This may be needed when MergeBamAlignment was run with SORT_ORDER different from 'coordinate' and thus could not fix\n"+
"these tags then.<br />"+
"This may be needed when MergeBamAlignment was run with SORT_ORDER different from 'coordinate' and thus could not fix\n"+
"these tags then.<br />"+
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar SetNmAndUqTags \\<br />" +
" I=sorted.bam \\<br />" +
" O=fixed.bam \\<br />"+
"</pre>" +
"<hr />";
@Option(doc = "The BAM or SAM file to fix.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME)
public File INPUT;

@Option(doc = "The fixed BAM or SAM output file. ", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME)
public File OUTPUT;

@Option(doc = "Whether the file contains bisulfite sequence (used when calculating the NM tag).")
public boolean IS_BISULFITE_SEQUENCE = false;

@Override
protected String[] customCommandLineValidation() {
if (REFERENCE_SEQUENCE == null) {
return new String[]{"Must have a non-null REFERENCE_SEQUENCE"};
}
return super.customCommandLineValidation();
}

private final Log log = Log.getInstance(SetNmAndUqTags.class);

public static void main(final String[] argv) {
new SetNmAndUqTags().instanceMainWithExit(argv);
}

protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);

if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new SAMException("Input must be coordinate-sorted for this program to run. Found: " + reader.getFileHeader().getSortOrder());
}

final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), true, OUTPUT);
writer.setProgressLogger(
new ProgressLogger(log, (int) 1e7, "Wrote", "records"));

final ReferenceSequenceFileWalker refSeq = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);

StreamSupport.stream(reader.spliterator(),false)
.peek(rec->{if(!rec.getReadUnmappedFlag()) AbstractAlignmentMerger.fixNMandUQ(rec, refSeq, IS_BISULFITE_SEQUENCE);})
.forEach(writer::addAlignment);

CloserUtil.close(reader);
writer.close();
return 0;
}
}

113 changes: 113 additions & 0 deletions src/main/java/picard/sam/SetNmMdAndUqTags.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* The MIT License
*
* Copyright (c) 2016 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam;

import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.SamOrBam;

import java.io.File;
import java.util.stream.StreamSupport;

/**
* @author Yossi Farjoun
*/
@CommandLineProgramProperties(
usage = SetNmMdAndUqTags.USAGE_SUMMARY + SetNmMdAndUqTags.USAGE_DETAILS,
usageShort = SetNmMdAndUqTags.USAGE_SUMMARY,
programGroup = SamOrBam.class
)
public class SetNmMdAndUqTags extends CommandLineProgram {
static final String USAGE_SUMMARY = "Fixes the NM, MD, and UQ tags in a SAM file. ";
static final String USAGE_DETAILS = "This tool takes in a SAM or BAM file (sorted by coordinate) and calculates the NM, MD, and UQ tags by comparing with the reference."+
"<br />" +
"This may be needed when MergeBamAlignment was run with SORT_ORDER different from 'coordinate' and thus could not fix\n"+
"these tags then.<br />"+
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar SetNmMDAndUqTags \\<br />" +
" I=sorted.bam \\<br />" +
" O=fixed.bam \\<br />"+
"</pre>" +
"<hr />";
@Option(doc = "The BAM or SAM file to fix.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME)
public File INPUT;

@Option(doc = "The fixed BAM or SAM output file. ", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME)
public File OUTPUT;

@Option(doc = "Whether the file contains bisulfite sequence (used when calculating the NM tag).")
public boolean IS_BISULFITE_SEQUENCE = false;

@Override
protected String[] customCommandLineValidation() {
if (REFERENCE_SEQUENCE == null) {
return new String[]{"Must have a non-null REFERENCE_SEQUENCE"};
}
return super.customCommandLineValidation();
}

private final Log log = Log.getInstance(SetNmMdAndUqTags.class);

public static void main(final String[] argv) {
new SetNmMdAndUqTags().instanceMainWithExit(argv);
}

protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);

if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new SAMException("Input must be coordinate-sorted for this program to run. Found: " + reader.getFileHeader().getSortOrder());
}

final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), true, OUTPUT);
writer.setProgressLogger(
new ProgressLogger(log, (int) 1e7, "Wrote", "records"));

final ReferenceSequenceFileWalker refSeq = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);

StreamSupport.stream(reader.spliterator(),false)
.peek(rec->{if(!rec.getReadUnmappedFlag()) AbstractAlignmentMerger.fixNmMdAndUq(rec, refSeq, IS_BISULFITE_SEQUENCE);})
.forEach(writer::addAlignment);

CloserUtil.close(reader);
writer.close();
return 0;
}
}
Loading

0 comments on commit 915ffa7

Please sign in to comment.