Skip to content

Commit

Permalink
Rename SetNmAndUqTags and fix #622 (MergeBamAlignment MD tag).
Browse files Browse the repository at this point in the history
1. Rename SetNmAndUqTags to SetNmMDAndUqTags.
WIP
2. Fixes: #622
  • Loading branch information
nh13 committed Aug 20, 2016
1 parent e478eb5 commit 9d2018a
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 43 deletions.
30 changes: 8 additions & 22 deletions src/main/java/picard/sam/AbstractAlignmentMerger.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.filter.FilteringIterator;
import htsjdk.samtools.filter.OverclippedReadFilter;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CigarUtil;
Expand Down Expand Up @@ -477,7 +476,7 @@ SAMRecord.class, new BAMRecordCodec(header), new SAMRecordCoordinateComparator()

for (final SAMRecord rec : sink.sorter) {
if (!rec.getReadUnmappedFlag() && refSeq != null) {
fixNMandUQ(rec, refSeq, bisulfiteSequence);
fixNmMdAndUQ(rec, refSeq, bisulfiteSequence);
}
writer.addAlignment(rec);
finalProgress.record(rec);
Expand All @@ -490,7 +489,7 @@ SAMRecord.class, new BAMRecordCodec(header), new SAMRecordCoordinateComparator()
log.info("Wrote " + aligned + " alignment records and " + (alignedReadsOnly ? 0 : unmapped) + " unmapped reads.");
}

/** Calculates and sets the NM and UQ tags from the record and the reference
/** Calculates and sets the NM, MD, and and UQ tags from the record and the reference
*
* @param record the record to be fixed
* @param refSeqWalker a ReferenceSequenceWalker that will be used to traverse the reference
Expand All @@ -499,10 +498,13 @@ SAMRecord.class, new BAMRecordCodec(header), new SAMRecordCoordinateComparator()
*
* No return value, modifies the provided record.
*/
public static void fixNMandUQ(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) {
public static void fixNmMdAndUQ(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) {
final byte[] referenceBases = refSeqWalker.get(refSeqWalker.getSequenceDictionary().getSequenceIndex(record.getReferenceName())).getBases();
record.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(record, referenceBases, 0, isBisulfiteSequence));

// only recalculate NM if it isn't bisulfite, since it needs to be treated specially below
SequenceUtil.calculateMdAndNmTags(record, referenceBases, true, !isBisulfiteSequence);
if (isBisulfiteSequence) { // recalculate the NM tag for bisulfite data
record.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(record, referenceBases, 0, isBisulfiteSequence));
}
if (record.getBaseQualities() != SAMRecord.NULL_QUALS) {
record.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(record, referenceBases, 0, isBisulfiteSequence));
}
Expand Down Expand Up @@ -584,20 +586,6 @@ private void transferAlignmentInfoToPairedRead(final SAMRecord firstUnaligned, f
}
}

/**
* Updates the MD and NM tags if present.
* @param rec
*/
private void calculateMdAndNmTags(final SAMRecord rec) {
// update NM and MD
final boolean calculateNm = rec.getIntegerAttribute(ReservedTagConstants.NM) != null;
final boolean calculateMd = rec.getStringAttribute(SAMTag.MD.name()) != null;
if (calculateNm || calculateMd) {
final byte[] referenceBases = this.refSeq.get(this.refSeq.getSequenceDictionary().getSequenceIndex(rec.getReferenceName())).getBases();
SequenceUtil.calculateMdAndNmTags(rec, referenceBases, calculateNm, calculateMd);
}
}

/**
* Checks to see whether the ends of the reads overlap and soft clips reads
* them if necessary.
Expand All @@ -619,13 +607,11 @@ protected void clipForOverlappingReads(final SAMRecord read1, final SAMRecord re
if (posDiff > 0) {
CigarUtil.softClip3PrimeEndOfRead(pos, Math.min(pos.getReadLength(),
pos.getReadLength() - posDiff + 1));
calculateMdAndNmTags(pos);
}

if (negDiff > 0) {
CigarUtil.softClip3PrimeEndOfRead(neg, Math.min(neg.getReadLength(),
neg.getReadLength() - negDiff + 1));
calculateMdAndNmTags(neg);
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
Expand All @@ -48,19 +47,19 @@
* @author Yossi Farjoun
*/
@CommandLineProgramProperties(
usage = SetNmAndUqTags.USAGE_SUMMARY + SetNmAndUqTags.USAGE_DETAILS,
usageShort = SetNmAndUqTags.USAGE_SUMMARY,
usage = SetNmMDAndUqTags.USAGE_SUMMARY + SetNmMDAndUqTags.USAGE_DETAILS,
usageShort = SetNmMDAndUqTags.USAGE_SUMMARY,
programGroup = SamOrBam.class
)
public class SetNmAndUqTags extends CommandLineProgram {
static final String USAGE_SUMMARY = "Fixes the UQ and NM tags in a SAM file. ";
static final String USAGE_DETAILS = "This tool takes in a SAM or BAM file (sorted by coordinate) and calculates the NM and UQ tags by comparing with the reference."+
public class SetNmMDAndUqTags extends CommandLineProgram {
static final String USAGE_SUMMARY = "Fixes the NM, MD, and UQ tags in a SAM file. ";
static final String USAGE_DETAILS = "This tool takes in a SAM or BAM file (sorted by coordinate) and calculates the NM, MD, and UQ tags by comparing with the reference."+
"<br />" +
"This may be needed when MergeBamAlignment was run with SORT_ORDER different from 'coordinate' and thus could not fix\n"+
"these tags then.<br />"+
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar SetNmAndUqTags \\<br />" +
"java -jar picard.jar SetNmMDAndUqTags \\<br />" +
" I=sorted.bam \\<br />" +
" O=fixed.bam \\<br />"+
"</pre>" +
Expand All @@ -82,10 +81,10 @@ protected String[] customCommandLineValidation() {
return super.customCommandLineValidation();
}

private final Log log = Log.getInstance(SetNmAndUqTags.class);
private final Log log = Log.getInstance(SetNmMDAndUqTags.class);

public static void main(final String[] argv) {
new SetNmAndUqTags().instanceMainWithExit(argv);
new SetNmMDAndUqTags().instanceMainWithExit(argv);
}

protected int doWork() {
Expand All @@ -104,7 +103,7 @@ protected int doWork() {
final ReferenceSequenceFileWalker refSeq = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);

StreamSupport.stream(reader.spliterator(),false)
.peek(rec->{if(!rec.getReadUnmappedFlag()) AbstractAlignmentMerger.fixNMandUQ(rec, refSeq, IS_BISULFITE_SEQUENCE);})
.peek(rec->{if(!rec.getReadUnmappedFlag()) AbstractAlignmentMerger.fixNmMdAndUQ(rec, refSeq, IS_BISULFITE_SEQUENCE);})
.forEach(writer::addAlignment);

CloserUtil.close(reader);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import java.io.File;
import java.io.IOException;

public class SetNmAndUqTagsTest {
public class SetNmMDAndUqTagsTest {

private static final File fasta = new File("testdata/picard/sam/merger.fasta");
@DataProvider(name="filesToFix")
Expand Down Expand Up @@ -64,7 +64,7 @@ private void fixFile(final File input, final File output, final File reference)
"OUTPUT="+output,
"REFERENCE_SEQUENCE="+reference };

SetNmAndUqTags setNmAndUqTags = new SetNmAndUqTags();
Assert.assertEquals(setNmAndUqTags.instanceMain(args), 0, "Fix did not succeed");
SetNmMDAndUqTags setNmMDAndUqTags = new SetNmMDAndUqTags();
Assert.assertEquals(setNmMDAndUqTags.instanceMain(args), 0, "Fix did not succeed");
}
}
4 changes: 2 additions & 2 deletions testdata/picard/sam/MergeBamAlignment/cliptest.aligned.sam
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ FF:80:175 65 chr1 100 20 76M chr1 80 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTT
FF:80:175 129 chr1 80 20 76M chr1 100 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD MD:Z:76 NM:i:0
FR_clip:100:155:56:0 97 chr1 100 20 76M chr1 80 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD@DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 MD:Z:76 NM:i:0
FR_clip:100:155:56:0 145 chr1 80 20 76M chr1 100 0 TTAGAGTACGTTAACACTCCATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAA GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD MD:Z:76 NM:i:0
FR_noclip:100:575:76:1 97 chr1 100 20 76M chr1 500 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD@DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 MD:Z:76 NM:i:0
FR_noclip:100:575:76:1 145 chr1 500 20 76M chr1 100 0 TTTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD MD:Z:76 NM:i:0
FR_noclip:100:575:75C0:1 97 chr1 100 20 76M chr1 500 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAC ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD@DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 MD:Z:76 NM:i:0
FR_noclip:100:575:75C0:1 145 chr1 500 20 76M chr1 100 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD MD:Z:76 NM:i:0
RF:100:575 81 chr1 100 20 76M chr1 500 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD@DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 MD:Z:76 NM:i:0
RF:100:575 161 chr1 500 20 76M chr1 100 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD MD:Z:76 NM:i:0
RR:80:175 113 chr1 100 20 76M chr1 80 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD@DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 MD:Z:76 NM:i:0
Expand Down
4 changes: 2 additions & 2 deletions testdata/picard/sam/MergeBamAlignment/cliptest.unmapped.sam
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ FF:80:175 77 * 0 0 * * 0 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGA
FF:80:175 141 * 0 0 * * 0 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
FR_clip:100:155:56:0 77 * 0 0 * * 0 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD@DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
FR_clip:100:155:56:0 141 * 0 0 * * 0 0 TTAAAGGTTTGTTAATATTTGCATCTGTACGATCGTAAGAGGGCTTCAGCATGAATGGAGTGTTAACGTACTCTAA GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
FR_noclip:100:575:76:1 77 * 0 0 * * 0 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD@DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
FR_noclip:100:575:76:1 141 * 0 0 * * 0 0 GTATTGTTTTTTTTTTGCCCTTAAAGGTTTGTTAATATTTGCATCTGTACGATCGTAAGAGGGCTTCAGCATGAAA GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
FR_noclip:100:575:75C0:1 77 * 0 0 * * 0 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD@DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
FR_noclip:100:575:75C0:1 141 * 0 0 * * 0 0 CTATTGTTTTTTTTTTGCCCTTAAAGGTTTGTTAATATTTGCATCTGTACGATCGTAAGAGGGCTTCAGCATGAAT GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
RF:100:575 77 * 0 0 * * 0 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD@DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
RF:100:575 141 * 0 0 * * 0 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
RR:80:175 77 * 0 0 * * 0 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD@DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
Expand Down
8 changes: 4 additions & 4 deletions testdata/picard/sam/MergeBamAlignment/contam.expected.sam
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
@RG ID:0 SM:Hi,Mom! PL:ILLUMINA
@PG ID:0 VN:1.0 CL:align! PN:myAligner
frag_multiple_primary_1 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
frag_multiple_primary_2 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0
frag_multiple_primary_2 256 chr1 1 15 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240
frag_multiple_primary_2 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:50 PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0
frag_multiple_primary_2 256 chr1 1 15 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:0T0T0C0A0T1C0T0G1 PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240
frag_primary_clipped 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
frag_secondary_clipped 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0
frag_secondary_clipped 256 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240
frag_secondary_clipped 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:50 PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0
frag_secondary_clipped 256 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:0T0T0C0A0T1C0T0G1 PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240
r1_clipped_r2_clipped 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
r1_clipped_r2_perfect 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
r1_clipped_r2_unmapped 77 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
Expand Down

0 comments on commit 9d2018a

Please sign in to comment.