From 6f70e98c1e03129a57239b9c3b4b3ecb8d041ed9 Mon Sep 17 00:00:00 2001 From: Roland Ewald Date: Sun, 16 Aug 2020 22:45:27 +0200 Subject: [PATCH] Fix for right-shifting deletions on AA sequences. **This is a combination of 4 commits.** **This is the 1st commit message:** Problem: The previous code addressed the problems with deletions introducing frameshifts by avoiding right-shifting in those cases altogether, by checking change.getAlt().length() != 0 in AminoAcidChangeNormalizer#normalizeDeletion. This may be wrong if the frame-shifted sequence introduces the same amino acid(s) that would have followed in the wild type as well. It also does not address the following corner case: - consider a deletion coinciding with the start of a codon, so that changeBeginPos.getFrameshift() == 0 in the calling code (DeletionAnnotationBuilder), which means that insAA will be empty and thus normalization will be attempted by AminoAcidChangeNormalizer#normalizeDeletion - assume the length of the deletion is not divisible by three, so it will itself introduce a frameshift, so that varAASeq (the AA sequence containing the variant) will differ significantly from from wtAASeq (the wildtype AA sequence) at the position of the deletion (and downstream) - since AminoAcidChangeNormalizer#normalizeDeletion only operates on the wildtype AA sequence (wtAASeq), this goes unnoticed, and in case the sequence of deleted amino acids starts with the same amino acid(s) as those after the deletion in the wild type, the variant will be erroneously right-shifted on that wild-type sequence (the right-shifting currently did not consider varAASeq at all) Example: H | T | G | E | K | P |fs F| E | C | GGAAACAT|ACT|GGG|GAG|AAA|CCC| TTT|GAG|TGT|CCCAAATGTGGGAAGTGTTACTTTCGG... GGAAACAT|ACT|GGG|---|---|---|-TTG|AGT|GTC|CCAAATGT.. | L | S | V | Before AminoAcidChangeNormalizer#normalizeDeletion gets called, AminoAcidChange will be 'EKPF'>''. However, due to the above problem this will be right-shifted to 'KPFE'>'', which leads to a wrong position and a wrong AA displayed in the variant's protein change HGVS annotation. In this example, the protein change should be p.(Glu316Leufs*25) (in one-letter codes, p.(E316Lfs*25)) but is erroneously changed to p.(Lys317Serfs*24) (p.(K317Sfs*24)). Solution: In contrast to the right-shifting code for nucleotides, it seems a better approach here would be to simply compare both wild-type and variant AA sequence, as they already have been computed by the AminoAcidChangeNormalizer anyhow. So, varAASeq is now passed into AminoAcidChangeNormalizer#normalizeDeletion and compared to the wildtype AA sequence. This also allows us to apply right-shifting to deletions that do not coincide with the beginning of a codon. Tests: This adds both an 'integration' test with the above example to DeletionAnnotationBuilderTest and some unit tests to AminoAcidChangeTest. The latter has also been adjusted to make it clear that the tested operations should work on AA sequences, not nucleotide sequences (globally replaced 'A' by 'L'). **This is the commit message #2:** **#498: fix normalizeDeletion(...) javadoc** **This is the commit message #3:** **498: refactor+fix shift for synonymous AA changes** This introduces the changes from 2190296 to BlockSubstitutionAnnotationBuilder and InsertionAnnotationBuilder as well. Additionally, this removes now-obsolete code and moves the call to 'trim' the AA change before shifting to AminoAcidChangeNormalizer itself, and also adds and fixes tests. Co-authored-by: Roland Ewald --- CHANGELOG.md | 6 ++ .../BlockSubstitutionAnnotationBuilder.java | 3 + .../builders/DeletionAnnotationBuilder.java | 3 +- .../builders/InsertionAnnotationBuilder.java | 3 +- .../reference/AminoAcidChangeNormalizer.java | 89 ++++++------------- .../DeletionAnnotationBuilderTest.java | 16 +++- .../reference/AminoAcidChangeTest.java | 60 +++++++++++-- 7 files changed, 106 insertions(+), 74 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2893900116..358fab5cae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ * Switching to Github Workflows for continuous integration. * Bumping a couple of dependencies. +### jannovar-core + +* Apply fix for (#498, PR #499 by @roland-ewald of @limbus-medtec). + This fixes a problem with right-shifting deletions on amino acid sequences. + See the tickes and merge request for details. + ## v0.34 ### jannovar-cli diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/BlockSubstitutionAnnotationBuilder.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/BlockSubstitutionAnnotationBuilder.java index 10d38f305b..b011221ae4 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/BlockSubstitutionAnnotationBuilder.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/BlockSubstitutionAnnotationBuilder.java @@ -153,6 +153,9 @@ public CDSExonicAnnotationBuilder() { wtAASeq.substring(refChangeBeginPos.getPos() / 3, Math.min((refChangeLastPos.getPos() + 1 + 2) / 3, wtAASeq.length())), varAASeq.substring(varChangeBeginPos.getPos() / 3, Math.min((varChangeLastPos.getPos() + 1 + 2) / 3, varAASeq.length()))); + // Shift change in case it is by chance synonymous + this.aaChange = AminoAcidChangeNormalizer.shiftSynonymousChange(this.aaChange, wtAASeq, varAASeq); + // Look for stop codon, starting at change position. this.varAAStopPos = varAASeq.indexOf('*', refChangeBeginPos.getPos() / 3); } diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/DeletionAnnotationBuilder.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/DeletionAnnotationBuilder.java index ffd99f4948..f72f0a88b8 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/DeletionAnnotationBuilder.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/DeletionAnnotationBuilder.java @@ -135,8 +135,7 @@ public CDSExonicAnnotationBuilder() { final int varAAEndPos = Math.min(changeBeginPos.getPos() / 3 + delta, varAASeq.length()); final String insAA = varAASeq.substring(changeBeginPos.getPos() / 3, varAAEndPos); this.aaChange = new AminoAcidChange(changeBeginPos.getPos() / 3, delAA, insAA); - this.aaChange = AminoAcidChangeNormalizer.truncateBothSides(this.aaChange); - this.aaChange = AminoAcidChangeNormalizer.normalizeDeletion(wtAASeq, this.aaChange); + this.aaChange = AminoAcidChangeNormalizer.shiftSynonymousChange(this.aaChange, wtAASeq, varAASeq); } public Annotation build() { diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java index 7723d64da3..a9f226fdec 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java @@ -179,8 +179,7 @@ public CDSExonicAnnotationBuilder() { final String insertAA = varAASeq.substring(insertAAPos, insertAAPos + insertAALength); this.aaChange = new AminoAcidChange(insertAAPos, delAA, insertAA); this.aaChange = AminoAcidChangeNormalizer.truncateAltAfterStopCodon(aaChange); - this.aaChange = AminoAcidChangeNormalizer.truncateBothSides(aaChange); - this.aaChange = AminoAcidChangeNormalizer.shiftInsertion(aaChange, wtAASeq); + this.aaChange = AminoAcidChangeNormalizer.shiftSynonymousChange(aaChange, wtAASeq, varAASeq); // Obtain amino acid insertion position. this.varAAInsertPos = this.aaChange.getPos(); } diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/AminoAcidChangeNormalizer.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/AminoAcidChangeNormalizer.java index b6b215f6cb..f3e9e617d8 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/AminoAcidChangeNormalizer.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/AminoAcidChangeNormalizer.java @@ -7,6 +7,9 @@ */ public final class AminoAcidChangeNormalizer { + /** Utility class, should not be instantiated.*/ + private AminoAcidChangeNormalizer() {} + /** * Search for stop codon in change.alt and truncate afterwards. * @@ -21,44 +24,44 @@ public static AminoAcidChange truncateAltAfterStopCodon(AminoAcidChange change) } /** - * Normalize deletion {@link AminoAcidChange} for amino acid string - *

- * Return change if it is not a clean deletion. + * This shifts the amino acid and its position to be reported in the proteinChange HGVS annotation to the + * first position where the amino acids actually differ. This is necessary because + * {@link AminoAcidChangeNormalizer#truncateBothSides(AminoAcidChange)} does not suffice in all situations. * - * @param ref reference amino acid string to change - * @param change the {@link AminoAcidChange} to normalize - * @return normalized AminoAcidChange + * @param change the original amino acid change to be shifted + * @param wtAASeq the wildtype amino acid sequence (i.e. the translated reference sequence) + * @param varAASeq the predicted amino acid sequence induced by the variant + * @return the amino acid change shifted to the first difference in the sequence, or the end of the ref/alt CDS */ - public static AminoAcidChange normalizeDeletion(String ref, AminoAcidChange change) { - if (change.getRef().length() == 0 || change.getAlt().length() != 0) - return change; - - // Compute shift of deletion. - int shift = 0; - final int LEN = change.getRef().length(); - while (change.getPos() + LEN + shift < ref.length() - && ref.charAt(change.getPos()) == ref.charAt(change.getPos() + LEN + shift)) - shift += 1; - if (shift == 0) - return change; - - // Build new AminoAcidChange. - StringBuilder changeRefBuilder = new StringBuilder(); - changeRefBuilder.append(ref.substring(change.getPos() + shift, change.getPos() + shift + change.getRef().length())); - return new AminoAcidChange(change.getPos() + shift, changeRefBuilder.toString(), ""); + public static AminoAcidChange shiftSynonymousChange(AminoAcidChange change, String wtAASeq, String varAASeq) { + AminoAcidChange aminoAcidChange = truncateBothSides(change); + int position = aminoAcidChange.getPos(); + int originalPosition = position; + int maxPosition = Math.min( + wtAASeq.length() - aminoAcidChange.getRef().length(), + varAASeq.length() - aminoAcidChange.getAlt().length()); + while (position < maxPosition && wtAASeq.charAt(position) != '*' + && wtAASeq.charAt(position) == varAASeq.charAt(position)) { + position++; + } + if (position == originalPosition) { + return aminoAcidChange; + } + return new AminoAcidChange(position, + wtAASeq.substring(position, position + aminoAcidChange.getRef().length()), + varAASeq.substring(position, position + aminoAcidChange.getAlt().length())); } /** * Truncate {@link AminoAcidChange} from both sides for matching ref/alt prefixes/suffixes. *

* Truncating of the prefixes is given higher priority to conform with the HGVS notation (you have to call - * {@link #shiftInsertion}) afterwards. + * {@link #shiftSynonymousChange}) afterwards. * * @param aaChange {@link AminoAcidChange} to truncate * @return updated {@link AminoAcidChange} */ - public static AminoAcidChange truncateBothSides(AminoAcidChange aaChange) { - // TODO(holtgrem): Test me! + private static AminoAcidChange truncateBothSides(AminoAcidChange aaChange) { // Truncate suffixes / from the right. final int REFLEN = aaChange.getRef().length() - 1; @@ -84,38 +87,4 @@ public static AminoAcidChange truncateBothSides(AminoAcidChange aaChange) { return aaChange; } - /** - * Shift insertion {@link AminoAcidChange} to the right in WT AA sequence. - *

- * Returns aaChange if aaChange.ref is not the empty string. - * - * @param aaChange {@link AminoAcidChange} to normalize - * @param wtAASeq WT AA sequence to use for shifting - * @return updated {@link AminoAcidChange} - */ - public static AminoAcidChange shiftInsertion(AminoAcidChange aaChange, String wtAASeq) { - // TODO(holtgrem): Test me! - if (aaChange.getRef().length() != 0) - return aaChange; - - // Insert the alternative bases at the position indicated by txPos. - StringBuilder builder = new StringBuilder(wtAASeq); - builder.insert(aaChange.getPos(), aaChange.getAlt()); - - // Execute algorithm and compute the shift. - int pos = aaChange.getPos(); - int shift = 0; - final int LEN = aaChange.getAlt().length(); - final String seq = builder.toString(); - while ((pos + LEN < seq.length()) && (seq.charAt(pos) == seq.charAt(pos + LEN))) { - ++shift; - ++pos; - } - - if (shift == 0) // only rebuild if shift > 0 - return aaChange; - else - return new AminoAcidChange(pos, "", seq.substring(pos, pos + LEN)); - } - } diff --git a/jannovar-core/src/test/java/de/charite/compbio/jannovar/annotation/builders/DeletionAnnotationBuilderTest.java b/jannovar-core/src/test/java/de/charite/compbio/jannovar/annotation/builders/DeletionAnnotationBuilderTest.java index 9df61b0139..f79a8c397a 100644 --- a/jannovar-core/src/test/java/de/charite/compbio/jannovar/annotation/builders/DeletionAnnotationBuilderTest.java +++ b/jannovar-core/src/test/java/de/charite/compbio/jannovar/annotation/builders/DeletionAnnotationBuilderTest.java @@ -326,6 +326,20 @@ public void testForwardFrameShiftDeletion() throws InvalidGenomeVariant { Assert.assertEquals(ImmutableSortedSet.of(VariantEffect.FRAMESHIFT_TRUNCATION), annotation1.getEffects()); } + @Test + public void testForwardInternalFrameShiftNormalization() throws InvalidGenomeVariant { + // The following starts with a codon but still causes a shift in the nucleotide sequence. + GenomeVariant change1 = new GenomeVariant(new GenomePosition(refDict, Strand.FWD, 1, 6645991, + PositionType.ZERO_BASED), "GAGAAACCCT", ""); + Annotation annotation1 = new DeletionAnnotationBuilder(infoForward, change1, new AnnotationBuilderOptions()) + .build(); + Assert.assertEquals(infoForward.getAccession(), annotation1.getTranscript().getAccession()); + Assert.assertEquals(3, annotation1.getAnnoLoc().getRank()); + Assert.assertEquals("946_955del", annotation1.getCDSNTChange().toHGVSString()); + Assert.assertEquals("(Glu316Leufs*25)", annotation1.getProteinChange().toHGVSString(AminoAcidCode.THREE_LETTER)); + Assert.assertEquals(ImmutableSortedSet.of(VariantEffect.FRAMESHIFT_TRUNCATION), annotation1.getEffects()); + } + @Test public void testForwardNonFrameShiftDeletion() throws InvalidGenomeVariant { // clean (FS of begin position is 0) deletion of one codon, starting in intron (thus no "exon3" annotation is @@ -1207,7 +1221,7 @@ public void testRealWorldCase_uc011mcs_() throws InvalidGenomeVariant { Assert.assertEquals(infoForward.getAccession(), annotation1.getTranscript().getAccession()); Assert.assertEquals(12, annotation1.getAnnoLoc().getRank()); Assert.assertEquals("1068_1071del", annotation1.getCDSNTChange().toHGVSString()); - Assert.assertEquals("(Glu358del)", annotation1.getProteinChange().toHGVSString(AminoAcidCode.THREE_LETTER)); + Assert.assertEquals("(Glu357Lysfs*?)", annotation1.getProteinChange().toHGVSString(AminoAcidCode.THREE_LETTER)); Assert.assertEquals(ImmutableSortedSet.of(VariantEffect.FRAMESHIFT_TRUNCATION), annotation1.getEffects()); } diff --git a/jannovar-core/src/test/java/de/charite/compbio/jannovar/reference/AminoAcidChangeTest.java b/jannovar-core/src/test/java/de/charite/compbio/jannovar/reference/AminoAcidChangeTest.java index 6803c866f2..48013859f7 100644 --- a/jannovar-core/src/test/java/de/charite/compbio/jannovar/reference/AminoAcidChangeTest.java +++ b/jannovar-core/src/test/java/de/charite/compbio/jannovar/reference/AminoAcidChangeTest.java @@ -13,42 +13,84 @@ public class AminoAcidChangeTest { @Before public void setUp() throws Exception { - this.ref = "AAACCCAAACCC"; - this.ref2 = "CATCATCTTCA"; + this.ref = "LLLCCCLLLCCC"; + this.ref2 = "CLTCLTCTTCL"; } @Test public void testShiftNoRefChange() { AminoAcidChange origChange = new AminoAcidChange(3, "CC", ""); - AminoAcidChange modChange = AminoAcidChangeNormalizer.normalizeDeletion(ref, origChange); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, ref, "LLLCLLLCCC"); Assert.assertEquals(new AminoAcidChange(4, "CC", ""), modChange); } @Test public void testShiftNoRefChangeEnd() { AminoAcidChange origChange = new AminoAcidChange(9, "CC", ""); - AminoAcidChange modChange = AminoAcidChangeNormalizer.normalizeDeletion(ref, origChange); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, ref, "LLLCCCLLLC"); Assert.assertEquals(new AminoAcidChange(10, "CC", ""), modChange); } @Test public void testShiftRefChange() { - AminoAcidChange origChange = new AminoAcidChange(3, "CAT", ""); - AminoAcidChange modChange = AminoAcidChangeNormalizer.normalizeDeletion(ref2, origChange); - Assert.assertEquals(new AminoAcidChange(4, "ATC", ""), modChange); + AminoAcidChange origChange = new AminoAcidChange(3, "CLT", ""); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, ref2, "CLTCTTCL"); + Assert.assertEquals(new AminoAcidChange(4, "LTC", ""), modChange); + } + + @Test + public void testShiftDeletionFrameshiftNoInsAA() { + AminoAcidChange origChange = new AminoAcidChange(3, "CQY", ""); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, "LLLCQYCLLL", "LLLCQYVA"); + Assert.assertEquals(new AminoAcidChange(6, "CLL", ""), modChange); + } + + @Test + public void testShiftDeletionFrameshiftInsAA() { + AminoAcidChange origChange = new AminoAcidChange(3, "CQY", "C"); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, "LLLCQYCLLL", "LLLCQYVA"); + Assert.assertEquals(new AminoAcidChange(6, "CL", ""), modChange); + } + + @Test + public void testShiftInsertionNoFrameshift() { + AminoAcidChange origChange = new AminoAcidChange(3, "C", "CQY"); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, "LLLCQYCLLL", "LLLCQYQYCLLL"); + Assert.assertEquals(new AminoAcidChange(6, "", "QY"), modChange); + } + + @Test + public void testShiftInsertionFrameshift() { + AminoAcidChange origChange = new AminoAcidChange(3, "C", "CQY"); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, "LLLCQYCLLL", "LLLCQYQYVAAA"); + Assert.assertEquals(new AminoAcidChange(6, "", "QY"), modChange); + } + + @Test + public void testShiftSubstitutionNoFrameshift() { + AminoAcidChange origChange = new AminoAcidChange(3, "CQYC", "CQLC"); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, "LLLCQYCLLL", "LLLCQLCLLL"); + Assert.assertEquals(new AminoAcidChange(5, "Y", "L"), modChange); + } + + @Test + public void testShiftSubstitutionFrameshift() { + AminoAcidChange origChange = new AminoAcidChange(3, "CQYC", "CQYQ"); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, "LLLCQYCLLL", "LLLCQYQYVAAA"); + Assert.assertEquals(new AminoAcidChange(6, "C", "Q"), modChange); } @Test public void testNoShift() { AminoAcidChange origChange = new AminoAcidChange(3, "CCC", ""); - AminoAcidChange modChange = AminoAcidChangeNormalizer.normalizeDeletion(ref, origChange); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, ref, "LLLLLLCCC"); Assert.assertEquals(new AminoAcidChange(3, "CCC", ""), modChange); } @Test public void testNoShiftEnd() { AminoAcidChange origChange = new AminoAcidChange(9, "CCC", ""); - AminoAcidChange modChange = AminoAcidChangeNormalizer.normalizeDeletion(ref, origChange); + AminoAcidChange modChange = AminoAcidChangeNormalizer.shiftSynonymousChange(origChange, ref, "LLLCCCLLL"); Assert.assertEquals(new AminoAcidChange(9, "CCC", ""), modChange); }