Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix variant region for ins and dup on intron-exon boundary #748

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/hgvs/_data/defaults.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ inferred_p_is_uncertain = True
normalize = True
prevalidation_level = EXTRINSIC
replace_reference = True
ins_at_boundary_is_intronic = True

# strict_bounds: require transcript variants to be within transcript sequence bounds
strict_bounds = True
Expand Down
15 changes: 12 additions & 3 deletions src/hgvs/assemblymapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,24 @@ def t_to_p(self, var_t):
)

def c_to_n(self, var_c):
var_out = super(AssemblyMapper, self).c_to_n(var_c)
alt_ac = self._alt_ac_for_tx_ac(var_c.ac)
var_out = super(AssemblyMapper, self).c_to_n(
var_c, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method
)
return self._maybe_normalize(var_out)

def n_to_c(self, var_n):
var_out = super(AssemblyMapper, self).n_to_c(var_n)
alt_ac = self._alt_ac_for_tx_ac(var_n.ac)
var_out = super(AssemblyMapper, self).n_to_c(
var_n, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method
)
return self._maybe_normalize(var_out)

def c_to_p(self, var_c):
var_out = super(AssemblyMapper, self).c_to_p(var_c)
alt_ac = self._alt_ac_for_tx_ac(var_c.ac)
var_out = super(AssemblyMapper, self).c_to_p(
var_c, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method
)
return self._maybe_normalize(var_out)

def relevant_transcripts(self, var_g):
Expand Down
4 changes: 2 additions & 2 deletions src/hgvs/sequencevariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __repr__(self):
", ".join((a.name + "=" + str(getattr(self, a.name))) for a in self.__attrs_attrs__),
)

def fill_ref(self, hdp):
def fill_ref(self, hdp, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
# TODO: Refactor. SVs should not operate on themselves when
# external resources are required
# replace_reference should be moved outside function
Expand All @@ -67,7 +67,7 @@ def fill_ref(self, hdp):
type in ["del", "delins", "identity", "dup", "repeat"]
and self.posedit.edit.ref_s is None
):
vm._replace_reference(self)
vm._replace_reference(self, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
if type == "identity" and isinstance(self.posedit.edit, hgvs.edit.NARefAlt):
self.posedit.edit.alt = self.posedit.edit.ref
return self
Expand Down
52 changes: 47 additions & 5 deletions src/hgvs/utils/altseqbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import logging
import math

from hgvs import global_config
from hgvs.exceptions import HGVSError

from bioutils.sequences import reverse_complement, translate_cds

from ..edit import Dup, Inv, NARefAlt, Repeat
Expand Down Expand Up @@ -194,8 +197,44 @@ def _get_variant_region(self):
and self._var_c.posedit.pos.end.datum == Datum.CDS_END
):
result = self.WHOLE_GENE
elif (
global_config.mapping.ins_at_boundary_is_intronic
and self._var_c.posedit.edit.type == "dup"
and self._var_c.posedit.pos.start.base in self._transcript_data.exon_start_positions
):
result = self.INTRON
elif (
global_config.mapping.ins_at_boundary_is_intronic
and self._var_c.posedit.edit.type == "dup"
and self._var_c.posedit.pos.end.base in self._transcript_data.exon_end_positions
):
result = self.INTRON
elif (
not global_config.mapping.ins_at_boundary_is_intronic
and self._var_c.posedit.edit.type == "ins"
and self._var_c.posedit.pos.start.offset == -1 and self._var_c.posedit.pos.end.offset == 0
):
result = self.EXON
elif (
not global_config.mapping.ins_at_boundary_is_intronic
and self._var_c.posedit.edit.type == "ins"
and self._var_c.posedit.pos.start.offset == 0 and self._var_c.posedit.pos.end.offset == 1
):
result = self.EXON
elif (
not global_config.mapping.ins_at_boundary_is_intronic
and self._var_c.posedit.edit.type == "dup"
and self._var_c.posedit.pos.end.offset == -1
):
result = self.EXON
elif (
not global_config.mapping.ins_at_boundary_is_intronic
and self._var_c.posedit.edit.type == "dup"
and self._var_c.posedit.pos.start.offset == 1
):
result = self.EXON
elif self._var_c.posedit.pos.start.offset != 0 or self._var_c.posedit.pos.end.offset != 0:
# leave out anything intronic for now
# leave out anything else intronic for now
result = self.INTRON
else: # anything else that contains an exon
result = self.EXON
Expand Down Expand Up @@ -265,7 +304,10 @@ def _incorporate_dup(self):
"""Incorporate dup into sequence"""
seq, cds_start, cds_stop, start, end = self._setup_incorporate()

dup_seq = seq[start:end]
if not self._var_c.posedit.edit.ref:
raise HGVSError('Duplication variant is missing reference sequence')

dup_seq = self._var_c.posedit.edit.ref
seq[end:end] = dup_seq

is_frameshift = len(dup_seq) % 3 != 0
Expand Down Expand Up @@ -327,10 +369,10 @@ def _setup_incorporate(self):
if pos.base < 0: # 5' UTR
result = cds_start - 1
else: # cds/intron
if pos.offset <= 0:
result = (cds_start - 1) + pos.base - 1
if pos.offset < 0:
result = (cds_start - 1) + pos.base - 2
else:
result = (cds_start - 1) + pos.base
result = (cds_start - 1) + pos.base - 1
elif pos.datum == Datum.CDS_END: # 3' UTR
result = cds_stop + pos.base - 1
else:
Expand Down
8 changes: 8 additions & 0 deletions src/hgvs/utils/reftranscriptdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,16 @@ def __init__(self, hdp, tx_ac, pro_ac):
# TODO: drop get_acs_for_protein_seq; use known mapping or digest (wo/pro ac inference)
pro_ac = hdp.get_pro_ac_for_tx_ac(tx_ac) or hdp.get_acs_for_protein_seq(protein_seq)[0]

exon_start_positions = [-tx_info["cds_start_i"]]
exon_end_positions = [exon_start_positions[0] + tx_info["lengths"][0]]
for exon_length in tx_info["lengths"][1:]:
exon_start_positions.append(exon_end_positions[-1] + 1)
exon_end_positions.append(exon_end_positions[-1] + exon_length)

self.transcript_sequence = tx_seq
self.aa_sequence = protein_seq
self.cds_start = cds_start
self.cds_stop = cds_stop
self.protein_accession = pro_ac
self.exon_start_positions = exon_start_positions
self.exon_end_positions = exon_end_positions
85 changes: 58 additions & 27 deletions src/hgvs/variantmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def t_to_g(self, var_t, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
raise HGVSInvalidVariantError("Expected a c. or n. variant; got " + str(var_t))
if self._validator:
self._validator.validate(var_t)
var_t.fill_ref(self.hdp)
var_t.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
if var_t.type == "c":
var_out = VariantMapper.c_to_g(
self, var_c=var_t, alt_ac=alt_ac, alt_aln_method=alt_aln_method
Expand Down Expand Up @@ -212,7 +212,7 @@ def n_to_g(self, var_n, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
raise HGVSInvalidVariantError("Expected a n. variant; got " + str(var_n))
if self._validator:
self._validator.validate(var_n)
var_n.fill_ref(self.hdp)
var_n.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_n.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
Expand All @@ -234,7 +234,7 @@ def n_to_g(self, var_n, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g)
)
if self.replace_reference:
self._replace_reference(var_g)
self._replace_reference(var_g, alt_ac, alt_aln_method)
# No gene symbol for g. variants (actually, *should* for NG, but no way to distinguish)
return var_g

Expand Down Expand Up @@ -306,7 +306,7 @@ def c_to_g(self, var_c, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp)
var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_c.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
Expand All @@ -331,12 +331,12 @@ def c_to_g(self, var_c, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g)
)
if self.replace_reference:
self._replace_reference(var_g)
self._replace_reference(var_g, alt_ac, alt_aln_method)
return var_g

# ############################################################################
# c⟷n
def c_to_n(self, var_c):
def c_to_n(self, var_c, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""Given a parsed c. variant, return a n. variant on the specified
transcript using the specified alignment method (default is
"transcript" indicating a self alignment).
Expand All @@ -351,7 +351,7 @@ def c_to_n(self, var_c):
raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp)
var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_c.ac, alt_ac=var_c.ac, alt_aln_method="transcript"
)
Expand All @@ -370,12 +370,12 @@ def c_to_n(self, var_c):
ac=var_c.ac, type="n", posedit=hgvs.posedit.PosEdit(pos_n, edit_n)
)
if self.replace_reference:
self._replace_reference(var_n)
self._replace_reference(var_n, alt_ac, alt_aln_method)
if self.add_gene_symbol:
self._update_gene_symbol(var_n, var_c.gene)
return var_n

def n_to_c(self, var_n):
def n_to_c(self, var_n, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""Given a parsed n. variant, return a c. variant on the specified
transcript using the specified alignment method (default is
"transcript" indicating a self alignment).
Expand All @@ -390,7 +390,7 @@ def n_to_c(self, var_n):
raise HGVSInvalidVariantError("Expected n. variant; got " + str(var_n))
if self._validator:
self._validator.validate(var_n)
var_n.fill_ref(self.hdp)
var_n.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_n.ac, alt_ac=var_n.ac, alt_aln_method="transcript"
)
Expand All @@ -409,14 +409,14 @@ def n_to_c(self, var_n):
ac=var_n.ac, type="c", posedit=hgvs.posedit.PosEdit(pos_c, edit_c)
)
if self.replace_reference:
self._replace_reference(var_c)
self._replace_reference(var_c, alt_ac, alt_aln_method)
if self.add_gene_symbol:
self._update_gene_symbol(var_c, var_n.gene)
return var_c

# ############################################################################
# c ⟶ p
def c_to_p(self, var_c, pro_ac=None):
def c_to_p(self, var_c, pro_ac=None, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""
Converts a c. SequenceVariant to a p. SequenceVariant on the specified protein accession
Author: Rudy Rico
Expand All @@ -431,6 +431,7 @@ def c_to_p(self, var_c, pro_ac=None):
raise HGVSInvalidVariantError("Expected a cDNA (c.) variant; got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac)
builder = altseqbuilder.AltSeqBuilder(var_c, reference_data)

Expand All @@ -455,7 +456,7 @@ def c_to_p(self, var_c, pro_ac=None):
############################################################################
# Internal methods

def _replace_reference(self, var):
def _replace_reference(self, var, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""fetch reference sequence for variant and update (in-place) if necessary"""

if var.type not in "cgmnr":
Expand All @@ -465,24 +466,51 @@ def _replace_reference(self, var):
# these types have no reference sequence (zero-width), so return as-is
return var

pos = var.posedit.pos
if (isinstance(pos.start, hgvs.location.BaseOffsetPosition) and pos.start.offset != 0) or (
isinstance(pos.end, hgvs.location.BaseOffsetPosition) and pos.end.offset != 0
mapper = None
if (
var.type in "cnr"
and var.posedit.pos is not None
and (var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0)
):
_logger.info("Can't update reference sequence for intronic variant {}".format(var))
return var

# For c. variants, we need coords on underlying sequences
if var.type == "c":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript"
)
pos = mapper.c_to_n(var.posedit.pos)
if var.type == "r":
_logger.info("Can't update reference sequence for intronic variant %s", var)
return var
if alt_ac is None:
_logger.info("Can't update reference sequence for intronic variant %s without alt_ac", var)
return var
if var.type == "c":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
pos = mapper.c_to_g(var.posedit.pos)
ac = alt_ac
_type = "g"
if var.type == "n":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
pos = mapper.n_to_g(var.posedit.pos)
ac = alt_ac
_type = "g"
else:
pos = var.posedit.pos
# For c. variants, we need coords on underlying sequences
if var.type == "c":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript"
)
pos = mapper.c_to_n(var.posedit.pos)
ac = var.ac
_type = var.type
else:
pos = var.posedit.pos
ac = var.ac
_type = var.type

seq_start = pos.start.base - 1
seq_end = pos.end.base
if _type in "cnr":
seq_start += pos.start.offset
seq_end += pos.end.offset

# When strict_bounds is False and an error occurs, return
# variant as-is
Expand All @@ -491,12 +519,15 @@ def _replace_reference(self, var):
# this is an out-of-bounds variant
return var

seq = self.hdp.get_seq(var.ac, seq_start, seq_end)
seq = self.hdp.get_seq(ac, seq_start, seq_end)

if len(seq) != seq_end - seq_start:
# tried to read beyond seq end; this is an out-of-bounds variant
return var

if _type == "g" and mapper and mapper.strand == -1:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, just out of curiosity, when can the strand be negative for a g.? Inversions?

Copy link
Contributor Author

@b0d0nne11 b0d0nne11 Jan 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not really sure. I don't have a simple, science-y explanation for why this is the case. My first pass at this PR didn't include it and we noticed some mappings gave the reverse compliments of the expected sequences. After some more digging we found that only g type variants on the negative strand as described by the alignment mapper were affected.

seq = reverse_complement(seq)

edit = var.posedit.edit
if edit.ref != seq:
_logger.debug(
Expand Down
Binary file modified tests/data/cache-py3.hdp
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/data/gcp/real.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ ID00048 NC_000010.10:g.89692922T>C NM_000314.4:c.406T>C NP_000305.3:p.(Cys136Arg
ID00049 NC_000010.10:g.89692921dupA NM_000314.4:c.405dupA NP_000305.3:p.(Cys136Metfs*44)
ID00050 NC_000010.10:g.89692923_89692939delGTGCATATTTATTACAT NM_000314.4:c.407_423delGTGCATATTTATTACAT NP_000305.3:p.(Cys136Serfs*38)
ID00051 NC_000010.10:g.89712015C>A NM_000314.4:c.633C>A NP_000305.3:p.(Cys211*)
ID00052 NC_000010.10:g.89685314dupT NM_000314.4:c.209dupT NP_000305.3:p.(Cys71Leufs*3)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I verified that this is indeed a duplication of the last base of the exon. Therefore it will now be p.? since the requested default behavior was to have insertions at the exon-intron boundary count as intronic.

ID00052 NC_000010.10:g.89685314dupT NM_000314.4:c.209dupT NP_000305.3:p.?
ID00053 NC_000010.10:g.89711893C>T NM_000314.4:c.511C>T NP_000305.3:p.(Gln171*)
ID00054 NC_000010.10:g.89692963dupA NM_000314.4:c.447dupA NP_000305.3:p.(Glu150Argfs*30)
ID00055 NC_000010.10:g.89685315G>A NM_000314.4:c.209+1G>A NP_000305.3:p.?
Expand Down
20 changes: 10 additions & 10 deletions tests/data/sanity_cp.tsv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
accession transcript_sequence cds_start_i cds_end_i
NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39
NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39
NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39
NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39
NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42
NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45
NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39
NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39
NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40
accession transcript_sequence cds_start_i cds_end_i lengths
NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 [10,25,30]
NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 [10,25,30]
NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 [10,25,30]
NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 [10,25,30]
NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 [10,25,30]
Loading
Loading