diff --git a/src/hgvs/_data/defaults.ini b/src/hgvs/_data/defaults.ini index 7e1ecd2d..0b6cae8f 100644 --- a/src/hgvs/_data/defaults.ini +++ b/src/hgvs/_data/defaults.ini @@ -6,6 +6,7 @@ inferred_p_is_uncertain = True normalize = True prevalidation_level = EXTRINSIC replace_reference = True +ins_at_boundary_is_intronic = True # strict_bounds: require transcript variants to be within transcript sequence bounds strict_bounds = True diff --git a/src/hgvs/assemblymapper.py b/src/hgvs/assemblymapper.py index 25980b86..219d1630 100644 --- a/src/hgvs/assemblymapper.py +++ b/src/hgvs/assemblymapper.py @@ -164,15 +164,24 @@ def t_to_p(self, var_t): ) def c_to_n(self, var_c): - var_out = super(AssemblyMapper, self).c_to_n(var_c) + alt_ac = self._alt_ac_for_tx_ac(var_c.ac) + var_out = super(AssemblyMapper, self).c_to_n( + var_c, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method + ) return self._maybe_normalize(var_out) def n_to_c(self, var_n): - var_out = super(AssemblyMapper, self).n_to_c(var_n) + alt_ac = self._alt_ac_for_tx_ac(var_n.ac) + var_out = super(AssemblyMapper, self).n_to_c( + var_n, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method + ) return self._maybe_normalize(var_out) def c_to_p(self, var_c): - var_out = super(AssemblyMapper, self).c_to_p(var_c) + alt_ac = self._alt_ac_for_tx_ac(var_c.ac) + var_out = super(AssemblyMapper, self).c_to_p( + var_c, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method + ) return self._maybe_normalize(var_out) def relevant_transcripts(self, var_g): diff --git a/src/hgvs/sequencevariant.py b/src/hgvs/sequencevariant.py index 86e1aad7..8150bf49 100644 --- a/src/hgvs/sequencevariant.py +++ b/src/hgvs/sequencevariant.py @@ -53,7 +53,7 @@ def __repr__(self): ", ".join((a.name + "=" + str(getattr(self, a.name))) for a in self.__attrs_attrs__), ) - def fill_ref(self, hdp): + def fill_ref(self, hdp, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): # TODO: Refactor. SVs should not operate on themselves when # external resources are required # replace_reference should be moved outside function @@ -67,7 +67,7 @@ def fill_ref(self, hdp): type in ["del", "delins", "identity", "dup", "repeat"] and self.posedit.edit.ref_s is None ): - vm._replace_reference(self) + vm._replace_reference(self, alt_ac=alt_ac, alt_aln_method=alt_aln_method) if type == "identity" and isinstance(self.posedit.edit, hgvs.edit.NARefAlt): self.posedit.edit.alt = self.posedit.edit.ref return self diff --git a/src/hgvs/utils/altseqbuilder.py b/src/hgvs/utils/altseqbuilder.py index 03fe5759..809b4374 100644 --- a/src/hgvs/utils/altseqbuilder.py +++ b/src/hgvs/utils/altseqbuilder.py @@ -9,6 +9,9 @@ import logging import math +from hgvs import global_config +from hgvs.exceptions import HGVSError + from bioutils.sequences import reverse_complement, translate_cds from ..edit import Dup, Inv, NARefAlt, Repeat @@ -194,8 +197,44 @@ def _get_variant_region(self): and self._var_c.posedit.pos.end.datum == Datum.CDS_END ): result = self.WHOLE_GENE + elif ( + global_config.mapping.ins_at_boundary_is_intronic + and self._var_c.posedit.edit.type == "dup" + and self._var_c.posedit.pos.start.base in self._transcript_data.exon_start_positions + ): + result = self.INTRON + elif ( + global_config.mapping.ins_at_boundary_is_intronic + and self._var_c.posedit.edit.type == "dup" + and self._var_c.posedit.pos.end.base in self._transcript_data.exon_end_positions + ): + result = self.INTRON + elif ( + not global_config.mapping.ins_at_boundary_is_intronic + and self._var_c.posedit.edit.type == "ins" + and self._var_c.posedit.pos.start.offset == -1 and self._var_c.posedit.pos.end.offset == 0 + ): + result = self.EXON + elif ( + not global_config.mapping.ins_at_boundary_is_intronic + and self._var_c.posedit.edit.type == "ins" + and self._var_c.posedit.pos.start.offset == 0 and self._var_c.posedit.pos.end.offset == 1 + ): + result = self.EXON + elif ( + not global_config.mapping.ins_at_boundary_is_intronic + and self._var_c.posedit.edit.type == "dup" + and self._var_c.posedit.pos.end.offset == -1 + ): + result = self.EXON + elif ( + not global_config.mapping.ins_at_boundary_is_intronic + and self._var_c.posedit.edit.type == "dup" + and self._var_c.posedit.pos.start.offset == 1 + ): + result = self.EXON elif self._var_c.posedit.pos.start.offset != 0 or self._var_c.posedit.pos.end.offset != 0: - # leave out anything intronic for now + # leave out anything else intronic for now result = self.INTRON else: # anything else that contains an exon result = self.EXON @@ -265,7 +304,10 @@ def _incorporate_dup(self): """Incorporate dup into sequence""" seq, cds_start, cds_stop, start, end = self._setup_incorporate() - dup_seq = seq[start:end] + if not self._var_c.posedit.edit.ref: + raise HGVSError('Duplication variant is missing reference sequence') + + dup_seq = self._var_c.posedit.edit.ref seq[end:end] = dup_seq is_frameshift = len(dup_seq) % 3 != 0 @@ -327,10 +369,10 @@ def _setup_incorporate(self): if pos.base < 0: # 5' UTR result = cds_start - 1 else: # cds/intron - if pos.offset <= 0: - result = (cds_start - 1) + pos.base - 1 + if pos.offset < 0: + result = (cds_start - 1) + pos.base - 2 else: - result = (cds_start - 1) + pos.base + result = (cds_start - 1) + pos.base - 1 elif pos.datum == Datum.CDS_END: # 3' UTR result = cds_stop + pos.base - 1 else: diff --git a/src/hgvs/utils/reftranscriptdata.py b/src/hgvs/utils/reftranscriptdata.py index 63f7c5fe..5f21b93a 100644 --- a/src/hgvs/utils/reftranscriptdata.py +++ b/src/hgvs/utils/reftranscriptdata.py @@ -34,8 +34,16 @@ def __init__(self, hdp, tx_ac, pro_ac): # TODO: drop get_acs_for_protein_seq; use known mapping or digest (wo/pro ac inference) pro_ac = hdp.get_pro_ac_for_tx_ac(tx_ac) or hdp.get_acs_for_protein_seq(protein_seq)[0] + exon_start_positions = [-tx_info["cds_start_i"]] + exon_end_positions = [exon_start_positions[0] + tx_info["lengths"][0]] + for exon_length in tx_info["lengths"][1:]: + exon_start_positions.append(exon_end_positions[-1] + 1) + exon_end_positions.append(exon_end_positions[-1] + exon_length) + self.transcript_sequence = tx_seq self.aa_sequence = protein_seq self.cds_start = cds_start self.cds_stop = cds_stop self.protein_accession = pro_ac + self.exon_start_positions = exon_start_positions + self.exon_end_positions = exon_end_positions diff --git a/src/hgvs/variantmapper.py b/src/hgvs/variantmapper.py index d73b98b7..0f2c3a48 100644 --- a/src/hgvs/variantmapper.py +++ b/src/hgvs/variantmapper.py @@ -120,7 +120,7 @@ def t_to_g(self, var_t, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al raise HGVSInvalidVariantError("Expected a c. or n. variant; got " + str(var_t)) if self._validator: self._validator.validate(var_t) - var_t.fill_ref(self.hdp) + var_t.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method) if var_t.type == "c": var_out = VariantMapper.c_to_g( self, var_c=var_t, alt_ac=alt_ac, alt_aln_method=alt_aln_method @@ -212,7 +212,7 @@ def n_to_g(self, var_n, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al raise HGVSInvalidVariantError("Expected a n. variant; got " + str(var_n)) if self._validator: self._validator.validate(var_n) - var_n.fill_ref(self.hdp) + var_n.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method) mapper = self._fetch_AlignmentMapper( tx_ac=var_n.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method ) @@ -234,7 +234,7 @@ def n_to_g(self, var_n, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g) ) if self.replace_reference: - self._replace_reference(var_g) + self._replace_reference(var_g, alt_ac, alt_aln_method) # No gene symbol for g. variants (actually, *should* for NG, but no way to distinguish) return var_g @@ -306,7 +306,7 @@ def c_to_g(self, var_c, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c)) if self._validator: self._validator.validate(var_c) - var_c.fill_ref(self.hdp) + var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method) mapper = self._fetch_AlignmentMapper( tx_ac=var_c.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method ) @@ -331,12 +331,12 @@ def c_to_g(self, var_c, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g) ) if self.replace_reference: - self._replace_reference(var_g) + self._replace_reference(var_g, alt_ac, alt_aln_method) return var_g # ############################################################################ # c⟷n - def c_to_n(self, var_c): + def c_to_n(self, var_c, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): """Given a parsed c. variant, return a n. variant on the specified transcript using the specified alignment method (default is "transcript" indicating a self alignment). @@ -351,7 +351,7 @@ def c_to_n(self, var_c): raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c)) if self._validator: self._validator.validate(var_c) - var_c.fill_ref(self.hdp) + var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method) mapper = self._fetch_AlignmentMapper( tx_ac=var_c.ac, alt_ac=var_c.ac, alt_aln_method="transcript" ) @@ -370,12 +370,12 @@ def c_to_n(self, var_c): ac=var_c.ac, type="n", posedit=hgvs.posedit.PosEdit(pos_n, edit_n) ) if self.replace_reference: - self._replace_reference(var_n) + self._replace_reference(var_n, alt_ac, alt_aln_method) if self.add_gene_symbol: self._update_gene_symbol(var_n, var_c.gene) return var_n - def n_to_c(self, var_n): + def n_to_c(self, var_n, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): """Given a parsed n. variant, return a c. variant on the specified transcript using the specified alignment method (default is "transcript" indicating a self alignment). @@ -390,7 +390,7 @@ def n_to_c(self, var_n): raise HGVSInvalidVariantError("Expected n. variant; got " + str(var_n)) if self._validator: self._validator.validate(var_n) - var_n.fill_ref(self.hdp) + var_n.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method) mapper = self._fetch_AlignmentMapper( tx_ac=var_n.ac, alt_ac=var_n.ac, alt_aln_method="transcript" ) @@ -409,14 +409,14 @@ def n_to_c(self, var_n): ac=var_n.ac, type="c", posedit=hgvs.posedit.PosEdit(pos_c, edit_c) ) if self.replace_reference: - self._replace_reference(var_c) + self._replace_reference(var_c, alt_ac, alt_aln_method) if self.add_gene_symbol: self._update_gene_symbol(var_c, var_n.gene) return var_c # ############################################################################ # c ⟶ p - def c_to_p(self, var_c, pro_ac=None): + def c_to_p(self, var_c, pro_ac=None, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): """ Converts a c. SequenceVariant to a p. SequenceVariant on the specified protein accession Author: Rudy Rico @@ -431,6 +431,7 @@ def c_to_p(self, var_c, pro_ac=None): raise HGVSInvalidVariantError("Expected a cDNA (c.) variant; got " + str(var_c)) if self._validator: self._validator.validate(var_c) + var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method) reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac) builder = altseqbuilder.AltSeqBuilder(var_c, reference_data) @@ -455,7 +456,7 @@ def c_to_p(self, var_c, pro_ac=None): ############################################################################ # Internal methods - def _replace_reference(self, var): + def _replace_reference(self, var, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): """fetch reference sequence for variant and update (in-place) if necessary""" if var.type not in "cgmnr": @@ -465,24 +466,51 @@ def _replace_reference(self, var): # these types have no reference sequence (zero-width), so return as-is return var - pos = var.posedit.pos - if (isinstance(pos.start, hgvs.location.BaseOffsetPosition) and pos.start.offset != 0) or ( - isinstance(pos.end, hgvs.location.BaseOffsetPosition) and pos.end.offset != 0 + mapper = None + if ( + var.type in "cnr" + and var.posedit.pos is not None + and (var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0) ): - _logger.info("Can't update reference sequence for intronic variant {}".format(var)) - return var - - # For c. variants, we need coords on underlying sequences - if var.type == "c": - mapper = self._fetch_AlignmentMapper( - tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript" - ) - pos = mapper.c_to_n(var.posedit.pos) + if var.type == "r": + _logger.info("Can't update reference sequence for intronic variant %s", var) + return var + if alt_ac is None: + _logger.info("Can't update reference sequence for intronic variant %s without alt_ac", var) + return var + if var.type == "c": + mapper = self._fetch_AlignmentMapper( + tx_ac=var.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method + ) + pos = mapper.c_to_g(var.posedit.pos) + ac = alt_ac + _type = "g" + if var.type == "n": + mapper = self._fetch_AlignmentMapper( + tx_ac=var.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method + ) + pos = mapper.n_to_g(var.posedit.pos) + ac = alt_ac + _type = "g" else: - pos = var.posedit.pos + # For c. variants, we need coords on underlying sequences + if var.type == "c": + mapper = self._fetch_AlignmentMapper( + tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript" + ) + pos = mapper.c_to_n(var.posedit.pos) + ac = var.ac + _type = var.type + else: + pos = var.posedit.pos + ac = var.ac + _type = var.type seq_start = pos.start.base - 1 seq_end = pos.end.base + if _type in "cnr": + seq_start += pos.start.offset + seq_end += pos.end.offset # When strict_bounds is False and an error occurs, return # variant as-is @@ -491,12 +519,15 @@ def _replace_reference(self, var): # this is an out-of-bounds variant return var - seq = self.hdp.get_seq(var.ac, seq_start, seq_end) + seq = self.hdp.get_seq(ac, seq_start, seq_end) if len(seq) != seq_end - seq_start: # tried to read beyond seq end; this is an out-of-bounds variant return var + if _type == "g" and mapper and mapper.strand == -1: + seq = reverse_complement(seq) + edit = var.posedit.edit if edit.ref != seq: _logger.debug( diff --git a/tests/data/cache-py3.hdp b/tests/data/cache-py3.hdp index 97ad36b0..323bade2 100644 Binary files a/tests/data/cache-py3.hdp and b/tests/data/cache-py3.hdp differ diff --git a/tests/data/gcp/real.tsv b/tests/data/gcp/real.tsv index 49c8378f..ea86bdb5 100644 --- a/tests/data/gcp/real.tsv +++ b/tests/data/gcp/real.tsv @@ -50,7 +50,7 @@ ID00048 NC_000010.10:g.89692922T>C NM_000314.4:c.406T>C NP_000305.3:p.(Cys136Arg ID00049 NC_000010.10:g.89692921dupA NM_000314.4:c.405dupA NP_000305.3:p.(Cys136Metfs*44) ID00050 NC_000010.10:g.89692923_89692939delGTGCATATTTATTACAT NM_000314.4:c.407_423delGTGCATATTTATTACAT NP_000305.3:p.(Cys136Serfs*38) ID00051 NC_000010.10:g.89712015C>A NM_000314.4:c.633C>A NP_000305.3:p.(Cys211*) -ID00052 NC_000010.10:g.89685314dupT NM_000314.4:c.209dupT NP_000305.3:p.(Cys71Leufs*3) +ID00052 NC_000010.10:g.89685314dupT NM_000314.4:c.209dupT NP_000305.3:p.? ID00053 NC_000010.10:g.89711893C>T NM_000314.4:c.511C>T NP_000305.3:p.(Gln171*) ID00054 NC_000010.10:g.89692963dupA NM_000314.4:c.447dupA NP_000305.3:p.(Glu150Argfs*30) ID00055 NC_000010.10:g.89685315G>A NM_000314.4:c.209+1G>A NP_000305.3:p.? diff --git a/tests/data/sanity_cp.tsv b/tests/data/sanity_cp.tsv index 81fc9dee..3d23a3d7 100644 --- a/tests/data/sanity_cp.tsv +++ b/tests/data/sanity_cp.tsv @@ -1,10 +1,10 @@ -accession transcript_sequence cds_start_i cds_end_i -NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 -NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 -NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 -NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 -NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 -NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 -NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 -NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 -NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 +accession transcript_sequence cds_start_i cds_end_i lengths +NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 [10,25,30] +NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 [10,25,30] +NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30] +NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30] +NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 [10,25,30] +NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 [10,25,30] +NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30] +NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30] +NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 [10,25,30] diff --git a/tests/issues/test_655.py b/tests/issues/test_655.py new file mode 100644 index 00000000..1026ad62 --- /dev/null +++ b/tests/issues/test_655.py @@ -0,0 +1,374 @@ +import os +from contextlib import contextmanager + +import hgvs +from hgvs.exceptions import HGVSError +import pytest +from support import CACHE +import support.mock_input_source as mock_input_data_source + +sanity_cases = [ + { + "name": "ins at intron/exon boundary", + "var_c": "NM_999999.1:c.2-1_2insG", + "exonic":{ + "var_p": "MOCK:p.Met1?", + }, + "intronic":{ + "var_p": "MOCK:p.?", + }, + }, + { + "name": "ins at exon/intron boundary", + "var_c": "NM_999999.1:c.26_26+1insC", + "exonic":{ + "var_p": "MOCK:p.(Lys9AsnfsTer?)", + }, + "intronic":{ + "var_p": "MOCK:p.?", + }, + }, + { + "name": "dup at intron/exon boundary", + "var_c": "NM_999999.1:c.2dup", + "exonic":{ + "var_p": "MOCK:p.Met1?", + }, + "intronic":{ + "var_p": "MOCK:p.?", + }, + }, + { + "name": "dup at intron/exon boundary", + "var_c": "NM_999999.1:c.2-1dup", + "exonic":{ + "exc": HGVSError, + }, + "intronic":{ + "var_p": "MOCK:p.?", + }, + + }, + { + "name": "dup at exon/intron boundary", + "var_c": "NM_999999.1:c.26dup", + "exonic":{ + "var_p": "MOCK:p.(Ter10IleextTer?)", + }, + "intronic":{ + "var_p": "MOCK:p.?", + }, + }, + { + "name": "dup at exon/intron boundary", + "var_c": "NM_999999.1:c.26+1dup", + "exonic":{ + "exc": HGVSError, + }, + "intronic":{ + "var_p": "MOCK:p.?", + }, + }, +] + +real_cases = [ + { + "name": "ins at intron/exon boundary", + "var_c": "NM_004380.2:c.3251-1_3251insA", + "exonic":{ + "var_p": "NP_004371.2:p.(Ile1084AsnfsTer3)", + }, + "intronic":{ + "var_p": "NP_004371.2:p.?", + }, + }, + { + "name": "ins at exon/intron boundary", + "var_c": "NM_004380.2:c.3250_3250+1insT", + "exonic":{ + "var_p": "NP_004371.2:p.(Phe1085LeufsTer2)", + }, + "intronic":{ + "var_p": "NP_004371.2:p.?", + }, + }, + { + "name": "dup at intron/exon boundary (in exon, single-base)", + "var_c": "NM_024529.4:c.132dup", + "exonic":{ + "var_p": "NP_078805.3:p.(Thr45AspfsTer21)", + }, + "intronic":{ + "var_p": "NP_078805.3:p.?", + }, + }, + { + "name": "dup at intron/exon boundary (in exon, multi-base)", + "var_c": "NM_024529.4:c.132_133dup", + "exonic":{ + "var_p": "NP_078805.3:p.(Thr45ArgfsTer65)", + }, + "intronic":{ + "var_p": "NP_078805.3:p.?", + }, + }, + { + "name": "dup at intron/exon boundary (in intron, single-base)", + "var_c": "NM_024529.4:c.132-1dup", + "exonic":{ + "var_p": "NP_078805.3:p.(Thr45AspfsTer21)", + "exc": HGVSError, + }, + "intronic":{ + "var_p": "NP_078805.3:p.?", + }, + }, + { + "name": "dup at intron/exon boundary (in intron, multi-base)", + "var_c": "NM_024529.4:c.132-2_132-1dup", + "exonic":{ + "var_p": "NP_078805.3:p.(Thr45GlyfsTer65)", + "exc": HGVSError, + }, + "intronic":{ + "var_p": "NP_078805.3:p.?", + }, + }, + { + "name": "dup at intron/exon boundary (in exon, single-base, negative strand)", + "var_c": "NM_004380.2:c.3251dup", + "exonic":{ + "var_p": "NP_004371.2:p.(Phe1085LeufsTer2)", + }, + "intronic":{ + "var_p": "NP_004371.2:p.?", + }, + }, + { + "name": "dup at intron/exon boundary (in exon, multi-base, negative strand)", + "var_c": "NM_004380.2:c.3251_3252dup", + "exonic":{ + "var_p": "NP_004371.2:p.(Phe1085SerfsTer15)", + }, + "intronic":{ + "var_p": "NP_004371.2:p.?", + }, + }, + { + "name": "dup at intron/exon boundary (in intron, single-base, negative strand)", + "var_c": "NM_004380.2:c.3251-1dup", + "exonic":{ + "var_p": "NP_004371.2:p.(Ile1084SerfsTer3)", + "exc": HGVSError, + }, + "intronic":{ + "var_p": "NP_004371.2:p.?", + }, + }, + { + "name": "dup at intron/exon boundary (in intron, multi-base, negative strand)", + "var_c": "NM_004380.2:c.3251-2_3251-1dup", + "exonic":{ + "var_p": "NP_004371.2:p.(Ile1084LysfsTer16)", + "exc": HGVSError, + }, + "intronic":{ + "var_p": "NP_004371.2:p.?", + }, + }, + { + "name": "dup at exon/intron boundary (in exon, single-base)", + "var_c": "NM_024529.4:c.131dup", + "exonic":{ + "var_p": "NP_078805.3:p.(Thr45AspfsTer21)", + }, + "intronic":{ + "var_p": "NP_078805.3:p.?", + }, + }, + { + "name": "dup at exon/intron boundary (in exon, multi-base)", + "var_c": "NM_024529.4:c.130_131dup", + "exonic":{ + "var_p": "NP_078805.3:p.(Thr45GlyfsTer65)", + }, + "intronic":{ + "var_p": "NP_078805.3:p.?", + }, + }, + { + "name": "dup at exon/intron boundary (in intron, single-base)", + "var_c": "NM_024529.4:c.131+1dup", + "exonic":{ + "var_p": "NP_078805.3:p.(Thr45AspfsTer21)", + "exc": HGVSError, + }, + "intronic":{ + "var_p": "NP_078805.3:p.?", + }, + }, + { + "name": "dup at exon/intron boundary (in intron, multi-base)", + "var_c": "NM_024529.4:c.131+1_131+3dup", + "exonic":{ + "var_p": "NP_078805.3:p.(Thr45Ter)", + "exc": HGVSError, + }, + "intronic":{ + "var_p": "NP_078805.3:p.?", + }, + }, + { + "name": "dup at exon/intron boundary (in exon, single-base, negative strand)", + "var_c": "NM_004985.4:c.111dup", + "exonic":{ + "var_p": "NP_004976.2:p.(Asp38GlyfsTer10)", + }, + "intronic":{ + "var_p": "NP_004976.2:p.?", + }, + }, + { + "name": "dup at exon/intron boundary (in exon, multi-base, negative strand)", + "var_c": "NM_004985.4:c.110_111dup", + "exonic":{ + "var_p": "NP_004976.2:p.(Asp38ArgfsTer8)", + }, + "intronic":{ + "var_p": "NP_004976.2:p.?", + }, + }, + { + "name": "dup at exon/intron boundary (in intron, single-base, negative strand)", + "var_c": "NM_004985.4:c.111+1dup", + "exonic":{ + "var_p": "NP_004976.2:p.(Asp38GlyfsTer10)", + "exc": HGVSError, + }, + "intronic":{ + "var_p": "NP_004976.2:p.?", + }, + }, + { + "name": "dup at exon/intron boundary (in intron, multi-base, negative strand)", + "var_c": "NM_004985.4:c.111+1_111+4dup", + "exonic":{ + "var_p": "NP_004976.2:p.(Asp38ValfsTer11)", + "exc": HGVSError, + }, + "intronic":{ + "var_p": "NP_004976.2:p.?", + }, + }, + { + "name": "snv with broken cigar mapping", + "var_c": "NM_014638.2:c.515-99G>T", + "exonic":{ + "var_p": "NP_055453.2:p.?", + }, + "intronic":{ + "var_p": "NP_055453.2:p.?", + }, + }, + { + "name": "ins with broken cigar mapping", + "var_c": "NM_004799.2:c.71-7048_71-7047insATAT", + "exonic":{ + "var_p": "NP_004790.2:p.?", + }, + "intronic":{ + "var_p": "NP_004790.2:p.?", + }, + }, + { + "name": "del with broken cigar mapping", + "var_c": "NM_007324.2:c.71-7826_71-7802del", + "exonic":{ + "var_p": "NP_015563.2:p.?", + }, + "intronic":{ + "var_p": "NP_015563.2:p.?", + }, + }, +] + + +@pytest.fixture(scope="module") +def hp(): + return hgvs.parser.Parser() + + +@pytest.fixture(scope="module") +def hdp(): + return hgvs.dataproviders.uta.connect( + mode=os.environ.get("HGVS_CACHE_MODE", "run"), cache=CACHE + ) + + +@pytest.fixture(scope="module") +def vm(hdp): + return hgvs.variantmapper.VariantMapper(hdp) + + +@pytest.fixture(scope="module") +def am37(hdp): + return hgvs.assemblymapper.AssemblyMapper(hdp, assembly_name="GRCh37") + + +@pytest.fixture(scope="module") +def mock_hdp(): + fn = os.path.join(os.path.dirname(__file__), "..", "data", "sanity_cp.tsv") + return mock_input_data_source.MockInputSource(fn) + + +@pytest.fixture(scope="module") +def mock_vm(mock_hdp): + return hgvs.variantmapper.VariantMapper(mock_hdp, prevalidation_level="INTRINSIC") + + +@contextmanager +def config_setting(module, setting, temp_value): + old_value = getattr(module, setting) + try: + setattr(module, setting, temp_value) + yield + finally: + setattr(module, setting, old_value) + + +@pytest.mark.parametrize("case", sanity_cases) +def test_sanity_c_to_p(case, hp, mock_vm): + with config_setting(hgvs.global_config.mapping, 'ins_at_boundary_is_intronic', False): + var_c = hp.parse(case["var_c"]) + if "exc" in case["exonic"]: + with pytest.raises(case["exonic"]["exc"]): + mock_vm.c_to_p(var_c, "MOCK") + else: + assert str(mock_vm.c_to_p(var_c, "MOCK")) == case["exonic"]["var_p"] + with config_setting(hgvs.global_config.mapping, 'ins_at_boundary_is_intronic', True): + var_c = hp.parse(case["var_c"]) + if "exc" in case["intronic"]: + with pytest.raises(case["intronic"]["exc"]): + mock_vm.c_to_p(var_c, "MOCK") + else: + assert str(mock_vm.c_to_p(var_c, "MOCK")) == case["intronic"]["var_p"] + + +@pytest.mark.parametrize("case", real_cases) +def test_real_c_to_p(case, hp, vm, am37): + with config_setting(hgvs.global_config.mapping, 'ins_at_boundary_is_intronic', False): + var_c = hp.parse(case["var_c"]) + if "exc" in case["exonic"]: + with pytest.raises(case["exonic"]["exc"]): + vm.c_to_p(var_c) + else: + assert str(vm.c_to_p(var_c)) == case["exonic"]["var_p"] + assert str(am37.c_to_p(var_c)) == case["exonic"]["var_p"] + with config_setting(hgvs.global_config.mapping, 'ins_at_boundary_is_intronic', True): + var_c = hp.parse(case["var_c"]) + if "exc" in case["intronic"]: + with pytest.raises(case["intronic"]["exc"]): + vm.c_to_p(var_c) + else: + assert str(vm.c_to_p(var_c)) == case["intronic"]["var_p"] + assert str(am37.c_to_p(var_c)) == case["intronic"]["var_p"] diff --git a/tests/support/mock_input_source.py b/tests/support/mock_input_source.py index 90a2dd6a..b646f809 100644 --- a/tests/support/mock_input_source.py +++ b/tests/support/mock_input_source.py @@ -31,7 +31,7 @@ def fetch_transcript_info(self, ac): result = None data = self._mock_data.get(ac) if data: # interbase coordinates - result = {"cds_start_i": data["cds_start_i"], "cds_end_i": data["cds_end_i"]} + result = {"cds_start_i": data["cds_start_i"], "cds_end_i": data["cds_end_i"], "lengths": data["lengths"]} return result def get_tx_identity_info(self, ac): @@ -72,6 +72,7 @@ def _read_input(self, in_file): "transcript_sequence": row["transcript_sequence"], "cds_start_i": int(row["cds_start_i"]), "cds_end_i": int(row["cds_end_i"]), + "lengths": eval(row["lengths"]), } return result diff --git a/tests/test_hgvs_variantmapper_cp_altseqbuilder.py b/tests/test_hgvs_variantmapper_cp_altseqbuilder.py index 21f16cb0..f3657ea4 100644 --- a/tests/test_hgvs_variantmapper_cp_altseqbuilder.py +++ b/tests/test_hgvs_variantmapper_cp_altseqbuilder.py @@ -116,6 +116,7 @@ def test_sequence_with_length_that_is_not_divisible_by_3(self): def _run_comparison(self, hgvsc, expected_sequence): ac_p = "DUMMY" var = self._parser.parse_hgvs_variant(hgvsc) + var.fill_ref(self._datasource) transcript_data = RefTranscriptData(hdp=self._datasource, tx_ac=var.ac, pro_ac=ac_p) builder = altseqbuilder.AltSeqBuilder(var, transcript_data)