From 2f36fdbd68495a8698cc3f87ea121467da67ca8b Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Wed, 4 Dec 2024 09:53:10 -0500 Subject: [PATCH 01/16] Add work to support residue coords + refactor offset calculation --- .../mappers/exon_genomic_coords.py | 112 ++++++++------ tests/mappers/test_exon_genomic_coords.py | 139 ++++++++++++++++-- 2 files changed, 196 insertions(+), 55 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 82d4a72..2717012 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -10,6 +10,7 @@ from cool_seq_tool.schemas import ( Assembly, BaseModelForbidExtra, + CoordinateType, ServiceMeta, Strand, ) @@ -412,6 +413,7 @@ async def genomic_to_tx_segment( transcript: str | None = None, get_nearest_transcript_junction: bool = False, gene: str | None = None, + coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, ) -> GenomicTxSegService: """Get transcript segment data for genomic data, lifted over to GRCh38. @@ -485,6 +487,7 @@ async def genomic_to_tx_segment( gene=gene, get_nearest_transcript_junction=get_nearest_transcript_junction, is_seg_start=True, + coordinate_type=coordinate_type, ) if start_tx_seg_data.errors: return _return_service_errors(start_tx_seg_data.errors) @@ -505,6 +508,7 @@ async def genomic_to_tx_segment( gene=gene, get_nearest_transcript_junction=get_nearest_transcript_junction, is_seg_start=False, + coordinate_type=coordinate_type, ) if end_tx_seg_data.errors: return _return_service_errors(end_tx_seg_data.errors) @@ -735,6 +739,7 @@ async def _genomic_to_tx_segment( gene: str | None = None, get_nearest_transcript_junction: bool = False, is_seg_start: bool = True, + coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, ) -> GenomicTxSeg: """Given genomic data, generate a boundary for a transcript segment. @@ -763,6 +768,8 @@ async def _genomic_to_tx_segment( preceding the breakpoint for the 3' end. :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts. ``False`` if ``genomic_pos`` is where the transcript segment ends. + :param coordinate_type: Coordinate type for ``seg_start_genomic`` and + ``seg_end_genomic`` :return: Data for a transcript segment boundary (inter-residue coordinates) """ params = {key: None for key in GenomicTxSeg.model_fields} @@ -835,6 +842,13 @@ async def _genomic_to_tx_segment( strand = Strand(tx_exons[0].alt_strand) params["strand"] = strand + use_alt_start_i = self._use_alt_start_i( + is_seg_start=is_seg_start, strand=strand + ) + if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE: + genomic_pos = ( + genomic_pos - 1 + ) # Convert residue coordinate to inter-residue # Check if breakpoint occurs on an exon. # If not, determine the adjacent exon given the selected transcript @@ -847,15 +861,11 @@ async def _genomic_to_tx_segment( ) offset = self._get_exon_offset( - start_i=tx_exons[exon_num].alt_start_i, - end_i=tx_exons[exon_num].alt_end_i, + genomic_pos=genomic_pos, + exon_boundary=tx_exons[exon_num].alt_start_i + if use_alt_start_i + else tx_exons[exon_num].alt_end_i, strand=strand, - use_start_i=strand == Strand.POSITIVE - if is_seg_start - else strand != Strand.POSITIVE, - is_in_exon=False, - start=genomic_pos if is_seg_start else None, - end=genomic_pos if not is_seg_start else None, ) genomic_location, err_msg = self._get_vrs_seq_loc( @@ -931,7 +941,12 @@ async def _genomic_to_tx_segment( ) return await self._get_tx_seg_genomic_metadata( - genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript + genomic_ac, + genomic_pos, + is_seg_start, + gene, + tx_ac=transcript, + coordinate_type=coordinate_type, ) async def _get_grch38_ac_pos( @@ -1049,6 +1064,7 @@ async def _get_tx_seg_genomic_metadata( is_seg_start: bool, gene: str, tx_ac: str | None, + coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, ) -> GenomicTxSeg: """Get transcript segment data and associated genomic metadata. @@ -1110,15 +1126,18 @@ async def _get_tx_seg_genomic_metadata( ) tx_exon_aln_data = tx_exon_aln_data[0] - + strand = Strand(tx_exon_aln_data.alt_strand) + use_alt_start_i = self._use_alt_start_i( + is_seg_start=is_seg_start, strand=strand + ) + if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE: + genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue offset = self._get_exon_offset( - start_i=tx_exon_aln_data.alt_start_i, - end_i=tx_exon_aln_data.alt_end_i, - strand=Strand(tx_exon_aln_data.alt_strand), - use_start_i=False, # This doesn't impact anything since we're on the exon - is_in_exon=True, - start=genomic_pos if is_seg_start else None, - end=genomic_pos if not is_seg_start else None, + genomic_pos=genomic_pos, + exon_boundary=tx_exon_aln_data.alt_start_i + if use_alt_start_i + else tx_exon_aln_data.alt_end_i, + strand=strand, ) genomic_location, err_msg = self._get_vrs_seq_loc( @@ -1150,6 +1169,24 @@ def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords ) + @staticmethod + def _use_alt_start_i(is_seg_start: bool, strand: Strand) -> bool: + """Determine whether to use alt_start_i or alt_end_i from UTA when computing + exon offset + + :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts. + ``False`` if ``genomic_pos`` is where the transcript segment ends. + :param strand: The transcribed strand + :return ``True`` if alt_start_i should be used, ``False`` if alt_end_i should + be used + """ + return bool( + is_seg_start + and strand == Strand.POSITIVE + or not is_seg_start + and strand == Strand.NEGATIVE + ) + @staticmethod def _get_adjacent_exon( tx_exons_genomic_coords: list[_ExonCoord], @@ -1205,38 +1242,21 @@ def _get_adjacent_exon( @staticmethod def _get_exon_offset( - start_i: int, - end_i: int, + genomic_pos: int, + exon_boundary: int, strand: Strand, - use_start_i: bool = True, - is_in_exon: bool = True, - start: int | None = None, - end: int | None = None, ) -> int: """Compute offset from exon start or end index - :param start_i: Exon start index (inter-residue) - :param end_i: Exon end index (inter-residue) - :param strand: Strand - :param use_start_i: Whether or not ``start_i`` should be used to compute the - offset, defaults to ``True``. This is only used when ``is_in_exon`` is - ``False``. - :param is_in_exon: Whether or not the position occurs in an exon, defaults to - ``True`` - :param start: Provided start position, defaults to ``None``. Must provide - ``start`` or ``end``, not both. - :param end: Provided end position, defaults to ``None``. Must provide ``start`` - or ``end``, not both + :param genomic_pos: The supplied genomic position. This can represent, for + example, a fusion junction breakpoint + :param exon_boundary: The genomic position for the exon boundary that the offset + is being computed against + :paran strand: The transcribed strand :return: Offset from exon start or end index """ - if is_in_exon: - if start is not None: - offset = start - start_i if strand == Strand.POSITIVE else end_i - start - else: - offset = end - end_i if strand == Strand.POSITIVE else start_i - end - else: - if strand == Strand.POSITIVE: - offset = start - start_i if use_start_i else end - end_i - else: - offset = start_i - end if use_start_i else end_i - start - return offset + return ( + genomic_pos - exon_boundary + if strand == Strand.POSITIVE + else (genomic_pos - exon_boundary) * -1 + ) diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index c779953..4b61564 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -10,6 +10,7 @@ _ExonCoord, ) from cool_seq_tool.schemas import ( + CoordinateType, Strand, ) @@ -880,6 +881,21 @@ def test_is_exonic_breakpoint(test_egc_mapper, nm_001105539_exons_genomic_coords assert resp is True # Breakpoint does occur on an exon +def test_use_alt_start_i(test_egc_mapper): + """Test when to use alt_start_i or alt_end_i from UTA""" + resp = test_egc_mapper._use_alt_start_i(is_seg_start=True, strand=Strand.POSITIVE) + assert resp + + resp = test_egc_mapper._use_alt_start_i(is_seg_start=False, strand=Strand.NEGATIVE) + assert resp + + resp = test_egc_mapper._use_alt_start_i(is_seg_start=True, strand=Strand.NEGATIVE) + assert not resp + + resp = test_egc_mapper._use_alt_start_i(is_seg_start=False, strand=Strand.POSITIVE) + assert not resp + + @pytest.mark.asyncio() async def test_genomic_to_transcript_fusion_context( test_egc_mapper, @@ -900,15 +916,6 @@ async def test_genomic_to_transcript_fusion_context( resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(resp, zbtb10_exon3_end) - inputs = { - "chromosome": "chr8", - "seg_end_genomic": 80514010, - "gene": "ZBTB10", - "get_nearest_transcript_junction": True, - } - resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_tx_seg_service_checks(resp, zbtb10_exon3_end) - inputs = { "chromosome": "8", "seg_start_genomic": 80518580, @@ -981,6 +988,67 @@ async def test_genomic_to_transcript_fusion_context( resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(resp, gusbp3_exon5_start) + # Test with residue coordinates + inputs = { + "chromosome": "8", + "seg_end_genomic": 80514010, + "gene": "ZBTB10", + "get_nearest_transcript_junction": True, + "coordinate_type": CoordinateType.RESIDUE, + } + resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + genomic_tx_seg_service_checks(resp, zbtb10_exon3_end) + + inputs = { + "chromosome": "8", + "seg_start_genomic": 80518581, + "gene": "ZBTB10", + "get_nearest_transcript_junction": True, + "coordinate_type": CoordinateType.RESIDUE, + } + resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + genomic_tx_seg_service_checks(resp, zbtb10_exon5_start) + + inputs = { + "chromosome": "1", + "seg_end_genomic": 154171411, + "gene": "TPM3", + "get_nearest_transcript_junction": True, + "coordinate_type": CoordinateType.RESIDUE, + } + resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + genomic_tx_seg_service_checks(resp, tpm3_exon6_end) + + inputs = { + "chromosome": "1", + "seg_start_genomic": 154173080, + "gene": "TPM3", + "get_nearest_transcript_junction": True, + "coordinate_type": CoordinateType.RESIDUE, + } + resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + genomic_tx_seg_service_checks(resp, tpm3_exon5_start) + + inputs = { + "chromosome": "5", + "seg_end_genomic": 69680765, + "gene": "GUSBP3", + "get_nearest_transcript_junction": True, + "coordinate_type": CoordinateType.RESIDUE, + } + resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + genomic_tx_seg_service_checks(resp, gusbp3_exon2_end) + + inputs = { + "chromosome": "5", + "seg_start_genomic": 69645878, + "gene": "GUSBP3", + "get_nearest_transcript_junction": True, + "coordinate_type": CoordinateType.RESIDUE, + } + resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + genomic_tx_seg_service_checks(resp, gusbp3_exon5_start) + @pytest.mark.asyncio() async def test_get_alt_ac_start_and_end( @@ -1070,6 +1138,59 @@ async def test_genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8): ) genomic_tx_seg_checks(resp, tpm3_exon8) + # Test with residue coordinates + resp = await test_egc_mapper._genomic_to_tx_segment( + 154192135, + genomic_ac="NC_000001.11", + transcript="NM_152263.3", + gene="TPM3", + coordinate_type=CoordinateType.RESIDUE, + ) + genomic_tx_seg_checks(resp, tpm3_exon1) + + resp = await test_egc_mapper._genomic_to_tx_segment( + 154192135, + chromosome="1", + transcript="NM_152263.3", + coordinate_type=CoordinateType.RESIDUE, + ) + genomic_tx_seg_checks(resp, tpm3_exon1) + + resp = await test_egc_mapper._genomic_to_tx_segment( + 154192135, + chromosome="1", + transcript="NM_152263.3", + coordinate_type=CoordinateType.RESIDUE, + ) + genomic_tx_seg_checks(resp, tpm3_exon1) + + resp = await test_egc_mapper._genomic_to_tx_segment( + 154170400, + genomic_ac="NC_000001.11", + transcript="NM_152263.3", + is_seg_start=False, + coordinate_type=CoordinateType.RESIDUE, + ) + genomic_tx_seg_checks(resp, tpm3_exon8) + + resp = await test_egc_mapper._genomic_to_tx_segment( + 154170400, + chromosome="1", + transcript="NM_152263.3", + is_seg_start=False, + coordinate_type=CoordinateType.RESIDUE, + ) + genomic_tx_seg_checks(resp, tpm3_exon8) + + resp = await test_egc_mapper._genomic_to_tx_segment( + 154170400, + chromosome="1", + transcript="NM_152263.3", + is_seg_start=False, + coordinate_type=CoordinateType.RESIDUE, + ) + genomic_tx_seg_checks(resp, tpm3_exon8) + @pytest.mark.asyncio() async def test_tpm3( From 49ef3dfbcc5a4ceef69f5fe83cf2aa137d63829d Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Mon, 9 Dec 2024 09:09:11 -0500 Subject: [PATCH 02/16] Fix docstrings --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 2717012..683df69 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -461,7 +461,7 @@ async def genomic_to_tx_segment( :param gene: A valid, case-sensitive HGNC symbol. Must be given if no ``transcript`` value is provided. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and - ``seg_end_genomic`` + ``seg_end_genomic``. Expects inter-residue coordinates by default :return: Genomic data (inter-residue coordinates) """ errors = [] @@ -747,7 +747,6 @@ async def _genomic_to_tx_segment( errors. :param genomic_pos: Genomic position where the transcript segment starts or ends - (inter-residue based) :param chromosome: Chromosome. Must give chromosome without a prefix (i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used. @@ -769,7 +768,7 @@ async def _genomic_to_tx_segment( :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts. ``False`` if ``genomic_pos`` is where the transcript segment ends. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and - ``seg_end_genomic`` + ``seg_end_genomic``. Expects inter-residue coordinates by default :return: Data for a transcript segment boundary (inter-residue coordinates) """ params = {key: None for key in GenomicTxSeg.model_fields} @@ -1180,7 +1179,7 @@ def _use_alt_start_i(is_seg_start: bool, strand: Strand) -> bool: :return ``True`` if alt_start_i should be used, ``False`` if alt_end_i should be used """ - return bool( + return ( is_seg_start and strand == Strand.POSITIVE or not is_seg_start @@ -1249,7 +1248,8 @@ def _get_exon_offset( """Compute offset from exon start or end index :param genomic_pos: The supplied genomic position. This can represent, for - example, a fusion junction breakpoint + example, a fusion junction breakpoint. This position is represented using + inter-residue coordinates :param exon_boundary: The genomic position for the exon boundary that the offset is being computed against :paran strand: The transcribed strand From 99beca087bc4b3833bbb5a191925dfcb1f979938 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Thu, 2 Jan 2025 11:32:49 -0500 Subject: [PATCH 03/16] Refactor genomic_to_tx_segment --- .../mappers/exon_genomic_coords.py | 475 +++++++++++------- src/cool_seq_tool/sources/uta_database.py | 32 ++ tests/mappers/test_exon_genomic_coords.py | 38 +- tests/sources/test_uta_database.py | 20 + 4 files changed, 362 insertions(+), 203 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 41e982f..f12438b 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -411,7 +411,6 @@ async def genomic_to_tx_segment( seg_start_genomic: int | None = None, seg_end_genomic: int | None = None, transcript: str | None = None, - get_nearest_transcript_junction: bool = False, gene: str | None = None, coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, ) -> GenomicTxSegService: @@ -451,13 +450,6 @@ async def genomic_to_tx_segment( following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript. See the :ref:`Transcript Selection policy <transcript_selection_policy>` page. - :param get_nearest_transcript_junction: If ``True``, this will return the - adjacent exon if the position specified by``seg_start_genomic`` or - ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent - is defined as the exon preceding the breakpoint for the 5' end and the exon - following the breakpoint for the 3' end. For the negative strand, adjacent - is defined as the exon following the breakpoint for the 5' end and the exon - preceding the breakpoint for the 3' end. :param gene: A valid, case-sensitive HGNC symbol. Must be given if no ``transcript`` value is provided. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and @@ -485,7 +477,6 @@ async def genomic_to_tx_segment( genomic_ac=genomic_ac, transcript=transcript, gene=gene, - get_nearest_transcript_junction=get_nearest_transcript_junction, is_seg_start=True, coordinate_type=coordinate_type, ) @@ -506,7 +497,6 @@ async def genomic_to_tx_segment( genomic_ac=genomic_ac, transcript=transcript, gene=gene, - get_nearest_transcript_junction=get_nearest_transcript_junction, is_seg_start=False, coordinate_type=coordinate_type, ) @@ -730,14 +720,13 @@ def _get_vrs_seq_loc( end=genomic_pos if not use_start else None, ), None - async def _genomic_to_tx_segment( + async def _genomic_to_tx_segment_old( self, genomic_pos: int, chromosome: str | None = None, genomic_ac: str | None = None, transcript: str | None = None, gene: str | None = None, - get_nearest_transcript_junction: bool = False, is_seg_start: bool = True, coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, ) -> GenomicTxSeg: @@ -758,13 +747,6 @@ async def _genomic_to_tx_segment( following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript :param gene: Valid, case-sensitive HGNC gene symbol - :param get_nearest_transcript_junction: If ``True``, this will return the - adjacent exon if the position specified by``seg_start_genomic`` or - ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent - is defined as the exon preceding the breakpoint for the 5' end and the exon - following the breakpoint for the 3' end. For the negative strand, adjacent - is defined as the exon following the breakpoint for the 5' end and the exon - preceding the breakpoint for the 3' end. :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts. ``False`` if ``genomic_pos`` is where the transcript segment ends. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and @@ -773,171 +755,314 @@ async def _genomic_to_tx_segment( """ params = {key: None for key in GenomicTxSeg.model_fields} - if get_nearest_transcript_junction: - if not gene and not transcript: + if not gene and not transcript: + return GenomicTxSeg( + errors=[ + "`gene` or `transcript` must be provided to select the adjacent transcript junction" + ] + ) + + if not genomic_ac: + genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome) + + if not genomic_acs: return GenomicTxSeg( - errors=[ - "`gene` or `transcript` must be provided to select the adjacent transcript junction" - ] + errors=[err_msg], ) + genomic_ac = genomic_acs[0] - if not genomic_ac: - genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome) + # Validate gene symbol exists + if gene: + valid_gene = await self.uta_db.validate_gene_symbol(gene=gene) + if not valid_gene: + return GenomicTxSeg(errors=[f"{gene} does not exist in UTA"]) - if not genomic_acs: - return GenomicTxSeg( - errors=[err_msg], - ) - genomic_ac = genomic_acs[0] + # Always liftover to GRCh38 + genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos( + genomic_ac, genomic_pos + ) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) - # Always liftover to GRCh38 - genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos( - genomic_ac, genomic_pos + # Validate coordinate is plausible + coordinate_check = await self._validate_gene_coordinates( + pos=genomic_pos, genomic_ac=genomic_ac, gene=gene + ) + if not coordinate_check: + return GenomicTxSeg( + errors=[ + f"{genomic_pos} on {genomic_ac} does not occur within the exons for {gene}" + ] ) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) - if not transcript: - # Select a transcript if not provided - mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data( - gene + if not transcript: + # Select a transcript if not provided + mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(gene) + + if mane_transcripts: + transcript = mane_transcripts[0]["RefSeq_nuc"] + else: + # Attempt to find a coding transcript if a MANE transcript + # cannot be found + results = await self.uta_db.get_transcripts( + gene=gene, alt_ac=genomic_ac ) - if mane_transcripts: - transcript = mane_transcripts[0]["RefSeq_nuc"] + if not results.is_empty(): + transcript = results[0]["tx_ac"][0] else: - # Attempt to find a coding transcript if a MANE transcript - # cannot be found - results = await self.uta_db.get_transcripts( - gene=gene, alt_ac=genomic_ac - ) - - if not results.is_empty(): - transcript = results[0]["tx_ac"][0] + # Run if gene is for a noncoding transcript + query = f""" + SELECT DISTINCT tx_ac + FROM {self.uta_db.schema}.tx_exon_aln_v + WHERE hgnc = '{gene}' + AND alt_ac = '{genomic_ac}' + """ # noqa: S608 + result = await self.uta_db.execute_query(query) + + if result: + transcript = result[0]["tx_ac"] else: - # Run if gene is for a noncoding transcript - query = f""" - SELECT DISTINCT tx_ac - FROM {self.uta_db.schema}.tx_exon_aln_v - WHERE hgnc = '{gene}' - AND alt_ac = '{genomic_ac}' - """ # noqa: S608 - result = await self.uta_db.execute_query(query) - - if result: - transcript = result[0]["tx_ac"] - else: - return GenomicTxSeg( - errors=[ - f"Could not find a transcript for {gene} on {genomic_ac}" - ] - ) - - tx_exons = await self._get_all_exon_coords( - tx_ac=transcript, genomic_ac=genomic_ac + return GenomicTxSeg( + errors=[ + f"Could not find a transcript for {gene} on {genomic_ac}" + ] + ) + + tx_exons = await self._get_all_exon_coords( + tx_ac=transcript, genomic_ac=genomic_ac + ) + if not tx_exons: + return GenomicTxSeg(errors=[f"No exons found given {transcript}"]) + + strand = Strand(tx_exons[0].alt_strand) + params["strand"] = strand + use_alt_start_i = self._use_alt_start_i( + is_seg_start=is_seg_start, strand=strand + ) + if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE: + genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue + + # gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene + if not gene: + _gene, err_msg = await self._get_tx_ac_gene(transcript) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) + gene = _gene + + # Check if breakpoint occurs on an exon. + # If not, determine the adjacent exon given the selected transcript + if not self._is_exonic_breakpoint(genomic_pos, tx_exons): + exon_num = self._get_adjacent_exon( + tx_exons_genomic_coords=tx_exons, + strand=strand, + start=genomic_pos if is_seg_start else None, + end=genomic_pos if not is_seg_start else None, ) - if not tx_exons: - return GenomicTxSeg(errors=[f"No exons found given {transcript}"]) - strand = Strand(tx_exons[0].alt_strand) - params["strand"] = strand - use_alt_start_i = self._use_alt_start_i( - is_seg_start=is_seg_start, strand=strand + offset = self._get_exon_offset( + genomic_pos=genomic_pos, + exon_boundary=tx_exons[exon_num].alt_start_i + if use_alt_start_i + else tx_exons[exon_num].alt_end_i, + strand=strand, ) - if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE: - genomic_pos = ( - genomic_pos - 1 - ) # Convert residue coordinate to inter-residue - - # Check if breakpoint occurs on an exon. - # If not, determine the adjacent exon given the selected transcript - if not self._is_exonic_breakpoint(genomic_pos, tx_exons): - exon_num = self._get_adjacent_exon( - tx_exons_genomic_coords=tx_exons, - strand=strand, - start=genomic_pos if is_seg_start else None, - end=genomic_pos if not is_seg_start else None, - ) - offset = self._get_exon_offset( - genomic_pos=genomic_pos, - exon_boundary=tx_exons[exon_num].alt_start_i - if use_alt_start_i - else tx_exons[exon_num].alt_end_i, - strand=strand, - ) + genomic_location, err_msg = self._get_vrs_seq_loc( + genomic_ac, genomic_pos, is_seg_start, strand + ) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) - genomic_location, err_msg = self._get_vrs_seq_loc( - genomic_ac, genomic_pos, is_seg_start, strand - ) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) + return GenomicTxSeg( + gene=gene, + genomic_ac=genomic_ac, + tx_ac=transcript, + seg=TxSegment( + exon_ord=exon_num, + offset=offset, + genomic_location=genomic_location, + ), + ) - # gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene - if not gene: - _gene, err_msg = await self._get_tx_ac_gene(transcript) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) - gene = _gene + return await self._get_tx_seg_genomic_metadata( + genomic_ac, + genomic_pos, + is_seg_start, + gene, + tx_ac=transcript, + ) - return GenomicTxSeg( - gene=gene, - genomic_ac=genomic_ac, - tx_ac=transcript, - seg=TxSegment( - exon_ord=exon_num, - offset=offset, - genomic_location=genomic_location, - ), - ) + async def _genomic_to_tx_segment( + self, + genomic_pos: int, + chromosome: str | None = None, + genomic_ac: str | None = None, + transcript: str | None = None, + gene: str | None = None, + is_seg_start: bool = True, + coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, + ) -> GenomicTxSeg: + """Given genomic data, generate a boundary for a transcript segment. + + Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return + errors. + + :param genomic_pos: Genomic position where the transcript segment starts or ends + :param chromosome: Chromosome. Must give chromosome without a prefix + (i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If + position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used. + If ``genomic_ac`` is also provided, ``genomic_ac`` will be used. + :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, + must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac`` + will be used. + :param transcript: The transcript to use. If this is not given, we will try the + following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining + Compatible Transcript + :param gene: Valid, case-sensitive HGNC gene symbol + :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts. + ``False`` if ``genomic_pos`` is where the transcript segment ends. + :param coordinate_type: Coordinate type for ``seg_start_genomic`` and + ``seg_end_genomic``. Expects inter-residue coordinates by default + :return: Data for a transcript segment boundary (inter-residue coordinates) + """ + params = {key: None for key in GenomicTxSeg.model_fields} + + if not gene and not transcript: + return GenomicTxSeg(errors=["`gene` or `transcript` must be provided"]) + + # Validate inputs exist in UTA + if gene: + gene_validation = await self.uta_db.validate_gene_symbol(gene) + if not gene_validation: + return GenomicTxSeg(errors=[f"{gene} does not exist in UTA"]) + + if transcript: + transcript_validation = await self.uta_db.validate_transcript(transcript) + if not transcript_validation: + return GenomicTxSeg(errors=[f"{transcript} does not exist in UTA"]) if genomic_ac: - _gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac) + genomic_ac_validation = await self.uta_db.validate_genomic_ac(genomic_ac) + if not genomic_ac_validation: + return GenomicTxSeg(errors=[f"{genomic_ac} does not exist in UTA"]) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) + if not genomic_ac: + genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome) - if gene and _gene != gene: + if not genomic_acs: return GenomicTxSeg( - errors=[f"Expected gene, {gene}, but found {_gene}"] + errors=[err_msg], ) + genomic_ac = genomic_acs[0] - gene = _gene - elif chromosome: - # Try GRCh38 first - for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]: - _genomic_acs, err_msg = self.seqrepo_access.translate_identifier( - f"{assembly}:chr{chromosome}", "refseq" - ) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) - _genomic_ac = _genomic_acs[0].split(":")[-1] + # Always liftover to GRCh38 + genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos( + genomic_ac, + genomic_pos, + ) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) - _gene, err_msg = await self._get_genomic_ac_gene( - genomic_pos, _genomic_ac + # Select a transcript if not provided + if not transcript: + mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(gene) + + if mane_transcripts: + transcript = mane_transcripts[0]["RefSeq_nuc"] + else: + # Attempt to find a coding transcript if a MANE transcript + # cannot be found + results = await self.uta_db.get_transcripts( + gene=gene, alt_ac=genomic_ac ) - if _gene: - if gene and _gene != gene: + + if not results.is_empty(): + transcript = results[0]["tx_ac"][0] + else: + # Run if gene is for a noncoding transcript + query = f""" + SELECT DISTINCT tx_ac + FROM {self.uta_db.schema}.tx_exon_aln_v + WHERE hgnc = '{gene}' + AND alt_ac = '{genomic_ac}' + """ # noqa: S608 + result = await self.uta_db.execute_query(query) + + if result: + transcript = result[0]["tx_ac"] + else: return GenomicTxSeg( - errors=[f"Expected gene, {gene}, but found {_gene}"] + errors=[ + f"Could not find a transcript for {gene} on {genomic_ac}" + ] ) - gene = _gene - genomic_ac = _genomic_ac - break + tx_exons = await self._get_all_exon_coords( + tx_ac=transcript, genomic_ac=genomic_ac + ) + if not tx_exons: + return GenomicTxSeg(errors=[f"No exons found given {transcript}"]) - if not genomic_ac: - return GenomicTxSeg( - errors=[ - f"Unable to get genomic RefSeq accession for chromosome {chromosome} on position {genomic_pos}" - ] - ) + strand = Strand(tx_exons[0].alt_strand) + params["strand"] = strand + use_alt_start_i = self._use_alt_start_i( + is_seg_start=is_seg_start, strand=strand + ) + if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE: + genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue - if not gene: - return GenomicTxSeg( - errors=[ - f"Unable to get gene given {genomic_ac} on position {genomic_pos}" - ] - ) + # gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene + if not gene: + _gene, err_msg = await self._get_tx_ac_gene(transcript) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) + gene = _gene + + # Validate that the breakpoint occurs on a transcript given a gene + coordinate_check = await self._validate_gene_coordinates( + pos=genomic_pos, genomic_ac=genomic_ac, gene=gene + ) + if not coordinate_check: + return GenomicTxSeg( + errors=[ + f"{genomic_pos} on {genomic_ac} does not occur within the exons for {gene}" + ] + ) + + # Check if breakpoint occurs on an exon. + # If not, determine the adjacent exon given the selected transcript + if not self._is_exonic_breakpoint(genomic_pos, tx_exons): + exon_num = self._get_adjacent_exon( + tx_exons_genomic_coords=tx_exons, + strand=strand, + start=genomic_pos if is_seg_start else None, + end=genomic_pos if not is_seg_start else None, + ) + + offset = self._get_exon_offset( + genomic_pos=genomic_pos, + exon_boundary=tx_exons[exon_num].alt_start_i + if use_alt_start_i + else tx_exons[exon_num].alt_end_i, + strand=strand, + ) + + genomic_location, err_msg = self._get_vrs_seq_loc( + genomic_ac, genomic_pos, is_seg_start, strand + ) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) + + return GenomicTxSeg( + gene=gene, + genomic_ac=genomic_ac, + tx_ac=transcript, + seg=TxSegment( + exon_ord=exon_num, + offset=offset, + genomic_location=genomic_location, + ), + ) return await self._get_tx_seg_genomic_metadata( genomic_ac, @@ -945,11 +1070,13 @@ async def _genomic_to_tx_segment( is_seg_start, gene, tx_ac=transcript, - coordinate_type=coordinate_type, ) async def _get_grch38_ac_pos( - self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None + self, + genomic_ac: str, + genomic_pos: int, + grch38_ac: str | None = None, ) -> tuple[str | None, int | None, str | None]: """Get GRCh38 genomic representation for accession and position @@ -960,6 +1087,7 @@ async def _get_grch38_ac_pos( :return: Tuple containing GRCh38 accession, GRCh38 position, and error message if unable to get GRCh38 representation """ + # Validate accession exists if not grch38_ac: grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) if not grch38_ac: @@ -983,7 +1111,6 @@ async def _get_grch38_ac_pos( None, f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.", ) - chromosome = chromosome[-1].split(":")[-1] liftover_data = self.liftover.get_liftover( chromosome, genomic_pos, Assembly.GRCH38 @@ -994,17 +1121,17 @@ async def _get_grch38_ac_pos( None, f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.", ) - genomic_pos = liftover_data[1] genomic_ac = grch38_ac return genomic_ac, genomic_pos, None - async def _get_genomic_ac_gene( + async def _validate_gene_coordinates( self, pos: int, genomic_ac: str, - ) -> tuple[str | None, str | None]: + gene: str, + ) -> bool: """Get gene given a genomic accession and position. If multiple genes are found for a given ``pos`` and ``genomic_ac``, only one @@ -1012,23 +1139,30 @@ async def _get_genomic_ac_gene( :param pos: Genomic position on ``genomic_ac`` :param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"`` - :return: HGNC gene symbol associated to genomic accession and position and - warning + :param gene: A gene symbol + :return: ``True`` if the coordinate falls within the first and last exon + for the gene, ``False`` if not """ query = f""" + WITH tx_boundaries AS ( + SELECT + tx_ac, + hgnc, + MIN(alt_start_i) as min_start, + MAX(alt_end_i) as max_end + FROM {self.uta_db.schema}.tx_exon_aln_v + WHERE hgnc = '{gene}' + AND alt_ac = '{genomic_ac}' + GROUP BY tx_ac, hgnc + ) SELECT DISTINCT hgnc - FROM {self.uta_db.schema}.tx_exon_aln_v - WHERE alt_ac = '{genomic_ac}' - AND alt_aln_method = 'splign' - AND {pos} BETWEEN alt_start_i AND alt_end_i + FROM tx_boundaries + WHERE {pos} between tx_boundaries.min_start and tx_boundaries.max_end ORDER BY hgnc LIMIT 1; """ # noqa: S608 results = await self.uta_db.execute_query(query) - if not results: - return None, f"No gene(s) found given {genomic_ac} on position {pos}" - - return results[0]["hgnc"], None + return bool(results) async def _get_tx_ac_gene( self, @@ -1063,7 +1197,6 @@ async def _get_tx_seg_genomic_metadata( is_seg_start: bool, gene: str, tx_ac: str | None, - coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, ) -> GenomicTxSeg: """Get transcript segment data and associated genomic metadata. @@ -1129,8 +1262,6 @@ async def _get_tx_seg_genomic_metadata( use_alt_start_i = self._use_alt_start_i( is_seg_start=is_seg_start, strand=strand ) - if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE: - genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue offset = self._get_exon_offset( genomic_pos=genomic_pos, exon_boundary=tx_exon_aln_data.alt_start_i diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index ee0c631..ab853bf 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -378,6 +378,38 @@ async def validate_genomic_ac(self, ac: str) -> bool: result = await self.execute_query(query) return result[0][0] + async def validate_gene_symbol(self, gene: str) -> bool: + """Return whether or not a gene symbol exists in UTA gene table + + :param gene: Gene symbol + :return ``True`` if gene symbol exists in UTA, ``False`` if not + """ + query = f""" + SELECT EXISTS( + SELECT hgnc + FROM {self.schema}.gene + WHERE hgnc = '{gene}' + ); + """ # noqa: S608 + result = await self.execute_query(query) + return result[0][0] + + async def validate_transcript(self, transcript: str) -> bool: + """Return whether or not a transcript exists in the UTA tx_exon_aln_v table + + :param transcript: A transcript accession + :return ``True`` if transcript exists in UTA, ``False`` if not + """ + query = f""" + SELECT EXISTS( + SELECT tx_ac + FROM {self.schema}.tx_exon_aln_v + WHERE tx_ac = '{transcript}' + ); + """ # noqa: S608 + result = await self.execute_query(query) + return result[0][0] + async def get_ac_descr(self, ac: str) -> str | None: """Return accession description. This is typically available only for accessions from older (pre-GRCh38) builds. diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 9a3522e..b783b4e 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -923,7 +923,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "8", "seg_end_genomic": 80514010, "gene": "ZBTB10", - "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(resp, zbtb10_exon3_end) @@ -932,7 +931,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "8", "seg_start_genomic": 80518580, "gene": "ZBTB10", - "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(resp, zbtb10_exon5_start) @@ -941,7 +939,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "1", "seg_end_genomic": 154171410, "gene": "TPM3", - "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(resp, tpm3_exon6_end) @@ -950,7 +947,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "1", "seg_start_genomic": 154173080, "gene": "TPM3", - "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(resp, tpm3_exon5_start) @@ -959,7 +955,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "5", "seg_end_genomic": 69680764, "gene": "GUSBP3", - "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(resp, gusbp3_exon2_end) @@ -968,7 +963,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "5", "seg_start_genomic": 69645878, "gene": "GUSBP3", - "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(resp, gusbp3_exon5_start) @@ -976,7 +970,6 @@ async def test_genomic_to_transcript_fusion_context( inputs = { # Test when gene and transcript are not provided "chromosome": "5", "seg_start_genomic": 69645878, - "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) assert resp.errors[0] == "Must provide either `gene` or `transcript`" @@ -986,7 +979,6 @@ async def test_genomic_to_transcript_fusion_context( "seg_start_genomic": 69645878, "gene": "GUSBP3", "transcript": "NR_027386.2", - "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(resp, gusbp3_exon5_start) @@ -995,7 +987,6 @@ async def test_genomic_to_transcript_fusion_context( "genomic_ac": "NC_000005.10", "seg_start_genomic": 69645878, "transcript": "NR_027386.2", - "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(resp, gusbp3_exon5_start) @@ -1005,7 +996,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "8", "seg_end_genomic": 80514010, "gene": "ZBTB10", - "get_nearest_transcript_junction": True, "coordinate_type": CoordinateType.RESIDUE, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) @@ -1015,7 +1005,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "8", "seg_start_genomic": 80518581, "gene": "ZBTB10", - "get_nearest_transcript_junction": True, "coordinate_type": CoordinateType.RESIDUE, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) @@ -1025,7 +1014,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "1", "seg_end_genomic": 154171411, "gene": "TPM3", - "get_nearest_transcript_junction": True, "coordinate_type": CoordinateType.RESIDUE, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) @@ -1035,7 +1023,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "1", "seg_start_genomic": 154173080, "gene": "TPM3", - "get_nearest_transcript_junction": True, "coordinate_type": CoordinateType.RESIDUE, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) @@ -1045,7 +1032,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "5", "seg_end_genomic": 69680765, "gene": "GUSBP3", - "get_nearest_transcript_junction": True, "coordinate_type": CoordinateType.RESIDUE, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) @@ -1055,7 +1041,6 @@ async def test_genomic_to_transcript_fusion_context( "chromosome": "5", "seg_start_genomic": 69645878, "gene": "GUSBP3", - "get_nearest_transcript_junction": True, "coordinate_type": CoordinateType.RESIDUE, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) @@ -1452,9 +1437,9 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) - inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9609996} - resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) + # inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9609996} + # resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + # assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) inputs = {"transcript": "NM_003390.3", "exon_start": 2} resp = await test_egc_mapper.tx_segment_to_genomic(**inputs) @@ -1486,7 +1471,6 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): seg_start_genomic=73442503, seg_end_genomic=73457929, # not on an exon gene="ELN", - get_nearest_transcript_junction=True, ) genomic_tx_seg_service_checks(resp, eln_grch38_intronic) @@ -1500,7 +1484,7 @@ async def test_invalid(test_egc_mapper): seg_end_genomic=154170399, genomic_ac="NC_000001.11", ) - assert resp.errors == ["No exons found given NM_152263 3"] + assert resp.errors == ["NM_152263 3 does not exist in UTA"] # start and end not given resp = await test_egc_mapper.genomic_to_tx_segment( @@ -1524,7 +1508,7 @@ async def test_invalid(test_egc_mapper): gene="dummy gene", ) genomic_tx_seg_service_checks(resp, is_valid=False) - assert resp.errors == ["Expected gene, dummy gene, but found TPM3"] + assert resp.errors == ["dummy gene does not exist in UTA"] # Invalid accession resp = await test_egc_mapper.genomic_to_tx_segment( @@ -1534,7 +1518,7 @@ async def test_invalid(test_egc_mapper): transcript="NM_152263.3", ) genomic_tx_seg_service_checks(resp, is_valid=False) - assert resp.errors == ["No gene(s) found given NC_000001.200 on position 154191901"] + assert resp.errors == ["NC_000001.200 does not exist in UTA"] # Invalid coordinates resp = await test_egc_mapper.genomic_to_tx_segment( @@ -1545,17 +1529,9 @@ async def test_invalid(test_egc_mapper): ) genomic_tx_seg_service_checks(resp, is_valid=False) assert resp.errors == [ - "No gene(s) found given NC_000001.11 on position 9999999999998" + "9999999999998 on NC_000001.11 does not occur within the exons for TPM3" ] - resp = await test_egc_mapper.genomic_to_tx_segment( - chromosome="1", - seg_start_genomic=154192135, - transcript="NM_002529.3", - ) - genomic_tx_seg_service_checks(resp, is_valid=False) - assert resp.errors == ["Must find exactly one row for genomic data, but found: 0"] - # Must supply either gene or transcript resp = await test_egc_mapper.genomic_to_tx_segment( seg_start_genomic=154191901, genomic_ac="NC_000001.11" diff --git a/tests/sources/test_uta_database.py b/tests/sources/test_uta_database.py index 4dccf21..6440d7d 100644 --- a/tests/sources/test_uta_database.py +++ b/tests/sources/test_uta_database.py @@ -87,6 +87,26 @@ async def test_validate_genomic_ac(test_db): assert resp is False +@pytest.mark.asyncio() +async def test_validate_gene_symbol(test_db): + """Test validate_gene_symbol""" + resp = await test_db.validate_gene_symbol("TPM3") + assert resp is True + + resp = await test_db.validate_gene_symbol("dummy gene") + assert resp is False + + +@pytest.mark.asyncio() +async def test_validate_transcript(test_db): + """Tests validate_transcript""" + resp = await test_db.validate_transcript("NM_152263.3") + assert resp is True + + resp = await test_db.validate_transcript("NM_152263 3") + assert resp is False + + @pytest.mark.asyncio() async def test_get_ac_descr(test_db): """Test that get_ac_descr works correctly.""" From fc541514e090025fe4b241467257867aad956fb0 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Thu, 2 Jan 2025 13:29:45 -0500 Subject: [PATCH 04/16] Add translate_identifier --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index f12438b..055384f 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -763,13 +763,13 @@ async def _genomic_to_tx_segment_old( ) if not genomic_ac: - genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome) - - if not genomic_acs: - return GenomicTxSeg( - errors=[err_msg], + for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]: + _genomic_acs, err_msg = self.seqrepo_access.translate_identifier( + f"{assembly}:chr{chromosome}", "refseq" ) - genomic_ac = genomic_acs[0] + if err_msg: + return GenomicTxSeg(errors=[err_msg]) + genomic_ac = _genomic_acs[0].split(":")[-1] # Validate gene symbol exists if gene: From 677492702c2810b4c781d5b650a14606d8d70914 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Thu, 2 Jan 2025 14:19:18 -0500 Subject: [PATCH 05/16] Fix liftover issues --- .../mappers/exon_genomic_coords.py | 180 ------------------ tests/mappers/test_exon_genomic_coords.py | 6 +- 2 files changed, 3 insertions(+), 183 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 055384f..faf1ea9 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -720,179 +720,6 @@ def _get_vrs_seq_loc( end=genomic_pos if not use_start else None, ), None - async def _genomic_to_tx_segment_old( - self, - genomic_pos: int, - chromosome: str | None = None, - genomic_ac: str | None = None, - transcript: str | None = None, - gene: str | None = None, - is_seg_start: bool = True, - coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, - ) -> GenomicTxSeg: - """Given genomic data, generate a boundary for a transcript segment. - - Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return - errors. - - :param genomic_pos: Genomic position where the transcript segment starts or ends - :param chromosome: Chromosome. Must give chromosome without a prefix - (i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If - position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used. - If ``genomic_ac`` is also provided, ``genomic_ac`` will be used. - :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, - must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac`` - will be used. - :param transcript: The transcript to use. If this is not given, we will try the - following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining - Compatible Transcript - :param gene: Valid, case-sensitive HGNC gene symbol - :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts. - ``False`` if ``genomic_pos`` is where the transcript segment ends. - :param coordinate_type: Coordinate type for ``seg_start_genomic`` and - ``seg_end_genomic``. Expects inter-residue coordinates by default - :return: Data for a transcript segment boundary (inter-residue coordinates) - """ - params = {key: None for key in GenomicTxSeg.model_fields} - - if not gene and not transcript: - return GenomicTxSeg( - errors=[ - "`gene` or `transcript` must be provided to select the adjacent transcript junction" - ] - ) - - if not genomic_ac: - for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]: - _genomic_acs, err_msg = self.seqrepo_access.translate_identifier( - f"{assembly}:chr{chromosome}", "refseq" - ) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) - genomic_ac = _genomic_acs[0].split(":")[-1] - - # Validate gene symbol exists - if gene: - valid_gene = await self.uta_db.validate_gene_symbol(gene=gene) - if not valid_gene: - return GenomicTxSeg(errors=[f"{gene} does not exist in UTA"]) - - # Always liftover to GRCh38 - genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos( - genomic_ac, genomic_pos - ) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) - - # Validate coordinate is plausible - coordinate_check = await self._validate_gene_coordinates( - pos=genomic_pos, genomic_ac=genomic_ac, gene=gene - ) - if not coordinate_check: - return GenomicTxSeg( - errors=[ - f"{genomic_pos} on {genomic_ac} does not occur within the exons for {gene}" - ] - ) - - if not transcript: - # Select a transcript if not provided - mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(gene) - - if mane_transcripts: - transcript = mane_transcripts[0]["RefSeq_nuc"] - else: - # Attempt to find a coding transcript if a MANE transcript - # cannot be found - results = await self.uta_db.get_transcripts( - gene=gene, alt_ac=genomic_ac - ) - - if not results.is_empty(): - transcript = results[0]["tx_ac"][0] - else: - # Run if gene is for a noncoding transcript - query = f""" - SELECT DISTINCT tx_ac - FROM {self.uta_db.schema}.tx_exon_aln_v - WHERE hgnc = '{gene}' - AND alt_ac = '{genomic_ac}' - """ # noqa: S608 - result = await self.uta_db.execute_query(query) - - if result: - transcript = result[0]["tx_ac"] - else: - return GenomicTxSeg( - errors=[ - f"Could not find a transcript for {gene} on {genomic_ac}" - ] - ) - - tx_exons = await self._get_all_exon_coords( - tx_ac=transcript, genomic_ac=genomic_ac - ) - if not tx_exons: - return GenomicTxSeg(errors=[f"No exons found given {transcript}"]) - - strand = Strand(tx_exons[0].alt_strand) - params["strand"] = strand - use_alt_start_i = self._use_alt_start_i( - is_seg_start=is_seg_start, strand=strand - ) - if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE: - genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue - - # gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene - if not gene: - _gene, err_msg = await self._get_tx_ac_gene(transcript) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) - gene = _gene - - # Check if breakpoint occurs on an exon. - # If not, determine the adjacent exon given the selected transcript - if not self._is_exonic_breakpoint(genomic_pos, tx_exons): - exon_num = self._get_adjacent_exon( - tx_exons_genomic_coords=tx_exons, - strand=strand, - start=genomic_pos if is_seg_start else None, - end=genomic_pos if not is_seg_start else None, - ) - - offset = self._get_exon_offset( - genomic_pos=genomic_pos, - exon_boundary=tx_exons[exon_num].alt_start_i - if use_alt_start_i - else tx_exons[exon_num].alt_end_i, - strand=strand, - ) - - genomic_location, err_msg = self._get_vrs_seq_loc( - genomic_ac, genomic_pos, is_seg_start, strand - ) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) - - return GenomicTxSeg( - gene=gene, - genomic_ac=genomic_ac, - tx_ac=transcript, - seg=TxSegment( - exon_ord=exon_num, - offset=offset, - genomic_location=genomic_location, - ), - ) - - return await self._get_tx_seg_genomic_metadata( - genomic_ac, - genomic_pos, - is_seg_start, - gene, - tx_ac=transcript, - ) - async def _genomic_to_tx_segment( self, genomic_pos: int, @@ -1232,13 +1059,6 @@ async def _get_tx_seg_genomic_metadata( tx_ac = mane_data["RefSeq_nuc"] grch38_ac = mane_data["GRCh38_chr"] - # Always liftover to GRCh38 - genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos( - genomic_ac, genomic_pos, grch38_ac=grch38_ac - ) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) - tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac) if not tx_exons: return GenomicTxSeg(errors=[f"No exons found given {tx_ac}"]) diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index b783b4e..a77158f 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1437,9 +1437,9 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) - # inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9609996} - # resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - # assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) + inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9588449} + resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) inputs = {"transcript": "NM_003390.3", "exon_start": 2} resp = await test_egc_mapper.tx_segment_to_genomic(**inputs) From 9bc2c31ff434f5fed5ef859f732190c40f9cf8cb Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Fri, 3 Jan 2025 11:06:52 -0500 Subject: [PATCH 06/16] Store requested changes --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index faf1ea9..549166f 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -418,8 +418,6 @@ async def genomic_to_tx_segment( If liftover to GRCh38 is unsuccessful, will return errors. - Must provide inter-residue coordinates. - MANE Transcript data will be returned if and only if ``transcript`` is not supplied. ``gene`` must be given in order to retrieve MANE Transcript data. @@ -773,8 +771,7 @@ async def _genomic_to_tx_segment( genomic_ac_validation = await self.uta_db.validate_genomic_ac(genomic_ac) if not genomic_ac_validation: return GenomicTxSeg(errors=[f"{genomic_ac} does not exist in UTA"]) - - if not genomic_ac: + else: genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome) if not genomic_acs: From 3a7c91459e7cdf5f01ce2a9a14ee2dbeb47f0f40 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Fri, 3 Jan 2025 12:58:41 -0500 Subject: [PATCH 07/16] Add support for providing assembly --- .../mappers/exon_genomic_coords.py | 95 +++++++------------ tests/mappers/test_exon_genomic_coords.py | 61 ++++-------- 2 files changed, 56 insertions(+), 100 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 549166f..32725d9 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -413,6 +413,7 @@ async def genomic_to_tx_segment( transcript: str | None = None, gene: str | None = None, coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, + assembly: Assembly = Assembly.GRCH38, ) -> GenomicTxSegService: """Get transcript segment data for genomic data, lifted over to GRCh38. @@ -452,6 +453,8 @@ async def genomic_to_tx_segment( value is provided. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and ``seg_end_genomic``. Expects inter-residue coordinates by default + :param assembly: The assembly that the supplied coordinate comes from. Set to + GRCh38 be default :return: Genomic data (inter-residue coordinates) """ errors = [] @@ -477,6 +480,7 @@ async def genomic_to_tx_segment( gene=gene, is_seg_start=True, coordinate_type=coordinate_type, + assembly=assembly, ) if start_tx_seg_data.errors: return _return_service_errors(start_tx_seg_data.errors) @@ -497,6 +501,7 @@ async def genomic_to_tx_segment( gene=gene, is_seg_start=False, coordinate_type=coordinate_type, + assembly=assembly, ) if end_tx_seg_data.errors: return _return_service_errors(end_tx_seg_data.errors) @@ -727,6 +732,7 @@ async def _genomic_to_tx_segment( gene: str | None = None, is_seg_start: bool = True, coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, + assembly: Assembly = Assembly.GRCH38, ) -> GenomicTxSeg: """Given genomic data, generate a boundary for a transcript segment. @@ -749,6 +755,8 @@ async def _genomic_to_tx_segment( ``False`` if ``genomic_pos`` is where the transcript segment ends. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and ``seg_end_genomic``. Expects inter-residue coordinates by default + :param assembly: The assembly that the supplied coordinate comes from. Set to + GRCh38 be default :return: Data for a transcript segment boundary (inter-residue coordinates) """ params = {key: None for key in GenomicTxSeg.model_fields} @@ -771,6 +779,9 @@ async def _genomic_to_tx_segment( genomic_ac_validation = await self.uta_db.validate_genomic_ac(genomic_ac) if not genomic_ac_validation: return GenomicTxSeg(errors=[f"{genomic_ac} does not exist in UTA"]) + if assembly == Assembly.GRCH37: + grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) + genomic_ac = grch38_ac[0] else: genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome) @@ -780,13 +791,15 @@ async def _genomic_to_tx_segment( ) genomic_ac = genomic_acs[0] - # Always liftover to GRCh38 - genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos( - genomic_ac, - genomic_pos, - ) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) + # Liftover to GRCh38 if the provided assembly is GRCh37 + if assembly == Assembly.GRCH37: + genomic_pos = await self._get_grch38_pos(genomic_pos, genomic_ac) + if not genomic_pos: + return GenomicTxSeg( + errors=[ + f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful." + ] + ) # Select a transcript if not provided if not transcript: @@ -896,59 +909,23 @@ async def _genomic_to_tx_segment( tx_ac=transcript, ) - async def _get_grch38_ac_pos( - self, - genomic_ac: str, - genomic_pos: int, - grch38_ac: str | None = None, - ) -> tuple[str | None, int | None, str | None]: - """Get GRCh38 genomic representation for accession and position - - :param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly) - :param genomic_pos: Genomic position on ``genomic_ac`` - :param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not - provided, will attempt to retrieve associated GRCh38 accession from UTA. - :return: Tuple containing GRCh38 accession, GRCh38 position, and error message - if unable to get GRCh38 representation - """ - # Validate accession exists - if not grch38_ac: - grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) - if not grch38_ac: - return None, None, f"Unrecognized genomic accession: {genomic_ac}." - - grch38_ac = grch38_ac[0] - - if grch38_ac != genomic_ac: - # Ensure genomic_ac is GRCh37 - chromosome, _ = self.seqrepo_access.translate_identifier( - genomic_ac, Assembly.GRCH37.value - ) - if not chromosome: - _logger.warning( - "SeqRepo could not find associated %s assembly for genomic accession %s.", - Assembly.GRCH37.value, - genomic_ac, - ) - return ( - None, - None, - f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.", - ) - chromosome = chromosome[-1].split(":")[-1] - liftover_data = self.liftover.get_liftover( - chromosome, genomic_pos, Assembly.GRCH38 - ) - if liftover_data is None: - return ( - None, - None, - f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.", - ) - genomic_pos = liftover_data[1] - genomic_ac = grch38_ac + async def _get_grch38_pos(self, genomic_pos: int, genomic_ac: str) -> int | None: + """Liftover a GRCh37 coordinate to GRCh38 - return genomic_ac, genomic_pos, None + :param genomic_pos: A genomic coordinate in GRCh37 + :param genomic_ac: The genomic accession in GRCh38 + :return The genomic coordinate in GRCh38 + """ + chromosome, _ = self.seqrepo_access.translate_identifier( + genomic_ac, target_namespaces=Assembly.GRCH38.value + ) + chromosome = chromosome[-1].split(":")[-1] + liftover_data = self.liftover.get_liftover( + chromosome, genomic_pos, Assembly.GRCH38 + ) + if liftover_data is None: + return None + return liftover_data[1] async def _validate_gene_coordinates( self, diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index a77158f..d259818 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -10,6 +10,7 @@ _ExonCoord, ) from cool_seq_tool.schemas import ( + Assembly, CoordinateType, Strand, ) @@ -704,48 +705,15 @@ def genomic_tx_seg_checks(actual, expected=None, is_valid=True): @pytest.mark.asyncio() -async def test_get_grch38_ac_pos(test_egc_mapper): - """Test that _get_grch38_ac_pos works correctly""" - grch38_ac = "NC_000001.11" - grch38_pos = 154192135 - expected = grch38_ac, grch38_pos, None - - # GRCh37 provided - grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.10", 154164611) - assert grch38_data == expected - - # GRCh38 provided, no grch38_ac - grch38_data = await test_egc_mapper._get_grch38_ac_pos(grch38_ac, grch38_pos) - assert grch38_data == expected - - # GRCh38 and grch38_ac provided - grch38_data = await test_egc_mapper._get_grch38_ac_pos( - grch38_ac, grch38_pos, grch38_ac=grch38_ac - ) - assert grch38_data == expected - - # Unrecognized accession - invalid_ac = "NC_0000026.10" - grch38_data = await test_egc_mapper._get_grch38_ac_pos(invalid_ac, 154164611) - assert grch38_data == (None, None, f"Unrecognized genomic accession: {invalid_ac}.") - - # GRCh36 used - grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.9", 154164611) - assert grch38_data == ( - None, - None, - "`genomic_ac` must use GRCh37 or GRCh38 assembly.", - ) +async def test_get_grch38_pos(test_egc_mapper): + """Test that get_grch38_pos works correctly""" + genomic_pos = await test_egc_mapper._get_grch38_pos(9609996, "NC_000011.10") + assert genomic_pos == 9588449 - # Unsuccessful liftover - grch38_data = await test_egc_mapper._get_grch38_ac_pos( - "NC_000001.10", 9999999999999999999 - ) - assert grch38_data == ( - None, - None, - "Lifting over 9999999999999999999 on NC_000001.10 from GRCh37 to GRCh38 was unsuccessful.", + genomic_pos = await test_egc_mapper._get_grch38_pos( + 9609996999999999, "NC_000011.10" ) + assert genomic_pos is None @pytest.mark.asyncio() @@ -1268,6 +1236,7 @@ async def test_braf(test_egc_mapper, mane_braf): "seg_start_genomic": 140501359, # GRCh38 coords: 140801559 "seg_end_genomic": 140453136, # GRCh38 coords: 140753336 "gene": "BRAF", + "assembly": Assembly.GRCH37.value, } # MANE g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) @@ -1291,6 +1260,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "seg_start_genomic": 9597639, "seg_end_genomic": 9609996, "transcript": "NM_003390.3", + "assembly": Assembly.GRCH37.value, } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11) @@ -1307,6 +1277,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "seg_end_genomic": 9609996, "transcript": "NM_003390.3", "gene": "WEE1", + "assembly": Assembly.GRCH37.value, } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11) @@ -1322,6 +1293,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "seg_start_genomic": 9597639, # GRCh38 coords: 9576092 "seg_end_genomic": 9609996, # GRCh38 coords: 9588449 "gene": "WEE1", + "assembly": Assembly.GRCH37.value, } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11) @@ -1433,11 +1405,17 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): "gene": "WEE1", "genomic_ac": "NC_000011.9", "seg_end_genomic": 9609996, + "assembly": Assembly.GRCH37.value, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) - inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9588449} + inputs = { + "gene": "WEE1", + "chromosome": "11", + "seg_end_genomic": 9609996, + "assembly": Assembly.GRCH37.value, + } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) @@ -1471,6 +1449,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): seg_start_genomic=73442503, seg_end_genomic=73457929, # not on an exon gene="ELN", + assembly=Assembly.GRCH37.value, ) genomic_tx_seg_service_checks(resp, eln_grch38_intronic) From 18f74d92e80a97b06d538f5dc0ea1981f4091f71 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Mon, 6 Jan 2025 12:09:54 -0500 Subject: [PATCH 08/16] Add more comments --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 32725d9..cbb560f 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -840,6 +840,7 @@ async def _genomic_to_tx_segment( if not tx_exons: return GenomicTxSeg(errors=[f"No exons found given {transcript}"]) + # Determine if genomic_pos needs to be modified strand = Strand(tx_exons[0].alt_strand) params["strand"] = strand use_alt_start_i = self._use_alt_start_i( @@ -848,7 +849,7 @@ async def _genomic_to_tx_segment( if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE: genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue - # gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene + # Extract gene symbol given transcript accession if not gene: _gene, err_msg = await self._get_tx_ac_gene(transcript) if err_msg: From 232673dfb3346491abdd6449d17cedb2c68f2f06 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Mon, 6 Jan 2025 14:01:05 -0500 Subject: [PATCH 09/16] Correct docstring in _validate_gene_coordinates --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index cbb560f..d8a2afe 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -934,10 +934,8 @@ async def _validate_gene_coordinates( genomic_ac: str, gene: str, ) -> bool: - """Get gene given a genomic accession and position. - - If multiple genes are found for a given ``pos`` and ``genomic_ac``, only one - gene will be returned. + """Validate that a genomic coordinate falls within the first and last exon + given a gene and accession :param pos: Genomic position on ``genomic_ac`` :param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"`` From 1c03cacab81f915d70d6d4435e9b4f62e90a9c4b Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Mon, 6 Jan 2025 15:03:11 -0500 Subject: [PATCH 10/16] Update assembly parameter, add chromosome as an optional parameter --- .../mappers/exon_genomic_coords.py | 38 +++++++++++-------- tests/mappers/test_exon_genomic_coords.py | 19 ++++++---- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index d8a2afe..0e4a842 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -413,7 +413,7 @@ async def genomic_to_tx_segment( transcript: str | None = None, gene: str | None = None, coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, - assembly: Assembly = Assembly.GRCH38, + starting_assembly: Assembly = Assembly.GRCH38, ) -> GenomicTxSegService: """Get transcript segment data for genomic data, lifted over to GRCh38. @@ -453,8 +453,8 @@ async def genomic_to_tx_segment( value is provided. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and ``seg_end_genomic``. Expects inter-residue coordinates by default - :param assembly: The assembly that the supplied coordinate comes from. Set to - GRCh38 be default + :param starting_assembly: The assembly that the supplied coordinate comes from. Set to + GRCh38 by default :return: Genomic data (inter-residue coordinates) """ errors = [] @@ -480,7 +480,7 @@ async def genomic_to_tx_segment( gene=gene, is_seg_start=True, coordinate_type=coordinate_type, - assembly=assembly, + starting_assembly=starting_assembly, ) if start_tx_seg_data.errors: return _return_service_errors(start_tx_seg_data.errors) @@ -501,7 +501,7 @@ async def genomic_to_tx_segment( gene=gene, is_seg_start=False, coordinate_type=coordinate_type, - assembly=assembly, + starting_assembly=starting_assembly, ) if end_tx_seg_data.errors: return _return_service_errors(end_tx_seg_data.errors) @@ -732,7 +732,7 @@ async def _genomic_to_tx_segment( gene: str | None = None, is_seg_start: bool = True, coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, - assembly: Assembly = Assembly.GRCH38, + starting_assembly: Assembly = Assembly.GRCH38, ) -> GenomicTxSeg: """Given genomic data, generate a boundary for a transcript segment. @@ -755,8 +755,8 @@ async def _genomic_to_tx_segment( ``False`` if ``genomic_pos`` is where the transcript segment ends. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and ``seg_end_genomic``. Expects inter-residue coordinates by default - :param assembly: The assembly that the supplied coordinate comes from. Set to - GRCh38 be default + :param starting_assembly: The assembly that the supplied coordinate comes from. Set to + GRCh38 by default :return: Data for a transcript segment boundary (inter-residue coordinates) """ params = {key: None for key in GenomicTxSeg.model_fields} @@ -779,7 +779,7 @@ async def _genomic_to_tx_segment( genomic_ac_validation = await self.uta_db.validate_genomic_ac(genomic_ac) if not genomic_ac_validation: return GenomicTxSeg(errors=[f"{genomic_ac} does not exist in UTA"]) - if assembly == Assembly.GRCH37: + if starting_assembly == Assembly.GRCH37: grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) genomic_ac = grch38_ac[0] else: @@ -792,8 +792,10 @@ async def _genomic_to_tx_segment( genomic_ac = genomic_acs[0] # Liftover to GRCh38 if the provided assembly is GRCh37 - if assembly == Assembly.GRCH37: - genomic_pos = await self._get_grch38_pos(genomic_pos, genomic_ac) + if starting_assembly == Assembly.GRCH37: + genomic_pos = await self._get_grch38_pos( + genomic_pos, genomic_ac, chromosome=chromosome if chromosome else None + ) if not genomic_pos: return GenomicTxSeg( errors=[ @@ -910,17 +912,21 @@ async def _genomic_to_tx_segment( tx_ac=transcript, ) - async def _get_grch38_pos(self, genomic_pos: int, genomic_ac: str) -> int | None: + async def _get_grch38_pos( + self, genomic_pos: int, genomic_ac: str, chromosome: str | None = None + ) -> int | None: """Liftover a GRCh37 coordinate to GRCh38 :param genomic_pos: A genomic coordinate in GRCh37 :param genomic_ac: The genomic accession in GRCh38 + :param chromosome: The chromosome that genomic_pos occurs on :return The genomic coordinate in GRCh38 """ - chromosome, _ = self.seqrepo_access.translate_identifier( - genomic_ac, target_namespaces=Assembly.GRCH38.value - ) - chromosome = chromosome[-1].split(":")[-1] + if not chromosome: + chromosome, _ = self.seqrepo_access.translate_identifier( + genomic_ac, target_namespaces=Assembly.GRCH38.value + ) + chromosome = chromosome[-1].split(":")[-1] liftover_data = self.liftover.get_liftover( chromosome, genomic_pos, Assembly.GRCH38 ) diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index d259818..4566068 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -710,6 +710,11 @@ async def test_get_grch38_pos(test_egc_mapper): genomic_pos = await test_egc_mapper._get_grch38_pos(9609996, "NC_000011.10") assert genomic_pos == 9588449 + genomic_pos = await test_egc_mapper._get_grch38_pos( + 9609996, "NC_000011.10", "chr11" + ) + assert genomic_pos == 9588449 + genomic_pos = await test_egc_mapper._get_grch38_pos( 9609996999999999, "NC_000011.10" ) @@ -1236,7 +1241,7 @@ async def test_braf(test_egc_mapper, mane_braf): "seg_start_genomic": 140501359, # GRCh38 coords: 140801559 "seg_end_genomic": 140453136, # GRCh38 coords: 140753336 "gene": "BRAF", - "assembly": Assembly.GRCH37.value, + "starting_assembly": Assembly.GRCH37.value, } # MANE g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) @@ -1260,7 +1265,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "seg_start_genomic": 9597639, "seg_end_genomic": 9609996, "transcript": "NM_003390.3", - "assembly": Assembly.GRCH37.value, + "starting_assembly": Assembly.GRCH37.value, } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11) @@ -1277,7 +1282,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "seg_end_genomic": 9609996, "transcript": "NM_003390.3", "gene": "WEE1", - "assembly": Assembly.GRCH37.value, + "starting_assembly": Assembly.GRCH37.value, } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11) @@ -1293,7 +1298,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "seg_start_genomic": 9597639, # GRCh38 coords: 9576092 "seg_end_genomic": 9609996, # GRCh38 coords: 9588449 "gene": "WEE1", - "assembly": Assembly.GRCH37.value, + "starting_assembly": Assembly.GRCH37.value, } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11) @@ -1405,7 +1410,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): "gene": "WEE1", "genomic_ac": "NC_000011.9", "seg_end_genomic": 9609996, - "assembly": Assembly.GRCH37.value, + "starting_assembly": Assembly.GRCH37.value, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) @@ -1414,7 +1419,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): "gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9609996, - "assembly": Assembly.GRCH37.value, + "starting_assembly": Assembly.GRCH37.value, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) @@ -1449,7 +1454,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): seg_start_genomic=73442503, seg_end_genomic=73457929, # not on an exon gene="ELN", - assembly=Assembly.GRCH37.value, + starting_assembly=Assembly.GRCH37.value, ) genomic_tx_seg_service_checks(resp, eln_grch38_intronic) From 4b336ba923554a7895e2d0457a53a60556917c0f Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Tue, 7 Jan 2025 13:23:04 -0500 Subject: [PATCH 11/16] Ensure merge conflicts are resolved --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 10 ++++++---- tests/mappers/test_exon_genomic_coords.py | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index cf77003..ddf5e79 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -780,7 +780,9 @@ async def _genomic_to_tx_segment( if genomic_ac: genomic_ac_validation = await self.uta_db.validate_genomic_ac(genomic_ac) if not genomic_ac_validation: - return GenomicTxSeg(errors=[f"{genomic_ac} does not exist in UTA"]) + return GenomicTxSeg( + errors=[f"Genomic accession does not exist in UTA: {genomic_ac}"] + ) if starting_assembly == Assembly.GRCH37: grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) genomic_ac = grch38_ac[0] @@ -796,7 +798,7 @@ async def _genomic_to_tx_segment( # Liftover to GRCh38 if the provided assembly is GRCh37 if starting_assembly == Assembly.GRCH37: genomic_pos = await self._get_grch38_pos( - genomic_pos, genomic_ac, chromosome=chromosome if chromosome else None + genomic_ac, genomic_pos, chromosome=chromosome if chromosome else None ) if not genomic_pos: return GenomicTxSeg( @@ -916,12 +918,12 @@ async def _genomic_to_tx_segment( ), ) - async def _get_grch38_ac_pos( + async def _get_grch38_pos( self, genomic_ac: str, genomic_pos: int, chromosome: str | None = None, - ) -> tuple[str | None, int | None, str | None]: + ) -> int | None: """Get GRCh38 genomic representation for accession and position :param genomic_pos: A genomic coordinate in GRCh37 diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index f46625d..c1d57ba 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -707,16 +707,16 @@ def genomic_tx_seg_checks(actual, expected=None, is_valid=True): @pytest.mark.asyncio() async def test_get_grch38_pos(test_egc_mapper): """Test that get_grch38_pos works correctly""" - genomic_pos = await test_egc_mapper._get_grch38_pos(9609996, "NC_000011.10") + genomic_pos = await test_egc_mapper._get_grch38_pos("NC_000011.10", 9609996) assert genomic_pos == 9588449 genomic_pos = await test_egc_mapper._get_grch38_pos( - 9609996, "NC_000011.10", "chr11" + "NC_000011.10", 9609996, "chr11" ) assert genomic_pos == 9588449 genomic_pos = await test_egc_mapper._get_grch38_pos( - 9609996999999999, "NC_000011.10" + "NC_000011.10", 9609996999999999 ) assert genomic_pos is None From 44e1f34454f0ccefd5128a1ba866d7af77365c56 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Tue, 7 Jan 2025 17:06:36 -0500 Subject: [PATCH 12/16] Add docstring, change liftover data access --- .../mappers/exon_genomic_coords.py | 8 +++----- tests/mappers/test_exon_genomic_coords.py | 17 ----------------- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index ddf5e79..a2d9893 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -454,7 +454,7 @@ async def genomic_to_tx_segment( :param coordinate_type: Coordinate type for ``seg_start_genomic`` and ``seg_end_genomic``. Expects inter-residue coordinates by default :param starting_assembly: The assembly that the supplied coordinate comes from. Set to - GRCh38 by default + GRCh38 by default. Will attempt to liftover if starting assembly is GRCh37 :return: Genomic data (inter-residue coordinates) """ errors = [] @@ -759,7 +759,7 @@ async def _genomic_to_tx_segment( :param coordinate_type: Coordinate type for ``seg_start_genomic`` and ``seg_end_genomic``. Expects inter-residue coordinates by default :param starting_assembly: The assembly that the supplied coordinate comes from. Set to - GRCh38 by default + GRCh38 by default. Will attempt to liftover if starting assembly is GRCh37 :return: Data for a transcript segment boundary (inter-residue coordinates) """ params = {key: None for key in GenomicTxSeg.model_fields} @@ -939,9 +939,7 @@ async def _get_grch38_pos( liftover_data = self.liftover.get_liftover( chromosome, genomic_pos, Assembly.GRCH38 ) - if liftover_data is None: - return None - return liftover_data[1] + return liftover_data[1] if liftover_data else None async def _validate_gene_coordinates( self, diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index c1d57ba..cb2f8e5 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -704,23 +704,6 @@ def genomic_tx_seg_checks(actual, expected=None, is_valid=True): assert len(actual.errors) > 0 -@pytest.mark.asyncio() -async def test_get_grch38_pos(test_egc_mapper): - """Test that get_grch38_pos works correctly""" - genomic_pos = await test_egc_mapper._get_grch38_pos("NC_000011.10", 9609996) - assert genomic_pos == 9588449 - - genomic_pos = await test_egc_mapper._get_grch38_pos( - "NC_000011.10", 9609996, "chr11" - ) - assert genomic_pos == 9588449 - - genomic_pos = await test_egc_mapper._get_grch38_pos( - "NC_000011.10", 9609996999999999 - ) - assert genomic_pos is None - - @pytest.mark.asyncio() async def test_get_all_exon_coords( test_egc_mapper, nm_152263_exons, nm_152263_exons_genomic_coords From 0734fbdb2b470422f23ab696d103fbf771e67768 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Wed, 8 Jan 2025 09:19:13 -0500 Subject: [PATCH 13/16] Remove duplicate error statement --- tests/mappers/test_exon_genomic_coords.py | 28 +++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index cb2f8e5..d5dcfc8 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1221,8 +1221,8 @@ async def test_braf(test_egc_mapper, mane_braf): """ inputs = { "genomic_ac": "NC_000007.13", - "seg_start_genomic": 140501359, # GRCh38 coords: 140801559 - "seg_end_genomic": 140453136, # GRCh38 coords: 140753336 + "seg_start_genomic": 140501359, + "seg_end_genomic": 140453136, "gene": "BRAF", "starting_assembly": Assembly.GRCH37.value, } @@ -1230,6 +1230,16 @@ async def test_braf(test_egc_mapper, mane_braf): g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, mane_braf) + inputs = { + "genomic_ac": "NC_000007.14", + "seg_start_genomic": 140801559, + "seg_end_genomic": 140753336, + "gene": "BRAF", + "starting_assembly": Assembly.GRCH38.value, + } + g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + genomic_tx_seg_service_checks(g_to_t_resp, mane_braf) + expected = mane_braf.model_copy(deep=True) expected.seg_start.genomic_location.end = 140801559 t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( @@ -1278,14 +1288,24 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): # MANE since no transcript provided inputs = { "genomic_ac": "NC_000011.9", - "seg_start_genomic": 9597639, # GRCh38 coords: 9576092 - "seg_end_genomic": 9609996, # GRCh38 coords: 9588449 + "seg_start_genomic": 9597639, + "seg_end_genomic": 9609996, "gene": "WEE1", "starting_assembly": Assembly.GRCH37.value, } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11) + inputs = { + "genomic_ac": "NC_000011.10", + "seg_start_genomic": 9576092, + "seg_end_genomic": 9588449, + "gene": "WEE1", + "starting_assembly": Assembly.GRCH38.value, + } + g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11) + t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( **get_t_to_g_args(g_to_t_resp) ) From 39b4a75ef3e889181b0d5b39d500946488900ba9 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Wed, 8 Jan 2025 10:50:28 -0500 Subject: [PATCH 14/16] Update logic for genomic accession validation --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 9 ++++----- tests/mappers/test_exon_genomic_coords.py | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index a2d9893..dd4cdb3 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -778,14 +778,13 @@ async def _genomic_to_tx_segment( ) if genomic_ac: - genomic_ac_validation = await self.uta_db.validate_genomic_ac(genomic_ac) - if not genomic_ac_validation: + grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) + if grch38_ac: + genomic_ac = grch38_ac[0] + else: return GenomicTxSeg( errors=[f"Genomic accession does not exist in UTA: {genomic_ac}"] ) - if starting_assembly == Assembly.GRCH37: - grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) - genomic_ac = grch38_ac[0] else: genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome) diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index d5dcfc8..57f277c 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1499,13 +1499,13 @@ async def test_invalid(test_egc_mapper): # Invalid accession resp = await test_egc_mapper.genomic_to_tx_segment( - genomic_ac="NC_000001.200", + genomic_ac="NC_000035.200", seg_start_genomic=154191901, seg_end_genomic=154192135, transcript="NM_152263.3", ) genomic_tx_seg_service_checks(resp, is_valid=False) - assert resp.errors == ["Genomic accession does not exist in UTA: NC_000001.200"] + assert resp.errors == ["Genomic accession does not exist in UTA: NC_000035.200"] # Invalid coordinates resp = await test_egc_mapper.genomic_to_tx_segment( From 06244a91d3e09bc2bc3680d5d10abe00cbd04e22 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Wed, 8 Jan 2025 11:57:59 -0500 Subject: [PATCH 15/16] Update docstring --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index dd4cdb3..46f89bb 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -442,7 +442,8 @@ async def genomic_to_tx_segment( used. :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, must provide ``chromosome. If ``chromosome`` is also provided, - ``genomic_ac`` will be used. + ``genomic_ac`` will be used. If the genomic accession is from GRCh37, it + will be lifted over to GRCh38 :param seg_start_genomic: Genomic position where the transcript segment starts :param seg_end_genomic: Genomic position where the transcript segment ends :param transcript: The transcript to use. If this is not given, we will try the @@ -749,7 +750,8 @@ async def _genomic_to_tx_segment( If ``genomic_ac`` is also provided, ``genomic_ac`` will be used. :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac`` - will be used. + will be used. If the genomic accession is from GRCh37, it will be lifted + over to GRCh38 :param transcript: The transcript to use. If this is not given, we will try the following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript From 2f6b224be1121a6c2bd058cea6b8458914a31ba3 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <jarbesfeld@gmail.com> Date: Wed, 8 Jan 2025 12:01:34 -0500 Subject: [PATCH 16/16] Indicate that accession version will be ignored --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 46f89bb..40756fb 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -443,7 +443,8 @@ async def genomic_to_tx_segment( :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac`` will be used. If the genomic accession is from GRCh37, it - will be lifted over to GRCh38 + will be lifted over to GRCh38 and the original accession version will be + ignored :param seg_start_genomic: Genomic position where the transcript segment starts :param seg_end_genomic: Genomic position where the transcript segment ends :param transcript: The transcript to use. If this is not given, we will try the @@ -751,7 +752,7 @@ async def _genomic_to_tx_segment( :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac`` will be used. If the genomic accession is from GRCh37, it will be lifted - over to GRCh38 + over to GRCh38 and the original accession version will be ignored :param transcript: The transcript to use. If this is not given, we will try the following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript