From a9c74229dc0ad828e1d694cc0222a078bc74a7a5 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Tue, 12 Nov 2024 17:49:57 -0500 Subject: [PATCH 1/7] Check if MANE transcript exists in UTA, if not select longest compatible --- src/cool_seq_tool/app.py | 1 + .../mappers/exon_genomic_coords.py | 24 ++++++++++++------- src/cool_seq_tool/sources/uta_database.py | 15 ++++++++++++ tests/mappers/test_exon_genomic_coords.py | 10 ++++++++ 4 files changed, 42 insertions(+), 8 deletions(-) diff --git a/src/cool_seq_tool/app.py b/src/cool_seq_tool/app.py index 1d9004b..6b1b456 100644 --- a/src/cool_seq_tool/app.py +++ b/src/cool_seq_tool/app.py @@ -107,6 +107,7 @@ def __init__( self.ex_g_coords_mapper = ExonGenomicCoordsMapper( self.seqrepo_access, self.uta_db, + self.mane_transcript, self.mane_transcript_mappings, self.liftover, ) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index a03e604..1edbfc2 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -7,7 +7,9 @@ from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.mappers.liftover import LiftOver +from cool_seq_tool.mappers.mane_transcript import ManeTranscript from cool_seq_tool.schemas import ( + AnnotationLayer, Assembly, BaseModelForbidExtra, ServiceMeta, @@ -232,6 +234,7 @@ def __init__( self, seqrepo_access: SeqRepoAccess, uta_db: UtaDatabase, + mane_transcript: ManeTranscript, mane_transcript_mappings: ManeTranscriptMappings, liftover: LiftOver, ) -> None: @@ -261,6 +264,7 @@ def __init__( """ self.seqrepo_access = seqrepo_access self.uta_db = uta_db + self.mane_transcript = mane_transcript self.mane_transcript_mappings = mane_transcript_mappings self.liftover = liftover @@ -796,18 +800,23 @@ async def _genomic_to_tx_segment( mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data( gene ) - - if mane_transcripts: + if mane_transcripts and await self.uta_db.validate_mane_transcript_acc( + mane_transcripts + ): transcript = mane_transcripts[0]["RefSeq_nuc"] else: # Attempt to find a coding transcript if a MANE transcript # cannot be found - results = await self.uta_db.get_transcripts( - gene=gene, alt_ac=genomic_ac + results = ( + await self.mane_transcript.get_longest_compatible_transcript( + start_pos=genomic_pos, + end_pos=genomic_pos, + start_annotation_layer=AnnotationLayer.GENOMIC, + gene=gene, + ) ) - - if not results.is_empty(): - transcript = results[0]["tx_ac"][0] + if results: + transcript = results.refseq else: # Run if gene is for a noncoding transcript query = f""" @@ -826,7 +835,6 @@ async def _genomic_to_tx_segment( f"Could not find a transcript for {gene} on {genomic_ac}" ] ) - tx_exons = await self._get_all_exon_coords( tx_ac=transcript, genomic_ac=genomic_ac ) diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index ee0c631..d98012f 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -378,6 +378,21 @@ async def validate_genomic_ac(self, ac: str) -> bool: result = await self.execute_query(query) return result[0][0] + async def validate_mane_transcript_acc(self, transcript_list: list[dict]) -> bool: + """Return whether or not a MANE transcript exists in UTA + + :param transcript_list: A list of MANE transcripts + :return ``True`` if transcript accession exists in UTA. ``False`` otherwise + """ + transcript = transcript_list[0]["RefSeq_nuc"] + query = f""" + SELECT * + FROM {self.schema}.tx_exon_aln_v + WHERE tx_ac = '{transcript}' + """ # noqa: S608 + results = await self.execute_query(query) + return bool(results) + async def get_ac_descr(self, ac: str) -> str | None: """Return accession description. This is typically available only for accessions from older (pre-GRCh38) builds. diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 07ff3ff..4309423 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1044,6 +1044,16 @@ async def test_genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8): ) genomic_tx_seg_checks(resp, tpm3_exon8) + # Check if transcript exists in UTA. If not, return longest compatible transcript + resp = await test_egc_mapper._genomic_to_tx_segment( + 22513522, + genomic_ac="NC_000016.10", + is_seg_start=False, + gene="NPIPB5", + get_nearest_transcript_junction=True, + ) + assert resp.tx_ac == "NM_001135865.2" + @pytest.mark.asyncio() async def test_tpm3( From b43065caa0f8d67a8ea3c655f30898fe9745d2bd Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Wed, 13 Nov 2024 10:28:22 -0500 Subject: [PATCH 2/7] Add support when get_nearest_transcript_junction is false --- .../mappers/exon_genomic_coords.py | 54 +++++++++++++++---- tests/mappers/test_exon_genomic_coords.py | 1 - 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 1edbfc2..042f588 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -1073,24 +1073,56 @@ async def _get_tx_seg_genomic_metadata( transcript :return: Transcript segment data and associated genomic metadata """ - if tx_ac: - # We should always try to liftover - grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) - if not grch38_ac: - return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"]) - grch38_ac = grch38_ac[0] - else: + # We should always try to liftover + grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) + if not grch38_ac: + return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"]) + grch38_ac = grch38_ac[0] + + non_mane_transcript = None + + if not tx_ac: mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene) if not mane_data: err_msg = f"Unable to find mane data for {genomic_ac} with position {genomic_pos}" if gene: err_msg += f" on gene {gene}" _logger.warning(err_msg) - return GenomicTxSeg(errors=[err_msg]) + if not await self.uta_db.validate_mane_transcript_acc(mane_data): + results = await self.mane_transcript.get_longest_compatible_transcript( + start_pos=genomic_pos, + end_pos=genomic_pos, + start_annotation_layer=AnnotationLayer.GENOMIC, + gene=gene, + ) + if results: + non_mane_transcript = results.refseq + else: + # Run if gene is for a noncoding transcript + query = f""" + SELECT DISTINCT tx_ac + FROM {self.uta_db.schema}.tx_exon_aln_v + WHERE hgnc = '{gene}' + AND alt_ac = '{genomic_ac}' + """ # noqa: S608 + result = await self.uta_db.execute_query(query) + + if result: + non_mane_transcript = result[0]["tx_ac"] + else: + return GenomicTxSeg( + errors=[ + f"Could not find a transcript for {gene} on {genomic_ac}" + ] + ) + return GenomicTxSeg(errors=[err_msg]) - mane_data = mane_data[0] - tx_ac = mane_data["RefSeq_nuc"] - grch38_ac = mane_data["GRCh38_chr"] + if non_mane_transcript: + tx_ac = non_mane_transcript + else: + mane_data = mane_data[0] + tx_ac = mane_data["RefSeq_nuc"] + grch38_ac = mane_data["GRCh38_chr"] # Always liftover to GRCh38 genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos( diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 4309423..3c49bd4 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1050,7 +1050,6 @@ async def test_genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8): genomic_ac="NC_000016.10", is_seg_start=False, gene="NPIPB5", - get_nearest_transcript_junction=True, ) assert resp.tx_ac == "NM_001135865.2" From fed9ed1c5e648f7f018ff5e94525bc080a7010d1 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Wed, 13 Nov 2024 10:45:52 -0500 Subject: [PATCH 3/7] Refactor transcript selection --- .../mappers/exon_genomic_coords.py | 98 ++++++++----------- 1 file changed, 43 insertions(+), 55 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 042f588..eb10f93 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -805,36 +805,9 @@ async def _genomic_to_tx_segment( ): transcript = mane_transcripts[0]["RefSeq_nuc"] else: - # Attempt to find a coding transcript if a MANE transcript - # cannot be found - results = ( - await self.mane_transcript.get_longest_compatible_transcript( - start_pos=genomic_pos, - end_pos=genomic_pos, - start_annotation_layer=AnnotationLayer.GENOMIC, - gene=gene, - ) + transcript = await self._select_optimal_transcript( + genomic_pos, genomic_ac, gene ) - if results: - transcript = results.refseq - else: - # Run if gene is for a noncoding transcript - query = f""" - SELECT DISTINCT tx_ac - FROM {self.uta_db.schema}.tx_exon_aln_v - WHERE hgnc = '{gene}' - AND alt_ac = '{genomic_ac}' - """ # noqa: S608 - result = await self.uta_db.execute_query(query) - - if result: - transcript = result[0]["tx_ac"] - else: - return GenomicTxSeg( - errors=[ - f"Could not find a transcript for {gene} on {genomic_ac}" - ] - ) tx_exons = await self._get_all_exon_coords( tx_ac=transcript, genomic_ac=genomic_ac ) @@ -942,6 +915,45 @@ async def _genomic_to_tx_segment( genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript ) + async def _select_optimal_transcript( + self, genomic_pos: int, genomic_ac: str, gene: str + ) -> str: + """Select the optimal transcript given a genomic position, accession, and gene. + + :param genomic_pos: Genomic position where the transcript segment starts or ends + (inter-residue based) + :param genomic_ac: Genomic accession + :param gene: Valid, case-sensitive HGNC gene symbol + :return A string representing the optimal transcript given the above data + """ + # Attempt to find a coding transcript if a MANE transcript cannot be found + transcript = None + results = await self.mane_transcript.get_longest_compatible_transcript( + start_pos=genomic_pos, + end_pos=genomic_pos, + start_annotation_layer=AnnotationLayer.GENOMIC, + gene=gene, + ) + if results: + transcript = results.refseq + else: + # Run if gene is for a noncoding transcript + query = f""" + SELECT DISTINCT tx_ac + FROM {self.uta_db.schema}.tx_exon_aln_v + WHERE hgnc = '{gene}' + AND alt_ac = '{genomic_ac}' + """ # noqa: S608 + result = await self.uta_db.execute_query(query) + + if result: + transcript = result[0]["tx_ac"] + else: + return GenomicTxSeg( + errors=[f"Could not find a transcript for {gene} on {genomic_ac}"] + ) + return transcript + async def _get_grch38_ac_pos( self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None ) -> tuple[str | None, int | None, str | None]: @@ -1089,33 +1101,9 @@ async def _get_tx_seg_genomic_metadata( err_msg += f" on gene {gene}" _logger.warning(err_msg) if not await self.uta_db.validate_mane_transcript_acc(mane_data): - results = await self.mane_transcript.get_longest_compatible_transcript( - start_pos=genomic_pos, - end_pos=genomic_pos, - start_annotation_layer=AnnotationLayer.GENOMIC, - gene=gene, + non_mane_transcript = await self._select_optimal_transcript( + genomic_pos, genomic_ac, gene ) - if results: - non_mane_transcript = results.refseq - else: - # Run if gene is for a noncoding transcript - query = f""" - SELECT DISTINCT tx_ac - FROM {self.uta_db.schema}.tx_exon_aln_v - WHERE hgnc = '{gene}' - AND alt_ac = '{genomic_ac}' - """ # noqa: S608 - result = await self.uta_db.execute_query(query) - - if result: - non_mane_transcript = result[0]["tx_ac"] - else: - return GenomicTxSeg( - errors=[ - f"Could not find a transcript for {gene} on {genomic_ac}" - ] - ) - return GenomicTxSeg(errors=[err_msg]) if non_mane_transcript: tx_ac = non_mane_transcript From 12d0f7ebd27c89c8d47d568adffa9965d79ab27a Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Wed, 13 Nov 2024 14:41:28 -0500 Subject: [PATCH 4/7] Make edits to function docstrings and variable names --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 8 ++++++-- src/cool_seq_tool/sources/uta_database.py | 8 +++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index eb10f93..6e3c006 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -260,6 +260,7 @@ def __init__( :param seqrepo_access: SeqRepo instance to give access to query SeqRepo database :param uta_db: UtaDatabase instance to give access to query UTA database :param mane_transcript_mappings: Instance to provide access to ManeTranscriptMappings class + :param mane_transcript: Instance to provide access to the ManeTranscript class :param liftover: Instance to provide mapping between human genome assemblies """ self.seqrepo_access = seqrepo_access @@ -800,7 +801,7 @@ async def _genomic_to_tx_segment( mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data( gene ) - if mane_transcripts and await self.uta_db.validate_mane_transcript_acc( + if mane_transcripts and await self.uta_db.validate_mane_transcript_ac( mane_transcripts ): transcript = mane_transcripts[0]["RefSeq_nuc"] @@ -919,6 +920,9 @@ async def _select_optimal_transcript( self, genomic_pos: int, genomic_ac: str, gene: str ) -> str: """Select the optimal transcript given a genomic position, accession, and gene. + In this case, optimal refers to the transcript that is the best represenative + transcript for this data, given that MANE data does not exist for the + gene or the MANE transcript for the gene does not exist in UTA :param genomic_pos: Genomic position where the transcript segment starts or ends (inter-residue based) @@ -1100,7 +1104,7 @@ async def _get_tx_seg_genomic_metadata( if gene: err_msg += f" on gene {gene}" _logger.warning(err_msg) - if not await self.uta_db.validate_mane_transcript_acc(mane_data): + if not await self.uta_db.validate_mane_transcript_ac(mane_data): non_mane_transcript = await self._select_optimal_transcript( genomic_pos, genomic_ac, gene ) diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index d98012f..3e3161e 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -378,13 +378,15 @@ async def validate_genomic_ac(self, ac: str) -> bool: result = await self.execute_query(query) return result[0][0] - async def validate_mane_transcript_acc(self, transcript_list: list[dict]) -> bool: - """Return whether or not a MANE transcript exists in UTA + async def validate_mane_transcript_ac(self, mane_transcripts: list[dict]) -> bool: + """Return whether or not a MANE transcript exists in UTA. This looks at the + first entry in the MANE transcripts list as this is the higher priority + transcript. :param transcript_list: A list of MANE transcripts :return ``True`` if transcript accession exists in UTA. ``False`` otherwise """ - transcript = transcript_list[0]["RefSeq_nuc"] + transcript = mane_transcripts[0]["RefSeq_nuc"] query = f""" SELECT * FROM {self.schema}.tx_exon_aln_v From 10a4357fe112ab0106f2572509de7a074b4096d6 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 14 Nov 2024 09:52:10 -0500 Subject: [PATCH 5/7] Add try_longest_compatible --- .../mappers/exon_genomic_coords.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 6e3c006..fd22ab8 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -416,6 +416,7 @@ async def genomic_to_tx_segment( seg_end_genomic: int | None = None, transcript: str | None = None, get_nearest_transcript_junction: bool = False, + try_longest_compatible: bool = True, gene: str | None = None, ) -> GenomicTxSegService: """Get transcript segment data for genomic data, lifted over to GRCh38. @@ -461,6 +462,8 @@ async def genomic_to_tx_segment( following the breakpoint for the 3' end. For the negative strand, adjacent is defined as the exon following the breakpoint for the 5' end and the exon preceding the breakpoint for the 3' end. + :param try_longest_compatible: ``True`` if should try longest compatible remaining + if mane transcript was not compatible. ``False`` otherwise. :param gene: A valid, case-sensitive HGNC symbol. Must be given if no ``transcript`` value is provided. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and @@ -489,6 +492,7 @@ async def genomic_to_tx_segment( transcript=transcript, gene=gene, get_nearest_transcript_junction=get_nearest_transcript_junction, + try_longest_compatible=try_longest_compatible, is_seg_start=True, ) if start_tx_seg_data.errors: @@ -509,6 +513,7 @@ async def genomic_to_tx_segment( transcript=transcript, gene=gene, get_nearest_transcript_junction=get_nearest_transcript_junction, + try_longest_compatible=try_longest_compatible, is_seg_start=False, ) if end_tx_seg_data.errors: @@ -739,6 +744,7 @@ async def _genomic_to_tx_segment( transcript: str | None = None, gene: str | None = None, get_nearest_transcript_junction: bool = False, + try_longest_compatible: bool = True, is_seg_start: bool = True, ) -> GenomicTxSeg: """Given genomic data, generate a boundary for a transcript segment. @@ -766,6 +772,8 @@ async def _genomic_to_tx_segment( following the breakpoint for the 3' end. For the negative strand, adjacent is defined as the exon following the breakpoint for the 5' end and the exon preceding the breakpoint for the 3' end. + :param try_longest_compatible: ``True`` if should try longest compatible remaining + if mane transcript was not compatible. ``False`` otherwise. :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts. ``False`` if ``genomic_pos`` is where the transcript segment ends. :return: Data for a transcript segment boundary (inter-residue coordinates) @@ -806,9 +814,16 @@ async def _genomic_to_tx_segment( ): transcript = mane_transcripts[0]["RefSeq_nuc"] else: - transcript = await self._select_optimal_transcript( - genomic_pos, genomic_ac, gene - ) + if try_longest_compatible: + transcript = await self._select_optimal_transcript( + genomic_pos, genomic_ac, gene + ) + else: + return GenomicTxSeg( + errors=[ + "A MANE transcript either does not exist or was not found in UTA. Please set `try_longest_compatible` to ``True`` to re-run" + ] + ) tx_exons = await self._get_all_exon_coords( tx_ac=transcript, genomic_ac=genomic_ac ) From 805a1e2af56e343dc84616840ef762383c9604e0 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 14 Nov 2024 11:12:17 -0500 Subject: [PATCH 6/7] Update uta query for non-coding transcripts --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 14 +++++++++++--- src/cool_seq_tool/mappers/mane_transcript.py | 9 +++------ tests/mappers/test_mane_transcript.py | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index fd22ab8..14ed514 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -2,6 +2,7 @@ import logging +import polars as pl from ga4gh.vrs.models import SequenceLocation, SequenceReference from pydantic import ConfigDict, Field, StrictInt, StrictStr, model_validator @@ -952,21 +953,28 @@ async def _select_optimal_transcript( end_pos=genomic_pos, start_annotation_layer=AnnotationLayer.GENOMIC, gene=gene, + alt_ac=genomic_ac, ) if results: transcript = results.refseq else: # Run if gene is for a noncoding transcript query = f""" - SELECT DISTINCT tx_ac + SELECT * FROM {self.uta_db.schema}.tx_exon_aln_v WHERE hgnc = '{gene}' AND alt_ac = '{genomic_ac}' """ # noqa: S608 - result = await self.uta_db.execute_query(query) + results = await self.uta_db.execute_query(query) + schema = ["tx_ac", "alt_ac", "hgnc"] + transcripts = [(r["tx_ac"], r["alt_ac"], r["hgnc"]) for r in results] + transcripts = pl.DataFrame(data=transcripts, schema=schema, orient="row") + result = self.mane_transcript.get_prioritized_transcripts_from_gene( + transcripts + ) if result: - transcript = result[0]["tx_ac"] + transcript = result[0] else: return GenomicTxSeg( errors=[f"Could not find a transcript for {gene} on {genomic_ac}"] diff --git a/src/cool_seq_tool/mappers/mane_transcript.py b/src/cool_seq_tool/mappers/mane_transcript.py index 678afc5..4f6e3df 100644 --- a/src/cool_seq_tool/mappers/mane_transcript.py +++ b/src/cool_seq_tool/mappers/mane_transcript.py @@ -640,7 +640,7 @@ def validate_index( )[0] ) - def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> list: + def get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> list: """Sort and filter transcripts from gene to get priority list :param df: Data frame containing transcripts from gene @@ -655,10 +655,7 @@ def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> list: copy_df = copy_df.with_columns( [ pl.col("tx_ac") - .str.split(".") - .list.get(0) - .str.split("NM_") - .list.get(1) + .str.extract(r"(?:NM_|NR_|NG_)(\d+)", 1) .cast(pl.Int64) .alias("ac_no_version_as_int"), pl.col("tx_ac") @@ -796,7 +793,7 @@ def _get_protein_rep( _logger.warning("Unable to get transcripts from gene %s", gene) return lcr_result - prioritized_tx_acs = self._get_prioritized_transcripts_from_gene(df) + prioritized_tx_acs = self.get_prioritized_transcripts_from_gene(df) if mane_transcripts: # Dont check MANE transcripts since we know that are not compatible diff --git a/tests/mappers/test_mane_transcript.py b/tests/mappers/test_mane_transcript.py index fb515e5..5054bfd 100644 --- a/tests/mappers/test_mane_transcript.py +++ b/tests/mappers/test_mane_transcript.py @@ -499,7 +499,7 @@ def get_reference_sequence(ac): data, schema=["pro_ac", "tx_ac", "alt_ac", "cds_start_i"], orient="row" ) - resp = test_mane_transcript._get_prioritized_transcripts_from_gene(test_df) + resp = test_mane_transcript.get_prioritized_transcripts_from_gene(test_df) assert resp == ["NM_004333.6", "NM_001374258.2", "NM_001378472.1"] From bb14ea7e148d8b2ee8fb2a3d20865a4e791212e3 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 14 Nov 2024 13:08:31 -0500 Subject: [PATCH 7/7] Get index of tx_ac column --- src/cool_seq_tool/mappers/exon_genomic_coords.py | 8 +++++--- src/cool_seq_tool/mappers/mane_transcript.py | 8 ++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 14ed514..53c9bad 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -966,9 +966,11 @@ async def _select_optimal_transcript( AND alt_ac = '{genomic_ac}' """ # noqa: S608 results = await self.uta_db.execute_query(query) - schema = ["tx_ac", "alt_ac", "hgnc"] - transcripts = [(r["tx_ac"], r["alt_ac"], r["hgnc"]) for r in results] - transcripts = pl.DataFrame(data=transcripts, schema=schema, orient="row") + schema = ["tx_ac"] + transcripts = [(r["tx_ac"]) for r in results] + transcripts = pl.DataFrame( + data=transcripts, schema=schema, orient="row" + ).unique() result = self.mane_transcript.get_prioritized_transcripts_from_gene( transcripts ) diff --git a/src/cool_seq_tool/mappers/mane_transcript.py b/src/cool_seq_tool/mappers/mane_transcript.py index 4f6e3df..c7058f0 100644 --- a/src/cool_seq_tool/mappers/mane_transcript.py +++ b/src/cool_seq_tool/mappers/mane_transcript.py @@ -651,7 +651,8 @@ def get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> list: most recent version of a transcript associated with an assembly will be kept """ copy_df = df.clone() - copy_df = copy_df.drop("alt_ac").unique() + if "alt_ac" in copy_df.columns: + copy_df = copy_df.drop("alt_ac").unique() copy_df = copy_df.with_columns( [ pl.col("tx_ac") @@ -670,9 +671,12 @@ def get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> list: ) copy_df = copy_df.unique(["ac_no_version_as_int"], keep="first") + tx_ac_index = copy_df.columns.index("tx_ac") copy_df = copy_df.with_columns( copy_df.map_rows( - lambda x: len(self.seqrepo_access.get_reference_sequence(x[1])[0]) + lambda x: len( + self.seqrepo_access.get_reference_sequence(x[tx_ac_index])[0] + ) ) .to_series() .alias("len_of_tx")