Skip to content

Commit

Permalink
Fix liftover issues
Browse files Browse the repository at this point in the history
  • Loading branch information
jarbesfeld committed Jan 2, 2025
1 parent fc54151 commit 6774927
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 183 deletions.
180 changes: 0 additions & 180 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,179 +720,6 @@ def _get_vrs_seq_loc(
end=genomic_pos if not use_start else None,
), None

async def _genomic_to_tx_segment_old(
self,
genomic_pos: int,
chromosome: str | None = None,
genomic_ac: str | None = None,
transcript: str | None = None,
gene: str | None = None,
is_seg_start: bool = True,
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
) -> GenomicTxSeg:
"""Given genomic data, generate a boundary for a transcript segment.
Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
errors.
:param genomic_pos: Genomic position where the transcript segment starts or ends
:param chromosome: Chromosome. Must give chromosome without a prefix
(i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If
position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used.
If ``genomic_ac`` is also provided, ``genomic_ac`` will be used.
:param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac``
will be used.
:param transcript: The transcript to use. If this is not given, we will try the
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
Compatible Transcript
:param gene: Valid, case-sensitive HGNC gene symbol
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
``False`` if ``genomic_pos`` is where the transcript segment ends.
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
``seg_end_genomic``. Expects inter-residue coordinates by default
:return: Data for a transcript segment boundary (inter-residue coordinates)
"""
params = {key: None for key in GenomicTxSeg.model_fields}

if not gene and not transcript:
return GenomicTxSeg(
errors=[
"`gene` or `transcript` must be provided to select the adjacent transcript junction"
]
)

if not genomic_ac:
for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
_genomic_acs, err_msg = self.seqrepo_access.translate_identifier(
f"{assembly}:chr{chromosome}", "refseq"
)
if err_msg:
return GenomicTxSeg(errors=[err_msg])
genomic_ac = _genomic_acs[0].split(":")[-1]

# Validate gene symbol exists
if gene:
valid_gene = await self.uta_db.validate_gene_symbol(gene=gene)
if not valid_gene:
return GenomicTxSeg(errors=[f"{gene} does not exist in UTA"])

# Always liftover to GRCh38
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
genomic_ac, genomic_pos
)
if err_msg:
return GenomicTxSeg(errors=[err_msg])

# Validate coordinate is plausible
coordinate_check = await self._validate_gene_coordinates(
pos=genomic_pos, genomic_ac=genomic_ac, gene=gene
)
if not coordinate_check:
return GenomicTxSeg(
errors=[
f"{genomic_pos} on {genomic_ac} does not occur within the exons for {gene}"
]
)

if not transcript:
# Select a transcript if not provided
mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(gene)

if mane_transcripts:
transcript = mane_transcripts[0]["RefSeq_nuc"]
else:
# Attempt to find a coding transcript if a MANE transcript
# cannot be found
results = await self.uta_db.get_transcripts(
gene=gene, alt_ac=genomic_ac
)

if not results.is_empty():
transcript = results[0]["tx_ac"][0]
else:
# Run if gene is for a noncoding transcript
query = f"""
SELECT DISTINCT tx_ac
FROM {self.uta_db.schema}.tx_exon_aln_v
WHERE hgnc = '{gene}'
AND alt_ac = '{genomic_ac}'
""" # noqa: S608
result = await self.uta_db.execute_query(query)

if result:
transcript = result[0]["tx_ac"]
else:
return GenomicTxSeg(
errors=[
f"Could not find a transcript for {gene} on {genomic_ac}"
]
)

tx_exons = await self._get_all_exon_coords(
tx_ac=transcript, genomic_ac=genomic_ac
)
if not tx_exons:
return GenomicTxSeg(errors=[f"No exons found given {transcript}"])

strand = Strand(tx_exons[0].alt_strand)
params["strand"] = strand
use_alt_start_i = self._use_alt_start_i(
is_seg_start=is_seg_start, strand=strand
)
if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE:
genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue

# gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene
if not gene:
_gene, err_msg = await self._get_tx_ac_gene(transcript)
if err_msg:
return GenomicTxSeg(errors=[err_msg])
gene = _gene

# Check if breakpoint occurs on an exon.
# If not, determine the adjacent exon given the selected transcript
if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
exon_num = self._get_adjacent_exon(
tx_exons_genomic_coords=tx_exons,
strand=strand,
start=genomic_pos if is_seg_start else None,
end=genomic_pos if not is_seg_start else None,
)

offset = self._get_exon_offset(
genomic_pos=genomic_pos,
exon_boundary=tx_exons[exon_num].alt_start_i
if use_alt_start_i
else tx_exons[exon_num].alt_end_i,
strand=strand,
)

genomic_location, err_msg = self._get_vrs_seq_loc(
genomic_ac, genomic_pos, is_seg_start, strand
)
if err_msg:
return GenomicTxSeg(errors=[err_msg])

return GenomicTxSeg(
gene=gene,
genomic_ac=genomic_ac,
tx_ac=transcript,
seg=TxSegment(
exon_ord=exon_num,
offset=offset,
genomic_location=genomic_location,
),
)

return await self._get_tx_seg_genomic_metadata(
genomic_ac,
genomic_pos,
is_seg_start,
gene,
tx_ac=transcript,
)

async def _genomic_to_tx_segment(
self,
genomic_pos: int,
Expand Down Expand Up @@ -1232,13 +1059,6 @@ async def _get_tx_seg_genomic_metadata(
tx_ac = mane_data["RefSeq_nuc"]
grch38_ac = mane_data["GRCh38_chr"]

# Always liftover to GRCh38
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
genomic_ac, genomic_pos, grch38_ac=grch38_ac
)
if err_msg:
return GenomicTxSeg(errors=[err_msg])

tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac)
if not tx_exons:
return GenomicTxSeg(errors=[f"No exons found given {tx_ac}"])
Expand Down
6 changes: 3 additions & 3 deletions tests/mappers/test_exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -1437,9 +1437,9 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))

# inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9609996}
# resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
# assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))
inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9588449}
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))

inputs = {"transcript": "NM_003390.3", "exon_start": 2}
resp = await test_egc_mapper.tx_segment_to_genomic(**inputs)
Expand Down

0 comments on commit 6774927

Please sign in to comment.