Skip to content

Commit

Permalink
Add support for providing assembly
Browse files Browse the repository at this point in the history
  • Loading branch information
jarbesfeld committed Jan 3, 2025
1 parent 9bc2c31 commit 3a7c914
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 100 deletions.
95 changes: 36 additions & 59 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ async def genomic_to_tx_segment(
transcript: str | None = None,
gene: str | None = None,
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
assembly: Assembly = Assembly.GRCH38,
) -> GenomicTxSegService:
"""Get transcript segment data for genomic data, lifted over to GRCh38.
Expand Down Expand Up @@ -452,6 +453,8 @@ async def genomic_to_tx_segment(
value is provided.
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
``seg_end_genomic``. Expects inter-residue coordinates by default
:param assembly: The assembly that the supplied coordinate comes from. Set to
GRCh38 be default
:return: Genomic data (inter-residue coordinates)
"""
errors = []
Expand All @@ -477,6 +480,7 @@ async def genomic_to_tx_segment(
gene=gene,
is_seg_start=True,
coordinate_type=coordinate_type,
assembly=assembly,
)
if start_tx_seg_data.errors:
return _return_service_errors(start_tx_seg_data.errors)
Expand All @@ -497,6 +501,7 @@ async def genomic_to_tx_segment(
gene=gene,
is_seg_start=False,
coordinate_type=coordinate_type,
assembly=assembly,
)
if end_tx_seg_data.errors:
return _return_service_errors(end_tx_seg_data.errors)
Expand Down Expand Up @@ -727,6 +732,7 @@ async def _genomic_to_tx_segment(
gene: str | None = None,
is_seg_start: bool = True,
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
assembly: Assembly = Assembly.GRCH38,
) -> GenomicTxSeg:
"""Given genomic data, generate a boundary for a transcript segment.
Expand All @@ -749,6 +755,8 @@ async def _genomic_to_tx_segment(
``False`` if ``genomic_pos`` is where the transcript segment ends.
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
``seg_end_genomic``. Expects inter-residue coordinates by default
:param assembly: The assembly that the supplied coordinate comes from. Set to
GRCh38 be default
:return: Data for a transcript segment boundary (inter-residue coordinates)
"""
params = {key: None for key in GenomicTxSeg.model_fields}
Expand All @@ -771,6 +779,9 @@ async def _genomic_to_tx_segment(
genomic_ac_validation = await self.uta_db.validate_genomic_ac(genomic_ac)
if not genomic_ac_validation:
return GenomicTxSeg(errors=[f"{genomic_ac} does not exist in UTA"])
if assembly == Assembly.GRCH37:
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
genomic_ac = grch38_ac[0]
else:
genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)

Expand All @@ -780,13 +791,15 @@ async def _genomic_to_tx_segment(
)
genomic_ac = genomic_acs[0]

# Always liftover to GRCh38
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
genomic_ac,
genomic_pos,
)
if err_msg:
return GenomicTxSeg(errors=[err_msg])
# Liftover to GRCh38 if the provided assembly is GRCh37
if assembly == Assembly.GRCH37:
genomic_pos = await self._get_grch38_pos(genomic_pos, genomic_ac)
if not genomic_pos:
return GenomicTxSeg(
errors=[
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful."
]
)

# Select a transcript if not provided
if not transcript:
Expand Down Expand Up @@ -896,59 +909,23 @@ async def _genomic_to_tx_segment(
tx_ac=transcript,
)

async def _get_grch38_ac_pos(
self,
genomic_ac: str,
genomic_pos: int,
grch38_ac: str | None = None,
) -> tuple[str | None, int | None, str | None]:
"""Get GRCh38 genomic representation for accession and position
:param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly)
:param genomic_pos: Genomic position on ``genomic_ac``
:param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not
provided, will attempt to retrieve associated GRCh38 accession from UTA.
:return: Tuple containing GRCh38 accession, GRCh38 position, and error message
if unable to get GRCh38 representation
"""
# Validate accession exists
if not grch38_ac:
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
if not grch38_ac:
return None, None, f"Unrecognized genomic accession: {genomic_ac}."

grch38_ac = grch38_ac[0]

if grch38_ac != genomic_ac:
# Ensure genomic_ac is GRCh37
chromosome, _ = self.seqrepo_access.translate_identifier(
genomic_ac, Assembly.GRCH37.value
)
if not chromosome:
_logger.warning(
"SeqRepo could not find associated %s assembly for genomic accession %s.",
Assembly.GRCH37.value,
genomic_ac,
)
return (
None,
None,
f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
)
chromosome = chromosome[-1].split(":")[-1]
liftover_data = self.liftover.get_liftover(
chromosome, genomic_pos, Assembly.GRCH38
)
if liftover_data is None:
return (
None,
None,
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
)
genomic_pos = liftover_data[1]
genomic_ac = grch38_ac
async def _get_grch38_pos(self, genomic_pos: int, genomic_ac: str) -> int | None:
"""Liftover a GRCh37 coordinate to GRCh38
return genomic_ac, genomic_pos, None
:param genomic_pos: A genomic coordinate in GRCh37
:param genomic_ac: The genomic accession in GRCh38
:return The genomic coordinate in GRCh38
"""
chromosome, _ = self.seqrepo_access.translate_identifier(
genomic_ac, target_namespaces=Assembly.GRCH38.value
)
chromosome = chromosome[-1].split(":")[-1]
liftover_data = self.liftover.get_liftover(
chromosome, genomic_pos, Assembly.GRCH38
)
if liftover_data is None:
return None
return liftover_data[1]

async def _validate_gene_coordinates(
self,
Expand Down
61 changes: 20 additions & 41 deletions tests/mappers/test_exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
_ExonCoord,
)
from cool_seq_tool.schemas import (
Assembly,
CoordinateType,
Strand,
)
Expand Down Expand Up @@ -704,48 +705,15 @@ def genomic_tx_seg_checks(actual, expected=None, is_valid=True):


@pytest.mark.asyncio()
async def test_get_grch38_ac_pos(test_egc_mapper):
"""Test that _get_grch38_ac_pos works correctly"""
grch38_ac = "NC_000001.11"
grch38_pos = 154192135
expected = grch38_ac, grch38_pos, None

# GRCh37 provided
grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.10", 154164611)
assert grch38_data == expected

# GRCh38 provided, no grch38_ac
grch38_data = await test_egc_mapper._get_grch38_ac_pos(grch38_ac, grch38_pos)
assert grch38_data == expected

# GRCh38 and grch38_ac provided
grch38_data = await test_egc_mapper._get_grch38_ac_pos(
grch38_ac, grch38_pos, grch38_ac=grch38_ac
)
assert grch38_data == expected

# Unrecognized accession
invalid_ac = "NC_0000026.10"
grch38_data = await test_egc_mapper._get_grch38_ac_pos(invalid_ac, 154164611)
assert grch38_data == (None, None, f"Unrecognized genomic accession: {invalid_ac}.")

# GRCh36 used
grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.9", 154164611)
assert grch38_data == (
None,
None,
"`genomic_ac` must use GRCh37 or GRCh38 assembly.",
)
async def test_get_grch38_pos(test_egc_mapper):
"""Test that get_grch38_pos works correctly"""
genomic_pos = await test_egc_mapper._get_grch38_pos(9609996, "NC_000011.10")
assert genomic_pos == 9588449

# Unsuccessful liftover
grch38_data = await test_egc_mapper._get_grch38_ac_pos(
"NC_000001.10", 9999999999999999999
)
assert grch38_data == (
None,
None,
"Lifting over 9999999999999999999 on NC_000001.10 from GRCh37 to GRCh38 was unsuccessful.",
genomic_pos = await test_egc_mapper._get_grch38_pos(
9609996999999999, "NC_000011.10"
)
assert genomic_pos is None


@pytest.mark.asyncio()
Expand Down Expand Up @@ -1268,6 +1236,7 @@ async def test_braf(test_egc_mapper, mane_braf):
"seg_start_genomic": 140501359, # GRCh38 coords: 140801559
"seg_end_genomic": 140453136, # GRCh38 coords: 140753336
"gene": "BRAF",
"assembly": Assembly.GRCH37.value,
}
# MANE
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
Expand All @@ -1291,6 +1260,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
"seg_start_genomic": 9597639,
"seg_end_genomic": 9609996,
"transcript": "NM_003390.3",
"assembly": Assembly.GRCH37.value,
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11)
Expand All @@ -1307,6 +1277,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
"seg_end_genomic": 9609996,
"transcript": "NM_003390.3",
"gene": "WEE1",
"assembly": Assembly.GRCH37.value,
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11)
Expand All @@ -1322,6 +1293,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
"seg_start_genomic": 9597639, # GRCh38 coords: 9576092
"seg_end_genomic": 9609996, # GRCh38 coords: 9588449
"gene": "WEE1",
"assembly": Assembly.GRCH37.value,
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11)
Expand Down Expand Up @@ -1433,11 +1405,17 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
"gene": "WEE1",
"genomic_ac": "NC_000011.9",
"seg_end_genomic": 9609996,
"assembly": Assembly.GRCH37.value,
}
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))

inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9588449}
inputs = {
"gene": "WEE1",
"chromosome": "11",
"seg_end_genomic": 9609996,
"assembly": Assembly.GRCH37.value,
}
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))

Expand Down Expand Up @@ -1471,6 +1449,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
seg_start_genomic=73442503,
seg_end_genomic=73457929, # not on an exon
gene="ELN",
assembly=Assembly.GRCH37.value,
)
genomic_tx_seg_service_checks(resp, eln_grch38_intronic)

Expand Down

0 comments on commit 3a7c914

Please sign in to comment.