Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: add starting_assembly as argument when using genomic_to_tx_segment #391

Merged
merged 21 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 41 additions & 55 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ async def genomic_to_tx_segment(
transcript: str | None = None,
gene: str | None = None,
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
starting_assembly: Assembly = Assembly.GRCH38,
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved
) -> GenomicTxSegService:
"""Get transcript segment data for genomic data, lifted over to GRCh38.

Expand Down Expand Up @@ -441,7 +442,9 @@ async def genomic_to_tx_segment(
used.
:param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
must provide ``chromosome. If ``chromosome`` is also provided,
``genomic_ac`` will be used.
``genomic_ac`` will be used. If the genomic accession is from GRCh37, it
will be lifted over to GRCh38 and the original accession version will be
ignored
:param seg_start_genomic: Genomic position where the transcript segment starts
:param seg_end_genomic: Genomic position where the transcript segment ends
:param transcript: The transcript to use. If this is not given, we will try the
Expand All @@ -452,6 +455,8 @@ async def genomic_to_tx_segment(
value is provided.
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
``seg_end_genomic``. Expects inter-residue coordinates by default
:param starting_assembly: The assembly that the supplied coordinate comes from. Set to
GRCh38 by default. Will attempt to liftover if starting assembly is GRCh37
:return: Genomic data (inter-residue coordinates)
"""
errors = []
Expand All @@ -477,6 +482,7 @@ async def genomic_to_tx_segment(
gene=gene,
is_seg_start=True,
coordinate_type=coordinate_type,
starting_assembly=starting_assembly,
)
if start_tx_seg_data.errors:
return _return_service_errors(start_tx_seg_data.errors)
Expand All @@ -497,6 +503,7 @@ async def genomic_to_tx_segment(
gene=gene,
is_seg_start=False,
coordinate_type=coordinate_type,
starting_assembly=starting_assembly,
)
if end_tx_seg_data.errors:
return _return_service_errors(end_tx_seg_data.errors)
Expand Down Expand Up @@ -727,6 +734,7 @@ async def _genomic_to_tx_segment(
gene: str | None = None,
is_seg_start: bool = True,
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
starting_assembly: Assembly = Assembly.GRCH38,
) -> GenomicTxSeg:
"""Given genomic data, generate a boundary for a transcript segment.

Expand All @@ -743,7 +751,8 @@ async def _genomic_to_tx_segment(
If ``genomic_ac`` is also provided, ``genomic_ac`` will be used.
:param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac``
will be used.
will be used. If the genomic accession is from GRCh37, it will be lifted
over to GRCh38 and the original accession version will be ignored
:param transcript: The transcript to use. If this is not given, we will try the
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
Compatible Transcript
Expand All @@ -752,6 +761,8 @@ async def _genomic_to_tx_segment(
``False`` if ``genomic_pos`` is where the transcript segment ends.
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
``seg_end_genomic``. Expects inter-residue coordinates by default
:param starting_assembly: The assembly that the supplied coordinate comes from. Set to
GRCh38 by default. Will attempt to liftover if starting assembly is GRCh37
:return: Data for a transcript segment boundary (inter-residue coordinates)
"""
params = {key: None for key in GenomicTxSeg.model_fields}
Expand All @@ -770,8 +781,10 @@ async def _genomic_to_tx_segment(
)

if genomic_ac:
genomic_ac_validation = await self.uta_db.validate_genomic_ac(genomic_ac)
if not genomic_ac_validation:
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
if grch38_ac:
genomic_ac = grch38_ac[0]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth adding something in the docstrings (for both private + public method) about version will be ignored for GRCh37?

else:
return GenomicTxSeg(
errors=[f"Genomic accession does not exist in UTA: {genomic_ac}"]
)
Expand All @@ -784,13 +797,17 @@ async def _genomic_to_tx_segment(
)
genomic_ac = genomic_acs[0]

# Always liftover to GRCh38
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
genomic_ac,
genomic_pos,
)
if err_msg:
return GenomicTxSeg(errors=[err_msg])
# Liftover to GRCh38 if the provided assembly is GRCh37
if starting_assembly == Assembly.GRCH37:
genomic_pos = await self._get_grch38_pos(
genomic_ac, genomic_pos, chromosome=chromosome if chromosome else None
)
if not genomic_pos:
return GenomicTxSeg(
errors=[
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful."
]
)

# Select a transcript if not provided
if not transcript:
Expand Down Expand Up @@ -903,59 +920,28 @@ async def _genomic_to_tx_segment(
),
)

async def _get_grch38_ac_pos(
async def _get_grch38_pos(
self,
genomic_ac: str,
genomic_pos: int,
grch38_ac: str | None = None,
) -> tuple[str | None, int | None, str | None]:
chromosome: str | None = None,
) -> int | None:
"""Get GRCh38 genomic representation for accession and position

:param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly)
:param genomic_pos: Genomic position on ``genomic_ac``
:param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not
provided, will attempt to retrieve associated GRCh38 accession from UTA.
:return: Tuple containing GRCh38 accession, GRCh38 position, and error message
if unable to get GRCh38 representation
:param genomic_pos: A genomic coordinate in GRCh37
:param genomic_ac: The genomic accession in GRCh38
:param chromosome: The chromosome that genomic_pos occurs on
:return The genomic coordinate in GRCh38
"""
# Validate accession exists
if not grch38_ac:
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
if not grch38_ac:
return None, None, f"Unrecognized genomic accession: {genomic_ac}."

grch38_ac = grch38_ac[0]

if grch38_ac != genomic_ac:
# Ensure genomic_ac is GRCh37
if not chromosome:
chromosome, _ = self.seqrepo_access.translate_identifier(
genomic_ac, Assembly.GRCH37.value
genomic_ac, target_namespaces=Assembly.GRCH38.value
)
if not chromosome:
_logger.warning(
"SeqRepo could not find associated %s assembly for genomic accession %s.",
Assembly.GRCH37.value,
genomic_ac,
)
return (
None,
None,
f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
)
chromosome = chromosome[-1].split(":")[-1]
liftover_data = self.liftover.get_liftover(
chromosome, genomic_pos, Assembly.GRCH38
)
if liftover_data is None:
return (
None,
None,
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
)
genomic_pos = liftover_data[1]
genomic_ac = grch38_ac

return genomic_ac, genomic_pos, None
liftover_data = self.liftover.get_liftover(
chromosome, genomic_pos, Assembly.GRCH38
)
return liftover_data[1] if liftover_data else None

async def _validate_gene_coordinates(
self,
Expand Down
110 changes: 56 additions & 54 deletions tests/mappers/test_exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
_ExonCoord,
)
from cool_seq_tool.schemas import (
Assembly,
CoordinateType,
Strand,
)
Expand Down Expand Up @@ -703,51 +704,6 @@ def genomic_tx_seg_checks(actual, expected=None, is_valid=True):
assert len(actual.errors) > 0


@pytest.mark.asyncio()
async def test_get_grch38_ac_pos(test_egc_mapper):
"""Test that _get_grch38_ac_pos works correctly"""
grch38_ac = "NC_000001.11"
grch38_pos = 154192135
expected = grch38_ac, grch38_pos, None

# GRCh37 provided
grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.10", 154164611)
assert grch38_data == expected

# GRCh38 provided, no grch38_ac
grch38_data = await test_egc_mapper._get_grch38_ac_pos(grch38_ac, grch38_pos)
assert grch38_data == expected

# GRCh38 and grch38_ac provided
grch38_data = await test_egc_mapper._get_grch38_ac_pos(
grch38_ac, grch38_pos, grch38_ac=grch38_ac
)
assert grch38_data == expected

# Unrecognized accession
invalid_ac = "NC_0000026.10"
grch38_data = await test_egc_mapper._get_grch38_ac_pos(invalid_ac, 154164611)
assert grch38_data == (None, None, f"Unrecognized genomic accession: {invalid_ac}.")

# GRCh36 used
grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.9", 154164611)
assert grch38_data == (
None,
None,
"`genomic_ac` must use GRCh37 or GRCh38 assembly.",
)

# Unsuccessful liftover
grch38_data = await test_egc_mapper._get_grch38_ac_pos(
"NC_000001.10", 9999999999999999999
)
assert grch38_data == (
None,
None,
"Lifting over 9999999999999999999 on NC_000001.10 from GRCh37 to GRCh38 was unsuccessful.",
)


@pytest.mark.asyncio()
async def test_get_all_exon_coords(
test_egc_mapper, nm_152263_exons, nm_152263_exons_genomic_coords
Expand Down Expand Up @@ -893,6 +849,21 @@ def test_is_exonic_breakpoint(test_egc_mapper, nm_001105539_exons_genomic_coords
assert resp is True # Breakpoint does occur on an exon


def test_use_alt_start_i(test_egc_mapper):
"""Test when to use alt_start_i or alt_end_i from UTA"""
resp = test_egc_mapper._use_alt_start_i(is_seg_start=True, strand=Strand.POSITIVE)
assert resp

resp = test_egc_mapper._use_alt_start_i(is_seg_start=False, strand=Strand.NEGATIVE)
assert resp

resp = test_egc_mapper._use_alt_start_i(is_seg_start=True, strand=Strand.NEGATIVE)
assert not resp

resp = test_egc_mapper._use_alt_start_i(is_seg_start=False, strand=Strand.POSITIVE)
assert not resp


@pytest.mark.asyncio()
async def test_genomic_to_transcript_fusion_context(
test_egc_mapper,
Expand Down Expand Up @@ -1250,14 +1221,25 @@ async def test_braf(test_egc_mapper, mane_braf):
"""
inputs = {
"genomic_ac": "NC_000007.13",
"seg_start_genomic": 140501359, # GRCh38 coords: 140801559
"seg_end_genomic": 140453136, # GRCh38 coords: 140753336
"seg_start_genomic": 140501359,
"seg_end_genomic": 140453136,
"gene": "BRAF",
"starting_assembly": Assembly.GRCH37.value,
}
# MANE
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, mane_braf)

inputs = {
"genomic_ac": "NC_000007.14",
"seg_start_genomic": 140801559,
"seg_end_genomic": 140753336,
"gene": "BRAF",
"starting_assembly": Assembly.GRCH38.value,
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, mane_braf)

expected = mane_braf.model_copy(deep=True)
expected.seg_start.genomic_location.end = 140801559
t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic(
Expand All @@ -1276,6 +1258,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
"seg_start_genomic": 9597639,
"seg_end_genomic": 9609996,
"transcript": "NM_003390.3",
"starting_assembly": Assembly.GRCH37.value,
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11)
Expand All @@ -1292,6 +1275,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
"seg_end_genomic": 9609996,
"transcript": "NM_003390.3",
"gene": "WEE1",
"starting_assembly": Assembly.GRCH37.value,
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11)
Expand All @@ -1304,9 +1288,20 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
# MANE since no transcript provided
inputs = {
"genomic_ac": "NC_000011.9",
"seg_start_genomic": 9597639, # GRCh38 coords: 9576092
"seg_end_genomic": 9609996, # GRCh38 coords: 9588449
"seg_start_genomic": 9597639,
"seg_end_genomic": 9609996,
"gene": "WEE1",
"starting_assembly": Assembly.GRCH37.value,
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11)

inputs = {
"genomic_ac": "NC_000011.10",
"seg_start_genomic": 9576092,
"seg_end_genomic": 9588449,
"gene": "WEE1",
"starting_assembly": Assembly.GRCH38.value,
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11)
Expand Down Expand Up @@ -1418,13 +1413,19 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
"gene": "WEE1",
"genomic_ac": "NC_000011.9",
"seg_end_genomic": 9609996,
"starting_assembly": Assembly.GRCH37.value,
}
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))

# inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9588449}
# resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
# assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))
inputs = {
"gene": "WEE1",
"chromosome": "11",
"seg_end_genomic": 9609996,
"starting_assembly": Assembly.GRCH37.value,
}
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))

inputs = {"transcript": "NM_003390.3", "exon_start": 2}
resp = await test_egc_mapper.tx_segment_to_genomic(**inputs)
Expand Down Expand Up @@ -1456,6 +1457,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
seg_start_genomic=73442503,
seg_end_genomic=73457929, # not on an exon
gene="ELN",
starting_assembly=Assembly.GRCH37.value,
)
genomic_tx_seg_service_checks(resp, eln_grch38_intronic)

Expand Down Expand Up @@ -1497,13 +1499,13 @@ async def test_invalid(test_egc_mapper):

# Invalid accession
resp = await test_egc_mapper.genomic_to_tx_segment(
genomic_ac="NC_000001.200",
genomic_ac="NC_000035.200",
seg_start_genomic=154191901,
seg_end_genomic=154192135,
transcript="NM_152263.3",
)
genomic_tx_seg_service_checks(resp, is_valid=False)
assert resp.errors == ["Genomic accession does not exist in UTA: NC_000001.200"]
assert resp.errors == ["Genomic accession does not exist in UTA: NC_000035.200"]

# Invalid coordinates
resp = await test_egc_mapper.genomic_to_tx_segment(
Expand Down
Loading