From a2c0c458aef5cd5bf1f385bc124c93ac5ee0129a Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 5 Dec 2023 07:52:13 -0500 Subject: [PATCH 01/34] add boilerplate --- .gitignore | 2 +- .pre-commit-config.yaml | 2 +- LICENSE | 2 +- docs/Makefile | 20 ++++++++++ docs/make.bat | 35 +++++++++++++++++ docs/source/conf.py | 85 +++++++++++++++++++++++++++++++++++++++++ setup.cfg | 8 ++++ 7 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 docs/Makefile create mode 100644 docs/make.bat create mode 100644 docs/source/conf.py diff --git a/.gitignore b/.gitignore index cb44c63d..11e1e767 100644 --- a/.gitignore +++ b/.gitignore @@ -69,7 +69,7 @@ instance/ .scrapy # Sphinx documentation -docs/_build/ +docs/build/ # PyBuilder target/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 23726534..22b3357a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,4 +18,4 @@ repos: rev: v0.0.291 hooks: - id: ruff - args: [ --fix, --exit-non-zero-on-fix ] + args: [ --fix, --exit-non-zero-on-fix, --extend-exclude, docs/ ] diff --git a/LICENSE b/LICENSE index 3b9aae3f..a718fa73 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 VICC +Copyright (c) 2021-2023 Wagner Lab Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d0c3cbf1 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..7d65db30 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..78dcff0b --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,85 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "cool-seq-tool" +copyright = "2023, Wagner Lab" +author = "Wagner Lab" +html_title = "cool-seq-tool" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.autodoc", + "sphinx_autodoc_typehints", + "sphinx.ext.linkcode", + "sphinx_copybutton", +] + +templates_path = ["_templates"] +exclude_patterns = [] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "furo" +html_static_path = [] +html_css_files = [ + "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/fontawesome.min.css", + "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/solid.min.css", + "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/brands.min.css", +] +html_theme_options = { + "footer_icons": [ + { + "name": "GitHub", + "url": "https://github.com/cancervariants/gene-normalization", + "html": "", + "class": "fa-brands fa-solid fa-github", + }, + { + "name": "Wagner Lab", + "url": "https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab", + "html": "", + "class": "fa-solid fa-house", + }, + { + "name": "Twitter", + "url": "https://twitter.com/genomicmedlab", + "html": "", + "class": "fa-solid fa-brands fa-twitter", + }, + ], +} +# -- autodoc things ---------------------------------------------------------- +import os # noqa: E402 +import sys # noqa: E402 + +sys.path.insert(0, os.path.abspath("../../cool_seq_tool")) +autodoc_preserve_defaults = True + +# -- get version ------------------------------------------------------------- +from cool_seq_tool.version import __version__ # noqa: E402 + +version = __version__ +release = version + + +# -- linkcode ---------------------------------------------------------------- +def linkcode_resolve(domain, info): + if domain != "py": + return None + if not info["module"]: + return None + filename = info["module"].replace(".", "/") + return f"https://github.com/genomicmedlab/cool-seq-tool/blob/main/{filename}.py" # noqa: E501 + + +# -- code block style -------------------------------------------------------- +pygments_style = "default" +pygements_dark_style = "monokai" diff --git a/setup.cfg b/setup.cfg index e94da141..3103a905 100644 --- a/setup.cfg +++ b/setup.cfg @@ -44,5 +44,13 @@ tests = pytest-asyncio == 0.18.3 mock +docs = + sphinx==6.1.3 + sphinx-autodoc-typehints==1.22.0 + sphinx-autobuild==2021.3.14 + sphinx-copybutton==0.5.2 + sphinxext-opengraph==0.8.2 + furo==2023.3.27 + [tool:pytest] addopts = --ignore setup.py --ignore codebuild/ --doctest-modules --cov-report term-missing --disable-warnings --cov . From 5ba11be6538592299199a5d4105159ffb544a097 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 5 Dec 2023 07:56:15 -0500 Subject: [PATCH 02/34] add boilerplate --- .pre-commit-config.yaml | 2 +- pyproject.toml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 22b3357a..23726534 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,4 +18,4 @@ repos: rev: v0.0.291 hooks: - id: ruff - args: [ --fix, --exit-non-zero-on-fix, --extend-exclude, docs/ ] + args: [ --fix, --exit-non-zero-on-fix ] diff --git a/pyproject.toml b/pyproject.toml index 685e11d0..c50fa99c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,8 @@ build-backend = "setuptools.build_meta:__legacy__" line-length = 88 [tool.ruff] +exclude = ["docs/source/conf.py"] + # pycodestyle (E, W) # Pyflakes (F) # flake8-annotations (ANN) From 94fab1fe3b1c5f6b3ba999fe700930ed36297d98 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 5 Dec 2023 15:48:53 -0500 Subject: [PATCH 03/34] Add progress --- .gitignore | 2 + README.md | 44 ----- cool_seq_tool/handlers/seqrepo_access.py | 17 +- cool_seq_tool/mappers/alignment.py | 69 +++---- cool_seq_tool/mappers/exon_genomic_coords.py | 58 +++--- cool_seq_tool/mappers/mane_transcript.py | 174 +++++++++--------- .../sources/mane_transcript_mappings.py | 8 +- cool_seq_tool/sources/uta_database.py | 52 +++--- .../source/_static/img/biomart.png | Bin docs/source/conf.py | 4 +- docs/source/contributing.rst | 83 +++++++++ docs/source/index.rst | 21 +++ docs/source/install.rst | 90 +++++++++ docs/source/license.rst | 4 + docs/source/reference/handlers.rst | 9 + docs/source/reference/index.rst | 8 + docs/source/reference/mappers.rst | 23 +++ docs/source/reference/sources.rst | 24 +++ docs/source/usage.rst | 11 ++ 19 files changed, 474 insertions(+), 227 deletions(-) rename biomart.png => docs/source/_static/img/biomart.png (100%) create mode 100644 docs/source/contributing.rst create mode 100644 docs/source/index.rst create mode 100644 docs/source/install.rst create mode 100644 docs/source/license.rst create mode 100644 docs/source/reference/handlers.rst create mode 100644 docs/source/reference/index.rst create mode 100644 docs/source/reference/mappers.rst create mode 100644 docs/source/reference/sources.rst create mode 100644 docs/source/usage.rst diff --git a/.gitignore b/.gitignore index 11e1e767..bd0891fd 100644 --- a/.gitignore +++ b/.gitignore @@ -130,6 +130,8 @@ dmypy.json Pipfile.lock +.DS_Store + # Data files cool_seq_tool/data/seqrepo/ cool_seq_tool/data/*.txt diff --git a/README.md b/README.md index ef156b89..daf91587 100644 --- a/README.md +++ b/README.md @@ -15,25 +15,6 @@ The **cool-seq-tool** provides: pip install cool-seq-tool[dev,tests] ``` -### Development - -Clone the repo: - -```commandline -git clone https://github.com/GenomicMedLab/cool-seq-tool -cd cool_seq_tool -``` - -[Install Pipenv](https://pipenv-fork.readthedocs.io/en/latest/#install-pipenv-today) if necessary. - -Install backend dependencies and enter Pipenv environment: - -```commandline -pipenv shell -pipenv update -pipenv install --dev -``` - ### UTA Database Installation `cool-seq-tool` uses intalls local UTA database. For other ways to install, visit [biocommons.uta](https://github.com/biocommons/uta). @@ -126,28 +107,3 @@ uvicorn cool_seq_tool.api:app --reload ``` Next, view the FastAPI on your local machine: http://127.0.0.1:8000/cool_seq_tool - -## Init coding style tests - -Code style is managed by [Ruff](https://github.com/astral-sh/ruff) and [Black](https://github.com/psf/black), and should be checked prior to commit. - -We use [pre-commit](https://pre-commit.com/#usage) to run conformance tests. - -This ensures: - -* Check code style -* Check for added large files -* Detect AWS Credentials -* Detect Private Key - -Before first commit run: - -``` -pre-commit install -``` - -## Testing -From the _root_ directory of the repository: -``` -pytest -``` diff --git a/cool_seq_tool/handlers/seqrepo_access.py b/cool_seq_tool/handlers/seqrepo_access.py index d226739e..d65fa35d 100644 --- a/cool_seq_tool/handlers/seqrepo_access.py +++ b/cool_seq_tool/handlers/seqrepo_access.py @@ -25,13 +25,15 @@ def get_reference_sequence( residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Tuple[str, Optional[str]]: """Get reference sequence for an accession given a start and end position. - If `start` and `end` are not given, it will return the entire reference sequence + + If ``start`` and ``end`` are not given, it will return the entire reference + sequence :param ac: Accession :param start: Start pos change - :param end: End pos change. If `None` assumes both `start` and `end` have same - values, if `start` exists. - :param residue_mode: Residue mode for `start` and `end` + :param end: End pos change. If ``None`` assumes both ``start`` and ``end`` have + same values, if ``start`` exists. + :param residue_mode: Residue mode for ``start`` and ``end`` :return: Sequence at position (if accession and positions actually exist, else return empty string), warning if any """ @@ -123,7 +125,7 @@ def chromosome_to_acs( ) -> Tuple[Optional[List[str]], Optional[str]]: """Get accessions for a chromosome - :param str chromosome: Chromosome number. Must be either 1-22, X, or Y + :param chromosome: Chromosome number. Must be either 1-22, X, or Y :return: Accessions for chromosome (ordered by latest assembly) """ acs = [] @@ -160,9 +162,10 @@ def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]: def get_fasta_file(self, sequence_id: str, outfile_path: Path) -> None: """Retrieve FASTA file containing sequence for requested sequence ID. - :param sequence_id: accession ID, sans namespace, eg `NM_152263.3` + + :param sequence_id: accession ID, sans namespace, eg ``NM_152263.3`` :param outfile_path: path to save file to - :return: None, but saves sequence data to `outfile_path` if successful + :return: None, but saves sequence data to ``outfile_path`` if successful :raise: KeyError if SeqRepo doesn't have sequence data for the given ID """ sequence = self.get_reference_sequence(sequence_id)[0] diff --git a/cool_seq_tool/mappers/alignment.py b/cool_seq_tool/mappers/alignment.py index 5a94e711..62a50c95 100644 --- a/cool_seq_tool/mappers/alignment.py +++ b/cool_seq_tool/mappers/alignment.py @@ -19,11 +19,10 @@ def __init__( ) -> None: """Initialize the AlignmentMapper class. - :param SeqRepoAccess seqrepo_access: Access to seqrepo queries - :param TranscriptMappings transcript_mappings: Access to transcript - accession mappings and conversions - :param UTADatabase uta_db: UTADatabase instance to give access to query - UTA database + :param seqrepo_access: Access to seqrepo queries + :param transcript_mappings: Access to transcript accession mappings and + conversions + :param uta_db: UTADatabase instance to give access to query UTA database """ self.seqrepo_access = seqrepo_access self.transcript_mappings = transcript_mappings @@ -38,15 +37,16 @@ async def p_to_c( ) -> Tuple[Optional[Dict], Optional[str]]: """Translate protein representation to cDNA representation. - :param str p_ac: Protein RefSeq accession - :param int p_start_pos: Protein start position - :param int p_end_pos: Protein end position - :param ResidueMode residue_mode: Residue mode for `p_start_pos` and `p_end_pos` + :param p_ac: Protein RefSeq accession + :param p_start_pos: Protein start position + :param p_end_pos: Protein end position + :param residue_mode: Residue mode for ``p_start_pos`` and ``p_end_pos`` :return: Tuple containing: - - cDNA representation (accession, codon range positions for corresponding - change, cds start site) if able to translate. Will return positions as - inter-residue coordinates. If unable to translate, returns `None`. - - Warning, if unable to translate to cDNA representation. Else `None` + + * cDNA representation (accession, codon range positions for corresponding + change, cds start site) if able to translate. Will return positions as + inter-residue coordinates. If unable to translate, returns ``None``. + * Warning, if unable to translate to cDNA representation. Else ``None`` """ # Get cDNA accession temp_c_ac = await self.uta_db.p_to_c_ac(p_ac) @@ -86,10 +86,10 @@ async def p_to_c( async def _get_cds_start(self, c_ac: str) -> Tuple[Optional[int], Optional[str]]: """Get CDS start for a given cDNA RefSeq accession - :param str c_ac: cDNA RefSeq accession + :param c_ac: cDNA RefSeq accession :return: Tuple containing: - - CDS start site if found. Else `None` - - Warning, if unable to get CDS start. Else `None` + - CDS start site if found. Else ``None`` + - Warning, if unable to get CDS start. Else ``None`` """ cds_start_end = await self.uta_db.get_cds_start_end(c_ac) if not cds_start_end: @@ -111,16 +111,17 @@ async def c_to_g( ) -> Tuple[Optional[Dict], Optional[str]]: """Translate cDNA representation to genomic representation - :param str c_ac: cDNA RefSeq accession - :param int c_start_pos: cDNA start position for codon - :param int c_end_pos: cDNA end position for codon - :param Optional[int] coding_start_site: Coding start site. If not provided, - this will be computed. - :param Assembly target_genome_assembly: Genome assembly to get genomic data for + :param c_ac: cDNA RefSeq accession + :param c_start_pos: cDNA start position for codon + :param c_end_pos: cDNA end position for codon + :param coding_start_site: Coding start site. If not provided, this will be + computed. + :param target_genome_assembly: Genome assembly to get genomic data for :return: Tuple containing: - - Genomic representation (ac, positions) if able to translate. Will return - positions as inter-residue coordinates. Else `None`. - - Warning, if unable to translate to genomic representation. Else `None` + + * Genomic representation (ac, positions) if able to translate. Will return + positions as inter-residue coordinates. Else ``None``. + * Warning, if unable to translate to genomic representation. Else ``None`` """ if any( ( @@ -214,15 +215,17 @@ async def p_to_g( ) -> Tuple[Optional[Dict], Optional[str]]: """Translate protein representation to genomic representation - :param str p_ac: Protein RefSeq accession - :param int p_start_pos: Protein start position - :param int p_end_pos: Protein end position - :param ResidueMode residue_mode: Residue mode for `p_start_pos` and `p_end_pos`. - :param Assembly target_genome_assembly: Genome assembly to get genomic data for + :param p_ac: Protein RefSeq accession + :param p_start_pos: Protein start position + :param p_end_pos: Protein end position + :param residue_mode: Residue mode for ``p_start_pos`` and ``p_end_pos``. + :param target_genome_assembly: Genome assembly to get genomic data for :return: Tuple containing: - - Genomic representation (ac, positions) if able to translate. Will return - positions as inter-residue coordinates. Else `None`. - and warnings. The genomic data will always return inter-residue coordinates + + * Genomic representation (ac, positions) if able to translate. Will return + positions as inter-residue coordinates. Else ``None``. + * and warnings. The genomic data will always return inter-residue coordinates + TODO something got cut off here """ c_data, warning = await self.p_to_c( p_ac, p_start_pos, p_end_pos, residue_mode=residue_mode diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py index b94aca59..fbc099e4 100644 --- a/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -43,8 +43,8 @@ def _return_warnings( """Add warnings to response object :param resp: Response object - :param warning_msg: Warning message on why `transcript_exon_data` or - `genomic_data` field is None + :param warning_msg: Warning message on why ``transcript_exon_data`` or + ``genomic_data`` field is ``None`` :return: Response object with warning message """ logger.warning(warning_msg) @@ -168,21 +168,23 @@ async def genomic_to_transcript_exon_coordinates( **kwargs, ) -> GenomicDataResponse: """Get transcript data for genomic data. - MANE Transcript data will be returned iff `transcript` is not supplied. - `gene` must be supplied in order to retrieve MANE Transcript data. + + MANE Transcript data will be returned iff `transcript` is not supplied. ``gene`` + must be supplied in order to retrieve MANE Transcript data. + Liftovers genomic coordinates to GRCh38 - :param chromosome: Chromosome. Must either give chromosome number (i.e. `1`) or - accession (i.e. `NC_000001.11`). + :param chromosome: Chromosome. Must either give chromosome number (i.e. ``1``) + or accession (i.e. ``NC_000001.11``). :param start: Start genomic position :param end: End genomic position - :param strand: Strand. Must be either `-1` or `1`. + :param strand: Strand. Must be either ``-1`` or ``1``. :param transcript: The transcript to use. If this is not given, we will try the following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript :param gene: Gene symbol - :param residue_mode: Default is `resiude` (1-based). Must be either `residue` or - `inter-residue` (0-based). + :param residue_mode: Default is ``resiude`` (1-based). Must be either + ``residue`` or ``inter-residue`` (0-based). :return: Genomic data (inter-residue coordinates) """ resp = GenomicDataResponse( @@ -265,26 +267,26 @@ async def _genomic_to_transcript_exon_coordinate( self, chromosome: Union[str, int], pos: int, - strand: int = None, - transcript: str = None, - gene: str = None, + strand: Optional[int] = None, + transcript: Optional[str] = None, + gene: Optional[str] = None, is_start: bool = True, residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> TranscriptExonDataResponse: """Convert individual genomic data to transcript data - :param chromosome: Chromosome. Must either give chromosome number (i.e. `1`) or - accession (i.e. `NC_000001.11`). + :param chromosome: Chromosome. Must either give chromosome number (i.e. ``1``) + or accession (i.e. ``NC_000001.11``). :param pos: Genomic position - :param strand: Strand. Must be either `-1` or `1`. + :param strand: Strand. Must be either ``-1`` or ``1``. :param transcript: The transcript to use. If this is not given, we will try the following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript :param gene: Gene symbol - :param is_start: `True` if `pos` is start position. `False` if `pos` is end - position. - :param residue_mode: Default is `resiude` (1-based). Must be either `residue` - or `inter-residue` (0-based). + :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is + end position. + :param residue_mode: Default is ``resiude`` (1-based). Must be either ``residue`` + or ``inter-residue`` (0-based). :return: Transcript data (inter-residue coordinates) """ resp = TranscriptExonDataResponse( @@ -405,9 +407,9 @@ async def _set_mane_genomic_data( :param alt_ac: Genomic accession :param pos: Genomic position :param strand: Strand - :param is_start: `True` if `pos` is start position. `False` if `pos` is end - position. - :param residue_mode: Residue mode for `pos` + :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is + end position. + :param residue_mode: Residue mode for ``pos`` :return: Warnings if found """ mane_data = await self.mane_transcript.get_mane_transcript( @@ -485,12 +487,12 @@ async def _set_mane_genomic_data( async def _set_genomic_data( self, params: Dict, strand: int, is_start: bool ) -> Optional[str]: - """Set genomic data in `params` + """Set genomic data in ``params`` :param params: Parameters for response :param strand: Strand - :param is_start: `True` if `pos` is start position. `False` if `pos` is end - position. + :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is + end position. :return: Warnings if found """ # We should always try to liftover @@ -565,14 +567,14 @@ async def _set_genomic_data( def _set_exon_offset( params: Dict, start: int, end: int, pos: int, is_start: bool, strand: int ) -> None: - """Set `exon_offset` in params. + """Set ``exon_offset`` in params. :param params: Parameters for response :param start: Start exon coord (can be transcript or genomic) :param end: End exon coord (can be transcript or genomic) :param pos: Position change (can be transcript or genomic) - :param is_start: `True` if `pos` is start position. `False` if `pos` is end - position + :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is + end position :param int strand: Strand """ if is_start: diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index c2a21b47..3fb1ba52 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -1,10 +1,13 @@ """Module for retrieving MANE Transcript from variation on p/c/g coordinate. + Steps: 1. Map annotation layer to genome -2. Liftover to preferred genome - We want to liftover to GRCh38. We do not support getting MANE transcripts - for GRCh36 and earlier assemblies. + +2. Liftover to preferred genome. We want to liftover to GRCh38. We do not support + getting MANE transcripts for GRCh36 and earlier assemblies. + 3. Select preferred compatible annotation + 4. Map back to correct annotation layer """ import logging @@ -56,10 +59,9 @@ def __init__( @staticmethod def _get_reading_frame(pos: int) -> int: - """Return reading frame number. - Only used on c. coordinate + """Return reading frame number. Only used on c. coordinate. - :param int pos: cDNA position + :param pos: cDNA position :return: Reading frame """ pos_mod_3 = pos % 3 @@ -71,8 +73,8 @@ def _get_reading_frame(pos: int) -> int: def _p_to_c_pos(start: int, end: int) -> Tuple[int, int]: """Return cDNA position given a protein position. - :param int start: Start protein position - :param int end: End protein position + :param start: Start protein position + :param end: End protein position :return: cDNA position start, cDNA position end """ start_pos = start * 3 - 1 @@ -88,9 +90,9 @@ async def _p_to_c( ) -> Optional[Tuple[str, Tuple[int, int]]]: """Convert protein (p.) annotation to cDNA (c.) annotation. - :param str ac: Protein accession - :param int start_pos: Protein start position - :param int end_pos: Protein end position + :param ac: Protein accession + :param start_pos: Protein start position + :param end_pos: Protein end position :return: [cDNA transcript accession, [cDNA pos start, cDNA pos end]] """ # TODO: Check version mappings 1 to 1 relationship @@ -116,8 +118,8 @@ async def _p_to_c( async def _c_to_g(self, ac: str, pos: Tuple[int, int]) -> Optional[Dict]: """Get g. annotation from c. annotation. - :param str ac: cDNA accession - :param Tuple[int, int] pos: [cDNA pos start, cDNA pos end] + :param ac: cDNA accession + :param pos: [cDNA pos start, cDNA pos end] :return: Gene, Transcript accession and position change, Altered transcript accession and position change, Strand """ @@ -160,12 +162,11 @@ async def _get_and_validate_genomic_tx_data( ) -> Optional[Dict]: """Get and validate genomic_tx_data - :param str tx_ac: Accession on c. coordinate - :param Tuple[int, int] pos: (start pos, end pos) - :param Union[AnnotationLayer.CDNA, AnnotationLayer.GENOMIC] annotation_layer: - Annotation layer for `ac` and `pos` - :param Optional[int] coding_start_site: Coding start site - :param Optional[str] alt_ac: Accession on g. coordinate + :param tx_ac: Accession on c. coordinate + :param pos: (start pos, end pos) + :param annotation_layer: Annotation layer for ``ac`` and ``pos`` + :param coding_start_site: Coding start site + :param alt_ac: Accession on g. coordinate :return: genomic_tx_data if found and validated, else None """ genomic_tx_data = await self.uta_db.get_genomic_tx_data( @@ -210,16 +211,14 @@ def _get_c_data( ) -> Dict: """Return transcript data on c. coordinate. - :param str gene: Gene symbol - :param Tuple[int, int] cds_start_end: Coding start and end site - for transcript - :param Tuple[int, int] c_pos_change: Start and end positions - for change on c. coordinate - :param str strand: Strand - :param TranscriptPriority status: Status of transcript - :param str refseq_c_ac: Refseq transcript - :param Optional[str] ensembl_c_ac: Ensembl transcript - :param Optional[str] alt_ac: Genomic accession + :param gene: Gene symbol + :param cds_start_end: Coding start and end site for transcript + :param c_pos_change: Start and end positions for change on c. coordinate + :param strand: Strand + :param status: Status of transcript + :param refseq_c_ac: Refseq transcript + :param ensembl_c_ac: Ensembl transcript + :param alt_ac: Genomic accession :return: Transcript data on c. coord """ cds_start = cds_start_end[0] @@ -248,11 +247,9 @@ def _get_c_data( def _get_mane_p(mane_data: Dict, mane_c_pos_range: Tuple[int, int]) -> Dict: """Translate MANE Transcript c. annotation to p. annotation - :param Dict mane_data: MANE Transcript data - :param Tuple[int, int] mane_c_pos_range: Position change range - on MANE Transcript c. coordinate - :return: MANE transcripts accessions and position change on - p. coordinate + :param mane_data: MANE Transcript data + :param mane_c_pos_range: Position change range on MANE Transcript c. coordinate + :return: MANE transcripts accessions and position change on p. coordinate """ start = mane_c_pos_range[0] / 3 end = mane_c_pos_range[1] / 3 @@ -281,14 +278,14 @@ async def _g_to_c( ) -> Optional[Dict]: """Get transcript c. annotation data from g. annotation. - :param Dict g: Genomic data - :param str refseq_c_ac: Refseq transcript accession - :param TranscriptPriority status: Status of transcript - :param Optional[str] ensembl_c_ac: Ensembl transcript accession - :param Optional[str] alt_ac: Genomic accession - :param bool found_result: `True` if found result, so do not need to query + :param g: Genomic data + :param refseq_c_ac: Refseq transcript accession + :param status: Status of transcript + :param ensembl_c_ac: Ensembl transcript accession + :param alt_ac: Genomic accession + :param found_result: ``True`` if found result, so do not need to query tx_exon_aln_v table. This is because the user did not need to liftover. - `False` if need to get result from tx_exon_aln_v table. + ``False`` if need to get result from tx_exon_aln_v table. :return: Transcript data """ if found_result: @@ -348,13 +345,13 @@ def _validate_reading_frames( ) -> bool: """Return whether reading frames are the same after translation. - :param str ac: Query accession - :param int start_pos: Original start cDNA position change - :param int end_pos: Original end cDNA position change - :param Dict transcript_data: Ensembl and RefSeq transcripts with - corresponding position change - :return: `True` if reading frames are the same after translation. - `False` otherwise + :param ac: Query accession + :param start_pos: Original start cDNA position change + :param end_pos: Original end cDNA position change + :param transcript_data: Ensembl and RefSeq transcripts with corresponding + position change + :return: ``True`` if reading frames are the same after translation. + ``False`` otherwise """ for pos, pos_index in [(start_pos, 0), (end_pos, 1)]: if pos is not None: @@ -397,8 +394,8 @@ def _validate_references( position change :param expected_ref: Reference at position given during input :param anno: Annotation layer we are starting from - :param residue_mode: Residue mode for `start_pos` and `end_pos` - :return: `True` if reference check passes. `False` otherwise. + :param residue_mode: Residue mode for ``start_pos`` and ``end_pos`` + :return: ``True`` if reference check passes. ``False`` otherwise. """ if anno == AnnotationLayer.CDNA: start_pos += coding_start_site @@ -445,10 +442,10 @@ def _validate_index( ) -> bool: """Validate that positions actually exist on accession - :param str ac: Accession - :param Tuple[int, int] pos: Start position change, End position change - :param int coding_start_site: coding start site for accession - :return: `True` if positions exist on accession. `False` otherwise + :param ac: Accession + :param pos: Start position change, End position change + :param coding_start_site: coding start site for accession + :return: ``True`` if positions exist on accession. ``False`` otherwise """ start_pos = pos[0] + coding_start_site end_pos = pos[1] + coding_start_site @@ -522,7 +519,9 @@ async def get_longest_compatible_transcript( ] = None, ) -> Optional[Dict]: """Get longest compatible transcript from a gene. + Try GRCh38 first, then GRCh37. + Transcript is compatible if it passes validation checks. :param start_pos: Start position change @@ -530,14 +529,13 @@ async def get_longest_compatible_transcript( :param start_annotation_layer: Starting annotation layer :param gene: HGNC gene symbol :param ref: Reference at position given during input - :param residue_mode: Residue mode for `start_pos` and `end_pos` + :param residue_mode: Residue mode for ``start_pos`` and ``end_pos`` :param mane_transcripts: Attempted mane transcripts that were not compatible :param alt_ac: Genomic accession :param end_annotation_layer: The end annotation layer. If not provided, will be - set to the following - `AnnotationLayer.PROTEIN` if - `start_annotation_layer == AnnotationLayer.PROTEIN` - `AnnotationLayer.CDNA` otherwise + set to ``AnnotationLayer.PROTEIN`` if + ``start_annotation_layer == AnnotationLayer.PROTEIN``, + ``AnnotationLayer.CDNA`` otherwise :return: Data for longest compatible transcript """ inter_residue_pos, _ = get_inter_residue_pos( @@ -714,17 +712,16 @@ async def get_mane_transcript( :param ac: Accession :param start_pos: Start position change :param start_annotation_layer: Starting annotation layer. - :param end_pos: End position change. If `None` assumes both `start_pos` and - `end_pos` have same values. + :param end_pos: End position change. If ``None`` assumes both ``start_pos`` and + ``end_pos`` have same values. :param gene: HGNC gene symbol :param ref: Reference at position given during input - :param try_longest_compatible: `True` if should try longest compatible remaining - if mane transcript was not compatible. `False` otherwise. - :param ResidueMode residue_mode: Starting residue mode for `start_pos` - and `end_pos`. Will always return coordinates in inter-residue + :param try_longest_compatible: ``True`` if should try longest compatible remaining + if mane transcript was not compatible. ``False`` otherwise. + :param ResidueMode residue_mode: Starting residue mode for ``start_pos`` and + ``end_pos``. Will always return coordinates in inter-residue :return: MANE data or longest transcript compatible data if validation - checks are correct. Will return inter-residue coordinates. - Else, `None` + checks are correct. Will return inter-residue coordinates. Else, ``None``. """ inter_residue_pos, warning = get_inter_residue_pos( start_pos, residue_mode, end_pos=end_pos @@ -842,9 +839,9 @@ async def g_to_grch38( ) -> Optional[Dict]: """Return genomic coordinate on GRCh38 when not given gene context. - :param str ac: Genomic accession - :param int start_pos: Genomic start position change - :param int end_pos: Genomic end position change + :param ac: Genomic accession + :param start_pos: Genomic start position change + :param end_pos: Genomic end position change :return: NC accession, start and end pos on GRCh38 assembly """ if end_pos is None: @@ -899,8 +896,8 @@ def get_mane_c_pos_change( ) -> Tuple[int, int]: """Get mane c position change - :param Dict mane_tx_genomic_data: MANE transcript and genomic data - :param int coding_start_site: Coding start site + :param mane_tx_genomic_data: MANE transcript and genomic data + :param coding_start_site: Coding start site :return: cDNA pos start, cDNA pos end """ tx_pos_range = mane_tx_genomic_data["tx_pos_range"] @@ -924,17 +921,19 @@ async def g_to_mane_c( residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Optional[Dict]: """Return MANE Transcript on the c. coordinate. + If gene is provided, g->GRCh38->MANE c. - If MANE c. cannot be found, we return the genomic coordinate on - GRCh38 - If gene is not provided, g -> GRCh38 - - :param str ac: Transcript accession on g. coordinate - :param int start_pos: genomic change start position - :param int end_pos: genomic change end position - :param str gene: Gene symbol - :param ResidueMode residue_mode: Starting residue mode for `start_pos` - and `end_pos`. Will always return coordinates in inter-residue + + If MANE c. cannot be found, we return the genomic coordinate on GRCh38. + + If gene is not provided, g -> GRCh38. + + :param ac: Transcript accession on g. coordinate + :param start_pos: genomic change start position + :param end_pos: genomic change end position + :param gene: Gene symbol + :param residue_mode: Starting residue mode for ``start_pos`` and ``end_pos``. + Will always return coordinates in inter-residue. :return: MANE Transcripts with cDNA change on c. coordinate if gene is provided. Else, GRCh38 data """ @@ -1033,20 +1032,21 @@ async def grch38_to_mane_p( try_longest_compatible: bool = False, ) -> Optional[Dict]: """Given GRCh38 genomic representation, return protein representation. + Will try MANE Select and then MANE Plus Clinical. If neither is found and - `try_longest_compatible` is set to `true`, will also try to find the longest + ``try_longest_compatible`` is set to ``true``, will also try to find the longest compatible remaining representation. :param alt_ac: Genomic RefSeq accession on GRCh38 :param start_pos: Start position :param end_pos: End position :param gene: HGNC gene symbol - :param residue_mode: Starting residue mode for `start_pos` and `end_pos`. Will + :param residue_mode: Starting residue mode for ``start_pos`` and ``end_pos``. Will always return coordinates as inter-residue. - :param try_longest_compatible: `True` if should try longest compatible remaining - if mane transcript(s) not compatible. `False` otherwise. + :param try_longest_compatible: ``True`` if should try longest compatible remaining + if mane transcript(s) not compatible. ``False`` otherwise. :return: If successful, return MANE data or longest compatible remaining (if - `try_longest_compatible` set to `True`) protein representation. Will return + ``try_longest_compatible`` set to ``True``) protein representation. Will return inter-residue coordinates. """ # Step 1: Get MANE data to map to diff --git a/cool_seq_tool/sources/mane_transcript_mappings.py b/cool_seq_tool/sources/mane_transcript_mappings.py index 5b429506..14d1d828 100644 --- a/cool_seq_tool/sources/mane_transcript_mappings.py +++ b/cool_seq_tool/sources/mane_transcript_mappings.py @@ -15,6 +15,7 @@ class MANETranscriptMappings: def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None: """Initialize the MANE Transcript mappings class. + :param Path mane_data_path: Path to RefSeq MANE summary data """ self.mane_data_path = mane_data_path @@ -28,10 +29,10 @@ def _load_mane_transcript_data(self) -> pl.DataFrame: def get_gene_mane_data(self, gene_symbol: str) -> List[Dict]: """Return MANE Transcript data for a gene. + :param str gene_symbol: HGNC Gene Symbol - :return: List of MANE Transcript data (Transcript accessions, - gene, and location information). Sorted list: MANE Select and then MANE Plus - Clinical + :return: List of MANE Transcript data (Transcript accessions, gene, and + location information). Sorted list: MANE Select and then MANE Plus Clinical """ data = self.df.filter(pl.col("symbol") == gene_symbol.upper()) @@ -59,6 +60,7 @@ def get_mane_data_from_chr_pos( self, alt_ac: str, start: int, end: int ) -> List[Dict]: """Get MANE data given chromosome, start pos, end end pos. Assumes GRCh38. + :param str alt_ac: NC Accession :param int start: Start genomic position. Assumes residue coordinates. :param int end: End genomic position. Assumes residue coordinates. diff --git a/cool_seq_tool/sources/uta_database.py b/cool_seq_tool/sources/uta_database.py index 8d029b18..435781c6 100644 --- a/cool_seq_tool/sources/uta_database.py +++ b/cool_seq_tool/sources/uta_database.py @@ -39,11 +39,15 @@ def __init__( chain_file_37_to_38: Optional[str] = None, chain_file_38_to_37: Optional[str] = None, ) -> None: - """Initialize DB class. Downstream libraries should use the create() - method to construct a new instance: await UTADatabase.create() + """Initialize DB class. + + Downstream libraries should use the create() method to construct a new instance: + + >>> from cool_seq_tool.sources.uta_database import UTADatabase + >>> uta = await UTADatabase.create() :param db_url: PostgreSQL connection URL - Format: `driver://user:password@host/database/schema` + Format: ``driver://user:password@host/database/schema`` :param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly. This is used for pyliftover. If this is not provided, will check to see if LIFTOVER_CHAIN_37_TO_38 env var is set. If neither is provided, will allow @@ -138,9 +142,10 @@ async def create( cls: Type[UTADatabaseType], db_url: str = UTA_DB_URL ) -> UTADatabaseType: """Provide fully-initialized class instance (a la factory pattern) + :param UTADatabaseType cls: supplied implicitly :param str db_url: PostgreSQL connection URL - Format: `driver://user:password@host/database/schema` + Format: ``driver://user:password@host/database/schema`` :return: UTA DB access class instance """ self = cls(db_url) @@ -243,7 +248,7 @@ async def chr_to_gene_and_accessions( :param int chromosome: Chromosome number :param int pos: Genomic position - :param Optional[int] strand: Strand. Must be either `-1` or `1` + :param Optional[int] strand: Strand. Must be either ``-1`` or ``1`` :param Optional[str] alt_ac: Genomic accession :param Optional[str] gene: Gene symbol :return: Dictionary containing genes and genomic accessions and @@ -541,7 +546,7 @@ async def validate_genomic_ac(self, ac: str) -> bool: """Return whether or not genomic accession exists. :param str ac: Genomic accession - :return: `True` if genomic accession exists. `False` otherwise. + :return: ``True`` if genomic accession exists. ``False`` otherwise. """ query = f""" SELECT EXISTS( @@ -590,13 +595,13 @@ async def get_tx_exon_aln_v_data( :param int start_pos: Start position change :param int end_pos: End position change :param str alt_ac: accession on g. coordinate - :param bool use_tx_pos: `True` if querying on transcript position. This means - `start_pos` and `end_pos` are on the c. coordinate - `False` if querying on genomic position. This means `start_pos` and - `end_pos` are on the g. coordinate - :param bool like_tx_ac: `True` if tx_ac condition should be a like statement. + :param bool use_tx_pos: ``True`` if querying on transcript position. This means + ``start_pos`` and ``end_pos`` are on the c. coordinate + ``False`` if querying on genomic position. This means ``start_pos`` and + ``end_pos`` are on the g. coordinate + :param bool like_tx_ac: ``True`` if tx_ac condition should be a like statement. This is used when you want to query an accession regardless of its version - `False` if tx_condition will be exact match + ``False`` if tx_condition will be exact match :return: List of tx_exon_aln_v data """ if end_pos is None: @@ -697,6 +702,7 @@ async def get_mane_c_genomic_data( """Get MANE Transcript and genomic data. Used when going from g -> MANE c + :param str ac: MANE Transcript accession :param str alt_ac: NC Accession :param int start_pos: Genomic start position change @@ -755,10 +761,10 @@ async def get_genomic_tx_data( :param str tx_ac: Accession on c. coordinate :param Tuple pos: (start pos, end pos) :param Union[AnnotationLayer.CDNA, AnnotationLayer.GENOMIC] annotation_layer: - Annotation layer for `ac` and `pos` + Annotation layer for ``ac`` and ``pos`` :param Optional[str] alt_ac: Accession on g. coordinate :param Assembly target_genome_assembly: Genome assembly to get genomic data for. - If `alt_ac` is provided, it will return the associated assembly. + If ``alt_ac`` is provided, it will return the associated assembly. :return: Gene, Transcript accession and position change, Altered transcript accession and position change, Strand """ @@ -871,20 +877,20 @@ async def get_transcripts( use_tx_pos: bool = True, alt_ac: Optional[str] = None, ) -> pl.DataFrame: - """Get transcripts for a given `gene` or `alt_ac` related to optional positions. + """Get transcripts for a given ``gene`` or ``alt_ac`` related to optional positions. :param start_pos: Start position change - If not provided and `end_pos` not provided, all transcripts associated with + If not provided and ``end_pos`` not provided, all transcripts associated with the gene and/or accession will be returned :param end_pos: End position change - If not provided and `start_pos` not provided, all transcripts associated + If not provided and ``start_pos`` not provided, all transcripts associated with the gene and/or accession will be returned :param gene: HGNC gene symbol - :param use_tx_pos: `True` if querying on transcript position. This means - `start_pos` and `end_pos` are c. coordinate positions. `False` if querying - on genomic position. This means `start_pos` and `end_pos` are g. coordinate + :param use_tx_pos: ``True`` if querying on transcript position. This means + ``start_pos`` and ``end_pos`` are c. coordinate positions. ``False`` if querying + on genomic position. This means ``start_pos`` and ``end_pos`` are g. coordinate positions - :param alt_ac: Genomic accession. If not provided, must provide `gene` + :param alt_ac: Genomic accession. If not provided, must provide ``gene`` :return: Data Frame containing transcripts associated with a gene. Transcripts are ordered by most recent NC accession, then by descending transcript length @@ -1022,7 +1028,7 @@ def get_liftover( ) -> Optional[Tuple]: """Get new genome assembly data for a position on a chromosome. - :param str chromosome: The chromosome number. Must be prefixed with `chr` + :param str chromosome: The chromosome number. Must be prefixed with ``chr`` :param int pos: Position on the chromosome :param Assembly liftover_to_assembly: Assembly to liftover to :return: [Target chromosome, target position, target strand, @@ -1058,7 +1064,7 @@ def _set_liftover( :param Dict genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and strand :param str key: Key to access coordinate positions - :param str chromosome: Chromosome, must be prefixed with `chr` + :param str chromosome: Chromosome, must be prefixed with ``chr`` :param Assembly liftover_to_assembly: Assembly to liftover to """ liftover_start_i = self.get_liftover( diff --git a/biomart.png b/docs/source/_static/img/biomart.png similarity index 100% rename from biomart.png rename to docs/source/_static/img/biomart.png diff --git a/docs/source/conf.py b/docs/source/conf.py index 78dcff0b..ee3adb9c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -6,10 +6,10 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -project = "cool-seq-tool" +project = "Cool-Seq-Tool" copyright = "2023, Wagner Lab" author = "Wagner Lab" -html_title = "cool-seq-tool" +html_title = "Cool-Seq-Tool" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst new file mode 100644 index 00000000..908aa727 --- /dev/null +++ b/docs/source/contributing.rst @@ -0,0 +1,83 @@ +Contributing to Cool-Seq-Tool +============================= + +Bug reports and feature requests +-------------------------------- + +Bugs and new feature requests can be submitted to the Cool-Seq-Tool `issue tracker on GitHub `_. See `this StackOverflow post `_ for tips on how to craft a helpful bug report. + +Development prerequisites +------------------------- +For a development install, we recommend using Pipenv. See the `Pipenv docs `_ for direction on installing Pipenv in your environment. + +Setup +----- +Clone the repository: :: + + git clone https://github.com/genomicmedlab/cool-seq-tool + cd cool-seq-tool + +Then initialize the Pipenv environment: :: + + pipenv update + pipenv install --dev + pipenv shell + +Alternatively, use a virtual environment and install :ref:`all dependency groups`: :: + + python3 -m venv venv + source venv/bin/activate + python3 -m pip install -e ".[dev,tests,docs]" + +We use `pre-commit `_ to run conformance tests before commits. This provides checks for: + +* Code format and style +* Added large files +* AWS credentials +* Private keys + +Before your first commit, run: :: + + pre-commit install + +Style +----- + +Code style is managed by `Ruff `_ and `Black `_, and should be checked via pre-commit hook before commits. Final QC is applied with GitHub Actions to every pull request. + +Tests +----- + +Tests are executed with `pytest `_: :: + + pytest + +Documentation +------------- + +The documentation is built with Sphinx, which is included as part of the Pipenv developer dependencies, or in the ``docs`` dependency group. Navigate to the `docs/` subdirectory and use `make` to build the HTML version: :: + + pipenv shell + cd docs + make html + +See the `Sphinx documentation `_ for more information. + +.. _build_transcript_mappings_tsv: + +transcript_mappings.tsv +----------------------- + +Cool-Seq-Tool uses a static copy of transcript mapping data acquired from `Ensembl BioMart `_. To regenerate this file from the BioMart, select the ``Human Genes (GRCh38.p13)`` dataset and choose the following attributes: + +* Gene stable ID +* Gene stable ID version +* Transcript stable ID +* Transcript stable ID version +* Protein stable ID +* Protein stable ID version +* RefSeq match transcript (MANE Select) +* Gene name + +.. image:: _static/img/biomart.png + :alt: BioMart screenshot diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..a89f0251 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,21 @@ +Cool-Seq-Tool |version| +======================= + +.. image:: https://img.shields.io/pypi/pyversions/cool-seq-tool?color=gr + :alt: PyPI - Python Version + +.. image:: https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml/badge.svg + :alt: tests status + :target: https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml + +Description here. + +.. toctree:: + :hidden: + :maxdepth: 2 + + Installation + Usage + API Reference + Contributing + License diff --git a/docs/source/install.rst b/docs/source/install.rst new file mode 100644 index 00000000..0fb4c277 --- /dev/null +++ b/docs/source/install.rst @@ -0,0 +1,90 @@ +Installation +============ + +Prerequisites +------------- + +* Python 3.8+ +* A UNIX-like environment (e.g. MacOS, WSL, Ubuntu) +* A recent version of PostgreSQL (ideally at least 11+) + +Install Cool-Seq-Tool +--------------------- + +Install Cool-Seq-Tool from `PyPI `_: + + pip install cool-seq-tool + +.. _dependency-groups: + +.. note:: + + Cool-Seq-Tool provides extra dependency groups for development and testing purposes. Most users won't need to install them. + + * ``dev`` includes packages for linting and performing other kinds of quality checks + * ``tests`` includes packages for running tests + * ``docs`` includes packages for writing and building documentation + +Set up UTA +---------- + +Cool-Seq-Tool requires an available instance of the Universal Transcript Archive (UTA) database. Complete installation instructions (via Docker or a local server) are available at the `UTA GitHub repository `_. For local usage, we recommend the following: + +.. code-block:: + + createuser -U postgres uta_admin + createuser -U postgres anonymous + createdb -U postgres -O uta_admin uta + + export UTA_VERSION=uta_20210129b.pgd.gz # most recent as of 2023/12/05 + curl -O https://dl.biocommons.org/uta/$UTA_VERSION + gzip -cdq ${UTA_VERSION} | psql -h localhost -U uta_admin --echo-errors --single-transaction -v ON_ERROR_STOP=1 -d uta -p 5433 + +By default, Cool-Seq-Tool expects to connect to the UTA database via a PostgreSQL connection served local on port 5433, under the PostgreSQL username ``uta_admin`` and the schema ``uta_20210129b``. + +Set up SeqRepo +-------------- + +Cool-Seq-Tool requires access to `SeqRepo `_ data. In general, we recommend the following for local setup: + +.. code-block:: + + pip install seqrepo + export SEQREPO_VERSION=2021-01-29 # or newer if available -- check `seqrepo list-remote-instances` + sudo mkdir /usr/local/share/seqrepo + sudo chown $USER /usr/local/share/seqrepo + seqrepo pull -i $SEQREPO_VERSION + +If you encounter a permission error similar to the one below: + +.. code-block:: + + PermissionError: [Error 13] Permission denied: '/usr/local/share/seqrepo/2021-01-29._fkuefgd' -> '/usr/local/share/seqrepo/2021-01-29' + +Try moving data manually with ``sudo``: + +.. code-block:: + + sudo mv /usr/local/share/seqrepo/$SEQREPO_VERSION.* /usr/local/share/seqrepo/$SEQREPO_VERSION + +See `mirroring documentation `_ on the SeqRepo GitHub repo for instructions and additional troubleshooting. + +Acquire genome and transcript data +---------------------------------- + +Cool-Seq-Tool also makes use of external data extracted from a handful of sources. This data is fetched automatically at runtime if it's not available, so no additional action is necessary by default -- however, if necessary, it can be configured and placed manually. + +LRG_RefSeqGene +++++++++++++++ + +Cool-Seq-Tool fetches the latest version of ``LRG_RefSeqGene`` if the environment variable ``LRG_REFSEQGENE_PATH`` is not set. When ``LRG_REFSEQGENE_PATH`` is set, Cool-Seq-Tool will look at this path and expect the ``LRG_RefSeqGene`` file. This file is found can be found `here `_. + +MANE summary +++++++++++++ + +Cool-Seq-Tool fetches the latest version of ``MANE.GRCh38.*.summary.txt.gz`` if the environment variable ``MANE_SUMMARY_PATH`` is not set. When ``MANE_SUMMARY_PATH`` is set, Cool-Seq-Tool will look at this path and expect the MANE Summary Data file. This file is found can be found `here `_. + +Transcript mappings ++++++++++++++++++++ + +Cool-Seq-Tool is packaged with transcript mapping data acquired from `Ensembl BioMart `_. If the environment variable ``TRANSCRIPT_MAPPINGS_PATH`` is not set, Cool-Seq-Tool will use the built-in file. When ``TRANSCRIPT_MAPPINGS_PATH`` is set, Cool-Seq-Tool will look at this path and expect to find the transcript mapping TSV file. To manually acquire this file, see the :ref:`contributor instructions `. diff --git a/docs/source/license.rst b/docs/source/license.rst new file mode 100644 index 00000000..0a3b8703 --- /dev/null +++ b/docs/source/license.rst @@ -0,0 +1,4 @@ +License +======= + +.. include:: ../../LICENSE diff --git a/docs/source/reference/handlers.rst b/docs/source/reference/handlers.rst new file mode 100644 index 00000000..7f1b8178 --- /dev/null +++ b/docs/source/reference/handlers.rst @@ -0,0 +1,9 @@ +Data Handlers API +================= + +seqrepo_access +-------------- + +.. automodule:: cool_seq_tool.handlers.seqrepo_access + :members: + :special-Members: __init__ diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst new file mode 100644 index 00000000..02406559 --- /dev/null +++ b/docs/source/reference/index.rst @@ -0,0 +1,8 @@ +API Reference +============= + +.. toctree:: + + Data Sources API + Data Handlers API + Data Mappers API diff --git a/docs/source/reference/mappers.rst b/docs/source/reference/mappers.rst new file mode 100644 index 00000000..3b9fbf15 --- /dev/null +++ b/docs/source/reference/mappers.rst @@ -0,0 +1,23 @@ +Data Mappers API +================ + +alignment.py +------------ + +.. automodule:: cool_seq_tool.mappers.alignment + :members: + :special-members: __init__ + +exon_genomic_coords.py +---------------------- + +.. automodule:: cool_seq_tool.mappers.exon_genomic_coords + :members: + :special-members: __init__ + +mane_transcript.py +------------------ + +.. automodule:: cool_seq_tool.mappers.mane_transcript + :members: + :special-members: diff --git a/docs/source/reference/sources.rst b/docs/source/reference/sources.rst new file mode 100644 index 00000000..600d4e05 --- /dev/null +++ b/docs/source/reference/sources.rst @@ -0,0 +1,24 @@ +Data Sources API +================ + +mane_transcript_mappings +------------------------ + +.. automodule:: cool_seq_tool.sources.mane_transcript_mappings + :members: + :special-members: __init__ + + +transcript_mappings +------------------- + +.. automodule:: cool_seq_tool.sources.transcript_mappings + :members: + :special-members: __init__ + +uta_database +------------ + +.. automodule:: cool_seq_tool.sources.uta_database + :members: + :special-members: __init__ diff --git a/docs/source/usage.rst b/docs/source/usage.rst new file mode 100644 index 00000000..d3d62e6a --- /dev/null +++ b/docs/source/usage.rst @@ -0,0 +1,11 @@ +Usage +===== + +Configuration +------------- + +Programmatic access +------------------- + +REST server +----------- From e2df581ea0d582082e299b3fc5a1183ce273a0c0 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Wed, 6 Dec 2023 16:03:56 -0500 Subject: [PATCH 04/34] Add autogenerated doc things --- .gitignore | 3 ++ cool_seq_tool/schemas.py | 10 ++-- docs/source/_templates/module_summary.rst | 6 +++ docs/source/conf.py | 1 + docs/source/index.rst | 1 + docs/source/overview.rst | 28 +++++++++++ docs/source/reference/handlers.rst | 9 ---- docs/source/reference/index.rst | 57 +++++++++++++++++++++-- docs/source/reference/mappers.rst | 23 --------- docs/source/reference/sources.rst | 24 ---------- 10 files changed, 97 insertions(+), 65 deletions(-) create mode 100644 docs/source/_templates/module_summary.rst create mode 100644 docs/source/overview.rst delete mode 100644 docs/source/reference/handlers.rst delete mode 100644 docs/source/reference/mappers.rst delete mode 100644 docs/source/reference/sources.rst diff --git a/.gitignore b/.gitignore index bd0891fd..b676a667 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,6 @@ cool_seq_tool/data/*.txt cool_seq_tool/data/LRG_RefSeqGene* cool_seq_tool/data/MANE* cool_seq_tool/data/notebooks/ + +# Autogenerated docs +docs/source/reference/api diff --git a/cool_seq_tool/schemas.py b/cool_seq_tool/schemas.py index c06edb85..19e0f2ee 100644 --- a/cool_seq_tool/schemas.py +++ b/cool_seq_tool/schemas.py @@ -71,7 +71,7 @@ class GenomicRequestBody(BaseModelForbidExtra): @model_validator(mode="after") def check_start_and_end(cls, values): - """Check that at least one of {`start`, `end`} is set""" + """Check that at least one of {``start``, ``end``} is set""" msg = "Must provide either `start` or `end`" start, end = values.start, values.end assert start or end, msg @@ -104,7 +104,7 @@ class TranscriptRequestBody(BaseModelForbidExtra): @model_validator(mode="after") def check_exon_start_and_exon_end(cls, values): - """Check that at least one of {`exon_start`, `exon_end`} is set""" + """Check that at least one of {``exon_start``, ``exon_end``} is set""" msg = "Must provide either `exon_start` or `exon_end`" exon_start, exon_end = values.exon_start, values.exon_end assert exon_start or exon_end, msg @@ -166,9 +166,9 @@ class GenomicData(BaseModelForbidExtra): @model_validator(mode="after") def check_start_end(cls, values): - """Check that at least one of {`start`, `end`} is set. - Check that at least one of {`exon_start`, `exon_end`} is set. - If not set, set corresponding offset to `None` + """Check that at least one of {``start``, ``end``} is set. + Check that at least one of {``exon_start``, ``exon_end``} is set. + If not set, set corresponding offset to ``None`` """ msg = "Missing values for `start` or `end`" start = values.start diff --git a/docs/source/_templates/module_summary.rst b/docs/source/_templates/module_summary.rst new file mode 100644 index 00000000..796f063e --- /dev/null +++ b/docs/source/_templates/module_summary.rst @@ -0,0 +1,6 @@ +{{ fullname | underline }} + +.. automodule:: {{ fullname }} + :members: + :special-members: __init__ + :exclude-members: model_fields diff --git a/docs/source/conf.py b/docs/source/conf.py index ee3adb9c..c9955166 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,6 +18,7 @@ "sphinx.ext.autodoc", "sphinx_autodoc_typehints", "sphinx.ext.linkcode", + "sphinx.ext.autosummary", "sphinx_copybutton", ] diff --git a/docs/source/index.rst b/docs/source/index.rst index a89f0251..3dd3c811 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -15,6 +15,7 @@ Description here. :maxdepth: 2 Installation + Overview Usage API Reference Contributing diff --git a/docs/source/overview.rst b/docs/source/overview.rst new file mode 100644 index 00000000..3a67553f --- /dev/null +++ b/docs/source/overview.rst @@ -0,0 +1,28 @@ +Overview of Features +==================== + +Cool-Seq-Tool provides easy access to, and useful operations on, a selection of important genomic resources. Modules are divided into three groups: + +* :ref:`Data sources `, for basic acquisition and setup for a data source via Python + +* :ref:`Data handlers `, for additional operations on top of existing sources + +* :ref:`Data mappers `, for functions that incorporate multiple sources/handlers to produce output + +.. _data_sources_description: + +Data sources +------------ + +.. _data_handlers_description: + +Data handlers +------------- + +.. _data_mappers_description: + +Data mappers +------------ + +Schema support +-------------- diff --git a/docs/source/reference/handlers.rst b/docs/source/reference/handlers.rst deleted file mode 100644 index 7f1b8178..00000000 --- a/docs/source/reference/handlers.rst +++ /dev/null @@ -1,9 +0,0 @@ -Data Handlers API -================= - -seqrepo_access --------------- - -.. automodule:: cool_seq_tool.handlers.seqrepo_access - :members: - :special-Members: __init__ diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 02406559..997987ab 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -1,8 +1,57 @@ API Reference ============= -.. toctree:: +.. _core_modules_api_index: - Data Sources API - Data Handlers API - Data Mappers API +Core Modules +------------ + +.. autosummary:: + :nosignatures: + :toctree: api/ + :template: module_summary.rst + + cool_seq_tool.app + cool_seq_tool.schemas + cool_seq_tool.utils + cool_seq_tool.data.data_downloads + +.. _sources_modules_api_index: + +Data Sources +------------ + +.. autosummary:: + :nosignatures: + :toctree: api/sources/ + :template: module_summary.rst + + cool_seq_tool.sources.mane_transcript_mappings + cool_seq_tool.sources.transcript_mappings + cool_seq_tool.sources.uta_database + +.. _handlers_modules_api_index: + +Data Handlers +------------- + +.. autosummary:: + :nosignatures: + :toctree: api/handlers/ + :template: module_summary.rst + + cool_seq_tool.handlers.seqrepo_access + +.. _mappers_module_api_index: + +Data Mappers +------------ + +.. autosummary:: + :nosignatures: + :toctree: api/mappers/ + :template: module_summary.rst + + cool_seq_tool.mappers.alignment + cool_seq_tool.mappers.exon_genomic_coords + cool_seq_tool.mappers.mane_transcript diff --git a/docs/source/reference/mappers.rst b/docs/source/reference/mappers.rst deleted file mode 100644 index 3b9fbf15..00000000 --- a/docs/source/reference/mappers.rst +++ /dev/null @@ -1,23 +0,0 @@ -Data Mappers API -================ - -alignment.py ------------- - -.. automodule:: cool_seq_tool.mappers.alignment - :members: - :special-members: __init__ - -exon_genomic_coords.py ----------------------- - -.. automodule:: cool_seq_tool.mappers.exon_genomic_coords - :members: - :special-members: __init__ - -mane_transcript.py ------------------- - -.. automodule:: cool_seq_tool.mappers.mane_transcript - :members: - :special-members: diff --git a/docs/source/reference/sources.rst b/docs/source/reference/sources.rst deleted file mode 100644 index 600d4e05..00000000 --- a/docs/source/reference/sources.rst +++ /dev/null @@ -1,24 +0,0 @@ -Data Sources API -================ - -mane_transcript_mappings ------------------------- - -.. automodule:: cool_seq_tool.sources.mane_transcript_mappings - :members: - :special-members: __init__ - - -transcript_mappings -------------------- - -.. automodule:: cool_seq_tool.sources.transcript_mappings - :members: - :special-members: __init__ - -uta_database ------------- - -.. automodule:: cool_seq_tool.sources.uta_database - :members: - :special-members: __init__ From 09adaf9dfcb7064a2bf360e6b2df9f6a5aa627e5 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Thu, 7 Dec 2023 17:02:31 -0500 Subject: [PATCH 05/34] more work --- cool_seq_tool/app.py | 31 ++- cool_seq_tool/data/data_downloads.py | 6 +- cool_seq_tool/handlers/seqrepo_access.py | 14 +- cool_seq_tool/mappers/__init__.py | 3 + cool_seq_tool/mappers/exon_genomic_coords.py | 36 +++- cool_seq_tool/mappers/mane_transcript.py | 5 + cool_seq_tool/schemas.py | 2 +- cool_seq_tool/sources/__init__.py | 2 + .../sources/mane_transcript_mappings.py | 24 ++- cool_seq_tool/sources/transcript_mappings.py | 39 ++-- cool_seq_tool/sources/uta_database.py | 184 ++++++++++-------- cool_seq_tool/utils.py | 24 ++- docs/source/_templates/module_summary.rst | 3 +- docs/source/conf.py | 2 +- docs/source/contributing.rst | 2 +- docs/source/install.rst | 6 + docs/source/overview.rst | 21 +- docs/source/reference/index.rst | 2 +- docs/source/usage.rst | 4 + 19 files changed, 266 insertions(+), 144 deletions(-) diff --git a/cool_seq_tool/app.py b/cool_seq_tool/app.py index 23c38584..f039b0e6 100644 --- a/cool_seq_tool/app.py +++ b/cool_seq_tool/app.py @@ -1,4 +1,6 @@ -"""Module for initializing data sources.""" +"""Provides core CoolSeqTool class, which non-redundantly initializes all Cool-Seq-Tool +data handler and mapping resources for straightforward access. +""" import logging from pathlib import Path from typing import Optional @@ -25,7 +27,26 @@ class CoolSeqTool: - """Class to initialize data sources.""" + """Non-redundantly initialize all Cool-Seq-Tool data resources, available under the + following attribute names: + + * ``self.seqrepo_access``: :py:class:`SeqRepoAccess ` + * ``self.transcript_mappings``: :py:class:`TranscriptMappings ` + * ``self.mane_transcript_mappings``: :py:class:`MANETranscriptMappings ` + * ``self.uta_db``: :py:class:`UTADatabase ` + * ``self.alignment_mapper``: :py:class:`AlignmentMapper ` + * ``self.mane_transcript``: :py:class:`MANETranscript ` + * ``self.ex_g_coords_mapper``: :py:class:`ExonGenomicCoordsMapper ` + + Initialization with default resource locations is straightforward: + + .. code-block:: pycon + + >>> from cool_seq_tool.app import CoolSeqTool + >>> cst = CoolSeqTool() + + See the :ref:`configuration ` section for more information. + """ def __init__( self, @@ -37,11 +58,11 @@ def __init__( ) -> None: """Initialize CoolSeqTool class - :param transcript_file_path: The path to transcript_mapping.tsv - :param lrg_refseqgene_path: The path to LRG_RefSeqGene + :param transcript_file_path: The path to ``transcript_mapping.tsv`` + :param lrg_refseqgene_path: The path to the LRG_RefSeqGene file :param mane_data_path: Path to RefSeq MANE summary data :param db_url: PostgreSQL connection URL - Format: `driver://user:password@host/database/schema` + Format: ``driver://user:password@host/database/schema`` :param sr: SeqRepo instance. If this is not provided, will create a new instance """ if not sr: diff --git a/cool_seq_tool/data/data_downloads.py b/cool_seq_tool/data/data_downloads.py index 373636ee..96e3b1dc 100644 --- a/cool_seq_tool/data/data_downloads.py +++ b/cool_seq_tool/data/data_downloads.py @@ -1,4 +1,4 @@ -"""Module for handling downloadable data files.""" +"""Handle acquisition of external data.""" import datetime import gzip import logging @@ -15,8 +15,8 @@ class DataDownload: - """Class for managing downloadable data files. Responsible for checking if files - are available under default locations, and fetching them if not. + """Manage downloadable data files. Responsible for checking if files are available + under expected locations, and fetching them if not. """ def __init__(self) -> None: diff --git a/cool_seq_tool/handlers/seqrepo_access.py b/cool_seq_tool/handlers/seqrepo_access.py index d65fa35d..b80d5164 100644 --- a/cool_seq_tool/handlers/seqrepo_access.py +++ b/cool_seq_tool/handlers/seqrepo_access.py @@ -1,4 +1,6 @@ -"""A module for accessing SeqRepo.""" +"""Wrap SeqRepo to provide additional lookup and identification methods on top of basic +dereferencing functions. +""" import logging from os import environ from pathlib import Path @@ -13,7 +15,9 @@ class SeqRepoAccess(SeqRepoDataProxy): - """The SeqRepoAccess class.""" + """Provide a wrapper around the base SeqRepoDataProxy class from ``VRS-Python`` to + provide additional lookup and identification methods. + """ environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none" @@ -29,6 +33,8 @@ def get_reference_sequence( If ``start`` and ``end`` are not given, it will return the entire reference sequence + # TODO exmaple + :param ac: Accession :param start: Start pos change :param end: End pos change. If ``None`` assumes both ``start`` and ``end`` have @@ -90,6 +96,8 @@ def translate_identifier( ) -> Tuple[List[str], Optional[str]]: """Return list of identifiers for accession. + # TODO example + :param ac: Identifier accession :param target_namespace: The namespace(s) of identifier to return :return: List of identifiers, warning @@ -163,6 +171,8 @@ def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]: def get_fasta_file(self, sequence_id: str, outfile_path: Path) -> None: """Retrieve FASTA file containing sequence for requested sequence ID. + # TODO example + :param sequence_id: accession ID, sans namespace, eg ``NM_152263.3`` :param outfile_path: path to save file to :return: None, but saves sequence data to ``outfile_path`` if successful diff --git a/cool_seq_tool/mappers/__init__.py b/cool_seq_tool/mappers/__init__.py index ecad925b..9fbb3306 100644 --- a/cool_seq_tool/mappers/__init__.py +++ b/cool_seq_tool/mappers/__init__.py @@ -2,3 +2,6 @@ from .alignment import AlignmentMapper # noqa: I001 from .mane_transcript import MANETranscript from .exon_genomic_coords import ExonGenomicCoordsMapper + + +__all__ = ["AlignmentMapper", "MANETranscript", "ExonGenomicCoordsMapper"] diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py index fbc099e4..57ad00d5 100644 --- a/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -1,4 +1,4 @@ -"""Module for mapping transcript exon to and from genomic coordinates""" +"""Provide mapping capabilities between transcript exon and genomic coordinates.""" import logging from typing import Dict, List, Optional, Tuple, TypeVar, Union @@ -23,12 +23,18 @@ class ExonGenomicCoordsMapper: - """Class for mapping transcript exon representation to/from genomic coordinate - representation + """Provide capabilties for mapping transcript exon representation to/from genomic + coordinate representation. """ def __init__(self, uta_db: UTADatabase, mane_transcript: MANETranscript) -> None: - """Initialize ExonGenomicCoordsMapper class + """Initialize ExonGenomicCoordsMapper class. + + A lot of resources are required for initialization, so when defaults are enough, + it's easiest to let the core CoolSeqTool class handle it for you: + + >>> from cool_seq_tool.app import CoolSeqTool + >>> egc = CoolSeqTool().ex_g_coords_mapper :param uta_db: UTADatabase instance to give access to query UTA database :param mane_transcript: Instance to align to MANE or compatible representation @@ -62,7 +68,25 @@ async def transcript_to_genomic_coordinates( **kwargs, ) -> GenomicDataResponse: """Get genomic data given transcript data. - Will use GRCh38 coordinates if possible + + By default, inputs are assumed to be in GRCh38 where they aren't otherwise + precisely defined. # TODO is this correct? + + >>> from cool_seq_tool.app import CoolSeqTool + >>> egc = CoolSeqTool().ex_g_coords_mapper + >>> tpm3 = await egc.transcript_to_genomic_coordinates( + ... gene="TPM3", chr="NC_000001.11", + ... exon_start=1, exon_end=8, + ... transcript="NM_152263.3" + ... ) + >>> (tpm3.genomic_data.chr, tpm3.genomic_data.start, tpm3.genomic_data.end) + ('NC_000001.11', 164192135, 154170399) + >>> ntrk1 = await egc.transcript_to_genomic_coordinates( + ... transcript="NM_002529.3", + ... exon_end=17 + ... ) + >>> (ntrk1.genomic_data.chr, ntrk1.genomic_data.end) + ('NC_000001.11', 156881456) :param gene: Gene symbol :param transcript: Transcript accession @@ -174,6 +198,8 @@ async def genomic_to_transcript_exon_coordinates( Liftovers genomic coordinates to GRCh38 + TODO clean up and provide example + :param chromosome: Chromosome. Must either give chromosome number (i.e. ``1``) or accession (i.e. ``NC_000001.11``). :param start: Start genomic position diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index 3fb1ba52..60d8d36a 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -442,6 +442,9 @@ def _validate_index( ) -> bool: """Validate that positions actually exist on accession + # TODO example + even though it's not public (it should be?) + :param ac: Accession :param pos: Start position change, End position change :param coding_start_site: coding start site for accession @@ -709,6 +712,8 @@ async def get_mane_transcript( ) -> Optional[Dict]: """Return mane transcript. + # TODO example + :param ac: Accession :param start_pos: Start position change :param start_annotation_layer: Starting annotation layer. diff --git a/cool_seq_tool/schemas.py b/cool_seq_tool/schemas.py index 19e0f2ee..6a0472f5 100644 --- a/cool_seq_tool/schemas.py +++ b/cool_seq_tool/schemas.py @@ -1,4 +1,4 @@ -"""Module for data models.""" +"""Defines attribute constants, useful object structures, and API response schemas.""" import re from datetime import datetime from enum import Enum diff --git a/cool_seq_tool/sources/__init__.py b/cool_seq_tool/sources/__init__.py index c753f9a3..2a0a9e1e 100644 --- a/cool_seq_tool/sources/__init__.py +++ b/cool_seq_tool/sources/__init__.py @@ -2,3 +2,5 @@ from .mane_transcript_mappings import MANETranscriptMappings from .transcript_mappings import TranscriptMappings from .uta_database import UTADatabase + +__all__ = ["MANETranscriptMappings", "TranscriptMappings", "UTADatabase"] diff --git a/cool_seq_tool/sources/mane_transcript_mappings.py b/cool_seq_tool/sources/mane_transcript_mappings.py index 14d1d828..94366020 100644 --- a/cool_seq_tool/sources/mane_transcript_mappings.py +++ b/cool_seq_tool/sources/mane_transcript_mappings.py @@ -1,4 +1,4 @@ -"""The module for loading MANE Transcript mappings to genes.""" +"""Provide fast tabular access to MANE summary file.""" import logging from pathlib import Path from typing import Dict, List @@ -11,7 +11,14 @@ class MANETranscriptMappings: - """The MANE Transcript mappings class.""" + """Provide fast tabular access to MANE summary file. + + By default, acquires data from `NCBI FTP server `_ + if unavailable locally. The local data location can be passed as an argument or + given under the environment variable ``MANE_SUMMARY_PATH``. + + See the `NCBI MANE page `_ for more information. + """ def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None: """Initialize the MANE Transcript mappings class. @@ -30,9 +37,18 @@ def _load_mane_transcript_data(self) -> pl.DataFrame: def get_gene_mane_data(self, gene_symbol: str) -> List[Dict]: """Return MANE Transcript data for a gene. + >>> from cool_seq_tool.sources import MANETranscriptMappings + >>> m = MANETranscriptMappings() + >>> braf_mane = m.get_gene_mane_data("BRAF") + >>> (braf_mane[0]["RefSeq_nuc"], braf_mane[0]["MANE_status"]) + ('NM_04333.6', 'MANE Select') + >>> (braf_mane[1]["RefSeq_nuc"], braf_mane[1]["MANE_status"]) + ('NM_001374258.1', 'MANE Plus Clinical') + :param str gene_symbol: HGNC Gene Symbol :return: List of MANE Transcript data (Transcript accessions, gene, and - location information). Sorted list: MANE Select and then MANE Plus Clinical + location information). The list is sorted so that a MANE Select entry comes + first, followed by a MANE Plus Clinical entry, if available. """ data = self.df.filter(pl.col("symbol") == gene_symbol.upper()) @@ -59,7 +75,7 @@ def get_mane_from_transcripts(self, transcripts: List[str]) -> List[Dict]: def get_mane_data_from_chr_pos( self, alt_ac: str, start: int, end: int ) -> List[Dict]: - """Get MANE data given chromosome, start pos, end end pos. Assumes GRCh38. + """Get MANE data given a GRCh38 genomic position. :param str alt_ac: NC Accession :param int start: Start genomic position. Assumes residue coordinates. diff --git a/cool_seq_tool/sources/transcript_mappings.py b/cool_seq_tool/sources/transcript_mappings.py index d39f1097..1b729fb5 100644 --- a/cool_seq_tool/sources/transcript_mappings.py +++ b/cool_seq_tool/sources/transcript_mappings.py @@ -1,4 +1,4 @@ -"""The module for Transcript Mappings.""" +"""Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions.""" import csv from pathlib import Path from typing import Dict, List, Optional @@ -7,7 +7,16 @@ class TranscriptMappings: - """The transcript mappings class.""" + """Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions. + + Uses ``LRG_RefSeqGene`` and ``transcript_mappings.csv``. See the + :ref:`installation instructions ` for information about manual + acquisition of data. + + In general, this class's methods expect to receive NCBI gene symbols, so users + should be careful about the sourcing of their input in cases where terms are + conflicted or ambiguous. # TODO is that true? + """ def __init__( self, @@ -16,8 +25,8 @@ def __init__( ) -> None: """Initialize the transcript mappings class. - :param Path transcript_file_path: Path to transcript mappings file - :param Path lrg_refseqgene_path: Path to LRG RefSeqGene file + :param transcript_file_path: Path to transcript mappings file + :param lrg_refseqgene_path: Path to LRG RefSeqGene file """ # ENSP <-> Gene Symbol self.ensembl_protein_version_for_gene_symbol: Dict[str, List[str]] = {} @@ -53,7 +62,7 @@ def __init__( def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None: """Load transcript mappings file to dictionaries. - :param Path transcript_file_path: Path to transcript mappings file + :param transcript_file_path: Path to transcript mappings file """ with open(transcript_file_path) as file: reader = csv.DictReader(file, delimiter="\t") @@ -127,7 +136,15 @@ def _load_refseq_gene_symbol_data(self, lrg_refseqgene_path: Path) -> None: def protein_transcripts(self, identifier: str) -> List[str]: """Return a list of protein transcripts for a gene symbol. - :param str identifier: Gene identifier to get protein transcripts for + >>> from cool_seq_tool.sources import TranscriptMappings + >>> TranscriptMappings().protein_transcripts("BRAF") + ['ENSP00000420119.2', + 'ENSP00000288602', + 'NP_004324.2', + ... # etc + 'NP_001365402.1'] + + :param identifier: Gene identifier to get protein transcripts for :return: Protein transcripts for a gene symbol """ protein_transcripts = list() @@ -141,7 +158,7 @@ def protein_transcripts(self, identifier: str) -> List[str]: def coding_dna_transcripts(self, identifier: str) -> List[str]: """Return transcripts from a coding dna refseq for a gene symbol. - :param str identifier: Gene identifier to find transcripts for + :param identifier: Gene identifier to find transcripts for :return: cDNA transcripts for a gene symbol """ genomic_transcripts = list() @@ -159,7 +176,7 @@ def coding_dna_transcripts(self, identifier: str) -> List[str]: def get_gene_symbol_from_ensembl_protein(self, q: str) -> Optional[str]: """Return the gene symbol for a Ensembl Protein. - :param str q: ensembl protein accession + :param q: ensembl protein accession :return: Gene symbol """ gene_symbol = self.ensembl_protein_version_to_gene_symbol.get(q) @@ -172,7 +189,7 @@ def get_gene_symbol_from_ensembl_protein(self, q: str) -> Optional[str]: def get_gene_symbol_from_refeq_protein(self, q: str) -> Optional[str]: """Return the gene symbol for a Refseq Protein. - :param str q: RefSeq protein accession + :param q: RefSeq protein accession :return: Gene symbol """ return self.refseq_protein_to_gene_symbol.get(q) @@ -180,7 +197,7 @@ def get_gene_symbol_from_refeq_protein(self, q: str) -> Optional[str]: def get_gene_symbol_from_refseq_rna(self, q: str) -> Optional[str]: """Return gene symbol for a Refseq RNA Transcript. - :param str q: RefSeq RNA transcript accession + :param q: RefSeq RNA transcript accession :return: Gene symbol """ gene_symbol = self.refseq_rna_version_to_gene_symbol.get(q) @@ -193,7 +210,7 @@ def get_gene_symbol_from_refseq_rna(self, q: str) -> Optional[str]: def get_gene_symbol_from_ensembl_transcript(self, q: str) -> Optional[str]: """Return gene symbol for an Ensembl Transcript. - :param str q: Ensembl transcript accession + :param q: Ensembl transcript accession :return: Gene symbol """ gene_symbol = self.ensembl_transcript_version_to_gene_symbol.get(q) diff --git a/cool_seq_tool/sources/uta_database.py b/cool_seq_tool/sources/uta_database.py index 435781c6..90c7b3d5 100644 --- a/cool_seq_tool/sources/uta_database.py +++ b/cool_seq_tool/sources/uta_database.py @@ -1,4 +1,4 @@ -"""Module for UTA queries.""" +"""Provide transcript lookup and metadata tools via the UTA database.""" import ast import base64 import logging @@ -24,14 +24,21 @@ LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37") UTA_DB_URL = environ.get( - "UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129" + "UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129b" ) logger = logging.getLogger(__name__) class UTADatabase: - """Class for connecting and querying UTA database.""" + """Provide transcript lookup and metadata tools via the Universal Transcript Archive + (UTA) database. + + Downstream libraries should use the create() method to construct a new instance: + + >>> from cool_seq_tool.sources.uta_database import UTADatabase + >>> uta = await UTADatabase.create() + """ def __init__( self, @@ -39,23 +46,19 @@ def __init__( chain_file_37_to_38: Optional[str] = None, chain_file_38_to_37: Optional[str] = None, ) -> None: - """Initialize DB class. - - Downstream libraries should use the create() method to construct a new instance: - - >>> from cool_seq_tool.sources.uta_database import UTADatabase - >>> uta = await UTADatabase.create() + """Initialize DB class. Should only be used by ``create()`` method, and not + be called directly by a user. :param db_url: PostgreSQL connection URL Format: ``driver://user:password@host/database/schema`` :param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly. - This is used for pyliftover. If this is not provided, will check to see if - LIFTOVER_CHAIN_37_TO_38 env var is set. If neither is provided, will allow - pyliftover to download a chain file from UCSC + This is used for ``pyliftover``. If this is not provided, will check to see + if ``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will + allow ``pyliftover`` to download a chain file from UCSC :param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly. - This is used for pyliftover. If this is not provided, will check to see if - LIFTOVER_CHAIN_38_TO_37 env var is set. If neither is provided, will allow - pyliftover to download a chain file from UCSC + This is used for ``pyliftover``. If this is not provided, will check to see + if ``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will + allow ``pyliftover`` to download a chain file from UCSC """ self.schema = None self._connection_pool = None @@ -141,10 +144,15 @@ async def create_pool(self) -> None: async def create( cls: Type[UTADatabaseType], db_url: str = UTA_DB_URL ) -> UTADatabaseType: - """Provide fully-initialized class instance (a la factory pattern) + """Manufacture a fully-initialized class instance (a la factory pattern). This + method should be used instead of calling the class directly to create a new + instance. - :param UTADatabaseType cls: supplied implicitly - :param str db_url: PostgreSQL connection URL + >>> from cool_seq_tool.sources.uta_database import UTADatabase + >>> uta = await UTADatabase.create() + + :param cls: supplied implicitly + :param db_url: PostgreSQL connection URL Format: ``driver://user:password@host/database/schema`` :return: UTA DB access class instance """ @@ -156,7 +164,7 @@ async def create( async def execute_query(self, query: str) -> Any: # noqa: ANN401 """Execute a query and return its result. - :param str query: Query to make on database + :param query: Query to make on database :return: Query's result """ @@ -228,7 +236,7 @@ async def _create_genomic_table(self) -> None: def _transform_list(li: List) -> List[List[Any]]: """Transform list to only contain field values - :param List li: List of asyncpg.Record objects + :param li: List of asyncpg.Record objects :return: List of list of objects """ results = list() @@ -246,13 +254,12 @@ async def chr_to_gene_and_accessions( ) -> Tuple[Optional[Dict], Optional[str]]: """Return genes and genomic accessions related to a position on a chr. - :param int chromosome: Chromosome number - :param int pos: Genomic position - :param Optional[int] strand: Strand. Must be either ``-1`` or ``1`` - :param Optional[str] alt_ac: Genomic accession - :param Optional[str] gene: Gene symbol - :return: Dictionary containing genes and genomic accessions and - warnings if found + :param chromosome: Chromosome number + :param pos: Genomic position + :param strand: Strand. Must be either ``-1`` or ``1`` + :param alt_ac: Genomic accession + :param gene: Gene symbol + :return: Dictionary containing genes and genomic accessions and warnings if found """ alt_ac_cond = ( f"WHERE alt_ac = '{alt_ac}'" @@ -298,8 +305,8 @@ async def get_tx_exons( ) -> Tuple[Optional[List[Tuple[int, int]]], Optional[str]]: """Get list of transcript exons start/end coordinates. - :param str tx_ac: Transcript accession - :param Optional[str] alt_ac: Genomic accession + :param tx_ac: Transcript accession + :param alt_ac: Genomic accession :return: List of a transcript's accessions and warnings if found """ if alt_ac: @@ -340,9 +347,9 @@ def _validate_exon( ) -> Tuple[Optional[Tuple[int, int]], Optional[str]]: """Validate that exon number is valid - :param str transcript: Transcript accession - :param List tx_exons: List of transcript's exons - :param Optional[int] exon_number: Exon number to validate + :param transcript: Transcript accession + :param tx_exons: List of transcript's exons + :param exon_number: Exon number to validate :return: Transcript coordinates and warnings if found """ msg = f"Exon {exon_number} does not exist on {transcript}" @@ -399,12 +406,10 @@ async def get_alt_ac_start_and_end( ) -> Tuple[Optional[Tuple[Tuple, Tuple]], Optional[str]]: """Get genomic coordinates for related transcript exon start and end. - :param str tx_ac: Transcript accession - :param Optional[List[str]] tx_exon_start: Transcript's exon start - coordinates - :param Optional[List[str]] tx_exon_end: Transcript's exon end - coordinates - :param str gene: Gene symbol + :param tx_ac: Transcript accession + :param tx_exon_start: Transcript's exon start coordinates + :param tx_exon_end: Transcript's exon end coordinates + :param gene: Gene symbol :return: Alt ac start and end data, and warnings if found """ if tx_exon_start: @@ -448,10 +453,10 @@ async def get_alt_ac_start_or_end( ) -> Tuple[Optional[Tuple[str, str, int, int, int]], Optional[str]]: """Get genomic data for related transcript exon start or end. - :param str tx_ac: Transcript accession - :param int tx_exon_start: Transcript's exon start coordinate - :param int tx_exon_end: Transcript's exon end coordinate - :param Optional[str] gene: Gene symbol + :param tx_ac: Transcript accession + :param tx_exon_start: Transcript's exon start coordinate + :param tx_exon_end: Transcript's exon end coordinate + :param gene: Gene symbol :return: [hgnc symbol, genomic accession for chromosome, start exon's end coordinate, end exon's start coordinate, strand], and warnings if found @@ -492,7 +497,7 @@ async def get_alt_ac_start_or_end( async def get_cds_start_end(self, tx_ac: str) -> Optional[Tuple[int, int]]: """Get coding start and end site - :param str tx_ac: Transcript accession + :param tx_ac: Transcript accession :return: [Coding start site, Coding end site] """ if tx_ac.startswith("ENS"): @@ -516,7 +521,7 @@ async def get_cds_start_end(self, tx_ac: str) -> Optional[Tuple[int, int]]: async def get_newest_assembly_ac(self, ac: str) -> List[str]: """Find accession associated to latest genomic assembly - :param str ac: Accession + :param ac: Accession :return: List of accessions associated to latest genomic assembly. Order by desc """ @@ -545,7 +550,7 @@ async def get_newest_assembly_ac(self, ac: str) -> List[str]: async def validate_genomic_ac(self, ac: str) -> bool: """Return whether or not genomic accession exists. - :param str ac: Genomic accession + :param ac: Genomic accession :return: ``True`` if genomic accession exists. ``False`` otherwise. """ query = f""" @@ -559,10 +564,15 @@ async def validate_genomic_ac(self, ac: str) -> bool: return result[0][0] async def get_ac_descr(self, ac: str) -> Optional[str]: - """Return accession description. - Typically description exists if not GRCh38 assembly. + """Return accession description. This is typically available only for accessions + from older (pre-GRCh38) builds. + + >>> from cool_seq_tool.sources.uta_database import UTADatabase + >>> u = await UTADatabase.create() + >>> await u.get_ac_descr("NC_000001.10") + 'Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly' - :param str ac: Accession + :param ac: chromosome accession, e.g. ``"NC_000001.10"`` :return: Description containing assembly and chromosome """ query = f""" @@ -585,21 +595,21 @@ async def get_tx_exon_aln_v_data( tx_ac: str, start_pos: int, end_pos: int, - alt_ac: str = None, + alt_ac: Optional[str] = None, use_tx_pos: bool = True, like_tx_ac: bool = False, ) -> List: """Return queried data from tx_exon_aln_v table. - :param str tx_ac: accession on c. coordinate - :param int start_pos: Start position change - :param int end_pos: End position change - :param str alt_ac: accession on g. coordinate - :param bool use_tx_pos: ``True`` if querying on transcript position. This means + :param tx_ac: accession on c. coordinate + :param start_pos: Start position change + :param end_pos: End position change + :param alt_ac: accession on g. coordinate + :param use_tx_pos: ``True`` if querying on transcript position. This means ``start_pos`` and ``end_pos`` are on the c. coordinate ``False`` if querying on genomic position. This means ``start_pos`` and ``end_pos`` are on the g. coordinate - :param bool like_tx_ac: ``True`` if tx_ac condition should be a like statement. + :param like_tx_ac: ``True`` if tx_ac condition should be a like statement. This is used when you want to query an accession regardless of its version ``False`` if tx_condition will be exact match :return: List of tx_exon_aln_v data @@ -664,7 +674,7 @@ async def get_tx_exon_aln_v_data( def data_from_result(result: List) -> Optional[Dict]: """Return data found from result. - :param List result: Data from tx_exon_aln_v table + :param result: Data from tx_exon_aln_v table :return: Gene, strand, and position ranges for tx and alt_ac """ gene = result[0] @@ -703,10 +713,13 @@ async def get_mane_c_genomic_data( Used when going from g -> MANE c - :param str ac: MANE Transcript accession - :param str alt_ac: NC Accession - :param int start_pos: Genomic start position change - :param int end_pos: Genomic end position change + # TODO example + + :param ac: MANE Transcript accession + :param alt_ac: NC Accession + :param start_pos: Genomic start position change + :param end_pos: Genomic end position change + :return: MANE transcript results if successful """ results = await self.get_tx_exon_aln_v_data( ac, start_pos, end_pos, alt_ac=alt_ac, use_tx_pos=False @@ -758,12 +771,11 @@ async def get_genomic_tx_data( ) -> Optional[Dict]: """Get transcript mapping to genomic data. - :param str tx_ac: Accession on c. coordinate - :param Tuple pos: (start pos, end pos) - :param Union[AnnotationLayer.CDNA, AnnotationLayer.GENOMIC] annotation_layer: - Annotation layer for ``ac`` and ``pos`` - :param Optional[str] alt_ac: Accession on g. coordinate - :param Assembly target_genome_assembly: Genome assembly to get genomic data for. + :param tx_ac: Accession on c. coordinate + :param pos: (start pos, end pos) + :param annotation_layer: Annotation layer for ``ac`` and ``pos`` + :param alt_ac: Accession on g. coordinate + :param target_genome_assembly: Genome assembly to get genomic data for. If ``alt_ac`` is provided, it will return the associated assembly. :return: Gene, Transcript accession and position change, Altered transcript accession and position change, Strand @@ -816,7 +828,7 @@ async def get_genomic_tx_data( async def get_ac_from_gene(self, gene: str) -> List[str]: """Return genomic accession(s) associated to a gene. - :param str gene: Gene symbol + :param gene: Gene symbol :return: List of genomic accessions, sorted in desc order """ query = f""" @@ -840,9 +852,9 @@ async def get_gene_from_ac( ) -> Optional[List[str]]: """Get transcripts from NC accession and positions. - :param str ac: NC Accession - :param int start_pos: Start position change - :param int end_pos: End position change + :param ac: NC Accession + :param start_pos: Start position change + :param end_pos: End position change :return: List of HGNC gene symbols """ if end_pos is None: @@ -949,7 +961,7 @@ async def get_transcripts( async def get_chr_assembly(self, ac: str) -> Optional[Tuple[str, str]]: """Get chromosome and assembly for NC accession if not in GRCh38. - :param str ac: NC accession + :param ac: NC accession :return: Chromosome and Assembly accession is on """ descr = await self.get_ac_descr(ac) @@ -972,8 +984,8 @@ async def get_chr_assembly(self, ac: str) -> Optional[Tuple[str, str]]: async def liftover_to_38(self, genomic_tx_data: Dict) -> None: """Liftover genomic_tx_data to hg38 assembly. - :param Dict genomic_tx_data: Dictionary containing gene, nc_accession, - alt_pos, and strand + :param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and + strand """ descr = await self.get_chr_assembly(genomic_tx_data["alt_ac"]) if descr is None: @@ -1028,9 +1040,9 @@ def get_liftover( ) -> Optional[Tuple]: """Get new genome assembly data for a position on a chromosome. - :param str chromosome: The chromosome number. Must be prefixed with ``chr`` - :param int pos: Position on the chromosome - :param Assembly liftover_to_assembly: Assembly to liftover to + :param chromosome: The chromosome number. Must be prefixed with ``chr`` + :param pos: Position on the chromosome + :param liftover_to_assembly: Assembly to liftover to :return: [Target chromosome, target position, target strand, conversion_chain_score] for assembly """ @@ -1061,11 +1073,11 @@ def _set_liftover( ) -> None: """Update genomic_tx_data to have coordinates for given assembly. - :param Dict genomic_tx_data: Dictionary containing gene, nc_accession, - alt_pos, and strand - :param str key: Key to access coordinate positions - :param str chromosome: Chromosome, must be prefixed with ``chr`` - :param Assembly liftover_to_assembly: Assembly to liftover to + :param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and + strand + :param key: Key to access coordinate positions + :param chromosome: Chromosome, must be prefixed with ``chr`` + :param liftover_to_assembly: Assembly to liftover to """ liftover_start_i = self.get_liftover( chromosome, genomic_tx_data[key][0], liftover_to_assembly @@ -1092,9 +1104,9 @@ def _set_liftover( async def p_to_c_ac(self, p_ac: str) -> List[str]: """Return c. accession from p. accession. - :param str p_ac: Protein accession - :return: List of rows containing c. accessions that are associated - with the given p. accession. In ascending order. + :param p_ac: Protein accession + :return: List of rows containing c. accessions that are associated with the + given p. accession. In ascending order. """ # Ensembl accessions do not have versions if p_ac.startswith("EN"): @@ -1121,8 +1133,8 @@ async def get_transcripts_from_genomic_pos( ) -> List[str]: """Get transcripts associated to a genomic ac and position. - :param str alt_ac: Genomic accession - :param int g_pos: Genomic position + :param alt_ac: Genomic accession + :param g_pos: Genomic position :return: RefSeq transcripts on c. coordinate """ query = f""" diff --git a/cool_seq_tool/utils.py b/cool_seq_tool/utils.py index 15ea546a..4f39ff2a 100644 --- a/cool_seq_tool/utils.py +++ b/cool_seq_tool/utils.py @@ -1,4 +1,4 @@ -"""Module for common utilities used throughout the app""" +"""Provide a small set of general helper functions.""" import logging from datetime import datetime from typing import Optional, Tuple @@ -12,12 +12,26 @@ def get_inter_residue_pos( start_pos: int, residue_mode: ResidueMode, end_pos: Optional[int] = None ) -> Tuple[Optional[Tuple[int, int]], Optional[str]]: - """Return inter-residue position + """Return equivalent inter-residue position. + + Generally, we prefer to work with inter-residue coordinates where possible. Our + rationale is detailed in an appendix to the + `VRS docs `_. + This function is used internally to shift user-provided coordinates accordingly. + + For example: + + .. code-block:: pycon + + >>> from cool_seq_tool.utils import get_inter_residue_pos + >>> from cool_seq_tool.schemas import ResidueMode + >>> get_inter_residue_pos(10, ResidueMode.RESIDUE) + ((9, 9), None) :param start_pos: Start position - :param residue_mode: Residue mode for `start_pos` and `end_pos` - :param end_pos: End position. If `None` assumes both `start` and `end` have same - values. + :param residue_mode: Residue mode for ``start_pos`` and ``end_pos`` + :param end_pos: End position. If ``None`` assumes both ``start`` and ``end`` have + same values. :return: Inter-residue coordinates, warning """ if residue_mode == ResidueMode.RESIDUE: diff --git a/docs/source/_templates/module_summary.rst b/docs/source/_templates/module_summary.rst index 796f063e..3a40b575 100644 --- a/docs/source/_templates/module_summary.rst +++ b/docs/source/_templates/module_summary.rst @@ -2,5 +2,6 @@ .. automodule:: {{ fullname }} :members: + :undoc-members: :special-members: __init__ - :exclude-members: model_fields + :exclude-members: model_fields, model_config diff --git a/docs/source/conf.py b/docs/source/conf.py index c9955166..d745f2a3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -39,7 +39,7 @@ "footer_icons": [ { "name": "GitHub", - "url": "https://github.com/cancervariants/gene-normalization", + "url": "https://github.com/genomicmedlab/cool-seq-tool", "html": "", "class": "fa-brands fa-solid fa-github", }, diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index 908aa727..d0feaef9 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -55,7 +55,7 @@ Tests are executed with `pytest `_. For local usage, we recommend the following: +.. long-term, it would be best to move this over to the UTA repo to avoid duplication + .. code-block:: createuser -U postgres uta_admin @@ -47,6 +49,8 @@ Set up SeqRepo Cool-Seq-Tool requires access to `SeqRepo `_ data. In general, we recommend the following for local setup: +.. long-term, it would be best to move this over to seqrepo to avoid duplication + .. code-block:: pip install seqrepo @@ -69,6 +73,8 @@ Try moving data manually with ``sudo``: See `mirroring documentation `_ on the SeqRepo GitHub repo for instructions and additional troubleshooting. +.. _required_files: + Acquire genome and transcript data ---------------------------------- diff --git a/docs/source/overview.rst b/docs/source/overview.rst index 3a67553f..f9b8d30e 100644 --- a/docs/source/overview.rst +++ b/docs/source/overview.rst @@ -3,26 +3,11 @@ Overview of Features Cool-Seq-Tool provides easy access to, and useful operations on, a selection of important genomic resources. Modules are divided into three groups: -* :ref:`Data sources `, for basic acquisition and setup for a data source via Python +* :ref:`Data sources `, for basic acquisition and setup for a data source via Python -* :ref:`Data handlers `, for additional operations on top of existing sources +* :ref:`Data handlers `, for additional operations on top of existing sources -* :ref:`Data mappers `, for functions that incorporate multiple sources/handlers to produce output - -.. _data_sources_description: - -Data sources ------------- - -.. _data_handlers_description: - -Data handlers -------------- - -.. _data_mappers_description: - -Data mappers ------------- +* :ref:`Data mappers `, for functions that incorporate multiple sources/handlers to produce output Schema support -------------- diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 997987ab..0b641b22 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -42,7 +42,7 @@ Data Handlers cool_seq_tool.handlers.seqrepo_access -.. _mappers_module_api_index: +.. _mappers_modules_api_index: Data Mappers ------------ diff --git a/docs/source/usage.rst b/docs/source/usage.rst index d3d62e6a..6ac8ab6a 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -1,6 +1,8 @@ Usage ===== +.. _configuration: + Configuration ------------- @@ -9,3 +11,5 @@ Programmatic access REST server ----------- + +Possibly staged for deletion? From bd23e5f0c0982a1cc9d456e5d8f23182a4a2194d Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 8 Dec 2023 14:00:36 -0500 Subject: [PATCH 06/34] Add more work --- cool_seq_tool/data/data_downloads.py | 4 +-- docs/source/index.rst | 1 - docs/source/overview.rst | 13 -------- docs/source/usage.rst | 49 +++++++++++++++++++++++++--- 4 files changed, 46 insertions(+), 21 deletions(-) delete mode 100644 docs/source/overview.rst diff --git a/cool_seq_tool/data/data_downloads.py b/cool_seq_tool/data/data_downloads.py index 96e3b1dc..2ad56863 100644 --- a/cool_seq_tool/data/data_downloads.py +++ b/cool_seq_tool/data/data_downloads.py @@ -25,7 +25,7 @@ def __init__(self) -> None: def get_mane_summary(self) -> Path: """Identify latest MANE summary data. If unavailable locally, download from - source. + `NCBI FTP server `_. :return: path to MANE summary file """ @@ -52,7 +52,7 @@ def get_mane_summary(self) -> Path: def get_lrg_refseq_gene_data(self) -> Path: """Identify latest LRG RefSeq Gene file. If unavailable locally, download from - source. + `NCBI FTP server `_. :return: path to acquired LRG RefSeq Gene data file """ diff --git a/docs/source/index.rst b/docs/source/index.rst index 3dd3c811..a89f0251 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -15,7 +15,6 @@ Description here. :maxdepth: 2 Installation - Overview Usage API Reference Contributing diff --git a/docs/source/overview.rst b/docs/source/overview.rst deleted file mode 100644 index f9b8d30e..00000000 --- a/docs/source/overview.rst +++ /dev/null @@ -1,13 +0,0 @@ -Overview of Features -==================== - -Cool-Seq-Tool provides easy access to, and useful operations on, a selection of important genomic resources. Modules are divided into three groups: - -* :ref:`Data sources `, for basic acquisition and setup for a data source via Python - -* :ref:`Data handlers `, for additional operations on top of existing sources - -* :ref:`Data mappers `, for functions that incorporate multiple sources/handlers to produce output - -Schema support --------------- diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 6ac8ab6a..28e1c41a 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -1,15 +1,54 @@ Usage ===== -.. _configuration: +Cool-Seq-Tool provides easy access to, and useful operations on, a selection of important genomic resources. Modules are divided into three groups: + +* :ref:`Data sources `, for basic acquisition and setup for a data source via Python + +* :ref:`Data handlers `, for additional operations on top of existing sources + +* :ref:`Data mappers `, for functions that incorporate multiple sources/handlers to produce output -Configuration -------------- +A core :py:class:`CoolSeqTool ` class encapsulates all of their functions and can be used for easy initialization: -Programmatic access -------------------- +.. code-block:: pycon + + >>> from cool_seq_tool.app import CoolSeqTool + >>> cst = CoolSeqTool() + +.. _configuration: REST server ----------- Possibly staged for deletion? + + +Environment configuration +------------------------- + +Individual classes will accept arguments upon initialization to set parameters regarding data sources. In general, these parameters are also configurable via environment variables, e.g. in a cloud deployment. + +.. list-table:: + :widths: 25 100 + :header-rows: 1 + + * - Variable + - Description + * - ``LRG_REFSEQGENE_PATH`` + - Path to LRG_RefSeqGene file. Used in :py:class:`TranscriptMappings ` to provide mappings between gene symbols and RefSeq/Ensembl transcript accessions. If not defined, defaults to the most recent version (formatted as ``data/LRG_RefSeqGene_YYYYMMDD``) within the Cool-Seq-Tool library directory. + * - ``TRANSCRIPT_MAPPINGS_PATH`` + - Path to transcript mapping file generated from `Ensembl BioMart `_. Used in :py:class:`TranscriptMappings `. If not defined, uses a copy of the file that is bundled within the Cool-Seq-Tool installation. See the :ref:`contributor instructions ` for information on manually rebuilding it. + * - ``MANE_SUMMARY_PATH`` + - Path to MANE Summary file. Used in :py:class:`MANETranscriptMappings ` to provide MANE transcript annotations. If not defined, defaults to the most recent version (formatted as ``data/MANE.GRCh38vX.X.summary.txt``) within the Cool-Seq-Tool library directory. + * - ``SEQREPO_ROOT_DIR`` + - Path to SeqRepo directory (i.e. contains ``aliases.sqlite3`` database file, and ``sequences`` directory). Used by :py:class:`SeqRepoAccess `_, i.e. of the form ``postgresql://:@://``, used by the :py:class:`cool_seq_tool.sources.uta_database.UTADatabase` class. By default, it is set to ``postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129b``. + * - ``LIFTOVER_CHAIN_37_TO_38`` + - A path to a `chainfile `_ for lifting from GRCh37 to GRCh38. Used by :py:class:`cool_seq_tool.sources.uta_database.UTADatabase` as input to `pyliftover `_. If not provided, pyliftover will fetch it automatically from UCSC. + * - ``LIFTOVER_CHAIN_38_TO_37`` + - A path to a `chainfile `_ for lifting from GRCh38 to GRCh37. Used by :py:class:`cool_seq_tool.sources.uta_database.UTADatabase` as input to `pyliftover `_. If not provided, pyliftover will fetch it automatically from UCSC. + +Schema support +-------------- From 88bf925cbc0ae80457ecb45b275b358ee2284846 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 8 Dec 2023 14:58:27 -0500 Subject: [PATCH 07/34] more work --- .../sources/mane_transcript_mappings.py | 5 ++++- docs/TranscriptSelectionPriority.md | 22 ------------------- docs/source/index.rst | 1 + docs/source/reference/index.rst | 2 ++ docs/source/transcript_selection.rst | 20 +++++++++++++++++ docs/source/usage.rst | 7 ++++-- 6 files changed, 32 insertions(+), 25 deletions(-) delete mode 100644 docs/TranscriptSelectionPriority.md create mode 100644 docs/source/transcript_selection.rst diff --git a/cool_seq_tool/sources/mane_transcript_mappings.py b/cool_seq_tool/sources/mane_transcript_mappings.py index 94366020..7617e682 100644 --- a/cool_seq_tool/sources/mane_transcript_mappings.py +++ b/cool_seq_tool/sources/mane_transcript_mappings.py @@ -1,4 +1,6 @@ -"""Provide fast tabular access to MANE summary file.""" +"""Provide fast tabular access to MANE summary file. Enables retrieval of associated +MANE transcripts for gene symbols, genomic positions, or transcript accessions. +""" import logging from pathlib import Path from typing import Dict, List @@ -30,6 +32,7 @@ def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None: def _load_mane_transcript_data(self) -> pl.DataFrame: """Load RefSeq MANE data file into DataFrame. + :return: DataFrame containing RefSeq MANE Transcript data """ return pl.read_csv(self.mane_data_path, separator="\t") diff --git a/docs/TranscriptSelectionPriority.md b/docs/TranscriptSelectionPriority.md deleted file mode 100644 index 72ef9944..00000000 --- a/docs/TranscriptSelectionPriority.md +++ /dev/null @@ -1,22 +0,0 @@ -# Transcript Selection Policy - -This document contains information on the Transcript Selection Policy. We use this policy for selecting a representative transcript using sequence attributes and MANE annotations from the `mane_transcript` module. - -More information on MANE can be found [here](https://www.ncbi.nlm.nih.gov/refseq/MANE/). - -## Representative transcript priority -We evaluate all compatible transcripts against each of the below criteria, and select the transcript which meets the earliest criterion as representative. - -1. Transcript is annotated as a MANE Select transcript -2. Transcript is annotated as a MANE Plus Clinical transcript -3. Longest Compatible Remaining\ - a. If there is a tie, choose the first-published transcript (lowest-numbered accession for RefSeq/Ensembl) among those transcripts meeting this criterion\ - _Note: We want the most recent version of a transcript associated with an assembly_ - -## Compatible Transcripts - -Compatible transcripts are those that pass validation checks. The checks that we make are: - - - Validating the position exists on an accession - - Validating reference sequences - - Validating exon structure diff --git a/docs/source/index.rst b/docs/source/index.rst index a89f0251..7f0ffd9c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -16,6 +16,7 @@ Description here. Installation Usage + Transcript Selection API Reference Contributing License diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 0b641b22..490b7d67 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -1,3 +1,5 @@ +.. _api_reference: + API Reference ============= diff --git a/docs/source/transcript_selection.rst b/docs/source/transcript_selection.rst new file mode 100644 index 00000000..1f951b56 --- /dev/null +++ b/docs/source/transcript_selection.rst @@ -0,0 +1,20 @@ +Transcript Selection +==================== + +One of the core uses of Cool-Seq-Tool is to acquire and use consensus, representative transcripts in performing genomic analysis. Here, we describe the selection processes, programmed in the :py:class:`MANETranscript ` class, for choosing the best available transcripts that are compatible with requested data. + +We rely heavily on transcripts annotated under the `Matched Annotation from NCBI and EMBL-EBI (MANE)` Transcripts project. For more information on the MANE project, see the `NCBI MANE page `_. + +Representative transcript priority +---------------------------------- + +All compatible transcripts are evaluated and ordered against the below criteria. The candidate transcript which meets the earliest criterion is chosen as representative. + +#. Transcript is annotated as a `MANE Select` transcript +#. Transcript is annotated as a `MANE Plus Clinical` transcript +#. Transcript is the longest-compatible remaining transcript +#. Transcript is the first-published (lowest-numbered RefSeq/Ensembl accession) remaining transcript + +.. note:: + + We prefer the most recent version of a transcript associated with an assembly. diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 28e1c41a..e9de0096 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -16,13 +16,14 @@ A core :py:class:`CoolSeqTool ` class encapsulate >>> from cool_seq_tool.app import CoolSeqTool >>> cst = CoolSeqTool() -.. _configuration: +Descriptions and examples of functions can be found in the :ref:`API Reference ` section. REST server ----------- -Possibly staged for deletion? +TODO Possibly staged for deletion? +.. _configuration: Environment configuration ------------------------- @@ -52,3 +53,5 @@ Individual classes will accept arguments upon initialization to set parameters r Schema support -------------- + +Many genomic data objects produced by Cool-Seq-Tool are structured in conformance with the `Variation Representation Specification `_, courtesy of the `VRS-Python ` library. From ba7c041db3526c92152f2478cd4641422be7aa68 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 8 Dec 2023 15:15:29 -0500 Subject: [PATCH 08/34] more stuff --- cool_seq_tool/api.py | 4 ++-- cool_seq_tool/sources/transcript_mappings.py | 7 ++++--- docs/source/install.rst | 22 -------------------- docs/source/transcript_selection.rst | 11 ++++++++++ docs/source/usage.rst | 14 +++++++++---- setup.cfg | 2 +- 6 files changed, 28 insertions(+), 32 deletions(-) diff --git a/cool_seq_tool/api.py b/cool_seq_tool/api.py index 89c7efdc..b36604e7 100644 --- a/cool_seq_tool/api.py +++ b/cool_seq_tool/api.py @@ -24,9 +24,9 @@ def custom_openapi() -> Dict: if app.openapi_schema: return app.openapi_schema openapi_schema = get_openapi( - title="The GenomicMedLab Cool Seq Tool", + title="The GenomicMedLab Cool-Seq-Tool", version=__version__, - description="Common Operations On Lots-of Sequences Tool.", + description="Common Operations On Lots of Sequences Tool.", routes=app.routes, ) diff --git a/cool_seq_tool/sources/transcript_mappings.py b/cool_seq_tool/sources/transcript_mappings.py index 1b729fb5..a6929d21 100644 --- a/cool_seq_tool/sources/transcript_mappings.py +++ b/cool_seq_tool/sources/transcript_mappings.py @@ -9,9 +9,10 @@ class TranscriptMappings: """Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions. - Uses ``LRG_RefSeqGene`` and ``transcript_mappings.csv``. See the - :ref:`installation instructions ` for information about manual - acquisition of data. + Uses ``LRG_RefSeqGene`` and ``transcript_mappings.csv``, which will automatically + be acquired if they aren't already available. See the + :ref:`configuration ` section in the documentation for information + about manual acquisition of data. In general, this class's methods expect to receive NCBI gene symbols, so users should be careful about the sourcing of their input in cases where terms are diff --git a/docs/source/install.rst b/docs/source/install.rst index 14556609..03c0a0ab 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -72,25 +72,3 @@ Try moving data manually with ``sudo``: sudo mv /usr/local/share/seqrepo/$SEQREPO_VERSION.* /usr/local/share/seqrepo/$SEQREPO_VERSION See `mirroring documentation `_ on the SeqRepo GitHub repo for instructions and additional troubleshooting. - -.. _required_files: - -Acquire genome and transcript data ----------------------------------- - -Cool-Seq-Tool also makes use of external data extracted from a handful of sources. This data is fetched automatically at runtime if it's not available, so no additional action is necessary by default -- however, if necessary, it can be configured and placed manually. - -LRG_RefSeqGene -++++++++++++++ - -Cool-Seq-Tool fetches the latest version of ``LRG_RefSeqGene`` if the environment variable ``LRG_REFSEQGENE_PATH`` is not set. When ``LRG_REFSEQGENE_PATH`` is set, Cool-Seq-Tool will look at this path and expect the ``LRG_RefSeqGene`` file. This file is found can be found `here `_. - -MANE summary -++++++++++++ - -Cool-Seq-Tool fetches the latest version of ``MANE.GRCh38.*.summary.txt.gz`` if the environment variable ``MANE_SUMMARY_PATH`` is not set. When ``MANE_SUMMARY_PATH`` is set, Cool-Seq-Tool will look at this path and expect the MANE Summary Data file. This file is found can be found `here `_. - -Transcript mappings -+++++++++++++++++++ - -Cool-Seq-Tool is packaged with transcript mapping data acquired from `Ensembl BioMart `_. If the environment variable ``TRANSCRIPT_MAPPINGS_PATH`` is not set, Cool-Seq-Tool will use the built-in file. When ``TRANSCRIPT_MAPPINGS_PATH`` is set, Cool-Seq-Tool will look at this path and expect to find the transcript mapping TSV file. To manually acquire this file, see the :ref:`contributor instructions `. diff --git a/docs/source/transcript_selection.rst b/docs/source/transcript_selection.rst index 1f951b56..3878d3fe 100644 --- a/docs/source/transcript_selection.rst +++ b/docs/source/transcript_selection.rst @@ -5,6 +5,17 @@ One of the core uses of Cool-Seq-Tool is to acquire and use consensus, represent We rely heavily on transcripts annotated under the `Matched Annotation from NCBI and EMBL-EBI (MANE)` Transcripts project. For more information on the MANE project, see the `NCBI MANE page `_. +Transcript compatibility +------------------------ + +The following validation checks are performed to determine compatibility of a transcript position: + +* The position exists on an accession +* "Validating reference sequences" # TODO what does this mean? +* "Validating exon structure" # TODO what does this mean? + +A transcript that fails to pass any of these checks is discarded as incompatible. + Representative transcript priority ---------------------------------- diff --git a/docs/source/usage.rst b/docs/source/usage.rst index e9de0096..43a14f8b 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -21,7 +21,13 @@ Descriptions and examples of functions can be found in the :ref:`API Reference < REST server ----------- -TODO Possibly staged for deletion? +Core Cool-Seq-Tool functions can also be performed via a REST HTTP interface, provided via `FastAPI `_. Use the ``uvicorn`` shell command to start a server instance: + +.. code-block:: shell + + uvicorn cool_seq_tool.api:app + +By default, ``uvicorn`` serves to port 8000. Once initialized, go to ``_ in a web browser for OpenAPI docs describing available endpoints. .. _configuration: @@ -37,9 +43,9 @@ Individual classes will accept arguments upon initialization to set parameters r * - Variable - Description * - ``LRG_REFSEQGENE_PATH`` - - Path to LRG_RefSeqGene file. Used in :py:class:`TranscriptMappings ` to provide mappings between gene symbols and RefSeq/Ensembl transcript accessions. If not defined, defaults to the most recent version (formatted as ``data/LRG_RefSeqGene_YYYYMMDD``) within the Cool-Seq-Tool library directory. + - Path to LRG_RefSeqGene file. Used in :py:class:`TranscriptMappings ` to provide mappings between gene symbols and RefSeq/Ensembl transcript accessions. If not defined, defaults to the most recent version (formatted as ``data/LRG_RefSeqGene_YYYYMMDD``) within the Cool-Seq-Tool library directory. Cool-Seq-Tool will acquire this data manually if no configuration is provided. * - ``TRANSCRIPT_MAPPINGS_PATH`` - - Path to transcript mapping file generated from `Ensembl BioMart `_. Used in :py:class:`TranscriptMappings `. If not defined, uses a copy of the file that is bundled within the Cool-Seq-Tool installation. See the :ref:`contributor instructions ` for information on manually rebuilding it. + - Path to transcript mapping file generated from `Ensembl BioMart `_. Used in :py:class:`TranscriptMappings `. If not defined, uses a copy of the file that is bundled within the Cool-Seq-Tool installation. See the :ref:`contributor instructions ` for information on manually rebuilding it. Cool-Seq-Tool will acquire this data manually if no configuration is provided. * - ``MANE_SUMMARY_PATH`` - Path to MANE Summary file. Used in :py:class:`MANETranscriptMappings ` to provide MANE transcript annotations. If not defined, defaults to the most recent version (formatted as ``data/MANE.GRCh38vX.X.summary.txt``) within the Cool-Seq-Tool library directory. * - ``SEQREPO_ROOT_DIR`` @@ -54,4 +60,4 @@ Individual classes will accept arguments upon initialization to set parameters r Schema support -------------- -Many genomic data objects produced by Cool-Seq-Tool are structured in conformance with the `Variation Representation Specification `_, courtesy of the `VRS-Python ` library. +Many genomic data objects produced by Cool-Seq-Tool are structured in conformance with the `Variation Representation Specification `_, courtesy of the `VRS-Python `_ library. diff --git a/setup.cfg b/setup.cfg index 3103a905..14ba77ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = cool_seq_tool -description = Common Operations On Lots-of Sequences Tool. +description = Common Operations On Lots of Sequences Tool long_description = file:README.md long_description_content_type = text/markdown author = Wagner Lab, Nationwide Childrens Hospital From 01bbdd6cec7867e54387f4d434ba39e1c00dc6fc Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 8 Dec 2023 20:41:14 -0500 Subject: [PATCH 09/34] Wrapping it up for the day --- cool_seq_tool/data/data_downloads.py | 3 ++ cool_seq_tool/handlers/seqrepo_access.py | 42 ++++++++++++--- cool_seq_tool/mappers/exon_genomic_coords.py | 23 ++++++--- cool_seq_tool/sources/uta_database.py | 54 ++++++++++++++------ cool_seq_tool/utils.py | 12 ++--- docs/source/transcript_selection.rst | 4 +- 6 files changed, 100 insertions(+), 38 deletions(-) diff --git a/cool_seq_tool/data/data_downloads.py b/cool_seq_tool/data/data_downloads.py index 2ad56863..dd42e74e 100644 --- a/cool_seq_tool/data/data_downloads.py +++ b/cool_seq_tool/data/data_downloads.py @@ -17,6 +17,9 @@ class DataDownload: """Manage downloadable data files. Responsible for checking if files are available under expected locations, and fetching them if not. + + Relevant methods are called automatically by data classes; users should not have + to interact with this class under normal circumstances. """ def __init__(self) -> None: diff --git a/cool_seq_tool/handlers/seqrepo_access.py b/cool_seq_tool/handlers/seqrepo_access.py index b80d5164..520bf2fe 100644 --- a/cool_seq_tool/handlers/seqrepo_access.py +++ b/cool_seq_tool/handlers/seqrepo_access.py @@ -28,12 +28,16 @@ def get_reference_sequence( end: Optional[int] = None, residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Tuple[str, Optional[str]]: - """Get reference sequence for an accession given a start and end position. + """Get reference sequence for an accession given a start and end position. If + ``start`` and ``end`` are not given, returns the entire reference sequence. - If ``start`` and ``end`` are not given, it will return the entire reference - sequence - - # TODO exmaple + >>> from cool_seq_tool.handlers import SeqRepoAccess + >>> from biocommons.seqrepo import SeqRepo + >>> sr = SeqRepoAccess(SeqRepo("/usr/local/share/seqrepo/latest")) + >>> sr.get_reference_sequence("NM_002529.3", 1, 10)[0] + 'TGCAGCTGG' + >>> sr.get_reference_sequence("NP_001341538.1", 1, 10)[0] + 'MAALSGGGG' :param ac: Accession :param start: Start pos change @@ -96,7 +100,23 @@ def translate_identifier( ) -> Tuple[List[str], Optional[str]]: """Return list of identifiers for accession. - # TODO example + >>> from cool_seq_tool.handlers import SeqRepoAccess + >>> from biocommons.seqrepo import SeqRepo + >>> sr = SeqRepoAccess(SeqRepo("/usr/local/share/seqrepo/latest")) + >>> sr.translate_identifier("NM_002529.3")[0] + ['MD5:18f0a6e3af9e1bbd8fef1948c7156012', + 'NCBI:NM_002529.3', + 'refseq:NM_002529.3', + 'SEGUID:dEJQBkga9d9VeBHTyTbg6JEtTGQ', + 'SHA1:74425006481af5df557811d3c936e0e8912d4c64', + 'VMC:GS_RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA', + 'sha512t24u:RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA', + 'ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA'] + >>> sr.translate_identifier("NM_002529.3", "ga4gh")[0] + ['ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA'] + >>> # SeqRepo doesn't store Ensembl accession IDs + >>> sr.translate_identifier("ENST00000380152.8") + ([], 'SeqRepo unable to get translated identifiers for ENST00000380152.8') :param ac: Identifier accession :param target_namespace: The namespace(s) of identifier to return @@ -171,7 +191,15 @@ def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]: def get_fasta_file(self, sequence_id: str, outfile_path: Path) -> None: """Retrieve FASTA file containing sequence for requested sequence ID. - # TODO example + >>> from pathlib import Path + >>> from cool_seq_tool.handlers import SeqRepoAccess + >>> from biocommons.seqrepo import SeqRepo + >>> sr = SeqRepoAccess(SeqRepo("/usr/local/share/seqrepo/latest")) + >>> # write to local file tpm3.fasta: + >>> sr.get_fasta_file("NM_002529.3", Path("tpm3.fasta")) + + FASTA file headers will include GA4GH sequence digest, Ensembl accession ID, + and RefSeq accession ID. :param sequence_id: accession ID, sans namespace, eg ``NM_152263.3`` :param outfile_path: path to save file to diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py index 57ad00d5..3573e4b5 100644 --- a/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -191,14 +191,22 @@ async def genomic_to_transcript_exon_coordinates( residue_mode: ResidueMode = ResidueMode.RESIDUE, **kwargs, ) -> GenomicDataResponse: - """Get transcript data for genomic data. + """Get transcript data for genomic data, lifted over to GRCh38. - MANE Transcript data will be returned iff `transcript` is not supplied. ``gene`` - must be supplied in order to retrieve MANE Transcript data. + MANE Transcript data will be returned if and only if `transcript` is not + supplied. ``gene`` must be given in order to retrieve MANE Transcript data. - Liftovers genomic coordinates to GRCh38 - - TODO clean up and provide example + >>> from cool_seq_tool.app import CoolSeqTool + >>> egc = CoolSeqTool().ex_g_coords_mapper + >>> result = await egc.genomic_to_transcript_exon_coordinates( + ... chromosome="NC_000001.11", + ... start=154192136, + ... end=154170400, + ... strand=-1, + ... transcript="NM_152263.3" + ... ) + >>> (result.genomic_data.exon_start, result.genomic_data.exon_end) + (1, 8) :param chromosome: Chromosome. Must either give chromosome number (i.e. ``1``) or accession (i.e. ``NC_000001.11``). @@ -207,7 +215,8 @@ async def genomic_to_transcript_exon_coordinates( :param strand: Strand. Must be either ``-1`` or ``1``. :param transcript: The transcript to use. If this is not given, we will try the following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining - Compatible Transcript + Compatible Transcript. See the :ref:`Transcript Selection policy ` + page. :param gene: Gene symbol :param residue_mode: Default is ``resiude`` (1-based). Must be either ``residue`` or ``inter-residue`` (0-based). diff --git a/cool_seq_tool/sources/uta_database.py b/cool_seq_tool/sources/uta_database.py index 90c7b3d5..d98a4ed1 100644 --- a/cool_seq_tool/sources/uta_database.py +++ b/cool_seq_tool/sources/uta_database.py @@ -34,10 +34,16 @@ class UTADatabase: """Provide transcript lookup and metadata tools via the Universal Transcript Archive (UTA) database. - Downstream libraries should use the create() method to construct a new instance: + Users should use the create() method to construct a new instance: >>> from cool_seq_tool.sources.uta_database import UTADatabase - >>> uta = await UTADatabase.create() + >>> uta_db = await UTADatabase.create() + + This class uses the `asyncpg `_ + library to connect to PostgreSQL. This means that most methods are defined as + coroutines and must be called with ``await``. See the + `Python stdlib docs `_ + for more information. """ def __init__( @@ -149,7 +155,7 @@ async def create( instance. >>> from cool_seq_tool.sources.uta_database import UTADatabase - >>> uta = await UTADatabase.create() + >>> uta_db = await UTADatabase.create() :param cls: supplied implicitly :param db_url: PostgreSQL connection URL @@ -568,8 +574,8 @@ async def get_ac_descr(self, ac: str) -> Optional[str]: from older (pre-GRCh38) builds. >>> from cool_seq_tool.sources.uta_database import UTADatabase - >>> u = await UTADatabase.create() - >>> await u.get_ac_descr("NC_000001.10") + >>> uta_db = await UTADatabase.create() + >>> await uta_db.get_ac_descr("NC_000001.10") 'Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly' :param ac: chromosome accession, e.g. ``"NC_000001.10"`` @@ -711,13 +717,25 @@ async def get_mane_c_genomic_data( ) -> Optional[Dict]: """Get MANE Transcript and genomic data. - Used when going from g -> MANE c - - # TODO example - - :param ac: MANE Transcript accession - :param alt_ac: NC Accession - :param start_pos: Genomic start position change + Used when going from g -> MANE c # TODO is this an HGVS reference? + + >>> from cool_seq_tool.sources import UTADatabase + >>> uta_db = await UTADatabase.create() + >>> result = await uta_db.get_mane_c_genomic_data( + ... "NM_004333.6", + ... None, + ... 140753335, + ... 140753335, + ... ) + >>> result["gene"] + 'BRAF' + >>> result["alt_ac"] + 'NC_000007.14' + + :param ac: MANE transcript accession + :param alt_ac: NC accession. Used to triangulate on correct genomic data. Can + be set to ``None`` if unavailable. + :param start_pos: Genomic start position change # TODO do we still want to say "change" here? :param end_pos: Genomic end position change :return: MANE transcript results if successful """ @@ -852,7 +870,12 @@ async def get_gene_from_ac( ) -> Optional[List[str]]: """Get transcripts from NC accession and positions. - :param ac: NC Accession + >>> from cool_seq_tool.sources import UTADatabase + >>> uta_db = await UTADatabase.create() + >>> await uta_db.get_gene_from_ac("NC_000017.11", 43044296, 43045802) + ['BRCA1'] + + :param ac: NC accession, e.g. ``"NC_000001.11"`` :param start_pos: Start position change :param end_pos: End position change :return: List of HGNC gene symbols @@ -1102,7 +1125,8 @@ def _set_liftover( genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1] async def p_to_c_ac(self, p_ac: str) -> List[str]: - """Return c. accession from p. accession. + """Return cDNA reference sequence accession from protein reference sequence + accession (i.e. ``p.`` to ``c.`` in HGVS syntax) :param p_ac: Protein accession :return: List of rows containing c. accessions that are associated with the @@ -1151,7 +1175,7 @@ async def get_transcripts_from_genomic_pos( @staticmethod def get_secret() -> str: - """Get secrets for UTA DB instances.""" + """Get secrets for UTA DB instances. Used for deployment on AWS.""" secret_name = environ["UTA_DB_SECRET"] region_name = "us-east-2" diff --git a/cool_seq_tool/utils.py b/cool_seq_tool/utils.py index 4f39ff2a..3c99d059 100644 --- a/cool_seq_tool/utils.py +++ b/cool_seq_tool/utils.py @@ -19,14 +19,10 @@ def get_inter_residue_pos( `VRS docs `_. This function is used internally to shift user-provided coordinates accordingly. - For example: - - .. code-block:: pycon - - >>> from cool_seq_tool.utils import get_inter_residue_pos - >>> from cool_seq_tool.schemas import ResidueMode - >>> get_inter_residue_pos(10, ResidueMode.RESIDUE) - ((9, 9), None) + >>> from cool_seq_tool.utils import get_inter_residue_pos + >>> from cool_seq_tool.schemas import ResidueMode + >>> get_inter_residue_pos(10, ResidueMode.RESIDUE) + ((9, 9), None) :param start_pos: Start position :param residue_mode: Residue mode for ``start_pos`` and ``end_pos`` diff --git a/docs/source/transcript_selection.rst b/docs/source/transcript_selection.rst index 3878d3fe..61759026 100644 --- a/docs/source/transcript_selection.rst +++ b/docs/source/transcript_selection.rst @@ -1,7 +1,9 @@ +.. _transcript_selection_policy: + Transcript Selection ==================== -One of the core uses of Cool-Seq-Tool is to acquire and use consensus, representative transcripts in performing genomic analysis. Here, we describe the selection processes, programmed in the :py:class:`MANETranscript ` class, for choosing the best available transcripts that are compatible with requested data. +One of the core uses of Cool-Seq-Tool is to acquire and use consensus-based, representative transcripts in performing genomic analysis. Here, we describe the selection processes, programmed in the :py:class:`MANETranscript ` class, for choosing the best available transcripts that are compatible with requested data. We rely heavily on transcripts annotated under the `Matched Annotation from NCBI and EMBL-EBI (MANE)` Transcripts project. For more information on the MANE project, see the `NCBI MANE page `_. From 09e9bc8de5e417d02c67f35b970663c87678389c Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 8 Dec 2023 20:42:32 -0500 Subject: [PATCH 10/34] Add docs action --- .github/workflows/checks.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 65f84f73..7750e8ce 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -30,3 +30,12 @@ jobs: - name: ruff uses: chartboost/ruff-action@v1 + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - uses: ammaraskar/sphinx-action@master + with: + pre-build-command: 'python3 -m pip install --upgrade pip && python3 -m pip install .[docs]' + docs-folder: "docs/" From 07516029a21dfd88447807f93e3b9ea725e34972 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 8 Dec 2023 20:58:16 -0500 Subject: [PATCH 11/34] add note on rest moutning --- docs/source/usage.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 43a14f8b..7501ec64 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -29,6 +29,16 @@ Core Cool-Seq-Tool functions can also be performed via a REST HTTP interface, pr By default, ``uvicorn`` serves to port 8000. Once initialized, go to ``_ in a web browser for OpenAPI docs describing available endpoints. +REST routes are defined using the FastAPI ``APIRouter`` class, meaning that they can also be mounted to other FastAPI applications: + +.. code-block:: python + + from fastapi import FastAPI + from cool_seq_tool.routers import mane + + app = FastAPI() + app.include_router(mane.router) + .. _configuration: Environment configuration From b8beaed3e9dfa3afddd64581f127c67a7ac8766f Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 11:04:08 -0500 Subject: [PATCH 12/34] add mapper descriptions --- cool_seq_tool/mappers/exon_genomic_coords.py | 9 ++ cool_seq_tool/mappers/mane_transcript.py | 122 +++++++++++++++---- cool_seq_tool/sources/transcript_mappings.py | 6 +- cool_seq_tool/utils.py | 3 +- docs/source/transcript_selection.rst | 4 +- docs/source/usage.rst | 2 +- 6 files changed, 114 insertions(+), 32 deletions(-) diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py index ea321f67..1e0819b3 100644 --- a/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -37,6 +37,15 @@ def __init__(self, uta_db: UTADatabase, mane_transcript: MANETranscript) -> None >>> from cool_seq_tool.app import CoolSeqTool >>> egc = CoolSeqTool().ex_g_coords_mapper + Note that this class's methods are all defined as ``async``, so they will + need to be called with ``await``. + + >>> egc.ex_g_coords_mapper().transcript_to_genomic_coordinates( + ... transcript="NM_002529.3", + ... exon_end=17 + ... ) + + :param uta_db: UTADatabase instance to give access to query UTA database :param mane_transcript: Instance to align to MANE or compatible representation """ diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index 775b9dc1..361c1677 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -1,14 +1,15 @@ -"""Module for retrieving MANE Transcript from variation on p/c/g coordinate. +"""Retrieve MANE transcript from a location on p./c./g. coordinates. Steps: -1. Map annotation layer to genome -2. Liftover to preferred genome. We want to liftover to GRCh38. We do not support - getting MANE transcripts for GRCh36 and earlier assemblies. +#. Map annotation layer to genome +#. Liftover to preferred genome (GRCh38). GRCh36 and earlier assemblies are not supported + for fetching MANE transcripts. +#. Select preferred compatible annotation (see :ref:`transcript compatibility `) +#. Map back to correct annotation layer -3. Select preferred compatible annotation - -4. Map back to correct annotation layer +In addition to a mapper utility class, this module also defines several vocabulary +constraints and data models for coordinate representation. """ import logging import math @@ -47,7 +48,7 @@ class EndAnnotationLayer(StrEnum): class DataRepresentation(BaseModel): - """Create model for final output representation""" + """Define object model for final output representation""" gene: Optional[str] = None refseq: str @@ -58,7 +59,7 @@ class DataRepresentation(BaseModel): class CdnaRepresentation(DataRepresentation): - """Create model for coding dna representation""" + """Define object model for coding DNA representation""" coding_start_site: int coding_end_site: int @@ -66,7 +67,7 @@ class CdnaRepresentation(DataRepresentation): class GenomicRepresentation(BaseModel): - """Create model for genomic representation""" + """Define object model for genomic representation""" refseq: str pos: Tuple[int, int] @@ -75,7 +76,7 @@ class GenomicRepresentation(BaseModel): class ProteinAndCdnaRepresentation(BaseModel): - """Create model for protein and coding dna representation""" + """Define object model for protein and cDNA representation""" protein: DataRepresentation cdna: CdnaRepresentation @@ -93,6 +94,18 @@ def __init__( ) -> None: """Initialize the MANETranscript class. + A handful of resources are required for initialization, so when defaults are + enough, it's easiest to let the core CoolSeqTool class handle it for you: + + >>> from cool_seq_tool.app import CoolSeqTool + >>> mane_mapper = CoolSeqTool().mane_transcript + + Note that most methods are defined as Python coroutines, so they must be called + with ``await``: + + >>> mane_mapper.g_to_grch38("NC_000001.11", 100, 200) + + :param seqrepo_access: Access to seqrepo queries :param transcript_mappings: Access to transcript accession mappings and conversions @@ -577,11 +590,33 @@ async def get_longest_compatible_transcript( ) -> Optional[ Union[DataRepresentation, CdnaRepresentation, ProteinAndCdnaRepresentation] ]: - """Get longest compatible transcript from a gene. + """Get longest compatible transcript from a gene. See the documentation for + the :ref:`transcript compatibility policy ` for more + information. + + >>> from cool_seq_tool.app import CoolSeqTool + >>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode + >>> mane_mapper = CoolSeqTool().mane_transcript + >>> mane_transcripts = { + ... "ENST00000646891.2", + ... "NM_001374258.1", + ... "NM_004333.6", + ... "ENST00000644969.2", + ... } + >>> result = await mane_mapper.get_longest_compatible_transcript( + ... 599, + ... 599, + ... gene="BRAF", + ... start_annotation_layer=AnnotationLayer.PROTEIN, + ... residue_mode=ResidueMode.INTER_RESIDUE, + ... mane_transcripts=mane_transcripts, + ... ) + >>> result.refseq + 'NP_001365396.1' Try GRCh38 first, then GRCh37. - Transcript is compatible if it passes validation checks. + # TODO example for this? :param start_pos: Start position change :param end_pos: End position change @@ -819,9 +854,18 @@ async def get_mane_transcript( try_longest_compatible: bool = False, residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Optional[Union[DataRepresentation, CdnaRepresentation]]: - """Return mane transcript. - - # TODO example + """Return MANE transcript. + + >>> from cool_seq_tool.app import CoolSeqTool + >>> mane_mapper = CoolSeqTool().mane_transcript + >>> result = await mane_mapper.get_mane_transcript( + ... "NP_004324.2", + ... 599, + ... AnnotationLayer.PROTEIN, + ... residue_mode=ResidueMode.INTER_RESIDUE, + ... ) + >>> (result.gene, result.refseq, result.status) + ('BRAF', 'NP_004324.2', ) :param ac: Accession :param start_pos: Start position change @@ -956,8 +1000,8 @@ async def g_to_grch38( """Return genomic coordinate on GRCh38 when not given gene context. :param ac: Genomic accession - :param start_pos: Genomic start position change - :param end_pos: Genomic end position change + :param start_pos: Genomic start position + :param end_pos: Genomic end position :return: NC accession, start and end pos on GRCh38 assembly """ if end_pos is None: @@ -1032,22 +1076,50 @@ async def g_to_mane_c( self, ac: str, start_pos: int, - end_pos: int, + end_pos: int, # TODO is this supposed to be marked as optional (see examples)? gene: Optional[str] = None, residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Optional[Union[GenomicRepresentation, CdnaRepresentation]]: """Return MANE Transcript on the c. coordinate. - If gene is provided, g->GRCh38->MANE c. + >>> from cool_seq_tool.app import CoolSeqTool + >>> mane_mapper = CoolSeqTool().mane_transcript + + If an arg for ``gene`` is provided, lifts to GRCh38, then gets MANE cDNA + representation. + + >>> result = await mane_mapper.g_to_mane_c( + ... "NC_000007.13", + ... 55259515, + ... None, + ... gene="EGFR" + ... ) + >>> type(result) + cool_seq_tool.mappers.mane_transcript.CdnaRepresentation + >>> result.status + + + If no matching MANE cDNA representation is found, instead returns genomic + coordinates on GRCh38. - If MANE c. cannot be found, we return the genomic coordinate on GRCh38. + # TODO is this true? code makes it look like it returns None + # otherwise find example - If gene is not provided, g -> GRCh38. + Locating a MANE transcript requires a ``gene`` symbol argument -- if none is + given, only lifts over to genomic coordinates on GRCh38. + + >>> result = await mane_mapper.g_to_mane_c( + ... "NC_000007.13", 55259515, None + ... ) + >>> type(result) + cool_seq_tool.mappers.mane_transcript.GenomicRepresentation + >>> result.refseq, result.pos, result.status + ('NC_000007.14', (55191821, 55191821), ) :param ac: Transcript accession on g. coordinate - :param start_pos: genomic change start position - :param end_pos: genomic change end position - :param gene: Gene symbol + :param start_pos: genomic start position + :param end_pos: genomic end position + :param gene: HGNC gene symbol :param residue_mode: Starting residue mode for ``start_pos`` and ``end_pos``. Will always return coordinates in inter-residue. :return: MANE Transcripts with cDNA change on c. coordinate if gene diff --git a/cool_seq_tool/sources/transcript_mappings.py b/cool_seq_tool/sources/transcript_mappings.py index a6929d21..f935cf41 100644 --- a/cool_seq_tool/sources/transcript_mappings.py +++ b/cool_seq_tool/sources/transcript_mappings.py @@ -138,12 +138,10 @@ def protein_transcripts(self, identifier: str) -> List[str]: """Return a list of protein transcripts for a gene symbol. >>> from cool_seq_tool.sources import TranscriptMappings - >>> TranscriptMappings().protein_transcripts("BRAF") + >>> TranscriptMappings().protein_transcripts("BRAF")[:3] ['ENSP00000420119.2', 'ENSP00000288602', - 'NP_004324.2', - ... # etc - 'NP_001365402.1'] + 'NP_004324.2'] :param identifier: Gene identifier to get protein transcripts for :return: Protein transcripts for a gene symbol diff --git a/cool_seq_tool/utils.py b/cool_seq_tool/utils.py index 3c99d059..bf1e2222 100644 --- a/cool_seq_tool/utils.py +++ b/cool_seq_tool/utils.py @@ -51,7 +51,8 @@ def get_inter_residue_pos( @staticmethod def service_meta() -> ServiceMeta: - """Return ServiceMeta for cool_seq_tool + """Return description of request and service, including parameters like software + version for reproducibility. :return: ServiceMeta object """ diff --git a/docs/source/transcript_selection.rst b/docs/source/transcript_selection.rst index 61759026..32aa39cb 100644 --- a/docs/source/transcript_selection.rst +++ b/docs/source/transcript_selection.rst @@ -7,6 +7,8 @@ One of the core uses of Cool-Seq-Tool is to acquire and use consensus-based, rep We rely heavily on transcripts annotated under the `Matched Annotation from NCBI and EMBL-EBI (MANE)` Transcripts project. For more information on the MANE project, see the `NCBI MANE page `_. +.. _transcript_compatibility: + Transcript compatibility ------------------------ @@ -30,4 +32,4 @@ All compatible transcripts are evaluated and ordered against the below criteria. .. note:: - We prefer the most recent version of a transcript associated with an assembly. + We always prefer the most recent version of a transcript associated with an assembly. diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 7501ec64..d4ffe364 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -9,7 +9,7 @@ Cool-Seq-Tool provides easy access to, and useful operations on, a selection of * :ref:`Data mappers `, for functions that incorporate multiple sources/handlers to produce output -A core :py:class:`CoolSeqTool ` class encapsulates all of their functions and can be used for easy initialization: +The core :py:class:`CoolSeqTool ` class encapsulates all of their functions and can be used for easy initialization: .. code-block:: pycon From 37a89a5256cc40d2a4382cd75011798bf4631909 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 12:52:38 -0500 Subject: [PATCH 13/34] Add more docstring descr --- cool_seq_tool/mappers/alignment.py | 4 ++-- cool_seq_tool/mappers/mane_transcript.py | 10 ++++------ cool_seq_tool/sources/transcript_mappings.py | 2 +- cool_seq_tool/sources/uta_database.py | 7 +++---- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/cool_seq_tool/mappers/alignment.py b/cool_seq_tool/mappers/alignment.py index 62a50c95..ba9e2e9d 100644 --- a/cool_seq_tool/mappers/alignment.py +++ b/cool_seq_tool/mappers/alignment.py @@ -224,8 +224,8 @@ async def p_to_g( * Genomic representation (ac, positions) if able to translate. Will return positions as inter-residue coordinates. Else ``None``. - * and warnings. The genomic data will always return inter-residue coordinates - TODO something got cut off here + * Warnings. The genomic data will always return inter-residue coordinates + (# TODO: something got cut off here?) """ c_data, warning = await self.p_to_c( p_ac, p_start_pos, p_end_pos, residue_mode=residue_mode diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index 361c1677..95f9479c 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -511,9 +511,6 @@ def _validate_index( ) -> bool: """Validate that positions actually exist on accession - # TODO example - even though it's not public (it should be?) - :param ac: Accession :param pos: Start position change, End position change :param coding_start_site: coding start site for accession @@ -614,9 +611,10 @@ async def get_longest_compatible_transcript( >>> result.refseq 'NP_001365396.1' - Try GRCh38 first, then GRCh37. + If unable to find a match on GRCh38, this method will then attempt to drop down + to GRCh37. - # TODO example for this? + # TODO example for inputs that demonstrate this? :param start_pos: Start position change :param end_pos: End position change @@ -1076,7 +1074,7 @@ async def g_to_mane_c( self, ac: str, start_pos: int, - end_pos: int, # TODO is this supposed to be marked as optional (see examples)? + end_pos: int, gene: Optional[str] = None, residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Optional[Union[GenomicRepresentation, CdnaRepresentation]]: diff --git a/cool_seq_tool/sources/transcript_mappings.py b/cool_seq_tool/sources/transcript_mappings.py index f935cf41..333311db 100644 --- a/cool_seq_tool/sources/transcript_mappings.py +++ b/cool_seq_tool/sources/transcript_mappings.py @@ -16,7 +16,7 @@ class TranscriptMappings: In general, this class's methods expect to receive NCBI gene symbols, so users should be careful about the sourcing of their input in cases where terms are - conflicted or ambiguous. # TODO is that true? + conflicted or ambiguous. # TODO double-checking: NCBI not HGNC? """ def __init__( diff --git a/cool_seq_tool/sources/uta_database.py b/cool_seq_tool/sources/uta_database.py index d98a4ed1..2435840e 100644 --- a/cool_seq_tool/sources/uta_database.py +++ b/cool_seq_tool/sources/uta_database.py @@ -715,9 +715,8 @@ def data_from_result(result: List) -> Optional[Dict]: async def get_mane_c_genomic_data( self, ac: str, alt_ac: Optional[str], start_pos: int, end_pos: int ) -> Optional[Dict]: - """Get MANE Transcript and genomic data. - - Used when going from g -> MANE c # TODO is this an HGVS reference? + """Get MANE transcript and genomic data. Used when going from g. to MANE c. + representation. >>> from cool_seq_tool.sources import UTADatabase >>> uta_db = await UTADatabase.create() @@ -735,7 +734,7 @@ async def get_mane_c_genomic_data( :param ac: MANE transcript accession :param alt_ac: NC accession. Used to triangulate on correct genomic data. Can be set to ``None`` if unavailable. - :param start_pos: Genomic start position change # TODO do we still want to say "change" here? + :param start_pos: Genomic start position :param end_pos: Genomic end position change :return: MANE transcript results if successful """ From 43a7f67349f0b5def746212351c261d1fd98619b Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 12:52:46 -0500 Subject: [PATCH 14/34] Update readme --- README.md | 116 +++++++++++------------------------------------------- 1 file changed, 22 insertions(+), 94 deletions(-) diff --git a/README.md b/README.md index 52bed28f..a0c6738e 100644 --- a/README.md +++ b/README.md @@ -1,109 +1,37 @@ -# **C**ommon **O**perations **O**n **L**ots-of **Seq**uences Tool +

+**C**ommon **O**perations **O**n **L**ots-of **Seq**uences Tool +

-The **cool-seq-tool** provides: +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/cool-seq-tool?color=gr) [![tests](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml/badge.svg)](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml) - - Transcript alignment data from the [UTA](https://github.com/biocommons/uta) database - - Fast access to sequence data using [SeqRepo](https://github.com/biocommons/biocommons.seqrepo) - - Liftover between assemblies (GRCh38 <--> GRCh37) from [PyLiftover](https://github.com/konstantint/pyliftover) - - Lifting over to preferred [MANE](https://www.ncbi.nlm.nih.gov/refseq/MANE/) compatible transcript. See [here](docs/TranscriptSelectionPriority.md) for more information. - -## Installation - -### pip - -```commandline -pip install cool-seq-tool[dev,tests] -``` - -### UTA Database Installation - -`cool-seq-tool` uses intalls local UTA database. For other ways to install, visit [biocommons.uta](https://github.com/biocommons/uta). - -#### Local Installation - -_The following commands will likely need modification appropriate for the installation environment._ -1. Install [PostgreSQL](https://www.postgresql.org/) -2. Create user and database. - - ``` - $ createuser -U postgres uta_admin - $ createuser -U postgres anonymous - $ createdb -U postgres -O uta_admin uta - ``` - -3. To install locally, from the _cool_seq_tool/data_ directory: -``` -export UTA_VERSION=uta_20210129b.pgd.gz -curl -O https://dl.biocommons.org/uta/$UTA_VERSION -gzip -cdq ${UTA_VERSION} | grep -v "^REFRESH MATERIALIZED VIEW" | psql -h localhost -U uta_admin --echo-errors --single-transaction -v ON_ERROR_STOP=1 -d uta -p 5433 -``` - -##### UTA Installation Issues -If you have trouble installing UTA, you can visit [these two READMEs](https://github.com/ga4gh/vrs-python/tree/main/docs/setup_help). - -#### Connecting to the database +**[Documentation](#)** · [Installation](#) · [Usage](#) · [API reference](#) -To connect to the UTA database, you can use the default url (`postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129b`). -If you do not wish to use the default, you must set the environment variable `UTA_DB_URL` which has the format of `driver://user:password@host:port/database/schema`. +## Overview -### Data Downloads + +The **CoolSeqTool** provides: -#### SeqRepo -`cool-seq-tool` relies on [seqrepo](https://github.com/biocommons/biocommons.seqrepo), which you must download yourself. + - Transcript alignment data from the [UTA](https://github.com/biocommons/uta) database + - Fast access to sequence data using [SeqRepo](https://github.com/biocommons/biocommons.seqrepo) + - Liftover between assemblies (GRCh38 <--> GRCh37) from [PyLiftover](https://github.com/konstantint/pyliftover) + - Lifting over to preferred [MANE](https://www.ncbi.nlm.nih.gov/refseq/MANE/) compatible transcript. See [here]() for more information. + -Use the `SEQREPO_ROOT_DIR` environment variable to set the path of an already existing SeqRepo directory. The default is `/usr/local/share/seqrepo/latest`. +--- -From the _root_ directory: -``` -pip install seqrepo -sudo mkdir /usr/local/share/seqrepo -sudo chown $USER /usr/local/share/seqrepo -seqrepo pull -i 2021-01-29 # Replace with latest version using `seqrepo list-remote-instances` if outdated -``` +## Install -If you get an error similar to the one below: -``` -PermissionError: [Error 13] Permission denied: '/usr/local/share/seqrepo/2021-01-29._fkuefgd' -> '/usr/local/share/seqrepo/2021-01-29' -``` +CoolSeqTool is available on [PyPI](https://pypi.org/project/cool-seq-tool) -You will want to do the following:\ -(*Might not be ._fkuefgd, so replace with your error message path*) -```console -sudo mv /usr/local/share/seqrepo/2021-01-29._fkuefgd /usr/local/share/seqrepo/2021-01-29 -exit +```shell +python3 -m pip install cool-seq-tool ``` -#### LRG_RefSeqGene - -`cool-seq-tool` fetches the latest version of `LRG_RefSeqGene` if the environment variable `LRG_REFSEQGENE_PATH` is not set. When `LRG_REFSEQGENE_PATH` is set, `cool-seq-tool` will look at this path and expect the LRG_RefSeqGene file. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene). +See the [installation instructions](#) in the documentation for a description of dependency setup requirements. -#### MANE Summary Data +--- -`cool-seq-tool` fetches the latest version of `MANE.GRCh38.*.summary.txt.gz` if the environment variable `MANE_SUMMARY_PATH` is not set. When `MANE_SUMMARY_PATH` is set, `cool-seq-tool` will look at this path and expect the MANE Summary Data file. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/). - -#### transcript_mapping.tsv -`cool-seq-tool` is packaged with transcript mapping data acquired from [Ensembl BioMart](http://www.ensembl.org/biomart/martview). If the environment variable `TRANSCRIPT_MAPPINGS_PATH` is not set, `cool-seq-tool` will use the built-in file. When `TRANSCRIPT_MAPPINGS_PATH` is set, `cool_seq_tool` will look at this path and expect to find the transcript mapping TSV file. - -To acquire this data manually from the [BioMart](https://www.ensembl.org/biomart/martview), select the `Human Genes (GRCh38.p13)` dataset and choose the following attributes: - -* Gene stable ID -* Gene stable ID version -* Transcript stable ID -* Transcript stable ID version -* Protein stable ID -* Protein stable ID version -* RefSeq match transcript (MANE Select) -* Gene name - -![image](biomart.png) - -## Starting the UTA Tools Service Locally - -To start the service, run the following: - -```commandline -uvicorn cool_seq_tool.api:app --reload -``` +## Feedback and contributing -Next, view the FastAPI on your local machine: http://127.0.0.1:8000/cool_seq_tool +We welcome bug reports, feature requests, and code contributions from users and interested collaborators. The [documentation](#) contains guidance for submitting feedback and contributing new code. From e84f9bc5a2b61c765344eca37e59911896e438c8 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 12:54:09 -0500 Subject: [PATCH 15/34] testing this out --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a0c6738e..407aaba4 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,14 @@

-**C**ommon **O**perations **O**n **L**ots-of **Seq**uences Tool +CoolSeqTool

-![PyPI - Python Version](https://img.shields.io/pypi/pyversions/cool-seq-tool?color=gr) [![tests](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml/badge.svg)](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml) +

**[Documentation](#)** · [Installation](#) · [Usage](#) · [API reference](#) +

+ +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/cool-seq-tool?color=gr) [![tests](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml/badge.svg)](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml) ## Overview From 5e0df5d6da7afb054cdea61ae9798837aee84d13 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 12:54:36 -0500 Subject: [PATCH 16/34] didnt work --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 407aaba4..0d42b670 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,10 @@ CoolSeqTool -

+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/cool-seq-tool?color=gr) [![tests](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml/badge.svg)](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml) **[Documentation](#)** · [Installation](#) · [Usage](#) · [API reference](#) -

- -![PyPI - Python Version](https://img.shields.io/pypi/pyversions/cool-seq-tool?color=gr) [![tests](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml/badge.svg)](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml) ## Overview From a992e2d8da6f6147db906528050754ca5917ee0d Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 12:56:18 -0500 Subject: [PATCH 17/34] add rtd config --- .readthedocs.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .readthedocs.yaml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..24bdd851 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,16 @@ +version: 2 + +build: + os: "ubuntu-20.04" + tools: + python: "3.11" + +python: + install: + - method: pip + path: . + extra_requirements: + - docs + +sphinx: + configuration: docs/source/conf.py From 4274b8e9bda41b417c3ed8bb840cfbabe0590686 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 13:29:50 -0500 Subject: [PATCH 18/34] Add dev dependency --- .github/workflows/checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 7750e8ce..78c97d8c 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -37,5 +37,5 @@ jobs: - uses: ammaraskar/sphinx-action@master with: - pre-build-command: 'python3 -m pip install --upgrade pip && python3 -m pip install .[docs]' + pre-build-command: 'python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' docs-folder: "docs/" From 3fe38aca26a7251e911b8b8957f1d16502a7a068 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 13:55:11 -0500 Subject: [PATCH 19/34] test --- .github/workflows/checks.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 78c97d8c..506a5a28 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -35,7 +35,17 @@ jobs: steps: - uses: actions/checkout@v3 + # manually perform setup + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.11 + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install '.[dev,docs]' + - uses: ammaraskar/sphinx-action@master with: - pre-build-command: 'python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' docs-folder: "docs/" From d820f0f65c850313c55a35ab2d0a095f1e0d7864 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 13:58:52 -0500 Subject: [PATCH 20/34] try manual install --- .github/workflows/checks.yml | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 506a5a28..88349f8a 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -35,17 +35,7 @@ jobs: steps: - uses: actions/checkout@v3 - # manually perform setup - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: 3.11 - - - name: Install dependencies - run: | - python3 -m pip install --upgrade pip - python3 -m pip install '.[dev,docs]' - - uses: ammaraskar/sphinx-action@master with: + pre-build-command: 'apt-get -y install postgresql && python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' docs-folder: "docs/" From 6dcdffb90234a679a07a2b4929ec2fbb8660e9bb Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 14:02:53 -0500 Subject: [PATCH 21/34] test --- .github/workflows/checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 88349f8a..278bde27 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -37,5 +37,5 @@ jobs: - uses: ammaraskar/sphinx-action@master with: - pre-build-command: 'apt-get -y install postgresql && python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' + pre-build-command: 'apt-get update -y && apt-get install -y postgresql && python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' docs-folder: "docs/" From e5813321d5a03afe0be4831d2e3ceaac112d2b1d Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 14:18:53 -0500 Subject: [PATCH 22/34] test --- .github/workflows/checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 278bde27..b30ac911 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -37,5 +37,5 @@ jobs: - uses: ammaraskar/sphinx-action@master with: - pre-build-command: 'apt-get update -y && apt-get install -y postgresql && python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' + pre-build-command: 'apt-get update -y && python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' docs-folder: "docs/" From 89854c30af67358b0e3ebdc5a27ed179caa339c3 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 14:19:48 -0500 Subject: [PATCH 23/34] test remove apt get --- .github/workflows/checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index b30ac911..a9516546 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -37,5 +37,5 @@ jobs: - uses: ammaraskar/sphinx-action@master with: - pre-build-command: 'apt-get update -y && python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' + pre-build-command: 'python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' docs-folder: "docs/" From ea42e70b8d725b65b27e3f6b4c8667ff80f4d1fa Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 14:21:11 -0500 Subject: [PATCH 24/34] test add apt get back --- .github/workflows/checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index a9516546..278bde27 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -37,5 +37,5 @@ jobs: - uses: ammaraskar/sphinx-action@master with: - pre-build-command: 'python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' + pre-build-command: 'apt-get update -y && apt-get install -y postgresql && python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' docs-folder: "docs/" From 274fa9c590e15437df4f5adba00a0aedfefa8bf8 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 14:25:19 -0500 Subject: [PATCH 25/34] try sudo --- .github/workflows/checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 278bde27..154d95f9 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -37,5 +37,5 @@ jobs: - uses: ammaraskar/sphinx-action@master with: - pre-build-command: 'apt-get update -y && apt-get install -y postgresql && python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' + pre-build-command: 'sudo apt-get update -y && sudo apt-get install -y postgresql && python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' docs-folder: "docs/" From 40b681e5a49511d224f5d6951b758b58606bbf0e Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 14:29:57 -0500 Subject: [PATCH 26/34] use manual build --- .github/workflows/checks.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 154d95f9..df9abd0d 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -35,7 +35,16 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ammaraskar/sphinx-action@master + - name: Set up Python + uses: actions/setup-python@v4 with: - pre-build-command: 'sudo apt-get update -y && sudo apt-get install -y postgresql && python3 -m pip install --upgrade pip && python3 -m pip install .[dev,docs]' - docs-folder: "docs/" + python-version: 3.11 + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install '.[docs]' + + - name: Attempt docs build + working-directory: ./docs + run: make html From 22954bde25aa0bd9819b38e09825a81172116290 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 14:33:03 -0500 Subject: [PATCH 27/34] intentionally introduce an error --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index d745f2a3..ae07b8d0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -5,7 +5,7 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information - +from sdflkj import sdlfk project = "Cool-Seq-Tool" copyright = "2023, Wagner Lab" author = "Wagner Lab" From b0f91526cab10a92ca443c2a3ea7a05296078a97 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 22 Dec 2023 14:34:19 -0500 Subject: [PATCH 28/34] fixed! --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index ae07b8d0..d745f2a3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -5,7 +5,7 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -from sdflkj import sdlfk + project = "Cool-Seq-Tool" copyright = "2023, Wagner Lab" author = "Wagner Lab" From f2e25b8ea0aa26dcf8d4eec1085db6953af9e519 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 26 Dec 2023 10:16:37 -0500 Subject: [PATCH 29/34] update description of async --- cool_seq_tool/mappers/exon_genomic_coords.py | 5 +++-- cool_seq_tool/mappers/mane_transcript.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py index 1e0819b3..c09ac3cf 100644 --- a/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -40,11 +40,12 @@ def __init__(self, uta_db: UTADatabase, mane_transcript: MANETranscript) -> None Note that this class's methods are all defined as ``async``, so they will need to be called with ``await``. - >>> egc.ex_g_coords_mapper().transcript_to_genomic_coordinates( + >>> result = egc.transcript_to_genomic_coordinates( ... transcript="NM_002529.3", ... exon_end=17 ... ) - + >>> result.genomic_data + AttributeError: 'coroutine' object has no attribute 'genomic_data' :param uta_db: UTADatabase instance to give access to query UTA database :param mane_transcript: Instance to align to MANE or compatible representation diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index 95f9479c..f1beeafd 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -103,8 +103,9 @@ def __init__( Note that most methods are defined as Python coroutines, so they must be called with ``await``: - >>> mane_mapper.g_to_grch38("NC_000001.11", 100, 200) - + >>> result = mane_mapper.g_to_grch38("NC_000001.11", 100, 200) + >>> result['ac'] + TypeError: 'coroutine' object is not subscriptable :param seqrepo_access: Access to seqrepo queries :param transcript_mappings: Access to transcript accession mappings and From d26cab76f113055bc643589b0c7cd0cdd1354d85 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 26 Dec 2023 12:37:47 -0500 Subject: [PATCH 30/34] Address some comments --- cool_seq_tool/mappers/alignment.py | 6 +++--- cool_seq_tool/mappers/exon_genomic_coords.py | 8 ++++---- cool_seq_tool/mappers/mane_transcript.py | 9 ++------- cool_seq_tool/sources/transcript_mappings.py | 2 +- docs/source/conf.py | 6 ------ 5 files changed, 10 insertions(+), 21 deletions(-) diff --git a/cool_seq_tool/mappers/alignment.py b/cool_seq_tool/mappers/alignment.py index ba9e2e9d..627cddad 100644 --- a/cool_seq_tool/mappers/alignment.py +++ b/cool_seq_tool/mappers/alignment.py @@ -213,7 +213,8 @@ async def p_to_g( residue_mode: ResidueMode = ResidueMode.INTER_RESIDUE, target_genome_assembly: Assembly = Assembly.GRCH38, ) -> Tuple[Optional[Dict], Optional[str]]: - """Translate protein representation to genomic representation + """Translate protein representation to genomic representation, by way of + intermediary conversion into cDNA coordinates. :param p_ac: Protein RefSeq accession :param p_start_pos: Protein start position @@ -224,8 +225,7 @@ async def p_to_g( * Genomic representation (ac, positions) if able to translate. Will return positions as inter-residue coordinates. Else ``None``. - * Warnings. The genomic data will always return inter-residue coordinates - (# TODO: something got cut off here?) + * Warnings, if conversion to cDNA or genomic coordinates fails. """ c_data, warning = await self.p_to_c( p_ac, p_start_pos, p_end_pos, residue_mode=residue_mode diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py index c09ac3cf..468d6c9a 100644 --- a/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -81,7 +81,7 @@ async def transcript_to_genomic_coordinates( """Get genomic data given transcript data. By default, inputs are assumed to be in GRCh38 where they aren't otherwise - precisely defined. # TODO is this correct? + precisely defined. >>> from cool_seq_tool.app import CoolSeqTool >>> egc = CoolSeqTool().ex_g_coords_mapper @@ -204,7 +204,7 @@ async def genomic_to_transcript_exon_coordinates( ) -> GenomicDataResponse: """Get transcript data for genomic data, lifted over to GRCh38. - MANE Transcript data will be returned if and only if `transcript` is not + MANE Transcript data will be returned if and only if ``transcript`` is not supplied. ``gene`` must be given in order to retrieve MANE Transcript data. >>> from cool_seq_tool.app import CoolSeqTool @@ -229,7 +229,7 @@ async def genomic_to_transcript_exon_coordinates( Compatible Transcript. See the :ref:`Transcript Selection policy ` page. :param gene: Gene symbol - :param residue_mode: Default is ``resiude`` (1-based). Must be either + :param residue_mode: Default is ``residue`` (1-based). Must be either ``residue`` or ``inter-residue`` (0-based). :return: Genomic data (inter-residue coordinates) """ @@ -331,7 +331,7 @@ async def _genomic_to_transcript_exon_coordinate( :param gene: Gene symbol :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is end position. - :param residue_mode: Default is ``resiude`` (1-based). Must be either ``residue`` + :param residue_mode: Default is ``residue`` (1-based). Must be either ``residue`` or ``inter-residue`` (0-based). :return: Transcript data (inter-residue coordinates) """ diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index f1beeafd..5c8fda09 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -856,6 +856,7 @@ async def get_mane_transcript( """Return MANE transcript. >>> from cool_seq_tool.app import CoolSeqTool + >>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode >>> mane_mapper = CoolSeqTool().mane_transcript >>> result = await mane_mapper.get_mane_transcript( ... "NP_004324.2", @@ -1098,14 +1099,8 @@ async def g_to_mane_c( >>> result.status - If no matching MANE cDNA representation is found, instead returns genomic - coordinates on GRCh38. - - # TODO is this true? code makes it look like it returns None - # otherwise find example - Locating a MANE transcript requires a ``gene`` symbol argument -- if none is - given, only lifts over to genomic coordinates on GRCh38. + given, this method will only lift over to genomic coordinates on GRCh38. >>> result = await mane_mapper.g_to_mane_c( ... "NC_000007.13", 55259515, None diff --git a/cool_seq_tool/sources/transcript_mappings.py b/cool_seq_tool/sources/transcript_mappings.py index 333311db..f3d4b67a 100644 --- a/cool_seq_tool/sources/transcript_mappings.py +++ b/cool_seq_tool/sources/transcript_mappings.py @@ -16,7 +16,7 @@ class TranscriptMappings: In general, this class's methods expect to receive NCBI gene symbols, so users should be careful about the sourcing of their input in cases where terms are - conflicted or ambiguous. # TODO double-checking: NCBI not HGNC? + conflicted or ambiguous (which, to be fair, should be relatively rare). """ def __init__( diff --git a/docs/source/conf.py b/docs/source/conf.py index d745f2a3..119bea7c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -49,12 +49,6 @@ "html": "", "class": "fa-solid fa-house", }, - { - "name": "Twitter", - "url": "https://twitter.com/genomicmedlab", - "html": "", - "class": "fa-solid fa-brands fa-twitter", - }, ], } # -- autodoc things ---------------------------------------------------------- From 5b44bbe69da59b7d989bdab99a0dec37d1fa7b9f Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 26 Dec 2023 12:37:57 -0500 Subject: [PATCH 31/34] fill in front matter --- README.md | 29 ++++++++++++++++++++++------- docs/source/index.rst | 13 +++++++------ docs/source/install.rst | 2 ++ docs/source/usage.rst | 10 +++++++++- 4 files changed, 40 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 0d42b670..1b49d0eb 100644 --- a/README.md +++ b/README.md @@ -2,20 +2,16 @@ CoolSeqTool -![PyPI - Python Version](https://img.shields.io/pypi/pyversions/cool-seq-tool?color=gr) [![tests](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml/badge.svg)](https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml) - **[Documentation](#)** · [Installation](#) · [Usage](#) · [API reference](#) - ## Overview The **CoolSeqTool** provides: - - Transcript alignment data from the [UTA](https://github.com/biocommons/uta) database - - Fast access to sequence data using [SeqRepo](https://github.com/biocommons/biocommons.seqrepo) - - Liftover between assemblies (GRCh38 <--> GRCh37) from [PyLiftover](https://github.com/konstantint/pyliftover) - - Lifting over to preferred [MANE](https://www.ncbi.nlm.nih.gov/refseq/MANE/) compatible transcript. See [here]() for more information. + - A Pythonic API on top of sequence data of interest to tertiary analysis tools, including mappings between gene names and transcripts, [MANE transcript](https://www.ncbi.nlm.nih.gov/refseq/MANE/) descriptions, and the [Universal Transcript Archive](https://github.com/biocommons/uta) + - Augmented access to the [SeqRepo](https://github.com/biocommons/biocommons.seqrepo) database, including multiple additional methods and tools + - Mapping tools that combine the above to support translation between references sequences, annotation layers, and MANE transcripts --- @@ -32,6 +28,25 @@ See the [installation instructions](#) in the documentation for a description of --- +## Usage + +All CoolSeqTool resources can be initialized by way of a top-level class instance: + +```pycon +>>> from cool_seq_tool.app import CoolSeqTool +>>> cst = CoolSeqTool() +>>> result = await cst.mane_transcript.get_mane_transcript( +... "NP_004324.2", +... 599, +... AnnotationLayer.PROTEIN, +... residue_mode=ResidueMode.INTER_RESIDUE, +... ) +>>> result.gene, result.refseq, result.status +('EGFR', 'NM_005228.5', ) +``` + +--- + ## Feedback and contributing We welcome bug reports, feature requests, and code contributions from users and interested collaborators. The [documentation](#) contains guidance for submitting feedback and contributing new code. diff --git a/docs/source/index.rst b/docs/source/index.rst index 7f0ffd9c..2011e494 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,14 +1,15 @@ Cool-Seq-Tool |version| ======================= -.. image:: https://img.shields.io/pypi/pyversions/cool-seq-tool?color=gr - :alt: PyPI - Python Version +The **CoolSeqTool** provides: -.. image:: https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml/badge.svg - :alt: tests status - :target: https://github.com/genomicmedlab/cool-seq-tool/actions/workflows/github-actions.yml +* A Pythonic API on top of sequence data of interest to tertiary analysis tools, including mappings between gene names and transcripts, `MANE transcript `_ descriptions, and the `Universal Transcript Archive `_ +* Augmented access to the `SeqRepo `_ database, including multiple additional methods and tools +* Mapping tools that combine the above to support translation between various references sequences and annotation layers, and to MANE-designated transcripts -Description here. +See the :ref:`Installation ` and :ref:`Usage ` pages for information on getting started. Individual classes and methods are documented within the :ref:`API reference `. + +CoolSeqTool was created to support the `Knowledgebase Integration Project `_ of the `Variant Interpretation for Cancer Consortium (VICC) `_. It is developed primarily by the `Wagner Lab `_. Full source code is available on `GitHub `_. .. toctree:: :hidden: diff --git a/docs/source/install.rst b/docs/source/install.rst index 03c0a0ab..c6c66977 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -1,3 +1,5 @@ +.. _installation: + Installation ============ diff --git a/docs/source/usage.rst b/docs/source/usage.rst index d4ffe364..6b859b53 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -1,3 +1,5 @@ +.. _usage: + Usage ===== @@ -9,12 +11,18 @@ Cool-Seq-Tool provides easy access to, and useful operations on, a selection of * :ref:`Data mappers `, for functions that incorporate multiple sources/handlers to produce output -The core :py:class:`CoolSeqTool ` class encapsulates all of their functions and can be used for easy initialization: +The core :py:class:`CoolSeqTool ` class encapsulates all of their functions and can be used for easy initialization and access: .. code-block:: pycon >>> from cool_seq_tool.app import CoolSeqTool >>> cst = CoolSeqTool() + >>> cst.seqrepo_access.translate_alias("NM_002529.3")[0][-1] + 'ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA' + >>> cst.transcript_mappings.ensembl_protein_for_gene_symbol["BRAF"][0] + 'ENSP00000419060' + >>> await cst.uta_db.get_ac_from_gene("BRAF") + ['NC_000007.14', 'NC_000007.13'] Descriptions and examples of functions can be found in the :ref:`API Reference ` section. From ec43fcae351e275996668c17e8ba27ab22b8db55 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 26 Dec 2023 12:42:48 -0500 Subject: [PATCH 32/34] fix --- cool_seq_tool/handlers/seqrepo_access.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cool_seq_tool/handlers/seqrepo_access.py b/cool_seq_tool/handlers/seqrepo_access.py index 520bf2fe..8f34e3a3 100644 --- a/cool_seq_tool/handlers/seqrepo_access.py +++ b/cool_seq_tool/handlers/seqrepo_access.py @@ -114,9 +114,6 @@ def translate_identifier( 'ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA'] >>> sr.translate_identifier("NM_002529.3", "ga4gh")[0] ['ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA'] - >>> # SeqRepo doesn't store Ensembl accession IDs - >>> sr.translate_identifier("ENST00000380152.8") - ([], 'SeqRepo unable to get translated identifiers for ENST00000380152.8') :param ac: Identifier accession :param target_namespace: The namespace(s) of identifier to return From 639e5e3bddbd40edd62417b05c564d56a716f4eb Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 26 Dec 2023 12:47:40 -0500 Subject: [PATCH 33/34] Update policy description --- docs/source/transcript_selection.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/transcript_selection.rst b/docs/source/transcript_selection.rst index 32aa39cb..1d0ea0a2 100644 --- a/docs/source/transcript_selection.rst +++ b/docs/source/transcript_selection.rst @@ -15,8 +15,8 @@ Transcript compatibility The following validation checks are performed to determine compatibility of a transcript position: * The position exists on an accession -* "Validating reference sequences" # TODO what does this mean? -* "Validating exon structure" # TODO what does this mean? +* The sequence matches the expected reference sequence for a given accession or position +* Exon numbering matches known exon structure A transcript that fails to pass any of these checks is discarded as incompatible. From 9407820a6ce219f9e98e956287ae713ee35d20b2 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 26 Dec 2023 13:59:36 -0500 Subject: [PATCH 34/34] bump version --- cool_seq_tool/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cool_seq_tool/version.py b/cool_seq_tool/version.py index 9fa4000e..061f84a6 100644 --- a/cool_seq_tool/version.py +++ b/cool_seq_tool/version.py @@ -1,2 +1,2 @@ """Define package version.""" -__version__ = "0.4.0-dev0" +__version__ = "0.4.0-dev1"