Skip to content

Commit

Permalink
Add PharmGKB sources (#304)
Browse files Browse the repository at this point in the history
Closes #94
  • Loading branch information
cthoyt authored Jan 19, 2025
1 parent 669f730 commit 028cfe3
Show file tree
Hide file tree
Showing 8 changed files with 458 additions and 0 deletions.
12 changes: 12 additions & 0 deletions src/pyobo/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@
from .omim_ps import OMIMPSGetter
from .pathbank import PathBankGetter
from .pfam import PfamClanGetter, PfamGetter
from .pharmgkb import (
PharmGKBChemicalGetter,
PharmGKBDiseaseGetter,
PharmGKBGeneGetter,
PharmGKBPathwayGetter,
PharmGKBVariantGetter,
)
from .pid import PIDGetter
from .pombase import PomBaseGetter
from .pubchem import PubChemCompoundGetter
Expand Down Expand Up @@ -116,6 +123,11 @@
"PathBankGetter",
"PfamClanGetter",
"PfamGetter",
"PharmGKBChemicalGetter",
"PharmGKBDiseaseGetter",
"PharmGKBGeneGetter",
"PharmGKBPathwayGetter",
"PharmGKBVariantGetter",
"PomBaseGetter",
"PubChemCompoundGetter",
"RGDGetter",
Expand Down
15 changes: 15 additions & 0 deletions src/pyobo/sources/pharmgkb/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""Sources for PharmGKB."""

from .pharmgkb_chemical import PharmGKBChemicalGetter
from .pharmgkb_disease import PharmGKBDiseaseGetter
from .pharmgkb_gene import PharmGKBGeneGetter
from .pharmgkb_pathway import PharmGKBPathwayGetter
from .pharmgkb_variant import PharmGKBVariantGetter

__all__ = [
"PharmGKBChemicalGetter",
"PharmGKBDiseaseGetter",
"PharmGKBGeneGetter",
"PharmGKBPathwayGetter",
"PharmGKBVariantGetter",
]
80 changes: 80 additions & 0 deletions src/pyobo/sources/pharmgkb/pharmgkb_chemical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""An ontology representation of PharmGKB chemicals."""

from collections.abc import Iterable

import pandas as pd

from pyobo import Obo, Reference, Term, default_reference
from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, split
from pyobo.struct.typedef import has_inchi, has_smiles

__all__ = [
"PharmGKBChemicalGetter",
]

PREFIX = "pharmgkb.drug"
URL = "https://api.pharmgkb.org/v1/download/file/data/chemicals.zip"


class PharmGKBChemicalGetter(Obo):
"""An ontology representation of PharmGKB chemicals."""

ontology = bioversions_key = PREFIX
dynamic_version = True
typedefs = [has_inchi, has_smiles]

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return iter_terms(force=force)


def iter_terms(force: bool = False) -> Iterable[Term]:
"""Iterate over terms."""
df = download_pharmgkb_tsv(PREFIX, url=URL, inner="chemicals.tsv", force=force)

type_to_ref = {
typ: default_reference(PREFIX, typ.lower().replace(" ", "-").replace(",", ""), name=typ)
for typ in df["Type"].unique()
}
for x in type_to_ref.values():
yield Term(reference=x)

for _, row in df.iterrows():
term = Term.from_triple(PREFIX, identifier=row["PharmGKB Accession Id"], name=row["Name"])
term.append_parent(type_to_ref[row["Type"]])
if pd.notna(row["SMILES"]):
term.annotate_object(has_smiles, Reference(prefix="smiles", identifier=row["SMILES"]))
if pd.notna(row["InChI"]):
term.annotate_object(has_inchi, Reference(prefix="inchi", identifier=row["InChI"]))
for atc_id in split(row, "ATC Identifiers"):
term.append_exact_match(Reference(prefix="atc", identifier=atc_id))
for rxnorm_id in split(row, "RxNorm Identifiers"):
term.append_exact_match(Reference(prefix="rxnorm", identifier=rxnorm_id))
for pubchem_id in split(row, "PubChem Compound Identifiers"):
term.append_exact_match(Reference(prefix="pubchem.compound", identifier=pubchem_id))
for xref_curie in split(row, "External Vocabulary"):
try:
reference = Reference.from_curie(xref_curie)
except ValueError:
pass
else:
term.append_exact_match(reference)
for xref_curie in split(row, "Cross-references"):
try:
reference = Reference.from_curie(xref_curie)
except ValueError:
pass
else:
term.append_exact_match(reference)

for trade_name in split(row, "Trade names"):
# TODO use OMO term for trade name
term.append_synonym(trade_name)

# TODO add more

yield term


if __name__ == "__main__":
PharmGKBChemicalGetter.cli()
55 changes: 55 additions & 0 deletions src/pyobo/sources/pharmgkb/pharmgkb_disease.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""An ontology representation of PharmGKB phenotypes."""

from collections.abc import Iterable

import pandas as pd

from pyobo import Obo, Term
from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, parse_xrefs, split

__all__ = [
"PharmGKBDiseaseGetter",
]

PREFIX = "pharmgkb.disease"
URL = "https://api.pharmgkb.org/v1/download/file/data/phenotypes.zip"


class PharmGKBDiseaseGetter(Obo):
"""An ontology representation of PharmGKB phenotypes."""

ontology = bioversions_key = PREFIX
dynamic_version = True

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return iter_terms(force=force)


def iter_terms(force: bool = False) -> Iterable[Term]:
"""Iterate over terms.
:param force: Should the data be re-downloaded
:yields: Terms
1. PharmGKB Accession Id = Identifier assigned to this phenotype by PharmGKB
2. Name = Name PharmGKB uses for this phenotype
3. Alternate Names = Other known names for this phenotype, comma-separated
4. Cross-references = References to other resources in the form "resource:id", comma-separated
5. External Vocabulary = Term for this phenotype in another vocabulary in the form "vocabulary:id", comma-separated
"""
df = download_pharmgkb_tsv(PREFIX, url=URL, inner="phenotypes.tsv", force=force)
for _, row in df.iterrows():
identifier = row["PharmGKB Accession Id"]
if pd.isna(identifier):
continue
term = Term.from_triple(PREFIX, identifier=str(identifier), name=row["Name"])
for synonym in split(row, "Alternate Names"):
term.append_synonym(synonym)
for xref in parse_xrefs(term, row):
term.append_xref(xref)
yield term


if __name__ == "__main__":
PharmGKBDiseaseGetter().write_default(force=True, write_obo=True, use_tqdm=True)
84 changes: 84 additions & 0 deletions src/pyobo/sources/pharmgkb/pharmgkb_gene.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""An ontology representation of PharmGKB genes."""

from collections.abc import Iterable

import pandas as pd

from pyobo import Obo, Reference, Term
from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, parse_xrefs, split

__all__ = [
"PharmGKBGeneGetter",
]

PREFIX = "pharmgkb.gene"
URL = "https://api.pharmgkb.org/v1/download/file/data/genes.zip"


class PharmGKBGeneGetter(Obo):
"""An ontology representation of PharmGKB genes."""

ontology = bioversions_key = PREFIX
dynamic_version = True

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return iter_terms(force=force)


def iter_terms(force: bool = False) -> Iterable[Term]:
"""Iterate over terms.
:param force: Should the data be re-downloaded
:yields: Terms
1. PharmGKB Accession Id = Identifier assigned to this gene by PharmGKB
2. NCBI Gene ID = Identifier assigned to this gene by NCBI
3. HGNC ID = Identifier assigned to this gene by HGNC
4. Ensembl Id = Identifier assigned to this gene by Ensembl
5. Name = Canonical name for this gene (by HGNC)
6. Symbol = Canonical name for this gene (by HGNC)
7. Alternate Names = Other known names for this gene, comma-separated
8. Alternate Symbols = Other known symbols for this gene, comma-separated
9. Is VIP = "Yes" if PharmGKB has written a VIP annotation for this gene, "No" otherwise
10. Has Variant Annotation = "Yes" if PharmGKB has written at least one variant annotation for this gene, "No" otherwise
11. Cross-references = References to other resources in the form "resource:id", comma-separated
12. Has CPIC Dosing Guideline = "Yes" if PharmGKB has annotated a CPIC guideline for this gene, "No" otherwise
13. Chromosome = The chromosome this gene is on, in the form "chr##"
14. Chromosomal Start - GRCh37 = Where this gene starts on the chromosomal sequence for NCBI GRCh37
15. Chromosomal Stop - GRCh37 = Where this gene stops on the chromosomal sequence for NCBI GRCh37
16. Chromosomal Start - GRCh38 = Where this gene starts on the chromosomal sequence for NCBI GRCh38
17. Chromosomal Stop - GRCh38 = Where this gene stops on the chromosomal sequence for NCBI GRCh38
"""
df = download_pharmgkb_tsv(PREFIX, url=URL, inner="genes.tsv", force=force)

for _, row in df.iterrows():
identifier = row["PharmGKB Accession Id"]
if pd.isna(identifier):
continue

term = Term.from_triple(PREFIX, identifier=str(identifier), name=row["Name"])

term.append_exact_match(Reference(prefix="ncbigene", identifier=str(row["NCBI Gene ID"])))
if pd.notna(hgnc_id := row["HGNC ID"]):
term.append_exact_match(Reference(prefix="hgnc", identifier=str(hgnc_id)))
if pd.notna(ensembl_id := row["Ensembl Id"]):
term.append_exact_match(Reference(prefix="ensembl", identifier=str(ensembl_id)))

for synonym in split(row, "Alternate Names"):
term.append_synonym(synonym)

# TODO symbol synonym type
if pd.notna(row["Symbol"]):
term.append_synonym(row["Symbol"])
for synonym in split(row, "Alternate Symbols"):
term.append_synonym(synonym)

for xref in parse_xrefs(term, row):
term.append_xref(xref)

yield term


if __name__ == "__main__":
PharmGKBGeneGetter().write_default(force=True, write_obo=True, use_tqdm=True)
60 changes: 60 additions & 0 deletions src/pyobo/sources/pharmgkb/pharmgkb_pathway.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""An ontology representation of PharmGKB pathways."""

import zipfile
from collections.abc import Iterable

from pyobo import Obo, Term
from pyobo.sources.pharmgkb.utils import download_pharmgkb

__all__ = [
"PharmGKBPathwayGetter",
]

PREFIX = "pharmgkb.pathways"
BIOPAX_URL = "https://api.pharmgkb.org/v1/download/file/data/pathways-biopax.zip"
TSV_URL = "https://api.pharmgkb.org/v1/download/file/data/pathways-tsv.zip"


class PharmGKBPathwayGetter(Obo):
"""An ontology representation of PharmGKB pathways."""

ontology = bioversions_key = PREFIX
dynamic_version = True

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return iter_terms(force=force)


def iter_terms(force: bool = False) -> Iterable[Term]:
"""Iterate over terms.
:param force: Should the data be re-downloaded
:yields: Terms
1. PharmGKB Accession Id = Identifier assigned to this phenotype by PharmGKB
2. Name = Name PharmGKB uses for this phenotype
3. Alternate Names = Other known names for this phenotype, comma-separated
4. Cross-references = References to other resources in the form "resource:id", comma-separated
5. External Vocabulary = Term for this phenotype in another vocabulary in the form "vocabulary:id", comma-separated
"""
path = download_pharmgkb(PREFIX, url=BIOPAX_URL, force=force)
with zipfile.ZipFile(path) as zf:
for zip_info in zf.filelist:
if not zip_info.filename.endswith(".owl"):
continue
with zf.open(zip_info) as file:
yield _process_biopax(zip_info, file)


def _process_biopax(path: zipfile.ZipInfo, file) -> Term:
identifier, _, rest = path.filename.partition("-")
name, _, _extension = rest.rpartition(".")
name = name.replace("_", " ")
term = Term.from_triple(PREFIX, identifier, name)
# TODO parse file with pybiopax to include members and provenance
return term


if __name__ == "__main__":
PharmGKBPathwayGetter().write_default(force=True, write_obo=True, use_tqdm=True)
Loading

0 comments on commit 028cfe3

Please sign in to comment.