Add PharmGKB sources (#304)

Closes #94
biopragmatics · Jan 19, 2025 · 028cfe3 · 028cfe3
1 parent 669f730
commit 028cfe3
Show file tree

Hide file tree

Showing 8 changed files with 458 additions and 0 deletions.
diff --git a/src/pyobo/sources/__init__.py b/src/pyobo/sources/__init__.py
@@ -41,6 +41,13 @@
 from .omim_ps import OMIMPSGetter
 from .pathbank import PathBankGetter
 from .pfam import PfamClanGetter, PfamGetter
+from .pharmgkb import (
+    PharmGKBChemicalGetter,
+    PharmGKBDiseaseGetter,
+    PharmGKBGeneGetter,
+    PharmGKBPathwayGetter,
+    PharmGKBVariantGetter,
+)
 from .pid import PIDGetter
 from .pombase import PomBaseGetter
 from .pubchem import PubChemCompoundGetter
@@ -116,6 +123,11 @@
     "PathBankGetter",
     "PfamClanGetter",
     "PfamGetter",
+    "PharmGKBChemicalGetter",
+    "PharmGKBDiseaseGetter",
+    "PharmGKBGeneGetter",
+    "PharmGKBPathwayGetter",
+    "PharmGKBVariantGetter",
     "PomBaseGetter",
     "PubChemCompoundGetter",
     "RGDGetter",

diff --git a/src/pyobo/sources/pharmgkb/__init__.py b/src/pyobo/sources/pharmgkb/__init__.py
@@ -0,0 +1,15 @@
+"""Sources for PharmGKB."""
+
+from .pharmgkb_chemical import PharmGKBChemicalGetter
+from .pharmgkb_disease import PharmGKBDiseaseGetter
+from .pharmgkb_gene import PharmGKBGeneGetter
+from .pharmgkb_pathway import PharmGKBPathwayGetter
+from .pharmgkb_variant import PharmGKBVariantGetter
+
+__all__ = [
+    "PharmGKBChemicalGetter",
+    "PharmGKBDiseaseGetter",
+    "PharmGKBGeneGetter",
+    "PharmGKBPathwayGetter",
+    "PharmGKBVariantGetter",
+]
diff --git a/src/pyobo/sources/pharmgkb/pharmgkb_chemical.py b/src/pyobo/sources/pharmgkb/pharmgkb_chemical.py
@@ -0,0 +1,80 @@
+"""An ontology representation of PharmGKB chemicals."""
+
+from collections.abc import Iterable
+
+import pandas as pd
+
+from pyobo import Obo, Reference, Term, default_reference
+from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, split
+from pyobo.struct.typedef import has_inchi, has_smiles
+
+__all__ = [
+    "PharmGKBChemicalGetter",
+]
+
+PREFIX = "pharmgkb.drug"
+URL = "https://api.pharmgkb.org/v1/download/file/data/chemicals.zip"
+
+
+class PharmGKBChemicalGetter(Obo):
+    """An ontology representation of PharmGKB chemicals."""
+
+    ontology = bioversions_key = PREFIX
+    dynamic_version = True
+    typedefs = [has_inchi, has_smiles]
+
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(force=force)
+
+
+def iter_terms(force: bool = False) -> Iterable[Term]:
+    """Iterate over terms."""
+    df = download_pharmgkb_tsv(PREFIX, url=URL, inner="chemicals.tsv", force=force)
+
+    type_to_ref = {
+        typ: default_reference(PREFIX, typ.lower().replace(" ", "-").replace(",", ""), name=typ)
+        for typ in df["Type"].unique()
+    }
+    for x in type_to_ref.values():
+        yield Term(reference=x)
+
+    for _, row in df.iterrows():
+        term = Term.from_triple(PREFIX, identifier=row["PharmGKB Accession Id"], name=row["Name"])
+        term.append_parent(type_to_ref[row["Type"]])
+        if pd.notna(row["SMILES"]):
+            term.annotate_object(has_smiles, Reference(prefix="smiles", identifier=row["SMILES"]))
+        if pd.notna(row["InChI"]):
+            term.annotate_object(has_inchi, Reference(prefix="inchi", identifier=row["InChI"]))
+        for atc_id in split(row, "ATC Identifiers"):
+            term.append_exact_match(Reference(prefix="atc", identifier=atc_id))
+        for rxnorm_id in split(row, "RxNorm Identifiers"):
+            term.append_exact_match(Reference(prefix="rxnorm", identifier=rxnorm_id))
+        for pubchem_id in split(row, "PubChem Compound Identifiers"):
+            term.append_exact_match(Reference(prefix="pubchem.compound", identifier=pubchem_id))
+        for xref_curie in split(row, "External Vocabulary"):
+            try:
+                reference = Reference.from_curie(xref_curie)
+            except ValueError:
+                pass
+            else:
+                term.append_exact_match(reference)
+        for xref_curie in split(row, "Cross-references"):
+            try:
+                reference = Reference.from_curie(xref_curie)
+            except ValueError:
+                pass
+            else:
+                term.append_exact_match(reference)
+
+        for trade_name in split(row, "Trade names"):
+            # TODO use OMO term for trade name
+            term.append_synonym(trade_name)
+
+        # TODO add more
+
+        yield term
+
+
+if __name__ == "__main__":
+    PharmGKBChemicalGetter.cli()
diff --git a/src/pyobo/sources/pharmgkb/pharmgkb_disease.py b/src/pyobo/sources/pharmgkb/pharmgkb_disease.py
@@ -0,0 +1,55 @@
+"""An ontology representation of PharmGKB phenotypes."""
+
+from collections.abc import Iterable
+
+import pandas as pd
+
+from pyobo import Obo, Term
+from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, parse_xrefs, split
+
+__all__ = [
+    "PharmGKBDiseaseGetter",
+]
+
+PREFIX = "pharmgkb.disease"
+URL = "https://api.pharmgkb.org/v1/download/file/data/phenotypes.zip"
+
+
+class PharmGKBDiseaseGetter(Obo):
+    """An ontology representation of PharmGKB phenotypes."""
+
+    ontology = bioversions_key = PREFIX
+    dynamic_version = True
+
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(force=force)
+
+
+def iter_terms(force: bool = False) -> Iterable[Term]:
+    """Iterate over terms.
+
+    :param force: Should the data be re-downloaded
+    :yields: Terms
+
+    1. PharmGKB Accession Id = Identifier assigned to this phenotype by PharmGKB
+    2. Name = Name PharmGKB uses for this phenotype
+    3. Alternate Names = Other known names for this phenotype, comma-separated
+    4. Cross-references = References to other resources in the form "resource:id", comma-separated
+    5. External Vocabulary = Term for this phenotype in another vocabulary in the form "vocabulary:id", comma-separated
+    """
+    df = download_pharmgkb_tsv(PREFIX, url=URL, inner="phenotypes.tsv", force=force)
+    for _, row in df.iterrows():
+        identifier = row["PharmGKB Accession Id"]
+        if pd.isna(identifier):
+            continue
+        term = Term.from_triple(PREFIX, identifier=str(identifier), name=row["Name"])
+        for synonym in split(row, "Alternate Names"):
+            term.append_synonym(synonym)
+        for xref in parse_xrefs(term, row):
+            term.append_xref(xref)
+        yield term
+
+
+if __name__ == "__main__":
+    PharmGKBDiseaseGetter().write_default(force=True, write_obo=True, use_tqdm=True)
diff --git a/src/pyobo/sources/pharmgkb/pharmgkb_gene.py b/src/pyobo/sources/pharmgkb/pharmgkb_gene.py
@@ -0,0 +1,84 @@
+"""An ontology representation of PharmGKB genes."""
+
+from collections.abc import Iterable
+
+import pandas as pd
+
+from pyobo import Obo, Reference, Term
+from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, parse_xrefs, split
+
+__all__ = [
+    "PharmGKBGeneGetter",
+]
+
+PREFIX = "pharmgkb.gene"
+URL = "https://api.pharmgkb.org/v1/download/file/data/genes.zip"
+
+
+class PharmGKBGeneGetter(Obo):
+    """An ontology representation of PharmGKB genes."""
+
+    ontology = bioversions_key = PREFIX
+    dynamic_version = True
+
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(force=force)
+
+
+def iter_terms(force: bool = False) -> Iterable[Term]:
+    """Iterate over terms.
+
+    :param force: Should the data be re-downloaded
+    :yields: Terms
+
+    1. PharmGKB Accession Id = Identifier assigned to this gene by PharmGKB
+    2. NCBI Gene ID = Identifier assigned to this gene by NCBI
+    3. HGNC ID = Identifier assigned to this gene by HGNC
+    4. Ensembl Id = Identifier assigned to this gene by Ensembl
+    5. Name = Canonical name for this gene (by HGNC)
+    6. Symbol = Canonical name for this gene (by HGNC)
+    7. Alternate Names = Other known names for this gene, comma-separated
+    8. Alternate Symbols = Other known symbols for this gene, comma-separated
+    9. Is VIP = "Yes" if PharmGKB has written a VIP annotation for this gene, "No" otherwise
+    10. Has Variant Annotation = "Yes" if PharmGKB has written at least one variant annotation for this gene, "No" otherwise
+    11. Cross-references = References to other resources in the form "resource:id", comma-separated
+    12. Has CPIC Dosing Guideline = "Yes" if PharmGKB has annotated a CPIC guideline for this gene, "No" otherwise
+    13. Chromosome = The chromosome this gene is on, in the form "chr##"
+    14. Chromosomal Start - GRCh37 = Where this gene starts on the chromosomal sequence for NCBI GRCh37
+    15. Chromosomal Stop - GRCh37 = Where this gene stops on the chromosomal sequence for NCBI GRCh37
+    16. Chromosomal Start - GRCh38 = Where this gene starts on the chromosomal sequence for NCBI GRCh38
+    17. Chromosomal Stop - GRCh38 = Where this gene stops on the chromosomal sequence for NCBI GRCh38
+    """
+    df = download_pharmgkb_tsv(PREFIX, url=URL, inner="genes.tsv", force=force)
+
+    for _, row in df.iterrows():
+        identifier = row["PharmGKB Accession Id"]
+        if pd.isna(identifier):
+            continue
+
+        term = Term.from_triple(PREFIX, identifier=str(identifier), name=row["Name"])
+
+        term.append_exact_match(Reference(prefix="ncbigene", identifier=str(row["NCBI Gene ID"])))
+        if pd.notna(hgnc_id := row["HGNC ID"]):
+            term.append_exact_match(Reference(prefix="hgnc", identifier=str(hgnc_id)))
+        if pd.notna(ensembl_id := row["Ensembl Id"]):
+            term.append_exact_match(Reference(prefix="ensembl", identifier=str(ensembl_id)))
+
+        for synonym in split(row, "Alternate Names"):
+            term.append_synonym(synonym)
+
+        # TODO symbol synonym type
+        if pd.notna(row["Symbol"]):
+            term.append_synonym(row["Symbol"])
+        for synonym in split(row, "Alternate Symbols"):
+            term.append_synonym(synonym)
+
+        for xref in parse_xrefs(term, row):
+            term.append_xref(xref)
+
+        yield term
+
+
+if __name__ == "__main__":
+    PharmGKBGeneGetter().write_default(force=True, write_obo=True, use_tqdm=True)
diff --git a/src/pyobo/sources/pharmgkb/pharmgkb_pathway.py b/src/pyobo/sources/pharmgkb/pharmgkb_pathway.py
@@ -0,0 +1,60 @@
+"""An ontology representation of PharmGKB pathways."""
+
+import zipfile
+from collections.abc import Iterable
+
+from pyobo import Obo, Term
+from pyobo.sources.pharmgkb.utils import download_pharmgkb
+
+__all__ = [
+    "PharmGKBPathwayGetter",
+]
+
+PREFIX = "pharmgkb.pathways"
+BIOPAX_URL = "https://api.pharmgkb.org/v1/download/file/data/pathways-biopax.zip"
+TSV_URL = "https://api.pharmgkb.org/v1/download/file/data/pathways-tsv.zip"
+
+
+class PharmGKBPathwayGetter(Obo):
+    """An ontology representation of PharmGKB pathways."""
+
+    ontology = bioversions_key = PREFIX
+    dynamic_version = True
+
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(force=force)
+
+
+def iter_terms(force: bool = False) -> Iterable[Term]:
+    """Iterate over terms.
+
+    :param force: Should the data be re-downloaded
+    :yields: Terms
+
+    1. PharmGKB Accession Id = Identifier assigned to this phenotype by PharmGKB
+    2. Name = Name PharmGKB uses for this phenotype
+    3. Alternate Names = Other known names for this phenotype, comma-separated
+    4. Cross-references = References to other resources in the form "resource:id", comma-separated
+    5. External Vocabulary = Term for this phenotype in another vocabulary in the form "vocabulary:id", comma-separated
+    """
+    path = download_pharmgkb(PREFIX, url=BIOPAX_URL, force=force)
+    with zipfile.ZipFile(path) as zf:
+        for zip_info in zf.filelist:
+            if not zip_info.filename.endswith(".owl"):
+                continue
+            with zf.open(zip_info) as file:
+                yield _process_biopax(zip_info, file)
+
+
+def _process_biopax(path: zipfile.ZipInfo, file) -> Term:
+    identifier, _, rest = path.filename.partition("-")
+    name, _, _extension = rest.rpartition(".")
+    name = name.replace("_", " ")
+    term = Term.from_triple(PREFIX, identifier, name)
+    # TODO parse file with pybiopax to include members and provenance
+    return term
+
+
+if __name__ == "__main__":
+    PharmGKBPathwayGetter().write_default(force=True, write_obo=True, use_tqdm=True)