-
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Closes #94
- Loading branch information
Showing
8 changed files
with
458 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
"""Sources for PharmGKB.""" | ||
|
||
from .pharmgkb_chemical import PharmGKBChemicalGetter | ||
from .pharmgkb_disease import PharmGKBDiseaseGetter | ||
from .pharmgkb_gene import PharmGKBGeneGetter | ||
from .pharmgkb_pathway import PharmGKBPathwayGetter | ||
from .pharmgkb_variant import PharmGKBVariantGetter | ||
|
||
__all__ = [ | ||
"PharmGKBChemicalGetter", | ||
"PharmGKBDiseaseGetter", | ||
"PharmGKBGeneGetter", | ||
"PharmGKBPathwayGetter", | ||
"PharmGKBVariantGetter", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
"""An ontology representation of PharmGKB chemicals.""" | ||
|
||
from collections.abc import Iterable | ||
|
||
import pandas as pd | ||
|
||
from pyobo import Obo, Reference, Term, default_reference | ||
from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, split | ||
from pyobo.struct.typedef import has_inchi, has_smiles | ||
|
||
__all__ = [ | ||
"PharmGKBChemicalGetter", | ||
] | ||
|
||
PREFIX = "pharmgkb.drug" | ||
URL = "https://api.pharmgkb.org/v1/download/file/data/chemicals.zip" | ||
|
||
|
||
class PharmGKBChemicalGetter(Obo): | ||
"""An ontology representation of PharmGKB chemicals.""" | ||
|
||
ontology = bioversions_key = PREFIX | ||
dynamic_version = True | ||
typedefs = [has_inchi, has_smiles] | ||
|
||
def iter_terms(self, force: bool = False) -> Iterable[Term]: | ||
"""Iterate over terms in the ontology.""" | ||
return iter_terms(force=force) | ||
|
||
|
||
def iter_terms(force: bool = False) -> Iterable[Term]: | ||
"""Iterate over terms.""" | ||
df = download_pharmgkb_tsv(PREFIX, url=URL, inner="chemicals.tsv", force=force) | ||
|
||
type_to_ref = { | ||
typ: default_reference(PREFIX, typ.lower().replace(" ", "-").replace(",", ""), name=typ) | ||
for typ in df["Type"].unique() | ||
} | ||
for x in type_to_ref.values(): | ||
yield Term(reference=x) | ||
|
||
for _, row in df.iterrows(): | ||
term = Term.from_triple(PREFIX, identifier=row["PharmGKB Accession Id"], name=row["Name"]) | ||
term.append_parent(type_to_ref[row["Type"]]) | ||
if pd.notna(row["SMILES"]): | ||
term.annotate_object(has_smiles, Reference(prefix="smiles", identifier=row["SMILES"])) | ||
if pd.notna(row["InChI"]): | ||
term.annotate_object(has_inchi, Reference(prefix="inchi", identifier=row["InChI"])) | ||
for atc_id in split(row, "ATC Identifiers"): | ||
term.append_exact_match(Reference(prefix="atc", identifier=atc_id)) | ||
for rxnorm_id in split(row, "RxNorm Identifiers"): | ||
term.append_exact_match(Reference(prefix="rxnorm", identifier=rxnorm_id)) | ||
for pubchem_id in split(row, "PubChem Compound Identifiers"): | ||
term.append_exact_match(Reference(prefix="pubchem.compound", identifier=pubchem_id)) | ||
for xref_curie in split(row, "External Vocabulary"): | ||
try: | ||
reference = Reference.from_curie(xref_curie) | ||
except ValueError: | ||
pass | ||
else: | ||
term.append_exact_match(reference) | ||
for xref_curie in split(row, "Cross-references"): | ||
try: | ||
reference = Reference.from_curie(xref_curie) | ||
except ValueError: | ||
pass | ||
else: | ||
term.append_exact_match(reference) | ||
|
||
for trade_name in split(row, "Trade names"): | ||
# TODO use OMO term for trade name | ||
term.append_synonym(trade_name) | ||
|
||
# TODO add more | ||
|
||
yield term | ||
|
||
|
||
if __name__ == "__main__": | ||
PharmGKBChemicalGetter.cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
"""An ontology representation of PharmGKB phenotypes.""" | ||
|
||
from collections.abc import Iterable | ||
|
||
import pandas as pd | ||
|
||
from pyobo import Obo, Term | ||
from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, parse_xrefs, split | ||
|
||
__all__ = [ | ||
"PharmGKBDiseaseGetter", | ||
] | ||
|
||
PREFIX = "pharmgkb.disease" | ||
URL = "https://api.pharmgkb.org/v1/download/file/data/phenotypes.zip" | ||
|
||
|
||
class PharmGKBDiseaseGetter(Obo): | ||
"""An ontology representation of PharmGKB phenotypes.""" | ||
|
||
ontology = bioversions_key = PREFIX | ||
dynamic_version = True | ||
|
||
def iter_terms(self, force: bool = False) -> Iterable[Term]: | ||
"""Iterate over terms in the ontology.""" | ||
return iter_terms(force=force) | ||
|
||
|
||
def iter_terms(force: bool = False) -> Iterable[Term]: | ||
"""Iterate over terms. | ||
:param force: Should the data be re-downloaded | ||
:yields: Terms | ||
1. PharmGKB Accession Id = Identifier assigned to this phenotype by PharmGKB | ||
2. Name = Name PharmGKB uses for this phenotype | ||
3. Alternate Names = Other known names for this phenotype, comma-separated | ||
4. Cross-references = References to other resources in the form "resource:id", comma-separated | ||
5. External Vocabulary = Term for this phenotype in another vocabulary in the form "vocabulary:id", comma-separated | ||
""" | ||
df = download_pharmgkb_tsv(PREFIX, url=URL, inner="phenotypes.tsv", force=force) | ||
for _, row in df.iterrows(): | ||
identifier = row["PharmGKB Accession Id"] | ||
if pd.isna(identifier): | ||
continue | ||
term = Term.from_triple(PREFIX, identifier=str(identifier), name=row["Name"]) | ||
for synonym in split(row, "Alternate Names"): | ||
term.append_synonym(synonym) | ||
for xref in parse_xrefs(term, row): | ||
term.append_xref(xref) | ||
yield term | ||
|
||
|
||
if __name__ == "__main__": | ||
PharmGKBDiseaseGetter().write_default(force=True, write_obo=True, use_tqdm=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
"""An ontology representation of PharmGKB genes.""" | ||
|
||
from collections.abc import Iterable | ||
|
||
import pandas as pd | ||
|
||
from pyobo import Obo, Reference, Term | ||
from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, parse_xrefs, split | ||
|
||
__all__ = [ | ||
"PharmGKBGeneGetter", | ||
] | ||
|
||
PREFIX = "pharmgkb.gene" | ||
URL = "https://api.pharmgkb.org/v1/download/file/data/genes.zip" | ||
|
||
|
||
class PharmGKBGeneGetter(Obo): | ||
"""An ontology representation of PharmGKB genes.""" | ||
|
||
ontology = bioversions_key = PREFIX | ||
dynamic_version = True | ||
|
||
def iter_terms(self, force: bool = False) -> Iterable[Term]: | ||
"""Iterate over terms in the ontology.""" | ||
return iter_terms(force=force) | ||
|
||
|
||
def iter_terms(force: bool = False) -> Iterable[Term]: | ||
"""Iterate over terms. | ||
:param force: Should the data be re-downloaded | ||
:yields: Terms | ||
1. PharmGKB Accession Id = Identifier assigned to this gene by PharmGKB | ||
2. NCBI Gene ID = Identifier assigned to this gene by NCBI | ||
3. HGNC ID = Identifier assigned to this gene by HGNC | ||
4. Ensembl Id = Identifier assigned to this gene by Ensembl | ||
5. Name = Canonical name for this gene (by HGNC) | ||
6. Symbol = Canonical name for this gene (by HGNC) | ||
7. Alternate Names = Other known names for this gene, comma-separated | ||
8. Alternate Symbols = Other known symbols for this gene, comma-separated | ||
9. Is VIP = "Yes" if PharmGKB has written a VIP annotation for this gene, "No" otherwise | ||
10. Has Variant Annotation = "Yes" if PharmGKB has written at least one variant annotation for this gene, "No" otherwise | ||
11. Cross-references = References to other resources in the form "resource:id", comma-separated | ||
12. Has CPIC Dosing Guideline = "Yes" if PharmGKB has annotated a CPIC guideline for this gene, "No" otherwise | ||
13. Chromosome = The chromosome this gene is on, in the form "chr##" | ||
14. Chromosomal Start - GRCh37 = Where this gene starts on the chromosomal sequence for NCBI GRCh37 | ||
15. Chromosomal Stop - GRCh37 = Where this gene stops on the chromosomal sequence for NCBI GRCh37 | ||
16. Chromosomal Start - GRCh38 = Where this gene starts on the chromosomal sequence for NCBI GRCh38 | ||
17. Chromosomal Stop - GRCh38 = Where this gene stops on the chromosomal sequence for NCBI GRCh38 | ||
""" | ||
df = download_pharmgkb_tsv(PREFIX, url=URL, inner="genes.tsv", force=force) | ||
|
||
for _, row in df.iterrows(): | ||
identifier = row["PharmGKB Accession Id"] | ||
if pd.isna(identifier): | ||
continue | ||
|
||
term = Term.from_triple(PREFIX, identifier=str(identifier), name=row["Name"]) | ||
|
||
term.append_exact_match(Reference(prefix="ncbigene", identifier=str(row["NCBI Gene ID"]))) | ||
if pd.notna(hgnc_id := row["HGNC ID"]): | ||
term.append_exact_match(Reference(prefix="hgnc", identifier=str(hgnc_id))) | ||
if pd.notna(ensembl_id := row["Ensembl Id"]): | ||
term.append_exact_match(Reference(prefix="ensembl", identifier=str(ensembl_id))) | ||
|
||
for synonym in split(row, "Alternate Names"): | ||
term.append_synonym(synonym) | ||
|
||
# TODO symbol synonym type | ||
if pd.notna(row["Symbol"]): | ||
term.append_synonym(row["Symbol"]) | ||
for synonym in split(row, "Alternate Symbols"): | ||
term.append_synonym(synonym) | ||
|
||
for xref in parse_xrefs(term, row): | ||
term.append_xref(xref) | ||
|
||
yield term | ||
|
||
|
||
if __name__ == "__main__": | ||
PharmGKBGeneGetter().write_default(force=True, write_obo=True, use_tqdm=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
"""An ontology representation of PharmGKB pathways.""" | ||
|
||
import zipfile | ||
from collections.abc import Iterable | ||
|
||
from pyobo import Obo, Term | ||
from pyobo.sources.pharmgkb.utils import download_pharmgkb | ||
|
||
__all__ = [ | ||
"PharmGKBPathwayGetter", | ||
] | ||
|
||
PREFIX = "pharmgkb.pathways" | ||
BIOPAX_URL = "https://api.pharmgkb.org/v1/download/file/data/pathways-biopax.zip" | ||
TSV_URL = "https://api.pharmgkb.org/v1/download/file/data/pathways-tsv.zip" | ||
|
||
|
||
class PharmGKBPathwayGetter(Obo): | ||
"""An ontology representation of PharmGKB pathways.""" | ||
|
||
ontology = bioversions_key = PREFIX | ||
dynamic_version = True | ||
|
||
def iter_terms(self, force: bool = False) -> Iterable[Term]: | ||
"""Iterate over terms in the ontology.""" | ||
return iter_terms(force=force) | ||
|
||
|
||
def iter_terms(force: bool = False) -> Iterable[Term]: | ||
"""Iterate over terms. | ||
:param force: Should the data be re-downloaded | ||
:yields: Terms | ||
1. PharmGKB Accession Id = Identifier assigned to this phenotype by PharmGKB | ||
2. Name = Name PharmGKB uses for this phenotype | ||
3. Alternate Names = Other known names for this phenotype, comma-separated | ||
4. Cross-references = References to other resources in the form "resource:id", comma-separated | ||
5. External Vocabulary = Term for this phenotype in another vocabulary in the form "vocabulary:id", comma-separated | ||
""" | ||
path = download_pharmgkb(PREFIX, url=BIOPAX_URL, force=force) | ||
with zipfile.ZipFile(path) as zf: | ||
for zip_info in zf.filelist: | ||
if not zip_info.filename.endswith(".owl"): | ||
continue | ||
with zf.open(zip_info) as file: | ||
yield _process_biopax(zip_info, file) | ||
|
||
|
||
def _process_biopax(path: zipfile.ZipInfo, file) -> Term: | ||
identifier, _, rest = path.filename.partition("-") | ||
name, _, _extension = rest.rpartition(".") | ||
name = name.replace("_", " ") | ||
term = Term.from_triple(PREFIX, identifier, name) | ||
# TODO parse file with pybiopax to include members and provenance | ||
return term | ||
|
||
|
||
if __name__ == "__main__": | ||
PharmGKBPathwayGetter().write_default(force=True, write_obo=True, use_tqdm=True) |
Oops, something went wrong.