Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Incorporate UMLS_UniProtKB mappings #361

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
"rxnorm_version": "09032024",
"drugbank_version": "5-1-12",

"UMLS_UniProtKB_download_raw_url": "https://raw.githubusercontent.com/cbizon/UMLS_UniProtKB/refs/heads/main/outputs/UMLS_UniProtKB.tsv",

"ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
"ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],
"mods": ["WormBase","FB","MGI","ZFIN","RGD","SGD"],
Expand All @@ -20,13 +22,13 @@

"gene_labels": ["HGNC","NCBIGene","UMLS"],
"gene_ids": ["ENSEMBL","HGNC","NCBIGene","UMLS","OMIM","ZFIN","WormBase","FB","MGI","RGD","SGD"],
"gene_concords": ["NCBIGene","NCBIGeneENSEMBL","medgen","UMLS"],
"gene_concords": ["NCBIGene","NCBIGeneENSEMBL","medgen","UMLS", "UMLS_NCBIGene"],
"gene_outputs": ["Gene.txt"],

"protein_labels": ["UniProtKB","PR","UMLS"],
"protein_synonyms": ["PR", "UMLS"],
"protein_ids": ["ENSEMBL","UniProtKB","PR","UMLS"],
"protein_concords": ["UniProtKB","PR","NCIT_UniProtKB","NCIT_UMLS"],
"protein_concords": ["UniProtKB","PR","NCIT_UniProtKB","NCIT_UMLS", "UMLS_UniProtKB"],
"protein_outputs": ["Protein.txt"],

"disease_labelsandsynonyms": ["MONDO","DOID","Orphanet","HP","MESH","NCIT","UMLS","SNOMEDCT","EFO"],
Expand Down
50 changes: 49 additions & 1 deletion src/datahandlers/uniprotkb.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from src.babel_utils import pull_via_urllib, make_local_name
import csv
import logging
import os

import requests
from requests import request

from src.babel_utils import pull_via_urllib, make_local_name, pull_via_wget


def readlabels(which):
Expand All @@ -23,3 +30,44 @@ def pull_uniprot_labels(sprotfile,tremblfile,fname):
labelfile.write(f'{k}\t{v}\n')
for k,v in tlabels.items():
labelfile.write(f'{k}\t{v}\n')


def download_umls_gene_protein_mappings(umls_uniprotkb_raw_url, umls_uniprotkb_filename, umls_gene_concords, umls_protein_concords):
"""
Chris Bizon generated a list of UMLS to NCBIGene/UniProtKB mappings in
https://github.com/cbizon/UMLS_UniProtKB. This function downloads this file
from that GitHub repository and generates concord files so that they can be
incorporated into our gene and protein cliques.

:param umls_uniprotkb_raw_url: The URL to download the UMLS/UniProtKB mapping file.
:param umls_uniprotkb_filename: The UMLS/UniProtKB filename to save the UMLS/UniProtKB filename.
:param umls_gene_concords: The file to write UMLS/NCBIGene gene concords to.
:param umls_uniprotkb_protein_concords: The file to write UMLS/UniProtKB protein concords to.
"""

RELATION = 'oio:closeMatch'

# Step 1. Download the file.
response = requests.get(umls_uniprotkb_raw_url)
response.raise_for_status()
with open(umls_uniprotkb_filename, 'w') as f:
f.write(response.text)

# Step 2. Read the file into memory.
os.makedirs(os.path.dirname(umls_gene_concords), exist_ok=True)
os.makedirs(os.path.dirname(umls_protein_concords), exist_ok=True)

count_rows = 0
with open(umls_uniprotkb_filename, 'r') as f, \
open(umls_gene_concords, 'w') as genef, \
open(umls_protein_concords, 'w') as proteinf:
csv_reader = csv.DictReader(f, dialect='excel-tab')
for row in csv_reader:
count_rows += 1
if row.keys() != {'UMLS_protein', 'UMLS_gene', 'NCBI_gene', 'UniProtKB'}:
raise RuntimeError(f"Format of the UniProtKB download from {umls_uniprotkb_raw_url} has changed: {csv_reader.fieldnames}.")

genef.write(f"{row['UMLS_gene']}\t{RELATION}\t{row['NCBI_gene']}\n")
proteinf.write(f"{row['UMLS_protein']}\t{RELATION}\t{row['UniProtKB']}\n")

logging.info(f"Downloaded UMLS file from {umls_uniprotkb_raw_url} and added {count_rows} to gene and protein concords.")
13 changes: 13 additions & 0 deletions src/snakefiles/datacollect.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,19 @@ rule get_uniprotkb_labels:
run:
uniprotkb.pull_uniprot_labels(input.sprot_input,input.trembl_input,output.outfile)

rule get_umls_gene_protein_mappings:
output:
umls_uniprotkb_filename=config['download_directory']+'/UMLS_UniProtKB/UMLS_UniProtKB.tsv',
umls_gene_concords=config['output_directory']+'/intermediate/gene/concords/UMLS_NCBIGene',
umls_protein_concords=config['output_directory']+'/intermediate/protein/concords/UMLS_UniProtKB',
run:
uniprotkb.download_umls_gene_protein_mappings(
config['UMLS_UniProtKB_download_raw_url'],
output.umls_uniprotkb_filename,
output.umls_gene_concords,
output.umls_protein_concords,
)

### MESH

rule get_mesh:
Expand Down