TranslatorSRI · gaurav · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/config.json b/config.json
@@ -9,6 +9,8 @@
   "rxnorm_version": "09032024",
   "drugbank_version": "5-1-12",
 
+  "UMLS_UniProtKB_download_raw_url": "https://raw.githubusercontent.com/cbizon/UMLS_UniProtKB/refs/heads/main/outputs/UMLS_UniProtKB.tsv",
+
   "ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
   "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],
   "mods": ["WormBase","FB","MGI","ZFIN","RGD","SGD"],
@@ -20,13 +22,13 @@
 
   "gene_labels": ["HGNC","NCBIGene","UMLS"],
   "gene_ids": ["ENSEMBL","HGNC","NCBIGene","UMLS","OMIM","ZFIN","WormBase","FB","MGI","RGD","SGD"],
-  "gene_concords": ["NCBIGene","NCBIGeneENSEMBL","medgen","UMLS"],
+  "gene_concords": ["NCBIGene","NCBIGeneENSEMBL","medgen","UMLS", "UMLS_NCBIGene"],
   "gene_outputs": ["Gene.txt"],
 
   "protein_labels": ["UniProtKB","PR","UMLS"],
   "protein_synonyms": ["PR", "UMLS"],
   "protein_ids": ["ENSEMBL","UniProtKB","PR","UMLS"],
-  "protein_concords": ["UniProtKB","PR","NCIT_UniProtKB","NCIT_UMLS"],
+  "protein_concords": ["UniProtKB","PR","NCIT_UniProtKB","NCIT_UMLS", "UMLS_UniProtKB"],
   "protein_outputs": ["Protein.txt"],
 
   "disease_labelsandsynonyms": ["MONDO","DOID","Orphanet","HP","MESH","NCIT","UMLS","SNOMEDCT","EFO"],

diff --git a/src/datahandlers/uniprotkb.py b/src/datahandlers/uniprotkb.py
@@ -1,4 +1,11 @@
-from src.babel_utils import pull_via_urllib, make_local_name
+import csv
+import logging
+import os
+
+import requests
+from requests import request
+
+from src.babel_utils import pull_via_urllib, make_local_name, pull_via_wget
 
 
 def readlabels(which):
@@ -23,3 +30,44 @@ def pull_uniprot_labels(sprotfile,tremblfile,fname):
             labelfile.write(f'{k}\t{v}\n')
         for k,v in tlabels.items():
             labelfile.write(f'{k}\t{v}\n')
+
+
+def download_umls_gene_protein_mappings(umls_uniprotkb_raw_url, umls_uniprotkb_filename, umls_gene_concords, umls_protein_concords):
+    """
+    Chris Bizon generated a list of UMLS to NCBIGene/UniProtKB mappings in
+    https://github.com/cbizon/UMLS_UniProtKB. This function downloads this file
+    from that GitHub repository and generates concord files so that they can be
+    incorporated into our gene and protein cliques.
+
+    :param umls_uniprotkb_raw_url: The URL to download the UMLS/UniProtKB mapping file.
+    :param umls_uniprotkb_filename: The UMLS/UniProtKB filename to save the UMLS/UniProtKB filename.
+    :param umls_gene_concords: The file to write UMLS/NCBIGene gene concords to.
+    :param umls_uniprotkb_protein_concords: The file to write UMLS/UniProtKB protein concords to.
+    """
+
+    RELATION = 'oio:closeMatch'
+
+    # Step 1. Download the file.
+    response = requests.get(umls_uniprotkb_raw_url)
+    response.raise_for_status()
+    with open(umls_uniprotkb_filename, 'w') as f:
+        f.write(response.text)
+
+    # Step 2. Read the file into memory.
+    os.makedirs(os.path.dirname(umls_gene_concords), exist_ok=True)
+    os.makedirs(os.path.dirname(umls_protein_concords), exist_ok=True)
+
+    count_rows = 0
+    with open(umls_uniprotkb_filename, 'r') as f, \
+        open(umls_gene_concords, 'w') as genef, \
+        open(umls_protein_concords, 'w') as proteinf:
+        csv_reader = csv.DictReader(f, dialect='excel-tab')
+        for row in csv_reader:
+            count_rows += 1
+            if row.keys() != {'UMLS_protein', 'UMLS_gene', 'NCBI_gene', 'UniProtKB'}:
+                raise RuntimeError(f"Format of the UniProtKB download from {umls_uniprotkb_raw_url} has changed: {csv_reader.fieldnames}.")
+
+            genef.write(f"{row['UMLS_gene']}\t{RELATION}\t{row['NCBI_gene']}\n")
+            proteinf.write(f"{row['UMLS_protein']}\t{RELATION}\t{row['UniProtKB']}\n")
+
+    logging.info(f"Downloaded UMLS file from {umls_uniprotkb_raw_url} and added {count_rows} to gene and protein concords.")
diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
@@ -121,6 +121,19 @@ rule get_uniprotkb_labels:
     run:
         uniprotkb.pull_uniprot_labels(input.sprot_input,input.trembl_input,output.outfile)
 
+rule get_umls_gene_protein_mappings:
+    output:
+        umls_uniprotkb_filename=config['download_directory']+'/UMLS_UniProtKB/UMLS_UniProtKB.tsv',
+        umls_gene_concords=config['output_directory']+'/intermediate/gene/concords/UMLS_NCBIGene',
+        umls_protein_concords=config['output_directory']+'/intermediate/protein/concords/UMLS_UniProtKB',
+    run:
+        uniprotkb.download_umls_gene_protein_mappings(
+            config['UMLS_UniProtKB_download_raw_url'],
+            output.umls_uniprotkb_filename,
+            output.umls_gene_concords,
+            output.umls_protein_concords,
+        )
+
 ### MESH
 
 rule get_mesh: