Merge pull request #201 from TranslatorSRI/babel-1.3

This PR includes several minor fixes needed to build Babel: - Upgrade: - UMLS to 2023AB - RxNorm to 01022024 - Biolink Model to 3.6.0 - Reduced log level of empty synonym list from warning to debug to reduce the size of the overall log. - I accidentally merged #206 into this branch instead of `master`, so it has a bunch of changes, including: - Adding `genefamily_outputs`, `umls_outputs` and `macromolecularcomplex_outputs`. - Adding the KGX export to the `all` target. - Expanding babel_outputs to 500Gi to accommodate KGX files. - A bug fix for #197 - KGX export now produces gzipped files. - #207 - #202 - #218 - Includes some code from PR #217, but only produces a warning instead of skipping mappings.
TranslatorSRI · Jan 24, 2024 · 0735fe0 · 0735fe0
2 parents 91bb1a2 + a8c4bbe
commit 0735fe0
Show file tree

Hide file tree

Showing 12 changed files with 333 additions and 15 deletions.
diff --git a/Snakefile b/Snakefile
@@ -13,6 +13,7 @@ include: "src/snakefiles/taxon.snakefile"
 include: "src/snakefiles/genefamily.snakefile"
 include: "src/snakefiles/leftover_umls.snakefile"
 include: "src/snakefiles/macromolecular_complex.snakefile"
+include: "src/snakefiles/exports.snakefile"
 
 rule all:
     input:
@@ -28,6 +29,8 @@ rule all:
         config['output_directory'] + '/reports/umls_done',
         config['output_directory'] + '/reports/macromolecular_complex_done',
         config['output_directory'] + '/reports/drugchemical_done',
+        # Check if we have exported the compendia as KGX.
+        config['output_directory'] + '/kgx/done',
     output:
         x = config['output_directory'] + '/reports/all_done'
     shell:

diff --git a/config.json b/config.json
@@ -4,9 +4,9 @@
   "intermediate_directory": "babel_outputs/intermediate",
   "output_directory": "babel_outputs",
 
-  "biolink_version": "3.5.4",
-  "umls_version": "2023AA",
-  "rxnorm_version": "08072023",
+  "biolink_version": "3.6.0",
+  "umls_version": "2023AB",
+  "rxnorm_version": "01022024",
 
   "ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
   "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],
@@ -57,10 +57,25 @@
   "genefamily_ids": ["PANTHER.FAMILY","HGNC.FAMILY"],
   "genefamily_outputs": ["GeneFamily.txt"],
 
+  "umls_outputs": ["umls.txt"],
+  "macromolecularcomplex_outputs": ["MacromolecularComplex.txt"],
   "ubergraph_iri_stem_to_prefix_map": {
     "https://identifiers.org/ncbigene/": "NCBIGene",
     "http://www.ncbi.nlm.nih.gov/gene/": "NCBIGene",
     "http://www.genenames.org/cgi-bin/gene_symbol_report?hgnc_id=": "HGNC",
     "http://www.informatics.jax.org/marker/MGI:": "MGI"
+  },
+
+  "preferred_name_boost_prefixes": {
+    "biolink:ChemicalEntity": [
+      "DRUGBANK",
+      "GTOPDB",
+      "DrugCentral",
+      "CHEMBL.COMPOUND",
+      "RXCUI",
+      "CHEBI",
+      "HMDB",
+      "PUBCHEM.COMPOUND"
+    ]
   }
 }
diff --git a/kubernetes/babel-outputs.k8s.yaml b/kubernetes/babel-outputs.k8s.yaml
@@ -15,5 +15,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 400Gi
+      storage: 500Gi
   storageClassName: basic
diff --git a/src/babel_utils.py b/src/babel_utils.py
@@ -1,5 +1,6 @@
 import logging
 import subprocess
+import traceback
 from ftplib import FTP
 from io import BytesIO
 import gzip
@@ -273,6 +274,27 @@ def pull_via_wget(
         raise RuntimeError(f'Expected uncompressed file {uncompressed_filename} does not exist.')
 
 
+def sort_identifiers_with_boosted_prefixes(identifiers, prefixes):
+    """
+    Given a list of identifiers (with `identifier` and `label` keys), sort them using
+    the following rules:
+    - Any identifier that has a prefix in prefixes is sorted based on its order in prefixes.
+    - Any identifier that does not have a prefix in prefixes is left in place.
+
+    :param identifiers: A list of identifiers to sort. This is a list of dictionaries
+        containing `identifier` and `label` keys, and possible others that we ignore.
+    :param prefixes: A list of prefixes, in the order in which they should be boosted.
+        We assume that CURIEs match these prefixes if they are in the form `{prefix}:...`.
+    :return: The list of identifiers sorted as described above.
+    """
+
+    # Thanks to JetBrains AI.
+    return sorted(
+        identifiers,
+        key=lambda identifier: prefixes.index(identifier['identifier'].split(':', 1)[0]) if identifier['identifier'].split(':', 1)[0] in prefixes else len(prefixes)
+    )
+
+
 def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None):
     """
     :param synonym_list:
@@ -294,6 +316,10 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
     node_factory = NodeFactory(make_local_name(''),biolink_version)
     synonym_factory = SynonymFactory(make_local_name(''))
 
+    # Load the preferred_name_boost_prefixes -- this tells us which prefixes to boost when
+    # coming up with a preferred label for a particular Biolink class.
+    preferred_name_boost_prefixes = config['preferred_name_boost_prefixes']
+
     # Create an InformationContentFactory based on the specified icRDF.tsv file. Default to the one in the download
     # directory.
     if not icrdf_filename:
@@ -334,14 +360,51 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
                 # Why are we running the synonym list through set() again? Because get_synonyms returns unique pairs of (relation, synonym).
                 # So multiple identical synonyms may be returned as long they have a different relation. But since we don't care about the
                 # relation, we should get rid of any duplicated synonyms here.
-                synonyms_list = sorted(set(synonyms), key=lambda x:len(x))
+                synonyms_list = sorted(set(synonyms), key=lambda x: len(x))
                 try:
+                    types = node_factory.get_ancestors(node["type"])
                     document = {"curie": curie,
                                 "names": synonyms_list,
-                                "types": [ t[8:] for t in node_factory.get_ancestors(node["type"])]} #remove biolink:
-
-                    if "label" in node["identifiers"][0]:
-                        document["preferred_name"] = node["identifiers"][0]["label"]
+                                "types": [t[8:] for t in types]} # remove biolink:
+
+                    # To pick a preferred label for this clique, we need to do three things:
+                    # 1. We sort all labels in the preferred-name order. By default, this should be
+                    #    the preferred CURIE order, but if this clique is in one of the Biolink classes in
+                    #    preferred_name_boost_prefixes, we boost those prefixes in that order to the top of the list.
+                    # 2. We filter out any suspicious labels.
+                    #    (If this simple filter doesn't work, and if prefixes are inconsistent, we can build upon the
+                    #    algorithm proposed by Jeff at
+                    #    https://github.com/NCATSTranslator/Feedback/issues/259#issuecomment-1605140850)
+                    # 3. We choose the first label that isn't blank. If no labels remain, we generate a warning.
+
+                    # Step 1.1. Sort labels in boosted prefix order if possible.
+                    possible_labels = []
+                    for typ in types:
+                        if typ in preferred_name_boost_prefixes:
+                            # This is the most specific matching type, so we use this.
+                            possible_labels = map(lambda identifier: identifier.get('label', ''),
+                                sort_identifiers_with_boosted_prefixes(
+                                    node["identifiers"],
+                                    preferred_name_boost_prefixes[typ]
+                                ))
+                            break
+
+                    # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
+                    # Biolink prefix order.
+                    if not possible_labels:
+                        possible_labels = map(lambda identifier: identifier.get('label', ''), node["identifiers"])
+
+                    # Step 2. Filter out any suspicious labels.
+                    filtered_possible_labels = [l for l in possible_labels if
+                                                l and                               # Ignore blank or empty names.
+                                                not l.startswith('CHEMBL')          # Some CHEMBL names are just the identifier again.
+                                                ]
+
+                    # Step 3. Pick the first label that isn't blank.
+                    if filtered_possible_labels:
+                        document["preferred_name"] = filtered_possible_labels[0]
+                    else:
+                        logging.debug(f"No preferred name for {node}")
 
                     # We previously used the shortest length of a name as a proxy for how good a match it is, i.e. given
                     # two concepts that both have the word "acetaminophen" in them, we assume that the shorter one is the
@@ -351,7 +414,7 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
 
                     # Since synonyms_list is sorted,
                     if len(synonyms_list) == 0:
-                        logging.warning(f"Synonym list for {node} is empty: no valid name. Skipping.")
+                        logging.debug(f"Synonym list for {node} is empty: no valid name. Skipping.")
                         continue
                     else:
                         document["shortest_name_length"] = len(synonyms_list[0])
@@ -371,6 +434,7 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
                     print(f"Exception thrown while write_compendium() was generating {ofname}: {ex}")
                     print(node["type"])
                     print(node_factory.get_ancestors(node["type"]))
+                    traceback.print_exc()
                     exit()
 
 def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}):

diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py
@@ -273,11 +273,24 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu
             x = line.strip().split('\t')
             subject = x[0]
             object = x[2]
-            #object is a PUBCHEM.  It's by definition a clique_leader.
+
             if subject in drug_rxcui_to_clique:
                 subject = drug_rxcui_to_clique[subject]
             elif subject in chemical_rxcui_to_clique:
                 subject = chemical_rxcui_to_clique[subject]
+            else:
+                raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}")
+
+            if object in drug_rxcui_to_clique:
+                object = drug_rxcui_to_clique[object]
+            elif object in chemical_rxcui_to_clique:
+                object = chemical_rxcui_to_clique[object]
+            else:
+                logging.warning(
+                    f"Object in subject-object pair ({subject}, {object}) isn't mapped to a RxCUI"
+                )
+                # raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}")
+
             pairs.append((subject, object))
     print("glom")
     gloms = {}

diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py
@@ -65,6 +65,7 @@ def write_ensembl_ids(ensembl_dir, outfile):
             dlpath = os.path.join(ensembl_dir, dl)
             if os.path.isdir(dlpath):
                 infname = os.path.join(dlpath, 'BioMart.tsv')
+                print(f'write_ensembl_ids for input filename {infname}')
                 if os.path.exists(infname):
                     # open each ensembl file, find the id column, and put it in the output
                     with open(infname, 'r') as inf:

diff --git a/src/datahandlers/ensembl.py b/src/datahandlers/ensembl.py
@@ -12,11 +12,17 @@
 # just what we need.
 def pull_ensembl(complete_file):
     f = find_datasets()
+
+    skip_dataset_ids = {'hgfemale_gene_ensembl'}
+
     cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source",
             "external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id',
             'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene'}
     for ds in f['Dataset_ID']:
         print(ds)
+        if ds in skip_dataset_ids:
+            print(f'Skipping {ds} as it is included in skip_dataset_ids: {skip_dataset_ids}')
+            continue
         outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}')
         # Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our
         # config, and keep it up to date.  Maybe you could have a job that gets the datasets and writes a dataset file,

diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py
@@ -1,9 +1,15 @@
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import make_local_name, pull_via_urllib
 import json
 
 def pull_hgnc():
-    outfile='HGNC/hgnc_complete_set.json'
-    pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile)
+    # On 2023nov26, I would get an error trying to download this file using FTP on Python (although
+    # weirdly enough, I could download the file without any problem using macOS Finder). So I changed
+    # it to use HTTP instead.
+    pull_via_urllib(
+        'https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/',
+        'hgnc_complete_set.json',
+        decompress=False,
+        subpath="HGNC")
 
 def pull_hgnc_labels_and_synonyms(infile):
     with open(infile,'r') as data: