Skip to content

Commit

Permalink
Merge pull request #201 from TranslatorSRI/babel-1.3
Browse files Browse the repository at this point in the history
This PR includes several minor fixes needed to build Babel:
- Upgrade:
  - UMLS to 2023AB
  - RxNorm to 01022024
  - Biolink Model to 3.6.0
- Reduced log level of empty synonym list from warning to debug to reduce the size of the overall log.
- I accidentally merged #206 into this branch instead of `master`, so it has a bunch of changes, including:
  - Adding `genefamily_outputs`, `umls_outputs` and `macromolecularcomplex_outputs`.
  - Adding the KGX export to the `all` target.
  - Expanding babel_outputs to 500Gi to accommodate KGX files.
- A bug fix for #197
- KGX export now produces gzipped files.
- #207
- #202
- #218
- Includes some code from PR #217, but only produces a warning instead of skipping mappings.
  • Loading branch information
gaurav authored Jan 24, 2024
2 parents 91bb1a2 + a8c4bbe commit 0735fe0
Show file tree
Hide file tree
Showing 12 changed files with 333 additions and 15 deletions.
3 changes: 3 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ include: "src/snakefiles/taxon.snakefile"
include: "src/snakefiles/genefamily.snakefile"
include: "src/snakefiles/leftover_umls.snakefile"
include: "src/snakefiles/macromolecular_complex.snakefile"
include: "src/snakefiles/exports.snakefile"

rule all:
input:
Expand All @@ -28,6 +29,8 @@ rule all:
config['output_directory'] + '/reports/umls_done',
config['output_directory'] + '/reports/macromolecular_complex_done',
config['output_directory'] + '/reports/drugchemical_done',
# Check if we have exported the compendia as KGX.
config['output_directory'] + '/kgx/done',
output:
x = config['output_directory'] + '/reports/all_done'
shell:
Expand Down
21 changes: 18 additions & 3 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
"intermediate_directory": "babel_outputs/intermediate",
"output_directory": "babel_outputs",

"biolink_version": "3.5.4",
"umls_version": "2023AA",
"rxnorm_version": "08072023",
"biolink_version": "3.6.0",
"umls_version": "2023AB",
"rxnorm_version": "01022024",

"ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
"ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],
Expand Down Expand Up @@ -57,10 +57,25 @@
"genefamily_ids": ["PANTHER.FAMILY","HGNC.FAMILY"],
"genefamily_outputs": ["GeneFamily.txt"],

"umls_outputs": ["umls.txt"],
"macromolecularcomplex_outputs": ["MacromolecularComplex.txt"],
"ubergraph_iri_stem_to_prefix_map": {
"https://identifiers.org/ncbigene/": "NCBIGene",
"http://www.ncbi.nlm.nih.gov/gene/": "NCBIGene",
"http://www.genenames.org/cgi-bin/gene_symbol_report?hgnc_id=": "HGNC",
"http://www.informatics.jax.org/marker/MGI:": "MGI"
},

"preferred_name_boost_prefixes": {
"biolink:ChemicalEntity": [
"DRUGBANK",
"GTOPDB",
"DrugCentral",
"CHEMBL.COMPOUND",
"RXCUI",
"CHEBI",
"HMDB",
"PUBCHEM.COMPOUND"
]
}
}
2 changes: 1 addition & 1 deletion kubernetes/babel-outputs.k8s.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ spec:
- ReadWriteOnce
resources:
requests:
storage: 400Gi
storage: 500Gi
storageClassName: basic
76 changes: 70 additions & 6 deletions src/babel_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import subprocess
import traceback
from ftplib import FTP
from io import BytesIO
import gzip
Expand Down Expand Up @@ -273,6 +274,27 @@ def pull_via_wget(
raise RuntimeError(f'Expected uncompressed file {uncompressed_filename} does not exist.')


def sort_identifiers_with_boosted_prefixes(identifiers, prefixes):
"""
Given a list of identifiers (with `identifier` and `label` keys), sort them using
the following rules:
- Any identifier that has a prefix in prefixes is sorted based on its order in prefixes.
- Any identifier that does not have a prefix in prefixes is left in place.
:param identifiers: A list of identifiers to sort. This is a list of dictionaries
containing `identifier` and `label` keys, and possible others that we ignore.
:param prefixes: A list of prefixes, in the order in which they should be boosted.
We assume that CURIEs match these prefixes if they are in the form `{prefix}:...`.
:return: The list of identifiers sorted as described above.
"""

# Thanks to JetBrains AI.
return sorted(
identifiers,
key=lambda identifier: prefixes.index(identifier['identifier'].split(':', 1)[0]) if identifier['identifier'].split(':', 1)[0] in prefixes else len(prefixes)
)


def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None):
"""
:param synonym_list:
Expand All @@ -294,6 +316,10 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
node_factory = NodeFactory(make_local_name(''),biolink_version)
synonym_factory = SynonymFactory(make_local_name(''))

# Load the preferred_name_boost_prefixes -- this tells us which prefixes to boost when
# coming up with a preferred label for a particular Biolink class.
preferred_name_boost_prefixes = config['preferred_name_boost_prefixes']

# Create an InformationContentFactory based on the specified icRDF.tsv file. Default to the one in the download
# directory.
if not icrdf_filename:
Expand Down Expand Up @@ -334,14 +360,51 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
# Why are we running the synonym list through set() again? Because get_synonyms returns unique pairs of (relation, synonym).
# So multiple identical synonyms may be returned as long they have a different relation. But since we don't care about the
# relation, we should get rid of any duplicated synonyms here.
synonyms_list = sorted(set(synonyms), key=lambda x:len(x))
synonyms_list = sorted(set(synonyms), key=lambda x: len(x))
try:
types = node_factory.get_ancestors(node["type"])
document = {"curie": curie,
"names": synonyms_list,
"types": [ t[8:] for t in node_factory.get_ancestors(node["type"])]} #remove biolink:

if "label" in node["identifiers"][0]:
document["preferred_name"] = node["identifiers"][0]["label"]
"types": [t[8:] for t in types]} # remove biolink:

# To pick a preferred label for this clique, we need to do three things:
# 1. We sort all labels in the preferred-name order. By default, this should be
# the preferred CURIE order, but if this clique is in one of the Biolink classes in
# preferred_name_boost_prefixes, we boost those prefixes in that order to the top of the list.
# 2. We filter out any suspicious labels.
# (If this simple filter doesn't work, and if prefixes are inconsistent, we can build upon the
# algorithm proposed by Jeff at
# https://github.com/NCATSTranslator/Feedback/issues/259#issuecomment-1605140850)
# 3. We choose the first label that isn't blank. If no labels remain, we generate a warning.

# Step 1.1. Sort labels in boosted prefix order if possible.
possible_labels = []
for typ in types:
if typ in preferred_name_boost_prefixes:
# This is the most specific matching type, so we use this.
possible_labels = map(lambda identifier: identifier.get('label', ''),
sort_identifiers_with_boosted_prefixes(
node["identifiers"],
preferred_name_boost_prefixes[typ]
))
break

# Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
# Biolink prefix order.
if not possible_labels:
possible_labels = map(lambda identifier: identifier.get('label', ''), node["identifiers"])

# Step 2. Filter out any suspicious labels.
filtered_possible_labels = [l for l in possible_labels if
l and # Ignore blank or empty names.
not l.startswith('CHEMBL') # Some CHEMBL names are just the identifier again.
]

# Step 3. Pick the first label that isn't blank.
if filtered_possible_labels:
document["preferred_name"] = filtered_possible_labels[0]
else:
logging.debug(f"No preferred name for {node}")

# We previously used the shortest length of a name as a proxy for how good a match it is, i.e. given
# two concepts that both have the word "acetaminophen" in them, we assume that the shorter one is the
Expand All @@ -351,7 +414,7 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i

# Since synonyms_list is sorted,
if len(synonyms_list) == 0:
logging.warning(f"Synonym list for {node} is empty: no valid name. Skipping.")
logging.debug(f"Synonym list for {node} is empty: no valid name. Skipping.")
continue
else:
document["shortest_name_length"] = len(synonyms_list[0])
Expand All @@ -371,6 +434,7 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
print(f"Exception thrown while write_compendium() was generating {ofname}: {ex}")
print(node["type"])
print(node_factory.get_ancestors(node["type"]))
traceback.print_exc()
exit()

def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}):
Expand Down
15 changes: 14 additions & 1 deletion src/createcompendia/drugchemical.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,24 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu
x = line.strip().split('\t')
subject = x[0]
object = x[2]
#object is a PUBCHEM. It's by definition a clique_leader.

if subject in drug_rxcui_to_clique:
subject = drug_rxcui_to_clique[subject]
elif subject in chemical_rxcui_to_clique:
subject = chemical_rxcui_to_clique[subject]
else:
raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}")

if object in drug_rxcui_to_clique:
object = drug_rxcui_to_clique[object]
elif object in chemical_rxcui_to_clique:
object = chemical_rxcui_to_clique[object]
else:
logging.warning(
f"Object in subject-object pair ({subject}, {object}) isn't mapped to a RxCUI"
)
# raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}")

pairs.append((subject, object))
print("glom")
gloms = {}
Expand Down
1 change: 1 addition & 0 deletions src/createcompendia/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def write_ensembl_ids(ensembl_dir, outfile):
dlpath = os.path.join(ensembl_dir, dl)
if os.path.isdir(dlpath):
infname = os.path.join(dlpath, 'BioMart.tsv')
print(f'write_ensembl_ids for input filename {infname}')
if os.path.exists(infname):
# open each ensembl file, find the id column, and put it in the output
with open(infname, 'r') as inf:
Expand Down
6 changes: 6 additions & 0 deletions src/datahandlers/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,17 @@
# just what we need.
def pull_ensembl(complete_file):
f = find_datasets()

skip_dataset_ids = {'hgfemale_gene_ensembl'}

cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source",
"external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id',
'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene'}
for ds in f['Dataset_ID']:
print(ds)
if ds in skip_dataset_ids:
print(f'Skipping {ds} as it is included in skip_dataset_ids: {skip_dataset_ids}')
continue
outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}')
# Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our
# config, and keep it up to date. Maybe you could have a job that gets the datasets and writes a dataset file,
Expand Down
12 changes: 9 additions & 3 deletions src/datahandlers/hgnc.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from src.babel_utils import make_local_name, pull_via_ftp
from src.babel_utils import make_local_name, pull_via_urllib
import json

def pull_hgnc():
outfile='HGNC/hgnc_complete_set.json'
pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile)
# On 2023nov26, I would get an error trying to download this file using FTP on Python (although
# weirdly enough, I could download the file without any problem using macOS Finder). So I changed
# it to use HTTP instead.
pull_via_urllib(
'https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/',
'hgnc_complete_set.json',
decompress=False,
subpath="HGNC")

def pull_hgnc_labels_and_synonyms(infile):
with open(infile,'r') as data:
Expand Down
Loading

0 comments on commit 0735fe0

Please sign in to comment.