Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve DrugChemical conflation leader #362

Draft
wants to merge 10 commits into
base: babel-1.9
Choose a base branch
from
82 changes: 73 additions & 9 deletions src/createcompendia/drugchemical.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import csv

from src.node import NodeFactory, get_config
from src.node import NodeFactory, get_config, InformationContentFactory
from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS
from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE,
SMALL_MOLECULE, NUCLEIC_ACID_ENTITY, MOLECULAR_ENTITY, FOOD_ADDITIVE,
Expand Down Expand Up @@ -237,14 +237,17 @@ def build_pubchem_relationships(infile,outfile):
for cid in cids:
outf.write(f"{RXCUI}:{rxnid}\tlinked\t{PUBCHEMCOMPOUND}:{cid}\n")

def build_conflation(manual_concord_filename, rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendium,chemical_compendia,outfilename):
def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename):
"""RXN_concord contains relationshps between rxcuis that can be used to conflate
Now we don't want all of them. We want the ones that are between drugs and chemicals,
and the ones between drugs and drugs.
To determine which those are, we're going to have to dig around in all the compendia.
We also want to get all the clique leaders as well. For those, we only need to worry if there are RXCUIs
in the clique."""

print("Loading information content values...")
ic_factory = InformationContentFactory(icrdf_filename)

print("Loading manual concords ...")
manual_concords = []
with open(manual_concord_filename,"r") as manualf:
Expand All @@ -261,12 +264,14 @@ def build_conflation(manual_concord_filename, rxn_concord,umls_concord,pubchem_r
print("load all chemical conflations so we can normalize identifiers")
preferred_curie_for_curie = {}
type_for_preferred_curie = {}
clique_for_preferred_curie = {}
for chemical_compendium in chemical_compendia:
with open(chemical_compendium, 'r') as compendiumf:
logger.info(f"Loading {chemical_compendium}")
for line in compendiumf:
clique = json.loads(line)
preferred_id = clique['identifiers'][0]['i']
clique_for_preferred_curie[preferred_id] = list(map(lambda ident: ident['i'], clique['identifiers']))
type_for_preferred_curie[preferred_id] = clique['type']
for ident in clique['identifiers']:
id = ident['i']
Expand Down Expand Up @@ -466,26 +471,85 @@ def build_conflation(manual_concord_filename, rxn_concord,umls_concord,pubchem_r
ids_already_added = set()
for prefix in prefixes_for_type:
if prefix in prefix_map:
prefixes_to_add = []
ids_to_add = []
for id in prefix_map[prefix]:
ids_already_added.add(id)
prefixes_to_add.append(id)
ids_to_add.append(id)

# Sort this set of CURIEs from the numerically smallest CURIE suffix to the largest, with
# non-numerical CURIE suffixes sorted to the end.
final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix)))
final_conflation_id_list.extend(list(sorted(ids_to_add, key=sort_by_curie_suffix)))

# Add any identifiers that weren't in the prefix_map in the original order (which is not significant).
prefixes_to_add = []
ids_to_add = []
for id in normalized_conflation_id_list:
if id not in ids_already_added:
prefixes_to_add.append(id)
ids_to_add.append(id)

# Sort this final set of CURIEs from the numerically smallest CURIE suffix to the largest, with
# non-numerical CURIE suffixes sorted to the end.
final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix)))
final_conflation_id_list.extend(list(sorted(ids_to_add, key=sort_by_curie_suffix)))

# At this point, final_conflation_id_list is a list of all the identifiers for this conflation
# arranged in two ways:
# - This is sorted by prefix in the prefix order specified for the type we've come up with for this
# conflation (conflation_type).
# - Within each prefix, we've sorted identifiers by CURIE suffix, so that the smallest identifier goes
# first.
# This generally gives us the right identifier for the conflation, but there are a few cases where we can
# improve this:
# - We might end up with a conflation clique leader that's not the right type.
# - We might end up with a conflation clique leader that's a more complex chemical than the simplest
# one (e.g. the conflated clique for CHEBI:45783 "imanitib" is currently lead by
# CHEBI:31690 "imatinib methanesulfonate", just because it's numerically smaller).
# - See https://github.com/TranslatorSRI/Babel/issues/341 for examples.
# - We might end up with a conflation clique leader that has a higher information content
# To work around this, we take this chance to pick an alternate conflation clique leader.
conflation_clique_leader = final_conflation_id_list[0]
conflation_clique_leader_prefix = conflation_clique_leader.split(':')[0]
conflation_clique_leader_ic = ic_factory.get_ic({
'identifiers': list(map(lambda curie: {'identifier': curie}, clique_for_preferred_curie[conflation_clique_leader]))
})
if conflation_clique_leader_ic is None:
conflation_clique_leader_ic = float(100.0)
else:
conflation_clique_leader_ic = float(conflation_clique_leader_ic)

for curie in final_conflation_id_list:
curie_prefix = curie.split(':')[0]
if curie_prefix != conflation_clique_leader_prefix:
# Let's stick will the same prefix as the first entry.
continue

# Note that this works because curie is always a clique leader here.
curie_type = type_for_preferred_curie[curie]
if curie_type != conflation_type:
# Only consider clique leaders that are of the calculated type.
continue

# Let's normalize all the identifiers.
# Is this a lower information content value? If so, prefer this CURIE.
curie_ic = ic_factory.get_ic({
'identifiers': list(map(lambda curie: {'identifier': curie}, clique_for_preferred_curie[curie]))
})
if curie_ic is not None and float(curie_ic) < float(conflation_clique_leader_ic):
logging.info(f"Found better IC with CURIE {curie} (IC {curie_ic}) than previous conflation clique "
f"leader {final_conflation_id_list[0]} (IC {conflation_clique_leader_ic}).")
conflation_clique_leader = curie
conflation_clique_leader_ic = float(curie_ic)

# Is this a shorter label? If so, we would like to prefer this
# CURIE, but loading all the labels into memory would take a
# lot of memory. So let's see how good we can do with just the
# information content values.

# If we've picked a new clique leader, move it to the front of the list.
if conflation_clique_leader != final_conflation_id_list[0]:
logging.info(f"Replacing conflation clique leader {final_conflation_id_list[0]} with improved "
f"conflation clique leader {conflation_clique_leader}")
final_conflation_id_list.remove(conflation_clique_leader)
final_conflation_id_list.insert(0, conflation_clique_leader)

# Write out all the identifiers.
logger.info(f"Ordered DrugChemical conflation {final_conflation_id_list}")

outfile.write(f"{json.dumps(final_conflation_id_list)}\n")
Expand Down
11 changes: 10 additions & 1 deletion src/snakefiles/drugchemical.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,19 @@ rule drugchemical_conflation:
umls_concord=config['intermediate_directory']+'/drugchemical/concords/UMLS',
pubchem_concord=config['intermediate_directory']+'/drugchemical/concords/PUBCHEM_RXNORM',
drugchemical_manual_concord=config['input_directory']+'/manual_concords/drugchemical.tsv',
icrdf_filename=config['download_directory']+'/icRDF.tsv',
output:
outfile=config['output_directory']+'/conflation/DrugChemical.txt'
run:
drugchemical.build_conflation(input.drugchemical_manual_concord,input.rxnorm_concord,input.umls_concord,input.pubchem_concord,input.drug_compendium,input.chemical_compendia,output.outfile)
drugchemical.build_conflation(
input.drugchemical_manual_concord,
input.rxnorm_concord,
input.umls_concord,
input.pubchem_concord,
input.drug_compendium,
input.chemical_compendia,
input.icrdf_filename,
output.outfile)

rule drugchemical_conflated_synonyms:
input:
Expand Down