Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix DrugChemical conflation typing #266

Merged
merged 7 commits into from
Apr 22, 2024
Merged
10 changes: 9 additions & 1 deletion src/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
CELLULAR_COMPONENT = 'biolink:CellularComponent'
CHEMICAL_ENTITY = 'biolink:ChemicalEntity'
CHEMICAL_MIXTURE = 'biolink:ChemicalMixture'
COMPLEX_CHEMICAL_MIXTURE = 'biolink:ComplexMolecularMixture'
COMPLEX_MOLECULAR_MIXTURE = 'biolink:ComplexMolecularMixture'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is admittedly not exactly the envisioned use case but the idea of these consts was so that when a 1:1 change occurred here you could just change the string and save yourself many edits throughout the code.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I know! But I took advantage of the centralized constants to bulk-rename one that had fallen out of sync with the corresponding Biolink type. If it had been a large change I would have proposed it in its own PR, but since it was only used in a handful of places and was related to the overall PR changes, I decided to incorporate it in here. For future changes, I'll update the string first and only update the constant name once it's pretty clear Biolink isn't going to change it again any time soon.

DEVICE = 'biolink:Device'
DISEASE = 'biolink:Disease'
DRUG = 'biolink:Drug'
Expand All @@ -31,3 +31,11 @@
PUBLICATION = 'biolink:Publication'
JOURNAL_ARTICLE = 'biolink:JournalArticle'
SMALL_MOLECULE = 'biolink:SmallMolecule'

# Added by Gaurav on April 16, 2024 based on ChemicalEntity children from
# https://biolink.github.io/biolink-model/categories.html
NUCLEIC_ACID_ENTITY = 'biolink:NucleicAcidEntity'
MOLECULAR_ENTITY = 'biolink:MolecularEntity'
FOOD_ADDITIVE = 'biolink:FoodAdditive'
ENVIRONMENTAL_FOOD_CONTAMINANT = 'biolink:EnvironmentalFoodContaminant'
PROCESSED_MATERIAL = 'biolink:ProcessedMaterial'
8 changes: 4 additions & 4 deletions src/createcompendia/chemicals.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from src.ubergraph import UberGraph
from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND,GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI
from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_CHEMICAL_MIXTURE, CHEMICAL_MIXTURE, DRUG
from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, DRUG
from src.sdfreader import read_sdf

from src.datahandlers.unichem import data_sources as unichem_data_sources
Expand Down Expand Up @@ -123,9 +123,9 @@ def write_mesh_ids(outfile):
meshmap['D12.125'] = POLYPEPTIDE
meshmap['D12.644'] = POLYPEPTIDE
meshmap['D13'] = POLYPEPTIDE
meshmap['D20'] = COMPLEX_CHEMICAL_MIXTURE
meshmap['D20'] = COMPLEX_MOLECULAR_MIXTURE
#Also add anything from SCR_Chemical, if it doesn't have a tree map
mesh.write_ids(meshmap,outfile,order=['EXCLUDE',POLYPEPTIDE,COMPLEX_CHEMICAL_MIXTURE,CHEMICAL_ENTITY],extra_vocab={'SCR_Chemical':CHEMICAL_ENTITY})
mesh.write_ids(meshmap, outfile, order=['EXCLUDE', POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_ENTITY], extra_vocab={'SCR_Chemical':CHEMICAL_ENTITY})

#def write_obo_ids(irisandtypes,outfile,exclude=[]):
# order = [CHEMICAL_SUBSTANCE]
Expand Down Expand Up @@ -567,7 +567,7 @@ def create_typed_sets(eqsets, types):
:param eqsets: A list of lists of identifiers (should NOT be a list of LabeledIDs, but a list of strings).
:param types: A dictionary of known types for each identifier. (Some identifiers don't have known types.)
"""
order = [DRUG, MOLECULAR_MIXTURE, SMALL_MOLECULE, POLYPEPTIDE, COMPLEX_CHEMICAL_MIXTURE, CHEMICAL_MIXTURE, CHEMICAL_ENTITY]
order = [DRUG, MOLECULAR_MIXTURE, SMALL_MOLECULE, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, CHEMICAL_ENTITY]
typed_sets = defaultdict(set)
# logging.warning(f"create_typed_sets: eqsets={eqsets}, types=...")
for equivalent_ids in eqsets:
Expand Down
178 changes: 105 additions & 73 deletions src/createcompendia/drugchemical.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from src.node import NodeFactory, get_config
from src.prefixes import RXCUI, PUBCHEMCOMPOUND, CHEMBLCOMPOUND, UNII, DRUGBANK, MESH, UMLS, CHEBI
from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS
from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE,
SMALL_MOLECULE, NUCLEIC_ACID_ENTITY, MOLECULAR_ENTITY, FOOD_ADDITIVE,
ENVIRONMENTAL_FOOD_CONTAMINANT, PROCESSED_MATERIAL, CHEMICAL_MIXTURE, POLYPEPTIDE)
from src.babel_utils import glom, get_numerical_curie_suffix
from collections import defaultdict
import os,json
Expand Down Expand Up @@ -344,83 +347,112 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu
continue
conflation_id_list = list(clique)

# Normalize the leading ID.
leading_id = conflation_id_list[0]
if leading_id not in preferred_curie_for_curie:
logger.error(f"Unable to normalize leading CURIE {leading_id}, skipping conflation list: {conflation_id_list}")
# Now we need to figure out the type of this conflation. One possibility would be to use the
# clique size (number of IDs in each clique) to determine this, but this approach might fail
# if a conflation has one oversized clique that pulls us away from the right path. Instead,
# we determine a preference order of Biolink types and follow that to choose a type for each
# conflation.
#
# To do this is a two-step process:
# 1. Figure out all the possible types (of the remaining IDs).
conflation_possible_types = map(
lambda id: type_for_preferred_curie[preferred_curie_for_curie[id]],
conflation_id_list
)
# 2. Sort possible types in our preferred order of types.
# I've also listed the number of entities as of 2024mar24 to give an idea of how common these are.
PREFERRED_CONFLATION_TYPE_ORDER = {
SMALL_MOLECULE: 1, # 107,459,280 cliques
POLYPEPTIDE: 2, # 622 cliques
NUCLEIC_ACID_ENTITY: 3, # N/A
MOLECULAR_ENTITY: 4, # N/A
COMPLEX_MOLECULAR_MIXTURE: 5, # 177 cliques
CHEMICAL_MIXTURE: 6, # 498 cliques
MOLECULAR_MIXTURE: 7, # 10,371,847 cliques
PROCESSED_MATERIAL: 8, # N/A
DRUG: 9, # 145,677 cliques
FOOD_ADDITIVE: 10, # N/A
FOOD: 11, # N/A
ENVIRONMENTAL_FOOD_CONTAMINANT: 12, # N/A
CHEMICAL_ENTITY: 13, # 7,398,124 cliques
}
sorted_possible_types = sorted(conflation_possible_types,
key=lambda typ: PREFERRED_CONFLATION_TYPE_ORDER.get(typ, 100))
if len(sorted_possible_types) > 0:
conflation_type = sorted_possible_types[0]
else:
# Determine a type for this conflation list.
normalized_leading_id = preferred_curie_for_curie[leading_id]
type_for_leading_id = type_for_preferred_curie[normalized_leading_id]

# Normalize all the identifiers. Any IDs that couldn't be normalized will show up as None.
normalized_conflation_id_list = [preferred_curie_for_curie.get(id) for id in conflation_id_list]

# Turn the conflation CURIE list into a prefix map, which maps prefixes to lists of CURIEs.
# This allows us to sort each prefix separately.
# Skip CURIE prefixes that aren't good conflation list leaders and ignore duplicates.
prefix_map = defaultdict(list)
ids_already_added = set()
for index, curie in enumerate(normalized_conflation_id_list):
# Remove Nones, which are IDs that could not be normalized.
if curie is None:
logger.warning(f"Could not normalize CURIE {conflation_id_list[index]} in conflation {conflation_id_list}, skipping.")
continue

# Remove duplicates
if curie in ids_already_added:
continue

# Group by prefix.
curie_prefix = curie.split(':')[0]
if curie_prefix == RXCUI:
# Drug has RXCUI rated highly as a prefix, but that's not a good ID for Babel, so let's skip
# this for now.
continue

if curie_prefix == UMLS:
# UMLS is a particularly bad identifier for us because we tend not to conflate on it, so let's
# skip this for now.
continue

prefix_map[curie_prefix].append(curie)
ids_already_added.add(curie)

# Determine the prefixes to be used for this conflation list based on the prefixes from the NodeFactory
# (which gets them from Biolink Model).
prefixes_for_type = nodefactory.get_prefixes(type_for_leading_id)
logger.info(f"Leading ID {leading_id} normalized to {normalized_leading_id} " +
f"(type {type_for_leading_id}) with prefixes: {prefixes_for_type}")

# Produce a final conflation list in the prefix order specified for the type of the conflation leader.
final_conflation_id_list = []
ids_already_added = set()
for prefix in prefixes_for_type:
if prefix in prefix_map:
prefixes_to_add = []
for id in prefix_map[prefix]:
ids_already_added.add(id)
prefixes_to_add.append(id)

# Sort this set of CURIEs from the numerically smallest CURIE suffix to the largest, with
# non-numerical CURIE suffixes sorted to the end.
final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix)))

# Add any identifiers that weren't in the prefix_map in the original order (which is not significant).
prefixes_to_add = []
for id in normalized_conflation_id_list:
if id not in ids_already_added:
logger.warning(f"Could not determine type for {conflation_id_list} with " +
f"conflation possible types: {conflation_possible_types}, defaulting to {CHEMICAL_ENTITY}.")
conflation_type = CHEMICAL_ENTITY

# Determine the prefixes to be used for this conflation list based on the prefixes from the NodeFactory
# (which gets them from Biolink Model).
prefixes_for_type = nodefactory.get_prefixes(conflation_type)
logger.info(f"Conflation {conflation_id_list} determined to have conflation type {conflation_type} " +
f"with prefixes: {prefixes_for_type}")

# Normalize all the identifiers. Any IDs that couldn't be normalized will show up as None.
normalized_conflation_id_list = [preferred_curie_for_curie.get(id) for id in conflation_id_list]

# Turn the conflation CURIE list into a prefix map, which maps prefixes to lists of CURIEs.
# This allows us to sort each prefix separately.
# Skip CURIE prefixes that aren't good conflation list leaders and ignore duplicates.
prefix_map = defaultdict(list)
ids_already_added = set()
for index, curie in enumerate(normalized_conflation_id_list):
# Remove Nones, which are IDs that could not be normalized.
if curie is None:
logger.warning(f"Could not normalize CURIE {conflation_id_list[index]} in conflation {conflation_id_list}, skipping.")
continue

# Remove duplicates
if curie in ids_already_added:
continue

# Group by prefix.
curie_prefix = curie.split(':')[0]
if curie_prefix == RXCUI:
# Drug has RXCUI rated highly as a prefix, but that's not a good ID for Babel, so let's skip
# this for now.
continue

if curie_prefix == UMLS:
# UMLS is a particularly bad identifier for us because we tend not to conflate on it, so let's
# skip this for now.
continue

prefix_map[curie_prefix].append(curie)
ids_already_added.add(curie)

# Produce a final conflation list in the prefix order specified for the type of the conflation leader.
final_conflation_id_list = []
ids_already_added = set()
for prefix in prefixes_for_type:
if prefix in prefix_map:
prefixes_to_add = []
for id in prefix_map[prefix]:
ids_already_added.add(id)
prefixes_to_add.append(id)

# Sort this final set of CURIEs from the numerically smallest CURIE suffix to the largest, with
# non-numerical CURIE suffixes sorted to the end.
final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix)))
# Sort this set of CURIEs from the numerically smallest CURIE suffix to the largest, with
# non-numerical CURIE suffixes sorted to the end.
final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix)))

# Add any identifiers that weren't in the prefix_map in the original order (which is not significant).
prefixes_to_add = []
for id in normalized_conflation_id_list:
if id not in ids_already_added:
prefixes_to_add.append(id)

# Sort this final set of CURIEs from the numerically smallest CURIE suffix to the largest, with
# non-numerical CURIE suffixes sorted to the end.
final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix)))

# Let's normalize all the identifiers.
logger.info(f"Ordered DrugChemical conflation leading with {leading_id}: {final_conflation_id_list}")
# Let's normalize all the identifiers.
logger.info(f"Ordered DrugChemical conflation {final_conflation_id_list}")

outfile.write(f"{json.dumps(final_conflation_id_list)}\n")
written.add(fs)
outfile.write(f"{json.dumps(final_conflation_id_list)}\n")
written.add(fs)


def sort_by_curie_suffix(curie):
Expand Down