diff --git a/src/categories.py b/src/categories.py index 75f57bf6..d6217d3e 100644 --- a/src/categories.py +++ b/src/categories.py @@ -9,7 +9,7 @@ CELLULAR_COMPONENT = 'biolink:CellularComponent' CHEMICAL_ENTITY = 'biolink:ChemicalEntity' CHEMICAL_MIXTURE = 'biolink:ChemicalMixture' -COMPLEX_CHEMICAL_MIXTURE = 'biolink:ComplexMolecularMixture' +COMPLEX_MOLECULAR_MIXTURE = 'biolink:ComplexMolecularMixture' DEVICE = 'biolink:Device' DISEASE = 'biolink:Disease' DRUG = 'biolink:Drug' @@ -31,3 +31,11 @@ PUBLICATION = 'biolink:Publication' JOURNAL_ARTICLE = 'biolink:JournalArticle' SMALL_MOLECULE = 'biolink:SmallMolecule' + +# Added by Gaurav on April 16, 2024 based on ChemicalEntity children from +# https://biolink.github.io/biolink-model/categories.html +NUCLEIC_ACID_ENTITY = 'biolink:NucleicAcidEntity' +MOLECULAR_ENTITY = 'biolink:MolecularEntity' +FOOD_ADDITIVE = 'biolink:FoodAdditive' +ENVIRONMENTAL_FOOD_CONTAMINANT = 'biolink:EnvironmentalFoodContaminant' +PROCESSED_MATERIAL = 'biolink:ProcessedMaterial' diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 0f16157b..180cbb41 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -8,7 +8,7 @@ from src.ubergraph import UberGraph from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND,GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI -from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_CHEMICAL_MIXTURE, CHEMICAL_MIXTURE, DRUG +from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, DRUG from src.sdfreader import read_sdf from src.datahandlers.unichem import data_sources as unichem_data_sources @@ -123,9 +123,9 @@ def write_mesh_ids(outfile): meshmap['D12.125'] = POLYPEPTIDE meshmap['D12.644'] = POLYPEPTIDE meshmap['D13'] = POLYPEPTIDE - meshmap['D20'] = COMPLEX_CHEMICAL_MIXTURE + meshmap['D20'] = COMPLEX_MOLECULAR_MIXTURE #Also add anything from SCR_Chemical, if it doesn't have a tree map - mesh.write_ids(meshmap,outfile,order=['EXCLUDE',POLYPEPTIDE,COMPLEX_CHEMICAL_MIXTURE,CHEMICAL_ENTITY],extra_vocab={'SCR_Chemical':CHEMICAL_ENTITY}) + mesh.write_ids(meshmap, outfile, order=['EXCLUDE', POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_ENTITY], extra_vocab={'SCR_Chemical':CHEMICAL_ENTITY}) #def write_obo_ids(irisandtypes,outfile,exclude=[]): # order = [CHEMICAL_SUBSTANCE] @@ -567,7 +567,7 @@ def create_typed_sets(eqsets, types): :param eqsets: A list of lists of identifiers (should NOT be a list of LabeledIDs, but a list of strings). :param types: A dictionary of known types for each identifier. (Some identifiers don't have known types.) """ - order = [DRUG, MOLECULAR_MIXTURE, SMALL_MOLECULE, POLYPEPTIDE, COMPLEX_CHEMICAL_MIXTURE, CHEMICAL_MIXTURE, CHEMICAL_ENTITY] + order = [DRUG, MOLECULAR_MIXTURE, SMALL_MOLECULE, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, CHEMICAL_ENTITY] typed_sets = defaultdict(set) # logging.warning(f"create_typed_sets: eqsets={eqsets}, types=...") for equivalent_ids in eqsets: diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 4c5719c4..78bfdbe0 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -1,5 +1,8 @@ from src.node import NodeFactory, get_config -from src.prefixes import RXCUI, PUBCHEMCOMPOUND, CHEMBLCOMPOUND, UNII, DRUGBANK, MESH, UMLS, CHEBI +from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS +from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE, + SMALL_MOLECULE, NUCLEIC_ACID_ENTITY, MOLECULAR_ENTITY, FOOD_ADDITIVE, + ENVIRONMENTAL_FOOD_CONTAMINANT, PROCESSED_MATERIAL, CHEMICAL_MIXTURE, POLYPEPTIDE) from src.babel_utils import glom, get_numerical_curie_suffix from collections import defaultdict import os,json @@ -344,83 +347,112 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu continue conflation_id_list = list(clique) - # Normalize the leading ID. - leading_id = conflation_id_list[0] - if leading_id not in preferred_curie_for_curie: - logger.error(f"Unable to normalize leading CURIE {leading_id}, skipping conflation list: {conflation_id_list}") + # Now we need to figure out the type of this conflation. One possibility would be to use the + # clique size (number of IDs in each clique) to determine this, but this approach might fail + # if a conflation has one oversized clique that pulls us away from the right path. Instead, + # we determine a preference order of Biolink types and follow that to choose a type for each + # conflation. + # + # To do this is a two-step process: + # 1. Figure out all the possible types (of the remaining IDs). + conflation_possible_types = map( + lambda id: type_for_preferred_curie[preferred_curie_for_curie[id]], + conflation_id_list + ) + # 2. Sort possible types in our preferred order of types. + # I've also listed the number of entities as of 2024mar24 to give an idea of how common these are. + PREFERRED_CONFLATION_TYPE_ORDER = { + SMALL_MOLECULE: 1, # 107,459,280 cliques + POLYPEPTIDE: 2, # 622 cliques + NUCLEIC_ACID_ENTITY: 3, # N/A + MOLECULAR_ENTITY: 4, # N/A + COMPLEX_MOLECULAR_MIXTURE: 5, # 177 cliques + CHEMICAL_MIXTURE: 6, # 498 cliques + MOLECULAR_MIXTURE: 7, # 10,371,847 cliques + PROCESSED_MATERIAL: 8, # N/A + DRUG: 9, # 145,677 cliques + FOOD_ADDITIVE: 10, # N/A + FOOD: 11, # N/A + ENVIRONMENTAL_FOOD_CONTAMINANT: 12, # N/A + CHEMICAL_ENTITY: 13, # 7,398,124 cliques + } + sorted_possible_types = sorted(conflation_possible_types, + key=lambda typ: PREFERRED_CONFLATION_TYPE_ORDER.get(typ, 100)) + if len(sorted_possible_types) > 0: + conflation_type = sorted_possible_types[0] else: - # Determine a type for this conflation list. - normalized_leading_id = preferred_curie_for_curie[leading_id] - type_for_leading_id = type_for_preferred_curie[normalized_leading_id] - - # Normalize all the identifiers. Any IDs that couldn't be normalized will show up as None. - normalized_conflation_id_list = [preferred_curie_for_curie.get(id) for id in conflation_id_list] - - # Turn the conflation CURIE list into a prefix map, which maps prefixes to lists of CURIEs. - # This allows us to sort each prefix separately. - # Skip CURIE prefixes that aren't good conflation list leaders and ignore duplicates. - prefix_map = defaultdict(list) - ids_already_added = set() - for index, curie in enumerate(normalized_conflation_id_list): - # Remove Nones, which are IDs that could not be normalized. - if curie is None: - logger.warning(f"Could not normalize CURIE {conflation_id_list[index]} in conflation {conflation_id_list}, skipping.") - continue - - # Remove duplicates - if curie in ids_already_added: - continue - - # Group by prefix. - curie_prefix = curie.split(':')[0] - if curie_prefix == RXCUI: - # Drug has RXCUI rated highly as a prefix, but that's not a good ID for Babel, so let's skip - # this for now. - continue - - if curie_prefix == UMLS: - # UMLS is a particularly bad identifier for us because we tend not to conflate on it, so let's - # skip this for now. - continue - - prefix_map[curie_prefix].append(curie) - ids_already_added.add(curie) - - # Determine the prefixes to be used for this conflation list based on the prefixes from the NodeFactory - # (which gets them from Biolink Model). - prefixes_for_type = nodefactory.get_prefixes(type_for_leading_id) - logger.info(f"Leading ID {leading_id} normalized to {normalized_leading_id} " + - f"(type {type_for_leading_id}) with prefixes: {prefixes_for_type}") - - # Produce a final conflation list in the prefix order specified for the type of the conflation leader. - final_conflation_id_list = [] - ids_already_added = set() - for prefix in prefixes_for_type: - if prefix in prefix_map: - prefixes_to_add = [] - for id in prefix_map[prefix]: - ids_already_added.add(id) - prefixes_to_add.append(id) - - # Sort this set of CURIEs from the numerically smallest CURIE suffix to the largest, with - # non-numerical CURIE suffixes sorted to the end. - final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix))) - - # Add any identifiers that weren't in the prefix_map in the original order (which is not significant). - prefixes_to_add = [] - for id in normalized_conflation_id_list: - if id not in ids_already_added: + logger.warning(f"Could not determine type for {conflation_id_list} with " + + f"conflation possible types: {conflation_possible_types}, defaulting to {CHEMICAL_ENTITY}.") + conflation_type = CHEMICAL_ENTITY + + # Determine the prefixes to be used for this conflation list based on the prefixes from the NodeFactory + # (which gets them from Biolink Model). + prefixes_for_type = nodefactory.get_prefixes(conflation_type) + logger.info(f"Conflation {conflation_id_list} determined to have conflation type {conflation_type} " + + f"with prefixes: {prefixes_for_type}") + + # Normalize all the identifiers. Any IDs that couldn't be normalized will show up as None. + normalized_conflation_id_list = [preferred_curie_for_curie.get(id) for id in conflation_id_list] + + # Turn the conflation CURIE list into a prefix map, which maps prefixes to lists of CURIEs. + # This allows us to sort each prefix separately. + # Skip CURIE prefixes that aren't good conflation list leaders and ignore duplicates. + prefix_map = defaultdict(list) + ids_already_added = set() + for index, curie in enumerate(normalized_conflation_id_list): + # Remove Nones, which are IDs that could not be normalized. + if curie is None: + logger.warning(f"Could not normalize CURIE {conflation_id_list[index]} in conflation {conflation_id_list}, skipping.") + continue + + # Remove duplicates + if curie in ids_already_added: + continue + + # Group by prefix. + curie_prefix = curie.split(':')[0] + if curie_prefix == RXCUI: + # Drug has RXCUI rated highly as a prefix, but that's not a good ID for Babel, so let's skip + # this for now. + continue + + if curie_prefix == UMLS: + # UMLS is a particularly bad identifier for us because we tend not to conflate on it, so let's + # skip this for now. + continue + + prefix_map[curie_prefix].append(curie) + ids_already_added.add(curie) + + # Produce a final conflation list in the prefix order specified for the type of the conflation leader. + final_conflation_id_list = [] + ids_already_added = set() + for prefix in prefixes_for_type: + if prefix in prefix_map: + prefixes_to_add = [] + for id in prefix_map[prefix]: + ids_already_added.add(id) prefixes_to_add.append(id) - # Sort this final set of CURIEs from the numerically smallest CURIE suffix to the largest, with - # non-numerical CURIE suffixes sorted to the end. - final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix))) + # Sort this set of CURIEs from the numerically smallest CURIE suffix to the largest, with + # non-numerical CURIE suffixes sorted to the end. + final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix))) + + # Add any identifiers that weren't in the prefix_map in the original order (which is not significant). + prefixes_to_add = [] + for id in normalized_conflation_id_list: + if id not in ids_already_added: + prefixes_to_add.append(id) + + # Sort this final set of CURIEs from the numerically smallest CURIE suffix to the largest, with + # non-numerical CURIE suffixes sorted to the end. + final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix))) - # Let's normalize all the identifiers. - logger.info(f"Ordered DrugChemical conflation leading with {leading_id}: {final_conflation_id_list}") + # Let's normalize all the identifiers. + logger.info(f"Ordered DrugChemical conflation {final_conflation_id_list}") - outfile.write(f"{json.dumps(final_conflation_id_list)}\n") - written.add(fs) + outfile.write(f"{json.dumps(final_conflation_id_list)}\n") + written.add(fs) def sort_by_curie_suffix(curie):