TranslatorSRI · gaurav · Apr 22, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024
diff --git a/src/categories.py b/src/categories.py
@@ -9,7 +9,7 @@
 CELLULAR_COMPONENT = 'biolink:CellularComponent'
 CHEMICAL_ENTITY = 'biolink:ChemicalEntity'
 CHEMICAL_MIXTURE = 'biolink:ChemicalMixture'
-COMPLEX_CHEMICAL_MIXTURE = 'biolink:ComplexMolecularMixture'
+COMPLEX_MOLECULAR_MIXTURE = 'biolink:ComplexMolecularMixture'
 DEVICE = 'biolink:Device'
 DISEASE = 'biolink:Disease'
 DRUG = 'biolink:Drug'
@@ -31,3 +31,11 @@
 PUBLICATION = 'biolink:Publication'
 JOURNAL_ARTICLE = 'biolink:JournalArticle'
 SMALL_MOLECULE = 'biolink:SmallMolecule'
+
+# Added by Gaurav on April 16, 2024 based on ChemicalEntity children from
+# https://biolink.github.io/biolink-model/categories.html
+NUCLEIC_ACID_ENTITY = 'biolink:NucleicAcidEntity'
+MOLECULAR_ENTITY = 'biolink:MolecularEntity'
+FOOD_ADDITIVE = 'biolink:FoodAdditive'
+ENVIRONMENTAL_FOOD_CONTAMINANT = 'biolink:EnvironmentalFoodContaminant'
+PROCESSED_MATERIAL = 'biolink:ProcessedMaterial'
diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py
@@ -8,7 +8,7 @@
 
 from src.ubergraph import UberGraph
 from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND,GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI
-from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_CHEMICAL_MIXTURE, CHEMICAL_MIXTURE, DRUG
+from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, DRUG
 from src.sdfreader import read_sdf
 
 from src.datahandlers.unichem import data_sources as unichem_data_sources
@@ -123,9 +123,9 @@ def write_mesh_ids(outfile):
     meshmap['D12.125'] = POLYPEPTIDE
     meshmap['D12.644'] = POLYPEPTIDE
     meshmap['D13'] = POLYPEPTIDE
-    meshmap['D20'] = COMPLEX_CHEMICAL_MIXTURE
+    meshmap['D20'] = COMPLEX_MOLECULAR_MIXTURE
     #Also add anything from SCR_Chemical, if it doesn't have a tree map
-    mesh.write_ids(meshmap,outfile,order=['EXCLUDE',POLYPEPTIDE,COMPLEX_CHEMICAL_MIXTURE,CHEMICAL_ENTITY],extra_vocab={'SCR_Chemical':CHEMICAL_ENTITY})
+    mesh.write_ids(meshmap, outfile, order=['EXCLUDE', POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_ENTITY], extra_vocab={'SCR_Chemical':CHEMICAL_ENTITY})
 
 #def write_obo_ids(irisandtypes,outfile,exclude=[]):
 #    order = [CHEMICAL_SUBSTANCE]
@@ -567,7 +567,7 @@ def create_typed_sets(eqsets, types):
     :param eqsets: A list of lists of identifiers (should NOT be a list of LabeledIDs, but a list of strings).
     :param types: A dictionary of known types for each identifier. (Some identifiers don't have known types.)
     """
-    order = [DRUG, MOLECULAR_MIXTURE, SMALL_MOLECULE, POLYPEPTIDE,  COMPLEX_CHEMICAL_MIXTURE, CHEMICAL_MIXTURE, CHEMICAL_ENTITY]
+    order = [DRUG, MOLECULAR_MIXTURE, SMALL_MOLECULE, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, CHEMICAL_ENTITY]
     typed_sets = defaultdict(set)
     # logging.warning(f"create_typed_sets: eqsets={eqsets}, types=...")
     for equivalent_ids in eqsets:

diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py
@@ -1,5 +1,8 @@
 from src.node import NodeFactory, get_config
-from src.prefixes import RXCUI, PUBCHEMCOMPOUND, CHEMBLCOMPOUND, UNII, DRUGBANK, MESH, UMLS, CHEBI
+from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS
+from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE,
+                            SMALL_MOLECULE, NUCLEIC_ACID_ENTITY, MOLECULAR_ENTITY, FOOD_ADDITIVE,
+                            ENVIRONMENTAL_FOOD_CONTAMINANT, PROCESSED_MATERIAL, CHEMICAL_MIXTURE, POLYPEPTIDE)
 from src.babel_utils import glom, get_numerical_curie_suffix
 from collections import defaultdict
 import os,json
@@ -344,83 +347,112 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu
                 continue
             conflation_id_list = list(clique)
 
-            # Normalize the leading ID.
-            leading_id = conflation_id_list[0]
-            if leading_id not in preferred_curie_for_curie:
-                logger.error(f"Unable to normalize leading CURIE {leading_id}, skipping conflation list: {conflation_id_list}")
+            # Now we need to figure out the type of this conflation. One possibility would be to use the
+            # clique size (number of IDs in each clique) to determine this, but this approach might fail
+            # if a conflation has one oversized clique that pulls us away from the right path. Instead,
+            # we determine a preference order of Biolink types and follow that to choose a type for each
+            # conflation.
+            #
+            # To do this is a two-step process:
+            # 1. Figure out all the possible types (of the remaining IDs).
+            conflation_possible_types = map(
+                lambda id: type_for_preferred_curie[preferred_curie_for_curie[id]],
+                conflation_id_list
+            )
+            # 2. Sort possible types in our preferred order of types.
+            # I've also listed the number of entities as of 2024mar24 to give an idea of how common these are.
+            PREFERRED_CONFLATION_TYPE_ORDER = {
+                SMALL_MOLECULE: 1,                      # 107,459,280 cliques
+                POLYPEPTIDE: 2,                         # 622 cliques
+                NUCLEIC_ACID_ENTITY: 3,                 # N/A
+                MOLECULAR_ENTITY: 4,                    # N/A
+                COMPLEX_MOLECULAR_MIXTURE: 5,           # 177 cliques
+                CHEMICAL_MIXTURE: 6,                    # 498 cliques
+                MOLECULAR_MIXTURE: 7,                   # 10,371,847 cliques
+                PROCESSED_MATERIAL: 8,                  # N/A
+                DRUG: 9,                                # 145,677 cliques
+                FOOD_ADDITIVE: 10,                      # N/A
+                FOOD: 11,                               # N/A
+                ENVIRONMENTAL_FOOD_CONTAMINANT: 12,     # N/A
+                CHEMICAL_ENTITY: 13,                    # 7,398,124 cliques
+            }
+            sorted_possible_types = sorted(conflation_possible_types,
+                                           key=lambda typ: PREFERRED_CONFLATION_TYPE_ORDER.get(typ, 100))
+            if len(sorted_possible_types) > 0:
+                conflation_type = sorted_possible_types[0]
             else:
-                # Determine a type for this conflation list.
-                normalized_leading_id = preferred_curie_for_curie[leading_id]
-                type_for_leading_id = type_for_preferred_curie[normalized_leading_id]
-
-                # Normalize all the identifiers. Any IDs that couldn't be normalized will show up as None.
-                normalized_conflation_id_list = [preferred_curie_for_curie.get(id) for id in conflation_id_list]
-
-                # Turn the conflation CURIE list into a prefix map, which maps prefixes to lists of CURIEs.
-                # This allows us to sort each prefix separately.
-                # Skip CURIE prefixes that aren't good conflation list leaders and ignore duplicates.
-                prefix_map = defaultdict(list)
-                ids_already_added = set()
-                for index, curie in enumerate(normalized_conflation_id_list):
-                    # Remove Nones, which are IDs that could not be normalized.
-                    if curie is None:
-                        logger.warning(f"Could not normalize CURIE {conflation_id_list[index]} in conflation {conflation_id_list}, skipping.")
-                        continue
-
-                    # Remove duplicates
-                    if curie in ids_already_added:
-                        continue
-
-                    # Group by prefix.
-                    curie_prefix = curie.split(':')[0]
-                    if curie_prefix == RXCUI:
-                        # Drug has RXCUI rated highly as a prefix, but that's not a good ID for Babel, so let's skip
-                        # this for now.
-                        continue
-
-                    if curie_prefix == UMLS:
-                        # UMLS is a particularly bad identifier for us because we tend not to conflate on it, so let's
-                        # skip this for now.
-                        continue
-
-                    prefix_map[curie_prefix].append(curie)
-                    ids_already_added.add(curie)
-
-                # Determine the prefixes to be used for this conflation list based on the prefixes from the NodeFactory
-                # (which gets them from Biolink Model).
-                prefixes_for_type = nodefactory.get_prefixes(type_for_leading_id)
-                logger.info(f"Leading ID {leading_id} normalized to {normalized_leading_id} " +
-                            f"(type {type_for_leading_id}) with prefixes: {prefixes_for_type}")
-
-                # Produce a final conflation list in the prefix order specified for the type of the conflation leader.
-                final_conflation_id_list = []
-                ids_already_added = set()
-                for prefix in prefixes_for_type:
-                    if prefix in prefix_map:
-                        prefixes_to_add = []
-                        for id in prefix_map[prefix]:
-                            ids_already_added.add(id)
-                            prefixes_to_add.append(id)
-
-                        # Sort this set of CURIEs from the numerically smallest CURIE suffix to the largest, with
-                        # non-numerical CURIE suffixes sorted to the end.
-                        final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix)))
-
-                # Add any identifiers that weren't in the prefix_map in the original order (which is not significant).
-                prefixes_to_add = []
-                for id in normalized_conflation_id_list:
-                    if id not in ids_already_added:
+                logger.warning(f"Could not determine type for {conflation_id_list} with " +
+                               f"conflation possible types: {conflation_possible_types}, defaulting to {CHEMICAL_ENTITY}.")
+                conflation_type = CHEMICAL_ENTITY
+
+            # Determine the prefixes to be used for this conflation list based on the prefixes from the NodeFactory
+            # (which gets them from Biolink Model).
+            prefixes_for_type = nodefactory.get_prefixes(conflation_type)
+            logger.info(f"Conflation {conflation_id_list} determined to have conflation type {conflation_type} " +
+                        f"with prefixes: {prefixes_for_type}")
+
+            # Normalize all the identifiers. Any IDs that couldn't be normalized will show up as None.
+            normalized_conflation_id_list = [preferred_curie_for_curie.get(id) for id in conflation_id_list]
+
+            # Turn the conflation CURIE list into a prefix map, which maps prefixes to lists of CURIEs.
+            # This allows us to sort each prefix separately.
+            # Skip CURIE prefixes that aren't good conflation list leaders and ignore duplicates.
+            prefix_map = defaultdict(list)
+            ids_already_added = set()
+            for index, curie in enumerate(normalized_conflation_id_list):
+                # Remove Nones, which are IDs that could not be normalized.
+                if curie is None:
+                    logger.warning(f"Could not normalize CURIE {conflation_id_list[index]} in conflation {conflation_id_list}, skipping.")
+                    continue
+
+                # Remove duplicates
+                if curie in ids_already_added:
+                    continue
+
+                # Group by prefix.
+                curie_prefix = curie.split(':')[0]
+                if curie_prefix == RXCUI:
+                    # Drug has RXCUI rated highly as a prefix, but that's not a good ID for Babel, so let's skip
+                    # this for now.
+                    continue
+
+                if curie_prefix == UMLS:
+                    # UMLS is a particularly bad identifier for us because we tend not to conflate on it, so let's
+                    # skip this for now.
+                    continue
+
+                prefix_map[curie_prefix].append(curie)
+                ids_already_added.add(curie)
+
+            # Produce a final conflation list in the prefix order specified for the type of the conflation leader.
+            final_conflation_id_list = []
+            ids_already_added = set()
+            for prefix in prefixes_for_type:
+                if prefix in prefix_map:
+                    prefixes_to_add = []
+                    for id in prefix_map[prefix]:
+                        ids_already_added.add(id)
                         prefixes_to_add.append(id)
 
-                # Sort this final set of CURIEs from the numerically smallest CURIE suffix to the largest, with
-                # non-numerical CURIE suffixes sorted to the end.
-                final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix)))
+                    # Sort this set of CURIEs from the numerically smallest CURIE suffix to the largest, with
+                    # non-numerical CURIE suffixes sorted to the end.
+                    final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix)))
+
+            # Add any identifiers that weren't in the prefix_map in the original order (which is not significant).
+            prefixes_to_add = []
+            for id in normalized_conflation_id_list:
+                if id not in ids_already_added:
+                    prefixes_to_add.append(id)
+
+            # Sort this final set of CURIEs from the numerically smallest CURIE suffix to the largest, with
+            # non-numerical CURIE suffixes sorted to the end.
+            final_conflation_id_list.extend(list(sorted(prefixes_to_add, key=sort_by_curie_suffix)))
 
-                # Let's normalize all the identifiers.
-                logger.info(f"Ordered DrugChemical conflation leading with {leading_id}: {final_conflation_id_list}")
+            # Let's normalize all the identifiers.
+            logger.info(f"Ordered DrugChemical conflation {final_conflation_id_list}")
 
-                outfile.write(f"{json.dumps(final_conflation_id_list)}\n")
-                written.add(fs)
+            outfile.write(f"{json.dumps(final_conflation_id_list)}\n")
+            written.add(fs)
 
 
 def sort_by_curie_suffix(curie):