Merge pull request #235 from TranslatorSRI/add-sapbert-export

This PR moves the SAPBERT training data exporter into Babel from [Babel Validation](https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207). This has been added as a new `exporter` and, as with the KGX file, generates SAPBERT training data that is already gzipped to save space on the Babel instance. It also includes some minor changes to the babel_utils.py and moved the lists of all Snakemake synonym files into snakesfiles/util.py.
TranslatorSRI · Feb 16, 2024 · 5def2b8 · 5def2b8
2 parents 5403314 + 330e9b3
commit 5def2b8
Show file tree

Hide file tree

Showing 6 changed files with 175 additions and 14 deletions.
diff --git a/Snakefile b/Snakefile
@@ -26,6 +26,7 @@ rule all:
 
         # Build all the exports.
         config['output_directory'] + '/kgx/done',
+        config['output_directory'] + '/sapbert-training-data/done',
     output:
         x = config['output_directory'] + '/reports/all_done'
     shell:

diff --git a/src/babel_utils.py b/src/babel_utils.py
@@ -424,7 +424,10 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
                     if filtered_possible_labels:
                         document["preferred_name"] = filtered_possible_labels[0]
                     else:
-                        logging.debug(f"No preferred name for {node}")
+                        logging.debug(
+                            f"No preferred name for {node}, probably because all names were filtered out. Skipping."
+                        )
+                        continue
 
                     # We previously used the shortest length of a name as a proxy for how good a match it is, i.e. given
                     # two concepts that both have the word "acetaminophen" in them, we assume that the shorter one is the
@@ -618,7 +621,7 @@ def clean_sets(result_dict):
     """The keys for this are unique and unmergable: Don't merge GO!
     But there are values that are showing up in multiple GOs (could be
     MetaCycs or RHEAs or Reactomes).  It's just how GO is mapping.  Now,
-    the right answer here is probably to kboom this whole mess.  But 
+    the right answer here is probably to kboom this whole mess.  But
     for prototype, we're just going to filter out garbage merge values).
     Note that this isn't limited to GO. Even MONDO include some #exactMatch
     to the same MESH from two different MONDO ids"""
@@ -705,4 +708,4 @@ def norm(x,op):
     pref = Text.get_curie(x)
     if pref in op:
         return Text.recurie(x,op[pref])
-    return x
+    return x
diff --git a/src/exporters/sapbert.py b/src/exporters/sapbert.py
@@ -0,0 +1,104 @@
+# Sapbert (https://github.com/RENCI-NER/sapbert) requires input files
+# in a particular pipe-delimited format:
+#   biolink:Gene||NCBIGene:10554||AGPAT1||1-acylglycerol-3-phosphate o-acyltransferase 1||lysophosphatidic acid acyltransferase, alpha
+# i.e. the format we need is:
+#   biolink-type||preferred ID||preferred label||synonym 1||synonym 2
+# Also, we can't do more than fifty synonym pairs for each preferred ID.
+#
+# This file provides code for doing that, based on the code from
+# https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207
+import gzip
+import hashlib
+import itertools
+import json
+import os
+import random
+import re
+from itertools import combinations
+
+import logging
+from src.util import LoggingUtil
+
+# Default logger for this file.
+logger = LoggingUtil.init_logging(__name__, level=logging.INFO)
+
+# Configuration options
+# Include up to 50 synonym pairs for each synonym.
+MAX_SYNONYM_PAIRS = 50
+# Should we lowercase all the names?
+LOWERCASE_ALL_NAMES = True
+
+
+def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
+    """
+    Convert a synonyms file to the training format for SAPBERT (https://github.com/RENCI-NER/sapbert).
+
+    Based on the code in https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207
+
+    :param synonym_filename: The compendium file to convert.
+    :param sapbert_filename_gzipped: The SAPBERT training file to generate.
+    """
+
+    logger.info(f"convert_synonyms_to_sapbert({synonym_filename}, {sapbert_filename_gzipped})")
+
+    # Make the output directories if they don't exist.
+    os.makedirs(os.path.dirname(sapbert_filename_gzipped), exist_ok=True)
+
+    # Go through all the synonyms in the input file.
+    count_entry = 0
+    count_training_text = 0
+    with open(synonym_filename, "r", encoding="utf-8") as synonymf, gzip.open(sapbert_filename_gzipped, "wt", encoding="utf-8") as sapbertf:
+        for line in synonymf:
+            count_entry += 1
+            entry = json.loads(line)
+
+            # Read fields from the synonym.
+            curie = entry['curie']
+            preferred_name = entry.get('preferred_name', '')
+            if not preferred_name:
+                logging.warning(f"Unable to convert synonym entry for curie {curie}, skipping: {entry}")
+                continue
+
+            # Collect and process the list of names.
+            names = entry['names']
+            if LOWERCASE_ALL_NAMES:
+                names = [name.lower() for name in names]
+
+            # We use '||' as a delimiter, so any occurrences of more than one pipe character
+            # should be changed to a single pipe character in the SAPBERT output, so we don't
+            # confuse it up with our delimiter.
+            names = [re.sub(r'\|\|+', '|', name) for name in names]
+
+            # Figure out the Biolink type to report.
+            types = entry['types']
+            if len(types) == 0:
+                biolink_type = 'NamedThing'
+            else:
+                biolink_type = types[0]
+
+            # How many names do we have?
+            if len(names) == 0:
+                # This shouldn't happen, but let's anticipate this anyway.
+                sapbertf.write(
+                    f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{preferred_name.lower()}\n"
+                )
+                count_training_text += 1
+            elif len(names) == 1:
+                # If we have less than two names, we don't have anything to randomize.
+                sapbertf.write(
+                    f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{names[0]}\n"
+                )
+                count_training_text += 1
+            else:
+                name_pairs = list(itertools.combinations(set(names), 2))
+
+                if len(name_pairs) > MAX_SYNONYM_PAIRS:
+                    # Randomly select 50 pairs.
+                    name_pairs = random.sample(name_pairs, MAX_SYNONYM_PAIRS)
+
+                for name_pair in name_pairs:
+                    sapbertf.write(f"biolink:{biolink_type}||{curie}||{preferred_name}||{name_pair[0]}||{name_pair[1]}\n")
+                    count_training_text += 1
+
+    logger.info(f"Converted {synonym_filename} to SAPBERT training file {synonym_filename}: " +
+                f"read {count_entry} entries and wrote out {count_training_text} training rows.")
diff --git a/src/snakefiles/exports.snakefile b/src/snakefiles/exports.snakefile
@@ -1,5 +1,6 @@
-from src.snakefiles.util import get_all_compendia
+from src.snakefiles.util import get_all_compendia, get_all_synonyms_with_drugchemicalconflated
 import src.exporters.kgx as kgx
+import src.exporters.sapbert as sapbert
 import os
 
 ### Export compendia/synonyms into downstream outputs
@@ -30,3 +31,26 @@ rule generate_kgx:
         edges_file=config['output_directory'] + "/kgx/{filename}_edges.jsonl.gz",
     run:
         kgx.convert_compendium_to_kgx(input.compendium_file, output.nodes_file, output.edges_file)
+
+
+# Export all synonym files to SAPBERT export, then create `babel_outputs/sapbert-training-data/done` to signal that we're done.
+rule export_all_to_sapbert_training:
+    input:
+        sapbert_training_file=expand("{od}/sapbert-training-data/{fn}.gz",
+            od=config['output_directory'],
+            fn=get_all_synonyms_with_drugchemicalconflated(config)
+        )
+    output:
+        x = config['output_directory'] + '/sapbert-training-data/done',
+    shell:
+        "echo 'done' >> {output.x}"
+
+
+# Generic rule for generating the KGX files for a particular compendia file.
+rule generate_sapbert_training_data:
+    input:
+        synonym_file=config['output_directory'] + "/synonyms/{filename}",
+    output:
+        sapbert_training_data_file=config['output_directory'] + "/sapbert-training-data/{filename}.gz",
+    run:
+        sapbert.convert_synonyms_to_sapbert(input.synonym_file, output.sapbert_training_data_file)
diff --git a/src/snakefiles/reports.snakefile b/src/snakefiles/reports.snakefile
@@ -1,3 +1,4 @@
+from src.snakefiles.util import get_all_compendia, get_all_synonyms_with_drugchemicalconflated
 import os
 
 from src.reports.compendia_per_file_reports import assert_files_in_directory, \
@@ -9,17 +10,10 @@ synonyms_path = config['output_directory'] + '/synonyms'
 conflations_path = config['output_directory'] + '/conflation'
 
 # Expected compendia files.
-compendia_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \
-    config['disease_outputs'] + config['process_outputs'] + \
-    config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \
-    config['umls_outputs'] + config['macromolecularcomplex_outputs']
+compendia_files = get_all_compendia(config)
 
 # Expected synonym files.
-synonyms_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \
-    config['disease_outputs'] + config['process_outputs'] + \
-    config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \
-    config['drugchemicalconflated_synonym_outputs'] + \
-    config['umls_outputs'] + config['macromolecularcomplex_outputs']
+synonyms_files = get_all_synonyms_with_drugchemicalconflated(config)
 
 # Expected conflation files.
 conflation_files = config['geneprotein_outputs'] + config['drugchemical_outputs']

diff --git a/src/snakefiles/util.py b/src/snakefiles/util.py
@@ -11,4 +11,39 @@ def get_all_compendia(config):
             config['protein_outputs'] +
             config['taxon_outputs'] +
             config['umls_outputs'] +
-            config['macromolecularcomplex_outputs'])
+            config['macromolecularcomplex_outputs'])
+
+
+# List of all the synonym files, except DrugChemicalConflated.
+def get_synonyms(config):
+    return (
+        config['anatomy_outputs'] +
+        config['gene_outputs'] +
+        config['protein_outputs'] +
+        config['disease_outputs'] +
+        config['process_outputs'] +
+        config['chemical_outputs'] +
+        config['taxon_outputs'] +
+        config['genefamily_outputs'] +
+        # config['drugchemicalconflated_synonym_outputs'] +
+        config['umls_outputs'] +
+        config['macromolecularcomplex_outputs']
+    )
+
+
+# List of all the synonym files including DrugChemicalConflated instead of the files it
+# duplicates.
+def get_all_synonyms_with_drugchemicalconflated(config):
+    return (
+            config['anatomy_outputs'] +
+            config['gene_outputs'] +
+            config['protein_outputs'] +
+            config['disease_outputs'] +
+            config['process_outputs'] +
+            # config['chemical_outputs'] +
+            config['taxon_outputs'] +
+            config['genefamily_outputs'] +
+            config['drugchemicalconflated_synonym_outputs'] +
+            config['umls_outputs'] +
+            config['macromolecularcomplex_outputs']
+    )