Skip to content

Commit

Permalink
Merge pull request #235 from TranslatorSRI/add-sapbert-export
Browse files Browse the repository at this point in the history
This PR moves the SAPBERT training data exporter into Babel from [Babel Validation](https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207). This has been added as a new `exporter` and, as with the KGX file, generates SAPBERT training data that is already gzipped to save space on the Babel instance.

It also includes some minor changes to the babel_utils.py and moved the lists of all Snakemake synonym files into snakesfiles/util.py.
  • Loading branch information
gaurav authored Feb 16, 2024
2 parents 5403314 + 330e9b3 commit 5def2b8
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 14 deletions.
1 change: 1 addition & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ rule all:

# Build all the exports.
config['output_directory'] + '/kgx/done',
config['output_directory'] + '/sapbert-training-data/done',
output:
x = config['output_directory'] + '/reports/all_done'
shell:
Expand Down
9 changes: 6 additions & 3 deletions src/babel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,10 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
if filtered_possible_labels:
document["preferred_name"] = filtered_possible_labels[0]
else:
logging.debug(f"No preferred name for {node}")
logging.debug(
f"No preferred name for {node}, probably because all names were filtered out. Skipping."
)
continue

# We previously used the shortest length of a name as a proxy for how good a match it is, i.e. given
# two concepts that both have the word "acetaminophen" in them, we assume that the shorter one is the
Expand Down Expand Up @@ -618,7 +621,7 @@ def clean_sets(result_dict):
"""The keys for this are unique and unmergable: Don't merge GO!
But there are values that are showing up in multiple GOs (could be
MetaCycs or RHEAs or Reactomes). It's just how GO is mapping. Now,
the right answer here is probably to kboom this whole mess. But
the right answer here is probably to kboom this whole mess. But
for prototype, we're just going to filter out garbage merge values).
Note that this isn't limited to GO. Even MONDO include some #exactMatch
to the same MESH from two different MONDO ids"""
Expand Down Expand Up @@ -705,4 +708,4 @@ def norm(x,op):
pref = Text.get_curie(x)
if pref in op:
return Text.recurie(x,op[pref])
return x
return x
104 changes: 104 additions & 0 deletions src/exporters/sapbert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Sapbert (https://github.com/RENCI-NER/sapbert) requires input files
# in a particular pipe-delimited format:
# biolink:Gene||NCBIGene:10554||AGPAT1||1-acylglycerol-3-phosphate o-acyltransferase 1||lysophosphatidic acid acyltransferase, alpha
# i.e. the format we need is:
# biolink-type||preferred ID||preferred label||synonym 1||synonym 2
# Also, we can't do more than fifty synonym pairs for each preferred ID.
#
# This file provides code for doing that, based on the code from
# https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207
import gzip
import hashlib
import itertools
import json
import os
import random
import re
from itertools import combinations

import logging
from src.util import LoggingUtil

# Default logger for this file.
logger = LoggingUtil.init_logging(__name__, level=logging.INFO)

# Configuration options
# Include up to 50 synonym pairs for each synonym.
MAX_SYNONYM_PAIRS = 50
# Should we lowercase all the names?
LOWERCASE_ALL_NAMES = True


def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
"""
Convert a synonyms file to the training format for SAPBERT (https://github.com/RENCI-NER/sapbert).
Based on the code in https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207
:param synonym_filename: The compendium file to convert.
:param sapbert_filename_gzipped: The SAPBERT training file to generate.
"""

logger.info(f"convert_synonyms_to_sapbert({synonym_filename}, {sapbert_filename_gzipped})")

# Make the output directories if they don't exist.
os.makedirs(os.path.dirname(sapbert_filename_gzipped), exist_ok=True)

# Go through all the synonyms in the input file.
count_entry = 0
count_training_text = 0
with open(synonym_filename, "r", encoding="utf-8") as synonymf, gzip.open(sapbert_filename_gzipped, "wt", encoding="utf-8") as sapbertf:
for line in synonymf:
count_entry += 1
entry = json.loads(line)

# Read fields from the synonym.
curie = entry['curie']
preferred_name = entry.get('preferred_name', '')
if not preferred_name:
logging.warning(f"Unable to convert synonym entry for curie {curie}, skipping: {entry}")
continue

# Collect and process the list of names.
names = entry['names']
if LOWERCASE_ALL_NAMES:
names = [name.lower() for name in names]

# We use '||' as a delimiter, so any occurrences of more than one pipe character
# should be changed to a single pipe character in the SAPBERT output, so we don't
# confuse it up with our delimiter.
names = [re.sub(r'\|\|+', '|', name) for name in names]

# Figure out the Biolink type to report.
types = entry['types']
if len(types) == 0:
biolink_type = 'NamedThing'
else:
biolink_type = types[0]

# How many names do we have?
if len(names) == 0:
# This shouldn't happen, but let's anticipate this anyway.
sapbertf.write(
f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{preferred_name.lower()}\n"
)
count_training_text += 1
elif len(names) == 1:
# If we have less than two names, we don't have anything to randomize.
sapbertf.write(
f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{names[0]}\n"
)
count_training_text += 1
else:
name_pairs = list(itertools.combinations(set(names), 2))

if len(name_pairs) > MAX_SYNONYM_PAIRS:
# Randomly select 50 pairs.
name_pairs = random.sample(name_pairs, MAX_SYNONYM_PAIRS)

for name_pair in name_pairs:
sapbertf.write(f"biolink:{biolink_type}||{curie}||{preferred_name}||{name_pair[0]}||{name_pair[1]}\n")
count_training_text += 1

logger.info(f"Converted {synonym_filename} to SAPBERT training file {synonym_filename}: " +
f"read {count_entry} entries and wrote out {count_training_text} training rows.")
26 changes: 25 additions & 1 deletion src/snakefiles/exports.snakefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from src.snakefiles.util import get_all_compendia
from src.snakefiles.util import get_all_compendia, get_all_synonyms_with_drugchemicalconflated
import src.exporters.kgx as kgx
import src.exporters.sapbert as sapbert
import os

### Export compendia/synonyms into downstream outputs
Expand Down Expand Up @@ -30,3 +31,26 @@ rule generate_kgx:
edges_file=config['output_directory'] + "/kgx/{filename}_edges.jsonl.gz",
run:
kgx.convert_compendium_to_kgx(input.compendium_file, output.nodes_file, output.edges_file)


# Export all synonym files to SAPBERT export, then create `babel_outputs/sapbert-training-data/done` to signal that we're done.
rule export_all_to_sapbert_training:
input:
sapbert_training_file=expand("{od}/sapbert-training-data/{fn}.gz",
od=config['output_directory'],
fn=get_all_synonyms_with_drugchemicalconflated(config)
)
output:
x = config['output_directory'] + '/sapbert-training-data/done',
shell:
"echo 'done' >> {output.x}"


# Generic rule for generating the KGX files for a particular compendia file.
rule generate_sapbert_training_data:
input:
synonym_file=config['output_directory'] + "/synonyms/{filename}",
output:
sapbert_training_data_file=config['output_directory'] + "/sapbert-training-data/{filename}.gz",
run:
sapbert.convert_synonyms_to_sapbert(input.synonym_file, output.sapbert_training_data_file)
12 changes: 3 additions & 9 deletions src/snakefiles/reports.snakefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from src.snakefiles.util import get_all_compendia, get_all_synonyms_with_drugchemicalconflated
import os

from src.reports.compendia_per_file_reports import assert_files_in_directory, \
Expand All @@ -9,17 +10,10 @@ synonyms_path = config['output_directory'] + '/synonyms'
conflations_path = config['output_directory'] + '/conflation'

# Expected compendia files.
compendia_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \
config['disease_outputs'] + config['process_outputs'] + \
config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \
config['umls_outputs'] + config['macromolecularcomplex_outputs']
compendia_files = get_all_compendia(config)

# Expected synonym files.
synonyms_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \
config['disease_outputs'] + config['process_outputs'] + \
config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \
config['drugchemicalconflated_synonym_outputs'] + \
config['umls_outputs'] + config['macromolecularcomplex_outputs']
synonyms_files = get_all_synonyms_with_drugchemicalconflated(config)

# Expected conflation files.
conflation_files = config['geneprotein_outputs'] + config['drugchemical_outputs']
Expand Down
37 changes: 36 additions & 1 deletion src/snakefiles/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,39 @@ def get_all_compendia(config):
config['protein_outputs'] +
config['taxon_outputs'] +
config['umls_outputs'] +
config['macromolecularcomplex_outputs'])
config['macromolecularcomplex_outputs'])


# List of all the synonym files, except DrugChemicalConflated.
def get_synonyms(config):
return (
config['anatomy_outputs'] +
config['gene_outputs'] +
config['protein_outputs'] +
config['disease_outputs'] +
config['process_outputs'] +
config['chemical_outputs'] +
config['taxon_outputs'] +
config['genefamily_outputs'] +
# config['drugchemicalconflated_synonym_outputs'] +
config['umls_outputs'] +
config['macromolecularcomplex_outputs']
)


# List of all the synonym files including DrugChemicalConflated instead of the files it
# duplicates.
def get_all_synonyms_with_drugchemicalconflated(config):
return (
config['anatomy_outputs'] +
config['gene_outputs'] +
config['protein_outputs'] +
config['disease_outputs'] +
config['process_outputs'] +
# config['chemical_outputs'] +
config['taxon_outputs'] +
config['genefamily_outputs'] +
config['drugchemicalconflated_synonym_outputs'] +
config['umls_outputs'] +
config['macromolecularcomplex_outputs']
)

0 comments on commit 5def2b8

Please sign in to comment.