-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #235 from TranslatorSRI/add-sapbert-export
This PR moves the SAPBERT training data exporter into Babel from [Babel Validation](https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207). This has been added as a new `exporter` and, as with the KGX file, generates SAPBERT training data that is already gzipped to save space on the Babel instance. It also includes some minor changes to the babel_utils.py and moved the lists of all Snakemake synonym files into snakesfiles/util.py.
- Loading branch information
Showing
6 changed files
with
175 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# Sapbert (https://github.com/RENCI-NER/sapbert) requires input files | ||
# in a particular pipe-delimited format: | ||
# biolink:Gene||NCBIGene:10554||AGPAT1||1-acylglycerol-3-phosphate o-acyltransferase 1||lysophosphatidic acid acyltransferase, alpha | ||
# i.e. the format we need is: | ||
# biolink-type||preferred ID||preferred label||synonym 1||synonym 2 | ||
# Also, we can't do more than fifty synonym pairs for each preferred ID. | ||
# | ||
# This file provides code for doing that, based on the code from | ||
# https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207 | ||
import gzip | ||
import hashlib | ||
import itertools | ||
import json | ||
import os | ||
import random | ||
import re | ||
from itertools import combinations | ||
|
||
import logging | ||
from src.util import LoggingUtil | ||
|
||
# Default logger for this file. | ||
logger = LoggingUtil.init_logging(__name__, level=logging.INFO) | ||
|
||
# Configuration options | ||
# Include up to 50 synonym pairs for each synonym. | ||
MAX_SYNONYM_PAIRS = 50 | ||
# Should we lowercase all the names? | ||
LOWERCASE_ALL_NAMES = True | ||
|
||
|
||
def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped): | ||
""" | ||
Convert a synonyms file to the training format for SAPBERT (https://github.com/RENCI-NER/sapbert). | ||
Based on the code in https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207 | ||
:param synonym_filename: The compendium file to convert. | ||
:param sapbert_filename_gzipped: The SAPBERT training file to generate. | ||
""" | ||
|
||
logger.info(f"convert_synonyms_to_sapbert({synonym_filename}, {sapbert_filename_gzipped})") | ||
|
||
# Make the output directories if they don't exist. | ||
os.makedirs(os.path.dirname(sapbert_filename_gzipped), exist_ok=True) | ||
|
||
# Go through all the synonyms in the input file. | ||
count_entry = 0 | ||
count_training_text = 0 | ||
with open(synonym_filename, "r", encoding="utf-8") as synonymf, gzip.open(sapbert_filename_gzipped, "wt", encoding="utf-8") as sapbertf: | ||
for line in synonymf: | ||
count_entry += 1 | ||
entry = json.loads(line) | ||
|
||
# Read fields from the synonym. | ||
curie = entry['curie'] | ||
preferred_name = entry.get('preferred_name', '') | ||
if not preferred_name: | ||
logging.warning(f"Unable to convert synonym entry for curie {curie}, skipping: {entry}") | ||
continue | ||
|
||
# Collect and process the list of names. | ||
names = entry['names'] | ||
if LOWERCASE_ALL_NAMES: | ||
names = [name.lower() for name in names] | ||
|
||
# We use '||' as a delimiter, so any occurrences of more than one pipe character | ||
# should be changed to a single pipe character in the SAPBERT output, so we don't | ||
# confuse it up with our delimiter. | ||
names = [re.sub(r'\|\|+', '|', name) for name in names] | ||
|
||
# Figure out the Biolink type to report. | ||
types = entry['types'] | ||
if len(types) == 0: | ||
biolink_type = 'NamedThing' | ||
else: | ||
biolink_type = types[0] | ||
|
||
# How many names do we have? | ||
if len(names) == 0: | ||
# This shouldn't happen, but let's anticipate this anyway. | ||
sapbertf.write( | ||
f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{preferred_name.lower()}\n" | ||
) | ||
count_training_text += 1 | ||
elif len(names) == 1: | ||
# If we have less than two names, we don't have anything to randomize. | ||
sapbertf.write( | ||
f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{names[0]}\n" | ||
) | ||
count_training_text += 1 | ||
else: | ||
name_pairs = list(itertools.combinations(set(names), 2)) | ||
|
||
if len(name_pairs) > MAX_SYNONYM_PAIRS: | ||
# Randomly select 50 pairs. | ||
name_pairs = random.sample(name_pairs, MAX_SYNONYM_PAIRS) | ||
|
||
for name_pair in name_pairs: | ||
sapbertf.write(f"biolink:{biolink_type}||{curie}||{preferred_name}||{name_pair[0]}||{name_pair[1]}\n") | ||
count_training_text += 1 | ||
|
||
logger.info(f"Converted {synonym_filename} to SAPBERT training file {synonym_filename}: " + | ||
f"read {count_entry} entries and wrote out {count_training_text} training rows.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters