-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #206 from TranslatorSRI/add-kgx-conversion
This PR moves the KGX exporter from NodeNorm (specifically, https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208) into this repo and sets up Snakemake to generate the KGX files with every Babel run. Closes #61. Closes TranslatorSRI/NodeNormalization#95.
- Loading branch information
Showing
7 changed files
with
221 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,5 +15,5 @@ spec: | |
- ReadWriteOnce | ||
resources: | ||
requests: | ||
storage: 400Gi | ||
storage: 500Gi | ||
storageClassName: basic |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
# Once we generate the compendium files, we need to convert them into the | ||
# Knowledge Graph Exchange (KGX, https://github.com/biolink/kgx) format. | ||
# This file provides code for doing that, based on the code from | ||
# https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208 | ||
|
||
import hashlib | ||
import json | ||
import os | ||
from itertools import combinations | ||
|
||
import logging | ||
from src.util import LoggingUtil | ||
|
||
# Default logger for this file. | ||
logger = LoggingUtil.init_logging(__name__, level=logging.INFO) | ||
|
||
|
||
def convert_compendium_to_kgx(compendium_filename, kgx_nodes_filename, kgx_edges_filename): | ||
""" | ||
Convert a compendium file to KGX (https://github.com/biolink/kgx) format. | ||
Based on the code in https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208 | ||
:param compendium_filename: The compendium file to convert. | ||
:param kgx_nodes_filename: The KGX nodes file to write out. | ||
:param kgx_edges_filename: The KGX edges file to write out. | ||
""" | ||
|
||
logger.info(f"convert_compendium_to_kgx({compendium_filename}, {kgx_nodes_filename}, {kgx_edges_filename})") | ||
|
||
# Set up data structures. | ||
nodes: list = [] | ||
edges: list = [] | ||
pass_nodes: list = [] | ||
|
||
count_lines = 0 | ||
count_nodes = 0 | ||
count_edges = 0 | ||
|
||
# Used to count batches of 10000 lines to process together. | ||
batch_size = 10000 | ||
line_counter = 0 | ||
|
||
# Make the output directories if they don't exist. | ||
os.makedirs(os.path.dirname(kgx_nodes_filename), exist_ok=True) | ||
os.makedirs(os.path.dirname(kgx_edges_filename), exist_ok=True) | ||
|
||
# Open the compendium file for reading. | ||
with open(compendium_filename, "r", encoding="utf-8") as compendium: | ||
# Open the nodes and edges files for writing. | ||
with \ | ||
open(kgx_nodes_filename, "w", encoding="utf-8") as node_file, \ | ||
open(kgx_edges_filename, "w", encoding="utf-8") as edge_file: | ||
|
||
# set the flag for suppressing the first ",\n" in the written data | ||
first = True | ||
|
||
# At this point we should validate the compendium file, but the report | ||
# has already run, so hopefully it's already validated? | ||
|
||
# for each line in the file | ||
for line in compendium: | ||
# increment the record counter | ||
line_counter += 1 | ||
|
||
# clear storage for this pass | ||
pass_nodes.clear() | ||
|
||
# load the line into memory | ||
instance: dict = json.loads(line) | ||
|
||
# all ids (even the root one) are in the equivalent identifiers | ||
if len(instance["identifiers"]) > 0: | ||
# loop through each identifier and create a node | ||
for equiv_id in instance["identifiers"]: | ||
# check to see if there is a label. if there is use it | ||
if "l" in equiv_id: | ||
name = equiv_id["l"] | ||
else: | ||
name = "" | ||
|
||
# add the node to the ones in this pass | ||
pass_nodes.append( | ||
{ | ||
"id": equiv_id["i"], | ||
"name": name, | ||
"category": instance["type"], | ||
"equivalent_identifiers": list(x["i"] for x in instance["identifiers"]), | ||
} | ||
) | ||
|
||
# get the combinations of the nodes in this pass | ||
combos = combinations(pass_nodes, 2) | ||
|
||
# for all the node combinations create an edge between them | ||
for c in combos: | ||
# create a unique id | ||
record_id: str = c[0]["id"] + c[1]["id"] + f"{compendium_filename}" | ||
|
||
# save the edge | ||
edges.append( | ||
{ | ||
"id": f'{hashlib.md5(record_id.encode("utf-8")).hexdigest()}', | ||
"subject": c[0]["id"], | ||
"predicate": "biolink:same_as", | ||
"object": c[1]["id"], | ||
} | ||
) | ||
|
||
# save the nodes in this pass to the big list | ||
nodes.extend(pass_nodes) | ||
|
||
# did we reach the write threshold | ||
if line_counter == batch_size: | ||
# first time in doesn't get a leading comma | ||
if first: | ||
prefix = "" | ||
else: | ||
prefix = "\n" | ||
|
||
# reset the first record flag | ||
first = False | ||
|
||
# get all the nodes in a string and write them out | ||
nodes_to_write = prefix + "\n".join([json.dumps(node) for node in nodes]) | ||
node_file.write(nodes_to_write) | ||
count_nodes += len(nodes) | ||
|
||
# are there any edges to output | ||
if len(edges) > 0: | ||
# get all the edges in a string and write them out | ||
edges_to_write = prefix + "\n".join([json.dumps(edge) for edge in edges]) | ||
edge_file.write(edges_to_write) | ||
count_edges += len(edges) | ||
|
||
# reset for the next group | ||
nodes.clear() | ||
edges.clear() | ||
|
||
# Count total lines | ||
count_lines += line_counter | ||
logger.info(f"Processed {count_lines} lines from {compendium_filename}") | ||
|
||
# reset the line counter for the next group | ||
line_counter = 0 | ||
|
||
# pick up any remainders in the file | ||
if len(nodes) > 0: | ||
nodes_to_write = "\n" + "\n".join([json.dumps(node) for node in nodes]) | ||
node_file.write(nodes_to_write) | ||
count_nodes += len(nodes) | ||
|
||
if len(edges) > 0: | ||
edges_to_write = "\n" + "\n".join([json.dumps(edge) for edge in edges]) | ||
edge_file.write(edges_to_write) | ||
count_edges += len(edges) | ||
|
||
# Count total lines | ||
count_lines += line_counter | ||
logger.info(f"Processed a total of {count_lines} lines from {compendium_filename}") | ||
|
||
logger.info(f"Converted {compendium_filename} to KGX: " + | ||
f"wrote {count_nodes} nodes to {kgx_nodes_filename} and " + | ||
f"wrote {count_edges} edges to {kgx_edges_filename}.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from src.snakefiles.util import get_all_compendia | ||
import src.exporters.kgx as kgx | ||
import os | ||
|
||
### Export compendia/synonyms into downstream outputs | ||
|
||
# Export all compendia to KGX, then create `babel_outputs/kgx/done` to signal that we're done. | ||
rule export_all_to_kgx: | ||
input: | ||
nodes_files=expand("{od}/kgx/{fn}", | ||
od=config['output_directory'], | ||
fn=map(lambda fn: os.path.splitext(fn)[0] + '_nodes.jsonl', get_all_compendia(config)) | ||
), | ||
edges_files=expand("{od}/kgx/{fn}", | ||
od=config['output_directory'], | ||
fn=map(lambda fn: os.path.splitext(fn)[0] + '_edges.jsonl', get_all_compendia(config)) | ||
) | ||
output: | ||
x = config['output_directory'] + '/kgx/done', | ||
shell: | ||
"echo 'done' >> {output.x}" | ||
|
||
|
||
# Generic rule for generating the KGX files for a particular compendia file. | ||
rule generate_kgx: | ||
input: | ||
compendium_file=config['output_directory'] + "/compendia/{filename}.txt", | ||
output: | ||
nodes_file=config['output_directory'] + "/kgx/{filename}_nodes.jsonl", | ||
edges_file=config['output_directory'] + "/kgx/{filename}_edges.jsonl", | ||
run: | ||
kgx.convert_compendium_to_kgx(input.compendium_file, output.nodes_file, output.edges_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Shared code used by Snakemake files | ||
|
||
# List of all the compendia files that need to be converted. | ||
def get_all_compendia(config): | ||
return (config['anatomy_outputs'] + | ||
config['chemical_outputs'] + | ||
config['disease_outputs'] + | ||
config['gene_outputs'] + | ||
config['genefamily_outputs'] + | ||
config['process_outputs'] + | ||
config['protein_outputs'] + | ||
config['taxon_outputs'] + | ||
config['umls_outputs'] + | ||
config['macromolecularcomplex_outputs']) |