-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from Knowledge-Graph-Hub/duckdb
Duckdb
- Loading branch information
Showing
48 changed files
with
1,871 additions
and
1,758 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,3 +127,8 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
.DS_Store | ||
data/raw/* | ||
tests/data/output/*.tsv | ||
data/merged/* | ||
.tmp/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
|
||
# download-transforms: | ||
|
||
# wget "XXX" -O data/raw/merged-kg_nodes.tsv | ||
|
||
# merge-kg-microbe-function: | ||
# PWD=$(pwd) | ||
# poetry run kg merge -y $(PWD)/merged_yamls/kg_base_merge.yaml | ||
# poetry run kg duckdb_merge -base-n $(PWD)/data/merged/merged-kg_nodes.tsv -subset-n $(PWD)/data/transformed/nodes.tsv -base-e $(PWD)/data/merged/merged-kg_edges.tsv -subset-e $(PWD)/data/transformed/edges.tsv | ||
|
||
# merge-kg-microbe-biomedical: | ||
# PWD=$(pwd) | ||
# poetry run kg merge -y $(PWD)/merged_yamls/kg_biomedical_merge.yaml | ||
|
||
# merge-kg-microbe-biomedical-function: | ||
# PWD=$(pwd) | ||
# poetry run kg merge -y $(PWD)/merged_yamls/kg_biomedical_merge.yaml | ||
# poetry run kg duckdb_merge -base-n $(PWD)/data/merged/merged-kg_nodes.tsv -subset-n $(PWD)/data/transformed/nodes.tsv -base-e $(PWD)/data/merged/merged-kg_edges.tsv -subset-e $(PWD)/data/transformed/edges.tsv | ||
|
||
# !For testing | ||
# merge-kg-microbe-biomedical-function: | ||
# poetry run kg merge -y merge_yamls/merge.yaml -m duckdb -base-n '/Users/brooksantangelo/Documents/LozuponeLab/FRMS_2024/duckdb/merged-kg_kg-microbe-base/merged-kg_nodes.tsv' -base-e '/Users/brooksantangelo/Documents/LozuponeLab/FRMS_2024/duckdb/merged-kg_kg-microbe-base/merged-kg_edges.tsv' -subset-n '/Users/brooksantangelo/Documents/Repositories/kg-microbe/data/transformed/uniprot_genome_features/nodes.tsv' -subset-e '/Users/brooksantangelo/Documents/Repositories/kg-microbe/data/transformed/uniprot_genome_features/edges.tsv' | ||
|
||
datamodel: | ||
poetry run gen-python kg_microbe_merge/schema/merge_schema.yaml > kg_microbe_merge/schema/merge_datamodel.py | ||
|
||
|
||
subset-merge: | ||
poetry run kg merge -m duckdb -s "bacdive, bactotraits, chebi, ncbitaxon" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,12 +8,13 @@ | |
import sys | ||
from datetime import date | ||
from kg_microbe_merge import __version__ | ||
|
||
# -- Project information ----------------------------------------------------- | ||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information | ||
|
||
project = 'kg-microbe-merge' | ||
project = "kg-microbe-merge" | ||
copyright = f"{date.today().year}, Harshad Hegde <[email protected]>" | ||
author = 'Harshad Hegde <[email protected]>' | ||
author = "Harshad Hegde <[email protected]>" | ||
release = __version__ | ||
|
||
# -- General configuration --------------------------------------------------- | ||
|
@@ -25,7 +26,7 @@ | |
"sphinx_rtd_theme", | ||
"sphinx_click", | ||
"sphinx_autodoc_typehints", | ||
"myst_parser" | ||
"myst_parser", | ||
] | ||
|
||
# generate autosummary pages | ||
|
@@ -46,13 +47,13 @@ | |
# This pattern also affects html_static_path and html_extra_path. | ||
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] | ||
|
||
templates_path = ['_templates'] | ||
templates_path = ["_templates"] | ||
|
||
# -- Options for HTML output ------------------------------------------------- | ||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output | ||
|
||
html_theme = 'sphinx_rtd_theme' | ||
html_static_path = ['_static'] | ||
html_theme = "sphinx_rtd_theme" | ||
html_static_path = ["_static"] | ||
|
||
# The name of an image file (relative to this directory) to place at the top | ||
# of the sidebar. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,13 @@ | ||
"""kg-microbe-merge package.""" | ||
|
||
from importlib import metadata | ||
|
||
from .download import download | ||
from .transform_utils import transform | ||
|
||
try: | ||
__version__ = metadata.version(__name__) | ||
except metadata.PackageNotFoundError: | ||
# package is not installed | ||
__version__ = "0.0.0" # pragma: no cover | ||
|
||
__all__ = ["download", "transform"] | ||
__all__ = ["download"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
"""Constants for merge_utilities.""" | ||
|
||
from pathlib import Path | ||
|
||
BASE_NODES_TABLE_NAME = "base_kg_nodes" | ||
SUBSET_NODES_TABLE_NAME = "subset_kg_nodes" | ||
BASE_EDGES_TABLE_NAME = "base_kg_edges" | ||
SUBSET_EDGES_TABLE_NAME = "subset_kg_edges" | ||
NODES_COLUMNS = [ | ||
"id", | ||
"name", | ||
"description", | ||
"category", | ||
"xref", | ||
"provided_by", | ||
"synonym", | ||
"object", | ||
"predicate", | ||
"relation", | ||
"same_as", | ||
"subject", | ||
"subsets", | ||
] | ||
EDGES_COLUMNS = ["subject", "predicate", "object", "relation", "primary_knowledge_source"] | ||
|
||
PWD = Path.cwd().resolve() | ||
DATA_DIR = PWD / "data" | ||
RAW_DATA_DIR = DATA_DIR / "raw" | ||
MERGED_DATA_DIR = DATA_DIR / "merged" | ||
MERGED_GRAPH_STATS_FILE = MERGED_DATA_DIR / "merged_graph_stats.yaml" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
"""Merging module.""" | ||
|
||
import csv | ||
from pathlib import Path | ||
from typing import Dict, List, Union | ||
|
||
import networkx as nx # type: ignore | ||
import yaml | ||
from kgx.cli.cli_utils import merge # type: ignore | ||
|
||
from kg_microbe_merge.constants import MERGED_DATA_DIR | ||
from kg_microbe_merge.utils.duckdb_utils import ( | ||
duckdb_edges_merge, | ||
duckdb_nodes_merge, | ||
) | ||
from kg_microbe_merge.utils.file_utils import tarball_files_in_dir | ||
|
||
|
||
def parse_load_config(yaml_file: str) -> Dict: | ||
""" | ||
Parse load config YAML. | ||
:param yaml_file: A string pointing to a KGX compatible config YAML. | ||
:return: Dict: The config as a dictionary. | ||
""" | ||
with open(yaml_file) as yamlf: | ||
config = yaml.safe_load(yamlf) # , Loader=yaml.FullLoader) | ||
return config | ||
|
||
|
||
def load_and_merge(yaml_file: str, processes: int = 1) -> nx.MultiDiGraph: | ||
""" | ||
Load and merge sources defined in the config YAML. | ||
:param yaml_file: A string pointing to a KGX compatible config YAML. | ||
:param processes: Number of processes to use. | ||
:return: networkx.MultiDiGraph: The merged graph. | ||
""" | ||
merged_graph = merge(yaml_file, processes=processes) | ||
return merged_graph | ||
|
||
|
||
def duckdb_merge( | ||
nodes_files_path: List[Union[str, Path]], | ||
edges_files_path: List[Union[str, Path]], | ||
merge_nodes_output_path: Union[str, Path], | ||
merged_edges_output_path: Union[str, Path], | ||
nodes_batch_size: int = 100000, | ||
edges_batch_size: int = 2000000, | ||
) -> None: | ||
""" | ||
Merge nodes and edges tables using DuckDB. | ||
:param nodes_files_path: List of paths to nodes files. | ||
:param edges_files_path: List of paths to edges files. | ||
:return: None | ||
""" | ||
# For all files in the nodes_files_path which has 'ontologies' dir in path, | ||
# get the value of the `provided_by` column in the tsv file and add it to the priority_sources list | ||
|
||
priority_sources = [] | ||
ontology_nodes_paths = [ | ||
Path(file_path) for file_path in nodes_files_path if "ontologies" in str(file_path) | ||
] | ||
for file_path in ontology_nodes_paths: | ||
with file_path.open(newline="") as tsvfile: | ||
reader = csv.DictReader(tsvfile, delimiter="\t") | ||
for row in reader: | ||
provided_by_value = row.get("provided_by") | ||
if provided_by_value: | ||
priority_sources.append(provided_by_value) | ||
break # We only need the value from one row | ||
|
||
# Merge nodes | ||
duckdb_nodes_merge( | ||
nodes_files_path, merge_nodes_output_path, priority_sources, nodes_batch_size | ||
) | ||
|
||
# Merge edges | ||
duckdb_edges_merge(edges_files_path, merged_edges_output_path, edges_batch_size) | ||
|
||
# Tarball all files in a directory | ||
tarball_files_in_dir(MERGED_DATA_DIR, "merged_kg") | ||
|
||
|
||
# def duckdb_merge( | ||
# base_kg_nodes_file, subset_kg_nodes_file, base_kg_edges_file, subset_kg_edges_file | ||
# ): | ||
|
||
# # Connect to DuckDB | ||
# con = duckdb.connect() | ||
# Merge nodes | ||
# duckdb_prepare_tables( | ||
# con, | ||
# base_kg_nodes_file, | ||
# subset_kg_nodes_file, | ||
# BASE_NODES_TABLE_NAME, | ||
# SUBSET_NODES_TABLE_NAME, | ||
# NODES_COLUMNS, | ||
# ) | ||
# merge_kg_nodes,duplicate_nodes = merge_kg_tables( | ||
# con, NODES_COLUMNS, BASE_NODES_TABLE_NAME, SUBSET_NODES_TABLE_NAME, "nodes" | ||
# ) | ||
# write_file(con, NODES_COLUMNS, "merge_kg_nodes.tsv", merge_kg_nodes) | ||
# write_file(con, NODES_COLUMNS, "duplicate_kg_nodes.tsv", duplicate_nodes) | ||
|
||
# # Merge edges | ||
# duckdb_prepare_tables( | ||
# con, | ||
# base_kg_edges_file, | ||
# subset_kg_edges_file, | ||
# BASE_EDGES_TABLE_NAME, | ||
# SUBSET_EDGES_TABLE_NAME, | ||
# EDGES_COLUMNS, | ||
# ) | ||
# merge_kg_edges, duplicate_edges = merge_kg_tables( | ||
# con, EDGES_COLUMNS, BASE_EDGES_TABLE_NAME, SUBSET_EDGES_TABLE_NAME, "edges" | ||
# ) | ||
# write_file(con, EDGES_COLUMNS, "merge_kg_edges.tsv", merge_kg_edges) | ||
# write_file(con, EDGES_COLUMNS, "duplicate_kg_edges.tsv", duplicate_edges) |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.