Skip to content

Commit

Permalink
Merge pull request #1 from Knowledge-Graph-Hub/duckdb
Browse files Browse the repository at this point in the history
Duckdb
  • Loading branch information
hrshdhgd authored Aug 8, 2024
2 parents 203c427 + 9f7e568 commit c899f0e
Show file tree
Hide file tree
Showing 48 changed files with 1,871 additions and 1,758 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,8 @@ dmypy.json

# Pyre type checker
.pyre/
.DS_Store
data/raw/*
tests/data/output/*.tsv
data/merged/*
.tmp/*
29 changes: 29 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

# download-transforms:

# wget "XXX" -O data/raw/merged-kg_nodes.tsv

# merge-kg-microbe-function:
# PWD=$(pwd)
# poetry run kg merge -y $(PWD)/merged_yamls/kg_base_merge.yaml
# poetry run kg duckdb_merge -base-n $(PWD)/data/merged/merged-kg_nodes.tsv -subset-n $(PWD)/data/transformed/nodes.tsv -base-e $(PWD)/data/merged/merged-kg_edges.tsv -subset-e $(PWD)/data/transformed/edges.tsv

# merge-kg-microbe-biomedical:
# PWD=$(pwd)
# poetry run kg merge -y $(PWD)/merged_yamls/kg_biomedical_merge.yaml

# merge-kg-microbe-biomedical-function:
# PWD=$(pwd)
# poetry run kg merge -y $(PWD)/merged_yamls/kg_biomedical_merge.yaml
# poetry run kg duckdb_merge -base-n $(PWD)/data/merged/merged-kg_nodes.tsv -subset-n $(PWD)/data/transformed/nodes.tsv -base-e $(PWD)/data/merged/merged-kg_edges.tsv -subset-e $(PWD)/data/transformed/edges.tsv

# !For testing
# merge-kg-microbe-biomedical-function:
# poetry run kg merge -y merge_yamls/merge.yaml -m duckdb -base-n '/Users/brooksantangelo/Documents/LozuponeLab/FRMS_2024/duckdb/merged-kg_kg-microbe-base/merged-kg_nodes.tsv' -base-e '/Users/brooksantangelo/Documents/LozuponeLab/FRMS_2024/duckdb/merged-kg_kg-microbe-base/merged-kg_edges.tsv' -subset-n '/Users/brooksantangelo/Documents/Repositories/kg-microbe/data/transformed/uniprot_genome_features/nodes.tsv' -subset-e '/Users/brooksantangelo/Documents/Repositories/kg-microbe/data/transformed/uniprot_genome_features/edges.tsv'

datamodel:
poetry run gen-python kg_microbe_merge/schema/merge_schema.yaml > kg_microbe_merge/schema/merge_datamodel.py


subset-merge:
poetry run kg merge -m duckdb -s "bacdive, bactotraits, chebi, ncbitaxon"
13 changes: 7 additions & 6 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
import sys
from datetime import date
from kg_microbe_merge import __version__

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

project = 'kg-microbe-merge'
project = "kg-microbe-merge"
copyright = f"{date.today().year}, Harshad Hegde <[email protected]>"
author = 'Harshad Hegde <[email protected]>'
author = "Harshad Hegde <[email protected]>"
release = __version__

# -- General configuration ---------------------------------------------------
Expand All @@ -25,7 +26,7 @@
"sphinx_rtd_theme",
"sphinx_click",
"sphinx_autodoc_typehints",
"myst_parser"
"myst_parser",
]

# generate autosummary pages
Expand All @@ -46,13 +47,13 @@
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

templates_path = ['_templates']
templates_path = ["_templates"]

# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

html_theme = 'sphinx_rtd_theme'
html_static_path = ['_static']
html_theme = "sphinx_rtd_theme"
html_static_path = ["_static"]

# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
Expand Down
37 changes: 20 additions & 17 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,28 @@
---

# **** Ontology files ****
#
# ENVO
#
-
url: http://purl.obolibrary.org/obo/envo.json
local_name: envo.json
#
# HP
#
-
url: http://purl.obolibrary.org/obo/hp.json
local_name: hp.json
-
url: git://Knowledge-Graph-Hub/kg-microbe/ontologies.tar.gz
local_name: ontologies.tar.gz

# **** Data sources ****
#
# KG-Microbe [BacDive]
# -
# url: https://github.com/Knowledge-Graph-Hub/kg-microbe/releases/download/2024-07-26-rc2/BacDive.tar.gz
# local_name: kg_microbe_2021-10-01.ttl
#
-
url: git://Knowledge-Graph-Hub/kg-microbe/BacDive.tar.gz
local_name: BacDive.tar.gz

# -
# url:
#
# KG-Microbe [BactoTraits]
#
-
url: git://Knowledge-Graph-Hub/kg-microbe/BactoTraits.tar.gz
local_name: BactoTraits.tar.gz

#
# KG-Microbe [MediaDive]
#
-
url: git://Knowledge-Graph-Hub/kg-microbe/MediaDive.tar.gz
local_name: MediaDive.tar.gz
4 changes: 2 additions & 2 deletions kg_microbe_merge/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
"""kg-microbe-merge package."""

from importlib import metadata

from .download import download
from .transform_utils import transform

try:
__version__ = metadata.version(__name__)
except metadata.PackageNotFoundError:
# package is not installed
__version__ = "0.0.0" # pragma: no cover

__all__ = ["download", "transform"]
__all__ = ["download"]
30 changes: 30 additions & 0 deletions kg_microbe_merge/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Constants for merge_utilities."""

from pathlib import Path

BASE_NODES_TABLE_NAME = "base_kg_nodes"
SUBSET_NODES_TABLE_NAME = "subset_kg_nodes"
BASE_EDGES_TABLE_NAME = "base_kg_edges"
SUBSET_EDGES_TABLE_NAME = "subset_kg_edges"
NODES_COLUMNS = [
"id",
"name",
"description",
"category",
"xref",
"provided_by",
"synonym",
"object",
"predicate",
"relation",
"same_as",
"subject",
"subsets",
]
EDGES_COLUMNS = ["subject", "predicate", "object", "relation", "primary_knowledge_source"]

PWD = Path.cwd().resolve()
DATA_DIR = PWD / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
MERGED_DATA_DIR = DATA_DIR / "merged"
MERGED_GRAPH_STATS_FILE = MERGED_DATA_DIR / "merged_graph_stats.yaml"
1 change: 1 addition & 0 deletions kg_microbe_merge/download.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Download resources from YAML file."""

from kghub_downloader.download_utils import download_from_yaml # type: ignore


Expand Down
121 changes: 121 additions & 0 deletions kg_microbe_merge/merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""Merging module."""

import csv
from pathlib import Path
from typing import Dict, List, Union

import networkx as nx # type: ignore
import yaml
from kgx.cli.cli_utils import merge # type: ignore

from kg_microbe_merge.constants import MERGED_DATA_DIR
from kg_microbe_merge.utils.duckdb_utils import (
duckdb_edges_merge,
duckdb_nodes_merge,
)
from kg_microbe_merge.utils.file_utils import tarball_files_in_dir


def parse_load_config(yaml_file: str) -> Dict:
"""
Parse load config YAML.
:param yaml_file: A string pointing to a KGX compatible config YAML.
:return: Dict: The config as a dictionary.
"""
with open(yaml_file) as yamlf:
config = yaml.safe_load(yamlf) # , Loader=yaml.FullLoader)
return config


def load_and_merge(yaml_file: str, processes: int = 1) -> nx.MultiDiGraph:
"""
Load and merge sources defined in the config YAML.
:param yaml_file: A string pointing to a KGX compatible config YAML.
:param processes: Number of processes to use.
:return: networkx.MultiDiGraph: The merged graph.
"""
merged_graph = merge(yaml_file, processes=processes)
return merged_graph


def duckdb_merge(
nodes_files_path: List[Union[str, Path]],
edges_files_path: List[Union[str, Path]],
merge_nodes_output_path: Union[str, Path],
merged_edges_output_path: Union[str, Path],
nodes_batch_size: int = 100000,
edges_batch_size: int = 2000000,
) -> None:
"""
Merge nodes and edges tables using DuckDB.
:param nodes_files_path: List of paths to nodes files.
:param edges_files_path: List of paths to edges files.
:return: None
"""
# For all files in the nodes_files_path which has 'ontologies' dir in path,
# get the value of the `provided_by` column in the tsv file and add it to the priority_sources list

priority_sources = []
ontology_nodes_paths = [
Path(file_path) for file_path in nodes_files_path if "ontologies" in str(file_path)
]
for file_path in ontology_nodes_paths:
with file_path.open(newline="") as tsvfile:
reader = csv.DictReader(tsvfile, delimiter="\t")
for row in reader:
provided_by_value = row.get("provided_by")
if provided_by_value:
priority_sources.append(provided_by_value)
break # We only need the value from one row

# Merge nodes
duckdb_nodes_merge(
nodes_files_path, merge_nodes_output_path, priority_sources, nodes_batch_size
)

# Merge edges
duckdb_edges_merge(edges_files_path, merged_edges_output_path, edges_batch_size)

# Tarball all files in a directory
tarball_files_in_dir(MERGED_DATA_DIR, "merged_kg")


# def duckdb_merge(
# base_kg_nodes_file, subset_kg_nodes_file, base_kg_edges_file, subset_kg_edges_file
# ):

# # Connect to DuckDB
# con = duckdb.connect()
# Merge nodes
# duckdb_prepare_tables(
# con,
# base_kg_nodes_file,
# subset_kg_nodes_file,
# BASE_NODES_TABLE_NAME,
# SUBSET_NODES_TABLE_NAME,
# NODES_COLUMNS,
# )
# merge_kg_nodes,duplicate_nodes = merge_kg_tables(
# con, NODES_COLUMNS, BASE_NODES_TABLE_NAME, SUBSET_NODES_TABLE_NAME, "nodes"
# )
# write_file(con, NODES_COLUMNS, "merge_kg_nodes.tsv", merge_kg_nodes)
# write_file(con, NODES_COLUMNS, "duplicate_kg_nodes.tsv", duplicate_nodes)

# # Merge edges
# duckdb_prepare_tables(
# con,
# base_kg_edges_file,
# subset_kg_edges_file,
# BASE_EDGES_TABLE_NAME,
# SUBSET_EDGES_TABLE_NAME,
# EDGES_COLUMNS,
# )
# merge_kg_edges, duplicate_edges = merge_kg_tables(
# con, EDGES_COLUMNS, BASE_EDGES_TABLE_NAME, SUBSET_EDGES_TABLE_NAME, "edges"
# )
# write_file(con, EDGES_COLUMNS, "merge_kg_edges.tsv", merge_kg_edges)
# write_file(con, EDGES_COLUMNS, "duplicate_kg_edges.tsv", duplicate_edges)
4 changes: 0 additions & 4 deletions kg_microbe_merge/merge_utils/__init__.py

This file was deleted.

31 changes: 0 additions & 31 deletions kg_microbe_merge/merge_utils/merge_kg.py

This file was deleted.

4 changes: 2 additions & 2 deletions kg_microbe_merge/query.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Query module."""

import logging

import yaml
Expand Down Expand Up @@ -49,8 +50,7 @@ def result_dict_to_tsv(result_dict: dict, outfile: str) -> None:
row_items.append(row[col]["value"])
except KeyError:
logging.error(
"Problem retrieving result for col %s in row %s"
% (col, "\t".join(row))
"Problem retrieving result for col %s in row %s" % (col, "\t".join(row))
)
row_items.append("ERROR")
try:
Expand Down
Loading

0 comments on commit c899f0e

Please sign in to comment.