From c9d4265acdfed24b1c02f55c477414651e4cb1b5 Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Mon, 26 Aug 2024 18:08:35 -0500 Subject: [PATCH] Added `-l`/`--merge-label` to distinguish subset merges --- kg_microbe_merge/merge.py | 9 ++++++--- kg_microbe_merge/run.py | 12 ++++++++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/kg_microbe_merge/merge.py b/kg_microbe_merge/merge.py index bfc492e..167bb13 100644 --- a/kg_microbe_merge/merge.py +++ b/kg_microbe_merge/merge.py @@ -63,7 +63,7 @@ def load_and_merge(yaml_file: str, processes: int = 1) -> nx.MultiDiGraph: def duckdb_merge( nodes_files_path: List[Union[str, Path]], edges_files_path: List[Union[str, Path]], - merge_nodes_output_path: Union[str, Path], + merged_nodes_output_path: Union[str, Path], merged_edges_output_path: Union[str, Path], nodes_batch_size: int = 100000, edges_batch_size: int = 2000000, @@ -91,13 +91,16 @@ def duckdb_merge( priority_sources.append(provided_by_value) break # We only need the value from one row + os.makedirs(os.path.dirname(merged_nodes_output_path), exist_ok=True) + # Merge nodes duckdb_nodes_merge( - nodes_files_path, merge_nodes_output_path, priority_sources, nodes_batch_size + nodes_files_path, merged_nodes_output_path, priority_sources, nodes_batch_size ) # Merge edges duckdb_edges_merge(edges_files_path, merged_edges_output_path, edges_batch_size) # Tarball all files in a directory - tarball_files_in_dir(MERGED_DATA_DIR, "merged_kg") + tarball_name = str(merged_nodes_output_path).split("/")[-2] + tarball_files_in_dir(MERGED_DATA_DIR / tarball_name, tarball_name) diff --git a/kg_microbe_merge/run.py b/kg_microbe_merge/run.py index 4dba149..0e6eafb 100644 --- a/kg_microbe_merge/run.py +++ b/kg_microbe_merge/run.py @@ -95,6 +95,7 @@ def download(*args, **kwargs) -> None: @click.option("--merge-tool", "-m", default="kgx", type=click.Choice(["kgx", "duckdb"])) @click.option("--data-dir", "-d", type=click.Path(exists=True), default=RAW_DATA_DIR) @click.option("--subset-transforms", "-s", multiple=True) +@click.option("--merge-label", "-l", default="merged-kg") @click.option("--nodes-batch-size", "-n", type=int, default=100000) @click.option("--edges-batch-size", "-e", type=int, default=2000000) def merge( @@ -103,6 +104,7 @@ def merge( merge_tool: str, data_dir: str, subset_transforms: tuple, + merge_label: str, nodes_batch_size: int, edges_batch_size: int, ) -> None: @@ -136,12 +138,18 @@ def merge( merge_kg_object.merged_graph = merged_graph_object if merge_tool == "duckdb": + if merge_label: + merged_nodes_output_path = MERGED_DATA_DIR / merge_label / "nodes.tsv" + merged_edges_output_path = MERGED_DATA_DIR / merge_label / "edges.tsv" + else: + merged_nodes_output_path = MERGED_DATA_DIR / "nodes.tsv" + merged_edges_output_path = MERGED_DATA_DIR / "edges.tsv" duckdb_merge( node_paths, edge_paths, - MERGED_DATA_DIR / "nodes.tsv", - MERGED_DATA_DIR / "edges.tsv", + merged_nodes_output_path, + merged_edges_output_path, nodes_batch_size, edges_batch_size, )