Skip to content
This repository has been archived by the owner on Oct 27, 2023. It is now read-only.

Add transitivity function in pandas_utils #7

Merged
merged 4 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24,392 changes: 24,392 additions & 0 deletions data/transformed/MediaDive/edges.tsv

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions kg_bacdive/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List, Optional

from kg_bacdive.transform_utils.bacdive.bacdive import BacDiveTransform
from kg_bacdive.transform_utils.mediadive.mediadive import MediaDiveDiveTransform
from kg_bacdive.transform_utils.mediadive.mediadive import MediaDiveTransform
from kg_bacdive.transform_utils.ontology.ontology_transform import ONTOLOGIES, OntologyTransform
from kg_bacdive.transform_utils.traits.traits import TraitsTransform

Expand All @@ -19,7 +19,7 @@
# "ProteinAtlasTransform": ProteinAtlasTransform,
# "STRINGTransform": STRINGTransform,
"BacDiveTransform": BacDiveTransform,
"MediaDiveDiveTransform": MediaDiveDiveTransform,
"MediaDiveTransform": MediaDiveTransform,
"TraitsTransform": TraitsTransform,
}

Expand Down
2 changes: 1 addition & 1 deletion kg_bacdive/transform_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
CHEBI_PREFIX = "CHEBI:"
CAS_RN_PREFIX = "CAS-RN:"
PUBCHEM_PREFIX = "PubChem:"
MEDIADIVE_COMPOUND_PREFIX = "mediadive.ingredient:"
MEDIADIVE_INGREDIENT_PREFIX = "mediadive.ingredient:"
MEDIADIVE_SOLUTION_PREFIX = "mediadive.solution:"
MEDIADIVE_MEDIUM_PREFIX = "mediadive.medium:"
GO_PREFIX = "GO:"
Expand Down
4 changes: 2 additions & 2 deletions kg_bacdive/transform_utils/mediadive/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""BacDive transform."""
from .mediadive import MediaDiveDiveTransform
from .mediadive import MediaDiveTransform

__all__ = ["MediaDiveDiveTransform"]
__all__ = ["MediaDiveTransform"]
16 changes: 11 additions & 5 deletions kg_bacdive/transform_utils/mediadive/mediadive.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@
KEGG_KEY,
KEGG_PREFIX,
MEDIADIVE_COMPLEX_MEDIUM_COLUMN,
MEDIADIVE_COMPOUND_PREFIX,
MEDIADIVE_DESC_COLUMN,
MEDIADIVE_ID_COLUMN,
MEDIADIVE_INGREDIENT_PREFIX,
MEDIADIVE_LINK_COLUMN,
MEDIADIVE_MAX_PH_COLUMN,
MEDIADIVE_MEDIUM_PREFIX,
Expand All @@ -72,10 +72,10 @@
SOLUTIONS_KEY,
)
from kg_bacdive.transform_utils.transform import Transform
from kg_bacdive.utils.pandas_utils import drop_duplicates
from kg_bacdive.utils.pandas_utils import drop_duplicates, establish_transitive_relationship


class MediaDiveDiveTransform(Transform):
class MediaDiveTransform(Transform):

"""Template for how the transform class would be designed."""

Expand Down Expand Up @@ -144,7 +144,7 @@ def standardize_compound_id(self, id: str):
elif data[CAS_RN_KEY] is not None:
return CAS_RN_PREFIX + str(data[CAS_RN_KEY])
else:
return MEDIADIVE_COMPOUND_PREFIX + id
return MEDIADIVE_INGREDIENT_PREFIX + id

def download_yaml_and_get_json(
self,
Expand Down Expand Up @@ -296,4 +296,10 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None):
progress.update()

drop_duplicates(self.output_node_file)
drop_duplicates(self.output_edge_file)
establish_transitive_relationship(
self.output_edge_file,
MEDIADIVE_MEDIUM_PREFIX,
MEDIADIVE_SOLUTION_PREFIX,
MEDIUM_TO_INGREDIENT_EDGE,
MEDIADIVE_INGREDIENT_PREFIX,
)
54 changes: 54 additions & 0 deletions kg_bacdive/utils/pandas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import pandas as pd

from kg_bacdive.transform_utils.constants import OBJECT_COLUMN, PREDICATE_COLUMN, SUBJECT_COLUMN


def drop_duplicates(file_path: Path):
"""
Expand All @@ -14,3 +16,55 @@ def drop_duplicates(file_path: Path):
df = pd.read_csv(file_path, sep="\t", low_memory=False)
df = df.drop_duplicates()
df.to_csv(file_path, sep="\t", index=False)
return df


def establish_transitive_relationship(
file_path: Path,
subject_prefix: str,
intermediate_prefix: str,
predicate: str,
object_prefix: str,
) -> pd.DataFrame:
"""
Establish transitive relationship given the predicate is the same.

e.g.: Existent relations:
1. A => predicate => B
2. B => predicate => C

This function adds the relation A => predicate => C


:param file_path: Filepath of the edge file.
:param subject_prefix: Subject prefix (A in the example)
:param intermediate_prefix: Intermediate prefix that connects the subject to object (B in the example).
:param predicate: The common predicate between all relations.
:param object_prefix: Object prefix (C in the example)
:return: Core dataframe with additional deduced rows.
"""
df = drop_duplicates(file_path)
df_relations = df.loc[df[PREDICATE_COLUMN] == predicate]
subject_condition = df_relations[SUBJECT_COLUMN].str.startswith(subject_prefix)
intermediate_subject_condition = df_relations[SUBJECT_COLUMN].str.startswith(
intermediate_prefix
)
object_condition = df_relations[OBJECT_COLUMN].str.startswith(object_prefix)
intermediate_object_condition = df_relations[OBJECT_COLUMN].str.startswith(intermediate_prefix)
subject_intermediate_df = df_relations[subject_condition & intermediate_object_condition]
intermediate_object_df = df_relations[intermediate_subject_condition & object_condition]

list_of_dfs_to_append = []

for row in subject_intermediate_df.iterrows():
transitive_relations_df = intermediate_object_df.loc[
intermediate_object_df[SUBJECT_COLUMN] == row[1].object
]
transitive_relations_df.loc[
transitive_relations_df[SUBJECT_COLUMN] == row[1].object, SUBJECT_COLUMN
] = row[1].subject
list_of_dfs_to_append.append(transitive_relations_df)

df = pd.concat([df] + list_of_dfs_to_append)
df.to_csv(file_path, sep="\t", index=False)
return df
14 changes: 7 additions & 7 deletions merged_graph_stats.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ edge_stats:
unknown:
count: 36244
biolink:has_part:
count: 84816
count: 102140
provided_by:
unknown:
count: 84816
count: 102140
source:
unknown:
count: 84816
count: 102140
biolink:has_phenotype:
count: 19727
provided_by:
Expand Down Expand Up @@ -124,13 +124,13 @@ edge_stats:
unknown:
count: 2027
biolink:ChemicalEntity-biolink:has_part-biolink:ChemicalEntity:
count: 84816
count: 102140
provided_by:
unknown:
count: 84816
count: 102140
source:
unknown:
count: 84816
count: 102140
biolink:ChemicalEntity-biolink:has_part-biolink:ChemicalSubstance:
count: 50254
provided_by:
Expand Down Expand Up @@ -464,7 +464,7 @@ edge_stats:
- unknown
source:
- unknown
total_edges: 1324137
total_edges: 1341461
graph_name: kg-bacdive graph
node_stats:
count_by_category:
Expand Down
Loading