Skip to content
This repository has been archived by the owner on Oct 27, 2023. It is now read-only.

Commit

Permalink
Add transitivity function in panda_utils
Browse files Browse the repository at this point in the history
  • Loading branch information
hrshdhgd committed Oct 27, 2023
1 parent 9a09d3c commit 2662928
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 3 deletions.
7 changes: 4 additions & 3 deletions kg_bacdive/transform_utils/mediadive/mediadive.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@
SOLUTIONS_KEY,
)
from kg_bacdive.transform_utils.transform import Transform
from kg_bacdive.utils.pandas_utils import drop_duplicates
from kg_bacdive.utils.pandas_utils import drop_duplicates, establish_transitive_relationship


class MediaDiveDiveTransform(Transform):
class MediaDiveTransform(Transform):

"""Template for how the transform class would be designed."""

Expand Down Expand Up @@ -296,4 +296,5 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None):
progress.update()

drop_duplicates(self.output_node_file)
drop_duplicates(self.output_edge_file)
establish_transitive_relationship(self.output_edge_file)
# drop_duplicates(self.output_edge_file)
39 changes: 39 additions & 0 deletions kg_bacdive/utils/pandas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import pandas as pd

from kg_bacdive.transform_utils.constants import OBJECT_COLUMN, PREDICATE_COLUMN, SUBJECT_COLUMN


def drop_duplicates(file_path: Path):
"""
Expand All @@ -14,3 +16,40 @@ def drop_duplicates(file_path: Path):
df = pd.read_csv(file_path, sep="\t", low_memory=False)
df = df.drop_duplicates()
df.to_csv(file_path, sep="\t", index=False)
return df

def establish_transitive_relationship(
file_path: Path,
subject_prefix:str,
intermediate_prefix: str,
predicate: str,
object_prefix:str
) -> pd.DataFrame:
"""Establish transitive relationship given the predicate is the same.
e.g.: Existant relations:
1. A => predicate => B
2. B => predicate => C
This function adds the relation A => predicate => C
:param file_path: Filepath of thes edge file.
:param subject_prefix: Subject prefix (A in the example)
:param intermediate_prefix: Intermediate prefix that connects the subject to object (B in the example).
:param predicate: The common predicate between all relations.
:param object_prefix: Object prefix (C in the example)
:return: Core dataframe with additional deduced rows.
"""
df = drop_duplicates(file_path)
df_relations = df.loc[df[PREDICATE_COLUMN] == predicate]
subject_condition = df[SUBJECT_COLUMN].str.startswith(subject_prefix)
intermediate_subject_condition = df[SUBJECT_COLUMN].str.startswith(intermediate_prefix)
object_condition = df[OBJECT_COLUMN].str.startswith(object_prefix)
intermediate_object_condition = df[OBJECT_COLUMN].str.startswith(intermediate_prefix)
subject_intermediate_df = df_relations[subject_condition & intermediate_object_condition]
intermediate_object_df = df_relations[intermediate_subject_condition & object_condition]

for row in subject_intermediate_df.iterrows():
import pdb; pdb.set_trace()

0 comments on commit 2662928

Please sign in to comment.