Skip to content
This repository has been archived by the owner on Oct 27, 2023. It is now read-only.

Add transitivity function in pandas_utils #7

Merged
merged 4 commits into from
Oct 27, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add transitivity function in panda_utils
hrshdhgd committed Oct 27, 2023
commit 2662928c8ef61ef2bc8a5f010996eca2d8758daf
7 changes: 4 additions & 3 deletions kg_bacdive/transform_utils/mediadive/mediadive.py
Original file line number Diff line number Diff line change
@@ -72,10 +72,10 @@
SOLUTIONS_KEY,
)
from kg_bacdive.transform_utils.transform import Transform
from kg_bacdive.utils.pandas_utils import drop_duplicates
from kg_bacdive.utils.pandas_utils import drop_duplicates, establish_transitive_relationship


class MediaDiveDiveTransform(Transform):
class MediaDiveTransform(Transform):

"""Template for how the transform class would be designed."""

@@ -296,4 +296,5 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None):
progress.update()

drop_duplicates(self.output_node_file)
drop_duplicates(self.output_edge_file)
establish_transitive_relationship(self.output_edge_file)
# drop_duplicates(self.output_edge_file)
39 changes: 39 additions & 0 deletions kg_bacdive/utils/pandas_utils.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,8 @@

import pandas as pd

from kg_bacdive.transform_utils.constants import OBJECT_COLUMN, PREDICATE_COLUMN, SUBJECT_COLUMN


def drop_duplicates(file_path: Path):
"""
@@ -14,3 +16,40 @@ def drop_duplicates(file_path: Path):
df = pd.read_csv(file_path, sep="\t", low_memory=False)
df = df.drop_duplicates()
df.to_csv(file_path, sep="\t", index=False)
return df

def establish_transitive_relationship(
file_path: Path,
subject_prefix:str,
intermediate_prefix: str,
predicate: str,
object_prefix:str
) -> pd.DataFrame:
"""Establish transitive relationship given the predicate is the same.

e.g.: Existant relations:
1. A => predicate => B
2. B => predicate => C

This function adds the relation A => predicate => C


:param file_path: Filepath of thes edge file.
:param subject_prefix: Subject prefix (A in the example)
:param intermediate_prefix: Intermediate prefix that connects the subject to object (B in the example).
:param predicate: The common predicate between all relations.
:param object_prefix: Object prefix (C in the example)
:return: Core dataframe with additional deduced rows.
"""
df = drop_duplicates(file_path)
df_relations = df.loc[df[PREDICATE_COLUMN] == predicate]
subject_condition = df[SUBJECT_COLUMN].str.startswith(subject_prefix)
intermediate_subject_condition = df[SUBJECT_COLUMN].str.startswith(intermediate_prefix)
object_condition = df[OBJECT_COLUMN].str.startswith(object_prefix)
intermediate_object_condition = df[OBJECT_COLUMN].str.startswith(intermediate_prefix)
subject_intermediate_df = df_relations[subject_condition & intermediate_object_condition]
intermediate_object_df = df_relations[intermediate_subject_condition & object_condition]

for row in subject_intermediate_df.iterrows():
import pdb; pdb.set_trace()