Add transitivity function in panda_utils

Knowledge-Graph-Hub · hrshdhgd · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
commit 2662928c8ef61ef2bc8a5f010996eca2d8758daf
diff --git a/kg_bacdive/transform_utils/mediadive/mediadive.py b/kg_bacdive/transform_utils/mediadive/mediadive.py
@@ -72,10 +72,10 @@
     SOLUTIONS_KEY,
 )
 from kg_bacdive.transform_utils.transform import Transform
-from kg_bacdive.utils.pandas_utils import drop_duplicates
+from kg_bacdive.utils.pandas_utils import drop_duplicates, establish_transitive_relationship
 
 
-class MediaDiveDiveTransform(Transform):
+class MediaDiveTransform(Transform):
 
     """Template for how the transform class would be designed."""
 
@@ -296,4 +296,5 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None):
                     progress.update()
 
         drop_duplicates(self.output_node_file)
-        drop_duplicates(self.output_edge_file)
+        establish_transitive_relationship(self.output_edge_file)
+        # drop_duplicates(self.output_edge_file)
diff --git a/kg_bacdive/utils/pandas_utils.py b/kg_bacdive/utils/pandas_utils.py
@@ -3,6 +3,8 @@
 
 import pandas as pd
 
+from kg_bacdive.transform_utils.constants import OBJECT_COLUMN, PREDICATE_COLUMN, SUBJECT_COLUMN
+
 
 def drop_duplicates(file_path: Path):
     """
@@ -14,3 +16,40 @@ def drop_duplicates(file_path: Path):
     df = pd.read_csv(file_path, sep="\t", low_memory=False)
     df = df.drop_duplicates()
     df.to_csv(file_path, sep="\t", index=False)
+    return df
+
+def establish_transitive_relationship(
+        file_path: Path,
+        subject_prefix:str,
+        intermediate_prefix: str,
+        predicate: str,
+        object_prefix:str
+    ) -> pd.DataFrame:
+    """Establish transitive relationship given the predicate is the same.
+
+    e.g.: Existant relations:
+        1. A => predicate => B
+        2. B => predicate => C
+
+    This function adds the relation A => predicate => C
+
+
+    :param file_path: Filepath of thes edge file.
+    :param subject_prefix: Subject prefix (A in the example)
+    :param intermediate_prefix: Intermediate prefix that connects the subject to object (B in the example).
+    :param predicate: The common predicate between all relations.
+    :param object_prefix: Object prefix (C in the example)
+    :return: Core dataframe with additional deduced rows.
+    """
+    df = drop_duplicates(file_path)
+    df_relations = df.loc[df[PREDICATE_COLUMN] == predicate]
+    subject_condition = df[SUBJECT_COLUMN].str.startswith(subject_prefix)
+    intermediate_subject_condition = df[SUBJECT_COLUMN].str.startswith(intermediate_prefix)
+    object_condition = df[OBJECT_COLUMN].str.startswith(object_prefix)
+    intermediate_object_condition = df[OBJECT_COLUMN].str.startswith(intermediate_prefix)
+    subject_intermediate_df = df_relations[subject_condition & intermediate_object_condition]
+    intermediate_object_df = df_relations[intermediate_subject_condition & object_condition]
+
+    for row in subject_intermediate_df.iterrows():
+        import pdb; pdb.set_trace()
+