Refactoring DFG discovery for Pandas (support for target activity key…

… different from source activity key)
process-intelligence-solutions · Jan 13, 2022 · 52ddbf7 · 52ddbf7
1 parent 520bdc8
commit 52ddbf7
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 9 deletions.
diff --git a/pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py b/pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py
@@ -6,7 +6,7 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
                   start_timestamp_key=None, timestamp_key="time:timestamp", perf_aggregation_key="mean",
                   sort_caseid_required=True,
                   sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1,
-                  business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR):
+                  business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR, target_activity_key=None):
     """
     Get DFG graph from Pandas dataframe
 
@@ -42,6 +42,11 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
     """
     import pandas as pd
 
+    # added support to specify an activity key for the target event which is different
+    # from the activity key of the source event.
+    if target_activity_key is None:
+        target_activity_key = activity_key
+
     # if not differently specified, set the start timestamp key to the timestamp key
     # to avoid retro-compatibility problems
     if start_timestamp_key is None:
@@ -58,9 +63,9 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
     # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
     # columns
     if measure == "frequency":
-        df_reduced = df[[case_id_glue, activity_key]]
+        df_reduced = df[{case_id_glue, activity_key, target_activity_key}]
     else:
-        df_reduced = df[[case_id_glue, activity_key, start_timestamp_key, timestamp_key]]
+        df_reduced = df[{case_id_glue, activity_key, start_timestamp_key, timestamp_key, target_activity_key}]
     # shift the dataframe by 1, in order to couple successive rows
     df_reduced_shifted = df_reduced.shift(-window)
     # change column names to shifted dataframe
@@ -72,10 +77,10 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
     df_successive_rows = df_successive_rows[df_successive_rows[case_id_glue] == df_successive_rows[case_id_glue + '_2']]
     if keep_once_per_case:
         df_successive_rows = df_successive_rows.groupby(
-            [case_id_glue, activity_key, activity_key + "_2"]).first().reset_index()
+            [case_id_glue, activity_key, target_activity_key + "_2"]).first().reset_index()
 
     all_columns = set(df_successive_rows.columns)
-    all_columns = list(all_columns - set([activity_key, activity_key + '_2']))
+    all_columns = list(all_columns - set([activity_key, target_activity_key + '_2']))
 
     if measure == "performance" or measure == "both":
         # in the arc performance calculation, make sure to consider positive or null values
@@ -94,10 +99,10 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
                     df_successive_rows[start_timestamp_key + '_2'] - df_successive_rows[timestamp_key]).astype(
                 'timedelta64[s]')
         # groups couple of attributes (directly follows relation, we can measure the frequency and the performance)
-        directly_follows_grouping = df_successive_rows.groupby([activity_key, activity_key + '_2'])[
+        directly_follows_grouping = df_successive_rows.groupby([activity_key, target_activity_key + '_2'])[
             constants.DEFAULT_FLOW_TIME]
     else:
-        directly_follows_grouping = df_successive_rows.groupby([activity_key, activity_key + '_2'])
+        directly_follows_grouping = df_successive_rows.groupby([activity_key, target_activity_key + '_2'])
         if all_columns:
             directly_follows_grouping = directly_follows_grouping[all_columns[0]]
 

diff --git a/pm4py/algo/filtering/pandas/paths/paths_filter.py b/pm4py/algo/filtering/pandas/paths/paths_filter.py
@@ -20,6 +20,7 @@ class Parameters(Enum):
     CASE_ID_KEY = PARAMETER_CONSTANT_CASEID_KEY
     ATTRIBUTE_KEY = PARAMETER_CONSTANT_ATTRIBUTE_KEY
     TIMESTAMP_KEY = PARAMETER_CONSTANT_TIMESTAMP_KEY
+    TARGET_ATTRIBUTE_KEY = "target_attribute_key"
     DECREASING_FACTOR = "decreasingFactor"
     POSITIVE = "positive"
     MIN_PERFORMANCE = "min_performance"
@@ -52,15 +53,17 @@ def apply(df: pd.DataFrame, paths: List[Tuple[str, str]], parameters: Optional[D
     case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
     attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
     timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
+    target_attribute_key = exec_utils.get_param_value(Parameters.TARGET_ATTRIBUTE_KEY, parameters, attribute_key)
+
     positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
     paths = [path[0] + DEFAULT_VARIANT_SEP + path[1] for path in paths]
     df = df.sort_values([case_id_glue, timestamp_key])
-    filt_df = df[[case_id_glue, attribute_key]]
+    filt_df = df[{case_id_glue, attribute_key, target_attribute_key}]
     filt_dif_shifted = filt_df.shift(-1)
     filt_dif_shifted.columns = [str(col) + '_2' for col in filt_dif_shifted.columns]
     stacked_df = pd.concat([filt_df, filt_dif_shifted], axis=1)
     stacked_df = stacked_df[stacked_df[case_id_glue] == stacked_df[case_id_glue + '_2']]
-    stacked_df["@@path"] = stacked_df[attribute_key] + DEFAULT_VARIANT_SEP + stacked_df[attribute_key + "_2"]
+    stacked_df["@@path"] = stacked_df[attribute_key] + DEFAULT_VARIANT_SEP + stacked_df[target_attribute_key + "_2"]
     stacked_df = stacked_df[stacked_df["@@path"].isin(paths)]
     i1 = df.set_index(case_id_glue).index
     i2 = stacked_df.set_index(case_id_glue).index