Skip to content

Commit

Permalink
Refactoring DFG discovery for Pandas (support for target activity key…
Browse files Browse the repository at this point in the history
… different from source activity key)
  • Loading branch information
fit-alessandro-berti committed Jan 13, 2022
1 parent 520bdc8 commit 52ddbf7
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 9 deletions.
19 changes: 12 additions & 7 deletions pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
start_timestamp_key=None, timestamp_key="time:timestamp", perf_aggregation_key="mean",
sort_caseid_required=True,
sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1,
business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR):
business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR, target_activity_key=None):
"""
Get DFG graph from Pandas dataframe
Expand Down Expand Up @@ -42,6 +42,11 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
"""
import pandas as pd

# added support to specify an activity key for the target event which is different
# from the activity key of the source event.
if target_activity_key is None:
target_activity_key = activity_key

# if not differently specified, set the start timestamp key to the timestamp key
# to avoid retro-compatibility problems
if start_timestamp_key is None:
Expand All @@ -58,9 +63,9 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
# to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
# columns
if measure == "frequency":
df_reduced = df[[case_id_glue, activity_key]]
df_reduced = df[{case_id_glue, activity_key, target_activity_key}]
else:
df_reduced = df[[case_id_glue, activity_key, start_timestamp_key, timestamp_key]]
df_reduced = df[{case_id_glue, activity_key, start_timestamp_key, timestamp_key, target_activity_key}]
# shift the dataframe by 1, in order to couple successive rows
df_reduced_shifted = df_reduced.shift(-window)
# change column names to shifted dataframe
Expand All @@ -72,10 +77,10 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
df_successive_rows = df_successive_rows[df_successive_rows[case_id_glue] == df_successive_rows[case_id_glue + '_2']]
if keep_once_per_case:
df_successive_rows = df_successive_rows.groupby(
[case_id_glue, activity_key, activity_key + "_2"]).first().reset_index()
[case_id_glue, activity_key, target_activity_key + "_2"]).first().reset_index()

all_columns = set(df_successive_rows.columns)
all_columns = list(all_columns - set([activity_key, activity_key + '_2']))
all_columns = list(all_columns - set([activity_key, target_activity_key + '_2']))

if measure == "performance" or measure == "both":
# in the arc performance calculation, make sure to consider positive or null values
Expand All @@ -94,10 +99,10 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
df_successive_rows[start_timestamp_key + '_2'] - df_successive_rows[timestamp_key]).astype(
'timedelta64[s]')
# groups couple of attributes (directly follows relation, we can measure the frequency and the performance)
directly_follows_grouping = df_successive_rows.groupby([activity_key, activity_key + '_2'])[
directly_follows_grouping = df_successive_rows.groupby([activity_key, target_activity_key + '_2'])[
constants.DEFAULT_FLOW_TIME]
else:
directly_follows_grouping = df_successive_rows.groupby([activity_key, activity_key + '_2'])
directly_follows_grouping = df_successive_rows.groupby([activity_key, target_activity_key + '_2'])
if all_columns:
directly_follows_grouping = directly_follows_grouping[all_columns[0]]

Expand Down
7 changes: 5 additions & 2 deletions pm4py/algo/filtering/pandas/paths/paths_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Parameters(Enum):
CASE_ID_KEY = PARAMETER_CONSTANT_CASEID_KEY
ATTRIBUTE_KEY = PARAMETER_CONSTANT_ATTRIBUTE_KEY
TIMESTAMP_KEY = PARAMETER_CONSTANT_TIMESTAMP_KEY
TARGET_ATTRIBUTE_KEY = "target_attribute_key"
DECREASING_FACTOR = "decreasingFactor"
POSITIVE = "positive"
MIN_PERFORMANCE = "min_performance"
Expand Down Expand Up @@ -52,15 +53,17 @@ def apply(df: pd.DataFrame, paths: List[Tuple[str, str]], parameters: Optional[D
case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
target_attribute_key = exec_utils.get_param_value(Parameters.TARGET_ATTRIBUTE_KEY, parameters, attribute_key)

positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
paths = [path[0] + DEFAULT_VARIANT_SEP + path[1] for path in paths]
df = df.sort_values([case_id_glue, timestamp_key])
filt_df = df[[case_id_glue, attribute_key]]
filt_df = df[{case_id_glue, attribute_key, target_attribute_key}]
filt_dif_shifted = filt_df.shift(-1)
filt_dif_shifted.columns = [str(col) + '_2' for col in filt_dif_shifted.columns]
stacked_df = pd.concat([filt_df, filt_dif_shifted], axis=1)
stacked_df = stacked_df[stacked_df[case_id_glue] == stacked_df[case_id_glue + '_2']]
stacked_df["@@path"] = stacked_df[attribute_key] + DEFAULT_VARIANT_SEP + stacked_df[attribute_key + "_2"]
stacked_df["@@path"] = stacked_df[attribute_key] + DEFAULT_VARIANT_SEP + stacked_df[target_attribute_key + "_2"]
stacked_df = stacked_df[stacked_df["@@path"].isin(paths)]
i1 = df.set_index(case_id_glue).index
i2 = stacked_df.set_index(case_id_glue).index
Expand Down

0 comments on commit 52ddbf7

Please sign in to comment.