Merge branch 'ft-1261-updates-related-to-business-hours' into 'integr…

…ation' FT 1261 Various updates related to business hours See merge request pm4py/pm4py-core!499
process-intelligence-solutions · Oct 15, 2021 · 829f091 · 829f091
2 parents d22dd49 + 0c789ec
commit 829f091
Show file tree

Hide file tree

Showing 7 changed files with 151 additions and 34 deletions.
diff --git a/pm4py/algo/conformance/temporal_profile/variants/dataframe.py b/pm4py/algo/conformance/temporal_profile/variants/dataframe.py
@@ -15,6 +15,9 @@ class Parameters(Enum):
     TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
     CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
     ZETA = "zeta"
+    BUSINESS_HOURS = "business_hours"
+    WORKTIMING = "worktiming"
+    WEEKENDS = "weekends"
 
 
 def apply(df: pd.DataFrame, temporal_profile: typing.TemporalProfile,
@@ -61,6 +64,10 @@ def apply(df: pd.DataFrame, temporal_profile: typing.TemporalProfile,
     case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
     zeta = exec_utils.get_param_value(Parameters.ZETA, parameters, 6.0)
 
+    business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False)
+    worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17])
+    weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7])
+
     temporal_profile = pd.DataFrame([{activity_key: x[0], activity_key + "_2": x[1], "@@min": y[0] - zeta * y[1],
                                       "@@max": y[0] + zeta * y[1], "@@mean": y[0], "@@std": y[1]} for x, y in
                                      temporal_profile.items()])
@@ -69,7 +76,8 @@ def apply(df: pd.DataFrame, temporal_profile: typing.TemporalProfile,
     ret = [[] for c in cases]
     efg = get_partial_order_dataframe(df, activity_key=activity_key, timestamp_key=timestamp_key,
                                       start_timestamp_key=start_timestamp_key, case_id_glue=case_id_key,
-                                      keep_first_following=False)
+                                      keep_first_following=False, business_hours=business_hours, worktiming=worktiming,
+                                      weekends=weekends)
     efg = efg[[case_id_key, activity_key, activity_key + "_2", "@@flow_time"]]
     efg = efg.merge(temporal_profile, on=[activity_key, activity_key + "_2"])
     efg = efg[(efg["@@flow_time"] < efg["@@min"]) | (efg["@@flow_time"] > efg["@@max"])][

diff --git a/pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py b/pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py
@@ -1,10 +1,12 @@
 from pm4py.util import xes_constants, pandas_utils, constants
+from pm4py.util.business_hours import soj_time_business_hours_diff
 
 
 def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_glue="case:concept:name",
                   start_timestamp_key=None, timestamp_key="time:timestamp", perf_aggregation_key="mean",
                   sort_caseid_required=True,
-                  sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1):
+                  sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1,
+                  business_hours=False, worktiming=None, weekends=None):
     """
     Get DFG graph from Pandas dataframe
 
@@ -76,13 +78,21 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
     all_columns = list(all_columns - set([activity_key, activity_key + '_2']))
 
     if measure == "performance" or measure == "both":
-        # calculate the difference between the timestamps of two successive events
-        df_successive_rows[constants.DEFAULT_FLOW_TIME] = (
-                df_successive_rows[start_timestamp_key + '_2'] - df_successive_rows[timestamp_key]).astype(
-            'timedelta64[s]')
         # in the arc performance calculation, make sure to consider positive or null values
-        df_successive_rows[constants.DEFAULT_FLOW_TIME] = df_successive_rows[constants.DEFAULT_FLOW_TIME].apply(
-            lambda x: max(x, 0))
+        df_successive_rows[start_timestamp_key + '_2'] = df_successive_rows[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
+        # calculate the difference between the timestamps of two successive events
+        if business_hours:
+            if worktiming is None:
+                worktiming = [7, 17]
+            if weekends is None:
+                weekends = [6, 7]
+            df_successive_rows[constants.DEFAULT_FLOW_TIME] = df_successive_rows.apply(
+            lambda x: soj_time_business_hours_diff(x[timestamp_key], x[start_timestamp_key + '_2'], worktiming,
+                                                   weekends), axis=1)
+        else:
+            df_successive_rows[constants.DEFAULT_FLOW_TIME] = (
+                    df_successive_rows[start_timestamp_key + '_2'] - df_successive_rows[timestamp_key]).astype(
+                'timedelta64[s]')
         # groups couple of attributes (directly follows relation, we can measure the frequency and the performance)
         directly_follows_grouping = df_successive_rows.groupby([activity_key, activity_key + '_2'])[
             constants.DEFAULT_FLOW_TIME]
@@ -126,7 +136,8 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
 def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="time:timestamp",
                                 case_id_glue="case:concept:name", activity_key="concept:name",
                                 sort_caseid_required=True,
-                                sort_timestamp_along_case_id=True, reduce_dataframe=True, keep_first_following=True):
+                                sort_timestamp_along_case_id=True, reduce_dataframe=True, keep_first_following=True,
+                                business_hours=False, worktiming=None, weekends=None):
     """
     Gets the partial order between events (of the same case) in a Pandas dataframe
 
@@ -179,10 +190,20 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim
 
     df = df.join(df_copy, rsuffix="_2").dropna()
     df = df[df[constants.DEFAULT_INDEX_KEY] < df[constants.DEFAULT_INDEX_KEY + "_2"]]
-    df = df[df[timestamp_key] <= df[start_timestamp_key + "_2"]]
+    df[start_timestamp_key + '_2'] = df[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
+
     df = df.reset_index()
 
-    df[constants.DEFAULT_FLOW_TIME] = (df[start_timestamp_key + "_2"] - df[timestamp_key]).astype('timedelta64[s]')
+    if business_hours:
+        if worktiming is None:
+            worktiming = [7, 17]
+        if weekends is None:
+            weekends = [6, 7]
+        df[constants.DEFAULT_FLOW_TIME] = df.apply(
+            lambda x: soj_time_business_hours_diff(x[timestamp_key], x[start_timestamp_key + '_2'], worktiming,
+                                                   weekends), axis=1)
+    else:
+        df[constants.DEFAULT_FLOW_TIME] = (df[start_timestamp_key + "_2"] - df[timestamp_key]).astype('timedelta64[s]')
 
     if keep_first_following:
         df = df.groupby(constants.DEFAULT_INDEX_KEY).first().reset_index()

diff --git a/pm4py/algo/discovery/temporal_profile/variants/dataframe.py b/pm4py/algo/discovery/temporal_profile/variants/dataframe.py
@@ -13,6 +13,9 @@ class Parameters(Enum):
     START_TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY
     TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
     CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
+    BUSINESS_HOURS = "business_hours"
+    WORKTIMING = "worktiming"
+    WEEKENDS = "weekends"
 
 
 def apply(df: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> typing.TemporalProfile:
@@ -48,9 +51,14 @@ def apply(df: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> typi
     start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters, None)
     case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
 
+    business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False)
+    worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17])
+    weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7])
+
     efg = get_partial_order_dataframe(df, activity_key=activity_key, timestamp_key=timestamp_key,
                                       start_timestamp_key=start_timestamp_key, case_id_glue=case_id_key,
-                                      keep_first_following=False)
+                                      keep_first_following=False, business_hours=business_hours, worktiming=worktiming,
+                                      weekends=weekends)
     efg = efg[[activity_key, activity_key + "_2", "@@flow_time"]]
     temporal_profile = efg.groupby([activity_key, activity_key + "_2"]).agg(["mean", "std"]).reset_index().fillna(
         0).to_dict("records")

diff --git a/pm4py/algo/filtering/pandas/cases/case_filter.py b/pm4py/algo/filtering/pandas/cases/case_filter.py
@@ -6,12 +6,17 @@
 import deprecation
 from typing import Optional, Dict, Any, Union, Tuple, List
 import pandas as pd
+from pm4py.util.business_hours import soj_time_business_hours_diff
 
 
 class Parameters(Enum):
     TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
     CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
 
+    BUSINESS_HOURS = "business_hours"
+    WORKTIMING = "worktiming"
+    WEEKENDS = "weekends"
+
 
 def filter_on_ncases(df: pd.DataFrame, case_id_glue: str = constants.CASE_CONCEPT_NAME, max_no_cases: int = 1000):
     """
@@ -72,7 +77,8 @@ def filter_on_case_size(df0: pd.DataFrame, case_id_glue: str = "case:concept:nam
 
 def filter_on_case_performance(df: pd.DataFrame, case_id_glue: str = constants.CASE_CONCEPT_NAME,
                                timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY,
-                               min_case_performance: float = 0, max_case_performance: float = 10000000000) -> pd.DataFrame:
+                               min_case_performance: float = 0, max_case_performance: float = 10000000000,
+                               business_hours=False, worktiming=[7, 17], weekends=[6, 7]) -> pd.DataFrame:
     """
     Filter a dataframe on case performance
 
@@ -99,8 +105,13 @@ def filter_on_case_performance(df: pd.DataFrame, case_id_glue: str = constants.C
     end_events = grouped_df.last()
     end_events.columns = [str(col) + '_2' for col in end_events.columns]
     stacked_df = pd.concat([start_events, end_events], axis=1)
-    stacked_df['caseDuration'] = stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key]
-    stacked_df['caseDuration'] = stacked_df['caseDuration'].astype('timedelta64[s]')
+    if business_hours:
+        stacked_df['caseDuration'] = stacked_df.apply(
+            lambda x: soj_time_business_hours_diff(x[timestamp_key], x[timestamp_key + "_2"], worktiming,
+                                                   weekends), axis=1)
+    else:
+        stacked_df['caseDuration'] = stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key]
+        stacked_df['caseDuration'] = stacked_df['caseDuration'].astype('timedelta64[s]')
     stacked_df = stacked_df[stacked_df['caseDuration'] <= max_case_performance]
     stacked_df = stacked_df[stacked_df['caseDuration'] >= min_case_performance]
     i1 = df.set_index(case_id_glue).index
@@ -116,9 +127,14 @@ def filter_case_performance(df: pd.DataFrame, min_case_performance: float = 0, m
     timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                                xes_constants.DEFAULT_TIMESTAMP_KEY)
     case_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
+    business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False)
+    worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17])
+    weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7])
+
     return filter_on_case_performance(df, min_case_performance=min_case_performance,
                                       max_case_performance=max_case_performance, timestamp_key=timestamp_key,
-                                      case_id_glue=case_glue)
+                                      case_id_glue=case_glue, business_hours=business_hours, worktiming=worktiming,
+                                      weekends=weekends)
 
 
 def apply(df, parameters=None):

diff --git a/pm4py/statistics/sojourn_time/log/get.py b/pm4py/statistics/sojourn_time/log/get.py
@@ -1,17 +1,18 @@
 from enum import Enum
-from statistics import mean
+from statistics import mean, median
 
 from pm4py.util import exec_utils, constants, xes_constants
 from pm4py.objects.conversion.log import converter as log_converter
 from pm4py.util.business_hours import BusinessHours
-from typing import Optional, Dict, Any, Union, Tuple, List, Set
+from typing import Optional, Dict, Any, Union
 from pm4py.objects.log.obj import EventLog
 
 
 class Parameters(Enum):
     ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
     START_TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY
     TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
+    AGGREGATION_MEASURE = "aggregationMeasure"
     BUSINESS_HOURS = "business_hours"
     WORKTIMING = "worktiming"
     WEEKENDS = "weekends"
@@ -40,6 +41,7 @@ def apply(log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]]
                                         Default: [7, 17] (work shift from 07:00 to 17:00)
         - Parameters.WEEKENDS => indexes of the days of the week that are weekend
                                         Default: [6, 7] (weekends are Saturday and Sunday)
+        - Parameters.AGGREGATION_MEASURE => performance aggregation measure (sum, min, max, mean, median)
 
     Returns
     --------------
@@ -60,6 +62,8 @@ def apply(log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]]
                                                      xes_constants.DEFAULT_TIMESTAMP_KEY)
     timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                                xes_constants.DEFAULT_TIMESTAMP_KEY)
+    aggregation_measure = exec_utils.get_param_value(Parameters.AGGREGATION_MEASURE,
+                                                     parameters, "mean")
 
     durations_dict = {}
     activities = [ev[activity_key] for trace in log for ev in trace]
@@ -80,6 +84,15 @@ def apply(log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]]
                 durations_dict[activity].append(complete_time - start_time)
 
     for act in durations_dict:
-        durations_dict[act] = mean(durations_dict[act])
+        if aggregation_measure == "median":
+            durations_dict[act] = median(durations_dict[act])
+        elif aggregation_measure == "min":
+            durations_dict[act] = min(durations_dict[act])
+        elif aggregation_measure == "max":
+            durations_dict[act] = max(durations_dict[act])
+        elif aggregation_measure == "sum":
+            durations_dict[act] = sum(durations_dict[act])
+        else:
+            durations_dict[act] = mean(durations_dict[act])
 
     return durations_dict
diff --git a/pm4py/statistics/sojourn_time/pandas/get.py b/pm4py/statistics/sojourn_time/pandas/get.py
@@ -1,15 +1,16 @@
+import pandas as pd
 from enum import Enum
 
 from pm4py.util import exec_utils, constants, xes_constants
 from pm4py.util.business_hours import soj_time_business_hours_diff
-import pandas as pd
-from typing import Optional, Dict, Any, Union, Tuple, List, Set
+from typing import Optional, Dict, Any, Union
 
 
 class Parameters(Enum):
     ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
     START_TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY
     TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
+    AGGREGATION_MEASURE = "aggregationMeasure"
     BUSINESS_HOURS = "business_hours"
     WORKTIMING = "worktiming"
     WEEKENDS = "weekends"
@@ -38,6 +39,7 @@ def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Union[str, Paramete
                                         Default: [7, 17] (work shift from 07:00 to 17:00)
         - Parameters.WEEKENDS => indexes of the days of the week that are weekend
                                         Default: [6, 7] (weekends are Saturday and Sunday)
+        - Parameters.AGGREGATION_MEASURE => performance aggregation measure (sum, min, max, mean, median)
 
     Returns
     --------------
@@ -56,18 +58,32 @@ def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Union[str, Paramete
                                                      xes_constants.DEFAULT_TIMESTAMP_KEY)
     timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                                xes_constants.DEFAULT_TIMESTAMP_KEY)
+    aggregation_measure = exec_utils.get_param_value(Parameters.AGGREGATION_MEASURE,
+                                                     parameters, "mean")
 
     if business_hours:
         dataframe[DIFF_KEY] = dataframe.apply(
             lambda x: soj_time_business_hours_diff(x[start_timestamp_key], x[timestamp_key], worktiming,
                                                    weekends), axis=1)
     else:
         dataframe[DIFF_KEY] = (
-                dataframe[timestamp_key] - dataframe[start_timestamp_key]).astype('timedelta64[s]')
+            dataframe[timestamp_key] - dataframe[start_timestamp_key]
+        ).astype('timedelta64[s]')
 
     dataframe = dataframe.reset_index()
 
-    ret_dict = dataframe.groupby(activity_key)[DIFF_KEY].mean().to_dict()
+    column = dataframe.groupby(activity_key)[DIFF_KEY]
+    if aggregation_measure == "median":
+        ret_dict = column.median().to_dict()
+    elif aggregation_measure == "min":
+        ret_dict = column.min().to_dict()
+    elif aggregation_measure == "max":
+        ret_dict = column.max().to_dict()
+    elif aggregation_measure == "sum":
+        ret_dict = column.sum().to_dict()
+    else:
+        ret_dict = column.mean().to_dict()
+
     # assure to avoid problems with np.float64, by using the Python float type
     for el in ret_dict:
         ret_dict[el] = float(ret_dict[el])