Skip to content

Commit

Permalink
Merge branch 'ft-1261-updates-related-to-business-hours' into 'integr…
Browse files Browse the repository at this point in the history
…ation'

FT 1261 Various updates related to business hours

See merge request pm4py/pm4py-core!499
  • Loading branch information
fit-sebastiaan-van-zelst committed Oct 15, 2021
2 parents d22dd49 + 0c789ec commit 829f091
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 34 deletions.
10 changes: 9 additions & 1 deletion pm4py/algo/conformance/temporal_profile/variants/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ class Parameters(Enum):
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
ZETA = "zeta"
BUSINESS_HOURS = "business_hours"
WORKTIMING = "worktiming"
WEEKENDS = "weekends"


def apply(df: pd.DataFrame, temporal_profile: typing.TemporalProfile,
Expand Down Expand Up @@ -61,6 +64,10 @@ def apply(df: pd.DataFrame, temporal_profile: typing.TemporalProfile,
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
zeta = exec_utils.get_param_value(Parameters.ZETA, parameters, 6.0)

business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False)
worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17])
weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7])

temporal_profile = pd.DataFrame([{activity_key: x[0], activity_key + "_2": x[1], "@@min": y[0] - zeta * y[1],
"@@max": y[0] + zeta * y[1], "@@mean": y[0], "@@std": y[1]} for x, y in
temporal_profile.items()])
Expand All @@ -69,7 +76,8 @@ def apply(df: pd.DataFrame, temporal_profile: typing.TemporalProfile,
ret = [[] for c in cases]
efg = get_partial_order_dataframe(df, activity_key=activity_key, timestamp_key=timestamp_key,
start_timestamp_key=start_timestamp_key, case_id_glue=case_id_key,
keep_first_following=False)
keep_first_following=False, business_hours=business_hours, worktiming=worktiming,
weekends=weekends)
efg = efg[[case_id_key, activity_key, activity_key + "_2", "@@flow_time"]]
efg = efg.merge(temporal_profile, on=[activity_key, activity_key + "_2"])
efg = efg[(efg["@@flow_time"] < efg["@@min"]) | (efg["@@flow_time"] > efg["@@max"])][
Expand Down
41 changes: 31 additions & 10 deletions pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from pm4py.util import xes_constants, pandas_utils, constants
from pm4py.util.business_hours import soj_time_business_hours_diff


def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_glue="case:concept:name",
start_timestamp_key=None, timestamp_key="time:timestamp", perf_aggregation_key="mean",
sort_caseid_required=True,
sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1):
sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1,
business_hours=False, worktiming=None, weekends=None):
"""
Get DFG graph from Pandas dataframe
Expand Down Expand Up @@ -76,13 +78,21 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
all_columns = list(all_columns - set([activity_key, activity_key + '_2']))

if measure == "performance" or measure == "both":
# calculate the difference between the timestamps of two successive events
df_successive_rows[constants.DEFAULT_FLOW_TIME] = (
df_successive_rows[start_timestamp_key + '_2'] - df_successive_rows[timestamp_key]).astype(
'timedelta64[s]')
# in the arc performance calculation, make sure to consider positive or null values
df_successive_rows[constants.DEFAULT_FLOW_TIME] = df_successive_rows[constants.DEFAULT_FLOW_TIME].apply(
lambda x: max(x, 0))
df_successive_rows[start_timestamp_key + '_2'] = df_successive_rows[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
# calculate the difference between the timestamps of two successive events
if business_hours:
if worktiming is None:
worktiming = [7, 17]
if weekends is None:
weekends = [6, 7]
df_successive_rows[constants.DEFAULT_FLOW_TIME] = df_successive_rows.apply(
lambda x: soj_time_business_hours_diff(x[timestamp_key], x[start_timestamp_key + '_2'], worktiming,
weekends), axis=1)
else:
df_successive_rows[constants.DEFAULT_FLOW_TIME] = (
df_successive_rows[start_timestamp_key + '_2'] - df_successive_rows[timestamp_key]).astype(
'timedelta64[s]')
# groups couple of attributes (directly follows relation, we can measure the frequency and the performance)
directly_follows_grouping = df_successive_rows.groupby([activity_key, activity_key + '_2'])[
constants.DEFAULT_FLOW_TIME]
Expand Down Expand Up @@ -126,7 +136,8 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="time:timestamp",
case_id_glue="case:concept:name", activity_key="concept:name",
sort_caseid_required=True,
sort_timestamp_along_case_id=True, reduce_dataframe=True, keep_first_following=True):
sort_timestamp_along_case_id=True, reduce_dataframe=True, keep_first_following=True,
business_hours=False, worktiming=None, weekends=None):
"""
Gets the partial order between events (of the same case) in a Pandas dataframe
Expand Down Expand Up @@ -179,10 +190,20 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim

df = df.join(df_copy, rsuffix="_2").dropna()
df = df[df[constants.DEFAULT_INDEX_KEY] < df[constants.DEFAULT_INDEX_KEY + "_2"]]
df = df[df[timestamp_key] <= df[start_timestamp_key + "_2"]]
df[start_timestamp_key + '_2'] = df[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)

df = df.reset_index()

df[constants.DEFAULT_FLOW_TIME] = (df[start_timestamp_key + "_2"] - df[timestamp_key]).astype('timedelta64[s]')
if business_hours:
if worktiming is None:
worktiming = [7, 17]
if weekends is None:
weekends = [6, 7]
df[constants.DEFAULT_FLOW_TIME] = df.apply(
lambda x: soj_time_business_hours_diff(x[timestamp_key], x[start_timestamp_key + '_2'], worktiming,
weekends), axis=1)
else:
df[constants.DEFAULT_FLOW_TIME] = (df[start_timestamp_key + "_2"] - df[timestamp_key]).astype('timedelta64[s]')

if keep_first_following:
df = df.groupby(constants.DEFAULT_INDEX_KEY).first().reset_index()
Expand Down
10 changes: 9 additions & 1 deletion pm4py/algo/discovery/temporal_profile/variants/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ class Parameters(Enum):
START_TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
BUSINESS_HOURS = "business_hours"
WORKTIMING = "worktiming"
WEEKENDS = "weekends"


def apply(df: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> typing.TemporalProfile:
Expand Down Expand Up @@ -48,9 +51,14 @@ def apply(df: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> typi
start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters, None)
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)

business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False)
worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17])
weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7])

efg = get_partial_order_dataframe(df, activity_key=activity_key, timestamp_key=timestamp_key,
start_timestamp_key=start_timestamp_key, case_id_glue=case_id_key,
keep_first_following=False)
keep_first_following=False, business_hours=business_hours, worktiming=worktiming,
weekends=weekends)
efg = efg[[activity_key, activity_key + "_2", "@@flow_time"]]
temporal_profile = efg.groupby([activity_key, activity_key + "_2"]).agg(["mean", "std"]).reset_index().fillna(
0).to_dict("records")
Expand Down
24 changes: 20 additions & 4 deletions pm4py/algo/filtering/pandas/cases/case_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,17 @@
import deprecation
from typing import Optional, Dict, Any, Union, Tuple, List
import pandas as pd
from pm4py.util.business_hours import soj_time_business_hours_diff


class Parameters(Enum):
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY

BUSINESS_HOURS = "business_hours"
WORKTIMING = "worktiming"
WEEKENDS = "weekends"


def filter_on_ncases(df: pd.DataFrame, case_id_glue: str = constants.CASE_CONCEPT_NAME, max_no_cases: int = 1000):
"""
Expand Down Expand Up @@ -72,7 +77,8 @@ def filter_on_case_size(df0: pd.DataFrame, case_id_glue: str = "case:concept:nam

def filter_on_case_performance(df: pd.DataFrame, case_id_glue: str = constants.CASE_CONCEPT_NAME,
timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY,
min_case_performance: float = 0, max_case_performance: float = 10000000000) -> pd.DataFrame:
min_case_performance: float = 0, max_case_performance: float = 10000000000,
business_hours=False, worktiming=[7, 17], weekends=[6, 7]) -> pd.DataFrame:
"""
Filter a dataframe on case performance
Expand All @@ -99,8 +105,13 @@ def filter_on_case_performance(df: pd.DataFrame, case_id_glue: str = constants.C
end_events = grouped_df.last()
end_events.columns = [str(col) + '_2' for col in end_events.columns]
stacked_df = pd.concat([start_events, end_events], axis=1)
stacked_df['caseDuration'] = stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key]
stacked_df['caseDuration'] = stacked_df['caseDuration'].astype('timedelta64[s]')
if business_hours:
stacked_df['caseDuration'] = stacked_df.apply(
lambda x: soj_time_business_hours_diff(x[timestamp_key], x[timestamp_key + "_2"], worktiming,
weekends), axis=1)
else:
stacked_df['caseDuration'] = stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key]
stacked_df['caseDuration'] = stacked_df['caseDuration'].astype('timedelta64[s]')
stacked_df = stacked_df[stacked_df['caseDuration'] <= max_case_performance]
stacked_df = stacked_df[stacked_df['caseDuration'] >= min_case_performance]
i1 = df.set_index(case_id_glue).index
Expand All @@ -116,9 +127,14 @@ def filter_case_performance(df: pd.DataFrame, min_case_performance: float = 0, m
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
xes_constants.DEFAULT_TIMESTAMP_KEY)
case_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False)
worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17])
weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7])

return filter_on_case_performance(df, min_case_performance=min_case_performance,
max_case_performance=max_case_performance, timestamp_key=timestamp_key,
case_id_glue=case_glue)
case_id_glue=case_glue, business_hours=business_hours, worktiming=worktiming,
weekends=weekends)


def apply(df, parameters=None):
Expand Down
19 changes: 16 additions & 3 deletions pm4py/statistics/sojourn_time/log/get.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
from enum import Enum
from statistics import mean
from statistics import mean, median

from pm4py.util import exec_utils, constants, xes_constants
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.util.business_hours import BusinessHours
from typing import Optional, Dict, Any, Union, Tuple, List, Set
from typing import Optional, Dict, Any, Union
from pm4py.objects.log.obj import EventLog


class Parameters(Enum):
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
START_TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
AGGREGATION_MEASURE = "aggregationMeasure"
BUSINESS_HOURS = "business_hours"
WORKTIMING = "worktiming"
WEEKENDS = "weekends"
Expand Down Expand Up @@ -40,6 +41,7 @@ def apply(log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]]
Default: [7, 17] (work shift from 07:00 to 17:00)
- Parameters.WEEKENDS => indexes of the days of the week that are weekend
Default: [6, 7] (weekends are Saturday and Sunday)
- Parameters.AGGREGATION_MEASURE => performance aggregation measure (sum, min, max, mean, median)
Returns
--------------
Expand All @@ -60,6 +62,8 @@ def apply(log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]]
xes_constants.DEFAULT_TIMESTAMP_KEY)
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
xes_constants.DEFAULT_TIMESTAMP_KEY)
aggregation_measure = exec_utils.get_param_value(Parameters.AGGREGATION_MEASURE,
parameters, "mean")

durations_dict = {}
activities = [ev[activity_key] for trace in log for ev in trace]
Expand All @@ -80,6 +84,15 @@ def apply(log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]]
durations_dict[activity].append(complete_time - start_time)

for act in durations_dict:
durations_dict[act] = mean(durations_dict[act])
if aggregation_measure == "median":
durations_dict[act] = median(durations_dict[act])
elif aggregation_measure == "min":
durations_dict[act] = min(durations_dict[act])
elif aggregation_measure == "max":
durations_dict[act] = max(durations_dict[act])
elif aggregation_measure == "sum":
durations_dict[act] = sum(durations_dict[act])
else:
durations_dict[act] = mean(durations_dict[act])

return durations_dict
24 changes: 20 additions & 4 deletions pm4py/statistics/sojourn_time/pandas/get.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import pandas as pd
from enum import Enum

from pm4py.util import exec_utils, constants, xes_constants
from pm4py.util.business_hours import soj_time_business_hours_diff
import pandas as pd
from typing import Optional, Dict, Any, Union, Tuple, List, Set
from typing import Optional, Dict, Any, Union


class Parameters(Enum):
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
START_TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
AGGREGATION_MEASURE = "aggregationMeasure"
BUSINESS_HOURS = "business_hours"
WORKTIMING = "worktiming"
WEEKENDS = "weekends"
Expand Down Expand Up @@ -38,6 +39,7 @@ def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Union[str, Paramete
Default: [7, 17] (work shift from 07:00 to 17:00)
- Parameters.WEEKENDS => indexes of the days of the week that are weekend
Default: [6, 7] (weekends are Saturday and Sunday)
- Parameters.AGGREGATION_MEASURE => performance aggregation measure (sum, min, max, mean, median)
Returns
--------------
Expand All @@ -56,18 +58,32 @@ def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Union[str, Paramete
xes_constants.DEFAULT_TIMESTAMP_KEY)
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
xes_constants.DEFAULT_TIMESTAMP_KEY)
aggregation_measure = exec_utils.get_param_value(Parameters.AGGREGATION_MEASURE,
parameters, "mean")

if business_hours:
dataframe[DIFF_KEY] = dataframe.apply(
lambda x: soj_time_business_hours_diff(x[start_timestamp_key], x[timestamp_key], worktiming,
weekends), axis=1)
else:
dataframe[DIFF_KEY] = (
dataframe[timestamp_key] - dataframe[start_timestamp_key]).astype('timedelta64[s]')
dataframe[timestamp_key] - dataframe[start_timestamp_key]
).astype('timedelta64[s]')

dataframe = dataframe.reset_index()

ret_dict = dataframe.groupby(activity_key)[DIFF_KEY].mean().to_dict()
column = dataframe.groupby(activity_key)[DIFF_KEY]
if aggregation_measure == "median":
ret_dict = column.median().to_dict()
elif aggregation_measure == "min":
ret_dict = column.min().to_dict()
elif aggregation_measure == "max":
ret_dict = column.max().to_dict()
elif aggregation_measure == "sum":
ret_dict = column.sum().to_dict()
else:
ret_dict = column.mean().to_dict()

# assure to avoid problems with np.float64, by using the Python float type
for el in ret_dict:
ret_dict[el] = float(ret_dict[el])
Expand Down
Loading

0 comments on commit 829f091

Please sign in to comment.