diff --git a/pm4py/__init__.py b/pm4py/__init__.py index e4ba0b01b..1ac56182f 100644 --- a/pm4py/__init__.py +++ b/pm4py/__init__.py @@ -3,7 +3,7 @@ from pm4py import util, objects, statistics, algo, visualization from pm4py.analysis import check_soundness, solve_marking_equation, solve_extended_marking_equation, \ - construct_synchronous_product_net + construct_synchronous_product_net, insert_artificial_start_end from pm4py.conformance import conformance_diagnostics_token_based_replay, conformance_diagnostics_alignments, \ fitness_token_based_replay, \ fitness_alignments, precision_token_based_replay, \ diff --git a/pm4py/analysis.py b/pm4py/analysis.py index 27caf7b5c..6a4f07583 100644 --- a/pm4py/analysis.py +++ b/pm4py/analysis.py @@ -1,7 +1,11 @@ -from typing import List, Optional, Tuple, Dict +from typing import List, Optional, Tuple, Dict, Union -from pm4py.objects.log.obj import Trace +from pm4py.objects.log.obj import Trace, EventLog from pm4py.objects.petri_net.obj import PetriNet, Marking +from pm4py.utils import get_properties +from pm4py.util.pandas_utils import check_is_pandas_dataframe, check_pandas_dataframe_columns + +import pandas as pd def construct_synchronous_product_net(trace: Trace, petri_net: PetriNet, initial_marking: Marking, @@ -139,3 +143,28 @@ def check_soundness(petri_net: PetriNet, initial_marking: Marking, """ from pm4py.algo.analysis.woflan import algorithm as woflan return woflan.apply(petri_net, initial_marking, final_marking) + + +def insert_artificial_start_end(log: Union[EventLog, pd.DataFrame]) -> Union[EventLog, pd.DataFrame]: + """ + Inserts the artificial start/end activities in an event log / Pandas dataframe + + Parameters + ------------------ + log + Event log / Pandas dataframe + + Returns + ------------------ + log + Event log / Pandas dataframe with artificial start / end activities + """ + properties = get_properties(log) + if check_is_pandas_dataframe(log): + check_pandas_dataframe_columns(log) + from pm4py.objects.log.util import dataframe_utils + return dataframe_utils.insert_artificial_start_end(log, parameters=properties) + else: + from pm4py.objects.log.util import artificial + return artificial.insert_artificial_start_end(log, parameters=properties) + diff --git a/pm4py/objects/log/util/__init__.py b/pm4py/objects/log/util/__init__.py index 47ca6f2c5..f4e094473 100644 --- a/pm4py/objects/log/util/__init__.py +++ b/pm4py/objects/log/util/__init__.py @@ -1,7 +1,7 @@ from pm4py.objects.log.util import insert_classifier, log, sampling, \ sorting, index_attribute, get_class_representation, get_prefixes, \ get_log_encoded, interval_lifecycle, log_regex, basic_filter, \ - filtering_utils, split_train_test, xes + filtering_utils, split_train_test, xes, artificial import pkgutil if pkgutil.find_loader("pandas"): diff --git a/pm4py/objects/log/util/artificial.py b/pm4py/objects/log/util/artificial.py new file mode 100644 index 000000000..99cfbbe0c --- /dev/null +++ b/pm4py/objects/log/util/artificial.py @@ -0,0 +1,47 @@ +from enum import Enum +from typing import Optional, Dict, Any + +from pm4py.objects.log.obj import Event +from pm4py.objects.log.obj import EventLog +from pm4py.util import constants +from pm4py.util import exec_utils +from pm4py.util import xes_constants + + +class Parameters(Enum): + ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY + PARAM_ARTIFICIAL_START_ACTIVITY = constants.PARAM_ARTIFICIAL_START_ACTIVITY + PARAM_ARTIFICIAL_END_ACTIVITY = constants.PARAM_ARTIFICIAL_END_ACTIVITY + + +def insert_artificial_start_end(log: EventLog, parameters: Optional[Dict[Any, Any]] = None) -> EventLog: + """ + Inserts the artificial start/end activities in an event log + + Parameters + ------------------- + log + Event log + parameters + Parameters of the algorithm, including: + - Parameters.ACTIVITY_KEY: the activity + + Returns + ------------------ + log + Enriched log + """ + if parameters is None: + parameters = {} + + activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) + artificial_start_activity = exec_utils.get_param_value(Parameters.PARAM_ARTIFICIAL_START_ACTIVITY, parameters, + constants.DEFAULT_ARTIFICIAL_START_ACTIVITY) + artificial_end_activity = exec_utils.get_param_value(Parameters.PARAM_ARTIFICIAL_END_ACTIVITY, parameters, + constants.DEFAULT_ARTIFICIAL_END_ACTIVITY) + + for trace in log: + trace.insert(0, Event({activity_key: artificial_start_activity})) + trace.append(Event({activity_key: artificial_end_activity})) + + return log diff --git a/pm4py/objects/log/util/dataframe_utils.py b/pm4py/objects/log/util/dataframe_utils.py index 0582de460..24842be47 100644 --- a/pm4py/objects/log/util/dataframe_utils.py +++ b/pm4py/objects/log/util/dataframe_utils.py @@ -8,7 +8,7 @@ from pm4py.util import constants from pm4py.util import exec_utils from pm4py.util import points_subset -from pm4py.util import xes_constants +from pm4py.util import xes_constants, pandas_utils LEGACY_PARQUET_TP_REPLACER = "AAA" LEGACY_PARQUET_CASECONCEPTNAME = "caseAAAconceptAAAname" @@ -23,6 +23,9 @@ class Parameters(Enum): MAX_DIFFERENT_OCC_STR_ATTR = 50 TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY + PARAM_ARTIFICIAL_START_ACTIVITY = constants.PARAM_ARTIFICIAL_START_ACTIVITY + PARAM_ARTIFICIAL_END_ACTIVITY = constants.PARAM_ARTIFICIAL_END_ACTIVITY + INDEX_KEY = "index_key" def insert_partitioning(df, num_partitions, parameters=None): @@ -371,3 +374,56 @@ def automatic_feature_extraction_df(df: pd.DataFrame, parameters: Optional[Dict[ columns.remove(timestamp_key) return get_features_df(fea_sel_df, list(columns), parameters=parameters) + + +def insert_artificial_start_end(df0: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame: + """ + Inserts the artificial start/end activities in a Pandas dataframe + + Parameters + ------------------ + df0 + Dataframe + parameters + Parameters of the algorithm, including: + - Parameters.CASE_ID_KEY: the case identifier + - Parameters.TIMESTAMP_KEY: the timestamp + - Parameters.ACTIVITY_KEY: the activity + + Returns + ----------------- + enriched_df + Dataframe with artificial start/end activities + """ + if parameters is None: + parameters = {} + + case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) + timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) + activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) + + artificial_start_activity = exec_utils.get_param_value(Parameters.PARAM_ARTIFICIAL_START_ACTIVITY, parameters, constants.DEFAULT_ARTIFICIAL_START_ACTIVITY) + artificial_end_activity = exec_utils.get_param_value(Parameters.PARAM_ARTIFICIAL_END_ACTIVITY, parameters, constants.DEFAULT_ARTIFICIAL_END_ACTIVITY) + + index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, constants.DEFAULT_INDEX_KEY) + + df = df0.copy() + df = pandas_utils.insert_index(df, index_key) + df = df.sort_values([case_id_key, timestamp_key, index_key]) + + start_df = df[[case_id_key, timestamp_key]].groupby(case_id_key).first().reset_index() + end_df = df[[case_id_key, timestamp_key]].groupby(case_id_key).last().reset_index() + # stability trick: remove 1ms from the artificial start activity timestamp, add 1ms to the artificial end activity timestamp + start_df[timestamp_key] = start_df[timestamp_key] - pd.Timedelta("1 ms") + end_df[timestamp_key] = end_df[timestamp_key] + pd.Timedelta("1 ms") + + start_df[activity_key] = artificial_start_activity + end_df[activity_key] = artificial_end_activity + + df = pd.concat([start_df, df, end_df]) + df = pandas_utils.insert_index(df, index_key) + df = df.sort_values([case_id_key, timestamp_key, index_key]) + + df.attrs = df0.attrs + + return df