Skip to content

Commit

Permalink
Merge branch 'PM4PY-1556' into 'integration'
Browse files Browse the repository at this point in the history
PM4PY-1556 Prefixes and suffixes filters for Pandas dataframes

See merge request process-mining/pm4py/pm4py-core!595
  • Loading branch information
fit-sebastiaan-van-zelst committed Jan 25, 2022
2 parents c7c7ed5 + 07105e7 commit 5b5c048
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 0 deletions.
1 change: 1 addition & 0 deletions pm4py/algo/filtering/pandas/prefixes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pm4py.algo.filtering.pandas.prefixes import prefix_filter
72 changes: 72 additions & 0 deletions pm4py/algo/filtering/pandas/prefixes/prefix_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import pandas as pd

from enum import Enum
from typing import Optional, Dict, Any

from pm4py.util import constants
from pm4py.util import exec_utils
from pm4py.util import xes_constants, pandas_utils


class Parameters(Enum):
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
INDEX_KEY = "index_key"
INDEX_IN_TRACE_KEY = "index_in_trace_key"
USE_EXTREMES_TIMESTAMP = "use_extremes_timestamp"
TEMP_COLUMN = "temp_column"
FIRST_OR_LAST = "first_or_last"
STRICT = "strict"


def apply(df: pd.DataFrame, activity: str, parameters: Optional[Dict[Any, Any]] = None):
"""
Filter all the prefixes to a given activity (first or last occurrence of the activity in the case).
Parameters
----------------
df
Dataframe
parameters
Parameters of the algorithm:
- Parameters.CASE_ID_KEY => the case identifier column.
- Parameters.ACTIVITY_KEY => the activity column.
- Parameters.INDEX_IN_TRACE_KEY => attribute that should act as container of the index of the event inside
the case.
- Parameters.TEMP_COLUMN => temporary column which is used for internal purposes.
- Parameters.FIRST_OR_LAST => filter on the first or last occurrence of an activity in the dataframe.
- Parameters.STRICT => applies the filter in a strict (<) or lean (<=) way (boolean).
Returns
----------------
df
Dataframe filtered keeping the prefixes to a given activity (first or last occurrence of the activity in the case).
"""
if parameters is None:
parameters = {}

case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
index_in_trace_key = exec_utils.get_param_value(Parameters.INDEX_IN_TRACE_KEY, parameters, constants.DEFAULT_INDEX_IN_TRACE_KEY)
temp_column = exec_utils.get_param_value(Parameters.TEMP_COLUMN, parameters, "@@temp_column")
first_or_last = exec_utils.get_param_value(Parameters.FIRST_OR_LAST, parameters, "first")
strict = exec_utils.get_param_value(Parameters.STRICT, parameters, True)

if index_in_trace_key not in df.columns:
df = pandas_utils.insert_ev_in_tr_index(df, column_name=index_in_trace_key, case_id=case_id_key)

position_activity = df[df[activity_key] == activity].groupby(case_id_key)
if first_or_last == "first":
position_activity = position_activity.first()
elif first_or_last == "last":
position_activity = position_activity.last()
position_activity = position_activity.reset_index()[[case_id_key, index_in_trace_key]].to_dict("r")
position_activity = {x[case_id_key]: x[index_in_trace_key] for x in position_activity}

df[temp_column] = df[case_id_key].map(position_activity)
if strict:
df = df[df[index_in_trace_key] < df[temp_column]]
else:
df = df[df[index_in_trace_key] <= df[temp_column]]

return df
Empty file.
72 changes: 72 additions & 0 deletions pm4py/algo/filtering/pandas/suffixes/suffix_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import pandas as pd

from enum import Enum
from typing import Optional, Dict, Any

from pm4py.util import constants
from pm4py.util import exec_utils
from pm4py.util import xes_constants, pandas_utils


class Parameters(Enum):
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
INDEX_KEY = "index_key"
INDEX_IN_TRACE_KEY = "index_in_trace_key"
USE_EXTREMES_TIMESTAMP = "use_extremes_timestamp"
TEMP_COLUMN = "temp_column"
FIRST_OR_LAST = "first_or_last"
STRICT = "strict"


def apply(df: pd.DataFrame, activity: str, parameters: Optional[Dict[Any, Any]] = None):
"""
Filter all the suffixes to a given activity (first or last occurrence of the activity in the case).
Parameters
----------------
df
Dataframe
parameters
Parameters of the algorithm:
- Parameters.CASE_ID_KEY => the case identifier column.
- Parameters.ACTIVITY_KEY => the activity column.
- Parameters.INDEX_IN_TRACE_KEY => attribute that should act as container of the index of the event inside
the case.
- Parameters.TEMP_COLUMN => temporary column which is used for internal purposes.
- Parameters.FIRST_OR_LAST => filter on the first or last occurrence of an activity in the dataframe.
- Parameters.STRICT => applies the filter in a strict (<) or lean (<=) way (boolean).
Returns
----------------
df
Dataframe filtered keeping the prefixes to a given activity (first or last occurrence of the activity in the case).
"""
if parameters is None:
parameters = {}

case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
index_in_trace_key = exec_utils.get_param_value(Parameters.INDEX_IN_TRACE_KEY, parameters, constants.DEFAULT_INDEX_IN_TRACE_KEY)
temp_column = exec_utils.get_param_value(Parameters.TEMP_COLUMN, parameters, "@@temp_column")
first_or_last = exec_utils.get_param_value(Parameters.FIRST_OR_LAST, parameters, "first")
strict = exec_utils.get_param_value(Parameters.STRICT, parameters, True)

if index_in_trace_key not in df.columns:
df = pandas_utils.insert_ev_in_tr_index(df, column_name=index_in_trace_key, case_id=case_id_key)

position_activity = df[df[activity_key] == activity].groupby(case_id_key)
if first_or_last == "first":
position_activity = position_activity.first()
elif first_or_last == "last":
position_activity = position_activity.last()
position_activity = position_activity.reset_index()[[case_id_key, index_in_trace_key]].to_dict("r")
position_activity = {x[case_id_key]: x[index_in_trace_key] for x in position_activity}

df[temp_column] = df[case_id_key].map(position_activity)
if strict:
df = df[df[index_in_trace_key] > df[temp_column]]
else:
df = df[df[index_in_trace_key] >= df[temp_column]]

return df

0 comments on commit 5b5c048

Please sign in to comment.