-
Notifications
You must be signed in to change notification settings - Fork 300
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'PM4PY-1556' into 'integration'
PM4PY-1556 Prefixes and suffixes filters for Pandas dataframes See merge request process-mining/pm4py/pm4py-core!595
- Loading branch information
Showing
4 changed files
with
145 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from pm4py.algo.filtering.pandas.prefixes import prefix_filter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import pandas as pd | ||
|
||
from enum import Enum | ||
from typing import Optional, Dict, Any | ||
|
||
from pm4py.util import constants | ||
from pm4py.util import exec_utils | ||
from pm4py.util import xes_constants, pandas_utils | ||
|
||
|
||
class Parameters(Enum): | ||
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY | ||
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY | ||
INDEX_KEY = "index_key" | ||
INDEX_IN_TRACE_KEY = "index_in_trace_key" | ||
USE_EXTREMES_TIMESTAMP = "use_extremes_timestamp" | ||
TEMP_COLUMN = "temp_column" | ||
FIRST_OR_LAST = "first_or_last" | ||
STRICT = "strict" | ||
|
||
|
||
def apply(df: pd.DataFrame, activity: str, parameters: Optional[Dict[Any, Any]] = None): | ||
""" | ||
Filter all the prefixes to a given activity (first or last occurrence of the activity in the case). | ||
Parameters | ||
---------------- | ||
df | ||
Dataframe | ||
parameters | ||
Parameters of the algorithm: | ||
- Parameters.CASE_ID_KEY => the case identifier column. | ||
- Parameters.ACTIVITY_KEY => the activity column. | ||
- Parameters.INDEX_IN_TRACE_KEY => attribute that should act as container of the index of the event inside | ||
the case. | ||
- Parameters.TEMP_COLUMN => temporary column which is used for internal purposes. | ||
- Parameters.FIRST_OR_LAST => filter on the first or last occurrence of an activity in the dataframe. | ||
- Parameters.STRICT => applies the filter in a strict (<) or lean (<=) way (boolean). | ||
Returns | ||
---------------- | ||
df | ||
Dataframe filtered keeping the prefixes to a given activity (first or last occurrence of the activity in the case). | ||
""" | ||
if parameters is None: | ||
parameters = {} | ||
|
||
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) | ||
activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) | ||
index_in_trace_key = exec_utils.get_param_value(Parameters.INDEX_IN_TRACE_KEY, parameters, constants.DEFAULT_INDEX_IN_TRACE_KEY) | ||
temp_column = exec_utils.get_param_value(Parameters.TEMP_COLUMN, parameters, "@@temp_column") | ||
first_or_last = exec_utils.get_param_value(Parameters.FIRST_OR_LAST, parameters, "first") | ||
strict = exec_utils.get_param_value(Parameters.STRICT, parameters, True) | ||
|
||
if index_in_trace_key not in df.columns: | ||
df = pandas_utils.insert_ev_in_tr_index(df, column_name=index_in_trace_key, case_id=case_id_key) | ||
|
||
position_activity = df[df[activity_key] == activity].groupby(case_id_key) | ||
if first_or_last == "first": | ||
position_activity = position_activity.first() | ||
elif first_or_last == "last": | ||
position_activity = position_activity.last() | ||
position_activity = position_activity.reset_index()[[case_id_key, index_in_trace_key]].to_dict("r") | ||
position_activity = {x[case_id_key]: x[index_in_trace_key] for x in position_activity} | ||
|
||
df[temp_column] = df[case_id_key].map(position_activity) | ||
if strict: | ||
df = df[df[index_in_trace_key] < df[temp_column]] | ||
else: | ||
df = df[df[index_in_trace_key] <= df[temp_column]] | ||
|
||
return df |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import pandas as pd | ||
|
||
from enum import Enum | ||
from typing import Optional, Dict, Any | ||
|
||
from pm4py.util import constants | ||
from pm4py.util import exec_utils | ||
from pm4py.util import xes_constants, pandas_utils | ||
|
||
|
||
class Parameters(Enum): | ||
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY | ||
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY | ||
INDEX_KEY = "index_key" | ||
INDEX_IN_TRACE_KEY = "index_in_trace_key" | ||
USE_EXTREMES_TIMESTAMP = "use_extremes_timestamp" | ||
TEMP_COLUMN = "temp_column" | ||
FIRST_OR_LAST = "first_or_last" | ||
STRICT = "strict" | ||
|
||
|
||
def apply(df: pd.DataFrame, activity: str, parameters: Optional[Dict[Any, Any]] = None): | ||
""" | ||
Filter all the suffixes to a given activity (first or last occurrence of the activity in the case). | ||
Parameters | ||
---------------- | ||
df | ||
Dataframe | ||
parameters | ||
Parameters of the algorithm: | ||
- Parameters.CASE_ID_KEY => the case identifier column. | ||
- Parameters.ACTIVITY_KEY => the activity column. | ||
- Parameters.INDEX_IN_TRACE_KEY => attribute that should act as container of the index of the event inside | ||
the case. | ||
- Parameters.TEMP_COLUMN => temporary column which is used for internal purposes. | ||
- Parameters.FIRST_OR_LAST => filter on the first or last occurrence of an activity in the dataframe. | ||
- Parameters.STRICT => applies the filter in a strict (<) or lean (<=) way (boolean). | ||
Returns | ||
---------------- | ||
df | ||
Dataframe filtered keeping the prefixes to a given activity (first or last occurrence of the activity in the case). | ||
""" | ||
if parameters is None: | ||
parameters = {} | ||
|
||
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) | ||
activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) | ||
index_in_trace_key = exec_utils.get_param_value(Parameters.INDEX_IN_TRACE_KEY, parameters, constants.DEFAULT_INDEX_IN_TRACE_KEY) | ||
temp_column = exec_utils.get_param_value(Parameters.TEMP_COLUMN, parameters, "@@temp_column") | ||
first_or_last = exec_utils.get_param_value(Parameters.FIRST_OR_LAST, parameters, "first") | ||
strict = exec_utils.get_param_value(Parameters.STRICT, parameters, True) | ||
|
||
if index_in_trace_key not in df.columns: | ||
df = pandas_utils.insert_ev_in_tr_index(df, column_name=index_in_trace_key, case_id=case_id_key) | ||
|
||
position_activity = df[df[activity_key] == activity].groupby(case_id_key) | ||
if first_or_last == "first": | ||
position_activity = position_activity.first() | ||
elif first_or_last == "last": | ||
position_activity = position_activity.last() | ||
position_activity = position_activity.reset_index()[[case_id_key, index_in_trace_key]].to_dict("r") | ||
position_activity = {x[case_id_key]: x[index_in_trace_key] for x in position_activity} | ||
|
||
df[temp_column] = df[case_id_key].map(position_activity) | ||
if strict: | ||
df = df[df[index_in_trace_key] > df[temp_column]] | ||
else: | ||
df = df[df[index_in_trace_key] >= df[temp_column]] | ||
|
||
return df |