Merge branch 'PM4PY-1556' into 'integration'

PM4PY-1556 Prefixes and suffixes filters for Pandas dataframes See merge request process-mining/pm4py/pm4py-core!595
process-intelligence-solutions · Jan 25, 2022 · 5b5c048 · 5b5c048
2 parents c7c7ed5 + 07105e7
commit 5b5c048
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 0 deletions.
diff --git a/pm4py/algo/filtering/pandas/prefixes/__init__.py b/pm4py/algo/filtering/pandas/prefixes/__init__.py
@@ -0,0 +1 @@
+from pm4py.algo.filtering.pandas.prefixes import prefix_filter
diff --git a/pm4py/algo/filtering/pandas/prefixes/prefix_filter.py b/pm4py/algo/filtering/pandas/prefixes/prefix_filter.py
@@ -0,0 +1,72 @@
+import pandas as pd
+
+from enum import Enum
+from typing import Optional, Dict, Any
+
+from pm4py.util import constants
+from pm4py.util import exec_utils
+from pm4py.util import xes_constants, pandas_utils
+
+
+class Parameters(Enum):
+    CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
+    ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
+    INDEX_KEY = "index_key"
+    INDEX_IN_TRACE_KEY = "index_in_trace_key"
+    USE_EXTREMES_TIMESTAMP = "use_extremes_timestamp"
+    TEMP_COLUMN = "temp_column"
+    FIRST_OR_LAST = "first_or_last"
+    STRICT = "strict"
+
+
+def apply(df: pd.DataFrame, activity: str, parameters: Optional[Dict[Any, Any]] = None):
+    """
+    Filter all the prefixes to a given activity (first or last occurrence of the activity in the case).
+
+    Parameters
+    ----------------
+    df
+        Dataframe
+    parameters
+        Parameters of the algorithm:
+        - Parameters.CASE_ID_KEY => the case identifier column.
+        - Parameters.ACTIVITY_KEY => the activity column.
+        - Parameters.INDEX_IN_TRACE_KEY => attribute that should act as container of the index of the event inside
+                                            the case.
+        - Parameters.TEMP_COLUMN => temporary column which is used for internal purposes.
+        - Parameters.FIRST_OR_LAST => filter on the first or last occurrence of an activity in the dataframe.
+        - Parameters.STRICT => applies the filter in a strict (<) or lean (<=) way (boolean).
+
+    Returns
+    ----------------
+    df
+        Dataframe filtered keeping the prefixes to a given activity (first or last occurrence of the activity in the case).
+    """
+    if parameters is None:
+        parameters = {}
+
+    case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
+    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
+    index_in_trace_key = exec_utils.get_param_value(Parameters.INDEX_IN_TRACE_KEY, parameters, constants.DEFAULT_INDEX_IN_TRACE_KEY)
+    temp_column = exec_utils.get_param_value(Parameters.TEMP_COLUMN, parameters, "@@temp_column")
+    first_or_last = exec_utils.get_param_value(Parameters.FIRST_OR_LAST, parameters, "first")
+    strict = exec_utils.get_param_value(Parameters.STRICT, parameters, True)
+
+    if index_in_trace_key not in df.columns:
+        df = pandas_utils.insert_ev_in_tr_index(df, column_name=index_in_trace_key, case_id=case_id_key)
+
+    position_activity = df[df[activity_key] == activity].groupby(case_id_key)
+    if first_or_last == "first":
+        position_activity = position_activity.first()
+    elif first_or_last == "last":
+        position_activity = position_activity.last()
+    position_activity = position_activity.reset_index()[[case_id_key, index_in_trace_key]].to_dict("r")
+    position_activity = {x[case_id_key]: x[index_in_trace_key] for x in position_activity}
+
+    df[temp_column] = df[case_id_key].map(position_activity)
+    if strict:
+        df = df[df[index_in_trace_key] < df[temp_column]]
+    else:
+        df = df[df[index_in_trace_key] <= df[temp_column]]
+
+    return df
diff --git a/pm4py/algo/filtering/pandas/suffixes/__init__.py b/pm4py/algo/filtering/pandas/suffixes/__init__.py
diff --git a/pm4py/algo/filtering/pandas/suffixes/suffix_filter.py b/pm4py/algo/filtering/pandas/suffixes/suffix_filter.py
@@ -0,0 +1,72 @@
+import pandas as pd
+
+from enum import Enum
+from typing import Optional, Dict, Any
+
+from pm4py.util import constants
+from pm4py.util import exec_utils
+from pm4py.util import xes_constants, pandas_utils
+
+
+class Parameters(Enum):
+    CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
+    ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
+    INDEX_KEY = "index_key"
+    INDEX_IN_TRACE_KEY = "index_in_trace_key"
+    USE_EXTREMES_TIMESTAMP = "use_extremes_timestamp"
+    TEMP_COLUMN = "temp_column"
+    FIRST_OR_LAST = "first_or_last"
+    STRICT = "strict"
+
+
+def apply(df: pd.DataFrame, activity: str, parameters: Optional[Dict[Any, Any]] = None):
+    """
+    Filter all the suffixes to a given activity (first or last occurrence of the activity in the case).
+
+    Parameters
+    ----------------
+    df
+        Dataframe
+    parameters
+        Parameters of the algorithm:
+        - Parameters.CASE_ID_KEY => the case identifier column.
+        - Parameters.ACTIVITY_KEY => the activity column.
+        - Parameters.INDEX_IN_TRACE_KEY => attribute that should act as container of the index of the event inside
+                                            the case.
+        - Parameters.TEMP_COLUMN => temporary column which is used for internal purposes.
+        - Parameters.FIRST_OR_LAST => filter on the first or last occurrence of an activity in the dataframe.
+        - Parameters.STRICT => applies the filter in a strict (<) or lean (<=) way (boolean).
+
+    Returns
+    ----------------
+    df
+        Dataframe filtered keeping the prefixes to a given activity (first or last occurrence of the activity in the case).
+    """
+    if parameters is None:
+        parameters = {}
+
+    case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
+    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
+    index_in_trace_key = exec_utils.get_param_value(Parameters.INDEX_IN_TRACE_KEY, parameters, constants.DEFAULT_INDEX_IN_TRACE_KEY)
+    temp_column = exec_utils.get_param_value(Parameters.TEMP_COLUMN, parameters, "@@temp_column")
+    first_or_last = exec_utils.get_param_value(Parameters.FIRST_OR_LAST, parameters, "first")
+    strict = exec_utils.get_param_value(Parameters.STRICT, parameters, True)
+
+    if index_in_trace_key not in df.columns:
+        df = pandas_utils.insert_ev_in_tr_index(df, column_name=index_in_trace_key, case_id=case_id_key)
+
+    position_activity = df[df[activity_key] == activity].groupby(case_id_key)
+    if first_or_last == "first":
+        position_activity = position_activity.first()
+    elif first_or_last == "last":
+        position_activity = position_activity.last()
+    position_activity = position_activity.reset_index()[[case_id_key, index_in_trace_key]].to_dict("r")
+    position_activity = {x[case_id_key]: x[index_in_trace_key] for x in position_activity}
+
+    df[temp_column] = df[case_id_key].map(position_activity)
+    if strict:
+        df = df[df[index_in_trace_key] > df[temp_column]]
+    else:
+        df = df[df[index_in_trace_key] >= df[temp_column]]
+
+    return df
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from pm4py.algo.filtering.pandas.prefixes import prefix_filter