Skip to content

Commit

Permalink
Merge branch 'ft-1333-filter-log-global-occurrences-attribute' into '…
Browse files Browse the repository at this point in the history
…integration'

PM4PY-1333 Filter log on the global occurrences of an attribute

See merge request process-mining/pm4py/pm4py-core!516
  • Loading branch information
fit-sebastiaan-van-zelst committed Jan 6, 2022
2 parents c0132b6 + a20f918 commit 5f5ff57
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pm4py/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
discover_petri_net_inductive, discover_tree_inductive, discover_process_tree_inductive, discover_heuristics_net, \
discover_dfg, discover_footprints, discover_eventually_follows_graph, discover_directly_follows_graph, discover_bpmn_inductive, \
discover_performance_dfg, discover_ocdfg, discover_oc_petri_net
from pm4py.filtering import filter_start_activities, filter_end_activities, filter_attribute_values, filter_variants, \
from pm4py.filtering import filter_log_relative_occurrence_event_attribute, filter_start_activities, filter_end_activities, filter_attribute_values, filter_variants, \
filter_variants_percentage, filter_directly_follows_relation, filter_time_range, filter_trace_attribute, \
filter_eventually_follows_relation, filter_event_attribute_values, filter_trace_attribute_values, \
filter_between, filter_case_size, filter_case_performance, filter_activities_rework, filter_paths_performance, \
Expand Down
44 changes: 44 additions & 0 deletions pm4py/algo/filtering/log/attributes/attributes_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class Parameters(Enum):
STREAM_FILTER_VALUE1 = "stream_filter_value1"
STREAM_FILTER_KEY2 = "stream_filter_key2"
STREAM_FILTER_VALUE2 = "stream_filter_value2"
KEEP_ONCE_PER_CASE = "keep_once_per_case"


def apply_numeric(log: EventLog, int1: float, int2: float, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog:
Expand Down Expand Up @@ -357,6 +358,49 @@ def filter_log_by_attributes_threshold(log, attributes, variants, vc, threshold,
return filtered_log


def filter_log_relative_occurrence_event_attribute(log: EventLog, min_relative_stake: float, parameters: Optional[Dict[Any, Any]] = None) -> EventLog:
"""
Filters the event log keeping only the events having an attribute value which occurs:
- in at least the specified (min_relative_stake) percentage of events, when Parameters.KEEP_ONCE_PER_CASE = False
- in at least the specified (min_relative_stake) percentage of cases, when Parameters.KEEP_ONCE_PER_CASE = True
Parameters
-------------------
log
Event log
min_relative_stake
Minimum percentage of cases (expressed as a number between 0 and 1) in which the attribute should occur.
parameters
Parameters of the algorithm, including:
- Parameters.ATTRIBUTE_KEY => the attribute to use (default: concept:name)
- Parameters.KEEP_ONCE_PER_CASE => decides the level of the filter to apply
(if the filter should be applied on the cases, set it to True).
Returns
------------------
filtered_log
Filtered event log
"""
if parameters is None:
parameters = {}

attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, xes.DEFAULT_NAME_KEY)
keep_once_per_case = exec_utils.get_param_value(Parameters.KEEP_ONCE_PER_CASE, parameters, True)

parameters_cp = copy(parameters)

activities_occurrences = get_attribute_values(log, attribute_key, parameters=parameters_cp)

if keep_once_per_case:
# filter on cases
filtered_attributes = set(x for x, y in activities_occurrences.items() if y >= min_relative_stake * len(log))
else:
# filter on events
filtered_attributes = set(x for x, y in activities_occurrences.items() if y >= min_relative_stake * sum(len(x) for x in log))

return apply_events(log, filtered_attributes, parameters=parameters)


@deprecation.deprecated("2.2.11", "3.0.0", details="Removed")
def apply_auto_filter(log, variants=None, parameters=None):
"""
Expand Down
45 changes: 45 additions & 0 deletions pm4py/algo/filtering/pandas/attributes/attributes_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class Parameters(Enum):
STREAM_FILTER_VALUE1 = "stream_filter_value1"
STREAM_FILTER_KEY2 = "stream_filter_key2"
STREAM_FILTER_VALUE2 = "stream_filter_value2"
KEEP_ONCE_PER_CASE = "keep_once_per_case"


def apply_numeric_events(df: pd.DataFrame, int1: float, int2: float, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> pd.DataFrame:
Expand Down Expand Up @@ -333,3 +334,47 @@ def filter_df_keeping_spno_activities(df: pd.DataFrame, activity_key: str = "con

ret.attrs = copy(df.attrs) if hasattr(df, 'attrs') else {}
return df


def filter_df_relative_occurrence_event_attribute(df: pd.DataFrame, min_relative_stake: float, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame:
"""
Filters the event log keeping only the events having an attribute value which occurs:
- in at least the specified (min_relative_stake) percentage of events, when Parameters.KEEP_ONCE_PER_CASE = False
- in at least the specified (min_relative_stake) percentage of cases, when Parameters.KEEP_ONCE_PER_CASE = True
Parameters
-------------------
df
Pandas dataframe
min_relative_stake
Minimum percentage of cases (expressed as a number between 0 and 1) in which the attribute should occur.
parameters
Parameters of the algorithm, including:
- Parameters.ATTRIBUTE_KEY => the attribute to use (default: concept:name)
- Parameters.KEEP_ONCE_PER_CASE => decides the level of the filter to apply
(if the filter should be applied on the cases, set it to True).
Returns
------------------
filtered_df
Filtered Pandas dataframe
"""
if parameters is None:
parameters = {}

attribute_key = exec_utils.get_param_value(PARAMETER_CONSTANT_ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
keep_once_per_case = exec_utils.get_param_value(Parameters.KEEP_ONCE_PER_CASE, parameters, True)

parameters_cp = copy(parameters)

activities_occurrences = get_attribute_values(df, attribute_key, parameters=parameters_cp)

if keep_once_per_case:
# filter on cases
filtered_attributes = set(x for x, y in activities_occurrences.items() if y >= min_relative_stake * df[case_id_key].nunique())
else:
# filter on events
filtered_attributes = set(x for x, y in activities_occurrences.items() if y >= min_relative_stake * len(df))

return apply_events(df, filtered_attributes, parameters=parameters)
39 changes: 38 additions & 1 deletion pm4py/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,50 @@

from pm4py.meta import VERSION as PM4PY_CURRENT_VERSION
from pm4py.objects.log.obj import EventLog
from pm4py.util import constants
from pm4py.util import constants, xes_constants
from pm4py.util.pandas_utils import check_is_pandas_dataframe, check_pandas_dataframe_columns
from pm4py.utils import get_properties, general_checks_classical_event_log
from pm4py.objects.ocel.obj import OCEL
import datetime


def filter_log_relative_occurrence_event_attribute(log: Union[EventLog, pd.DataFrame], min_relative_stake: float, attribute_key : str = xes_constants.DEFAULT_NAME_KEY, level="cases") -> Union[EventLog, pd.DataFrame]:
"""
Filters the event log keeping only the events having an attribute value which occurs:
- in at least the specified (min_relative_stake) percentage of events, when level="events"
- in at least the specified (min_relative_stake) percentage of cases, when level="cases"
Parameters
-------------------
log
Event log / Pandas dataframe
min_relative_stake
Minimum percentage of cases (expressed as a number between 0 and 1) in which the attribute should occur.
attribute_key
The attribute to filter
level
The level of the filter (if level="events", then events / if level="cases", then cases)
Returns
------------------
filtered_log
Filtered event log
"""
general_checks_classical_event_log(log)
parameters = get_properties(log)
if check_is_pandas_dataframe(log):
check_pandas_dataframe_columns(log)
from pm4py.algo.filtering.pandas.attributes import attributes_filter
parameters[attributes_filter.Parameters.ATTRIBUTE_KEY] = attribute_key
parameters[attributes_filter.Parameters.KEEP_ONCE_PER_CASE] = True if level == "cases" else False
return attributes_filter.filter_df_relative_occurrence_event_attribute(log, min_relative_stake, parameters=parameters)
else:
from pm4py.algo.filtering.log.attributes import attributes_filter
parameters[attributes_filter.Parameters.ATTRIBUTE_KEY] = attribute_key
parameters[attributes_filter.Parameters.KEEP_ONCE_PER_CASE] = True if level == "cases" else False
return attributes_filter.filter_log_relative_occurrence_event_attribute(log, min_relative_stake, parameters=parameters)


def filter_start_activities(log: Union[EventLog, pd.DataFrame], activities: Union[Set[str], List[str]], retain: bool = True) -> \
Union[EventLog, pd.DataFrame]:
"""
Expand Down

0 comments on commit 5f5ff57

Please sign in to comment.