Skip to content

Commit

Permalink
Merge branch 'release' into release-github
Browse files Browse the repository at this point in the history
# Conflicts:
#	pm4py/algo/simulation/playout/dfg/algorithm.py
#	pm4py/algo/simulation/playout/dfg/variants/__init__.py
#	pm4py/meta.py
#	pm4py/streaming/__init__.py
  • Loading branch information
fit-sebastiaan-van-zelst committed Sep 3, 2021
2 parents c1adc23 + 9bbaa1b commit 9c51ae4
Show file tree
Hide file tree
Showing 40 changed files with 1,051 additions and 227 deletions.
31 changes: 31 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,36 @@
# PM4Py Changelog

## PM4Py 2.2.13 (2021.09.03)

### Fixed

### Removed

### Deprecated

### Changed
* 5723df7b
* xes exporter now reports on xes features and xmlns
* 3b632548
* graphviz based visualizations now expose background color as a parameter

### Added
* 0592157b
* new dfg playout including performance specification
* 85739ba0
* allow pandas df to be used as an iterable for streaming simulation
* 2fa9993f
* path filter that filters the cases of an event log where there is at least one occurrence of the provided path
occurring in a given time range.
* a7ee73a8
* added filter based on rework detection
* c03b6188
* add petri net, reset/inhibitor net and data petri net semantics
### Other


---

## PM4Py 2.2.12 (2021.08.19)

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = '2.2'
# The full version, including alpha/beta/rc tags
release = '2.2.12'
release = '2.2.13'

# -- General configuration ---------------------------------------------------

Expand Down
18 changes: 18 additions & 0 deletions examples/pandas_iterable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd
import pm4py
import os
from pm4py.streaming.conversion import from_pandas


def execute_script():
df = pd.read_csv(os.path.join("..", "tests", "input_data", "receipt.csv"))
df = pm4py.format_dataframe(df)
it = from_pandas.apply(df)
count = 0
for trace in it:
print(count, trace)
count = count + 1


if __name__ == "__main__":
execute_script()
22 changes: 22 additions & 0 deletions examples/pandas_iterable_to_trace_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pandas as pd
import pm4py
import os
from pm4py.streaming.conversion import from_pandas
from pm4py.streaming.stream.live_trace_stream import LiveTraceStream
from pm4py.streaming.util import trace_stream_printer


def execute_script():
df = pd.read_csv(os.path.join("..", "tests", "input_data", "receipt.csv"))
df = pm4py.format_dataframe(df)
it = from_pandas.apply(df)
printer = trace_stream_printer.TraceStreamPrinter()
trace_stream = LiveTraceStream()
trace_stream.register(printer)
trace_stream.start()
it.to_trace_stream(trace_stream)
trace_stream.stop()


if __name__ == "__main__":
execute_script()
17 changes: 17 additions & 0 deletions examples/performance_dfg_simulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os

import pm4py
from pm4py.algo.simulation.playout.dfg import algorithm as dfg_simulator


def execute_script():
log = pm4py.read_xes(os.path.join("..", "tests", "input_data", "receipt.xes"))
frequency_dfg, sa, ea = pm4py.discover_dfg(log)
performance_dfg, sa, ea = pm4py.discover_performance_dfg(log)
simulated_log = dfg_simulator.apply(frequency_dfg, sa, ea, variant=dfg_simulator.Variants.PERFORMANCE,
parameters={"performance_dfg": performance_dfg})
print(simulated_log)


if __name__ == "__main__":
execute_script()
53 changes: 52 additions & 1 deletion pm4py/algo/filtering/log/paths/paths_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@
from pm4py.objects.log.obj import EventLog, Trace
from pm4py.util import exec_utils
from pm4py.util import xes_constants as xes
from pm4py.util.constants import PARAMETER_CONSTANT_ATTRIBUTE_KEY
from pm4py.util.constants import PARAMETER_CONSTANT_ATTRIBUTE_KEY, PARAMETER_CONSTANT_TIMESTAMP_KEY
import deprecation
import sys

from typing import Optional, Dict, Any, Union, Tuple, List
from pm4py.objects.log.obj import EventLog, EventStream, Trace
Expand All @@ -32,6 +33,9 @@ class Parameters(Enum):
ATTRIBUTE_KEY = PARAMETER_CONSTANT_ATTRIBUTE_KEY
DECREASING_FACTOR = "decreasingFactor"
POSITIVE = "positive"
TIMESTAMP_KEY = PARAMETER_CONSTANT_TIMESTAMP_KEY
MIN_PERFORMANCE = "min_performance"
MAX_PERFORMANCE = "max_performance"


def apply(log: EventLog, paths: List[Tuple[str, str]], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog:
Expand Down Expand Up @@ -72,6 +76,53 @@ def apply(log: EventLog, paths: List[Tuple[str, str]], parameters: Optional[Dict
return filtered_log


def apply_performance(log: EventLog, provided_path: Tuple[str, str], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog:
"""
Filters the cases of an event log where there is at least one occurrence of the provided path
occurring in the defined timedelta range.
Parameters
----------------
log
Event log
provided_path
Path between two activities (expressed as tuple)
parameters
Parameters of the filter, including:
Parameters.ATTRIBUTE_KEY -> Attribute identifying the activity in the log
Parameters.TIMESTAMP_KEY -> Attribute identifying the timestamp in the log
Parameters.POSITIVE -> Indicate if events should be kept/removed
Parameters.MIN_PERFORMANCE -> Minimal allowed performance of the provided path
Parameters.MAX_PERFORMANCE -> Maximal allowed performance of the provided path
Returns
----------------
filtered_log
Filtered event log
"""
if parameters is None:
parameters = {}
attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, xes.DEFAULT_NAME_KEY)
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY)
min_performance = exec_utils.get_param_value(Parameters.MIN_PERFORMANCE, parameters, 0)
max_performance = exec_utils.get_param_value(Parameters.MAX_PERFORMANCE, parameters, sys.maxsize)
positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
filtered_log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
omni_present=log.omni_present, properties=log.properties)
for trace in log:
found = False
for i in range(len(trace) - 1):
path = (trace[i][attribute_key], trace[i + 1][attribute_key])
if path == provided_path:
timediff = trace[i + 1][timestamp_key].timestamp() - trace[i][timestamp_key].timestamp()
if min_performance <= timediff <= max_performance:
found = True
break
if (found and positive) or (not found and not positive):
filtered_log.append(trace)
return filtered_log


def get_paths_from_log(log, attribute_key="concept:name"):
"""
Get the paths of the log along with their count
Expand Down
17 changes: 17 additions & 0 deletions pm4py/algo/filtering/log/rework/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
'''
This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
PM4Py is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
PM4Py is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
'''
from pm4py.algo.filtering.log.rework import rework_filter
74 changes: 74 additions & 0 deletions pm4py/algo/filtering/log/rework/rework_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
'''
This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
PM4Py is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
PM4Py is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
'''
from enum import Enum
from pm4py.util import constants, xes_constants, exec_utils
from pm4py.objects.log.obj import EventLog
from collections import Counter
from typing import Optional, Dict, Any


class Parameters(Enum):
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
MIN_OCCURRENCES = "min_occurrences"
POSITIVE = "positive"


def apply(log: EventLog, activity: str, parameters: Optional[Dict[Any, Any]] = None) -> EventLog:
"""
Applies the rework filter on the provided event log and activity.
This filter the cases of the log having at least Parameters.MIN_OCCURRENCES (default: 2) occurrences
of the given activity.
It is also possible (setting Parameters.POSITIVE to False) to retrieve the cases of the log not having the
given activity or having the activity occurred less than Parameters.MIN_OCCURRENCES times.
Parameters
-------------------
log
Event log
activity
Activity of which the rework shall be filtered
parameters
Parameters of the filter, including:
- Parameters.ACTIVITY_KEY => the attribute to use as activity
- Parameters.MIN_OCCURRENCES => the minimum number of occurrences for the activity
- Parameters.POSITIVE => if True, filters the cases of the log having at least MIN_OCCURRENCES occurrences.
if False, filters the cases of the log where such behavior does not occur.
Returns
-----------------
filtered_log
Filtered event log
"""
if parameters is None:
parameters = {}

activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
min_occurrences = exec_utils.get_param_value(Parameters.MIN_OCCURRENCES, parameters, 2)
positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)

filtered_log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
omni_present=log.omni_present, properties=log.properties)

for trace in log:
act_counter = Counter([x[activity_key] for x in trace])
if positive and activity in act_counter and act_counter[activity] >= min_occurrences:
filtered_log.append(trace)
elif not positive and (activity not in act_counter or act_counter[activity] < min_occurrences):
filtered_log.append(trace)

return filtered_log
61 changes: 60 additions & 1 deletion pm4py/algo/filtering/pandas/paths/paths_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import deprecation
from typing import Optional, Dict, Any, Union, Tuple, List
import pandas as pd
import sys


class Parameters(Enum):
Expand All @@ -36,6 +37,8 @@ class Parameters(Enum):
TIMESTAMP_KEY = PARAMETER_CONSTANT_TIMESTAMP_KEY
DECREASING_FACTOR = "decreasingFactor"
POSITIVE = "positive"
MIN_PERFORMANCE = "min_performance"
MAX_PERFORMANCE = "max_performance"


def apply(df: pd.DataFrame, paths: List[Tuple[str, str]], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> pd.DataFrame:
Expand All @@ -61,11 +64,11 @@ def apply(df: pd.DataFrame, paths: List[Tuple[str, str]], parameters: Optional[D
"""
if parameters is None:
parameters = {}
paths = [path[0] + "," + path[1] for path in paths]
case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
paths = [path[0] + "," + path[1] for path in paths]
df = df.sort_values([case_id_glue, timestamp_key])
filt_df = df[[case_id_glue, attribute_key]]
filt_dif_shifted = filt_df.shift(-1)
Expand All @@ -84,6 +87,62 @@ def apply(df: pd.DataFrame, paths: List[Tuple[str, str]], parameters: Optional[D
return ret


def apply_performance(df: pd.DataFrame, provided_path: Tuple[str, str], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> pd.DataFrame:
"""
Filters the cases of a dataframe where there is at least one occurrence of the provided path
occurring in the defined timedelta range.
Parameters
----------
df
Dataframe
paths
Paths to filter on
parameters
Possible parameters of the algorithm, including:
Parameters.CASE_ID_KEY -> Case ID column in the dataframe
Parameters.ATTRIBUTE_KEY -> Attribute we want to filter
Parameters.TIMESTAMP_KEY -> Attribute identifying the timestamp in the log
Parameters.POSITIVE -> Specifies if the filter should be applied including traces (positive=True)
or excluding traces (positive=False)
Parameters.MIN_PERFORMANCE -> Minimal allowed performance of the provided path
Parameters.MAX_PERFORMANCE -> Maximal allowed performance of the provided path
Returns
----------
df
Filtered dataframe
"""
if parameters is None:
parameters = {}
case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
provided_path = provided_path[0] + "," + provided_path[1]
min_performance = exec_utils.get_param_value(Parameters.MIN_PERFORMANCE, parameters, 0)
max_performance = exec_utils.get_param_value(Parameters.MAX_PERFORMANCE, parameters, sys.maxsize)
df = df.sort_values([case_id_glue, timestamp_key])
filt_df = df[[case_id_glue, attribute_key, timestamp_key]]
filt_dif_shifted = filt_df.shift(-1)
filt_dif_shifted.columns = [str(col) + '_2' for col in filt_dif_shifted.columns]
stacked_df = pd.concat([filt_df, filt_dif_shifted], axis=1)
stacked_df["@@path"] = stacked_df[attribute_key] + "," + stacked_df[attribute_key + "_2"]
stacked_df = stacked_df[stacked_df["@@path"] == provided_path]
stacked_df["@@timedelta"] = (stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key]).astype('timedelta64[s]')
stacked_df = stacked_df[stacked_df["@@timedelta"] >= min_performance]
stacked_df = stacked_df[stacked_df["@@timedelta"] <= max_performance]
i1 = df.set_index(case_id_glue).index
i2 = stacked_df.set_index(case_id_glue).index
if positive:
ret = df[i1.isin(i2)]
else:
ret = df[~i1.isin(i2)]

ret.attrs = copy(df.attrs) if hasattr(df, 'attrs') else {}
return ret


@deprecation.deprecated("2.2.11", "3.0.0", details="Removed")
def apply_auto_filter(df, parameters=None):
del df
Expand Down
17 changes: 17 additions & 0 deletions pm4py/algo/filtering/pandas/rework/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
'''
This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
PM4Py is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
PM4Py is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
'''
from pm4py.algo.filtering.pandas.rework import rework_filter
Loading

0 comments on commit 9c51ae4

Please sign in to comment.