Merge branch 'release' into release-github

# Conflicts: # pm4py/algo/simulation/playout/dfg/algorithm.py # pm4py/algo/simulation/playout/dfg/variants/__init__.py # pm4py/meta.py # pm4py/streaming/__init__.py
process-intelligence-solutions · Sep 3, 2021 · 9c51ae4 · 9c51ae4
2 parents c1adc23 + 9bbaa1b
commit 9c51ae4
Show file tree

Hide file tree

Showing 40 changed files with 1,051 additions and 227 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,36 @@
 # PM4Py Changelog
 
+## PM4Py 2.2.13 (2021.09.03)
+
+### Fixed
+
+### Removed
+
+### Deprecated
+
+### Changed
+* 5723df7b
+    * xes exporter now reports on xes features and xmlns
+* 3b632548
+  * graphviz based visualizations now expose background color as a parameter
+
+### Added
+* 0592157b
+    * new dfg playout including performance specification
+* 85739ba0
+    * allow pandas df to be used as an iterable for streaming simulation 
+* 2fa9993f
+    * path filter  that filters the cases of an event log where there is at least one occurrence of the provided path
+    occurring in a given time range.
+* a7ee73a8
+    * added filter based on rework detection
+* c03b6188
+    * add petri net, reset/inhibitor net and data petri net semantics 
+### Other
+
+
+---
+
 ## PM4Py 2.2.12 (2021.08.19)
 
 ### Fixed

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = '2.2'
 # The full version, including alpha/beta/rc tags
-release = '2.2.12'
+release = '2.2.13'
 
 # -- General configuration ---------------------------------------------------
 

diff --git a/examples/pandas_iterable.py b/examples/pandas_iterable.py
@@ -0,0 +1,18 @@
+import pandas as pd
+import pm4py
+import os
+from pm4py.streaming.conversion import from_pandas
+
+
+def execute_script():
+    df = pd.read_csv(os.path.join("..", "tests", "input_data", "receipt.csv"))
+    df = pm4py.format_dataframe(df)
+    it = from_pandas.apply(df)
+    count = 0
+    for trace in it:
+        print(count, trace)
+        count = count + 1
+
+
+if __name__ == "__main__":
+    execute_script()
diff --git a/examples/pandas_iterable_to_trace_stream.py b/examples/pandas_iterable_to_trace_stream.py
@@ -0,0 +1,22 @@
+import pandas as pd
+import pm4py
+import os
+from pm4py.streaming.conversion import from_pandas
+from pm4py.streaming.stream.live_trace_stream import LiveTraceStream
+from pm4py.streaming.util import trace_stream_printer
+
+
+def execute_script():
+    df = pd.read_csv(os.path.join("..", "tests", "input_data", "receipt.csv"))
+    df = pm4py.format_dataframe(df)
+    it = from_pandas.apply(df)
+    printer = trace_stream_printer.TraceStreamPrinter()
+    trace_stream = LiveTraceStream()
+    trace_stream.register(printer)
+    trace_stream.start()
+    it.to_trace_stream(trace_stream)
+    trace_stream.stop()
+
+
+if __name__ == "__main__":
+    execute_script()
diff --git a/examples/performance_dfg_simulation.py b/examples/performance_dfg_simulation.py
@@ -0,0 +1,17 @@
+import os
+
+import pm4py
+from pm4py.algo.simulation.playout.dfg import algorithm as dfg_simulator
+
+
+def execute_script():
+    log = pm4py.read_xes(os.path.join("..", "tests", "input_data", "receipt.xes"))
+    frequency_dfg, sa, ea = pm4py.discover_dfg(log)
+    performance_dfg, sa, ea = pm4py.discover_performance_dfg(log)
+    simulated_log = dfg_simulator.apply(frequency_dfg, sa, ea, variant=dfg_simulator.Variants.PERFORMANCE,
+                                        parameters={"performance_dfg": performance_dfg})
+    print(simulated_log)
+
+
+if __name__ == "__main__":
+    execute_script()
diff --git a/pm4py/algo/filtering/log/paths/paths_filter.py b/pm4py/algo/filtering/log/paths/paths_filter.py
@@ -21,8 +21,9 @@
 from pm4py.objects.log.obj import EventLog, Trace
 from pm4py.util import exec_utils
 from pm4py.util import xes_constants as xes
-from pm4py.util.constants import PARAMETER_CONSTANT_ATTRIBUTE_KEY
+from pm4py.util.constants import PARAMETER_CONSTANT_ATTRIBUTE_KEY, PARAMETER_CONSTANT_TIMESTAMP_KEY
 import deprecation
+import sys
 
 from typing import Optional, Dict, Any, Union, Tuple, List
 from pm4py.objects.log.obj import EventLog, EventStream, Trace
@@ -32,6 +33,9 @@ class Parameters(Enum):
     ATTRIBUTE_KEY = PARAMETER_CONSTANT_ATTRIBUTE_KEY
     DECREASING_FACTOR = "decreasingFactor"
     POSITIVE = "positive"
+    TIMESTAMP_KEY = PARAMETER_CONSTANT_TIMESTAMP_KEY
+    MIN_PERFORMANCE = "min_performance"
+    MAX_PERFORMANCE = "max_performance"
 
 
 def apply(log: EventLog, paths: List[Tuple[str, str]], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog:
@@ -72,6 +76,53 @@ def apply(log: EventLog, paths: List[Tuple[str, str]], parameters: Optional[Dict
     return filtered_log
 
 
+def apply_performance(log: EventLog, provided_path: Tuple[str, str], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog:
+    """
+    Filters the cases of an event log where there is at least one occurrence of the provided path
+    occurring in the defined timedelta range.
+
+    Parameters
+    ----------------
+    log
+        Event log
+    provided_path
+        Path between two activities (expressed as tuple)
+    parameters
+        Parameters of the filter, including:
+            Parameters.ATTRIBUTE_KEY -> Attribute identifying the activity in the log
+            Parameters.TIMESTAMP_KEY -> Attribute identifying the timestamp in the log
+            Parameters.POSITIVE -> Indicate if events should be kept/removed
+            Parameters.MIN_PERFORMANCE -> Minimal allowed performance of the provided path
+            Parameters.MAX_PERFORMANCE -> Maximal allowed performance of the provided path
+
+    Returns
+    ----------------
+    filtered_log
+        Filtered event log
+    """
+    if parameters is None:
+        parameters = {}
+    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, xes.DEFAULT_NAME_KEY)
+    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY)
+    min_performance = exec_utils.get_param_value(Parameters.MIN_PERFORMANCE, parameters, 0)
+    max_performance = exec_utils.get_param_value(Parameters.MAX_PERFORMANCE, parameters, sys.maxsize)
+    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
+    filtered_log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
+                            omni_present=log.omni_present, properties=log.properties)
+    for trace in log:
+        found = False
+        for i in range(len(trace) - 1):
+            path = (trace[i][attribute_key], trace[i + 1][attribute_key])
+            if path == provided_path:
+                timediff = trace[i + 1][timestamp_key].timestamp() - trace[i][timestamp_key].timestamp()
+                if min_performance <= timediff <= max_performance:
+                    found = True
+                    break
+        if (found and positive) or (not found and not positive):
+            filtered_log.append(trace)
+    return filtered_log
+
+
 def get_paths_from_log(log, attribute_key="concept:name"):
     """
     Get the paths of the log along with their count

diff --git a/pm4py/algo/filtering/log/rework/__init__.py b/pm4py/algo/filtering/log/rework/__init__.py
@@ -0,0 +1,17 @@
+'''
+    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
+
+    PM4Py is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    PM4Py is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
+'''
+from pm4py.algo.filtering.log.rework import rework_filter
diff --git a/pm4py/algo/filtering/log/rework/rework_filter.py b/pm4py/algo/filtering/log/rework/rework_filter.py
@@ -0,0 +1,74 @@
+'''
+    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
+
+    PM4Py is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    PM4Py is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
+'''
+from enum import Enum
+from pm4py.util import constants, xes_constants, exec_utils
+from pm4py.objects.log.obj import EventLog
+from collections import Counter
+from typing import Optional, Dict, Any
+
+
+class Parameters(Enum):
+    ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
+    MIN_OCCURRENCES = "min_occurrences"
+    POSITIVE = "positive"
+
+
+def apply(log: EventLog, activity: str, parameters: Optional[Dict[Any, Any]] = None) -> EventLog:
+    """
+    Applies the rework filter on the provided event log and activity.
+    This filter the cases of the log having at least Parameters.MIN_OCCURRENCES (default: 2) occurrences
+    of the given activity.
+
+    It is also possible (setting Parameters.POSITIVE to False) to retrieve the cases of the log not having the
+    given activity or having the activity occurred less than Parameters.MIN_OCCURRENCES times.
+
+    Parameters
+    -------------------
+    log
+        Event log
+    activity
+        Activity of which the rework shall be filtered
+    parameters
+        Parameters of the filter, including:
+        - Parameters.ACTIVITY_KEY => the attribute to use as activity
+        - Parameters.MIN_OCCURRENCES => the minimum number of occurrences for the activity
+        - Parameters.POSITIVE => if True, filters the cases of the log having at least MIN_OCCURRENCES occurrences.
+            if False, filters the cases of the log where such behavior does not occur.
+
+    Returns
+    -----------------
+    filtered_log
+        Filtered event log
+    """
+    if parameters is None:
+        parameters = {}
+
+    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
+    min_occurrences = exec_utils.get_param_value(Parameters.MIN_OCCURRENCES, parameters, 2)
+    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
+
+    filtered_log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
+                            omni_present=log.omni_present, properties=log.properties)
+
+    for trace in log:
+        act_counter = Counter([x[activity_key] for x in trace])
+        if positive and activity in act_counter and act_counter[activity] >= min_occurrences:
+            filtered_log.append(trace)
+        elif not positive and (activity not in act_counter or act_counter[activity] < min_occurrences):
+            filtered_log.append(trace)
+
+    return filtered_log
diff --git a/pm4py/algo/filtering/pandas/paths/paths_filter.py b/pm4py/algo/filtering/pandas/paths/paths_filter.py
@@ -28,6 +28,7 @@
 import deprecation
 from typing import Optional, Dict, Any, Union, Tuple, List
 import pandas as pd
+import sys
 
 
 class Parameters(Enum):
@@ -36,6 +37,8 @@ class Parameters(Enum):
     TIMESTAMP_KEY = PARAMETER_CONSTANT_TIMESTAMP_KEY
     DECREASING_FACTOR = "decreasingFactor"
     POSITIVE = "positive"
+    MIN_PERFORMANCE = "min_performance"
+    MAX_PERFORMANCE = "max_performance"
 
 
 def apply(df: pd.DataFrame, paths: List[Tuple[str, str]], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> pd.DataFrame:
@@ -61,11 +64,11 @@ def apply(df: pd.DataFrame, paths: List[Tuple[str, str]], parameters: Optional[D
     """
     if parameters is None:
         parameters = {}
-    paths = [path[0] + "," + path[1] for path in paths]
     case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
     attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
     timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
     positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
+    paths = [path[0] + "," + path[1] for path in paths]
     df = df.sort_values([case_id_glue, timestamp_key])
     filt_df = df[[case_id_glue, attribute_key]]
     filt_dif_shifted = filt_df.shift(-1)
@@ -84,6 +87,62 @@ def apply(df: pd.DataFrame, paths: List[Tuple[str, str]], parameters: Optional[D
     return ret
 
 
+def apply_performance(df: pd.DataFrame, provided_path: Tuple[str, str], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> pd.DataFrame:
+    """
+    Filters the cases of a dataframe where there is at least one occurrence of the provided path
+    occurring in the defined timedelta range.
+
+    Parameters
+    ----------
+    df
+        Dataframe
+    paths
+        Paths to filter on
+    parameters
+        Possible parameters of the algorithm, including:
+            Parameters.CASE_ID_KEY -> Case ID column in the dataframe
+            Parameters.ATTRIBUTE_KEY -> Attribute we want to filter
+            Parameters.TIMESTAMP_KEY -> Attribute identifying the timestamp in the log
+            Parameters.POSITIVE -> Specifies if the filter should be applied including traces (positive=True)
+            or excluding traces (positive=False)
+            Parameters.MIN_PERFORMANCE -> Minimal allowed performance of the provided path
+            Parameters.MAX_PERFORMANCE -> Maximal allowed performance of the provided path
+
+    Returns
+    ----------
+    df
+        Filtered dataframe
+    """
+    if parameters is None:
+        parameters = {}
+    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
+    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
+    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
+    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
+    provided_path = provided_path[0] + "," + provided_path[1]
+    min_performance = exec_utils.get_param_value(Parameters.MIN_PERFORMANCE, parameters, 0)
+    max_performance = exec_utils.get_param_value(Parameters.MAX_PERFORMANCE, parameters, sys.maxsize)
+    df = df.sort_values([case_id_glue, timestamp_key])
+    filt_df = df[[case_id_glue, attribute_key, timestamp_key]]
+    filt_dif_shifted = filt_df.shift(-1)
+    filt_dif_shifted.columns = [str(col) + '_2' for col in filt_dif_shifted.columns]
+    stacked_df = pd.concat([filt_df, filt_dif_shifted], axis=1)
+    stacked_df["@@path"] = stacked_df[attribute_key] + "," + stacked_df[attribute_key + "_2"]
+    stacked_df = stacked_df[stacked_df["@@path"] == provided_path]
+    stacked_df["@@timedelta"] = (stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key]).astype('timedelta64[s]')
+    stacked_df = stacked_df[stacked_df["@@timedelta"] >= min_performance]
+    stacked_df = stacked_df[stacked_df["@@timedelta"] <= max_performance]
+    i1 = df.set_index(case_id_glue).index
+    i2 = stacked_df.set_index(case_id_glue).index
+    if positive:
+        ret = df[i1.isin(i2)]
+    else:
+        ret = df[~i1.isin(i2)]
+
+    ret.attrs = copy(df.attrs) if hasattr(df, 'attrs') else {}
+    return ret
+
+
 @deprecation.deprecated("2.2.11", "3.0.0", details="Removed")
 def apply_auto_filter(df, parameters=None):
     del df

diff --git a/pm4py/algo/filtering/pandas/rework/__init__.py b/pm4py/algo/filtering/pandas/rework/__init__.py
@@ -0,0 +1,17 @@
+'''
+    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
+
+    PM4Py is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    PM4Py is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
+'''
+from pm4py.algo.filtering.pandas.rework import rework_filter