Merge branch 'ft-1382-generic-network-analysis' into 'integration'

FT 1382 Generic Network Analysis (In, Out, Node and Edge column) See merge request pm4py/pm4py-core!528
process-intelligence-solutions · Nov 16, 2021 · 9fbd1c4 · 9fbd1c4
2 parents d4747f7 + d0556e4
commit 9fbd1c4
Show file tree

Hide file tree

Showing 11 changed files with 484 additions and 0 deletions.
diff --git a/examples/execute_everything.py b/examples/execute_everything.py
@@ -4,6 +4,12 @@
 import traceback
 
 
+def network_analysis():
+    from examples import network_analysis
+    print("\n\nnetwork_analysis")
+    network_analysis.execute_script()
+
+
 def read_write_ocel():
     from examples import read_write_ocel
     print("\n\nread_write_ocel")
@@ -471,6 +477,7 @@ def execute_script(f):
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))))
 
 if __name__ == "__main__":
+    execute_script(network_analysis)
     execute_script(read_write_ocel)
     execute_script(ocdfg_discovery)
     execute_script(enrich_log_with_align)

diff --git a/examples/network_analysis.py b/examples/network_analysis.py
@@ -0,0 +1,46 @@
+import os
+import pm4py
+from pm4py.algo.organizational_mining.network_analysis import algorithm as network_analysis
+from pm4py.visualization.network_analysis import visualizer as network_visualizer
+
+
+def execute_script():
+    log = pm4py.read_xes(os.path.join("..", "tests", "input_data", "receipt.xes"))
+
+    # frequency view of the network analysis
+
+    # OUT column: case identifier
+    # IN column: case identifier (the next event having the same case identifier is matched)
+    # NODE column: the attribute to use to classify the node. In this case, we use the org:group (organizational group)
+    # EDGE column: the attribute (of the source event) to use to classify the edge. In this case, we use the
+    # concept:name (activity)
+
+    frequency_edges = network_analysis.apply(log, parameters={"in_column": "case:concept:name",
+                                                              "out_column": "case:concept:name",
+                                                              "node_column": "org:group", "edge_column": "concept:name",
+                                                              "include_performance": False})
+    gviz_frequency = network_visualizer.apply(frequency_edges, variant=network_visualizer.Variants.FREQUENCY,
+                                              parameters={"edge_threshold": 10, "format": "svg"})
+    network_visualizer.view(gviz_frequency)
+
+    # performance view of the network analysis
+
+    # OUT column: case identifier
+    # IN column: case identifier (the next event having the same case identifier is matched)
+    # NODE column: the attribute to use to classify the node. In this case, we use the org:group (organizational group)
+    # EDGE column: the attribute (of the source event) to use to classify the edge. In this case, we use the
+    # concept:name (activity)
+
+    performance_edges = network_analysis.apply(log, parameters={"in_column": "case:concept:name",
+                                                                "out_column": "case:concept:name",
+                                                                "node_column": "org:group",
+                                                                "edge_column": "concept:name",
+                                                                "include_performance": True})
+    gviz_performance = network_visualizer.apply(performance_edges, variant=network_visualizer.Variants.PERFORMANCE,
+                                                parameters={"edge_threshold": 10, "format": "svg",
+                                                            "aggregation_measure": "median"})
+    network_visualizer.view(gviz_performance)
+
+
+if __name__ == "__main__":
+    execute_script()
diff --git a/pm4py/algo/organizational_mining/network_analysis/__init__.py b/pm4py/algo/organizational_mining/network_analysis/__init__.py
@@ -0,0 +1 @@
+from pm4py.algo.organizational_mining.network_analysis import algorithm, variants
diff --git a/pm4py/algo/organizational_mining/network_analysis/algorithm.py b/pm4py/algo/organizational_mining/network_analysis/algorithm.py
@@ -0,0 +1,30 @@
+from pm4py.algo.organizational_mining.network_analysis.variants import dataframe
+from enum import Enum
+from pm4py.util import exec_utils
+from typing import Dict, Optional, Any, Tuple, Union
+import pandas as pd
+from pm4py.objects.log.obj import EventLog, EventStream
+from pm4py.objects.conversion.log import converter as log_converter
+
+
+class Variants(Enum):
+    DATAFRAME = dataframe
+
+
+def apply(log: Union[pd.DataFrame, EventLog, EventStream], variant=Variants.DATAFRAME, parameters: Optional[Dict[Any, Any]] = None) -> Dict[Tuple[str, str], Dict[str, Any]]:
+    """
+    Performs the network analysis on the provided event log
+
+    Parameters
+    ----------------
+    log
+        Event log
+    parameters
+        Version-specific parameters
+
+    Returns
+    ----------------
+    network_analysis
+        Edges of the network analysis (first key: edge; second key: type; value: number of occurrences)
+    """
+    return exec_utils.get_variant(variant).apply(log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=parameters), parameters=parameters)
diff --git a/pm4py/algo/organizational_mining/network_analysis/variants/__init__.py b/pm4py/algo/organizational_mining/network_analysis/variants/__init__.py
@@ -0,0 +1 @@
+from pm4py.algo.organizational_mining.network_analysis.variants import dataframe
diff --git a/pm4py/algo/organizational_mining/network_analysis/variants/dataframe.py b/pm4py/algo/organizational_mining/network_analysis/variants/dataframe.py
@@ -0,0 +1,113 @@
+from enum import Enum
+from pm4py.util import exec_utils
+from pm4py.util import xes_constants, constants, pandas_utils
+import pandas as pd
+from typing import Dict, Optional, Any, Tuple
+from pm4py.util.business_hours import soj_time_business_hours_diff
+
+
+class Parameters(Enum):
+    SORTING_COLUMN = "sorting_column"
+    INDEX_KEY = "index_key"
+    TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
+    IN_COLUMN = "in_column"
+    OUT_COLUMN = "out_column"
+    NODE_COLUMN = "node_column"
+    EDGE_COLUMN = "edge_column"
+    INCLUDE_PERFORMANCE = "include_performance"
+    BUSINESS_HOURS = "business_hours"
+    WORKTIMING = "worktiming"
+    WEEKENDS = "weekends"
+    TIMESTAMP_DIFF_COLUMN = "timestamp_diff_column"
+
+
+def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> Dict[
+    Tuple[str, str], Dict[str, Any]]:
+    """
+    Performs the network analysis on the provided dataframe
+
+    Parameters
+    -----------------
+    dataframe
+        Dataframe
+    parameters
+        Parameters of the algorithm, including:
+        - Parameters.SORTING_COLUMN => the column that should be used to sort the log
+        - Parameters.INDEX_KEY => the name for the index attribute in the log (inserted during the execution)
+        - Parameters.TIMESTAMP_COLUMN => the timestamp column
+        - Parameters.IN_COLUMN => the target column of the link (default: the case identifier; events of the same case are linked)
+        - Parameters.OUT_COLUMN => the source column of the link (default: the case identifier; events of the same case are linked)
+        - Parameters.NODE_COLUMN => the attribute to be used for the node definition (default: the resource of the log, org:resource)
+        - Parameters.EDGE_COLUMN => the attribute (of the source event) to be used for the edge definition (default: the activity of the log, concept:name)
+        - Parameters.INCLUDE_PERFORMANCE => considers the performance of the edge
+        - Parameters.BUSINESS_HOURS => boolean value that enables the business hours
+        - Parameters.WORKTIMING => defines the worktiming of the organization (e.g. [7, 17]) if business hours are enabled
+        - Parameters.WEEKENDS => defines the weekends of the organization (e.g. [6, 7]) if business hours are enabled
+        - Parameters.TIMESTAMP_DIFF_COLUMN => timestamp diff column
+
+    Returns
+    -----------------
+    network_analysis
+        Edges of the network analysis (first key: edge; second key: type; value: number of occurrences)
+    """
+    if parameters is None:
+        parameters = {}
+
+    sorting_column = exec_utils.get_param_value(Parameters.SORTING_COLUMN, parameters,
+                                                xes_constants.DEFAULT_TIMESTAMP_KEY)
+    index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, constants.DEFAULT_INDEX_KEY)
+    timestamp_column = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
+                                                  xes_constants.DEFAULT_TIMESTAMP_KEY)
+    in_column = exec_utils.get_param_value(Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
+    out_column = exec_utils.get_param_value(Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
+    node_column = exec_utils.get_param_value(Parameters.NODE_COLUMN, parameters, xes_constants.DEFAULT_RESOURCE_KEY)
+    edge_column = exec_utils.get_param_value(Parameters.EDGE_COLUMN, parameters, xes_constants.DEFAULT_NAME_KEY)
+    include_performance = exec_utils.get_param_value(Parameters.INCLUDE_PERFORMANCE, parameters, False)
+    business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False)
+    worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17])
+    weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7])
+    timestamp_diff_column = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_COLUMN, parameters, "@@timestamp_diff")
+
+    dataframe = dataframe[{timestamp_column, in_column, out_column, node_column, edge_column, sorting_column}]
+    dataframe = dataframe.sort_values(sorting_column)
+    dataframe = pandas_utils.insert_index(dataframe, index_key)
+
+    edges = {}
+
+    df_out = dataframe[[index_key, out_column, node_column, edge_column, timestamp_column]].dropna(
+        subset=[out_column, node_column, edge_column, timestamp_column], how="any")
+    df_out.columns = [x + "_out" for x in df_out.columns]
+    df_in = dataframe[[index_key, in_column, node_column, timestamp_column]].dropna(
+        subset=[in_column, node_column, timestamp_column], how="any")
+    df_in.columns = [x + "_in" for x in df_in.columns]
+
+    merged_df = df_out.merge(df_in, left_on=out_column + "_out", right_on=in_column + "_in")
+    merged_df = merged_df[merged_df[index_key + "_in"] > merged_df[index_key + "_out"]]
+    merged_df = merged_df.groupby([index_key + "_out", out_column + "_out", in_column + "_in"]).first().reset_index()
+
+    if business_hours:
+        merged_df[timestamp_diff_column] = merged_df.apply(
+            lambda x: soj_time_business_hours_diff(x[timestamp_column + "_out"], x[timestamp_column + "_in"],
+                                                   worktiming,
+                                                   weekends), axis=1)
+
+    else:
+        merged_df[timestamp_diff_column] = (
+                merged_df[timestamp_column + "_in"] - merged_df[timestamp_column + "_out"]).astype(
+            'timedelta64[s]')
+
+    edges0 = merged_df.groupby([node_column + "_out", node_column + "_in", edge_column + "_out"])[
+        timestamp_diff_column].apply(list).to_dict()
+
+    for e0 in edges0:
+        edge = (e0[0], e0[1])
+        edge_value = e0[2]
+        if edge not in edges:
+            edges[edge] = {}
+        if edge_value not in edges[edge]:
+            if include_performance:
+                edges[edge][edge_value] = edges0[e0]
+            else:
+                edges[edge][edge_value] = len(edges0[e0])
+
+    return edges
diff --git a/pm4py/visualization/network_analysis/__init__.py b/pm4py/visualization/network_analysis/__init__.py
@@ -0,0 +1 @@
+from pm4py.visualization.network_analysis import visualizer, variants
diff --git a/pm4py/visualization/network_analysis/variants/__init__.py b/pm4py/visualization/network_analysis/variants/__init__.py
@@ -0,0 +1 @@
+from pm4py.visualization.network_analysis.variants import frequency, performance
diff --git a/pm4py/visualization/network_analysis/variants/frequency.py b/pm4py/visualization/network_analysis/variants/frequency.py
@@ -0,0 +1,94 @@
+import sys
+import uuid
+from enum import Enum
+from pm4py.util import exec_utils
+import tempfile
+from graphviz import Digraph
+from pm4py.util import vis_utils
+from typing import Dict, Optional, Any, Tuple
+
+
+class Parameters(Enum):
+    FORMAT = "format"
+    BGCOLOR = "bgcolor"
+    ACTIVITY_THRESHOLD = "activity_threshold"
+    EDGE_THRESHOLD = "edge_threshold"
+
+
+def apply(network_analysis_edges: Dict[Tuple[str, str], Dict[str, int]], parameters: Optional[Dict[Any, Any]] = None) -> Digraph:
+    """
+    Creates a visualization of the network analysis
+
+    Parameters
+    -----------------
+    network_analysis_edges
+        Edges of the network analysis
+    parameters
+        Parameters of the algorithm, including:
+        - Parameters.FORMAT => the format of the visualization
+        - Parameters.BGCOLOR => the background color
+        - Parameters.ACTIVITY_THRESHOLD => the minimum number of occurrences for an activity to be included (default: 1)
+        - Parameters.EDGE_THRESHOLD => the minimum number of occurrences for an edge to be included (default: 1)
+
+    Returns
+    ------------------
+    digraph
+        Graphviz graph
+    """
+    if parameters is None:
+        parameters = {}
+
+    image_format = exec_utils.get_param_value(Parameters.FORMAT, parameters, "png")
+    bgcolor = exec_utils.get_param_value(Parameters.BGCOLOR, parameters, "transparent")
+    activity_threshold = exec_utils.get_param_value(Parameters.ACTIVITY_THRESHOLD, parameters, 1)
+    edge_threshold = exec_utils.get_param_value(Parameters.EDGE_THRESHOLD, parameters, 1)
+
+    filename = tempfile.NamedTemporaryFile(suffix='.gv')
+    viz = Digraph("pt", filename=filename.name, engine='dot', graph_attr={'bgcolor': bgcolor})
+    viz.attr('node', shape='ellipse', fixedsize='false')
+
+    nodes = set(x[0] for x in network_analysis_edges).union(set(x[1] for x in network_analysis_edges))
+    nodes_in_degree = {x: 0 for x in nodes}
+    nodes_out_degree = {x: 0 for x in nodes}
+    for edge in network_analysis_edges:
+        for edge_value in network_analysis_edges[edge]:
+            if network_analysis_edges[edge][edge_value] >= edge_threshold:
+                nodes_in_degree[edge[1]] += network_analysis_edges[edge][edge_value]
+                nodes_out_degree[edge[0]] += network_analysis_edges[edge][edge_value]
+    nodes_max_degree = {x: max(nodes_in_degree[x], nodes_out_degree[x]) for x in nodes}
+
+    max_node_value = sys.maxsize
+    min_node_value = -sys.maxsize
+
+    nodes_dict = {}
+    for node in nodes_max_degree:
+        if nodes_max_degree[node] >= activity_threshold:
+            nodes_dict[node] = str(uuid.uuid4())
+            viz.node(nodes_dict[node], node+"\n(in="+str(nodes_in_degree[node])+"; out="+str(nodes_out_degree[node])+")", style="filled", fillcolor=vis_utils.get_trans_freq_color(nodes_max_degree[node], max_node_value, max_node_value))
+            count = nodes_max_degree[node]
+            if count > max_node_value:
+                max_node_value = count
+            elif count < min_node_value:
+                min_node_value = count
+
+    min_edge_value = sys.maxsize
+    max_edge_value = -sys.maxsize
+
+    for edge in network_analysis_edges:
+        if edge[0] in nodes_dict and edge[1] in nodes_dict:
+            for edge_value in network_analysis_edges[edge]:
+                count = network_analysis_edges[edge][edge_value]
+                if count > max_edge_value:
+                    max_edge_value = count
+                elif count < min_edge_value:
+                    min_edge_value = count
+
+    for edge in network_analysis_edges:
+        if edge[0] in nodes_dict and edge[1] in nodes_dict:
+            for edge_value in network_analysis_edges[edge]:
+                if network_analysis_edges[edge][edge_value] >= edge_threshold:
+                    viz.edge(nodes_dict[edge[0]], nodes_dict[edge[1]], label=edge_value+"\n"+str(network_analysis_edges[edge][edge_value])+"", penwidth=str(vis_utils.get_arc_penwidth(network_analysis_edges[edge][edge_value], min_edge_value, max_edge_value)))
+
+    viz.format = image_format
+
+    return viz
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from pm4py.algo.organizational_mining.network_analysis import algorithm, variants
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from pm4py.visualization.network_analysis import visualizer, variants