Skip to content

Commit

Permalink
Merge branch 'ft-1382-generic-network-analysis' into 'integration'
Browse files Browse the repository at this point in the history
FT 1382 Generic Network Analysis (In, Out, Node and Edge column)

See merge request pm4py/pm4py-core!528
  • Loading branch information
fit-sebastiaan-van-zelst committed Nov 16, 2021
2 parents d4747f7 + d0556e4 commit 9fbd1c4
Show file tree
Hide file tree
Showing 11 changed files with 484 additions and 0 deletions.
7 changes: 7 additions & 0 deletions examples/execute_everything.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
import traceback


def network_analysis():
from examples import network_analysis
print("\n\nnetwork_analysis")
network_analysis.execute_script()


def read_write_ocel():
from examples import read_write_ocel
print("\n\nread_write_ocel")
Expand Down Expand Up @@ -471,6 +477,7 @@ def execute_script(f):
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))))

if __name__ == "__main__":
execute_script(network_analysis)
execute_script(read_write_ocel)
execute_script(ocdfg_discovery)
execute_script(enrich_log_with_align)
Expand Down
46 changes: 46 additions & 0 deletions examples/network_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import pm4py
from pm4py.algo.organizational_mining.network_analysis import algorithm as network_analysis
from pm4py.visualization.network_analysis import visualizer as network_visualizer


def execute_script():
log = pm4py.read_xes(os.path.join("..", "tests", "input_data", "receipt.xes"))

# frequency view of the network analysis

# OUT column: case identifier
# IN column: case identifier (the next event having the same case identifier is matched)
# NODE column: the attribute to use to classify the node. In this case, we use the org:group (organizational group)
# EDGE column: the attribute (of the source event) to use to classify the edge. In this case, we use the
# concept:name (activity)

frequency_edges = network_analysis.apply(log, parameters={"in_column": "case:concept:name",
"out_column": "case:concept:name",
"node_column": "org:group", "edge_column": "concept:name",
"include_performance": False})
gviz_frequency = network_visualizer.apply(frequency_edges, variant=network_visualizer.Variants.FREQUENCY,
parameters={"edge_threshold": 10, "format": "svg"})
network_visualizer.view(gviz_frequency)

# performance view of the network analysis

# OUT column: case identifier
# IN column: case identifier (the next event having the same case identifier is matched)
# NODE column: the attribute to use to classify the node. In this case, we use the org:group (organizational group)
# EDGE column: the attribute (of the source event) to use to classify the edge. In this case, we use the
# concept:name (activity)

performance_edges = network_analysis.apply(log, parameters={"in_column": "case:concept:name",
"out_column": "case:concept:name",
"node_column": "org:group",
"edge_column": "concept:name",
"include_performance": True})
gviz_performance = network_visualizer.apply(performance_edges, variant=network_visualizer.Variants.PERFORMANCE,
parameters={"edge_threshold": 10, "format": "svg",
"aggregation_measure": "median"})
network_visualizer.view(gviz_performance)


if __name__ == "__main__":
execute_script()
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pm4py.algo.organizational_mining.network_analysis import algorithm, variants
30 changes: 30 additions & 0 deletions pm4py/algo/organizational_mining/network_analysis/algorithm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from pm4py.algo.organizational_mining.network_analysis.variants import dataframe
from enum import Enum
from pm4py.util import exec_utils
from typing import Dict, Optional, Any, Tuple, Union
import pandas as pd
from pm4py.objects.log.obj import EventLog, EventStream
from pm4py.objects.conversion.log import converter as log_converter


class Variants(Enum):
DATAFRAME = dataframe


def apply(log: Union[pd.DataFrame, EventLog, EventStream], variant=Variants.DATAFRAME, parameters: Optional[Dict[Any, Any]] = None) -> Dict[Tuple[str, str], Dict[str, Any]]:
"""
Performs the network analysis on the provided event log
Parameters
----------------
log
Event log
parameters
Version-specific parameters
Returns
----------------
network_analysis
Edges of the network analysis (first key: edge; second key: type; value: number of occurrences)
"""
return exec_utils.get_variant(variant).apply(log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=parameters), parameters=parameters)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pm4py.algo.organizational_mining.network_analysis.variants import dataframe
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from enum import Enum
from pm4py.util import exec_utils
from pm4py.util import xes_constants, constants, pandas_utils
import pandas as pd
from typing import Dict, Optional, Any, Tuple
from pm4py.util.business_hours import soj_time_business_hours_diff


class Parameters(Enum):
SORTING_COLUMN = "sorting_column"
INDEX_KEY = "index_key"
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
IN_COLUMN = "in_column"
OUT_COLUMN = "out_column"
NODE_COLUMN = "node_column"
EDGE_COLUMN = "edge_column"
INCLUDE_PERFORMANCE = "include_performance"
BUSINESS_HOURS = "business_hours"
WORKTIMING = "worktiming"
WEEKENDS = "weekends"
TIMESTAMP_DIFF_COLUMN = "timestamp_diff_column"


def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> Dict[
Tuple[str, str], Dict[str, Any]]:
"""
Performs the network analysis on the provided dataframe
Parameters
-----------------
dataframe
Dataframe
parameters
Parameters of the algorithm, including:
- Parameters.SORTING_COLUMN => the column that should be used to sort the log
- Parameters.INDEX_KEY => the name for the index attribute in the log (inserted during the execution)
- Parameters.TIMESTAMP_COLUMN => the timestamp column
- Parameters.IN_COLUMN => the target column of the link (default: the case identifier; events of the same case are linked)
- Parameters.OUT_COLUMN => the source column of the link (default: the case identifier; events of the same case are linked)
- Parameters.NODE_COLUMN => the attribute to be used for the node definition (default: the resource of the log, org:resource)
- Parameters.EDGE_COLUMN => the attribute (of the source event) to be used for the edge definition (default: the activity of the log, concept:name)
- Parameters.INCLUDE_PERFORMANCE => considers the performance of the edge
- Parameters.BUSINESS_HOURS => boolean value that enables the business hours
- Parameters.WORKTIMING => defines the worktiming of the organization (e.g. [7, 17]) if business hours are enabled
- Parameters.WEEKENDS => defines the weekends of the organization (e.g. [6, 7]) if business hours are enabled
- Parameters.TIMESTAMP_DIFF_COLUMN => timestamp diff column
Returns
-----------------
network_analysis
Edges of the network analysis (first key: edge; second key: type; value: number of occurrences)
"""
if parameters is None:
parameters = {}

sorting_column = exec_utils.get_param_value(Parameters.SORTING_COLUMN, parameters,
xes_constants.DEFAULT_TIMESTAMP_KEY)
index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, constants.DEFAULT_INDEX_KEY)
timestamp_column = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
xes_constants.DEFAULT_TIMESTAMP_KEY)
in_column = exec_utils.get_param_value(Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
out_column = exec_utils.get_param_value(Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
node_column = exec_utils.get_param_value(Parameters.NODE_COLUMN, parameters, xes_constants.DEFAULT_RESOURCE_KEY)
edge_column = exec_utils.get_param_value(Parameters.EDGE_COLUMN, parameters, xes_constants.DEFAULT_NAME_KEY)
include_performance = exec_utils.get_param_value(Parameters.INCLUDE_PERFORMANCE, parameters, False)
business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False)
worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17])
weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7])
timestamp_diff_column = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_COLUMN, parameters, "@@timestamp_diff")

dataframe = dataframe[{timestamp_column, in_column, out_column, node_column, edge_column, sorting_column}]
dataframe = dataframe.sort_values(sorting_column)
dataframe = pandas_utils.insert_index(dataframe, index_key)

edges = {}

df_out = dataframe[[index_key, out_column, node_column, edge_column, timestamp_column]].dropna(
subset=[out_column, node_column, edge_column, timestamp_column], how="any")
df_out.columns = [x + "_out" for x in df_out.columns]
df_in = dataframe[[index_key, in_column, node_column, timestamp_column]].dropna(
subset=[in_column, node_column, timestamp_column], how="any")
df_in.columns = [x + "_in" for x in df_in.columns]

merged_df = df_out.merge(df_in, left_on=out_column + "_out", right_on=in_column + "_in")
merged_df = merged_df[merged_df[index_key + "_in"] > merged_df[index_key + "_out"]]
merged_df = merged_df.groupby([index_key + "_out", out_column + "_out", in_column + "_in"]).first().reset_index()

if business_hours:
merged_df[timestamp_diff_column] = merged_df.apply(
lambda x: soj_time_business_hours_diff(x[timestamp_column + "_out"], x[timestamp_column + "_in"],
worktiming,
weekends), axis=1)

else:
merged_df[timestamp_diff_column] = (
merged_df[timestamp_column + "_in"] - merged_df[timestamp_column + "_out"]).astype(
'timedelta64[s]')

edges0 = merged_df.groupby([node_column + "_out", node_column + "_in", edge_column + "_out"])[
timestamp_diff_column].apply(list).to_dict()

for e0 in edges0:
edge = (e0[0], e0[1])
edge_value = e0[2]
if edge not in edges:
edges[edge] = {}
if edge_value not in edges[edge]:
if include_performance:
edges[edge][edge_value] = edges0[e0]
else:
edges[edge][edge_value] = len(edges0[e0])

return edges
1 change: 1 addition & 0 deletions pm4py/visualization/network_analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pm4py.visualization.network_analysis import visualizer, variants
1 change: 1 addition & 0 deletions pm4py/visualization/network_analysis/variants/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pm4py.visualization.network_analysis.variants import frequency, performance
94 changes: 94 additions & 0 deletions pm4py/visualization/network_analysis/variants/frequency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import sys
import uuid
from enum import Enum
from pm4py.util import exec_utils
import tempfile
from graphviz import Digraph
from pm4py.util import vis_utils
from typing import Dict, Optional, Any, Tuple


class Parameters(Enum):
FORMAT = "format"
BGCOLOR = "bgcolor"
ACTIVITY_THRESHOLD = "activity_threshold"
EDGE_THRESHOLD = "edge_threshold"


def apply(network_analysis_edges: Dict[Tuple[str, str], Dict[str, int]], parameters: Optional[Dict[Any, Any]] = None) -> Digraph:
"""
Creates a visualization of the network analysis
Parameters
-----------------
network_analysis_edges
Edges of the network analysis
parameters
Parameters of the algorithm, including:
- Parameters.FORMAT => the format of the visualization
- Parameters.BGCOLOR => the background color
- Parameters.ACTIVITY_THRESHOLD => the minimum number of occurrences for an activity to be included (default: 1)
- Parameters.EDGE_THRESHOLD => the minimum number of occurrences for an edge to be included (default: 1)
Returns
------------------
digraph
Graphviz graph
"""
if parameters is None:
parameters = {}

image_format = exec_utils.get_param_value(Parameters.FORMAT, parameters, "png")
bgcolor = exec_utils.get_param_value(Parameters.BGCOLOR, parameters, "transparent")
activity_threshold = exec_utils.get_param_value(Parameters.ACTIVITY_THRESHOLD, parameters, 1)
edge_threshold = exec_utils.get_param_value(Parameters.EDGE_THRESHOLD, parameters, 1)

filename = tempfile.NamedTemporaryFile(suffix='.gv')
viz = Digraph("pt", filename=filename.name, engine='dot', graph_attr={'bgcolor': bgcolor})
viz.attr('node', shape='ellipse', fixedsize='false')

nodes = set(x[0] for x in network_analysis_edges).union(set(x[1] for x in network_analysis_edges))
nodes_in_degree = {x: 0 for x in nodes}
nodes_out_degree = {x: 0 for x in nodes}
for edge in network_analysis_edges:
for edge_value in network_analysis_edges[edge]:
if network_analysis_edges[edge][edge_value] >= edge_threshold:
nodes_in_degree[edge[1]] += network_analysis_edges[edge][edge_value]
nodes_out_degree[edge[0]] += network_analysis_edges[edge][edge_value]
nodes_max_degree = {x: max(nodes_in_degree[x], nodes_out_degree[x]) for x in nodes}

max_node_value = sys.maxsize
min_node_value = -sys.maxsize

nodes_dict = {}
for node in nodes_max_degree:
if nodes_max_degree[node] >= activity_threshold:
nodes_dict[node] = str(uuid.uuid4())
viz.node(nodes_dict[node], node+"\n(in="+str(nodes_in_degree[node])+"; out="+str(nodes_out_degree[node])+")", style="filled", fillcolor=vis_utils.get_trans_freq_color(nodes_max_degree[node], max_node_value, max_node_value))
count = nodes_max_degree[node]
if count > max_node_value:
max_node_value = count
elif count < min_node_value:
min_node_value = count

min_edge_value = sys.maxsize
max_edge_value = -sys.maxsize

for edge in network_analysis_edges:
if edge[0] in nodes_dict and edge[1] in nodes_dict:
for edge_value in network_analysis_edges[edge]:
count = network_analysis_edges[edge][edge_value]
if count > max_edge_value:
max_edge_value = count
elif count < min_edge_value:
min_edge_value = count

for edge in network_analysis_edges:
if edge[0] in nodes_dict and edge[1] in nodes_dict:
for edge_value in network_analysis_edges[edge]:
if network_analysis_edges[edge][edge_value] >= edge_threshold:
viz.edge(nodes_dict[edge[0]], nodes_dict[edge[1]], label=edge_value+"\n"+str(network_analysis_edges[edge][edge_value])+"", penwidth=str(vis_utils.get_arc_penwidth(network_analysis_edges[edge][edge_value], min_edge_value, max_edge_value)))

viz.format = image_format

return viz
Loading

0 comments on commit 9fbd1c4

Please sign in to comment.