-
Notifications
You must be signed in to change notification settings - Fork 300
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'ft-1382-generic-network-analysis' into 'integration'
FT 1382 Generic Network Analysis (In, Out, Node and Edge column) See merge request pm4py/pm4py-core!528
- Loading branch information
Showing
11 changed files
with
484 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import os | ||
import pm4py | ||
from pm4py.algo.organizational_mining.network_analysis import algorithm as network_analysis | ||
from pm4py.visualization.network_analysis import visualizer as network_visualizer | ||
|
||
|
||
def execute_script(): | ||
log = pm4py.read_xes(os.path.join("..", "tests", "input_data", "receipt.xes")) | ||
|
||
# frequency view of the network analysis | ||
|
||
# OUT column: case identifier | ||
# IN column: case identifier (the next event having the same case identifier is matched) | ||
# NODE column: the attribute to use to classify the node. In this case, we use the org:group (organizational group) | ||
# EDGE column: the attribute (of the source event) to use to classify the edge. In this case, we use the | ||
# concept:name (activity) | ||
|
||
frequency_edges = network_analysis.apply(log, parameters={"in_column": "case:concept:name", | ||
"out_column": "case:concept:name", | ||
"node_column": "org:group", "edge_column": "concept:name", | ||
"include_performance": False}) | ||
gviz_frequency = network_visualizer.apply(frequency_edges, variant=network_visualizer.Variants.FREQUENCY, | ||
parameters={"edge_threshold": 10, "format": "svg"}) | ||
network_visualizer.view(gviz_frequency) | ||
|
||
# performance view of the network analysis | ||
|
||
# OUT column: case identifier | ||
# IN column: case identifier (the next event having the same case identifier is matched) | ||
# NODE column: the attribute to use to classify the node. In this case, we use the org:group (organizational group) | ||
# EDGE column: the attribute (of the source event) to use to classify the edge. In this case, we use the | ||
# concept:name (activity) | ||
|
||
performance_edges = network_analysis.apply(log, parameters={"in_column": "case:concept:name", | ||
"out_column": "case:concept:name", | ||
"node_column": "org:group", | ||
"edge_column": "concept:name", | ||
"include_performance": True}) | ||
gviz_performance = network_visualizer.apply(performance_edges, variant=network_visualizer.Variants.PERFORMANCE, | ||
parameters={"edge_threshold": 10, "format": "svg", | ||
"aggregation_measure": "median"}) | ||
network_visualizer.view(gviz_performance) | ||
|
||
|
||
if __name__ == "__main__": | ||
execute_script() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from pm4py.algo.organizational_mining.network_analysis import algorithm, variants |
30 changes: 30 additions & 0 deletions
30
pm4py/algo/organizational_mining/network_analysis/algorithm.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from pm4py.algo.organizational_mining.network_analysis.variants import dataframe | ||
from enum import Enum | ||
from pm4py.util import exec_utils | ||
from typing import Dict, Optional, Any, Tuple, Union | ||
import pandas as pd | ||
from pm4py.objects.log.obj import EventLog, EventStream | ||
from pm4py.objects.conversion.log import converter as log_converter | ||
|
||
|
||
class Variants(Enum): | ||
DATAFRAME = dataframe | ||
|
||
|
||
def apply(log: Union[pd.DataFrame, EventLog, EventStream], variant=Variants.DATAFRAME, parameters: Optional[Dict[Any, Any]] = None) -> Dict[Tuple[str, str], Dict[str, Any]]: | ||
""" | ||
Performs the network analysis on the provided event log | ||
Parameters | ||
---------------- | ||
log | ||
Event log | ||
parameters | ||
Version-specific parameters | ||
Returns | ||
---------------- | ||
network_analysis | ||
Edges of the network analysis (first key: edge; second key: type; value: number of occurrences) | ||
""" | ||
return exec_utils.get_variant(variant).apply(log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=parameters), parameters=parameters) |
1 change: 1 addition & 0 deletions
1
pm4py/algo/organizational_mining/network_analysis/variants/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from pm4py.algo.organizational_mining.network_analysis.variants import dataframe |
113 changes: 113 additions & 0 deletions
113
pm4py/algo/organizational_mining/network_analysis/variants/dataframe.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
from enum import Enum | ||
from pm4py.util import exec_utils | ||
from pm4py.util import xes_constants, constants, pandas_utils | ||
import pandas as pd | ||
from typing import Dict, Optional, Any, Tuple | ||
from pm4py.util.business_hours import soj_time_business_hours_diff | ||
|
||
|
||
class Parameters(Enum): | ||
SORTING_COLUMN = "sorting_column" | ||
INDEX_KEY = "index_key" | ||
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY | ||
IN_COLUMN = "in_column" | ||
OUT_COLUMN = "out_column" | ||
NODE_COLUMN = "node_column" | ||
EDGE_COLUMN = "edge_column" | ||
INCLUDE_PERFORMANCE = "include_performance" | ||
BUSINESS_HOURS = "business_hours" | ||
WORKTIMING = "worktiming" | ||
WEEKENDS = "weekends" | ||
TIMESTAMP_DIFF_COLUMN = "timestamp_diff_column" | ||
|
||
|
||
def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> Dict[ | ||
Tuple[str, str], Dict[str, Any]]: | ||
""" | ||
Performs the network analysis on the provided dataframe | ||
Parameters | ||
----------------- | ||
dataframe | ||
Dataframe | ||
parameters | ||
Parameters of the algorithm, including: | ||
- Parameters.SORTING_COLUMN => the column that should be used to sort the log | ||
- Parameters.INDEX_KEY => the name for the index attribute in the log (inserted during the execution) | ||
- Parameters.TIMESTAMP_COLUMN => the timestamp column | ||
- Parameters.IN_COLUMN => the target column of the link (default: the case identifier; events of the same case are linked) | ||
- Parameters.OUT_COLUMN => the source column of the link (default: the case identifier; events of the same case are linked) | ||
- Parameters.NODE_COLUMN => the attribute to be used for the node definition (default: the resource of the log, org:resource) | ||
- Parameters.EDGE_COLUMN => the attribute (of the source event) to be used for the edge definition (default: the activity of the log, concept:name) | ||
- Parameters.INCLUDE_PERFORMANCE => considers the performance of the edge | ||
- Parameters.BUSINESS_HOURS => boolean value that enables the business hours | ||
- Parameters.WORKTIMING => defines the worktiming of the organization (e.g. [7, 17]) if business hours are enabled | ||
- Parameters.WEEKENDS => defines the weekends of the organization (e.g. [6, 7]) if business hours are enabled | ||
- Parameters.TIMESTAMP_DIFF_COLUMN => timestamp diff column | ||
Returns | ||
----------------- | ||
network_analysis | ||
Edges of the network analysis (first key: edge; second key: type; value: number of occurrences) | ||
""" | ||
if parameters is None: | ||
parameters = {} | ||
|
||
sorting_column = exec_utils.get_param_value(Parameters.SORTING_COLUMN, parameters, | ||
xes_constants.DEFAULT_TIMESTAMP_KEY) | ||
index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, constants.DEFAULT_INDEX_KEY) | ||
timestamp_column = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, | ||
xes_constants.DEFAULT_TIMESTAMP_KEY) | ||
in_column = exec_utils.get_param_value(Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME) | ||
out_column = exec_utils.get_param_value(Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME) | ||
node_column = exec_utils.get_param_value(Parameters.NODE_COLUMN, parameters, xes_constants.DEFAULT_RESOURCE_KEY) | ||
edge_column = exec_utils.get_param_value(Parameters.EDGE_COLUMN, parameters, xes_constants.DEFAULT_NAME_KEY) | ||
include_performance = exec_utils.get_param_value(Parameters.INCLUDE_PERFORMANCE, parameters, False) | ||
business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False) | ||
worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17]) | ||
weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7]) | ||
timestamp_diff_column = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_COLUMN, parameters, "@@timestamp_diff") | ||
|
||
dataframe = dataframe[{timestamp_column, in_column, out_column, node_column, edge_column, sorting_column}] | ||
dataframe = dataframe.sort_values(sorting_column) | ||
dataframe = pandas_utils.insert_index(dataframe, index_key) | ||
|
||
edges = {} | ||
|
||
df_out = dataframe[[index_key, out_column, node_column, edge_column, timestamp_column]].dropna( | ||
subset=[out_column, node_column, edge_column, timestamp_column], how="any") | ||
df_out.columns = [x + "_out" for x in df_out.columns] | ||
df_in = dataframe[[index_key, in_column, node_column, timestamp_column]].dropna( | ||
subset=[in_column, node_column, timestamp_column], how="any") | ||
df_in.columns = [x + "_in" for x in df_in.columns] | ||
|
||
merged_df = df_out.merge(df_in, left_on=out_column + "_out", right_on=in_column + "_in") | ||
merged_df = merged_df[merged_df[index_key + "_in"] > merged_df[index_key + "_out"]] | ||
merged_df = merged_df.groupby([index_key + "_out", out_column + "_out", in_column + "_in"]).first().reset_index() | ||
|
||
if business_hours: | ||
merged_df[timestamp_diff_column] = merged_df.apply( | ||
lambda x: soj_time_business_hours_diff(x[timestamp_column + "_out"], x[timestamp_column + "_in"], | ||
worktiming, | ||
weekends), axis=1) | ||
|
||
else: | ||
merged_df[timestamp_diff_column] = ( | ||
merged_df[timestamp_column + "_in"] - merged_df[timestamp_column + "_out"]).astype( | ||
'timedelta64[s]') | ||
|
||
edges0 = merged_df.groupby([node_column + "_out", node_column + "_in", edge_column + "_out"])[ | ||
timestamp_diff_column].apply(list).to_dict() | ||
|
||
for e0 in edges0: | ||
edge = (e0[0], e0[1]) | ||
edge_value = e0[2] | ||
if edge not in edges: | ||
edges[edge] = {} | ||
if edge_value not in edges[edge]: | ||
if include_performance: | ||
edges[edge][edge_value] = edges0[e0] | ||
else: | ||
edges[edge][edge_value] = len(edges0[e0]) | ||
|
||
return edges |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from pm4py.visualization.network_analysis import visualizer, variants |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from pm4py.visualization.network_analysis.variants import frequency, performance |
94 changes: 94 additions & 0 deletions
94
pm4py/visualization/network_analysis/variants/frequency.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import sys | ||
import uuid | ||
from enum import Enum | ||
from pm4py.util import exec_utils | ||
import tempfile | ||
from graphviz import Digraph | ||
from pm4py.util import vis_utils | ||
from typing import Dict, Optional, Any, Tuple | ||
|
||
|
||
class Parameters(Enum): | ||
FORMAT = "format" | ||
BGCOLOR = "bgcolor" | ||
ACTIVITY_THRESHOLD = "activity_threshold" | ||
EDGE_THRESHOLD = "edge_threshold" | ||
|
||
|
||
def apply(network_analysis_edges: Dict[Tuple[str, str], Dict[str, int]], parameters: Optional[Dict[Any, Any]] = None) -> Digraph: | ||
""" | ||
Creates a visualization of the network analysis | ||
Parameters | ||
----------------- | ||
network_analysis_edges | ||
Edges of the network analysis | ||
parameters | ||
Parameters of the algorithm, including: | ||
- Parameters.FORMAT => the format of the visualization | ||
- Parameters.BGCOLOR => the background color | ||
- Parameters.ACTIVITY_THRESHOLD => the minimum number of occurrences for an activity to be included (default: 1) | ||
- Parameters.EDGE_THRESHOLD => the minimum number of occurrences for an edge to be included (default: 1) | ||
Returns | ||
------------------ | ||
digraph | ||
Graphviz graph | ||
""" | ||
if parameters is None: | ||
parameters = {} | ||
|
||
image_format = exec_utils.get_param_value(Parameters.FORMAT, parameters, "png") | ||
bgcolor = exec_utils.get_param_value(Parameters.BGCOLOR, parameters, "transparent") | ||
activity_threshold = exec_utils.get_param_value(Parameters.ACTIVITY_THRESHOLD, parameters, 1) | ||
edge_threshold = exec_utils.get_param_value(Parameters.EDGE_THRESHOLD, parameters, 1) | ||
|
||
filename = tempfile.NamedTemporaryFile(suffix='.gv') | ||
viz = Digraph("pt", filename=filename.name, engine='dot', graph_attr={'bgcolor': bgcolor}) | ||
viz.attr('node', shape='ellipse', fixedsize='false') | ||
|
||
nodes = set(x[0] for x in network_analysis_edges).union(set(x[1] for x in network_analysis_edges)) | ||
nodes_in_degree = {x: 0 for x in nodes} | ||
nodes_out_degree = {x: 0 for x in nodes} | ||
for edge in network_analysis_edges: | ||
for edge_value in network_analysis_edges[edge]: | ||
if network_analysis_edges[edge][edge_value] >= edge_threshold: | ||
nodes_in_degree[edge[1]] += network_analysis_edges[edge][edge_value] | ||
nodes_out_degree[edge[0]] += network_analysis_edges[edge][edge_value] | ||
nodes_max_degree = {x: max(nodes_in_degree[x], nodes_out_degree[x]) for x in nodes} | ||
|
||
max_node_value = sys.maxsize | ||
min_node_value = -sys.maxsize | ||
|
||
nodes_dict = {} | ||
for node in nodes_max_degree: | ||
if nodes_max_degree[node] >= activity_threshold: | ||
nodes_dict[node] = str(uuid.uuid4()) | ||
viz.node(nodes_dict[node], node+"\n(in="+str(nodes_in_degree[node])+"; out="+str(nodes_out_degree[node])+")", style="filled", fillcolor=vis_utils.get_trans_freq_color(nodes_max_degree[node], max_node_value, max_node_value)) | ||
count = nodes_max_degree[node] | ||
if count > max_node_value: | ||
max_node_value = count | ||
elif count < min_node_value: | ||
min_node_value = count | ||
|
||
min_edge_value = sys.maxsize | ||
max_edge_value = -sys.maxsize | ||
|
||
for edge in network_analysis_edges: | ||
if edge[0] in nodes_dict and edge[1] in nodes_dict: | ||
for edge_value in network_analysis_edges[edge]: | ||
count = network_analysis_edges[edge][edge_value] | ||
if count > max_edge_value: | ||
max_edge_value = count | ||
elif count < min_edge_value: | ||
min_edge_value = count | ||
|
||
for edge in network_analysis_edges: | ||
if edge[0] in nodes_dict and edge[1] in nodes_dict: | ||
for edge_value in network_analysis_edges[edge]: | ||
if network_analysis_edges[edge][edge_value] >= edge_threshold: | ||
viz.edge(nodes_dict[edge[0]], nodes_dict[edge[1]], label=edge_value+"\n"+str(network_analysis_edges[edge][edge_value])+"", penwidth=str(vis_utils.get_arc_penwidth(network_analysis_edges[edge][edge_value], min_edge_value, max_edge_value))) | ||
|
||
viz.format = image_format | ||
|
||
return viz |
Oops, something went wrong.