Skip to content

Commit

Permalink
Merge branch 'ft-1387-link-analysis' into 'integration'
Browse files Browse the repository at this point in the history
FT 1387 Link Analysis

See merge request process-mining/pm4py/pm4py-core!530
  • Loading branch information
fit-sebastiaan-van-zelst committed Dec 14, 2021
2 parents ead288e + 7ea4e3d commit d4011ff
Show file tree
Hide file tree
Showing 9 changed files with 310 additions and 51 deletions.
25 changes: 25 additions & 0 deletions examples/link_analysis_vbfa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pandas as pd
from pm4py.algo.discovery.ocel.link_analysis import algorithm as link_analysis
import os


def execute_script():
dataframe = pd.read_csv(os.path.join("..", "tests", "input_data", "ocel", "VBFA.zip"), compression="zip", dtype="str")
dataframe["time:timestamp"] = dataframe["ERDAT"] + " " + dataframe["ERZET"]
dataframe["time:timestamp"] = pd.to_datetime(dataframe["time:timestamp"], format="%Y%m%d %H%M%S")
dataframe["RFWRT"] = dataframe["RFWRT"].astype(float)
dataframe = link_analysis.apply(dataframe, parameters={"out_column": "VBELN", "in_column": "VBELV",
"sorting_column": "time:timestamp", "propagate": True})

# finds the connected documents in which the currency in one document is different from the currency in the connected document.
df_currency = dataframe[(dataframe["WAERS_out"] != " ") & (dataframe["WAERS_in"] != " ") & (
dataframe["WAERS_out"] != dataframe["WAERS_in"])]
print(df_currency[["WAERS_out", "WAERS_in"]].value_counts())

# finds the connected documents in which the amount in one document is lower than the amount in the connected document.
df_amount = dataframe[(dataframe["RFWRT_out"] > 0) & (dataframe["RFWRT_out"] < dataframe["RFWRT_in"])]
print(df_amount[["RFWRT_out", "RFWRT_in"]])


if __name__ == "__main__":
execute_script()
7 changes: 5 additions & 2 deletions examples/network_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def execute_script():
# EDGE column: the attribute (of the source event) to use to classify the edge. In this case, we use the
# concept:name (activity)

frequency_edges = pm4py.discover_network_analysis(log, out_column="case:concept:name", in_column="case:concept:name", node_column="org:group", edge_column="concept:name", performance=False)
frequency_edges = pm4py.discover_network_analysis(log, out_column="case:concept:name", in_column="case:concept:name", node_column_source="org:group", node_column_target="org:group", edge_column="concept:name", performance=False)
pm4py.view_network_analysis(frequency_edges, variant="frequency", format="svg", edge_threshold=10)

# performance view of the network analysis
Expand All @@ -26,9 +26,12 @@ def execute_script():
# EDGE column: the attribute (of the source event) to use to classify the edge. In this case, we use the
# concept:name (activity)

performance_edges = pm4py.discover_network_analysis(log, out_column="case:concept:name", in_column="case:concept:name", node_column="org:group", edge_column="concept:name", performance=True)
performance_edges = pm4py.discover_network_analysis(log, out_column="case:concept:name", in_column="case:concept:name", node_column_source="org:group", node_column_target="org:group", edge_column="concept:name", performance=True)
pm4py.view_network_analysis(performance_edges, variant="performance", format="svg", edge_threshold=10)

resource_group_edges = pm4py.discover_network_analysis(log, out_column="case:concept:name", in_column="case:concept:name", node_column_source="org:resource", node_column_target="org:group", edge_column="org:resource", performance=False)
pm4py.view_network_analysis(resource_group_edges, variant="frequency", format="svg", edge_threshold=10)


if __name__ == "__main__":
execute_script()
1 change: 1 addition & 0 deletions pm4py/algo/discovery/ocel/link_analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pm4py.algo.discovery.ocel.link_analysis import variants
36 changes: 36 additions & 0 deletions pm4py/algo/discovery/ocel/link_analysis/algorithm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from pm4py.algo.discovery.ocel.link_analysis.variants import classic
from enum import Enum
from pm4py.util import exec_utils
import pandas as pd
from typing import Optional, Dict, Any
from pm4py.objects.log.obj import EventLog, EventStream
from typing import Union
from pm4py.objects.conversion.log import converter


class Variants(Enum):
CLASSIC = classic


def apply(log: Union[EventLog, EventStream, pd.DataFrame], variant=Variants.CLASSIC, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame:
"""
Applies a link analysis algorithm on the provided log object.
Parameters
-----------------
log
Event log
variant
Variant of the algorithm to consider
parameters
Variant-specific parameters
Returns
-----------------
link_analysis_dataframe
Link analysis dataframe
"""
if parameters is None:
parameters = {}

return exec_utils.get_variant(variant).apply(converter.apply(log, variant=converter.Variants.TO_DATA_FRAME, parameters=parameters), parameters=parameters)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pm4py.algo.discovery.ocel.link_analysis.variants import classic
139 changes: 139 additions & 0 deletions pm4py/algo/discovery/ocel/link_analysis/variants/classic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from enum import Enum

from pm4py.util import exec_utils, constants, xes_constants, pandas_utils
from typing import Optional, Dict, Any, Set
import pandas as pd


class Parameters(Enum):
OUT_COLUMN = "out_column"
IN_COLUMN = "in_column"
SORTING_COLUMN = "sorting_column"
INDEX_COLUMN = "index_column"
LOOK_FORWARD = "look_forward"
KEEP_FIRST_OCCURRENCE = "keep_first_occurrence"
PROPAGATE = "propagate"


def propagate_associations(associations: Dict[str, Set[str]]) -> Dict[str, Set[str]]:
"""
Propagate the associations, such that the eventually-follows
flow between the events of the event log is considered
Parameters
-------------------
associations
Associations between events
Returns
------------------
propagated_associations
Propagated associations
"""
reverse_dict = {}
for x in associations:
for k in associations[x]:
if k not in reverse_dict:
reverse_dict[k] = set()
reverse_dict[k].add(x)
change_dict = {x: True for x in associations}
to_change = [x for x in change_dict if change_dict[x]]
while to_change:
for x in to_change:
change_dict[x] = False
for x in to_change:
if x in reverse_dict:
rv = reverse_dict[x]
for k in rv:
new_set = associations[k].union(associations[x])
if len(new_set) > len(associations[k]):
change_dict[k] = True
associations[k] = new_set
to_change = [x for x in change_dict if change_dict[x]]
return associations


def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame:
"""
Performs a link analysis between the entries of the current dataframe.
The link analysis permits advanced filtering based on events connected in an
output-input relation (e.g., the OUT column of the first is equal to the IN column
of the second).
When OUT_COLUMN = IN_COLUMN = CASE ID, it can be equivalent to the directly-follows graph
(when Parameters.KEEP_FIRST_OCCURRENCE = True), and to the eventually-follows graph
(when Parameters.KEEP_FIRST_OCCURRENCE = False).
Parameters
-----------------
dataframe
Pandas dataframe
parameters
Parameters of the algorithm, including:
- Parameters.OUT_COLUMN => the output column of the dataframe
- Parameters.IN_COLUMN => the input column of the dataframe
- Parameters.SORTING_COLUMN => the column on top of which the
- Parameters.INDEX_COLUMN => the attribute to use for the indexing
- Parameters.LOOK_FORWARD => filters the relations in which the second event has an index >= than the index
of the first event.
- Parameters.KEEP_FIRST_OCCURRENCE => keep, for every source event, only the first-occurring relationship
with a target event (OUT=IN).
- Parameters.PROPAGATE => propagate the relationships between events, in such a way that the entire document
flow chain can be reconstructed.
Returns
-----------------
link_analysis_dataframe
Link analysis dataframe
"""
if parameters is None:
parameters = {}

out_column = exec_utils.get_param_value(Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
in_column = exec_utils.get_param_value(Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
sorting_column = exec_utils.get_param_value(Parameters.SORTING_COLUMN, parameters,
xes_constants.DEFAULT_TIMESTAMP_KEY)
index_column = exec_utils.get_param_value(Parameters.INDEX_COLUMN, parameters, constants.DEFAULT_INDEX_KEY)
look_forward = exec_utils.get_param_value(Parameters.LOOK_FORWARD, parameters, True)
keep_first_occurrence = exec_utils.get_param_value(Parameters.KEEP_FIRST_OCCURRENCE, parameters, False)
propagate = exec_utils.get_param_value(Parameters.PROPAGATE, parameters, False)

dataframe = dataframe.sort_values(sorting_column)
dataframe = pandas_utils.insert_index(dataframe, index_column)

df_red1 = dataframe[[out_column, index_column]]
df_red2 = dataframe[[in_column, index_column]]
df_red = df_red1.merge(df_red2, left_on=out_column, right_on=in_column, suffixes=("_out", "_in"))

if look_forward:
df_red = df_red[df_red[index_column + "_out"] < df_red[index_column + "_in"]]

if keep_first_occurrence:
df_red = df_red.groupby(index_column + "_out").first().reset_index()

stream_red = df_red.to_dict("records")
associations = {}
for el in stream_red:
if not el[index_column + "_out"] in associations:
associations[el[index_column + "_out"]] = set()
associations[el[index_column + "_out"]].add(el[index_column + "_in"])

if propagate:
associations = propagate_associations(associations)

out_clmn = []
in_clmn = []
for k in associations:
for v in associations[k]:
out_clmn.append(k)
in_clmn.append(v)

rel = pd.DataFrame({index_column + "_out": out_clmn, index_column + "_in": in_clmn})

df_link = dataframe.copy()
df_link.columns = [x + "_out" for x in df_link.columns]
df_link = df_link.merge(rel, left_on=index_column + "_out", right_on=index_column + "_out")
dataframe.columns = [x + "_in" for x in dataframe.columns]
df_link = df_link.merge(dataframe, left_on=index_column + "_in", right_on=index_column + "_in")

return df_link
Loading

0 comments on commit d4011ff

Please sign in to comment.