Merge branch 'ft-1387-link-analysis' into 'integration'

FT 1387 Link Analysis See merge request process-mining/pm4py/pm4py-core!530
process-intelligence-solutions · Dec 14, 2021 · d4011ff · d4011ff
2 parents ead288e + 7ea4e3d
commit d4011ff
Show file tree

Hide file tree

Showing 9 changed files with 310 additions and 51 deletions.
diff --git a/examples/link_analysis_vbfa.py b/examples/link_analysis_vbfa.py
@@ -0,0 +1,25 @@
+import pandas as pd
+from pm4py.algo.discovery.ocel.link_analysis import algorithm as link_analysis
+import os
+
+
+def execute_script():
+    dataframe = pd.read_csv(os.path.join("..", "tests", "input_data", "ocel", "VBFA.zip"), compression="zip", dtype="str")
+    dataframe["time:timestamp"] = dataframe["ERDAT"] + " " + dataframe["ERZET"]
+    dataframe["time:timestamp"] = pd.to_datetime(dataframe["time:timestamp"], format="%Y%m%d %H%M%S")
+    dataframe["RFWRT"] = dataframe["RFWRT"].astype(float)
+    dataframe = link_analysis.apply(dataframe, parameters={"out_column": "VBELN", "in_column": "VBELV",
+                                                           "sorting_column": "time:timestamp", "propagate": True})
+
+    # finds the connected documents in which the currency in one document is different from the currency in the connected document.
+    df_currency = dataframe[(dataframe["WAERS_out"] != " ") & (dataframe["WAERS_in"] != " ") & (
+                dataframe["WAERS_out"] != dataframe["WAERS_in"])]
+    print(df_currency[["WAERS_out", "WAERS_in"]].value_counts())
+
+    # finds the connected documents in which the amount in one document is lower than the amount in the connected document.
+    df_amount = dataframe[(dataframe["RFWRT_out"] > 0) & (dataframe["RFWRT_out"] < dataframe["RFWRT_in"])]
+    print(df_amount[["RFWRT_out", "RFWRT_in"]])
+
+
+if __name__ == "__main__":
+    execute_script()
diff --git a/examples/network_analysis.py b/examples/network_analysis.py
@@ -15,7 +15,7 @@ def execute_script():
     # EDGE column: the attribute (of the source event) to use to classify the edge. In this case, we use the
     # concept:name (activity)
 
-    frequency_edges = pm4py.discover_network_analysis(log, out_column="case:concept:name", in_column="case:concept:name", node_column="org:group", edge_column="concept:name", performance=False)
+    frequency_edges = pm4py.discover_network_analysis(log, out_column="case:concept:name", in_column="case:concept:name", node_column_source="org:group", node_column_target="org:group", edge_column="concept:name", performance=False)
     pm4py.view_network_analysis(frequency_edges, variant="frequency", format="svg", edge_threshold=10)
 
     # performance view of the network analysis
@@ -26,9 +26,12 @@ def execute_script():
     # EDGE column: the attribute (of the source event) to use to classify the edge. In this case, we use the
     # concept:name (activity)
 
-    performance_edges = pm4py.discover_network_analysis(log, out_column="case:concept:name", in_column="case:concept:name", node_column="org:group", edge_column="concept:name", performance=True)
+    performance_edges = pm4py.discover_network_analysis(log, out_column="case:concept:name", in_column="case:concept:name", node_column_source="org:group", node_column_target="org:group", edge_column="concept:name", performance=True)
     pm4py.view_network_analysis(performance_edges, variant="performance", format="svg", edge_threshold=10)
 
+    resource_group_edges = pm4py.discover_network_analysis(log, out_column="case:concept:name", in_column="case:concept:name", node_column_source="org:resource", node_column_target="org:group", edge_column="org:resource", performance=False)
+    pm4py.view_network_analysis(resource_group_edges, variant="frequency", format="svg", edge_threshold=10)
+
 
 if __name__ == "__main__":
     execute_script()
diff --git a/pm4py/algo/discovery/ocel/link_analysis/__init__.py b/pm4py/algo/discovery/ocel/link_analysis/__init__.py
@@ -0,0 +1 @@
+from pm4py.algo.discovery.ocel.link_analysis import variants
diff --git a/pm4py/algo/discovery/ocel/link_analysis/algorithm.py b/pm4py/algo/discovery/ocel/link_analysis/algorithm.py
@@ -0,0 +1,36 @@
+from pm4py.algo.discovery.ocel.link_analysis.variants import classic
+from enum import Enum
+from pm4py.util import exec_utils
+import pandas as pd
+from typing import Optional, Dict, Any
+from pm4py.objects.log.obj import EventLog, EventStream
+from typing import Union
+from pm4py.objects.conversion.log import converter
+
+
+class Variants(Enum):
+    CLASSIC = classic
+
+
+def apply(log: Union[EventLog, EventStream, pd.DataFrame], variant=Variants.CLASSIC, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame:
+    """
+    Applies a link analysis algorithm on the provided log object.
+
+    Parameters
+    -----------------
+    log
+        Event log
+    variant
+        Variant of the algorithm to consider
+    parameters
+        Variant-specific parameters
+
+    Returns
+    -----------------
+    link_analysis_dataframe
+        Link analysis dataframe
+    """
+    if parameters is None:
+        parameters = {}
+
+    return exec_utils.get_variant(variant).apply(converter.apply(log, variant=converter.Variants.TO_DATA_FRAME, parameters=parameters), parameters=parameters)
diff --git a/pm4py/algo/discovery/ocel/link_analysis/variants/__init__.py b/pm4py/algo/discovery/ocel/link_analysis/variants/__init__.py
@@ -0,0 +1 @@
+from pm4py.algo.discovery.ocel.link_analysis.variants import classic
diff --git a/pm4py/algo/discovery/ocel/link_analysis/variants/classic.py b/pm4py/algo/discovery/ocel/link_analysis/variants/classic.py
@@ -0,0 +1,139 @@
+from enum import Enum
+
+from pm4py.util import exec_utils, constants, xes_constants, pandas_utils
+from typing import Optional, Dict, Any, Set
+import pandas as pd
+
+
+class Parameters(Enum):
+    OUT_COLUMN = "out_column"
+    IN_COLUMN = "in_column"
+    SORTING_COLUMN = "sorting_column"
+    INDEX_COLUMN = "index_column"
+    LOOK_FORWARD = "look_forward"
+    KEEP_FIRST_OCCURRENCE = "keep_first_occurrence"
+    PROPAGATE = "propagate"
+
+
+def propagate_associations(associations: Dict[str, Set[str]]) -> Dict[str, Set[str]]:
+    """
+    Propagate the associations, such that the eventually-follows
+    flow between the events of the event log is considered
+
+    Parameters
+    -------------------
+    associations
+        Associations between events
+
+    Returns
+    ------------------
+    propagated_associations
+        Propagated associations
+    """
+    reverse_dict = {}
+    for x in associations:
+        for k in associations[x]:
+            if k not in reverse_dict:
+                reverse_dict[k] = set()
+            reverse_dict[k].add(x)
+    change_dict = {x: True for x in associations}
+    to_change = [x for x in change_dict if change_dict[x]]
+    while to_change:
+        for x in to_change:
+            change_dict[x] = False
+        for x in to_change:
+            if x in reverse_dict:
+                rv = reverse_dict[x]
+                for k in rv:
+                    new_set = associations[k].union(associations[x])
+                    if len(new_set) > len(associations[k]):
+                        change_dict[k] = True
+                        associations[k] = new_set
+        to_change = [x for x in change_dict if change_dict[x]]
+    return associations
+
+
+def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame:
+    """
+    Performs a link analysis between the entries of the current dataframe.
+    The link analysis permits advanced filtering based on events connected in an
+    output-input relation (e.g., the OUT column of the first is equal to the IN column
+    of the second).
+
+    When OUT_COLUMN = IN_COLUMN = CASE ID, it can be equivalent to the directly-follows graph
+    (when Parameters.KEEP_FIRST_OCCURRENCE = True), and to the eventually-follows graph
+    (when Parameters.KEEP_FIRST_OCCURRENCE = False).
+
+    Parameters
+    -----------------
+    dataframe
+        Pandas dataframe
+    parameters
+        Parameters of the algorithm, including:
+        - Parameters.OUT_COLUMN => the output column of the dataframe
+        - Parameters.IN_COLUMN => the input column of the dataframe
+        - Parameters.SORTING_COLUMN => the column on top of which the
+        - Parameters.INDEX_COLUMN => the attribute to use for the indexing
+        - Parameters.LOOK_FORWARD => filters the relations in which the second event has an index >= than the index
+        of the first event.
+        - Parameters.KEEP_FIRST_OCCURRENCE => keep, for every source event, only the first-occurring relationship
+        with a target event (OUT=IN).
+        - Parameters.PROPAGATE => propagate the relationships between events, in such a way that the entire document
+        flow chain can be reconstructed.
+
+    Returns
+    -----------------
+    link_analysis_dataframe
+        Link analysis dataframe
+    """
+    if parameters is None:
+        parameters = {}
+
+    out_column = exec_utils.get_param_value(Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
+    in_column = exec_utils.get_param_value(Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
+    sorting_column = exec_utils.get_param_value(Parameters.SORTING_COLUMN, parameters,
+                                                xes_constants.DEFAULT_TIMESTAMP_KEY)
+    index_column = exec_utils.get_param_value(Parameters.INDEX_COLUMN, parameters, constants.DEFAULT_INDEX_KEY)
+    look_forward = exec_utils.get_param_value(Parameters.LOOK_FORWARD, parameters, True)
+    keep_first_occurrence = exec_utils.get_param_value(Parameters.KEEP_FIRST_OCCURRENCE, parameters, False)
+    propagate = exec_utils.get_param_value(Parameters.PROPAGATE, parameters, False)
+
+    dataframe = dataframe.sort_values(sorting_column)
+    dataframe = pandas_utils.insert_index(dataframe, index_column)
+
+    df_red1 = dataframe[[out_column, index_column]]
+    df_red2 = dataframe[[in_column, index_column]]
+    df_red = df_red1.merge(df_red2, left_on=out_column, right_on=in_column, suffixes=("_out", "_in"))
+
+    if look_forward:
+        df_red = df_red[df_red[index_column + "_out"] < df_red[index_column + "_in"]]
+
+    if keep_first_occurrence:
+        df_red = df_red.groupby(index_column + "_out").first().reset_index()
+
+    stream_red = df_red.to_dict("records")
+    associations = {}
+    for el in stream_red:
+        if not el[index_column + "_out"] in associations:
+            associations[el[index_column + "_out"]] = set()
+        associations[el[index_column + "_out"]].add(el[index_column + "_in"])
+
+    if propagate:
+        associations = propagate_associations(associations)
+
+    out_clmn = []
+    in_clmn = []
+    for k in associations:
+        for v in associations[k]:
+            out_clmn.append(k)
+            in_clmn.append(v)
+
+    rel = pd.DataFrame({index_column + "_out": out_clmn, index_column + "_in": in_clmn})
+
+    df_link = dataframe.copy()
+    df_link.columns = [x + "_out" for x in df_link.columns]
+    df_link = df_link.merge(rel, left_on=index_column + "_out", right_on=index_column + "_out")
+    dataframe.columns = [x + "_in" for x in dataframe.columns]
+    df_link = df_link.merge(dataframe, left_on=index_column + "_in", right_on=index_column + "_in")
+
+    return df_link
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from pm4py.algo.discovery.ocel.link_analysis import variants