-
Notifications
You must be signed in to change notification settings - Fork 300
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'ft-1387-link-analysis' into 'integration'
FT 1387 Link Analysis See merge request process-mining/pm4py/pm4py-core!530
- Loading branch information
Showing
9 changed files
with
310 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import pandas as pd | ||
from pm4py.algo.discovery.ocel.link_analysis import algorithm as link_analysis | ||
import os | ||
|
||
|
||
def execute_script(): | ||
dataframe = pd.read_csv(os.path.join("..", "tests", "input_data", "ocel", "VBFA.zip"), compression="zip", dtype="str") | ||
dataframe["time:timestamp"] = dataframe["ERDAT"] + " " + dataframe["ERZET"] | ||
dataframe["time:timestamp"] = pd.to_datetime(dataframe["time:timestamp"], format="%Y%m%d %H%M%S") | ||
dataframe["RFWRT"] = dataframe["RFWRT"].astype(float) | ||
dataframe = link_analysis.apply(dataframe, parameters={"out_column": "VBELN", "in_column": "VBELV", | ||
"sorting_column": "time:timestamp", "propagate": True}) | ||
|
||
# finds the connected documents in which the currency in one document is different from the currency in the connected document. | ||
df_currency = dataframe[(dataframe["WAERS_out"] != " ") & (dataframe["WAERS_in"] != " ") & ( | ||
dataframe["WAERS_out"] != dataframe["WAERS_in"])] | ||
print(df_currency[["WAERS_out", "WAERS_in"]].value_counts()) | ||
|
||
# finds the connected documents in which the amount in one document is lower than the amount in the connected document. | ||
df_amount = dataframe[(dataframe["RFWRT_out"] > 0) & (dataframe["RFWRT_out"] < dataframe["RFWRT_in"])] | ||
print(df_amount[["RFWRT_out", "RFWRT_in"]]) | ||
|
||
|
||
if __name__ == "__main__": | ||
execute_script() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from pm4py.algo.discovery.ocel.link_analysis import variants |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from pm4py.algo.discovery.ocel.link_analysis.variants import classic | ||
from enum import Enum | ||
from pm4py.util import exec_utils | ||
import pandas as pd | ||
from typing import Optional, Dict, Any | ||
from pm4py.objects.log.obj import EventLog, EventStream | ||
from typing import Union | ||
from pm4py.objects.conversion.log import converter | ||
|
||
|
||
class Variants(Enum): | ||
CLASSIC = classic | ||
|
||
|
||
def apply(log: Union[EventLog, EventStream, pd.DataFrame], variant=Variants.CLASSIC, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame: | ||
""" | ||
Applies a link analysis algorithm on the provided log object. | ||
Parameters | ||
----------------- | ||
log | ||
Event log | ||
variant | ||
Variant of the algorithm to consider | ||
parameters | ||
Variant-specific parameters | ||
Returns | ||
----------------- | ||
link_analysis_dataframe | ||
Link analysis dataframe | ||
""" | ||
if parameters is None: | ||
parameters = {} | ||
|
||
return exec_utils.get_variant(variant).apply(converter.apply(log, variant=converter.Variants.TO_DATA_FRAME, parameters=parameters), parameters=parameters) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from pm4py.algo.discovery.ocel.link_analysis.variants import classic |
139 changes: 139 additions & 0 deletions
139
pm4py/algo/discovery/ocel/link_analysis/variants/classic.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
from enum import Enum | ||
|
||
from pm4py.util import exec_utils, constants, xes_constants, pandas_utils | ||
from typing import Optional, Dict, Any, Set | ||
import pandas as pd | ||
|
||
|
||
class Parameters(Enum): | ||
OUT_COLUMN = "out_column" | ||
IN_COLUMN = "in_column" | ||
SORTING_COLUMN = "sorting_column" | ||
INDEX_COLUMN = "index_column" | ||
LOOK_FORWARD = "look_forward" | ||
KEEP_FIRST_OCCURRENCE = "keep_first_occurrence" | ||
PROPAGATE = "propagate" | ||
|
||
|
||
def propagate_associations(associations: Dict[str, Set[str]]) -> Dict[str, Set[str]]: | ||
""" | ||
Propagate the associations, such that the eventually-follows | ||
flow between the events of the event log is considered | ||
Parameters | ||
------------------- | ||
associations | ||
Associations between events | ||
Returns | ||
------------------ | ||
propagated_associations | ||
Propagated associations | ||
""" | ||
reverse_dict = {} | ||
for x in associations: | ||
for k in associations[x]: | ||
if k not in reverse_dict: | ||
reverse_dict[k] = set() | ||
reverse_dict[k].add(x) | ||
change_dict = {x: True for x in associations} | ||
to_change = [x for x in change_dict if change_dict[x]] | ||
while to_change: | ||
for x in to_change: | ||
change_dict[x] = False | ||
for x in to_change: | ||
if x in reverse_dict: | ||
rv = reverse_dict[x] | ||
for k in rv: | ||
new_set = associations[k].union(associations[x]) | ||
if len(new_set) > len(associations[k]): | ||
change_dict[k] = True | ||
associations[k] = new_set | ||
to_change = [x for x in change_dict if change_dict[x]] | ||
return associations | ||
|
||
|
||
def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame: | ||
""" | ||
Performs a link analysis between the entries of the current dataframe. | ||
The link analysis permits advanced filtering based on events connected in an | ||
output-input relation (e.g., the OUT column of the first is equal to the IN column | ||
of the second). | ||
When OUT_COLUMN = IN_COLUMN = CASE ID, it can be equivalent to the directly-follows graph | ||
(when Parameters.KEEP_FIRST_OCCURRENCE = True), and to the eventually-follows graph | ||
(when Parameters.KEEP_FIRST_OCCURRENCE = False). | ||
Parameters | ||
----------------- | ||
dataframe | ||
Pandas dataframe | ||
parameters | ||
Parameters of the algorithm, including: | ||
- Parameters.OUT_COLUMN => the output column of the dataframe | ||
- Parameters.IN_COLUMN => the input column of the dataframe | ||
- Parameters.SORTING_COLUMN => the column on top of which the | ||
- Parameters.INDEX_COLUMN => the attribute to use for the indexing | ||
- Parameters.LOOK_FORWARD => filters the relations in which the second event has an index >= than the index | ||
of the first event. | ||
- Parameters.KEEP_FIRST_OCCURRENCE => keep, for every source event, only the first-occurring relationship | ||
with a target event (OUT=IN). | ||
- Parameters.PROPAGATE => propagate the relationships between events, in such a way that the entire document | ||
flow chain can be reconstructed. | ||
Returns | ||
----------------- | ||
link_analysis_dataframe | ||
Link analysis dataframe | ||
""" | ||
if parameters is None: | ||
parameters = {} | ||
|
||
out_column = exec_utils.get_param_value(Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME) | ||
in_column = exec_utils.get_param_value(Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME) | ||
sorting_column = exec_utils.get_param_value(Parameters.SORTING_COLUMN, parameters, | ||
xes_constants.DEFAULT_TIMESTAMP_KEY) | ||
index_column = exec_utils.get_param_value(Parameters.INDEX_COLUMN, parameters, constants.DEFAULT_INDEX_KEY) | ||
look_forward = exec_utils.get_param_value(Parameters.LOOK_FORWARD, parameters, True) | ||
keep_first_occurrence = exec_utils.get_param_value(Parameters.KEEP_FIRST_OCCURRENCE, parameters, False) | ||
propagate = exec_utils.get_param_value(Parameters.PROPAGATE, parameters, False) | ||
|
||
dataframe = dataframe.sort_values(sorting_column) | ||
dataframe = pandas_utils.insert_index(dataframe, index_column) | ||
|
||
df_red1 = dataframe[[out_column, index_column]] | ||
df_red2 = dataframe[[in_column, index_column]] | ||
df_red = df_red1.merge(df_red2, left_on=out_column, right_on=in_column, suffixes=("_out", "_in")) | ||
|
||
if look_forward: | ||
df_red = df_red[df_red[index_column + "_out"] < df_red[index_column + "_in"]] | ||
|
||
if keep_first_occurrence: | ||
df_red = df_red.groupby(index_column + "_out").first().reset_index() | ||
|
||
stream_red = df_red.to_dict("records") | ||
associations = {} | ||
for el in stream_red: | ||
if not el[index_column + "_out"] in associations: | ||
associations[el[index_column + "_out"]] = set() | ||
associations[el[index_column + "_out"]].add(el[index_column + "_in"]) | ||
|
||
if propagate: | ||
associations = propagate_associations(associations) | ||
|
||
out_clmn = [] | ||
in_clmn = [] | ||
for k in associations: | ||
for v in associations[k]: | ||
out_clmn.append(k) | ||
in_clmn.append(v) | ||
|
||
rel = pd.DataFrame({index_column + "_out": out_clmn, index_column + "_in": in_clmn}) | ||
|
||
df_link = dataframe.copy() | ||
df_link.columns = [x + "_out" for x in df_link.columns] | ||
df_link = df_link.merge(rel, left_on=index_column + "_out", right_on=index_column + "_out") | ||
dataframe.columns = [x + "_in" for x in dataframe.columns] | ||
df_link = df_link.merge(dataframe, left_on=index_column + "_in", right_on=index_column + "_in") | ||
|
||
return df_link |
Oops, something went wrong.