Skip to content

Commit

Permalink
Merge branch '206-priority-2-refactoring-scikit-learn-usage-throughou…
Browse files Browse the repository at this point in the history
…t-the-project-enabling-future-integration-of' into 'gpu_integration'

[Priority 2] Refactoring Scikit-Learn usage throughout the project (enabling future integration of NVIDIA CUML when stable)

See merge request process-mining/pm4py/pm4py-core!1204
  • Loading branch information
daniel.schuster committed Jan 29, 2024
2 parents bd1ac4f + 75e97cc commit 1c93795
Show file tree
Hide file tree
Showing 14 changed files with 82 additions and 37 deletions.
7 changes: 3 additions & 4 deletions examples/decisiontree_trivial_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os

from sklearn import tree

from pm4py.util import ml_utils
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.util import get_class_representation
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
Expand All @@ -18,7 +17,7 @@ def execute_script():
# gets classes representation by final concept:name value (end activity)
target, classes = get_class_representation.get_class_representation_by_str_ev_attr_value_value(log, "concept:name")
# mine the decision tree given 'data' and 'target'
clf = tree.DecisionTreeClassifier(max_depth=7)
clf = ml_utils.DecisionTreeClassifier(max_depth=7)
clf.fit(data, target)
# visualize the decision tree
gviz = dt_vis.apply(clf, feature_names, classes, parameters={dt_vis.Variants.CLASSIC.value.Parameters.FORMAT: examples_conf.TARGET_IMG_FORMAT})
Expand All @@ -27,7 +26,7 @@ def execute_script():
# gets classes representation by trace duration (threshold between the two classes = 200D)
target, classes = get_class_representation.get_class_representation_by_trace_duration(log, 2 * 8640000)
# mine the decision tree given 'data' and 'target'
clf = tree.DecisionTreeClassifier(max_depth=7)
clf = ml_utils.DecisionTreeClassifier(max_depth=7)
clf.fit(data, target)
# visualize the decision tree
gviz = dt_vis.apply(clf, feature_names, classes, parameters={dt_vis.Variants.CLASSIC.value.Parameters.FORMAT: examples_conf.TARGET_IMG_FORMAT})
Expand Down
9 changes: 7 additions & 2 deletions examples/kneighb_regression.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pm4py
from pm4py.algo.transformation.log_to_features import algorithm
import random
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
from pm4py.util import ml_utils


def execute_script():
Expand All @@ -13,18 +14,22 @@ def execute_script():
parameters={"str_tr_attr": ["channel", "group", "responsible", "department"],
"str_ev_attr": [], "num_tr_attr": [], "num_ev_attr": [],
"str_evsucc_attr": []})
data = [np.array(x) for x in data]

throughput_time = [y[-1]["time:timestamp"].timestamp() - y[0]["time:timestamp"].timestamp() for y in log]

# split the cases in training and test

available_cases = [i for i in range(len(log))]
training_cases = set(random.sample(available_cases, 500))
data_training = [data[i] for i in range(len(log)) if i in training_cases]
data_training = np.array(data_training)

throughput_time_training = [throughput_time[i] for i in range(len(log)) if i in training_cases]

# train the regressor

regressor = KNeighborsRegressor(n_neighbors=3)
regressor = ml_utils.KNeighborsRegressor(n_neighbors=3)
regressor.fit(data_training, throughput_time_training)

data_validation = [data[i] for i in range(len(log)) if i not in training_cases]
Expand Down
4 changes: 2 additions & 2 deletions examples/trace_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ def execute_script():
dataframe = pm4py.read_xes(os.path.join("..", "tests", "input_data", "receipt.xes"), return_legacy_log_object=True)

# define a K-Means with 3 clusters
from sklearn.cluster import KMeans
clusterer = KMeans(n_clusters=3, random_state=0, n_init="auto")
from pm4py.util import ml_utils
clusterer = ml_utils.KMeans(n_clusters=3, random_state=0, n_init="auto")

for clust_log in pm4py.cluster_log(dataframe, sklearn_clusterer=clusterer):
print(clust_log)
Expand Down
6 changes: 4 additions & 2 deletions pm4py/algo/clustering/profiles/variants/sklearn_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pm4py.util import exec_utils
from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
import numpy as np
from typing import Optional, Dict, Any, Generator, Union
from copy import copy

Expand Down Expand Up @@ -37,8 +38,8 @@ def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[
if parameters is None:
parameters = {}

from sklearn.cluster import KMeans
clusterer = exec_utils.get_param_value(Parameters.SKLEARN_CLUSTERER, parameters, KMeans(n_clusters=2, random_state=0, n_init="auto"))
from pm4py.util import ml_utils
clusterer = exec_utils.get_param_value(Parameters.SKLEARN_CLUSTERER, parameters, ml_utils.KMeans(n_clusters=2, random_state=0, n_init="auto"))

if "enable_activity_def_representation" not in parameters:
parameters["enable_activity_def_representation"] = True
Expand All @@ -51,6 +52,7 @@ def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[

log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=conv_parameters)
data, feature_names = features_extractor.apply(log, parameters=parameters)
data = np.array([np.array(x) for x in data])

clusters = clusterer.fit_predict(data)
max_clu = max(clusters)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def diagnose_from_trans_fitness(log, trans_fitness, parameters=None):
- feature names
- classes
"""
from sklearn import tree
from pm4py.util import ml_utils

if parameters is None:
parameters = {}
Expand Down Expand Up @@ -171,8 +171,10 @@ def diagnose_from_trans_fitness(log, trans_fitness, parameters=None):
target.append(1)
classes.append("underfed")

data = np.array([np.array(x) for x in data])

target = np.asarray(target)
clf = tree.DecisionTreeClassifier(max_depth=7)
clf = ml_utils.DecisionTreeClassifier(max_depth=7)
clf.fit(data, target)
diagn_dict = {"clf": clf, "data": data, "feature_names": feature_names, "target": target,
"classes": classes}
Expand Down Expand Up @@ -207,7 +209,7 @@ def diagnose_from_notexisting_activities(log, notexisting_activities_in_model, p
- feature names
- classes
"""
from sklearn import tree
from pm4py.util import ml_utils

if parameters is None:
parameters = {}
Expand Down Expand Up @@ -259,8 +261,10 @@ def diagnose_from_notexisting_activities(log, notexisting_activities_in_model, p
target.append(1)
classes.append("containing")

data = np.array([np.array(x) for x in data])

target = np.asarray(target)
clf = tree.DecisionTreeClassifier(max_depth=7)
clf = ml_utils.DecisionTreeClassifier(max_depth=7)
clf.fit(data, target)
diagn_dict = {"clf": clf, "data": data, "feature_names": feature_names, "target": target,
"classes": classes}
Expand Down
6 changes: 4 additions & 2 deletions pm4py/algo/decision_mining/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from copy import deepcopy, copy
from enum import Enum
from typing import Optional, Dict, Any, Union, Tuple
import numpy as np

from pm4py.algo.conformance.alignments.petri_net import algorithm as ali
from pm4py.algo.conformance.alignments.petri_net.variants import state_equation_a_star as star
Expand Down Expand Up @@ -107,14 +108,15 @@ def get_decision_tree(log: Union[EventLog, pd.DataFrame], net: PetriNet, initial
classes
The classes
"""
from sklearn import tree
from pm4py.util import ml_utils

if parameters is None:
parameters = {}

X, y, targets = apply(log, net, initial_marking, final_marking, decision_point=decision_point,
attributes=attributes, parameters=parameters)
dt = tree.DecisionTreeClassifier()

dt = ml_utils.DecisionTreeClassifier()
dt = dt.fit(X, y)
return dt, list(X.columns.values.tolist()), targets

Expand Down
4 changes: 2 additions & 2 deletions pm4py/algo/organizational_mining/sna/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def cluster_affinity_propagation(sna: SNA, parameters=None) -> Dict[str, List[st
Dictionary that contains, for each cluster that has been identified,
the list of resources of the cluster
"""
from sklearn.cluster import AffinityPropagation
from pm4py.util import ml_utils

if parameters is None:
parameters = {}
Expand All @@ -70,7 +70,7 @@ def cluster_affinity_propagation(sna: SNA, parameters=None) -> Dict[str, List[st
for c, w in sna.connections.items():
matrix[originators.index(c[0]), originators.index(c[1])] = w

affinity_propagation = AffinityPropagation(**parameters)
affinity_propagation = ml_utils.AffinityPropagation(**parameters)
affinity_propagation.fit(matrix)

clusters = affinity_propagation.predict(matrix)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
from typing import Optional, Dict, Any, Tuple, List

import numpy as np
import pandas as pd
from sklearn.manifold import LocallyLinearEmbedding
from pm4py.util import ml_utils

from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.obj import EventLog
Expand Down Expand Up @@ -81,8 +80,9 @@ def apply(log: EventLog, parameters: Optional[Dict[str, Any]] = None) -> Tuple[L

x = [trace[0][timestamp_key] for trace in log]
data, feature_names = log_to_features.apply(log, parameters={"str_ev_attr": [activity_key], "str_evsucc_attr": [activity_key]})
data = np.array([np.array(x) for x in data])

tsne = LocallyLinearEmbedding(n_components=1, eigen_solver='dense')
tsne = ml_utils.LocallyLinearEmbedding(n_components=1, eigen_solver='dense')
data = tsne.fit_transform(data)
data = np.ndarray.flatten(data)

Expand Down
31 changes: 31 additions & 0 deletions pm4py/util/ml_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import importlib.util


def DecisionTreeClassifier(*args, **kwargs):
from sklearn.tree import DecisionTreeClassifier

return DecisionTreeClassifier(*args, **kwargs)


def AffinityPropagation(*args, **kwargs):
from sklearn.cluster import AffinityPropagation

return AffinityPropagation(*args, **kwargs)


def KMeans(*args, **kwargs):
from sklearn.cluster import KMeans

return KMeans(*args, **kwargs)


def KNeighborsRegressor(*args, **kwargs):
from sklearn.neighbors import KNeighborsRegressor

return KNeighborsRegressor(*args, **kwargs)


def LocallyLinearEmbedding(*args, **kwargs):
from sklearn.manifold import LocallyLinearEmbedding

return LocallyLinearEmbedding(*args, **kwargs)
7 changes: 4 additions & 3 deletions pm4py/visualization/decisiontree/util/dt_to_string.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from typing import Dict, Tuple, Set, List


def apply(clf: tree.DecisionTreeClassifier, columns: List[str]) -> Tuple[Dict[str, str], Dict[str, Set[str]]]:
def apply(clf: DecisionTreeClassifier, columns: List[str]) -> Tuple[Dict[str, str], Dict[str, Set[str]]]:
"""
Translates a decision tree object into a dictionary
associating a set of conditions for each target class
Expand All @@ -19,7 +20,7 @@ def apply(clf: tree.DecisionTreeClassifier, columns: List[str]) -> Tuple[Dict[st
dict_classes
Dictionary associating a set of conditions for each target class
"""
tree_string = tree.export_text(clf).split("\n")
tree_string = export_text(clf).split("\n")
levels = {}
target_classes = {}
variables = {}
Expand Down
7 changes: 4 additions & 3 deletions pm4py/visualization/decisiontree/variants/classic.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import tempfile
from pm4py.util import exec_utils
from enum import Enum
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from typing import Optional, Dict, Any, Union, List
import graphviz

Expand All @@ -10,7 +11,7 @@ class Parameters(Enum):
FORMAT = "format"


def apply(clf: tree.DecisionTreeClassifier, feature_names: List[str], classes: List[str], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> graphviz.Source:
def apply(clf: DecisionTreeClassifier, feature_names: List[str], classes: List[str], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> graphviz.Source:
"""
Apply the visualization of the decision tree
Expand Down Expand Up @@ -38,7 +39,7 @@ def apply(clf: tree.DecisionTreeClassifier, feature_names: List[str], classes: L
filename = tempfile.NamedTemporaryFile(suffix='.gv')
filename.close()

dot_data = tree.export_graphviz(clf, out_file=None,
dot_data = export_graphviz(clf, out_file=None,
feature_names=feature_names,
class_names=classes,
filled=True, rounded=True,
Expand Down
4 changes: 2 additions & 2 deletions pm4py/visualization/decisiontree/visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from enum import Enum
from pm4py.util import exec_utils
from pm4py.visualization.common.gview import serialize, serialize_dot
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from typing import Optional, Dict, Any, List
import graphviz

Expand All @@ -16,7 +16,7 @@ class Variants(Enum):
DEFAULT_VARIANT = Variants.CLASSIC


def apply(clf: tree.DecisionTreeClassifier, feature_names: List[str], classes: List[str], parameters: Optional[Dict[Any, Any]] = None, variant=DEFAULT_VARIANT) -> graphviz.Source:
def apply(clf: DecisionTreeClassifier, feature_names: List[str], classes: List[str], parameters: Optional[Dict[Any, Any]] = None, variant=DEFAULT_VARIANT) -> graphviz.Source:
"""
Method to apply the visualization of the decision tree
Expand Down
8 changes: 4 additions & 4 deletions tests/dec_tree_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class DecisionTreeTest(unittest.TestCase):
def test_decisiontree_evattrvalue(self):
from sklearn import tree
from pm4py.util import ml_utils
from pm4py.visualization.decisiontree import visualizer as dt_vis

# to avoid static method warnings in tests,
Expand All @@ -21,14 +21,14 @@ def test_decisiontree_evattrvalue(self):
"num_tr_attr": [], "num_ev_attr": ["amount"]})
target, classes = get_class_representation.get_class_representation_by_str_ev_attr_value_value(log,
"concept:name")
clf = tree.DecisionTreeClassifier(max_depth=7)
clf = ml_utils.DecisionTreeClassifier(max_depth=7)
clf.fit(data, target)
gviz = dt_vis.apply(clf, feature_names, classes,
parameters={dt_vis.Variants.CLASSIC.value.Parameters.FORMAT: "svg"})
del gviz

def test_decisiontree_traceduration(self):
from sklearn import tree
from pm4py.util import ml_utils
from pm4py.visualization.decisiontree import visualizer as dt_vis

# to avoid static method warnings in tests,
Expand All @@ -40,7 +40,7 @@ def test_decisiontree_traceduration(self):
parameters={"str_tr_attr": [], "str_ev_attr": ["concept:name"],
"num_tr_attr": [], "num_ev_attr": ["amount"]})
target, classes = get_class_representation.get_class_representation_by_trace_duration(log, 2 * 8640000)
clf = tree.DecisionTreeClassifier(max_depth=7)
clf = ml_utils.DecisionTreeClassifier(max_depth=7)
clf.fit(data, target)
gviz = dt_vis.apply(clf, feature_names, classes,
parameters={dt_vis.Variants.CLASSIC.value.Parameters.FORMAT: "svg"})
Expand Down
8 changes: 4 additions & 4 deletions tests/doc_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,8 +617,8 @@ def test_60(self):
target, classes = get_class_representation.get_class_representation_by_str_ev_attr_value_value(log,
"concept:name")

from sklearn import tree
clf = tree.DecisionTreeClassifier()
from pm4py.util import ml_utils
clf = ml_utils.DecisionTreeClassifier()
clf.fit(data, target)

from pm4py.visualization.decisiontree import visualizer as dectree_visualizer
Expand All @@ -640,8 +640,8 @@ def test_61(self):
from pm4py.objects.log.util import get_class_representation
target, classes = get_class_representation.get_class_representation_by_trace_duration(log, 2 * 8640000)

from sklearn import tree
clf = tree.DecisionTreeClassifier()
from pm4py.util import ml_utils
clf = ml_utils.DecisionTreeClassifier()
clf.fit(data, target)

from pm4py.visualization.decisiontree import visualizer as dectree_visualizer
Expand Down

0 comments on commit 1c93795

Please sign in to comment.