Skip to content

Commit

Permalink
Merge branch 'PMPY-2016' into 'integration'
Browse files Browse the repository at this point in the history
PMPY-2016 Log to target vectors (for ML purposes)

Closes PMPY-2016

See merge request process-mining/pm4py/pm4py-core!913
  • Loading branch information
fit-alessandro-berti committed Mar 11, 2023
2 parents 5e851ef + b4c82e7 commit 6907160
Show file tree
Hide file tree
Showing 10 changed files with 273 additions and 2 deletions.
17 changes: 17 additions & 0 deletions examples/ml_log_to_target_vectory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pm4py
from pm4py.algo.transformation.log_to_target import algorithm as log_to_target


def execute_script():
log = pm4py.read_xes("../tests/input_data/running-example.xes")
rem_time_target, classes = log_to_target.apply(log, variant=log_to_target.Variants.REMAINING_TIME)
print(rem_time_target)
next_time_target, classes = log_to_target.apply(log, variant=log_to_target.Variants.NEXT_TIME)
print(next_time_target)
next_activity_target, next_activities = log_to_target.apply(log, variant=log_to_target.Variants.NEXT_ACTIVITY)
print(next_activity_target)
print(next_activities)


if __name__ == "__main__":
execute_script()
2 changes: 1 addition & 1 deletion pm4py/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
get_case_arrival_average, get_rework_cases_per_activity, get_case_overlap, get_cycle_time, \
get_all_case_durations, get_case_duration, get_activity_position_summary, get_stochastic_language
from pm4py.sim import play_out, generate_process_tree
from pm4py.ml import split_train_test, get_prefixes_from_log, extract_features_dataframe, extract_temporal_features_dataframe, extract_outcome_enriched_dataframe
from pm4py.ml import split_train_test, get_prefixes_from_log, extract_features_dataframe, extract_temporal_features_dataframe, extract_outcome_enriched_dataframe, extract_target_vector
from pm4py.org import discover_handover_of_work_network, discover_activity_based_resource_similarity, discover_subcontracting_network, discover_working_together_network, discover_organizational_roles, discover_network_analysis
from pm4py.hof import filter_log, filter_trace, sort_trace, sort_log
from pm4py.meta import __name__, __version__, __doc__, __author__, __author_email__, \
Expand Down
1 change: 1 addition & 0 deletions pm4py/algo/transformation/log_to_target/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pm4py.algo.transformation.log_to_target import algorithm, variants
39 changes: 39 additions & 0 deletions pm4py/algo/transformation/log_to_target/algorithm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from enum import Enum
from pm4py.algo.transformation.log_to_target.variants import next_activity, next_time, remaining_time
from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
from typing import Union, Dict, Optional, Any, Tuple, List
from pm4py.util import exec_utils


class Variants(Enum):
NEXT_ACTIVITY = next_activity
NEXT_TIME = next_time
REMAINING_TIME = remaining_time


def apply(log: Union[EventLog, EventStream, pd.DataFrame], variant=None, parameters: Optional[Dict[Any, Any]] = None) -> Tuple[Any, List[str]]:
"""
Extracts from the event log
the target vector for a specific ML use case
Parameters
---------------
log
Event log / Event stream / Pandas dataframe
variant
Specification of the target vector:
- Variants.NEXT_ACTIVITY => encodes the next activity
- Variants.NEXT_TIME => encodes the next timestamp
- Variants.REMAINING_TIME => encodes the remaining time
Returns
--------------
vector
Target vector for the specified ML use case
classes
Classes (for every column of the target vector)
"""
if variant is None:
raise Exception("please provide the variant between: Variants.NEXT_ACTIVITY, Variants.NEXT_TIME, Variants.REMAINING_TIME")
return exec_utils.get_variant(variant).apply(log, parameters=parameters)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pm4py.algo.transformation.log_to_target.variants import next_activity, next_time, remaining_time
57 changes: 57 additions & 0 deletions pm4py/algo/transformation/log_to_target/variants/next_activity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from enum import Enum
from pm4py.util import exec_utils, constants, xes_constants
from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
from typing import Union, Dict, Optional, Any, Tuple, List
from pm4py.objects.conversion.log import converter as log_converter


class Parameters(Enum):
ACTIVITIES = "activities"
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY


def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[Dict[Any, Any]] = None) -> Tuple[List[List[List[int]]], List[str]]:
"""
Returns a list of matrixes (one for every case).
Every matrix contains as many rows as many events are contained in the case,
and as many columns as many distinct activities are in the log.
The corresponding activity to the given event is assigned to the value 1;
the remaining activities are assigned to the value 0.
Parameters
--------------
log
Event log / Event stream / Pandas dataframe
parameters
Parameters of the algorithm, including:
- Parameters.ACTIVITIES => list of activities to consider
- Parameters.ACTIVITY_KEY => attribute that should be used as activity
Returns
-------------
target
The aforementioned list of matrixes.
activities
The considered list of activities
"""
if parameters is None:
parameters = {}

log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)

activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
activities = exec_utils.get_param_value(Parameters.ACTIVITIES, parameters, sorted(list(set(y[activity_key] for x in log for y in x))))

target = []
for trace in log:
target.append([])
for i in range(len(trace)):
target[-1].append([0] * len(activities))
if i < len(trace) - 1:
act = trace[i+1][activity_key]
if act in activities:
target[-1][-1][activities.index(act)] = 1

return target, activities
50 changes: 50 additions & 0 deletions pm4py/algo/transformation/log_to_target/variants/next_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from enum import Enum
from pm4py.util import exec_utils, constants, xes_constants
from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
from typing import Union, Dict, Optional, Any, Tuple, List
from pm4py.objects.conversion.log import converter as log_converter


class Parameters(Enum):
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY


def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[Dict[Any, Any]] = None) -> Tuple[List[List[int]], List[str]]:
"""
Returns a list of lists (one for every case of the log) containing the difference between the timestamp of the current event
and the timestamp of the next event of the case.
For the last event of the case, the difference is defaulted to 0.
Parameters
---------------
log
Event log
parameters
Parameters of the algorithm, including:
- Parameters.TIMESTAMP_KEY => the attribute of the log to be used as timestamp
Returns
---------------
target
The aforementioned list
classes
Dummy list (of classes)
"""
if parameters is None:
parameters = {}

log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)

timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY)

target = []
for trace in log:
target.append([])
for i in range(len(trace)):
curr_time = trace[i][timestamp_key].timestamp()
next_time = trace[i+1][timestamp_key].timestamp() if i < len(trace)-1 else curr_time

target[-1].append(next_time-curr_time)

return target, ["@@next_time"]
49 changes: 49 additions & 0 deletions pm4py/algo/transformation/log_to_target/variants/remaining_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from enum import Enum
from pm4py.util import exec_utils, constants, xes_constants
from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
from typing import Union, Dict, Optional, Any, Tuple, List
from pm4py.objects.conversion.log import converter as log_converter


class Parameters(Enum):
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY


def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[Dict[Any, Any]] = None) -> Tuple[List[List[int]], List[str]]:
"""
Returns a list of lists (one for every case of the log) containing the remaining time in seconds
from an event to the end of the case.
Parameters
---------------
log
Event log
parameters
Parameters of the algorithm, including:
- Parameters.TIMESTAMP_KEY => the attribute of the log to be used as timestamp
Returns
---------------
target
The aforementioned list
classes
Dummy list (of classes)
"""
if parameters is None:
parameters = {}

log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)

timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY)

target = []
for trace in log:
target.append([])
for i in range(len(trace)):
curr_time = trace[i][timestamp_key].timestamp()
last_time = trace[-1][timestamp_key].timestamp()

target[-1].append(last_time-curr_time)

return target, ["@@remaining_time"]
41 changes: 40 additions & 1 deletion pm4py/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
The ``pm4py.ml`` module contains the machine learning features offered in ``pm4py``
"""

from typing import Union, Tuple
from typing import Union, Tuple, Any, List
import pandas as pd
from pm4py.objects.log.obj import EventLog, EventStream
from pm4py.util import constants
Expand Down Expand Up @@ -206,3 +206,42 @@ def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grou
parameters[temporal.Parameters.RESOURCE_COLUMN] = resource_key

return temporal.apply(log, parameters=parameters)


def extract_target_vector(log: Union[EventLog, pd.DataFrame], variant: str, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key="case:concept:name") -> Tuple[Any, List[str]]:
"""
Extracts from a log object the target vector for a specific ML use case
(next activity, next time, remaining time)
:param log: log object (event log / Pandas dataframe)
:param variant: variant of the algorithm to be used: next_activity, next_time, remaining_time
:param activity_key: the attribute to be used as activity
:param timestamp_key: the attribute to be used as timestamp
:param case_id_key: the attribute to be used as case identifier
:rtype: ``Tuple[Any, List[str]]``
.. code-block:: python3
import pm4py
vector_next_act, class_next_act = pm4py.extract_target_vector(log, 'next_activity', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name')
vector_next_time, class_next_time = pm4py.extract_target_vector(log, 'next_time', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name')
vector_rem_time, class_rem_time = pm4py.extract_target_vector(log, 'remaining_time', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name')
"""
if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!")
__event_log_deprecation_warning(log)

parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)

from pm4py.algo.transformation.log_to_target import algorithm as log_to_target

var_map = {"next_activity": log_to_target.Variants.NEXT_ACTIVITY, "next_time": log_to_target.Variants.NEXT_TIME,
"remaining_time": log_to_target.Variants.REMAINING_TIME}

if variant not in var_map:
raise Exception(
"please provide the variant between: next_activity, next_time, remaining_time")

target, classes = log_to_target.apply(log, variant=var_map[variant], parameters=parameters)
return target, classes
18 changes: 18 additions & 0 deletions tests/other_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,24 @@ def test_compression_multivariate_df(self):
self.assertTrue(compression_util.discover_dfg(cl))
self.assertTrue(compression_util.get_variants(cl))

def test_log_to_target_rem_time(self):
import pm4py
from pm4py.algo.transformation.log_to_target import algorithm as log_to_target
log = pm4py.read_xes("input_data/running-example.xes")
rem_time_target, classes = log_to_target.apply(log, variant=log_to_target.Variants.REMAINING_TIME)

def test_log_to_target_next_time(self):
import pm4py
from pm4py.algo.transformation.log_to_target import algorithm as log_to_target
log = pm4py.read_xes("input_data/running-example.xes")
next_time_target, classes = log_to_target.apply(log, variant=log_to_target.Variants.NEXT_TIME)

def test_log_to_target_next_activity(self):
import pm4py
from pm4py.algo.transformation.log_to_target import algorithm as log_to_target
log = pm4py.read_xes("input_data/running-example.xes")
next_activity_target, next_activities = log_to_target.apply(log, variant=log_to_target.Variants.NEXT_ACTIVITY)


if __name__ == "__main__":
unittest.main()

0 comments on commit 6907160

Please sign in to comment.