Skip to content

Commit

Permalink
feat(pm4py): preparing for 2.2.25
Browse files Browse the repository at this point in the history
  • Loading branch information
fit-alessandro-berti committed Jul 29, 2022
2 parents 8a4a835 + 51e84a9 commit 2d45d5b
Show file tree
Hide file tree
Showing 11 changed files with 130 additions and 70 deletions.
31 changes: 31 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,36 @@
# Changelog of pm4py

## pm4py 2.2.25 (2022.07.29)

### Added

### Changed
* ce94110076e3269c96a6eee61d7618f08f44472a
* optimization in the calculation of the eventually-follows graph on Pandas dataframes.
* 3cca8f97bbd09f4ae5644dcc156489d4b2037028
* optimization in the calculation of the performance directly-follows graph on Pandas dataframes.
* 4d8721787a50da397b265678be614c94894ea851
* column reduction in DFG calculation on top of Pandas dataframes

### Deprecated

### Fixed
* d754ccdac680f610b2b628dc9830d92da6954dc1
cb76238c29b986026f07261c11a1c09a667c9ab9
54970a58927ad0e17b173bff17705a10f5344d92
ef575a8bf0519655bcf8a57b981c7fa3c018db7a
* small fixes in OCEL utilities
* d0094fa4ccc815b57ccc519d15ccbda6399c2ef7
* bug fix eventually_follows filter in LTL checker when timestamp_diff_boundaries is provided.
* eb8617de0cfcfebf7374b4545660158e4b4291b6
* bug fix eventually_follows filter in LTL checker on EventLog objects.

### Removed

### Other

-----

## pm4py 2.2.24 (2022.07.12)

### Added
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = '2.2'
# The full version, including alpha/beta/rc tags
release = '2.2.24'
release = '2.2.25'

# -- General configuration ---------------------------------------------------

Expand Down
65 changes: 41 additions & 24 deletions pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
'''
from pm4py.util import xes_constants, pandas_utils, constants
from pm4py.util.business_hours import soj_time_business_hours_diff
import numpy as np


def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_glue="case:concept:name",
start_timestamp_key=None, timestamp_key="time:timestamp", perf_aggregation_key="mean",
sort_caseid_required=True,
sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1,
business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR, target_activity_key=None):
business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR, target_activity_key=None,
reduce_columns=True):
"""
Get DFG graph from Pandas dataframe
Expand Down Expand Up @@ -65,9 +67,19 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_

# if not differently specified, set the start timestamp key to the timestamp key
# to avoid retro-compatibility problems
st_eq_ct = start_timestamp_key == timestamp_key
if start_timestamp_key is None:
start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY
df[start_timestamp_key] = df[timestamp_key]
st_eq_ct = True

# to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
# columns
if reduce_columns:
if measure == "frequency" and not sort_timestamp_along_case_id:
df = df[list({case_id_glue, activity_key, target_activity_key})]
else:
df = df[list({case_id_glue, activity_key, start_timestamp_key, timestamp_key, target_activity_key})]

# to get rows belonging to same case ID together, we need to sort on case ID
if sort_caseid_required:
Expand All @@ -76,18 +88,12 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
else:
df = df.sort_values(case_id_glue)

# to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
# columns
if measure == "frequency":
df_reduced = df[{case_id_glue, activity_key, target_activity_key}]
else:
df_reduced = df[{case_id_glue, activity_key, start_timestamp_key, timestamp_key, target_activity_key}]
# shift the dataframe by 1, in order to couple successive rows
df_reduced_shifted = df_reduced.shift(-window)
df_shifted = df.shift(-window)
# change column names to shifted dataframe
df_reduced_shifted.columns = [str(col) + '_2' for col in df_reduced_shifted.columns]
df_shifted.columns = [str(col) + '_2' for col in df_shifted.columns]
# concate the two dataframe to get a unique dataframe
df_successive_rows = pd.concat([df_reduced, df_reduced_shifted], axis=1)
df_successive_rows = pd.concat([df, df_shifted], axis=1)
# as successive rows in the sorted dataframe may belong to different case IDs we have to restrict ourselves to
# successive rows belonging to same case ID
df_successive_rows = df_successive_rows[df_successive_rows[case_id_glue] == df_successive_rows[case_id_glue + '_2']]
Expand All @@ -99,8 +105,10 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
all_columns = list(all_columns - set([activity_key, target_activity_key + '_2']))

if measure == "performance" or measure == "both":
# in the arc performance calculation, make sure to consider positive or null values
df_successive_rows[start_timestamp_key + '_2'] = df_successive_rows[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
if not st_eq_ct:
# in the arc performance calculation, make sure to consider positive or null values
df_successive_rows[start_timestamp_key + '_2'] = df_successive_rows[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)

# calculate the difference between the timestamps of two successive events
if business_hours:
if worktiming is None:
Expand Down Expand Up @@ -158,7 +166,8 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim
case_id_glue="case:concept:name", activity_key="concept:name",
sort_caseid_required=True,
sort_timestamp_along_case_id=True, reduce_dataframe=True, keep_first_following=True,
business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR):
business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR,
event_index=constants.DEFAULT_INDEX_KEY):
"""
Gets the partial order between events (of the same case) in a Pandas dataframe
Expand Down Expand Up @@ -191,29 +200,37 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim
# to avoid retro-compatibility problems
if start_timestamp_key is None:
start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY

if start_timestamp_key not in df:
df[start_timestamp_key] = df[timestamp_key]

# to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
# columns
if reduce_dataframe:
needed_columns = {case_id_glue, activity_key, start_timestamp_key, timestamp_key}
if event_index in df.columns:
needed_columns.add(event_index)
needed_columns = list(needed_columns)
df = df[needed_columns]

# to get rows belonging to same case ID together, we need to sort on case ID
if sort_caseid_required:
if sort_timestamp_along_case_id:
df = df.sort_values([case_id_glue, start_timestamp_key, timestamp_key])
else:
df = df.sort_values(case_id_glue)
df.reset_index(drop=True, inplace=True)

# to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
# columns
if reduce_dataframe:
df = df[[case_id_glue, activity_key, start_timestamp_key, timestamp_key]]
if event_index not in df.columns:
df[event_index] = df.index

df = pandas_utils.insert_index(df)
df = df.set_index(case_id_glue)
df_copy = df.copy()
df.set_index(case_id_glue, inplace=True)

df = df.join(df_copy, rsuffix="_2").dropna()
df = df[df[constants.DEFAULT_INDEX_KEY] < df[constants.DEFAULT_INDEX_KEY + "_2"]]
df[start_timestamp_key + '_2'] = df[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
df = df.join(df, rsuffix="_2")
df = df[df[event_index] < df[event_index + "_2"]]
df = df[df[timestamp_key] <= df[start_timestamp_key + '_2']]

df = df.reset_index()
df.reset_index(inplace=True)

if business_hours:
if worktiming is None:
Expand Down
60 changes: 21 additions & 39 deletions pm4py/algo/filtering/log/ltl/ltl_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from typing import Optional, Dict, Any, Union, Tuple, List
from pm4py.objects.log.obj import EventLog, EventStream, Trace
import itertools


class Parameters(Enum):
Expand Down Expand Up @@ -317,48 +318,29 @@ def eventually_follows(log: EventLog, attribute_values: List[str], parameters: O
omni_present=log.omni_present, properties=log.properties)

for trace in log:
if enable_timestamp:
occurrences = [[trace[i][timestamp_key].timestamp() for i in range(len(trace))
if attribute_key in trace[i] and trace[i][attribute_key] == attribute_value] for attribute_value in attribute_values]
else:
occurrences = [[i for i in range(len(trace))
if attribute_key in trace[i] and trace[i][attribute_key] == attribute_value] for attribute_value in attribute_values]
occurrences = [[i for i in range(len(trace))
if attribute_key in trace[i] and trace[i][attribute_key] == attribute_value] for attribute_value in attribute_values]

is_good = False

is_good = True
if enable_timestamp and timestamp_diff_boundaries:
prev_min = min(occurrences[0], default=-1)
for i in range(1, len(attribute_values)):
if prev_min == -1 or len(occurrences[i]) == 0:
is_good = False
for c in itertools.product(*occurrences):
ok = True
for i in range(len(c)-1):
if c[i] > c[i+1]:
ok = False
break

if timestamp_diff_boundaries:
min_diff = timestamp_diff_boundaries[i - 1][0]
max_diff = timestamp_diff_boundaries[i - 1][1]
min_timestamp = min([o for o in occurrences[i] if (o - prev_min) >= min_diff and (o - prev_min) <= max_diff], default=-1)
else:
min_timestamp = min([o for o in occurrences[i] if o >= prev_min], default = -1)

prev_min = min_timestamp

if prev_min == -1:
is_good = False
break

else:
prev_min = min(occurrences[0], default=-1)
for i in range(1, len(attribute_values)):
if prev_min == -1:
is_good = False
break

if len(occurrences[i]) == 0:
is_good = False
break

min_index = min([o for o in occurrences[i] if o >= prev_min], default = -1)
prev_min = min_index
if ok:
if enable_timestamp and timestamp_diff_boundaries:
for i in range(len(c)-1):
timest_i = trace[i][timestamp_key].timestamp()
timest_j = trace[i+1][timestamp_key].timestamp()
if timest_j - timest_i < timestamp_diff_boundaries[i][0] or timest_j - timest_i > timestamp_diff_boundaries[i][1]:
ok = False
break

if ok:
is_good = True
break

if is_good:
if positive:
Expand Down
4 changes: 2 additions & 2 deletions pm4py/algo/filtering/pandas/ltl/ltl_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,8 @@ def eventually_follows(df0: pd.DataFrame, attribute_values: List[str], parameter
attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, False)
timestamp_diff_boundaries = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_BOUNDARIES, parameters, [])
enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, len(timestamp_diff_boundaries) > 0)

colset = [case_id_glue, attribute_key]
if enable_timestamp:
Expand All @@ -341,7 +341,7 @@ def eventually_follows(df0: pd.DataFrame, attribute_values: List[str], parameter
df_join = df_join[df_join["@@diffindex%d" % (i - 1)] > 0]

if enable_timestamp:
for i in range(2, len(df_a)):
for i in range(1, len(df_a)):
df_join["@@difftimestamp%d" % (i - 1)] = (
df_join[timestamp_key + "_%d" % i] - df_join[timestamp_key + '_%d' % (i-1)]).astype(
'timedelta64[s]')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None):
enable_object_str_attributes = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_STR_ATTRIBUTES, parameters, enable_all)
enable_object_num_attributes = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_NUM_ATTRIBUTES, parameters, enable_all)
enable_object_interaction_graph_ot = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_INTERACTION_GRAPH_OT, parameters, enable_all)
enable_work_in_progress = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_WORK_IN_PROGRESS, parameters, enable_all)
enable_object_lifecycle_unq_act = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_LIFECYCLE_UNQ_ACT, parameters, enable_all)
enable_work_in_progress = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_WORK_IN_PROGRESS, parameters, False)
enable_related_events_features = exec_utils.get_param_value(Parameters.ENABLE_RELATED_EVENTS_FEATURES, parameters, False)
enable_related_activities_features = exec_utils.get_param_value(Parameters.ENABLE_RELATED_ACTIVITIES_FEATURES, parameters, False)
enable_obj_con_in_graph_features = exec_utils.get_param_value(Parameters.ENABLE_OBJ_CON_IN_GRAPH_FEATURES, parameters, False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None):
feature_names = ["@@object_degree_centrality"]

for obj in ordered_objects:
data.append([centrality[obj]])
if obj in centrality:
data.append([centrality[obj]])
else:
data.append([0])

return data, feature_names
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,10 @@ def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None) -> Set[Tuple[
graph.add((o1, o2))
set_objects.add(o2)

graph_it = list(graph)
for el in graph_it:
if (el[1], el[0]) in graph:
graph.remove((el[0], el[1]))
graph.remove((el[1], el[0]))

return graph
3 changes: 2 additions & 1 deletion pm4py/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,7 +1033,8 @@ def filter_ocel_objects(ocel: OCEL, object_identifiers: Collection[str], positiv
while level > 1:
curr = list(object_identifiers)
for el in curr:
object_identifiers = object_identifiers.union(graph[el])
for el2 in graph[el]:
object_identifiers.add(el2)
level = level - 1
from copy import copy
from pm4py.objects.ocel.util import filtering_utils
Expand Down
2 changes: 1 addition & 1 deletion pm4py/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
'''
__name__ = 'pm4py'
VERSION = '2.2.24'
VERSION = '2.2.25'
__version__ = VERSION
__doc__ = 'Process Mining for Python (PM4Py)'
__author__ = 'Fraunhofer Institute for Applied Technology'
Expand Down
20 changes: 20 additions & 0 deletions safety_checks/20220729
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
+==============================================================================+
| |
| /$$$$$$ /$$ |
| /$$__ $$ | $$ |
| /$$$$$$$ /$$$$$$ | $$ \__//$$$$$$ /$$$$$$ /$$ /$$ |
| /$$_____/ |____ $$| $$$$ /$$__ $$|_ $$_/ | $$ | $$ |
| | $$$$$$ /$$$$$$$| $$_/ | $$$$$$$$ | $$ | $$ | $$ |
| \____ $$ /$$__ $$| $$ | $$_____/ | $$ /$$| $$ | $$ |
| /$$$$$$$/| $$$$$$$| $$ | $$$$$$$ | $$$$/| $$$$$$$ |
| |_______/ \_______/|__/ \_______/ \___/ \____ $$ |
| /$$ | $$ |
| | $$$$$$/ |
| by pyup.io \______/ |
| |
+==============================================================================+
| REPORT |
| checked 45 packages, using free DB (updated once a month) |
+==============================================================================+
| No known security vulnerabilities found. |
+==============================================================================+

0 comments on commit 2d45d5b

Please sign in to comment.