From 1bcadff3acacfda2463cf9325f873004e15ed915 Mon Sep 17 00:00:00 2001 From: "alessandro.berti" Date: Sat, 5 Mar 2022 16:13:10 +0000 Subject: [PATCH] PMPY-1625 Bug fix / Efficiency change Format Dataframe --- pm4py/util/constants.py | 1 + pm4py/util/pandas_utils.py | 33 +++++++++++++++++++++++++++++++-- pm4py/utils.py | 5 +++-- 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/pm4py/util/constants.py b/pm4py/util/constants.py index 8c26e4996..c7e10d73c 100644 --- a/pm4py/util/constants.py +++ b/pm4py/util/constants.py @@ -27,6 +27,7 @@ DEFAULT_VARIANT_SEP = "," DEFAULT_INDEX_KEY = "@@index" +DEFAULT_CASE_INDEX_KEY = "@@case_index" DEFAULT_INDEX_IN_TRACE_KEY = "@@index_in_trace" DEFAULT_EVENT_INDEX_KEY = "@@event_index" DEFAULT_FLOW_TIME = "@@flow_time" diff --git a/pm4py/util/pandas_utils.py b/pm4py/util/pandas_utils.py index b31e84536..bb5e313cc 100644 --- a/pm4py/util/pandas_utils.py +++ b/pm4py/util/pandas_utils.py @@ -41,7 +41,7 @@ def to_dict_index(df): return df.to_dict('index') -def insert_index(df, column_name=constants.DEFAULT_INDEX_KEY): +def insert_index(df, column_name=constants.DEFAULT_INDEX_KEY, copy_dataframe=True): """ Inserts the dataframe index in the specified column @@ -51,17 +51,46 @@ def insert_index(df, column_name=constants.DEFAULT_INDEX_KEY): Dataframe column_name Name of the column that should host the index + copy_dataframe + Establishes if the original dataframe should be copied before inserting the column Returns -------------- df Dataframe with index """ - df = df.copy() + if copy_dataframe: + df = df.copy() df[column_name] = df.index return df +def insert_case_index(df, column_name=constants.DEFAULT_CASE_INDEX_KEY, case_id=constants.CASE_CONCEPT_NAME, copy_dataframe=True): + """ + Inserts the case number in the dataframe + + Parameters + --------------- + df + Dataframe + column_name + Name of the column that should host the case index + case_id + Case identifier + copy_dataframe + Establishes if the original dataframe should be copied before inserting the column + + Returns + --------------- + df + Dataframe with case index + """ + if copy_dataframe: + df = df.copy() + df[column_name] = df.groupby(case_id).ngroup() + return df + + def insert_ev_in_tr_index(df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAME, column_name: str = constants.DEFAULT_INDEX_IN_TRACE_KEY) -> pd.DataFrame: """ diff --git a/pm4py/utils.py b/pm4py/utils.py index 71e695ac4..b467dc25f 100644 --- a/pm4py/utils.py +++ b/pm4py/utils.py @@ -8,6 +8,7 @@ from pm4py.objects.ocel.obj import OCEL from pm4py.util import constants, xes_constants, pandas_utils + INDEX_COLUMN = "@@index" @@ -71,11 +72,11 @@ def format_dataframe(df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAM # make sure the activity column is of string type df[xes_constants.DEFAULT_NAME_KEY] = df[xes_constants.DEFAULT_NAME_KEY].astype("string") # set an index column - df = pandas_utils.insert_index(df, INDEX_COLUMN) + df = pandas_utils.insert_index(df, INDEX_COLUMN, copy_dataframe=False) # sorts the dataframe df = df.sort_values([constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_TIMESTAMP_KEY, INDEX_COLUMN]) # re-set the index column - df = pandas_utils.insert_index(df, INDEX_COLUMN) + df = pandas_utils.insert_index(df, INDEX_COLUMN, copy_dataframe=False) # sets the properties if not hasattr(df, 'attrs'): # legacy (Python 3.6) support