diff --git a/README.md b/README.md index 27b7d6c..181b660 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,9 @@ anomaly_watchdog = AnomalyWatchdog( column_target: str, granularity: str, columns_dimension: list[str] = None, + start_date: Union[str, None] = None, + end_date: Union[str, None] = None, models_to_use: List[str] = ['auto_arima', 'Prophet'], - check_history: bool = False ) ``` @@ -43,8 +44,9 @@ AnomalyWatchdog has the following inputs: - column_target: String containing the column name of the time series values. Values should be float or int. - granularity: String containing the granularity of the time series data. Values available are "D" for daily, "M" for monthly and "W" for weekly data. - columns_dimension: List of strings containing the column dimension names representing the disaggregation of the data if any. +- start_date: String containing the start date to return anomalies. Values should be str in format YYYY-MM-DD (i.e. 2020-01-30). If None, it returns all the history. +- end_date: String containing the end date to return anomalies. Values should be str in format YYYY-MM-DD (i.e. 2020-01-30). If None, it returns all the history. - models_to_use: List of strings containing the models available. Models available are "autoencoder_basic", "autoencoder_lstm", "prophet" and "auto_arima". If non value is provided, AnomalyWatchdog performs with only "prophet" and "auto_arima". -- check_history: Boolean that checks outliers in the complete history of the time series if True, and only in the last day if false (default). ### Outputs AnomalyWatchdog has two outputs, one of which is only delivered if diff --git a/anomalywatchdog/anomaly_features/CalendarFeatureCreator.py b/anomalywatchdog/anomaly_features/CalendarFeatureCreator.py index 0dfe85f..d54cf17 100644 --- a/anomalywatchdog/anomaly_features/CalendarFeatureCreator.py +++ b/anomalywatchdog/anomaly_features/CalendarFeatureCreator.py @@ -70,8 +70,7 @@ def add_holidays(self, granularity): df_ts.groupby('date')['holiday'].sum().reset_index() ) df_holiday_granular['date'] = pd.to_datetime( - df_holiday_granular['date'], - utc=True + df_holiday_granular['date'] ) self.df = self.df.merge( df_holiday_granular, diff --git a/anomalywatchdog/core.py b/anomalywatchdog/core.py index 8d3bd37..7469367 100644 --- a/anomalywatchdog/core.py +++ b/anomalywatchdog/core.py @@ -26,9 +26,10 @@ def __init__( column_date: str, column_target: str, granularity: str, - columns_dimension: list[str] = None, - models_to_use: List[str] = ['auto_arima', 'Prophet'], - check_history: bool = False + columns_dimension: list[str] = [], + start_date: Union[str, None] = None, + end_date: Union[str, None] = None, + models_to_use: List[str] = ['auto_arima', 'Prophet'] ): # -- Initialize logs self.log = Logger(name="anomaly_detector") @@ -44,7 +45,8 @@ def __init__( columns_dimension=columns_dimension, granularity=granularity, models_to_use=models_to_use, - check_history=check_history, + start_date=start_date, + end_date=end_date, config=self.config ) # -- Update inputs @@ -53,6 +55,8 @@ def __init__( self.columns_dimension = input_checker.columns_dimension self.config = input_checker.config self.granularity = granularity + self.start_date = input_checker.start_date + self.end_date = input_checker.end_date self.df_input.rename( columns={column_date: "date", column_target: "value"}, inplace=True @@ -86,9 +90,20 @@ def __init__( self.df_anomaly_dimension = self.__detect_granular_anomalies( df_predictions=self.df_anomaly.copy(), columns_dimension=self.columns_dimension, - granularity=self.granularity, - check_history=check_history + granularity=self.granularity ) + self.log.info(">> 3. Filter selected dates") + print('print2') + if self.start_date: + self.df_anomaly = self.df_anomaly.loc[ + (self.df_anomaly['date'] >= self.start_date) & + (self.df_anomaly['date'] <= self.end_date) + ].copy() + if len(self.df_anomaly_dimension) > 0: + self.df_anomaly_dimension = self.df_anomaly_dimension.loc[ + (self.df_anomaly_dimension['date'] >= self.start_date) & + (self.df_anomaly_dimension['date'] <= self.end_date) + ].copy() print(self.df_anomaly) print(self.df_anomaly_dimension) @@ -114,26 +129,24 @@ def __detect_granular_anomalies( self, df_predictions: pd.DataFrame, columns_dimension: list, - granularity: str, - check_history: bool + granularity: str ) -> pd.DataFrame(): df_dimension = pd.DataFrame() - if columns_dimension is not None: + if len(columns_dimension) > 0: list_df_dimension = [] - for model in df_predictions["model"].unique(): - is_anomaly_max_date = ( - df_predictions - .loc[(df_predictions['model'] == model) & - (df_predictions['date'] == self.max_date), - 'anomaly'].sum() - ) > 0 - is_anomaly_history = ( - df_predictions - .loc[(df_predictions['model'] == model), 'anomaly'].sum() - ) > 0 - condition1 = check_history and is_anomaly_history - condition2 = not check_history and is_anomaly_max_date - if condition1 or condition2: + filtered_df = df_predictions.copy() + if self.start_date: + filtered_df = filtered_df.loc[ + (filtered_df['date'] <= self.end_date) & + (filtered_df['date'] >= self.start_date) + ] + for model in filtered_df["model"].unique(): + is_anomaly_in_interval = ( + filtered_df.loc[filtered_df['model'] == model, + ['anomaly']].sum() + > 0 + ) + if is_anomaly_in_interval: for column_dimension in columns_dimension: list_dimension_value = [ dimension for dimension diff --git a/anomalywatchdog/data_treatment/data_handler.py b/anomalywatchdog/data_treatment/data_handler.py index a28f390..1a5e59d 100644 --- a/anomalywatchdog/data_treatment/data_handler.py +++ b/anomalywatchdog/data_treatment/data_handler.py @@ -49,7 +49,7 @@ def __expand_dates(self): how='left' ) # -- set correct date format - self.df["date"] = pd.to_datetime(self.df["date"], utc=True) + self.df["date"] = pd.to_datetime(self.df["date"]) def __get_ordered_dataframe(self): self.df.sort_values(self.group_columns, inplace=True) diff --git a/anomalywatchdog/data_treatment/input_checker.py b/anomalywatchdog/data_treatment/input_checker.py index ab8ddd4..c75d159 100644 --- a/anomalywatchdog/data_treatment/input_checker.py +++ b/anomalywatchdog/data_treatment/input_checker.py @@ -13,8 +13,9 @@ def __init__( granularity: str, columns_dimension: List[str], models_to_use: List[str], - check_history: bool, - config: dict + config: dict, + start_date: Union[str, None] = None, + end_date: Union[str, None] = None ): # -- Main inputs self.df = df @@ -23,19 +24,22 @@ def __init__( self.granularity = granularity self.columns_dimension = columns_dimension self.models_to_use = models_to_use - self.check_history = check_history + self.start_date = start_date + self.end_date = end_date self.config = config # -- Preliminary Checks self.__check_df_instance() self.__check_columns_in_dataframe() - if self.columns_dimension is not None: + if len(self.columns_dimension) > 0: self.__check_column_list_types(self.columns_dimension) self.__check_column_str_type(self.column_target) self.__check_column_str_type(self.column_date) self.__enforce_lowercase() self.__check_granularity() - if self.check_history: - self.__check_check_history() + self.start_date = self.__check_date(self.start_date) + self.end_date = self.__check_date(self.end_date) + self.__check_dates_consistency() + if self.models_to_use: self.__check_column_list_types(self.models_to_use) self.__check_models_to_use() @@ -125,15 +129,6 @@ def __check_granularity(self): ) raise ValueError(error_string) - def __check_check_history(self): - if not isinstance(self.check_history, bool): - error_string = ( - "Input parameter check_history is " + - f"{type(self.check_history)}. " + - "Expected input type is bool." - ) - raise TypeError(error_string) - def __check_models_to_use(self): models_to_use_list = [ "autoencoder_basic", @@ -153,3 +148,35 @@ def __check_models_to_use(self): def __update_config(self): if self.models_to_use: self.config['models_to_use'] = self.models_to_use + + @staticmethod + def __check_date(date_string: Union[str, None]): + if date_string: + formatting_error = ( + f"Format for {date_string} not understood. " + f"Accepted format is 'YYYY-MM-DD'" + f"(e.g. 2021-03-28)." + ) + try: + return pd.to_datetime( + date_string, + format="%Y-%m-%d" + ) + except: + raise ValueError(formatting_error) + + def __check_dates_consistency(self): + if not self.start_date and self.end_date: + self.start_date = self.end_date + if not self.end_date and self.start_date: + self.end_date = self.start_date + + if self.start_date: + if (pd.to_datetime(self.end_date, format="%Y-%m-%d") + < pd.to_datetime(self.start_date, format="%Y-%m-%d")): + formatting_error = ( + f"Value for end_date: {self.end_date} must be greater or " + f"equal than start_date: {self.start_date}." + ) + raise ValueError(formatting_error) + diff --git a/tests/data_treatment/test_data_handler.py b/tests/data_treatment/test_data_handler.py index 41c2ba8..9d13b64 100644 --- a/tests/data_treatment/test_data_handler.py +++ b/tests/data_treatment/test_data_handler.py @@ -71,7 +71,7 @@ def expected_df_monthly(): 3400257.853 ] return pd.DataFrame( - {'date': pd.to_datetime(date_list, utc=True), + {'date': pd.to_datetime(date_list), 'value': value_list} ) @@ -113,7 +113,7 @@ def expected_df_weekly(): 3400257.853 ] return pd.DataFrame( - {'date': pd.to_datetime(date_list, utc=True), + {'date': pd.to_datetime(date_list), 'value': value_list} ) @@ -155,6 +155,6 @@ def expected_df_daily(): 3400257.853 ] return pd.DataFrame( - {'date': pd.to_datetime(date_list, utc=True), + {'date': pd.to_datetime(date_list), 'value': value_list} ) \ No newline at end of file diff --git a/tests/data_treatment/test_input_checker.py b/tests/data_treatment/test_input_checker.py index dc96b39..8856ca0 100644 --- a/tests/data_treatment/test_input_checker.py +++ b/tests/data_treatment/test_input_checker.py @@ -7,7 +7,6 @@ EXPECTED_CONFIG = {"models_to_use": ["autoencoder_basic", "prophet"]} - def test_input_checker(): # -- Initialize input checker input_checker = InputChecker( @@ -17,7 +16,6 @@ def test_input_checker(): granularity='M', columns_dimension=['Col_dim'], models_to_use=INPUT_CONFIG["models_to_use"], - check_history=True, config=INPUT_CONFIG ) # -- Check df and columns