Merge pull request #5 from adevinta/feature/data_range

Add Date Range
adevinta · Jul 11, 2024 · 3351aef · 3351aef
2 parents a4e12e4 + ff9b29c
commit 3351aef
Show file tree

Hide file tree

Showing 7 changed files with 87 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -31,8 +31,9 @@ anomaly_watchdog = AnomalyWatchdog(
             column_target: str,
             granularity: str,
             columns_dimension: list[str] = None,
+            start_date: Union[str, None] = None,
+            end_date: Union[str, None] = None,
             models_to_use: List[str] = ['auto_arima', 'Prophet'],
-            check_history: bool = False
         )
 ```
 
@@ -43,8 +44,9 @@ AnomalyWatchdog has the following inputs:
 - column_target: String containing the column name of the time series values. Values should be float or int.
 - granularity: String containing the granularity of the time series data. Values available are "D" for daily, "M" for monthly and "W" for weekly data.
 - columns_dimension: List of strings containing the column dimension names representing the disaggregation of the data if any.
+- start_date: String containing the start date to return anomalies. Values should be str in format YYYY-MM-DD (i.e. 2020-01-30). If None, it returns all the history.
+- end_date: String containing the end date to return anomalies. Values should be str in format YYYY-MM-DD (i.e. 2020-01-30). If None, it returns all the history.
 - models_to_use: List of strings containing the models available. Models available are "autoencoder_basic", "autoencoder_lstm", "prophet" and "auto_arima". If non value is provided, AnomalyWatchdog performs with only "prophet" and "auto_arima".
-- check_history: Boolean that checks outliers in the complete history of the time series if True, and only in the last day if false (default).
 
 ### Outputs
 AnomalyWatchdog has two outputs, one of which is only delivered if 

diff --git a/anomalywatchdog/anomaly_features/CalendarFeatureCreator.py b/anomalywatchdog/anomaly_features/CalendarFeatureCreator.py
@@ -70,8 +70,7 @@ def add_holidays(self, granularity):
                 df_ts.groupby('date')['holiday'].sum().reset_index()
             )
         df_holiday_granular['date'] = pd.to_datetime(
-            df_holiday_granular['date'],
-            utc=True
+            df_holiday_granular['date']
         )
         self.df = self.df.merge(
             df_holiday_granular,

diff --git a/anomalywatchdog/core.py b/anomalywatchdog/core.py
@@ -26,9 +26,10 @@ def __init__(
             column_date: str,
             column_target: str,
             granularity: str,
-            columns_dimension: list[str] = None,
-            models_to_use: List[str] = ['auto_arima', 'Prophet'],
-            check_history: bool = False
+            columns_dimension: list[str] = [],
+            start_date: Union[str, None] = None,
+            end_date: Union[str, None] = None,
+            models_to_use: List[str] = ['auto_arima', 'Prophet']
     ):
         # -- Initialize logs
         self.log = Logger(name="anomaly_detector")
@@ -44,7 +45,8 @@ def __init__(
             columns_dimension=columns_dimension,
             granularity=granularity,
             models_to_use=models_to_use,
-            check_history=check_history,
+            start_date=start_date,
+            end_date=end_date,
             config=self.config
         )
         # -- Update inputs
@@ -53,6 +55,8 @@ def __init__(
         self.columns_dimension = input_checker.columns_dimension
         self.config = input_checker.config
         self.granularity = granularity
+        self.start_date = input_checker.start_date
+        self.end_date = input_checker.end_date
         self.df_input.rename(
             columns={column_date: "date", column_target: "value"},
             inplace=True
@@ -86,9 +90,20 @@ def __init__(
         self.df_anomaly_dimension = self.__detect_granular_anomalies(
             df_predictions=self.df_anomaly.copy(),
             columns_dimension=self.columns_dimension,
-            granularity=self.granularity,
-            check_history=check_history
+            granularity=self.granularity
         )
+        self.log.info(">> 3. Filter selected dates")
+        print('print2')
+        if self.start_date:
+            self.df_anomaly = self.df_anomaly.loc[
+                (self.df_anomaly['date'] >= self.start_date) &
+                (self.df_anomaly['date'] <= self.end_date)
+                ].copy()
+            if len(self.df_anomaly_dimension) > 0:
+                self.df_anomaly_dimension = self.df_anomaly_dimension.loc[
+                    (self.df_anomaly_dimension['date'] >= self.start_date) &
+                    (self.df_anomaly_dimension['date'] <= self.end_date)
+                    ].copy()
         print(self.df_anomaly)
         print(self.df_anomaly_dimension)
 
@@ -114,26 +129,24 @@ def __detect_granular_anomalies(
             self,
             df_predictions: pd.DataFrame,
             columns_dimension: list,
-            granularity: str,
-            check_history: bool
+            granularity: str
     ) -> pd.DataFrame():
         df_dimension = pd.DataFrame()
-        if columns_dimension is not None:
+        if len(columns_dimension) > 0:
             list_df_dimension = []
-            for model in df_predictions["model"].unique():
-                is_anomaly_max_date = (
-                    df_predictions
-                    .loc[(df_predictions['model'] == model) &
-                         (df_predictions['date'] == self.max_date),
-                         'anomaly'].sum()
-                ) > 0
-                is_anomaly_history = (
-                    df_predictions
-                    .loc[(df_predictions['model'] == model), 'anomaly'].sum()
-                ) > 0
-                condition1 = check_history and is_anomaly_history
-                condition2 = not check_history and is_anomaly_max_date
-                if condition1 or condition2:
+            filtered_df = df_predictions.copy()
+            if self.start_date:
+                filtered_df = filtered_df.loc[
+                    (filtered_df['date'] <= self.end_date) &
+                    (filtered_df['date'] >= self.start_date)
+                    ]
+            for model in filtered_df["model"].unique():
+                is_anomaly_in_interval = (
+                        filtered_df.loc[filtered_df['model'] == model,
+                                        ['anomaly']].sum()
+                        > 0
+                )
+                if is_anomaly_in_interval:
                     for column_dimension in columns_dimension:
                         list_dimension_value = [
                             dimension for dimension

diff --git a/anomalywatchdog/data_treatment/data_handler.py b/anomalywatchdog/data_treatment/data_handler.py
@@ -49,7 +49,7 @@ def __expand_dates(self):
             how='left'
         )
         # -- set correct date format
-        self.df["date"] = pd.to_datetime(self.df["date"], utc=True)
+        self.df["date"] = pd.to_datetime(self.df["date"])
 
     def __get_ordered_dataframe(self):
         self.df.sort_values(self.group_columns, inplace=True)

diff --git a/anomalywatchdog/data_treatment/input_checker.py b/anomalywatchdog/data_treatment/input_checker.py
@@ -13,8 +13,9 @@ def __init__(
             granularity: str,
             columns_dimension: List[str],
             models_to_use: List[str],
-            check_history: bool,
-            config: dict
+            config: dict,
+            start_date: Union[str, None] = None,
+            end_date: Union[str, None] = None
     ):
         # -- Main inputs
         self.df = df
@@ -23,19 +24,22 @@ def __init__(
         self.granularity = granularity
         self.columns_dimension = columns_dimension
         self.models_to_use = models_to_use
-        self.check_history = check_history
+        self.start_date = start_date
+        self.end_date = end_date
         self.config = config
         # -- Preliminary Checks
         self.__check_df_instance()
         self.__check_columns_in_dataframe()
-        if self.columns_dimension is not None:
+        if len(self.columns_dimension) > 0:
             self.__check_column_list_types(self.columns_dimension)
         self.__check_column_str_type(self.column_target)
         self.__check_column_str_type(self.column_date)
         self.__enforce_lowercase()
         self.__check_granularity()
-        if self.check_history:
-            self.__check_check_history()
+        self.start_date = self.__check_date(self.start_date)
+        self.end_date = self.__check_date(self.end_date)
+        self.__check_dates_consistency()
+
         if self.models_to_use:
             self.__check_column_list_types(self.models_to_use)
             self.__check_models_to_use()
@@ -125,15 +129,6 @@ def __check_granularity(self):
             )
             raise ValueError(error_string)
 
-    def __check_check_history(self):
-        if not isinstance(self.check_history, bool):
-            error_string = (
-                    "Input parameter check_history is " +
-                    f"{type(self.check_history)}. " +
-                    "Expected input type is bool."
-            )
-            raise TypeError(error_string)
-
     def __check_models_to_use(self):
         models_to_use_list = [
                 "autoencoder_basic",
@@ -153,3 +148,35 @@ def __check_models_to_use(self):
     def __update_config(self):
         if self.models_to_use:
             self.config['models_to_use'] = self.models_to_use
+
+    @staticmethod
+    def __check_date(date_string: Union[str, None]):
+        if date_string:
+            formatting_error = (
+                f"Format for {date_string} not understood. "
+                f"Accepted format is 'YYYY-MM-DD'"
+                f"(e.g. 2021-03-28)."
+            )
+            try:
+                return pd.to_datetime(
+                    date_string,
+                    format="%Y-%m-%d"
+                )
+            except:
+                raise ValueError(formatting_error)
+
+    def __check_dates_consistency(self):
+        if not self.start_date and self.end_date:
+            self.start_date = self.end_date
+        if not self.end_date and self.start_date:
+            self.end_date = self.start_date
+
+        if self.start_date:
+            if (pd.to_datetime(self.end_date, format="%Y-%m-%d")
+                    < pd.to_datetime(self.start_date, format="%Y-%m-%d")):
+                formatting_error = (
+                    f"Value for end_date: {self.end_date} must be greater or "
+                    f"equal than start_date: {self.start_date}."
+                )
+                raise ValueError(formatting_error)
+
diff --git a/tests/data_treatment/test_data_handler.py b/tests/data_treatment/test_data_handler.py
@@ -71,7 +71,7 @@ def expected_df_monthly():
         3400257.853
     ]
     return pd.DataFrame(
-        {'date': pd.to_datetime(date_list, utc=True),
+        {'date': pd.to_datetime(date_list),
          'value': value_list}
     )
 
@@ -113,7 +113,7 @@ def expected_df_weekly():
         3400257.853
     ]
     return pd.DataFrame(
-        {'date': pd.to_datetime(date_list, utc=True),
+        {'date': pd.to_datetime(date_list),
          'value': value_list}
     )
 
@@ -155,6 +155,6 @@ def expected_df_daily():
         3400257.853
     ]
     return pd.DataFrame(
-        {'date': pd.to_datetime(date_list, utc=True),
+        {'date': pd.to_datetime(date_list),
          'value': value_list}
     )
diff --git a/tests/data_treatment/test_input_checker.py b/tests/data_treatment/test_input_checker.py
@@ -7,7 +7,6 @@
 EXPECTED_CONFIG = {"models_to_use": ["autoencoder_basic", "prophet"]}
 
 
-
 def test_input_checker():
     # -- Initialize input checker
     input_checker = InputChecker(
@@ -17,7 +16,6 @@ def test_input_checker():
         granularity='M',
         columns_dimension=['Col_dim'],
         models_to_use=INPUT_CONFIG["models_to_use"],
-        check_history=True,
         config=INPUT_CONFIG
     )
     # -- Check df and columns