Merge pull request #2 from adevinta/feature/second_part_development

fix: Last part of initial development
adevinta · Jul 1, 2024 · 4bdce08 · 4bdce08
2 parents d034ced + 912cdc0
commit 4bdce08
Show file tree

Hide file tree

Showing 29 changed files with 3,607 additions and 31 deletions.
diff --git a/anomalywatchdog/anomaly_features/create_features.py b/anomalywatchdog/anomaly_features/create_features.py
@@ -0,0 +1,15 @@
+import pandas as pd
+from anomalywatchdog.anomaly_features\
+    .CalendarFeatureCreator import CalendarFeatureCreator
+
+
+def create_features(
+        df: pd.DataFrame,
+        granularity: str
+) -> pd.DataFrame:
+    # -- Create calendar features
+    calendar_feature_creator = CalendarFeatureCreator(df=df, country="Spain")
+    calendar_feature_creator.add_holidays(granularity=granularity)
+    df_feature = calendar_feature_creator.df
+    # -- Return output
+    return df_feature
diff --git a/anomalywatchdog/anomaly_watchdog.py b/anomalywatchdog/anomaly_watchdog.py
@@ -0,0 +1,179 @@
+from anomalywatchdog.data_treatment.data_handler\
+    import DataADHandler
+from anomalywatchdog.data_treatment.input_checker\
+    import InputChecker
+from anomalywatchdog.data_treatment.model_updater\
+    import ModelUpdater
+from anomalywatchdog.engine.model_trainer\
+    import ModelTrainer
+from anomalywatchdog.engine.model_predictor\
+    import ModelPredictor
+from anomalywatchdog.anomaly_features.create_features\
+    import create_features
+import os.path as op
+import yaml
+from logging import Logger
+import pandas as pd
+from typing import Union, List
+from pyspark.sql import DataFrame
+
+
+class AnomalyWatchdog:
+
+    def __init__(
+            self,
+            df: Union[pd.DataFrame, DataFrame],
+            column_date: str,
+            column_target: str,
+            granularity: str,
+            columns_dimension: list[str] = None,
+            models_to_use: List[str] = ['auto_arima', 'Prophet'],
+            check_history: bool = False
+    ):
+        # -- Initialize logs
+        self.log = Logger(name="anomaly_detector")
+        # -- Read Config
+        current_path = op.dirname(__file__)
+        with open(op.join(current_path, "config_ad.yaml"), 'rb') as fp:
+            self.config = yaml.safe_load(fp)
+        # -- Check inputs
+        input_checker = InputChecker(
+            df=df.copy(),
+            column_date=column_date,
+            column_target=column_target,
+            columns_dimension=columns_dimension,
+            granularity=granularity,
+            models_to_use=models_to_use,
+            check_history=check_history,
+            config=self.config
+        )
+        # -- Update inputs
+        self.df_input = input_checker.df.copy()
+        self.column_target = input_checker.column_target
+        self.columns_dimension = input_checker.columns_dimension
+        self.config = input_checker.config
+        self.granularity = granularity
+        self.df_input.rename(
+            columns={column_date: "date", column_target: "value"},
+            inplace=True
+        )
+        # -- Initialize inputs
+        self.max_date = self.df_input["date"].max()
+        # -- Initialize output
+        self.df_anomaly = pd.DataFrame()
+        self.df_anomaly_dimension = pd.DataFrame()
+        self.log.info(">> 1. Data Treatment")
+        data_ad_handler = DataADHandler(
+            df=self.df_input,
+            granularity=self.granularity
+        )
+        df = data_ad_handler.df_grouped.copy()
+        df = df[["date", "value"]].copy()
+        self.log.info(">> 1.1 Get TS properties")
+        ModelUpdater(
+            df=df.copy(),
+            granularity=self.granularity,
+            config=self.config
+        )
+        self.log.info(">> 1.2 Get features")
+        df = create_features(df=df.copy(), granularity=self.granularity)
+        self.log.info(">> 2. Get global anomalies")
+        self.df_anomaly = self.__detect_anomalies(
+            df_handled=df.copy(),
+            list_models=self.config['models_to_use']
+        )
+        self.log.info(">> 2. Get drilled anomalies")
+        self.df_anomaly_dimension = self.__detect_granular_anomalies(
+            df_predictions=self.df_anomaly.copy(),
+            columns_dimension=self.columns_dimension,
+            granularity=self.granularity,
+            check_history=check_history
+        )
+        print(self.df_anomaly)
+        print(self.df_anomaly_dimension)
+
+    def __detect_anomalies(
+            self,
+            df_handled:pd.DataFrame,
+            list_models:list[str]
+    ) -> pd.DataFrame():
+        self.log.info(">> 2.1. Train Models")
+        model_trainer = ModelTrainer(
+            model_names=list_models, #self.config['models_to_use'],
+            df_train=df_handled.copy(),
+            config=self.config
+        )
+        df_models_trained = model_trainer.train()
+        self.log.info(">> 2.2. Predict Models")
+        model_predictor = ModelPredictor(
+            df_models_trained=df_models_trained)
+        df_predictions = model_predictor.predict()
+        return df_predictions
+
+    def __detect_granular_anomalies(
+            self,
+            df_predictions:pd.DataFrame,
+            columns_dimension:list,
+            granularity:str,
+            check_history:bool
+    ) -> pd.DataFrame():
+        df_dimension = pd.DataFrame()
+        if columns_dimension is not None:
+            list_df_dimension = []
+            for model in df_predictions["model"].unique():
+                is_anomaly_max_date = (
+                    df_predictions
+                    .loc[(df_predictions['model']==model) &
+                         (df_predictions['date']==self.max_date),
+                        'anomaly'
+                    ].sum()
+                ) > 0
+                is_anomaly_history = (
+                    df_predictions
+                    .loc[(df_predictions['model']==model),
+                        'anomaly'
+                    ].sum()
+                ) > 0
+                condition1 = check_history and is_anomaly_history
+                condition2 = not check_history and is_anomaly_max_date
+                if condition1 or condition2:
+                    for column_dimension in columns_dimension:
+                        list_dimension_value = [
+                            dimension for dimension
+                            in self.df_input[column_dimension].unique()
+                            if dimension is not None
+                        ]
+                        for dimension_value in list_dimension_value:
+                            df_dimension = (
+                                self.df_input
+                                .loc[self.df_input[column_dimension]
+                                     ==dimension_value,
+                                       ["date", "value"]]
+                                .reset_index(drop=True)
+                                .copy()
+                            )
+                            data_ad_handler = DataADHandler(
+                                df=df_dimension,
+                                granularity=granularity
+                            )
+                            df = data_ad_handler.df_grouped.copy()
+                            df = create_features(df=df.copy(),
+                                                 granularity=self.granularity)
+                            df_predictions_element = self.__detect_anomalies(
+                                df_handled=df.copy(),
+                                list_models=[model]
+                            )
+                            print(df_predictions_element)
+                            df_predictions_element['dimension'] = (
+                                column_dimension
+                            )
+                            df_predictions_element['dimension_value'] = (
+                                dimension_value
+                            )
+                            list_df_dimension.append(df_predictions_element)
+                        df_dimension = pd.concat(list_df_dimension)
+                    return df_dimension
+                else:
+                    return df_dimension
+        else:
+            return df_dimension
diff --git a/anomalywatchdog/config_ad.yaml b/anomalywatchdog/config_ad.yaml
@@ -0,0 +1,64 @@
+models_to_use: [
+    "autoencoder_lstm",
+    "autoencoder_conv",
+    "prophet",
+    "auto_arima"
+]
+ts_seasonality:
+  weekly_seasonality: False
+  monthly_seasonality: False
+  yearly_seasonality: False
+autoencoder_conv:
+    granularity: 'D'
+    timesteps:
+      D: 120  #steps back in the TS in each step
+      W: 24
+      M: 8
+    activation: 'relu'
+    loss: 'mse'
+    epochs: 100
+    batch_size:
+      D: 128
+      W: 32
+      M: 12
+    validation_split: 0.1
+    learning_rate: 0.001
+    quantile: 0.75
+    quantile_multiplier: 1.5
+    features:
+      holidays: True
+autoencoder_lstm:
+    granularity: 'D'
+    timesteps:
+      D: 30  #steps back in the TS in each step
+      W: 12
+      M: 8
+    activation: 'relu'
+    loss: 'mse'
+    epochs: 50
+    batch_size:
+      D: 32
+      W: 16
+      M: 12
+    validation_split: 0.2
+    learning_rate: 0.001
+    quantile: 0.75
+    quantile_multiplier: 1.5
+    features:
+      holidays: True
+prophet:
+  weekly_seasonality: False
+  monthly_seasonality: False
+  yearly_seasonality: False
+  seasonality_mode: "additive"
+  interval_width: 0.95
+  changepoint_range: 0.8
+  features:
+    holidays: True
+auto_arima:
+  weekly_seasonality: False
+  monthly_seasonality: False
+  yearly_seasonality: False
+  features:
+    holidays: True
+