Merge pull request #1 from adevinta/feature/first_part_development

fix: First part of development
adevinta · Jul 1, 2024 · d034ced · d034ced
2 parents 717a47c + 079d311
commit d034ced
Show file tree

Hide file tree

Showing 18 changed files with 1,413 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,59 @@
+# AnomalyWatchdog
+
+AnomalyWatchdog detects outliers for time series using both statistical and 
+machine learning approaches and showcase them. It works for both daily, weekly 
+and monthly data.
+
+If a time series split in different dimensions is provided, AnomalyWatchdog
+first groups the data by the id provided and analyzes outliers at its highest
+level. If an outlier is detected, it will analyze outliers at the different 
+dimensions to detect the origin of the anomaly.
+
+## Installation
+
+```bash
+
+pip install AnomalyWatchdog
+
+```
+
+## Quickstart
+
+To detect anomalies in your data, you need to insert the following parameters
+in the AnomalyWatchdog class as you can see below.
+
+```{python, error=TRUE, include=TRUE}
+from anomalywatchdog import AnomalyWatchdog
+
+anomaly_watchdog = AnomalyWatchdog(
+            df: Union[pd.DataFrame, DataFrame],
+            column_date: str,
+            column_target: str,
+            granularity: str,
+            columns_dimension: list[str] = None,
+            models_to_use: List[str] = ['auto_arima', 'Prophet'],
+            check_history: bool = False
+        )
+```
+
+### Inputs
+AnomalyWatchdog has the following inputs:
+- df: pandas DataFrame or spark DataFrame that contains the required column_id, column_date, column_target and columns_dimension.
+- column_date: String containing the column name of the time series dates. Values should be str in format YYYY-MM-DD (i.e. 2020-01-30).
+- column_target: String containing the column name of the time series values. Values should be float or int.
+- granularity: String containing the granularity of the time series data. Values available are "D" for daily, "M" for monthly and "W" for weekly data.
+- columns_dimension: List of strings containing the column dimension names representing the disaggregation of the data if any.
+- models_to_use: List of strings containing the models available. Models available are "autoencoder_basic", "autoencoder_lstm", "prophet" and "auto_arima". If non value is provided, AnomalyWatchdog performs with only "prophet" and "auto_arima".
+- check_history: Boolean that checks outliers in the complete history of the time series if True, and only in the last day if false (default).
+
+### Outputs
+AnomalyWatchdog has two outputs, one of which is only delivered if 
+columns_dimension parameter is specified.
+
+```{python, error=TRUE, include=TRUE}
+# -- AnomalyWatchdog output for main time series
+anomaly_watchdog.df_anomaly
+# -- AnomalyWatchdog output for each of the dimensions (only if columns_dimension is specified)
+anomaly_watchdog.df_anomaly_dimension
+```
+
diff --git a/anomalywatchdog/anomaly_features/CalendarFeatureCreator.py b/anomalywatchdog/anomaly_features/CalendarFeatureCreator.py
@@ -0,0 +1,80 @@
+import pandas as pd
+import holidays
+from datetime import date
+from datetime import timedelta
+
+
+class CalendarFeatureCreator:
+
+    def __init__(self, df: pd.DataFrame, country:str):
+        # -- Main data
+        self.df = df
+        self.country = country
+        # -- Initialize holidays
+        self.df_holidays = self.__get_holidays()
+
+    def __get_holidays(self):
+        # -- Add Spain Holidays
+        holidays = eval("holidays." + self.country + "()")
+        # This is done because of a bug in the package holiday. Weird, yes.
+        min_year = int(self.df['date'].astype(str).str[0:4].min())
+        max_year = int(self.df['date'].astype(str).str[0:4].max()) + 1
+        year_list = [i for i in range(min_year, max_year)]
+        [date(year, 1, 1) in holidays for year in year_list]
+        df_holidays = pd.DataFrame.from_dict(
+            holidays,
+            orient='index'
+        ).reset_index()
+        df_holidays["date"] = pd.to_datetime(df_holidays['index'])
+        df_holidays['holiday'] = 1
+        df_holidays = df_holidays[["date", "holiday"]].copy()
+        return df_holidays
+
+    def add_holidays(self, granularity):
+        df_holiday_granular = self.df_holidays.copy()
+        df_ts = self.df.copy()
+        if granularity == "M":
+            # -- Compute holidays by month
+            df_holiday_granular['date'] = (
+                df_holiday_granular["date"].astype(str).str[0:7]
+            )
+            df_holiday_granular = (
+                df_holiday_granular
+                .groupby('date')["holiday"].sum().reset_index()
+            )
+            df_holiday_granular['date'] = (
+                pd.to_datetime(df_holiday_granular['date'] + '-01')
+            )
+        elif granularity == "W":
+            # -- Compute holidays by week
+            # ----- Create dummy key
+            df_holiday_granular['key'] = 0
+            df_ts['key'] = 0
+            # ----- Create end date of week
+            df_ts['date_max_week'] = df_ts['date'] + timedelta(days=6)
+            # ----- Cartesian
+            df_holiday_granular.rename(
+                columns={'date':'date_holidays'}, inplace=True
+            )
+            df_ts = df_ts.merge(
+                df_holiday_granular,
+                on='key',
+                how='outer'
+            )
+            # ----- Filter holidays within each period of the week
+            df_ts = df_ts.loc[
+                (df_ts['date_holidays']>=df_ts['date'])
+                & (df_ts['date_holidays']<=df_ts['date_max_week'])].copy()
+            # ----- Aggregate holidays by week
+            df_holiday_granular = (
+                df_ts.groupby('date')['holiday'].sum().reset_index()
+            )
+        df_holiday_granular['date'] = pd.to_datetime(
+            df_holiday_granular['date'],
+            utc=True
+        )
+        self.df = self.df.merge(
+            df_holiday_granular,
+            on='date',
+            how='left'
+        )[["date", "value", "holiday"]].fillna(0)
diff --git a/anomalywatchdog/anomaly_features/__init__.py b/anomalywatchdog/anomaly_features/__init__.py
diff --git a/anomalywatchdog/data_treatment/__init__.py b/anomalywatchdog/data_treatment/__init__.py
diff --git a/anomalywatchdog/data_treatment/data_handler.py b/anomalywatchdog/data_treatment/data_handler.py
@@ -0,0 +1,66 @@
+# -- Packages
+import pandas as pd
+
+class DataADHandler:
+    """
+    This class creates a data set that cleans the Data for Anomaly Detection
+    """
+    def __init__(
+            self,
+            df:pd.DataFrame,
+            granularity:str = 'D'
+    ):
+        # -- Main data and parameters
+        self.df = df
+        self.granularity = granularity
+        self.group_columns = ["date"]
+        self.df_grouped = pd.DataFrame()
+        # -- Methods
+        self.__expand_dates()
+        self.__group_by()
+        self.__get_ordered_dataframe()
+
+    def __expand_dates(self):
+        # -- Get first date of month and max date to disaggregate
+        min_date = self.df["date"].min()
+        max_date = self.df["date"].max()
+        if self.granularity == 'W':
+            num_weeks = int((max_date - min_date).days / 7) + 1
+            dates = pd.date_range(start=min_date,
+                                  periods=num_weeks, freq="7D").tolist()
+        elif self.granularity == 'M':
+            dates = pd.date_range(start=min_date,
+                                  end=max_date, freq="MS").tolist()
+        else :
+            dates = pd.date_range(start=min_date,
+                                  end=max_date, freq="D").tolist()
+        # -- dates df
+        df_dates = pd.DataFrame(
+            range(len(dates)),
+            dates
+        ).reset_index()
+        df_dates.columns = ["date", 'index']
+        df_dates.drop(['index'], axis=1, inplace=True)
+        # -- Add values
+        self.df = df_dates.merge(
+            self.df,
+            on=self.group_columns,
+            how='left'
+        )
+        # -- set correct date format
+        self.df["date"] = pd.to_datetime(self.df["date"], utc=True)
+
+    def __get_ordered_dataframe(self):
+        self.df.sort_values(self.group_columns, inplace=True)
+
+    def __group_by(self):
+        self.df_grouped = (
+            self.df
+                .groupby(self.group_columns)["value"]
+                .sum()
+                .reset_index()
+        )
+
+
+
+
diff --git a/anomalywatchdog/engine/__init__.py b/anomalywatchdog/engine/__init__.py
diff --git a/anomalywatchdog/engine/model_trainer.py b/anomalywatchdog/engine/model_trainer.py
@@ -0,0 +1,31 @@
+import pandas as pd
+from loaders.package.c_engine.model_factory \
+    import ModelFactory
+
+
+class ModelTrainer:
+    def __init__(self, model_names, df_train, config):
+        self.model_names = model_names
+        self.df_train = df_train
+        self.config = config
+
+    @staticmethod
+    def filter_config_by(config, model) -> dict:
+        return config[model]
+
+    def train(self) -> pd.DataFrame:
+        list_output = []
+        print(">> Training:")
+        for model_name in self.model_names:
+            dict_output_id = {}
+            model = ModelFactory.get_model(
+                model=model_name,
+                df_train=self.df_train.copy(),
+                dict_config=ModelTrainer.filter_config_by(
+                    self.config,
+                    model_name
+                )
+            )
+            dict_output_id['model'] = model
+            list_output.append(dict_output_id)
+        return pd.DataFrame(list_output)
diff --git a/anomalywatchdog/modelling/__init__.py b/anomalywatchdog/modelling/__init__.py
diff --git a/anomalywatchdog/modelling/abstract_model.py b/anomalywatchdog/modelling/abstract_model.py
@@ -0,0 +1,21 @@
+# -- ML main packages
+from abc import ABC, abstractmethod
+import pandas as pd
+
+
+class AnomalyDetectionModel(ABC):
+
+    def __init__(self, df:pd.DataFrame, dict_params:dict):
+        self.df = df
+        self.dict_params = dict_params
+
+    @abstractmethod
+    def fit_model(self, df_train:pd.DataFrame, dict_params:dict):
+        pass
+
+    @abstractmethod
+    def get_anomalies(self):
+        pass
+
+    def plot(self):
+        pass
diff --git a/anomalywatchdog/modelling/auto_arima_model.py b/anomalywatchdog/modelling/auto_arima_model.py
@@ -0,0 +1,79 @@
+from loaders.package.b_modelling.abstract_model \
+    import AnomalyDetectionModel
+from loaders.package.z_utils.create_fourier_terms \
+    import create_fourier_terms
+import pandas as pd
+from pmdarima.arima import auto_arima
+import numpy as np
+import tensorflow as tf
+import random
+
+def set_seed(seed=42):
+    np.random.seed(seed)
+    tf.random.set_seed(seed)
+    random.seed(seed)
+
+set_seed(42)
+
+class AutoArimaModel(AnomalyDetectionModel):
+    def __init__(self,
+                 *args,
+                 **kwargs
+                 ):
+        super(AutoArimaModel, self).__init__(*args, **kwargs)
+        # -- fit model
+        self.model_fitted = self.fit_model(
+            df_train=self.df,
+            dict_params=self.dict_params)
+
+    def fit_model(self, df_train:pd.DataFrame, dict_params:dict):
+        list_df_fourier = []
+        if dict_params['weekly_seasonality']:
+            list_df_fourier.append(create_fourier_terms(self.df, freq=7, K=3))
+        if dict_params['monthly_seasonality']:
+            list_df_fourier.append(
+                create_fourier_terms(self.df, freq=30, K=3)
+            )
+        if dict_params['yearly_seasonality']:
+            list_df_fourier.append(
+                create_fourier_terms(self.df, freq=365, K=3)
+            )
+        if len(list_df_fourier) == 0:
+            return auto_arima(
+                df_train['value'], seasonal=False, suppress_warnings=True
+            )
+        else:
+            df_exogenous = pd.concat(list_df_fourier, axis=1)
+            if self.dict_params['features']['holidays']:
+                df_exogenous = pd.concat(
+                    [df_exogenous, df_train['holiday']], axis=1
+                )
+            return auto_arima(
+                df_train['value'], seasonal=False, suppress_warnings=True,
+                exogenous=df_exogenous
+            )
+
+
+    def get_anomalies(self):
+        # -- Predict over train
+        df_fit, df_confidence_intervals = (
+            self.model_fitted.predict_in_sample(return_conf_int=True)
+        )
+        df_anomaly = pd.concat([
+            self.df[["date", "value"]].copy(),
+            df_fit,
+            pd.DataFrame(df_confidence_intervals)
+            ], axis=1)
+        df_anomaly.columns = [
+            'date', 'value', 'yhat', 'yhat_lower', 'yhat_upper'
+        ]
+        df_anomaly['anomaly'] = False
+        df_anomaly.loc[
+            (df_anomaly['value'] > df_anomaly['yhat_upper'])
+            | (df_anomaly['value'] < df_anomaly['yhat_lower']),
+            'anomaly'
+        ] = True
+        df_anomaly = df_anomaly[["date", "value", "anomaly"]].copy()
+        df_anomaly['model'] = 'auto_arima'
+        df_anomaly['date'] = pd.to_datetime(df_anomaly['date'])
+        self.df = df_anomaly.copy()