Skip to content

Commit

Permalink
Merge pull request #1 from adevinta/feature/first_part_development
Browse files Browse the repository at this point in the history
fix: First part of development
  • Loading branch information
alexvazquez1988 authored Jul 1, 2024
2 parents 717a47c + 079d311 commit d034ced
Show file tree
Hide file tree
Showing 18 changed files with 1,413 additions and 0 deletions.
59 changes: 59 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# AnomalyWatchdog

AnomalyWatchdog detects outliers for time series using both statistical and
machine learning approaches and showcase them. It works for both daily, weekly
and monthly data.

If a time series split in different dimensions is provided, AnomalyWatchdog
first groups the data by the id provided and analyzes outliers at its highest
level. If an outlier is detected, it will analyze outliers at the different
dimensions to detect the origin of the anomaly.

## Installation

```bash

pip install AnomalyWatchdog

```

## Quickstart

To detect anomalies in your data, you need to insert the following parameters
in the AnomalyWatchdog class as you can see below.

```{python, error=TRUE, include=TRUE}
from anomalywatchdog import AnomalyWatchdog
anomaly_watchdog = AnomalyWatchdog(
df: Union[pd.DataFrame, DataFrame],
column_date: str,
column_target: str,
granularity: str,
columns_dimension: list[str] = None,
models_to_use: List[str] = ['auto_arima', 'Prophet'],
check_history: bool = False
)
```

### Inputs
AnomalyWatchdog has the following inputs:
- df: pandas DataFrame or spark DataFrame that contains the required column_id, column_date, column_target and columns_dimension.
- column_date: String containing the column name of the time series dates. Values should be str in format YYYY-MM-DD (i.e. 2020-01-30).
- column_target: String containing the column name of the time series values. Values should be float or int.
- granularity: String containing the granularity of the time series data. Values available are "D" for daily, "M" for monthly and "W" for weekly data.
- columns_dimension: List of strings containing the column dimension names representing the disaggregation of the data if any.
- models_to_use: List of strings containing the models available. Models available are "autoencoder_basic", "autoencoder_lstm", "prophet" and "auto_arima". If non value is provided, AnomalyWatchdog performs with only "prophet" and "auto_arima".
- check_history: Boolean that checks outliers in the complete history of the time series if True, and only in the last day if false (default).

### Outputs
AnomalyWatchdog has two outputs, one of which is only delivered if
columns_dimension parameter is specified.

```{python, error=TRUE, include=TRUE}
# -- AnomalyWatchdog output for main time series
anomaly_watchdog.df_anomaly
# -- AnomalyWatchdog output for each of the dimensions (only if columns_dimension is specified)
anomaly_watchdog.df_anomaly_dimension
```

80 changes: 80 additions & 0 deletions anomalywatchdog/anomaly_features/CalendarFeatureCreator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import pandas as pd
import holidays
from datetime import date
from datetime import timedelta


class CalendarFeatureCreator:

def __init__(self, df: pd.DataFrame, country:str):
# -- Main data
self.df = df
self.country = country
# -- Initialize holidays
self.df_holidays = self.__get_holidays()

def __get_holidays(self):
# -- Add Spain Holidays
holidays = eval("holidays." + self.country + "()")
# This is done because of a bug in the package holiday. Weird, yes.
min_year = int(self.df['date'].astype(str).str[0:4].min())
max_year = int(self.df['date'].astype(str).str[0:4].max()) + 1
year_list = [i for i in range(min_year, max_year)]
[date(year, 1, 1) in holidays for year in year_list]
df_holidays = pd.DataFrame.from_dict(
holidays,
orient='index'
).reset_index()
df_holidays["date"] = pd.to_datetime(df_holidays['index'])
df_holidays['holiday'] = 1
df_holidays = df_holidays[["date", "holiday"]].copy()
return df_holidays

def add_holidays(self, granularity):
df_holiday_granular = self.df_holidays.copy()
df_ts = self.df.copy()
if granularity == "M":
# -- Compute holidays by month
df_holiday_granular['date'] = (
df_holiday_granular["date"].astype(str).str[0:7]
)
df_holiday_granular = (
df_holiday_granular
.groupby('date')["holiday"].sum().reset_index()
)
df_holiday_granular['date'] = (
pd.to_datetime(df_holiday_granular['date'] + '-01')
)
elif granularity == "W":
# -- Compute holidays by week
# ----- Create dummy key
df_holiday_granular['key'] = 0
df_ts['key'] = 0
# ----- Create end date of week
df_ts['date_max_week'] = df_ts['date'] + timedelta(days=6)
# ----- Cartesian
df_holiday_granular.rename(
columns={'date':'date_holidays'}, inplace=True
)
df_ts = df_ts.merge(
df_holiday_granular,
on='key',
how='outer'
)
# ----- Filter holidays within each period of the week
df_ts = df_ts.loc[
(df_ts['date_holidays']>=df_ts['date'])
& (df_ts['date_holidays']<=df_ts['date_max_week'])].copy()
# ----- Aggregate holidays by week
df_holiday_granular = (
df_ts.groupby('date')['holiday'].sum().reset_index()
)
df_holiday_granular['date'] = pd.to_datetime(
df_holiday_granular['date'],
utc=True
)
self.df = self.df.merge(
df_holiday_granular,
on='date',
how='left'
)[["date", "value", "holiday"]].fillna(0)
Empty file.
Empty file.
66 changes: 66 additions & 0 deletions anomalywatchdog/data_treatment/data_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# -- Packages
import pandas as pd

class DataADHandler:
"""
This class creates a data set that cleans the Data for Anomaly Detection
"""
def __init__(
self,
df:pd.DataFrame,
granularity:str = 'D'
):
# -- Main data and parameters
self.df = df
self.granularity = granularity
self.group_columns = ["date"]
self.df_grouped = pd.DataFrame()
# -- Methods
self.__expand_dates()
self.__group_by()
self.__get_ordered_dataframe()

def __expand_dates(self):
# -- Get first date of month and max date to disaggregate
min_date = self.df["date"].min()
max_date = self.df["date"].max()
if self.granularity == 'W':
num_weeks = int((max_date - min_date).days / 7) + 1
dates = pd.date_range(start=min_date,
periods=num_weeks, freq="7D").tolist()
elif self.granularity == 'M':
dates = pd.date_range(start=min_date,
end=max_date, freq="MS").tolist()
else :
dates = pd.date_range(start=min_date,
end=max_date, freq="D").tolist()
# -- dates df
df_dates = pd.DataFrame(
range(len(dates)),
dates
).reset_index()
df_dates.columns = ["date", 'index']
df_dates.drop(['index'], axis=1, inplace=True)
# -- Add values
self.df = df_dates.merge(
self.df,
on=self.group_columns,
how='left'
)
# -- set correct date format
self.df["date"] = pd.to_datetime(self.df["date"], utc=True)

def __get_ordered_dataframe(self):
self.df.sort_values(self.group_columns, inplace=True)

def __group_by(self):
self.df_grouped = (
self.df
.groupby(self.group_columns)["value"]
.sum()
.reset_index()
)




Empty file.
31 changes: 31 additions & 0 deletions anomalywatchdog/engine/model_trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pandas as pd
from loaders.package.c_engine.model_factory \
import ModelFactory


class ModelTrainer:
def __init__(self, model_names, df_train, config):
self.model_names = model_names
self.df_train = df_train
self.config = config

@staticmethod
def filter_config_by(config, model) -> dict:
return config[model]

def train(self) -> pd.DataFrame:
list_output = []
print(">> Training:")
for model_name in self.model_names:
dict_output_id = {}
model = ModelFactory.get_model(
model=model_name,
df_train=self.df_train.copy(),
dict_config=ModelTrainer.filter_config_by(
self.config,
model_name
)
)
dict_output_id['model'] = model
list_output.append(dict_output_id)
return pd.DataFrame(list_output)
Empty file.
21 changes: 21 additions & 0 deletions anomalywatchdog/modelling/abstract_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -- ML main packages
from abc import ABC, abstractmethod
import pandas as pd


class AnomalyDetectionModel(ABC):

def __init__(self, df:pd.DataFrame, dict_params:dict):
self.df = df
self.dict_params = dict_params

@abstractmethod
def fit_model(self, df_train:pd.DataFrame, dict_params:dict):
pass

@abstractmethod
def get_anomalies(self):
pass

def plot(self):
pass
79 changes: 79 additions & 0 deletions anomalywatchdog/modelling/auto_arima_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from loaders.package.b_modelling.abstract_model \
import AnomalyDetectionModel
from loaders.package.z_utils.create_fourier_terms \
import create_fourier_terms
import pandas as pd
from pmdarima.arima import auto_arima
import numpy as np
import tensorflow as tf
import random

def set_seed(seed=42):
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

set_seed(42)

class AutoArimaModel(AnomalyDetectionModel):
def __init__(self,
*args,
**kwargs
):
super(AutoArimaModel, self).__init__(*args, **kwargs)
# -- fit model
self.model_fitted = self.fit_model(
df_train=self.df,
dict_params=self.dict_params)

def fit_model(self, df_train:pd.DataFrame, dict_params:dict):
list_df_fourier = []
if dict_params['weekly_seasonality']:
list_df_fourier.append(create_fourier_terms(self.df, freq=7, K=3))
if dict_params['monthly_seasonality']:
list_df_fourier.append(
create_fourier_terms(self.df, freq=30, K=3)
)
if dict_params['yearly_seasonality']:
list_df_fourier.append(
create_fourier_terms(self.df, freq=365, K=3)
)
if len(list_df_fourier) == 0:
return auto_arima(
df_train['value'], seasonal=False, suppress_warnings=True
)
else:
df_exogenous = pd.concat(list_df_fourier, axis=1)
if self.dict_params['features']['holidays']:
df_exogenous = pd.concat(
[df_exogenous, df_train['holiday']], axis=1
)
return auto_arima(
df_train['value'], seasonal=False, suppress_warnings=True,
exogenous=df_exogenous
)


def get_anomalies(self):
# -- Predict over train
df_fit, df_confidence_intervals = (
self.model_fitted.predict_in_sample(return_conf_int=True)
)
df_anomaly = pd.concat([
self.df[["date", "value"]].copy(),
df_fit,
pd.DataFrame(df_confidence_intervals)
], axis=1)
df_anomaly.columns = [
'date', 'value', 'yhat', 'yhat_lower', 'yhat_upper'
]
df_anomaly['anomaly'] = False
df_anomaly.loc[
(df_anomaly['value'] > df_anomaly['yhat_upper'])
| (df_anomaly['value'] < df_anomaly['yhat_lower']),
'anomaly'
] = True
df_anomaly = df_anomaly[["date", "value", "anomaly"]].copy()
df_anomaly['model'] = 'auto_arima'
df_anomaly['date'] = pd.to_datetime(df_anomaly['date'])
self.df = df_anomaly.copy()
Loading

0 comments on commit d034ced

Please sign in to comment.