-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from adevinta/feature/first_part_development
fix: First part of development
- Loading branch information
Showing
18 changed files
with
1,413 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# AnomalyWatchdog | ||
|
||
AnomalyWatchdog detects outliers for time series using both statistical and | ||
machine learning approaches and showcase them. It works for both daily, weekly | ||
and monthly data. | ||
|
||
If a time series split in different dimensions is provided, AnomalyWatchdog | ||
first groups the data by the id provided and analyzes outliers at its highest | ||
level. If an outlier is detected, it will analyze outliers at the different | ||
dimensions to detect the origin of the anomaly. | ||
|
||
## Installation | ||
|
||
```bash | ||
|
||
pip install AnomalyWatchdog | ||
|
||
``` | ||
|
||
## Quickstart | ||
|
||
To detect anomalies in your data, you need to insert the following parameters | ||
in the AnomalyWatchdog class as you can see below. | ||
|
||
```{python, error=TRUE, include=TRUE} | ||
from anomalywatchdog import AnomalyWatchdog | ||
anomaly_watchdog = AnomalyWatchdog( | ||
df: Union[pd.DataFrame, DataFrame], | ||
column_date: str, | ||
column_target: str, | ||
granularity: str, | ||
columns_dimension: list[str] = None, | ||
models_to_use: List[str] = ['auto_arima', 'Prophet'], | ||
check_history: bool = False | ||
) | ||
``` | ||
|
||
### Inputs | ||
AnomalyWatchdog has the following inputs: | ||
- df: pandas DataFrame or spark DataFrame that contains the required column_id, column_date, column_target and columns_dimension. | ||
- column_date: String containing the column name of the time series dates. Values should be str in format YYYY-MM-DD (i.e. 2020-01-30). | ||
- column_target: String containing the column name of the time series values. Values should be float or int. | ||
- granularity: String containing the granularity of the time series data. Values available are "D" for daily, "M" for monthly and "W" for weekly data. | ||
- columns_dimension: List of strings containing the column dimension names representing the disaggregation of the data if any. | ||
- models_to_use: List of strings containing the models available. Models available are "autoencoder_basic", "autoencoder_lstm", "prophet" and "auto_arima". If non value is provided, AnomalyWatchdog performs with only "prophet" and "auto_arima". | ||
- check_history: Boolean that checks outliers in the complete history of the time series if True, and only in the last day if false (default). | ||
|
||
### Outputs | ||
AnomalyWatchdog has two outputs, one of which is only delivered if | ||
columns_dimension parameter is specified. | ||
|
||
```{python, error=TRUE, include=TRUE} | ||
# -- AnomalyWatchdog output for main time series | ||
anomaly_watchdog.df_anomaly | ||
# -- AnomalyWatchdog output for each of the dimensions (only if columns_dimension is specified) | ||
anomaly_watchdog.df_anomaly_dimension | ||
``` | ||
|
80 changes: 80 additions & 0 deletions
80
anomalywatchdog/anomaly_features/CalendarFeatureCreator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import pandas as pd | ||
import holidays | ||
from datetime import date | ||
from datetime import timedelta | ||
|
||
|
||
class CalendarFeatureCreator: | ||
|
||
def __init__(self, df: pd.DataFrame, country:str): | ||
# -- Main data | ||
self.df = df | ||
self.country = country | ||
# -- Initialize holidays | ||
self.df_holidays = self.__get_holidays() | ||
|
||
def __get_holidays(self): | ||
# -- Add Spain Holidays | ||
holidays = eval("holidays." + self.country + "()") | ||
# This is done because of a bug in the package holiday. Weird, yes. | ||
min_year = int(self.df['date'].astype(str).str[0:4].min()) | ||
max_year = int(self.df['date'].astype(str).str[0:4].max()) + 1 | ||
year_list = [i for i in range(min_year, max_year)] | ||
[date(year, 1, 1) in holidays for year in year_list] | ||
df_holidays = pd.DataFrame.from_dict( | ||
holidays, | ||
orient='index' | ||
).reset_index() | ||
df_holidays["date"] = pd.to_datetime(df_holidays['index']) | ||
df_holidays['holiday'] = 1 | ||
df_holidays = df_holidays[["date", "holiday"]].copy() | ||
return df_holidays | ||
|
||
def add_holidays(self, granularity): | ||
df_holiday_granular = self.df_holidays.copy() | ||
df_ts = self.df.copy() | ||
if granularity == "M": | ||
# -- Compute holidays by month | ||
df_holiday_granular['date'] = ( | ||
df_holiday_granular["date"].astype(str).str[0:7] | ||
) | ||
df_holiday_granular = ( | ||
df_holiday_granular | ||
.groupby('date')["holiday"].sum().reset_index() | ||
) | ||
df_holiday_granular['date'] = ( | ||
pd.to_datetime(df_holiday_granular['date'] + '-01') | ||
) | ||
elif granularity == "W": | ||
# -- Compute holidays by week | ||
# ----- Create dummy key | ||
df_holiday_granular['key'] = 0 | ||
df_ts['key'] = 0 | ||
# ----- Create end date of week | ||
df_ts['date_max_week'] = df_ts['date'] + timedelta(days=6) | ||
# ----- Cartesian | ||
df_holiday_granular.rename( | ||
columns={'date':'date_holidays'}, inplace=True | ||
) | ||
df_ts = df_ts.merge( | ||
df_holiday_granular, | ||
on='key', | ||
how='outer' | ||
) | ||
# ----- Filter holidays within each period of the week | ||
df_ts = df_ts.loc[ | ||
(df_ts['date_holidays']>=df_ts['date']) | ||
& (df_ts['date_holidays']<=df_ts['date_max_week'])].copy() | ||
# ----- Aggregate holidays by week | ||
df_holiday_granular = ( | ||
df_ts.groupby('date')['holiday'].sum().reset_index() | ||
) | ||
df_holiday_granular['date'] = pd.to_datetime( | ||
df_holiday_granular['date'], | ||
utc=True | ||
) | ||
self.df = self.df.merge( | ||
df_holiday_granular, | ||
on='date', | ||
how='left' | ||
)[["date", "value", "holiday"]].fillna(0) |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# -- Packages | ||
import pandas as pd | ||
|
||
class DataADHandler: | ||
""" | ||
This class creates a data set that cleans the Data for Anomaly Detection | ||
""" | ||
def __init__( | ||
self, | ||
df:pd.DataFrame, | ||
granularity:str = 'D' | ||
): | ||
# -- Main data and parameters | ||
self.df = df | ||
self.granularity = granularity | ||
self.group_columns = ["date"] | ||
self.df_grouped = pd.DataFrame() | ||
# -- Methods | ||
self.__expand_dates() | ||
self.__group_by() | ||
self.__get_ordered_dataframe() | ||
|
||
def __expand_dates(self): | ||
# -- Get first date of month and max date to disaggregate | ||
min_date = self.df["date"].min() | ||
max_date = self.df["date"].max() | ||
if self.granularity == 'W': | ||
num_weeks = int((max_date - min_date).days / 7) + 1 | ||
dates = pd.date_range(start=min_date, | ||
periods=num_weeks, freq="7D").tolist() | ||
elif self.granularity == 'M': | ||
dates = pd.date_range(start=min_date, | ||
end=max_date, freq="MS").tolist() | ||
else : | ||
dates = pd.date_range(start=min_date, | ||
end=max_date, freq="D").tolist() | ||
# -- dates df | ||
df_dates = pd.DataFrame( | ||
range(len(dates)), | ||
dates | ||
).reset_index() | ||
df_dates.columns = ["date", 'index'] | ||
df_dates.drop(['index'], axis=1, inplace=True) | ||
# -- Add values | ||
self.df = df_dates.merge( | ||
self.df, | ||
on=self.group_columns, | ||
how='left' | ||
) | ||
# -- set correct date format | ||
self.df["date"] = pd.to_datetime(self.df["date"], utc=True) | ||
|
||
def __get_ordered_dataframe(self): | ||
self.df.sort_values(self.group_columns, inplace=True) | ||
|
||
def __group_by(self): | ||
self.df_grouped = ( | ||
self.df | ||
.groupby(self.group_columns)["value"] | ||
.sum() | ||
.reset_index() | ||
) | ||
|
||
|
||
|
||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import pandas as pd | ||
from loaders.package.c_engine.model_factory \ | ||
import ModelFactory | ||
|
||
|
||
class ModelTrainer: | ||
def __init__(self, model_names, df_train, config): | ||
self.model_names = model_names | ||
self.df_train = df_train | ||
self.config = config | ||
|
||
@staticmethod | ||
def filter_config_by(config, model) -> dict: | ||
return config[model] | ||
|
||
def train(self) -> pd.DataFrame: | ||
list_output = [] | ||
print(">> Training:") | ||
for model_name in self.model_names: | ||
dict_output_id = {} | ||
model = ModelFactory.get_model( | ||
model=model_name, | ||
df_train=self.df_train.copy(), | ||
dict_config=ModelTrainer.filter_config_by( | ||
self.config, | ||
model_name | ||
) | ||
) | ||
dict_output_id['model'] = model | ||
list_output.append(dict_output_id) | ||
return pd.DataFrame(list_output) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# -- ML main packages | ||
from abc import ABC, abstractmethod | ||
import pandas as pd | ||
|
||
|
||
class AnomalyDetectionModel(ABC): | ||
|
||
def __init__(self, df:pd.DataFrame, dict_params:dict): | ||
self.df = df | ||
self.dict_params = dict_params | ||
|
||
@abstractmethod | ||
def fit_model(self, df_train:pd.DataFrame, dict_params:dict): | ||
pass | ||
|
||
@abstractmethod | ||
def get_anomalies(self): | ||
pass | ||
|
||
def plot(self): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
from loaders.package.b_modelling.abstract_model \ | ||
import AnomalyDetectionModel | ||
from loaders.package.z_utils.create_fourier_terms \ | ||
import create_fourier_terms | ||
import pandas as pd | ||
from pmdarima.arima import auto_arima | ||
import numpy as np | ||
import tensorflow as tf | ||
import random | ||
|
||
def set_seed(seed=42): | ||
np.random.seed(seed) | ||
tf.random.set_seed(seed) | ||
random.seed(seed) | ||
|
||
set_seed(42) | ||
|
||
class AutoArimaModel(AnomalyDetectionModel): | ||
def __init__(self, | ||
*args, | ||
**kwargs | ||
): | ||
super(AutoArimaModel, self).__init__(*args, **kwargs) | ||
# -- fit model | ||
self.model_fitted = self.fit_model( | ||
df_train=self.df, | ||
dict_params=self.dict_params) | ||
|
||
def fit_model(self, df_train:pd.DataFrame, dict_params:dict): | ||
list_df_fourier = [] | ||
if dict_params['weekly_seasonality']: | ||
list_df_fourier.append(create_fourier_terms(self.df, freq=7, K=3)) | ||
if dict_params['monthly_seasonality']: | ||
list_df_fourier.append( | ||
create_fourier_terms(self.df, freq=30, K=3) | ||
) | ||
if dict_params['yearly_seasonality']: | ||
list_df_fourier.append( | ||
create_fourier_terms(self.df, freq=365, K=3) | ||
) | ||
if len(list_df_fourier) == 0: | ||
return auto_arima( | ||
df_train['value'], seasonal=False, suppress_warnings=True | ||
) | ||
else: | ||
df_exogenous = pd.concat(list_df_fourier, axis=1) | ||
if self.dict_params['features']['holidays']: | ||
df_exogenous = pd.concat( | ||
[df_exogenous, df_train['holiday']], axis=1 | ||
) | ||
return auto_arima( | ||
df_train['value'], seasonal=False, suppress_warnings=True, | ||
exogenous=df_exogenous | ||
) | ||
|
||
|
||
def get_anomalies(self): | ||
# -- Predict over train | ||
df_fit, df_confidence_intervals = ( | ||
self.model_fitted.predict_in_sample(return_conf_int=True) | ||
) | ||
df_anomaly = pd.concat([ | ||
self.df[["date", "value"]].copy(), | ||
df_fit, | ||
pd.DataFrame(df_confidence_intervals) | ||
], axis=1) | ||
df_anomaly.columns = [ | ||
'date', 'value', 'yhat', 'yhat_lower', 'yhat_upper' | ||
] | ||
df_anomaly['anomaly'] = False | ||
df_anomaly.loc[ | ||
(df_anomaly['value'] > df_anomaly['yhat_upper']) | ||
| (df_anomaly['value'] < df_anomaly['yhat_lower']), | ||
'anomaly' | ||
] = True | ||
df_anomaly = df_anomaly[["date", "value", "anomaly"]].copy() | ||
df_anomaly['model'] = 'auto_arima' | ||
df_anomaly['date'] = pd.to_datetime(df_anomaly['date']) | ||
self.df = df_anomaly.copy() |
Oops, something went wrong.