-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from adevinta/feature/second_part_development
fix: Last part of initial development
- Loading branch information
Showing
29 changed files
with
3,607 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import pandas as pd | ||
from anomalywatchdog.anomaly_features\ | ||
.CalendarFeatureCreator import CalendarFeatureCreator | ||
|
||
|
||
def create_features( | ||
df: pd.DataFrame, | ||
granularity: str | ||
) -> pd.DataFrame: | ||
# -- Create calendar features | ||
calendar_feature_creator = CalendarFeatureCreator(df=df, country="Spain") | ||
calendar_feature_creator.add_holidays(granularity=granularity) | ||
df_feature = calendar_feature_creator.df | ||
# -- Return output | ||
return df_feature |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
from anomalywatchdog.data_treatment.data_handler\ | ||
import DataADHandler | ||
from anomalywatchdog.data_treatment.input_checker\ | ||
import InputChecker | ||
from anomalywatchdog.data_treatment.model_updater\ | ||
import ModelUpdater | ||
from anomalywatchdog.engine.model_trainer\ | ||
import ModelTrainer | ||
from anomalywatchdog.engine.model_predictor\ | ||
import ModelPredictor | ||
from anomalywatchdog.anomaly_features.create_features\ | ||
import create_features | ||
import os.path as op | ||
import yaml | ||
from logging import Logger | ||
import pandas as pd | ||
from typing import Union, List | ||
from pyspark.sql import DataFrame | ||
|
||
|
||
class AnomalyWatchdog: | ||
|
||
def __init__( | ||
self, | ||
df: Union[pd.DataFrame, DataFrame], | ||
column_date: str, | ||
column_target: str, | ||
granularity: str, | ||
columns_dimension: list[str] = None, | ||
models_to_use: List[str] = ['auto_arima', 'Prophet'], | ||
check_history: bool = False | ||
): | ||
# -- Initialize logs | ||
self.log = Logger(name="anomaly_detector") | ||
# -- Read Config | ||
current_path = op.dirname(__file__) | ||
with open(op.join(current_path, "config_ad.yaml"), 'rb') as fp: | ||
self.config = yaml.safe_load(fp) | ||
# -- Check inputs | ||
input_checker = InputChecker( | ||
df=df.copy(), | ||
column_date=column_date, | ||
column_target=column_target, | ||
columns_dimension=columns_dimension, | ||
granularity=granularity, | ||
models_to_use=models_to_use, | ||
check_history=check_history, | ||
config=self.config | ||
) | ||
# -- Update inputs | ||
self.df_input = input_checker.df.copy() | ||
self.column_target = input_checker.column_target | ||
self.columns_dimension = input_checker.columns_dimension | ||
self.config = input_checker.config | ||
self.granularity = granularity | ||
self.df_input.rename( | ||
columns={column_date: "date", column_target: "value"}, | ||
inplace=True | ||
) | ||
# -- Initialize inputs | ||
self.max_date = self.df_input["date"].max() | ||
# -- Initialize output | ||
self.df_anomaly = pd.DataFrame() | ||
self.df_anomaly_dimension = pd.DataFrame() | ||
self.log.info(">> 1. Data Treatment") | ||
data_ad_handler = DataADHandler( | ||
df=self.df_input, | ||
granularity=self.granularity | ||
) | ||
df = data_ad_handler.df_grouped.copy() | ||
df = df[["date", "value"]].copy() | ||
self.log.info(">> 1.1 Get TS properties") | ||
ModelUpdater( | ||
df=df.copy(), | ||
granularity=self.granularity, | ||
config=self.config | ||
) | ||
self.log.info(">> 1.2 Get features") | ||
df = create_features(df=df.copy(), granularity=self.granularity) | ||
self.log.info(">> 2. Get global anomalies") | ||
self.df_anomaly = self.__detect_anomalies( | ||
df_handled=df.copy(), | ||
list_models=self.config['models_to_use'] | ||
) | ||
self.log.info(">> 2. Get drilled anomalies") | ||
self.df_anomaly_dimension = self.__detect_granular_anomalies( | ||
df_predictions=self.df_anomaly.copy(), | ||
columns_dimension=self.columns_dimension, | ||
granularity=self.granularity, | ||
check_history=check_history | ||
) | ||
print(self.df_anomaly) | ||
print(self.df_anomaly_dimension) | ||
|
||
def __detect_anomalies( | ||
self, | ||
df_handled:pd.DataFrame, | ||
list_models:list[str] | ||
) -> pd.DataFrame(): | ||
self.log.info(">> 2.1. Train Models") | ||
model_trainer = ModelTrainer( | ||
model_names=list_models, #self.config['models_to_use'], | ||
df_train=df_handled.copy(), | ||
config=self.config | ||
) | ||
df_models_trained = model_trainer.train() | ||
self.log.info(">> 2.2. Predict Models") | ||
model_predictor = ModelPredictor( | ||
df_models_trained=df_models_trained) | ||
df_predictions = model_predictor.predict() | ||
return df_predictions | ||
|
||
def __detect_granular_anomalies( | ||
self, | ||
df_predictions:pd.DataFrame, | ||
columns_dimension:list, | ||
granularity:str, | ||
check_history:bool | ||
) -> pd.DataFrame(): | ||
df_dimension = pd.DataFrame() | ||
if columns_dimension is not None: | ||
list_df_dimension = [] | ||
for model in df_predictions["model"].unique(): | ||
is_anomaly_max_date = ( | ||
df_predictions | ||
.loc[(df_predictions['model']==model) & | ||
(df_predictions['date']==self.max_date), | ||
'anomaly' | ||
].sum() | ||
) > 0 | ||
is_anomaly_history = ( | ||
df_predictions | ||
.loc[(df_predictions['model']==model), | ||
'anomaly' | ||
].sum() | ||
) > 0 | ||
condition1 = check_history and is_anomaly_history | ||
condition2 = not check_history and is_anomaly_max_date | ||
if condition1 or condition2: | ||
for column_dimension in columns_dimension: | ||
list_dimension_value = [ | ||
dimension for dimension | ||
in self.df_input[column_dimension].unique() | ||
if dimension is not None | ||
] | ||
for dimension_value in list_dimension_value: | ||
df_dimension = ( | ||
self.df_input | ||
.loc[self.df_input[column_dimension] | ||
==dimension_value, | ||
["date", "value"]] | ||
.reset_index(drop=True) | ||
.copy() | ||
) | ||
data_ad_handler = DataADHandler( | ||
df=df_dimension, | ||
granularity=granularity | ||
) | ||
df = data_ad_handler.df_grouped.copy() | ||
df = create_features(df=df.copy(), | ||
granularity=self.granularity) | ||
df_predictions_element = self.__detect_anomalies( | ||
df_handled=df.copy(), | ||
list_models=[model] | ||
) | ||
print(df_predictions_element) | ||
df_predictions_element['dimension'] = ( | ||
column_dimension | ||
) | ||
df_predictions_element['dimension_value'] = ( | ||
dimension_value | ||
) | ||
list_df_dimension.append(df_predictions_element) | ||
df_dimension = pd.concat(list_df_dimension) | ||
return df_dimension | ||
else: | ||
return df_dimension | ||
else: | ||
return df_dimension |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
models_to_use: [ | ||
"autoencoder_lstm", | ||
"autoencoder_conv", | ||
"prophet", | ||
"auto_arima" | ||
] | ||
ts_seasonality: | ||
weekly_seasonality: False | ||
monthly_seasonality: False | ||
yearly_seasonality: False | ||
autoencoder_conv: | ||
granularity: 'D' | ||
timesteps: | ||
D: 120 #steps back in the TS in each step | ||
W: 24 | ||
M: 8 | ||
activation: 'relu' | ||
loss: 'mse' | ||
epochs: 100 | ||
batch_size: | ||
D: 128 | ||
W: 32 | ||
M: 12 | ||
validation_split: 0.1 | ||
learning_rate: 0.001 | ||
quantile: 0.75 | ||
quantile_multiplier: 1.5 | ||
features: | ||
holidays: True | ||
autoencoder_lstm: | ||
granularity: 'D' | ||
timesteps: | ||
D: 30 #steps back in the TS in each step | ||
W: 12 | ||
M: 8 | ||
activation: 'relu' | ||
loss: 'mse' | ||
epochs: 50 | ||
batch_size: | ||
D: 32 | ||
W: 16 | ||
M: 12 | ||
validation_split: 0.2 | ||
learning_rate: 0.001 | ||
quantile: 0.75 | ||
quantile_multiplier: 1.5 | ||
features: | ||
holidays: True | ||
prophet: | ||
weekly_seasonality: False | ||
monthly_seasonality: False | ||
yearly_seasonality: False | ||
seasonality_mode: "additive" | ||
interval_width: 0.95 | ||
changepoint_range: 0.8 | ||
features: | ||
holidays: True | ||
auto_arima: | ||
weekly_seasonality: False | ||
monthly_seasonality: False | ||
yearly_seasonality: False | ||
features: | ||
holidays: True | ||
|
Oops, something went wrong.