Skip to content

Commit

Permalink
Merge pull request #2 from adevinta/feature/second_part_development
Browse files Browse the repository at this point in the history
fix: Last part of initial development
  • Loading branch information
epampols authored Jul 1, 2024
2 parents d034ced + 912cdc0 commit 4bdce08
Show file tree
Hide file tree
Showing 29 changed files with 3,607 additions and 31 deletions.
15 changes: 15 additions & 0 deletions anomalywatchdog/anomaly_features/create_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pandas as pd
from anomalywatchdog.anomaly_features\
.CalendarFeatureCreator import CalendarFeatureCreator


def create_features(
df: pd.DataFrame,
granularity: str
) -> pd.DataFrame:
# -- Create calendar features
calendar_feature_creator = CalendarFeatureCreator(df=df, country="Spain")
calendar_feature_creator.add_holidays(granularity=granularity)
df_feature = calendar_feature_creator.df
# -- Return output
return df_feature
179 changes: 179 additions & 0 deletions anomalywatchdog/anomaly_watchdog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
from anomalywatchdog.data_treatment.data_handler\
import DataADHandler
from anomalywatchdog.data_treatment.input_checker\
import InputChecker
from anomalywatchdog.data_treatment.model_updater\
import ModelUpdater
from anomalywatchdog.engine.model_trainer\
import ModelTrainer
from anomalywatchdog.engine.model_predictor\
import ModelPredictor
from anomalywatchdog.anomaly_features.create_features\
import create_features
import os.path as op
import yaml
from logging import Logger
import pandas as pd
from typing import Union, List
from pyspark.sql import DataFrame


class AnomalyWatchdog:

def __init__(
self,
df: Union[pd.DataFrame, DataFrame],
column_date: str,
column_target: str,
granularity: str,
columns_dimension: list[str] = None,
models_to_use: List[str] = ['auto_arima', 'Prophet'],
check_history: bool = False
):
# -- Initialize logs
self.log = Logger(name="anomaly_detector")
# -- Read Config
current_path = op.dirname(__file__)
with open(op.join(current_path, "config_ad.yaml"), 'rb') as fp:
self.config = yaml.safe_load(fp)
# -- Check inputs
input_checker = InputChecker(
df=df.copy(),
column_date=column_date,
column_target=column_target,
columns_dimension=columns_dimension,
granularity=granularity,
models_to_use=models_to_use,
check_history=check_history,
config=self.config
)
# -- Update inputs
self.df_input = input_checker.df.copy()
self.column_target = input_checker.column_target
self.columns_dimension = input_checker.columns_dimension
self.config = input_checker.config
self.granularity = granularity
self.df_input.rename(
columns={column_date: "date", column_target: "value"},
inplace=True
)
# -- Initialize inputs
self.max_date = self.df_input["date"].max()
# -- Initialize output
self.df_anomaly = pd.DataFrame()
self.df_anomaly_dimension = pd.DataFrame()
self.log.info(">> 1. Data Treatment")
data_ad_handler = DataADHandler(
df=self.df_input,
granularity=self.granularity
)
df = data_ad_handler.df_grouped.copy()
df = df[["date", "value"]].copy()
self.log.info(">> 1.1 Get TS properties")
ModelUpdater(
df=df.copy(),
granularity=self.granularity,
config=self.config
)
self.log.info(">> 1.2 Get features")
df = create_features(df=df.copy(), granularity=self.granularity)
self.log.info(">> 2. Get global anomalies")
self.df_anomaly = self.__detect_anomalies(
df_handled=df.copy(),
list_models=self.config['models_to_use']
)
self.log.info(">> 2. Get drilled anomalies")
self.df_anomaly_dimension = self.__detect_granular_anomalies(
df_predictions=self.df_anomaly.copy(),
columns_dimension=self.columns_dimension,
granularity=self.granularity,
check_history=check_history
)
print(self.df_anomaly)
print(self.df_anomaly_dimension)

def __detect_anomalies(
self,
df_handled:pd.DataFrame,
list_models:list[str]
) -> pd.DataFrame():
self.log.info(">> 2.1. Train Models")
model_trainer = ModelTrainer(
model_names=list_models, #self.config['models_to_use'],
df_train=df_handled.copy(),
config=self.config
)
df_models_trained = model_trainer.train()
self.log.info(">> 2.2. Predict Models")
model_predictor = ModelPredictor(
df_models_trained=df_models_trained)
df_predictions = model_predictor.predict()
return df_predictions

def __detect_granular_anomalies(
self,
df_predictions:pd.DataFrame,
columns_dimension:list,
granularity:str,
check_history:bool
) -> pd.DataFrame():
df_dimension = pd.DataFrame()
if columns_dimension is not None:
list_df_dimension = []
for model in df_predictions["model"].unique():
is_anomaly_max_date = (
df_predictions
.loc[(df_predictions['model']==model) &
(df_predictions['date']==self.max_date),
'anomaly'
].sum()
) > 0
is_anomaly_history = (
df_predictions
.loc[(df_predictions['model']==model),
'anomaly'
].sum()
) > 0
condition1 = check_history and is_anomaly_history
condition2 = not check_history and is_anomaly_max_date
if condition1 or condition2:
for column_dimension in columns_dimension:
list_dimension_value = [
dimension for dimension
in self.df_input[column_dimension].unique()
if dimension is not None
]
for dimension_value in list_dimension_value:
df_dimension = (
self.df_input
.loc[self.df_input[column_dimension]
==dimension_value,
["date", "value"]]
.reset_index(drop=True)
.copy()
)
data_ad_handler = DataADHandler(
df=df_dimension,
granularity=granularity
)
df = data_ad_handler.df_grouped.copy()
df = create_features(df=df.copy(),
granularity=self.granularity)
df_predictions_element = self.__detect_anomalies(
df_handled=df.copy(),
list_models=[model]
)
print(df_predictions_element)
df_predictions_element['dimension'] = (
column_dimension
)
df_predictions_element['dimension_value'] = (
dimension_value
)
list_df_dimension.append(df_predictions_element)
df_dimension = pd.concat(list_df_dimension)
return df_dimension
else:
return df_dimension
else:
return df_dimension
64 changes: 64 additions & 0 deletions anomalywatchdog/config_ad.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
models_to_use: [
"autoencoder_lstm",
"autoencoder_conv",
"prophet",
"auto_arima"
]
ts_seasonality:
weekly_seasonality: False
monthly_seasonality: False
yearly_seasonality: False
autoencoder_conv:
granularity: 'D'
timesteps:
D: 120 #steps back in the TS in each step
W: 24
M: 8
activation: 'relu'
loss: 'mse'
epochs: 100
batch_size:
D: 128
W: 32
M: 12
validation_split: 0.1
learning_rate: 0.001
quantile: 0.75
quantile_multiplier: 1.5
features:
holidays: True
autoencoder_lstm:
granularity: 'D'
timesteps:
D: 30 #steps back in the TS in each step
W: 12
M: 8
activation: 'relu'
loss: 'mse'
epochs: 50
batch_size:
D: 32
W: 16
M: 12
validation_split: 0.2
learning_rate: 0.001
quantile: 0.75
quantile_multiplier: 1.5
features:
holidays: True
prophet:
weekly_seasonality: False
monthly_seasonality: False
yearly_seasonality: False
seasonality_mode: "additive"
interval_width: 0.95
changepoint_range: 0.8
features:
holidays: True
auto_arima:
weekly_seasonality: False
monthly_seasonality: False
yearly_seasonality: False
features:
holidays: True

Loading

0 comments on commit 4bdce08

Please sign in to comment.