Skip to content

Commit

Permalink
Merge pull request #5 from adevinta/feature/data_range
Browse files Browse the repository at this point in the history
Add Date Range
  • Loading branch information
alexvazquez1988 authored Jul 11, 2024
2 parents a4e12e4 + ff9b29c commit 3351aef
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 48 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ anomaly_watchdog = AnomalyWatchdog(
column_target: str,
granularity: str,
columns_dimension: list[str] = None,
start_date: Union[str, None] = None,
end_date: Union[str, None] = None,
models_to_use: List[str] = ['auto_arima', 'Prophet'],
check_history: bool = False
)
```

Expand All @@ -43,8 +44,9 @@ AnomalyWatchdog has the following inputs:
- column_target: String containing the column name of the time series values. Values should be float or int.
- granularity: String containing the granularity of the time series data. Values available are "D" for daily, "M" for monthly and "W" for weekly data.
- columns_dimension: List of strings containing the column dimension names representing the disaggregation of the data if any.
- start_date: String containing the start date to return anomalies. Values should be str in format YYYY-MM-DD (i.e. 2020-01-30). If None, it returns all the history.
- end_date: String containing the end date to return anomalies. Values should be str in format YYYY-MM-DD (i.e. 2020-01-30). If None, it returns all the history.
- models_to_use: List of strings containing the models available. Models available are "autoencoder_basic", "autoencoder_lstm", "prophet" and "auto_arima". If non value is provided, AnomalyWatchdog performs with only "prophet" and "auto_arima".
- check_history: Boolean that checks outliers in the complete history of the time series if True, and only in the last day if false (default).

### Outputs
AnomalyWatchdog has two outputs, one of which is only delivered if
Expand Down
3 changes: 1 addition & 2 deletions anomalywatchdog/anomaly_features/CalendarFeatureCreator.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ def add_holidays(self, granularity):
df_ts.groupby('date')['holiday'].sum().reset_index()
)
df_holiday_granular['date'] = pd.to_datetime(
df_holiday_granular['date'],
utc=True
df_holiday_granular['date']
)
self.df = self.df.merge(
df_holiday_granular,
Expand Down
59 changes: 36 additions & 23 deletions anomalywatchdog/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ def __init__(
column_date: str,
column_target: str,
granularity: str,
columns_dimension: list[str] = None,
models_to_use: List[str] = ['auto_arima', 'Prophet'],
check_history: bool = False
columns_dimension: list[str] = [],
start_date: Union[str, None] = None,
end_date: Union[str, None] = None,
models_to_use: List[str] = ['auto_arima', 'Prophet']
):
# -- Initialize logs
self.log = Logger(name="anomaly_detector")
Expand All @@ -44,7 +45,8 @@ def __init__(
columns_dimension=columns_dimension,
granularity=granularity,
models_to_use=models_to_use,
check_history=check_history,
start_date=start_date,
end_date=end_date,
config=self.config
)
# -- Update inputs
Expand All @@ -53,6 +55,8 @@ def __init__(
self.columns_dimension = input_checker.columns_dimension
self.config = input_checker.config
self.granularity = granularity
self.start_date = input_checker.start_date
self.end_date = input_checker.end_date
self.df_input.rename(
columns={column_date: "date", column_target: "value"},
inplace=True
Expand Down Expand Up @@ -86,9 +90,20 @@ def __init__(
self.df_anomaly_dimension = self.__detect_granular_anomalies(
df_predictions=self.df_anomaly.copy(),
columns_dimension=self.columns_dimension,
granularity=self.granularity,
check_history=check_history
granularity=self.granularity
)
self.log.info(">> 3. Filter selected dates")
print('print2')
if self.start_date:
self.df_anomaly = self.df_anomaly.loc[
(self.df_anomaly['date'] >= self.start_date) &
(self.df_anomaly['date'] <= self.end_date)
].copy()
if len(self.df_anomaly_dimension) > 0:
self.df_anomaly_dimension = self.df_anomaly_dimension.loc[
(self.df_anomaly_dimension['date'] >= self.start_date) &
(self.df_anomaly_dimension['date'] <= self.end_date)
].copy()
print(self.df_anomaly)
print(self.df_anomaly_dimension)

Expand All @@ -114,26 +129,24 @@ def __detect_granular_anomalies(
self,
df_predictions: pd.DataFrame,
columns_dimension: list,
granularity: str,
check_history: bool
granularity: str
) -> pd.DataFrame():
df_dimension = pd.DataFrame()
if columns_dimension is not None:
if len(columns_dimension) > 0:
list_df_dimension = []
for model in df_predictions["model"].unique():
is_anomaly_max_date = (
df_predictions
.loc[(df_predictions['model'] == model) &
(df_predictions['date'] == self.max_date),
'anomaly'].sum()
) > 0
is_anomaly_history = (
df_predictions
.loc[(df_predictions['model'] == model), 'anomaly'].sum()
) > 0
condition1 = check_history and is_anomaly_history
condition2 = not check_history and is_anomaly_max_date
if condition1 or condition2:
filtered_df = df_predictions.copy()
if self.start_date:
filtered_df = filtered_df.loc[
(filtered_df['date'] <= self.end_date) &
(filtered_df['date'] >= self.start_date)
]
for model in filtered_df["model"].unique():
is_anomaly_in_interval = (
filtered_df.loc[filtered_df['model'] == model,
['anomaly']].sum()
> 0
)
if is_anomaly_in_interval:
for column_dimension in columns_dimension:
list_dimension_value = [
dimension for dimension
Expand Down
2 changes: 1 addition & 1 deletion anomalywatchdog/data_treatment/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __expand_dates(self):
how='left'
)
# -- set correct date format
self.df["date"] = pd.to_datetime(self.df["date"], utc=True)
self.df["date"] = pd.to_datetime(self.df["date"])

def __get_ordered_dataframe(self):
self.df.sort_values(self.group_columns, inplace=True)
Expand Down
57 changes: 42 additions & 15 deletions anomalywatchdog/data_treatment/input_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ def __init__(
granularity: str,
columns_dimension: List[str],
models_to_use: List[str],
check_history: bool,
config: dict
config: dict,
start_date: Union[str, None] = None,
end_date: Union[str, None] = None
):
# -- Main inputs
self.df = df
Expand All @@ -23,19 +24,22 @@ def __init__(
self.granularity = granularity
self.columns_dimension = columns_dimension
self.models_to_use = models_to_use
self.check_history = check_history
self.start_date = start_date
self.end_date = end_date
self.config = config
# -- Preliminary Checks
self.__check_df_instance()
self.__check_columns_in_dataframe()
if self.columns_dimension is not None:
if len(self.columns_dimension) > 0:
self.__check_column_list_types(self.columns_dimension)
self.__check_column_str_type(self.column_target)
self.__check_column_str_type(self.column_date)
self.__enforce_lowercase()
self.__check_granularity()
if self.check_history:
self.__check_check_history()
self.start_date = self.__check_date(self.start_date)
self.end_date = self.__check_date(self.end_date)
self.__check_dates_consistency()

if self.models_to_use:
self.__check_column_list_types(self.models_to_use)
self.__check_models_to_use()
Expand Down Expand Up @@ -125,15 +129,6 @@ def __check_granularity(self):
)
raise ValueError(error_string)

def __check_check_history(self):
if not isinstance(self.check_history, bool):
error_string = (
"Input parameter check_history is " +
f"{type(self.check_history)}. " +
"Expected input type is bool."
)
raise TypeError(error_string)

def __check_models_to_use(self):
models_to_use_list = [
"autoencoder_basic",
Expand All @@ -153,3 +148,35 @@ def __check_models_to_use(self):
def __update_config(self):
if self.models_to_use:
self.config['models_to_use'] = self.models_to_use

@staticmethod
def __check_date(date_string: Union[str, None]):
if date_string:
formatting_error = (
f"Format for {date_string} not understood. "
f"Accepted format is 'YYYY-MM-DD'"
f"(e.g. 2021-03-28)."
)
try:
return pd.to_datetime(
date_string,
format="%Y-%m-%d"
)
except:
raise ValueError(formatting_error)

def __check_dates_consistency(self):
if not self.start_date and self.end_date:
self.start_date = self.end_date
if not self.end_date and self.start_date:
self.end_date = self.start_date

if self.start_date:
if (pd.to_datetime(self.end_date, format="%Y-%m-%d")
< pd.to_datetime(self.start_date, format="%Y-%m-%d")):
formatting_error = (
f"Value for end_date: {self.end_date} must be greater or "
f"equal than start_date: {self.start_date}."
)
raise ValueError(formatting_error)

6 changes: 3 additions & 3 deletions tests/data_treatment/test_data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def expected_df_monthly():
3400257.853
]
return pd.DataFrame(
{'date': pd.to_datetime(date_list, utc=True),
{'date': pd.to_datetime(date_list),
'value': value_list}
)

Expand Down Expand Up @@ -113,7 +113,7 @@ def expected_df_weekly():
3400257.853
]
return pd.DataFrame(
{'date': pd.to_datetime(date_list, utc=True),
{'date': pd.to_datetime(date_list),
'value': value_list}
)

Expand Down Expand Up @@ -155,6 +155,6 @@ def expected_df_daily():
3400257.853
]
return pd.DataFrame(
{'date': pd.to_datetime(date_list, utc=True),
{'date': pd.to_datetime(date_list),
'value': value_list}
)
2 changes: 0 additions & 2 deletions tests/data_treatment/test_input_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
EXPECTED_CONFIG = {"models_to_use": ["autoencoder_basic", "prophet"]}



def test_input_checker():
# -- Initialize input checker
input_checker = InputChecker(
Expand All @@ -17,7 +16,6 @@ def test_input_checker():
granularity='M',
columns_dimension=['Col_dim'],
models_to_use=INPUT_CONFIG["models_to_use"],
check_history=True,
config=INPUT_CONFIG
)
# -- Check df and columns
Expand Down

0 comments on commit 3351aef

Please sign in to comment.