Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Boostings implementations eval_set fix #1358

Merged
merged 12 commits into from
Jan 23, 2025
3 changes: 3 additions & 0 deletions fedot/core/operations/evaluation/boostings.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def _convert_to_operation(self, operation_type: str):
raise ValueError(f'Impossible to obtain Boosting Strategy for {operation_type}')

def fit(self, train_data: InputData):
if train_data.task.task_type == TaskTypesEnum.ts_forecasting:
raise ValueError('Time series forecasting not supported for boosting models')

if is_multi_output_task(train_data):
if self.operation_type == 'catboost':
self.params_for_fit.update(loss_function='MultiLogloss')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor
from lightgbm import early_stopping as lgbm_early_stopping
from matplotlib import pyplot as plt
Expand All @@ -32,14 +31,15 @@ def __init__(self, params: Optional[OperationParameters] = None):
self.model_params = {k: v for k, v in self.params.to_dict().items() if k not in self.__operation_params}
self.model = None
self.features_names = None
self.classes_ = None

def fit(self, input_data: InputData):
self.features_names = input_data.features_names

if self.params.get('enable_categorical'):
input_data = input_data.get_not_encoded_data()

if self.params.get('use_eval_set'):
if check_eval_set_condition(input_data, self.params):
train_input, eval_input = train_test_data_setup(input_data)

X_train, y_train = convert_to_dataframe(
Expand All @@ -52,14 +52,25 @@ def fit(self, input_data: InputData):

self.model.eval_metric = self.set_eval_metric(self.classes_)

self.model.fit(X=X_train, y=y_train, eval_set=[(X_eval, y_eval)], verbose=self.model_params['verbosity'])
self.model.fit(
X=X_train, y=y_train,
eval_set=[(X_eval, y_eval)],
verbose=self.model_params['verbosity']
)
else:
# Disable parameter used for eval_set
if bool(self.params.get('early_stopping_rounds')):
self.model.early_stopping_rounds = None
self.params.update(early_stopping_rounds=None)

# Training model without splitting on train and eval
X_train, y_train = convert_to_dataframe(
input_data, identify_cats=self.params.get('enable_categorical')
)
self.features_names = input_data.features_names

self.model.fit(X=X_train, y=y_train, verbose=self.model_params['verbosity'])
self.model.fit(
X=X_train, y=y_train,
verbose=self.model_params['verbosity']
)

return self.model

Expand Down Expand Up @@ -111,7 +122,6 @@ def set_eval_metric(n_classes):
class FedotXGBoostClassificationImplementation(FedotXGBoostImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.classes_ = None
self.model = XGBClassifier(**self.model_params)

def fit(self, input_data: InputData):
Expand All @@ -130,7 +140,6 @@ def predict_proba(self, input_data: InputData):
class FedotXGBoostRegressionImplementation(FedotXGBoostImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.classes_ = None
self.model = XGBRegressor(**self.model_params)


Expand All @@ -145,48 +154,47 @@ def __init__(self, params: Optional[OperationParameters] = None):
self.model_params = {k: v for k, v in self.params.to_dict().items() if k not in self.__operation_params}
self.model = None
self.features_names = None
self.classes_ = None

def fit(self, input_data: InputData):
self.features_names = input_data.features_names

if self.params.get('enable_categorical'):
input_data = input_data.get_not_encoded_data()

if self.params.get('use_eval_set'):
if check_eval_set_condition(input_data, self.params):
train_input, eval_input = train_test_data_setup(input_data)

X_train, y_train = convert_to_dataframe(
train_input, identify_cats=self.params.get('enable_categorical')
)

X_eval, y_eval = convert_to_dataframe(
eval_input, identify_cats=self.params.get('enable_categorical')
)

eval_metric = self.set_eval_metric(self.classes_)
callbacks = self.update_callbacks()

self.model.fit(
X=X_train, y=y_train,
eval_set=[(X_eval, y_eval)], eval_metric=eval_metric,
callbacks=callbacks
)
else:
# Disable parameter used for eval_set
if bool(self.params.get('early_stopping_rounds')):
self.model._other_params.update(early_stopping_rounds=None)
self.params.update(early_stopping_rounds=None)

if is_multi_output_task(input_data):
X_train, y_train = convert_to_dataframe(
input_data, identify_cats=self.params.get('enable_categorical')
)
self._convert_to_multi_output_model(input_data)
self.model.fit(X=X_train, y=y_train)
else:
train_input, eval_input = train_test_data_setup(input_data)

X_train, y_train = convert_to_dataframe(
train_input, identify_cats=self.params.get('enable_categorical')
)

X_eval, y_eval = convert_to_dataframe(
eval_input, identify_cats=self.params.get('enable_categorical')
)

eval_metric = self.set_eval_metric(self.classes_)
callbacks = self.update_callbacks()

self.model.fit(
X=X_train, y=y_train,
eval_set=[(X_eval, y_eval)], eval_metric=eval_metric,
callbacks=callbacks
)
else:

# Training model without splitting on train and eval
X_train, y_train = convert_to_dataframe(
input_data, identify_cats=self.params.get('enable_categorical')
)
if is_multi_output_task(input_data):
self._convert_to_multi_output_model(input_data)
self.model.fit(X=X_train, y=y_train)
else:
self.model.fit(X=X_train, y=y_train)
self.model.fit(X=X_train, y=y_train)

return self.model

Expand All @@ -204,7 +212,7 @@ def check_and_update_params(self):
use_eval_set = self.params.get('use_eval_set')

if isinstance(early_stopping_rounds, int) and not use_eval_set:
self.params.update(early_stopping_rounds=False)
self.params.update(early_stopping_rounds=None)

def update_callbacks(self) -> list:
callback = []
Expand Down Expand Up @@ -234,22 +242,19 @@ def plot_feature_importance(self):
def _convert_to_multi_output_model(self, input_data: InputData):
if input_data.task.task_type == TaskTypesEnum.classification:
multiout_func = MultiOutputClassifier
lgb_model = lgb.LGBMClassifier()
elif input_data.task.task_type in [TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting]:
multiout_func = MultiOutputRegressor
lgb_model = lgb.LGBMRegressor()
else:
raise ValueError(f"For task type '{input_data.task.task_type}' MultiOutput wrapper is not supported")

self.model = multiout_func(lgb_model)
self.model = multiout_func(self.model)

return self.model


class FedotLightGBMClassificationImplementation(FedotLightGBMImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.classes_ = None
self.model = LGBMClassifier(**self.model_params)

def fit(self, input_data: InputData):
Expand All @@ -268,7 +273,6 @@ def predict_proba(self, input_data: InputData):
class FedotLightGBMRegressionImplementation(FedotLightGBMImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.classes_ = None
self.model = LGBMRegressor(**self.model_params)


Expand All @@ -290,19 +294,25 @@ def fit(self, input_data: InputData):
if self.params.get('enable_categorical'):
input_data = input_data.get_not_encoded_data()

if self.params.get('use_eval_set'):
if check_eval_set_condition(input_data, self.params):
# TODO: Using this method for tuning
train_input, eval_input = train_test_data_setup(input_data)

train_input = self.convert_to_pool(train_input, identify_cats=self.params.get('enable_categorical'))
eval_input = self.convert_to_pool(eval_input, identify_cats=self.params.get('enable_categorical'))

self.model.fit(X=train_input, eval_set=eval_input)

else:
train_input = self.convert_to_pool(input_data, identify_cats=self.params.get('enable_categorical'))
# Disable parameter used for eval_set
if bool(self.params.get('use_best_model')):
self.model._init_params.update(use_best_model=False)
self.params.update(use_best_model=False)

self.model.fit(train_input)
# Training model without splitting on train and eval
train_input = self.convert_to_pool(
input_data, identify_cats=self.params.get('enable_categorical')
)
self.model.fit(X=train_input)

return self.model

Expand Down Expand Up @@ -417,3 +427,25 @@ def convert_to_dataframe(data: Optional[InputData], identify_cats: bool):
return X_without_target, y_target

return dataframe.drop(columns=['target']), dataframe['target']


def check_eval_set_condition(input_data: InputData, params: OperationParameters) -> bool:
is_using_eval_set = bool(params.get('use_eval_set'))
if not is_using_eval_set or is_multi_output_task(input_data):
return False

# No special conditions for regression task
if input_data.task.task_type == TaskTypesEnum.regression:
return True

# For classification task check
# if all classes presented in train_set are also presented in eval_set
if input_data.task.task_type == TaskTypesEnum.classification:
train_input, eval_input = train_test_data_setup(input_data)
train_classes = np.unique(train_input.target)
eval_classes = np.unique(eval_input.target)
all_classes_present_in_eval = np.all(np.isin(train_classes, eval_classes))
if all_classes_present_in_eval:
return True

return False
Loading