diff --git a/fedot/core/operations/evaluation/boostings.py b/fedot/core/operations/evaluation/boostings.py index ca4ad9e416..42b7356857 100644 --- a/fedot/core/operations/evaluation/boostings.py +++ b/fedot/core/operations/evaluation/boostings.py @@ -34,6 +34,9 @@ def _convert_to_operation(self, operation_type: str): raise ValueError(f'Impossible to obtain Boosting Strategy for {operation_type}') def fit(self, train_data: InputData): + if train_data.task.task_type == TaskTypesEnum.ts_forecasting: + raise ValueError('Time series forecasting not supported for boosting models') + if is_multi_output_task(train_data): if self.operation_type == 'catboost': self.params_for_fit.update(loss_function='MultiLogloss') diff --git a/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py b/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py index f3b8f151b6..1ea4a7d3e3 100644 --- a/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py +++ b/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd from catboost import CatBoostClassifier, CatBoostRegressor, Pool -import lightgbm as lgb from lightgbm import LGBMClassifier, LGBMRegressor from lightgbm import early_stopping as lgbm_early_stopping from matplotlib import pyplot as plt @@ -32,6 +31,7 @@ def __init__(self, params: Optional[OperationParameters] = None): self.model_params = {k: v for k, v in self.params.to_dict().items() if k not in self.__operation_params} self.model = None self.features_names = None + self.classes_ = None def fit(self, input_data: InputData): self.features_names = input_data.features_names @@ -39,7 +39,7 @@ def fit(self, input_data: InputData): if self.params.get('enable_categorical'): input_data = input_data.get_not_encoded_data() - if self.params.get('use_eval_set'): + if check_eval_set_condition(input_data, self.params): train_input, eval_input = train_test_data_setup(input_data) X_train, y_train = convert_to_dataframe( @@ -52,14 +52,25 @@ def fit(self, input_data: InputData): self.model.eval_metric = self.set_eval_metric(self.classes_) - self.model.fit(X=X_train, y=y_train, eval_set=[(X_eval, y_eval)], verbose=self.model_params['verbosity']) + self.model.fit( + X=X_train, y=y_train, + eval_set=[(X_eval, y_eval)], + verbose=self.model_params['verbosity'] + ) else: + # Disable parameter used for eval_set + if bool(self.params.get('early_stopping_rounds')): + self.model.early_stopping_rounds = None + self.params.update(early_stopping_rounds=None) + + # Training model without splitting on train and eval X_train, y_train = convert_to_dataframe( input_data, identify_cats=self.params.get('enable_categorical') ) - self.features_names = input_data.features_names - - self.model.fit(X=X_train, y=y_train, verbose=self.model_params['verbosity']) + self.model.fit( + X=X_train, y=y_train, + verbose=self.model_params['verbosity'] + ) return self.model @@ -111,7 +122,6 @@ def set_eval_metric(n_classes): class FedotXGBoostClassificationImplementation(FedotXGBoostImplementation): def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) - self.classes_ = None self.model = XGBClassifier(**self.model_params) def fit(self, input_data: InputData): @@ -130,7 +140,6 @@ def predict_proba(self, input_data: InputData): class FedotXGBoostRegressionImplementation(FedotXGBoostImplementation): def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) - self.classes_ = None self.model = XGBRegressor(**self.model_params) @@ -145,6 +154,7 @@ def __init__(self, params: Optional[OperationParameters] = None): self.model_params = {k: v for k, v in self.params.to_dict().items() if k not in self.__operation_params} self.model = None self.features_names = None + self.classes_ = None def fit(self, input_data: InputData): self.features_names = input_data.features_names @@ -152,41 +162,39 @@ def fit(self, input_data: InputData): if self.params.get('enable_categorical'): input_data = input_data.get_not_encoded_data() - if self.params.get('use_eval_set'): + if check_eval_set_condition(input_data, self.params): + train_input, eval_input = train_test_data_setup(input_data) + + X_train, y_train = convert_to_dataframe( + train_input, identify_cats=self.params.get('enable_categorical') + ) + + X_eval, y_eval = convert_to_dataframe( + eval_input, identify_cats=self.params.get('enable_categorical') + ) + + eval_metric = self.set_eval_metric(self.classes_) + callbacks = self.update_callbacks() + + self.model.fit( + X=X_train, y=y_train, + eval_set=[(X_eval, y_eval)], eval_metric=eval_metric, + callbacks=callbacks + ) + else: + # Disable parameter used for eval_set + if bool(self.params.get('early_stopping_rounds')): + self.model._other_params.update(early_stopping_rounds=None) + self.params.update(early_stopping_rounds=None) + if is_multi_output_task(input_data): - X_train, y_train = convert_to_dataframe( - input_data, identify_cats=self.params.get('enable_categorical') - ) self._convert_to_multi_output_model(input_data) - self.model.fit(X=X_train, y=y_train) - else: - train_input, eval_input = train_test_data_setup(input_data) - - X_train, y_train = convert_to_dataframe( - train_input, identify_cats=self.params.get('enable_categorical') - ) - - X_eval, y_eval = convert_to_dataframe( - eval_input, identify_cats=self.params.get('enable_categorical') - ) - - eval_metric = self.set_eval_metric(self.classes_) - callbacks = self.update_callbacks() - - self.model.fit( - X=X_train, y=y_train, - eval_set=[(X_eval, y_eval)], eval_metric=eval_metric, - callbacks=callbacks - ) - else: + + # Training model without splitting on train and eval X_train, y_train = convert_to_dataframe( input_data, identify_cats=self.params.get('enable_categorical') ) - if is_multi_output_task(input_data): - self._convert_to_multi_output_model(input_data) - self.model.fit(X=X_train, y=y_train) - else: - self.model.fit(X=X_train, y=y_train) + self.model.fit(X=X_train, y=y_train) return self.model @@ -204,7 +212,7 @@ def check_and_update_params(self): use_eval_set = self.params.get('use_eval_set') if isinstance(early_stopping_rounds, int) and not use_eval_set: - self.params.update(early_stopping_rounds=False) + self.params.update(early_stopping_rounds=None) def update_callbacks(self) -> list: callback = [] @@ -234,14 +242,12 @@ def plot_feature_importance(self): def _convert_to_multi_output_model(self, input_data: InputData): if input_data.task.task_type == TaskTypesEnum.classification: multiout_func = MultiOutputClassifier - lgb_model = lgb.LGBMClassifier() elif input_data.task.task_type in [TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting]: multiout_func = MultiOutputRegressor - lgb_model = lgb.LGBMRegressor() else: raise ValueError(f"For task type '{input_data.task.task_type}' MultiOutput wrapper is not supported") - self.model = multiout_func(lgb_model) + self.model = multiout_func(self.model) return self.model @@ -249,7 +255,6 @@ def _convert_to_multi_output_model(self, input_data: InputData): class FedotLightGBMClassificationImplementation(FedotLightGBMImplementation): def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) - self.classes_ = None self.model = LGBMClassifier(**self.model_params) def fit(self, input_data: InputData): @@ -268,7 +273,6 @@ def predict_proba(self, input_data: InputData): class FedotLightGBMRegressionImplementation(FedotLightGBMImplementation): def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) - self.classes_ = None self.model = LGBMRegressor(**self.model_params) @@ -290,7 +294,7 @@ def fit(self, input_data: InputData): if self.params.get('enable_categorical'): input_data = input_data.get_not_encoded_data() - if self.params.get('use_eval_set'): + if check_eval_set_condition(input_data, self.params): # TODO: Using this method for tuning train_input, eval_input = train_test_data_setup(input_data) @@ -298,11 +302,17 @@ def fit(self, input_data: InputData): eval_input = self.convert_to_pool(eval_input, identify_cats=self.params.get('enable_categorical')) self.model.fit(X=train_input, eval_set=eval_input) - else: - train_input = self.convert_to_pool(input_data, identify_cats=self.params.get('enable_categorical')) + # Disable parameter used for eval_set + if bool(self.params.get('use_best_model')): + self.model._init_params.update(use_best_model=False) + self.params.update(use_best_model=False) - self.model.fit(train_input) + # Training model without splitting on train and eval + train_input = self.convert_to_pool( + input_data, identify_cats=self.params.get('enable_categorical') + ) + self.model.fit(X=train_input) return self.model @@ -417,3 +427,25 @@ def convert_to_dataframe(data: Optional[InputData], identify_cats: bool): return X_without_target, y_target return dataframe.drop(columns=['target']), dataframe['target'] + + +def check_eval_set_condition(input_data: InputData, params: OperationParameters) -> bool: + is_using_eval_set = bool(params.get('use_eval_set')) + if not is_using_eval_set or is_multi_output_task(input_data): + return False + + # No special conditions for regression task + if input_data.task.task_type == TaskTypesEnum.regression: + return True + + # For classification task check + # if all classes presented in train_set are also presented in eval_set + if input_data.task.task_type == TaskTypesEnum.classification: + train_input, eval_input = train_test_data_setup(input_data) + train_classes = np.unique(train_input.target) + eval_classes = np.unique(eval_input.target) + all_classes_present_in_eval = np.all(np.isin(train_classes, eval_classes)) + if all_classes_present_in_eval: + return True + + return False