Skip to content

Commit

Permalink
fix pos_label (#1131)
Browse files Browse the repository at this point in the history
Error with pos_label when passing binary string target was fixed
Error with resample was fixed (add extra rule)
Fitness calculating errors now raise exceptions if it's test
Unexpected behaviour with inplace operations with data in node.fit and node.predict was fixed
  • Loading branch information
valer1435 authored Aug 14, 2023
1 parent 89ff552 commit 04bceac
Show file tree
Hide file tree
Showing 13 changed files with 125 additions and 19 deletions.
5 changes: 4 additions & 1 deletion fedot/api/api_utils/api_params_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum
from golem.core.optimisers.genetic.operators.mutation import MutationTypesEnum

from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, boosting_mutation
from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, boosting_mutation, \
add_resample_mutation
from fedot.core.constants import AUTO_PRESET_NAME
from fedot.core.repository.tasks import TaskTypesEnum
from fedot.core.utils import default_fedot_data_dir
Expand Down Expand Up @@ -131,5 +132,7 @@ def _get_default_mutations(task_type: TaskTypesEnum, params) -> Sequence[Mutatio
# TODO remove workaround after boosting mutation fix
if task_type == TaskTypesEnum.ts_forecasting:
mutations.append(partial(boosting_mutation, params=params))
else:
mutations.append(add_resample_mutation)

return mutations
18 changes: 18 additions & 0 deletions fedot/core/composer/gp_composer/specific_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,24 @@ def boosting_mutation(pipeline: Pipeline, requirements, graph_gen_params, **kwar
return pipeline


def add_resample_mutation(pipeline: Pipeline, **kwargs):
"""
Add resample operation before all primary operations in pipeline
:param pipeline: pipeline to insert resample
:return: mutated pipeline
"""
resample_node = PipelineNode('resample')

p_nodes = [p_node for p_node in pipeline.primary_nodes]
pipeline.add_node(resample_node)

for node in p_nodes:
pipeline.connect_nodes(resample_node, node)
return pipeline


def choose_new_model(boosting_model_candidates: List[str]) -> str:
""" Since 'linear' and 'dtreg' operations are suitable for solving the problem
and they are simpler than others, they are preferred """
Expand Down
18 changes: 15 additions & 3 deletions fedot/core/composer/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def get_value(cls, pipeline: 'Pipeline', reference_data: InputData,
save_path=Path(save_path, 'forecast.png'))

except Exception as ex:
pipeline.log.info(f'Metric can not be evaluated because of: {ex}')
pipeline.log.info(f'Metric can not be evaluated because of: {ex}', raise_if_test=True)

return metric

Expand Down Expand Up @@ -216,7 +216,10 @@ def metric(reference: InputData, predicted: OutputData) -> float:
if n_classes > 2:
additional_params = {'average': F1.multiclass_averaging_mode}
else:
additional_params = {'average': F1.binary_averaging_mode}
u, count = np.unique(np.ravel(reference.target), return_counts=True)
count_sort_ind = np.argsort(count)
pos_label = u[count_sort_ind[0]].item()
additional_params = {'average': F1.binary_averaging_mode, 'pos_label': pos_label}
return f1_score(y_true=reference.target, y_pred=predicted.predict,
**additional_params)

Expand Down Expand Up @@ -271,7 +274,16 @@ class Precision(QualityMetric):
@staticmethod
@from_maximised_metric
def metric(reference: InputData, predicted: OutputData) -> float:
return precision_score(y_true=reference.target, y_pred=predicted.predict)
n_classes = reference.num_classes
if n_classes > 2:
return precision_score(y_true=reference.target, y_pred=predicted.predict)
else:
u, count = np.unique(np.ravel(reference.target), return_counts=True)
count_sort_ind = np.argsort(count)
pos_label = u[count_sort_ind[0]].item()
additional_params = {'pos_label': pos_label}
return precision_score(y_true=reference.target, y_pred=predicted.predict,
**additional_params)


class Logloss(QualityMetric):
Expand Down
3 changes: 2 additions & 1 deletion fedot/core/operations/operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def predict_for_fit(self, fitted_operation, data: InputData, params: Optional[Op
def _predict(self, fitted_operation, data: InputData, params: Optional[OperationParameters] = None,
output_mode: str = 'default', is_fit_stage: bool = False):


is_main_target = data.supplementary_data.is_main_target
data_flow_length = data.supplementary_data.data_flow_length
self._init(data.task, output_mode=output_mode, params=params, n_samples_data=data.features.shape[0])
Expand All @@ -133,11 +134,11 @@ def _predict(self, fitted_operation, data: InputData, params: Optional[Operation
predict_data=data)
prediction = self.assign_tabular_column_types(prediction, output_mode)

# any inplace operations here are dangerous!
if is_main_target is False:
prediction.supplementary_data.is_main_target = is_main_target

prediction.supplementary_data.data_flow_length = data_flow_length
prediction.supplementary_data.obligatorily_preprocessed = True
return prediction

@staticmethod
Expand Down
17 changes: 15 additions & 2 deletions fedot/core/pipelines/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from copy import deepcopy
from datetime import timedelta
from os import PathLike
from typing import Optional, Tuple, Union, Sequence, Dict
from typing import Optional, Tuple, Union, Sequence, List, Dict

import func_timeout
from golem.core.dag.graph import Graph
Expand Down Expand Up @@ -307,7 +307,7 @@ def load(self, source: Union[str, dict], dict_fitted_operations: Optional[dict]
dict_fitted_operations: dictionary of the fitted operations
"""

self.nodes = []
self.nodes: Optional[List[PipelineNode]] = []
template = PipelineTemplate(self)
template.import_pipeline(source, dict_fitted_operations)
return self
Expand All @@ -327,6 +327,19 @@ def root_node(self) -> Optional[PipelineNode]:
raise ValueError(f'{ERROR_PREFIX} More than 1 root_nodes in pipeline')
return root[0]

@property
def primary_nodes(self) -> List[PipelineNode]:
"""Finds pipeline's primary nodes
Returns:
list of primary nodes
"""
if not self.nodes:
return []
primary_nodes = [node for node in self.nodes
if node.is_primary]
return primary_nodes

def pipeline_for_side_task(self, task_type: TaskTypesEnum) -> 'Pipeline':
"""Returns pipeline formed from the last node solving the given problem and all its parents
Expand Down
9 changes: 6 additions & 3 deletions fedot/core/pipelines/random_pipeline_factory.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import random
from copy import deepcopy
from random import randint
from typing import Optional
Expand All @@ -14,6 +15,7 @@

class RandomPipelineFactory(RandomGraphFactory):
""" Default realisation of random graph factory. Generates DAG graph using random growth. """
PROBABILITY_OF_GROWTH = 0.3

def __init__(self,
verifier: GraphVerifier,
Expand Down Expand Up @@ -76,13 +78,14 @@ def graph_growth(graph: OptGraph,
for offspring_node in range(offspring_size):
height = distance_to_root_level(graph, node_parent)
is_max_depth_exceeded = height >= max_depth - 2
is_primary_node_selected = height < max_depth - 1 and randint(0, 1)
if is_max_depth_exceeded or is_primary_node_selected:
is_primary_node_selected = height < max_depth - 1
is_growth_should_stopped = random.random() > RandomPipelineFactory.PROBABILITY_OF_GROWTH
if is_max_depth_exceeded or is_primary_node_selected or is_growth_should_stopped:
primary_node = node_factory.get_node(is_primary=True)
node_parent.nodes_from.append(primary_node)
graph.add_node(primary_node)
else:
secondary_node = node_factory.get_node(is_primary=False)
graph.add_node(secondary_node)
node_parent.nodes_from.append(secondary_node)
graph_growth(graph, secondary_node, node_factory, requirements, max_depth)
graph_growth(graph, secondary_node, node_factory, requirements, max_depth)
5 changes: 3 additions & 2 deletions fedot/core/pipelines/verification.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
has_no_data_flow_conflicts_in_ts_pipeline,
has_primary_nodes,
is_pipeline_contains_ts_operations,
only_non_lagged_operations_are_primary
only_non_lagged_operations_are_primary, has_correct_location_of_resample
)
from fedot.core.repository.tasks import TaskTypesEnum

Expand All @@ -40,7 +40,8 @@
has_no_conflicts_with_data_flow,
has_no_conflicts_in_decompose,
has_correct_data_connections,
has_correct_data_sources]
has_correct_data_sources,
has_correct_location_of_resample]

ts_rules = [is_pipeline_contains_ts_operations,
only_non_lagged_operations_are_primary,
Expand Down
26 changes: 25 additions & 1 deletion fedot/core/pipelines/verification_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


def has_correct_operations_for_task(pipeline: Pipeline, task_type: Optional[TaskTypesEnum] = None):
if task_type and not task_type in pipeline.root_node.operation.acceptable_task_types:
if task_type and task_type not in pipeline.root_node.operation.acceptable_task_types:
raise ValueError(f'{ERROR_PREFIX} Pipeline has incorrect operations positions')
return True

Expand Down Expand Up @@ -152,6 +152,30 @@ def has_no_data_flow_conflicts_in_ts_pipeline(pipeline: Pipeline):
return True


def has_correct_location_of_resample(pipeline: Pipeline):
"""
Pipeline can have only one resample operation located in start of the pipeline
:param pipeline: pipeline for checking
"""
is_resample_primary = False
is_not_resample_primary = False
for node in pipeline.nodes:
if node.is_primary:
if node.name == 'resample':
is_resample_primary = True
else:
is_not_resample_primary = True
else:
if node.name == 'resample':
raise ValueError(
f'{ERROR_PREFIX} Pipeline can have only one resample operation located in start of the pipeline')
if is_resample_primary and is_not_resample_primary:
raise ValueError(
f'{ERROR_PREFIX} Pipeline can have only one resample operation located in start of the pipeline')
return True


def get_wrong_links(ts_to_table_operations: list, ts_data_operations: list, non_ts_data_operations: list,
ts_models: list, non_ts_models: list) -> dict:
"""
Expand Down
6 changes: 5 additions & 1 deletion fedot/core/repository/data/model_repository.json
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,10 @@
"meta": "sklearn_class",
"presets": ["fast_train"],
"tags": [
"bayesian", "non_multi", "linear"
"simple",
"bayesian",
"non_multi",
"linear"
]
},
"catboost": {
Expand All @@ -186,6 +189,7 @@
"meta": "sklearn_class",
"presets": ["fast_train", "*tree"],
"tags": [
"simple",
"tree",
"interpretable",
"non_linear"
Expand Down
14 changes: 14 additions & 0 deletions test/integration/models/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,20 @@ def classification_dataset():
threshold = 0.5
classes = np.array([0.0 if val <= threshold else 1.0 for val in y])
classes = np.expand_dims(classes, axis=1)
data = InputData(features=MinMaxScaler().fit_transform(x), target=classes, idx=np.arange(0, len(x)),
task=Task(TaskTypesEnum.classification),
data_type=DataTypesEnum.table)
return data


def classification_dataset_with_str_labels():
samples = 1000
x = 10.0 * np.random.rand(samples, ) - 5.0
x = np.expand_dims(x, axis=1)
y = 1.0 / (1.0 + np.exp(np.power(x, -1.0)))
threshold = 0.5
classes = np.array(['a' if val <= threshold else 'b' for val in y])
classes = np.expand_dims(classes, axis=1)
data = InputData(features=MinMaxScaler().fit_transform(x), target=classes, idx=np.arange(0, len(x)),
task=Task(TaskTypesEnum.classification),
data_type=DataTypesEnum.table)
Expand Down
2 changes: 0 additions & 2 deletions test/unit/composer/test_quality_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
@pytest.fixture()
def data_setup():
predictors, response = load_breast_cancer(return_X_y=True)
np.random.shuffle(predictors)
np.random.shuffle(response)
response = response[:100]
predictors = predictors[:100]

Expand Down
2 changes: 0 additions & 2 deletions test/unit/data/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
@pytest.fixture()
def data_setup() -> InputData:
predictors, response = load_iris(return_X_y=True)
np.random.shuffle(predictors)
np.random.shuffle(response)
predictors = predictors[:100]
response = response[:100]
data = InputData(features=predictors, target=response, idx=np.arange(0, 100),
Expand Down
19 changes: 18 additions & 1 deletion test/unit/optimizer/test_pipeline_objective_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
RegressionMetricsEnum
from fedot.core.repository.tasks import Task, TaskTypesEnum
from fedot.core.validation.split import tabular_cv_generator, OneFoldInputDataSplit
from test.integration.models.test_model import classification_dataset
from test.integration.models.test_model import classification_dataset, classification_dataset_with_str_labels
from test.unit.tasks.test_forecasting import get_simple_ts_pipeline
from test.unit.validation.test_table_cv import sample_pipeline
from test.unit.validation.test_time_series_cv import configure_experiment
Expand Down Expand Up @@ -86,6 +86,23 @@ def test_pipeline_objective_evaluate_with_different_metrics(classification_datas
assert np.isclose(fitness.value, act_fitness.value, atol=1e-8), metric.name


@pytest.mark.parametrize(
'pipeline',
[pipeline_first_test(), pipeline_second_test(), pipeline_third_test()]
)
def test_pipeline_objective_evaluate_with_different_metrics_with_str_labes(pipeline):
for metric in ClassificationMetricsEnum:
one_fold_split = OneFoldInputDataSplit()
data_split = partial(one_fold_split.input_split, input_data=classification_dataset_with_str_labels())
check_pipeline = deepcopy(pipeline)
objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split)
fitness = objective_eval(pipeline)
act_fitness = actual_fitness(data_split, check_pipeline, metric)
assert fitness.valid
assert fitness.value is not None
assert np.isclose(fitness.value, act_fitness.value, atol=1e-8), metric.name


def test_pipeline_objective_evaluate_with_empty_pipeline(classification_dataset):
pipeline = empty_pipeline()

Expand Down

0 comments on commit 04bceac

Please sign in to comment.