From cbe3f9fea2bf66098d012908da45ad7c15fff48a Mon Sep 17 00:00:00 2001 From: maiia Date: Sun, 15 Jan 2023 04:38:45 +0300 Subject: [PATCH 1/3] add stratification for holdout --- fedot/core/data/data_split.py | 9 ++++++++- fedot/core/optimisers/objective/data_source_splitter.py | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fedot/core/data/data_split.py b/fedot/core/data/data_split.py index e8f16fa6d9..f086cd9519 100644 --- a/fedot/core/data/data_split.py +++ b/fedot/core/data/data_split.py @@ -6,6 +6,7 @@ from fedot.core.data.data import InputData from fedot.core.data.multi_modal import MultiModalData from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import TaskTypesEnum def _split_time_series(data: InputData, task, *args, **kwargs): @@ -110,13 +111,19 @@ def _split_any(data: InputData, task, data_type, split_ratio, with_shuffle=False input_features = data.features input_target = data.target idx = data.idx + if task.task_type == TaskTypesEnum.classification and with_shuffle: + stratify = input_target + else: + stratify = None + idx_train, idx_test, x_train, x_test, y_train, y_test = \ train_test_split(idx, input_features, input_target, test_size=1. - split_ratio, shuffle=with_shuffle, - random_state=random_state) + random_state=random_state, + stratify=stratify) # Prepare data to train the operation train_data = InputData(idx=idx_train, features=x_train, target=y_train, diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index 61d3fc629d..17607a1ff9 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -90,3 +90,5 @@ def _build_kfolds_producer(self, data: InputData) -> DataSource: self.cv_folds, self.advisor.propose_kfold(data)) return cv_generator + + From f10ffdc36a07078c081209e57906bb1db1efd138 Mon Sep 17 00:00:00 2001 From: maiia Date: Mon, 16 Jan 2023 03:01:13 +0300 Subject: [PATCH 2/3] remove excessive spaces --- fedot/core/optimisers/objective/data_source_splitter.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index 17607a1ff9..61d3fc629d 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -90,5 +90,3 @@ def _build_kfolds_producer(self, data: InputData) -> DataSource: self.cv_folds, self.advisor.propose_kfold(data)) return cv_generator - - From 2376f2ada6fb698bf0a65bdeb6c1339c1fc377ae Mon Sep 17 00:00:00 2001 From: maiia Date: Mon, 16 Jan 2023 03:33:55 +0300 Subject: [PATCH 3/3] add tests --- test/unit/data/test_data_split.py | 59 ++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index 375826631c..17f0beb90d 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -53,6 +53,50 @@ def get_image_classification_data(): return input_data +def get_imbalanced_data_to_test_mismatch(): + task = Task(TaskTypesEnum.classification) + x = np.array([[0, 0, 15], + [0, 1, 2], + [8, 12, 0], + [0, 1, 0], + [1, 1, 0], + [0, 11, 9], + [5, 1, 10], + [8, 16, 4], + [3, 1, 5], + [0, 1, 6], + [2, 7, 9], + [0, 1, 2], + [14, 1, 0], + [0, 4, 10]]) + y = np.array([0, 0, 0, 0, 2, 0, 0, 1, 2, 1, 0, 0, 3, 3]) + input_data = InputData(idx=np.arange(0, len(x)), features=x, + target=y, task=task, data_type=DataTypesEnum.table) + return input_data + + +def get_balanced_data_to_test_mismatch(): + task = Task(TaskTypesEnum.classification) + x = np.array([[0, 0, 15], + [0, 1, 2], + [8, 12, 0], + [0, 1, 0], + [1, 1, 0], + [0, 11, 9], + [5, 1, 10], + [8, 16, 4], + [3, 1, 5], + [0, 1, 6], + [2, 7, 9], + [0, 1, 2], + [14, 1, 0], + [0, 4, 10]]) + y = np.array([0, 1, 2, 3, 2, 1, 0, 1, 2, 1, 0, 0, 3, 3]) + input_data = InputData(idx=np.arange(0, len(x)), features=x, + target=y, task=task, data_type=DataTypesEnum.table) + return input_data + + def test_split_data(): dataframe = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6], @@ -92,6 +136,20 @@ def test_advanced_time_series_splitting(): assert np.allclose(test_data.target, np.array([16, 17, 18, 19])) +@pytest.mark.parametrize('data_splitter, data', + # test StratifiedKFold + [(DataSourceSplitter(cv_folds=3, shuffle=True), get_imbalanced_data_to_test_mismatch()), + # test KFold + (DataSourceSplitter(cv_folds=3, shuffle=True), get_balanced_data_to_test_mismatch()), + # test hold-out + (DataSourceSplitter(shuffle=True), get_imbalanced_data_to_test_mismatch())]) +def test_data_splitting_without_shape_mismatch(data_splitter: DataSourceSplitter, data: InputData): + """ Checks if data split correctly into train test subsets: there are no new classes in test subset """ + data_source = data_splitter.build(data=data) + for fold_id, (train_data, test_data) in enumerate(data_source()): + assert set(train_data.target) >= set(test_data.target) + + def test_data_splitting_perform_correctly_after_build(): """ Check if data splitting perform correctly through Objective Builder - Objective Evaluate @@ -105,7 +163,6 @@ def test_data_splitting_perform_correctly_after_build(): # Imitate evaluation process for fold_id, (train_data, test_data) in enumerate(data_source()): - expected_output = output_by_fold[fold_id] assert train_data.features.shape == expected_output['train_features_size'] assert test_data.features.shape == expected_output['test_features_size']