diff --git a/fedot/api/api_utils/data_definition.py b/fedot/api/api_utils/data_definition.py index 6af04cfa28..e2f9e694c5 100644 --- a/fedot/api/api_utils/data_definition.py +++ b/fedot/api/api_utils/data_definition.py @@ -81,7 +81,7 @@ def define_data(self, features: pd.DataFrame, if isinstance(target, str) and target in features.columns: target_array = features[target] - features = features.drop(columns=[target]) + features = features.drop(columns=target) else: target_array = target @@ -121,7 +121,6 @@ def define_data(self, features: Union[str, PathLike], is_predict: bool = False) -> InputData: # CSV files as input data, by default - table data - data_type = DataTypesEnum.table if task.task_type == TaskTypesEnum.ts_forecasting: # For time series forecasting format - time series data = InputData.from_csv_time_series(task=task, @@ -133,7 +132,7 @@ def define_data(self, features: Union[str, PathLike], # CSV files as input data data = InputData.from_csv(features, task=task, target_columns=target, - data_type=data_type) + data_type=DataTypesEnum.table) return data @@ -155,9 +154,9 @@ def define_data(self, features: dict, idx=None) -> MultiModalData: # change data type to InputData - for source in features: - if not isinstance(features[source], InputData): - features[source] = array_to_input_data(features_array=features[source], target_array=target, + for source, inner_data in features.items(): + if not isinstance(inner_data, InputData): + features[source] = array_to_input_data(features_array=inner_data, target_array=target, task=task, idx=idx) # create labels for data sources sources = dict((f'{self.source_name_by_type.get(features[data_part_key].data_type.name)}/{data_part_key}', diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index a26e94cd1e..9d983ddc8c 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -8,17 +8,9 @@ import numpy as np import pandas as pd - from golem.core.log import default_log from golem.utilities.requirements_notificator import warn_requirement -#: The list of keyword for auto-detecting csv *tabular* data index. Used in :py:meth:`Data.from_csv` -#: and :py:meth:`MultiModalData.from_csv`. -POSSIBLE_TABULAR_IDX_KEYWORDS = ['idx', 'index', 'id', 'unnamed: 0'] -#: The list of keyword for auto-detecting csv *time-series* data index. Used in :py:meth:`Data.from_csv_time_series`, -#: :py:meth:`Data.from_csv_multi_time_series` and :py:meth:`MultiModalData.from_csv_time_series`. -POSSIBLE_TS_IDX_KEYWORDS = ['datetime', 'date', 'time', 'unnamed: 0'] - try: import cv2 except ModuleNotFoundError: @@ -31,6 +23,13 @@ from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum +#: The list of keyword for auto-detecting csv *tabular* data index. Used in :py:meth:`Data.from_csv` +#: and :py:meth:`MultiModalData.from_csv`. +POSSIBLE_TABULAR_IDX_KEYWORDS = ['idx', 'index', 'id', 'unnamed: 0'] +#: The list of keyword for auto-detecting csv *time-series* data index. Used in :py:meth:`Data.from_csv_time_series`, +#: :py:meth:`Data.from_csv_multi_time_series` and :py:meth:`MultiModalData.from_csv_time_series`. +POSSIBLE_TS_IDX_KEYWORDS = ['datetime', 'date', 'time', 'unnamed: 0'] + PathType = Union[os.PathLike, str] @@ -40,11 +39,11 @@ class Data: Base Data type class """ - idx: np.array + idx: np.ndarray task: Task data_type: DataTypesEnum - features: np.array - target: Optional[np.array] = None + features: np.ndarray + target: Optional[np.ndarray] = None # Object with supplementary info supplementary_data: SupplementaryData = field(default_factory=SupplementaryData) @@ -333,19 +332,19 @@ def from_json_files(files_path: str, if len(fields_to_use) > 1: fields_to_combine = [] - for field in fields_to_use: - fields_to_combine.append(np.array(df_data[field])) + for field_to_use in fields_to_use: + fields_to_combine.append(np.array(df_data[field_to_use])) # Unite if the element of text data is divided into strings - if isinstance(df_data[field][0], list): - df_data[field] = [' '.join(piece) for piece in df_data[field]] + if isinstance(df_data[field_to_use][0], list): + df_data[field_to_use] = [' '.join(piece) for piece in df_data[field_to_use]] features = np.column_stack(tuple(fields_to_combine)) else: - field = df_data[fields_to_use[0]] - # process field with nested list - if isinstance(field[0], list): - field = [' '.join(piece) for piece in field] - features = np.array(field) + field_to_use = df_data[fields_to_use[0]] + # process field_to_use with nested list + if isinstance(field_to_use[0], list): + field_to_use = [' '.join(piece) for piece in field_to_use] + features = np.array(field_to_use) if is_multilabel: target = df_data[label] @@ -583,6 +582,25 @@ def get_indices_from_file(data_frame, file_path, idx_column='datetime') -> Itera return np.arange(0, len(data_frame)) +def np_datetime_to_numeric(data: np.ndarray) -> np.ndarray: + """ + Change data's datetime type to integer with milliseconds unit. + + Args: + data: table data for converting. + + Returns: + The same table data with datetimes (if existed) converted to integer + """ + orig_shape = data.shape + out_dtype = np.int64 if 'datetime' in str((dt := data.dtype)) else dt + features_df = pd.DataFrame(data, copy=False).infer_objects() + date_cols = features_df.select_dtypes('datetime') + converted_cols = date_cols.to_numpy(np.int64) // 1e6 # to 'ms' unit from 'ns' + features_df[date_cols.columns] = converted_cols + return features_df.to_numpy(out_dtype).reshape(orig_shape) + + def array_to_input_data(features_array: np.array, target_array: np.array, idx: Optional[np.array] = None, @@ -606,7 +624,6 @@ def get_df_from_csv(file_path: PathType, delimiter: str, index_col: Optional[Uni possible_idx_keywords: Optional[List[str]] = None, *, columns_to_drop: Optional[List[Union[str, int]]] = None, columns_to_use: Optional[List[Union[str, int]]] = None): - def define_index_column(candidate_columns: List[str]) -> Optional[str]: for column_name in candidate_columns: if is_column_name_suitable_for_index(column_name): diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index fa81ed6b90..1f31490b14 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -7,7 +7,8 @@ from golem.core.paths import copy_doc from sklearn.preprocessing import LabelEncoder -from fedot.core.data.data import InputData, OutputData, data_type_is_table, data_type_is_ts, data_type_is_text +from fedot.core.data.data import InputData, np_datetime_to_numeric +from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_ts, data_type_is_text from fedot.core.data.data_preprocessing import ( data_has_categorical_features, data_has_missing_values, @@ -87,7 +88,7 @@ def _init_supplementary_preprocessors(self, data: Union[InputData, MultiModalDat def _init_main_target_source_name(self, multi_data: MultiModalData): """ Defines main_target_source_name for MultiModal data branches with main target and the side ones - + Args: multi_data: `MultiModalData` """ @@ -118,8 +119,8 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) -> return data @copy_doc(BasePreprocessor.obligatory_prepare_for_predict) - def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]) -> Union[ - InputData, MultiModalData]: + def obligatory_prepare_for_predict(self, + data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]: if isinstance(data, InputData): data = self._prepare_obligatory_unimodal_for_predict(data, source_name=DEFAULT_SOURCE_NAME) @@ -132,8 +133,8 @@ def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]) return data @copy_doc(BasePreprocessor.optional_prepare_for_fit) - def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[ - InputData, MultiModalData]: + def optional_prepare_for_fit(self, pipeline, + data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]: self._init_supplementary_preprocessors(data) if isinstance(data, InputData): @@ -148,8 +149,8 @@ def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalDa return data @copy_doc(BasePreprocessor.optional_prepare_for_predict) - def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[ - InputData, MultiModalData]: + def optional_prepare_for_predict(self, pipeline, + data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]: if isinstance(data, InputData): self._prepare_optional(pipeline, data, DEFAULT_SOURCE_NAME) else: @@ -163,7 +164,7 @@ def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiMod def _take_only_correct_features(self, data: InputData, source_name: str): """ Takes only correct features from the table - + Args: data: to take correct features from source_name: name of the data source node @@ -178,11 +179,11 @@ def _take_only_correct_features(self, data: InputData, source_name: str): def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str) -> InputData: """ Processes InputData for pipeline fit method - + Args: data: to be preprocessed source_name: name of the data source node - + Returns: obligatory-prepared ``data`` """ @@ -190,8 +191,13 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str # Preprocessing was already done - return data return data - # Wrap indices in numpy array - data.idx = np.array(data.idx) + # Convert datetime data to numerical + data.features = np_datetime_to_numeric(data.features) + if data.target is not None: + data.target = np_datetime_to_numeric(data.target) + + # Wrap indices in numpy array if needed + data.idx = np.asarray(data.idx) # Fix tables / time series sizes data = self._correct_shapes(data) @@ -229,11 +235,11 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name: str) -> InputData: """ Processes InputData for pipeline predict method - + Args: data: to be preprocessed source_name: name of the data source node - + Returns: obligatory-prepared data """ @@ -241,6 +247,11 @@ def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name: # Preprocessing was already done - return data return data + # Convert datetime data to numerical + data.features = np_datetime_to_numeric(data.features) + if data.target is not None: + data.target = np_datetime_to_numeric(data.target) + # Wrap indices in numpy array data.idx = np.array(data.idx) @@ -263,7 +274,7 @@ def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name: def _prepare_optional(self, pipeline, data: InputData, source_name: str): """ Performs optional fitting/preprocessing for unimodal data - + Args: pipeline: determines if optional preprocessing is needed data: to be preprocessed @@ -309,10 +320,10 @@ def _find_features_full_of_nans(self, data: InputData, source_name: str): def _drop_rows_with_nan_in_target(data: InputData) -> InputData: """ Drops rows with nans in target column - + Args: data: to be modified - + Returns: modified ``data`` """ @@ -339,7 +350,7 @@ def _clean_extra_spaces(data: InputData) -> InputData: """ Removes extra spaces from data. Transforms cells in columns from ' x ' to 'x' - + Args: data: to be stripped @@ -381,7 +392,7 @@ def _apply_imputation_unidata(self, data: InputData, source_name: str) -> InputD Args: data: data for fill in the gaps - + Returns: imputed ``data`` """ @@ -403,7 +414,7 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu Args: data: data to be transformed source_name: name of the data source node - + Returns: encoded ``data`` """ @@ -421,7 +432,7 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu def _train_target_encoder(self, data: InputData, source_name: str): """ Trains `LabelEncoder` if the ``data``'s target consists of strings - + Args: data: data to be encoded source_name: name of the data source node @@ -444,7 +455,7 @@ def _apply_target_encoding(self, data: InputData, source_name: str) -> np.ndarra Args: data: data to be encoded source_name: name of the data source node - + Returns: encoded ``data``'s target """ @@ -483,7 +494,7 @@ def _determine_target_converter(self): Determines which encoder target to use. Applicable for inverse target transformation (if there are several targets in single MultiModal pipeline). - + Returns: selected data source name """ @@ -497,11 +508,11 @@ def _determine_target_converter(self): def _correct_shapes(data: InputData) -> InputData: """ Corrects shapes of tabular data or time series. - + Args: data: time series or tabular. In the first case must be 1d-array, in the second case must be two-dimensional arrays or array of (n, 1) for texts. - + Returns: corrected tabular data """ diff --git a/test/unit/data_operations/test_data_definition.py b/test/unit/data_operations/test_data_definition.py new file mode 100644 index 0000000000..3c839b6b5d --- /dev/null +++ b/test/unit/data_operations/test_data_definition.py @@ -0,0 +1,39 @@ +from datetime import datetime + +import numpy as np +import pandas as pd +import pytest + +from fedot.core.data.data import np_datetime_to_numeric + +_DATE = '2000-01-01T10:00:00.100' +_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f' + + +@pytest.mark.parametrize('features', [ + np.array([ + [_DATE, datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 54, 54.] + ]), + np.array([ + [datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 42] + ], dtype=object), + np.array([ + [datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 54., 54] + ], dtype=object), + np.array([ + [*pd.date_range(_DATE, periods=3, freq='D').to_numpy(), 54, 54.] + ], dtype=object), + np.array([ + [*pd.date_range(_DATE, periods=3, freq='D')] + ], dtype=np.datetime64), + pd.date_range(_DATE, periods=3, freq='D').to_numpy(), + np.array([ + [datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE)] + ]), + np.array([ + ['without_datetime', 54, 54.] + ], dtype=object) +]) +def test_datetime_erasure(features: np.ndarray): + result = np_datetime_to_numeric(features) + assert 'datetime' not in str(pd.DataFrame(result).infer_objects().dtypes) diff --git a/test/unit/data_operations/test_text_preprocessing.py b/test/unit/data_operations/test_text_preprocessing.py index f85b0b1836..0e9cba37ff 100644 --- a/test/unit/data_operations/test_text_preprocessing.py +++ b/test/unit/data_operations/test_text_preprocessing.py @@ -16,7 +16,7 @@ def test_clean_text_preprocessing(): ] input_data = InputData(features=np.array(test_text), - target=[0, 1, 1, 0], + target=np.array([0, 1, 1, 0]), idx=np.arange(0, len(test_text)), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.text)