aimclub · IIaKyJIuH · Apr 24, 2023 · Mar 10, 2023 · Mar 10, 2023 · Mar 13, 2023
diff --git a/fedot/api/api_utils/data_definition.py b/fedot/api/api_utils/data_definition.py
@@ -81,7 +81,7 @@ def define_data(self, features: pd.DataFrame,
 
         if isinstance(target, str) and target in features.columns:
             target_array = features[target]
-            features = features.drop(columns=[target])
+            features = features.drop(columns=target)
         else:
             target_array = target
 
@@ -121,7 +121,6 @@ def define_data(self, features: Union[str, PathLike],
                     is_predict: bool = False) -> InputData:
         # CSV files as input data, by default - table data
 
-        data_type = DataTypesEnum.table
         if task.task_type == TaskTypesEnum.ts_forecasting:
             # For time series forecasting format - time series
             data = InputData.from_csv_time_series(task=task,
@@ -133,7 +132,7 @@ def define_data(self, features: Union[str, PathLike],
             # CSV files as input data
             data = InputData.from_csv(features, task=task,
                                       target_columns=target,
-                                      data_type=data_type)
+                                      data_type=DataTypesEnum.table)
         return data
 
 
@@ -155,9 +154,9 @@ def define_data(self, features: dict,
                     idx=None) -> MultiModalData:
 
         # change data type to InputData
-        for source in features:
-            if not isinstance(features[source], InputData):
-                features[source] = array_to_input_data(features_array=features[source], target_array=target,
+        for source, inner_data in features.items():
+            if not isinstance(inner_data, InputData):
+                features[source] = array_to_input_data(features_array=inner_data, target_array=target,
                                                        task=task, idx=idx)
         # create labels for data sources
         sources = dict((f'{self.source_name_by_type.get(features[data_part_key].data_type.name)}/{data_part_key}',

diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py
@@ -8,17 +8,9 @@
 
 import numpy as np
 import pandas as pd
-
 from golem.core.log import default_log
 from golem.utilities.requirements_notificator import warn_requirement
 
-#: The list of keyword for auto-detecting csv *tabular* data index. Used in :py:meth:`Data.from_csv`
-#: and :py:meth:`MultiModalData.from_csv`.
-POSSIBLE_TABULAR_IDX_KEYWORDS = ['idx', 'index', 'id', 'unnamed: 0']
-#: The list of keyword for auto-detecting csv *time-series* data index. Used in :py:meth:`Data.from_csv_time_series`,
-#: :py:meth:`Data.from_csv_multi_time_series` and :py:meth:`MultiModalData.from_csv_time_series`.
-POSSIBLE_TS_IDX_KEYWORDS = ['datetime', 'date', 'time', 'unnamed: 0']
-
 try:
     import cv2
 except ModuleNotFoundError:
@@ -31,6 +23,13 @@
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.core.repository.tasks import Task, TaskTypesEnum
 
+#: The list of keyword for auto-detecting csv *tabular* data index. Used in :py:meth:`Data.from_csv`
+#: and :py:meth:`MultiModalData.from_csv`.
+POSSIBLE_TABULAR_IDX_KEYWORDS = ['idx', 'index', 'id', 'unnamed: 0']
+#: The list of keyword for auto-detecting csv *time-series* data index. Used in :py:meth:`Data.from_csv_time_series`,
+#: :py:meth:`Data.from_csv_multi_time_series` and :py:meth:`MultiModalData.from_csv_time_series`.
+POSSIBLE_TS_IDX_KEYWORDS = ['datetime', 'date', 'time', 'unnamed: 0']
+
 PathType = Union[os.PathLike, str]
 
 
@@ -40,11 +39,11 @@ class Data:
     Base Data type class
     """
 
-    idx: np.array
+    idx: np.ndarray
     task: Task
     data_type: DataTypesEnum
-    features: np.array
-    target: Optional[np.array] = None
+    features: np.ndarray
+    target: Optional[np.ndarray] = None
 
     # Object with supplementary info
     supplementary_data: SupplementaryData = field(default_factory=SupplementaryData)
@@ -333,19 +332,19 @@ def from_json_files(files_path: str,
 
         if len(fields_to_use) > 1:
             fields_to_combine = []
-            for field in fields_to_use:
-                fields_to_combine.append(np.array(df_data[field]))
+            for field_to_use in fields_to_use:
+                fields_to_combine.append(np.array(df_data[field_to_use]))
                 # Unite if the element of text data is divided into strings
-                if isinstance(df_data[field][0], list):
-                    df_data[field] = [' '.join(piece) for piece in df_data[field]]
+                if isinstance(df_data[field_to_use][0], list):
+                    df_data[field_to_use] = [' '.join(piece) for piece in df_data[field_to_use]]
 
             features = np.column_stack(tuple(fields_to_combine))
         else:
-            field = df_data[fields_to_use[0]]
-            # process field with nested list
-            if isinstance(field[0], list):
-                field = [' '.join(piece) for piece in field]
-            features = np.array(field)
+            field_to_use = df_data[fields_to_use[0]]
+            # process field_to_use with nested list
+            if isinstance(field_to_use[0], list):
+                field_to_use = [' '.join(piece) for piece in field_to_use]
+            features = np.array(field_to_use)
 
         if is_multilabel:
             target = df_data[label]
@@ -583,6 +582,25 @@ def get_indices_from_file(data_frame, file_path, idx_column='datetime') -> Itera
     return np.arange(0, len(data_frame))
 
 
+def np_datetime_to_numeric(data: np.ndarray) -> np.ndarray:
+    """
+    Change data's datetime type to integer with milliseconds unit.
+
+    Args:
+        data: table data for converting.
+
+    Returns:
+        The same table data with datetimes (if existed) converted to integer
+    """
+    orig_shape = data.shape
+    out_dtype = np.int64 if 'datetime' in str((dt := data.dtype)) else dt
+    features_df = pd.DataFrame(data, copy=False).infer_objects()
+    date_cols = features_df.select_dtypes('datetime')
+    converted_cols = date_cols.to_numpy(np.int64) // 1e6  # to 'ms' unit from 'ns'
+    features_df[date_cols.columns] = converted_cols
+    return features_df.to_numpy(out_dtype).reshape(orig_shape)
+
+
 def array_to_input_data(features_array: np.array,
                         target_array: np.array,
                         idx: Optional[np.array] = None,
@@ -606,7 +624,6 @@ def get_df_from_csv(file_path: PathType, delimiter: str, index_col: Optional[Uni
                     possible_idx_keywords: Optional[List[str]] = None, *,
                     columns_to_drop: Optional[List[Union[str, int]]] = None,
                     columns_to_use: Optional[List[Union[str, int]]] = None):
-
     def define_index_column(candidate_columns: List[str]) -> Optional[str]:
         for column_name in candidate_columns:
             if is_column_name_suitable_for_index(column_name):

diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
@@ -7,7 +7,8 @@
 from golem.core.paths import copy_doc
 from sklearn.preprocessing import LabelEncoder
 
-from fedot.core.data.data import InputData, OutputData, data_type_is_table, data_type_is_ts, data_type_is_text
+from fedot.core.data.data import InputData, np_datetime_to_numeric
+from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_ts, data_type_is_text
 from fedot.core.data.data_preprocessing import (
     data_has_categorical_features,
     data_has_missing_values,
@@ -87,7 +88,7 @@ def _init_supplementary_preprocessors(self, data: Union[InputData, MultiModalDat
     def _init_main_target_source_name(self, multi_data: MultiModalData):
         """
         Defines main_target_source_name for MultiModal data branches with main target and the side ones
-        
+
         Args:
             multi_data: `MultiModalData`
         """
@@ -118,8 +119,8 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) ->
         return data
 
     @copy_doc(BasePreprocessor.obligatory_prepare_for_predict)
-    def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]) -> Union[
-        InputData, MultiModalData]:
+    def obligatory_prepare_for_predict(self,
+                                       data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]:
         if isinstance(data, InputData):
             data = self._prepare_obligatory_unimodal_for_predict(data, source_name=DEFAULT_SOURCE_NAME)
 
@@ -132,8 +133,8 @@ def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData])
         return data
 
     @copy_doc(BasePreprocessor.optional_prepare_for_fit)
-    def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[
-        InputData, MultiModalData]:
+    def optional_prepare_for_fit(self, pipeline,
+                                 data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]:
         self._init_supplementary_preprocessors(data)
 
         if isinstance(data, InputData):
@@ -148,8 +149,8 @@ def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalDa
         return data
 
     @copy_doc(BasePreprocessor.optional_prepare_for_predict)
-    def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[
-        InputData, MultiModalData]:
+    def optional_prepare_for_predict(self, pipeline,
+                                     data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]:
         if isinstance(data, InputData):
             self._prepare_optional(pipeline, data, DEFAULT_SOURCE_NAME)
         else:
@@ -163,7 +164,7 @@ def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiMod
     def _take_only_correct_features(self, data: InputData, source_name: str):
         """
         Takes only correct features from the table
-        
+
         Args:
             data: to take correct features from
             source_name: name of the data source node
@@ -178,20 +179,25 @@ def _take_only_correct_features(self, data: InputData, source_name: str):
     def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str) -> InputData:
         """
         Processes InputData for pipeline fit method
-        
+
         Args:
             data: to be preprocessed
             source_name: name of the data source node
-        
+
         Returns:
             obligatory-prepared ``data``
         """
         if data.supplementary_data.obligatorily_preprocessed:
             # Preprocessing was already done - return data
             return data
 
-        # Wrap indices in numpy array
-        data.idx = np.array(data.idx)
+        # Convert datetime data to numerical
+        data.features = np_datetime_to_numeric(data.features)
+        if data.target is not None:
+            data.target = np_datetime_to_numeric(data.target)
+
+        # Wrap indices in numpy array if needed
+        data.idx = np.asarray(data.idx)
 
         # Fix tables / time series sizes
         data = self._correct_shapes(data)
@@ -229,18 +235,23 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str
     def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name: str) -> InputData:
         """
         Processes InputData for pipeline predict method
-        
+
         Args:
             data: to be preprocessed
             source_name: name of the data source node
-        
+
         Returns:
             obligatory-prepared data
         """
         if data.supplementary_data.obligatorily_preprocessed:
             # Preprocessing was already done - return data
             return data
 
+        # Convert datetime data to numerical
+        data.features = np_datetime_to_numeric(data.features)
+        if data.target is not None:
+            data.target = np_datetime_to_numeric(data.target)
+
         # Wrap indices in numpy array
         data.idx = np.array(data.idx)
 
@@ -263,7 +274,7 @@ def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name:
     def _prepare_optional(self, pipeline, data: InputData, source_name: str):
         """
         Performs optional fitting/preprocessing for unimodal data
-        
+
         Args:
             pipeline: determines if optional preprocessing is needed
             data: to be preprocessed
@@ -309,10 +320,10 @@ def _find_features_full_of_nans(self, data: InputData, source_name: str):
     def _drop_rows_with_nan_in_target(data: InputData) -> InputData:
         """
         Drops rows with nans in target column
-        
+
         Args:
             data: to be modified
-        
+
         Returns:
             modified ``data``
         """
@@ -339,7 +350,7 @@ def _clean_extra_spaces(data: InputData) -> InputData:
         """
         Removes extra spaces from data.
             Transforms cells in columns from ' x ' to 'x'
-        
+
         Args:
             data: to be stripped
 
@@ -381,7 +392,7 @@ def _apply_imputation_unidata(self, data: InputData, source_name: str) -> InputD
 
         Args:
             data: data for fill in the gaps
-        
+
         Returns:
             imputed ``data``
         """
@@ -403,7 +414,7 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu
         Args:
             data: data to be transformed
             source_name: name of the data source node
-        
+
         Returns:
             encoded ``data``
         """
@@ -421,7 +432,7 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu
     def _train_target_encoder(self, data: InputData, source_name: str):
         """
         Trains `LabelEncoder` if the ``data``'s target consists of strings
-        
+
         Args:
             data: data to be encoded
             source_name: name of the data source node
@@ -444,7 +455,7 @@ def _apply_target_encoding(self, data: InputData, source_name: str) -> np.ndarra
         Args:
             data: data to be encoded
             source_name: name of the data source node
-        
+
         Returns:
             encoded ``data``'s target
         """
@@ -483,7 +494,7 @@ def _determine_target_converter(self):
         Determines which encoder target to use.
         Applicable for inverse target transformation (if there are several targets in
             single MultiModal pipeline).
-        
+
         Returns:
             selected data source name
         """
@@ -497,11 +508,11 @@ def _determine_target_converter(self):
     def _correct_shapes(data: InputData) -> InputData:
         """
         Corrects shapes of tabular data or time series.
-        
+
         Args:
             data: time series or tabular. In the first case must be 1d-array, in the second case must be
                 two-dimensional arrays or array of (n, 1) for texts.
-        
+
         Returns:
             corrected tabular data
         """

diff --git a/test/unit/data_operations/test_data_definition.py b/test/unit/data_operations/test_data_definition.py
@@ -0,0 +1,39 @@
+from datetime import datetime
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from fedot.core.data.data import np_datetime_to_numeric
+
+_DATE = '2000-01-01T10:00:00.100'
+_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
+
+
+@pytest.mark.parametrize('features', [
+    np.array([
+        [_DATE, datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 54, 54.]
+    ]),
+    np.array([
+        [datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 42]
+    ], dtype=object),
+    np.array([
+        [datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 54., 54]
+    ], dtype=object),
+    np.array([
+        [*pd.date_range(_DATE, periods=3, freq='D').to_numpy(), 54, 54.]
+    ], dtype=object),
+    np.array([
+        [*pd.date_range(_DATE, periods=3, freq='D')]
+    ], dtype=np.datetime64),
+    pd.date_range(_DATE, periods=3, freq='D').to_numpy(),
+    np.array([
+        [datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE)]
+    ]),
+    np.array([
+        ['without_datetime', 54, 54.]
+    ], dtype=object)
+])
+def test_datetime_erasure(features: np.ndarray):
+    result = np_datetime_to_numeric(features)
+    assert 'datetime' not in str(pd.DataFrame(result).infer_objects().dtypes)