Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1059 timestamp bug #1065

Merged
merged 27 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
8dacb6f
offer fıxes
IIaKyJIuH Mar 10, 2023
53103af
add remarkable solution
IIaKyJIuH Mar 10, 2023
d6e646c
come to the most appropriate solution
IIaKyJIuH Mar 13, 2023
132f3f6
format variants
IIaKyJIuH Mar 13, 2023
4c92a10
+date conversion tests
IIaKyJIuH Mar 13, 2023
f43beef
monkeypatched untestable function of strategies
IIaKyJIuH Mar 14, 2023
4f9d060
+conversions to int ms
IIaKyJIuH Mar 22, 2023
938bfe2
exclude copying idx array if it is already numpy
IIaKyJIuH Mar 22, 2023
65b5771
lint fixes (1)
IIaKyJIuH Mar 22, 2023
2aeb2da
+numpy strategy dates handler
IIaKyJIuH Mar 22, 2023
8fa77e7
extend strategies test
IIaKyJIuH Mar 22, 2023
27149de
upd numpy's strategy
IIaKyJIuH Apr 5, 2023
98aba06
upd strategies test
IIaKyJIuH Apr 5, 2023
5ce0438
made separate union testable function
IIaKyJIuH Apr 12, 2023
579ea3d
use separate function&extend strategies
IIaKyJIuH Apr 12, 2023
302f876
simplified datetime conversion
IIaKyJIuH Apr 12, 2023
22b94b5
preserve original shape of data
IIaKyJIuH Apr 12, 2023
87d50ac
fix 'field' variable shadowing
IIaKyJIuH Apr 12, 2023
15a39b8
place module level imports at the top
IIaKyJIuH Apr 12, 2023
4632edd
rid of datetime conversion in definer
IIaKyJIuH Apr 20, 2023
6a304ef
place datetime conversion in preprocessor
IIaKyJIuH Apr 21, 2023
9ec154d
preserve original dtype in datetime converter
IIaKyJIuH Apr 21, 2023
2aa0282
update tests
IIaKyJIuH Apr 21, 2023
533e585
reorganize imports
IIaKyJIuH Apr 21, 2023
9d6a8bf
clarify output dtype
IIaKyJIuH Apr 21, 2023
378b2a0
simplified datetime division
IIaKyJIuH Apr 21, 2023
a5cb31c
remove TODO
IIaKyJIuH Apr 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions fedot/api/api_utils/data_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def define_data(self, features: pd.DataFrame,

if isinstance(target, str) and target in features.columns:
target_array = features[target]
features = features.drop(columns=[target])
features = features.drop(columns=target)
else:
target_array = target

Expand Down Expand Up @@ -121,7 +121,6 @@ def define_data(self, features: Union[str, PathLike],
is_predict: bool = False) -> InputData:
# CSV files as input data, by default - table data

data_type = DataTypesEnum.table
if task.task_type == TaskTypesEnum.ts_forecasting:
# For time series forecasting format - time series
data = InputData.from_csv_time_series(task=task,
Expand All @@ -133,7 +132,7 @@ def define_data(self, features: Union[str, PathLike],
# CSV files as input data
data = InputData.from_csv(features, task=task,
target_columns=target,
data_type=data_type)
data_type=DataTypesEnum.table)
return data


Expand All @@ -155,9 +154,9 @@ def define_data(self, features: dict,
idx=None) -> MultiModalData:

# change data type to InputData
for source in features:
if not isinstance(features[source], InputData):
features[source] = array_to_input_data(features_array=features[source], target_array=target,
for source, inner_data in features.items():
if not isinstance(inner_data, InputData):
features[source] = array_to_input_data(features_array=inner_data, target_array=target,
task=task, idx=idx)
# create labels for data sources
sources = dict((f'{self.source_name_by_type.get(features[data_part_key].data_type.name)}/{data_part_key}',
Expand Down
59 changes: 38 additions & 21 deletions fedot/core/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,9 @@

import numpy as np
import pandas as pd

from golem.core.log import default_log
from golem.utilities.requirements_notificator import warn_requirement

#: The list of keyword for auto-detecting csv *tabular* data index. Used in :py:meth:`Data.from_csv`
#: and :py:meth:`MultiModalData.from_csv`.
POSSIBLE_TABULAR_IDX_KEYWORDS = ['idx', 'index', 'id', 'unnamed: 0']
#: The list of keyword for auto-detecting csv *time-series* data index. Used in :py:meth:`Data.from_csv_time_series`,
#: :py:meth:`Data.from_csv_multi_time_series` and :py:meth:`MultiModalData.from_csv_time_series`.
POSSIBLE_TS_IDX_KEYWORDS = ['datetime', 'date', 'time', 'unnamed: 0']

try:
import cv2
except ModuleNotFoundError:
Expand All @@ -31,6 +23,13 @@
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.tasks import Task, TaskTypesEnum

#: The list of keyword for auto-detecting csv *tabular* data index. Used in :py:meth:`Data.from_csv`
#: and :py:meth:`MultiModalData.from_csv`.
POSSIBLE_TABULAR_IDX_KEYWORDS = ['idx', 'index', 'id', 'unnamed: 0']
#: The list of keyword for auto-detecting csv *time-series* data index. Used in :py:meth:`Data.from_csv_time_series`,
#: :py:meth:`Data.from_csv_multi_time_series` and :py:meth:`MultiModalData.from_csv_time_series`.
POSSIBLE_TS_IDX_KEYWORDS = ['datetime', 'date', 'time', 'unnamed: 0']

PathType = Union[os.PathLike, str]


Expand All @@ -40,11 +39,11 @@ class Data:
Base Data type class
"""

idx: np.array
idx: np.ndarray
task: Task
data_type: DataTypesEnum
features: np.array
target: Optional[np.array] = None
features: np.ndarray
target: Optional[np.ndarray] = None

# Object with supplementary info
supplementary_data: SupplementaryData = field(default_factory=SupplementaryData)
Expand Down Expand Up @@ -333,19 +332,19 @@ def from_json_files(files_path: str,

if len(fields_to_use) > 1:
fields_to_combine = []
for field in fields_to_use:
fields_to_combine.append(np.array(df_data[field]))
for field_to_use in fields_to_use:
fields_to_combine.append(np.array(df_data[field_to_use]))
# Unite if the element of text data is divided into strings
if isinstance(df_data[field][0], list):
df_data[field] = [' '.join(piece) for piece in df_data[field]]
if isinstance(df_data[field_to_use][0], list):
df_data[field_to_use] = [' '.join(piece) for piece in df_data[field_to_use]]

features = np.column_stack(tuple(fields_to_combine))
else:
field = df_data[fields_to_use[0]]
# process field with nested list
if isinstance(field[0], list):
field = [' '.join(piece) for piece in field]
features = np.array(field)
field_to_use = df_data[fields_to_use[0]]
# process field_to_use with nested list
if isinstance(field_to_use[0], list):
field_to_use = [' '.join(piece) for piece in field_to_use]
features = np.array(field_to_use)

if is_multilabel:
target = df_data[label]
Expand Down Expand Up @@ -583,6 +582,25 @@ def get_indices_from_file(data_frame, file_path, idx_column='datetime') -> Itera
return np.arange(0, len(data_frame))


def np_datetime_to_numeric(data: np.ndarray) -> np.ndarray:
"""
Change data's datetime type to integer with milliseconds unit.

Args:
data: table data for converting.

Returns:
The same table data with datetimes (if existed) converted to integer
"""
orig_shape = data.shape
out_dtype = np.int64 if 'datetime' in str((dt := data.dtype)) else dt
features_df = pd.DataFrame(data, copy=False).infer_objects()
date_cols = features_df.select_dtypes('datetime')
converted_cols = date_cols.to_numpy(np.int64) // 1e6 # to 'ms' unit from 'ns'
features_df[date_cols.columns] = converted_cols
return features_df.to_numpy(out_dtype).reshape(orig_shape)


def array_to_input_data(features_array: np.array,
target_array: np.array,
idx: Optional[np.array] = None,
Expand All @@ -606,7 +624,6 @@ def get_df_from_csv(file_path: PathType, delimiter: str, index_col: Optional[Uni
possible_idx_keywords: Optional[List[str]] = None, *,
columns_to_drop: Optional[List[Union[str, int]]] = None,
columns_to_use: Optional[List[Union[str, int]]] = None):

def define_index_column(candidate_columns: List[str]) -> Optional[str]:
for column_name in candidate_columns:
if is_column_name_suitable_for_index(column_name):
Expand Down
63 changes: 37 additions & 26 deletions fedot/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from golem.core.paths import copy_doc
from sklearn.preprocessing import LabelEncoder

from fedot.core.data.data import InputData, OutputData, data_type_is_table, data_type_is_ts, data_type_is_text
from fedot.core.data.data import InputData, np_datetime_to_numeric
from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_ts, data_type_is_text
from fedot.core.data.data_preprocessing import (
data_has_categorical_features,
data_has_missing_values,
Expand Down Expand Up @@ -87,7 +88,7 @@ def _init_supplementary_preprocessors(self, data: Union[InputData, MultiModalDat
def _init_main_target_source_name(self, multi_data: MultiModalData):
"""
Defines main_target_source_name for MultiModal data branches with main target and the side ones

Args:
multi_data: `MultiModalData`
"""
Expand Down Expand Up @@ -118,8 +119,8 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) ->
return data

@copy_doc(BasePreprocessor.obligatory_prepare_for_predict)
def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]) -> Union[
InputData, MultiModalData]:
def obligatory_prepare_for_predict(self,
data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]:
if isinstance(data, InputData):
data = self._prepare_obligatory_unimodal_for_predict(data, source_name=DEFAULT_SOURCE_NAME)

Expand All @@ -132,8 +133,8 @@ def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData])
return data

@copy_doc(BasePreprocessor.optional_prepare_for_fit)
def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[
InputData, MultiModalData]:
def optional_prepare_for_fit(self, pipeline,
data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]:
self._init_supplementary_preprocessors(data)

if isinstance(data, InputData):
Expand All @@ -148,8 +149,8 @@ def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalDa
return data

@copy_doc(BasePreprocessor.optional_prepare_for_predict)
def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[
InputData, MultiModalData]:
def optional_prepare_for_predict(self, pipeline,
data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]:
if isinstance(data, InputData):
self._prepare_optional(pipeline, data, DEFAULT_SOURCE_NAME)
else:
Expand All @@ -163,7 +164,7 @@ def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiMod
def _take_only_correct_features(self, data: InputData, source_name: str):
"""
Takes only correct features from the table

Args:
data: to take correct features from
source_name: name of the data source node
Expand All @@ -178,20 +179,25 @@ def _take_only_correct_features(self, data: InputData, source_name: str):
def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str) -> InputData:
"""
Processes InputData for pipeline fit method

Args:
data: to be preprocessed
source_name: name of the data source node

Returns:
obligatory-prepared ``data``
"""
if data.supplementary_data.obligatorily_preprocessed:
# Preprocessing was already done - return data
return data

# Wrap indices in numpy array
data.idx = np.array(data.idx)
# Convert datetime data to numerical
data.features = np_datetime_to_numeric(data.features)
if data.target is not None:
data.target = np_datetime_to_numeric(data.target)

# Wrap indices in numpy array if needed
data.idx = np.asarray(data.idx)

# Fix tables / time series sizes
data = self._correct_shapes(data)
Expand Down Expand Up @@ -229,18 +235,23 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str
def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name: str) -> InputData:
"""
Processes InputData for pipeline predict method

Args:
data: to be preprocessed
source_name: name of the data source node

Returns:
obligatory-prepared data
"""
if data.supplementary_data.obligatorily_preprocessed:
# Preprocessing was already done - return data
return data

# Convert datetime data to numerical
data.features = np_datetime_to_numeric(data.features)
if data.target is not None:
data.target = np_datetime_to_numeric(data.target)

# Wrap indices in numpy array
data.idx = np.array(data.idx)

Expand All @@ -263,7 +274,7 @@ def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name:
def _prepare_optional(self, pipeline, data: InputData, source_name: str):
"""
Performs optional fitting/preprocessing for unimodal data

Args:
pipeline: determines if optional preprocessing is needed
data: to be preprocessed
Expand Down Expand Up @@ -309,10 +320,10 @@ def _find_features_full_of_nans(self, data: InputData, source_name: str):
def _drop_rows_with_nan_in_target(data: InputData) -> InputData:
"""
Drops rows with nans in target column

Args:
data: to be modified

Returns:
modified ``data``
"""
Expand All @@ -339,7 +350,7 @@ def _clean_extra_spaces(data: InputData) -> InputData:
"""
Removes extra spaces from data.
Transforms cells in columns from ' x ' to 'x'

Args:
data: to be stripped

Expand Down Expand Up @@ -381,7 +392,7 @@ def _apply_imputation_unidata(self, data: InputData, source_name: str) -> InputD

Args:
data: data for fill in the gaps

Returns:
imputed ``data``
"""
Expand All @@ -403,7 +414,7 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu
Args:
data: data to be transformed
source_name: name of the data source node

Returns:
encoded ``data``
"""
Expand All @@ -421,7 +432,7 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu
def _train_target_encoder(self, data: InputData, source_name: str):
"""
Trains `LabelEncoder` if the ``data``'s target consists of strings

Args:
data: data to be encoded
source_name: name of the data source node
Expand All @@ -444,7 +455,7 @@ def _apply_target_encoding(self, data: InputData, source_name: str) -> np.ndarra
Args:
data: data to be encoded
source_name: name of the data source node

Returns:
encoded ``data``'s target
"""
Expand Down Expand Up @@ -483,7 +494,7 @@ def _determine_target_converter(self):
Determines which encoder target to use.
Applicable for inverse target transformation (if there are several targets in
single MultiModal pipeline).

Returns:
selected data source name
"""
Expand All @@ -497,11 +508,11 @@ def _determine_target_converter(self):
def _correct_shapes(data: InputData) -> InputData:
"""
Corrects shapes of tabular data or time series.

Args:
data: time series or tabular. In the first case must be 1d-array, in the second case must be
two-dimensional arrays or array of (n, 1) for texts.

Returns:
corrected tabular data
"""
Expand Down
39 changes: 39 additions & 0 deletions test/unit/data_operations/test_data_definition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from datetime import datetime

import numpy as np
import pandas as pd
import pytest

from fedot.core.data.data import np_datetime_to_numeric

_DATE = '2000-01-01T10:00:00.100'
_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'


@pytest.mark.parametrize('features', [
np.array([
[_DATE, datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 54, 54.]
]),
np.array([
[datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 42]
], dtype=object),
np.array([
[datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 54., 54]
], dtype=object),
np.array([
[*pd.date_range(_DATE, periods=3, freq='D').to_numpy(), 54, 54.]
], dtype=object),
np.array([
[*pd.date_range(_DATE, periods=3, freq='D')]
], dtype=np.datetime64),
pd.date_range(_DATE, periods=3, freq='D').to_numpy(),
np.array([
[datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE)]
]),
np.array([
['without_datetime', 54, 54.]
], dtype=object)
])
def test_datetime_erasure(features: np.ndarray):
result = np_datetime_to_numeric(features)
assert 'datetime' not in str(pd.DataFrame(result).infer_objects().dtypes)
Loading