From d4504d2f022e0f2b4c170527dac872c41d75db72 Mon Sep 17 00:00:00 2001 From: nankolena <145366880+nankolena@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:06:07 -0500 Subject: [PATCH] feat: add special datatype for timestamp (#730) * fix import path * add tests * include new annotation type * stash * move timestamp to experimental special class * convert to utc first * add docstring, set GMT as default tz * clean up + add unit tests * import typing for backwards compatibility * add logic to convert to utc within timestamp --- docs/reference/experimental/index.md | 3 + .../multiclass/upload_results.py | 2 +- kolena/_experimental/special_data_type.py | 85 +++++++++++++++++++ kolena/_utils/datatypes.py | 3 + kolena/annotation.py | 1 + tests/integration/dataset/test_dataset.py | 11 ++- .../_experimental/test_special_data_type.py | 83 ++++++++++++++++++ 7 files changed, 184 insertions(+), 4 deletions(-) create mode 100644 kolena/_experimental/special_data_type.py create mode 100644 tests/unit/_experimental/test_special_data_type.py diff --git a/docs/reference/experimental/index.md b/docs/reference/experimental/index.md index cd9ea994c..9a5c16933 100644 --- a/docs/reference/experimental/index.md +++ b/docs/reference/experimental/index.md @@ -24,3 +24,6 @@ options: members: ["download_results_by_tag"] show_root_heading: true +::: kolena._experimental.special_data_type + options: + show_root_heading: true diff --git a/examples/dataset/classification/classification/multiclass/upload_results.py b/examples/dataset/classification/classification/multiclass/upload_results.py index 459ccc61b..f7f08ff26 100644 --- a/examples/dataset/classification/classification/multiclass/upload_results.py +++ b/examples/dataset/classification/classification/multiclass/upload_results.py @@ -22,9 +22,9 @@ from classification.multiclass.constants import DATASET from classification.multiclass.constants import ID_FIELDS +from kolena.annotation import ScoredClassificationLabel from kolena.dataset import download_dataset from kolena.dataset import upload_results -from kolena.workflow.annotation import ScoredClassificationLabel MODELS = ["resnet50v2", "inceptionv3"] diff --git a/kolena/_experimental/special_data_type.py b/kolena/_experimental/special_data_type.py new file mode 100644 index 000000000..e36896c8e --- /dev/null +++ b/kolena/_experimental/special_data_type.py @@ -0,0 +1,85 @@ +# Copyright 2021-2024 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Special data types supported on the Kolena platform. + +""" # noqa: E501 +from abc import ABCMeta +from datetime import datetime +from typing import Optional + +from kolena._utils.datatypes import DataCategory +from kolena._utils.datatypes import DataType +from kolena._utils.datatypes import TypedDataObject +from kolena._utils.pydantic_v1.dataclasses import dataclass +from kolena._utils.validators import ValidatorConfig + + +class _SpecialDataType(DataType): + TIMESTAMP = "TIMESTAMP" + + @staticmethod + def _data_category() -> DataCategory: + return DataCategory.SPECIAL + + +@dataclass(frozen=True, config=ValidatorConfig) +class SpecialDataType(TypedDataObject[_SpecialDataType], metaclass=ABCMeta): + """The base class for all special data types.""" + + +@dataclass(frozen=True, config=ValidatorConfig) +class Timestamp(SpecialDataType): + """ + !!! note "Experimental" + This class is considered **experimental** + + Timestamp data type. + """ + + epoch_time: Optional[float] = None + """The epoch time of the timestamp. If `value` and `format` are specified, the `epoch_time` will be calculated.""" + + value: Optional[str] = None + """ + The timestamp in a string representation. If present, the corresponding `format` must be specified too. + Note that GMT timezone is assumed unless the offset is specified in the string. + """ + + format: Optional[str] = None + """ + The format of the `value` string following the + [python format codes](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes). + """ + + @staticmethod + def _data_type() -> _SpecialDataType: + return _SpecialDataType.TIMESTAMP + + def __post_init__(self) -> None: + if self.value: + if not self.format: + raise ValueError("format needs to be specified for string timestamp") + if "%z" in self.format: + time_value = self.value + time_format = self.format + else: + time_value = self.value + " +0000" + time_format = self.format + " %z" + + object.__setattr__( + self, + "epoch_time", + datetime.strptime(time_value, time_format).timestamp(), + ) diff --git a/kolena/_utils/datatypes.py b/kolena/_utils/datatypes.py index 1eec79b06..2850d9041 100644 --- a/kolena/_utils/datatypes.py +++ b/kolena/_utils/datatypes.py @@ -85,6 +85,7 @@ class DataCategory(str, Enum): METRICS = "METRICS" ASSET = "ASSET" ANNOTATION = "ANNOTATION" + SPECIAL = "SPECIAL" def data_category_to_module_name(self) -> str: if self == DataCategory.TEST_SAMPLE: @@ -97,6 +98,8 @@ def data_category_to_module_name(self) -> str: return "kolena.asset" if self == DataCategory.ANNOTATION: return "kolena.annotation" + if self == DataCategory.SPECIAL: + return "kolena._experimental.data_type.special" raise ValueError(f"Must specify module name for data category: {self}") diff --git a/kolena/annotation.py b/kolena/annotation.py index 02f8fceed..424d3fdf0 100644 --- a/kolena/annotation.py +++ b/kolena/annotation.py @@ -62,6 +62,7 @@ class _AnnotationType(DataType): TIME_SEGMENT = "TIME_SEGMENT" TEXT_SEGMENT = "TEXT_SEGMENT" CUSTOM = "CUSTOM" + TIMESTAMP = "TIMESTAMP" @staticmethod def _data_category() -> DataCategory: diff --git a/tests/integration/dataset/test_dataset.py b/tests/integration/dataset/test_dataset.py index d053b1dc2..dfc0b2a12 100644 --- a/tests/integration/dataset/test_dataset.py +++ b/tests/integration/dataset/test_dataset.py @@ -21,6 +21,9 @@ import pytest from kolena._api.v2.dataset import CommitData +from kolena._experimental.special_data_type import Timestamp +from kolena.annotation import BoundingBox +from kolena.annotation import LabeledBoundingBox from kolena.dataset import download_dataset from kolena.dataset import list_datasets from kolena.dataset import upload_dataset @@ -28,8 +31,6 @@ from kolena.dataset.dataset import _load_dataset_metadata from kolena.errors import InputValidationError from kolena.errors import NotFoundError -from kolena.workflow.annotation import BoundingBox -from kolena.workflow.annotation import LabeledBoundingBox from tests.integration.helper import assert_frame_equal from tests.integration.helper import fake_locator from tests.integration.helper import upload_extracted_properties @@ -83,6 +84,8 @@ def test__upload_dataset() -> None: LabeledBoundingBox(label="cat", top_left=[i, i], bottom_right=[i + 10, i + 10]), LabeledBoundingBox(label="dog", top_left=[i + 5, i + 5], bottom_right=[i + 20, i + 20]), ], + time_str=Timestamp(value=f"12/31/2024, 00:00:{'{:02d}'.format(i)}", format="%m/%d/%Y, %H:%M:%S"), + time_num=Timestamp(epoch_time=1735689600 + i), ) for i in range(20) ] @@ -96,10 +99,12 @@ def test__upload_dataset() -> None: BoundingBox(label=bbox.label, top_left=bbox.top_left, bottom_right=bbox.bottom_right) for bbox in dp["bboxes"] ], + time_str=dp["time_str"], + time_num=dp["time_num"], ) for dp in datapoints ] - columns = ["locator", "width", "height", "city", "bboxes"] + columns = ["locator", "width", "height", "city", "bboxes", "time_str", "time_num"] upload_dataset(name, pd.DataFrame(datapoints[:10], columns=columns), id_fields=["locator"]) diff --git a/tests/unit/_experimental/test_special_data_type.py b/tests/unit/_experimental/test_special_data_type.py new file mode 100644 index 000000000..8ba781c21 --- /dev/null +++ b/tests/unit/_experimental/test_special_data_type.py @@ -0,0 +1,83 @@ +# Copyright 2021-2024 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any +from typing import Dict +from typing import Optional + +import pytest + +from kolena._experimental.special_data_type import _SpecialDataType +from kolena._experimental.special_data_type import Timestamp +from kolena._utils.datatypes import DATA_TYPE_FIELD + + +@pytest.mark.parametrize( + "object, json_data", + [ + ( + Timestamp(epoch_time=1700000000), + { + "epoch_time": 1700000000, + "value": None, + "format": None, + }, + ), + ( + Timestamp(value="12/31/2024, 00:00:00", format="%m/%d/%Y, %H:%M:%S"), + { + "epoch_time": 1735603200, + "value": "12/31/2024, 00:00:00", + "format": "%m/%d/%Y, %H:%M:%S", + }, + ), + ], +) +def test__serde__timestamp(object: Timestamp, json_data: Dict[str, Any]) -> None: + object_dict = object._to_dict() + assert object_dict == { + **json_data, + DATA_TYPE_FIELD: f"{_SpecialDataType._data_category().value}/{_SpecialDataType.TIMESTAMP.value}", + } + assert Timestamp._from_dict(object_dict) == object + + +@pytest.mark.parametrize( + "value, format, epoch_time", + [ + ("12/31/2024, 00:00:00", "%m/%d/%Y, %H:%M:%S", 1735603200), + ("25/05/99 02:35:5.523", "%d/%m/%y %H:%M:%S.%f", 927599705.523), + ("2021/05/25", "%Y/%m/%d", 1621900800), + ("2021-05-25 02:35:15", "%Y-%m-%d %H:%M:%S", 1621910115), + ("Tuesday, December 31, 2024 5:00:00 AM", "%A, %B %d, %Y %H:%M:%S %p", 1735621200), + ("Tuesday, December 31, 2024 00:00:00 AM GMT-05:00", "%A, %B %d, %Y %H:%M:%S %p %Z%z", 1735621200), + ("Tuesday, December 31, 2024 00:00:00 AM UTC-05:00", "%A, %B %d, %Y %H:%M:%S %p %Z%z", 1735621200), + ], +) +def test__timestamp_epoch_conversion(value: str, format: str, epoch_time: float) -> None: + timestamp_object = Timestamp(value=value, format=format) + assert epoch_time == timestamp_object.epoch_time + + +@pytest.mark.parametrize( + "value, format", + [ + # value without format + ("12/31/2024, 00:00:00", None), + # format inconsistent with value + ("12/31/2024, 00:00:00", "%m/%d/%Y, %s"), + ], +) +def test__timestamp_validation(value: str, format: Optional[str]) -> None: + with pytest.raises(ValueError): + Timestamp(value=value, format=format)