Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add task type information when importing #1422

Merged
merged 26 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## May 2024 Release 1.7.0
### New features
- Add task_type property for dataset
(<https://github.com/openvinotoolkit/datumaro/pull/1422>)

### Enhancements

Expand Down
3 changes: 2 additions & 1 deletion src/datumaro/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
RleMask,
)
from .components.cli_plugin import CliPlugin
from .components.dataset import Dataset, DatasetPatch, DatasetSubset, eager_mode
from .components.dataset import Dataset, DatasetPatch, DatasetSubset, StreamDataset, eager_mode
from .components.dataset_base import (
DEFAULT_SUBSET_NAME,
CategoriesInfo,
Expand All @@ -64,6 +64,7 @@
TQDMProgressReporter,
)
from .components.registry import PluginRegistry
from .components.task import TaskType
from .components.transformer import ItemTransform, ModelTransform, Transform
from .components.validator import Validator
from .components.visualizer import Visualizer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@


def select_uninferenced_dataset(dataset):
uninferenced_dataset = Dataset(media_type=MediaElement)
uninferenced_dataset = Dataset(media_type=MediaElement, task_type=dataset.task_type())
for item in dataset:
if not any(isinstance(annotation, HashKey) for annotation in item.annotations):
uninferenced_dataset.put(item)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,9 @@ def get_pruned(self, ratio: float = 0.5) -> Dataset:
source=self._dataset,
)

result_dataset = Dataset(media_type=self._dataset.media_type())
result_dataset = Dataset(
media_type=self._dataset.media_type(), task_type=self._dataset.task_type()
)
result_dataset._source_path = self._dataset._source_path
result_dataset.define_categories(self._dataset.categories())
for item in selected_items:
Expand Down
25 changes: 22 additions & 3 deletions src/datumaro/components/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
from datumaro.components.media import Image, MediaElement
from datumaro.components.merge import DEFAULT_MERGE_POLICY
from datumaro.components.progress_reporting import NullProgressReporter, ProgressReporter
from datumaro.components.task import TaskType
from datumaro.components.transformer import ItemTransform, ModelTransform, Transform
from datumaro.util.log_utils import logging_disabled
from datumaro.util.meta_file_util import load_hash_key
Expand Down Expand Up @@ -111,6 +112,9 @@ def categories(self):
def media_type(self):
return self.parent.media_type()

def task_type(self):
return self.parent.task_type()

def get_annotated_items(self):
return sum(bool(s.annotations) for s in self.parent._data.get_subset(self.name))

Expand Down Expand Up @@ -160,6 +164,7 @@ def from_iterable(
*,
env: Optional[Environment] = None,
media_type: Type[MediaElement] = Image,
task_type: Optional[TaskType] = TaskType.unlabeled,
) -> Dataset:
"""
Creates a new dataset from an iterable object producing dataset items -
Expand Down Expand Up @@ -199,6 +204,7 @@ def __init__(self):
super().__init__(
length=len(iterable) if hasattr(iterable, "__len__") else None,
media_type=media_type,
task_type=task_type,
)

def __iter__(self):
Expand Down Expand Up @@ -254,6 +260,7 @@ def __init__(
infos: Optional[DatasetInfo] = None,
categories: Optional[CategoriesInfo] = None,
media_type: Optional[Type[MediaElement]] = None,
task_type: Optional[TaskType] = None,
env: Optional[Environment] = None,
) -> None:
super().__init__()
Expand All @@ -263,7 +270,11 @@ def __init__(

self.eager = None
self._data = DatasetStorage(
source, infos=infos, categories=categories, media_type=media_type
source=source,
infos=infos,
categories=categories,
media_type=media_type,
task_type=task_type,
)
if self.is_eager:
self.init_cache()
Expand All @@ -279,6 +290,7 @@ def __repr__(self) -> str:
f"\tsize={len(self._data)}\n"
f"\tsource_path={self._source_path}\n"
f"\tmedia_type={self.media_type()}\n"
f"\ttask_type={self.task_type()}\n"
f"\tannotated_items_count={self.get_annotated_items()}\n"
f"\tannotations_count={self.get_annotations()}\n"
f"subsets\n"
Expand Down Expand Up @@ -319,6 +331,9 @@ def categories(self) -> CategoriesInfo:
def media_type(self) -> Type[MediaElement]:
return self._data.media_type()

def task_type(self) -> TaskType:
return self._data.task_type()

def get(self, id: str, subset: Optional[str] = None) -> Optional[DatasetItem]:
return self._data.get(id, subset)

Expand Down Expand Up @@ -869,7 +884,6 @@ def import_from(
extractors.append(
env.make_extractor(src_conf.format, src_conf.url, **extractor_kwargs)
)

dataset = (
cls(source=extractor_merger(extractors), env=env)
if extractor_merger is not None
Expand Down Expand Up @@ -947,13 +961,18 @@ def __init__(
infos: Optional[DatasetInfo] = None,
categories: Optional[CategoriesInfo] = None,
media_type: Optional[Type[MediaElement]] = None,
task_type: Optional[TaskType] = None,
env: Optional[Environment] = None,
) -> None:
assert env is None or isinstance(env, Environment), env
self._env = env

self._data = StreamDatasetStorage(
source, infos=infos, categories=categories, media_type=media_type
source,
infos=infos,
categories=categories,
media_type=media_type,
task_type=task_type,
)

self._format = DEFAULT_FORMAT
Expand Down
25 changes: 24 additions & 1 deletion src/datumaro/components/dataset_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from datumaro.components.cli_plugin import CliPlugin
from datumaro.components.contexts.importer import ImportContext, NullImportContext
from datumaro.components.media import Image, MediaElement
from datumaro.components.task import TaskType
from datumaro.util.attrs_util import default_if_none, not_empty
from datumaro.util.definitions import DEFAULT_SUBSET_NAME

Expand Down Expand Up @@ -108,6 +109,12 @@ def media_type(self) -> Type[MediaElement]:
"""
raise NotImplementedError()

def task_type(self) -> TaskType:
"""
Returns available task type from dataset annotation types.
"""
raise NotImplementedError()

@property
def is_stream(self) -> bool:
"""Boolean indicating whether the dataset is a stream
Expand All @@ -121,6 +128,7 @@ class _DatasetBase(IDataset):
def __init__(self, *, length: Optional[int] = None, subsets: Optional[Sequence[str]] = None):
self._length = length
self._subsets = subsets
self._ann_types = set()

def _init_cache(self):
subsets = set()
Expand Down Expand Up @@ -176,6 +184,9 @@ def categories(_):
def media_type(_):
return self.media_type()

def task_type(_):
return self.task_type()

return _DatasetFilter()

def infos(self) -> DatasetInfo:
Expand Down Expand Up @@ -205,16 +216,21 @@ def __init__(
length: Optional[int] = None,
subsets: Optional[Sequence[str]] = None,
media_type: Type[MediaElement] = Image,
task_type: Optional[TaskType] = None,
ctx: Optional[ImportContext] = None,
):
super().__init__(length=length, subsets=subsets)

self._ctx: ImportContext = ctx or NullImportContext()
self._media_type = media_type
self._task_type = task_type if task_type else TaskType.unlabeled

def media_type(self):
return self._media_type

def task_type(self):
return self._task_type


class SubsetBase(DatasetBase):
"""
Expand All @@ -228,10 +244,17 @@ def __init__(
length: Optional[int] = None,
subset: Optional[str] = None,
media_type: Type[MediaElement] = Image,
task_type: TaskType = None,
ctx: Optional[ImportContext] = None,
):
self._subset = subset or DEFAULT_SUBSET_NAME
super().__init__(length=length, subsets=[self._subset], media_type=media_type, ctx=ctx)
super().__init__(
length=length,
subsets=[self._subset],
media_type=media_type,
task_type=task_type,
ctx=ctx,
)

self._infos = {}
self._categories = {}
Expand Down
9 changes: 9 additions & 0 deletions src/datumaro/components/dataset_item_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from datumaro.components.annotation import AnnotationType
from datumaro.components.dataset_base import CategoriesInfo, DatasetInfo, DatasetItem, IDataset
from datumaro.components.media import MediaElement
from datumaro.components.task import TaskType
from datumaro.util.definitions import DEFAULT_SUBSET_NAME

__all__ = ["ItemStatus", "DatasetItemStorage", "DatasetItemStorageDatasetView"]
Expand Down Expand Up @@ -169,17 +170,22 @@
def media_type(self):
return self.parent.media_type()

def task_type(self):
return self.parent.task_type()

Check warning on line 174 in src/datumaro/components/dataset_item_storage.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/components/dataset_item_storage.py#L174

Added line #L174 was not covered by tests

def __init__(
self,
parent: DatasetItemStorage,
infos: DatasetInfo,
categories: CategoriesInfo,
media_type: Optional[Type[MediaElement]],
task_type: Optional[TaskType],
):
self._parent = parent
self._infos = infos
self._categories = categories
self._media_type = media_type
self._task_type = task_type

def __iter__(self):
yield from self._parent
Expand Down Expand Up @@ -207,3 +213,6 @@

def media_type(self):
return self._media_type

def task_type(self):
return self._task_type
Loading
Loading