diff --git a/datumaro/datumaro/cli/project/__init__.py b/datumaro/datumaro/cli/project/__init__.py index bd43a72db51c..667957a92656 100644 --- a/datumaro/datumaro/cli/project/__init__.py +++ b/datumaro/datumaro/cli/project/__init__.py @@ -7,6 +7,7 @@ import logging as log import os import os.path as osp +import shutil from datumaro.components.project import Project from datumaro.components.comparator import Comparator @@ -27,10 +28,13 @@ def create_command(args): project_dir = osp.abspath(args.dst_dir) project_path = make_project_path(project_dir) - if not args.overwrite and osp.isdir(project_dir) and os.listdir(project_dir): - log.error("Directory '%s' already exists " - "(pass --overwrite to force creation)" % project_dir) - return 1 + if osp.isdir(project_dir) and os.listdir(project_dir): + if not args.overwrite: + log.error("Directory '%s' already exists " + "(pass --overwrite to force creation)" % project_dir) + return 1 + else: + shutil.rmtree(project_dir) os.makedirs(project_dir, exist_ok=args.overwrite) if not args.overwrite and osp.isfile(project_path): @@ -76,10 +80,13 @@ def import_command(args): project_dir = osp.abspath(args.dst_dir) project_path = make_project_path(project_dir) - if not args.overwrite and osp.isdir(project_dir) and os.listdir(project_dir): - log.error("Directory '%s' already exists " - "(pass --overwrite to force creation)" % project_dir) - return 1 + if osp.isdir(project_dir) and os.listdir(project_dir): + if not args.overwrite: + log.error("Directory '%s' already exists " + "(pass --overwrite to force creation)" % project_dir) + return 1 + else: + shutil.rmtree(project_dir) os.makedirs(project_dir, exist_ok=args.overwrite) if not args.overwrite and osp.isfile(project_path): @@ -137,7 +144,11 @@ def export_command(args): dst_dir = osp.abspath(args.dst_dir) os.makedirs(dst_dir, exist_ok=False) - project.make_dataset().export( + log.info("Loading the project...") + dataset = project.make_dataset() + + log.info("Exporting the project...") + dataset.export( save_dir=dst_dir, output_format=args.output_format, filter_expr=args.filter, diff --git a/datumaro/datumaro/cli/source/__init__.py b/datumaro/datumaro/cli/source/__init__.py index 605d222b2131..a0f7c688b054 100644 --- a/datumaro/datumaro/cli/source/__init__.py +++ b/datumaro/datumaro/cli/source/__init__.py @@ -194,8 +194,12 @@ def export_command(args): dst_dir = osp.abspath(args.dst_dir) os.makedirs(dst_dir, exist_ok=False) + log.info("Loading the project...") source_project = project.make_source_project(args.name) - source_project.make_dataset().export( + dataset = source_project.make_dataset() + + log.info("Exporting the project...") + dataset.export( save_dir=dst_dir, output_format=args.output_format, filter_expr=args.filter, diff --git a/datumaro/datumaro/components/converters/__init__.py b/datumaro/datumaro/components/converters/__init__.py index 26c3710966b6..a78ba4a7e202 100644 --- a/datumaro/datumaro/components/converters/__init__.py +++ b/datumaro/datumaro/components/converters/__init__.py @@ -24,10 +24,7 @@ ) from datumaro.components.converters.yolo import YoloConverter - -from datumaro.components.converters.tfrecord import ( - DetectionApiConverter, -) +from datumaro.components.converters.tfrecord import DetectionApiConverter items = [ diff --git a/datumaro/datumaro/components/converters/datumaro.py b/datumaro/datumaro/components/converters/datumaro.py index 4e80b5855aa4..246d1911529a 100644 --- a/datumaro/datumaro/components/converters/datumaro.py +++ b/datumaro/datumaro/components/converters/datumaro.py @@ -11,8 +11,7 @@ from datumaro.components.converter import Converter from datumaro.components.extractor import ( - DEFAULT_SUBSET_NAME, - AnnotationType, Annotation, + DEFAULT_SUBSET_NAME, Annotation, LabelObject, MaskObject, PointsObject, PolygonObject, PolyLineObject, BboxObject, CaptionObject, LabelCategories, MaskCategories, PointsCategories @@ -52,11 +51,13 @@ def items(self): def write_item(self, item): annotations = [] - self.items.append({ + item_desc = { 'id': item.id, - 'path': item.path, 'annotations': annotations, - }) + } + if item.path: + item_desc['path'] = item.path + self.items.append(item_desc) for ann in item.annotations: if isinstance(ann, LabelObject): diff --git a/datumaro/datumaro/components/extractor.py b/datumaro/datumaro/components/extractor.py index bf73a9f05af9..733e84bb36b0 100644 --- a/datumaro/datumaro/components/extractor.py +++ b/datumaro/datumaro/components/extractor.py @@ -53,6 +53,11 @@ class Categories: def __init__(self, attributes=None): if attributes is None: attributes = set() + else: + if not isinstance(attributes, set): + attributes = set(attributes) + for attr in attributes: + assert isinstance(attr, str) self.attributes = attributes def __eq__(self, other): @@ -62,7 +67,7 @@ def __eq__(self, other): (self.attributes == other.attributes) class LabelCategories(Categories): - Category = namedtuple('Category', ['name', 'parent']) + Category = namedtuple('Category', ['name', 'parent', 'attributes']) def __init__(self, items=None, attributes=None): super().__init__(attributes=attributes) @@ -81,11 +86,18 @@ def _reindex(self): indices[item.name] = index self._indices = indices - def add(self, name, parent=None): + def add(self, name, parent=None, attributes=None): assert name not in self._indices + if attributes is None: + attributes = set() + else: + if not isinstance(attributes, set): + attributes = set(attributes) + for attr in attributes: + assert isinstance(attr, str) index = len(self.items) - self.items.append(self.Category(name, parent)) + self.items.append(self.Category(name, parent, attributes)) self._indices[name] = index def find(self, name): @@ -462,6 +474,7 @@ def __eq__(self, other): (self.id == other.id) and \ (self.subset == other.subset) and \ (self.annotations == other.annotations) and \ + (self.path == other.path) and \ (self.has_image == other.has_image) and \ (self.has_image and np.all(self.image == other.image) or \ not self.has_image) diff --git a/datumaro/datumaro/components/extractors/__init__.py b/datumaro/datumaro/components/extractors/__init__.py index 9820a27df4d2..6e5f323b9b5c 100644 --- a/datumaro/datumaro/components/extractors/__init__.py +++ b/datumaro/datumaro/components/extractors/__init__.py @@ -26,14 +26,9 @@ VocComp_9_10_Extractor, ) -from datumaro.components.extractors.yolo import ( - YoloExtractor, -) - -from datumaro.components.extractors.tfrecord import ( - DetectionApiExtractor, -) - +from datumaro.components.extractors.yolo import YoloExtractor +from datumaro.components.extractors.tfrecord import DetectionApiExtractor +from datumaro.components.extractors.cvat import CvatExtractor items = [ ('datumaro', DatumaroExtractor), @@ -59,4 +54,6 @@ ('yolo', YoloExtractor), ('tf_detection_api', DetectionApiExtractor), + + ('cvat', CvatExtractor), ] \ No newline at end of file diff --git a/datumaro/datumaro/components/extractors/cvat.py b/datumaro/datumaro/components/extractors/cvat.py new file mode 100644 index 000000000000..200fe88e9e79 --- /dev/null +++ b/datumaro/datumaro/components/extractors/cvat.py @@ -0,0 +1,286 @@ + +# Copyright (C) 2019 Intel Corporation +# +# SPDX-License-Identifier: MIT + +from collections import OrderedDict +import os.path as osp +import xml.etree as ET + +from datumaro.components.extractor import (Extractor, DatasetItem, + DEFAULT_SUBSET_NAME, AnnotationType, + PointsObject, PolygonObject, PolyLineObject, BboxObject, + LabelCategories +) +from datumaro.components.formats.cvat import CvatPath +from datumaro.util.image import lazy_image + + +class CvatExtractor(Extractor): + _SUPPORTED_SHAPES = ('box', 'polygon', 'polyline', 'points') + + def __init__(self, path): + super().__init__() + + assert osp.isfile(path) + rootpath = path.rsplit(CvatPath.ANNOTATIONS_DIR, maxsplit=1)[0] + self._path = rootpath + + subset = osp.splitext(osp.basename(path))[0] + if subset == DEFAULT_SUBSET_NAME: + subset = None + self._subset = subset + + items, categories = self._parse(path) + self._items = self._load_items(items) + self._categories = categories + + def categories(self): + return self._categories + + def __iter__(self): + for item in self._items.values(): + yield item + + def __len__(self): + return len(self._items) + + def subsets(self): + if self._subset: + return [self._subset] + return None + + def get_subset(self, name): + if name != self._subset: + return None + return self + + @classmethod + def _parse(cls, path): + context = ET.ElementTree.iterparse(path, events=("start", "end")) + context = iter(context) + + categories = cls._parse_meta(context) + + items = OrderedDict() + + track = None + shape = None + image = None + for ev, el in context: + if ev == 'start': + if el.tag == 'track': + track = { + 'id': el.attrib.get('id'), + 'label': el.attrib.get('label'), + 'group': int(el.attrib.get('group_id', 0)), + } + elif el.tag == 'image': + image = { + 'name': el.attrib.get('name'), + 'frame': el.attrib['id'], + } + elif el.tag in cls._SUPPORTED_SHAPES and (track or image): + shape = { + 'type': None, + 'attributes': {}, + } + if track: + shape.update(track) + if image: + shape.update(image) + elif ev == 'end': + if el.tag == 'attribute' and shape is not None: + shape['attributes'][el.attrib['name']] = el.text + elif el.tag in cls._SUPPORTED_SHAPES: + if track is not None: + shape['frame'] = el.attrib['frame'] + shape['outside'] = (el.attrib.get('outside') == '1') + shape['keyframe'] = (el.attrib.get('keyframe') == '1') + if image is not None: + shape['label'] = el.attrib.get('label') + shape['group'] = int(el.attrib.get('group_id', 0)) + + shape['type'] = el.tag + shape['occluded'] = (el.attrib.get('occluded') == '1') + shape['z_order'] = int(el.attrib.get('z_order', 0)) + + if el.tag == 'box': + shape['points'] = list(map(float, [ + el.attrib['xtl'], el.attrib['ytl'], + el.attrib['xbr'], el.attrib['ybr'], + ])) + else: + shape['points'] = [] + for pair in el.attrib['points'].split(';'): + shape['points'].extend(map(float, pair.split(','))) + + frame_desc = items.get(shape['frame'], { + 'name': shape.get('name'), + 'annotations': [], + }) + frame_desc['annotations'].append( + cls._parse_ann(shape, categories)) + items[shape['frame']] = frame_desc + shape = None + + elif el.tag == 'track': + track = None + elif el.tag == 'image': + image = None + el.clear() + + return items, categories + + @staticmethod + def _parse_meta(context): + ev, el = next(context) + if not (ev == 'start' and el.tag == 'annotations'): + raise Exception("Unexpected token ") + + categories = {} + + has_z_order = False + mode = 'annotation' + labels = OrderedDict() + label = None + + # Recursive descent parser + el = None + states = ['annotations'] + def accepted(expected_state, tag, next_state=None): + state = states[-1] + if state == expected_state and el is not None and el.tag == tag: + if not next_state: + next_state = tag + states.append(next_state) + return True + return False + def consumed(expected_state, tag): + state = states[-1] + if state == expected_state and el is not None and el.tag == tag: + states.pop() + return True + return False + + for ev, el in context: + if ev == 'start': + if accepted('annotations', 'meta'): pass + elif accepted('meta', 'task'): pass + elif accepted('task', 'z_order'): pass + elif accepted('task', 'labels'): pass + elif accepted('labels', 'label'): + label = { 'name': None, 'attributes': set() } + elif accepted('label', 'name', next_state='label_name'): pass + elif accepted('label', 'attributes'): pass + elif accepted('attributes', 'attribute'): pass + elif accepted('attribute', 'name', next_state='attr_name'): pass + elif accepted('annotations', 'image') or \ + accepted('annotations', 'track') or \ + accepted('annotations', 'tag'): + break + else: + pass + elif ev == 'end': + if consumed('meta', 'meta'): + break + elif consumed('task', 'task'): pass + elif consumed('z_order', 'z_order'): + has_z_order = (el.text == 'True') + elif consumed('label_name', 'name'): + label['name'] = el.text + elif consumed('attr_name', 'name'): + label['attributes'].add(el.text) + elif consumed('attribute', 'attribute'): pass + elif consumed('attributes', 'attributes'): pass + elif consumed('label', 'label'): + labels[label['name']] = label['attributes'] + label = None + elif consumed('labels', 'labels'): pass + else: + pass + + assert len(states) == 1 and states[0] == 'annotations', \ + "Expected 'meta' section in the annotation file, path: %s" % states + + common_attrs = ['occluded'] + if has_z_order: + common_attrs.append('z_order') + if mode == 'interpolation': + common_attrs.append('keyframe') + common_attrs.append('outside') + + label_cat = LabelCategories(attributes=common_attrs) + for label, attrs in labels.items(): + label_cat.add(label, attributes=attrs) + + categories[AnnotationType.label] = label_cat + + return categories + + @classmethod + def _parse_ann(cls, ann, categories): + ann_id = ann.get('id') + ann_type = ann['type'] + + attributes = ann.get('attributes', {}) + if 'occluded' in categories[AnnotationType.label].attributes: + attributes['occluded'] = ann.get('occluded', False) + if 'z_order' in categories[AnnotationType.label].attributes: + attributes['z_order'] = ann.get('z_order', 0) + if 'outside' in categories[AnnotationType.label].attributes: + attributes['outside'] = ann.get('outside', False) + if 'keyframe' in categories[AnnotationType.label].attributes: + attributes['keyframe'] = ann.get('keyframe', False) + + group = ann.get('group') + if group == 0: + group = None + + label = ann.get('label') + label_id = categories[AnnotationType.label].find(label)[0] + + points = ann.get('points', []) + + if ann_type == 'polyline': + return PolyLineObject(points, label=label_id, + id=ann_id, attributes=attributes, group=group) + + elif ann_type == 'polygon': + return PolygonObject(points, label=label_id, + id=ann_id, attributes=attributes, group=group) + + elif ann_type == 'points': + return PointsObject(points, label=label_id, + id=ann_id, attributes=attributes, group=group) + + elif ann_type == 'box': + x, y = points[0], points[1] + w, h = points[2] - x, points[3] - y + return BboxObject(x, y, w, h, label=label_id, + id=ann_id, attributes=attributes, group=group) + + else: + raise NotImplementedError("Unknown annotation type '%s'" % ann_type) + + def _load_items(self, parsed): + for item_id, item_desc in parsed.items(): + file_name = item_desc.get('name') + if not file_name: + file_name = item_id + file_name += CvatPath.IMAGE_EXT + image = self._find_image(file_name) + + parsed[item_id] = DatasetItem(id=item_id, subset=self._subset, + image=image, annotations=item_desc.get('annotations', None)) + return parsed + + def _find_image(self, file_name): + images_dir = osp.join(self._path, CvatPath.IMAGES_DIR) + search_paths = [ + osp.join(images_dir, file_name), + osp.join(images_dir, self._subset or DEFAULT_SUBSET_NAME, file_name), + ] + for image_path in search_paths: + if osp.exists(image_path): + return lazy_image(image_path) \ No newline at end of file diff --git a/datumaro/datumaro/components/extractors/datumaro.py b/datumaro/datumaro/components/extractors/datumaro.py index 6bb336533114..8917b8b99e4f 100644 --- a/datumaro/datumaro/components/extractors/datumaro.py +++ b/datumaro/datumaro/components/extractors/datumaro.py @@ -3,70 +3,58 @@ # # SPDX-License-Identifier: MIT -from collections import defaultdict import json import logging as log import os.path as osp from datumaro.components.extractor import (Extractor, DatasetItem, - DEFAULT_SUBSET_NAME, - AnnotationType, + DEFAULT_SUBSET_NAME, AnnotationType, LabelObject, MaskObject, PointsObject, PolygonObject, PolyLineObject, BboxObject, CaptionObject, LabelCategories, MaskCategories, PointsCategories ) from datumaro.components.formats.datumaro import DatumaroPath -from datumaro.util import dir_items from datumaro.util.image import lazy_image from datumaro.util.mask_tools import lazy_mask class DatumaroExtractor(Extractor): - class Subset(Extractor): - def __init__(self, name, parent): - super().__init__() - self._parent = parent - self._name = name - self.items = [] - - def __iter__(self): - for item in self.items: - yield self._parent._get(item, self._name) + def __init__(self, path): + super().__init__() - def __len__(self): - return len(self.items) + assert osp.isfile(path) + rootpath = path.rsplit(DatumaroPath.ANNOTATIONS_DIR, maxsplit=1)[0] + self._path = rootpath - def categories(self): - return self._parent.categories() + subset_name = osp.splitext(osp.basename(path))[0] + if subset_name == DEFAULT_SUBSET_NAME: + subset_name = None + self._subset_name = subset_name - def __init__(self, path): - super().__init__() + with open(path, 'r') as f: + parsed_anns = json.load(f) + self._categories = self._load_categories(parsed_anns) + self._items = self._load_items(parsed_anns) - assert osp.isdir(path) - self._path = path + def categories(self): + return self._categories - annotations = defaultdict(list) - found_subsets = self._find_subsets(path) - parsed_anns = None - subsets = {} - for subset_name, subset_path in found_subsets.items(): - if subset_name == DEFAULT_SUBSET_NAME: - subset_name = None - subset = self.Subset(subset_name, self) - with open(subset_path, 'r') as f: - parsed_anns = json.load(f) + def __iter__(self): + for item in self._items: + yield item - for index, _ in enumerate(parsed_anns['items']): - subset.items.append(index) + def __len__(self): + return len(self._items) - annotations[subset_name] = parsed_anns - subsets[subset_name] = subset - self._annotations = dict(annotations) - self._subsets = subsets + def subsets(self): + if self._subset_name: + return [self._subset_name] + return None - self._categories = {} - if parsed_anns is not None: - self._categories = self._load_categories(parsed_anns) + def get_subset(self, name): + if name != self._subset_name: + return None + return self @staticmethod def _load_categories(parsed): @@ -101,21 +89,24 @@ def _load_categories(parsed): return categories - def _get(self, index, subset_name): - item = self._annotations[subset_name]['items'][index] + def _load_items(self, parsed): + items = [] + for item_desc in parsed['items']: + item_id = item_desc['id'] + image = None + image_path = osp.join(self._path, DatumaroPath.IMAGES_DIR, + item_id + DatumaroPath.IMAGE_EXT) + if osp.exists(image_path): + image = lazy_image(image_path) - item_id = item.get('id') + annotations = self._load_annotations(item_desc) - image_path = osp.join(self._path, DatumaroPath.IMAGES_DIR, - item_id + DatumaroPath.IMAGE_EXT) - image = None - if osp.isfile(image_path): - image = lazy_image(image_path) + item = DatasetItem(id=item_id, subset=self._subset_name, + annotations=annotations, image=image) - annotations = self._load_annotations(item) + items.append(item) - return DatasetItem(id=item_id, subset=subset_name, - annotations=annotations, image=image) + return items def _load_annotations(self, item): parsed = item['annotations'] @@ -182,33 +173,3 @@ def _load_annotations(self, item): raise NotImplementedError() return loaded - - def categories(self): - return self._categories - - def __iter__(self): - for subset_name, subset in self._subsets.items(): - for index in subset.items: - yield self._get(index, subset_name) - - def __len__(self): - length = 0 - for subset in self._subsets.values(): - length += len(subset) - return length - - def subsets(self): - return list(self._subsets) - - def get_subset(self, name): - return self._subsets[name] - - @staticmethod - def _find_subsets(path): - anno_dir = osp.join(path, DatumaroPath.ANNOTATIONS_DIR) - if not osp.isdir(anno_dir): - raise Exception('Datumaro dataset not found at "%s"' % path) - - return { name: osp.join(anno_dir, name + '.json') - for name in dir_items(anno_dir, '.json', truncate_ext=True) - } \ No newline at end of file diff --git a/datumaro/datumaro/components/extractors/ms_coco.py b/datumaro/datumaro/components/extractors/ms_coco.py index 537a297b2d1a..38cb1a38a07c 100644 --- a/datumaro/datumaro/components/extractors/ms_coco.py +++ b/datumaro/datumaro/components/extractors/ms_coco.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: MIT from collections import OrderedDict +from itertools import chain import numpy as np import os.path as osp @@ -11,7 +12,7 @@ import pycocotools.mask as mask_utils from datumaro.components.extractor import (Extractor, DatasetItem, - AnnotationType, + DEFAULT_SUBSET_NAME, AnnotationType, LabelObject, MaskObject, PointsObject, PolygonObject, BboxObject, CaptionObject, LabelCategories, PointsCategories @@ -42,46 +43,46 @@ def __eq__(self, other): return super().__eq__(other) return self._rle == other._rle - class CocoExtractor(Extractor): - class Subset(Extractor): - def __init__(self, name, parent): - super().__init__() - self._name = name - self._parent = parent - self.loaders = {} - self.items = OrderedDict() - - def __iter__(self): - for img_id in self.items: - yield self._parent._get(img_id, self._name) - - def __len__(self): - return len(self.items) - - def categories(self): - return self._parent.categories() - def __init__(self, path, task, merge_instance_polygons=False): super().__init__() + assert osp.isfile(path) rootpath = path.rsplit(CocoPath.ANNOTATIONS_DIR, maxsplit=1)[0] self._path = rootpath self._task = task - self._subsets = {} - subset_name = osp.splitext(osp.basename(path))[0] \ + subset = osp.splitext(osp.basename(path))[0] \ .rsplit('_', maxsplit=1)[1] - subset = CocoExtractor.Subset(subset_name, self) + if subset == DEFAULT_SUBSET_NAME: + subset = None + self._subset = subset + + self._merge_instance_polygons = merge_instance_polygons + loader = self._make_subset_loader(path) - subset.loaders[task] = loader - for img_id in loader.getImgIds(): - subset.items[img_id] = None - self._subsets[subset_name] = subset + self._load_categories(loader) + self._items = self._load_items(loader) - self._load_categories() + def categories(self): + return self._categories - self._merge_instance_polygons = merge_instance_polygons + def __iter__(self): + for item in self._items.values(): + yield item + + def __len__(self): + return len(self._items) + + def subsets(self): + if self._subset: + return [self._subset] + return None + + def get_subset(self, name): + if name != self._subset: + return None + return self @staticmethod def _make_subset_loader(path): @@ -95,31 +96,17 @@ def _make_subset_loader(path): coco_api.createIndex() return coco_api - def _load_categories(self): - loaders = {} - - for subset in self._subsets.values(): - loaders.update(subset.loaders) - + def _load_categories(self, loader): self._categories = {} - label_loader = loaders.get(CocoTask.labels) - instances_loader = loaders.get(CocoTask.instances) - person_kp_loader = loaders.get(CocoTask.person_keypoints) - - if label_loader is None and instances_loader is not None: - label_loader = instances_loader - if label_loader is None and person_kp_loader is not None: - label_loader = person_kp_loader - if label_loader is not None: - label_categories, label_map = \ - self._load_label_categories(label_loader) + if self._task in [CocoTask.instances, CocoTask.labels, + CocoTask.person_keypoints, CocoTask.stuff, CocoTask.panoptic]: + label_categories, label_map = self._load_label_categories(loader) self._categories[AnnotationType.label] = label_categories self._label_map = label_map - if person_kp_loader is not None: - person_kp_categories = \ - self._load_person_kp_categories(person_kp_loader) + if self._task == CocoTask.person_keypoints: + person_kp_categories = self._load_person_kp_categories(loader) self._categories[AnnotationType.points] = person_kp_categories # pylint: disable=no-self-use @@ -142,76 +129,47 @@ def _load_person_kp_categories(self, loader): categories = PointsCategories() for cat in cats: - label_id, _ = self._categories[AnnotationType.label].find(cat['name']) + label_id = self._label_map[cat['id']] categories.add(label_id=label_id, labels=cat['keypoints'], adjacent=cat['skeleton']) return categories - def categories(self): - return self._categories + def _load_items(self, loader): + items = OrderedDict() - def __iter__(self): - for subset in self._subsets.values(): - for item in subset: - yield item + for img_id in loader.getImgIds(): + image_info = loader.loadImgs(img_id)[0] + image = self._find_image(image_info['file_name']) - def __len__(self): - length = 0 - for subset in self._subsets.values(): - length += len(subset) - return length + anns = loader.getAnnIds(imgIds=img_id) + anns = loader.loadAnns(anns) + anns = list(chain(*( + self._load_annotations(ann, image_info) for ann in anns))) - def subsets(self): - return list(self._subsets) + items[img_id] = DatasetItem(id=img_id, subset=self._subset, + image=image, annotations=anns) - def get_subset(self, name): - return self._subsets[name] - - def _get(self, img_id, subset): - file_name = None - image_info = None - image = None - annotations = [] - for ann_type, loader in self._subsets[subset].loaders.items(): - if image is None: - image_info = loader.loadImgs(img_id)[0] - file_name = image_info['file_name'] - if file_name != '': - image_dir = osp.join(self._path, CocoPath.IMAGES_DIR) - search_paths = [ - osp.join(image_dir, file_name), - osp.join(image_dir, subset, file_name), - ] - for image_path in search_paths: - if osp.exists(image_path): - image = lazy_image(image_path) - break - - annIds = loader.getAnnIds(imgIds=img_id) - anns = loader.loadAnns(annIds) - - for ann in anns: - self._parse_annotation(ann, ann_type, annotations, image_info) - return DatasetItem(id=img_id, subset=subset, - image=image, annotations=annotations) - - def _parse_label(self, ann): + return items + + def _get_label_id(self, ann): cat_id = ann.get('category_id') if cat_id in [0, None]: return None return self._label_map[cat_id] - def _parse_annotation(self, ann, ann_type, parsed_annotations, - image_info=None): + def _load_annotations(self, ann, image_info=None): + parsed_annotations = [] + ann_id = ann.get('id') + attributes = {} if 'score' in ann: attributes['score'] = ann['score'] - if ann_type is CocoTask.instances: + if self._task is CocoTask.instances: x, y, w, h = ann['bbox'] - label_id = self._parse_label(ann) + label_id = self._get_label_id(ann) group = None is_crowd = bool(ann['iscrowd']) @@ -253,18 +211,17 @@ def _parse_annotation(self, ann, ann_type, parsed_annotations, BboxObject(x, y, w, h, label=label_id, id=ann_id, attributes=attributes, group=group) ) - elif ann_type is CocoTask.labels: - label_id = self._parse_label(ann) + elif self._task is CocoTask.labels: + label_id = self._get_label_id(ann) parsed_annotations.append( - LabelObject(label=label_id, - id=ann_id, attributes=attributes) + LabelObject(label=label_id, id=ann_id, attributes=attributes) ) - elif ann_type is CocoTask.person_keypoints: + elif self._task is CocoTask.person_keypoints: keypoints = ann['keypoints'] points = [p for i, p in enumerate(keypoints) if i % 3 != 2] visibility = keypoints[2::3] bbox = ann.get('bbox') - label_id = self._parse_label(ann) + label_id = self._get_label_id(ann) group = None if bbox is not None: group = ann_id @@ -276,7 +233,7 @@ def _parse_annotation(self, ann, ann_type, parsed_annotations, parsed_annotations.append( BboxObject(*bbox, label=label_id, group=group) ) - elif ann_type is CocoTask.captions: + elif self._task is CocoTask.captions: caption = ann['caption'] parsed_annotations.append( CaptionObject(caption, @@ -287,6 +244,16 @@ def _parse_annotation(self, ann, ann_type, parsed_annotations, return parsed_annotations + def _find_image(self, file_name): + images_dir = osp.join(self._path, CocoPath.IMAGES_DIR) + search_paths = [ + osp.join(images_dir, file_name), + osp.join(images_dir, self._subset or DEFAULT_SUBSET_NAME, file_name), + ] + for image_path in search_paths: + if osp.exists(image_path): + return lazy_image(image_path) + class CocoImageInfoExtractor(CocoExtractor): def __init__(self, path, **kwargs): super().__init__(path, task=CocoTask.image_info, **kwargs) diff --git a/datumaro/datumaro/components/formats/cvat.py b/datumaro/datumaro/components/formats/cvat.py new file mode 100644 index 000000000000..e0c7a10476ae --- /dev/null +++ b/datumaro/datumaro/components/formats/cvat.py @@ -0,0 +1,10 @@ + +# Copyright (C) 2019 Intel Corporation +# +# SPDX-License-Identifier: MIT + +class CvatPath: + IMAGES_DIR = 'images' + ANNOTATIONS_DIR = 'annotations' + + IMAGE_EXT = '.jpg' diff --git a/datumaro/datumaro/components/importers/__init__.py b/datumaro/datumaro/components/importers/__init__.py index 5d2923b8141e..7c952d2cbbbf 100644 --- a/datumaro/datumaro/components/importers/__init__.py +++ b/datumaro/datumaro/components/importers/__init__.py @@ -4,19 +4,16 @@ # SPDX-License-Identifier: MIT from datumaro.components.importers.datumaro import DatumaroImporter - -from datumaro.components.importers.ms_coco import ( - CocoImporter, -) +from datumaro.components.importers.ms_coco import CocoImporter from datumaro.components.importers.voc import ( VocImporter, VocResultsImporter, ) -from datumaro.components.importers.tfrecord import ( - DetectionApiImporter, -) +from datumaro.components.importers.tfrecord import DetectionApiImporter +from datumaro.components.importers.yolo import YoloImporter +from datumaro.components.importers.cvat import CvatImporter items = [ @@ -27,5 +24,9 @@ ('voc', VocImporter), ('voc_results', VocResultsImporter), + ('yolo', YoloImporter), + ('tf_detection_api', DetectionApiImporter), + + ('cvat', CvatImporter), ] \ No newline at end of file diff --git a/datumaro/datumaro/components/importers/cvat.py b/datumaro/datumaro/components/importers/cvat.py new file mode 100644 index 000000000000..efdeff2963e7 --- /dev/null +++ b/datumaro/datumaro/components/importers/cvat.py @@ -0,0 +1,46 @@ + +# Copyright (C) 2019 Intel Corporation +# +# SPDX-License-Identifier: MIT + +from glob import glob +import logging as log +import os.path as osp + +from datumaro.components.formats.cvat import CvatPath + + +class CvatImporter: + EXTRACTOR_NAME = 'cvat' + + def __call__(self, path, **extra_params): + from datumaro.components.project import Project # cyclic import + project = Project() + + if path.endswith('.xml') and osp.isfile(path): + subset_paths = [path] + else: + subset_paths = glob(osp.join(path, '*.xml')) + + if osp.basename(osp.normpath(path)) != CvatPath.ANNOTATIONS_DIR: + path = osp.join(path, CvatPath.ANNOTATIONS_DIR) + subset_paths += glob(osp.join(path, '*.xml')) + + if len(subset_paths) == 0: + raise Exception("Failed to find 'cvat' dataset at '%s'" % path) + + for subset_path in subset_paths: + if not osp.isfile(subset_path): + continue + + log.info("Found a dataset at '%s'" % subset_path) + + subset_name = osp.splitext(osp.basename(subset_path))[0] + + project.add_source(subset_name, { + 'url': subset_path, + 'format': self.EXTRACTOR_NAME, + 'options': extra_params, + }) + + return project diff --git a/datumaro/datumaro/components/importers/datumaro.py b/datumaro/datumaro/components/importers/datumaro.py index 40939b90cf18..9c2a162b8cc8 100644 --- a/datumaro/datumaro/components/importers/datumaro.py +++ b/datumaro/datumaro/components/importers/datumaro.py @@ -3,23 +3,44 @@ # # SPDX-License-Identifier: MIT +from glob import glob +import logging as log import os.path as osp +from datumaro.components.formats.datumaro import DatumaroPath + class DatumaroImporter: EXTRACTOR_NAME = 'datumaro' - def __call__(self, path): + def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() - if not osp.exists(path): + if path.endswith('.json') and osp.isfile(path): + subset_paths = [path] + else: + subset_paths = glob(osp.join(path, '*.json')) + + if osp.basename(osp.normpath(path)) != DatumaroPath.ANNOTATIONS_DIR: + path = osp.join(path, DatumaroPath.ANNOTATIONS_DIR) + subset_paths += glob(osp.join(path, '*.json')) + + if len(subset_paths) == 0: raise Exception("Failed to find 'datumaro' dataset at '%s'" % path) - source_name = osp.splitext(osp.basename(path))[0] - project.add_source(source_name, { - 'url': path, - 'format': self.EXTRACTOR_NAME, - }) + for subset_path in subset_paths: + if not osp.isfile(subset_path): + continue + + log.info("Found a dataset at '%s'" % subset_path) + + subset_name = osp.splitext(osp.basename(subset_path))[0] + + project.add_source(subset_name, { + 'url': subset_path, + 'format': self.EXTRACTOR_NAME, + 'options': extra_params, + }) - return project \ No newline at end of file + return project diff --git a/datumaro/datumaro/components/importers/ms_coco.py b/datumaro/datumaro/components/importers/ms_coco.py index 30d959b0024e..e7a0d26ca018 100644 --- a/datumaro/datumaro/components/importers/ms_coco.py +++ b/datumaro/datumaro/components/importers/ms_coco.py @@ -4,7 +4,8 @@ # SPDX-License-Identifier: MIT from collections import defaultdict -import os +from glob import glob +import logging as log import os.path as osp from datumaro.components.formats.ms_coco import CocoTask, CocoPath @@ -19,9 +20,6 @@ class CocoImporter: CocoTask.image_info: 'coco_images', } - def __init__(self, task_filter=None): - self._task_filter = task_filter - def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() @@ -33,6 +31,8 @@ def __call__(self, path, **extra_params): for ann_files in subsets.values(): for ann_type, ann_file in ann_files.items(): + log.info("Found a dataset at '%s'" % ann_file) + source_name = osp.splitext(osp.basename(ann_file))[0] project.add_source(source_name, { 'url': ann_file, @@ -43,28 +43,29 @@ def __call__(self, path, **extra_params): return project @staticmethod - def find_subsets(dataset_dir): - ann_dir = os.path.join(dataset_dir, CocoPath.ANNOTATIONS_DIR) - if not osp.isdir(ann_dir): - raise NotADirectoryError( - 'COCO annotations directory not found at "%s"' % ann_dir) + def find_subsets(path): + if path.endswith('.json') and osp.isfile(path): + subset_paths = [path] + else: + subset_paths = glob(osp.join(path, '*_*.json')) + + if osp.basename(osp.normpath(path)) != CocoPath.ANNOTATIONS_DIR: + path = osp.join(path, CocoPath.ANNOTATIONS_DIR) + subset_paths += glob(osp.join(path, '*_*.json')) subsets = defaultdict(dict) - for ann_file in os.listdir(ann_dir): - subset_path = osp.join(ann_dir, ann_file) - if not subset_path.endswith('.json'): - continue + for subset_path in subset_paths: + name_parts = osp.splitext(osp.basename(subset_path))[0] \ + .rsplit('_', maxsplit=1) - name_parts = osp.splitext(ann_file)[0].rsplit('_', maxsplit=1) ann_type = name_parts[0] try: ann_type = CocoTask[ann_type] except KeyError: - raise Exception( - 'Unknown subset type %s, only known are: %s' % \ - (ann_type, - ', '.join([e.name for e in CocoTask]) - )) + log.warn("Skipping '%s': unknown subset " + "type '%s', the only known are: %s" % \ + (subset_path, ann_type, + ', '.join([e.name for e in CocoTask]))) subset_name = name_parts[1] subsets[subset_name][ann_type] = subset_path return dict(subsets) \ No newline at end of file diff --git a/datumaro/datumaro/components/importers/tfrecord.py b/datumaro/datumaro/components/importers/tfrecord.py index c42c2e174389..c1506211142d 100644 --- a/datumaro/datumaro/components/importers/tfrecord.py +++ b/datumaro/datumaro/components/importers/tfrecord.py @@ -4,32 +4,39 @@ # SPDX-License-Identifier: MIT from glob import glob +import logging as log import os.path as osp class DetectionApiImporter: EXTRACTOR_NAME = 'tf_detection_api' - def __call__(self, path): + def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() - subset_paths = glob(osp.join(path, '*.tfrecord')) + if path.endswith('.tfrecord') and osp.isfile(path): + subset_paths = [path] + else: + subset_paths = glob(osp.join(path, '*.tfrecord')) + + if len(subset_paths) == 0: + raise Exception( + "Failed to find 'tf_detection_api' dataset at '%s'" % path) for subset_path in subset_paths: if not osp.isfile(subset_path): continue + log.info("Found a dataset at '%s'" % subset_path) + subset_name = osp.splitext(osp.basename(subset_path))[0] project.add_source(subset_name, { 'url': subset_path, 'format': self.EXTRACTOR_NAME, + 'options': extra_params, }) - if len(project.config.sources) == 0: - raise Exception( - "Failed to find 'tf_detection_api' dataset at '%s'" % path) - return project diff --git a/datumaro/datumaro/components/importers/voc.py b/datumaro/datumaro/components/importers/voc.py index 432cf374141e..e71327893b0e 100644 --- a/datumaro/datumaro/components/importers/voc.py +++ b/datumaro/datumaro/components/importers/voc.py @@ -19,7 +19,7 @@ class VocImporter: (VocTask.action_classification, 'voc_action', 'Action'), ] - def __call__(self, path): + def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() @@ -31,6 +31,7 @@ def __call__(self, path): project.add_source(task.name, { 'url': path, 'format': extractor_type, + 'options': extra_params, }) if len(project.config.sources) == 0: @@ -53,7 +54,7 @@ class VocResultsImporter: ('comp10', 'voc_comp_9_10', 'Action'), ] - def __call__(self, path): + def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() @@ -68,6 +69,7 @@ def __call__(self, path): project.add_source(task_name, { 'url': task_dir, 'format': extractor_type, + 'options': extra_params, }) if len(project.config.sources) == 0: diff --git a/datumaro/datumaro/components/importers/yolo.py b/datumaro/datumaro/components/importers/yolo.py index 4254b803e14c..2a22117edd23 100644 --- a/datumaro/datumaro/components/importers/yolo.py +++ b/datumaro/datumaro/components/importers/yolo.py @@ -3,8 +3,9 @@ # # SPDX-License-Identifier: MIT +from glob import glob +import logging as log import os.path as osp -from datumaro.util import dir_items class YoloImporter: @@ -15,13 +16,14 @@ def __call__(self, path, **extra_params): if not osp.exists(path): raise Exception("Failed to find 'yolo' dataset at '%s'" % path) - configs = [] - if osp.isfile(path): - configs = path - elif osp.isdir(path): - configs = [osp.join(path, p) for p in dir_items(path, '.data')] + if path.endswith('.data') and osp.isfile(path): + config_paths = [path] + else: + config_paths = glob(osp.join(path, '*.data')) + + for config_path in config_paths: + log.info("Found a dataset at '%s'" % config_path) - for config_path in configs: source_name = osp.splitext(osp.basename(config_path))[0] project.add_source(source_name, { 'url': config_path, diff --git a/datumaro/datumaro/components/project.py b/datumaro/datumaro/components/project.py index a648f461e593..e03aad631a5d 100644 --- a/datumaro/datumaro/components/project.py +++ b/datumaro/datumaro/components/project.py @@ -269,6 +269,8 @@ def categories(self): class DatasetItemWrapper(DatasetItem): def __init__(self, item, path, annotations, image=None): self._item = item + if path is None: + path = [] self._path = path self._annotations = annotations self._image = image @@ -334,7 +336,10 @@ def __init__(self, project): own_source = None own_source_dir = osp.join(config.project_dir, config.dataset_dir) if osp.isdir(own_source_dir): - own_source = env.make_extractor(DEFAULT_FORMAT, own_source_dir) + log.disable(log.INFO) + own_source = env.make_importer(DEFAULT_FORMAT)(own_source_dir) \ + .make_dataset() + log.disable(log.NOTSET) # merge categories # TODO: implement properly with merging and annotations remapping @@ -351,7 +356,7 @@ def __init__(self, project): # merge items subsets = defaultdict(lambda: Subset(self)) for source_name, source in self._sources.items(): - log.info("Loading '%s' source contents..." % source_name) + log.debug("Loading '%s' source contents..." % source_name) for item in source: if dataset_filter and not dataset_filter(item): continue @@ -387,7 +392,7 @@ def __init__(self, project): # override with our items, fallback to existing images if own_source is not None: - log.info("Loading own dataset...") + log.debug("Loading own dataset...") for item in own_source: if dataset_filter and not dataset_filter(item): continue diff --git a/datumaro/tests/test_coco_format.py b/datumaro/tests/test_coco_format.py index 17d7155752f8..580136ef428d 100644 --- a/datumaro/tests/test_coco_format.py +++ b/datumaro/tests/test_coco_format.py @@ -7,9 +7,7 @@ from unittest import TestCase from datumaro.components.project import Project -from datumaro.components.extractor import ( - DEFAULT_SUBSET_NAME, - Extractor, DatasetItem, +from datumaro.components.extractor import (Extractor, DatasetItem, AnnotationType, LabelObject, MaskObject, PointsObject, PolygonObject, BboxObject, CaptionObject, LabelCategories, PointsCategories @@ -150,10 +148,8 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, if target_dataset is not None: source_dataset = target_dataset - source_subsets = [s if s else DEFAULT_SUBSET_NAME - for s in source_dataset.subsets()] self.assertListEqual( - sorted(source_subsets), + sorted(source_dataset.subsets()), sorted(parsed_dataset.subsets()), ) @@ -172,7 +168,7 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, def test_can_save_and_load_captions(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=0, subset='train', annotations=[ CaptionObject('hello', id=1), @@ -188,11 +184,7 @@ def __iter__(self): CaptionObject('word', id=1), ] ), - ] - return iter(items) - - def subsets(self): - return ['train', 'val'] + ]) with TestDir() as test_dir: self._test_save_and_load(TestExtractor(), @@ -201,7 +193,7 @@ def subsets(self): def test_can_save_and_load_instances(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=0, subset='train', image=np.ones((4, 4, 3)), annotations=[ # Bbox + single polygon @@ -234,11 +226,7 @@ def __iter__(self): attributes={ 'is_crowd': True }, label=4, group=3, id=3), ]), - ] - return iter(items) - - def subsets(self): - return ['train', 'val'] + ]) def categories(self): label_categories = LabelCategories() @@ -255,7 +243,7 @@ def categories(self): def test_can_save_and_load_instances_with_mask_conversion(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=0, image=np.zeros((5, 5, 3)), subset='train', annotations=[ BboxObject(0, 0, 5, 5, label=3, id=4, group=4, @@ -276,11 +264,7 @@ def __iter__(self): label=3, id=4, group=4), ] ), - ] - return iter(items) - - def subsets(self): - return ['train'] + ]) def categories(self): label_categories = LabelCategories() @@ -302,7 +286,7 @@ def test_can_merge_instance_polygons_to_mask_in_coverter(self): class SrcTestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=0, image=np.zeros((5, 10, 3)), annotations=[ PolygonObject([0, 0, 4, 0, 4, 4], @@ -313,15 +297,14 @@ def __iter__(self): attributes={ 'is_crowd': False }), ] ), - ] - return iter(items) + ]) def categories(self): return { AnnotationType.label: label_categories } class DstTestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=0, image=np.zeros((5, 10, 3)), annotations=[ BboxObject(1, 0, 8, 4, label=3, id=4, group=4, @@ -339,8 +322,7 @@ def __iter__(self): label=3, id=4, group=4), ] ), - ] - return iter(items) + ]) def categories(self): return { AnnotationType.label: label_categories } @@ -353,7 +335,7 @@ def categories(self): def test_can_save_and_load_images(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=0, subset='train'), DatasetItem(id=1, subset='train'), @@ -362,11 +344,7 @@ def __iter__(self): DatasetItem(id=4, subset='val'), DatasetItem(id=5, subset='test'), - ] - return iter(items) - - def subsets(self): - return ['train', 'val', 'test'] + ]) with TestDir() as test_dir: self._test_save_and_load(TestExtractor(), @@ -375,7 +353,7 @@ def subsets(self): def test_can_save_and_load_labels(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=0, subset='train', annotations=[ LabelObject(4, id=1), @@ -390,11 +368,7 @@ def __iter__(self): annotations=[ LabelObject(2, id=1), ]), - ] - return iter(items) - - def subsets(self): - return ['train', 'val'] + ]) def categories(self): label_categories = LabelCategories() @@ -411,7 +385,7 @@ def categories(self): def test_can_save_and_load_keypoints(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=0, subset='train', annotations=[ PointsObject([1, 2, 0, 2, 4, 1], [0, 1, 2], @@ -433,11 +407,7 @@ def __iter__(self): group=3, id=3), BboxObject(0, 2, 4, 4, label=2, group=3), ]), - ] - return iter(items) - - def subsets(self): - return ['train', 'val'] + ]) def categories(self): label_categories = LabelCategories() @@ -458,7 +428,7 @@ def categories(self): def test_can_save_dataset_with_no_subsets(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=1, annotations=[ LabelObject(2, id=1), ]), @@ -470,10 +440,7 @@ def __iter__(self): PolygonObject([0, 0, 4, 0, 4, 4], label=3, id=4, group=4, attributes={ 'is_crowd': False }), ]), - ] - - for item in items: - yield item + ]) def categories(self): label_cat = LabelCategories() diff --git a/datumaro/tests/test_cvat_format.py b/datumaro/tests/test_cvat_format.py new file mode 100644 index 000000000000..bc06b0561396 --- /dev/null +++ b/datumaro/tests/test_cvat_format.py @@ -0,0 +1,148 @@ +import numpy as np +import os +import os.path as osp +from xml.etree import ElementTree as ET + +from unittest import TestCase + +from datumaro.components.extractor import (Extractor, DatasetItem, + AnnotationType, PointsObject, PolygonObject, PolyLineObject, BboxObject, + LabelCategories, +) +from datumaro.components.importers.cvat import CvatImporter +import datumaro.components.formats.cvat as Cvat +from datumaro.util.image import save_image +from datumaro.util.test_utils import TestDir + + +class CvatExtractorTest(TestCase): + @staticmethod + def generate_dummy_cvat(path): + images_dir = osp.join(path, Cvat.CvatPath.IMAGES_DIR) + anno_dir = osp.join(path, Cvat.CvatPath.ANNOTATIONS_DIR) + + os.makedirs(images_dir) + os.makedirs(anno_dir) + + root_elem = ET.Element('annotations') + ET.SubElement(root_elem, 'version').text = '1.1' + + meta_elem = ET.SubElement(root_elem, 'meta') + task_elem = ET.SubElement(meta_elem, 'task') + ET.SubElement(task_elem, 'z_order').text = 'True' + ET.SubElement(task_elem, 'mode').text = 'interpolation' + + labels_elem = ET.SubElement(task_elem, 'labels') + + label1_elem = ET.SubElement(labels_elem, 'label') + ET.SubElement(label1_elem, 'name').text = 'label1' + label1_attrs_elem = ET.SubElement(label1_elem, 'attributes') + + label1_a1_elem = ET.SubElement(label1_attrs_elem, 'attribute') + ET.SubElement(label1_a1_elem, 'name').text = 'a1' + ET.SubElement(label1_a1_elem, 'input_type').text = 'checkbox' + ET.SubElement(label1_a1_elem, 'default_value').text = 'false' + ET.SubElement(label1_a1_elem, 'values').text = 'false\ntrue' + + label1_a2_elem = ET.SubElement(label1_attrs_elem, 'attribute') + ET.SubElement(label1_a2_elem, 'name').text = 'a2' + ET.SubElement(label1_a2_elem, 'input_type').text = 'radio' + ET.SubElement(label1_a2_elem, 'default_value').text = 'v1' + ET.SubElement(label1_a2_elem, 'values').text = 'v1\nv2\nv3' + + label2_elem = ET.SubElement(labels_elem, 'label') + ET.SubElement(label2_elem, 'name').text = 'label2' + + # item 1 + save_image(osp.join(images_dir, 'img0.jpg'), np.ones((8, 8, 3))) + item1_elem = ET.SubElement(root_elem, 'image') + item1_elem.attrib.update({ + 'id': '0', 'name': 'img0', 'width': '8', 'height': '8' + }) + + item1_ann1_elem = ET.SubElement(item1_elem, 'box') + item1_ann1_elem.attrib.update({ + 'label': 'label1', 'occluded': '1', 'z_order': '1', + 'xtl': '0', 'ytl': '2', 'xbr': '4', 'ybr': '4' + }) + item1_ann1_a1_elem = ET.SubElement(item1_ann1_elem, 'attribute') + item1_ann1_a1_elem.attrib['name'] = 'a1' + item1_ann1_a1_elem.text = 'true' + item1_ann1_a2_elem = ET.SubElement(item1_ann1_elem, 'attribute') + item1_ann1_a2_elem.attrib['name'] = 'a2' + item1_ann1_a2_elem.text = 'v3' + + item1_ann2_elem = ET.SubElement(item1_elem, 'polyline') + item1_ann2_elem.attrib.update({ + 'label': '', 'points': '1.0,2;3,4;5,6;7,8' + }) + + # item 2 + save_image(osp.join(images_dir, 'img1.jpg'), np.ones((10, 10, 3))) + item2_elem = ET.SubElement(root_elem, 'image') + item2_elem.attrib.update({ + 'id': '1', 'name': 'img1', 'width': '8', 'height': '8' + }) + + item2_ann1_elem = ET.SubElement(item2_elem, 'polygon') + item2_ann1_elem.attrib.update({ + 'label': '', 'points': '1,2;3,4;6,5', 'z_order': '1', + }) + + item2_ann2_elem = ET.SubElement(item2_elem, 'points') + item2_ann2_elem.attrib.update({ + 'label': 'label2', 'points': '1,2;3,4;5,6', 'z_order': '2', + }) + + with open(osp.join(anno_dir, 'train.xml'), 'w') as f: + f.write(ET.tostring(root_elem, encoding='unicode')) + + def test_can_load(self): + class TestExtractor(Extractor): + def __iter__(self): + return iter([ + DatasetItem(id=1, subset='train', image=np.ones((8, 8, 3)), + annotations=[ + BboxObject(0, 2, 4, 2, label=0, + attributes={ + 'occluded': True, 'z_order': 1, + 'a1': 'true', 'a2': 'v3' + }), + PolyLineObject([1, 2, 3, 4, 5, 6, 7, 8], + attributes={'occluded': False, 'z_order': 0}), + ]), + DatasetItem(id=2, subset='train', image=np.ones((10, 10, 3)), + annotations=[ + PolygonObject([1, 2, 3, 4, 6, 5], + attributes={'occluded': False, 'z_order': 1}), + PointsObject([1, 2, 3, 4, 5, 6], label=1, + attributes={'occluded': False, 'z_order': 2}), + ]), + ]) + + def categories(self): + label_categories = LabelCategories() + for i in range(10): + label_categories.add('label_' + str(i)) + return { + AnnotationType.label: label_categories, + } + + with TestDir() as test_dir: + self.generate_dummy_cvat(test_dir.path) + source_dataset = TestExtractor() + + parsed_dataset = CvatImporter()(test_dir.path).make_dataset() + + self.assertListEqual( + sorted(source_dataset.subsets()), + sorted(parsed_dataset.subsets()), + ) + self.assertEqual(len(source_dataset), len(parsed_dataset)) + for subset_name in source_dataset.subsets(): + source_subset = source_dataset.get_subset(subset_name) + parsed_subset = parsed_dataset.get_subset(subset_name) + for item_a, item_b in zip(source_subset, parsed_subset): + self.assertEqual(len(item_a.annotations), len(item_b.annotations)) + for ann_a, ann_b in zip(item_a.annotations, item_b.annotations): + self.assertEqual(ann_a, ann_b) \ No newline at end of file diff --git a/datumaro/tests/test_datumaro_format.py b/datumaro/tests/test_datumaro_format.py index 3402ccba22f3..3a83c424ca0e 100644 --- a/datumaro/tests/test_datumaro_format.py +++ b/datumaro/tests/test_datumaro_format.py @@ -1,4 +1,3 @@ -from itertools import zip_longest import numpy as np from unittest import TestCase @@ -17,7 +16,7 @@ class DatumaroConverterTest(TestCase): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=100, subset='train', image=np.ones((10, 6, 3)), annotations=[ CaptionObject('hello', id=1), @@ -47,11 +46,10 @@ def __iter__(self): ]), DatasetItem(id=42, subset='test'), - ] - return iter(items) - def subsets(self): - return ['train', 'val', 'test'] + DatasetItem(id=42), + DatasetItem(id=43), + ]) def categories(self): label_categories = LabelCategories() @@ -91,8 +89,9 @@ def test_can_save_and_load(self): for subset_name in source_dataset.subsets(): source_subset = source_dataset.get_subset(subset_name) parsed_subset = parsed_dataset.get_subset(subset_name) + self.assertEqual(len(source_subset), len(parsed_subset)) for idx, (item_a, item_b) in enumerate( - zip_longest(source_subset, parsed_subset)): + zip(source_subset, parsed_subset)): self.assertEqual(item_a, item_b, str(idx)) self.assertEqual( diff --git a/datumaro/tests/test_project.py b/datumaro/tests/test_project.py index 1d9df96f14ab..7f67e1d9e18f 100644 --- a/datumaro/tests/test_project.py +++ b/datumaro/tests/test_project.py @@ -137,9 +137,6 @@ def __iter__(self): for i in range(self.n): yield DatasetItem(id=i, subset='train', image=i) - def subsets(self): - return ['train'] - class TestLauncher(Launcher): def __init__(self, **kwargs): pass @@ -178,9 +175,6 @@ def __iter__(self): yield DatasetItem(id=i, subset='train', image=i, annotations=[ LabelObject(i) ]) - def subsets(self): - return ['train'] - class TestLauncher(Launcher): def __init__(self, **kwargs): pass @@ -207,17 +201,10 @@ def __iter__(self): index = osp.splitext(osp.basename(path))[0] subset = f.readline()[:-1] label = int(f.readline()[:-1]) - assert(subset == 'train') + assert subset == 'train' yield DatasetItem(id=index, subset=subset, annotations=[ LabelObject(label) ]) - def __len__(self): - return len(self.items) - - def subsets(self): - return ['train'] - - model_name = 'model' launcher_name = 'custom_launcher' extractor_name = 'custom_extractor' @@ -251,9 +238,6 @@ def __iter__(self): for i in range(self.n): yield DatasetItem(id=self.s + i, subset='train') - def subsets(self): - return ['train'] - e_name1 = 'e1' e_name2 = 'e2' n1 = 2 @@ -279,9 +263,6 @@ def __iter__(self): for i in range(self.n): yield DatasetItem(id=i, subset='train') - def subsets(self): - return ['train'] - e_type = 'type' project = Project() project.env.extractors.register(e_type, TestExtractor) @@ -353,9 +334,6 @@ def __iter__(self): else: yield v2_item - def subsets(self): - return ['train'] - project = Project() project.env.extractors.register('t1', lambda p: TestExtractor(p, v=1)) project.env.extractors.register('t2', lambda p: TestExtractor(p, v=2)) @@ -379,9 +357,6 @@ def __iter__(self): for i in range(self.n): yield DatasetItem(id=i, subset='train') - def subsets(self): - return ['train'] - def test_xpathfilter_can_be_applied(self): extractor = self.TestExtractor('', n=4) dataset_filter = XPathDatasetFilter('/item[id > 1]') @@ -433,9 +408,6 @@ def __iter__(self): DatasetItem(id=3, subset='test'), ]) - def subsets(self): - return ['train', 'test'] - extractor_name = 'ext1' project = Project() project.env.extractors.register(extractor_name, CustomExtractor) diff --git a/datumaro/tests/test_tfrecord_format.py b/datumaro/tests/test_tfrecord_format.py index 8511dc14e9aa..e6ac60c4c84e 100644 --- a/datumaro/tests/test_tfrecord_format.py +++ b/datumaro/tests/test_tfrecord_format.py @@ -6,12 +6,8 @@ from datumaro.components.extractor import (Extractor, DatasetItem, AnnotationType, BboxObject, LabelCategories ) -from datumaro.components.extractors.tfrecord import ( - DetectionApiExtractor, -) -from datumaro.components.converters.tfrecord import ( - DetectionApiConverter, -) +from datumaro.components.extractors.tfrecord import DetectionApiExtractor +from datumaro.components.converters.tfrecord import DetectionApiConverter from datumaro.util import find from datumaro.util.test_utils import TestDir @@ -47,7 +43,7 @@ def _test_can_save_and_load(self, source_dataset, converter, test_dir, def test_can_save_bboxes(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=1, subset='train', image=np.ones((16, 16, 3)), annotations=[ @@ -67,10 +63,7 @@ def __iter__(self): DatasetItem(id=3, subset='test', image=np.ones((5, 4, 3)) * 3, ), - ] - - for item in items: - yield item + ]) def categories(self): label_cat = LabelCategories() @@ -88,7 +81,7 @@ def categories(self): def test_can_save_dataset_with_no_subsets(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=1, image=np.ones((16, 16, 3)), annotations=[ @@ -107,10 +100,7 @@ def __iter__(self): DatasetItem(id=3, image=np.ones((8, 4, 3)) * 3, ), - ] - - for item in items: - yield item + ]) def categories(self): label_cat = LabelCategories() diff --git a/datumaro/tests/test_voc_format.py b/datumaro/tests/test_voc_format.py index fac5c27e37ef..3d184ca5522f 100644 --- a/datumaro/tests/test_voc_format.py +++ b/datumaro/tests/test_voc_format.py @@ -28,6 +28,7 @@ VocSegmentationConverter, ) from datumaro.components.importers.voc import VocImporter +from datumaro.components.project import Project from datumaro.util import find from datumaro.util.test_utils import TestDir @@ -437,7 +438,7 @@ def test_can_save_voc_action(self): def test_can_save_dataset_with_no_subsets(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=1, annotations=[ BboxObject(2, 3, 4, 5, label=2, id=1), BboxObject(2, 3, 4, 5, label=3, id=2), @@ -446,10 +447,7 @@ def __iter__(self): DatasetItem(id=2, annotations=[ BboxObject(5, 4, 6, 5, label=3, id=1), ]), - ] - - for item in items: - yield item + ]) def categories(self): label_cat = LabelCategories() @@ -480,7 +478,7 @@ def test_can_import(self): dummy_dir = osp.join(test_dir.path, 'dummy') subsets = generate_dummy_voc(dummy_dir) - dataset = VocImporter()(dummy_dir).make_dataset() + dataset = Project.import_from(dummy_dir, 'voc').make_dataset() self.assertEqual(len(VOC.VocTask), len(dataset.sources)) self.assertEqual(set(subsets), set(dataset.subsets())) diff --git a/datumaro/tests/test_yolo_format.py b/datumaro/tests/test_yolo_format.py index 364c91a04b54..6b24ba5d927d 100644 --- a/datumaro/tests/test_yolo_format.py +++ b/datumaro/tests/test_yolo_format.py @@ -14,7 +14,7 @@ class YoloFormatTest(TestCase): def test_can_save_and_load(self): class TestExtractor(Extractor): def __iter__(self): - items = [ + return iter([ DatasetItem(id=1, subset='train', image=np.ones((8, 8, 3)), annotations=[ BboxObject(0, 2, 4, 2, label=2), @@ -34,8 +34,7 @@ def __iter__(self): BboxObject(0, 2, 4, 2, label=6), BboxObject(0, 7, 3, 2, label=7), ]), - ] - return iter(items) + ]) def categories(self): label_categories = LabelCategories()