openvinotoolkit · zhiltsov-max · Jun 9, 2021 · Jun 7, 2021 · Jun 7, 2021 · Jun 7, 2021
@@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 ### Added
--
+- Subformat importers for VOC and COCO (<https://github.com/openvinotoolkit/datumaro/pull/281>)
 
 ### Changed
 -

@@ -97,22 +97,22 @@ def put(self, item):
             return self._data.put(item)
 
         def get(self, id, subset=None):
-            assert subset or DEFAULT_SUBSET_NAME == \
-                   self.name or DEFAULT_SUBSET_NAME
+            assert (subset or DEFAULT_SUBSET_NAME) == \
+                   (self.name or DEFAULT_SUBSET_NAME)
             return self._data.get(id, subset)
 
         def remove(self, id, subset=None):
-            assert subset or DEFAULT_SUBSET_NAME == \
-                   self.name or DEFAULT_SUBSET_NAME
+            assert (subset or DEFAULT_SUBSET_NAME) == \
+                   (self.name or DEFAULT_SUBSET_NAME)
             return self._data.remove(id, subset)
 
         def get_subset(self, name):
-            assert name or DEFAULT_SUBSET_NAME == \
-                   self.name or DEFAULT_SUBSET_NAME
+            assert (name or DEFAULT_SUBSET_NAME) == \
+                   (self.name or DEFAULT_SUBSET_NAME)
             return self
 
         def subsets(self):
-            return { self.name or DEFAULT_SUBSET_NAME: self }
+            return { self.name or DEFAULT_SUBSET_NAME : self }
 
         def categories(self):
             return self.parent.categories()
@@ -187,22 +187,24 @@ def put(self, item):
         return self.parent.put(item, subset=self.name)
 
     def get(self, id, subset=None):
-        assert subset or DEFAULT_SUBSET_NAME == \
-               self.name or DEFAULT_SUBSET_NAME
+        assert (subset or DEFAULT_SUBSET_NAME) == \
+               (self.name or DEFAULT_SUBSET_NAME)
         return self.parent.get(id, subset=self.name)
 
     def remove(self, id, subset=None):
-        assert subset or DEFAULT_SUBSET_NAME == \
-               self.name or DEFAULT_SUBSET_NAME
+        assert (subset or DEFAULT_SUBSET_NAME) == \
+               (self.name or DEFAULT_SUBSET_NAME)
         return self.parent.remove(id, subset=self.name)
 
     def get_subset(self, name):
-        assert name or DEFAULT_SUBSET_NAME == \
-               self.name or DEFAULT_SUBSET_NAME
+        assert (name or DEFAULT_SUBSET_NAME) == \
+               (self.name or DEFAULT_SUBSET_NAME)
         return self
 
     def subsets(self):
-        return { self.name or DEFAULT_SUBSET_NAME: self }
+        if (self.name or DEFAULT_SUBSET_NAME) == DEFAULT_SUBSET_NAME:
+            return self.parent.subsets()
+        return { self.name: self }
 
     def categories(self):
         return self.parent.categories()

@@ -5,7 +5,7 @@
 
 from enum import Enum
 from glob import iglob
-from typing import Iterable, List, Dict, Optional
+from typing import Callable, Iterable, List, Dict, Optional
 import numpy as np
 import os
 import os.path as osp
@@ -672,13 +672,39 @@ def __call__(self, path, **extra_params):
         return project
 
     @classmethod
-    def _find_sources_recursive(cls, path, ext, extractor_name,
-            filename='*', dirname='', file_filter=None, max_depth=3):
+    def _find_sources_recursive(cls, path: str, ext: Optional[str],
+            extractor_name: str, filename: str = '*', dirname: str = '',
+            file_filter: Optional[Callable[[str], bool]] = None,
+            max_depth: int = 3):
+        """
+        Finds sources in the specified location, using the matching pattern
+        to filter file names and directories.
+        Supposed to be used, and to be the only call in subclasses.
+
+        Paramters:
+        - path - a directory or file path, where sources need to be found.
+        - ext - file extension to match. To match directories,
+            set this parameter to None or ''. Comparison is case-independent,
+            a starting dot is not required.
+        - extractor_name - the name of the associated Extractor type
+        - filename - a glob pattern for file names
+        - dirname - a glob pattern for filename prefixes
+        - file_filter - a callable (abspath: str) -> bool, to filter paths found
+        - max_depth - the maximum depth for recursive search.
+
+        Returns: a list of source configurations
+            (i.e. Extractor type names and c-tor parameters)
+        """
+
+        if ext:
+            if not ext.startswith('.'):
+                ext = '.' + ext
+            ext = ext.lower()
 
-        if (path.endswith(ext) and osp.isfile(path)) or \
-                (not ext and osp.isdir(path) and dirname and \
-                os.sep + osp.normpath(dirname) + os.sep in \
-                    osp.abspath(path) + os.sep):
+        if (ext and path.lower().endswith(ext) and osp.isfile(path)) or \
+                (not ext and dirname and osp.isdir(path) and \
+                os.sep + osp.normpath(dirname.lower()) + os.sep in \
+                    osp.abspath(path.lower()) + os.sep):
             sources = [{'url': path, 'format': extractor_name}]
         else:
             sources = []

@@ -3,19 +3,19 @@
 #
 # SPDX-License-Identifier: MIT
 
-from collections import defaultdict
 from glob import glob
 import logging as log
 import os.path as osp
 
 from datumaro.components.extractor import Importer
+from datumaro.util import parse_str_enum_value
 from datumaro.util.log_utils import logging_disabled
 
 from .format import CocoTask
 
 
 class CocoImporter(Importer):
-    _COCO_EXTRACTORS = {
+    _TASKS = {
         CocoTask.instances: 'coco_instances',
         CocoTask.person_keypoints: 'coco_person_keypoints',
         CocoTask.captions: 'coco_captions',
@@ -65,34 +65,60 @@ def __call__(self, path, **extra_params):
                 source_name = osp.splitext(osp.basename(ann_file))[0]
                 project.add_source(source_name, {
                     'url': ann_file,
-                    'format': self._COCO_EXTRACTORS[ann_type],
+                    'format': self._TASKS[ann_type],
                     'options': dict(extra_params),
                 })
 
         return project
 
-    @staticmethod
-    def find_sources(path):
+    @classmethod
+    def find_sources(cls, path):
         if path.endswith('.json') and osp.isfile(path):
             subset_paths = [path]
         else:
             subset_paths = glob(osp.join(path, '**', '*_*.json'),
                 recursive=True)
 
-        subsets = defaultdict(dict)
+        subsets = {}
         for subset_path in subset_paths:
             name_parts = osp.splitext(osp.basename(subset_path))[0] \
                 .rsplit('_', maxsplit=1)
 
-            ann_type = name_parts[0]
-            try:
-                ann_type = CocoTask[ann_type]
-            except KeyError:
-                log.warning("Skipping '%s': unknown subset "
-                    "type '%s', the only known are: %s" % \
-                    (subset_path, ann_type,
-                        ', '.join(e.name for e in CocoTask)))
+            ann_type = parse_str_enum_value(name_parts[0], CocoTask,
+                default=None)
+            if ann_type not in cls._TASKS:
                 continue
+
             subset_name = name_parts[1]
-            subsets[subset_name][ann_type] = subset_path
-        return dict(subsets)
+            subsets.setdefault(subset_name, {})[ann_type] = subset_path
+
+        return subsets
+
+
+class CocoImageInfoImporter(CocoImporter):
+    _TASK = CocoTask.image_info
+    _TASKS = { _TASK: CocoImporter._TASKS[_TASK] }
+
+class CocoCaptionsImporter(CocoImporter):
+    _TASK = CocoTask.captions
+    _TASKS = { _TASK: CocoImporter._TASKS[_TASK] }
+
+class CocoInstancesImporter(CocoImporter):
+    _TASK = CocoTask.instances
+    _TASKS = { _TASK: CocoImporter._TASKS[_TASK] }
+
+class CocoPersonKeypointsImporter(CocoImporter):
+    _TASK = CocoTask.person_keypoints
+    _TASKS = { _TASK: CocoImporter._TASKS[_TASK] }
+
+class CocoLabelsImporter(CocoImporter):
+    _TASK = CocoTask.labels
+    _TASKS = { _TASK: CocoImporter._TASKS[_TASK] }
+
+class CocoPanopticImporter(CocoImporter):
+    _TASK = CocoTask.panoptic
+    _TASKS = { _TASK: CocoImporter._TASKS[_TASK] }
+
+class CocoStuffImporter(CocoImporter):
+    _TASK = CocoTask.stuff
+    _TASKS = { _TASK: CocoImporter._TASKS[_TASK] }
@@ -164,7 +164,8 @@ def get_image_name(person, image_id):
 class LfwImporter(Importer):
     @classmethod
     def find_sources(cls, path):
-        return cls._find_sources_recursive(path, LfwPath.PAIRS_FILE, 'lfw')
+        base, ext = osp.splitext(LfwPath.PAIRS_FILE)
+        return cls._find_sources_recursive(path, ext, 'lfw', filename=base)
 
 class LfwConverter(Converter):
     DEFAULT_IMAGE_EXT = LfwPath.IMAGE_EXT

@@ -3,75 +3,93 @@
 #
 # SPDX-License-Identifier: MIT
 
-from glob import glob
 import os.path as osp
 
 from datumaro.components.extractor import Importer
 
 from .format import VocTask, VocPath
 
-def find_path(root_path, path, depth=4):
-    level, is_found = 0, False
-    full_path = None
-    while level < depth and not is_found:
-        full_path = osp.join(root_path, path)
-        paths = glob(full_path)
-        if paths:
-            full_path = paths[0] # ignore all after the first one
-            is_found = osp.isdir(full_path)
-        else:
-            full_path = None
-
-        level += 1
-        root_path = osp.join(root_path, '*')
-
-    return full_path
 
 class VocImporter(Importer):
-    _TASKS = [
-        (VocTask.classification, 'voc_classification', 'Main'),
-        (VocTask.detection, 'voc_detection', 'Main'),
-        (VocTask.segmentation, 'voc_segmentation', 'Segmentation'),
-        (VocTask.person_layout, 'voc_layout', 'Layout'),
-        (VocTask.action_classification, 'voc_action', 'Action'),
-    ]
+    _TASKS = {
+        VocTask.classification: ('voc_classification', 'Main'),
+        VocTask.detection: ('voc_detection', 'Main'),
+        VocTask.segmentation: ('voc_segmentation', 'Segmentation'),
+        VocTask.person_layout: ('voc_layout', 'Layout'),
+        VocTask.action_classification: ('voc_action', 'Action'),
+    }
 
     def __call__(self, path, **extra_params):
         from datumaro.components.project import Project # cyclic import
         project = Project()
 
-        subset_paths = self.find_sources(path)
-        if len(subset_paths) == 0:
+        subsets = self.find_sources(path)
+        if len(subsets) == 0:
             raise Exception("Failed to find 'voc' dataset at '%s'" % path)
 
-        for task, extractor_type, subset_path in subset_paths:
+        for config in subsets:
+            subset_path = config['url']
+            extractor_type = config['format']
+
+            task = extractor_type.split('_')[1]
+
+            opts = dict(config.get('options') or {})
+            opts.update(extra_params)
+
             project.add_source('%s-%s' %
-                (task.name, osp.splitext(osp.basename(subset_path))[0]),
+                (task, osp.splitext(osp.basename(subset_path))[0]),
             {
                 'url': subset_path,
                 'format': extractor_type,
-                'options': dict(extra_params),
+                'options': opts,
             })
 
         return project
 
     @classmethod
     def find_sources(cls, path):
-        # find root path for the dataset
-        root_path = path
-        for task, extractor_type, task_dir in cls._TASKS:
-            task_path = find_path(root_path, osp.join(VocPath.SUBSETS_DIR, task_dir))
-            if task_path:
-                root_path = osp.dirname(osp.dirname(task_path))
-                break
-
-        subset_paths = []
-        for task, extractor_type, task_dir in cls._TASKS:
-            task_path = osp.join(root_path, VocPath.SUBSETS_DIR, task_dir)
-
-            if not osp.isdir(task_path):
+        subsets = []
+
+        # find root path for the dataset and use it for all tasks
+        root_path = None
+        for extractor_type, task_dir in cls._TASKS.values():
+            if osp.isfile(path) and \
+                    not osp.basename(osp.dirname(path)) == task_dir:
+                continue
+
+            task_subsets = cls._find_sources_recursive(root_path or path,
+                'txt', extractor_type,
+                dirname=osp.join(VocPath.SUBSETS_DIR, task_dir),
+                file_filter=lambda p: '_' not in osp.basename(p),
+                max_depth=0 if root_path else 3)
+
+            if not task_subsets:
                 continue
-            task_subsets = [p for p in glob(osp.join(task_path, '*.txt'))
-                if '_' not in osp.basename(p)]
-            subset_paths += [(task, extractor_type, p) for p in task_subsets]
-        return subset_paths
+
+            subsets.extend(task_subsets)
+
+            if not root_path:
+                root_path = osp.dirname(osp.dirname(
+                    osp.dirname(task_subsets[0]['url'])))
+
+        return subsets
+
+class VocClassificationImporter(VocImporter):
+    _TASK = VocTask.classification
+    _TASKS = { _TASK: VocImporter._TASKS[_TASK] }
+
+class VocDetectionImporter(VocImporter):
+    _TASK = VocTask.detection
+    _TASKS = { _TASK: VocImporter._TASKS[_TASK] }
+
+class VocSegmentationImporter(VocImporter):
+    _TASK = VocTask.segmentation
+    _TASKS = { _TASK: VocImporter._TASKS[_TASK] }
+
+class VocLayoutImporter(VocImporter):
+    _TASK = VocTask.person_layout
+    _TASKS = { _TASK: VocImporter._TASKS[_TASK] }
+
+class VocActionImporter(VocImporter):
+    _TASK = VocTask.action_classification
+    _TASKS = { _TASK: VocImporter._TASKS[_TASK] }