Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subformat importers for VOC and COCO #281

Merged
merged 11 commits into from
Jun 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Added
-
- Subformat importers for VOC and COCO (<https://github.com/openvinotoolkit/datumaro/pull/281>)

### Changed
-
Expand Down
30 changes: 16 additions & 14 deletions datumaro/components/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,22 +97,22 @@ def put(self, item):
return self._data.put(item)

def get(self, id, subset=None):
assert subset or DEFAULT_SUBSET_NAME == \
self.name or DEFAULT_SUBSET_NAME
assert (subset or DEFAULT_SUBSET_NAME) == \
(self.name or DEFAULT_SUBSET_NAME)
return self._data.get(id, subset)

def remove(self, id, subset=None):
assert subset or DEFAULT_SUBSET_NAME == \
self.name or DEFAULT_SUBSET_NAME
assert (subset or DEFAULT_SUBSET_NAME) == \
(self.name or DEFAULT_SUBSET_NAME)
return self._data.remove(id, subset)

def get_subset(self, name):
assert name or DEFAULT_SUBSET_NAME == \
self.name or DEFAULT_SUBSET_NAME
assert (name or DEFAULT_SUBSET_NAME) == \
(self.name or DEFAULT_SUBSET_NAME)
return self

def subsets(self):
return { self.name or DEFAULT_SUBSET_NAME: self }
return { self.name or DEFAULT_SUBSET_NAME : self }

def categories(self):
return self.parent.categories()
Expand Down Expand Up @@ -187,22 +187,24 @@ def put(self, item):
return self.parent.put(item, subset=self.name)

def get(self, id, subset=None):
assert subset or DEFAULT_SUBSET_NAME == \
self.name or DEFAULT_SUBSET_NAME
assert (subset or DEFAULT_SUBSET_NAME) == \
(self.name or DEFAULT_SUBSET_NAME)
return self.parent.get(id, subset=self.name)

def remove(self, id, subset=None):
assert subset or DEFAULT_SUBSET_NAME == \
self.name or DEFAULT_SUBSET_NAME
assert (subset or DEFAULT_SUBSET_NAME) == \
(self.name or DEFAULT_SUBSET_NAME)
return self.parent.remove(id, subset=self.name)

def get_subset(self, name):
assert name or DEFAULT_SUBSET_NAME == \
self.name or DEFAULT_SUBSET_NAME
assert (name or DEFAULT_SUBSET_NAME) == \
(self.name or DEFAULT_SUBSET_NAME)
return self

def subsets(self):
return { self.name or DEFAULT_SUBSET_NAME: self }
if (self.name or DEFAULT_SUBSET_NAME) == DEFAULT_SUBSET_NAME:
return self.parent.subsets()
return { self.name: self }

def categories(self):
return self.parent.categories()
Expand Down
40 changes: 33 additions & 7 deletions datumaro/components/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from enum import Enum
from glob import iglob
from typing import Iterable, List, Dict, Optional
from typing import Callable, Iterable, List, Dict, Optional
import numpy as np
import os
import os.path as osp
Expand Down Expand Up @@ -672,13 +672,39 @@ def __call__(self, path, **extra_params):
return project

@classmethod
def _find_sources_recursive(cls, path, ext, extractor_name,
filename='*', dirname='', file_filter=None, max_depth=3):
def _find_sources_recursive(cls, path: str, ext: Optional[str],
extractor_name: str, filename: str = '*', dirname: str = '',
file_filter: Optional[Callable[[str], bool]] = None,
max_depth: int = 3):
"""
Finds sources in the specified location, using the matching pattern
to filter file names and directories.
Supposed to be used, and to be the only call in subclasses.

Paramters:
- path - a directory or file path, where sources need to be found.
- ext - file extension to match. To match directories,
set this parameter to None or ''. Comparison is case-independent,
a starting dot is not required.
- extractor_name - the name of the associated Extractor type
- filename - a glob pattern for file names
- dirname - a glob pattern for filename prefixes
- file_filter - a callable (abspath: str) -> bool, to filter paths found
- max_depth - the maximum depth for recursive search.

Returns: a list of source configurations
(i.e. Extractor type names and c-tor parameters)
"""

if ext:
if not ext.startswith('.'):
ext = '.' + ext
ext = ext.lower()

if (path.endswith(ext) and osp.isfile(path)) or \
(not ext and osp.isdir(path) and dirname and \
os.sep + osp.normpath(dirname) + os.sep in \
osp.abspath(path) + os.sep):
if (ext and path.lower().endswith(ext) and osp.isfile(path)) or \
(not ext and dirname and osp.isdir(path) and \
os.sep + osp.normpath(dirname.lower()) + os.sep in \
osp.abspath(path.lower()) + os.sep):
sources = [{'url': path, 'format': extractor_name}]
else:
sources = []
Expand Down
58 changes: 42 additions & 16 deletions datumaro/plugins/coco_format/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@
#
# SPDX-License-Identifier: MIT

from collections import defaultdict
from glob import glob
import logging as log
import os.path as osp

from datumaro.components.extractor import Importer
from datumaro.util import parse_str_enum_value
from datumaro.util.log_utils import logging_disabled

from .format import CocoTask


class CocoImporter(Importer):
_COCO_EXTRACTORS = {
_TASKS = {
CocoTask.instances: 'coco_instances',
CocoTask.person_keypoints: 'coco_person_keypoints',
CocoTask.captions: 'coco_captions',
Expand Down Expand Up @@ -65,34 +65,60 @@ def __call__(self, path, **extra_params):
source_name = osp.splitext(osp.basename(ann_file))[0]
project.add_source(source_name, {
'url': ann_file,
'format': self._COCO_EXTRACTORS[ann_type],
'format': self._TASKS[ann_type],
'options': dict(extra_params),
})

return project

@staticmethod
def find_sources(path):
@classmethod
def find_sources(cls, path):
if path.endswith('.json') and osp.isfile(path):
subset_paths = [path]
else:
subset_paths = glob(osp.join(path, '**', '*_*.json'),
recursive=True)

subsets = defaultdict(dict)
subsets = {}
for subset_path in subset_paths:
name_parts = osp.splitext(osp.basename(subset_path))[0] \
.rsplit('_', maxsplit=1)

ann_type = name_parts[0]
try:
ann_type = CocoTask[ann_type]
except KeyError:
log.warning("Skipping '%s': unknown subset "
"type '%s', the only known are: %s" % \
(subset_path, ann_type,
', '.join(e.name for e in CocoTask)))
ann_type = parse_str_enum_value(name_parts[0], CocoTask,
default=None)
if ann_type not in cls._TASKS:
continue

subset_name = name_parts[1]
subsets[subset_name][ann_type] = subset_path
return dict(subsets)
subsets.setdefault(subset_name, {})[ann_type] = subset_path

return subsets


class CocoImageInfoImporter(CocoImporter):
_TASK = CocoTask.image_info
_TASKS = { _TASK: CocoImporter._TASKS[_TASK] }

class CocoCaptionsImporter(CocoImporter):
_TASK = CocoTask.captions
_TASKS = { _TASK: CocoImporter._TASKS[_TASK] }

class CocoInstancesImporter(CocoImporter):
_TASK = CocoTask.instances
_TASKS = { _TASK: CocoImporter._TASKS[_TASK] }

class CocoPersonKeypointsImporter(CocoImporter):
_TASK = CocoTask.person_keypoints
_TASKS = { _TASK: CocoImporter._TASKS[_TASK] }

class CocoLabelsImporter(CocoImporter):
_TASK = CocoTask.labels
_TASKS = { _TASK: CocoImporter._TASKS[_TASK] }

class CocoPanopticImporter(CocoImporter):
_TASK = CocoTask.panoptic
_TASKS = { _TASK: CocoImporter._TASKS[_TASK] }

class CocoStuffImporter(CocoImporter):
_TASK = CocoTask.stuff
_TASKS = { _TASK: CocoImporter._TASKS[_TASK] }
3 changes: 2 additions & 1 deletion datumaro/plugins/lfw_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ def get_image_name(person, image_id):
class LfwImporter(Importer):
@classmethod
def find_sources(cls, path):
return cls._find_sources_recursive(path, LfwPath.PAIRS_FILE, 'lfw')
base, ext = osp.splitext(LfwPath.PAIRS_FILE)
return cls._find_sources_recursive(path, ext, 'lfw', filename=base)

class LfwConverter(Converter):
DEFAULT_IMAGE_EXT = LfwPath.IMAGE_EXT
Expand Down
110 changes: 64 additions & 46 deletions datumaro/plugins/voc_format/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,75 +3,93 @@
#
# SPDX-License-Identifier: MIT

from glob import glob
import os.path as osp

from datumaro.components.extractor import Importer

from .format import VocTask, VocPath

def find_path(root_path, path, depth=4):
level, is_found = 0, False
full_path = None
while level < depth and not is_found:
full_path = osp.join(root_path, path)
paths = glob(full_path)
if paths:
full_path = paths[0] # ignore all after the first one
is_found = osp.isdir(full_path)
else:
full_path = None

level += 1
root_path = osp.join(root_path, '*')

return full_path

class VocImporter(Importer):
_TASKS = [
(VocTask.classification, 'voc_classification', 'Main'),
(VocTask.detection, 'voc_detection', 'Main'),
(VocTask.segmentation, 'voc_segmentation', 'Segmentation'),
(VocTask.person_layout, 'voc_layout', 'Layout'),
(VocTask.action_classification, 'voc_action', 'Action'),
]
_TASKS = {
VocTask.classification: ('voc_classification', 'Main'),
VocTask.detection: ('voc_detection', 'Main'),
VocTask.segmentation: ('voc_segmentation', 'Segmentation'),
VocTask.person_layout: ('voc_layout', 'Layout'),
VocTask.action_classification: ('voc_action', 'Action'),
}

def __call__(self, path, **extra_params):
from datumaro.components.project import Project # cyclic import
project = Project()

subset_paths = self.find_sources(path)
if len(subset_paths) == 0:
subsets = self.find_sources(path)
if len(subsets) == 0:
raise Exception("Failed to find 'voc' dataset at '%s'" % path)

for task, extractor_type, subset_path in subset_paths:
for config in subsets:
subset_path = config['url']
extractor_type = config['format']

task = extractor_type.split('_')[1]

opts = dict(config.get('options') or {})
opts.update(extra_params)

project.add_source('%s-%s' %
(task.name, osp.splitext(osp.basename(subset_path))[0]),
(task, osp.splitext(osp.basename(subset_path))[0]),
{
'url': subset_path,
'format': extractor_type,
'options': dict(extra_params),
'options': opts,
})

return project

@classmethod
def find_sources(cls, path):
# find root path for the dataset
root_path = path
for task, extractor_type, task_dir in cls._TASKS:
task_path = find_path(root_path, osp.join(VocPath.SUBSETS_DIR, task_dir))
if task_path:
root_path = osp.dirname(osp.dirname(task_path))
break

subset_paths = []
for task, extractor_type, task_dir in cls._TASKS:
task_path = osp.join(root_path, VocPath.SUBSETS_DIR, task_dir)

if not osp.isdir(task_path):
subsets = []

# find root path for the dataset and use it for all tasks
root_path = None
for extractor_type, task_dir in cls._TASKS.values():
if osp.isfile(path) and \
not osp.basename(osp.dirname(path)) == task_dir:
continue

task_subsets = cls._find_sources_recursive(root_path or path,
'txt', extractor_type,
dirname=osp.join(VocPath.SUBSETS_DIR, task_dir),
file_filter=lambda p: '_' not in osp.basename(p),
max_depth=0 if root_path else 3)

if not task_subsets:
continue
task_subsets = [p for p in glob(osp.join(task_path, '*.txt'))
if '_' not in osp.basename(p)]
subset_paths += [(task, extractor_type, p) for p in task_subsets]
return subset_paths

subsets.extend(task_subsets)

if not root_path:
root_path = osp.dirname(osp.dirname(
osp.dirname(task_subsets[0]['url'])))

return subsets

class VocClassificationImporter(VocImporter):
_TASK = VocTask.classification
_TASKS = { _TASK: VocImporter._TASKS[_TASK] }

class VocDetectionImporter(VocImporter):
_TASK = VocTask.detection
_TASKS = { _TASK: VocImporter._TASKS[_TASK] }

class VocSegmentationImporter(VocImporter):
_TASK = VocTask.segmentation
_TASKS = { _TASK: VocImporter._TASKS[_TASK] }

class VocLayoutImporter(VocImporter):
_TASK = VocTask.person_layout
_TASKS = { _TASK: VocImporter._TASKS[_TASK] }

class VocActionImporter(VocImporter):
_TASK = VocTask.action_classification
_TASKS = { _TASK: VocImporter._TASKS[_TASK] }
Loading