openvinotoolkit · sooahleex · Apr 20, 2023 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023
@@ -57,7 +57,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/826>)
 - Add DatumaroBinary format
   (<https://github.com/openvinotoolkit/datumaro/pull/828>, <https://github.com/openvinotoolkit/datumaro/pull/829>, <https://github.com/openvinotoolkit/datumaro/pull/830>, <https://github.com/openvinotoolkit/datumaro/pull/831>, <https://github.com/openvinotoolkit/datumaro/pull/880>, <https://github.com/openvinotoolkit/datumaro/pull/883>)
-- Add Searcher CLI documentation
+- Add Explorer CLI documentation
   (<https://github.com/openvinotoolkit/datumaro/pull/838>)
 - Add version to dataset exported as datumaro format
   (<https://github.com/openvinotoolkit/datumaro/pull/842>)

@@ -10,12 +10,12 @@
     diff,
     download,
     explain,
+    explore,
     filter,
     generate,
     info,
     merge,
     patch,
-    search,
     stats,
     transform,
     validate,
@@ -40,7 +40,7 @@ def get_non_project_commands():
         ("generate", generate, "Generate synthetic dataset"),
         ("merge", merge, "Merge datasets"),
         ("patch", patch, "Update dataset from another one"),
-        ("search", search, "Search similar datasetitems of query"),
+        ("explore", explore, "Explore similar datasetitems of query"),
         ("stats", stats, "Compute dataset statistics"),
         ("transform", transform, "Modify dataset items"),
         ("validate", validate, "Validate dataset"),

@@ -9,18 +9,18 @@
 import numpy as np
 
 from datumaro.components.errors import ProjectNotFoundError
-from datumaro.components.searcher import Searcher
+from datumaro.components.explorer import Explorer
 from datumaro.components.visualizer import Visualizer
 from datumaro.util.image import save_image
-from datumaro.util.scope import scope_add
+from datumaro.util.scope import scope_add, scoped
 
 from ..util import MultilineFormatter
 from ..util.project import load_project, parse_full_revpath
 
 
 def build_parser(parser_ctor=argparse.ArgumentParser):
     parser = parser_ctor(
-        help="Search similar data of query in dataset",
+        help="Explore similar data of query in dataset",
         description="""
         Applies data exploration to a dataset for image/text query.
         The command can be useful if you have to find similar data in dataset.
@@ -30,28 +30,28 @@ def build_parser(parser_ctor=argparse.ArgumentParser):
         When not specified, the current project's working tree is used.|n
         |n
         Examples:|n
-        - Search top50 similar images of image query in COCO dataset:|n
+        - Explore top50 similar images of image query in COCO dataset:|n
         |s|s%(prog)s -q path/to/image.jpg -topk 50|n
-        - Search top50 similar images of text query, elephant, in COCO dataset:|n
+        - Explore top50 similar images of text query, elephant, in COCO dataset:|n
         |s|s%(prog)s -q elephant -topk 50|n
-        - Search top50 similar images of image query list in COCO dataset:|n
+        - Explore top50 similar images of image query list in COCO dataset:|n
         |s|s%(prog)s -q path/to/image1.jpg/ path/to/image2.jpg/ path/to/image3.jpg/ -topk 50|n
-        - Search top50 similar images of text query list in COCO dataset:|n
-        |s|s%(prog)s -q motorcycle/ bus/ train/ -topk 50|n
+        - Explore top50 similar images of text query list in COCO dataset:|n
+        |s|s%(prog)s -q motorcycle/ bus/ train/ -topk 50
         """,
         formatter_class=MultilineFormatter,
     )
 
     parser.add_argument(
         "_positionals", nargs=argparse.REMAINDER, help=argparse.SUPPRESS
     )  # workaround for -- eaten by positionals
-    parser.add_argument("target", nargs="?", default="project", help="Target dataset")
+    parser.add_argument("target", nargs="+", default="project", help="Target dataset")
     parser.add_argument(
         "-q",
         "--query",
         dest="query",
         required=True,
-        help="Image path or id of query to search similar data",
+        help="Image path or id of query to explore similar data",
     )
     parser.add_argument("-topk", type=int, dest="topk", help="Number of similar results")
     parser.add_argument(
@@ -61,47 +61,45 @@ def build_parser(parser_ctor=argparse.ArgumentParser):
         help="Directory of the project to operate on (default: current dir)",
     )
     parser.add_argument(
-        "-s", "--save", dest="save", default=True, help="Save searcher result as png"
+        "-s", "--save", dest="save", default=True, help="Save explorer result as png"
     )
 
-    parser.set_defaults(command=search_command)
+    parser.set_defaults(command=explore_command)
 
     return parser
 
 
 def get_sensitive_args():
     return {
-        search_command: [
+        explore_command: [
             "target",
             "query",
             "topk",
-            "project_dir",
             "save",
         ]
     }
 
 
-def search_command(args):
+@scoped
+def explore_command(args):
     project = None
     try:
         project = scope_add(load_project(args.project_dir))
     except ProjectNotFoundError:
         if args.project_dir:
-            log.info(
-                f"Wrong argument: project_dir, {args.project_dir}, should be a path to project dir"
-            )
             raise
-    dataset, _ = parse_full_revpath(args.target, project)
 
-    searcher = Searcher(dataset)
+    dataset, _ = parse_full_revpath(args.target[0], project)
+
+    explorer = Explorer(dataset)
 
     # Get query datasetitem through query path
     if osp.exists(args.query):
         query_datasetitem = dataset.get_datasetitem_by_path(args.query)
     else:
         query_datasetitem = args.query
 
-    results = searcher.search_topk(query_datasetitem, args.topk)
+    results = explorer.explore_topk(query_datasetitem, args.topk)
 
     subset_list = []
     id_list = []
@@ -121,6 +119,6 @@ def search_command(args):
         fig.canvas.draw()
         data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
         data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
-        save_image(osp.join("./searcher.png"), data, create_dir=True)
+        save_image(osp.join("./explorer.png"), data, create_dir=True)
 
     return 0
@@ -11,7 +11,7 @@
 from datumaro.components.dataset_base import DatasetItem
 from datumaro.components.errors import MediaTypeError
 from datumaro.components.media import Image
-from datumaro.plugins.searcher import SearcherLauncher
+from datumaro.plugins.explorer import ExplorerLauncher
 
 
 def calculate_hamming(B1, B2):
@@ -25,24 +25,24 @@ def calculate_hamming(B1, B2):
     return distH
 
 
-class Searcher:
+class Explorer:
     def __init__(
         self,
         dataset: IDataset,
         topk: int = 10,
     ) -> None:
         """
-        Searcher for Datumaro dataitems
+        Explorer for Datumaro dataitems
 
         Parameters
         ----------
         dataset:
-            Datumaro dataset to search similar dataitem.
+            Datumaro dataset to explore similar dataitem.
         topk:
             Number of images.
         """
-        self._model = SearcherLauncher(model_name="clip_visual_ViT-B_32")
-        self._text_model = SearcherLauncher(model_name="clip_text_ViT-B_32")
+        self._model = ExplorerLauncher(model_name="clip_visual_ViT-B_32")
+        self._text_model = ExplorerLauncher(model_name="clip_text_ViT-B_32")
         inference = dataset.run_model(self._model, append_annotation=True)
         self._topk = topk
 
@@ -63,13 +63,13 @@ def __init__(
         self._database_keys = database_keys
         self._item_list = item_list
 
-    def search_topk(
+    def explore_topk(
         self,
         query: Union[DatasetItem, str, List[DatasetItem], List[str]],
         topk: Optional[int] = None,
     ):
         """
-        Search topk similar results based on hamming distance for query DatasetItem
+        Explore topk similar results based on hamming distance for query DatasetItem
         """
         if not topk:
             topk = self._topk

@@ -10,7 +10,7 @@
 from datumaro.plugins.openvino_plugin.launcher import OpenvinoLauncher
 
 
-class SearcherLauncher(OpenvinoLauncher):
+class ExplorerLauncher(OpenvinoLauncher):
     def __init__(
         self,
         description=None,

@@ -0,0 +1,48 @@
+# Explore
+
+## Explore datasets
+
+This command explore similar data results for query on dataset. You can use your own query with any image file or text description, even put it on the list. The result includes top-k similar data among target dataset and the visualization of result is saved as png file. This feature is supposed to help users to figure out dataset property easier.
+
+Explorer is a feature that operates on hash basis. Once you put dataset that use as a datasetbase, Explorer calculates hash for every datasetitems in the dataset. Currently, hash of each data is computed based on the CLIP ([article](https://arxiv.org/abs/2103.00020)), which could support both image and text modality. Supported model format is Openvino IR and those are uploaded in [openvinotoolkit storage](https://storage.openvinotoolkit.org/repositories/datumaro/models/). When you call Explorer class, hash of whole dataset is started to compute. For database, we use hash for image of each datasetitem. Through CLIP, we extracted feature of image, converted it to binary value and pack the elements into bits. Each hash information is saved as `HashKey` in annotations. Hence, once you call Explorer for the dataset, all datasetitems in dataset have `HashKey` in each annotations.
+
+To explore similar data in dataset, you need to set query first. Query could be image, text, list of images, list of texts and list of images and texts. The query does not need to be an image that exists in the dataset. You can put in any data that you want to explore similar dataset. And you need to set top-k that how much you want to find similar data. The default value for top-k is 50, so if you hope to find more smaller results, you would set top-k. For single query, we computed hamming distance of hash between whole dataset and query. And we sorted those distance and select top-k data which have short distance. For list query, we repeated computing distance for each query and select top-k data based on distance among all dataset.
+
+The command can be applied to a dataset. And if you want to use multiple dataset as database, you could use merged dataset. The current project (`-p/--project`) is also used a context for plugins, so it can be useful for dataset paths having custom formats. When not specified, the current project's working tree is used. To save visualized result (`-s/--save`) is turned on as default. This visualized result is based on [Visualizer](../../jupyter_notebook_examples/visualizer).
+
+Usage:
+``` bash
+datum explore [-q <path/to/image.jpg> or <text_query>] [-topk TOPK]
+```
+
+Parameters:
+- `-q, --query` (string) - Image path or text to use as query.
+- `-topk` (int) - Number how much you want to find similar data.
+- `-p, --project` (string) - Directory of the project to operate on (default: current directory).
+- `-s, --save` (bool) - Save visualized result of similar dataset.
+
+Examples:
+- Use image query
+```bash
+datum project create <...>
+datum project import -f datumaro <path/to/dataset/>
+datum explore -q path/to/image.jpg -topk 10
+```
+- Use text query
+```bash
+datum project create <...>
+datum project import -f datumaro <path/to/dataset/>
+datum explore -q elephant -topk 10
+```
+- Use list of images query
+```bash
+datum project create <...>
+datum project import -f datumaro <path/to/dataset/>
+datum explore -q path/to/image1.jpg path/to/image2.jpg path/to/image3.jpg -topk 50
+```
+- Use list of texts query
+```bash
+datum project create <...>
+datum project import -f datumaro <path/to/dataset/>
+datum explore -q motorcycle bus train -topk 50
+```
@@ -2,4 +2,60 @@
 Level 9: Dataset Explorartion from a Query Image/Text
 =====================================================
 
-TBD
+
+Datumaro support exploration feature to find out similar data for query among dataset. With query, the exploration result includes top-k similar data among dataset.
+Through this feature, you could figure out dataset property. You could check the visualization result of exploration using `Visualizer`.
+
+More detailed descriptions about explorer are given by :ref:`Explore`
+The Python example for the usage of explorer is described in :doc:`here <../../jupyter_notebook_examples/notebooks/07_data_explorer>`.
+
+
+.. tab-set::
+
+    .. tab-item:: Python
+
+        With Python API, we can explore similar items as below
+
+        .. code-block:: python
+
+            from datumaro.components.dataset import Dataset
+            from datumaro.components.environment import Environment
+            from datumaro.componenets.explorer import Explorer
+
+            data_path = '/path/to/data'
+
+            env = Environment()
+            detected_formats = env.detect_dataset(data_path)
+
+            dataset = Dataset.import_from(path, detected_formats[0])
+
+            explorer = Explorer(dataset)
+            query = '/path/to/image/file'
+            topk = 20
+            topk_result = explorer.explore_topk(query, topk)
+
+    .. tab-item:: ProjectCLI
+
+        With the project-based CLI, we first require to create a project by
+
+        .. code-block:: bash
+
+            datum project create -o <path/to/project>
+
+        We now import data in to project through
+
+        .. code-block:: bash
+
+            datum project import --project <path/to/project> <path/to/data>
+
+        We can explore similar items for the query
+
+        .. code-block:: bash
+
+            datum explore -q QUERY -topk TOPK_NUM -p <path/to/project>
+
+        ``QUERY`` could be image file path, text description, list of both of them
+
+        ``TOPK_NUM`` is an integer that you want to find the number of similar results for query
+
+        Exploration result would be printed by log and visualized result would be saved by ``explorer.png``
@@ -1,7 +1,7 @@
-Search module
+Explore module
 =============
 
-.. automodule:: datumaro.cli.commands.search
+.. automodule:: datumaro.cli.commands.explore
    :members:
    :undoc-members:
    :show-inheritance:
@@ -1,7 +1,7 @@
-Searcher module
+Explorer module
 ===============
 
-.. automodule:: datumaro.components.searcher
+.. automodule:: datumaro.components.explorer
    :members:
    :undoc-members:
    :show-inheritance: