Clip (#219)

* wip: init image embeddings * new: add clip * new: fix clip text embedding * fix: fix image parallel * fix: add test images * fix: add PIL * fix: fix generics * fix: fix sparse worker * fix: fix image test path * fix: replace models repo * new: follow-up for onnx providers and local_files_only option * fix: add types, refactor a bit * refactoring: move onnxprovider type alias to types * fix: fix type alias import
qdrant · May 9, 2024 · 99164c9 · 99164c9
1 parent 6ecab6d
commit 99164c9
Show file tree

Hide file tree

Showing 28 changed files with 920 additions and 120 deletions.
diff --git a/fastembed/__init__.py b/fastembed/__init__.py
@@ -1,12 +1,15 @@
 import importlib.metadata
 
+from fastembed.image import ImageEmbedding
 from fastembed.text import TextEmbedding
 from fastembed.sparse import SparseTextEmbedding, SparseEmbedding
 
+
 try:
     version = importlib.metadata.version("fastembed")
 except importlib.metadata.PackageNotFoundError as _:
     version = importlib.metadata.version("fastembed-gpu")
 
 __version__ = version
-__all__ = ["TextEmbedding", "SparseTextEmbedding", "SparseEmbedding"]
+__all__ = ["TextEmbedding", "SparseTextEmbedding", "SparseEmbedding", "ImageEmbedding"]
+
diff --git a/fastembed/common/__init__.py b/fastembed/common/__init__.py
@@ -1,3 +1,3 @@
-from fastembed.common.onnx_model import OnnxProvider
+from fastembed.common.types import OnnxProvider, ImageInput, PathInput
 
-__all__ = ["OnnxProvider"]
+__all__ = ["OnnxProvider", "ImageInput", "PathInput"]
diff --git a/fastembed/common/model_management.py b/fastembed/common/model_management.py
@@ -108,6 +108,7 @@ def download_files_from_huggingface(
             "tokenizer.json",
             "tokenizer_config.json",
             "special_tokens_map.json",
+            "preprocessor_config.json",
         ]
         if extra_patterns is not None:
             allow_patterns.extend(extra_patterns)

diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py
@@ -1,33 +1,16 @@
-import os
-from multiprocessing import get_all_start_methods
 from pathlib import Path
-from typing import (
-    Any,
-    Dict,
-    Generic,
-    Iterable,
-    List,
-    Optional,
-    Tuple,
-    Type,
-    TypeVar,
-    Union,
-    Sequence,
-)
+from typing import Any, Dict, Generic, Iterable, Optional, Tuple, Type, TypeVar, Sequence
 
 import numpy as np
 import onnxruntime as ort
 
-from fastembed.common.models import load_tokenizer
-from fastembed.common.utils import iter_batch
-from fastembed.parallel_processor import ParallelWorkerPool, Worker
+from fastembed.common.types import OnnxProvider
+from fastembed.parallel_processor import Worker
 
 
 # Holds type of the embedding result
 T = TypeVar("T")
 
-OnnxProvider = Union[str, Tuple[str, Dict[Any, Any]]]
-
 
 class OnnxModel(Generic[T]):
     @classmethod
@@ -76,65 +59,12 @@ def load_onnx_model(
             so.intra_op_num_threads = threads
             so.inter_op_num_threads = threads
 
-        self.tokenizer = load_tokenizer(model_dir=model_dir)
         self.model = ort.InferenceSession(
             str(model_path), providers=onnx_providers, sess_options=so
         )
 
-    def onnx_embed(self, documents: List[str]) -> Tuple[np.ndarray, np.ndarray]:
-        encoded = self.tokenizer.encode_batch(documents)
-        input_ids = np.array([e.ids for e in encoded])
-        attention_mask = np.array([e.attention_mask for e in encoded])
-
-        onnx_input = {
-            "input_ids": np.array(input_ids, dtype=np.int64),
-            "attention_mask": np.array(attention_mask, dtype=np.int64),
-            "token_type_ids": np.array(
-                [np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64
-            ),
-        }
-
-        onnx_input = self._preprocess_onnx_input(onnx_input)
-
-        model_output = self.model.run(None, onnx_input)
-        embeddings = model_output[0]
-        return embeddings, attention_mask
-
-    def _embed_documents(
-        self,
-        model_name: str,
-        cache_dir: str,
-        documents: Union[str, Iterable[str]],
-        batch_size: int = 256,
-        parallel: Optional[int] = None,
-    ) -> Iterable[T]:
-        is_small = False
-
-        if isinstance(documents, str):
-            documents = [documents]
-            is_small = True
-
-        if isinstance(documents, list):
-            if len(documents) < batch_size:
-                is_small = True
-
-        if parallel == 0:
-            parallel = os.cpu_count()
-
-        if parallel is None or is_small:
-            for batch in iter_batch(documents, batch_size):
-                yield from self._post_process_onnx_output(self.onnx_embed(batch))
-        else:
-            start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn"
-            params = {
-                "model_name": model_name,
-                "cache_dir": cache_dir,
-            }
-            pool = ParallelWorkerPool(
-                parallel, self._get_worker_class(), start_method=start_method
-            )
-            for batch in pool.ordered_map(iter_batch(documents, batch_size), **params):
-                yield from self._post_process_onnx_output(batch)
+    def onnx_embed(self, *args, **kwargs) -> Tuple[np.ndarray, np.ndarray]:
+        raise NotImplementedError("Subclasses must implement this method")
 
 
 class EmbeddingWorker(Worker):
@@ -160,6 +90,4 @@ def start(cls, model_name: str, cache_dir: str, **kwargs: Any) -> "EmbeddingWork
         )
 
     def process(self, items: Iterable[Tuple[int, Any]]) -> Iterable[Tuple[int, Any]]:
-        for idx, batch in items:
-            embeddings, attn_mask = self.model.onnx_embed(batch)
-            yield idx, (embeddings, attn_mask)
+        raise NotImplementedError("Subclasses must implement this method")
diff --git a/fastembed/common/models.py → fastembed/common/preprocessor_utils.py b/fastembed/common/models.py → fastembed/common/preprocessor_utils.py
@@ -1,9 +1,10 @@
 import json
 from pathlib import Path
 
-import numpy as np
 from tokenizers import Tokenizer, AddedToken
 
+from fastembed.image.transform.operators import Compose
+
 
 def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer:
     config_path = model_dir / "config.json"
@@ -46,9 +47,12 @@ def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer:
     return tokenizer
 
 
-def normalize(input_array, p=2, dim=1, eps=1e-12) -> np.ndarray:
-    # Calculate the Lp norm along the specified dimension
-    norm = np.linalg.norm(input_array, ord=p, axis=dim, keepdims=True)
-    norm = np.maximum(norm, eps)  # Avoid division by zero
-    normalized_array = input_array / norm
-    return normalized_array
+def load_preprocessor(model_dir: Path) -> Compose:
+    preprocessor_config_path = model_dir / "preprocessor_config.json"
+    if not preprocessor_config_path.exists():
+        raise ValueError(f"Could not find preprocessor_config.json in {model_dir}")
+
+    with open(str(preprocessor_config_path)) as preprocessor_config_file:
+        preprocessor_config = json.load(preprocessor_config_file)
+        transforms = Compose.from_config(preprocessor_config)
+    return transforms
diff --git a/fastembed/common/types.py b/fastembed/common/types.py
@@ -0,0 +1,14 @@
+import os
+import sys
+from typing import Union, Iterable, Tuple, Dict, Any
+
+if sys.version_info >= (3, 10):
+    from typing import TypeAlias
+else:
+    from typing_extensions import TypeAlias
+
+
+PathInput: TypeAlias = Union[str, os.PathLike]
+ImageInput: TypeAlias = Union[PathInput, Iterable[PathInput]]
+
+OnnxProvider: TypeAlias = Union[str, Tuple[str, Dict[Any, Any]]]
diff --git a/fastembed/common/utils.py b/fastembed/common/utils.py
@@ -4,6 +4,16 @@
 from pathlib import Path
 from typing import Union, Iterable, Generator, Optional
 
+import numpy as np
+
+
+def normalize(input_array, p=2, dim=1, eps=1e-12) -> np.ndarray:
+    # Calculate the Lp norm along the specified dimension
+    norm = np.linalg.norm(input_array, ord=p, axis=dim, keepdims=True)
+    norm = np.maximum(norm, eps)  # Avoid division by zero
+    normalized_array = input_array / norm
+    return normalized_array
+
 
 def iter_batch(iterable: Union[Iterable, Generator], size: int) -> Iterable:
     """

diff --git a/fastembed/image/__init__.py b/fastembed/image/__init__.py
@@ -0,0 +1,4 @@
+from fastembed.image.image_embedding import ImageEmbedding
+
+
+__all__ = ["ImageEmbedding"]
diff --git a/fastembed/image/image_embedding.py b/fastembed/image/image_embedding.py
@@ -0,0 +1,87 @@
+from typing import Any, Dict, Iterable, List, Optional, Type, Sequence
+
+import numpy as np
+
+from fastembed.common import ImageInput, OnnxProvider
+from fastembed.image.image_embedding_base import ImageEmbeddingBase
+from fastembed.image.onnx_embedding import OnnxImageEmbedding
+
+
+class ImageEmbedding(ImageEmbeddingBase):
+    EMBEDDINGS_REGISTRY: List[Type[ImageEmbeddingBase]] = [OnnxImageEmbedding]
+
+    @classmethod
+    def list_supported_models(cls) -> List[Dict[str, Any]]:
+        """
+        Lists the supported models.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries containing the model information.
+
+            Example:
+                ```
+                [
+                    {
+                        "model": "Qdrant/clip-ViT-B-32-vision",
+                        "dim": 512,
+                        "description": "CLIP vision encoder based on ViT-B/32",
+                        "size_in_GB": 0.33,
+                        "sources": {
+                            "hf": "Qdrant/clip-ViT-B-32-vision",
+                        },
+                        "model_file": "model.onnx",
+                    }
+                ]
+                ```
+        """
+        result = []
+        for embedding in cls.EMBEDDINGS_REGISTRY:
+            result.extend(embedding.list_supported_models())
+        return result
+
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        providers: Optional[Sequence[OnnxProvider]] = None,
+        **kwargs,
+    ):
+        super().__init__(model_name, cache_dir, threads, **kwargs)
+
+        for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY:
+            supported_models = EMBEDDING_MODEL_TYPE.list_supported_models()
+            if any(model_name.lower() == model["model"].lower() for model in supported_models):
+                self.model = EMBEDDING_MODEL_TYPE(
+                    model_name, cache_dir, threads, providers=providers, **kwargs
+                )
+                return
+
+        raise ValueError(
+            f"Model {model_name} is not supported in TextEmbedding."
+            "Please check the supported models using `TextEmbedding.list_supported_models()`"
+        )
+
+    def embed(
+        self,
+        images: ImageInput,
+        batch_size: int = 16,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ) -> Iterable[np.ndarray]:
+        """
+        Encode a list of documents into list of embeddings.
+        We use mean pooling with attention so that the model can handle variable-length inputs.
+
+        Args:
+            images: Iterator of image paths or single image path to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self.model.embed(images, batch_size, parallel, **kwargs)
diff --git a/fastembed/image/image_embedding_base.py b/fastembed/image/image_embedding_base.py
@@ -0,0 +1,39 @@
+from typing import Iterable, Optional
+
+import numpy as np
+
+from fastembed.common.model_management import ModelManagement
+from fastembed.common.types import ImageInput
+
+
+class ImageEmbeddingBase(ModelManagement):
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        **kwargs,
+    ):
+        self.model_name = model_name
+        self.cache_dir = cache_dir
+        self.threads = threads
+        self._local_files_only = kwargs.pop("local_files_only", False)
+
+    def embed(
+        self,
+        images: ImageInput,
+        batch_size: int = 16,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ) -> Iterable[np.ndarray]:
+        """
+        Embeds a list of images into a list of embeddings.
+
+        Args:
+            images - The list of image paths to preprocess and embed.
+            **kwargs: Additional keyword argument to pass to the embed method.
+
+        Yields:
+            Iterable[np.ndarray]: The embeddings.
+        """
+        raise NotImplementedError()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		from fastembed.image.image_embedding import ImageEmbedding


		__all__ = ["ImageEmbedding"]