From 87bfae15bafafe0a83b517cc2b29980ca2a8765c Mon Sep 17 00:00:00 2001
From: George Panchuk <george.panchuk@qdrant.tech>
Date: Wed, 18 Dec 2024 17:58:08 +0100
Subject: [PATCH 01/17] wip: design draft

---
 fastembed/image/image_embedding.py            |   3 +-
 .../late_interaction_multimodal/colpali.py    | 266 ++++++++++++++++++
 .../late_interaction_multimodal_embedding.py  | 123 ++++++++
 ...e_interaction_multimodal_embedding_base.py |  65 +++++
 .../onnx_multimodal_model.py                  | 237 ++++++++++++++++
 5 files changed, 692 insertions(+), 2 deletions(-)
 create mode 100644 fastembed/late_interaction_multimodal/colpali.py
 create mode 100644 fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
 create mode 100644 fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py
 create mode 100644 fastembed/late_interaction_multimodal/onnx_multimodal_model.py
diff --git a/fastembed/image/image_embedding.py b/fastembed/image/image_embedding.py
index aa4c91b4..23d39a3e 100644
--- a/fastembed/image/image_embedding.py
+++ b/fastembed/image/image_embedding.py
@@ -80,8 +80,7 @@ def embed(
         **kwargs,
     ) -> Iterable[np.ndarray]:
         """
-        Encode a list of documents into list of embeddings.
-        We use mean pooling with attention so that the model can handle variable-length inputs.
+        Encode a list of images into list of embeddings.
 
         Args:
             images: Iterator of image paths or single image path to embed
diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
new file mode 100644
index 00000000..d3508194
--- /dev/null
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -0,0 +1,266 @@
+from typing import Any, Iterable, Optional, Sequence, Type, Union
+
+import numpy as np
+from tokenizers import Encoding
+
+from fastembed.common import OnnxProvider, ImageInput
+from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.common.utils import define_cache_dir
+from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
+    LateInteractionMultimodalEmbeddingBase,
+)
+from fastembed.late_interaction_multimodal.onnx_multimodal_model import (
+    OnnxMultimodalModel,
+    TextEmbeddingWorker,
+    ImageEmbeddingWorker,
+)
+
+
+supported_colbert_models = [
+    {
+        "model": "colpali",
+        "dim": ...,
+        "description": "Late interaction model",
+        "license": "mit",
+        "size_in_GB": 6.06,
+        "sources": {
+            "hf": "colpali",
+        },
+        "model_file": "model.onnx",
+    },
+]
+
+
+class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.ndarray]):
+    DOCUMENT_MARKER_TOKEN_ID = 2
+    QUERY_PREFIX = "Query: "
+    BOS_TOKEN = "<s>"
+    PAD_TOKEN = "<pad>"
+    QUERY_MARKER_TOKEN_ID = [2, 9413]
+    IMAGE_PLACEHOLDER_SIZE = (3, 448, 448)
+    EMPTY_TEXT_PLACEHOLDER = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108])
+    EVEN_ATTENTION_MASK = np.array([1] * 1030)
+
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        providers: Optional[Sequence[OnnxProvider]] = None,
+        cuda: bool = False,
+        device_ids: Optional[list[int]] = None,
+        lazy_load: bool = False,
+        device_id: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            model_name (str): The name of the model to use.
+            cache_dir (str, optional): The path to the cache directory.
+                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
+                                       Defaults to `fastembed_cache` in the system's temp directory.
+            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
+            providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
+                Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
+            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to False.
+            device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
+                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
+                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
+            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
+
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        """
+
+        super().__init__(model_name, cache_dir, threads, **kwargs)
+        self.providers = providers
+        self.lazy_load = lazy_load
+
+        # List of device ids, that can be used for data parallel processing in workers
+        self.device_ids = device_ids
+        self.cuda = cuda
+
+        # This device_id will be used if we need to load model in current process
+        if device_id is not None:
+            self.device_id = device_id
+        elif self.device_ids is not None:
+            self.device_id = self.device_ids[0]
+        else:
+            self.device_id = None
+
+        self.model_description = self._get_model_description(model_name)
+        self.cache_dir = define_cache_dir(cache_dir)
+
+        self._model_dir = self.download_model(
+            self.model_description, self.cache_dir, local_files_only=self._local_files_only
+        )
+        self.mask_token_id = None
+        self.pad_token_id = None
+        self.skip_list = set()
+
+        if not self.lazy_load:
+            self.load_onnx_model()
+
+    @classmethod
+    def list_supported_models(cls) -> list[dict[str, Any]]:
+        """Lists the supported models.
+
+        Returns:
+            list[dict[str, Any]]: A list of dictionaries containing the model information.
+        """
+        return supported_colbert_models
+
+    def load_onnx_model(self) -> None:
+        self._load_onnx_model(
+            model_dir=self._model_dir,
+            model_file=self.model_description["model_file"],
+            threads=self.threads,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_id=self.device_id,
+        )
+
+    def _post_process_onnx_image_output(
+        self,
+        output: OnnxOutputContext,
+    ) -> Iterable[np.ndarray]:
+        """
+        Post-process the ONNX model output to convert it into a usable format.
+
+        Args:
+            output (OnnxOutputContext): The raw output from the ONNX model.
+
+        Returns:
+            Iterable[np.ndarray]: Post-processed output as NumPy arrays.
+        """
+        return output.model_output.astype(np.float32)
+
+    def _post_process_onnx_text_output(
+        self,
+        output: OnnxOutputContext,
+    ) -> Iterable[np.ndarray]:
+        """
+        Post-process the ONNX model output to convert it into a usable format.
+
+        Args:
+            output (OnnxOutputContext): The raw output from the ONNX model.
+
+        Returns:
+            Iterable[np.ndarray]: Post-processed output as NumPy arrays.
+        """
+        return output.model_output.astype(np.float32)
+
+    def tokenize(self, documents: list[str], **_) -> list[Encoding]:
+        texts_query: list[str] = []
+
+        for query in documents:
+            query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10
+            query += "\n"
+
+            texts_query.append(query)
+        encoded = self.tokenizer.encode_batch(documents)
+        return encoded
+
+    def _preprocess_onnx_text_input(
+        self, onnx_input: dict[str, np.ndarray], **kwargs
+    ) -> dict[str, np.ndarray]:
+        onnx_input["input_ids"] = np.array(
+            [self.QUERY_MARKER_TOKEN_ID + input_ids[2:] for input_ids in onnx_input["input_ids"]]
+        )
+        return onnx_input
+
+    def embed_text(
+        self,
+        documents: Union[str, Iterable[str]],
+        batch_size: int = 256,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ) -> Iterable[np.ndarray]:
+        """
+        Encode a list of documents into list of embeddings.
+
+        Args:
+            documents: Iterator of documents or single document to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self._embed_documents(
+            model_name=self.model_name,
+            cache_dir=str(self.cache_dir),
+            documents=documents,
+            batch_size=batch_size,
+            parallel=parallel,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_ids=self.device_ids,
+            **kwargs,
+        )
+
+    def embed_images(
+        self,
+        images: ImageInput,
+        batch_size: int = 16,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ) -> Iterable[np.ndarray]:
+        """
+        Encode a list of images into list of embeddings.
+
+        Args:
+            images: Iterator of image paths or single image path to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self._embed_images(
+            model_name=self.model_name,
+            cache_dir=str(self.cache_dir),
+            images=images,
+            batch_size=batch_size,
+            parallel=parallel,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_ids=self.device_ids,
+            **kwargs,
+        )
+
+    @classmethod
+    def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker]:
+        return ColPaliTextEmbeddingWorker
+
+    @classmethod
+    def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker]:
+        return ColPaliImageEmbeddingWorker
+
+
+class ColPaliTextEmbeddingWorker(TextEmbeddingWorker):
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs) -> ColPali:
+        return ColPali(
+            model_name=model_name,
+            cache_dir=cache_dir,
+            threads=1,
+            **kwargs,
+        )
+
+
+class ColPaliImageEmbeddingWorker(ImageEmbeddingWorker):
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs) -> ColPali:
+        return ColPali(
+            model_name=model_name,
+            cache_dir=cache_dir,
+            threads=1,
+            **kwargs,
+        )
diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
new file mode 100644
index 00000000..3d35c52f
--- /dev/null
+++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
@@ -0,0 +1,123 @@
+from typing import Any, Iterable, Optional, Sequence, Type, Union
+
+import numpy as np
+
+from fastembed.common import OnnxProvider, ImageInput
+from fastembed.late_interaction_multimodal.colpali import ColPali
+
+from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
+    LateInteractionMultimodalEmbeddingBase,
+)
+
+
+class LateInteractionMultimodalEmbedding(LateInteractionMultimodalEmbeddingBase):
+    EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [ColPali]
+
+    @classmethod
+    def list_supported_models(cls) -> list[dict[str, Any]]:
+        """
+        Lists the supported models.
+
+        Returns:
+            list[dict[str, Any]]: A list of dictionaries containing the model information.
+
+            Example:
+                ```
+                [
+                    {
+                        "model": "colpali",
+                        "dim": ...,
+                        "description": "Late interaction model",
+                        "license": "mit",
+                        "size_in_GB": 6.06,
+                        "sources": {
+                            "hf": "colpali",
+                        },
+                        "model_file": "model.onnx",
+                    },
+                ]
+                ```
+        """
+        result = []
+        for embedding in cls.EMBEDDINGS_REGISTRY:
+            result.extend(embedding.list_supported_models())
+        return result
+
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        providers: Optional[Sequence[OnnxProvider]] = None,
+        cuda: bool = False,
+        device_ids: Optional[list[int]] = None,
+        lazy_load: bool = False,
+        **kwargs,
+    ):
+        super().__init__(model_name, cache_dir, threads, **kwargs)
+        for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY:
+            supported_models = EMBEDDING_MODEL_TYPE.list_supported_models()
+            if any(model_name.lower() == model["model"].lower() for model in supported_models):
+                self.model = EMBEDDING_MODEL_TYPE(
+                    model_name,
+                    cache_dir,
+                    threads=threads,
+                    providers=providers,
+                    cuda=cuda,
+                    device_ids=device_ids,
+                    lazy_load=lazy_load,
+                    **kwargs,
+                )
+                return
+
+        raise ValueError(
+            f"Model {model_name} is not supported in LateInteractionMultimodalEmbedding."
+            "Please check the supported models using `LateInteractionMultimodalEmbedding.list_supported_models()`"
+        )
+
+    def embed_text(
+        self,
+        documents: Union[str, Iterable[str]],
+        batch_size: int = 256,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ) -> Iterable[np.ndarray]:
+        """
+        Encode a list of documents into list of embeddings.
+
+        Args:
+            documents: Iterator of documents or single document to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self.model.embed_text(documents, batch_size, parallel, **kwargs)
+
+    def embed_image(
+        self,
+        images: ImageInput,
+        batch_size: int = 16,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ) -> Iterable[np.ndarray]:
+        """
+        Encode a list of documents into list of embeddings.
+        We use mean pooling with attention so that the model can handle variable-length inputs.
+
+        Args:
+            images: Iterator of image paths or single image path to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self.model.embed_image(images, batch_size, parallel, **kwargs)
diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py
new file mode 100644
index 00000000..cc1a929b
--- /dev/null
+++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py
@@ -0,0 +1,65 @@
+from typing import Iterable, Optional, Union
+
+import numpy as np
+
+from fastembed.common import ImageInput
+from fastembed.common.model_management import ModelManagement
+
+
+class LateInteractionMultimodalEmbeddingBase(ModelManagement):
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        **kwargs,
+    ):
+        self.model_name = model_name
+        self.cache_dir = cache_dir
+        self.threads = threads
+        self._local_files_only = kwargs.pop("local_files_only", False)
+
+    def embed_text(
+        self,
+        documents: Union[str, Iterable[str]],
+        batch_size: int = 256,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ) -> Iterable[np.ndarray]:
+        """
+        Embeds a list of documents into a list of embeddings.
+
+        Args:
+            documents (Iterable[str]): The list of texts to embed.
+            batch_size (int) - ...
+            parallel (Optional[int]) - ...
+            **kwargs: Additional keyword argument to pass to the embed method.
+
+        Yields:
+            Iterable[np.ndarray]: The embeddings.
+        """
+        raise NotImplementedError()
+
+    def embed_image(
+        self,
+        images: ImageInput,
+        batch_size: int = 16,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ) -> Iterable[np.ndarray]:
+        """
+        Encode a list of documents into list of embeddings.
+        We use mean pooling with attention so that the model can handle variable-length inputs.
+
+        Args:
+            images: Iterator of image paths or single image path to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+
+        Returns:
+            List of embeddings, one per document
+        """
+        raise NotImplementedError()
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
new file mode 100644
index 00000000..0557a92e
--- /dev/null
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -0,0 +1,237 @@
+import contextlib
+import os
+from multiprocessing import get_all_start_methods
+from pathlib import Path
+from typing import Any, Iterable, Optional, Sequence, Type, Union
+
+import numpy as np
+from PIL import Image
+from tokenizers import Encoding
+
+from fastembed.common import OnnxProvider, ImageInput
+from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
+from fastembed.common.preprocessor_utils import load_tokenizer, load_preprocessor
+from fastembed.common.utils import iter_batch
+from fastembed.parallel_processor import ParallelWorkerPool
+
+
+class OnnxMultimodalModel(OnnxModel[T]):
+    ONNX_OUTPUT_NAMES: Optional[list[str]] = None
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.tokenizer = None
+        self.processor = None
+        self.special_token_to_id = {}
+
+    def _preprocess_onnx_text_input(
+        self, onnx_input: dict[str, np.ndarray], **kwargs
+    ) -> dict[str, np.ndarray]:
+        """
+        Preprocess the onnx input.
+        """
+        return onnx_input
+
+    def _preprocess_onnx_image_input(
+        self, onnx_input: dict[str, np.ndarray], **kwargs
+    ) -> dict[str, np.ndarray]:
+        """
+        Preprocess the onnx input.
+        """
+        return onnx_input
+
+    @classmethod
+    def _get_text_worker_class(cls) -> Type["TextEmbeddingWorker"]:
+        raise NotImplementedError("Subclasses must implement this method")
+
+    @classmethod
+    def _get_image_worker_class(cls) -> Type["ImageEmbeddingWorker"]:
+        raise NotImplementedError("Subclasses must implement this method")
+
+    def _post_process_onnx_image_output(self, output: OnnxOutputContext) -> Iterable[T]:
+        raise NotImplementedError("Subclasses must implement this method")
+
+    def _post_process_onnx_text_output(self, output: OnnxOutputContext) -> Iterable[T]:
+        raise NotImplementedError("Subclasses must implement this method")
+
+    def _load_onnx_model(
+        self,
+        model_dir: Path,
+        model_file: str,
+        threads: Optional[int],
+        providers: Optional[Sequence[OnnxProvider]] = None,
+        cuda: bool = False,
+        device_id: Optional[int] = None,
+    ) -> None:
+        super()._load_onnx_model(
+            model_dir=model_dir,
+            model_file=model_file,
+            threads=threads,
+            providers=providers,
+            cuda=cuda,
+            device_id=device_id,
+        )
+        self.tokenizer, self.special_token_to_id = load_tokenizer(model_dir=model_dir)
+        self.processor = load_preprocessor(model_dir=model_dir)
+
+    def load_onnx_model(self) -> None:
+        raise NotImplementedError("Subclasses must implement this method")
+
+    def tokenize(self, documents: list[str], **kwargs) -> list[Encoding]:
+        return self.tokenizer.encode_batch(documents)
+
+    def onnx_embed_text(
+        self,
+        documents: list[str],
+        **kwargs,
+    ) -> OnnxOutputContext:
+        encoded = self.tokenize(documents, **kwargs)
+        input_ids = np.array([e.ids for e in encoded])
+        attention_mask = np.array([e.attention_mask for e in encoded])
+        input_names = {node.name for node in self.model.get_inputs()}
+        onnx_input = {
+            "input_ids": np.array(input_ids, dtype=np.int64),
+        }
+        if "attention_mask" in input_names:
+            onnx_input["attention_mask"] = np.array(attention_mask, dtype=np.int64)
+        if "token_type_ids" in input_names:
+            onnx_input["token_type_ids"] = np.array(
+                [np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64
+            )
+
+        onnx_input = self._preprocess_onnx_text_input(onnx_input, **kwargs)
+
+        model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)
+        return OnnxOutputContext(
+            model_output=model_output[0],
+            attention_mask=onnx_input.get("attention_mask", attention_mask),
+            input_ids=onnx_input.get("input_ids", input_ids),
+        )
+
+    def _embed_documents(
+        self,
+        model_name: str,
+        cache_dir: str,
+        documents: Union[str, Iterable[str]],
+        batch_size: int = 256,
+        parallel: Optional[int] = None,
+        providers: Optional[Sequence[OnnxProvider]] = None,
+        cuda: bool = False,
+        device_ids: Optional[list[int]] = None,
+        **kwargs,
+    ) -> Iterable[T]:
+        is_small = False
+
+        if isinstance(documents, str):
+            documents = [documents]
+            is_small = True
+
+        if isinstance(documents, list):
+            if len(documents) < batch_size:
+                is_small = True
+
+        if parallel is None or is_small:
+            if not hasattr(self, "model") or self.model is None:
+                self.load_onnx_model()
+            for batch in iter_batch(documents, batch_size):
+                yield from self._post_process_onnx_text_output(self.onnx_embed_text(batch))
+        else:
+            if parallel == 0:
+                parallel = os.cpu_count()
+
+            start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn"
+            params = {
+                "model_name": model_name,
+                "cache_dir": cache_dir,
+                "providers": providers,
+                **kwargs,
+            }
+
+            pool = ParallelWorkerPool(
+                num_workers=parallel or 1,
+                worker=self._get_text_worker_class(),
+                cuda=cuda,
+                device_ids=device_ids,
+                start_method=start_method,
+            )
+            for batch in pool.ordered_map(iter_batch(documents, batch_size), **params):
+                yield from self._post_process_onnx_text_output(batch)
+
+    def _build_onnx_image_input(self, encoded: np.ndarray) -> dict[str, np.ndarray]:
+        return {node.name: encoded for node in self.model.get_inputs()}
+
+    def onnx_embed_image(self, images: list[ImageInput], **kwargs) -> OnnxOutputContext:
+        with contextlib.ExitStack():
+            image_files = [
+                Image.open(image) if not isinstance(image, Image.Image) else image
+                for image in images
+            ]
+            encoded = self.processor(image_files)
+        onnx_input = self._build_onnx_image_input(encoded)
+        onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs)
+        model_output = self.model.run(None, onnx_input)
+        embeddings = model_output[0].reshape(len(images), -1)
+        return OnnxOutputContext(model_output=embeddings)
+
+    def _embed_images(
+        self,
+        model_name: str,
+        cache_dir: str,
+        images: ImageInput,
+        batch_size: int = 256,
+        parallel: Optional[int] = None,
+        providers: Optional[Sequence[OnnxProvider]] = None,
+        cuda: bool = False,
+        device_ids: Optional[list[int]] = None,
+        **kwargs,
+    ) -> Iterable[T]:
+        is_small = False
+
+        if isinstance(images, (str, Path, Image.Image)):
+            images = [images]
+            is_small = True
+
+        if isinstance(images, list) and len(images) < batch_size:
+            is_small = True
+
+        if parallel is None or is_small:
+            if not hasattr(self, "model") or self.model is None:
+                self.load_onnx_model()
+
+            for batch in iter_batch(images, batch_size):
+                yield from self._post_process_onnx_image_output(self.onnx_embed_image(batch))
+        else:
+            if parallel == 0:
+                parallel = os.cpu_count()
+
+            start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn"
+            params = {
+                "model_name": model_name,
+                "cache_dir": cache_dir,
+                "providers": providers,
+                **kwargs,
+            }
+
+            pool = ParallelWorkerPool(
+                num_workers=parallel or 1,
+                worker=self._get_image_worker_class(),
+                cuda=cuda,
+                device_ids=device_ids,
+                start_method=start_method,
+            )
+            for batch in pool.ordered_map(iter_batch(images, batch_size), **params):
+                yield from self._post_process_onnx_image_output(batch)
+
+
+class TextEmbeddingWorker(EmbeddingWorker):
+    def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]:
+        for idx, batch in items:
+            onnx_output = self.model.onnx_embed_text(batch)
+            yield idx, onnx_output
+
+
+class ImageEmbeddingWorker(EmbeddingWorker):
+    def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]:
+        for idx, batch in items:
+            embeddings = self.model.onnx_embed_image(batch)
+            yield idx, embeddings

From b8cda6828c8b287c1783fec1dc4acc41c0c0b0b8 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Thu, 19 Dec 2024 23:21:06 +0100
Subject: [PATCH 02/17] Operators fix

---
 fastembed/image/transform/operators.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py
index bac65e08..9701e2ae 100644
--- a/fastembed/image/transform/operators.py
+++ b/fastembed/image/transform/operators.py
@@ -139,7 +139,7 @@ def _get_convert_to_rgb(transforms: list[Transform], config: dict[str, Any]):
     @classmethod
     def _get_resize(cls, transforms: list[Transform], config: dict[str, Any]):
         mode = config.get("image_processor_type", "CLIPImageProcessor")
-        if mode == "CLIPImageProcessor":
+        if mode in ("CLIPImageProcessor", "SiglipImageProcessor"):
             if config.get("do_resize", False):
                 size = config["size"]
                 if "shortest_edge" in size:
@@ -202,7 +202,7 @@ def _get_resize(cls, transforms: list[Transform], config: dict[str, Any]):
     @staticmethod
     def _get_center_crop(transforms: list[Transform], config: dict[str, Any]):
         mode = config.get("image_processor_type", "CLIPImageProcessor")
-        if mode == "CLIPImageProcessor":
+        if mode in ("CLIPImageProcessor", "SiglipImageProcessor"):
             if config.get("do_center_crop", False):
                 crop_size = config["crop_size"]
                 if isinstance(crop_size, int):

From a7aa9c335c6dacd5c85b55fed3dd8b479074bbf6 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Fri, 20 Dec 2024 12:35:17 +0100
Subject: [PATCH 03/17] Fix model inputs

---
 .../late_interaction_multimodal/colpali.py    | 52 ++++++++++++++-----
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index d3508194..f8780cb9 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -16,18 +16,24 @@
 )
 
 
-supported_colbert_models = [
+supported_colpali_models = [
     {
-        "model": "colpali",
-        "dim": ...,
-        "description": "Late interaction model",
+        "model": "akshayballal/colpali-v1.2-merged",
+        "dim": 128,
+        "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.",
         "license": "mit",
-        "size_in_GB": 6.06,
+        "size_in_GB": 6.08,
         "sources": {
-            "hf": "colpali",
+            "hf": "akshayballal/colpali-v1.2-merged-onnx",
         },
+        "additional_files": [
+            "model.onnx_data",
+            "tokenizer.json",
+            "tokenizer_config.json",
+            "config.json",
+        ],
         "model_file": "model.onnx",
-    },
+    }
 ]
 
 
@@ -110,7 +116,7 @@ def list_supported_models(cls) -> list[dict[str, Any]]:
         Returns:
             list[dict[str, Any]]: A list of dictionaries containing the model information.
         """
-        return supported_colbert_models
+        return supported_colpali_models
 
     def load_onnx_model(self) -> None:
         self._load_onnx_model(
@@ -135,7 +141,7 @@ def _post_process_onnx_image_output(
         Returns:
             Iterable[np.ndarray]: Post-processed output as NumPy arrays.
         """
-        return output.model_output.astype(np.float32)
+        return output.model_output.reshape(output.model_output.shape[0], -1, self.model_description['dim']).astype(np.float32)
 
     def _post_process_onnx_text_output(
         self,
@@ -154,20 +160,42 @@ def _post_process_onnx_text_output(
 
     def tokenize(self, documents: list[str], **_) -> list[Encoding]:
         texts_query: list[str] = []
-
         for query in documents:
             query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10
             query += "\n"
 
             texts_query.append(query)
-        encoded = self.tokenizer.encode_batch(documents)
+        encoded = self.tokenizer.encode_batch(texts_query)
         return encoded
 
     def _preprocess_onnx_text_input(
         self, onnx_input: dict[str, np.ndarray], **kwargs
     ) -> dict[str, np.ndarray]:
         onnx_input["input_ids"] = np.array(
-            [self.QUERY_MARKER_TOKEN_ID + input_ids[2:] for input_ids in onnx_input["input_ids"]]
+            [self.QUERY_MARKER_TOKEN_ID + input_ids[2:].tolist() for input_ids in onnx_input["input_ids"]]
+        )
+        empty_image_placeholder = np.zeros(self.IMAGE_PLACEHOLDER_SIZE, dtype=np.float32)
+        onnx_input["pixel_values"] = np.array(
+            [empty_image_placeholder for _ in onnx_input["input_ids"]]
+        )
+        return onnx_input
+
+    def _preprocess_onnx_image_input(
+        self, onnx_input: dict[str, np.ndarray], **kwargs
+    ) -> dict[str, np.ndarray]:
+        """
+        Add placeholders for text input when processing image data for ONNX.
+        Args:
+            onnx_input (Dict[str, np.ndarray]): Preprocessed image inputs.
+            **kwargs: Additional arguments.
+        Returns:
+            Dict[str, np.ndarray]: ONNX input with text placeholders.
+        """
+        onnx_input["input_ids"] = np.array(
+            [self.EMPTY_TEXT_PLACEHOLDER for _ in onnx_input["input_ids"]]
+        )
+        onnx_input["attention_mask"] = np.array(
+            [self.EVEN_ATTENTION_MASK for _ in onnx_input["input_ids"]]
         )
         return onnx_input
 

From ade31c5b709fff669ead51fd4c6f36d8cc1aa633 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Fri, 20 Dec 2024 12:56:33 +0100
Subject: [PATCH 04/17] Import from fastembed.late_interaction_multimodal

---
 fastembed/late_interaction_multimodal/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 fastembed/late_interaction_multimodal/__init__.py

diff --git a/fastembed/late_interaction_multimodal/__init__.py b/fastembed/late_interaction_multimodal/__init__.py
new file mode 100644
index 00000000..e23c1e28
--- /dev/null
+++ b/fastembed/late_interaction_multimodal/__init__.py
@@ -0,0 +1,5 @@
+from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding import (
+    LateInteractionMultimodalEmbedding,
+)
+
+__all__ = ["LateInteractionMultimodalEmbedding"]
\ No newline at end of file

From 5cc0a7eb83e78384d823e9675aed1241a64f1d06 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Fri, 20 Dec 2024 13:02:18 +0100
Subject: [PATCH 05/17] Fixed method misspelling

---
 fastembed/late_interaction_multimodal/colpali.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index f8780cb9..222e843b 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -141,7 +141,9 @@ def _post_process_onnx_image_output(
         Returns:
             Iterable[np.ndarray]: Post-processed output as NumPy arrays.
         """
-        return output.model_output.reshape(output.model_output.shape[0], -1, self.model_description['dim']).astype(np.float32)
+        return output.model_output.reshape(
+            output.model_output.shape[0], -1, self.model_description["dim"]
+        ).astype(np.float32)
 
     def _post_process_onnx_text_output(
         self,
@@ -172,7 +174,10 @@ def _preprocess_onnx_text_input(
         self, onnx_input: dict[str, np.ndarray], **kwargs
     ) -> dict[str, np.ndarray]:
         onnx_input["input_ids"] = np.array(
-            [self.QUERY_MARKER_TOKEN_ID + input_ids[2:].tolist() for input_ids in onnx_input["input_ids"]]
+            [
+                self.QUERY_MARKER_TOKEN_ID + input_ids[2:].tolist()
+                for input_ids in onnx_input["input_ids"]
+            ]
         )
         empty_image_placeholder = np.zeros(self.IMAGE_PLACEHOLDER_SIZE, dtype=np.float32)
         onnx_input["pixel_values"] = np.array(
@@ -232,7 +237,7 @@ def embed_text(
             **kwargs,
         )
 
-    def embed_images(
+    def embed_image(
         self,
         images: ImageInput,
         batch_size: int = 16,

From 566d2450d22357275870cfccb800887f3a7ddfa9 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Mon, 23 Dec 2024 13:23:30 +0100
Subject: [PATCH 06/17] Tests, which do not run in CI Docstring improvements

---
 .../late_interaction_multimodal/colpali.py    |   3 -
 ...e_interaction_multimodal_embedding_base.py |   7 +-
 tests/test_late_interaction_multimodal.py     | 142 ++++++++++++++++++
 3 files changed, 147 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_late_interaction_multimodal.py

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index 222e843b..93d65083 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -28,9 +28,6 @@
         },
         "additional_files": [
             "model.onnx_data",
-            "tokenizer.json",
-            "tokenizer_config.json",
-            "config.json",
         ],
         "model_file": "model.onnx",
     }
diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py
index cc1a929b..ec908a1b 100644
--- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py
+++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py
@@ -31,8 +31,11 @@ def embed_text(
 
         Args:
             documents (Iterable[str]): The list of texts to embed.
-            batch_size (int) - ...
-            parallel (Optional[int]) - ...
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
             **kwargs: Additional keyword argument to pass to the embed method.
 
         Yields:
diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
new file mode 100644
index 00000000..baf44e7f
--- /dev/null
+++ b/tests/test_late_interaction_multimodal.py
@@ -0,0 +1,142 @@
+import os
+
+import numpy as np
+import pytest
+
+from fastembed.late_interaction_multimodal.colpali import ColPali
+from tests.utils import delete_model_cache
+from tests.config import TEST_MISC_DIR
+from PIL import Image
+
+# vectors are abridged and rounded for brevity
+CANONICAL_COLUMN_VALUES = {
+    "akshayballal/colpali-v1.2-merged": np.array(
+        [
+            [
+                [0.015, 0.051, 0.059, 0.026, -0.061, -0.027, -0.014],
+                [-0.22, -0.111, 0.046, 0.081, -0.048, -0.052, -0.086],
+                [-0.184, -0.131, 0.004, 0.062, -0.038, -0.059, -0.127],
+                [-0.209, -0.113, 0.015, 0.059, -0.035, -0.035, -0.072],
+                [-0.031, -0.044, 0.092, -0.005, 0.006, -0.057, -0.061],
+                [-0.18, -0.039, 0.031, 0.003, 0.083, -0.041, 0.088],
+                [-0.091, 0.023, 0.116, -0.02, 0.039, -0.064, -0.026],
+            ]
+        ]
+    ),
+}
+
+CANONICAL_QUERY_VALUES = {
+    "akshayballal/colpali-v1.2-merged": np.array(
+        [
+            [0.158, -0.02, 0.1, -0.023, 0.045, 0.031, 0.071],
+            [-0.074, -0.111, 0.065, -0.0, -0.089, -0.003, -0.099],
+            [-0.034, -0.014, 0.174, -0.063, -0.09, -0.036, 0.064],
+            [-0.07, -0.014, 0.186, -0.013, -0.021, -0.062, 0.107],
+            [-0.085, 0.025, 0.179, -0.101, 0.036, -0.089, 0.098],
+            [-0.058, 0.031, 0.18, -0.078, 0.023, -0.119, 0.131],
+            [-0.067, 0.038, 0.188, -0.079, -0.001, -0.123, 0.127],
+            [-0.063, 0.037, 0.204, -0.069, 0.003, -0.118, 0.134],
+            [-0.054, 0.036, 0.212, -0.072, -0.001, -0.117, 0.133],
+            [-0.044, 0.03, 0.218, -0.077, -0.003, -0.107, 0.139],
+            [-0.037, 0.033, 0.22, -0.088, 0.0, -0.095, 0.146],
+            [-0.031, 0.041, 0.213, -0.092, 0.001, -0.088, 0.147],
+            [-0.026, 0.047, 0.204, -0.089, -0.002, -0.084, 0.144],
+            [-0.027, 0.051, 0.199, -0.084, -0.007, -0.083, 0.14],
+            [-0.031, 0.056, 0.19, -0.082, -0.011, -0.086, 0.135],
+            [-0.008, 0.108, 0.144, -0.095, -0.018, -0.086, 0.085],
+        ]
+    ),
+}
+
+queries = ["hello world", "flag embedding"]
+images = [
+    TEST_MISC_DIR / "image.jpeg",
+    str(TEST_MISC_DIR / "small_image.jpeg"),
+    Image.open((TEST_MISC_DIR / "small_image.jpeg")),
+]
+
+
+def test_batch_embedding():
+    is_ci = os.getenv("CI")
+    docs_to_embed = images
+
+    for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
+        print("evaluating", model_name)
+        model = ColPali(model_name=model_name)
+        result = list(model.embed_images(docs_to_embed, batch_size=2))
+
+        for value in result:
+            batch_size, token_num, abridged_dim = expected_result.shape
+            assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=1e-3)
+            break
+
+        if is_ci:
+            delete_model_cache(model.model._model_dir)
+
+
+def test_single_embedding():
+    is_ci = os.getenv("CI")
+    if not is_ci:
+        docs_to_embed = images
+
+        for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
+            print("evaluating", model_name)
+            model = ColPali(model_name=model_name)
+            result = next(iter(model.embed_images(docs_to_embed, batch_size=6)))
+            batch_size, token_num, abridged_dim = expected_result.shape
+            assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
+
+
+def test_single_embedding_query():
+    is_ci = os.getenv("CI")
+    if not is_ci:
+        queries_to_embed = queries
+
+        for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
+            print("evaluating", model_name)
+            model = ColPali(model_name=model_name)
+            result = next(iter(model.embed_text(queries_to_embed)))
+            token_num, abridged_dim = expected_result.shape
+            assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
+
+
+def test_parallel_processing():
+    is_ci = os.getenv("CI")
+    if not is_ci:
+        model = ColPali(model_name="akshayballal/colpali-v1.2-merged")
+
+        token_dim = 128
+        docs = ["hello world", "flag embedding"] * 100
+        embeddings = list(model.embed_text(docs, batch_size=10, parallel=2))
+        embeddings = np.stack(embeddings, axis=0)
+
+        embeddings_2 = list(model.embed_text(docs, batch_size=10, parallel=None))
+        embeddings_2 = np.stack(embeddings_2, axis=0)
+
+        embeddings_3 = list(model.embed_text(docs, batch_size=10, parallel=0))
+        embeddings_3 = np.stack(embeddings_3, axis=0)
+
+        assert embeddings.shape[0] == len(docs) and embeddings.shape[-1] == token_dim
+        assert np.allclose(embeddings, embeddings_2, atol=1e-3)
+        assert np.allclose(embeddings, embeddings_3, atol=1e-3)
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    ["akshayballal/colpali-v1.2-merged"],
+)
+def test_lazy_load(model_name):
+    is_ci = os.getenv("CI")
+    if not is_ci:
+        model = ColPali(model_name=model_name, lazy_load=True)
+        assert not hasattr(model.model, "model")
+
+        docs = ["hello world", "flag embedding"]
+        list(model.embed_text(docs))
+        assert hasattr(model.model, "model")
+
+        model = ColPali(model_name=model_name, lazy_load=True)
+        list(model.embed_text(docs))
+
+        model = ColPali(model_name=model_name, lazy_load=True)
+        list(model.embed_text(docs))

From d0b1f8615ec2f079cca8e5e788ee27a0929b5b55 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Fri, 27 Dec 2024 13:11:28 +0100
Subject: [PATCH 07/17] Fix tests

---
 tests/test_late_interaction_multimodal.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index baf44e7f..65b76bbb 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 
-from fastembed.late_interaction_multimodal.colpali import ColPali
+from fastembed.late_interaction_multimodal import LateInteractionMultimodalEmbedding
 from tests.utils import delete_model_cache
 from tests.config import TEST_MISC_DIR
 from PIL import Image
@@ -62,8 +62,8 @@ def test_batch_embedding():
 
     for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
         print("evaluating", model_name)
-        model = ColPali(model_name=model_name)
-        result = list(model.embed_images(docs_to_embed, batch_size=2))
+        model = LateInteractionMultimodalEmbedding(model_name=model_name)
+        result = list(model.embed_image(docs_to_embed, batch_size=2))
 
         for value in result:
             batch_size, token_num, abridged_dim = expected_result.shape
@@ -81,7 +81,7 @@ def test_single_embedding():
 
         for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
             print("evaluating", model_name)
-            model = ColPali(model_name=model_name)
+            model = LateInteractionMultimodalEmbedding(model_name=model_name)
             result = next(iter(model.embed_images(docs_to_embed, batch_size=6)))
             batch_size, token_num, abridged_dim = expected_result.shape
             assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
@@ -94,7 +94,7 @@ def test_single_embedding_query():
 
         for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
             print("evaluating", model_name)
-            model = ColPali(model_name=model_name)
+            model = LateInteractionMultimodalEmbedding(model_name=model_name)
             result = next(iter(model.embed_text(queries_to_embed)))
             token_num, abridged_dim = expected_result.shape
             assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
@@ -103,7 +103,7 @@ def test_single_embedding_query():
 def test_parallel_processing():
     is_ci = os.getenv("CI")
     if not is_ci:
-        model = ColPali(model_name="akshayballal/colpali-v1.2-merged")
+        model = LateInteractionMultimodalEmbedding(model_name="akshayballal/colpali-v1.2-merged")
 
         token_dim = 128
         docs = ["hello world", "flag embedding"] * 100
@@ -128,15 +128,15 @@ def test_parallel_processing():
 def test_lazy_load(model_name):
     is_ci = os.getenv("CI")
     if not is_ci:
-        model = ColPali(model_name=model_name, lazy_load=True)
+        model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
         assert not hasattr(model.model, "model")
 
         docs = ["hello world", "flag embedding"]
         list(model.embed_text(docs))
         assert hasattr(model.model, "model")
 
-        model = ColPali(model_name=model_name, lazy_load=True)
+        model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
         list(model.embed_text(docs))
 
-        model = ColPali(model_name=model_name, lazy_load=True)
+        model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
         list(model.embed_text(docs))

From b60f3721ef528e9e8f7b091e5db583df6daa8600 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Mon, 13 Jan 2025 13:15:19 +0100
Subject: [PATCH 08/17] Bump colpali to version v1.3

---
 .../late_interaction_multimodal/colpali.py    | 18 ++++-
 .../onnx_multimodal_model.py                  |  1 -
 tests/test_late_interaction_multimodal.py     | 73 +++++++------------
 3 files changed, 43 insertions(+), 49 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index 93d65083..8e052f71 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -30,7 +30,21 @@
             "model.onnx_data",
         ],
         "model_file": "model.onnx",
-    }
+    },
+    {
+        "model": "AndrewOgn/colpali-v1.3-merged-onnx",
+        "dim": 128,
+        "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.",
+        "license": "mit",
+        "size_in_GB": 10.08,
+        "sources": {
+            "hf": "AndrewOgn/colpali-v1.3-merged-onnx",
+        },
+        "additional_files": [
+            "model.onnx_data",
+        ],
+        "model_file": "model.onnx",
+    },
 ]
 
 
@@ -39,7 +53,7 @@ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.nda
     QUERY_PREFIX = "Query: "
     BOS_TOKEN = "<s>"
     PAD_TOKEN = "<pad>"
-    QUERY_MARKER_TOKEN_ID = [2, 9413]
+    QUERY_MARKER_TOKEN_ID = [2, 5098]
     IMAGE_PLACEHOLDER_SIZE = (3, 448, 448)
     EMPTY_TEXT_PLACEHOLDER = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108])
     EVEN_ATTENTION_MASK = np.array([1] * 1030)
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index 0557a92e..c9a12b22 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -100,7 +100,6 @@ def onnx_embed_text(
             )
 
         onnx_input = self._preprocess_onnx_text_input(onnx_input, **kwargs)
-
         model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)
         return OnnxOutputContext(
             model_output=model_output[0],
diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index 65b76bbb..9786847d 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -1,7 +1,6 @@
 import os
 
 import numpy as np
-import pytest
 
 from fastembed.late_interaction_multimodal import LateInteractionMultimodalEmbedding
 from tests.utils import delete_model_cache
@@ -9,7 +8,7 @@
 from PIL import Image
 
 # vectors are abridged and rounded for brevity
-CANONICAL_COLUMN_VALUES = {
+CANONICAL_IMAGE_VALUES = {
     "akshayballal/colpali-v1.2-merged": np.array(
         [
             [
@@ -23,6 +22,19 @@
             ]
         ]
     ),
+    "AndrewOgn/colpali-v1.3-merged-onnx": np.array(
+        [
+            [
+                [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738],
+                [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021],
+                [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666],
+                [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087],
+                [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064],
+                [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301],
+                [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122],
+            ]
+        ]
+    ),
 }
 
 CANONICAL_QUERY_VALUES = {
@@ -46,6 +58,17 @@
             [-0.008, 0.108, 0.144, -0.095, -0.018, -0.086, 0.085],
         ]
     ),
+    "AndrewOgn/colpali-v1.3-merged-onnx": np.array(
+        [
+            [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567],
+            [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537],
+            [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593],
+            [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098],
+            [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708],
+            [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022],
+            [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137],
+        ]
+    ),
 }
 
 queries = ["hello world", "flag embedding"]
@@ -60,7 +83,7 @@ def test_batch_embedding():
     is_ci = os.getenv("CI")
     docs_to_embed = images
 
-    for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
+    for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
         print("evaluating", model_name)
         model = LateInteractionMultimodalEmbedding(model_name=model_name)
         result = list(model.embed_image(docs_to_embed, batch_size=2))
@@ -79,7 +102,7 @@ def test_single_embedding():
     if not is_ci:
         docs_to_embed = images
 
-        for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
+        for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
             print("evaluating", model_name)
             model = LateInteractionMultimodalEmbedding(model_name=model_name)
             result = next(iter(model.embed_images(docs_to_embed, batch_size=6)))
@@ -98,45 +121,3 @@ def test_single_embedding_query():
             result = next(iter(model.embed_text(queries_to_embed)))
             token_num, abridged_dim = expected_result.shape
             assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
-
-
-def test_parallel_processing():
-    is_ci = os.getenv("CI")
-    if not is_ci:
-        model = LateInteractionMultimodalEmbedding(model_name="akshayballal/colpali-v1.2-merged")
-
-        token_dim = 128
-        docs = ["hello world", "flag embedding"] * 100
-        embeddings = list(model.embed_text(docs, batch_size=10, parallel=2))
-        embeddings = np.stack(embeddings, axis=0)
-
-        embeddings_2 = list(model.embed_text(docs, batch_size=10, parallel=None))
-        embeddings_2 = np.stack(embeddings_2, axis=0)
-
-        embeddings_3 = list(model.embed_text(docs, batch_size=10, parallel=0))
-        embeddings_3 = np.stack(embeddings_3, axis=0)
-
-        assert embeddings.shape[0] == len(docs) and embeddings.shape[-1] == token_dim
-        assert np.allclose(embeddings, embeddings_2, atol=1e-3)
-        assert np.allclose(embeddings, embeddings_3, atol=1e-3)
-
-
-@pytest.mark.parametrize(
-    "model_name",
-    ["akshayballal/colpali-v1.2-merged"],
-)
-def test_lazy_load(model_name):
-    is_ci = os.getenv("CI")
-    if not is_ci:
-        model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
-        assert not hasattr(model.model, "model")
-
-        docs = ["hello world", "flag embedding"]
-        list(model.embed_text(docs))
-        assert hasattr(model.model, "model")
-
-        model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
-        list(model.embed_text(docs))
-
-        model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
-        list(model.embed_text(docs))

From 013a4621fcb551848b67c4b7028bf2136ab40ae3 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Mon, 13 Jan 2025 13:16:23 +0100
Subject: [PATCH 09/17] Remove colpali v1.2

---
 fastembed/late_interaction_multimodal/colpali.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index 8e052f71..5d8e118d 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -17,20 +17,6 @@
 
 
 supported_colpali_models = [
-    {
-        "model": "akshayballal/colpali-v1.2-merged",
-        "dim": 128,
-        "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.",
-        "license": "mit",
-        "size_in_GB": 6.08,
-        "sources": {
-            "hf": "akshayballal/colpali-v1.2-merged-onnx",
-        },
-        "additional_files": [
-            "model.onnx_data",
-        ],
-        "model_file": "model.onnx",
-    },
     {
         "model": "AndrewOgn/colpali-v1.3-merged-onnx",
         "dim": 128,

From a57fd3c79131bc2a736b9ec0e18285002ce1fadb Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Mon, 13 Jan 2025 13:16:38 +0100
Subject: [PATCH 10/17] Remove colpali v1.2 from tests

---
 tests/test_late_interaction_multimodal.py | 33 -----------------------
 1 file changed, 33 deletions(-)

diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index 9786847d..9d7c0b90 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -9,19 +9,6 @@
 
 # vectors are abridged and rounded for brevity
 CANONICAL_IMAGE_VALUES = {
-    "akshayballal/colpali-v1.2-merged": np.array(
-        [
-            [
-                [0.015, 0.051, 0.059, 0.026, -0.061, -0.027, -0.014],
-                [-0.22, -0.111, 0.046, 0.081, -0.048, -0.052, -0.086],
-                [-0.184, -0.131, 0.004, 0.062, -0.038, -0.059, -0.127],
-                [-0.209, -0.113, 0.015, 0.059, -0.035, -0.035, -0.072],
-                [-0.031, -0.044, 0.092, -0.005, 0.006, -0.057, -0.061],
-                [-0.18, -0.039, 0.031, 0.003, 0.083, -0.041, 0.088],
-                [-0.091, 0.023, 0.116, -0.02, 0.039, -0.064, -0.026],
-            ]
-        ]
-    ),
     "AndrewOgn/colpali-v1.3-merged-onnx": np.array(
         [
             [
@@ -38,26 +25,6 @@
 }
 
 CANONICAL_QUERY_VALUES = {
-    "akshayballal/colpali-v1.2-merged": np.array(
-        [
-            [0.158, -0.02, 0.1, -0.023, 0.045, 0.031, 0.071],
-            [-0.074, -0.111, 0.065, -0.0, -0.089, -0.003, -0.099],
-            [-0.034, -0.014, 0.174, -0.063, -0.09, -0.036, 0.064],
-            [-0.07, -0.014, 0.186, -0.013, -0.021, -0.062, 0.107],
-            [-0.085, 0.025, 0.179, -0.101, 0.036, -0.089, 0.098],
-            [-0.058, 0.031, 0.18, -0.078, 0.023, -0.119, 0.131],
-            [-0.067, 0.038, 0.188, -0.079, -0.001, -0.123, 0.127],
-            [-0.063, 0.037, 0.204, -0.069, 0.003, -0.118, 0.134],
-            [-0.054, 0.036, 0.212, -0.072, -0.001, -0.117, 0.133],
-            [-0.044, 0.03, 0.218, -0.077, -0.003, -0.107, 0.139],
-            [-0.037, 0.033, 0.22, -0.088, 0.0, -0.095, 0.146],
-            [-0.031, 0.041, 0.213, -0.092, 0.001, -0.088, 0.147],
-            [-0.026, 0.047, 0.204, -0.089, -0.002, -0.084, 0.144],
-            [-0.027, 0.051, 0.199, -0.084, -0.007, -0.083, 0.14],
-            [-0.031, 0.056, 0.19, -0.082, -0.011, -0.086, 0.135],
-            [-0.008, 0.108, 0.144, -0.095, -0.018, -0.086, 0.085],
-        ]
-    ),
     "AndrewOgn/colpali-v1.3-merged-onnx": np.array(
         [
             [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567],

From 939a1c0a4234cd3ecda3f24f87f0eaa8187eb695 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Mon, 13 Jan 2025 13:34:57 +0100
Subject: [PATCH 11/17] partial fix of change requests: descriptions docs black

---
 .../late_interaction_multimodal/__init__.py   |  2 +-
 .../late_interaction_multimodal/colpali.py    |  7 ++---
 .../late_interaction_multimodal_embedding.py  | 26 ++++++++++---------
 ...e_interaction_multimodal_embedding_base.py |  6 ++---
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/__init__.py b/fastembed/late_interaction_multimodal/__init__.py
index e23c1e28..50588cde 100644
--- a/fastembed/late_interaction_multimodal/__init__.py
+++ b/fastembed/late_interaction_multimodal/__init__.py
@@ -2,4 +2,4 @@
     LateInteractionMultimodalEmbedding,
 )
 
-__all__ = ["LateInteractionMultimodalEmbedding"]
\ No newline at end of file
+__all__ = ["LateInteractionMultimodalEmbedding"]
diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index 5d8e118d..be7256b9 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -20,7 +20,7 @@
     {
         "model": "AndrewOgn/colpali-v1.3-merged-onnx",
         "dim": 128,
-        "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.",
+        "description": "Text embeddings, Bimodal (text/image), Aligned to image latent space, fp16 quantized, 2024.",
         "license": "mit",
         "size_in_GB": 10.08,
         "sources": {
@@ -97,11 +97,12 @@ def __init__(
         self.cache_dir = define_cache_dir(cache_dir)
 
         self._model_dir = self.download_model(
-            self.model_description, self.cache_dir, local_files_only=self._local_files_only
+            self.model_description,
+            self.cache_dir,
+            local_files_only=self._local_files_only,
         )
         self.mask_token_id = None
         self.pad_token_id = None
-        self.skip_list = set()
 
         if not self.lazy_load:
             self.load_onnx_model()
diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
index 3d35c52f..33104bf7 100644
--- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
+++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
@@ -25,15 +25,18 @@ def list_supported_models(cls) -> list[dict[str, Any]]:
                 ```
                 [
                     {
-                        "model": "colpali",
-                        "dim": ...,
-                        "description": "Late interaction model",
-                        "license": "mit",
-                        "size_in_GB": 6.06,
-                        "sources": {
-                            "hf": "colpali",
-                        },
-                        "model_file": "model.onnx",
+                         "model": "AndrewOgn/colpali-v1.3-merged-onnx",
+                         "dim": 128,
+                         "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.",
+                         "license": "mit",
+                         "size_in_GB": 6.06,
+                         "sources": {
+                            "hf": "AndrewOgn/colpali-v1.3-merged-onnx",
+                            },
+                         "additional_files": [
+                         "model.onnx_data",
+                ],
+                "model_file": "model.onnx",
                     },
                 ]
                 ```
@@ -106,8 +109,7 @@ def embed_image(
         **kwargs,
     ) -> Iterable[np.ndarray]:
         """
-        Encode a list of documents into list of embeddings.
-        We use mean pooling with attention so that the model can handle variable-length inputs.
+        Encode a list of images into list of embeddings.
 
         Args:
             images: Iterator of image paths or single image path to embed
@@ -118,6 +120,6 @@ def embed_image(
                 If None, don't use data-parallel processing, use default onnxruntime threading instead.
 
         Returns:
-            List of embeddings, one per document
+            List of embeddings, one per image
         """
         yield from self.model.embed_image(images, batch_size, parallel, **kwargs)
diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py
index ec908a1b..2ae0e34f 100644
--- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py
+++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py
@@ -51,9 +51,7 @@ def embed_image(
         **kwargs,
     ) -> Iterable[np.ndarray]:
         """
-        Encode a list of documents into list of embeddings.
-        We use mean pooling with attention so that the model can handle variable-length inputs.
-
+        Encode a list of images into list of embeddings.
         Args:
             images: Iterator of image paths or single image path to embed
             batch_size: Batch size for encoding -- higher values will use more memory, but be faster
@@ -63,6 +61,6 @@ def embed_image(
                 If None, don't use data-parallel processing, use default onnxruntime threading instead.
 
         Returns:
-            List of embeddings, one per document
+            List of embeddings, one per image
         """
         raise NotImplementedError()

From 78dcc33cd999476f0a66b1fb6f4cdce9696bae46 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Mon, 13 Jan 2025 14:00:51 +0100
Subject: [PATCH 12/17] query_max_length

---
 fastembed/late_interaction_multimodal/colpali.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index be7256b9..974dc7b0 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -43,6 +43,7 @@ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.nda
     IMAGE_PLACEHOLDER_SIZE = (3, 448, 448)
     EMPTY_TEXT_PLACEHOLDER = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108])
     EVEN_ATTENTION_MASK = np.array([1] * 1030)
+    QUERY_MAX_LENGTH = 50
 
     def __init__(
         self,
@@ -165,7 +166,7 @@ def tokenize(self, documents: list[str], **_) -> list[Encoding]:
             query += "\n"
 
             texts_query.append(query)
-        encoded = self.tokenizer.encode_batch(texts_query)
+        encoded = self.tokenizer.encode_batch(texts_query, max_length=self.QUERY_MAX_LENGTH)
         return encoded
 
     def _preprocess_onnx_text_input(

From 17fa789cc7c160cf9ef57e2b1064bf3fd0103068 Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Wed, 15 Jan 2025 11:12:03 +0100
Subject: [PATCH 13/17] black colpali

---
 fastembed/late_interaction_multimodal/colpali.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index 974dc7b0..772df5cf 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -44,6 +44,7 @@ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.nda
     EMPTY_TEXT_PLACEHOLDER = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108])
     EVEN_ATTENTION_MASK = np.array([1] * 1030)
     QUERY_MAX_LENGTH = 50
+    VISUAL_PROMPT_PREFIX = "<image><bos>Describe the image."
 
     def __init__(
         self,
@@ -195,6 +196,7 @@ def _preprocess_onnx_image_input(
         Returns:
             Dict[str, np.ndarray]: ONNX input with text placeholders.
         """
+
         onnx_input["input_ids"] = np.array(
             [self.EMPTY_TEXT_PLACEHOLDER for _ in onnx_input["input_ids"]]
         )

From 5518a88095b614352adf184a15728d65a18e8c0d Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Thu, 16 Jan 2025 17:54:20 +0100
Subject: [PATCH 14/17] Added comment for EMPTY_TEXT_PLACEHOLDER

---
 fastembed/late_interaction_multimodal/colpali.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index 772df5cf..300504e7 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -41,7 +41,9 @@ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.nda
     PAD_TOKEN = "<pad>"
     QUERY_MARKER_TOKEN_ID = [2, 5098]
     IMAGE_PLACEHOLDER_SIZE = (3, 448, 448)
-    EMPTY_TEXT_PLACEHOLDER = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108])
+    EMPTY_TEXT_PLACEHOLDER = np.array(
+        [257152] * 1024 + [2, 50721, 573, 2416, 235265, 108]
+    )  # This is a tokenization of '<image>' * 1024 + '<bos>Describe the image.\n' line which is used as placeholder while processing just image
     EVEN_ATTENTION_MASK = np.array([1] * 1030)
     QUERY_MAX_LENGTH = 50
     VISUAL_PROMPT_PREFIX = "<image><bos>Describe the image."

From 3a0b570dbe2ff422d7453d008ccc0e32b7da5d8a Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Thu, 16 Jan 2025 19:43:45 +0100
Subject: [PATCH 15/17] Review fixes

---
 .../late_interaction_multimodal/onnx_multimodal_model.py      | 4 ++--
 tests/test_late_interaction_multimodal.py                     | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index c9a12b22..7f084fb8 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -2,7 +2,7 @@
 import os
 from multiprocessing import get_all_start_methods
 from pathlib import Path
-from typing import Any, Iterable, Optional, Sequence, Type, Union
+from typing import Any, Iterable, Optional, Sequence, Type, Union, get_args
 
 import numpy as np
 from PIL import Image
@@ -186,7 +186,7 @@ def _embed_images(
     ) -> Iterable[T]:
         is_small = False
 
-        if isinstance(images, (str, Path, Image.Image)):
+        if isinstance(images, get_args(ImageInput)):
             images = [images]
             is_small = True
 
diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index 9d7c0b90..dc75d9df 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -58,7 +58,6 @@ def test_batch_embedding():
         for value in result:
             batch_size, token_num, abridged_dim = expected_result.shape
             assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=1e-3)
-            break
 
         if is_ci:
             delete_model_cache(model.model._model_dir)

From d9f53fbe92e6dd4f47ee13ff578128627f525d3d Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Mon, 20 Jan 2025 15:42:29 +0100
Subject: [PATCH 16/17] Removed redundant VISUAL_PROMPT_PREFIX

---
 fastembed/late_interaction_multimodal/colpali.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index 300504e7..348966dc 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -46,7 +46,6 @@ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.nda
     )  # This is a tokenization of '<image>' * 1024 + '<bos>Describe the image.\n' line which is used as placeholder while processing just image
     EVEN_ATTENTION_MASK = np.array([1] * 1030)
     QUERY_MAX_LENGTH = 50
-    VISUAL_PROMPT_PREFIX = "<image><bos>Describe the image."
 
     def __init__(
         self,

From 40b322b4d3203863d5a82a507f9b6bd49f7d469c Mon Sep 17 00:00:00 2001
From: "d.rudenko" <dmitrii.rudenko@qdrant.com>
Date: Thu, 23 Jan 2025 16:55:36 +0100
Subject: [PATCH 17/17] type fix + model info

---
 fastembed/late_interaction_multimodal/colpali.py              | 3 +--
 .../late_interaction_multimodal/onnx_multimodal_model.py      | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index 348966dc..f0bbcdfe 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -15,14 +15,13 @@
     ImageEmbeddingWorker,
 )
 
-
 supported_colpali_models = [
     {
         "model": "AndrewOgn/colpali-v1.3-merged-onnx",
         "dim": 128,
         "description": "Text embeddings, Bimodal (text/image), Aligned to image latent space, fp16 quantized, 2024.",
         "license": "mit",
-        "size_in_GB": 10.08,
+        "size_in_GB": 6.5,
         "sources": {
             "hf": "AndrewOgn/colpali-v1.3-merged-onnx",
         },
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index 7f084fb8..719dfd70 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -2,7 +2,7 @@
 import os
 from multiprocessing import get_all_start_methods
 from pathlib import Path
-from typing import Any, Iterable, Optional, Sequence, Type, Union, get_args
+from typing import Any, Iterable, Optional, Sequence, Type, Union
 
 import numpy as np
 from PIL import Image
@@ -186,7 +186,7 @@ def _embed_images(
     ) -> Iterable[T]:
         is_small = False
 
-        if isinstance(images, get_args(ImageInput)):
+        if isinstance(images, Image.Image):
             images = [images]
             is_small = True