From 87bfae15bafafe0a83b517cc2b29980ca2a8765c Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Wed, 18 Dec 2024 17:58:08 +0100 Subject: [PATCH 01/17] wip: design draft --- fastembed/image/image_embedding.py | 3 +- .../late_interaction_multimodal/colpali.py | 266 ++++++++++++++++++ .../late_interaction_multimodal_embedding.py | 123 ++++++++ ...e_interaction_multimodal_embedding_base.py | 65 +++++ .../onnx_multimodal_model.py | 237 ++++++++++++++++ 5 files changed, 692 insertions(+), 2 deletions(-) create mode 100644 fastembed/late_interaction_multimodal/colpali.py create mode 100644 fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py create mode 100644 fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py create mode 100644 fastembed/late_interaction_multimodal/onnx_multimodal_model.py diff --git a/fastembed/image/image_embedding.py b/fastembed/image/image_embedding.py index aa4c91b4..23d39a3e 100644 --- a/fastembed/image/image_embedding.py +++ b/fastembed/image/image_embedding.py @@ -80,8 +80,7 @@ def embed( **kwargs, ) -> Iterable[np.ndarray]: """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. + Encode a list of images into list of embeddings. Args: images: Iterator of image paths or single image path to embed diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py new file mode 100644 index 00000000..d3508194 --- /dev/null +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -0,0 +1,266 @@ +from typing import Any, Iterable, Optional, Sequence, Type, Union + +import numpy as np +from tokenizers import Encoding + +from fastembed.common import OnnxProvider, ImageInput +from fastembed.common.onnx_model import OnnxOutputContext +from fastembed.common.utils import define_cache_dir +from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( + LateInteractionMultimodalEmbeddingBase, +) +from fastembed.late_interaction_multimodal.onnx_multimodal_model import ( + OnnxMultimodalModel, + TextEmbeddingWorker, + ImageEmbeddingWorker, +) + + +supported_colbert_models = [ + { + "model": "colpali", + "dim": ..., + "description": "Late interaction model", + "license": "mit", + "size_in_GB": 6.06, + "sources": { + "hf": "colpali", + }, + "model_file": "model.onnx", + }, +] + + +class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.ndarray]): + DOCUMENT_MARKER_TOKEN_ID = 2 + QUERY_PREFIX = "Query: " + BOS_TOKEN = "" + PAD_TOKEN = "" + QUERY_MARKER_TOKEN_ID = [2, 9413] + IMAGE_PLACEHOLDER_SIZE = (3, 448, 448) + EMPTY_TEXT_PLACEHOLDER = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108]) + EVEN_ATTENTION_MASK = np.array([1] * 1030) + + def __init__( + self, + model_name: str, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + providers: Optional[Sequence[OnnxProvider]] = None, + cuda: bool = False, + device_ids: Optional[list[int]] = None, + lazy_load: bool = False, + device_id: Optional[int] = None, + **kwargs, + ): + """ + Args: + model_name (str): The name of the model to use. + cache_dir (str, optional): The path to the cache directory. + Can be set using the `FASTEMBED_CACHE_PATH` env variable. + Defaults to `fastembed_cache` in the system's temp directory. + threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. + providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. + Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. + cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` + Defaults to False. + device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in + workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. + lazy_load (bool, optional): Whether to load the model during class initialization or on demand. + Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. + device_id (Optional[int], optional): The device id to use for loading the model in the worker process. + + Raises: + ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + """ + + super().__init__(model_name, cache_dir, threads, **kwargs) + self.providers = providers + self.lazy_load = lazy_load + + # List of device ids, that can be used for data parallel processing in workers + self.device_ids = device_ids + self.cuda = cuda + + # This device_id will be used if we need to load model in current process + if device_id is not None: + self.device_id = device_id + elif self.device_ids is not None: + self.device_id = self.device_ids[0] + else: + self.device_id = None + + self.model_description = self._get_model_description(model_name) + self.cache_dir = define_cache_dir(cache_dir) + + self._model_dir = self.download_model( + self.model_description, self.cache_dir, local_files_only=self._local_files_only + ) + self.mask_token_id = None + self.pad_token_id = None + self.skip_list = set() + + if not self.lazy_load: + self.load_onnx_model() + + @classmethod + def list_supported_models(cls) -> list[dict[str, Any]]: + """Lists the supported models. + + Returns: + list[dict[str, Any]]: A list of dictionaries containing the model information. + """ + return supported_colbert_models + + def load_onnx_model(self) -> None: + self._load_onnx_model( + model_dir=self._model_dir, + model_file=self.model_description["model_file"], + threads=self.threads, + providers=self.providers, + cuda=self.cuda, + device_id=self.device_id, + ) + + def _post_process_onnx_image_output( + self, + output: OnnxOutputContext, + ) -> Iterable[np.ndarray]: + """ + Post-process the ONNX model output to convert it into a usable format. + + Args: + output (OnnxOutputContext): The raw output from the ONNX model. + + Returns: + Iterable[np.ndarray]: Post-processed output as NumPy arrays. + """ + return output.model_output.astype(np.float32) + + def _post_process_onnx_text_output( + self, + output: OnnxOutputContext, + ) -> Iterable[np.ndarray]: + """ + Post-process the ONNX model output to convert it into a usable format. + + Args: + output (OnnxOutputContext): The raw output from the ONNX model. + + Returns: + Iterable[np.ndarray]: Post-processed output as NumPy arrays. + """ + return output.model_output.astype(np.float32) + + def tokenize(self, documents: list[str], **_) -> list[Encoding]: + texts_query: list[str] = [] + + for query in documents: + query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10 + query += "\n" + + texts_query.append(query) + encoded = self.tokenizer.encode_batch(documents) + return encoded + + def _preprocess_onnx_text_input( + self, onnx_input: dict[str, np.ndarray], **kwargs + ) -> dict[str, np.ndarray]: + onnx_input["input_ids"] = np.array( + [self.QUERY_MARKER_TOKEN_ID + input_ids[2:] for input_ids in onnx_input["input_ids"]] + ) + return onnx_input + + def embed_text( + self, + documents: Union[str, Iterable[str]], + batch_size: int = 256, + parallel: Optional[int] = None, + **kwargs, + ) -> Iterable[np.ndarray]: + """ + Encode a list of documents into list of embeddings. + + Args: + documents: Iterator of documents or single document to embed + batch_size: Batch size for encoding -- higher values will use more memory, but be faster + parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + + Returns: + List of embeddings, one per document + """ + yield from self._embed_documents( + model_name=self.model_name, + cache_dir=str(self.cache_dir), + documents=documents, + batch_size=batch_size, + parallel=parallel, + providers=self.providers, + cuda=self.cuda, + device_ids=self.device_ids, + **kwargs, + ) + + def embed_images( + self, + images: ImageInput, + batch_size: int = 16, + parallel: Optional[int] = None, + **kwargs, + ) -> Iterable[np.ndarray]: + """ + Encode a list of images into list of embeddings. + + Args: + images: Iterator of image paths or single image path to embed + batch_size: Batch size for encoding -- higher values will use more memory, but be faster + parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + + Returns: + List of embeddings, one per document + """ + yield from self._embed_images( + model_name=self.model_name, + cache_dir=str(self.cache_dir), + images=images, + batch_size=batch_size, + parallel=parallel, + providers=self.providers, + cuda=self.cuda, + device_ids=self.device_ids, + **kwargs, + ) + + @classmethod + def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker]: + return ColPaliTextEmbeddingWorker + + @classmethod + def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker]: + return ColPaliImageEmbeddingWorker + + +class ColPaliTextEmbeddingWorker(TextEmbeddingWorker): + def init_embedding(self, model_name: str, cache_dir: str, **kwargs) -> ColPali: + return ColPali( + model_name=model_name, + cache_dir=cache_dir, + threads=1, + **kwargs, + ) + + +class ColPaliImageEmbeddingWorker(ImageEmbeddingWorker): + def init_embedding(self, model_name: str, cache_dir: str, **kwargs) -> ColPali: + return ColPali( + model_name=model_name, + cache_dir=cache_dir, + threads=1, + **kwargs, + ) diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py new file mode 100644 index 00000000..3d35c52f --- /dev/null +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py @@ -0,0 +1,123 @@ +from typing import Any, Iterable, Optional, Sequence, Type, Union + +import numpy as np + +from fastembed.common import OnnxProvider, ImageInput +from fastembed.late_interaction_multimodal.colpali import ColPali + +from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( + LateInteractionMultimodalEmbeddingBase, +) + + +class LateInteractionMultimodalEmbedding(LateInteractionMultimodalEmbeddingBase): + EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [ColPali] + + @classmethod + def list_supported_models(cls) -> list[dict[str, Any]]: + """ + Lists the supported models. + + Returns: + list[dict[str, Any]]: A list of dictionaries containing the model information. + + Example: + ``` + [ + { + "model": "colpali", + "dim": ..., + "description": "Late interaction model", + "license": "mit", + "size_in_GB": 6.06, + "sources": { + "hf": "colpali", + }, + "model_file": "model.onnx", + }, + ] + ``` + """ + result = [] + for embedding in cls.EMBEDDINGS_REGISTRY: + result.extend(embedding.list_supported_models()) + return result + + def __init__( + self, + model_name: str, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + providers: Optional[Sequence[OnnxProvider]] = None, + cuda: bool = False, + device_ids: Optional[list[int]] = None, + lazy_load: bool = False, + **kwargs, + ): + super().__init__(model_name, cache_dir, threads, **kwargs) + for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY: + supported_models = EMBEDDING_MODEL_TYPE.list_supported_models() + if any(model_name.lower() == model["model"].lower() for model in supported_models): + self.model = EMBEDDING_MODEL_TYPE( + model_name, + cache_dir, + threads=threads, + providers=providers, + cuda=cuda, + device_ids=device_ids, + lazy_load=lazy_load, + **kwargs, + ) + return + + raise ValueError( + f"Model {model_name} is not supported in LateInteractionMultimodalEmbedding." + "Please check the supported models using `LateInteractionMultimodalEmbedding.list_supported_models()`" + ) + + def embed_text( + self, + documents: Union[str, Iterable[str]], + batch_size: int = 256, + parallel: Optional[int] = None, + **kwargs, + ) -> Iterable[np.ndarray]: + """ + Encode a list of documents into list of embeddings. + + Args: + documents: Iterator of documents or single document to embed + batch_size: Batch size for encoding -- higher values will use more memory, but be faster + parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + + Returns: + List of embeddings, one per document + """ + yield from self.model.embed_text(documents, batch_size, parallel, **kwargs) + + def embed_image( + self, + images: ImageInput, + batch_size: int = 16, + parallel: Optional[int] = None, + **kwargs, + ) -> Iterable[np.ndarray]: + """ + Encode a list of documents into list of embeddings. + We use mean pooling with attention so that the model can handle variable-length inputs. + + Args: + images: Iterator of image paths or single image path to embed + batch_size: Batch size for encoding -- higher values will use more memory, but be faster + parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + + Returns: + List of embeddings, one per document + """ + yield from self.model.embed_image(images, batch_size, parallel, **kwargs) diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py new file mode 100644 index 00000000..cc1a929b --- /dev/null +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py @@ -0,0 +1,65 @@ +from typing import Iterable, Optional, Union + +import numpy as np + +from fastembed.common import ImageInput +from fastembed.common.model_management import ModelManagement + + +class LateInteractionMultimodalEmbeddingBase(ModelManagement): + def __init__( + self, + model_name: str, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + **kwargs, + ): + self.model_name = model_name + self.cache_dir = cache_dir + self.threads = threads + self._local_files_only = kwargs.pop("local_files_only", False) + + def embed_text( + self, + documents: Union[str, Iterable[str]], + batch_size: int = 256, + parallel: Optional[int] = None, + **kwargs, + ) -> Iterable[np.ndarray]: + """ + Embeds a list of documents into a list of embeddings. + + Args: + documents (Iterable[str]): The list of texts to embed. + batch_size (int) - ... + parallel (Optional[int]) - ... + **kwargs: Additional keyword argument to pass to the embed method. + + Yields: + Iterable[np.ndarray]: The embeddings. + """ + raise NotImplementedError() + + def embed_image( + self, + images: ImageInput, + batch_size: int = 16, + parallel: Optional[int] = None, + **kwargs, + ) -> Iterable[np.ndarray]: + """ + Encode a list of documents into list of embeddings. + We use mean pooling with attention so that the model can handle variable-length inputs. + + Args: + images: Iterator of image paths or single image path to embed + batch_size: Batch size for encoding -- higher values will use more memory, but be faster + parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + + Returns: + List of embeddings, one per document + """ + raise NotImplementedError() diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py new file mode 100644 index 00000000..0557a92e --- /dev/null +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -0,0 +1,237 @@ +import contextlib +import os +from multiprocessing import get_all_start_methods +from pathlib import Path +from typing import Any, Iterable, Optional, Sequence, Type, Union + +import numpy as np +from PIL import Image +from tokenizers import Encoding + +from fastembed.common import OnnxProvider, ImageInput +from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T +from fastembed.common.preprocessor_utils import load_tokenizer, load_preprocessor +from fastembed.common.utils import iter_batch +from fastembed.parallel_processor import ParallelWorkerPool + + +class OnnxMultimodalModel(OnnxModel[T]): + ONNX_OUTPUT_NAMES: Optional[list[str]] = None + + def __init__(self) -> None: + super().__init__() + self.tokenizer = None + self.processor = None + self.special_token_to_id = {} + + def _preprocess_onnx_text_input( + self, onnx_input: dict[str, np.ndarray], **kwargs + ) -> dict[str, np.ndarray]: + """ + Preprocess the onnx input. + """ + return onnx_input + + def _preprocess_onnx_image_input( + self, onnx_input: dict[str, np.ndarray], **kwargs + ) -> dict[str, np.ndarray]: + """ + Preprocess the onnx input. + """ + return onnx_input + + @classmethod + def _get_text_worker_class(cls) -> Type["TextEmbeddingWorker"]: + raise NotImplementedError("Subclasses must implement this method") + + @classmethod + def _get_image_worker_class(cls) -> Type["ImageEmbeddingWorker"]: + raise NotImplementedError("Subclasses must implement this method") + + def _post_process_onnx_image_output(self, output: OnnxOutputContext) -> Iterable[T]: + raise NotImplementedError("Subclasses must implement this method") + + def _post_process_onnx_text_output(self, output: OnnxOutputContext) -> Iterable[T]: + raise NotImplementedError("Subclasses must implement this method") + + def _load_onnx_model( + self, + model_dir: Path, + model_file: str, + threads: Optional[int], + providers: Optional[Sequence[OnnxProvider]] = None, + cuda: bool = False, + device_id: Optional[int] = None, + ) -> None: + super()._load_onnx_model( + model_dir=model_dir, + model_file=model_file, + threads=threads, + providers=providers, + cuda=cuda, + device_id=device_id, + ) + self.tokenizer, self.special_token_to_id = load_tokenizer(model_dir=model_dir) + self.processor = load_preprocessor(model_dir=model_dir) + + def load_onnx_model(self) -> None: + raise NotImplementedError("Subclasses must implement this method") + + def tokenize(self, documents: list[str], **kwargs) -> list[Encoding]: + return self.tokenizer.encode_batch(documents) + + def onnx_embed_text( + self, + documents: list[str], + **kwargs, + ) -> OnnxOutputContext: + encoded = self.tokenize(documents, **kwargs) + input_ids = np.array([e.ids for e in encoded]) + attention_mask = np.array([e.attention_mask for e in encoded]) + input_names = {node.name for node in self.model.get_inputs()} + onnx_input = { + "input_ids": np.array(input_ids, dtype=np.int64), + } + if "attention_mask" in input_names: + onnx_input["attention_mask"] = np.array(attention_mask, dtype=np.int64) + if "token_type_ids" in input_names: + onnx_input["token_type_ids"] = np.array( + [np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64 + ) + + onnx_input = self._preprocess_onnx_text_input(onnx_input, **kwargs) + + model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input) + return OnnxOutputContext( + model_output=model_output[0], + attention_mask=onnx_input.get("attention_mask", attention_mask), + input_ids=onnx_input.get("input_ids", input_ids), + ) + + def _embed_documents( + self, + model_name: str, + cache_dir: str, + documents: Union[str, Iterable[str]], + batch_size: int = 256, + parallel: Optional[int] = None, + providers: Optional[Sequence[OnnxProvider]] = None, + cuda: bool = False, + device_ids: Optional[list[int]] = None, + **kwargs, + ) -> Iterable[T]: + is_small = False + + if isinstance(documents, str): + documents = [documents] + is_small = True + + if isinstance(documents, list): + if len(documents) < batch_size: + is_small = True + + if parallel is None or is_small: + if not hasattr(self, "model") or self.model is None: + self.load_onnx_model() + for batch in iter_batch(documents, batch_size): + yield from self._post_process_onnx_text_output(self.onnx_embed_text(batch)) + else: + if parallel == 0: + parallel = os.cpu_count() + + start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn" + params = { + "model_name": model_name, + "cache_dir": cache_dir, + "providers": providers, + **kwargs, + } + + pool = ParallelWorkerPool( + num_workers=parallel or 1, + worker=self._get_text_worker_class(), + cuda=cuda, + device_ids=device_ids, + start_method=start_method, + ) + for batch in pool.ordered_map(iter_batch(documents, batch_size), **params): + yield from self._post_process_onnx_text_output(batch) + + def _build_onnx_image_input(self, encoded: np.ndarray) -> dict[str, np.ndarray]: + return {node.name: encoded for node in self.model.get_inputs()} + + def onnx_embed_image(self, images: list[ImageInput], **kwargs) -> OnnxOutputContext: + with contextlib.ExitStack(): + image_files = [ + Image.open(image) if not isinstance(image, Image.Image) else image + for image in images + ] + encoded = self.processor(image_files) + onnx_input = self._build_onnx_image_input(encoded) + onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs) + model_output = self.model.run(None, onnx_input) + embeddings = model_output[0].reshape(len(images), -1) + return OnnxOutputContext(model_output=embeddings) + + def _embed_images( + self, + model_name: str, + cache_dir: str, + images: ImageInput, + batch_size: int = 256, + parallel: Optional[int] = None, + providers: Optional[Sequence[OnnxProvider]] = None, + cuda: bool = False, + device_ids: Optional[list[int]] = None, + **kwargs, + ) -> Iterable[T]: + is_small = False + + if isinstance(images, (str, Path, Image.Image)): + images = [images] + is_small = True + + if isinstance(images, list) and len(images) < batch_size: + is_small = True + + if parallel is None or is_small: + if not hasattr(self, "model") or self.model is None: + self.load_onnx_model() + + for batch in iter_batch(images, batch_size): + yield from self._post_process_onnx_image_output(self.onnx_embed_image(batch)) + else: + if parallel == 0: + parallel = os.cpu_count() + + start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn" + params = { + "model_name": model_name, + "cache_dir": cache_dir, + "providers": providers, + **kwargs, + } + + pool = ParallelWorkerPool( + num_workers=parallel or 1, + worker=self._get_image_worker_class(), + cuda=cuda, + device_ids=device_ids, + start_method=start_method, + ) + for batch in pool.ordered_map(iter_batch(images, batch_size), **params): + yield from self._post_process_onnx_image_output(batch) + + +class TextEmbeddingWorker(EmbeddingWorker): + def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]: + for idx, batch in items: + onnx_output = self.model.onnx_embed_text(batch) + yield idx, onnx_output + + +class ImageEmbeddingWorker(EmbeddingWorker): + def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]: + for idx, batch in items: + embeddings = self.model.onnx_embed_image(batch) + yield idx, embeddings From b8cda6828c8b287c1783fec1dc4acc41c0c0b0b8 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Thu, 19 Dec 2024 23:21:06 +0100 Subject: [PATCH 02/17] Operators fix --- fastembed/image/transform/operators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py index bac65e08..9701e2ae 100644 --- a/fastembed/image/transform/operators.py +++ b/fastembed/image/transform/operators.py @@ -139,7 +139,7 @@ def _get_convert_to_rgb(transforms: list[Transform], config: dict[str, Any]): @classmethod def _get_resize(cls, transforms: list[Transform], config: dict[str, Any]): mode = config.get("image_processor_type", "CLIPImageProcessor") - if mode == "CLIPImageProcessor": + if mode in ("CLIPImageProcessor", "SiglipImageProcessor"): if config.get("do_resize", False): size = config["size"] if "shortest_edge" in size: @@ -202,7 +202,7 @@ def _get_resize(cls, transforms: list[Transform], config: dict[str, Any]): @staticmethod def _get_center_crop(transforms: list[Transform], config: dict[str, Any]): mode = config.get("image_processor_type", "CLIPImageProcessor") - if mode == "CLIPImageProcessor": + if mode in ("CLIPImageProcessor", "SiglipImageProcessor"): if config.get("do_center_crop", False): crop_size = config["crop_size"] if isinstance(crop_size, int): From a7aa9c335c6dacd5c85b55fed3dd8b479074bbf6 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Fri, 20 Dec 2024 12:35:17 +0100 Subject: [PATCH 03/17] Fix model inputs --- .../late_interaction_multimodal/colpali.py | 52 ++++++++++++++----- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index d3508194..f8780cb9 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -16,18 +16,24 @@ ) -supported_colbert_models = [ +supported_colpali_models = [ { - "model": "colpali", - "dim": ..., - "description": "Late interaction model", + "model": "akshayballal/colpali-v1.2-merged", + "dim": 128, + "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.", "license": "mit", - "size_in_GB": 6.06, + "size_in_GB": 6.08, "sources": { - "hf": "colpali", + "hf": "akshayballal/colpali-v1.2-merged-onnx", }, + "additional_files": [ + "model.onnx_data", + "tokenizer.json", + "tokenizer_config.json", + "config.json", + ], "model_file": "model.onnx", - }, + } ] @@ -110,7 +116,7 @@ def list_supported_models(cls) -> list[dict[str, Any]]: Returns: list[dict[str, Any]]: A list of dictionaries containing the model information. """ - return supported_colbert_models + return supported_colpali_models def load_onnx_model(self) -> None: self._load_onnx_model( @@ -135,7 +141,7 @@ def _post_process_onnx_image_output( Returns: Iterable[np.ndarray]: Post-processed output as NumPy arrays. """ - return output.model_output.astype(np.float32) + return output.model_output.reshape(output.model_output.shape[0], -1, self.model_description['dim']).astype(np.float32) def _post_process_onnx_text_output( self, @@ -154,20 +160,42 @@ def _post_process_onnx_text_output( def tokenize(self, documents: list[str], **_) -> list[Encoding]: texts_query: list[str] = [] - for query in documents: query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10 query += "\n" texts_query.append(query) - encoded = self.tokenizer.encode_batch(documents) + encoded = self.tokenizer.encode_batch(texts_query) return encoded def _preprocess_onnx_text_input( self, onnx_input: dict[str, np.ndarray], **kwargs ) -> dict[str, np.ndarray]: onnx_input["input_ids"] = np.array( - [self.QUERY_MARKER_TOKEN_ID + input_ids[2:] for input_ids in onnx_input["input_ids"]] + [self.QUERY_MARKER_TOKEN_ID + input_ids[2:].tolist() for input_ids in onnx_input["input_ids"]] + ) + empty_image_placeholder = np.zeros(self.IMAGE_PLACEHOLDER_SIZE, dtype=np.float32) + onnx_input["pixel_values"] = np.array( + [empty_image_placeholder for _ in onnx_input["input_ids"]] + ) + return onnx_input + + def _preprocess_onnx_image_input( + self, onnx_input: dict[str, np.ndarray], **kwargs + ) -> dict[str, np.ndarray]: + """ + Add placeholders for text input when processing image data for ONNX. + Args: + onnx_input (Dict[str, np.ndarray]): Preprocessed image inputs. + **kwargs: Additional arguments. + Returns: + Dict[str, np.ndarray]: ONNX input with text placeholders. + """ + onnx_input["input_ids"] = np.array( + [self.EMPTY_TEXT_PLACEHOLDER for _ in onnx_input["input_ids"]] + ) + onnx_input["attention_mask"] = np.array( + [self.EVEN_ATTENTION_MASK for _ in onnx_input["input_ids"]] ) return onnx_input From ade31c5b709fff669ead51fd4c6f36d8cc1aa633 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Fri, 20 Dec 2024 12:56:33 +0100 Subject: [PATCH 04/17] Import from fastembed.late_interaction_multimodal --- fastembed/late_interaction_multimodal/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 fastembed/late_interaction_multimodal/__init__.py diff --git a/fastembed/late_interaction_multimodal/__init__.py b/fastembed/late_interaction_multimodal/__init__.py new file mode 100644 index 00000000..e23c1e28 --- /dev/null +++ b/fastembed/late_interaction_multimodal/__init__.py @@ -0,0 +1,5 @@ +from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding import ( + LateInteractionMultimodalEmbedding, +) + +__all__ = ["LateInteractionMultimodalEmbedding"] \ No newline at end of file From 5cc0a7eb83e78384d823e9675aed1241a64f1d06 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Fri, 20 Dec 2024 13:02:18 +0100 Subject: [PATCH 05/17] Fixed method misspelling --- fastembed/late_interaction_multimodal/colpali.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index f8780cb9..222e843b 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -141,7 +141,9 @@ def _post_process_onnx_image_output( Returns: Iterable[np.ndarray]: Post-processed output as NumPy arrays. """ - return output.model_output.reshape(output.model_output.shape[0], -1, self.model_description['dim']).astype(np.float32) + return output.model_output.reshape( + output.model_output.shape[0], -1, self.model_description["dim"] + ).astype(np.float32) def _post_process_onnx_text_output( self, @@ -172,7 +174,10 @@ def _preprocess_onnx_text_input( self, onnx_input: dict[str, np.ndarray], **kwargs ) -> dict[str, np.ndarray]: onnx_input["input_ids"] = np.array( - [self.QUERY_MARKER_TOKEN_ID + input_ids[2:].tolist() for input_ids in onnx_input["input_ids"]] + [ + self.QUERY_MARKER_TOKEN_ID + input_ids[2:].tolist() + for input_ids in onnx_input["input_ids"] + ] ) empty_image_placeholder = np.zeros(self.IMAGE_PLACEHOLDER_SIZE, dtype=np.float32) onnx_input["pixel_values"] = np.array( @@ -232,7 +237,7 @@ def embed_text( **kwargs, ) - def embed_images( + def embed_image( self, images: ImageInput, batch_size: int = 16, From 566d2450d22357275870cfccb800887f3a7ddfa9 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Mon, 23 Dec 2024 13:23:30 +0100 Subject: [PATCH 06/17] Tests, which do not run in CI Docstring improvements --- .../late_interaction_multimodal/colpali.py | 3 - ...e_interaction_multimodal_embedding_base.py | 7 +- tests/test_late_interaction_multimodal.py | 142 ++++++++++++++++++ 3 files changed, 147 insertions(+), 5 deletions(-) create mode 100644 tests/test_late_interaction_multimodal.py diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index 222e843b..93d65083 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -28,9 +28,6 @@ }, "additional_files": [ "model.onnx_data", - "tokenizer.json", - "tokenizer_config.json", - "config.json", ], "model_file": "model.onnx", } diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py index cc1a929b..ec908a1b 100644 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py @@ -31,8 +31,11 @@ def embed_text( Args: documents (Iterable[str]): The list of texts to embed. - batch_size (int) - ... - parallel (Optional[int]) - ... + batch_size: Batch size for encoding -- higher values will use more memory, but be faster + parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. **kwargs: Additional keyword argument to pass to the embed method. Yields: diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py new file mode 100644 index 00000000..baf44e7f --- /dev/null +++ b/tests/test_late_interaction_multimodal.py @@ -0,0 +1,142 @@ +import os + +import numpy as np +import pytest + +from fastembed.late_interaction_multimodal.colpali import ColPali +from tests.utils import delete_model_cache +from tests.config import TEST_MISC_DIR +from PIL import Image + +# vectors are abridged and rounded for brevity +CANONICAL_COLUMN_VALUES = { + "akshayballal/colpali-v1.2-merged": np.array( + [ + [ + [0.015, 0.051, 0.059, 0.026, -0.061, -0.027, -0.014], + [-0.22, -0.111, 0.046, 0.081, -0.048, -0.052, -0.086], + [-0.184, -0.131, 0.004, 0.062, -0.038, -0.059, -0.127], + [-0.209, -0.113, 0.015, 0.059, -0.035, -0.035, -0.072], + [-0.031, -0.044, 0.092, -0.005, 0.006, -0.057, -0.061], + [-0.18, -0.039, 0.031, 0.003, 0.083, -0.041, 0.088], + [-0.091, 0.023, 0.116, -0.02, 0.039, -0.064, -0.026], + ] + ] + ), +} + +CANONICAL_QUERY_VALUES = { + "akshayballal/colpali-v1.2-merged": np.array( + [ + [0.158, -0.02, 0.1, -0.023, 0.045, 0.031, 0.071], + [-0.074, -0.111, 0.065, -0.0, -0.089, -0.003, -0.099], + [-0.034, -0.014, 0.174, -0.063, -0.09, -0.036, 0.064], + [-0.07, -0.014, 0.186, -0.013, -0.021, -0.062, 0.107], + [-0.085, 0.025, 0.179, -0.101, 0.036, -0.089, 0.098], + [-0.058, 0.031, 0.18, -0.078, 0.023, -0.119, 0.131], + [-0.067, 0.038, 0.188, -0.079, -0.001, -0.123, 0.127], + [-0.063, 0.037, 0.204, -0.069, 0.003, -0.118, 0.134], + [-0.054, 0.036, 0.212, -0.072, -0.001, -0.117, 0.133], + [-0.044, 0.03, 0.218, -0.077, -0.003, -0.107, 0.139], + [-0.037, 0.033, 0.22, -0.088, 0.0, -0.095, 0.146], + [-0.031, 0.041, 0.213, -0.092, 0.001, -0.088, 0.147], + [-0.026, 0.047, 0.204, -0.089, -0.002, -0.084, 0.144], + [-0.027, 0.051, 0.199, -0.084, -0.007, -0.083, 0.14], + [-0.031, 0.056, 0.19, -0.082, -0.011, -0.086, 0.135], + [-0.008, 0.108, 0.144, -0.095, -0.018, -0.086, 0.085], + ] + ), +} + +queries = ["hello world", "flag embedding"] +images = [ + TEST_MISC_DIR / "image.jpeg", + str(TEST_MISC_DIR / "small_image.jpeg"), + Image.open((TEST_MISC_DIR / "small_image.jpeg")), +] + + +def test_batch_embedding(): + is_ci = os.getenv("CI") + docs_to_embed = images + + for model_name, expected_result in CANONICAL_COLUMN_VALUES.items(): + print("evaluating", model_name) + model = ColPali(model_name=model_name) + result = list(model.embed_images(docs_to_embed, batch_size=2)) + + for value in result: + batch_size, token_num, abridged_dim = expected_result.shape + assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=1e-3) + break + + if is_ci: + delete_model_cache(model.model._model_dir) + + +def test_single_embedding(): + is_ci = os.getenv("CI") + if not is_ci: + docs_to_embed = images + + for model_name, expected_result in CANONICAL_COLUMN_VALUES.items(): + print("evaluating", model_name) + model = ColPali(model_name=model_name) + result = next(iter(model.embed_images(docs_to_embed, batch_size=6))) + batch_size, token_num, abridged_dim = expected_result.shape + assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) + + +def test_single_embedding_query(): + is_ci = os.getenv("CI") + if not is_ci: + queries_to_embed = queries + + for model_name, expected_result in CANONICAL_QUERY_VALUES.items(): + print("evaluating", model_name) + model = ColPali(model_name=model_name) + result = next(iter(model.embed_text(queries_to_embed))) + token_num, abridged_dim = expected_result.shape + assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) + + +def test_parallel_processing(): + is_ci = os.getenv("CI") + if not is_ci: + model = ColPali(model_name="akshayballal/colpali-v1.2-merged") + + token_dim = 128 + docs = ["hello world", "flag embedding"] * 100 + embeddings = list(model.embed_text(docs, batch_size=10, parallel=2)) + embeddings = np.stack(embeddings, axis=0) + + embeddings_2 = list(model.embed_text(docs, batch_size=10, parallel=None)) + embeddings_2 = np.stack(embeddings_2, axis=0) + + embeddings_3 = list(model.embed_text(docs, batch_size=10, parallel=0)) + embeddings_3 = np.stack(embeddings_3, axis=0) + + assert embeddings.shape[0] == len(docs) and embeddings.shape[-1] == token_dim + assert np.allclose(embeddings, embeddings_2, atol=1e-3) + assert np.allclose(embeddings, embeddings_3, atol=1e-3) + + +@pytest.mark.parametrize( + "model_name", + ["akshayballal/colpali-v1.2-merged"], +) +def test_lazy_load(model_name): + is_ci = os.getenv("CI") + if not is_ci: + model = ColPali(model_name=model_name, lazy_load=True) + assert not hasattr(model.model, "model") + + docs = ["hello world", "flag embedding"] + list(model.embed_text(docs)) + assert hasattr(model.model, "model") + + model = ColPali(model_name=model_name, lazy_load=True) + list(model.embed_text(docs)) + + model = ColPali(model_name=model_name, lazy_load=True) + list(model.embed_text(docs)) From d0b1f8615ec2f079cca8e5e788ee27a0929b5b55 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Fri, 27 Dec 2024 13:11:28 +0100 Subject: [PATCH 07/17] Fix tests --- tests/test_late_interaction_multimodal.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index baf44e7f..65b76bbb 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from fastembed.late_interaction_multimodal.colpali import ColPali +from fastembed.late_interaction_multimodal import LateInteractionMultimodalEmbedding from tests.utils import delete_model_cache from tests.config import TEST_MISC_DIR from PIL import Image @@ -62,8 +62,8 @@ def test_batch_embedding(): for model_name, expected_result in CANONICAL_COLUMN_VALUES.items(): print("evaluating", model_name) - model = ColPali(model_name=model_name) - result = list(model.embed_images(docs_to_embed, batch_size=2)) + model = LateInteractionMultimodalEmbedding(model_name=model_name) + result = list(model.embed_image(docs_to_embed, batch_size=2)) for value in result: batch_size, token_num, abridged_dim = expected_result.shape @@ -81,7 +81,7 @@ def test_single_embedding(): for model_name, expected_result in CANONICAL_COLUMN_VALUES.items(): print("evaluating", model_name) - model = ColPali(model_name=model_name) + model = LateInteractionMultimodalEmbedding(model_name=model_name) result = next(iter(model.embed_images(docs_to_embed, batch_size=6))) batch_size, token_num, abridged_dim = expected_result.shape assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) @@ -94,7 +94,7 @@ def test_single_embedding_query(): for model_name, expected_result in CANONICAL_QUERY_VALUES.items(): print("evaluating", model_name) - model = ColPali(model_name=model_name) + model = LateInteractionMultimodalEmbedding(model_name=model_name) result = next(iter(model.embed_text(queries_to_embed))) token_num, abridged_dim = expected_result.shape assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) @@ -103,7 +103,7 @@ def test_single_embedding_query(): def test_parallel_processing(): is_ci = os.getenv("CI") if not is_ci: - model = ColPali(model_name="akshayballal/colpali-v1.2-merged") + model = LateInteractionMultimodalEmbedding(model_name="akshayballal/colpali-v1.2-merged") token_dim = 128 docs = ["hello world", "flag embedding"] * 100 @@ -128,15 +128,15 @@ def test_parallel_processing(): def test_lazy_load(model_name): is_ci = os.getenv("CI") if not is_ci: - model = ColPali(model_name=model_name, lazy_load=True) + model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) assert not hasattr(model.model, "model") docs = ["hello world", "flag embedding"] list(model.embed_text(docs)) assert hasattr(model.model, "model") - model = ColPali(model_name=model_name, lazy_load=True) + model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) list(model.embed_text(docs)) - model = ColPali(model_name=model_name, lazy_load=True) + model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) list(model.embed_text(docs)) From b60f3721ef528e9e8f7b091e5db583df6daa8600 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Mon, 13 Jan 2025 13:15:19 +0100 Subject: [PATCH 08/17] Bump colpali to version v1.3 --- .../late_interaction_multimodal/colpali.py | 18 ++++- .../onnx_multimodal_model.py | 1 - tests/test_late_interaction_multimodal.py | 73 +++++++------------ 3 files changed, 43 insertions(+), 49 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index 93d65083..8e052f71 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -30,7 +30,21 @@ "model.onnx_data", ], "model_file": "model.onnx", - } + }, + { + "model": "AndrewOgn/colpali-v1.3-merged-onnx", + "dim": 128, + "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.", + "license": "mit", + "size_in_GB": 10.08, + "sources": { + "hf": "AndrewOgn/colpali-v1.3-merged-onnx", + }, + "additional_files": [ + "model.onnx_data", + ], + "model_file": "model.onnx", + }, ] @@ -39,7 +53,7 @@ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.nda QUERY_PREFIX = "Query: " BOS_TOKEN = "" PAD_TOKEN = "" - QUERY_MARKER_TOKEN_ID = [2, 9413] + QUERY_MARKER_TOKEN_ID = [2, 5098] IMAGE_PLACEHOLDER_SIZE = (3, 448, 448) EMPTY_TEXT_PLACEHOLDER = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108]) EVEN_ATTENTION_MASK = np.array([1] * 1030) diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index 0557a92e..c9a12b22 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -100,7 +100,6 @@ def onnx_embed_text( ) onnx_input = self._preprocess_onnx_text_input(onnx_input, **kwargs) - model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input) return OnnxOutputContext( model_output=model_output[0], diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index 65b76bbb..9786847d 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -1,7 +1,6 @@ import os import numpy as np -import pytest from fastembed.late_interaction_multimodal import LateInteractionMultimodalEmbedding from tests.utils import delete_model_cache @@ -9,7 +8,7 @@ from PIL import Image # vectors are abridged and rounded for brevity -CANONICAL_COLUMN_VALUES = { +CANONICAL_IMAGE_VALUES = { "akshayballal/colpali-v1.2-merged": np.array( [ [ @@ -23,6 +22,19 @@ ] ] ), + "AndrewOgn/colpali-v1.3-merged-onnx": np.array( + [ + [ + [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738], + [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021], + [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666], + [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087], + [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064], + [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301], + [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122], + ] + ] + ), } CANONICAL_QUERY_VALUES = { @@ -46,6 +58,17 @@ [-0.008, 0.108, 0.144, -0.095, -0.018, -0.086, 0.085], ] ), + "AndrewOgn/colpali-v1.3-merged-onnx": np.array( + [ + [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567], + [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537], + [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593], + [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098], + [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708], + [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022], + [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137], + ] + ), } queries = ["hello world", "flag embedding"] @@ -60,7 +83,7 @@ def test_batch_embedding(): is_ci = os.getenv("CI") docs_to_embed = images - for model_name, expected_result in CANONICAL_COLUMN_VALUES.items(): + for model_name, expected_result in CANONICAL_IMAGE_VALUES.items(): print("evaluating", model_name) model = LateInteractionMultimodalEmbedding(model_name=model_name) result = list(model.embed_image(docs_to_embed, batch_size=2)) @@ -79,7 +102,7 @@ def test_single_embedding(): if not is_ci: docs_to_embed = images - for model_name, expected_result in CANONICAL_COLUMN_VALUES.items(): + for model_name, expected_result in CANONICAL_IMAGE_VALUES.items(): print("evaluating", model_name) model = LateInteractionMultimodalEmbedding(model_name=model_name) result = next(iter(model.embed_images(docs_to_embed, batch_size=6))) @@ -98,45 +121,3 @@ def test_single_embedding_query(): result = next(iter(model.embed_text(queries_to_embed))) token_num, abridged_dim = expected_result.shape assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) - - -def test_parallel_processing(): - is_ci = os.getenv("CI") - if not is_ci: - model = LateInteractionMultimodalEmbedding(model_name="akshayballal/colpali-v1.2-merged") - - token_dim = 128 - docs = ["hello world", "flag embedding"] * 100 - embeddings = list(model.embed_text(docs, batch_size=10, parallel=2)) - embeddings = np.stack(embeddings, axis=0) - - embeddings_2 = list(model.embed_text(docs, batch_size=10, parallel=None)) - embeddings_2 = np.stack(embeddings_2, axis=0) - - embeddings_3 = list(model.embed_text(docs, batch_size=10, parallel=0)) - embeddings_3 = np.stack(embeddings_3, axis=0) - - assert embeddings.shape[0] == len(docs) and embeddings.shape[-1] == token_dim - assert np.allclose(embeddings, embeddings_2, atol=1e-3) - assert np.allclose(embeddings, embeddings_3, atol=1e-3) - - -@pytest.mark.parametrize( - "model_name", - ["akshayballal/colpali-v1.2-merged"], -) -def test_lazy_load(model_name): - is_ci = os.getenv("CI") - if not is_ci: - model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) - assert not hasattr(model.model, "model") - - docs = ["hello world", "flag embedding"] - list(model.embed_text(docs)) - assert hasattr(model.model, "model") - - model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) - list(model.embed_text(docs)) - - model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) - list(model.embed_text(docs)) From 013a4621fcb551848b67c4b7028bf2136ab40ae3 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Mon, 13 Jan 2025 13:16:23 +0100 Subject: [PATCH 09/17] Remove colpali v1.2 --- fastembed/late_interaction_multimodal/colpali.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index 8e052f71..5d8e118d 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -17,20 +17,6 @@ supported_colpali_models = [ - { - "model": "akshayballal/colpali-v1.2-merged", - "dim": 128, - "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.", - "license": "mit", - "size_in_GB": 6.08, - "sources": { - "hf": "akshayballal/colpali-v1.2-merged-onnx", - }, - "additional_files": [ - "model.onnx_data", - ], - "model_file": "model.onnx", - }, { "model": "AndrewOgn/colpali-v1.3-merged-onnx", "dim": 128, From a57fd3c79131bc2a736b9ec0e18285002ce1fadb Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Mon, 13 Jan 2025 13:16:38 +0100 Subject: [PATCH 10/17] Remove colpali v1.2 from tests --- tests/test_late_interaction_multimodal.py | 33 ----------------------- 1 file changed, 33 deletions(-) diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index 9786847d..9d7c0b90 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -9,19 +9,6 @@ # vectors are abridged and rounded for brevity CANONICAL_IMAGE_VALUES = { - "akshayballal/colpali-v1.2-merged": np.array( - [ - [ - [0.015, 0.051, 0.059, 0.026, -0.061, -0.027, -0.014], - [-0.22, -0.111, 0.046, 0.081, -0.048, -0.052, -0.086], - [-0.184, -0.131, 0.004, 0.062, -0.038, -0.059, -0.127], - [-0.209, -0.113, 0.015, 0.059, -0.035, -0.035, -0.072], - [-0.031, -0.044, 0.092, -0.005, 0.006, -0.057, -0.061], - [-0.18, -0.039, 0.031, 0.003, 0.083, -0.041, 0.088], - [-0.091, 0.023, 0.116, -0.02, 0.039, -0.064, -0.026], - ] - ] - ), "AndrewOgn/colpali-v1.3-merged-onnx": np.array( [ [ @@ -38,26 +25,6 @@ } CANONICAL_QUERY_VALUES = { - "akshayballal/colpali-v1.2-merged": np.array( - [ - [0.158, -0.02, 0.1, -0.023, 0.045, 0.031, 0.071], - [-0.074, -0.111, 0.065, -0.0, -0.089, -0.003, -0.099], - [-0.034, -0.014, 0.174, -0.063, -0.09, -0.036, 0.064], - [-0.07, -0.014, 0.186, -0.013, -0.021, -0.062, 0.107], - [-0.085, 0.025, 0.179, -0.101, 0.036, -0.089, 0.098], - [-0.058, 0.031, 0.18, -0.078, 0.023, -0.119, 0.131], - [-0.067, 0.038, 0.188, -0.079, -0.001, -0.123, 0.127], - [-0.063, 0.037, 0.204, -0.069, 0.003, -0.118, 0.134], - [-0.054, 0.036, 0.212, -0.072, -0.001, -0.117, 0.133], - [-0.044, 0.03, 0.218, -0.077, -0.003, -0.107, 0.139], - [-0.037, 0.033, 0.22, -0.088, 0.0, -0.095, 0.146], - [-0.031, 0.041, 0.213, -0.092, 0.001, -0.088, 0.147], - [-0.026, 0.047, 0.204, -0.089, -0.002, -0.084, 0.144], - [-0.027, 0.051, 0.199, -0.084, -0.007, -0.083, 0.14], - [-0.031, 0.056, 0.19, -0.082, -0.011, -0.086, 0.135], - [-0.008, 0.108, 0.144, -0.095, -0.018, -0.086, 0.085], - ] - ), "AndrewOgn/colpali-v1.3-merged-onnx": np.array( [ [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567], From 939a1c0a4234cd3ecda3f24f87f0eaa8187eb695 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Mon, 13 Jan 2025 13:34:57 +0100 Subject: [PATCH 11/17] partial fix of change requests: descriptions docs black --- .../late_interaction_multimodal/__init__.py | 2 +- .../late_interaction_multimodal/colpali.py | 7 ++--- .../late_interaction_multimodal_embedding.py | 26 ++++++++++--------- ...e_interaction_multimodal_embedding_base.py | 6 ++--- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/fastembed/late_interaction_multimodal/__init__.py b/fastembed/late_interaction_multimodal/__init__.py index e23c1e28..50588cde 100644 --- a/fastembed/late_interaction_multimodal/__init__.py +++ b/fastembed/late_interaction_multimodal/__init__.py @@ -2,4 +2,4 @@ LateInteractionMultimodalEmbedding, ) -__all__ = ["LateInteractionMultimodalEmbedding"] \ No newline at end of file +__all__ = ["LateInteractionMultimodalEmbedding"] diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index 5d8e118d..be7256b9 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -20,7 +20,7 @@ { "model": "AndrewOgn/colpali-v1.3-merged-onnx", "dim": 128, - "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.", + "description": "Text embeddings, Bimodal (text/image), Aligned to image latent space, fp16 quantized, 2024.", "license": "mit", "size_in_GB": 10.08, "sources": { @@ -97,11 +97,12 @@ def __init__( self.cache_dir = define_cache_dir(cache_dir) self._model_dir = self.download_model( - self.model_description, self.cache_dir, local_files_only=self._local_files_only + self.model_description, + self.cache_dir, + local_files_only=self._local_files_only, ) self.mask_token_id = None self.pad_token_id = None - self.skip_list = set() if not self.lazy_load: self.load_onnx_model() diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py index 3d35c52f..33104bf7 100644 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py @@ -25,15 +25,18 @@ def list_supported_models(cls) -> list[dict[str, Any]]: ``` [ { - "model": "colpali", - "dim": ..., - "description": "Late interaction model", - "license": "mit", - "size_in_GB": 6.06, - "sources": { - "hf": "colpali", - }, - "model_file": "model.onnx", + "model": "AndrewOgn/colpali-v1.3-merged-onnx", + "dim": 128, + "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.", + "license": "mit", + "size_in_GB": 6.06, + "sources": { + "hf": "AndrewOgn/colpali-v1.3-merged-onnx", + }, + "additional_files": [ + "model.onnx_data", + ], + "model_file": "model.onnx", }, ] ``` @@ -106,8 +109,7 @@ def embed_image( **kwargs, ) -> Iterable[np.ndarray]: """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. + Encode a list of images into list of embeddings. Args: images: Iterator of image paths or single image path to embed @@ -118,6 +120,6 @@ def embed_image( If None, don't use data-parallel processing, use default onnxruntime threading instead. Returns: - List of embeddings, one per document + List of embeddings, one per image """ yield from self.model.embed_image(images, batch_size, parallel, **kwargs) diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py index ec908a1b..2ae0e34f 100644 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py @@ -51,9 +51,7 @@ def embed_image( **kwargs, ) -> Iterable[np.ndarray]: """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - + Encode a list of images into list of embeddings. Args: images: Iterator of image paths or single image path to embed batch_size: Batch size for encoding -- higher values will use more memory, but be faster @@ -63,6 +61,6 @@ def embed_image( If None, don't use data-parallel processing, use default onnxruntime threading instead. Returns: - List of embeddings, one per document + List of embeddings, one per image """ raise NotImplementedError() From 78dcc33cd999476f0a66b1fb6f4cdce9696bae46 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Mon, 13 Jan 2025 14:00:51 +0100 Subject: [PATCH 12/17] query_max_length --- fastembed/late_interaction_multimodal/colpali.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index be7256b9..974dc7b0 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -43,6 +43,7 @@ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.nda IMAGE_PLACEHOLDER_SIZE = (3, 448, 448) EMPTY_TEXT_PLACEHOLDER = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108]) EVEN_ATTENTION_MASK = np.array([1] * 1030) + QUERY_MAX_LENGTH = 50 def __init__( self, @@ -165,7 +166,7 @@ def tokenize(self, documents: list[str], **_) -> list[Encoding]: query += "\n" texts_query.append(query) - encoded = self.tokenizer.encode_batch(texts_query) + encoded = self.tokenizer.encode_batch(texts_query, max_length=self.QUERY_MAX_LENGTH) return encoded def _preprocess_onnx_text_input( From 17fa789cc7c160cf9ef57e2b1064bf3fd0103068 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Wed, 15 Jan 2025 11:12:03 +0100 Subject: [PATCH 13/17] black colpali --- fastembed/late_interaction_multimodal/colpali.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index 974dc7b0..772df5cf 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -44,6 +44,7 @@ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.nda EMPTY_TEXT_PLACEHOLDER = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108]) EVEN_ATTENTION_MASK = np.array([1] * 1030) QUERY_MAX_LENGTH = 50 + VISUAL_PROMPT_PREFIX = "Describe the image." def __init__( self, @@ -195,6 +196,7 @@ def _preprocess_onnx_image_input( Returns: Dict[str, np.ndarray]: ONNX input with text placeholders. """ + onnx_input["input_ids"] = np.array( [self.EMPTY_TEXT_PLACEHOLDER for _ in onnx_input["input_ids"]] ) From 5518a88095b614352adf184a15728d65a18e8c0d Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Thu, 16 Jan 2025 17:54:20 +0100 Subject: [PATCH 14/17] Added comment for EMPTY_TEXT_PLACEHOLDER --- fastembed/late_interaction_multimodal/colpali.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index 772df5cf..300504e7 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -41,7 +41,9 @@ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.nda PAD_TOKEN = "" QUERY_MARKER_TOKEN_ID = [2, 5098] IMAGE_PLACEHOLDER_SIZE = (3, 448, 448) - EMPTY_TEXT_PLACEHOLDER = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108]) + EMPTY_TEXT_PLACEHOLDER = np.array( + [257152] * 1024 + [2, 50721, 573, 2416, 235265, 108] + ) # This is a tokenization of '' * 1024 + 'Describe the image.\n' line which is used as placeholder while processing just image EVEN_ATTENTION_MASK = np.array([1] * 1030) QUERY_MAX_LENGTH = 50 VISUAL_PROMPT_PREFIX = "Describe the image." From 3a0b570dbe2ff422d7453d008ccc0e32b7da5d8a Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Thu, 16 Jan 2025 19:43:45 +0100 Subject: [PATCH 15/17] Review fixes --- .../late_interaction_multimodal/onnx_multimodal_model.py | 4 ++-- tests/test_late_interaction_multimodal.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index c9a12b22..7f084fb8 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -2,7 +2,7 @@ import os from multiprocessing import get_all_start_methods from pathlib import Path -from typing import Any, Iterable, Optional, Sequence, Type, Union +from typing import Any, Iterable, Optional, Sequence, Type, Union, get_args import numpy as np from PIL import Image @@ -186,7 +186,7 @@ def _embed_images( ) -> Iterable[T]: is_small = False - if isinstance(images, (str, Path, Image.Image)): + if isinstance(images, get_args(ImageInput)): images = [images] is_small = True diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index 9d7c0b90..dc75d9df 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -58,7 +58,6 @@ def test_batch_embedding(): for value in result: batch_size, token_num, abridged_dim = expected_result.shape assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=1e-3) - break if is_ci: delete_model_cache(model.model._model_dir) From d9f53fbe92e6dd4f47ee13ff578128627f525d3d Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Mon, 20 Jan 2025 15:42:29 +0100 Subject: [PATCH 16/17] Removed redundant VISUAL_PROMPT_PREFIX --- fastembed/late_interaction_multimodal/colpali.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index 300504e7..348966dc 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -46,7 +46,6 @@ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[np.nda ) # This is a tokenization of '' * 1024 + 'Describe the image.\n' line which is used as placeholder while processing just image EVEN_ATTENTION_MASK = np.array([1] * 1030) QUERY_MAX_LENGTH = 50 - VISUAL_PROMPT_PREFIX = "Describe the image." def __init__( self, From 40b322b4d3203863d5a82a507f9b6bd49f7d469c Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Thu, 23 Jan 2025 16:55:36 +0100 Subject: [PATCH 17/17] type fix + model info --- fastembed/late_interaction_multimodal/colpali.py | 3 +-- .../late_interaction_multimodal/onnx_multimodal_model.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index 348966dc..f0bbcdfe 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -15,14 +15,13 @@ ImageEmbeddingWorker, ) - supported_colpali_models = [ { "model": "AndrewOgn/colpali-v1.3-merged-onnx", "dim": 128, "description": "Text embeddings, Bimodal (text/image), Aligned to image latent space, fp16 quantized, 2024.", "license": "mit", - "size_in_GB": 10.08, + "size_in_GB": 6.5, "sources": { "hf": "AndrewOgn/colpali-v1.3-merged-onnx", }, diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index 7f084fb8..719dfd70 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -2,7 +2,7 @@ import os from multiprocessing import get_all_start_methods from pathlib import Path -from typing import Any, Iterable, Optional, Sequence, Type, Union, get_args +from typing import Any, Iterable, Optional, Sequence, Type, Union import numpy as np from PIL import Image @@ -186,7 +186,7 @@ def _embed_images( ) -> Iterable[T]: is_small = False - if isinstance(images, get_args(ImageInput)): + if isinstance(images, Image.Image): images = [images] is_small = True