Skip to content

Commit

Permalink
Clip (#219)
Browse files Browse the repository at this point in the history
* wip: init image embeddings

* new: add clip

* new: fix clip text embedding

* fix: fix image parallel

* fix: add test images

* fix: add PIL

* fix: fix generics

* fix: fix sparse worker

* fix: fix image test path

* fix: replace models repo

* new: follow-up for onnx providers and local_files_only option

* fix: add types, refactor a bit

* refactoring: move onnxprovider type alias to types

* fix: fix type alias import
  • Loading branch information
joein authored May 9, 2024
1 parent 6ecab6d commit 99164c9
Show file tree
Hide file tree
Showing 28 changed files with 920 additions and 120 deletions.
5 changes: 4 additions & 1 deletion fastembed/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import importlib.metadata

from fastembed.image import ImageEmbedding
from fastembed.text import TextEmbedding
from fastembed.sparse import SparseTextEmbedding, SparseEmbedding


try:
version = importlib.metadata.version("fastembed")
except importlib.metadata.PackageNotFoundError as _:
version = importlib.metadata.version("fastembed-gpu")

__version__ = version
__all__ = ["TextEmbedding", "SparseTextEmbedding", "SparseEmbedding"]
__all__ = ["TextEmbedding", "SparseTextEmbedding", "SparseEmbedding", "ImageEmbedding"]

4 changes: 2 additions & 2 deletions fastembed/common/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from fastembed.common.onnx_model import OnnxProvider
from fastembed.common.types import OnnxProvider, ImageInput, PathInput

__all__ = ["OnnxProvider"]
__all__ = ["OnnxProvider", "ImageInput", "PathInput"]
1 change: 1 addition & 0 deletions fastembed/common/model_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def download_files_from_huggingface(
"tokenizer.json",
"tokenizer_config.json",
"special_tokens_map.json",
"preprocessor_config.json",
]
if extra_patterns is not None:
allow_patterns.extend(extra_patterns)
Expand Down
84 changes: 6 additions & 78 deletions fastembed/common/onnx_model.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,16 @@
import os
from multiprocessing import get_all_start_methods
from pathlib import Path
from typing import (
Any,
Dict,
Generic,
Iterable,
List,
Optional,
Tuple,
Type,
TypeVar,
Union,
Sequence,
)
from typing import Any, Dict, Generic, Iterable, Optional, Tuple, Type, TypeVar, Sequence

import numpy as np
import onnxruntime as ort

from fastembed.common.models import load_tokenizer
from fastembed.common.utils import iter_batch
from fastembed.parallel_processor import ParallelWorkerPool, Worker
from fastembed.common.types import OnnxProvider
from fastembed.parallel_processor import Worker


# Holds type of the embedding result
T = TypeVar("T")

OnnxProvider = Union[str, Tuple[str, Dict[Any, Any]]]


class OnnxModel(Generic[T]):
@classmethod
Expand Down Expand Up @@ -76,65 +59,12 @@ def load_onnx_model(
so.intra_op_num_threads = threads
so.inter_op_num_threads = threads

self.tokenizer = load_tokenizer(model_dir=model_dir)
self.model = ort.InferenceSession(
str(model_path), providers=onnx_providers, sess_options=so
)

def onnx_embed(self, documents: List[str]) -> Tuple[np.ndarray, np.ndarray]:
encoded = self.tokenizer.encode_batch(documents)
input_ids = np.array([e.ids for e in encoded])
attention_mask = np.array([e.attention_mask for e in encoded])

onnx_input = {
"input_ids": np.array(input_ids, dtype=np.int64),
"attention_mask": np.array(attention_mask, dtype=np.int64),
"token_type_ids": np.array(
[np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64
),
}

onnx_input = self._preprocess_onnx_input(onnx_input)

model_output = self.model.run(None, onnx_input)
embeddings = model_output[0]
return embeddings, attention_mask

def _embed_documents(
self,
model_name: str,
cache_dir: str,
documents: Union[str, Iterable[str]],
batch_size: int = 256,
parallel: Optional[int] = None,
) -> Iterable[T]:
is_small = False

if isinstance(documents, str):
documents = [documents]
is_small = True

if isinstance(documents, list):
if len(documents) < batch_size:
is_small = True

if parallel == 0:
parallel = os.cpu_count()

if parallel is None or is_small:
for batch in iter_batch(documents, batch_size):
yield from self._post_process_onnx_output(self.onnx_embed(batch))
else:
start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn"
params = {
"model_name": model_name,
"cache_dir": cache_dir,
}
pool = ParallelWorkerPool(
parallel, self._get_worker_class(), start_method=start_method
)
for batch in pool.ordered_map(iter_batch(documents, batch_size), **params):
yield from self._post_process_onnx_output(batch)
def onnx_embed(self, *args, **kwargs) -> Tuple[np.ndarray, np.ndarray]:
raise NotImplementedError("Subclasses must implement this method")


class EmbeddingWorker(Worker):
Expand All @@ -160,6 +90,4 @@ def start(cls, model_name: str, cache_dir: str, **kwargs: Any) -> "EmbeddingWork
)

def process(self, items: Iterable[Tuple[int, Any]]) -> Iterable[Tuple[int, Any]]:
for idx, batch in items:
embeddings, attn_mask = self.model.onnx_embed(batch)
yield idx, (embeddings, attn_mask)
raise NotImplementedError("Subclasses must implement this method")
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import json
from pathlib import Path

import numpy as np
from tokenizers import Tokenizer, AddedToken

from fastembed.image.transform.operators import Compose


def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer:
config_path = model_dir / "config.json"
Expand Down Expand Up @@ -46,9 +47,12 @@ def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer:
return tokenizer


def normalize(input_array, p=2, dim=1, eps=1e-12) -> np.ndarray:
# Calculate the Lp norm along the specified dimension
norm = np.linalg.norm(input_array, ord=p, axis=dim, keepdims=True)
norm = np.maximum(norm, eps) # Avoid division by zero
normalized_array = input_array / norm
return normalized_array
def load_preprocessor(model_dir: Path) -> Compose:
preprocessor_config_path = model_dir / "preprocessor_config.json"
if not preprocessor_config_path.exists():
raise ValueError(f"Could not find preprocessor_config.json in {model_dir}")

with open(str(preprocessor_config_path)) as preprocessor_config_file:
preprocessor_config = json.load(preprocessor_config_file)
transforms = Compose.from_config(preprocessor_config)
return transforms
14 changes: 14 additions & 0 deletions fastembed/common/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os
import sys
from typing import Union, Iterable, Tuple, Dict, Any

if sys.version_info >= (3, 10):
from typing import TypeAlias
else:
from typing_extensions import TypeAlias


PathInput: TypeAlias = Union[str, os.PathLike]
ImageInput: TypeAlias = Union[PathInput, Iterable[PathInput]]

OnnxProvider: TypeAlias = Union[str, Tuple[str, Dict[Any, Any]]]
10 changes: 10 additions & 0 deletions fastembed/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@
from pathlib import Path
from typing import Union, Iterable, Generator, Optional

import numpy as np


def normalize(input_array, p=2, dim=1, eps=1e-12) -> np.ndarray:
# Calculate the Lp norm along the specified dimension
norm = np.linalg.norm(input_array, ord=p, axis=dim, keepdims=True)
norm = np.maximum(norm, eps) # Avoid division by zero
normalized_array = input_array / norm
return normalized_array


def iter_batch(iterable: Union[Iterable, Generator], size: int) -> Iterable:
"""
Expand Down
4 changes: 4 additions & 0 deletions fastembed/image/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from fastembed.image.image_embedding import ImageEmbedding


__all__ = ["ImageEmbedding"]
87 changes: 87 additions & 0 deletions fastembed/image/image_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from typing import Any, Dict, Iterable, List, Optional, Type, Sequence

import numpy as np

from fastembed.common import ImageInput, OnnxProvider
from fastembed.image.image_embedding_base import ImageEmbeddingBase
from fastembed.image.onnx_embedding import OnnxImageEmbedding


class ImageEmbedding(ImageEmbeddingBase):
EMBEDDINGS_REGISTRY: List[Type[ImageEmbeddingBase]] = [OnnxImageEmbedding]

@classmethod
def list_supported_models(cls) -> List[Dict[str, Any]]:
"""
Lists the supported models.
Returns:
List[Dict[str, Any]]: A list of dictionaries containing the model information.
Example:
```
[
{
"model": "Qdrant/clip-ViT-B-32-vision",
"dim": 512,
"description": "CLIP vision encoder based on ViT-B/32",
"size_in_GB": 0.33,
"sources": {
"hf": "Qdrant/clip-ViT-B-32-vision",
},
"model_file": "model.onnx",
}
]
```
"""
result = []
for embedding in cls.EMBEDDINGS_REGISTRY:
result.extend(embedding.list_supported_models())
return result

def __init__(
self,
model_name: str,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
providers: Optional[Sequence[OnnxProvider]] = None,
**kwargs,
):
super().__init__(model_name, cache_dir, threads, **kwargs)

for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY:
supported_models = EMBEDDING_MODEL_TYPE.list_supported_models()
if any(model_name.lower() == model["model"].lower() for model in supported_models):
self.model = EMBEDDING_MODEL_TYPE(
model_name, cache_dir, threads, providers=providers, **kwargs
)
return

raise ValueError(
f"Model {model_name} is not supported in TextEmbedding."
"Please check the supported models using `TextEmbedding.list_supported_models()`"
)

def embed(
self,
images: ImageInput,
batch_size: int = 16,
parallel: Optional[int] = None,
**kwargs,
) -> Iterable[np.ndarray]:
"""
Encode a list of documents into list of embeddings.
We use mean pooling with attention so that the model can handle variable-length inputs.
Args:
images: Iterator of image paths or single image path to embed
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
parallel:
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
If 0, use all available cores.
If None, don't use data-parallel processing, use default onnxruntime threading instead.
Returns:
List of embeddings, one per document
"""
yield from self.model.embed(images, batch_size, parallel, **kwargs)
39 changes: 39 additions & 0 deletions fastembed/image/image_embedding_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import Iterable, Optional

import numpy as np

from fastembed.common.model_management import ModelManagement
from fastembed.common.types import ImageInput


class ImageEmbeddingBase(ModelManagement):
def __init__(
self,
model_name: str,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
**kwargs,
):
self.model_name = model_name
self.cache_dir = cache_dir
self.threads = threads
self._local_files_only = kwargs.pop("local_files_only", False)

def embed(
self,
images: ImageInput,
batch_size: int = 16,
parallel: Optional[int] = None,
**kwargs,
) -> Iterable[np.ndarray]:
"""
Embeds a list of images into a list of embeddings.
Args:
images - The list of image paths to preprocess and embed.
**kwargs: Additional keyword argument to pass to the embed method.
Yields:
Iterable[np.ndarray]: The embeddings.
"""
raise NotImplementedError()
Loading

0 comments on commit 99164c9

Please sign in to comment.