feat: Quantized models

qdrant · Apr 16, 2024 · 49edfa3 · 49edfa3
1 parent 9bad443
commit 49edfa3
Show file tree

Hide file tree

Showing 7 changed files with 97 additions and 54 deletions.
diff --git a/fastembed/common/model_management.py b/fastembed/common/model_management.py
@@ -2,14 +2,16 @@
 import shutil
 import tarfile
 from pathlib import Path
-from typing import List, Optional, Dict, Any
+from typing import List, Literal, Optional, Dict, Any, Tuple
 
 import requests
 from huggingface_hub import snapshot_download
 from huggingface_hub.utils import RepositoryNotFoundError
 from tqdm import tqdm
 from loguru import logger
 
+SOURCE = Literal["hf", "gcs"]
+
 
 def locate_model_file(model_dir: Path, file_names: List[str]) -> Path:
     """
@@ -118,8 +120,6 @@ def download_files_from_huggingface(
         return snapshot_download(
             repo_id=hf_source_repo,
             allow_patterns=[
-                "*.onnx",
-                "*.onnx_data",
                 "config.json",
                 "tokenizer.json",
                 "tokenizer_config.json",
@@ -200,7 +200,7 @@ def retrieve_model_gcs(cls, model_name: str, source_url: str, cache_dir: str) ->
         return model_dir
 
     @classmethod
-    def download_model(cls, model: Dict[str, Any], cache_dir: Path) -> Path:
+    def download_repo_files(cls, model: Dict[str, Any], cache_dir: Path) -> Tuple[Path, SOURCE]:
         """
         Downloads a model from HuggingFace Hub or Google Cloud Storage.
 
@@ -232,14 +232,14 @@ def download_model(cls, model: Dict[str, Any], cache_dir: Path) -> Path:
             try:
                 return Path(
                     cls.download_files_from_huggingface(hf_source, cache_dir=str(cache_dir))
-                )
+                ), "hf"
             except (EnvironmentError, RepositoryNotFoundError, ValueError) as e:
                 logger.error(
                     f"Could not download model from HuggingFace: {e}"
                     "Falling back to other sources."
                 )
 
         if url_source:
-            return cls.retrieve_model_gcs(model["model"], url_source, str(cache_dir))
+            return cls.retrieve_model_gcs(model["model"], url_source, str(cache_dir)), "gcs"
 
         raise ValueError(f"Could not download model {model['model']} from any source.")
diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py
@@ -6,11 +6,13 @@
 import numpy as np
 import onnxruntime as ort
 
-from fastembed.common.model_management import locate_model_file
+from fastembed.common.model_management import SOURCE, locate_model_file
 from fastembed.common.models import load_tokenizer
 from fastembed.common.utils import iter_batch
 from fastembed.parallel_processor import ParallelWorkerPool, Worker
 
+from huggingface_hub import hf_hub_download
+
 # Holds type of the embedding result
 T = TypeVar("T")
 
@@ -34,8 +36,33 @@ def _preprocess_onnx_input(self, onnx_input: Dict[str, np.ndarray]) -> Dict[str,
         """
         return onnx_input
 
-    def load_onnx_model(self, model_dir: Path, threads: Optional[int], max_length: int) -> None:
-        model_path = locate_model_file(model_dir, ["model.onnx", "model_optimized.onnx"])
+    def load_onnx_model(
+        self,
+        model_dir: Path,
+        threads: Optional[int],
+        cache_dir: Path,
+        model_description: dict,
+        source: SOURCE,
+    ) -> None:
+        if source == "gcs":
+            model_path = locate_model_file(model_dir, ["model.onnx", "model_optimized.onnx"])
+        elif source == "hf":
+            # For HuggingFace sources, the model file is conditionally downloaded
+            repo_id = model_description["sources"]["hf"]
+            model_file = model_description["model_file"]
+
+            # Some models require additional repo files.
+            # For eg: intfloat/multilingual-e5-large requires the model.onnx_data file.
+            # These can be specified within the "additional_files" option when describing the model properties
+            if additional_files := model_description.get("additional_files"):
+                for file in additional_files:
+                    hf_hub_download(repo_id=repo_id, filename=file, cache_dir=str(cache_dir))
+
+            model_path = hf_hub_download(
+                repo_id=repo_id, filename=model_file, cache_dir=str(cache_dir)
+            )
+        else:
+            raise ValueError(f"Unknown source: {source}")
 
         # List of Execution Providers: https://onnxruntime.ai/docs/execution-providers
         onnx_providers = ["CPUExecutionProvider"]
@@ -50,7 +77,7 @@ def load_onnx_model(self, model_dir: Path, threads: Optional[int], max_length: i
             so.intra_op_num_threads = threads
             so.inter_op_num_threads = threads
 
-        self.tokenizer = load_tokenizer(model_dir=model_dir, max_length=max_length)
+        self.tokenizer = load_tokenizer(model_dir=model_dir)
         self.model = ort.InferenceSession(
             str(model_path), providers=onnx_providers, sess_options=so
         )

diff --git a/fastembed/sparse/splade_pp.py b/fastembed/sparse/splade_pp.py
@@ -15,6 +15,7 @@
         "sources": {
             "hf": "Qdrant/SPLADE_PP_en_v1",
         },
+        "model_file": "model.onnx",
     },
     {
         "model": "prithivida/Splade_PP_en_v1",
@@ -24,6 +25,7 @@
         "sources": {
             "hf": "Qdrant/SPLADE_PP_en_v1",
         },
+        "model_file": "model.onnx",
     },
 ]
 
@@ -76,15 +78,19 @@ def __init__(
         """
 
         super().__init__(model_name, cache_dir, threads, **kwargs)
-
-        self.model_name = model_name
-        self._model_description = self._get_model_description(model_name)
-
-        self._cache_dir = define_cache_dir(cache_dir)
-        self._model_dir = self.download_model(self._model_description, self._cache_dir)
-        self._max_length = 512
-
-        self.load_onnx_model(self._model_dir, self.threads, self._max_length)
+
+        model_description = self._get_model_description(model_name)
+        cache_dir = define_cache_dir(cache_dir)
+
+        model_dir, source = self.download_repo_files(model_description, cache_dir)
+
+        self.load_onnx_model(
+            model_dir,
+            threads,
+            cache_dir,
+            model_description,
+            source,
+        )
 
     def embed(
         self,
@@ -110,7 +116,7 @@ def embed(
         """
         yield from self._embed_documents(
             model_name=self.model_name,
-            cache_dir=str(self._cache_dir),
+            cache_dir=str(self.cache_dir),
             documents=documents,
             batch_size=batch_size,
             parallel=parallel,

diff --git a/fastembed/text/e5_onnx_embedding.py b/fastembed/text/e5_onnx_embedding.py
@@ -15,6 +15,8 @@
             "url": "https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz",
             "hf": "qdrant/multilingual-e5-large-onnx",
         },
+        "model_file": "model.onnx",
+        "additional_files": ["model.onnx_data"],
     },
     {
         "model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
@@ -24,6 +26,7 @@
         "sources": {
             "hf": "xenova/paraphrase-multilingual-mpnet-base-v2",
         },
+        "model_file": "onnx/model.onnx",
     },
 ]
 

diff --git a/fastembed/text/jina_onnx_embedding.py b/fastembed/text/jina_onnx_embedding.py
@@ -13,13 +13,15 @@
         "description": "English embedding model supporting 8192 sequence length",
         "size_in_GB": 0.52,
         "sources": {"hf": "xenova/jina-embeddings-v2-base-en"},
+        "model_file": "onnx/model.onnx",
     },
     {
         "model": "jinaai/jina-embeddings-v2-small-en",
         "dim": 512,
         "description": "English embedding model supporting 8192 sequence length",
         "size_in_GB": 0.12,
         "sources": {"hf": "xenova/jina-embeddings-v2-small-en"},
+        "model_file": "onnx/model.onnx",
     },
 ]
 

diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py
@@ -16,6 +16,7 @@
         "sources": {
             "url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz",
         },
+        "model_file": "model.onnx",
     },
     {
         "model": "BAAI/bge-base-en-v1.5",
@@ -26,6 +27,7 @@
             "url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz",
             "hf": "qdrant/bge-base-en-v1.5-onnx-q",
         },
+        "model_file": "model_optimized.onnx",
     },
     {
         "model": "BAAI/bge-large-en-v1.5",
@@ -35,6 +37,7 @@
         "sources": {
             "hf": "qdrant/bge-large-en-v1.5-onnx",
         },
+        "model_file": "model.onnx",
     },
     {
         "model": "BAAI/bge-small-en",
@@ -44,18 +47,8 @@
         "sources": {
             "url": "https://storage.googleapis.com/qdrant-fastembed/BAAI-bge-small-en.tar.gz",
         },
+        "model_file": "onnx/model.onnx",
     },
-    # {
-    #     "model": "BAAI/bge-small-en",
-    #     "dim": 384,
-    #     "description": "Fast English model",
-    #     "size_in_GB": 0.2,
-    #     "hf_sources": [],
-    #     "compressed_url_sources": [
-    #         "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-en.tar.gz",
-    #         "https://storage.googleapis.com/qdrant-fastembed/BAAI-bge-small-en.tar.gz"
-    #     ]
-    # },
     {
         "model": "BAAI/bge-small-en-v1.5",
         "dim": 384,
@@ -64,6 +57,7 @@
         "sources": {
             "hf": "qdrant/bge-small-en-v1.5-onnx-q",
         },
+        "model_file": "model_optimized.onnx",
     },
     {
         "model": "BAAI/bge-small-zh-v1.5",
@@ -73,6 +67,7 @@
         "sources": {
             "url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz",
         },
+        "model_file": "onnx/model.onnx",
     },
     {
         "model": "sentence-transformers/all-MiniLM-L6-v2",
@@ -83,6 +78,7 @@
             "url": "https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz",
             "hf": "qdrant/all-MiniLM-L6-v2-onnx",
         },
+        "model_file": "model.onnx",
     },
     {
         "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
@@ -92,6 +88,7 @@
         "sources": {
             "hf": "qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q",
         },
+        "model_file": "model_optimized.onnx",
     },
     {
         "model": "nomic-ai/nomic-embed-text-v1",
@@ -101,6 +98,7 @@
         "sources": {
             "hf": "nomic-ai/nomic-embed-text-v1",
         },
+        "model_file": "onnx/model.onnx",
     },
     {
         "model": "nomic-ai/nomic-embed-text-v1.5",
@@ -110,6 +108,17 @@
         "sources": {
             "hf": "nomic-ai/nomic-embed-text-v1.5",
         },
+        "model_file": "onnx/model.onnx",
+    },
+    {
+        "model": "nomic-ai/nomic-embed-text-v1.5-Q",
+        "dim": 768,
+        "description": "Quantized 8192 context length english model",
+        "size_in_GB": 0.13,
+        "sources": {
+            "hf": "nomic-ai/nomic-embed-text-v1.5",
+        },
+        "model_file": "onnx/model_quantized.onnx",
     },
     {
         "model": "thenlper/gte-large",
@@ -119,20 +128,8 @@
         "sources": {
             "hf": "qdrant/gte-large-onnx",
         },
+        "model_file": "model_optimized.onnx",
     },
-    # {
-    #     "model": "sentence-transformers/all-MiniLM-L6-v2",
-    #     "dim": 384,
-    #     "description": "Sentence Transformer model, MiniLM-L6-v2",
-    #     "size_in_GB": 0.09,
-    #     "hf_sources": [
-    #         "qdrant/all-MiniLM-L6-v2-onnx"
-    #     ],
-    #     "compressed_url_sources": [
-    #         "https://storage.googleapis.com/qdrant-fastembed/fast-all-MiniLM-L6-v2.tar.gz",
-    #         "https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz"
-    #     ]
-    # }
     {
         "model": "mixedbread-ai/mxbai-embed-large-v1",
         "dim": 1024,
@@ -141,6 +138,7 @@
         "sources": {
             "hf": "mixedbread-ai/mxbai-embed-large-v1",
         },
+        "model_file": "onnx/model.onnx",
     },
 ]
 
@@ -178,15 +176,19 @@ def __init__(
         """
 
         super().__init__(model_name, cache_dir, threads, **kwargs)
-
-        self.model_name = model_name
-        self._model_description = self._get_model_description(model_name)
-
-        self._cache_dir = define_cache_dir(cache_dir)
-        self._model_dir = self.download_model(self._model_description, self._cache_dir)
-        self._max_length = 512
-
-        self.load_onnx_model(self._model_dir, self.threads, self._max_length)
+
+        model_description = self._get_model_description(model_name)
+        cache_dir = define_cache_dir(cache_dir)
+
+        model_dir, source = self.download_repo_files(model_description, cache_dir)
+
+        self.load_onnx_model(
+            model_dir,
+            threads,
+            cache_dir,
+            model_description,
+            source,
+        )
 
     def embed(
         self,
@@ -212,7 +214,7 @@ def embed(
         """
         yield from self._embed_documents(
             model_name=self.model_name,
-            cache_dir=str(self._cache_dir),
+            cache_dir=str(self.cache_dir),
             documents=documents,
             batch_size=batch_size,
             parallel=parallel,

diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py
@@ -28,6 +28,9 @@
     ),
     "thenlper/gte-large": np.array([-0.01920587, 0.00113156, -0.00708992, -0.00632304, -0.04025577]),
     "mixedbread-ai/mxbai-embed-large-v1": np.array([0.02295546, 0.03196154, 0.016512, -0.04031524, -0.0219634]),
+    "nomic-ai/nomic-embed-text-v1.5-Q": np.array(
+        [-0.04514299, -0.00462366, -0.18909897, -0.0071826, 0.00678478]
+    ),
 }