From 5adf3f6e8e28e680fc8b3e4b32f9630f0baa9362 Mon Sep 17 00:00:00 2001
From: Anush008 <anushshetty90@gmail.com>
Date: Tue, 30 Jan 2024 16:32:30 +0530
Subject: [PATCH 1/3] chore: xenova jina

---
 fastembed/embedding.py | 13 +++++++++++++
 fastembed/models.json  |  4 ++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/fastembed/embedding.py b/fastembed/embedding.py
index 52cf4207..41e2958f 100644
--- a/fastembed/embedding.py
+++ b/fastembed/embedding.py
@@ -95,6 +95,9 @@ def __init__(
         model_path = self.path / "model.onnx"
         optimized_model_path = self.path / "model_optimized.onnx"
 
+        xenova_model_path = self.path / "onnx" / "model.onnx"
+        xenova_optimized_model_path = self.path / "onnx" / "model_optimized.onnx"
+
         # List of Execution Providers: https://onnxruntime.ai/docs/execution-providers
         onnx_providers = ["CPUExecutionProvider"]
 
@@ -102,6 +105,16 @@ def __init__(
             # Rename file model_optimized.onnx to model.onnx if it exists
             if optimized_model_path.exists():
                 optimized_model_path.rename(model_path)
+
+            # Patch for inconsistent repo structure at
+            # - https://huggingface.co/Xenova/jina-embeddings-v2-small-en
+            # - https://huggingface.co/Xenova/jina-embeddings-v2-base-en
+            elif xenova_model_path.exists():
+                model_path = xenova_model_path
+
+            elif xenova_optimized_model_path.exists():
+                model_path = xenova_optimized_model_path
+
             else:
                 raise ValueError(f"Could not find model.onnx in {self.path}")
 
diff --git a/fastembed/models.json b/fastembed/models.json
index 1c4e7b5d..7aea39ad 100644
--- a/fastembed/models.json
+++ b/fastembed/models.json
@@ -83,7 +83,7 @@
         "description": " English embedding model supporting 8192 sequence length",
         "size_in_GB": 0.55,
         "hf_sources": [
-            "jinaai/jina-embeddings-v2-base-en"
+            "xenova/jina-embeddings-v2-base-en"
         ],
         "compressed_url_sources": []
     },
@@ -93,7 +93,7 @@
         "description": " English embedding model supporting 8192 sequence length",
         "size_in_GB": 0.13,
         "hf_sources": [
-            "jinaai/jina-embeddings-v2-small-en"
+            "xenova/jina-embeddings-v2-small-en"
         ],
         "compressed_url_sources": []
     },

From 0bd761f14881c8772f88b7fd7f6bace0c47e596b Mon Sep 17 00:00:00 2001
From: Anush008 <anushshetty90@gmail.com>
Date: Tue, 30 Jan 2024 17:14:17 +0530
Subject: [PATCH 2/3] chore: try recusive model location

---
 fastembed/embedding.py | 35 +++++++++++++----------------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/fastembed/embedding.py b/fastembed/embedding.py
index 41e2958f..f84e189f 100644
--- a/fastembed/embedding.py
+++ b/fastembed/embedding.py
@@ -35,6 +35,18 @@ def iter_batch(iterable: Union[Iterable, Generator], size: int) -> Iterable:
         yield b
 
 
+def locate_model_file(model_dir: Path, file_names: list):
+    if not model_dir.is_dir():
+        raise ValueError(f"Provided model path '{model_dir}' is not a directory.")
+
+    for path in model_dir.rglob("*"):
+        for file_name in file_names:
+            if path.is_file() and path.name == file_name:
+                return path
+
+    raise ValueError(f"Could not find model file in {model_dir}")
+
+
 def normalize(input_array, p=2, dim=1, eps=1e-12):
     # Calculate the Lp norm along the specified dimension
     norm = np.linalg.norm(input_array, ord=p, axis=dim, keepdims=True)
@@ -92,32 +104,11 @@ def __init__(
     ):
         self.path = path
         self.model_name = model_name
-        model_path = self.path / "model.onnx"
-        optimized_model_path = self.path / "model_optimized.onnx"
-
-        xenova_model_path = self.path / "onnx" / "model.onnx"
-        xenova_optimized_model_path = self.path / "onnx" / "model_optimized.onnx"
+        model_path = locate_model_file(self.path, ["model.onnx", "model_optimized.onnx"])
 
         # List of Execution Providers: https://onnxruntime.ai/docs/execution-providers
         onnx_providers = ["CPUExecutionProvider"]
 
-        if not model_path.exists():
-            # Rename file model_optimized.onnx to model.onnx if it exists
-            if optimized_model_path.exists():
-                optimized_model_path.rename(model_path)
-
-            # Patch for inconsistent repo structure at
-            # - https://huggingface.co/Xenova/jina-embeddings-v2-small-en
-            # - https://huggingface.co/Xenova/jina-embeddings-v2-base-en
-            elif xenova_model_path.exists():
-                model_path = xenova_model_path
-
-            elif xenova_optimized_model_path.exists():
-                model_path = xenova_optimized_model_path
-
-            else:
-                raise ValueError(f"Could not find model.onnx in {self.path}")
-
         # Hacky support for multilingual model
         self.exclude_token_type_ids = False
         if model_name == "intfloat/multilingual-e5-large":

From fd6b9f9378536d6193cdef378603225885c9ab1a Mon Sep 17 00:00:00 2001
From: Anush008 <anushshetty90@gmail.com>
Date: Tue, 30 Jan 2024 17:22:32 +0530
Subject: [PATCH 3/3] chore: updated doc string, blob pattern

---
 fastembed/embedding.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fastembed/embedding.py b/fastembed/embedding.py
index f84e189f..620b5d5e 100644
--- a/fastembed/embedding.py
+++ b/fastembed/embedding.py
@@ -35,16 +35,19 @@ def iter_batch(iterable: Union[Iterable, Generator], size: int) -> Iterable:
         yield b
 
 
-def locate_model_file(model_dir: Path, file_names: list):
+def locate_model_file(model_dir: Path, file_names: List[str]):
+    """
+    Find model path for both TransformerJS style `onnx`  subdirectory structure and direct model weights structure used by Optimum and Qdrant
+    """
     if not model_dir.is_dir():
         raise ValueError(f"Provided model path '{model_dir}' is not a directory.")
 
-    for path in model_dir.rglob("*"):
+    for path in model_dir.rglob("*.onnx"):
         for file_name in file_names:
             if path.is_file() and path.name == file_name:
                 return path
 
-    raise ValueError(f"Could not find model file in {model_dir}")
+    raise ValueError(f"Could not find either of {', '.join(file_names)} in {model_dir}")
 
 
 def normalize(input_array, p=2, dim=1, eps=1e-12):