Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: patch for new Xenova Jina source #102

Merged
merged 3 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions fastembed/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ def iter_batch(iterable: Union[Iterable, Generator], size: int) -> Iterable:
yield b


def locate_model_file(model_dir: Path, file_names: list):
if not model_dir.is_dir():
raise ValueError(f"Provided model path '{model_dir}' is not a directory.")

for path in model_dir.rglob("*"):
Anush008 marked this conversation as resolved.
Show resolved Hide resolved
for file_name in file_names:
if path.is_file() and path.name == file_name:
return path

raise ValueError(f"Could not find model file in {model_dir}")


def normalize(input_array, p=2, dim=1, eps=1e-12):
# Calculate the Lp norm along the specified dimension
norm = np.linalg.norm(input_array, ord=p, axis=dim, keepdims=True)
Expand Down Expand Up @@ -92,19 +104,11 @@ def __init__(
):
self.path = path
self.model_name = model_name
model_path = self.path / "model.onnx"
optimized_model_path = self.path / "model_optimized.onnx"
model_path = locate_model_file(self.path, ["model.onnx", "model_optimized.onnx"])

# List of Execution Providers: https://onnxruntime.ai/docs/execution-providers
onnx_providers = ["CPUExecutionProvider"]

if not model_path.exists():
# Rename file model_optimized.onnx to model.onnx if it exists
if optimized_model_path.exists():
optimized_model_path.rename(model_path)
else:
raise ValueError(f"Could not find model.onnx in {self.path}")

# Hacky support for multilingual model
self.exclude_token_type_ids = False
if model_name == "intfloat/multilingual-e5-large":
Expand Down
4 changes: 2 additions & 2 deletions fastembed/models.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
"description": " English embedding model supporting 8192 sequence length",
"size_in_GB": 0.55,
"hf_sources": [
"jinaai/jina-embeddings-v2-base-en"
"xenova/jina-embeddings-v2-base-en"
],
"compressed_url_sources": []
},
Expand All @@ -93,7 +93,7 @@
"description": " English embedding model supporting 8192 sequence length",
"size_in_GB": 0.13,
"hf_sources": [
"jinaai/jina-embeddings-v2-small-en"
"xenova/jina-embeddings-v2-small-en"
],
"compressed_url_sources": []
},
Expand Down
Loading