Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix paraphrase minilm #436

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
11 changes: 0 additions & 11 deletions fastembed/text/onnx_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,6 @@
},
"model_file": "model_optimized.onnx",
},
{
"model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"dim": 384,
"description": "Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, Prefixes for queries/documents: not necessary, 2019 year.",
"license": "apache-2.0",
"size_in_GB": 0.22,
"sources": {
"hf": "qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q",
},
"model_file": "model_optimized.onnx",
},
{
"model": "thenlper/gte-large",
"dim": 1024,
Expand Down
11 changes: 11 additions & 0 deletions fastembed/text/pooled_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@
},
"model_file": "onnx/model.onnx",
},
{
"model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"dim": 384,
"description": "Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, Prefixes for queries/documents: not necessary, 2019 year.",
"license": "apache-2.0",
"size_in_GB": 0.22,
"sources": {
"hf": "qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q",
},
"model_file": "model_optimized.onnx",
},
]


Expand Down
9 changes: 9 additions & 0 deletions fastembed/text/text_embedding.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from typing import Any, Iterable, Optional, Sequence, Type, Union

import numpy as np
Expand Down Expand Up @@ -62,6 +63,14 @@ def __init__(
**kwargs,
):
super().__init__(model_name, cache_dir, threads, **kwargs)
if model_name == "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2":
warnings.warn(
"The model 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' has been updated to include a mean pooling layer. "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you please add fastembed version in which it's gonna changed? (0.5.2)

"Please ensure your usage aligns with the new functionality. "
"Support for the previous version without mean pooling is removed.",
UserWarning,
stacklevel=2,
)
for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY:
supported_models = EMBEDDING_MODEL_TYPE.list_supported_models()
if any(model_name.lower() == model["model"].lower() for model in supported_models):
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ keywords = ["vector", "embedding", "neural", "search", "qdrant", "sentence-trans

[tool.poetry.dependencies]
python = ">=3.9.0"
onnx = ">=1.15.0"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should not be in this PR

numpy = [
{ version = ">=1.21,<2.1.0", python = "<3.10" },
{ version = ">=1.21", python = ">=3.10,<3.12" },
Expand Down
2 changes: 1 addition & 1 deletion tests/test_text_onnx_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
[-0.034478, 0.03102, 0.00673, 0.02611, -0.039362]
),
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array(
[0.0094, 0.0184, 0.0328, 0.0072, -0.0351]
[0.0361, 0.1862, 0.2776, 0.2461, -0.1904]
),
"intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]),
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2": np.array(
Expand Down
Loading