From d715ab3d59210ac1e37f825d9e10fa4f28078faf Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 20 Feb 2024 21:36:59 +0500 Subject: [PATCH 1/3] feat: Support sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 --- fastembed/models.json | 10 ++++++++++ fastembed/text/onnx_embedding.py | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/fastembed/models.json b/fastembed/models.json index e370c74a..f10bd3af 100644 --- a/fastembed/models.json +++ b/fastembed/models.json @@ -110,6 +110,16 @@ "https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz" ] }, + { + "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + "dim": 384, + "description": "Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2", + "size_in_GB": 0.46, + "hf_sources": [ + "qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q" + ], + "compressed_url_sources": [] + }, { "model": "xenova/multilingual-e5-large", "dim": 1024, diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index 165f6eb4..1d194450 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -98,6 +98,15 @@ "hf": "qdrant/all-MiniLM-L6-v2-onnx", }, }, + { + "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + "dim": 384, + "description": "Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2", + "size_in_GB": 0.46, + "sources": { + "hf": "qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q", + }, + }, { "model": "nomic-ai/nomic-embed-text-v1", "dim": 768, From fa4fb9ac0eaa5d75cc874e9807e86d1e3a4daa43 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 20 Feb 2024 21:37:32 +0500 Subject: [PATCH 2/3] test: Include sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 --- tests/test_onnx_embeddings.py | 1 + tests/test_text_onnx_embeddings.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/test_onnx_embeddings.py b/tests/test_onnx_embeddings.py index 7f580e6b..ea255af2 100644 --- a/tests/test_onnx_embeddings.py +++ b/tests/test_onnx_embeddings.py @@ -13,6 +13,7 @@ "BAAI/bge-base-en-v1.5": np.array([0.01129394, 0.05493144, 0.02615099, 0.00328772, 0.02996045]), "BAAI/bge-large-en-v1.5": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]), "sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]), + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array([0.0094, 0.0184, 0.0328, 0.0072, -0.0351]), "intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]), "xenova/multilingual-e5-large": np.array([0.00975464, 0.00446568, 0.00655449, -0.0354155, 0.00702112]), "xenova/paraphrase-multilingual-mpnet-base-v2": np.array( diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py index 8f523adf..81bcbbd3 100644 --- a/tests/test_text_onnx_embeddings.py +++ b/tests/test_text_onnx_embeddings.py @@ -14,6 +14,7 @@ "BAAI/bge-large-en-v1.5": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]), "BAAI/bge-large-en-v1.5-quantized": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]), "sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]), + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array([0.0094, 0.0184, 0.0328, 0.0072, -0.0351]), "intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]), "sentence-transformers/paraphrase-multilingual-mpnet-base-v2": np.array( [-0.01341097, 0.0416553, -0.00480805, 0.02844842, 0.0505299] From de1fc8a48c12ec64b362518b8155f9d72ab79bc5 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 20 Feb 2024 21:38:04 +0500 Subject: [PATCH 3/3] docs: supported models update --- docs/examples/Supported_Models.ipynb | 66 ++++++++++++++++------------ 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/docs/examples/Supported_Models.ipynb b/docs/examples/Supported_Models.ipynb index c180b9ea..8c100d12 100644 --- a/docs/examples/Supported_Models.ipynb +++ b/docs/examples/Supported_Models.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -110,14 +110,22 @@ " \n", " \n", " 8\n", + " sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\n", + " 384\n", + " Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2\n", + " 0.46\n", + " {'hf': 'qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q'}\n", + " \n", + " \n", + " 9\n", " nomic-ai/nomic-embed-text-v1\n", " 768\n", " 8192 context length english model\n", " 0.54\n", - " {'hf': 'xenova/nomic-embed-text-v1'}\n", + " {'hf': 'nomic-ai/nomic-embed-text-v1'}\n", " \n", " \n", - " 9\n", + " 10\n", " intfloat/multilingual-e5-large\n", " 1024\n", " Multilingual model, e5-large. Recommend using this model for non-English languages\n", @@ -125,7 +133,7 @@ " {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'}\n", " \n", " \n", - " 10\n", + " 11\n", " sentence-transformers/paraphrase-multilingual-mpnet-base-v2\n", " 768\n", " Sentence-transformers model for tasks like clustering or semantic search\n", @@ -133,7 +141,7 @@ " {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'}\n", " \n", " \n", - " 11\n", + " 12\n", " jinaai/jina-embeddings-v2-base-en\n", " 768\n", " English embedding model supporting 8192 sequence length\n", @@ -141,7 +149,7 @@ " {'hf': 'xenova/jina-embeddings-v2-base-en'}\n", " \n", " \n", - " 12\n", + " 13\n", " jinaai/jina-embeddings-v2-small-en\n", " 512\n", " English embedding model supporting 8192 sequence length\n", @@ -162,11 +170,12 @@ "5 BAAI/bge-small-en-v1.5 384 \n", "6 BAAI/bge-small-zh-v1.5 512 \n", "7 sentence-transformers/all-MiniLM-L6-v2 384 \n", - "8 nomic-ai/nomic-embed-text-v1 768 \n", - "9 intfloat/multilingual-e5-large 1024 \n", - "10 sentence-transformers/paraphrase-multilingual-mpnet-base-v2 768 \n", - "11 jinaai/jina-embeddings-v2-base-en 768 \n", - "12 jinaai/jina-embeddings-v2-small-en 512 \n", + "8 sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 384 \n", + "9 nomic-ai/nomic-embed-text-v1 768 \n", + "10 intfloat/multilingual-e5-large 1024 \n", + "11 sentence-transformers/paraphrase-multilingual-mpnet-base-v2 768 \n", + "12 jinaai/jina-embeddings-v2-base-en 768 \n", + "13 jinaai/jina-embeddings-v2-small-en 512 \n", "\n", " description \\\n", "0 Base English model \n", @@ -177,11 +186,12 @@ "5 Fast and Default English model \n", "6 Fast and recommended Chinese model \n", "7 Sentence Transformer model, MiniLM-L6-v2 \n", - "8 8192 context length english model \n", - "9 Multilingual model, e5-large. Recommend using this model for non-English languages \n", - "10 Sentence-transformers model for tasks like clustering or semantic search \n", - "11 English embedding model supporting 8192 sequence length \n", + "8 Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2 \n", + "9 8192 context length english model \n", + "10 Multilingual model, e5-large. Recommend using this model for non-English languages \n", + "11 Sentence-transformers model for tasks like clustering or semantic search \n", "12 English embedding model supporting 8192 sequence length \n", + "13 English embedding model supporting 8192 sequence length \n", "\n", " size_in_GB \\\n", "0 0.50 \n", @@ -192,11 +202,12 @@ "5 0.13 \n", "6 0.10 \n", "7 0.09 \n", - "8 0.54 \n", - "9 2.24 \n", - "10 1.11 \n", - "11 0.55 \n", - "12 0.13 \n", + "8 0.46 \n", + "9 0.54 \n", + "10 2.24 \n", + "11 1.11 \n", + "12 0.55 \n", + "13 0.13 \n", "\n", " sources \n", "0 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz'} \n", @@ -207,14 +218,15 @@ "5 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-en-v1.5.tar.gz', 'hf': 'qdrant/bge-small-en-v1.5-onnx-q'} \n", "6 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz'} \n", "7 {'url': 'https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz', 'hf': 'qdrant/all-MiniLM-L6-v2-onnx'} \n", - "8 {'hf': 'xenova/nomic-embed-text-v1'} \n", - "9 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} \n", - "10 {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} \n", - "11 {'hf': 'xenova/jina-embeddings-v2-base-en'} \n", - "12 {'hf': 'xenova/jina-embeddings-v2-small-en'} " + "8 {'hf': 'qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q'} \n", + "9 {'hf': 'nomic-ai/nomic-embed-text-v1'} \n", + "10 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} \n", + "11 {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} \n", + "12 {'hf': 'xenova/jina-embeddings-v2-base-en'} \n", + "13 {'hf': 'xenova/jina-embeddings-v2-small-en'} " ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -244,7 +256,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.11.4" }, "orig_nbformat": 4 },