Skip to content

Commit

Permalink
Add truncation support in evaluators (#2582)
Browse files Browse the repository at this point in the history
* Add truncation support in evaluators

* Add truncation to logs; support truncation in ParaphraseMiningEvaluator

* Use Matryoshka evaluators for matryoshka training scripts

---------

Co-authored-by: Tom Aarsen <[email protected]>
  • Loading branch information
kddubey and tomaarsen authored Apr 11, 2024
1 parent 53a2470 commit 99674c7
Show file tree
Hide file tree
Showing 11 changed files with 317 additions and 173 deletions.
47 changes: 30 additions & 17 deletions examples/training/matryoshka/matryoshka_nli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with MatryoshkaLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset
At every 10% training steps, the model is evaluated on the STS benchmark dataset at the different output dimensions.
Usage:
python matryoshka_nli.py
Expand All @@ -15,7 +15,7 @@
from datasets import load_dataset
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
Expand All @@ -34,6 +34,7 @@
train_batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1
matryoshka_dims = [768, 512, 256, 128, 64]

# Save path of the model
model_save_path = (
Expand Down Expand Up @@ -97,16 +98,22 @@ def add_to_samples(sent1, sent2, label):

# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
train_loss = losses.MatryoshkaLoss(model, train_loss, matryoshka_dims=matryoshka_dims)

stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
evaluators = []
for dim in matryoshka_dims:
evaluators.append(
EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name=f"sts-dev-{dim}",
truncate_dim=dim,
)
)
dev_evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[0])

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
Expand Down Expand Up @@ -134,13 +141,19 @@ def add_to_samples(sent1, sent2, label):

model = SentenceTransformer(model_save_path)
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
evaluators = []
for dim in matryoshka_dims:
evaluators.append(
EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name=f"sts-test-{dim}",
truncate_dim=dim,
)
)
test_evaluator = SequentialEvaluator(evaluators)
test_evaluator(model, output_path=model_save_path)


Expand Down
48 changes: 31 additions & 17 deletions examples/training/matryoshka/matryoshka_nli_reduced_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with MatryoshkaLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset
At every 10% training steps, the model is evaluated on the STS benchmark dataset at the different output dimensions.
The difference between this script and matryoshka_nli.py is that this script uses a reduced dimensionality of the base
model by adding a Dense layer with `reduced_dim=256` output dimensions. This might be useful when your desired output
Expand All @@ -19,7 +19,7 @@
from datasets import load_dataset
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
Expand All @@ -39,6 +39,7 @@
max_seq_length = 75
num_epochs = 1
reduced_dim = 256
matryoshka_dims = [256, 128, 64, 32, 16]

# Save path of the model
model_save_path = (
Expand Down Expand Up @@ -103,16 +104,22 @@ def add_to_samples(sent1, sent2, label):

# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [256, 128, 64, 32, 16])
train_loss = losses.MatryoshkaLoss(model, train_loss, matryoshka_dims=matryoshka_dims)

stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
evaluators = []
for dim in matryoshka_dims:
evaluators.append(
EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name=f"sts-dev-{dim}",
truncate_dim=dim,
)
)
dev_evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[0])

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
Expand Down Expand Up @@ -140,15 +147,22 @@ def add_to_samples(sent1, sent2, label):

model = SentenceTransformer(model_save_path)
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
evaluators = []
for dim in matryoshka_dims:
evaluators.append(
EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name=f"sts-test-{dim}",
truncate_dim=dim,
)
)
test_evaluator = SequentialEvaluator(evaluators)
test_evaluator(model, output_path=model_save_path)


# Optionally, save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
Expand Down
61 changes: 37 additions & 24 deletions sentence_transformers/evaluation/BinaryClassificationEvaluator.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from contextlib import nullcontext
from . import SentenceEvaluator
import logging
import os
import csv
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from sklearn.metrics import average_precision_score
import numpy as np
from typing import List
from typing import List, Optional
from ..readers import InputExample


Expand All @@ -30,6 +31,8 @@ class BinaryClassificationEvaluator(SentenceEvaluator):
:param batch_size: Batch size used to compute embeddings
:param show_progress_bar: If true, prints a progress bar
:param write_csv: Write results to a CSV file
:param truncate_dim: The dimension to truncate sentence embeddings to. `None` uses the model's current truncation
dimension. Defaults to None.
"""

def __init__(
Expand All @@ -41,10 +44,12 @@ def __init__(
batch_size: int = 32,
show_progress_bar: bool = False,
write_csv: bool = True,
truncate_dim: Optional[int] = None,
):
self.sentences1 = sentences1
self.sentences2 = sentences2
self.labels = labels
self.truncate_dim = truncate_dim

assert len(self.sentences1) == len(self.sentences2)
assert len(self.sentences1) == len(self.labels)
Expand Down Expand Up @@ -109,13 +114,15 @@ def from_input_examples(cls, examples: List[InputExample], **kwargs):
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = f" after epoch {epoch}:"
out_txt = f" after epoch {epoch}"
else:
out_txt = f" in epoch {epoch} after {steps} steps:"
out_txt = f" in epoch {epoch} after {steps} steps"
else:
out_txt = ":"
out_txt = ""
if self.truncate_dim is not None:
out_txt += f" (truncated to {self.truncate_dim})"

logger.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt)
logger.info(f"Binary Accuracy Evaluation of the model on the {self.name} dataset{out_txt}:")

scores = self.compute_metrices(model)

Expand Down Expand Up @@ -144,25 +151,31 @@ def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int =
return main_score

def compute_metrices(self, model):
try:
# If the sentences are hashable, then we can use a set to avoid embedding the same sentences multiple times
sentences = list(set(self.sentences1 + self.sentences2))
embeddings = model.encode(
sentences, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
)
emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
embeddings1 = [emb_dict[sent] for sent in self.sentences1]
embeddings2 = [emb_dict[sent] for sent in self.sentences2]
except TypeError:
# Otherwise we just embed everything, e.g. if the sentences are images for evaluating a CLIP model
embeddings = model.encode(
self.sentences1 + self.sentences2,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
)
embeddings1 = embeddings[: len(self.sentences1)]
embeddings2 = embeddings[len(self.sentences1) :]
with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
try:
# If the sentences are hashable, then we can use a set to avoid embedding the same sentences multiple
# times
sentences = list(set(self.sentences1 + self.sentences2))
except TypeError:
# Otherwise we just embed everything, e.g. if the sentences are images for evaluating a CLIP model
embeddings = model.encode(
self.sentences1 + self.sentences2,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
)
embeddings1 = embeddings[: len(self.sentences1)]
embeddings2 = embeddings[len(self.sentences1) :]
else:
embeddings = model.encode(
sentences,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
)
emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
embeddings1 = [emb_dict[sent] for sent in self.sentences1]
embeddings2 = [emb_dict[sent] for sent in self.sentences2]

cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2)
Expand Down
10 changes: 6 additions & 4 deletions sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,15 @@ def from_input_examples(cls, examples: List[InputExample], **kwargs):
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
out_txt = f" after epoch {epoch}"
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
out_txt = f" in epoch {epoch} after {steps} steps"
else:
out_txt = ":"
out_txt = ""
if self.truncate_dim is not None:
out_txt += f" (truncated to {self.truncate_dim})"

logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
logger.info(f"EmbeddingSimilarityEvaluator: Evaluating the model on the {self.name} dataset{out_txt}:")

with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
embeddings1 = model.encode(
Expand Down
45 changes: 28 additions & 17 deletions sentence_transformers/evaluation/InformationRetrievalEvaluator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from contextlib import nullcontext
from . import SentenceEvaluator
import torch
from torch import Tensor
Expand All @@ -6,7 +7,7 @@
from ..util import cos_sim, dot_score
import os
import numpy as np
from typing import List, Dict, Set, Callable
from typing import List, Dict, Optional, Set, Callable
import heapq


Expand Down Expand Up @@ -36,6 +37,7 @@ def __init__(
batch_size: int = 32,
name: str = "",
write_csv: bool = True,
truncate_dim: Optional[int] = None,
score_functions: Dict[str, Callable[[Tensor, Tensor], Tensor]] = {
"cos_sim": cos_sim,
"dot_score": dot_score,
Expand Down Expand Up @@ -67,6 +69,7 @@ def __init__(
self.score_functions = score_functions
self.score_function_names = sorted(list(self.score_functions.keys()))
self.main_score_function = main_score_function
self.truncate_dim = truncate_dim

if name:
name = "_" + name
Expand All @@ -93,15 +96,16 @@ def __init__(

def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1, *args, **kwargs) -> float:
if epoch != -1:
out_txt = (
" after epoch {}:".format(epoch)
if steps == -1
else " in epoch {} after {} steps:".format(epoch, steps)
)
if steps == -1:
out_txt = f" after epoch {epoch}"
else:
out_txt = f" in epoch {epoch} after {steps} steps"
else:
out_txt = ":"
out_txt = ""
if self.truncate_dim is not None:
out_txt += f" (truncated to {self.truncate_dim})"

logger.info("Information Retrieval Evaluation on " + self.name + " dataset" + out_txt)
logger.info(f"Information Retrieval Evaluation of the model on the {self.name} dataset{out_txt}:")

scores = self.compute_metrices(model, *args, **kwargs)

Expand Down Expand Up @@ -156,9 +160,13 @@ def compute_metrices(self, model, corpus_model=None, corpus_embeddings: Tensor =
)

# Compute embedding for the queries
query_embeddings = model.encode(
self.queries, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True
)
with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
query_embeddings = model.encode(
self.queries,
show_progress_bar=self.show_progress_bar,
batch_size=self.batch_size,
convert_to_tensor=True,
)

queries_result_list = {}
for name in self.score_functions:
Expand All @@ -172,12 +180,15 @@ def compute_metrices(self, model, corpus_model=None, corpus_embeddings: Tensor =

# Encode chunk of corpus
if corpus_embeddings is None:
sub_corpus_embeddings = corpus_model.encode(
self.corpus[corpus_start_idx:corpus_end_idx],
show_progress_bar=False,
batch_size=self.batch_size,
convert_to_tensor=True,
)
with nullcontext() if self.truncate_dim is None else corpus_model.truncate_sentence_embeddings(
self.truncate_dim
):
sub_corpus_embeddings = corpus_model.encode(
self.corpus[corpus_start_idx:corpus_end_idx],
show_progress_bar=False,
batch_size=self.batch_size,
convert_to_tensor=True,
)
else:
sub_corpus_embeddings = corpus_embeddings[corpus_start_idx:corpus_end_idx]

Expand Down
Loading

0 comments on commit 99674c7

Please sign in to comment.