Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: rollback BUCC revision #1706

Merged
merged 5 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions mteb/evaluation/evaluators/BitextMiningEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,13 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):

def compute_metrics(self, model: Encoder, encode_kwargs: dict[str, Any] = {}):
pair_elements = {p for pair in self.pairs for p in pair}
subsets = [
col for col in self.sentences.features.keys() if col in pair_elements
]
if isinstance(self.sentences, Dataset):
subsets = [
col for col in self.sentences.features.keys() if col in pair_elements
]
else:
# BUCC outputs a dict instead of a Dataset
subsets = list(pair_elements)
Comment on lines +52 to +53
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't we change the dataset instead?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to change it, but it resulted in a different number of sentence1 and sentence2, which prevented me from creating a dataset.Dataset.

n_subsets = len(subsets)

embeddings = {}
Expand Down
12 changes: 9 additions & 3 deletions mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

import logging

from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining
from mteb.abstasks.MultilingualTask import MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata
Expand All @@ -14,14 +16,17 @@

_SPLITS = ["test"]

logger = logging.getLogger(__name__)


class BUCCBitextMining(AbsTaskBitextMining, MultilingualTask):
superseded_by = "BUCC.v2"
metadata = TaskMetadata(
name="BUCC",
dataset={
"path": "mteb/bucc-bitext-mining",
"revision": "1739dc11ffe9b7bfccd7f3d585aeb4c544fc6677",
"revision": "d51519689f32196a32af33b075a01d0e7c51e252",
isaac-chung marked this conversation as resolved.
Show resolved Hide resolved
"trust_remote_code": True,
},
description="BUCC bitext mining dataset",
reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
Expand Down Expand Up @@ -70,8 +75,9 @@ def dataset_transform(self):
sentence1 = data["sentence1"][0]
sentence2 = data["sentence2"][0]
sentence1 = [sentence1[i] for (i, j) in gold]
print(lang, len(gold))
print(len(sentence1), len(sentence2))
logger.info(f"Lang {lang} num gold {len(gold)}")
isaac-chung marked this conversation as resolved.
Show resolved Hide resolved
logger.info(f"Lang {lang} num sentence1 {len(sentence1)}")
logger.info(f"Lang {lang} num sentence2 {len(sentence2)}")
dataset[lang][split] = {
"sentence1": sentence1,
"sentence2": sentence2,
Expand Down
Loading