Skip to content

Commit

Permalink
fix: rollback BUCC revision (#1706)
Browse files Browse the repository at this point in the history
* fix bucc
* fix logger
* upd evaluator
* add comment
* lint
  • Loading branch information
Samoed authored Jan 8, 2025
1 parent ab8805c commit 9bcb52f
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 6 deletions.
10 changes: 7 additions & 3 deletions mteb/evaluation/evaluators/BitextMiningEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,13 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):

def compute_metrics(self, model: Encoder, encode_kwargs: dict[str, Any] = {}):
pair_elements = {p for pair in self.pairs for p in pair}
subsets = [
col for col in self.sentences.features.keys() if col in pair_elements
]
if isinstance(self.sentences, Dataset):
subsets = [
col for col in self.sentences.features.keys() if col in pair_elements
]
else:
# BUCC outputs a dict instead of a Dataset
subsets = list(pair_elements)
n_subsets = len(subsets)

embeddings = {}
Expand Down
12 changes: 9 additions & 3 deletions mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

import logging

from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining
from mteb.abstasks.MultilingualTask import MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata
Expand All @@ -14,14 +16,17 @@

_SPLITS = ["test"]

logger = logging.getLogger(__name__)


class BUCCBitextMining(AbsTaskBitextMining, MultilingualTask):
superseded_by = "BUCC.v2"
metadata = TaskMetadata(
name="BUCC",
dataset={
"path": "mteb/bucc-bitext-mining",
"revision": "1739dc11ffe9b7bfccd7f3d585aeb4c544fc6677",
"revision": "d51519689f32196a32af33b075a01d0e7c51e252",
"trust_remote_code": True,
},
description="BUCC bitext mining dataset",
reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
Expand Down Expand Up @@ -70,8 +75,9 @@ def dataset_transform(self):
sentence1 = data["sentence1"][0]
sentence2 = data["sentence2"][0]
sentence1 = [sentence1[i] for (i, j) in gold]
print(lang, len(gold))
print(len(sentence1), len(sentence2))
logger.info(f"Lang {lang} num gold {len(gold)}")
logger.info(f"Lang {lang} num sentence1 {len(sentence1)}")
logger.info(f"Lang {lang} num sentence2 {len(sentence2)}")
dataset[lang][split] = {
"sentence1": sentence1,
"sentence2": sentence2,
Expand Down

0 comments on commit 9bcb52f

Please sign in to comment.