diff --git a/parlai/core/metrics.py b/parlai/core/metrics.py index 1e79ec43c17..d2fa8681e47 100644 --- a/parlai/core/metrics.py +++ b/parlai/core/metrics.py @@ -535,7 +535,9 @@ def _prec_recall_f1_score(pred_items, gold_items): return precision, recall, f1 @staticmethod - def compute(guess: str, answers: List[str]) -> F1Metric: + def compute( + guess: str, answers: List[str], expose_p_and_r: bool = False + ) -> Union[F1Metric, Tuple[F1Metric, F1Metric, F1Metric]]: if guess is None or answers is None: return AverageMetric(0, 0) g_tokens = normalize_answer(guess).split() @@ -546,7 +548,10 @@ def compute(guess: str, answers: List[str]) -> F1Metric: max_p, max_r, max_f1 = 0, 0, 0 for p, r, f1 in scores: max_p, max_r, max_f1 = max(max_p, p), max(max_r, r), max(f1, max_f1) - return (F1Metric(max_p, 1), F1Metric(max_r, 1), F1Metric(max_f1, 1)) + if expose_p_and_r: + return (F1Metric(max_p, 1), F1Metric(max_r, 1), F1Metric(max_f1, 1)) + else: + return F1Metric(max_f1, 1) class ExactMatchMetric(AverageMetric): @@ -730,7 +735,9 @@ def compute_many( :return: (rouge-1, rouge-2, rouge-L) """ measure = measure.lower() - assert measure in ROUGE_METRICS_MEASURES, "Use one of recall 'r' (default), f1 'f', or precision 'p'." + assert ( + measure in ROUGE_METRICS_MEASURES + ), "Use one of recall 'r' (default), f1 'f', or precision 'p'." # possible global initialization try: @@ -1031,7 +1038,9 @@ def evaluate_response(self, observation: Message, labels: List[str]) -> None: if prediction is not None: self.add('accuracy', ExactMatchMetric.compute(prediction, labels)) - precision, recall, f1 = F1Metric.compute(prediction, labels) + precision, recall, f1 = F1Metric.compute( + prediction, labels, expose_p_and_r=True + ) self.add('precision', precision) self.add('recall', recall) self.add('f1', f1) diff --git a/tests/nightly/gpu/test_bb3.py b/tests/nightly/gpu/test_bb3.py index f5c68ac980a..f1ff4d70943 100644 --- a/tests/nightly/gpu/test_bb3.py +++ b/tests/nightly/gpu/test_bb3.py @@ -19,6 +19,9 @@ from projects.bb3.tests.opt_presets import INIT_OPT +LOCAL = False + + def _self_memory(text): return f'{PROMPT.SELF_MEMORY_PREFIX}: {text}' @@ -115,7 +118,7 @@ class TestOptFtBase(unittest.TestCase): def setUp(self): self.opt = INIT_OPT for k, v in self.opt.items(): - if 'BB3OPTAgent' in v: + if 'BB3OPTAgent' in str(v): self.opt[k] = 'projects.bb3.agents.opt_api_agent:MockOptAgent' self.opt['search_server'] = 'test' @@ -340,8 +343,8 @@ def setUp(self): opt = copy.deepcopy(self.opt) overrides = { 'knowledge_conditioning': 'combined', - 'opt_server': 'http://18.117.126.138:3000', - 'search_server': 'bing_cc', + 'opt_server': 'http://localhost:6000', + 'search_server': 'test', } for k, v in overrides.items(): opt[k] = v @@ -351,6 +354,7 @@ def setUp(self): self.batch_agent = create_agent(opt) +@unittest.skipUnless(LOCAL, "must be local to specify opt server") class TestOptMainServerBatching(TestOptMainServerBase): def test_batching(self): self.batch_agent = create_agent(self.opt)