diff --git a/continuous_eval/metrics/generation_LLM_based_metrics.py b/continuous_eval/metrics/generation_LLM_based_metrics.py index a1d5bfb..58e712e 100644 --- a/continuous_eval/metrics/generation_LLM_based_metrics.py +++ b/continuous_eval/metrics/generation_LLM_based_metrics.py @@ -88,7 +88,7 @@ def __str__(self): def calculate(self, question, answer, ground_truths, **kwargs): """ - Calculate the faithfulness score for the given datapoint. + Calculate the correctness score for the given datapoint. """ gt_answers = "\n".join(ground_truths) if self.use_few_shot: @@ -145,7 +145,7 @@ def __str__(self): def calculate(self, question, answer, **kwargs): """ - Calculate the faithfulness score for the given datapoint. + Calculate the answer relevance score for the given datapoint. """ if self.use_few_shot: few_shot_prompt = """ @@ -191,8 +191,8 @@ def calculate(self, question, answer, **kwargs): class LLMBasedStyleConsistency(LLMBasedMetric): """ - The LLM based answer relevance metric. - Measures whether the generated answer is relevant to the question. + The LLM based style consistency metric. + Measures whether the generated answer is consistent in style to the ground truth answer. """ def __init__(self, model: Optional[LLMInterface] = None, use_few_shot: bool = True): @@ -204,7 +204,7 @@ def __str__(self): def calculate(self, answer, ground_truths, **kwargs): """ - Calculate the faithfulness score for the given datapoint. + Calculate the style consistency score for the given datapoint. """ gt_answers = "\n".join(ground_truths) if self.use_few_shot: @@ -224,7 +224,7 @@ def calculate(self, answer, ground_truths, **kwargs): "system_prompt": ( """ You are an expert evaluator system for a question answering system. -You need to evaluate the style of the generated answer based on some reference answers. +You only need to evaluate the style of the generated answer based on some reference answers, regardless of whether the answer is correct or not. Assess style aspects such as tone, verbosity, formality, complexity, use of terminology, etc. Output a score and the reasoning for your score in a new line. Use the following guidelines for evaluation: diff --git a/continuous_eval/metrics/retrieval_LLM_based_metrics.py b/continuous_eval/metrics/retrieval_LLM_based_metrics.py index dc82af3..f7d77dc 100644 --- a/continuous_eval/metrics/retrieval_LLM_based_metrics.py +++ b/continuous_eval/metrics/retrieval_LLM_based_metrics.py @@ -21,7 +21,7 @@ def __str__(self): def calculate(self, question, retrieved_contexts, **kwargs): """ - Calculate the context relevance score for the given datapoint. + Calculate the context precision score for the given datapoint. """ scores = [] for context in retrieved_contexts: @@ -87,7 +87,7 @@ def __str__(self): def calculate(self, question, retrieved_contexts, answer, **kwargs): """ - Calculate the context relevance score for the given datapoint. + Calculate the context coverage score for the given datapoint. """ context = "\n".join(retrieved_contexts) diff --git a/docs/src/content/docs/index.mdx b/docs/src/content/docs/index.mdx index b499460..2c31836 100644 --- a/docs/src/content/docs/index.mdx +++ b/docs/src/content/docs/index.mdx @@ -27,12 +27,12 @@ import { Icon } from '@astrojs/starlight/components'; diff --git a/docs/src/content/docs/metrics/ensembling/ensembling_classifier.md b/docs/src/content/docs/metrics/ensembling/ensembling_classifier.md index 5f739b2..2155784 100644 --- a/docs/src/content/docs/metrics/ensembling/ensembling_classifier.md +++ b/docs/src/content/docs/metrics/ensembling/ensembling_classifier.md @@ -153,7 +153,7 @@ def judicator(idx): # and in this case, since we are computing the correctness of the sample, # it returns True if the example is correct and False otherwise datum = datasplit.test_full.X.iloc[idx].to_dict() - return llm_metric.calculate(**datum)["LLM_based_answer_correctness"] >= 3 + return llm_metric.calculate(**datum)["LLM_based_answer_correctness"] >= 0.5 ``` To use the judicator we simply pass it to the `predict` method: diff --git a/examples/ensemble_metric_with_judicator.py b/examples/ensemble_metric_with_judicator.py index 9a9ae11..5041a86 100644 --- a/examples/ensemble_metric_with_judicator.py +++ b/examples/ensemble_metric_with_judicator.py @@ -45,7 +45,7 @@ def judicator(idx): # and in this case, since we are computing the correctness of the sample, # it returns True if the example is correct and False otherwise datum = datasplit.test_full.X.iloc[idx].to_dict() - return llm_metric.calculate(**datum)["LLM_based_answer_correctness"] >= 3 + return llm_metric.calculate(**datum)["LLM_based_answer_correctness"] >= 0.5 # Let's train a metric ensamble classifier