Skip to content

Commit

Permalink
adds aime24, 25 and math500 (#586)
Browse files Browse the repository at this point in the history
* commit

* Apply suggestions from code review

* commit

* add prompt to math 500

* add prompt to math 500
  • Loading branch information
NathanHB authored Feb 25, 2025
1 parent 066f84f commit 4c9af85
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 16 deletions.
5 changes: 2 additions & 3 deletions examples/model_configs/vllm_model_config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
model:
base_params:
model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,revision=main,dtype=bfloat16" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B-Instruct,revision=main,dtype=bfloat16"
generation:
temperature: 0.3
repetition_penalty: 1.0
Expand All @@ -10,5 +10,4 @@ model:
top_k: -1
min_p: 0.0
top_p: 0.9
max_new_tokens: 256
stop_tokens: ["<EOS>", "<PAD>"]
max_new_tokens: 2048
4 changes: 2 additions & 2 deletions src/lighteval/main_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,8 +468,8 @@ def litellm(
if model_args.endswith(".yaml"):
model_config = LiteLLMModelConfig.from_path(model_args)
else:
model_name = model_args.split(",")[0].strip()
model_config = LiteLLMModelConfig(model=model_name)
model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
model_config = LiteLLMModelConfig(**model_args_dict)

pipeline_params = PipelineParameters(
launcher_type=parallelism_manager,
Expand Down
20 changes: 20 additions & 0 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
from aenum import Enum

from lighteval.metrics.dynamic_metrics import (
ExprExtractionConfig,
IndicesExtractionConfig,
LatexExtractionConfig,
multilingual_extractive_match_metric,
)
from lighteval.metrics.harness_compatibility.drop import drop_metrics
Expand Down Expand Up @@ -178,6 +180,15 @@ class Metrics(Enum):
corpus_level_fn=np.mean,
higher_is_better=True,
)
expr_gold_metric = multilingual_extractive_match_metric(
language=Language.ENGLISH,
fallback_mode="first_match",
precision=5,
gold_extraction_target=(ExprExtractionConfig(),),
# Match boxed first before trying other regexes
pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
aggregation_function=max,
)
extractiveness = SampleLevelMetricGrouping(
metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
sample_level_fn=Extractiveness(
Expand Down Expand Up @@ -238,6 +249,15 @@ class Metrics(Enum):
corpus_level_fn=np.mean,
higher_is_better=True,
)
latex_gold_metric = multilingual_extractive_match_metric(
language=Language.ENGLISH,
fallback_mode="first_match",
precision=5,
gold_extraction_target=(LatexExtractionConfig(),),
# Match boxed first before trying other regexes
pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
aggregation_function=max,
)
loglikelihood_acc = SampleLevelMetric(
metric_name="acc",
sample_level_fn=LoglikelihoodAcc(logprob_normalization=None).compute,
Expand Down
10 changes: 6 additions & 4 deletions src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,11 @@ def greedy_until(
# the case! Because of that we only use batch size of 1
stop_tokens = dataset[0].stop_sequence

max_new_tokens = dataset[0].generation_size # could be none
max_new_tokens = (
dataset[0].generation_size
if self.sampling_params.max_tokens is None
else self.sampling_params.max_tokens
)
returns_logits = dataset[0].use_logits
num_samples = dataset[0].num_samples

Expand Down Expand Up @@ -321,9 +325,7 @@ def _generate(
sampling_params = self.sampling_params.clone() or SamplingParams()
if generate:
sampling_params.n = num_samples
sampling_params.max_tokens = (
max_new_tokens if sampling_params.max_tokens is None else sampling_params.max_tokens
)
sampling_params.max_tokens = max_new_tokens
sampling_params.stop = stop_tokens
sampling_params.logprobs = 1 if returns_logits else 0

Expand Down
60 changes: 53 additions & 7 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,24 @@
# fmt: on


def aime_prompt_fn(line, task_name: str = None):
# Prompt template adapted from
# - simple-evals: https://github.com/openai/simple-evals/blob/6e84f4e2aed6b60f6a0c7b8f06bbbf4bfde72e58/math_eval.py#L17
# - Llama 3: https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-Instruct-evals/viewer/Llama-3.2-1B-Instruct-evals__math__details?views%5B%5D=llama_32_1b_instruct_evals__math__details
# Note that it is important to have the final answer in a box for math-verify to work correctly
MATH_QUERY_TEMPLATE = """
Solve the following math problem efficiently and clearly. The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
{Question}
""".strip()
return Doc(
task_name=task_name,
query=MATH_QUERY_TEMPLATE.format(Question=line["problem"]),
choices=[line["answer"]],
gold_index=0,
)


def anli(line, task_name: str = None):
return Doc(
task_name=task_name,
Expand Down Expand Up @@ -710,22 +728,31 @@ def ethics_virtue(line, task_name: str = None):


def gpqa(line, task_name: str = None):
# Prompt template from simple-evals: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14
GPQA_QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
{Question}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
gold_index = random.randint(0, 3)
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
choices.insert(gold_index, line["Correct Answer"])

instruction = "Select the correct answer to the following questions.\n\n"

query = f"Question: {line['Question']}\n"
query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)])
query += "Answer: "
query = GPQA_QUERY_TEMPLATE.format(
A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"]
)

return Doc(
task_name=task_name,
query=f"{instruction}{query}",
query=query,
choices=LETTER_INDICES[: len(choices)],
gold_index=gold_index,
instruction=instruction,
instruction=query,
)


Expand Down Expand Up @@ -1257,6 +1284,25 @@ def lsat_qa(line, task_name: str = None):
)


def math_500(line, task_name: str = None):
# Prompt template adapted from
# - simple-evals: https://github.com/openai/simple-evals/blob/6e84f4e2aed6b60f6a0c7b8f06bbbf4bfde72e58/math_eval.py#L17
# - Llama 3: https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-Instruct-evals/viewer/Llama-3.2-1B-Instruct-evals__math__details?views%5B%5D=llama_32_1b_instruct_evals__math__details
# Note that it is important to have the final answer in a box for math-verify to work correctly
MATH_QUERY_TEMPLATE = """
Solve the following math problem efficiently and clearly. The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
{Question}
""".strip()

return Doc(
task_name=task_name,
query=MATH_QUERY_TEMPLATE.format(Question=line["problem"]),
gold_index=0,
choices=[line["solution"]],
)


def math(line, task_name: str = None):
return Doc(
task_name=task_name,
Expand Down
42 changes: 42 additions & 0 deletions src/lighteval/tasks/default_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,34 @@
trust_dataset=True,
version=0,
)
aime24 = LightevalTaskConfig(
name="aime24",
suite=["lighteval"],
prompt_function=prompt.aime_prompt_fn,
hf_repo="HuggingFaceH4/aime_2024",
hf_subset="default",
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
few_shots_select=None,
generation_size=32768,
metric=[Metrics.expr_gold_metric],
version=1,
)
aime25 = LightevalTaskConfig(
name="aime25",
suite=["lighteval"],
prompt_function=prompt.aime_prompt_fn,
hf_repo="yentinglin/aime_2025",
hf_subset="default",
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
few_shots_select=None,
generation_size=10000,
metric=[Metrics.expr_gold_metric],
version=1,
)
anachronisms_bigbench = LightevalTaskConfig(
name="anachronisms",
suite=["bigbench", "bigbench_json"],
Expand Down Expand Up @@ -9597,6 +9625,20 @@
trust_dataset=True,
version=0,
)
math_500 = LightevalTaskConfig(
name="math_500",
suite=["lighteval"],
prompt_function=prompt.math_500,
hf_repo="HuggingFaceH4/MATH-500",
hf_subset="default",
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=None,
few_shots_select=None,
generation_size=32768,
metric=[Metrics.latex_gold_metric],
version=1,
)
math_algebra_lighteval = LightevalTaskConfig(
name="math:algebra",
suite=["lighteval", "math"],
Expand Down

0 comments on commit 4c9af85

Please sign in to comment.