Skip to content

Commit

Permalink
Merged main
Browse files Browse the repository at this point in the history
  • Loading branch information
MiguelAFH committed Feb 4, 2025
2 parents 2703207 + 5a50569 commit dcca97a
Show file tree
Hide file tree
Showing 35 changed files with 1,946 additions and 639 deletions.
2 changes: 1 addition & 1 deletion helm-frontend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"prettier": "^3.0.3",
"tailwindcss": "^3.3.3",
"typescript": "^5.0.2",
"vite": "^4.5.5",
"vite": "^4.5.6",
"vitest": "^0.33.0"
}
}
8 changes: 4 additions & 4 deletions helm-frontend/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5270,10 +5270,10 @@ [email protected]:
picocolors "^1.0.0"
vite "^3.0.0 || ^4.0.0"

"vite@^3.0.0 || ^4.0.0", vite@^4.5.5:
version "4.5.5"
resolved "https://registry.yarnpkg.com/vite/-/vite-4.5.5.tgz#639b9feca5c0a3bfe3c60cb630ef28bf219d742e"
integrity sha512-ifW3Lb2sMdX+WU91s3R0FyQlAyLxOzCSCP37ujw0+r5POeHPwe6udWVIElKQq8gk3t7b8rkmvqC6IHBpCff4GQ==
"vite@^3.0.0 || ^4.0.0", vite@^4.5.6:
version "4.5.6"
resolved "https://registry.yarnpkg.com/vite/-/vite-4.5.6.tgz#48bbd97fe06e8241df2e625b31c581707e10b57d"
integrity sha512-ElBNuVvJKslxcfY2gMmae5IjaKGqCYGicCNZ+8R56sAznobeE3pI9ctzI17cBS/6OJh5YuQNMSN4BP4dRjugBg==
dependencies:
esbuild "^0.18.10"
postcss "^8.4.27"
Expand Down
39 changes: 30 additions & 9 deletions scripts/redact_scenario_states.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from helm.benchmark.scenarios.scenario import Instance, Reference
from helm.common.codec import from_json, to_json
from helm.common.hierarchical_logger import hlog
from helm.common.request import Request
from helm.common.request import Request, RequestResult, GeneratedOutput, Token


SCENARIO_STATE_FILE_NAME = "scenario_state.json"
Expand Down Expand Up @@ -62,28 +62,47 @@ def redact_output_mapping(output_mapping: Optional[Dict[str, str]]) -> Optional[
return {key: REDACTED_STRING for key in output_mapping}


def redact_request_state(request_state: RequestState) -> RequestState:
def redact_token(token: Token) -> Token:
return dataclasses.replace(token, text=REDACTED_STRING)


def redact_completion(completion: GeneratedOutput) -> GeneratedOutput:
# Replacing tokens for empty list in case length of completion reveals information about the prompt
return dataclasses.replace(completion, text=REDACTED_STRING, tokens=[])


def redact_result(result: RequestResult) -> RequestResult:
redacted_completions = [redact_completion(completion) for completion in result.completions]
return dataclasses.replace(result, completions=redacted_completions)


def redact_request_state(request_state: RequestState, redact_output: bool) -> RequestState:
assert request_state.result is not None
result = redact_result(request_state.result) if redact_output else request_state.result
return dataclasses.replace(
request_state,
instance=redact_instance(request_state.instance),
request=redact_request(request_state.request),
output_mapping=redact_output_mapping(request_state.output_mapping),
result=result,
)


def redact_scenario_state(scenario_state: ScenarioState) -> ScenarioState:
redacted_request_states = [redact_request_state(request_state) for request_state in scenario_state.request_states]
def redact_scenario_state(scenario_state: ScenarioState, redact_output: bool) -> ScenarioState:
redacted_request_states = [
redact_request_state(request_state, redact_output) for request_state in scenario_state.request_states
]
return dataclasses.replace(scenario_state, request_states=redacted_request_states)


def modify_scenario_state_for_run(run_path: str) -> None:
def modify_scenario_state_for_run(run_path: str, redact_output: bool) -> None:
scenario_state_path = os.path.join(run_path, SCENARIO_STATE_FILE_NAME)
scenario_state = read_scenario_state(scenario_state_path)
redacted_scenario_state = redact_scenario_state(scenario_state)
redacted_scenario_state = redact_scenario_state(scenario_state, redact_output)
write_scenario_state(scenario_state_path, redacted_scenario_state)


def modify_scenario_states_for_suite(run_suite_path: str) -> None:
def modify_scenario_states_for_suite(run_suite_path: str, redact_output: bool) -> None:
"""Load the runs in the run suite path."""
# run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
# so filter them out.
Expand All @@ -100,7 +119,7 @@ def modify_scenario_states_for_suite(run_suite_path: str) -> None:
hlog(f"WARNING: {run_dir_name} doesn't have {SCENARIO_STATE_FILE_NAME}, skipping")
continue
run_path: str = os.path.join(run_suite_path, run_dir_name)
modify_scenario_state_for_run(run_path)
modify_scenario_state_for_run(run_path, redact_output)


def main():
Expand All @@ -113,11 +132,13 @@ def main():
type=str,
help="Name of the suite this summarization should go under.",
)
parser.add_argument("--redact-output", action="store_true", help="Whether to redact the generated outputs.")
args = parser.parse_args()
output_path = args.output_path
suite = args.suite
redact_output = args.redact_output
run_suite_path = os.path.join(output_path, "runs", suite)
modify_scenario_states_for_suite(run_suite_path)
modify_scenario_states_for_suite(run_suite_path, redact_output)


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ audiolm =
# For HuggingFace audio datasets
soundfile~=0.12
librosa~=0.10
einops~=0.7.0

# For LLaMA-Omni
openai-whisper==20240930
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import re
from typing import Any

from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.annotation.annotator import Annotator
from helm.clients.auto_client import AutoClient
from helm.common.request import Request
from helm.proxy.retry import NonRetriableException


class AnnotatorResponseParseFailure(NonRetriableException):
def __init__(self, response_text: str, **kwargs):
self.response_text = response_text
super().__init__(kwargs)


class HelpdeskCallSummarizationAnnotator(Annotator):
"""The Helpdesk Call Summarization autograder."""

name = "helpdesk_call_center_summarization"

PROMPT_TEMPLATE = """You are an expert evaluator. Your task is to evaluate the quality of a model-generated summary of a helpdesk call transcript.
The helpdesk call transcript and summary are provided below, delineated with start and end tags:
<call_transcript>
{{QUESTION}}
</call_transcript>
<summary>
{{PRED}}
</summary>
Evaluate the summary based on the following criteria:
- Conciseness: A high-quality summary should effectively convey the most important information from the original source while keeping the length brief.
- Relevance: The information presented in the summary should be relevant to the main topic.
- Coherence: A good summary should have a clear structure and flow of ideas that make it easy to understand and follow.
- Accuracy: The summary's information should be factually correct and should not contain false or misleading information.
Think step by step, then score the summary. Your reasoning should be less than 100 words. The score should be a single number between 1 to 10 inclusive.
Please respond with your output and reasoning in the following format, your reasoning within <reasoning></reasoning> tags and your score within <score></score> tags, without any other output:
<reasoning>INSERT_YOUR_REASONING_HERE</reasoning>
<score>INSERT_YOUR_SCORE_HERE</score>
""" # noqa: E501

PATTERN = r"^\s*reason:(.*)##(.*)"

def __init__(self, auto_client: AutoClient):
self._auto_client = auto_client

def annotate(self, request_state: RequestState) -> Any:
assert request_state.result
assert len(request_state.result.completions) == 1
prediction_text = request_state.result.completions[0].text

question_text = request_state.instance.input.text

annotator_prompt = self.PROMPT_TEMPLATE.replace("{{QUESTION}}", question_text).replace(
"{{PRED}}", prediction_text
)
annotator_request = Request(
model="openai/gpt-4o-2024-05-13",
model_deployment="openai/gpt-4o-2024-05-13",
prompt=annotator_prompt,
temperature=0.0,
max_tokens=512,
)
annotator_response = self._auto_client.make_request(annotator_request)
if not annotator_response.success:
raise Exception(f"Annotation request failed: {annotator_response.error}")
assert len(annotator_response.completions) == 1
annotator_response_text = annotator_response.completions[0].text
# fuzzy match regex check, allows for different casing, or forgetting / in end tag
reasoning_match = re.search(
r"<\s*reasoning\s*>(.*?)<\/?\s*reasoning\s*>", annotator_response_text, re.DOTALL | re.IGNORECASE
)
score_match = re.search(
r"<\s*score\s*>(.*?)<\/?\s*score\s*>", annotator_response_text, re.DOTALL | re.IGNORECASE
)
if not reasoning_match or not score_match:
raise AnnotatorResponseParseFailure(
message=f"Could not parse markup in raw response: '{annotator_response_text}'",
response_text=annotator_response_text,
)
reasoning = reasoning_match.group(1).strip()
try:
score = float(score_match.group(1).strip())
except ValueError:
raise AnnotatorResponseParseFailure(
message=f"Could not parse score as float from raw request: '{annotator_response_text}'",
response_text=annotator_response_text,
)

return {"reasoning": reasoning, "score": score}
8 changes: 7 additions & 1 deletion src/helm/benchmark/annotation/omni_math_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@ def annotate(self, request_state: RequestState) -> Any:
.replace("{{Solution}}", model_output_text)
)
if not model_output_text.strip():
return {"prompt_text": annotator_prompt, "correctness": 0.0}
return {
"prompt_text": "",
"student_final_answer": "N/A",
"equivalence_judgement": "FALSE",
"justification": "The model output is empty.",
}

annotator_request = Request(
model="openai/gpt-4o-2024-05-13",
Expand All @@ -66,6 +71,7 @@ def annotate(self, request_state: RequestState) -> Any:
justification = info.get("Justification", "").strip().removesuffix("=== report over ===").strip()

return {
"prompt_text": annotator_prompt,
"student_final_answer": student_final_answer,
"equivalence_judgement": equivalence_judgement,
"justification": justification,
Expand Down
83 changes: 59 additions & 24 deletions src/helm/benchmark/annotation/wildbench_annotator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import re
from typing import Any
from importlib.resources import files
from typing import Dict

from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.annotation.annotator import Annotator
from helm.benchmark.annotation.model_as_judge import _AnnotatorModelInfo
from helm.clients.auto_client import AutoClient
from helm.common.request import Request

Expand All @@ -29,7 +31,12 @@ def annotate(self, request_state: RequestState) -> Any:
model_output_text = request_state.result.completions[0].text
if not model_output_text.strip():
# Following https://github.com/allenai/WildBench/blob/d6b8dcaf377d173d031980f97c16e1a82618c03d/src/eval.py
return {"prompt_text": "", "strengths": "N/A", "weaknesses": "The model output is empty.", "score": 1.0}
return {
"prompt_text": "",
"strengths": ["N/A"],
"weaknesses": ["The model output is empty."],
"score": [1.0],
}
prompt_template = self._score_template

annotator_prompt = (
Expand All @@ -38,28 +45,56 @@ def annotate(self, request_state: RequestState) -> Any:
.replace("{$model_output}", model_output_text)
.replace("{$checklist}", "\n".join(request_state.instance.extra_data["checklist"]))
)
annotator_request = Request(
model="openai/gpt-4o-2024-05-13",
model_deployment="openai/gpt-4o-2024-05-13",
prompt=annotator_prompt,
temperature=0.0,
max_tokens=2000,
)
annotator_response = self._auto_client.make_request(annotator_request)
if not annotator_response.success:
raise Exception(f"Annotation request failed: {annotator_response.error}")
assert len(annotator_response.completions) == 1
annotator_response_text = annotator_response.completions[0].text
annotator_response_parts = self._pattern.search(annotator_response_text)
if not annotator_response_parts:
raise ValueError(f"Malformed annotator response: {annotator_response_text}")

strengths = annotator_response_parts[1].strip()
weaknesses = annotator_response_parts[2].strip()
score_text = annotator_response_parts[3].strip().strip('"')
try:
score = float(score_text)
except ValueError:
raise ValueError(f"Malformed score '{score_text}' in annotator response: {annotator_response_text}")
SHORT_NAME_TO_MODEL_INFO: Dict[str, _AnnotatorModelInfo] = {
"gpt": _AnnotatorModelInfo(
model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"
),
"llama": _AnnotatorModelInfo(
model_name="meta/llama-3.1-405b-instruct-turbo",
model_deployment="together/llama-3.1-405b-instruct-turbo",
),
"claude": _AnnotatorModelInfo(
model_name="anthropic/claude-3-5-sonnet-20241022",
model_deployment="anthropic/claude-3-5-sonnet-20241022",
),
}
all_strengths = []
all_weaknesses = []
all_scores = []
for annotator_model in SHORT_NAME_TO_MODEL_INFO:
annotator_model_info = SHORT_NAME_TO_MODEL_INFO[annotator_model]
annotator_request = Request(
model=annotator_model_info.model_name,
model_deployment=annotator_model_info.model_deployment,
prompt=annotator_prompt,
temperature=0.0,
max_tokens=2000,
)
annotator_response = self._auto_client.make_request(annotator_request)
if not annotator_response.success:
continue # skip this annotator if the request failed
assert len(annotator_response.completions) == 1
annotator_response_text = annotator_response.completions[0].text
annotator_response_parts = self._pattern.search(annotator_response_text)
if not annotator_response_parts:
continue # skip this annotator if the response is malformed

strengths = annotator_response_parts[1].strip()
weaknesses = annotator_response_parts[2].strip()
score_text = annotator_response_parts[3].strip().strip('"')
try:
score = float(score_text)
except ValueError:
continue # skip this annotator if the score is not a number

all_strengths.append(strengths)
all_weaknesses.append(weaknesses)
all_scores.append(score)

return {"strengths": strengths, "weaknesses": weaknesses, "score": score}
return {
"prompt_text": annotator_prompt,
"strengths": all_strengths,
"weaknesses": all_weaknesses,
"score": all_scores,
}
Loading

0 comments on commit dcca97a

Please sign in to comment.