Merged main

stanford-crfm · Feb 4, 2025 · dcca97a · dcca97a
2 parents 2703207 + 5a50569
commit dcca97a
Show file tree

Hide file tree

Showing 35 changed files with 1,946 additions and 639 deletions.
diff --git a/helm-frontend/package.json b/helm-frontend/package.json
@@ -50,7 +50,7 @@
     "prettier": "^3.0.3",
     "tailwindcss": "^3.3.3",
     "typescript": "^5.0.2",
-    "vite": "^4.5.5",
+    "vite": "^4.5.6",
     "vitest": "^0.33.0"
   }
 }
diff --git a/helm-frontend/yarn.lock b/helm-frontend/yarn.lock
@@ -5270,10 +5270,10 @@ [email protected]:
     picocolors "^1.0.0"
     vite "^3.0.0 || ^4.0.0"
 
-"vite@^3.0.0 || ^4.0.0", vite@^4.5.5:
-  version "4.5.5"
-  resolved "https://registry.yarnpkg.com/vite/-/vite-4.5.5.tgz#639b9feca5c0a3bfe3c60cb630ef28bf219d742e"
-  integrity sha512-ifW3Lb2sMdX+WU91s3R0FyQlAyLxOzCSCP37ujw0+r5POeHPwe6udWVIElKQq8gk3t7b8rkmvqC6IHBpCff4GQ==
+"vite@^3.0.0 || ^4.0.0", vite@^4.5.6:
+  version "4.5.6"
+  resolved "https://registry.yarnpkg.com/vite/-/vite-4.5.6.tgz#48bbd97fe06e8241df2e625b31c581707e10b57d"
+  integrity sha512-ElBNuVvJKslxcfY2gMmae5IjaKGqCYGicCNZ+8R56sAznobeE3pI9ctzI17cBS/6OJh5YuQNMSN4BP4dRjugBg==
   dependencies:
     esbuild "^0.18.10"
     postcss "^8.4.27"

diff --git a/scripts/redact_scenario_states.py b/scripts/redact_scenario_states.py
@@ -22,7 +22,7 @@
 from helm.benchmark.scenarios.scenario import Instance, Reference
 from helm.common.codec import from_json, to_json
 from helm.common.hierarchical_logger import hlog
-from helm.common.request import Request
+from helm.common.request import Request, RequestResult, GeneratedOutput, Token
 
 
 SCENARIO_STATE_FILE_NAME = "scenario_state.json"
@@ -62,28 +62,47 @@ def redact_output_mapping(output_mapping: Optional[Dict[str, str]]) -> Optional[
     return {key: REDACTED_STRING for key in output_mapping}
 
 
-def redact_request_state(request_state: RequestState) -> RequestState:
+def redact_token(token: Token) -> Token:
+    return dataclasses.replace(token, text=REDACTED_STRING)
+
+
+def redact_completion(completion: GeneratedOutput) -> GeneratedOutput:
+    # Replacing tokens for empty list in case length of completion reveals information about the prompt
+    return dataclasses.replace(completion, text=REDACTED_STRING, tokens=[])
+
+
+def redact_result(result: RequestResult) -> RequestResult:
+    redacted_completions = [redact_completion(completion) for completion in result.completions]
+    return dataclasses.replace(result, completions=redacted_completions)
+
+
+def redact_request_state(request_state: RequestState, redact_output: bool) -> RequestState:
+    assert request_state.result is not None
+    result = redact_result(request_state.result) if redact_output else request_state.result
     return dataclasses.replace(
         request_state,
         instance=redact_instance(request_state.instance),
         request=redact_request(request_state.request),
         output_mapping=redact_output_mapping(request_state.output_mapping),
+        result=result,
     )
 
 
-def redact_scenario_state(scenario_state: ScenarioState) -> ScenarioState:
-    redacted_request_states = [redact_request_state(request_state) for request_state in scenario_state.request_states]
+def redact_scenario_state(scenario_state: ScenarioState, redact_output: bool) -> ScenarioState:
+    redacted_request_states = [
+        redact_request_state(request_state, redact_output) for request_state in scenario_state.request_states
+    ]
     return dataclasses.replace(scenario_state, request_states=redacted_request_states)
 
 
-def modify_scenario_state_for_run(run_path: str) -> None:
+def modify_scenario_state_for_run(run_path: str, redact_output: bool) -> None:
     scenario_state_path = os.path.join(run_path, SCENARIO_STATE_FILE_NAME)
     scenario_state = read_scenario_state(scenario_state_path)
-    redacted_scenario_state = redact_scenario_state(scenario_state)
+    redacted_scenario_state = redact_scenario_state(scenario_state, redact_output)
     write_scenario_state(scenario_state_path, redacted_scenario_state)
 
 
-def modify_scenario_states_for_suite(run_suite_path: str) -> None:
+def modify_scenario_states_for_suite(run_suite_path: str, redact_output: bool) -> None:
     """Load the runs in the run suite path."""
     # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
     # so filter them out.
@@ -100,7 +119,7 @@ def modify_scenario_states_for_suite(run_suite_path: str) -> None:
             hlog(f"WARNING: {run_dir_name} doesn't have {SCENARIO_STATE_FILE_NAME}, skipping")
             continue
         run_path: str = os.path.join(run_suite_path, run_dir_name)
-        modify_scenario_state_for_run(run_path)
+        modify_scenario_state_for_run(run_path, redact_output)
 
 
 def main():
@@ -113,11 +132,13 @@ def main():
         type=str,
         help="Name of the suite this summarization should go under.",
     )
+    parser.add_argument("--redact-output", action="store_true", help="Whether to redact the generated outputs.")
     args = parser.parse_args()
     output_path = args.output_path
     suite = args.suite
+    redact_output = args.redact_output
     run_suite_path = os.path.join(output_path, "runs", suite)
-    modify_scenario_states_for_suite(run_suite_path)
+    modify_scenario_states_for_suite(run_suite_path, redact_output)
 
 
 if __name__ == "__main__":

diff --git a/setup.cfg b/setup.cfg
@@ -285,6 +285,7 @@ audiolm =
     # For HuggingFace audio datasets
     soundfile~=0.12
     librosa~=0.10
+    einops~=0.7.0
 
     # For LLaMA-Omni
     openai-whisper==20240930

diff --git a/src/helm/benchmark/annotation/helpdesk_call_summarization_annotator.py b/src/helm/benchmark/annotation/helpdesk_call_summarization_annotator.py
@@ -0,0 +1,96 @@
+import re
+from typing import Any
+
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+from helm.proxy.retry import NonRetriableException
+
+
+class AnnotatorResponseParseFailure(NonRetriableException):
+    def __init__(self, response_text: str, **kwargs):
+        self.response_text = response_text
+        super().__init__(kwargs)
+
+
+class HelpdeskCallSummarizationAnnotator(Annotator):
+    """The Helpdesk Call Summarization autograder."""
+
+    name = "helpdesk_call_center_summarization"
+
+    PROMPT_TEMPLATE = """You are an expert evaluator. Your task is to evaluate the quality of a model-generated summary of a helpdesk call transcript.
+
+The helpdesk call transcript and summary are provided below, delineated with start and end tags:
+
+<call_transcript>
+{{QUESTION}}
+</call_transcript>
+<summary>
+{{PRED}}
+</summary>
+
+Evaluate the summary based on the following criteria:
+
+- Conciseness: A high-quality summary should effectively convey the most important information from the original source while keeping the length brief.
+- Relevance: The information presented in the summary should be relevant to the main topic.
+- Coherence: A good summary should have a clear structure and flow of ideas that make it easy to understand and follow.
+- Accuracy: The summary's information should be factually correct and should not contain false or misleading information.
+
+Think step by step, then score the summary. Your reasoning should be less than 100 words. The score should be a single number between 1 to 10 inclusive.
+
+Please respond with your output and reasoning in the following format, your reasoning within <reasoning></reasoning> tags and your score within <score></score> tags, without any other output:
+
+<reasoning>INSERT_YOUR_REASONING_HERE</reasoning>
+<score>INSERT_YOUR_SCORE_HERE</score>
+"""  # noqa: E501
+
+    PATTERN = r"^\s*reason:(.*)##(.*)"
+
+    def __init__(self, auto_client: AutoClient):
+        self._auto_client = auto_client
+
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        prediction_text = request_state.result.completions[0].text
+
+        question_text = request_state.instance.input.text
+
+        annotator_prompt = self.PROMPT_TEMPLATE.replace("{{QUESTION}}", question_text).replace(
+            "{{PRED}}", prediction_text
+        )
+        annotator_request = Request(
+            model="openai/gpt-4o-2024-05-13",
+            model_deployment="openai/gpt-4o-2024-05-13",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=512,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        # fuzzy match regex check, allows for different casing, or forgetting / in end tag
+        reasoning_match = re.search(
+            r"<\s*reasoning\s*>(.*?)<\/?\s*reasoning\s*>", annotator_response_text, re.DOTALL | re.IGNORECASE
+        )
+        score_match = re.search(
+            r"<\s*score\s*>(.*?)<\/?\s*score\s*>", annotator_response_text, re.DOTALL | re.IGNORECASE
+        )
+        if not reasoning_match or not score_match:
+            raise AnnotatorResponseParseFailure(
+                message=f"Could not parse markup in raw response: '{annotator_response_text}'",
+                response_text=annotator_response_text,
+            )
+        reasoning = reasoning_match.group(1).strip()
+        try:
+            score = float(score_match.group(1).strip())
+        except ValueError:
+            raise AnnotatorResponseParseFailure(
+                message=f"Could not parse score as float from raw request: '{annotator_response_text}'",
+                response_text=annotator_response_text,
+            )
+
+        return {"reasoning": reasoning, "score": score}
diff --git a/src/helm/benchmark/annotation/omni_math_annotator.py b/src/helm/benchmark/annotation/omni_math_annotator.py
@@ -44,7 +44,12 @@ def annotate(self, request_state: RequestState) -> Any:
             .replace("{{Solution}}", model_output_text)
         )
         if not model_output_text.strip():
-            return {"prompt_text": annotator_prompt, "correctness": 0.0}
+            return {
+                "prompt_text": "",
+                "student_final_answer": "N/A",
+                "equivalence_judgement": "FALSE",
+                "justification": "The model output is empty.",
+            }
 
         annotator_request = Request(
             model="openai/gpt-4o-2024-05-13",
@@ -66,6 +71,7 @@ def annotate(self, request_state: RequestState) -> Any:
         justification = info.get("Justification", "").strip().removesuffix("=== report over ===").strip()
 
         return {
+            "prompt_text": annotator_prompt,
             "student_final_answer": student_final_answer,
             "equivalence_judgement": equivalence_judgement,
             "justification": justification,

diff --git a/src/helm/benchmark/annotation/wildbench_annotator.py b/src/helm/benchmark/annotation/wildbench_annotator.py
@@ -1,9 +1,11 @@
 import re
 from typing import Any
 from importlib.resources import files
+from typing import Dict
 
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.annotator import Annotator
+from helm.benchmark.annotation.model_as_judge import _AnnotatorModelInfo
 from helm.clients.auto_client import AutoClient
 from helm.common.request import Request
 
@@ -29,7 +31,12 @@ def annotate(self, request_state: RequestState) -> Any:
         model_output_text = request_state.result.completions[0].text
         if not model_output_text.strip():
             # Following https://github.com/allenai/WildBench/blob/d6b8dcaf377d173d031980f97c16e1a82618c03d/src/eval.py
-            return {"prompt_text": "", "strengths": "N/A", "weaknesses": "The model output is empty.", "score": 1.0}
+            return {
+                "prompt_text": "",
+                "strengths": ["N/A"],
+                "weaknesses": ["The model output is empty."],
+                "score": [1.0],
+            }
         prompt_template = self._score_template
 
         annotator_prompt = (
@@ -38,28 +45,56 @@ def annotate(self, request_state: RequestState) -> Any:
             .replace("{$model_output}", model_output_text)
             .replace("{$checklist}", "\n".join(request_state.instance.extra_data["checklist"]))
         )
-        annotator_request = Request(
-            model="openai/gpt-4o-2024-05-13",
-            model_deployment="openai/gpt-4o-2024-05-13",
-            prompt=annotator_prompt,
-            temperature=0.0,
-            max_tokens=2000,
-        )
-        annotator_response = self._auto_client.make_request(annotator_request)
-        if not annotator_response.success:
-            raise Exception(f"Annotation request failed: {annotator_response.error}")
-        assert len(annotator_response.completions) == 1
-        annotator_response_text = annotator_response.completions[0].text
-        annotator_response_parts = self._pattern.search(annotator_response_text)
-        if not annotator_response_parts:
-            raise ValueError(f"Malformed annotator response: {annotator_response_text}")
 
-        strengths = annotator_response_parts[1].strip()
-        weaknesses = annotator_response_parts[2].strip()
-        score_text = annotator_response_parts[3].strip().strip('"')
-        try:
-            score = float(score_text)
-        except ValueError:
-            raise ValueError(f"Malformed score '{score_text}' in annotator response: {annotator_response_text}")
+        SHORT_NAME_TO_MODEL_INFO: Dict[str, _AnnotatorModelInfo] = {
+            "gpt": _AnnotatorModelInfo(
+                model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"
+            ),
+            "llama": _AnnotatorModelInfo(
+                model_name="meta/llama-3.1-405b-instruct-turbo",
+                model_deployment="together/llama-3.1-405b-instruct-turbo",
+            ),
+            "claude": _AnnotatorModelInfo(
+                model_name="anthropic/claude-3-5-sonnet-20241022",
+                model_deployment="anthropic/claude-3-5-sonnet-20241022",
+            ),
+        }
+        all_strengths = []
+        all_weaknesses = []
+        all_scores = []
+        for annotator_model in SHORT_NAME_TO_MODEL_INFO:
+            annotator_model_info = SHORT_NAME_TO_MODEL_INFO[annotator_model]
+            annotator_request = Request(
+                model=annotator_model_info.model_name,
+                model_deployment=annotator_model_info.model_deployment,
+                prompt=annotator_prompt,
+                temperature=0.0,
+                max_tokens=2000,
+            )
+            annotator_response = self._auto_client.make_request(annotator_request)
+            if not annotator_response.success:
+                continue  # skip this annotator if the request failed
+            assert len(annotator_response.completions) == 1
+            annotator_response_text = annotator_response.completions[0].text
+            annotator_response_parts = self._pattern.search(annotator_response_text)
+            if not annotator_response_parts:
+                continue  # skip this annotator if the response is malformed
+
+            strengths = annotator_response_parts[1].strip()
+            weaknesses = annotator_response_parts[2].strip()
+            score_text = annotator_response_parts[3].strip().strip('"')
+            try:
+                score = float(score_text)
+            except ValueError:
+                continue  # skip this annotator if the score is not a number
+
+            all_strengths.append(strengths)
+            all_weaknesses.append(weaknesses)
+            all_scores.append(score)
 
-        return {"strengths": strengths, "weaknesses": weaknesses, "score": score}
+            return {
+                "prompt_text": annotator_prompt,
+                "strengths": all_strengths,
+                "weaknesses": all_weaknesses,
+                "score": all_scores,
+            }