diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py index 9dd01b2443..dadc8eaefc 100644 --- a/src/helm/benchmark/run_specs/enterprise_run_specs.py +++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py @@ -120,6 +120,32 @@ def get_casehold_spec() -> RunSpec: ) +@run_spec_function("echr_judgment_classification") +def get_echr_judgment_classification_spec() -> RunSpec: + """A different implementation (binary classification) of lex_glue_fixed:subset=ecthr_a""" + from helm.benchmark.scenarios.echr_judgment_classification_scenario import EchrJudgeScenario + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.echr_judgment_classification_scenario.EchrJudgeScenario", + args={"filter_max_length": 600}, + ) + + adapter_spec = get_generation_adapter_spec( + instructions=EchrJudgeScenario.PROMPT_INST_WITH_EX, + input_noun=EchrJudgeScenario.PROMPT_INPUT, + output_noun=EchrJudgeScenario.PROMPT_OUTPUT, + max_tokens=1, + ) + + return RunSpec( + name="echr_judgment_classification", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]), + groups=["echr_judgment_classification"], + ) + + # Climate diff --git a/src/helm/benchmark/scenarios/echr_judgment_classification_scenario.py b/src/helm/benchmark/scenarios/echr_judgment_classification_scenario.py new file mode 100644 index 0000000000..adafbe1486 --- /dev/null +++ b/src/helm/benchmark/scenarios/echr_judgment_classification_scenario.py @@ -0,0 +1,113 @@ +from typing import List, Optional +import json +import os +import re + +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + VALID_SPLIT, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) +from helm.common.general import ensure_file_downloaded, ensure_directory_exists + + +class EchrJudgeScenario(Scenario): + """The "Binary Violation" Classification task from the paper Neural Legal Judgment Prediction in English [(Chalkidis et al., 2019)](https://arxiv.org/pdf/1906.02059.pdf). + + The task is to analyze the description of a legal case from the European Court of Human Rights (ECHR), + and classify it as positive if any human rights article or protocol has been violated and negative otherwise. + + The case text can be very long, which sometimes results in incorrect model output + when using zero-shot predictions in many cases. + Therefore, have added two trivial cases to the instructions part. + + Example Prompt: + Is the following case a violation of human rights? (Instructions) + + Case: Human rights have not been violated. (Trivial No case in instructions) + Answer: No + + Case: Human rights have been violated. (Trivial Yes case in instructions) + Answer: Yes + + Case: (In-context examples, if possible) + Answer: