stanford-crfm · yifanmai · Feb 5, 2025 · Dec 5, 2024 · Feb 4, 2025
diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -120,6 +120,32 @@ def get_casehold_spec() -> RunSpec:
     )
 
 
+@run_spec_function("echr_judgment_classification")
+def get_echr_judgment_classification_spec() -> RunSpec:
+    """A different implementation (binary classification) of lex_glue_fixed:subset=ecthr_a"""
+    from helm.benchmark.scenarios.echr_judgment_classification_scenario import EchrJudgeScenario
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.echr_judgment_classification_scenario.EchrJudgeScenario",
+        args={"filter_max_length": 600},
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions=EchrJudgeScenario.PROMPT_INST_WITH_EX,
+        input_noun=EchrJudgeScenario.PROMPT_INPUT,
+        output_noun=EchrJudgeScenario.PROMPT_OUTPUT,
+        max_tokens=1,
+    )
+
+    return RunSpec(
+        name="echr_judgment_classification",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]),
+        groups=["echr_judgment_classification"],
+    )
+
+
 # Climate
 
 

diff --git a/src/helm/benchmark/scenarios/echr_judgment_classification_scenario.py b/src/helm/benchmark/scenarios/echr_judgment_classification_scenario.py
@@ -0,0 +1,113 @@
+from typing import List, Optional
+import json
+import os
+import re
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+
+
+class EchrJudgeScenario(Scenario):
+    """The "Binary Violation" Classification task from the paper Neural Legal Judgment Prediction in English [(Chalkidis et al., 2019)](https://arxiv.org/pdf/1906.02059.pdf).
+
+    The task is to analyze the description of a legal case from the European Court of Human Rights (ECHR),
+    and classify it as positive if any human rights article or protocol has been violated and negative otherwise.
+
+    The case text can be very long, which sometimes results in incorrect model output
+    when using zero-shot predictions in many cases.
+    Therefore, have added two trivial cases to the instructions part.
+
+    Example Prompt:
+        Is the following case a violation of human rights?  (Instructions)
+
+        Case: Human rights have not been violated.          (Trivial No case in instructions)
+        Answer: No
+
+        Case: Human rights have been violated.              (Trivial Yes case in instructions)
+        Answer: Yes
+
+        Case: <TEXT>                                        (In-context examples, if possible)
+        Answer: <Label>                                     (Label is correct answer, Yes or No)
+
+        ...
+        Case: <TEXT>                                        (Target input text)
+        Answer: <Output>                                    (Output ::= Yes | No)
+    """  # noqa: E501
+
+    # Names of the tasks we support
+    name = "echr_judgment_classification"
+    description = 'The "Binary Violation" Classification task from the paper Neural Legal Judgment Prediction in English [(Chalkidis et al., 2019)](https://arxiv.org/pdf/1906.02059.pdf).'  # noqa: E501
+    tags = ["classification", "judgement", "legal"]
+
+    # Dataset file name
+    _DATASET_URL = "https://archive.org/download/ECHR-ACL2019/ECHR_Dataset.zip"
+
+    # Answer labels
+    YES_ANSWER = "Yes"
+    NO_ANSWER = "No"
+
+    # Prompt constants (used in adapter)
+    PROMPT_INPUT = "Case"
+    PROMPT_OUTPUT = "Answer"
+
+    YES_EX = f"\n\n{PROMPT_INPUT}: Human rights have been violated.\n{PROMPT_OUTPUT}: {YES_ANSWER}"
+    NO_EX = f"\n\n{PROMPT_INPUT}: Human rights have not been violated.\n{PROMPT_OUTPUT}: {NO_ANSWER}"
+    INST_EX = f"{NO_EX}{YES_EX}"
+
+    PROMPT_INST = "Is the following case a violation of human rights?"  # Prompt for instructions
+    PROMPT_INST_WITH_EX = f"{PROMPT_INST}{INST_EX}"  # Prompt for instructions with trivial examples
+
+    def __init__(self, filter_max_length: Optional[int] = None):
+        """
+        Args:
+            filter_max_length: Int indicating maximum length for documents. Documents longer than
+                               train_filter_max_length tokens (using whitespace tokenization)
+                               will be filtered out.
+        """
+        super().__init__()
+        self.filter_max_length = filter_max_length
+
+    def count_words(self, text: str) -> int:
+        """Returns the number of words in the text"""
+        return len(re.split(r"\W+", text))
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(data_dir)
+        file_name = self._DATASET_URL.split("/")[-1]
+        download_directory_path = os.path.join(data_dir, self._DATASET_URL)
+        ensure_file_downloaded(
+            source_url=self._DATASET_URL,
+            target_path=download_directory_path,
+            unpack=True,
+            unpack_type="unzip",
+        )
+
+        source_split_to_helm_split = {"EN_train": TRAIN_SPLIT, "EN_dev": VALID_SPLIT, "EN_test": TEST_SPLIT}
+
+        instances: List[Instance] = []
+        for source_split, helm_split in source_split_to_helm_split.items():
+            target_data_dir = os.path.join(download_directory_path, source_split)
+            for file_name in os.listdir(target_data_dir):
+                if not file_name.endswith(".json"):
+                    continue
+                file_path = os.path.join(target_data_dir, file_name)
+                with open(file_path, "r") as f:
+                    raw_data = json.load(f)
+                input_text = " ".join(raw_data["TEXT"])
+                if self.filter_max_length is not None and self.count_words(input_text) > self.filter_max_length:
+                    continue
+                answer = self.YES_ANSWER if len(raw_data["VIOLATED_ARTICLES"]) > 0 else self.NO_ANSWER
+                correct_reference = Reference(Output(text=answer), tags=[CORRECT_TAG])
+                instances.append(Instance(input=Input(input_text), references=[correct_reference], split=helm_split))
+        return instances
diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml
@@ -121,6 +121,7 @@ run_groups:
     subgroups:
       - legal_contract_summarization
       - casehold
+      - echr_judgment_classification
       - legal_opinion_sentiment_classification
 
   - name: climate_scenarios
@@ -188,6 +189,22 @@ run_groups:
       when: before 2021
       language: English
 
+  - name: echr_judgment_classification
+    display_name: ECHR Judgment Classification
+    description: The "Binary Violation" Classification task from the paper Neural Legal Judgment Prediction in English [(Chalkidis et al., 2019)](https://arxiv.org/pdf/1906.02059.pdf). The task is to analyze the description of a legal case from the European Court of Human Rights (ECHR), and classify it as positive if any human rights article or protocol has been violated and negative otherwise.
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: classification_weighted_f1
+      main_split: test
+    taxonomy:
+      task: text classification
+      what: casees from the European Court of Human Rights
+      who: judiciary of the European Court of Human Rights
+      when: 2014-2018 (train) and 2014-2018 (test)
+      language: English
+
   - name: legal_opinion_sentiment_classification
     display_name: Legal Opinion Sentiment Classification
     description: A legal opinion sentiment classification task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).