Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ECHR Judgment Classification scenario #3311

Merged
merged 2 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/helm/benchmark/run_specs/enterprise_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,32 @@ def get_casehold_spec() -> RunSpec:
)


@run_spec_function("echr_judgment_classification")
def get_echr_judgment_classification_spec() -> RunSpec:
"""A different implementation (binary classification) of lex_glue_fixed:subset=ecthr_a"""
from helm.benchmark.scenarios.echr_judgment_classification_scenario import EchrJudgeScenario

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.echr_judgment_classification_scenario.EchrJudgeScenario",
args={"filter_max_length": 600},
)

adapter_spec = get_generation_adapter_spec(
instructions=EchrJudgeScenario.PROMPT_INST_WITH_EX,
input_noun=EchrJudgeScenario.PROMPT_INPUT,
output_noun=EchrJudgeScenario.PROMPT_OUTPUT,
max_tokens=1,
)

return RunSpec(
name="echr_judgment_classification",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]),
groups=["echr_judgment_classification"],
)


# Climate


Expand Down
113 changes: 113 additions & 0 deletions src/helm/benchmark/scenarios/echr_judgment_classification_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from typing import List, Optional
import json
import os
import re

from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Reference,
TRAIN_SPLIT,
VALID_SPLIT,
TEST_SPLIT,
CORRECT_TAG,
Input,
Output,
)
from helm.common.general import ensure_file_downloaded, ensure_directory_exists


class EchrJudgeScenario(Scenario):
"""The "Binary Violation" Classification task from the paper Neural Legal Judgment Prediction in English [(Chalkidis et al., 2019)](https://arxiv.org/pdf/1906.02059.pdf).

The task is to analyze the description of a legal case from the European Court of Human Rights (ECHR),
and classify it as positive if any human rights article or protocol has been violated and negative otherwise.

The case text can be very long, which sometimes results in incorrect model output
when using zero-shot predictions in many cases.
Therefore, have added two trivial cases to the instructions part.

Example Prompt:
Is the following case a violation of human rights? (Instructions)

Case: Human rights have not been violated. (Trivial No case in instructions)
Answer: No

Case: Human rights have been violated. (Trivial Yes case in instructions)
Answer: Yes

Case: <TEXT> (In-context examples, if possible)
Answer: <Label> (Label is correct answer, Yes or No)

...
Case: <TEXT> (Target input text)
Answer: <Output> (Output ::= Yes | No)
""" # noqa: E501

# Names of the tasks we support
name = "echr_judgment_classification"
description = 'The "Binary Violation" Classification task from the paper Neural Legal Judgment Prediction in English [(Chalkidis et al., 2019)](https://arxiv.org/pdf/1906.02059.pdf).' # noqa: E501
tags = ["classification", "judgement", "legal"]

# Dataset file name
_DATASET_URL = "https://archive.org/download/ECHR-ACL2019/ECHR_Dataset.zip"

# Answer labels
YES_ANSWER = "Yes"
NO_ANSWER = "No"

# Prompt constants (used in adapter)
PROMPT_INPUT = "Case"
PROMPT_OUTPUT = "Answer"

YES_EX = f"\n\n{PROMPT_INPUT}: Human rights have been violated.\n{PROMPT_OUTPUT}: {YES_ANSWER}"
NO_EX = f"\n\n{PROMPT_INPUT}: Human rights have not been violated.\n{PROMPT_OUTPUT}: {NO_ANSWER}"
INST_EX = f"{NO_EX}{YES_EX}"

PROMPT_INST = "Is the following case a violation of human rights?" # Prompt for instructions
PROMPT_INST_WITH_EX = f"{PROMPT_INST}{INST_EX}" # Prompt for instructions with trivial examples

def __init__(self, filter_max_length: Optional[int] = None):
"""
Args:
filter_max_length: Int indicating maximum length for documents. Documents longer than
train_filter_max_length tokens (using whitespace tokenization)
will be filtered out.
"""
super().__init__()
self.filter_max_length = filter_max_length

def count_words(self, text: str) -> int:
"""Returns the number of words in the text"""
return len(re.split(r"\W+", text))

def get_instances(self, output_path: str) -> List[Instance]:
data_dir = os.path.join(output_path, "data")
ensure_directory_exists(data_dir)
file_name = self._DATASET_URL.split("/")[-1]
download_directory_path = os.path.join(data_dir, self._DATASET_URL)
ensure_file_downloaded(
source_url=self._DATASET_URL,
target_path=download_directory_path,
unpack=True,
unpack_type="unzip",
)

source_split_to_helm_split = {"EN_train": TRAIN_SPLIT, "EN_dev": VALID_SPLIT, "EN_test": TEST_SPLIT}

instances: List[Instance] = []
for source_split, helm_split in source_split_to_helm_split.items():
target_data_dir = os.path.join(download_directory_path, source_split)
for file_name in os.listdir(target_data_dir):
if not file_name.endswith(".json"):
continue
file_path = os.path.join(target_data_dir, file_name)
with open(file_path, "r") as f:
raw_data = json.load(f)
input_text = " ".join(raw_data["TEXT"])
if self.filter_max_length is not None and self.count_words(input_text) > self.filter_max_length:
continue
answer = self.YES_ANSWER if len(raw_data["VIOLATED_ARTICLES"]) > 0 else self.NO_ANSWER
correct_reference = Reference(Output(text=answer), tags=[CORRECT_TAG])
instances.append(Instance(input=Input(input_text), references=[correct_reference], split=helm_split))
return instances
17 changes: 17 additions & 0 deletions src/helm/benchmark/static/schema_enterprise.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ run_groups:
subgroups:
- legal_contract_summarization
- casehold
- echr_judgment_classification
- legal_opinion_sentiment_classification

- name: climate_scenarios
Expand Down Expand Up @@ -188,6 +189,22 @@ run_groups:
when: before 2021
language: English

- name: echr_judgment_classification
display_name: ECHR Judgment Classification
description: The "Binary Violation" Classification task from the paper Neural Legal Judgment Prediction in English [(Chalkidis et al., 2019)](https://arxiv.org/pdf/1906.02059.pdf). The task is to analyze the description of a legal case from the European Court of Human Rights (ECHR), and classify it as positive if any human rights article or protocol has been violated and negative otherwise.
metric_groups:
- accuracy
- general_information
environment:
main_name: classification_weighted_f1
main_split: test
taxonomy:
task: text classification
what: casees from the European Court of Human Rights
who: judiciary of the European Court of Human Rights
when: 2014-2018 (train) and 2014-2018 (test)
language: English

- name: legal_opinion_sentiment_classification
display_name: Legal Opinion Sentiment Classification
description: A legal opinion sentiment classification task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
Expand Down