Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MODULE] Capstone project on evaluation #97

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added 8_capstone_project/README.md
Empty file.
190 changes: 190 additions & 0 deletions 8_capstone_project/process_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
"""
Process and aggregate evaluation results from multiple model runs and upload them to the HuggingFace Hub.

This script handles:
- Extracting results from JSON files
- Processing results into a structured format
- Uploading aggregated results to HuggingFace Hub (optional)
- Logging results to console
"""

import argparse
import json
import logging
from pathlib import Path
from typing import Dict, List

import pandas as pd
from datasets import Dataset

# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def extract_results(eval_results: Dict) -> Dict:
"""
Extract relevant scores from evaluation results dictionary.

Args:
eval_results: Dictionary containing model evaluation results

Returns:
Dictionary containing model configuration and task scores
"""
try:
model_results = eval_results["config_general"]
for task_name, task_score in eval_results["results"]["all"].items():
model_results[task_name] = task_score
return model_results
except KeyError as e:
logger.error(f"Missing required key in evaluation results: {e}")
raise


def get_results_from_dir(results_dir: Path) -> List[Dict]:
"""
Recursively process all result files from the given directory.

Args:
results_dir: Path to directory containing evaluation results

Returns:
List of processed result dictionaries
"""
if not results_dir.exists():
raise FileNotFoundError(f"Results directory not found: {results_dir}")

results = []
try:
for author_dir in results_dir.iterdir():
if not author_dir.is_dir():
continue

for model_dir in author_dir.iterdir():
if not model_dir.is_dir():
continue

for file in model_dir.iterdir():
if not file.suffix == ".json":
continue

try:
results.append(process_result_file(file))
except Exception as e:
logger.error(f"Error processing file {file}: {e}")
continue

if not results:
logger.warning("No valid result files found in the specified directory")

return results

except Exception as e:
logger.error(f"Error reading results directory: {e}")
raise


def process_result_file(file_path: Path) -> Dict:
"""
Process a single result file.

Args:
file_path: Path to the result file

Returns:
Processed result dictionary
"""
try:
with file_path.open() as f:
results = json.load(f)
return extract_results(results)
except json.JSONDecodeError:
logger.error(f"Invalid JSON in file: {file_path}")
raise
except Exception as e:
logger.error(f"Error processing file {file_path}: {e}")
raise


def push_results_to_hub(model_results: List[Dict], repo_id: str) -> None:
"""
Upload processed results to HuggingFace Hub.

Args:
model_results: List of processed result dictionaries
repo_id: HuggingFace Hub repository ID to upload to
"""
try:
dataset = Dataset.from_list(model_results)
dataset.push_to_hub(repo_id=repo_id)
logger.info(f"Successfully pushed results to {repo_id}")
except Exception as e:
logger.error(f"Error pushing results to hub: {e}")
raise


def display_results(results: List[Dict]) -> None:
"""
Display results as a formatted table.

Args:
results: List of processed result dictionaries
"""
try:
df = pd.DataFrame(results)
logger.info("\nResults Summary:")
logger.info("\n" + str(df))

# Log some basic statistics
logger.info("\nSummary Statistics:")
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
logger.info("\n" + str(df[numeric_cols].describe()))

except Exception as e:
logger.error(f"Error displaying results: {e}")
raise


def main(results_dir: str, repo_id: str = None) -> None:
"""
Main function to process results and optionally upload to HuggingFace Hub.

Args:
results_dir: Directory containing evaluation results
repo_id: Optional HuggingFace Hub repository ID to upload results to
"""
try:
results_path = Path(results_dir)
results = get_results_from_dir(results_path)

display_results(results)

if repo_id:
push_results_to_hub(results, repo_id)

except Exception as e:
logger.error(f"Error in main execution: {e}")
raise


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Process model evaluation results and optionally upload to HuggingFace Hub"
)
parser.add_argument(
"--results-dir",
type=str,
required=True,
help="Directory containing evaluation results",
)
parser.add_argument(
"--repo-id",
type=str,
help="HuggingFace Hub repository ID to upload results to (optional)",
)

args = parser.parse_args()
main(args.results_dir, args.repo_id)
92 changes: 92 additions & 0 deletions 8_capstone_project/run_evaluation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/bin/bash
export HF_CACHE="$HOME/.cache/huggingface"
export TRANSFORMERS_CACHE="$HF_CACHE"
export HF_HUB_CACHE="$HF_CACHE"
export HF_HOME="$HF_CACHE"

# Default values
num_fewshots=0
truncate_fewshots=0

# Function to print usage
print_usage() {
echo "Usage: $0 -m MODEL_ID [-f NUM_FEWSHOTS] [-x]"
echo " -m MODEL_ID : HuggingFace model ID (required)"
echo " -f NUM_FEWSHOTS : Number of few-shot examples (default: 0)"
echo " -x : Truncate few-shot examples (default: false)"
echo " -h : Show this help message"
}

# Parse command line arguments
while getopts "m:f:xh" opt; do
case $opt in
m) model_id="$OPTARG";;
f) num_fewshots="$OPTARG";;
x) truncate_fewshots="true";;
h) print_usage; exit 0;;
?) print_usage; exit 1;;
esac
done

# Check if model_id is provided
if [ -z "$model_id" ]; then
echo "Error: Model ID is required"
print_usage
exit 1
fi

# Get the directory of the script
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
tasks_dir="$script_dir/submitted_tasks"

# Create output directory if it doesn't exist
output_dir="$script_dir/results/$model_id"
if [ ! -d "$output_dir" ]; then
mkdir -p "$output_dir"
fi

# Collect all Python files (excluding run_evaluation.py)
task_files=()
task_names=()
while IFS= read -r -d '' file; do
task_files+=("${file}")
task_names+=($(basename "$file" .py))
done < <(find "$tasks_dir" -name "*.py" -print0)

# Check if any task files were found
if [ ${#task_files[@]} -eq 0 ]; then
echo "Error: No Python task files found in $tasks_dir"
exit 1
fi

echo "----------------------------------------"
echo "Running evaluation for model: $model_id"
echo "Found tasks: ${task_names[*]}"
echo "Number of few-shots: $num_fewshots"
echo "Truncate few-shots: $truncate_fewshots"

# Build the tasks parameter string
tasks_param=""
for task_name in "${task_names[@]}"; do
tasks_param+="community|${task_name}|$num_fewshots|$truncate_fewshots,"
done
tasks_param=${tasks_param%,} # Remove trailing comma

# Build the custom-tasks parameter string
custom_tasks_param=$(IFS=,; echo "${task_files[*]}")

lighteval accelerate "pretrained=$model_id" \
"$tasks_param" \
--custom-tasks "$custom_tasks_param" \
--output-dir . \
--override-batch-size 512

exit_code=$?

if [ $exit_code -eq 0 ]; then
echo "Successfully completed evaluation of all tasks"
echo "Results saved in: $output_dir"
else
echo "Error running evaluation (exit code: $exit_code)"
exit $exit_code
fi
80 changes: 80 additions & 0 deletions 8_capstone_project/submitted_tasks/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import numpy as np

from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.metrics.utils.metric_utils import (
SampleLevelMetric,
MetricCategory,
MetricUseCase,
)

################################################################################
# Define the prompt function based on the structure of the dataset
################################################################################


def prompt_fn(line, task_name: str = None):
"""Converts a dataset line to a Doc object for evaluation."""
instruction = "Choose the correct answer for the following exam question:"
return Doc(
task_name=task_name,
query=f"{instruction} {line['question']}",
choices=[
f" {line['answer_a']}",
f" {line['answer_b']}",
f" {line['answer_c']}",
f" {line['answer_d']}",
],
gold_index=["answer_a", "answer_b", "answer_c", "answer_d"].index(
line["correct_answer"]
),
instruction=instruction,
)


################################################################################
# Define the custom metric based on guide here https://github.com/huggingface/lighteval/wiki/Adding-a-New-Metric
# Or use an existing metric based on the guide here: https://github.com/huggingface/lighteval/wiki/Metric-List
# Existing metrics can be imported from lighteval.metrics.metrics
################################################################################


def sample_level_fn(formatted_doc: Doc, **kwargs) -> bool:
response = np.argmin(kwargs["choices_logprob"])
return response == formatted_doc.gold_index


custom_metric = SampleLevelMetric(
metric_name="exam_question_accuracy",
higher_is_better=True,
category=MetricCategory.MULTICHOICE,
use_case=MetricUseCase.NONE,
sample_level_fn=sample_level_fn,
corpus_level_fn=np.mean,
)

################################################################################
# Define the task based on the prompt function and the custom metric
# Based on the guide here: https://github.com/huggingface/lighteval/wiki/Adding-a-Custom-Task
################################################################################

task = LightevalTaskConfig(
name="example",
prompt_function=prompt_fn,
suite=["community"],
hf_repo="burtenshaw/exam_questions",
hf_subset="default",
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
few_shots_select=None,
metric=[custom_metric],
)

# Add the task to TASKS_TABLE
TASKS_TABLE = [task]

# MODULE LOGIC
if __name__ == "__main__":
print([t.name for t in TASKS_TABLE])
print(len(TASKS_TABLE))
6 changes: 6 additions & 0 deletions 8_capstone_project/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
export HF_CACHE="$HOME/.cache/huggingface"
export TRANSFORMERS_CACHE="$HF_CACHE"
export HF_HUB_CACHE="$HF_CACHE"
export HF_HOME="$HF_CACHE"

lighteval accelerate "pretrained=HuggingFaceTB/SmolLM2-135M-Instruct" "community|example|20|1" --custom-tasks "submitted_tasks/example.py" --output-dir "results" --override-batch-size 512