diff --git a/8_capstone_project/README.md b/8_capstone_project/README.md new file mode 100644 index 00000000..e69de29b diff --git a/8_capstone_project/process_results.py b/8_capstone_project/process_results.py new file mode 100644 index 00000000..808d1415 --- /dev/null +++ b/8_capstone_project/process_results.py @@ -0,0 +1,190 @@ +""" +Process and aggregate evaluation results from multiple model runs and upload them to the HuggingFace Hub. + +This script handles: +- Extracting results from JSON files +- Processing results into a structured format +- Uploading aggregated results to HuggingFace Hub (optional) +- Logging results to console +""" + +import argparse +import json +import logging +from pathlib import Path +from typing import Dict, List + +import pandas as pd +from datasets import Dataset + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +def extract_results(eval_results: Dict) -> Dict: + """ + Extract relevant scores from evaluation results dictionary. + + Args: + eval_results: Dictionary containing model evaluation results + + Returns: + Dictionary containing model configuration and task scores + """ + try: + model_results = eval_results["config_general"] + for task_name, task_score in eval_results["results"]["all"].items(): + model_results[task_name] = task_score + return model_results + except KeyError as e: + logger.error(f"Missing required key in evaluation results: {e}") + raise + + +def get_results_from_dir(results_dir: Path) -> List[Dict]: + """ + Recursively process all result files from the given directory. + + Args: + results_dir: Path to directory containing evaluation results + + Returns: + List of processed result dictionaries + """ + if not results_dir.exists(): + raise FileNotFoundError(f"Results directory not found: {results_dir}") + + results = [] + try: + for author_dir in results_dir.iterdir(): + if not author_dir.is_dir(): + continue + + for model_dir in author_dir.iterdir(): + if not model_dir.is_dir(): + continue + + for file in model_dir.iterdir(): + if not file.suffix == ".json": + continue + + try: + results.append(process_result_file(file)) + except Exception as e: + logger.error(f"Error processing file {file}: {e}") + continue + + if not results: + logger.warning("No valid result files found in the specified directory") + + return results + + except Exception as e: + logger.error(f"Error reading results directory: {e}") + raise + + +def process_result_file(file_path: Path) -> Dict: + """ + Process a single result file. + + Args: + file_path: Path to the result file + + Returns: + Processed result dictionary + """ + try: + with file_path.open() as f: + results = json.load(f) + return extract_results(results) + except json.JSONDecodeError: + logger.error(f"Invalid JSON in file: {file_path}") + raise + except Exception as e: + logger.error(f"Error processing file {file_path}: {e}") + raise + + +def push_results_to_hub(model_results: List[Dict], repo_id: str) -> None: + """ + Upload processed results to HuggingFace Hub. + + Args: + model_results: List of processed result dictionaries + repo_id: HuggingFace Hub repository ID to upload to + """ + try: + dataset = Dataset.from_list(model_results) + dataset.push_to_hub(repo_id=repo_id) + logger.info(f"Successfully pushed results to {repo_id}") + except Exception as e: + logger.error(f"Error pushing results to hub: {e}") + raise + + +def display_results(results: List[Dict]) -> None: + """ + Display results as a formatted table. + + Args: + results: List of processed result dictionaries + """ + try: + df = pd.DataFrame(results) + logger.info("\nResults Summary:") + logger.info("\n" + str(df)) + + # Log some basic statistics + logger.info("\nSummary Statistics:") + numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns + logger.info("\n" + str(df[numeric_cols].describe())) + + except Exception as e: + logger.error(f"Error displaying results: {e}") + raise + + +def main(results_dir: str, repo_id: str = None) -> None: + """ + Main function to process results and optionally upload to HuggingFace Hub. + + Args: + results_dir: Directory containing evaluation results + repo_id: Optional HuggingFace Hub repository ID to upload results to + """ + try: + results_path = Path(results_dir) + results = get_results_from_dir(results_path) + + display_results(results) + + if repo_id: + push_results_to_hub(results, repo_id) + + except Exception as e: + logger.error(f"Error in main execution: {e}") + raise + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Process model evaluation results and optionally upload to HuggingFace Hub" + ) + parser.add_argument( + "--results-dir", + type=str, + required=True, + help="Directory containing evaluation results", + ) + parser.add_argument( + "--repo-id", + type=str, + help="HuggingFace Hub repository ID to upload results to (optional)", + ) + + args = parser.parse_args() + main(args.results_dir, args.repo_id) diff --git a/8_capstone_project/run_evaluation.sh b/8_capstone_project/run_evaluation.sh new file mode 100755 index 00000000..e152764c --- /dev/null +++ b/8_capstone_project/run_evaluation.sh @@ -0,0 +1,92 @@ +#!/bin/bash +export HF_CACHE="$HOME/.cache/huggingface" +export TRANSFORMERS_CACHE="$HF_CACHE" +export HF_HUB_CACHE="$HF_CACHE" +export HF_HOME="$HF_CACHE" + +# Default values +num_fewshots=0 +truncate_fewshots=0 + +# Function to print usage +print_usage() { + echo "Usage: $0 -m MODEL_ID [-f NUM_FEWSHOTS] [-x]" + echo " -m MODEL_ID : HuggingFace model ID (required)" + echo " -f NUM_FEWSHOTS : Number of few-shot examples (default: 0)" + echo " -x : Truncate few-shot examples (default: false)" + echo " -h : Show this help message" +} + +# Parse command line arguments +while getopts "m:f:xh" opt; do + case $opt in + m) model_id="$OPTARG";; + f) num_fewshots="$OPTARG";; + x) truncate_fewshots="true";; + h) print_usage; exit 0;; + ?) print_usage; exit 1;; + esac +done + +# Check if model_id is provided +if [ -z "$model_id" ]; then + echo "Error: Model ID is required" + print_usage + exit 1 +fi + +# Get the directory of the script +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +tasks_dir="$script_dir/submitted_tasks" + +# Create output directory if it doesn't exist +output_dir="$script_dir/results/$model_id" +if [ ! -d "$output_dir" ]; then + mkdir -p "$output_dir" +fi + +# Collect all Python files (excluding run_evaluation.py) +task_files=() +task_names=() +while IFS= read -r -d '' file; do + task_files+=("${file}") + task_names+=($(basename "$file" .py)) +done < <(find "$tasks_dir" -name "*.py" -print0) + +# Check if any task files were found +if [ ${#task_files[@]} -eq 0 ]; then + echo "Error: No Python task files found in $tasks_dir" + exit 1 +fi + +echo "----------------------------------------" +echo "Running evaluation for model: $model_id" +echo "Found tasks: ${task_names[*]}" +echo "Number of few-shots: $num_fewshots" +echo "Truncate few-shots: $truncate_fewshots" + +# Build the tasks parameter string +tasks_param="" +for task_name in "${task_names[@]}"; do + tasks_param+="community|${task_name}|$num_fewshots|$truncate_fewshots," +done +tasks_param=${tasks_param%,} # Remove trailing comma + +# Build the custom-tasks parameter string +custom_tasks_param=$(IFS=,; echo "${task_files[*]}") + +lighteval accelerate "pretrained=$model_id" \ + "$tasks_param" \ + --custom-tasks "$custom_tasks_param" \ + --output-dir . \ + --override-batch-size 512 + +exit_code=$? + +if [ $exit_code -eq 0 ]; then + echo "Successfully completed evaluation of all tasks" + echo "Results saved in: $output_dir" +else + echo "Error running evaluation (exit code: $exit_code)" + exit $exit_code +fi \ No newline at end of file diff --git a/8_capstone_project/submitted_tasks/example.py b/8_capstone_project/submitted_tasks/example.py new file mode 100644 index 00000000..63ce865a --- /dev/null +++ b/8_capstone_project/submitted_tasks/example.py @@ -0,0 +1,80 @@ +import numpy as np + +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.metrics.utils.metric_utils import ( + SampleLevelMetric, + MetricCategory, + MetricUseCase, +) + +################################################################################ +# Define the prompt function based on the structure of the dataset +################################################################################ + + +def prompt_fn(line, task_name: str = None): + """Converts a dataset line to a Doc object for evaluation.""" + instruction = "Choose the correct answer for the following exam question:" + return Doc( + task_name=task_name, + query=f"{instruction} {line['question']}", + choices=[ + f" {line['answer_a']}", + f" {line['answer_b']}", + f" {line['answer_c']}", + f" {line['answer_d']}", + ], + gold_index=["answer_a", "answer_b", "answer_c", "answer_d"].index( + line["correct_answer"] + ), + instruction=instruction, + ) + + +################################################################################ +# Define the custom metric based on guide here https://github.com/huggingface/lighteval/wiki/Adding-a-New-Metric +# Or use an existing metric based on the guide here: https://github.com/huggingface/lighteval/wiki/Metric-List +# Existing metrics can be imported from lighteval.metrics.metrics +################################################################################ + + +def sample_level_fn(formatted_doc: Doc, **kwargs) -> bool: + response = np.argmin(kwargs["choices_logprob"]) + return response == formatted_doc.gold_index + + +custom_metric = SampleLevelMetric( + metric_name="exam_question_accuracy", + higher_is_better=True, + category=MetricCategory.MULTICHOICE, + use_case=MetricUseCase.NONE, + sample_level_fn=sample_level_fn, + corpus_level_fn=np.mean, +) + +################################################################################ +# Define the task based on the prompt function and the custom metric +# Based on the guide here: https://github.com/huggingface/lighteval/wiki/Adding-a-Custom-Task +################################################################################ + +task = LightevalTaskConfig( + name="example", + prompt_function=prompt_fn, + suite=["community"], + hf_repo="burtenshaw/exam_questions", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + metric=[custom_metric], +) + +# Add the task to TASKS_TABLE +TASKS_TABLE = [task] + +# MODULE LOGIC +if __name__ == "__main__": + print([t.name for t in TASKS_TABLE]) + print(len(TASKS_TABLE)) diff --git a/8_capstone_project/test.sh b/8_capstone_project/test.sh new file mode 100755 index 00000000..0f494bec --- /dev/null +++ b/8_capstone_project/test.sh @@ -0,0 +1,6 @@ +export HF_CACHE="$HOME/.cache/huggingface" +export TRANSFORMERS_CACHE="$HF_CACHE" +export HF_HUB_CACHE="$HF_CACHE" +export HF_HOME="$HF_CACHE" + +lighteval accelerate "pretrained=HuggingFaceTB/SmolLM2-135M-Instruct" "community|example|20|1" --custom-tasks "submitted_tasks/example.py" --output-dir "results" --override-batch-size 512 \ No newline at end of file