huggingface · burtenshaw · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/8_capstone_project/README.md b/8_capstone_project/README.md
diff --git a/8_capstone_project/process_results.py b/8_capstone_project/process_results.py
@@ -0,0 +1,190 @@
+"""
+Process and aggregate evaluation results from multiple model runs and upload them to the HuggingFace Hub.
+
+This script handles:
+- Extracting results from JSON files
+- Processing results into a structured format
+- Uploading aggregated results to HuggingFace Hub (optional)
+- Logging results to console
+"""
+
+import argparse
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+import pandas as pd
+from datasets import Dataset
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+def extract_results(eval_results: Dict) -> Dict:
+    """
+    Extract relevant scores from evaluation results dictionary.
+
+    Args:
+        eval_results: Dictionary containing model evaluation results
+
+    Returns:
+        Dictionary containing model configuration and task scores
+    """
+    try:
+        model_results = eval_results["config_general"]
+        for task_name, task_score in eval_results["results"]["all"].items():
+            model_results[task_name] = task_score
+        return model_results
+    except KeyError as e:
+        logger.error(f"Missing required key in evaluation results: {e}")
+        raise
+
+
+def get_results_from_dir(results_dir: Path) -> List[Dict]:
+    """
+    Recursively process all result files from the given directory.
+
+    Args:
+        results_dir: Path to directory containing evaluation results
+
+    Returns:
+        List of processed result dictionaries
+    """
+    if not results_dir.exists():
+        raise FileNotFoundError(f"Results directory not found: {results_dir}")
+
+    results = []
+    try:
+        for author_dir in results_dir.iterdir():
+            if not author_dir.is_dir():
+                continue
+
+            for model_dir in author_dir.iterdir():
+                if not model_dir.is_dir():
+                    continue
+
+                for file in model_dir.iterdir():
+                    if not file.suffix == ".json":
+                        continue
+
+                    try:
+                        results.append(process_result_file(file))
+                    except Exception as e:
+                        logger.error(f"Error processing file {file}: {e}")
+                        continue
+
+        if not results:
+            logger.warning("No valid result files found in the specified directory")
+
+        return results
+
+    except Exception as e:
+        logger.error(f"Error reading results directory: {e}")
+        raise
+
+
+def process_result_file(file_path: Path) -> Dict:
+    """
+    Process a single result file.
+
+    Args:
+        file_path: Path to the result file
+
+    Returns:
+        Processed result dictionary
+    """
+    try:
+        with file_path.open() as f:
+            results = json.load(f)
+        return extract_results(results)
+    except json.JSONDecodeError:
+        logger.error(f"Invalid JSON in file: {file_path}")
+        raise
+    except Exception as e:
+        logger.error(f"Error processing file {file_path}: {e}")
+        raise
+
+
+def push_results_to_hub(model_results: List[Dict], repo_id: str) -> None:
+    """
+    Upload processed results to HuggingFace Hub.
+
+    Args:
+        model_results: List of processed result dictionaries
+        repo_id: HuggingFace Hub repository ID to upload to
+    """
+    try:
+        dataset = Dataset.from_list(model_results)
+        dataset.push_to_hub(repo_id=repo_id)
+        logger.info(f"Successfully pushed results to {repo_id}")
+    except Exception as e:
+        logger.error(f"Error pushing results to hub: {e}")
+        raise
+
+
+def display_results(results: List[Dict]) -> None:
+    """
+    Display results as a formatted table.
+
+    Args:
+        results: List of processed result dictionaries
+    """
+    try:
+        df = pd.DataFrame(results)
+        logger.info("\nResults Summary:")
+        logger.info("\n" + str(df))
+
+        # Log some basic statistics
+        logger.info("\nSummary Statistics:")
+        numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
+        logger.info("\n" + str(df[numeric_cols].describe()))
+
+    except Exception as e:
+        logger.error(f"Error displaying results: {e}")
+        raise
+
+
+def main(results_dir: str, repo_id: str = None) -> None:
+    """
+    Main function to process results and optionally upload to HuggingFace Hub.
+
+    Args:
+        results_dir: Directory containing evaluation results
+        repo_id: Optional HuggingFace Hub repository ID to upload results to
+    """
+    try:
+        results_path = Path(results_dir)
+        results = get_results_from_dir(results_path)
+
+        display_results(results)
+
+        if repo_id:
+            push_results_to_hub(results, repo_id)
+
+    except Exception as e:
+        logger.error(f"Error in main execution: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Process model evaluation results and optionally upload to HuggingFace Hub"
+    )
+    parser.add_argument(
+        "--results-dir",
+        type=str,
+        required=True,
+        help="Directory containing evaluation results",
+    )
+    parser.add_argument(
+        "--repo-id",
+        type=str,
+        help="HuggingFace Hub repository ID to upload results to (optional)",
+    )
+
+    args = parser.parse_args()
+    main(args.results_dir, args.repo_id)
diff --git a/8_capstone_project/run_evaluation.sh b/8_capstone_project/run_evaluation.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+export HF_CACHE="$HOME/.cache/huggingface"
+export TRANSFORMERS_CACHE="$HF_CACHE"
+export HF_HUB_CACHE="$HF_CACHE"
+export HF_HOME="$HF_CACHE"
+
+# Default values
+num_fewshots=0
+truncate_fewshots=0
+
+# Function to print usage
+print_usage() {
+    echo "Usage: $0 -m MODEL_ID [-f NUM_FEWSHOTS] [-x]"
+    echo "  -m MODEL_ID         : HuggingFace model ID (required)"
+    echo "  -f NUM_FEWSHOTS     : Number of few-shot examples (default: 0)"
+    echo "  -x                  : Truncate few-shot examples (default: false)"
+    echo "  -h                  : Show this help message"
+}
+
+# Parse command line arguments
+while getopts "m:f:xh" opt; do
+    case $opt in
+        m) model_id="$OPTARG";;
+        f) num_fewshots="$OPTARG";;
+        x) truncate_fewshots="true";;
+        h) print_usage; exit 0;;
+        ?) print_usage; exit 1;;
+    esac
+done
+
+# Check if model_id is provided
+if [ -z "$model_id" ]; then
+    echo "Error: Model ID is required"
+    print_usage
+    exit 1
+fi
+
+# Get the directory of the script
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+tasks_dir="$script_dir/submitted_tasks"
+
+# Create output directory if it doesn't exist
+output_dir="$script_dir/results/$model_id"
+if [ ! -d "$output_dir" ]; then
+    mkdir -p "$output_dir"
+fi
+
+# Collect all Python files (excluding run_evaluation.py)
+task_files=()
+task_names=()
+while IFS= read -r -d '' file; do
+    task_files+=("${file}")
+    task_names+=($(basename "$file" .py))
+done < <(find "$tasks_dir" -name "*.py" -print0)
+
+# Check if any task files were found
+if [ ${#task_files[@]} -eq 0 ]; then
+    echo "Error: No Python task files found in $tasks_dir"
+    exit 1
+fi
+
+echo "----------------------------------------"
+echo "Running evaluation for model: $model_id"
+echo "Found tasks: ${task_names[*]}"
+echo "Number of few-shots: $num_fewshots"
+echo "Truncate few-shots: $truncate_fewshots"
+
+# Build the tasks parameter string
+tasks_param=""
+for task_name in "${task_names[@]}"; do
+    tasks_param+="community|${task_name}|$num_fewshots|$truncate_fewshots,"
+done
+tasks_param=${tasks_param%,}  # Remove trailing comma
+
+# Build the custom-tasks parameter string
+custom_tasks_param=$(IFS=,; echo "${task_files[*]}")
+
+lighteval accelerate "pretrained=$model_id" \
+    "$tasks_param" \
+    --custom-tasks "$custom_tasks_param" \
+    --output-dir . \
+    --override-batch-size 512
+
+exit_code=$?
+
+if [ $exit_code -eq 0 ]; then
+    echo "Successfully completed evaluation of all tasks"
+    echo "Results saved in: $output_dir"
+else
+    echo "Error running evaluation (exit code: $exit_code)"
+    exit $exit_code
+fi 
diff --git a/8_capstone_project/submitted_tasks/example.py b/8_capstone_project/submitted_tasks/example.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.metrics.utils.metric_utils import (
+    SampleLevelMetric,
+    MetricCategory,
+    MetricUseCase,
+)
+
+################################################################################
+# Define the prompt function based on the structure of the dataset
+################################################################################
+
+
+def prompt_fn(line, task_name: str = None):
+    """Converts a dataset line to a Doc object for evaluation."""
+    instruction = "Choose the correct answer for the following exam question:"
+    return Doc(
+        task_name=task_name,
+        query=f"{instruction} {line['question']}",
+        choices=[
+            f" {line['answer_a']}",
+            f" {line['answer_b']}",
+            f" {line['answer_c']}",
+            f" {line['answer_d']}",
+        ],
+        gold_index=["answer_a", "answer_b", "answer_c", "answer_d"].index(
+            line["correct_answer"]
+        ),
+        instruction=instruction,
+    )
+
+
+################################################################################
+# Define the custom metric based on guide here https://github.com/huggingface/lighteval/wiki/Adding-a-New-Metric
+# Or use an existing metric based on the guide here: https://github.com/huggingface/lighteval/wiki/Metric-List
+# Existing metrics can be imported from lighteval.metrics.metrics
+################################################################################
+
+
+def sample_level_fn(formatted_doc: Doc, **kwargs) -> bool:
+    response = np.argmin(kwargs["choices_logprob"])
+    return response == formatted_doc.gold_index
+
+
+custom_metric = SampleLevelMetric(
+    metric_name="exam_question_accuracy",
+    higher_is_better=True,
+    category=MetricCategory.MULTICHOICE,
+    use_case=MetricUseCase.NONE,
+    sample_level_fn=sample_level_fn,
+    corpus_level_fn=np.mean,
+)
+
+################################################################################
+# Define the task based on the prompt function and the custom metric
+# Based on the guide here: https://github.com/huggingface/lighteval/wiki/Adding-a-Custom-Task
+################################################################################
+
+task = LightevalTaskConfig(
+    name="example",
+    prompt_function=prompt_fn,
+    suite=["community"],
+    hf_repo="burtenshaw/exam_questions",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    metric=[custom_metric],
+)
+
+# Add the task to TASKS_TABLE
+TASKS_TABLE = [task]
+
+# MODULE LOGIC
+if __name__ == "__main__":
+    print([t.name for t in TASKS_TABLE])
+    print(len(TASKS_TABLE))
diff --git a/8_capstone_project/test.sh b/8_capstone_project/test.sh
@@ -0,0 +1,6 @@
+export HF_CACHE="$HOME/.cache/huggingface"
+export TRANSFORMERS_CACHE="$HF_CACHE"
+export HF_HUB_CACHE="$HF_CACHE"
+export HF_HOME="$HF_CACHE"
+
+lighteval accelerate "pretrained=HuggingFaceTB/SmolLM2-135M-Instruct" "community|example|20|1" --custom-tasks "submitted_tasks/example.py" --output-dir "results" --override-batch-size 512