Skip to content

Commit

Permalink
Separate files for eval logs (fixie-ai#61)
Browse files Browse the repository at this point in the history
* write each eval log to its own separate file
  • Loading branch information
farzadab authored Jul 30, 2024
1 parent 0648662 commit e606ff9
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 13 deletions.
34 changes: 22 additions & 12 deletions ultravox/training/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import concurrent.futures
import dataclasses
import functools
import json
import os
from typing import List, Optional

Expand Down Expand Up @@ -113,13 +114,17 @@ def evaluate(
num_procs: int = 8,
max_new_tokens: Optional[int] = None,
temperature: Optional[float] = None,
verbose: bool = False,
log_dir: Optional[str] = None,
):
metrics = {}

world_size = int(os.environ.get("WORLD_SIZE", 1))
local_rank = int(os.environ.get("LOCAL_RANK", 0))

if log_dir:
log_dir = os.path.join(log_dir, "evals")
os.makedirs(log_dir, exist_ok=True)

for task in EVAL_SCENARIOS:
ds_args = datasets.VoiceDatasetArgs(
data_dir=data_dir,
Expand Down Expand Up @@ -156,21 +161,26 @@ def evaluate(

scores = [x for x in possibly_non_scores if x is not None]

if verbose:
print(f"Eval for {task.dataset}:")
for sample, score in zip(output_samples, scores):
print("-" * 20)
print(f"Q: {sample.question}")
print(f"A: {sample.generated_answer}")
print(f"X: {sample.expected_answer} [score: {score:.2f}]")

average = np.mean(scores)
std = np.std(scores) / np.sqrt(len(scores))
metrics[f"eval_{task.name}"] = average
metrics[f"eval_{task.name}_std"] = std

print(
f"Aggregate {task.metric} score for {task.dataset}: {average:.2f} ± {std:.2f}"
)
has_audio_str = "with" if task.include_audio else "without"
agg_score_str = f"Aggregate {task.metric} score for {task.dataset} ({has_audio_str} audio): {average:.2f} ± {std:.2f}"
print(agg_score_str)

if log_dir:
eval_details = {
"score": average,
"confidence_interval": std,
"task_info": dataclasses.asdict(task),
"samples": [
{**dataclasses.asdict(sample), "score": score}
for sample, score in zip(output_samples, scores)
],
}
with open(os.path.join(log_dir, f"{task.name}.json"), "w") as f:
json.dump(eval_details, f, indent=1)

return metrics
4 changes: 3 additions & 1 deletion ultravox/training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import torch.distributed
import transformers
import wandb
import wandb.sdk
from torch.distributed.elastic.multiprocessing.errors import record
from torch.utils import data

Expand Down Expand Up @@ -131,6 +132,7 @@ def main() -> None:
name=args.exp_name,
dir="runs",
tags=args.run_tags,
save_code=True,
)

if args.model_load_dir:
Expand Down Expand Up @@ -302,7 +304,7 @@ def main() -> None:
num_procs=args.eval_num_procs,
num_samples=args.eval_num_samples,
max_new_tokens=args.eval_max_new_tokens,
verbose=True,
log_dir=wandb.run.dir if wandb.run else str(args.logs_dir),
)
if is_master:
trainer.log(metrics)
Expand Down

0 comments on commit e606ff9

Please sign in to comment.