Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(weave): Allow back-compat for "old-style" scorers and evaluations using model_output kwarg #2806

Merged
merged 12 commits into from
Oct 30, 2024
27 changes: 0 additions & 27 deletions tests/trace/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import weave
from weave import Dataset, Evaluation, Model
from weave.scorers import MultiTaskBinaryClassificationF1

dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
dataset = Dataset(rows=dataset_rows)
Expand Down Expand Up @@ -153,29 +152,3 @@ def score(self, target, output):
"mean": pytest.approx(0, abs=1),
},
}


def test_multiclass_f1_score(client):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is moved to test_evaluate_oldstyle

evaluation = Evaluation(
dataset=[{"target": {"a": False, "b": True}, "pred": {"a": True, "b": False}}],
scorers=[MultiTaskBinaryClassificationF1(class_names=["a", "b"])],
)

@weave.op()
def return_pred(pred):
return pred

result = asyncio.run(evaluation.evaluate(return_pred))
assert result == {
"output": {
"a": {"true_count": 1, "true_fraction": 1.0},
"b": {"true_count": 0, "true_fraction": 0.0},
},
"MultiTaskBinaryClassificationF1": {
"a": {"f1": 0, "precision": 0.0, "recall": 0},
"b": {"f1": 0, "precision": 0, "recall": 0.0},
},
"model_latency": {
"mean": pytest.approx(0, abs=1),
},
}
181 changes: 181 additions & 0 deletions tests/trace/test_evaluate_oldstyle.py
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file re-implements the old-style tests, specifically:

  1. outputs containing model_output; and
  2. scorers that take a model_output param

Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
import asyncio

import pytest

import weave
from weave import Dataset, Evaluation, Model
from weave.scorers import MultiTaskBinaryClassificationF1

dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
dataset = Dataset(rows=dataset_rows)


expected_eval_result = {
"model_output": {"mean": 9.5},
"score_oldstyle": {"true_count": 1, "true_fraction": 0.5},
"model_latency": {"mean": pytest.approx(0, abs=1)},
}


class EvalModel(Model):
@weave.op()
async def predict(self, input) -> str:
return eval(input)


@weave.op()
def score_oldstyle(model_output, target):
return model_output == target


@weave.op()
def example_to_model_input(example):
return {"input": example["input"]}


def test_evaluate_callable_as_model(client):
@weave.op()
async def model_predict(input) -> str:
return eval(input)

evaluation = Evaluation(
dataset=dataset_rows,
scorers=[score_oldstyle],
)
result = asyncio.run(evaluation.evaluate(model_predict))
assert result == expected_eval_result


def test_predict_can_receive_other_params(client):
@weave.op()
async def model_predict(input, target) -> str:
return eval(input) + target

evaluation = Evaluation(
dataset=dataset_rows,
scorers=[score_oldstyle],
)
result = asyncio.run(evaluation.evaluate(model_predict))
assert result == {
"model_output": {"mean": 18.5},
"score_oldstyle": {"true_count": 0, "true_fraction": 0.0},
"model_latency": {
"mean": pytest.approx(0, abs=1),
},
}


def test_can_preprocess_model_input(client):
@weave.op()
async def model_predict(x) -> str:
return eval(x)

@weave.op()
def preprocess(example):
return {"x": example["input"]}

evaluation = Evaluation(
dataset=dataset_rows,
scorers=[score_oldstyle],
preprocess_model_input=preprocess,
)
result = asyncio.run(evaluation.evaluate(model_predict))
assert result == expected_eval_result


def test_evaluate_rows_only(client):
evaluation = Evaluation(
dataset=dataset_rows,
scorers=[score_oldstyle],
)
model = EvalModel()
result = asyncio.run(evaluation.evaluate(model))
assert result == expected_eval_result


def test_evaluate_other_model_method_names():
class EvalModel(Model):
@weave.op()
async def infer(self, input) -> str:
return eval(input)

evaluation = Evaluation(
dataset=dataset_rows,
scorers=[score_oldstyle],
)
model = EvalModel()
result = asyncio.run(evaluation.evaluate(model))
assert result == expected_eval_result


def test_score_as_class(client):
class MyScorerOldstyle(weave.Scorer):
@weave.op()
def score(self, model_output, target):
return model_output == target

evaluation = Evaluation(
dataset=dataset_rows,
scorers=[MyScorerOldstyle()],
)
model = EvalModel()
result = asyncio.run(evaluation.evaluate(model))
assert result == {
"model_output": {"mean": 9.5},
"MyScorerOldstyle": {"true_count": 1, "true_fraction": 0.5},
"model_latency": {
"mean": pytest.approx(0, abs=1),
},
}


def test_score_with_custom_summarize(client):
class MyScorerOldstyle(weave.Scorer):
@weave.op()
def summarize(self, score_rows):
assert list(score_rows) == [True, False]
return {"awesome": 3}

@weave.op()
def score(self, model_output, target):
return model_output == target

evaluation = Evaluation(
dataset=dataset_rows,
scorers=[MyScorerOldstyle()],
)
model = EvalModel()
result = asyncio.run(evaluation.evaluate(model))
assert result == {
"model_output": {"mean": 9.5},
"MyScorerOldstyle": {"awesome": 3},
"model_latency": {
"mean": pytest.approx(0, abs=1),
},
}


def test_multiclass_f1_score(client):
evaluation = Evaluation(
dataset=[{"target": {"a": False, "b": True}, "pred": {"a": True, "b": False}}],
scorers=[MultiTaskBinaryClassificationF1(class_names=["a", "b"])],
)

@weave.op()
def return_pred(pred):
return pred

result = asyncio.run(evaluation.evaluate(return_pred))
assert result == {
"model_output": {
"a": {"true_count": 1, "true_fraction": 1.0},
"b": {"true_count": 0, "true_fraction": 0.0},
},
"MultiTaskBinaryClassificationF1": {
"a": {"f1": 0, "precision": 0.0, "recall": 0},
"b": {"f1": 0, "precision": 0, "recall": 0.0},
},
"model_latency": {
"mean": pytest.approx(0, abs=1),
},
}
24 changes: 16 additions & 8 deletions weave/flow/eval.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import asyncio
import inspect
import logging
import textwrap
import time
import traceback
from typing import Any, Callable, Coroutine, Optional, Union, cast

from pydantic import PrivateAttr
from rich import print
from rich.console import Console

Expand All @@ -28,7 +30,7 @@
from weave.trace.weave_client import Call, get_ref

console = Console()

logger = logging.getLogger(__name__)

INVALID_MODEL_ERROR = (
"`Evaluation.evaluate` requires a `Model` or `Op` instance as the `model` argument. "
Expand Down Expand Up @@ -96,7 +98,7 @@ def function_to_evaluate(question: str):

# Score your examples using scoring functions
evaluation = Evaluation(
dataset=examples, scorers=[match_score1]
dataset=examples, scorers=[match_score1], output_key="generated_text"
)

# Start tracking the evaluation
Expand All @@ -111,6 +113,8 @@ def function_to_evaluate(question: str):
preprocess_model_input: Optional[Callable] = None
trials: int = 1

_output_key: str = PrivateAttr("output")

def model_post_init(self, __context: Any) -> None:
scorers: list[Union[Callable, Scorer, Op]] = []
for scorer in self.scorers or []:
Expand Down Expand Up @@ -339,7 +343,12 @@ async def predict_and_score(
raise ValueError(
f"{score_fn} expects arguments: {score_arg_names}, provide a preprocess_model_input function that returns a dict with those keys."
)
score_args["output"] = model_output
if "model_output" in score_arg_names:
util.warn_once(
logger, "model_output is deprecated, please use output instead"
)
self._output_key = "model_output"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happens if some scores use the new output and some use the old? I don't think this should be stored on the eval. just return output from predict and score

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you saying for all scorers to return output instead of model_output?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm saying that self._output_key does not make sense. The output key for the arg is per-scorer, not global to the eval

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it does make sense to keep the output key consistent per eval though. Otherwise if you mix and match scorers, should you get both output and model_output keys? That doesn't seem right to me.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think it is worth being clear here - all we are talking about is the key for the output of predict and score (i think?). Which if you miz am match is going to flip around given current implementation. the call stack is

  • predict_and_score
    • predict
    • score 1
    • score 2
    • score 3

the output of predict_and_score should not be keyed based on the last scorer param

score_args[self._output_key] = model_output

try:
if is_op(score_fn) and model_call:
Expand Down Expand Up @@ -370,7 +379,7 @@ async def predict_and_score(
for param in score_signature.parameters.values()
if param.default == inspect.Parameter.empty
]
required_arg_names.remove("output")
required_arg_names.remove(self._output_key)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should dynamic based on the scorer

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be required_arg_names.remove(score_output_name) instead or we could just remove this line.


message = textwrap.dedent(
f"""
Expand All @@ -397,7 +406,7 @@ async def predict_and_score(
scores[scorer_name] = result

return {
"output": model_output,
self._output_key: model_output,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should always be "output" even if you have some scorers that use "model_output"

"scores": scores,
"model_latency": model_latency,
}
Expand All @@ -421,7 +430,6 @@ async def summarize(self, eval_table: EvaluationResults) -> dict:
model_output_summary = auto_summarize(vals)
if model_output_summary:
summary[name] = model_output_summary

return summary

async def get_eval_results(
Expand All @@ -441,7 +449,7 @@ async def eval_example(example: dict) -> dict:
except Exception as e:
print("Predict and score failed")
traceback.print_exc()
return {"output": None, "scores": {}}
return {self._output_key: None, "scores": {}}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should always be "output" even if you have some scorers that use "model_output"

return eval_row

n_complete = 0
Expand All @@ -458,7 +466,7 @@ async def eval_example(example: dict) -> dict:
# f"Evaluating... {duration:.2f}s [{n_complete} / {len(self.dataset.rows)} complete]" # type:ignore
# )
if eval_row is None:
eval_row = {"output": None, "scores": {}}
eval_row = {self._output_key: None, "scores": {}}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should always be "output" even if you have some scorers that use "model_output"

else:
eval_row["scores"] = eval_row.get("scores", {})
for scorer in self.scorers or []:
Expand Down
10 changes: 10 additions & 0 deletions weave/flow/util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import asyncio
import logging
import multiprocessing
from typing import Any, AsyncIterator, Awaitable, Callable, Iterable, Tuple, TypeVar

T = TypeVar("T")
U = TypeVar("U")

_shown_warnings = set()


async def async_foreach(
sequence: Iterable[T],
Expand Down Expand Up @@ -70,3 +73,10 @@ async def run_in_process_with_timeout(
raise ValueError(
"Unhandled exception in subprocess. Exitcode: " + str(process.exitcode)
)


def warn_once(logger: logging.Logger, message: str) -> None:
"""Display a warning message only once. If the message has already been shown, do nothing."""
if message not in _shown_warnings:
logger.warning(message)
_shown_warnings.add(message)
2 changes: 1 addition & 1 deletion weave/scorers/base_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class Scorer(Object):
description="A mapping from column names in the dataset to the names expected by the scorer",
)

def score(self, input: Any, target: Any, output: Any) -> Any:
def score(self, *, output: Any, **kwargs: Any) -> Any:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is more representative of the actual score func signature. Some scorers don't take input, or may take other args

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree, and input is not necessary an available col.

raise NotImplementedError

@weave.op()
Expand Down
7 changes: 5 additions & 2 deletions weave/scorers/classification_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,15 @@ def summarize(self, score_rows: list) -> Optional[dict]:

return result

# NOTE: This is an old-style scorer that uses `model_output` instead of `output` for
# backwards compatibility. In future, this behaviour may change to use the newer `output` key.
# You can still pass a `column_map` to map to the new `output` key if you prefer.
@weave.op()
def score(self, target: dict, output: Optional[dict]) -> dict:
def score(self, target: dict, model_output: Optional[dict]) -> dict:
result = {}
for class_name in self.class_names:
class_label = target.get(class_name)
class_output = output.get(class_name) if output else None
class_output = model_output.get(class_name) if model_output else None
result[class_name] = {
"correct": class_label == class_output,
"negative": not class_output,
Expand Down
Loading