-
Notifications
You must be signed in to change notification settings - Fork 74
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(weave): Allow back-compat for "old-style" scorers and evaluations using model_output
kwarg
#2806
fix(weave): Allow back-compat for "old-style" scorers and evaluations using model_output
kwarg
#2806
Changes from 3 commits
9d3feaa
6594c4b
3240f4a
3dc5131
78c41e7
1fd8d04
bc8e939
b27b93f
eb5d607
184a431
d57b59c
c37b746
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file re-implements the old-style tests, specifically:
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
import asyncio | ||
|
||
import pytest | ||
|
||
import weave | ||
from weave import Dataset, Evaluation, Model | ||
from weave.scorers import MultiTaskBinaryClassificationF1 | ||
|
||
dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}] | ||
dataset = Dataset(rows=dataset_rows) | ||
|
||
|
||
expected_eval_result = { | ||
"model_output": {"mean": 9.5}, | ||
"score_oldstyle": {"true_count": 1, "true_fraction": 0.5}, | ||
"model_latency": {"mean": pytest.approx(0, abs=1)}, | ||
} | ||
|
||
|
||
class EvalModel(Model): | ||
@weave.op() | ||
async def predict(self, input) -> str: | ||
return eval(input) | ||
|
||
|
||
@weave.op() | ||
def score_oldstyle(model_output, target): | ||
return model_output == target | ||
|
||
|
||
@weave.op() | ||
def example_to_model_input(example): | ||
return {"input": example["input"]} | ||
|
||
|
||
def test_evaluate_callable_as_model(client): | ||
@weave.op() | ||
async def model_predict(input) -> str: | ||
return eval(input) | ||
|
||
evaluation = Evaluation( | ||
dataset=dataset_rows, | ||
scorers=[score_oldstyle], | ||
) | ||
result = asyncio.run(evaluation.evaluate(model_predict)) | ||
assert result == expected_eval_result | ||
|
||
|
||
def test_predict_can_receive_other_params(client): | ||
@weave.op() | ||
async def model_predict(input, target) -> str: | ||
return eval(input) + target | ||
|
||
evaluation = Evaluation( | ||
dataset=dataset_rows, | ||
scorers=[score_oldstyle], | ||
) | ||
result = asyncio.run(evaluation.evaluate(model_predict)) | ||
assert result == { | ||
"model_output": {"mean": 18.5}, | ||
"score_oldstyle": {"true_count": 0, "true_fraction": 0.0}, | ||
"model_latency": { | ||
"mean": pytest.approx(0, abs=1), | ||
}, | ||
} | ||
|
||
|
||
def test_can_preprocess_model_input(client): | ||
@weave.op() | ||
async def model_predict(x) -> str: | ||
return eval(x) | ||
|
||
@weave.op() | ||
def preprocess(example): | ||
return {"x": example["input"]} | ||
|
||
evaluation = Evaluation( | ||
dataset=dataset_rows, | ||
scorers=[score_oldstyle], | ||
preprocess_model_input=preprocess, | ||
) | ||
result = asyncio.run(evaluation.evaluate(model_predict)) | ||
assert result == expected_eval_result | ||
|
||
|
||
def test_evaluate_rows_only(client): | ||
evaluation = Evaluation( | ||
dataset=dataset_rows, | ||
scorers=[score_oldstyle], | ||
) | ||
model = EvalModel() | ||
result = asyncio.run(evaluation.evaluate(model)) | ||
assert result == expected_eval_result | ||
|
||
|
||
def test_evaluate_other_model_method_names(): | ||
class EvalModel(Model): | ||
@weave.op() | ||
async def infer(self, input) -> str: | ||
return eval(input) | ||
|
||
evaluation = Evaluation( | ||
dataset=dataset_rows, | ||
scorers=[score_oldstyle], | ||
) | ||
model = EvalModel() | ||
result = asyncio.run(evaluation.evaluate(model)) | ||
assert result == expected_eval_result | ||
|
||
|
||
def test_score_as_class(client): | ||
class MyScorerOldstyle(weave.Scorer): | ||
@weave.op() | ||
def score(self, model_output, target): | ||
return model_output == target | ||
|
||
evaluation = Evaluation( | ||
dataset=dataset_rows, | ||
scorers=[MyScorerOldstyle()], | ||
) | ||
model = EvalModel() | ||
result = asyncio.run(evaluation.evaluate(model)) | ||
assert result == { | ||
"model_output": {"mean": 9.5}, | ||
"MyScorerOldstyle": {"true_count": 1, "true_fraction": 0.5}, | ||
"model_latency": { | ||
"mean": pytest.approx(0, abs=1), | ||
}, | ||
} | ||
|
||
|
||
def test_score_with_custom_summarize(client): | ||
class MyScorerOldstyle(weave.Scorer): | ||
@weave.op() | ||
def summarize(self, score_rows): | ||
assert list(score_rows) == [True, False] | ||
return {"awesome": 3} | ||
|
||
@weave.op() | ||
def score(self, model_output, target): | ||
return model_output == target | ||
|
||
evaluation = Evaluation( | ||
dataset=dataset_rows, | ||
scorers=[MyScorerOldstyle()], | ||
) | ||
model = EvalModel() | ||
result = asyncio.run(evaluation.evaluate(model)) | ||
assert result == { | ||
"model_output": {"mean": 9.5}, | ||
"MyScorerOldstyle": {"awesome": 3}, | ||
"model_latency": { | ||
"mean": pytest.approx(0, abs=1), | ||
}, | ||
} | ||
|
||
|
||
def test_multiclass_f1_score(client): | ||
evaluation = Evaluation( | ||
dataset=[{"target": {"a": False, "b": True}, "pred": {"a": True, "b": False}}], | ||
scorers=[MultiTaskBinaryClassificationF1(class_names=["a", "b"])], | ||
) | ||
|
||
@weave.op() | ||
def return_pred(pred): | ||
return pred | ||
|
||
result = asyncio.run(evaluation.evaluate(return_pred)) | ||
assert result == { | ||
"model_output": { | ||
"a": {"true_count": 1, "true_fraction": 1.0}, | ||
"b": {"true_count": 0, "true_fraction": 0.0}, | ||
}, | ||
"MultiTaskBinaryClassificationF1": { | ||
"a": {"f1": 0, "precision": 0.0, "recall": 0}, | ||
"b": {"f1": 0, "precision": 0, "recall": 0.0}, | ||
}, | ||
"model_latency": { | ||
"mean": pytest.approx(0, abs=1), | ||
}, | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,12 @@ | ||
import asyncio | ||
import inspect | ||
import logging | ||
import textwrap | ||
import time | ||
import traceback | ||
from typing import Any, Callable, Coroutine, Optional, Union, cast | ||
|
||
from pydantic import PrivateAttr | ||
from rich import print | ||
from rich.console import Console | ||
|
||
|
@@ -28,7 +30,7 @@ | |
from weave.trace.weave_client import Call, get_ref | ||
|
||
console = Console() | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
INVALID_MODEL_ERROR = ( | ||
"`Evaluation.evaluate` requires a `Model` or `Op` instance as the `model` argument. " | ||
|
@@ -96,7 +98,7 @@ def function_to_evaluate(question: str): | |
|
||
# Score your examples using scoring functions | ||
evaluation = Evaluation( | ||
dataset=examples, scorers=[match_score1] | ||
dataset=examples, scorers=[match_score1], output_key="generated_text" | ||
) | ||
|
||
# Start tracking the evaluation | ||
|
@@ -111,6 +113,8 @@ def function_to_evaluate(question: str): | |
preprocess_model_input: Optional[Callable] = None | ||
trials: int = 1 | ||
|
||
_output_key: str = PrivateAttr("output") | ||
|
||
def model_post_init(self, __context: Any) -> None: | ||
scorers: list[Union[Callable, Scorer, Op]] = [] | ||
for scorer in self.scorers or []: | ||
|
@@ -339,7 +343,12 @@ async def predict_and_score( | |
raise ValueError( | ||
f"{score_fn} expects arguments: {score_arg_names}, provide a preprocess_model_input function that returns a dict with those keys." | ||
) | ||
score_args["output"] = model_output | ||
if "model_output" in score_arg_names: | ||
util.warn_once( | ||
logger, "model_output is deprecated, please use output instead" | ||
) | ||
self._output_key = "model_output" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what happens if some scores use the new output and some use the old? I don't think this should be stored on the eval. just return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are you saying for all scorers to return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i'm saying that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it does make sense to keep the output key consistent per eval though. Otherwise if you mix and match scorers, should you get both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think it is worth being clear here - all we are talking about is the key for the output of predict and score (i think?). Which if you miz am match is going to flip around given current implementation. the call stack is
the output of |
||
score_args[self._output_key] = model_output | ||
|
||
try: | ||
if is_op(score_fn) and model_call: | ||
|
@@ -370,7 +379,7 @@ async def predict_and_score( | |
for param in score_signature.parameters.values() | ||
if param.default == inspect.Parameter.empty | ||
] | ||
required_arg_names.remove("output") | ||
required_arg_names.remove(self._output_key) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should dynamic based on the scorer There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should be |
||
|
||
message = textwrap.dedent( | ||
f""" | ||
|
@@ -397,7 +406,7 @@ async def predict_and_score( | |
scores[scorer_name] = result | ||
|
||
return { | ||
"output": model_output, | ||
self._output_key: model_output, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should always be "output" even if you have some scorers that use "model_output" |
||
"scores": scores, | ||
"model_latency": model_latency, | ||
} | ||
|
@@ -421,7 +430,6 @@ async def summarize(self, eval_table: EvaluationResults) -> dict: | |
model_output_summary = auto_summarize(vals) | ||
if model_output_summary: | ||
summary[name] = model_output_summary | ||
|
||
return summary | ||
|
||
async def get_eval_results( | ||
|
@@ -441,7 +449,7 @@ async def eval_example(example: dict) -> dict: | |
except Exception as e: | ||
print("Predict and score failed") | ||
traceback.print_exc() | ||
return {"output": None, "scores": {}} | ||
return {self._output_key: None, "scores": {}} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should always be "output" even if you have some scorers that use "model_output" |
||
return eval_row | ||
|
||
n_complete = 0 | ||
|
@@ -458,7 +466,7 @@ async def eval_example(example: dict) -> dict: | |
# f"Evaluating... {duration:.2f}s [{n_complete} / {len(self.dataset.rows)} complete]" # type:ignore | ||
# ) | ||
if eval_row is None: | ||
eval_row = {"output": None, "scores": {}} | ||
eval_row = {self._output_key: None, "scores": {}} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should always be "output" even if you have some scorers that use "model_output" |
||
else: | ||
eval_row["scores"] = eval_row.get("scores", {}) | ||
for scorer in self.scorers or []: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,7 @@ class Scorer(Object): | |
description="A mapping from column names in the dataset to the names expected by the scorer", | ||
) | ||
|
||
def score(self, input: Any, target: Any, output: Any) -> Any: | ||
def score(self, *, output: Any, **kwargs: Any) -> Any: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is more representative of the actual score func signature. Some scorers don't take input, or may take other args There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agree, and |
||
raise NotImplementedError | ||
|
||
@weave.op() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test is moved to
test_evaluate_oldstyle