From 7de96324bef807686f6a9f872680fbce25ea794f Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Tue, 3 Dec 2024 14:31:10 -0500 Subject: [PATCH 01/11] test --- weave/flow/eval.py | 10 +++++- weave/flow/util.py | 88 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/weave/flow/eval.py b/weave/flow/eval.py index 5f4a961f9045..a8e12935e06d 100644 --- a/weave/flow/eval.py +++ b/weave/flow/eval.py @@ -16,6 +16,7 @@ from weave.flow.dataset import Dataset from weave.flow.model import Model, get_infer_method from weave.flow.obj import Object +from weave.flow.util import make_memorable_name from weave.scorers import ( Scorer, _has_oldstyle_scorers, @@ -115,11 +116,18 @@ def function_to_evaluate(question: str): scorers: Optional[list[Union[Callable, Op, Scorer]]] = None preprocess_model_input: Optional[Callable] = None trials: int = 1 + # evaluation_name: Optional[Union[str, CallDisplayNameFunc]] = None # internal attr to track whether to use the new `output` or old `model_output` key for outputs _output_key: Literal["output", "model_output"] = PrivateAttr("output") def model_post_init(self, __context: Any) -> None: + # if self.evaluation_name: + # base_f = self.evaluate.resolve_fn + # self.__dict__["evaluate"] = weave.op( + # base_f, call_display_name=self.evaluation_name + # ) + scorers: list[Union[Callable, Scorer, Op]] = [] for scorer in self.scorers or []: if isinstance(scorer, Scorer): @@ -486,7 +494,7 @@ async def eval_example(example: dict) -> dict: eval_rows.append(eval_row) return EvaluationResults(rows=weave.Table(eval_rows)) - @weave.op() + @weave.op(call_display_name=make_memorable_name) async def evaluate(self, model: Union[Callable, Model]) -> dict: # The need for this pattern is quite unfortunate and highlights a gap in our # data model. As a user, I just want to pass a list of data `eval_rows` to diff --git a/weave/flow/util.py b/weave/flow/util.py index 4d89e777d883..16115d81dbb8 100644 --- a/weave/flow/util.py +++ b/weave/flow/util.py @@ -4,6 +4,8 @@ from collections.abc import AsyncIterator, Awaitable, Iterable from typing import Any, Callable, TypeVar +from weave.trace.weave_client import Call + T = TypeVar("T") U = TypeVar("U") @@ -81,3 +83,89 @@ def warn_once(logger: logging.Logger, message: str) -> None: if message not in _shown_warnings: logger.warning(message) _shown_warnings.add(message) + + +def make_memorable_name(call: Call) -> str: + adjectives = [ + "jubilant", + "eager", + "calm", + "bright", + "clever", + "dazzling", + "elegant", + "fierce", + "gentle", + "happy", + "innocent", + "kind", + "lively", + "merry", + "nice", + "proud", + "quiet", + "rich", + "sweet", + "tender", + "unique", + "wise", + "zealous", + "brave", + "charming", + "daring", + "eloquent", + "friendly", + "graceful", + "honest", + "imaginative", + "joyful", + "keen", + "loyal", + "noble", + "optimistic", + ] + + nouns = [ + "sun", + "moon", + "star", + "cloud", + "rain", + "wind", + "tree", + "flower", + "river", + "mountain", + "ocean", + "forest", + "meadow", + "bird", + "wolf", + "bear", + "tiger", + "lion", + "eagle", + "fish", + "whale", + "dolphin", + "rose", + "daisy", + "oak", + "pine", + "maple", + "cedar", + "valley", + "hill", + "lake", + "stream", + "breeze", + "dawn", + "dusk", + "horizon", + "island", + "plateau", + ] + + adj = random.choice(adjectives) + noun = random.choice(nouns) + return f"{adj}-{noun}" From 3932621541c7b2485446d2b8c5521c91d8c237ce Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Tue, 3 Dec 2024 14:36:54 -0500 Subject: [PATCH 02/11] test --- weave/flow/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/weave/flow/util.py b/weave/flow/util.py index 16115d81dbb8..e528af1ad26d 100644 --- a/weave/flow/util.py +++ b/weave/flow/util.py @@ -1,6 +1,7 @@ import asyncio import logging import multiprocessing +import random from collections.abc import AsyncIterator, Awaitable, Iterable from typing import Any, Callable, TypeVar From c46a7cffbbbc701a4c65f31309b4b798a3c1094b Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Tue, 3 Dec 2024 15:02:40 -0500 Subject: [PATCH 03/11] test --- tests/trace/test_evaluations.py | 18 ++++++++++++++++++ weave/flow/eval.py | 21 ++++++++++++++------- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py index f4993e9227dc..047df235c641 100644 --- a/tests/trace/test_evaluations.py +++ b/tests/trace/test_evaluations.py @@ -1029,3 +1029,21 @@ def my_second_scorer(text, output, model_output): with pytest.raises(ValueError, match="Both 'output' and 'model_output'"): evaluation = weave.Evaluation(dataset=ds, scorers=[my_second_scorer]) + + +@pytest.mark.asyncio +async def test_evaluation_with_custom_name(client): + dataset = weave.Dataset(rows=[{"input": "hi", "output": "hello"}]) + evaluation = weave.Evaluation(dataset=dataset, evaluation_name="wow-custom!") + + @weave.op() + def model(input: str) -> str: + return "hmmm" + + await evaluation.evaluate(model) + + calls = list(client.get_calls(filter=tsi.CallsFilter(trace_roots_only=True))) + assert len(calls) == 1 + + call = calls[0] + assert call.display_name == "wow-custom!" diff --git a/weave/flow/eval.py b/weave/flow/eval.py index a8e12935e06d..1fec9db3c713 100644 --- a/weave/flow/eval.py +++ b/weave/flow/eval.py @@ -5,6 +5,7 @@ import time import traceback from collections.abc import Coroutine +from types import MethodType from typing import Any, Callable, Literal, Optional, Union, cast from pydantic import PrivateAttr @@ -29,7 +30,7 @@ from weave.trace.env import get_weave_parallelism from weave.trace.errors import OpCallError from weave.trace.isinstance import weave_isinstance -from weave.trace.op import Op, as_op, is_op +from weave.trace.op import CallDisplayNameFunc, Op, as_op, is_op from weave.trace.vals import WeaveObject from weave.trace.weave_client import Call, get_ref @@ -116,17 +117,23 @@ def function_to_evaluate(question: str): scorers: Optional[list[Union[Callable, Op, Scorer]]] = None preprocess_model_input: Optional[Callable] = None trials: int = 1 - # evaluation_name: Optional[Union[str, CallDisplayNameFunc]] = None + + # Custom evaluation name for display in the UI. This is the same API as passing a + # custom `call_display_name` to `weave.op` (see that for more details). + evaluation_name: Optional[Union[str, CallDisplayNameFunc]] = None # internal attr to track whether to use the new `output` or old `model_output` key for outputs _output_key: Literal["output", "model_output"] = PrivateAttr("output") def model_post_init(self, __context: Any) -> None: - # if self.evaluation_name: - # base_f = self.evaluate.resolve_fn - # self.__dict__["evaluate"] = weave.op( - # base_f, call_display_name=self.evaluation_name - # ) + if self.evaluation_name: + original_op = self.evaluate + new_op = weave.op( + original_op.resolve_fn, + call_display_name=self.evaluation_name, + ) + bound_method = MethodType(new_op, self) + self.__dict__["evaluate"] = bound_method scorers: list[Union[Callable, Scorer, Op]] = [] for scorer in self.scorers or []: From 0277d4893b6ed34a77587684978789b6d39adc80 Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Tue, 3 Dec 2024 15:09:36 -0500 Subject: [PATCH 04/11] test --- tests/trace/test_evaluations.py | 4 ++-- tests/trace/util.py | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py index 047df235c641..e5c38ef0140e 100644 --- a/tests/trace/test_evaluations.py +++ b/tests/trace/test_evaluations.py @@ -7,7 +7,7 @@ from PIL import Image import weave -from tests.trace.util import AnyIntMatcher +from tests.trace.util import AnyIntMatcher, AnyStrMatcher from weave import Evaluation, Model from weave.scorers import Scorer from weave.trace.refs import CallRef @@ -504,8 +504,8 @@ async def test_evaluation_data_topology(client): } }, "weave": { + "display_name": AnyStrMatcher(), "latency_ms": AnyIntMatcher(), - "trace_name": "Evaluation.evaluate", "status": "success", }, } diff --git a/tests/trace/util.py b/tests/trace/util.py index eb4c6002beb5..beb651722e36 100644 --- a/tests/trace/util.py +++ b/tests/trace/util.py @@ -8,6 +8,13 @@ def client_is_sqlite(client): return isinstance(client.server._internal_trace_server, SqliteTraceServer) +class AnyStrMatcher: + """Matches any string.""" + + def __eq__(self, other): + return isinstance(other, str) + + class AnyIntMatcher: """Matches any integer.""" From 118b14d59e7136b3fdc5ddaed5132404a4debd09 Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Thu, 5 Dec 2024 00:22:39 -0500 Subject: [PATCH 05/11] test --- weave/flow/eval.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/weave/flow/eval.py b/weave/flow/eval.py index 1fec9db3c713..d3645f1ab025 100644 --- a/weave/flow/eval.py +++ b/weave/flow/eval.py @@ -5,7 +5,6 @@ import time import traceback from collections.abc import Coroutine -from types import MethodType from typing import Any, Callable, Literal, Optional, Union, cast from pydantic import PrivateAttr @@ -127,13 +126,7 @@ def function_to_evaluate(question: str): def model_post_init(self, __context: Any) -> None: if self.evaluation_name: - original_op = self.evaluate - new_op = weave.op( - original_op.resolve_fn, - call_display_name=self.evaluation_name, - ) - bound_method = MethodType(new_op, self) - self.__dict__["evaluate"] = bound_method + self.evaluate.call_display_name = self.evaluation_name scorers: list[Union[Callable, Scorer, Op]] = [] for scorer in self.scorers or []: From b546a4f0ea57f2a2a8d39665124afd957dd2cecf Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Thu, 5 Dec 2024 00:28:01 -0500 Subject: [PATCH 06/11] test --- weave/flow/eval.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/weave/flow/eval.py b/weave/flow/eval.py index d3645f1ab025..ee5578610687 100644 --- a/weave/flow/eval.py +++ b/weave/flow/eval.py @@ -7,7 +7,7 @@ from collections.abc import Coroutine from typing import Any, Callable, Literal, Optional, Union, cast -from pydantic import PrivateAttr +from pydantic import PrivateAttr, model_validator from rich import print from rich.console import Console @@ -124,10 +124,13 @@ def function_to_evaluate(question: str): # internal attr to track whether to use the new `output` or old `model_output` key for outputs _output_key: Literal["output", "model_output"] = PrivateAttr("output") - def model_post_init(self, __context: Any) -> None: + @model_validator(mode="after") + def _udpate_display_name(self) -> "Evaluation": if self.evaluation_name: self.evaluate.call_display_name = self.evaluation_name + return self + def model_post_init(self, __context: Any) -> None: scorers: list[Union[Callable, Scorer, Op]] = [] for scorer in self.scorers or []: if isinstance(scorer, Scorer): From 6156d258a9ac12c071dce9e92108e6ca7101aeb2 Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Thu, 5 Dec 2024 00:28:55 -0500 Subject: [PATCH 07/11] test --- weave/flow/eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/weave/flow/eval.py b/weave/flow/eval.py index ee5578610687..6575c5026acd 100644 --- a/weave/flow/eval.py +++ b/weave/flow/eval.py @@ -125,7 +125,8 @@ def function_to_evaluate(question: str): _output_key: Literal["output", "model_output"] = PrivateAttr("output") @model_validator(mode="after") - def _udpate_display_name(self) -> "Evaluation": + def _update_display_name(self) -> "Evaluation": + # Keep the evaluate op's `call_display_name` in sync with `evaluation_name` if self.evaluation_name: self.evaluate.call_display_name = self.evaluation_name return self From b3df39cd1342078af7eda986bb4f5d4c3a322451 Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Thu, 5 Dec 2024 02:23:43 -0500 Subject: [PATCH 08/11] test --- weave/flow/eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/weave/flow/eval.py b/weave/flow/eval.py index 6575c5026acd..57f7166ddb5a 100644 --- a/weave/flow/eval.py +++ b/weave/flow/eval.py @@ -128,7 +128,8 @@ def function_to_evaluate(question: str): def _update_display_name(self) -> "Evaluation": # Keep the evaluate op's `call_display_name` in sync with `evaluation_name` if self.evaluation_name: - self.evaluate.call_display_name = self.evaluation_name + eval_op = cast(Op, self.evaluate) + eval_op.call_display_name = self.evaluation_name return self def model_post_init(self, __context: Any) -> None: From 32b5bcf6337657b4f35a06f8fc2f809d542b1097 Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Thu, 5 Dec 2024 13:58:01 -0500 Subject: [PATCH 09/11] test --- weave/flow/eval.py | 9 ++++++++- weave/flow/util.py | 4 +--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/weave/flow/eval.py b/weave/flow/eval.py index 57f7166ddb5a..9042d91d9de0 100644 --- a/weave/flow/eval.py +++ b/weave/flow/eval.py @@ -1,4 +1,5 @@ import asyncio +import datetime import inspect import logging import textwrap @@ -42,6 +43,12 @@ ) +def default_evaluation_display_name(call: Call) -> str: + date = datetime.now().strftime("%Y-%m-%d") + unique_name = make_memorable_name() + return f"eval-{date}-{unique_name}" + + def async_call(func: Union[Callable, Op], *args: Any, **kwargs: Any) -> Coroutine: is_async = False if is_op(func): @@ -499,7 +506,7 @@ async def eval_example(example: dict) -> dict: eval_rows.append(eval_row) return EvaluationResults(rows=weave.Table(eval_rows)) - @weave.op(call_display_name=make_memorable_name) + @weave.op(call_display_name=default_evaluation_display_name) async def evaluate(self, model: Union[Callable, Model]) -> dict: # The need for this pattern is quite unfortunate and highlights a gap in our # data model. As a user, I just want to pass a list of data `eval_rows` to diff --git a/weave/flow/util.py b/weave/flow/util.py index e528af1ad26d..b41d651b6cf0 100644 --- a/weave/flow/util.py +++ b/weave/flow/util.py @@ -5,8 +5,6 @@ from collections.abc import AsyncIterator, Awaitable, Iterable from typing import Any, Callable, TypeVar -from weave.trace.weave_client import Call - T = TypeVar("T") U = TypeVar("U") @@ -86,7 +84,7 @@ def warn_once(logger: logging.Logger, message: str) -> None: _shown_warnings.add(message) -def make_memorable_name(call: Call) -> str: +def make_memorable_name() -> str: adjectives = [ "jubilant", "eager", From acf33e24e08da48c3d874c2bd46e4bc09d73cefe Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Thu, 5 Dec 2024 14:30:15 -0500 Subject: [PATCH 10/11] test --- weave/flow/eval.py | 6 +++--- weave/trace/op.py | 52 ++++++++++++++++++---------------------------- 2 files changed, 23 insertions(+), 35 deletions(-) diff --git a/weave/flow/eval.py b/weave/flow/eval.py index 9042d91d9de0..bf78dc06d85d 100644 --- a/weave/flow/eval.py +++ b/weave/flow/eval.py @@ -1,11 +1,11 @@ import asyncio -import datetime import inspect import logging import textwrap import time import traceback from collections.abc import Coroutine +from datetime import datetime from typing import Any, Callable, Literal, Optional, Union, cast from pydantic import PrivateAttr, model_validator @@ -133,9 +133,9 @@ def function_to_evaluate(question: str): @model_validator(mode="after") def _update_display_name(self) -> "Evaluation": - # Keep the evaluate op's `call_display_name` in sync with `evaluation_name` if self.evaluation_name: - eval_op = cast(Op, self.evaluate) + # Treat user-specified `evaluation_name` as the name for `Evaluation.evaluate` + eval_op = as_op(self.evaluate) eval_op.call_display_name = self.evaluation_name return self diff --git a/weave/trace/op.py b/weave/trace/op.py index 45147789dee5..2b5835474d88 100644 --- a/weave/trace/op.py +++ b/weave/trace/op.py @@ -34,8 +34,6 @@ logger = logging.getLogger(__name__) -WEAVE_KWARGS_KEY = "__weave" - if TYPE_CHECKING: from weave.trace.weave_client import Call, CallsIter @@ -54,17 +52,6 @@ except ImportError: ANTHROPIC_NOT_GIVEN = None -try: - # https://github.com/search?q=repo:mistralai/client-python%20Final&type=code - from mistralai.types.basemodel import UNSET # type: ignore - - MISTRAL_NOT_GIVEN = UNSET # type: ignore -except ImportError: - MISTRAL_NOT_GIVEN = None - -MISTRAL_NOT_GIVEN = None - - try: from cerebras.cloud.sdk._types import NOT_GIVEN as CEREBRAS_NOT_GIVEN except ImportError: @@ -105,14 +92,13 @@ class ProcessedInputs: def _value_is_sentinel(param: Any) -> bool: - return ( - param.default is None - or param.default is OPENAI_NOT_GIVEN - or param.default is COHERE_NOT_GIVEN - or param.default is ANTHROPIC_NOT_GIVEN - or param.default is MISTRAL_NOT_GIVEN - or param.default is CEREBRAS_NOT_GIVEN - or param.default is Ellipsis + return param.default in ( + None, + Ellipsis, + OPENAI_NOT_GIVEN, + COHERE_NOT_GIVEN, + ANTHROPIC_NOT_GIVEN, + CEREBRAS_NOT_GIVEN, ) @@ -121,16 +107,15 @@ def _apply_fn_defaults_to_inputs( ) -> dict[str, Any]: inputs = {**inputs} sig = inspect.signature(fn) - for param_name, param in sig.parameters.items(): - if param_name not in inputs: - if param.default != inspect.Parameter.empty and not _value_is_sentinel( - param - ): - inputs[param_name] = param.default - if param.kind == inspect.Parameter.VAR_POSITIONAL: - inputs[param_name] = () - elif param.kind == inspect.Parameter.VAR_KEYWORD: - inputs[param_name] = {} + for name, param in sig.parameters.items(): + if name in inputs: + continue + if param.default != inspect.Parameter.empty and not _value_is_sentinel(param): + inputs[name] = param.default + if param.kind == inspect.Parameter.VAR_POSITIONAL: + inputs[name] = () + if param.kind == inspect.Parameter.VAR_KEYWORD: + inputs[name] = {} return inputs @@ -230,6 +215,7 @@ def _default_on_input_handler(func: Op, args: tuple, kwargs: dict) -> ProcessedI inputs = sig.bind(*args, **kwargs).arguments except TypeError as e: raise OpCallError(f"Error calling {func.name}: {e}") + inputs_with_defaults = _apply_fn_defaults_to_inputs(func, inputs) return ProcessedInputs( original_args=args, @@ -750,7 +736,9 @@ def as_op(fn: Callable) -> Op: if not is_op(fn): raise ValueError("fn must be a weave.op() decorated function") - return cast(Op, fn) + # The unbinding is necessary for methods because `MethodType` is applied after the + # func is decorated into an Op. + return maybe_unbind_method(cast(Op, fn)) __docspec__ = [call, calls] From 5b571685d87172e03c8ebbc8ba2ee585a506e7b1 Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Sun, 8 Dec 2024 21:26:44 -0500 Subject: [PATCH 11/11] test --- weave/flow/util.py | 94 +++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/weave/flow/util.py b/weave/flow/util.py index b41d651b6cf0..ba35d5ebe4ab 100644 --- a/weave/flow/util.py +++ b/weave/flow/util.py @@ -86,21 +86,34 @@ def warn_once(logger: logging.Logger, message: str) -> None: def make_memorable_name() -> str: adjectives = [ - "jubilant", - "eager", - "calm", + "brave", "bright", + "calm", + "charming", "clever", + "daring", "dazzling", + "eager", "elegant", + "eloquent", "fierce", + "friendly", "gentle", + "graceful", "happy", + "honest", + "imaginative", "innocent", + "joyful", + "jubilant", + "keen", "kind", "lively", + "loyal", "merry", "nice", + "noble", + "optimistic", "proud", "quiet", "rich", @@ -109,60 +122,47 @@ def make_memorable_name() -> str: "unique", "wise", "zealous", - "brave", - "charming", - "daring", - "eloquent", - "friendly", - "graceful", - "honest", - "imaginative", - "joyful", - "keen", - "loyal", - "noble", - "optimistic", ] nouns = [ - "sun", - "moon", - "star", - "cloud", - "rain", - "wind", - "tree", - "flower", - "river", - "mountain", - "ocean", - "forest", - "meadow", - "bird", - "wolf", "bear", - "tiger", - "lion", - "eagle", - "fish", - "whale", - "dolphin", - "rose", - "daisy", - "oak", - "pine", - "maple", - "cedar", - "valley", - "hill", - "lake", - "stream", + "bird", "breeze", + "cedar", + "cloud", + "daisy", "dawn", + "dolphin", "dusk", + "eagle", + "fish", + "flower", + "forest", + "hill", "horizon", "island", + "lake", + "lion", + "maple", + "meadow", + "moon", + "mountain", + "oak", + "ocean", + "pine", "plateau", + "rain", + "river", + "rose", + "star", + "stream", + "sun", + "tiger", + "tree", + "valley", + "whale", + "wind", + "wolf", ] adj = random.choice(adjectives)