wandb · andrewtruong · Dec 9, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
@@ -7,7 +7,7 @@
 from PIL import Image
 
 import weave
-from tests.trace.util import AnyIntMatcher
+from tests.trace.util import AnyIntMatcher, AnyStrMatcher
 from weave import Evaluation, Model
 from weave.scorers import Scorer
 from weave.trace.refs import CallRef
@@ -504,8 +504,8 @@ async def test_evaluation_data_topology(client):
             }
         },
         "weave": {
+            "display_name": AnyStrMatcher(),
             "latency_ms": AnyIntMatcher(),
-            "trace_name": "Evaluation.evaluate",
             "status": "success",
         },
     }
@@ -1029,3 +1029,21 @@ def my_second_scorer(text, output, model_output):
 
     with pytest.raises(ValueError, match="Both 'output' and 'model_output'"):
         evaluation = weave.Evaluation(dataset=ds, scorers=[my_second_scorer])
+
+
+@pytest.mark.asyncio
+async def test_evaluation_with_custom_name(client):
+    dataset = weave.Dataset(rows=[{"input": "hi", "output": "hello"}])
+    evaluation = weave.Evaluation(dataset=dataset, evaluation_name="wow-custom!")
+
+    @weave.op()
+    def model(input: str) -> str:
+        return "hmmm"
+
+    await evaluation.evaluate(model)
+
+    calls = list(client.get_calls(filter=tsi.CallsFilter(trace_roots_only=True)))
+    assert len(calls) == 1
+
+    call = calls[0]
+    assert call.display_name == "wow-custom!"
@@ -8,6 +8,13 @@ def client_is_sqlite(client):
     return isinstance(client.server._internal_trace_server, SqliteTraceServer)
 
 
+class AnyStrMatcher:
+    """Matches any string."""
+
+    def __eq__(self, other):
+        return isinstance(other, str)
+
+
 class AnyIntMatcher:
     """Matches any integer."""
 

@@ -5,9 +5,10 @@
 import time
 import traceback
 from collections.abc import Coroutine
+from datetime import datetime
 from typing import Any, Callable, Literal, Optional, Union, cast
 
-from pydantic import PrivateAttr
+from pydantic import PrivateAttr, model_validator
 from rich import print
 from rich.console import Console
 
@@ -16,6 +17,7 @@
 from weave.flow.dataset import Dataset
 from weave.flow.model import Model, get_infer_method
 from weave.flow.obj import Object
+from weave.flow.util import make_memorable_name
 from weave.scorers import (
     Scorer,
     _has_oldstyle_scorers,
@@ -28,7 +30,7 @@
 from weave.trace.env import get_weave_parallelism
 from weave.trace.errors import OpCallError
 from weave.trace.isinstance import weave_isinstance
-from weave.trace.op import Op, as_op, is_op
+from weave.trace.op import CallDisplayNameFunc, Op, as_op, is_op
 from weave.trace.vals import WeaveObject
 from weave.trace.weave_client import Call, get_ref
 
@@ -41,6 +43,12 @@
 )
 
 
+def default_evaluation_display_name(call: Call) -> str:
+    date = datetime.now().strftime("%Y-%m-%d")
+    unique_name = make_memorable_name()
+    return f"eval-{date}-{unique_name}"
+
+
 def async_call(func: Union[Callable, Op], *args: Any, **kwargs: Any) -> Coroutine:
     is_async = False
     if is_op(func):
@@ -116,9 +124,21 @@ def function_to_evaluate(question: str):
     preprocess_model_input: Optional[Callable] = None
     trials: int = 1
 
+    # Custom evaluation name for display in the UI.  This is the same API as passing a
+    # custom `call_display_name` to `weave.op` (see that for more details).
+    evaluation_name: Optional[Union[str, CallDisplayNameFunc]] = None
+
     # internal attr to track whether to use the new `output` or old `model_output` key for outputs
     _output_key: Literal["output", "model_output"] = PrivateAttr("output")
 
+    @model_validator(mode="after")
+    def _update_display_name(self) -> "Evaluation":
+        if self.evaluation_name:
+            # Treat user-specified `evaluation_name` as the name for `Evaluation.evaluate`
+            eval_op = as_op(self.evaluate)
+            eval_op.call_display_name = self.evaluation_name
+        return self
+
     def model_post_init(self, __context: Any) -> None:
         scorers: list[Union[Callable, Scorer, Op]] = []
         for scorer in self.scorers or []:
@@ -486,7 +506,7 @@ async def eval_example(example: dict) -> dict:
             eval_rows.append(eval_row)
         return EvaluationResults(rows=weave.Table(eval_rows))
 
-    @weave.op()
+    @weave.op(call_display_name=default_evaluation_display_name)
     async def evaluate(self, model: Union[Callable, Model]) -> dict:
         # The need for this pattern is quite unfortunate and highlights a gap in our
         # data model. As a user, I just want to pass a list of data `eval_rows` to

@@ -1,6 +1,7 @@
 import asyncio
 import logging
 import multiprocessing
+import random
 from collections.abc import AsyncIterator, Awaitable, Iterable
 from typing import Any, Callable, TypeVar
 
@@ -81,3 +82,89 @@ def warn_once(logger: logging.Logger, message: str) -> None:
     if message not in _shown_warnings:
         logger.warning(message)
         _shown_warnings.add(message)
+
+
+def make_memorable_name() -> str:
+    adjectives = [
+        "jubilant",
+        "eager",
+        "calm",
+        "bright",
+        "clever",
+        "dazzling",
+        "elegant",
+        "fierce",
+        "gentle",
+        "happy",
+        "innocent",
+        "kind",
+        "lively",
+        "merry",
+        "nice",
+        "proud",
+        "quiet",
+        "rich",
+        "sweet",
+        "tender",
+        "unique",
+        "wise",
+        "zealous",
+        "brave",
+        "charming",
+        "daring",
+        "eloquent",
+        "friendly",
+        "graceful",
+        "honest",
+        "imaginative",
+        "joyful",
+        "keen",
+        "loyal",
+        "noble",
+        "optimistic",
+    ]
+
+    nouns = [
+        "sun",
+        "moon",
+        "star",
+        "cloud",
+        "rain",
+        "wind",
+        "tree",
+        "flower",
+        "river",
+        "mountain",
+        "ocean",
+        "forest",
+        "meadow",
+        "bird",
+        "wolf",
+        "bear",
+        "tiger",
+        "lion",
+        "eagle",
+        "fish",
+        "whale",
+        "dolphin",
+        "rose",
+        "daisy",
+        "oak",
+        "pine",
+        "maple",
+        "cedar",
+        "valley",
+        "hill",
+        "lake",
+        "stream",
+        "breeze",
+        "dawn",
+        "dusk",
+        "horizon",
+        "island",
+        "plateau",
+    ]
+
+    adj = random.choice(adjectives)
+    noun = random.choice(nouns)
+    return f"{adj}-{noun}"
@@ -107,16 +107,15 @@ def _apply_fn_defaults_to_inputs(
 ) -> dict[str, Any]:
     inputs = {**inputs}
     sig = inspect.signature(fn)
-    for param_name, param in sig.parameters.items():
-        if param_name not in inputs:
-            if param.default != inspect.Parameter.empty and not _value_is_sentinel(
-                param
-            ):
-                inputs[param_name] = param.default
-            if param.kind == inspect.Parameter.VAR_POSITIONAL:
-                inputs[param_name] = ()
-            elif param.kind == inspect.Parameter.VAR_KEYWORD:
-                inputs[param_name] = {}
+    for name, param in sig.parameters.items():
+        if name in inputs:
+            continue
+        if param.default != inspect.Parameter.empty and not _value_is_sentinel(param):
+            inputs[name] = param.default
+        if param.kind == inspect.Parameter.VAR_POSITIONAL:
+            inputs[name] = ()
+        if param.kind == inspect.Parameter.VAR_KEYWORD:
+            inputs[name] = {}
     return inputs
 
 
@@ -216,6 +215,7 @@ def _default_on_input_handler(func: Op, args: tuple, kwargs: dict) -> ProcessedI
         inputs = sig.bind(*args, **kwargs).arguments
     except TypeError as e:
         raise OpCallError(f"Error calling {func.name}: {e}")
+
     inputs_with_defaults = _apply_fn_defaults_to_inputs(func, inputs)
     return ProcessedInputs(
         original_args=args,
@@ -736,7 +736,9 @@ def as_op(fn: Callable) -> Op:
     if not is_op(fn):
         raise ValueError("fn must be a weave.op() decorated function")
 
-    return cast(Op, fn)
+    # The unbinding is necessary for methods because `MethodType` is applied after the
+    # func is decorated into an Op.
+    return maybe_unbind_method(cast(Op, fn))
 
 
 __docspec__ = [call, calls]