add column_map warnings, fix docs, make create and embed available

wandb · Oct 17, 2024 · a847395 · a847395
1 parent 873b90c
commit a847395
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 28 deletions.
diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
@@ -36,29 +36,32 @@ Example:
 
 ```python
 from weave import Scorer
+from weave.scorers import create
+
+llm_client = ...
 
 class SummarizationScorer(Scorer):
     model_id: str = "the LLM model to use"
     system_prompt: str = "Evaluate whether the summary is good."
 
     @weave.op
-    def some_complicated_preprocessing(text):
+    def some_complicated_preprocessing(self, text: str) -> str:
         ...
         return text
 
     @weave.op
-    def llm_call(summary, text):
-        res = create(self.system_prompt, summary, text)
+    def call_llm(self, summary: str, text: str) -> Dict:
+        res = llm_client.create(self.system_prompt, summary, text)
         return {"summary_quality": res}
 
     @weave.op
-    def score(output, text)
-        '''
-            output: The summary generated by an AI system
-            text: The original text being summarised
-        '''
+    def score(self, output: str, text: str) -> Dict
+        """"
+        output: The summary generated by an AI system
+        text: The original text being summarised
+        """"
         text = some_complicated_preprocessing(text)
-        eval_result = call_llm(summary, text, self.prompt)
+        eval_result = call_llm(summary, text)
         return {"summary_quality": eval_result}
 
 summarization_scorer = SummarizationScorer(model_id="o2")
@@ -67,14 +70,23 @@ eval = weave.Evaluations(..., scorers=[summarization_scorer])
 This class evaluates how good a summary is by comparing it to the original text.
 
 ## How Scorers Work
-### Keyword Arguments
-Scorers can access both the output from your AI system and the input data.
+### Scorer Keyword Arguments
+Scorers can access both the output from your AI system and the input data from the dataset row.
 
-- **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output.
+- **Input:** If you would like your scorer to use data from your dataset row, such as a "label" or "target" column then you can easily make this available to the scorer by adding a `label` or `target` keyword argument to your scorer definition.
+
+For example if you wanted to use a column called "label" from your dataset then your scorer function (or `score` class method) would have a parameter list like this:
 
-- **Input:** Add parameters that match the names of the columns in your dataset to access input data.
+```python
+@weave.op
+def my_custom_scorer(outout: str, label: int):
+    ...
+```
+
+When a weave `Evaluation` is run, the output of the AI system is passed to the `output` parameter. The `Evaluation` also automatically tries to match any additional scorer parameter names to your dataset columns. If customizing your scorer parameters or dataset columns is not feasible, you can use column mapping - see below for more.
+
+- **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output.
 
-For example, if your dataset has a "news_article" column, you can access it in the scorer by adding a `news_article` parameter to your scorer's signature.
 
 ### Mapping Column Names
 Sometimes, the scorer's parameter names don't match the column names in your dataset. You can fix this using a `column_map`.
@@ -111,6 +123,14 @@ Here, the `text` parameter in the score method will receive data from the `news_
 
 ## Predefined Scorers
 
+**Installation**
+
+To use Weave's predefined scorers you need to install some additional dependencies:
+
+```bash
+pip install weave[scorers]
+```
+
 **LLM-evaluators**
 
 The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also uses weave's `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them.

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
@@ -223,7 +223,7 @@ async def predict_and_score(
             model_output = None
         model_latency = time.time() - model_start_time
 
-        scores = {}
+        scores = {}  # TODO: Consider moving scorer setup and checks out of `predict_and_score`
         scorers = cast(list[Union[Op, Scorer]], self.scorers or [])
         for scorer in scorers:
             scorer_self = None
@@ -245,14 +245,8 @@ async def predict_and_score(
             ):
                 message = textwrap.dedent(
                     f"""
-                    Scorer {scorer_name} must have a 'model_output' or 'output' argument, to receive the output of the model function.
-                    You can also set the `scorer.column_map` attribute to map dataset columns to the expected parameter names in the scorer.
-                    For example, if the scorer expects "input" and "ground_truth" and we have a dataset
-                    with columns "question" and "answer", column_map should be defined as follows:
-                    {{"input": "question", "ground_truth": "answer"}}
-                    scorer.column_map: {getattr(scorer, 'column_map', None)}
-                    score_arg_names: {score_arg_names}
-                    example: {example}
+                    Scorer {scorer_name} must have an `output` or `model_output` argument, to receive the 
+                    output of the model function.
                     """
                 )
                 raise OpCallError(message)
@@ -272,11 +266,43 @@ async def predict_and_score(
                 # input: is the full row, we have access to it via example
                 # output: is the model output, we have access to it via model_output
                 if isinstance(scorer, Scorer) and scorer.column_map is not None:
-                    score_args = {
-                        arg: example[scorer.column_map.get(arg, arg)]
-                        for arg in score_arg_names
-                        if scorer.column_map.get(arg, arg) in example
-                    }
+                    score_args = {}
+                    for arg in score_arg_names:
+                        # Testing scorer to column_map logic
+                        # Check column_map validity, if column_map contains the scorer args
+                        if arg not in scorer.column_map:
+                            message = textwrap.dedent(
+                            f"""
+                            Scorer {scorer_name} expects argument {arg} to be in `scorer.column_map` keys.
+                            Available scorer keyword argument names: {score_arg_names}
+                            scorer.column_map keys: {scorer.column_map.keys()}
+
+                            Hint: 
+                            - column_map should follow the format: {{scorer arg name: dataset column name}}
+                            - Check if your scorer.column_map keys and values are not reversed. 
+                            """
+                            )   
+                            raise ValueError(message)
+
+                        # Try to map scorer arg to dataset columm, testing dataset to column_map logic
+                        example_column_name = scorer.column_map.get(arg)
+                        if example_column_name in example:
+                            score_args[arg] = example[example_column_name]
+                        else:
+                            message = textwrap.dedent(
+                            f"""
+                            There is an issue with `scorer.column_map`: {scorer.column_map}.
+                            
+                            The value for column_map key: {arg} is {example_column_name} but 
+                            {example_column_name} is not found in the dataset columns.
+                            
+                            Available dataset columns: {example.keys()}
+
+                            Hint: 
+                            - column_map should follow the format: {{scorer arg name: dataset column name}} 
+                            """
+                            )   
+                            raise ValueError(message)
                 else:
                     score_args = {
                         k: v for k, v in example.items() if k in score_arg_names
@@ -326,6 +352,17 @@ async def predict_and_score(
                     f"""
                     Call error: {e}
 
+                                        If using the `Scorer` weave class, you can set the `scorer.column_map` 
+                    attribute to map scorer parameter names to dataset columns.
+                    
+                    For example, if the scorer expects "output", "input" and "ground_truth" and we have a dataset
+                    with columns "question" and "answer", `column_map` can be used to map the non-output parameter to like so:
+                    {{"input": "question", "ground_truth": "answer"}}
+                    
+                    scorer argument names: {score_arg_names}
+                    dataset keys: {example.keys()}
+                    scorer.column_map: {getattr(scorer, 'column_map', None)}
+
                     Options for resolving:
                     a. change {scorer_name} argument names to match a subset of dataset column names ({dataset_column_names_str})
                     b. change dataset column names to match expected {scorer_name} argument names: {required_arg_names}

diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py
@@ -3,6 +3,10 @@
     auto_summarize,
     get_scorer_attributes,
 )
+from weave.flow.scorers.llm_utils import (
+    create,
+    embed,
+)
 from weave.flow.scorers.classification_scorer import (
     MultiTaskBinaryClassificationF1,
     transpose,
@@ -29,6 +33,8 @@
 
 __all__ = [
     "auto_summarize",
+    "create",
+    "embed",
     "ContextEntityRecallScorer",
     "ContextRelevancyScorer",
     "EmbeddingSimilarityScorer",