Skip to content

Commit

Permalink
add column_map warnings, fix docs, make create and embed available
Browse files Browse the repository at this point in the history
  • Loading branch information
morganmcg1 committed Oct 17, 2024
1 parent 873b90c commit a847395
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 28 deletions.
48 changes: 34 additions & 14 deletions docs/docs/guides/evaluation/scorers.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,29 +36,32 @@ Example:

```python
from weave import Scorer
from weave.scorers import create

llm_client = ...

class SummarizationScorer(Scorer):
model_id: str = "the LLM model to use"
system_prompt: str = "Evaluate whether the summary is good."

@weave.op
def some_complicated_preprocessing(text):
def some_complicated_preprocessing(self, text: str) -> str:
...
return text

@weave.op
def llm_call(summary, text):
res = create(self.system_prompt, summary, text)
def call_llm(self, summary: str, text: str) -> Dict:
res = llm_client.create(self.system_prompt, summary, text)
return {"summary_quality": res}

@weave.op
def score(output, text)
'''
output: The summary generated by an AI system
text: The original text being summarised
'''
def score(self, output: str, text: str) -> Dict
""""
output: The summary generated by an AI system
text: The original text being summarised
""""
text = some_complicated_preprocessing(text)
eval_result = call_llm(summary, text, self.prompt)
eval_result = call_llm(summary, text)
return {"summary_quality": eval_result}

summarization_scorer = SummarizationScorer(model_id="o2")
Expand All @@ -67,14 +70,23 @@ eval = weave.Evaluations(..., scorers=[summarization_scorer])
This class evaluates how good a summary is by comparing it to the original text.

## How Scorers Work
### Keyword Arguments
Scorers can access both the output from your AI system and the input data.
### Scorer Keyword Arguments
Scorers can access both the output from your AI system and the input data from the dataset row.

- **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output.
- **Input:** If you would like your scorer to use data from your dataset row, such as a "label" or "target" column then you can easily make this available to the scorer by adding a `label` or `target` keyword argument to your scorer definition.

For example if you wanted to use a column called "label" from your dataset then your scorer function (or `score` class method) would have a parameter list like this:

- **Input:** Add parameters that match the names of the columns in your dataset to access input data.
```python
@weave.op
def my_custom_scorer(outout: str, label: int):
...
```

When a weave `Evaluation` is run, the output of the AI system is passed to the `output` parameter. The `Evaluation` also automatically tries to match any additional scorer parameter names to your dataset columns. If customizing your scorer parameters or dataset columns is not feasible, you can use column mapping - see below for more.

- **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output.

For example, if your dataset has a "news_article" column, you can access it in the scorer by adding a `news_article` parameter to your scorer's signature.

### Mapping Column Names
Sometimes, the scorer's parameter names don't match the column names in your dataset. You can fix this using a `column_map`.
Expand Down Expand Up @@ -111,6 +123,14 @@ Here, the `text` parameter in the score method will receive data from the `news_
## Predefined Scorers
**Installation**
To use Weave's predefined scorers you need to install some additional dependencies:
```bash
pip install weave[scorers]
```
**LLM-evaluators**
The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also uses weave's `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them.
Expand Down
65 changes: 51 additions & 14 deletions weave/flow/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ async def predict_and_score(
model_output = None
model_latency = time.time() - model_start_time

scores = {}
scores = {} # TODO: Consider moving scorer setup and checks out of `predict_and_score`
scorers = cast(list[Union[Op, Scorer]], self.scorers or [])
for scorer in scorers:
scorer_self = None
Expand All @@ -245,14 +245,8 @@ async def predict_and_score(
):
message = textwrap.dedent(
f"""
Scorer {scorer_name} must have a 'model_output' or 'output' argument, to receive the output of the model function.
You can also set the `scorer.column_map` attribute to map dataset columns to the expected parameter names in the scorer.
For example, if the scorer expects "input" and "ground_truth" and we have a dataset
with columns "question" and "answer", column_map should be defined as follows:
{{"input": "question", "ground_truth": "answer"}}
scorer.column_map: {getattr(scorer, 'column_map', None)}
score_arg_names: {score_arg_names}
example: {example}
Scorer {scorer_name} must have an `output` or `model_output` argument, to receive the
output of the model function.
"""
)
raise OpCallError(message)
Expand All @@ -272,11 +266,43 @@ async def predict_and_score(
# input: is the full row, we have access to it via example
# output: is the model output, we have access to it via model_output
if isinstance(scorer, Scorer) and scorer.column_map is not None:
score_args = {
arg: example[scorer.column_map.get(arg, arg)]
for arg in score_arg_names
if scorer.column_map.get(arg, arg) in example
}
score_args = {}
for arg in score_arg_names:
# Testing scorer to column_map logic
# Check column_map validity, if column_map contains the scorer args
if arg not in scorer.column_map:
message = textwrap.dedent(
f"""
Scorer {scorer_name} expects argument {arg} to be in `scorer.column_map` keys.
Available scorer keyword argument names: {score_arg_names}
scorer.column_map keys: {scorer.column_map.keys()}
Hint:
- column_map should follow the format: {{scorer arg name: dataset column name}}
- Check if your scorer.column_map keys and values are not reversed.
"""
)
raise ValueError(message)

# Try to map scorer arg to dataset columm, testing dataset to column_map logic
example_column_name = scorer.column_map.get(arg)
if example_column_name in example:
score_args[arg] = example[example_column_name]
else:
message = textwrap.dedent(
f"""
There is an issue with `scorer.column_map`: {scorer.column_map}.
The value for column_map key: {arg} is {example_column_name} but
{example_column_name} is not found in the dataset columns.
Available dataset columns: {example.keys()}
Hint:
- column_map should follow the format: {{scorer arg name: dataset column name}}
"""
)
raise ValueError(message)
else:
score_args = {
k: v for k, v in example.items() if k in score_arg_names
Expand Down Expand Up @@ -326,6 +352,17 @@ async def predict_and_score(
f"""
Call error: {e}
If using the `Scorer` weave class, you can set the `scorer.column_map`
attribute to map scorer parameter names to dataset columns.
For example, if the scorer expects "output", "input" and "ground_truth" and we have a dataset
with columns "question" and "answer", `column_map` can be used to map the non-output parameter to like so:
{{"input": "question", "ground_truth": "answer"}}
scorer argument names: {score_arg_names}
dataset keys: {example.keys()}
scorer.column_map: {getattr(scorer, 'column_map', None)}
Options for resolving:
a. change {scorer_name} argument names to match a subset of dataset column names ({dataset_column_names_str})
b. change dataset column names to match expected {scorer_name} argument names: {required_arg_names}
Expand Down
6 changes: 6 additions & 0 deletions weave/flow/scorers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
auto_summarize,
get_scorer_attributes,
)
from weave.flow.scorers.llm_utils import (
create,
embed,
)
from weave.flow.scorers.classification_scorer import (
MultiTaskBinaryClassificationF1,
transpose,
Expand All @@ -29,6 +33,8 @@

__all__ = [
"auto_summarize",
"create",
"embed",
"ContextEntityRecallScorer",
"ContextRelevancyScorer",
"EmbeddingSimilarityScorer",
Expand Down

0 comments on commit a847395

Please sign in to comment.