From c7ac51a7ffdabddcb93c9f762848e11835c1791e Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Thu, 10 Oct 2024 15:16:33 +0100
Subject: [PATCH 001/150] add inital scorere, refactor

---
 weave/flow/scorer/__init__.py                 |  1 +
 .../flow/{scorer.py => scorer/base_scorer.py} |  0
 weave/flow/scorer/regex_scorer.py             | 41 +++++++++++++++++++
 3 files changed, 42 insertions(+)
 create mode 100644 weave/flow/scorer/__init__.py
 rename weave/flow/{scorer.py => scorer/base_scorer.py} (100%)
 create mode 100644 weave/flow/scorer/regex_scorer.py

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
new file mode 100644
index 000000000000..75628913f604
--- /dev/null
+++ b/weave/flow/scorer/__init__.py
@@ -0,0 +1 @@
+from .base_scorer import *
\ No newline at end of file
diff --git a/weave/flow/scorer.py b/weave/flow/scorer/base_scorer.py
similarity index 100%
rename from weave/flow/scorer.py
rename to weave/flow/scorer/base_scorer.py
diff --git a/weave/flow/scorer/regex_scorer.py b/weave/flow/scorer/regex_scorer.py
new file mode 100644
index 000000000000..b80f1143f1da
--- /dev/null
+++ b/weave/flow/scorer/regex_scorer.py
@@ -0,0 +1,41 @@
+from weave.flow.scorer.base_scorer import Scorer
+
+from typing import Union
+import re
+from pydantic import Field
+import weave
+
+class RegexScorer(Scorer):
+    patterns: Union[str, list[str]] = Field(default_factory=list, description="The patterns or keywords to match")
+    ignore_case: bool = True
+    ignore_whitespace: bool = False
+    use_regex: bool = False  # Use regex patterns if True
+    target_column: str = Field(default=["target"], description="The class names to match")
+
+    @weave.op
+    def score(self, model_output: Union[dict, str], target: dict ={}) -> dict:
+        if isinstance(model_output, str):
+            model_output = {"output": model_output}
+
+        if target is not None:
+            patterns = target
+
+        flags = re.IGNORECASE if self.ignore_case else 0
+
+        patterns = [self.patterns] if isinstance(self.patterns, str) else self.patterns
+        compiled_patterns = []
+        for pattern in patterns:
+            if not self.use_regex:
+                pattern = re.escape(pattern)
+            if self.ignore_whitespace:
+                pattern = ''.join(pattern.split())
+            compiled_patterns.append(re.compile(pattern, flags=flags))
+
+        # for class_name in self.class_names:
+        text_to_search = model_output.get("output") if model_output else ""
+        if self.ignore_whitespace:
+            text_to_search = ''.join(text_to_search.split())
+
+        match_found = any(pattern.search(text_to_search) for pattern in compiled_patterns)
+
+        return {"string_match": match_found}
\ No newline at end of file

From 08c83bb9a1721c15751be01d4f92efe2408035cd Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Thu, 10 Oct 2024 15:34:18 +0100
Subject: [PATCH 002/150] fixes

---
 weave/flow/scorer/regex_scorer.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/weave/flow/scorer/regex_scorer.py b/weave/flow/scorer/regex_scorer.py
index b80f1143f1da..56d9c74d1003 100644
--- a/weave/flow/scorer/regex_scorer.py
+++ b/weave/flow/scorer/regex_scorer.py
@@ -10,28 +10,30 @@ class RegexScorer(Scorer):
     ignore_case: bool = True
     ignore_whitespace: bool = False
     use_regex: bool = False  # Use regex patterns if True
-    target_column: str = Field(default=["target"], description="The class names to match")
+    match_full_string: bool = False  # Match the entire string if True
+    target_column: str = Field(default="target", description="The class name to match")
 
     @weave.op
-    def score(self, model_output: Union[dict, str], target: dict ={}) -> dict:
+    def score(self, model_output: Union[dict, str], target: Union[str, list[str], None] = None) -> dict:
         if isinstance(model_output, str):
             model_output = {"output": model_output}
 
-        if target is not None:
-            patterns = target
+        # Use target patterns if provided
+        patterns = target if target else self.patterns
+        if isinstance(patterns, str):
+            patterns = [patterns]
 
         flags = re.IGNORECASE if self.ignore_case else 0
-
-        patterns = [self.patterns] if isinstance(self.patterns, str) else self.patterns
         compiled_patterns = []
         for pattern in patterns:
             if not self.use_regex:
                 pattern = re.escape(pattern)
             if self.ignore_whitespace:
                 pattern = ''.join(pattern.split())
+            if self.match_full_string:
+                pattern = f'^{pattern}$'
             compiled_patterns.append(re.compile(pattern, flags=flags))
 
-        # for class_name in self.class_names:
         text_to_search = model_output.get("output") if model_output else ""
         if self.ignore_whitespace:
             text_to_search = ''.join(text_to_search.split())

From 792827000e3b642670493453e6502f07f06708d2 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 17:54:26 +0200
Subject: [PATCH 003/150] add json and xml scorers

---
 weave/flow/scorer/__init__.py     |  4 +++-
 weave/flow/scorer/json_scorer.py  | 24 ++++++++++++++++++++++++
 weave/flow/scorer/regex_scorer.py |  2 ++
 weave/flow/scorer/xml_scorer.py   | 29 +++++++++++++++++++++++++++++
 4 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 weave/flow/scorer/json_scorer.py
 create mode 100644 weave/flow/scorer/xml_scorer.py

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index 75628913f604..c034d7f51957 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -1 +1,3 @@
-from .base_scorer import *
\ No newline at end of file
+from weave.flow.scorer.base_scorer import *
+from weave.flow.scorer.regex_scorer import RegexScorer
+from weave.flow.scorer.json_scorer import JSONScorer
diff --git a/weave/flow/scorer/json_scorer.py b/weave/flow/scorer/json_scorer.py
new file mode 100644
index 000000000000..38dbbda6a4ae
--- /dev/null
+++ b/weave/flow/scorer/json_scorer.py
@@ -0,0 +1,24 @@
+import json
+from typing import Any
+
+from weave.flow.scorer.base_scorer import Scorer
+
+class JSONScorer(Scorer):
+    """
+    Score a JSON string.
+    """
+    def score(self, model_output: Any) -> Any:
+        try:
+            result = json.loads(model_output)
+
+            if isinstance(result, dict) or isinstance(result, list):
+                return True
+
+        except json.JSONDecodeError:
+            pass
+        return False
+    
+
+if __name__ == "__main__":
+    scorer = JSONScorer()
+    print(scorer.score("{\"city\": \"San Francisco\", \"country\": \"USA\", \"column2\": \"Santiago\"}"))
\ No newline at end of file
diff --git a/weave/flow/scorer/regex_scorer.py b/weave/flow/scorer/regex_scorer.py
index 56d9c74d1003..0fdd7527017b 100644
--- a/weave/flow/scorer/regex_scorer.py
+++ b/weave/flow/scorer/regex_scorer.py
@@ -5,6 +5,8 @@
 from pydantic import Field
 import weave
 
+from weave.flow.scorer.base_scorer import Scorer
+
 class RegexScorer(Scorer):
     patterns: Union[str, list[str]] = Field(default_factory=list, description="The patterns or keywords to match")
     ignore_case: bool = True
diff --git a/weave/flow/scorer/xml_scorer.py b/weave/flow/scorer/xml_scorer.py
new file mode 100644
index 000000000000..539a16d110d5
--- /dev/null
+++ b/weave/flow/scorer/xml_scorer.py
@@ -0,0 +1,29 @@
+import xml.etree.ElementTree as ET
+from typing import Any, Union
+
+from weave.flow.scorer.base_scorer import Scorer
+
+
+class XMLScorer(Scorer):
+    """
+    Score an XML string.
+    """
+    def score(self, model_output: Union[str, dict]) -> dict:
+        if isinstance(model_output, dict):
+            xml_string = model_output.get("output", "")
+        else:
+            xml_string = model_output
+
+        try:
+            ET.fromstring(xml_string)
+            return True
+        except ET.ParseError:
+            return False
+
+
+if __name__ == "__main__":
+    scorer = XMLScorer()
+    print(scorer.score("""<xml>
+        <city>San Francisco</city>
+        <country>USA</country>
+    </xml>"""))
\ No newline at end of file

From f2c69cd3a212da942e67bb05819a06593c0a3622 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 18:46:00 +0200
Subject: [PATCH 004/150] add keys

---
 weave/flow/scorer/json_scorer.py | 4 ++--
 weave/flow/scorer/xml_scorer.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/weave/flow/scorer/json_scorer.py b/weave/flow/scorer/json_scorer.py
index 38dbbda6a4ae..96c3d4d2e6c7 100644
--- a/weave/flow/scorer/json_scorer.py
+++ b/weave/flow/scorer/json_scorer.py
@@ -12,11 +12,11 @@ def score(self, model_output: Any) -> Any:
             result = json.loads(model_output)
 
             if isinstance(result, dict) or isinstance(result, list):
-                return True
+                return {"json_valid": True}
 
         except json.JSONDecodeError:
             pass
-        return False
+        return {"json_valid": False}
     
 
 if __name__ == "__main__":
diff --git a/weave/flow/scorer/xml_scorer.py b/weave/flow/scorer/xml_scorer.py
index 539a16d110d5..512445fa4367 100644
--- a/weave/flow/scorer/xml_scorer.py
+++ b/weave/flow/scorer/xml_scorer.py
@@ -16,9 +16,9 @@ def score(self, model_output: Union[str, dict]) -> dict:
 
         try:
             ET.fromstring(xml_string)
-            return True
+            return {"xml_valid": True}
         except ET.ParseError:
-            return False
+            return {"xml_valid": False}
 
 
 if __name__ == "__main__":

From 3c398f91ced52e6ace5b10f83d29c63d45eb9a1e Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 18:46:09 +0200
Subject: [PATCH 005/150] Embed Scorer

---
 weave/flow/scorer/__init__.py   |  1 +
 weave/flow/scorer/llm_scorer.py | 87 +++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 weave/flow/scorer/llm_scorer.py

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index c034d7f51957..a374d84c842d 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -1,3 +1,4 @@
 from weave.flow.scorer.base_scorer import *
 from weave.flow.scorer.regex_scorer import RegexScorer
 from weave.flow.scorer.json_scorer import JSONScorer
+from weave.flow.scorer.llm_scorer import LLMScorer, EmbeddingScorer
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
new file mode 100644
index 000000000000..5c62ef9b77d4
--- /dev/null
+++ b/weave/flow/scorer/llm_scorer.py
@@ -0,0 +1,87 @@
+from pydantic import Field, field_validator
+from typing import Any, Union, Type
+import numpy as np
+
+from weave.flow.scorer.base_scorer import Scorer
+
+_LLM_CLIENT_TYPES = []
+
+try:
+    from openai import OpenAI, AsyncOpenAI
+    _LLM_CLIENT_TYPES.append(OpenAI)
+    _LLM_CLIENT_TYPES.append(AsyncOpenAI)
+except:
+    pass    
+try:
+    from anthropic import Anthropic, AsyncAnthropic
+    _LLM_CLIENT_TYPES.append(Anthropic)
+    _LLM_CLIENT_TYPES.append(AsyncAnthropic)
+except:
+    pass    
+try:
+    from mistralai import Mistral
+    _LLM_CLIENT_TYPES.append(Mistral)
+except:
+    pass    
+
+_LLM_CLIENTS = Union[tuple(_LLM_CLIENT_TYPES)]
+
+_DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
+
+class LLMScorer(Scorer):
+    """
+    Score an LLM output.
+    """
+    client: Any = Field(description="The LLM client to use, has to be instantiated with an api_key")
+    model: str = Field(description="The model to use")
+
+    @field_validator('client')
+    def validate_client(cls, v):
+        if not any(isinstance(v, client_type) for client_type in _LLM_CLIENT_TYPES):
+            raise ValueError(f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}")
+        return v
+
+class EmbeddingScorer(LLMScorer):
+    """
+    Check the embedding distance between the model output and the target.
+    """
+    def score(self, model_output: Any, target: Any) -> Any:
+        if not isinstance(self.client, (OpenAI, AsyncOpenAI)):
+            raise ValueError("Embedding scoring only works with OpenAI or AsyncOpenAI")
+        
+        # Use AsyncOpenAI if available, otherwise use OpenAI
+        client = self.client if isinstance(self.client, AsyncOpenAI) else self.client
+        
+        model_embedding = client.embeddings.create(
+            input=model_output, model=self.model).data[0].embedding
+        target_embedding = client.embeddings.create(
+            input=target, model=self.model).data[0].embedding
+        
+        return self.cosine_similarity(model_embedding, target_embedding)
+    
+
+    def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
+        """
+        Compute the cosine similarity between two vectors.
+        """
+        vec1 = np.array(vec1)
+        vec2 = np.array(vec2)
+        cosine_sim =  np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
+
+        # cast to float
+        return float(cosine_sim)
+
+
+
+if __name__ == "__main__":
+    try:
+        import openai
+        client = openai.OpenAI()
+        scorer = EmbeddingScorer(
+            client=client, 
+            model="text-embedding-3-small")
+        print(scorer.score("I don't know", "I don't know"))
+    except Exception as e:
+        print("Install openai to run this script")
+    
+

From 07502df03a82ae6e0a4fbbcf178d3ffd17fe7d10 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 19:00:30 +0200
Subject: [PATCH 006/150] add openai moderation

---
 weave/flow/scorer/llm_scorer.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index 5c62ef9b77d4..d3cfbde83b5c 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -72,6 +72,20 @@ def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
         return float(cosine_sim)
 
 
+class OpenAIModerationScorer(LLMScorer):
+    "Use OpenAI moderation API to check if the model output is safe"
+
+    def score(self, model_output: Any) -> Any:
+        if not isinstance(self.client, (OpenAI, AsyncOpenAI)):
+            raise ValueError("Moderation scoring only works with OpenAI or AsyncOpenAI")
+        
+        response = self.client.moderations.create(
+            model=self.model,
+            input=model_output,
+        )
+        
+        return response.results[0]
+
 
 if __name__ == "__main__":
     try:
@@ -83,5 +97,15 @@ def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
         print(scorer.score("I don't know", "I don't know"))
     except Exception as e:
         print("Install openai to run this script")
+
+    try:
+        import openai
+        client = openai.OpenAI()
+        scorer = OpenAIModerationScorer(
+            client=client, 
+            model="omni-moderation-latest")
+        print(scorer.score("I should kill myself"))
+    except Exception as e:
+        print("Install openai to run this script")
     
 

From 3d2e35212b2bf7a1c7f928b272ee46e248c64ec2 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 19:01:10 +0200
Subject: [PATCH 007/150] missing import

---
 weave/flow/scorer/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index a374d84c842d..747422bfdb04 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -1,4 +1,4 @@
 from weave.flow.scorer.base_scorer import *
 from weave.flow.scorer.regex_scorer import RegexScorer
 from weave.flow.scorer.json_scorer import JSONScorer
-from weave.flow.scorer.llm_scorer import LLMScorer, EmbeddingScorer
+from weave.flow.scorer.llm_scorer import LLMScorer, EmbeddingScorer, OpenAIModerationScorer

From f1f604a40d96683f3cdded91b5fbff2ba5533513 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 19:30:03 +0200
Subject: [PATCH 008/150] re-invent the wheel

---
 weave/flow/scorer/lightllm.py   | 192 ++++++++++++++++++++++++++++++++
 weave/flow/scorer/llm_scorer.py |  37 +-----
 2 files changed, 198 insertions(+), 31 deletions(-)
 create mode 100644 weave/flow/scorer/lightllm.py

diff --git a/weave/flow/scorer/lightllm.py b/weave/flow/scorer/lightllm.py
new file mode 100644
index 000000000000..cbe49fe73df4
--- /dev/null
+++ b/weave/flow/scorer/lightllm.py
@@ -0,0 +1,192 @@
+from abc import ABC, abstractmethod
+from typing import List, Dict, Union, Any
+
+
+OPENAI_DEFAULT_MODEL = "gpt-4o"
+OPENAI_DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
+
+ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet-20240620"
+
+MISTRAL_DEFAULT_MODEL = "mistral-large-latest"
+MISTRAL_DEFAULT_EMBEDDING_MODEL = "mistral-embed"
+
+
+_LLM_CLIENT_TYPES = []
+
+try:
+    from openai import OpenAI, AsyncOpenAI
+    _LLM_CLIENT_TYPES.append(OpenAI)
+    _LLM_CLIENT_TYPES.append(AsyncOpenAI)
+except:
+    pass    
+try:
+    from anthropic import Anthropic, AsyncAnthropic
+    _LLM_CLIENT_TYPES.append(Anthropic)
+    _LLM_CLIENT_TYPES.append(AsyncAnthropic)
+except:
+    pass    
+try:
+    from mistralai import Mistral
+    _LLM_CLIENT_TYPES.append(Mistral)
+except:
+    pass    
+
+_LLM_CLIENTS = Union[tuple(_LLM_CLIENT_TYPES)]
+
+class BaseLLM(ABC):
+    def __init__(self, client: Any, model_id: str):
+        self.client = client
+        self.model_id = model_id
+
+    @abstractmethod
+    def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
+        pass
+
+    @abstractmethod
+    async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
+        pass
+
+    @abstractmethod
+    def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+        pass
+
+    @abstractmethod
+    async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+        pass
+
+class MistralLLM(BaseLLM):
+    def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
+        response = self.client.chat.complete(model=self.model_id, messages=messages, **kwargs)
+        return response.choices[0].message.content
+
+    async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
+        response = await self.client.chat.complete(model=self.model_id, messages=messages, **kwargs)
+        return response.choices[0].message.content
+
+    def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+        if isinstance(texts, str):
+            texts = [texts]
+        response = self.client.embeddings.create(model=self.model_id, inputs=texts)
+        return [embedding.embedding for embedding in response.data]
+
+    async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+        if isinstance(texts, str):
+            texts = [texts]
+        response = await self.client.embeddings.create(model=self.model_id, inputs=texts)
+        return [embedding.embedding for embedding in response.data]
+
+class OpenAILLM(BaseLLM):
+    def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
+        response = self.client.chat.completions.create(model=self.model_id, messages=messages, **kwargs)
+        return response.choices[0].message.content
+
+    async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
+        response = await self.client.chat.completions.create(model=self.model_id, messages=messages, **kwargs)
+        return response.choices[0].message.content
+
+    def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+        if isinstance(texts, str):
+            texts = [texts]
+        response = self.client.embeddings.create(input=texts, model=self.model_id)
+        return [data.embedding for data in response.data]
+
+    async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+        if isinstance(texts, str):
+            texts = [texts]
+        response = await self.client.embeddings.create(input=texts, model=self.model_id)
+        return [data.embedding for data in response.data]
+
+class AnthropicLLM(BaseLLM):
+    def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
+        response = self.client.messages.create(model=self.model_id, messages=messages, max_tokens=2048, **kwargs)
+        return response.content
+
+    async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
+        response = await self.client.messages.create(model=self.model_id, messages=messages, max_tokens=2048, **kwargs)
+        return response.content
+
+    def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+        return [[0.0]]  # Anthropic doesn't support embeddings
+
+    async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+        return [[0.0]]  # Anthropic doesn't support embeddings
+
+class LLMFactory:
+    @staticmethod
+    def create(client: Any, model_id: str) -> BaseLLM:
+        client_type = type(client).__name__.lower()
+        if "mistral" in client_type:
+            return MistralLLM(client, model_id)
+        elif "openai" in client_type:
+            return OpenAILLM(client, model_id)
+        elif "anthropic" in client_type:
+            return AnthropicLLM(client, model_id)
+        else:
+            raise ValueError(f"Unsupported client type: {client_type}")
+
+# Helper function for dynamic imports
+def import_client(provider: str):
+    try:
+        if provider == "mistral":
+            from mistralai import Mistral
+            return Mistral
+        elif provider == "openai":
+            from openai import OpenAI
+            return OpenAI
+        elif provider == "anthropic":
+            import anthropic
+            return anthropic.Anthropic
+    except ImportError:
+        return None
+
+# Example usage:
+if __name__ == "__main__":
+    import os
+    import asyncio
+
+    # Mistral example
+    MistralClient = import_client("mistral")
+    if MistralClient:
+        mistral_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
+        mistral_llm = LLMFactory.create(mistral_client, MISTRAL_DEFAULT_MODEL)
+        mistral_response = mistral_llm.chat([{"role": "user", "content": "What is the best French cheese?"}])
+        print("Mistral response:", mistral_response)
+
+    # OpenAI example
+    OpenAIClient = import_client("openai")
+    if OpenAIClient:
+        openai_client = OpenAIClient()
+        openai_llm = LLMFactory.create(openai_client, OPENAI_DEFAULT_MODEL)
+        openai_response = openai_llm.chat([
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Write a haiku about recursion in programming."}
+        ])
+        print("OpenAI response:", openai_response)
+
+    # Anthropic example
+    AnthropicClient = import_client("anthropic")
+    if AnthropicClient:
+        anthropic_client = AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY"))
+        anthropic_llm = LLMFactory.create(anthropic_client, ANTHROPIC_DEFAULT_MODEL)
+        anthropic_response = anthropic_llm.chat([{"role": "user", "content": "Hello, Claude"}])
+        print("Anthropic response:", anthropic_response)
+
+    # Embedding example
+    if MistralClient:
+        mistral_embed_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
+        mistral_embed_llm = LLMFactory.create(mistral_embed_client, MISTRAL_DEFAULT_EMBEDDING_MODEL)
+        mistral_embeddings = mistral_embed_llm.embed(["Embed this sentence.", "As well as this one."])
+        print("Mistral embeddings:", mistral_embeddings)
+
+    # Async example
+    async def async_example():
+        if OpenAIClient:
+            from openai import AsyncOpenAI
+            openai_async_client = AsyncOpenAI()
+            openai_async_llm = LLMFactory.create(openai_async_client, OPENAI_DEFAULT_MODEL)
+            openai_async_response = await openai_async_llm.achat([
+                {"role": "user", "content": "What's the meaning of life?"}
+            ])
+            print("OpenAI async response:", openai_async_response)
+
+    asyncio.run(async_example())
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index d3cfbde83b5c..7b7038fc16d9 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -3,30 +3,12 @@
 import numpy as np
 
 from weave.flow.scorer.base_scorer import Scorer
-
-_LLM_CLIENT_TYPES = []
+from weave.flow.scorer.lightllm import LLMFactory, _LLM_CLIENT_TYPES
 
 try:
     from openai import OpenAI, AsyncOpenAI
-    _LLM_CLIENT_TYPES.append(OpenAI)
-    _LLM_CLIENT_TYPES.append(AsyncOpenAI)
-except:
-    pass    
-try:
-    from anthropic import Anthropic, AsyncAnthropic
-    _LLM_CLIENT_TYPES.append(Anthropic)
-    _LLM_CLIENT_TYPES.append(AsyncAnthropic)
 except:
     pass    
-try:
-    from mistralai import Mistral
-    _LLM_CLIENT_TYPES.append(Mistral)
-except:
-    pass    
-
-_LLM_CLIENTS = Union[tuple(_LLM_CLIENT_TYPES)]
-
-_DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
 
 class LLMScorer(Scorer):
     """
@@ -46,20 +28,13 @@ class EmbeddingScorer(LLMScorer):
     Check the embedding distance between the model output and the target.
     """
     def score(self, model_output: Any, target: Any) -> Any:
-        if not isinstance(self.client, (OpenAI, AsyncOpenAI)):
-            raise ValueError("Embedding scoring only works with OpenAI or AsyncOpenAI")
-        
-        # Use AsyncOpenAI if available, otherwise use OpenAI
-        client = self.client if isinstance(self.client, AsyncOpenAI) else self.client
-        
-        model_embedding = client.embeddings.create(
-            input=model_output, model=self.model).data[0].embedding
-        target_embedding = client.embeddings.create(
-            input=target, model=self.model).data[0].embedding
-        
+        model_embedding, target_embedding = self._compute_embeddings(model_output, target)
         return self.cosine_similarity(model_embedding, target_embedding)
     
-
+    def _compute_embeddings(self, model_output: str, target: str) -> list[float]:
+        llm = LLMFactory.create(self.client, self.model)
+        return llm.embed([model_output, target])
+    
     def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
         """
         Compute the cosine similarity between two vectors.

From 291363fbd37d159577bb9e66d20d18506ef1ae1e Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 21:03:25 +0200
Subject: [PATCH 009/150] simplify moderation output

---
 weave/flow/scorer/llm_scorer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index 7b7038fc16d9..4ff15cdf996f 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -57,9 +57,10 @@ def score(self, model_output: Any) -> Any:
         response = self.client.moderations.create(
             model=self.model,
             input=model_output,
-        )
+        ).results[0]
+        categories = {k: v for k, v in response.categories.dict().items() if v}
+        return {"flagged": response.flagged, "categories": categories}
         
-        return response.results[0]
 
 
 if __name__ == "__main__":

From 97242ec6a3884965e70d50b62679161dffc55cf3 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 21:14:59 +0200
Subject: [PATCH 010/150] handle system message

---
 weave/flow/scorer/lightllm.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/weave/flow/scorer/lightllm.py b/weave/flow/scorer/lightllm.py
index cbe49fe73df4..a656daec1dc9 100644
--- a/weave/flow/scorer/lightllm.py
+++ b/weave/flow/scorer/lightllm.py
@@ -98,11 +98,27 @@ async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[floa
 
 class AnthropicLLM(BaseLLM):
     def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        response = self.client.messages.create(model=self.model_id, messages=messages, max_tokens=2048, **kwargs)
+        system_message = next((msg['content'] for msg in messages if msg['role'] == 'system'), None)
+        user_messages = [msg for msg in messages if msg['role'] != 'system']
+        response = self.client.messages.create(
+            model=self.model_id,
+            messages=user_messages,
+            system=system_message,
+            max_tokens=2048,
+            **kwargs
+        )
         return response.content
 
     async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        response = await self.client.messages.create(model=self.model_id, messages=messages, max_tokens=2048, **kwargs)
+        system_message = next((msg['content'] for msg in messages if msg['role'] == 'system'), None)
+        user_messages = [msg for msg in messages if msg['role'] != 'system']
+        response = await self.client.messages.create(
+            model=self.model_id,
+            messages=user_messages,
+            system=system_message,
+            max_tokens=2048,
+            **kwargs
+        )
         return response.content
 
     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
@@ -152,23 +168,26 @@ def import_client(provider: str):
         mistral_response = mistral_llm.chat([{"role": "user", "content": "What is the best French cheese?"}])
         print("Mistral response:", mistral_response)
 
-    # OpenAI example
+    # OpenAI example with system message
     OpenAIClient = import_client("openai")
     if OpenAIClient:
         openai_client = OpenAIClient()
         openai_llm = LLMFactory.create(openai_client, OPENAI_DEFAULT_MODEL)
         openai_response = openai_llm.chat([
-            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "system", "content": "You are a helpful assistant specialized in writing poetry."},
             {"role": "user", "content": "Write a haiku about recursion in programming."}
         ])
         print("OpenAI response:", openai_response)
 
-    # Anthropic example
+    # Anthropic example with system message
     AnthropicClient = import_client("anthropic")
     if AnthropicClient:
         anthropic_client = AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY"))
         anthropic_llm = LLMFactory.create(anthropic_client, ANTHROPIC_DEFAULT_MODEL)
-        anthropic_response = anthropic_llm.chat([{"role": "user", "content": "Hello, Claude"}])
+        anthropic_response = anthropic_llm.chat([
+            {"role": "system", "content": "You are Claude, an AI assistant created by Anthropic."},
+            {"role": "user", "content": "Hello, Claude"}
+        ])
         print("Anthropic response:", anthropic_response)
 
     # Embedding example
@@ -178,13 +197,14 @@ def import_client(provider: str):
         mistral_embeddings = mistral_embed_llm.embed(["Embed this sentence.", "As well as this one."])
         print("Mistral embeddings:", mistral_embeddings)
 
-    # Async example
+    # Async example with system message
     async def async_example():
         if OpenAIClient:
             from openai import AsyncOpenAI
             openai_async_client = AsyncOpenAI()
             openai_async_llm = LLMFactory.create(openai_async_client, OPENAI_DEFAULT_MODEL)
             openai_async_response = await openai_async_llm.achat([
+                {"role": "system", "content": "You are a philosopher AI assistant."},
                 {"role": "user", "content": "What's the meaning of life?"}
             ])
             print("OpenAI async response:", openai_async_response)

From ad5f02167950166b4220f9d4b4bcdad77017c13b Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 21:25:05 +0200
Subject: [PATCH 011/150] simple prompt scorer

---
 weave/flow/scorer/llm_scorer.py | 65 +++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index 4ff15cdf996f..4abf3b7493e9 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -23,6 +23,28 @@ def validate_client(cls, v):
             raise ValueError(f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}")
         return v
 
+class PromptScorer(LLMScorer):
+    """
+    Score an LLM output based on the prompt.
+    """
+    system_prompt: str = Field(default="You are a helpful assistant.", description="The system prompt to use")
+    user_prompt: str = Field(description="The user prompt to use")
+
+    @field_validator('user_prompt')
+    def validate_user_prompt(cls, v):
+        "The user prompt must contain the `model_output` variable."
+        if "{model_output}" not in v:
+            raise ValueError("The user prompt must contain the `model_output` variable.")
+        return v
+    
+    def score(self, model_output: Any) -> Any:
+        llm = LLMFactory.create(self.client, self.model)
+        messages = [
+            {"role": "system", "content": self.system_prompt},
+            {"role": "user", "content": self.user_prompt.format(model_output=model_output)},
+        ]
+        return llm.chat(messages=messages)
+
 class EmbeddingScorer(LLMScorer):
     """
     Check the embedding distance between the model output and the target.
@@ -64,24 +86,35 @@ def score(self, model_output: Any) -> Any:
 
 
 if __name__ == "__main__":
-    try:
-        import openai
-        client = openai.OpenAI()
-        scorer = EmbeddingScorer(
-            client=client, 
-            model="text-embedding-3-small")
-        print(scorer.score("I don't know", "I don't know"))
-    except Exception as e:
-        print("Install openai to run this script")
+    # try:
+    #     import openai
+    #     client = openai.OpenAI()
+    #     scorer = EmbeddingScorer(
+    #         client=client, 
+    #         model="text-embedding-3-small")
+    #     print(scorer.score("I don't know", "I don't know"))
+    # except Exception as e:
+    #     print("Install openai to run this script")
 
-    try:
+    # try:
+    #     import openai
+    #     client = openai.OpenAI()
+    #     scorer = OpenAIModerationScorer(
+    #         client=client, 
+    #         model="omni-moderation-latest")
+    #     print(scorer.score("I should kill myself"))
+    # except Exception as e:
+    #     print("Install openai to run this script")
+    
+    # try:
         import openai
         client = openai.OpenAI()
-        scorer = OpenAIModerationScorer(
+        scorer = PromptScorer(
             client=client, 
-            model="omni-moderation-latest")
-        print(scorer.score("I should kill myself"))
-    except Exception as e:
-        print("Install openai to run this script")
-    
+            model="gpt-4o",
+            system_prompt="You are a helpful assistant.",
+            user_prompt="Extract the entity from this phrase: \m {model_output}")
+        print(scorer.score("The cat is happy"))
+    # except Exception as e:
+    #     print("Install openai to run this script")
 

From 125d583e8e19ee9a0447c8d3f8d9c9832988343e Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 21:30:11 +0200
Subject: [PATCH 012/150] clean test

---
 weave/flow/scorer/llm_scorer.py | 47 ++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index 4abf3b7493e9..2c4b96960010 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -32,9 +32,8 @@ class PromptScorer(LLMScorer):
 
     @field_validator('user_prompt')
     def validate_user_prompt(cls, v):
-        "The user prompt must contain the `model_output` variable."
         if "{model_output}" not in v:
-            raise ValueError("The user prompt must contain the `model_output` variable.")
+            raise ValueError("The user prompt must contain the `{model_output}` variable.")
         return v
     
     def score(self, model_output: Any) -> Any:
@@ -86,35 +85,35 @@ def score(self, model_output: Any) -> Any:
 
 
 if __name__ == "__main__":
-    # try:
-    #     import openai
-    #     client = openai.OpenAI()
-    #     scorer = EmbeddingScorer(
-    #         client=client, 
-    #         model="text-embedding-3-small")
-    #     print(scorer.score("I don't know", "I don't know"))
-    # except Exception as e:
-    #     print("Install openai to run this script")
+    try:
+        import openai
+        client = openai.OpenAI()
+        scorer = EmbeddingScorer(
+            client=client, 
+            model="text-embedding-3-small")
+        print(scorer.score("I don't know", "I don't know"))
+    except Exception as e:
+        print("Install openai to run this script")
 
-    # try:
-    #     import openai
-    #     client = openai.OpenAI()
-    #     scorer = OpenAIModerationScorer(
-    #         client=client, 
-    #         model="omni-moderation-latest")
-    #     print(scorer.score("I should kill myself"))
-    # except Exception as e:
-    #     print("Install openai to run this script")
+    try:
+        import openai
+        client = openai.OpenAI()
+        scorer = OpenAIModerationScorer(
+            client=client, 
+            model="omni-moderation-latest")
+        print(scorer.score("I should kill myself"))
+    except Exception as e:
+        print("Install openai to run this script")
     
-    # try:
+    try:
         import openai
         client = openai.OpenAI()
         scorer = PromptScorer(
             client=client, 
             model="gpt-4o",
             system_prompt="You are a helpful assistant.",
-            user_prompt="Extract the entity from this phrase: \m {model_output}")
+            user_prompt="Extract the entity from this phrase: \n {model_output}")
         print(scorer.score("The cat is happy"))
-    # except Exception as e:
-    #     print("Install openai to run this script")
+    except Exception as e:
+        print("Install openai to run this script")
 

From 200c1154e11cd38d7b7e452e7f73848ff8f8d40f Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 21:39:10 +0200
Subject: [PATCH 013/150] pydantic validator

---
 weave/flow/scorer/pydantic_scorer.py | 40 ++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 weave/flow/scorer/pydantic_scorer.py

diff --git a/weave/flow/scorer/pydantic_scorer.py b/weave/flow/scorer/pydantic_scorer.py
new file mode 100644
index 000000000000..254e6ca243e9
--- /dev/null
+++ b/weave/flow/scorer/pydantic_scorer.py
@@ -0,0 +1,40 @@
+from pydantic import BaseModel, ValidationError
+from typing import Any, Type
+
+from weave.flow.scorer.base_scorer import Scorer
+
+class PydanticScorer(Scorer):
+    """
+    Validate the model output against a pydantic model.
+    """
+    model: Type[BaseModel]
+
+    def score(self, model_output: Any):
+        if isinstance(model_output, str):
+            try:
+                self.model.model_validate_json(model_output)
+                return True
+            except ValidationError:
+                return False
+        else:
+            try:
+                self.model.model_validate(model_output)
+                return True
+            except ValidationError:
+                return False
+
+
+if __name__ == "__main__":
+    from pydantic import BaseModel
+
+    class User(BaseModel):
+        name: str
+        age: int
+
+    scorer = PydanticScorer(model=User)
+
+    model_output = "{\"name\": \"John\", \"age\": 30}"
+    print(scorer.score(model_output))
+
+    model_output = {"name": "John", "age": 30}
+    print(scorer.score(model_output))

From 18d3d81173db2442bc074e2b3778aacea19725e6 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 10 Oct 2024 21:44:41 +0200
Subject: [PATCH 014/150] move classification out

---
 weave/flow/scorer/__init__.py       | 20 +++++++++-
 weave/flow/scorer/base_scorer.py    | 56 +---------------------------
 weave/flow/scorer/classification.py | 58 +++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 56 deletions(-)
 create mode 100644 weave/flow/scorer/classification.py

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index 747422bfdb04..7cac801b21c0 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -1,4 +1,22 @@
-from weave.flow.scorer.base_scorer import *
+from weave.flow.scorer.base_scorer import Scorer, auto_summarize, get_scorer_attributes
+from weave.flow.scorer.classification import MultiTaskBinaryClassificationF1
 from weave.flow.scorer.regex_scorer import RegexScorer
 from weave.flow.scorer.json_scorer import JSONScorer
 from weave.flow.scorer.llm_scorer import LLMScorer, EmbeddingScorer, OpenAIModerationScorer
+from weave.flow.scorer.pydantic_scorer import PydanticScorer
+from weave.flow.scorer.hallucination import HallucinationScorer
+
+
+__all__ = [
+    "Scorer",
+    "auto_summarize",
+    "get_scorer_attributes",
+    "MultiTaskBinaryClassificationF1",
+    "RegexScorer",
+    "JSONScorer",
+    "LLMScorer",
+    "EmbeddingScorer",
+    "OpenAIModerationScorer",
+    "PydanticScorer",
+    "HallucinationScorer",
+]
diff --git a/weave/flow/scorer/base_scorer.py b/weave/flow/scorer/base_scorer.py
index e69f3afeb3f1..511cd32a047e 100644
--- a/weave/flow/scorer/base_scorer.py
+++ b/weave/flow/scorer/base_scorer.py
@@ -1,4 +1,3 @@
-from collections import defaultdict
 from numbers import Number
 from typing import Any, Callable, Optional, Sequence, Tuple, Union
 
@@ -102,57 +101,4 @@ def get_scorer_attributes(
         summarize_fn = auto_summarize  # type: ignore
     else:
         raise ValueError(f"Unknown scorer type: {scorer}")
-    return (scorer_name, score_fn, summarize_fn)  # type: ignore
-
-
-def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
-    # if any denom is zero, then zero. could use NaN instead...
-    precision: float = 0
-    if tp or fp:
-        precision = tp / (tp + fp)
-    recall: float = 0
-    if tp or fn:
-        recall = tp / (tp + fn)
-    f1: float = 0
-    if precision or recall:
-        f1 = 2 * (precision * recall) / (precision + recall)
-    return precision, recall, f1
-
-
-class MultiTaskBinaryClassificationF1(Scorer):
-    class_names: list[str]
-
-    @weave.op()
-    def summarize(self, score_rows: list) -> Optional[dict]:
-        result = {}
-        cols = transpose(score_rows)
-
-        for class_name in self.class_names:
-            col = cols[class_name]
-            tp = sum(r["correct"] and not r["negative"] for r in col)
-            fp = sum(not r["correct"] and not r["negative"] for r in col)
-            fn = sum(not r["correct"] and r["negative"] for r in col)
-            precision, recall, f1 = p_r_f1(tp, fp, fn)
-            result[class_name] = {"f1": f1, "precision": precision, "recall": recall}
-
-        return result
-
-    @weave.op()
-    def score(self, target: dict, model_output: Optional[dict]) -> dict:
-        result = {}
-        for class_name in self.class_names:
-            class_label = target.get(class_name)
-            class_model_output = model_output.get(class_name) if model_output else None
-            result[class_name] = {
-                "correct": class_label == class_model_output,
-                "negative": not class_model_output,
-            }
-        return result
-
-
-def transpose(rows: list[dict]) -> dict[str, list]:
-    cols = defaultdict(list)
-    for row in rows:
-        for k, v in row.items():
-            cols[k].append(v)
-    return dict(cols)
+    return (scorer_name, score_fn, summarize_fn)  # type: ignore
\ No newline at end of file
diff --git a/weave/flow/scorer/classification.py b/weave/flow/scorer/classification.py
new file mode 100644
index 000000000000..b86fb890d5e4
--- /dev/null
+++ b/weave/flow/scorer/classification.py
@@ -0,0 +1,58 @@
+from collections import defaultdict
+from typing import Optional, Tuple
+
+import weave
+from weave.flow.scorer.base_scorer import Scorer
+
+
+def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
+    # if any denom is zero, then zero. could use NaN instead...
+    precision: float = 0
+    if tp or fp:
+        precision = tp / (tp + fp)
+    recall: float = 0
+    if tp or fn:
+        recall = tp / (tp + fn)
+    f1: float = 0
+    if precision or recall:
+        f1 = 2 * (precision * recall) / (precision + recall)
+    return precision, recall, f1
+
+
+class MultiTaskBinaryClassificationF1(Scorer):
+    class_names: list[str]
+
+    @weave.op()
+    def summarize(self, score_rows: list) -> Optional[dict]:
+        result = {}
+        cols = transpose(score_rows)
+
+        for class_name in self.class_names:
+            col = cols[class_name]
+            tp = sum(r["correct"] and not r["negative"] for r in col)
+            fp = sum(not r["correct"] and not r["negative"] for r in col)
+            fn = sum(not r["correct"] and r["negative"] for r in col)
+            precision, recall, f1 = p_r_f1(tp, fp, fn)
+            result[class_name] = {"f1": f1, "precision": precision, "recall": recall}
+
+        return result
+
+    @weave.op()
+    def score(self, target: dict, model_output: Optional[dict]) -> dict:
+        result = {}
+        for class_name in self.class_names:
+            class_label = target.get(class_name)
+            class_model_output = model_output.get(class_name) if model_output else None
+            result[class_name] = {
+                "correct": class_label == class_model_output,
+                "negative": not class_model_output,
+            }
+        return result
+
+
+def transpose(rows: list[dict]) -> dict[str, list]:
+    cols = defaultdict(list)
+    for row in rows:
+        for k, v in row.items():
+            cols[k].append(v)
+    return dict(cols)

From ddd1dfdee368ab587354e83200aba45fb5b55b53 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 09:41:59 +0200
Subject: [PATCH 015/150] rename embedding

---
 weave/flow/scorer/__init__.py   | 4 ++--
 weave/flow/scorer/llm_scorer.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index 7cac801b21c0..b727f3db4bbb 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -1,8 +1,8 @@
 from weave.flow.scorer.base_scorer import Scorer, auto_summarize, get_scorer_attributes
-from weave.flow.scorer.classification import MultiTaskBinaryClassificationF1
+from weave.flow.scorer.classification import MultiTaskBinaryClassificationF1, transpose
 from weave.flow.scorer.regex_scorer import RegexScorer
 from weave.flow.scorer.json_scorer import JSONScorer
-from weave.flow.scorer.llm_scorer import LLMScorer, EmbeddingScorer, OpenAIModerationScorer
+from weave.flow.scorer.llm_scorer import LLMScorer, EmbeddingSimilarityScorer, OpenAIModerationScorer
 from weave.flow.scorer.pydantic_scorer import PydanticScorer
 from weave.flow.scorer.hallucination import HallucinationScorer
 
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index 2c4b96960010..ae4f62f65f22 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -44,7 +44,7 @@ def score(self, model_output: Any) -> Any:
         ]
         return llm.chat(messages=messages)
 
-class EmbeddingScorer(LLMScorer):
+class EmbeddingSimilarityScorer(LLMScorer):
     """
     Check the embedding distance between the model output and the target.
     """
@@ -88,7 +88,7 @@ def score(self, model_output: Any) -> Any:
     try:
         import openai
         client = openai.OpenAI()
-        scorer = EmbeddingScorer(
+        scorer = EmbeddingSimilarityScorer(
             client=client, 
             model="text-embedding-3-small")
         print(scorer.score("I don't know", "I don't know"))

From 49cb13f88a9ef71f1bc4d0458de1db066a048a9d Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 09:42:04 +0200
Subject: [PATCH 016/150] add ragas support

---
 weave/flow/scorer/ragas.py | 219 +++++++++++++++++++++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 weave/flow/scorer/ragas.py

diff --git a/weave/flow/scorer/ragas.py b/weave/flow/scorer/ragas.py
new file mode 100644
index 000000000000..5bbc448b903a
--- /dev/null
+++ b/weave/flow/scorer/ragas.py
@@ -0,0 +1,219 @@
+from typing import Any, List
+
+from weave.flow.scorer.lightllm import LLMFactory
+from weave.flow.scorer.llm_scorer import EmbeddingSimilarityScorer, LLMScorer
+
+
+class ContextEntityRecallScorer(LLMScorer):
+    """
+    Estimates context recall by extracting entities from the expected answer and
+    the provided context, then computes the recall.
+    """
+
+    extraction_prompt: str = """
+    Extract unique entities from the following text without repetition.
+
+    Text: {text}
+    Entities:
+    """
+
+    def extract_entities(self, text: str) -> List[str]:
+        # Use LLM to extract entities
+        llm = LLMFactory.create(self.client, self.model)
+        prompt = self.extraction_prompt.format(text=text)
+        response = llm.chat(messages=[{"role": "user", "content": prompt}])
+        # Assume entities are returned as a comma-separated list
+        entities = [e.strip() for e in response.split(",")]
+        return entities
+
+    def score(self, model_output: Any, expected: str, context: str) -> float:
+        # Extract entities
+        expected_entities = self.extract_entities(expected)
+        context_entities = self.extract_entities(context)
+        # Calculate recall
+        if not expected_entities:
+            return 0.0
+        matches = set(expected_entities) & set(context_entities)
+        recall = len(matches) / len(expected_entities)
+        return recall
+
+
+class ContextRelevancyScorer(LLMScorer):
+    """Evaluates the relevancy of the provided context to the input question."""
+
+    relevancy_prompt: str = """
+    Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1.
+
+    Question: {question}
+    Context: {context}
+    Relevancy Score (0-1):
+    """
+
+    def score(self, model_output: Any, input_text: str, context: str) -> float:
+        llm = LLMFactory.create(self.client, self.model)
+        prompt = self.relevancy_prompt.format(question=input_text, context=context)
+        response = llm.chat(messages=[{"role": "user", "content": prompt}])
+        # Parse the response to get the relevancy score
+        try:
+            score = float(response.strip())
+            return max(0.0, min(score, 1.0))  # Ensure the score is between 0 and 1
+        except ValueError:
+            return 0.0  # Return 0 if parsing fails
+
+
+class ContextPrecisionScorer(LLMScorer):
+    """Determines whether the provided context was useful in arriving at the given answer."""
+
+    precision_prompt: str = """
+    Given the question, answer, and context, determine if the context was useful in arriving at the answer.
+    Respond with 1 if useful, 0 if not.
+
+    Question: {question}
+    Answer: {answer}
+    Context: {context}
+    Verdict (1 for useful, 0 for not useful):
+    """
+
+    def score(
+        self, model_output: Any, input_text: str, expected: str, context: str
+    ) -> float:
+        llm = LLMFactory.create(self.client, self.model)
+        prompt = self.precision_prompt.format(
+            question=input_text, answer=expected, context=context
+        )
+        response = llm.chat(messages=[{"role": "user", "content": prompt}])
+        # Parse the response to get the verdict
+        try:
+            verdict = int(response.strip())
+            return float(verdict)
+        except ValueError:
+            return 0.0  # Return 0 if parsing fails
+
+
+class FaithfulnessScorer(LLMScorer):
+    """Measures the factual consistency of the generated answer against the provided context."""
+
+    faithfulness_prompt: str = """
+    Compare the following answer and context for factual consistency. Rate the faithfulness on a scale from 0 to 1.
+
+    Answer: {answer}
+    Context: {context}
+    Faithfulness Score (0-1):
+    """
+
+    def score(self, model_output: Any, expected: str, context: str) -> float:
+        llm = LLMFactory.create(self.client, self.model)
+        answer = model_output.get("answer", "")
+        prompt = self.faithfulness_prompt.format(answer=answer, context=context)
+        response = llm.chat(messages=[{"role": "user", "content": prompt}])
+        # Parse the response to get the faithfulness score
+        try:
+            score = float(response.strip())
+            return max(0.0, min(score, 1.0))
+        except ValueError:
+            return 0.0  # Return 0 if parsing fails
+
+
+class AnswerSimilarityScorer(EmbeddingSimilarityScorer):
+    """Measures the similarity between the generated answer and the expected answer."""
+
+    def score(self, model_output: Any, expected: str) -> float:
+        generated_answer = model_output.get("answer", "")
+        return super().score(generated_answer, expected)
+
+
+from typing import Any
+
+from weave.flow.scorer.llm_scorer import LLMScorer
+
+
+class AnswerCorrectnessScorer(LLMScorer):
+    """Evaluates the correctness of the answer based on the ground truth."""
+
+    correctness_prompt: str = """
+    Given the question, generated answer, and ground truth, rate the correctness of the answer on a scale from 0 to 1.
+
+    Question: {question}
+    Generated Answer: {generated_answer}
+    Ground Truth: {ground_truth}
+    Correctness Score (0-1):
+    """
+
+    def score(self, model_output: Any, input_text: str, expected: str) -> float:
+        llm = LLMFactory.create(self.client, self.model)
+        generated_answer = model_output.get("answer", "")
+        prompt = self.correctness_prompt.format(
+            question=input_text,
+            generated_answer=generated_answer,
+            ground_truth=expected,
+        )
+        response = llm.chat(messages=[{"role": "user", "content": prompt}])
+        # Parse the response to get the correctness score
+        try:
+            score = float(response.strip())
+            return max(0.0, min(score, 1.0))
+        except ValueError:
+            return 0.0  # Return 0 if parsing fails
+
+
+if __name__ == "__main__":
+    import os
+    import weave
+    try:
+        from weave.flow.scorer.lightllm import import_client
+
+        # Instantiate your LLM client
+        OpenAIClient = import_client("openai")
+        if OpenAIClient:
+            llm_client = OpenAIClient(api_key=os.environ["OPENAI_API_KEY"])  # Replace with your API key
+        else:
+            raise ImportError("OpenAI client not available")
+
+        # Instantiate scorers
+        context_entity_recall_scorer = ContextEntityRecallScorer(
+            client=llm_client, model="gpt-4o"
+        )
+        context_relevancy_scorer = ContextRelevancyScorer(client=llm_client, model="gpt-4")
+        context_precision_scorer = ContextPrecisionScorer(client=llm_client, model="gpt-4")
+        faithfulness_scorer = FaithfulnessScorer(client=llm_client, model="gpt-4")
+        answer_similarity_scorer = AnswerSimilarityScorer(
+            client=llm_client, model="text-embedding-ada-002"
+        )
+        answer_correctness_scorer = AnswerCorrectnessScorer(
+            client=llm_client, model="gpt-4o"
+        )
+
+        # Create your dataset of examples
+        examples = [
+            {"question": "What is the capital of France?", "expected": "Paris", "context": "Paris is the capital of France."},
+            {"question": "Who wrote 'To Kill a Mockingbird'?", "expected": "Harper Lee", "context": "Harper Lee is the author of 'To Kill a Mockingbird'."},
+            # Add more examples as needed
+        ]
+
+        scorers = [
+            context_entity_recall_scorer,
+            context_relevancy_scorer,
+            context_precision_scorer,
+            faithfulness_scorer,
+            answer_similarity_scorer,
+            answer_correctness_scorer,
+        ]
+
+        for example in examples:
+            model_output = {"answer": example["expected"]}  # Simulate model output
+            for scorer in scorers:
+                if isinstance(scorer, ContextEntityRecallScorer):
+                    score = scorer.score(model_output, example["expected"], example["context"])
+                elif isinstance(scorer, ContextRelevancyScorer):
+                    score = scorer.score(model_output, example["question"], example["context"])
+                elif isinstance(scorer, ContextPrecisionScorer):
+                    score = scorer.score(model_output, example["question"], example["expected"], example["context"])
+                elif isinstance(scorer, FaithfulnessScorer):
+                    score = scorer.score(model_output, example["expected"], example["context"])
+                elif isinstance(scorer, AnswerSimilarityScorer):
+                    score = scorer.score(model_output, example["expected"])
+                elif isinstance(scorer, AnswerCorrectnessScorer):
+                    score = scorer.score(model_output, example["question"], example["expected"])
+                print(f"{scorer.__class__.__name__} score for '{example['question']}': {score}")
+    except Exception as e:
+        print(e)
\ No newline at end of file

From 9a224902b70506fa786905811b77fddf2d725d6c Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 09:44:42 +0200
Subject: [PATCH 017/150] ref

---
 weave/flow/scorer/ragas.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/weave/flow/scorer/ragas.py b/weave/flow/scorer/ragas.py
index 5bbc448b903a..c849ae73c6c1 100644
--- a/weave/flow/scorer/ragas.py
+++ b/weave/flow/scorer/ragas.py
@@ -1,3 +1,5 @@
+# implememting metrics from ragas: https://github.com/explodinggradients/ragas
+
 from typing import Any, List
 
 from weave.flow.scorer.lightllm import LLMFactory

From 674080b4a33393696aecc87dd8bc258460268f0c Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 09:46:28 +0200
Subject: [PATCH 018/150] update init

---
 weave/flow/scorer/__init__.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index b727f3db4bbb..ccf0d84ef35e 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -5,6 +5,7 @@
 from weave.flow.scorer.llm_scorer import LLMScorer, EmbeddingSimilarityScorer, OpenAIModerationScorer
 from weave.flow.scorer.pydantic_scorer import PydanticScorer
 from weave.flow.scorer.hallucination import HallucinationScorer
+from weave.flow.scorer.ragas import AnswerCorrectnessScorer, ContextEntityRecallScorer, ContextPrecisionScorer, ContextRelevancyScorer, FaithfulnessScorer, AnswerSimilarityScorer
 
 
 __all__ = [
@@ -12,11 +13,18 @@
     "auto_summarize",
     "get_scorer_attributes",
     "MultiTaskBinaryClassificationF1",
+    "transpose",
     "RegexScorer",
     "JSONScorer",
     "LLMScorer",
-    "EmbeddingScorer",
+    "EmbeddingSimilarityScorer",
     "OpenAIModerationScorer",
     "PydanticScorer",
     "HallucinationScorer",
+    "AnswerCorrectnessScorer",
+    "ContextEntityRecallScorer",
+    "ContextPrecisionScorer",
+    "ContextRelevancyScorer",
+    "FaithfulnessScorer",
+    "AnswerSimilarityScorer",
 ]

From 0f16c194baf6d5b3be6415256015310b70610561 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 09:48:00 +0200
Subject: [PATCH 019/150] lint

---
 weave/flow/scorer/__init__.py        |  20 +++--
 weave/flow/scorer/base_scorer.py     |   2 +-
 weave/flow/scorer/hallucination.py   |   6 ++
 weave/flow/scorer/json_scorer.py     |  14 ++--
 weave/flow/scorer/lightllm.py        | 121 +++++++++++++++++++--------
 weave/flow/scorer/llm_scorer.py      |  92 +++++++++++---------
 weave/flow/scorer/pydantic_scorer.py |  11 +--
 weave/flow/scorer/ragas.py           |  56 ++++++++++---
 weave/flow/scorer/regex_scorer.py    |  28 ++++---
 weave/flow/scorer/xml_scorer.py      |  15 ++--
 10 files changed, 245 insertions(+), 120 deletions(-)
 create mode 100644 weave/flow/scorer/hallucination.py

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index ccf0d84ef35e..946f6967be6c 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -1,12 +1,22 @@
 from weave.flow.scorer.base_scorer import Scorer, auto_summarize, get_scorer_attributes
 from weave.flow.scorer.classification import MultiTaskBinaryClassificationF1, transpose
-from weave.flow.scorer.regex_scorer import RegexScorer
+from weave.flow.scorer.hallucination import HallucinationScorer
 from weave.flow.scorer.json_scorer import JSONScorer
-from weave.flow.scorer.llm_scorer import LLMScorer, EmbeddingSimilarityScorer, OpenAIModerationScorer
+from weave.flow.scorer.llm_scorer import (
+    EmbeddingSimilarityScorer,
+    LLMScorer,
+    OpenAIModerationScorer,
+)
 from weave.flow.scorer.pydantic_scorer import PydanticScorer
-from weave.flow.scorer.hallucination import HallucinationScorer
-from weave.flow.scorer.ragas import AnswerCorrectnessScorer, ContextEntityRecallScorer, ContextPrecisionScorer, ContextRelevancyScorer, FaithfulnessScorer, AnswerSimilarityScorer
-
+from weave.flow.scorer.ragas import (
+    AnswerCorrectnessScorer,
+    AnswerSimilarityScorer,
+    ContextEntityRecallScorer,
+    ContextPrecisionScorer,
+    ContextRelevancyScorer,
+    FaithfulnessScorer,
+)
+from weave.flow.scorer.regex_scorer import RegexScorer
 
 __all__ = [
     "Scorer",
diff --git a/weave/flow/scorer/base_scorer.py b/weave/flow/scorer/base_scorer.py
index 511cd32a047e..b51c63d51245 100644
--- a/weave/flow/scorer/base_scorer.py
+++ b/weave/flow/scorer/base_scorer.py
@@ -101,4 +101,4 @@ def get_scorer_attributes(
         summarize_fn = auto_summarize  # type: ignore
     else:
         raise ValueError(f"Unknown scorer type: {scorer}")
-    return (scorer_name, score_fn, summarize_fn)  # type: ignore
\ No newline at end of file
+    return (scorer_name, score_fn, summarize_fn)  # type: ignore
diff --git a/weave/flow/scorer/hallucination.py b/weave/flow/scorer/hallucination.py
new file mode 100644
index 000000000000..0e3fda5d71f8
--- /dev/null
+++ b/weave/flow/scorer/hallucination.py
@@ -0,0 +1,6 @@
+from weave.flow.scorer.llm_scorer import PromptScorer
+
+
+class HallucinationScorer(PromptScorer):
+    def score(self, model_output, target):
+        return super().score(model_output, target)
diff --git a/weave/flow/scorer/json_scorer.py b/weave/flow/scorer/json_scorer.py
index 96c3d4d2e6c7..4b2bcac14f03 100644
--- a/weave/flow/scorer/json_scorer.py
+++ b/weave/flow/scorer/json_scorer.py
@@ -3,10 +3,10 @@
 
 from weave.flow.scorer.base_scorer import Scorer
 
+
 class JSONScorer(Scorer):
-    """
-    Score a JSON string.
-    """
+    """Score a JSON string."""
+
     def score(self, model_output: Any) -> Any:
         try:
             result = json.loads(model_output)
@@ -17,8 +17,12 @@ def score(self, model_output: Any) -> Any:
         except json.JSONDecodeError:
             pass
         return {"json_valid": False}
-    
+
 
 if __name__ == "__main__":
     scorer = JSONScorer()
-    print(scorer.score("{\"city\": \"San Francisco\", \"country\": \"USA\", \"column2\": \"Santiago\"}"))
\ No newline at end of file
+    print(
+        scorer.score(
+            '{"city": "San Francisco", "country": "USA", "column2": "Santiago"}'
+        )
+    )
diff --git a/weave/flow/scorer/lightllm.py b/weave/flow/scorer/lightllm.py
index a656daec1dc9..92e97c47913b 100644
--- a/weave/flow/scorer/lightllm.py
+++ b/weave/flow/scorer/lightllm.py
@@ -1,6 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Dict, Union, Any
-
+from typing import Any, Dict, List, Union
 
 OPENAI_DEFAULT_MODEL = "gpt-4o"
 OPENAI_DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
@@ -14,25 +13,29 @@
 _LLM_CLIENT_TYPES = []
 
 try:
-    from openai import OpenAI, AsyncOpenAI
+    from openai import AsyncOpenAI, OpenAI
+
     _LLM_CLIENT_TYPES.append(OpenAI)
     _LLM_CLIENT_TYPES.append(AsyncOpenAI)
 except:
-    pass    
+    pass
 try:
     from anthropic import Anthropic, AsyncAnthropic
+
     _LLM_CLIENT_TYPES.append(Anthropic)
     _LLM_CLIENT_TYPES.append(AsyncAnthropic)
 except:
-    pass    
+    pass
 try:
     from mistralai import Mistral
+
     _LLM_CLIENT_TYPES.append(Mistral)
 except:
-    pass    
+    pass
 
 _LLM_CLIENTS = Union[tuple(_LLM_CLIENT_TYPES)]
 
+
 class BaseLLM(ABC):
     def __init__(self, client: Any, model_id: str):
         self.client = client
@@ -54,13 +57,18 @@ def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
         pass
 
+
 class MistralLLM(BaseLLM):
     def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        response = self.client.chat.complete(model=self.model_id, messages=messages, **kwargs)
+        response = self.client.chat.complete(
+            model=self.model_id, messages=messages, **kwargs
+        )
         return response.choices[0].message.content
 
     async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        response = await self.client.chat.complete(model=self.model_id, messages=messages, **kwargs)
+        response = await self.client.chat.complete(
+            model=self.model_id, messages=messages, **kwargs
+        )
         return response.choices[0].message.content
 
     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
@@ -72,16 +80,23 @@ def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
         if isinstance(texts, str):
             texts = [texts]
-        response = await self.client.embeddings.create(model=self.model_id, inputs=texts)
+        response = await self.client.embeddings.create(
+            model=self.model_id, inputs=texts
+        )
         return [embedding.embedding for embedding in response.data]
 
+
 class OpenAILLM(BaseLLM):
     def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        response = self.client.chat.completions.create(model=self.model_id, messages=messages, **kwargs)
+        response = self.client.chat.completions.create(
+            model=self.model_id, messages=messages, **kwargs
+        )
         return response.choices[0].message.content
 
     async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        response = await self.client.chat.completions.create(model=self.model_id, messages=messages, **kwargs)
+        response = await self.client.chat.completions.create(
+            model=self.model_id, messages=messages, **kwargs
+        )
         return response.choices[0].message.content
 
     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
@@ -96,28 +111,33 @@ async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[floa
         response = await self.client.embeddings.create(input=texts, model=self.model_id)
         return [data.embedding for data in response.data]
 
+
 class AnthropicLLM(BaseLLM):
     def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        system_message = next((msg['content'] for msg in messages if msg['role'] == 'system'), None)
-        user_messages = [msg for msg in messages if msg['role'] != 'system']
+        system_message = next(
+            (msg["content"] for msg in messages if msg["role"] == "system"), None
+        )
+        user_messages = [msg for msg in messages if msg["role"] != "system"]
         response = self.client.messages.create(
             model=self.model_id,
             messages=user_messages,
             system=system_message,
             max_tokens=2048,
-            **kwargs
+            **kwargs,
         )
         return response.content
 
     async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        system_message = next((msg['content'] for msg in messages if msg['role'] == 'system'), None)
-        user_messages = [msg for msg in messages if msg['role'] != 'system']
+        system_message = next(
+            (msg["content"] for msg in messages if msg["role"] == "system"), None
+        )
+        user_messages = [msg for msg in messages if msg["role"] != "system"]
         response = await self.client.messages.create(
             model=self.model_id,
             messages=user_messages,
             system=system_message,
             max_tokens=2048,
-            **kwargs
+            **kwargs,
         )
         return response.content
 
@@ -127,6 +147,7 @@ def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
         return [[0.0]]  # Anthropic doesn't support embeddings
 
+
 class LLMFactory:
     @staticmethod
     def create(client: Any, model_id: str) -> BaseLLM:
@@ -140,32 +161,39 @@ def create(client: Any, model_id: str) -> BaseLLM:
         else:
             raise ValueError(f"Unsupported client type: {client_type}")
 
+
 # Helper function for dynamic imports
 def import_client(provider: str):
     try:
         if provider == "mistral":
             from mistralai import Mistral
+
             return Mistral
         elif provider == "openai":
             from openai import OpenAI
+
             return OpenAI
         elif provider == "anthropic":
             import anthropic
+
             return anthropic.Anthropic
     except ImportError:
         return None
 
+
 # Example usage:
 if __name__ == "__main__":
-    import os
     import asyncio
+    import os
 
     # Mistral example
     MistralClient = import_client("mistral")
     if MistralClient:
         mistral_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
         mistral_llm = LLMFactory.create(mistral_client, MISTRAL_DEFAULT_MODEL)
-        mistral_response = mistral_llm.chat([{"role": "user", "content": "What is the best French cheese?"}])
+        mistral_response = mistral_llm.chat(
+            [{"role": "user", "content": "What is the best French cheese?"}]
+        )
         print("Mistral response:", mistral_response)
 
     # OpenAI example with system message
@@ -173,10 +201,18 @@ def import_client(provider: str):
     if OpenAIClient:
         openai_client = OpenAIClient()
         openai_llm = LLMFactory.create(openai_client, OPENAI_DEFAULT_MODEL)
-        openai_response = openai_llm.chat([
-            {"role": "system", "content": "You are a helpful assistant specialized in writing poetry."},
-            {"role": "user", "content": "Write a haiku about recursion in programming."}
-        ])
+        openai_response = openai_llm.chat(
+            [
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant specialized in writing poetry.",
+                },
+                {
+                    "role": "user",
+                    "content": "Write a haiku about recursion in programming.",
+                },
+            ]
+        )
         print("OpenAI response:", openai_response)
 
     # Anthropic example with system message
@@ -184,29 +220,46 @@ def import_client(provider: str):
     if AnthropicClient:
         anthropic_client = AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY"))
         anthropic_llm = LLMFactory.create(anthropic_client, ANTHROPIC_DEFAULT_MODEL)
-        anthropic_response = anthropic_llm.chat([
-            {"role": "system", "content": "You are Claude, an AI assistant created by Anthropic."},
-            {"role": "user", "content": "Hello, Claude"}
-        ])
+        anthropic_response = anthropic_llm.chat(
+            [
+                {
+                    "role": "system",
+                    "content": "You are Claude, an AI assistant created by Anthropic.",
+                },
+                {"role": "user", "content": "Hello, Claude"},
+            ]
+        )
         print("Anthropic response:", anthropic_response)
 
     # Embedding example
     if MistralClient:
         mistral_embed_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
-        mistral_embed_llm = LLMFactory.create(mistral_embed_client, MISTRAL_DEFAULT_EMBEDDING_MODEL)
-        mistral_embeddings = mistral_embed_llm.embed(["Embed this sentence.", "As well as this one."])
+        mistral_embed_llm = LLMFactory.create(
+            mistral_embed_client, MISTRAL_DEFAULT_EMBEDDING_MODEL
+        )
+        mistral_embeddings = mistral_embed_llm.embed(
+            ["Embed this sentence.", "As well as this one."]
+        )
         print("Mistral embeddings:", mistral_embeddings)
 
     # Async example with system message
     async def async_example():
         if OpenAIClient:
             from openai import AsyncOpenAI
+
             openai_async_client = AsyncOpenAI()
-            openai_async_llm = LLMFactory.create(openai_async_client, OPENAI_DEFAULT_MODEL)
-            openai_async_response = await openai_async_llm.achat([
-                {"role": "system", "content": "You are a philosopher AI assistant."},
-                {"role": "user", "content": "What's the meaning of life?"}
-            ])
+            openai_async_llm = LLMFactory.create(
+                openai_async_client, OPENAI_DEFAULT_MODEL
+            )
+            openai_async_response = await openai_async_llm.achat(
+                [
+                    {
+                        "role": "system",
+                        "content": "You are a philosopher AI assistant.",
+                    },
+                    {"role": "user", "content": "What's the meaning of life?"},
+                ]
+            )
             print("OpenAI async response:", openai_async_response)
 
     asyncio.run(async_example())
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index ae4f62f65f22..e8022dda662e 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -1,68 +1,80 @@
-from pydantic import Field, field_validator
-from typing import Any, Union, Type
+from typing import Any
+
 import numpy as np
+from pydantic import Field, field_validator
 
 from weave.flow.scorer.base_scorer import Scorer
-from weave.flow.scorer.lightllm import LLMFactory, _LLM_CLIENT_TYPES
+from weave.flow.scorer.lightllm import _LLM_CLIENT_TYPES, LLMFactory
 
 try:
-    from openai import OpenAI, AsyncOpenAI
+    from openai import AsyncOpenAI, OpenAI
 except:
-    pass    
+    pass
+
 
 class LLMScorer(Scorer):
-    """
-    Score an LLM output.
-    """
-    client: Any = Field(description="The LLM client to use, has to be instantiated with an api_key")
+    """Score an LLM output."""
+
+    client: Any = Field(
+        description="The LLM client to use, has to be instantiated with an api_key"
+    )
     model: str = Field(description="The model to use")
 
-    @field_validator('client')
+    @field_validator("client")
     def validate_client(cls, v):
         if not any(isinstance(v, client_type) for client_type in _LLM_CLIENT_TYPES):
-            raise ValueError(f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}")
+            raise ValueError(
+                f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}"
+            )
         return v
 
+
 class PromptScorer(LLMScorer):
-    """
-    Score an LLM output based on the prompt.
-    """
-    system_prompt: str = Field(default="You are a helpful assistant.", description="The system prompt to use")
+    """Score an LLM output based on the prompt."""
+
+    system_prompt: str = Field(
+        default="You are a helpful assistant.", description="The system prompt to use"
+    )
     user_prompt: str = Field(description="The user prompt to use")
 
-    @field_validator('user_prompt')
+    @field_validator("user_prompt")
     def validate_user_prompt(cls, v):
         if "{model_output}" not in v:
-            raise ValueError("The user prompt must contain the `{model_output}` variable.")
+            raise ValueError(
+                "The user prompt must contain the `{model_output}` variable."
+            )
         return v
-    
+
     def score(self, model_output: Any) -> Any:
         llm = LLMFactory.create(self.client, self.model)
         messages = [
             {"role": "system", "content": self.system_prompt},
-            {"role": "user", "content": self.user_prompt.format(model_output=model_output)},
+            {
+                "role": "user",
+                "content": self.user_prompt.format(model_output=model_output),
+            },
         ]
         return llm.chat(messages=messages)
 
+
 class EmbeddingSimilarityScorer(LLMScorer):
-    """
-    Check the embedding distance between the model output and the target.
-    """
+    """Check the embedding distance between the model output and the target."""
+
     def score(self, model_output: Any, target: Any) -> Any:
-        model_embedding, target_embedding = self._compute_embeddings(model_output, target)
+        model_embedding, target_embedding = self._compute_embeddings(
+            model_output, target
+        )
         return self.cosine_similarity(model_embedding, target_embedding)
-    
+
     def _compute_embeddings(self, model_output: str, target: str) -> list[float]:
         llm = LLMFactory.create(self.client, self.model)
         return llm.embed([model_output, target])
-    
+
     def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
-        """
-        Compute the cosine similarity between two vectors.
-        """
+        """Compute the cosine similarity between two vectors."""
         vec1 = np.array(vec1)
         vec2 = np.array(vec2)
-        cosine_sim =  np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
+        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
 
         # cast to float
         return float(cosine_sim)
@@ -74,46 +86,46 @@ class OpenAIModerationScorer(LLMScorer):
     def score(self, model_output: Any) -> Any:
         if not isinstance(self.client, (OpenAI, AsyncOpenAI)):
             raise ValueError("Moderation scoring only works with OpenAI or AsyncOpenAI")
-        
+
         response = self.client.moderations.create(
             model=self.model,
             input=model_output,
         ).results[0]
         categories = {k: v for k, v in response.categories.dict().items() if v}
         return {"flagged": response.flagged, "categories": categories}
-        
 
 
 if __name__ == "__main__":
     try:
         import openai
+
         client = openai.OpenAI()
         scorer = EmbeddingSimilarityScorer(
-            client=client, 
-            model="text-embedding-3-small")
+            client=client, model="text-embedding-3-small"
+        )
         print(scorer.score("I don't know", "I don't know"))
     except Exception as e:
         print("Install openai to run this script")
 
     try:
         import openai
+
         client = openai.OpenAI()
-        scorer = OpenAIModerationScorer(
-            client=client, 
-            model="omni-moderation-latest")
+        scorer = OpenAIModerationScorer(client=client, model="omni-moderation-latest")
         print(scorer.score("I should kill myself"))
     except Exception as e:
         print("Install openai to run this script")
-    
+
     try:
         import openai
+
         client = openai.OpenAI()
         scorer = PromptScorer(
-            client=client, 
+            client=client,
             model="gpt-4o",
             system_prompt="You are a helpful assistant.",
-            user_prompt="Extract the entity from this phrase: \n {model_output}")
+            user_prompt="Extract the entity from this phrase: \n {model_output}",
+        )
         print(scorer.score("The cat is happy"))
     except Exception as e:
         print("Install openai to run this script")
-
diff --git a/weave/flow/scorer/pydantic_scorer.py b/weave/flow/scorer/pydantic_scorer.py
index 254e6ca243e9..21bd0c6de357 100644
--- a/weave/flow/scorer/pydantic_scorer.py
+++ b/weave/flow/scorer/pydantic_scorer.py
@@ -1,12 +1,13 @@
-from pydantic import BaseModel, ValidationError
 from typing import Any, Type
 
+from pydantic import BaseModel, ValidationError
+
 from weave.flow.scorer.base_scorer import Scorer
 
+
 class PydanticScorer(Scorer):
-    """
-    Validate the model output against a pydantic model.
-    """
+    """Validate the model output against a pydantic model."""
+
     model: Type[BaseModel]
 
     def score(self, model_output: Any):
@@ -33,7 +34,7 @@ class User(BaseModel):
 
     scorer = PydanticScorer(model=User)
 
-    model_output = "{\"name\": \"John\", \"age\": 30}"
+    model_output = '{"name": "John", "age": 30}'
     print(scorer.score(model_output))
 
     model_output = {"name": "John", "age": 30}
diff --git a/weave/flow/scorer/ragas.py b/weave/flow/scorer/ragas.py
index c849ae73c6c1..cff8fbea12ad 100644
--- a/weave/flow/scorer/ragas.py
+++ b/weave/flow/scorer/ragas.py
@@ -160,14 +160,17 @@ def score(self, model_output: Any, input_text: str, expected: str) -> float:
 
 if __name__ == "__main__":
     import os
-    import weave
+
+
     try:
         from weave.flow.scorer.lightllm import import_client
 
         # Instantiate your LLM client
         OpenAIClient = import_client("openai")
         if OpenAIClient:
-            llm_client = OpenAIClient(api_key=os.environ["OPENAI_API_KEY"])  # Replace with your API key
+            llm_client = OpenAIClient(
+                api_key=os.environ["OPENAI_API_KEY"]
+            )  # Replace with your API key
         else:
             raise ImportError("OpenAI client not available")
 
@@ -175,8 +178,12 @@ def score(self, model_output: Any, input_text: str, expected: str) -> float:
         context_entity_recall_scorer = ContextEntityRecallScorer(
             client=llm_client, model="gpt-4o"
         )
-        context_relevancy_scorer = ContextRelevancyScorer(client=llm_client, model="gpt-4")
-        context_precision_scorer = ContextPrecisionScorer(client=llm_client, model="gpt-4")
+        context_relevancy_scorer = ContextRelevancyScorer(
+            client=llm_client, model="gpt-4"
+        )
+        context_precision_scorer = ContextPrecisionScorer(
+            client=llm_client, model="gpt-4"
+        )
         faithfulness_scorer = FaithfulnessScorer(client=llm_client, model="gpt-4")
         answer_similarity_scorer = AnswerSimilarityScorer(
             client=llm_client, model="text-embedding-ada-002"
@@ -187,8 +194,16 @@ def score(self, model_output: Any, input_text: str, expected: str) -> float:
 
         # Create your dataset of examples
         examples = [
-            {"question": "What is the capital of France?", "expected": "Paris", "context": "Paris is the capital of France."},
-            {"question": "Who wrote 'To Kill a Mockingbird'?", "expected": "Harper Lee", "context": "Harper Lee is the author of 'To Kill a Mockingbird'."},
+            {
+                "question": "What is the capital of France?",
+                "expected": "Paris",
+                "context": "Paris is the capital of France.",
+            },
+            {
+                "question": "Who wrote 'To Kill a Mockingbird'?",
+                "expected": "Harper Lee",
+                "context": "Harper Lee is the author of 'To Kill a Mockingbird'.",
+            },
             # Add more examples as needed
         ]
 
@@ -205,17 +220,32 @@ def score(self, model_output: Any, input_text: str, expected: str) -> float:
             model_output = {"answer": example["expected"]}  # Simulate model output
             for scorer in scorers:
                 if isinstance(scorer, ContextEntityRecallScorer):
-                    score = scorer.score(model_output, example["expected"], example["context"])
+                    score = scorer.score(
+                        model_output, example["expected"], example["context"]
+                    )
                 elif isinstance(scorer, ContextRelevancyScorer):
-                    score = scorer.score(model_output, example["question"], example["context"])
+                    score = scorer.score(
+                        model_output, example["question"], example["context"]
+                    )
                 elif isinstance(scorer, ContextPrecisionScorer):
-                    score = scorer.score(model_output, example["question"], example["expected"], example["context"])
+                    score = scorer.score(
+                        model_output,
+                        example["question"],
+                        example["expected"],
+                        example["context"],
+                    )
                 elif isinstance(scorer, FaithfulnessScorer):
-                    score = scorer.score(model_output, example["expected"], example["context"])
+                    score = scorer.score(
+                        model_output, example["expected"], example["context"]
+                    )
                 elif isinstance(scorer, AnswerSimilarityScorer):
                     score = scorer.score(model_output, example["expected"])
                 elif isinstance(scorer, AnswerCorrectnessScorer):
-                    score = scorer.score(model_output, example["question"], example["expected"])
-                print(f"{scorer.__class__.__name__} score for '{example['question']}': {score}")
+                    score = scorer.score(
+                        model_output, example["question"], example["expected"]
+                    )
+                print(
+                    f"{scorer.__class__.__name__} score for '{example['question']}': {score}"
+                )
     except Exception as e:
-        print(e)
\ No newline at end of file
+        print(e)
diff --git a/weave/flow/scorer/regex_scorer.py b/weave/flow/scorer/regex_scorer.py
index 0fdd7527017b..e860f5d7561a 100644
--- a/weave/flow/scorer/regex_scorer.py
+++ b/weave/flow/scorer/regex_scorer.py
@@ -1,14 +1,16 @@
-from weave.flow.scorer.base_scorer import Scorer
-
-from typing import Union
 import re
+from typing import Union
+
 from pydantic import Field
-import weave
 
+import weave
 from weave.flow.scorer.base_scorer import Scorer
 
+
 class RegexScorer(Scorer):
-    patterns: Union[str, list[str]] = Field(default_factory=list, description="The patterns or keywords to match")
+    patterns: Union[str, list[str]] = Field(
+        default_factory=list, description="The patterns or keywords to match"
+    )
     ignore_case: bool = True
     ignore_whitespace: bool = False
     use_regex: bool = False  # Use regex patterns if True
@@ -16,7 +18,9 @@ class RegexScorer(Scorer):
     target_column: str = Field(default="target", description="The class name to match")
 
     @weave.op
-    def score(self, model_output: Union[dict, str], target: Union[str, list[str], None] = None) -> dict:
+    def score(
+        self, model_output: Union[dict, str], target: Union[str, list[str], None] = None
+    ) -> dict:
         if isinstance(model_output, str):
             model_output = {"output": model_output}
 
@@ -31,15 +35,17 @@ def score(self, model_output: Union[dict, str], target: Union[str, list[str], No
             if not self.use_regex:
                 pattern = re.escape(pattern)
             if self.ignore_whitespace:
-                pattern = ''.join(pattern.split())
+                pattern = "".join(pattern.split())
             if self.match_full_string:
-                pattern = f'^{pattern}$'
+                pattern = f"^{pattern}$"
             compiled_patterns.append(re.compile(pattern, flags=flags))
 
         text_to_search = model_output.get("output") if model_output else ""
         if self.ignore_whitespace:
-            text_to_search = ''.join(text_to_search.split())
+            text_to_search = "".join(text_to_search.split())
 
-        match_found = any(pattern.search(text_to_search) for pattern in compiled_patterns)
+        match_found = any(
+            pattern.search(text_to_search) for pattern in compiled_patterns
+        )
 
-        return {"string_match": match_found}
\ No newline at end of file
+        return {"string_match": match_found}
diff --git a/weave/flow/scorer/xml_scorer.py b/weave/flow/scorer/xml_scorer.py
index 512445fa4367..04a161d53525 100644
--- a/weave/flow/scorer/xml_scorer.py
+++ b/weave/flow/scorer/xml_scorer.py
@@ -1,13 +1,12 @@
 import xml.etree.ElementTree as ET
-from typing import Any, Union
+from typing import Union
 
 from weave.flow.scorer.base_scorer import Scorer
 
 
 class XMLScorer(Scorer):
-    """
-    Score an XML string.
-    """
+    """Score an XML string."""
+
     def score(self, model_output: Union[str, dict]) -> dict:
         if isinstance(model_output, dict):
             xml_string = model_output.get("output", "")
@@ -23,7 +22,11 @@ def score(self, model_output: Union[str, dict]) -> dict:
 
 if __name__ == "__main__":
     scorer = XMLScorer()
-    print(scorer.score("""<xml>
+    print(
+        scorer.score(
+            """<xml>
         <city>San Francisco</city>
         <country>USA</country>
-    </xml>"""))
\ No newline at end of file
+    </xml>"""
+        )
+    )

From a22c76bf028fe97a7507080b0eba0ec0e86fc667 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 11:50:18 +0200
Subject: [PATCH 020/150] pass through dataset row

---
 weave/flow/eval.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 281bb98bec15..759215a00d92 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -204,6 +204,8 @@ async def predict_and_score(
                 score_signature = inspect.signature(score_fn)
             score_arg_names = list(score_signature.parameters.keys())
 
+            # TODO: Check for input columns parameters in the signature of the scorer
+
             if "model_output" not in score_arg_names:
                 raise OpCallError(
                     f"Scorer {scorer_name} must have a 'model_output' argument, to receive the output of the model function."
@@ -211,6 +213,7 @@ async def predict_and_score(
 
             if isinstance(example, dict):
                 score_args = {k: v for k, v in example.items() if k in score_arg_names}
+                score_args.update({"dataset_row": example}) # TODO: investigate deduplication of dataset_row for performance
             else:
                 if len(score_arg_names) == 2:
                     score_args = {score_arg_names[0]: example}

From 8050e8b1d2ff61f363d63a6e9aa202620de2b7d8 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 11:50:51 +0200
Subject: [PATCH 021/150] add string match

---
 weave/flow/scorer/regex_scorer.py | 34 +++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/weave/flow/scorer/regex_scorer.py b/weave/flow/scorer/regex_scorer.py
index e860f5d7561a..1a4e743fe0b8 100644
--- a/weave/flow/scorer/regex_scorer.py
+++ b/weave/flow/scorer/regex_scorer.py
@@ -1,11 +1,20 @@
 import re
-from typing import Union
+from typing import Union, List, Any
 
 from pydantic import Field
 
 import weave
 from weave.flow.scorer.base_scorer import Scorer
 
+class StringScorer(Scorer):
+    """
+    Scorer that checks if the model output string is found in the search columns of the dataset row.
+    """
+    target_columns: List[str] = Field(default_factory=list, description="The names of the columns that are used as input to the scorer")
+
+    def score(self, model_output: Any, dataset_row: dict) -> dict:
+        string_in_input = any([model_output.lower() in input.lower() for k, input in dataset_row.items() if k in self.target_columns])
+        return {"string_in_input": string_in_input}
 
 class RegexScorer(Scorer):
     patterns: Union[str, list[str]] = Field(
@@ -13,7 +22,6 @@ class RegexScorer(Scorer):
     )
     ignore_case: bool = True
     ignore_whitespace: bool = False
-    use_regex: bool = False  # Use regex patterns if True
     match_full_string: bool = False  # Match the entire string if True
     target_column: str = Field(default="target", description="The class name to match")
 
@@ -49,3 +57,25 @@ def score(
         )
 
         return {"string_match": match_found}
+
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    scorer = StringScorer(target_columns=["col1", "col2"])
+    
+    @weave.op
+    def f(col1, col2): 
+        return "Hello"    
+
+    model_output = f(col1="hello", col2="world")
+    dataset_row = {"col1": "Hello my name is Morgan", "col2": "I am an engineer"}
+    print(scorer.score(model_output=model_output, dataset_row=dataset_row))
+
+    dataset = [{"col1": "Hello my name is Morgan", "col2": "I am an engineer", "target": "Morgan"}, 
+               {"col1": "Hello my name is John", "col2": "I am a doctor", "target": "John"}]
+    
+    evaluation = weave.Evaluation(dataset=dataset, scorers=[scorer])
+
+    eval_out = asyncio.run(evaluation.evaluate(f))
\ No newline at end of file

From bed6017529fd3c4aa869be99246d19c313443468 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 13:54:23 +0200
Subject: [PATCH 022/150] hallucination and llm refactor

---
 weave/flow/scorer/__init__.py             |   2 +-
 weave/flow/scorer/hallucination.py        |   6 -
 weave/flow/scorer/hallucination_scorer.py |  87 +++++++
 weave/flow/scorer/lightllm.py             | 265 ----------------------
 weave/flow/scorer/llm.py                  | 264 +++++++++++++++++++++
 weave/flow/scorer/llm_scorer.py           |  72 ++----
 weave/flow/scorer/ragas.py                |  56 +++--
 weave/flow/scorer/utils.py                |  35 +++
 8 files changed, 438 insertions(+), 349 deletions(-)
 delete mode 100644 weave/flow/scorer/hallucination.py
 create mode 100644 weave/flow/scorer/hallucination_scorer.py
 delete mode 100644 weave/flow/scorer/lightllm.py
 create mode 100644 weave/flow/scorer/llm.py
 create mode 100644 weave/flow/scorer/utils.py

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index 946f6967be6c..209d520d126a 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -1,6 +1,6 @@
 from weave.flow.scorer.base_scorer import Scorer, auto_summarize, get_scorer_attributes
 from weave.flow.scorer.classification import MultiTaskBinaryClassificationF1, transpose
-from weave.flow.scorer.hallucination import HallucinationScorer
+from weave.flow.scorer.hallucination_scorer import HallucinationScorer
 from weave.flow.scorer.json_scorer import JSONScorer
 from weave.flow.scorer.llm_scorer import (
     EmbeddingSimilarityScorer,
diff --git a/weave/flow/scorer/hallucination.py b/weave/flow/scorer/hallucination.py
deleted file mode 100644
index 0e3fda5d71f8..000000000000
--- a/weave/flow/scorer/hallucination.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from weave.flow.scorer.llm_scorer import PromptScorer
-
-
-class HallucinationScorer(PromptScorer):
-    def score(self, model_output, target):
-        return super().score(model_output, target)
diff --git a/weave/flow/scorer/hallucination_scorer.py b/weave/flow/scorer/hallucination_scorer.py
new file mode 100644
index 000000000000..ceacad26a4c0
--- /dev/null
+++ b/weave/flow/scorer/hallucination_scorer.py
@@ -0,0 +1,87 @@
+from pydantic import BaseModel, Field
+
+
+import weave
+from weave.flow.scorer.utils import stringify
+from weave.flow.scorer.llm_scorer import LLMScorer
+from weave.flow.scorer.llm import OPENAI_DEFAULT_MODEL
+
+
+DEFAULT_SYSTEM_PROMPT =  """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
+DEFAULT_USER_PROMPT =  """Given some input_data and a model_output, determine if the model_output is a hallucination of the input_data.
+## Input data
+<input_data>
+{input_data}
+</dataset_row>
+
+## Model output
+<model_output>
+{model_output}
+</model_output>
+
+## Instructions
+Think step by step before answering. Is the model_output an factually and logically consistent with the input_data? 
+"""
+
+class HallucinationResponse(BaseModel):
+    chain_of_thought: str = Field(description="Think step by step about whether the model_output is a hallucination of the dataset_row")
+    is_hallucination: bool = Field(description="Whether the model output is a hallucination of the dataset row")
+
+class HallucinationScorer(LLMScorer):
+    """
+    Scorer that checks if the model output is a hallucination of the dataset row.
+    """
+    system_prompt: str = DEFAULT_SYSTEM_PROMPT
+    user_prompt: str = DEFAULT_USER_PROMPT
+    model_id: str = OPENAI_DEFAULT_MODEL
+    temperature: float = 0.7
+    max_tokens: int = 4096
+    input_data_columns: list[str] = Field(description="The columns of the input data to use as ground truth")
+
+    @weave.op
+    def score(self, model_output: str, dataset_row: dict) -> HallucinationResponse:
+
+        model_output = stringify(model_output)
+
+        input_data = {k: stringify(v) for k, v in dataset_row.items() if k in self.input_data_columns}
+
+        response = self.client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": self.system_prompt},
+                {"role": "user", "content": self.user_prompt.format(input_data=input_data, model_output=model_output)},
+            ],
+            model=self.model_id,
+            response_model=HallucinationResponse,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+        )
+        return response
+
+
+if __name__ == "__main__":
+    try:
+        import openai, os, weave, asyncio
+
+        weave.init("hallucination-scorer-2")
+
+        openai_client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+        scorer = HallucinationScorer(client=openai_client, input_data_columns=["text"])
+
+        model_output = "John favorite cheese is camembert"
+        dataset_row = {"text": "John doesn't like cheese"}
+        response = scorer.score(model_output, dataset_row)
+        print(response)
+    
+        @weave.op
+        def model():
+            return "John favorite food is apples"
+
+        dataset = [{"text": "John doesn't like cheese"}, 
+                   {"text": "John likes pizza"}]
+        
+        evaluation = weave.Evaluation(dataset=dataset, scorers=[scorer])
+        asyncio.run(evaluation.evaluate(model))
+    
+    except Exception as e:
+        print(e)
+    
\ No newline at end of file
diff --git a/weave/flow/scorer/lightllm.py b/weave/flow/scorer/lightllm.py
deleted file mode 100644
index 92e97c47913b..000000000000
--- a/weave/flow/scorer/lightllm.py
+++ /dev/null
@@ -1,265 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Union
-
-OPENAI_DEFAULT_MODEL = "gpt-4o"
-OPENAI_DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
-
-ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet-20240620"
-
-MISTRAL_DEFAULT_MODEL = "mistral-large-latest"
-MISTRAL_DEFAULT_EMBEDDING_MODEL = "mistral-embed"
-
-
-_LLM_CLIENT_TYPES = []
-
-try:
-    from openai import AsyncOpenAI, OpenAI
-
-    _LLM_CLIENT_TYPES.append(OpenAI)
-    _LLM_CLIENT_TYPES.append(AsyncOpenAI)
-except:
-    pass
-try:
-    from anthropic import Anthropic, AsyncAnthropic
-
-    _LLM_CLIENT_TYPES.append(Anthropic)
-    _LLM_CLIENT_TYPES.append(AsyncAnthropic)
-except:
-    pass
-try:
-    from mistralai import Mistral
-
-    _LLM_CLIENT_TYPES.append(Mistral)
-except:
-    pass
-
-_LLM_CLIENTS = Union[tuple(_LLM_CLIENT_TYPES)]
-
-
-class BaseLLM(ABC):
-    def __init__(self, client: Any, model_id: str):
-        self.client = client
-        self.model_id = model_id
-
-    @abstractmethod
-    def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        pass
-
-    @abstractmethod
-    async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        pass
-
-    @abstractmethod
-    def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-        pass
-
-    @abstractmethod
-    async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-        pass
-
-
-class MistralLLM(BaseLLM):
-    def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        response = self.client.chat.complete(
-            model=self.model_id, messages=messages, **kwargs
-        )
-        return response.choices[0].message.content
-
-    async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        response = await self.client.chat.complete(
-            model=self.model_id, messages=messages, **kwargs
-        )
-        return response.choices[0].message.content
-
-    def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-        if isinstance(texts, str):
-            texts = [texts]
-        response = self.client.embeddings.create(model=self.model_id, inputs=texts)
-        return [embedding.embedding for embedding in response.data]
-
-    async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-        if isinstance(texts, str):
-            texts = [texts]
-        response = await self.client.embeddings.create(
-            model=self.model_id, inputs=texts
-        )
-        return [embedding.embedding for embedding in response.data]
-
-
-class OpenAILLM(BaseLLM):
-    def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        response = self.client.chat.completions.create(
-            model=self.model_id, messages=messages, **kwargs
-        )
-        return response.choices[0].message.content
-
-    async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        response = await self.client.chat.completions.create(
-            model=self.model_id, messages=messages, **kwargs
-        )
-        return response.choices[0].message.content
-
-    def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-        if isinstance(texts, str):
-            texts = [texts]
-        response = self.client.embeddings.create(input=texts, model=self.model_id)
-        return [data.embedding for data in response.data]
-
-    async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-        if isinstance(texts, str):
-            texts = [texts]
-        response = await self.client.embeddings.create(input=texts, model=self.model_id)
-        return [data.embedding for data in response.data]
-
-
-class AnthropicLLM(BaseLLM):
-    def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        system_message = next(
-            (msg["content"] for msg in messages if msg["role"] == "system"), None
-        )
-        user_messages = [msg for msg in messages if msg["role"] != "system"]
-        response = self.client.messages.create(
-            model=self.model_id,
-            messages=user_messages,
-            system=system_message,
-            max_tokens=2048,
-            **kwargs,
-        )
-        return response.content
-
-    async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-        system_message = next(
-            (msg["content"] for msg in messages if msg["role"] == "system"), None
-        )
-        user_messages = [msg for msg in messages if msg["role"] != "system"]
-        response = await self.client.messages.create(
-            model=self.model_id,
-            messages=user_messages,
-            system=system_message,
-            max_tokens=2048,
-            **kwargs,
-        )
-        return response.content
-
-    def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-        return [[0.0]]  # Anthropic doesn't support embeddings
-
-    async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-        return [[0.0]]  # Anthropic doesn't support embeddings
-
-
-class LLMFactory:
-    @staticmethod
-    def create(client: Any, model_id: str) -> BaseLLM:
-        client_type = type(client).__name__.lower()
-        if "mistral" in client_type:
-            return MistralLLM(client, model_id)
-        elif "openai" in client_type:
-            return OpenAILLM(client, model_id)
-        elif "anthropic" in client_type:
-            return AnthropicLLM(client, model_id)
-        else:
-            raise ValueError(f"Unsupported client type: {client_type}")
-
-
-# Helper function for dynamic imports
-def import_client(provider: str):
-    try:
-        if provider == "mistral":
-            from mistralai import Mistral
-
-            return Mistral
-        elif provider == "openai":
-            from openai import OpenAI
-
-            return OpenAI
-        elif provider == "anthropic":
-            import anthropic
-
-            return anthropic.Anthropic
-    except ImportError:
-        return None
-
-
-# Example usage:
-if __name__ == "__main__":
-    import asyncio
-    import os
-
-    # Mistral example
-    MistralClient = import_client("mistral")
-    if MistralClient:
-        mistral_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
-        mistral_llm = LLMFactory.create(mistral_client, MISTRAL_DEFAULT_MODEL)
-        mistral_response = mistral_llm.chat(
-            [{"role": "user", "content": "What is the best French cheese?"}]
-        )
-        print("Mistral response:", mistral_response)
-
-    # OpenAI example with system message
-    OpenAIClient = import_client("openai")
-    if OpenAIClient:
-        openai_client = OpenAIClient()
-        openai_llm = LLMFactory.create(openai_client, OPENAI_DEFAULT_MODEL)
-        openai_response = openai_llm.chat(
-            [
-                {
-                    "role": "system",
-                    "content": "You are a helpful assistant specialized in writing poetry.",
-                },
-                {
-                    "role": "user",
-                    "content": "Write a haiku about recursion in programming.",
-                },
-            ]
-        )
-        print("OpenAI response:", openai_response)
-
-    # Anthropic example with system message
-    AnthropicClient = import_client("anthropic")
-    if AnthropicClient:
-        anthropic_client = AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY"))
-        anthropic_llm = LLMFactory.create(anthropic_client, ANTHROPIC_DEFAULT_MODEL)
-        anthropic_response = anthropic_llm.chat(
-            [
-                {
-                    "role": "system",
-                    "content": "You are Claude, an AI assistant created by Anthropic.",
-                },
-                {"role": "user", "content": "Hello, Claude"},
-            ]
-        )
-        print("Anthropic response:", anthropic_response)
-
-    # Embedding example
-    if MistralClient:
-        mistral_embed_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
-        mistral_embed_llm = LLMFactory.create(
-            mistral_embed_client, MISTRAL_DEFAULT_EMBEDDING_MODEL
-        )
-        mistral_embeddings = mistral_embed_llm.embed(
-            ["Embed this sentence.", "As well as this one."]
-        )
-        print("Mistral embeddings:", mistral_embeddings)
-
-    # Async example with system message
-    async def async_example():
-        if OpenAIClient:
-            from openai import AsyncOpenAI
-
-            openai_async_client = AsyncOpenAI()
-            openai_async_llm = LLMFactory.create(
-                openai_async_client, OPENAI_DEFAULT_MODEL
-            )
-            openai_async_response = await openai_async_llm.achat(
-                [
-                    {
-                        "role": "system",
-                        "content": "You are a philosopher AI assistant.",
-                    },
-                    {"role": "user", "content": "What's the meaning of life?"},
-                ]
-            )
-            print("OpenAI async response:", openai_async_response)
-
-    asyncio.run(async_example())
diff --git a/weave/flow/scorer/llm.py b/weave/flow/scorer/llm.py
new file mode 100644
index 000000000000..a38ef4e174ef
--- /dev/null
+++ b/weave/flow/scorer/llm.py
@@ -0,0 +1,264 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Union
+
+import instructor
+
+from weave.trace.autopatch import autopatch
+
+autopatch() # fix instrucor tracing
+
+# TODO: Gemini
+
+OPENAI_DEFAULT_MODEL = "gpt-4o"
+OPENAI_DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
+
+ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet-20240620"
+
+MISTRAL_DEFAULT_MODEL = "mistral-large-latest"
+MISTRAL_DEFAULT_EMBEDDING_MODEL = "mistral-embed"
+
+DEFAULT_MAX_TOKENS = 4096
+
+
+_LLM_CLIENT_TYPES = []
+
+try:
+    from openai import AsyncOpenAI, OpenAI
+
+    _LLM_CLIENT_TYPES.append(OpenAI)
+    _LLM_CLIENT_TYPES.append(AsyncOpenAI)
+except:
+    pass
+try:
+    from anthropic import Anthropic, AsyncAnthropic
+
+    _LLM_CLIENT_TYPES.append(Anthropic)
+    _LLM_CLIENT_TYPES.append(AsyncAnthropic)
+except:
+    pass
+try:
+    from mistralai import Mistral
+
+    _LLM_CLIENT_TYPES.append(Mistral)
+except:
+    pass
+
+_LLM_CLIENTS = Union[tuple(_LLM_CLIENT_TYPES)]
+
+
+# class EmbeddingLLM(ABC):
+#     def __init__(self, client: Any, model_id: str):
+#         self.client = client
+#         self.model_id = model_id
+
+#     @abstractmethod
+#     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+#         pass
+
+#     @abstractmethod
+#     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+#         pass
+
+def instruct_client(client: _LLM_CLIENTS):
+    client_type = type(client).__name__.lower()
+    if "mistral" in client_type:
+        return instructor.from_mistral(client)
+    elif "openai" in client_type:
+        return instructor.from_openai(client)
+    elif "anthropic" in client_type:
+        return instructor.from_anthropic(client)
+    else:
+        raise ValueError(f"Unsupported client type: {client_type}")
+
+
+
+# class MistralLLM(LLM):
+#     def model_post_init(self):
+#         try:
+#             import instructor
+#             self.llm = instructor.from_mistral(self)
+#         except ImportError:
+#            raise ImportError("instructor is required to use InstructorMistralLLM\nYou can install it with `pip install instructor`")
+
+#     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+#         if isinstance(texts, str):
+#             texts = [texts]
+#         response = self.client.embeddings.create(model=self.model_id, inputs=texts)
+#         return [embedding.embedding for embedding in response.data]
+
+#     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+#         if isinstance(texts, str):
+#             texts = [texts]
+#         response = await self.client.embeddings.create(
+#             model=self.model_id, inputs=texts
+#         )
+#         return [embedding.embedding for embedding in response.data]
+
+# class OpenAILLM(BaseLLM):
+#     def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
+#         response = self.client.chat.completions.create(
+#             model=self.model_id, messages=messages, **kwargs
+#         )
+#         return response.choices[0].message.content
+
+#     async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
+#         response = await self.client.chat.completions.create(
+#             model=self.model_id, messages=messages, **kwargs
+#         )
+#         return response.choices[0].message.content
+
+#     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+#         if isinstance(texts, str):
+#             texts = [texts]
+#         response = self.client.embeddings.create(input=texts, model=self.model_id)
+#         return [data.embedding for data in response.data]
+
+#     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+#         if isinstance(texts, str):
+#             texts = [texts]
+#         response = await self.client.embeddings.create(input=texts, model=self.model_id)
+#         return [data.embedding for data in response.data]
+
+
+# class AnthropicLLM(BaseLLM):
+#     def chat(self, messages: List[Dict[str, str]], max_tokens=4096, **kwargs) -> str:
+#         system_message = next(
+#             (msg["content"] for msg in messages if msg["role"] == "system"), None
+#         )
+#         user_messages = [msg for msg in messages if msg["role"] != "system"]
+#         response = self.client.messages.create(
+#             model=self.model_id,
+#             messages=user_messages,
+#             system=system_message,
+#             max_tokens=max_tokens,
+#             **kwargs,
+#         )
+#         return response.content
+
+#     async def achat(self, messages: List[Dict[str, str]], max_tokens=4096, **kwargs) -> str:
+#         system_message = next(
+#             (msg["content"] for msg in messages if msg["role"] == "system"), None
+#         )
+#         user_messages = [msg for msg in messages if msg["role"] != "system"]
+#         response = await self.client.messages.create(
+#             model=self.model_id,
+#             messages=user_messages,
+#             system=system_message,
+#             max_tokens=max_tokens,
+#             **kwargs,
+#         )
+#         return response.content
+
+#     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+#         return [[0.0]]  # Anthropic doesn't support embeddings
+
+#     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+#         return [[0.0]]  # Anthropic doesn't support embeddings
+    
+# Helper function for dynamic imports
+def import_client(provider: str):
+    try:
+        if provider == "mistral":
+            from mistralai import Mistral
+
+            return Mistral
+        elif provider == "openai":
+            from openai import OpenAI
+
+            return OpenAI
+        elif provider == "anthropic":
+            import anthropic
+
+            return anthropic.Anthropic
+    except ImportError:
+        return None
+
+
+# Example usage:
+if __name__ == "__main__":
+    import asyncio
+    import os
+
+    # Mistral example
+    MistralClient = import_client("mistral")
+    if MistralClient:
+        mistral_client = instruct_client(Mistral(api_key=os.environ.get("MISTRAL_API_KEY")))
+        mistral_response = mistral_client.chat.completions.create(
+            messages=[{"role": "user", "content": "What is the best French cheese?"}],
+            model=MISTRAL_DEFAULT_MODEL,
+            max_tokens=DEFAULT_MAX_TOKENS,
+            response_model=str,
+        )
+        print("Mistral response:", mistral_response)
+
+    # OpenAI example with system message
+    OpenAIClient = import_client("openai")
+    if OpenAIClient:
+        openai_client = instruct_client(OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY")))
+        openai_response = openai_client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant specialized in writing poetry.",
+                },
+                {
+                    "role": "user",
+                    "content": "Write a haiku about recursion in programming.",
+                },
+            ],
+            model=OPENAI_DEFAULT_MODEL,
+            max_tokens=DEFAULT_MAX_TOKENS,
+            response_model=str,
+        )
+        print("OpenAI response:", openai_response)
+
+    # Anthropic example with system message
+    AnthropicClient = import_client("anthropic")
+    if AnthropicClient:
+        anthropic_client = instruct_client(AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY")))
+        anthropic_response = anthropic_client.messages.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are Claude, an AI assistant created by Anthropic.",
+                },
+                {"role": "user", "content": "Hello, Claude"},
+            ],
+            model=ANTHROPIC_DEFAULT_MODEL,
+            max_tokens=DEFAULT_MAX_TOKENS,
+            response_model=str,
+        )
+        print("Anthropic response:", anthropic_response)
+
+    # Embedding example
+    # if MistralClient:
+    #     mistral_embed_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
+    #     mistral_embed_llm = LLMFactory.create(
+    #         mistral_embed_client, MISTRAL_DEFAULT_EMBEDDING_MODEL
+    #     )
+    #     mistral_embeddings = mistral_embed_llm.embed(
+    #         ["Embed this sentence.", "As well as this one."]
+    #     )
+    #     print("Mistral embeddings:", mistral_embeddings)
+
+    # # Async example with system message
+    # async def async_example():
+    #     if OpenAIClient:
+    #         from openai import AsyncOpenAI
+
+    #         openai_async_client = AsyncOpenAI()
+    #         openai_async_llm = LLMFactory.create(
+    #             openai_async_client, OPENAI_DEFAULT_MODEL
+    #         )
+    #         openai_async_response = await openai_async_llm.achat(
+    #             [
+    #                 {
+    #                     "role": "system",
+    #                     "content": "You are a philosopher AI assistant.",
+    #                 },
+    #                 {"role": "user", "content": "What's the meaning of life?"},
+    #             ]
+    #         )
+    #         print("OpenAI async response:", openai_async_response)
+
+    # asyncio.run(async_example())
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index e8022dda662e..46eae4b85ac9 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -1,24 +1,25 @@
-from typing import Any
+import json
+from typing import Any, Type
 
 import numpy as np
-from pydantic import Field, field_validator
+from pydantic import BaseModel, Field, field_validator
 
 from weave.flow.scorer.base_scorer import Scorer
-from weave.flow.scorer.lightllm import _LLM_CLIENT_TYPES, LLMFactory
+from weave.flow.scorer.llm import instruct_client, OPENAI_DEFAULT_MODEL, _LLM_CLIENT_TYPES
 
 try:
     from openai import AsyncOpenAI, OpenAI
 except:
     pass
-
-
 class LLMScorer(Scorer):
     """Score an LLM output."""
 
     client: Any = Field(
         description="The LLM client to use, has to be instantiated with an api_key"
     )
-    model: str = Field(description="The model to use")
+    model_id: str = Field(description="The model to use")
+    temperature: float = Field(..., description="The temperature to use for the response")
+    max_tokens: int = Field(..., description="The maximum number of tokens in the response")
 
     @field_validator("client")
     def validate_client(cls, v):
@@ -26,35 +27,7 @@ def validate_client(cls, v):
             raise ValueError(
                 f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}"
             )
-        return v
-
-
-class PromptScorer(LLMScorer):
-    """Score an LLM output based on the prompt."""
-
-    system_prompt: str = Field(
-        default="You are a helpful assistant.", description="The system prompt to use"
-    )
-    user_prompt: str = Field(description="The user prompt to use")
-
-    @field_validator("user_prompt")
-    def validate_user_prompt(cls, v):
-        if "{model_output}" not in v:
-            raise ValueError(
-                "The user prompt must contain the `{model_output}` variable."
-            )
-        return v
-
-    def score(self, model_output: Any) -> Any:
-        llm = LLMFactory.create(self.client, self.model)
-        messages = [
-            {"role": "system", "content": self.system_prompt},
-            {
-                "role": "user",
-                "content": self.user_prompt.format(model_output=model_output),
-            },
-        ]
-        return llm.chat(messages=messages)
+        return instruct_client(v)
 
 
 class EmbeddingSimilarityScorer(LLMScorer):
@@ -66,9 +39,10 @@ def score(self, model_output: Any, target: Any) -> Any:
         )
         return self.cosine_similarity(model_embedding, target_embedding)
 
-    def _compute_embeddings(self, model_output: str, target: str) -> list[float]:
-        llm = LLMFactory.create(self.client, self.model)
-        return llm.embed([model_output, target])
+    def _compute_embeddings(self, model_output: str, target: str) -> tuple[list[float], list[float]]:
+        llm = instruct_client(self.client)
+        embeddings = llm.embed([model_output, target])
+        return embeddings[0], embeddings[1]
 
     def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
         """Compute the cosine similarity between two vectors."""
@@ -88,7 +62,7 @@ def score(self, model_output: Any) -> Any:
             raise ValueError("Moderation scoring only works with OpenAI or AsyncOpenAI")
 
         response = self.client.moderations.create(
-            model=self.model,
+            model=self.model_id,
             input=model_output,
         ).results[0]
         categories = {k: v for k, v in response.categories.dict().items() if v}
@@ -101,7 +75,7 @@ def score(self, model_output: Any) -> Any:
 
         client = openai.OpenAI()
         scorer = EmbeddingSimilarityScorer(
-            client=client, model="text-embedding-3-small"
+            client=client, model_id="text-embedding-3-small"
         )
         print(scorer.score("I don't know", "I don't know"))
     except Exception as e:
@@ -111,21 +85,7 @@ def score(self, model_output: Any) -> Any:
         import openai
 
         client = openai.OpenAI()
-        scorer = OpenAIModerationScorer(client=client, model="omni-moderation-latest")
+        scorer = OpenAIModerationScorer(client=client, model_id="omni-moderation-latest")
         print(scorer.score("I should kill myself"))
     except Exception as e:
-        print("Install openai to run this script")
-
-    try:
-        import openai
-
-        client = openai.OpenAI()
-        scorer = PromptScorer(
-            client=client,
-            model="gpt-4o",
-            system_prompt="You are a helpful assistant.",
-            user_prompt="Extract the entity from this phrase: \n {model_output}",
-        )
-        print(scorer.score("The cat is happy"))
-    except Exception as e:
-        print("Install openai to run this script")
+        print("Install openai to run this script")
\ No newline at end of file
diff --git a/weave/flow/scorer/ragas.py b/weave/flow/scorer/ragas.py
index cff8fbea12ad..f8ff1263e69a 100644
--- a/weave/flow/scorer/ragas.py
+++ b/weave/flow/scorer/ragas.py
@@ -1,11 +1,14 @@
-# implememting metrics from ragas: https://github.com/explodinggradients/ragas
+# implementing metrics from ragas: https://github.com/explodinggradients/ragas
 
 from typing import Any, List
-
-from weave.flow.scorer.lightllm import LLMFactory
+from pydantic import BaseModel
+from weave.flow.scorer.llm import instruct_client
 from weave.flow.scorer.llm_scorer import EmbeddingSimilarityScorer, LLMScorer
 
 
+class EntityExtractionResponse(BaseModel):
+    entities: List[str]
+
 class ContextEntityRecallScorer(LLMScorer):
     """
     Estimates context recall by extracting entities from the expected answer and
@@ -17,13 +20,15 @@ class ContextEntityRecallScorer(LLMScorer):
 
     Text: {text}
     Entities:
-    """
-
+    """  
     def extract_entities(self, text: str) -> List[str]:
         # Use LLM to extract entities
-        llm = LLMFactory.create(self.client, self.model)
+        llm = instruct_client(self.client)
         prompt = self.extraction_prompt.format(text=text)
-        response = llm.chat(messages=[{"role": "user", "content": prompt}])
+        response = llm.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            response_model=str
+        )
         # Assume entities are returned as a comma-separated list
         entities = [e.strip() for e in response.split(",")]
         return entities
@@ -52,9 +57,12 @@ class ContextRelevancyScorer(LLMScorer):
     """
 
     def score(self, model_output: Any, input_text: str, context: str) -> float:
-        llm = LLMFactory.create(self.client, self.model)
+        llm = instruct_client(self.client)
         prompt = self.relevancy_prompt.format(question=input_text, context=context)
-        response = llm.chat(messages=[{"role": "user", "content": prompt}])
+        response = llm.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            response_model=str
+        )
         # Parse the response to get the relevancy score
         try:
             score = float(response.strip())
@@ -79,11 +87,14 @@ class ContextPrecisionScorer(LLMScorer):
     def score(
         self, model_output: Any, input_text: str, expected: str, context: str
     ) -> float:
-        llm = LLMFactory.create(self.client, self.model)
+        llm = instruct_client(self.client)
         prompt = self.precision_prompt.format(
             question=input_text, answer=expected, context=context
         )
-        response = llm.chat(messages=[{"role": "user", "content": prompt}])
+        response = llm.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            response_model=str
+        )
         # Parse the response to get the verdict
         try:
             verdict = int(response.strip())
@@ -104,10 +115,13 @@ class FaithfulnessScorer(LLMScorer):
     """
 
     def score(self, model_output: Any, expected: str, context: str) -> float:
-        llm = LLMFactory.create(self.client, self.model)
+        llm = instruct_client(self.client)
         answer = model_output.get("answer", "")
         prompt = self.faithfulness_prompt.format(answer=answer, context=context)
-        response = llm.chat(messages=[{"role": "user", "content": prompt}])
+        response = llm.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            response_model=str
+        )
         # Parse the response to get the faithfulness score
         try:
             score = float(response.strip())
@@ -142,14 +156,17 @@ class AnswerCorrectnessScorer(LLMScorer):
     """
 
     def score(self, model_output: Any, input_text: str, expected: str) -> float:
-        llm = LLMFactory.create(self.client, self.model)
+        llm = instruct_client(self.client)
         generated_answer = model_output.get("answer", "")
         prompt = self.correctness_prompt.format(
             question=input_text,
             generated_answer=generated_answer,
             ground_truth=expected,
         )
-        response = llm.chat(messages=[{"role": "user", "content": prompt}])
+        response = llm.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            response_model=str
+        )
         # Parse the response to get the correctness score
         try:
             score = float(response.strip())
@@ -161,16 +178,13 @@ def score(self, model_output: Any, input_text: str, expected: str) -> float:
 if __name__ == "__main__":
     import os
 
-
     try:
-        from weave.flow.scorer.lightllm import import_client
+        from weave.flow.scorer.llm import import_client
 
         # Instantiate your LLM client
         OpenAIClient = import_client("openai")
         if OpenAIClient:
-            llm_client = OpenAIClient(
-                api_key=os.environ["OPENAI_API_KEY"]
-            )  # Replace with your API key
+            llm_client = OpenAIClient(api_key=os.environ["OPENAI_API_KEY"])
         else:
             raise ImportError("OpenAI client not available")
 
@@ -248,4 +262,4 @@ def score(self, model_output: Any, input_text: str, expected: str) -> float:
                     f"{scorer.__class__.__name__} score for '{example['question']}': {score}"
                 )
     except Exception as e:
-        print(e)
+        print(e)
\ No newline at end of file
diff --git a/weave/flow/scorer/utils.py b/weave/flow/scorer/utils.py
new file mode 100644
index 000000000000..3c70a1a023ed
--- /dev/null
+++ b/weave/flow/scorer/utils.py
@@ -0,0 +1,35 @@
+import json
+from typing import Any
+
+from pydantic import BaseModel
+
+
+def stringify(model_output: Any) -> str:
+    if isinstance(model_output, str):
+        return model_output
+    elif isinstance(model_output, (list, tuple)):
+        return json.dumps(model_output, indent=2)
+    elif isinstance(model_output, dict):
+        return json.dumps(model_output, indent=2)
+    elif isinstance(model_output, BaseModel):
+        return model_output.model_dump_json(indent=2)
+    else:
+        raise ValueError(f"Unsupported model output type: {type(model_output)}")
+
+if __name__ == "__main__":
+    # test
+    model_output = "hey"
+    print(stringify(model_output))
+
+    model_output = [1, 2, 3]
+    print(stringify(model_output))
+
+    model_output = {"a": 1, "b": 2}
+    print(stringify(model_output))
+
+    class TestModel(BaseModel):
+        a: int
+        b: str
+
+    model_output = TestModel(a=1, b="test")
+    print(stringify(model_output))

From 5b7d7e55474acb3905c22c6b24728e219c90837c Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 15:20:09 +0200
Subject: [PATCH 023/150] fix embed

---
 weave/flow/scorer/llm.py | 150 ++++++---------------------------------
 1 file changed, 20 insertions(+), 130 deletions(-)

diff --git a/weave/flow/scorer/llm.py b/weave/flow/scorer/llm.py
index a38ef4e174ef..cc648868cf2b 100644
--- a/weave/flow/scorer/llm.py
+++ b/weave/flow/scorer/llm.py
@@ -1,5 +1,4 @@
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Union
+from typing import List, Union, TypeVar
 
 import instructor
 
@@ -43,21 +42,7 @@
 except:
     pass
 
-_LLM_CLIENTS = Union[tuple(_LLM_CLIENT_TYPES)]
-
-
-# class EmbeddingLLM(ABC):
-#     def __init__(self, client: Any, model_id: str):
-#         self.client = client
-#         self.model_id = model_id
-
-#     @abstractmethod
-#     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-#         pass
-
-#     @abstractmethod
-#     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-#         pass
+_LLM_CLIENTS = TypeVar(Union[tuple(_LLM_CLIENT_TYPES)])
 
 def instruct_client(client: _LLM_CLIENTS):
     client_type = type(client).__name__.lower()
@@ -70,91 +55,17 @@ def instruct_client(client: _LLM_CLIENTS):
     else:
         raise ValueError(f"Unsupported client type: {client_type}")
 
+def embed(client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+    client_type = type(client).__name__.lower()
+    if "mistral" in client_type:
+        response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)
+        return [embedding.embedding for embedding in response.data]
+    elif "openai" in client_type:
+        response = client.embeddings.create(model=model_id, input=texts, **kwargs)
+        return [embedding.embedding for embedding in response.data]
+    else:
+        raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}")
 
-
-# class MistralLLM(LLM):
-#     def model_post_init(self):
-#         try:
-#             import instructor
-#             self.llm = instructor.from_mistral(self)
-#         except ImportError:
-#            raise ImportError("instructor is required to use InstructorMistralLLM\nYou can install it with `pip install instructor`")
-
-#     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-#         if isinstance(texts, str):
-#             texts = [texts]
-#         response = self.client.embeddings.create(model=self.model_id, inputs=texts)
-#         return [embedding.embedding for embedding in response.data]
-
-#     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-#         if isinstance(texts, str):
-#             texts = [texts]
-#         response = await self.client.embeddings.create(
-#             model=self.model_id, inputs=texts
-#         )
-#         return [embedding.embedding for embedding in response.data]
-
-# class OpenAILLM(BaseLLM):
-#     def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-#         response = self.client.chat.completions.create(
-#             model=self.model_id, messages=messages, **kwargs
-#         )
-#         return response.choices[0].message.content
-
-#     async def achat(self, messages: List[Dict[str, str]], **kwargs) -> str:
-#         response = await self.client.chat.completions.create(
-#             model=self.model_id, messages=messages, **kwargs
-#         )
-#         return response.choices[0].message.content
-
-#     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-#         if isinstance(texts, str):
-#             texts = [texts]
-#         response = self.client.embeddings.create(input=texts, model=self.model_id)
-#         return [data.embedding for data in response.data]
-
-#     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-#         if isinstance(texts, str):
-#             texts = [texts]
-#         response = await self.client.embeddings.create(input=texts, model=self.model_id)
-#         return [data.embedding for data in response.data]
-
-
-# class AnthropicLLM(BaseLLM):
-#     def chat(self, messages: List[Dict[str, str]], max_tokens=4096, **kwargs) -> str:
-#         system_message = next(
-#             (msg["content"] for msg in messages if msg["role"] == "system"), None
-#         )
-#         user_messages = [msg for msg in messages if msg["role"] != "system"]
-#         response = self.client.messages.create(
-#             model=self.model_id,
-#             messages=user_messages,
-#             system=system_message,
-#             max_tokens=max_tokens,
-#             **kwargs,
-#         )
-#         return response.content
-
-#     async def achat(self, messages: List[Dict[str, str]], max_tokens=4096, **kwargs) -> str:
-#         system_message = next(
-#             (msg["content"] for msg in messages if msg["role"] == "system"), None
-#         )
-#         user_messages = [msg for msg in messages if msg["role"] != "system"]
-#         response = await self.client.messages.create(
-#             model=self.model_id,
-#             messages=user_messages,
-#             system=system_message,
-#             max_tokens=max_tokens,
-#             **kwargs,
-#         )
-#         return response.content
-
-#     def embed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-#         return [[0.0]]  # Anthropic doesn't support embeddings
-
-#     async def aembed(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
-#         return [[0.0]]  # Anthropic doesn't support embeddings
-    
 # Helper function for dynamic imports
 def import_client(provider: str):
     try:
@@ -231,34 +142,13 @@ def import_client(provider: str):
         print("Anthropic response:", anthropic_response)
 
     # Embedding example
-    # if MistralClient:
-    #     mistral_embed_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
-    #     mistral_embed_llm = LLMFactory.create(
-    #         mistral_embed_client, MISTRAL_DEFAULT_EMBEDDING_MODEL
-    #     )
-    #     mistral_embeddings = mistral_embed_llm.embed(
-    #         ["Embed this sentence.", "As well as this one."]
-    #     )
-    #     print("Mistral embeddings:", mistral_embeddings)
-
-    # # Async example with system message
-    # async def async_example():
-    #     if OpenAIClient:
-    #         from openai import AsyncOpenAI
+    if OpenAIClient:
+        openai_embed_client = OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY"))
+        openai_embeddings = embed(openai_embed_client, OPENAI_DEFAULT_EMBEDDING_MODEL, ["Embed this sentence.", "As well as this one."])
+        print("OpenAI embeddings:", openai_embeddings)
 
-    #         openai_async_client = AsyncOpenAI()
-    #         openai_async_llm = LLMFactory.create(
-    #             openai_async_client, OPENAI_DEFAULT_MODEL
-    #         )
-    #         openai_async_response = await openai_async_llm.achat(
-    #             [
-    #                 {
-    #                     "role": "system",
-    #                     "content": "You are a philosopher AI assistant.",
-    #                 },
-    #                 {"role": "user", "content": "What's the meaning of life?"},
-    #             ]
-    #         )
-    #         print("OpenAI async response:", openai_async_response)
+    if MistralClient:
+        mistral_embed_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
+        mistral_embeddings = embed(mistral_embed_client, MISTRAL_DEFAULT_EMBEDDING_MODEL, ["Embed this sentence.", "As well as this one."])
+        print("Mistral embeddings:", mistral_embeddings)
 
-    # asyncio.run(async_example())

From 8cd5957bd6e98d01e634888f96e7b873b9506e6e Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 16:54:32 +0200
Subject: [PATCH 024/150] fix ragas

---
 weave/flow/scorer/__init__.py |  12 +-
 weave/flow/scorer/ragas.py    | 250 ++++++++--------------------------
 2 files changed, 61 insertions(+), 201 deletions(-)

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index 209d520d126a..7cf64581bf53 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -3,18 +3,14 @@
 from weave.flow.scorer.hallucination_scorer import HallucinationScorer
 from weave.flow.scorer.json_scorer import JSONScorer
 from weave.flow.scorer.llm_scorer import (
-    EmbeddingSimilarityScorer,
     LLMScorer,
-    OpenAIModerationScorer,
 )
+from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
+from weave.flow.scorer.moderation_scorer import OpenAIModerationScorer
 from weave.flow.scorer.pydantic_scorer import PydanticScorer
 from weave.flow.scorer.ragas import (
-    AnswerCorrectnessScorer,
-    AnswerSimilarityScorer,
     ContextEntityRecallScorer,
-    ContextPrecisionScorer,
     ContextRelevancyScorer,
-    FaithfulnessScorer,
 )
 from weave.flow.scorer.regex_scorer import RegexScorer
 
@@ -31,10 +27,6 @@
     "OpenAIModerationScorer",
     "PydanticScorer",
     "HallucinationScorer",
-    "AnswerCorrectnessScorer",
     "ContextEntityRecallScorer",
-    "ContextPrecisionScorer",
     "ContextRelevancyScorer",
-    "FaithfulnessScorer",
-    "AnswerSimilarityScorer",
 ]
diff --git a/weave/flow/scorer/ragas.py b/weave/flow/scorer/ragas.py
index f8ff1263e69a..18f59af36b8c 100644
--- a/weave/flow/scorer/ragas.py
+++ b/weave/flow/scorer/ragas.py
@@ -1,185 +1,92 @@
 # implementing metrics from ragas: https://github.com/explodinggradients/ragas
 
-from typing import Any, List
-from pydantic import BaseModel
-from weave.flow.scorer.llm import instruct_client
-from weave.flow.scorer.llm_scorer import EmbeddingSimilarityScorer, LLMScorer
+from typing import List
+from pydantic import BaseModel, Field
+from textwrap import dedent
 
+import weave
+from weave.flow.scorer.llm_utils import instructor_client
+from weave.flow.scorer.llm_scorer import LLMScorer
+from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
 
 class EntityExtractionResponse(BaseModel):
-    entities: List[str]
+    entities: List[str] = Field(description="A list of unique entities extracted from the text")
 
 class ContextEntityRecallScorer(LLMScorer):
     """
-    Estimates context recall by extracting entities from the expected answer and
-    the provided context, then computes the recall.
+    Estimates context recall by extracting entities from the model output 
+    and the expected answer, then computes the recall.
     """
 
-    extraction_prompt: str = """
+    extraction_prompt: str = dedent("""
     Extract unique entities from the following text without repetition.
 
     Text: {text}
     Entities:
-    """  
+    """)
+    answer_column: str = Field(description="The column in the dataset that contains the expected answer")
+    
     def extract_entities(self, text: str) -> List[str]:
         # Use LLM to extract entities
-        llm = instruct_client(self.client)
+        client = instructor_client(self.client)
         prompt = self.extraction_prompt.format(text=text)
-        response = llm.chat.completions.create(
+        response = client.chat.completions.create(
             messages=[{"role": "user", "content": prompt}],
-            response_model=str
+            response_model=EntityExtractionResponse,
+            model=self.model_id
         )
         # Assume entities are returned as a comma-separated list
-        entities = [e.strip() for e in response.split(",")]
+        entities = [e.strip() for e in response.entities]
         return entities
-
-    def score(self, model_output: Any, expected: str, context: str) -> float:
+    
+    @weave.op
+    def score(self, model_output: str, dataset_row: dict) -> float:
         # Extract entities
-        expected_entities = self.extract_entities(expected)
-        context_entities = self.extract_entities(context)
+        if self.answer_column not in dataset_row:
+            raise ValueError(f"Answer column {self.answer_column} not found in dataset_row")
+        expected_entities = self.extract_entities(model_output)
+        context_entities = self.extract_entities(dataset_row[self.answer_column])
         # Calculate recall
         if not expected_entities:
             return 0.0
         matches = set(expected_entities) & set(context_entities)
         recall = len(matches) / len(expected_entities)
-        return recall
-
+        return {"recall": recall}
 
+class RelevancyResponse(BaseModel):
+    reasoning: str = Field(description="Think step by step about whether the context is relevant to the question")
+    relevancy_score: int = Field(ge=0, le=1, description="The relevancy score of the context to the question (0 for not relevant, 1 for relevant)")
 class ContextRelevancyScorer(LLMScorer):
-    """Evaluates the relevancy of the provided context to the input question."""
+    """Evaluates the relevancy of the provided context to the model output."""
 
-    relevancy_prompt: str = """
+    relevancy_prompt: str = dedent("""
     Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1.
 
     Question: {question}
     Context: {context}
     Relevancy Score (0-1):
-    """
-
-    def score(self, model_output: Any, input_text: str, context: str) -> float:
-        llm = instruct_client(self.client)
-        prompt = self.relevancy_prompt.format(question=input_text, context=context)
-        response = llm.chat.completions.create(
-            messages=[{"role": "user", "content": prompt}],
-            response_model=str
-        )
-        # Parse the response to get the relevancy score
-        try:
-            score = float(response.strip())
-            return max(0.0, min(score, 1.0))  # Ensure the score is between 0 and 1
-        except ValueError:
-            return 0.0  # Return 0 if parsing fails
-
-
-class ContextPrecisionScorer(LLMScorer):
-    """Determines whether the provided context was useful in arriving at the given answer."""
-
-    precision_prompt: str = """
-    Given the question, answer, and context, determine if the context was useful in arriving at the answer.
-    Respond with 1 if useful, 0 if not.
-
-    Question: {question}
-    Answer: {answer}
-    Context: {context}
-    Verdict (1 for useful, 0 for not useful):
-    """
-
-    def score(
-        self, model_output: Any, input_text: str, expected: str, context: str
-    ) -> float:
-        llm = instruct_client(self.client)
-        prompt = self.precision_prompt.format(
-            question=input_text, answer=expected, context=context
-        )
-        response = llm.chat.completions.create(
-            messages=[{"role": "user", "content": prompt}],
-            response_model=str
-        )
-        # Parse the response to get the verdict
-        try:
-            verdict = int(response.strip())
-            return float(verdict)
-        except ValueError:
-            return 0.0  # Return 0 if parsing fails
-
-
-class FaithfulnessScorer(LLMScorer):
-    """Measures the factual consistency of the generated answer against the provided context."""
-
-    faithfulness_prompt: str = """
-    Compare the following answer and context for factual consistency. Rate the faithfulness on a scale from 0 to 1.
-
-    Answer: {answer}
-    Context: {context}
-    Faithfulness Score (0-1):
-    """
-
-    def score(self, model_output: Any, expected: str, context: str) -> float:
-        llm = instruct_client(self.client)
-        answer = model_output.get("answer", "")
-        prompt = self.faithfulness_prompt.format(answer=answer, context=context)
+    """)
+    context_column: str = Field(description="The column in the dataset that contains the context")
+
+    @weave.op
+    def score(self, model_output: str, dataset_row: dict) -> float:
+        if self.context_column not in dataset_row:
+            raise ValueError(f"Context column {self.context_column} not found in dataset_row")
+        context = dataset_row[self.context_column]
+        llm = instructor_client(self.client)
+        prompt = self.relevancy_prompt.format(question=model_output, context=context)
         response = llm.chat.completions.create(
             messages=[{"role": "user", "content": prompt}],
-            response_model=str
+            response_model=RelevancyResponse,
+            model=self.model_id
         )
-        # Parse the response to get the faithfulness score
-        try:
-            score = float(response.strip())
-            return max(0.0, min(score, 1.0))
-        except ValueError:
-            return 0.0  # Return 0 if parsing fails
-
-
-class AnswerSimilarityScorer(EmbeddingSimilarityScorer):
-    """Measures the similarity between the generated answer and the expected answer."""
-
-    def score(self, model_output: Any, expected: str) -> float:
-        generated_answer = model_output.get("answer", "")
-        return super().score(generated_answer, expected)
-
-
-from typing import Any
-
-from weave.flow.scorer.llm_scorer import LLMScorer
-
-
-class AnswerCorrectnessScorer(LLMScorer):
-    """Evaluates the correctness of the answer based on the ground truth."""
-
-    correctness_prompt: str = """
-    Given the question, generated answer, and ground truth, rate the correctness of the answer on a scale from 0 to 1.
-
-    Question: {question}
-    Generated Answer: {generated_answer}
-    Ground Truth: {ground_truth}
-    Correctness Score (0-1):
-    """
-
-    def score(self, model_output: Any, input_text: str, expected: str) -> float:
-        llm = instruct_client(self.client)
-        generated_answer = model_output.get("answer", "")
-        prompt = self.correctness_prompt.format(
-            question=input_text,
-            generated_answer=generated_answer,
-            ground_truth=expected,
-        )
-        response = llm.chat.completions.create(
-            messages=[{"role": "user", "content": prompt}],
-            response_model=str
-        )
-        # Parse the response to get the correctness score
-        try:
-            score = float(response.strip())
-            return max(0.0, min(score, 1.0))
-        except ValueError:
-            return 0.0  # Return 0 if parsing fails
-
-
+        return {"relevancy_score": response.relevancy_score}
+        
 if __name__ == "__main__":
     import os
 
     try:
-        from weave.flow.scorer.llm import import_client
+        from weave.flow.scorer.llm_utils import import_client
 
         # Instantiate your LLM client
         OpenAIClient = import_client("openai")
@@ -190,22 +97,13 @@ def score(self, model_output: Any, input_text: str, expected: str) -> float:
 
         # Instantiate scorers
         context_entity_recall_scorer = ContextEntityRecallScorer(
-            client=llm_client, model="gpt-4o"
+            client=llm_client, model_id="gpt-4o",
+            answer_column="expected"
         )
         context_relevancy_scorer = ContextRelevancyScorer(
-            client=llm_client, model="gpt-4"
+            client=llm_client, model_id="gpt-4o",
+            context_column="context"
         )
-        context_precision_scorer = ContextPrecisionScorer(
-            client=llm_client, model="gpt-4"
-        )
-        faithfulness_scorer = FaithfulnessScorer(client=llm_client, model="gpt-4")
-        answer_similarity_scorer = AnswerSimilarityScorer(
-            client=llm_client, model="text-embedding-ada-002"
-        )
-        answer_correctness_scorer = AnswerCorrectnessScorer(
-            client=llm_client, model="gpt-4o"
-        )
-
         # Create your dataset of examples
         examples = [
             {
@@ -221,45 +119,15 @@ def score(self, model_output: Any, input_text: str, expected: str) -> float:
             # Add more examples as needed
         ]
 
-        scorers = [
-            context_entity_recall_scorer,
-            context_relevancy_scorer,
-            context_precision_scorer,
-            faithfulness_scorer,
-            answer_similarity_scorer,
-            answer_correctness_scorer,
-        ]
-
         for example in examples:
             model_output = {"answer": example["expected"]}  # Simulate model output
-            for scorer in scorers:
-                if isinstance(scorer, ContextEntityRecallScorer):
-                    score = scorer.score(
-                        model_output, example["expected"], example["context"]
-                    )
-                elif isinstance(scorer, ContextRelevancyScorer):
-                    score = scorer.score(
-                        model_output, example["question"], example["context"]
-                    )
-                elif isinstance(scorer, ContextPrecisionScorer):
-                    score = scorer.score(
-                        model_output,
-                        example["question"],
-                        example["expected"],
-                        example["context"],
-                    )
-                elif isinstance(scorer, FaithfulnessScorer):
-                    score = scorer.score(
-                        model_output, example["expected"], example["context"]
-                    )
-                elif isinstance(scorer, AnswerSimilarityScorer):
-                    score = scorer.score(model_output, example["expected"])
-                elif isinstance(scorer, AnswerCorrectnessScorer):
-                    score = scorer.score(
-                        model_output, example["question"], example["expected"]
-                    )
-                print(
-                    f"{scorer.__class__.__name__} score for '{example['question']}': {score}"
-                )
+            score = context_entity_recall_scorer.score(
+                model_output, example
+            )
+            print(f"Context Entity Recall Score: {score}")
+            score = context_relevancy_scorer.score(
+                model_output, example
+            )
+            print(f"Context Relevancy Score: {score}")
     except Exception as e:
         print(e)
\ No newline at end of file

From 5dcde73583b31cf17afb6df63ea06e374eab1453 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 16:54:43 +0200
Subject: [PATCH 025/150] rename

---
 weave/flow/scorer/{llm.py => llm_utils.py} | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
 rename weave/flow/scorer/{llm.py => llm_utils.py} (93%)

diff --git a/weave/flow/scorer/llm.py b/weave/flow/scorer/llm_utils.py
similarity index 93%
rename from weave/flow/scorer/llm.py
rename to weave/flow/scorer/llm_utils.py
index cc648868cf2b..5c910b645f13 100644
--- a/weave/flow/scorer/llm.py
+++ b/weave/flow/scorer/llm_utils.py
@@ -44,7 +44,7 @@
 
 _LLM_CLIENTS = TypeVar(Union[tuple(_LLM_CLIENT_TYPES)])
 
-def instruct_client(client: _LLM_CLIENTS):
+def instructor_client(client: _LLM_CLIENTS):
     client_type = type(client).__name__.lower()
     if "mistral" in client_type:
         return instructor.from_mistral(client)
@@ -93,7 +93,7 @@ def import_client(provider: str):
     # Mistral example
     MistralClient = import_client("mistral")
     if MistralClient:
-        mistral_client = instruct_client(Mistral(api_key=os.environ.get("MISTRAL_API_KEY")))
+        mistral_client = instructor_client(Mistral(api_key=os.environ.get("MISTRAL_API_KEY")))
         mistral_response = mistral_client.chat.completions.create(
             messages=[{"role": "user", "content": "What is the best French cheese?"}],
             model=MISTRAL_DEFAULT_MODEL,
@@ -105,7 +105,7 @@ def import_client(provider: str):
     # OpenAI example with system message
     OpenAIClient = import_client("openai")
     if OpenAIClient:
-        openai_client = instruct_client(OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY")))
+        openai_client = instructor_client(OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY")))
         openai_response = openai_client.chat.completions.create(
             messages=[
                 {
@@ -126,7 +126,7 @@ def import_client(provider: str):
     # Anthropic example with system message
     AnthropicClient = import_client("anthropic")
     if AnthropicClient:
-        anthropic_client = instruct_client(AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY")))
+        anthropic_client = instructor_client(AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY")))
         anthropic_response = anthropic_client.messages.create(
             messages=[
                 {

From a367096485fb34eaa9929c2cfee7b012c933f198 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 16:55:20 +0200
Subject: [PATCH 026/150] refactor LLMScorer, move stuff around

---
 weave/flow/scorer/hallucination_scorer.py |  2 +-
 weave/flow/scorer/llm_scorer.py           | 81 +++++------------------
 weave/flow/scorer/moderation_scorer.py    | 41 ++++++++++++
 weave/flow/scorer/similarity_score.py     | 62 +++++++++++++++++
 4 files changed, 122 insertions(+), 64 deletions(-)
 create mode 100644 weave/flow/scorer/moderation_scorer.py
 create mode 100644 weave/flow/scorer/similarity_score.py

diff --git a/weave/flow/scorer/hallucination_scorer.py b/weave/flow/scorer/hallucination_scorer.py
index ceacad26a4c0..6c5f8890d51f 100644
--- a/weave/flow/scorer/hallucination_scorer.py
+++ b/weave/flow/scorer/hallucination_scorer.py
@@ -4,7 +4,7 @@
 import weave
 from weave.flow.scorer.utils import stringify
 from weave.flow.scorer.llm_scorer import LLMScorer
-from weave.flow.scorer.llm import OPENAI_DEFAULT_MODEL
+from weave.flow.scorer.llm_utils import OPENAI_DEFAULT_MODEL
 
 
 DEFAULT_SYSTEM_PROMPT =  """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index 46eae4b85ac9..4edccadeaece 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -5,21 +5,19 @@
 from pydantic import BaseModel, Field, field_validator
 
 from weave.flow.scorer.base_scorer import Scorer
-from weave.flow.scorer.llm import instruct_client, OPENAI_DEFAULT_MODEL, _LLM_CLIENT_TYPES
+from weave.flow.scorer.llm_utils import embed, instructor_client, OPENAI_DEFAULT_MODEL, _LLM_CLIENT_TYPES
 
 try:
     from openai import AsyncOpenAI, OpenAI
 except:
     pass
 class LLMScorer(Scorer):
-    """Score an LLM output."""
+    """Score a model output using an LLM"""
 
     client: Any = Field(
         description="The LLM client to use, has to be instantiated with an api_key"
     )
     model_id: str = Field(description="The model to use")
-    temperature: float = Field(..., description="The temperature to use for the response")
-    max_tokens: int = Field(..., description="The maximum number of tokens in the response")
 
     @field_validator("client")
     def validate_client(cls, v):
@@ -27,65 +25,22 @@ def validate_client(cls, v):
             raise ValueError(
                 f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}"
             )
-        return instruct_client(v)
-
-
-class EmbeddingSimilarityScorer(LLMScorer):
-    """Check the embedding distance between the model output and the target."""
-
-    def score(self, model_output: Any, target: Any) -> Any:
-        model_embedding, target_embedding = self._compute_embeddings(
-            model_output, target
-        )
-        return self.cosine_similarity(model_embedding, target_embedding)
-
-    def _compute_embeddings(self, model_output: str, target: str) -> tuple[list[float], list[float]]:
-        llm = instruct_client(self.client)
-        embeddings = llm.embed([model_output, target])
-        return embeddings[0], embeddings[1]
-
-    def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
-        """Compute the cosine similarity between two vectors."""
-        vec1 = np.array(vec1)
-        vec2 = np.array(vec2)
-        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
-
-        # cast to float
-        return float(cosine_sim)
-
-
-class OpenAIModerationScorer(LLMScorer):
-    "Use OpenAI moderation API to check if the model output is safe"
-
-    def score(self, model_output: Any) -> Any:
-        if not isinstance(self.client, (OpenAI, AsyncOpenAI)):
-            raise ValueError("Moderation scoring only works with OpenAI or AsyncOpenAI")
-
-        response = self.client.moderations.create(
-            model=self.model_id,
-            input=model_output,
-        ).results[0]
-        categories = {k: v for k, v in response.categories.dict().items() if v}
-        return {"flagged": response.flagged, "categories": categories}
-
+        return v
 
-if __name__ == "__main__":
-    try:
-        import openai
+class InstructorLLMScorer(Scorer):
+    """Score a model output using an LLM"""
 
-        client = openai.OpenAI()
-        scorer = EmbeddingSimilarityScorer(
-            client=client, model_id="text-embedding-3-small"
-        )
-        print(scorer.score("I don't know", "I don't know"))
-    except Exception as e:
-        print("Install openai to run this script")
-
-    try:
-        import openai
+    client: Any = Field(
+        description="The LLM client to use, has to be instantiated with an api_key"
+    )
+    model_id: str = Field(description="The model to use")
+    temperature: float = Field(..., description="The temperature to use for the response")
+    max_tokens: int = Field(..., description="The maximum number of tokens in the response")
 
-        client = openai.OpenAI()
-        scorer = OpenAIModerationScorer(client=client, model_id="omni-moderation-latest")
-        print(scorer.score("I should kill myself"))
-    except Exception as e:
-        print("Install openai to run this script")
\ No newline at end of file
+    @field_validator("client")
+    def validate_client(cls, v):
+        if not any(isinstance(v, client_type) for client_type in _LLM_CLIENT_TYPES):
+            raise ValueError(
+                f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}"
+            )
+        return instructor_client(v)
\ No newline at end of file
diff --git a/weave/flow/scorer/moderation_scorer.py b/weave/flow/scorer/moderation_scorer.py
new file mode 100644
index 000000000000..22c1014bd577
--- /dev/null
+++ b/weave/flow/scorer/moderation_scorer.py
@@ -0,0 +1,41 @@
+from typing import Any
+from pydantic import field_validator
+
+import weave
+from weave.flow.scorer.llm_scorer import LLMScorer
+
+
+class OpenAIModerationScorer(LLMScorer):
+    """Use OpenAI moderation API to check if the model output is safe"""
+
+    @field_validator("client")
+    def validate_openai_client(cls, v):
+        try:
+            from openai import AsyncOpenAI, OpenAI  # Ensure these are the correct imports
+        except ImportError:
+            raise ValueError("Install openai to use this scorer")
+        
+        if not isinstance(v, (OpenAI, AsyncOpenAI)):
+            raise ValueError("Moderation scoring only works with OpenAI or AsyncOpenAI")
+        return v
+    
+    @weave.op
+    def score(self, model_output: Any) -> Any:
+        response = self.client.moderations.create(
+            model=self.model_id,
+            input=model_output,
+        ).results[0]
+        categories = {k: v for k, v in response.categories.dict().items() if v}
+        return {"flagged": response.flagged, "categories": categories}
+
+
+if __name__ == "__main__":
+    try:
+        import openai
+
+        client = openai.OpenAI()
+        scorer = OpenAIModerationScorer(client=client, model_id="omni-moderation-latest")
+        print(scorer.score("I should kill someone"))
+    except Exception as e:
+        print("Error:", e)
+        raise e
\ No newline at end of file
diff --git a/weave/flow/scorer/similarity_score.py b/weave/flow/scorer/similarity_score.py
new file mode 100644
index 000000000000..9929545d078b
--- /dev/null
+++ b/weave/flow/scorer/similarity_score.py
@@ -0,0 +1,62 @@
+from typing import Any
+
+from pydantic import Field
+import numpy as np
+
+import weave
+from weave.flow.scorer.llm_scorer import LLMScorer
+from weave.flow.scorer.llm_utils import embed
+
+
+class EmbeddingSimilarityScorer(LLMScorer):
+    """Check the cosine similarity distance between the model output and the target.
+    
+    The threshold is the minimum cosine similarity score that is considered similar.
+    
+    Args:
+        target_column: The column to compare the model output to. Defaults to "text".
+        threshold: The minimum cosine similarity score that is considered similar. Defaults to 0.5
+    """
+    target_column: str = Field(..., description="The column to compare the model output to")
+    threshold: float = Field(0.5, description="The threshold for the similarity score")
+
+    @weave.op
+    def score(self, model_output: Any, dataset_row: dict) -> Any:
+        if self.target_column not in dataset_row:
+            raise ValueError(f"Target column {self.target_column} not found in dataset_row")
+        
+        target = str(dataset_row[self.target_column])  # TODO: handle if it is not a string
+        model_embedding, target_embedding = self._compute_embeddings(
+            model_output, target
+        )
+        return self.cosine_similarity(model_embedding, target_embedding)
+
+    def _compute_embeddings(self, model_output: str, target: str) -> tuple[list[float], list[float]]:
+        embeddings = embed(self.client, self.model_id, [model_output, target])
+        return embeddings[0], embeddings[1]
+
+    def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
+        """Compute the cosine similarity between two vectors."""
+        vec1 = np.array(vec1)
+        vec2 = np.array(vec2)
+        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) 
+        # TODO: check if this can be negative
+
+        # cast to float
+        score = float(cosine_sim)
+        return {"similarity_score": score, "is_similar": score >= self.threshold}
+
+
+if __name__ == "__main__":
+    try:
+        import openai, os
+
+        client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
+        scorer = EmbeddingSimilarityScorer(
+            client=client, model_id="text-embedding-3-small", target_column="text"
+        )
+
+        dataset_row = {"text": "Whales are mammals that live in the ocean."}
+        print(scorer.score(model_output="Dolphins are animals that live in the sea.", dataset_row=dataset_row))
+    except Exception as e:
+        print("Error running script:", e)
\ No newline at end of file

From d78f7cf085ad48bedb9dcc7d3e0e536de98b17d5 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 16:57:39 +0200
Subject: [PATCH 027/150] rename ragas

---
 weave/flow/scorer/__init__.py                   | 2 +-
 weave/flow/scorer/{ragas.py => ragas_scorer.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename weave/flow/scorer/{ragas.py => ragas_scorer.py} (100%)

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index 7cf64581bf53..6020b5dd2eda 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -8,7 +8,7 @@
 from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
 from weave.flow.scorer.moderation_scorer import OpenAIModerationScorer
 from weave.flow.scorer.pydantic_scorer import PydanticScorer
-from weave.flow.scorer.ragas import (
+from weave.flow.scorer.ragas_scorer import (
     ContextEntityRecallScorer,
     ContextRelevancyScorer,
 )
diff --git a/weave/flow/scorer/ragas.py b/weave/flow/scorer/ragas_scorer.py
similarity index 100%
rename from weave/flow/scorer/ragas.py
rename to weave/flow/scorer/ragas_scorer.py

From 3fdaade42ac9cc722a4bb29379fe792af9c18d28 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 18:14:24 +0200
Subject: [PATCH 028/150] add summarization (sort of)

---
 weave/flow/scorer/summarization_scorer.py | 84 +++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 weave/flow/scorer/summarization_scorer.py

diff --git a/weave/flow/scorer/summarization_scorer.py b/weave/flow/scorer/summarization_scorer.py
new file mode 100644
index 000000000000..bebab30c4711
--- /dev/null
+++ b/weave/flow/scorer/summarization_scorer.py
@@ -0,0 +1,84 @@
+from pydantic import BaseModel, Field
+from typing import List
+from textwrap import dedent
+
+import weave
+from weave.flow.scorer.llm_scorer import LLMScorer
+from weave.flow.scorer.llm_utils import instructor_client
+
+
+class EntityExtractionResponse(BaseModel):
+    entities: List[str] = Field(description="A list of unique entities extracted from the text")
+
+class SummarizationScorer(LLMScorer):
+    """
+    Estimates summary quality by computing the recall of entities in the model output compared to the input.
+    """
+
+    extraction_prompt: str = dedent("""
+    Extract unique entities from the following text without repetition.
+
+    Text: {text}
+    Entities:
+    """)
+    input_column: str = Field(description="The column in the dataset that contains the input text")
+    
+    def extract_entities(self, text: str) -> List[str]:
+        # Use LLM to extract entities
+        client = instructor_client(self.client)
+        prompt = self.extraction_prompt.format(text=text)
+        response = client.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            response_model=EntityExtractionResponse,
+            model=self.model_id
+        )
+        entities = [e.strip().lower() for e in response.entities]
+        return entities
+    
+    @weave.op
+    def score(self, model_output: str, dataset_row: dict) -> float:
+        # Extract entities
+        if self.input_column not in dataset_row:
+            raise ValueError(f"Answer column {self.input_column} not found in dataset_row")
+        output_entities = self.extract_entities(model_output)
+        input_entities = self.extract_entities(dataset_row[self.input_column])
+        # Calculate recall
+        if not output_entities:
+            return 0.0
+        matches = set(output_entities) & set(input_entities)
+        recall = len(matches) / len(input_entities)
+        return {"recall": recall}
+    
+
+
+if __name__ == "__main__":
+    import os
+
+    try:
+        from weave.flow.scorer.llm_utils import import_client
+
+        # Instantiate your LLM client
+        OpenAIClient = import_client("openai")
+        if OpenAIClient:
+            llm_client = OpenAIClient(api_key=os.environ["OPENAI_API_KEY"])
+        else:
+            raise ImportError("OpenAI client not available")
+
+        # Instantiate scorers
+        summarization_scorer = SummarizationScorer(
+            client=llm_client, model_id="gpt-4o", input_column="input"
+        )
+
+        # Create your dataset of examples
+        examples = [
+            {"input":"Harry Potter is a wizard. He is friends with Ron Weasley. They all go to Hogwarts to learn magic. They have been doing this for years. Their enemy is Voldemort, a dark wizard who is trying to kill them.",
+             "model_output":"Harry Potter, Ron Weasley, and Voldemort are wizards.",
+             "relevancy_score":1}
+        ]
+
+        for example in examples:
+            score = summarization_scorer.score(example["model_output"], example)
+            print(f"Summarization Score: {score}")
+
+    except Exception as e:
+        print(f"Error: {e}")

From 4fd3c22239b9171986a7538135d660badb04e51f Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 18:33:56 +0200
Subject: [PATCH 029/150] levenshtein

---
 weave/flow/scorer/classification.py           | 58 -------------------
 .../{regex_scorer.py => string_scorer.py}     | 15 ++++-
 2 files changed, 14 insertions(+), 59 deletions(-)
 delete mode 100644 weave/flow/scorer/classification.py
 rename weave/flow/scorer/{regex_scorer.py => string_scorer.py} (84%)

diff --git a/weave/flow/scorer/classification.py b/weave/flow/scorer/classification.py
deleted file mode 100644
index b86fb890d5e4..000000000000
--- a/weave/flow/scorer/classification.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from collections import defaultdict
-from typing import Optional, Tuple
-
-import weave
-from weave.flow.scorer.base_scorer import Scorer
-
-
-def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
-    # if any denom is zero, then zero. could use NaN instead...
-    precision: float = 0
-    if tp or fp:
-        precision = tp / (tp + fp)
-    recall: float = 0
-    if tp or fn:
-        recall = tp / (tp + fn)
-    f1: float = 0
-    if precision or recall:
-        f1 = 2 * (precision * recall) / (precision + recall)
-    return precision, recall, f1
-
-
-class MultiTaskBinaryClassificationF1(Scorer):
-    class_names: list[str]
-
-    @weave.op()
-    def summarize(self, score_rows: list) -> Optional[dict]:
-        result = {}
-        cols = transpose(score_rows)
-
-        for class_name in self.class_names:
-            col = cols[class_name]
-            tp = sum(r["correct"] and not r["negative"] for r in col)
-            fp = sum(not r["correct"] and not r["negative"] for r in col)
-            fn = sum(not r["correct"] and r["negative"] for r in col)
-            precision, recall, f1 = p_r_f1(tp, fp, fn)
-            result[class_name] = {"f1": f1, "precision": precision, "recall": recall}
-
-        return result
-
-    @weave.op()
-    def score(self, target: dict, model_output: Optional[dict]) -> dict:
-        result = {}
-        for class_name in self.class_names:
-            class_label = target.get(class_name)
-            class_model_output = model_output.get(class_name) if model_output else None
-            result[class_name] = {
-                "correct": class_label == class_model_output,
-                "negative": not class_model_output,
-            }
-        return result
-
-
-def transpose(rows: list[dict]) -> dict[str, list]:
-    cols = defaultdict(list)
-    for row in rows:
-        for k, v in row.items():
-            cols[k].append(v)
-    return dict(cols)
diff --git a/weave/flow/scorer/regex_scorer.py b/weave/flow/scorer/string_scorer.py
similarity index 84%
rename from weave/flow/scorer/regex_scorer.py
rename to weave/flow/scorer/string_scorer.py
index 1a4e743fe0b8..3f6893e1d033 100644
--- a/weave/flow/scorer/regex_scorer.py
+++ b/weave/flow/scorer/string_scorer.py
@@ -1,7 +1,7 @@
 import re
 from typing import Union, List, Any
 
-from pydantic import Field
+from pydantic import Field, model_validator
 
 import weave
 from weave.flow.scorer.base_scorer import Scorer
@@ -59,6 +59,19 @@ def score(
         return {"string_match": match_found}
 
 
+class LevenshteinScorer(Scorer):
+    @model_validator(mode='after')
+    def check_levenshtein(self):
+        try:
+            from Levenshtein import distance
+        except ImportError:
+            raise ValueError("Levenshtein package not found. Please install it with `pip install Levenshtein`")
+
+    @weave.op
+    def score(self, model_output: str, target: str) -> dict:
+        distance = distance(model_output, target)
+        return {"levenshtein_distance": distance}
+
 
 if __name__ == "__main__":
     import asyncio

From 3df1839b40b18d27b8f1c47a30dea7b0511ac004 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 18:34:29 +0200
Subject: [PATCH 030/150] rename

---
 weave/flow/scorer/__init__.py              |  4 +-
 weave/flow/scorer/classification_scorer.py | 58 ++++++++++++++++++++++
 2 files changed, 60 insertions(+), 2 deletions(-)
 create mode 100644 weave/flow/scorer/classification_scorer.py

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index 6020b5dd2eda..46a60b3d0dde 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -1,5 +1,5 @@
 from weave.flow.scorer.base_scorer import Scorer, auto_summarize, get_scorer_attributes
-from weave.flow.scorer.classification import MultiTaskBinaryClassificationF1, transpose
+from weave.flow.scorer.classification_scorer import MultiTaskBinaryClassificationF1, transpose
 from weave.flow.scorer.hallucination_scorer import HallucinationScorer
 from weave.flow.scorer.json_scorer import JSONScorer
 from weave.flow.scorer.llm_scorer import (
@@ -12,7 +12,7 @@
     ContextEntityRecallScorer,
     ContextRelevancyScorer,
 )
-from weave.flow.scorer.regex_scorer import RegexScorer
+from weave.flow.scorer.string_scorer import RegexScorer
 
 __all__ = [
     "Scorer",
diff --git a/weave/flow/scorer/classification_scorer.py b/weave/flow/scorer/classification_scorer.py
new file mode 100644
index 000000000000..b86fb890d5e4
--- /dev/null
+++ b/weave/flow/scorer/classification_scorer.py
@@ -0,0 +1,58 @@
+from collections import defaultdict
+from typing import Optional, Tuple
+
+import weave
+from weave.flow.scorer.base_scorer import Scorer
+
+
+def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
+    # if any denom is zero, then zero. could use NaN instead...
+    precision: float = 0
+    if tp or fp:
+        precision = tp / (tp + fp)
+    recall: float = 0
+    if tp or fn:
+        recall = tp / (tp + fn)
+    f1: float = 0
+    if precision or recall:
+        f1 = 2 * (precision * recall) / (precision + recall)
+    return precision, recall, f1
+
+
+class MultiTaskBinaryClassificationF1(Scorer):
+    class_names: list[str]
+
+    @weave.op()
+    def summarize(self, score_rows: list) -> Optional[dict]:
+        result = {}
+        cols = transpose(score_rows)
+
+        for class_name in self.class_names:
+            col = cols[class_name]
+            tp = sum(r["correct"] and not r["negative"] for r in col)
+            fp = sum(not r["correct"] and not r["negative"] for r in col)
+            fn = sum(not r["correct"] and r["negative"] for r in col)
+            precision, recall, f1 = p_r_f1(tp, fp, fn)
+            result[class_name] = {"f1": f1, "precision": precision, "recall": recall}
+
+        return result
+
+    @weave.op()
+    def score(self, target: dict, model_output: Optional[dict]) -> dict:
+        result = {}
+        for class_name in self.class_names:
+            class_label = target.get(class_name)
+            class_model_output = model_output.get(class_name) if model_output else None
+            result[class_name] = {
+                "correct": class_label == class_model_output,
+                "negative": not class_model_output,
+            }
+        return result
+
+
+def transpose(rows: list[dict]) -> dict[str, list]:
+    cols = defaultdict(list)
+    for row in rows:
+        for k, v in row.items():
+            cols[k].append(v)
+    return dict(cols)

From 672eed8a814808db75b7cfb63360810071ece24a Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 19:27:08 +0200
Subject: [PATCH 031/150] model_output -> output

---
 weave/flow/eval.py                         | 15 +++++++--
 weave/flow/scorer/base_scorer.py           |  5 +--
 weave/flow/scorer/classification_scorer.py |  8 ++---
 weave/flow/scorer/hallucination_scorer.py  | 22 ++++++-------
 weave/flow/scorer/json_scorer.py           |  4 +--
 weave/flow/scorer/moderation_scorer.py     |  4 +--
 weave/flow/scorer/pydantic_scorer.py       | 16 +++++-----
 weave/flow/scorer/ragas_scorer.py          | 14 ++++-----
 weave/flow/scorer/similarity_score.py      | 10 +++---
 weave/flow/scorer/string_scorer.py         | 20 ++++++------
 weave/flow/scorer/summarization_scorer.py  |  8 ++---
 weave/flow/scorer/utils.py                 | 36 +++++++++++-----------
 weave/flow/scorer/xml_scorer.py            |  8 ++---
 13 files changed, 90 insertions(+), 80 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 759215a00d92..e9a14225a0e2 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -206,14 +206,23 @@ async def predict_and_score(
 
             # TODO: Check for input columns parameters in the signature of the scorer
 
-            if "model_output" not in score_arg_names:
+            if "model_output" not in score_arg_names and "output" not in score_arg_names:
                 raise OpCallError(
-                    f"Scorer {scorer_name} must have a 'model_output' argument, to receive the output of the model function."
+                    f"Scorer {scorer_name} must have a 'model_output' or 'output' argument, to receive the output of the model function."
                 )
 
             if isinstance(example, dict):
                 score_args = {k: v for k, v in example.items() if k in score_arg_names}
-                score_args.update({"dataset_row": example}) # TODO: investigate deduplication of dataset_row for performance
+                # If we get a column_map from the scorer, it means that the scorer expects the input to have different names than the dataset columns
+                # So we need to remap the input names to the expected names in the scorer
+                # For instance, if the scorer expects "input" and "target" and we have a dataset with columns "question" and "expected"
+                # we need to remap {"question": "input", "expected": "target"}
+                # and pass those to the scorer
+                # input: is the full row, we have access to it via example
+                # output: is the model output, we have access to it via model_output
+
+                if scorer.column_map is not None:
+                    score_args = {scorer.column_map[k]: v for k, v in score_args.items()}
             else:
                 if len(score_arg_names) == 2:
                     score_args = {score_arg_names[0]: example}
diff --git a/weave/flow/scorer/base_scorer.py b/weave/flow/scorer/base_scorer.py
index b51c63d51245..08445e85856f 100644
--- a/weave/flow/scorer/base_scorer.py
+++ b/weave/flow/scorer/base_scorer.py
@@ -2,7 +2,7 @@
 from typing import Any, Callable, Optional, Sequence, Tuple, Union
 
 import numpy as np
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 import weave
 from weave.flow.obj import Object
@@ -11,7 +11,8 @@
 
 
 class Scorer(Object):
-    def score(self, target: Any, model_output: Any) -> Any:
+    column_map: Optional[dict[str, str]] = Field(default=None, description="A mapping from column names in the dataset to the names expected by the scorer")
+    def score(self, input: Any, target: Any, output: Any) -> Any:
         raise NotImplementedError
 
     @weave.op()
diff --git a/weave/flow/scorer/classification_scorer.py b/weave/flow/scorer/classification_scorer.py
index b86fb890d5e4..622f576e6788 100644
--- a/weave/flow/scorer/classification_scorer.py
+++ b/weave/flow/scorer/classification_scorer.py
@@ -38,14 +38,14 @@ def summarize(self, score_rows: list) -> Optional[dict]:
         return result
 
     @weave.op()
-    def score(self, target: dict, model_output: Optional[dict]) -> dict:
+    def score(self, target: dict, output: Optional[dict]) -> dict:
         result = {}
         for class_name in self.class_names:
             class_label = target.get(class_name)
-            class_model_output = model_output.get(class_name) if model_output else None
+            class_output = output.get(class_name) if output else None
             result[class_name] = {
-                "correct": class_label == class_model_output,
-                "negative": not class_model_output,
+                "correct": class_label == class_output,
+                "negative": not class_output,
             }
         return result
 
diff --git a/weave/flow/scorer/hallucination_scorer.py b/weave/flow/scorer/hallucination_scorer.py
index 6c5f8890d51f..a6a359fe8793 100644
--- a/weave/flow/scorer/hallucination_scorer.py
+++ b/weave/flow/scorer/hallucination_scorer.py
@@ -8,23 +8,23 @@
 
 
 DEFAULT_SYSTEM_PROMPT =  """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
-DEFAULT_USER_PROMPT =  """Given some input_data and a model_output, determine if the model_output is a hallucination of the input_data.
+DEFAULT_USER_PROMPT =  """Given some input_data and a output, determine if the output is a hallucination of the input_data.
 ## Input data
 <input_data>
 {input_data}
 </dataset_row>
 
 ## Model output
-<model_output>
-{model_output}
-</model_output>
+<output>
+{output}
+</output>
 
 ## Instructions
-Think step by step before answering. Is the model_output an factually and logically consistent with the input_data? 
+Think step by step before answering. Is the output an factually and logically consistent with the input_data? 
 """
 
 class HallucinationResponse(BaseModel):
-    chain_of_thought: str = Field(description="Think step by step about whether the model_output is a hallucination of the dataset_row")
+    chain_of_thought: str = Field(description="Think step by step about whether the output is a hallucination of the dataset_row")
     is_hallucination: bool = Field(description="Whether the model output is a hallucination of the dataset row")
 
 class HallucinationScorer(LLMScorer):
@@ -39,16 +39,16 @@ class HallucinationScorer(LLMScorer):
     input_data_columns: list[str] = Field(description="The columns of the input data to use as ground truth")
 
     @weave.op
-    def score(self, model_output: str, dataset_row: dict) -> HallucinationResponse:
+    def score(self, output: str, dataset_row: dict) -> HallucinationResponse:
 
-        model_output = stringify(model_output)
+        output = stringify(output)
 
         input_data = {k: stringify(v) for k, v in dataset_row.items() if k in self.input_data_columns}
 
         response = self.client.chat.completions.create(
             messages=[
                 {"role": "system", "content": self.system_prompt},
-                {"role": "user", "content": self.user_prompt.format(input_data=input_data, model_output=model_output)},
+                {"role": "user", "content": self.user_prompt.format(input_data=input_data, output=output)},
             ],
             model=self.model_id,
             response_model=HallucinationResponse,
@@ -67,9 +67,9 @@ def score(self, model_output: str, dataset_row: dict) -> HallucinationResponse:
         openai_client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
         scorer = HallucinationScorer(client=openai_client, input_data_columns=["text"])
 
-        model_output = "John favorite cheese is camembert"
+        output = "John favorite cheese is camembert"
         dataset_row = {"text": "John doesn't like cheese"}
-        response = scorer.score(model_output, dataset_row)
+        response = scorer.score(output, dataset_row)
         print(response)
     
         @weave.op
diff --git a/weave/flow/scorer/json_scorer.py b/weave/flow/scorer/json_scorer.py
index 4b2bcac14f03..7e1fbedc6800 100644
--- a/weave/flow/scorer/json_scorer.py
+++ b/weave/flow/scorer/json_scorer.py
@@ -7,9 +7,9 @@
 class JSONScorer(Scorer):
     """Score a JSON string."""
 
-    def score(self, model_output: Any) -> Any:
+    def score(self, output: Any) -> Any:
         try:
-            result = json.loads(model_output)
+            result = json.loads(output)
 
             if isinstance(result, dict) or isinstance(result, list):
                 return {"json_valid": True}
diff --git a/weave/flow/scorer/moderation_scorer.py b/weave/flow/scorer/moderation_scorer.py
index 22c1014bd577..6d747740723c 100644
--- a/weave/flow/scorer/moderation_scorer.py
+++ b/weave/flow/scorer/moderation_scorer.py
@@ -20,10 +20,10 @@ def validate_openai_client(cls, v):
         return v
     
     @weave.op
-    def score(self, model_output: Any) -> Any:
+    def score(self, output: Any) -> Any:
         response = self.client.moderations.create(
             model=self.model_id,
-            input=model_output,
+            input=output,
         ).results[0]
         categories = {k: v for k, v in response.categories.dict().items() if v}
         return {"flagged": response.flagged, "categories": categories}
diff --git a/weave/flow/scorer/pydantic_scorer.py b/weave/flow/scorer/pydantic_scorer.py
index 21bd0c6de357..c1cccf463b0a 100644
--- a/weave/flow/scorer/pydantic_scorer.py
+++ b/weave/flow/scorer/pydantic_scorer.py
@@ -10,16 +10,16 @@ class PydanticScorer(Scorer):
 
     model: Type[BaseModel]
 
-    def score(self, model_output: Any):
-        if isinstance(model_output, str):
+    def score(self, output: Any):
+        if isinstance(output, str):
             try:
-                self.model.model_validate_json(model_output)
+                self.model.model_validate_json(output)
                 return True
             except ValidationError:
                 return False
         else:
             try:
-                self.model.model_validate(model_output)
+                self.model.model_validate(output)
                 return True
             except ValidationError:
                 return False
@@ -34,8 +34,8 @@ class User(BaseModel):
 
     scorer = PydanticScorer(model=User)
 
-    model_output = '{"name": "John", "age": 30}'
-    print(scorer.score(model_output))
+    output = '{"name": "John", "age": 30}'
+    print(scorer.score(output))
 
-    model_output = {"name": "John", "age": 30}
-    print(scorer.score(model_output))
+    output = {"name": "John", "age": 30}
+    print(scorer.score(output))
diff --git a/weave/flow/scorer/ragas_scorer.py b/weave/flow/scorer/ragas_scorer.py
index 18f59af36b8c..77d2f5a9c02e 100644
--- a/weave/flow/scorer/ragas_scorer.py
+++ b/weave/flow/scorer/ragas_scorer.py
@@ -40,11 +40,11 @@ def extract_entities(self, text: str) -> List[str]:
         return entities
     
     @weave.op
-    def score(self, model_output: str, dataset_row: dict) -> float:
+    def score(self, output: str, dataset_row: dict) -> float:
         # Extract entities
         if self.answer_column not in dataset_row:
             raise ValueError(f"Answer column {self.answer_column} not found in dataset_row")
-        expected_entities = self.extract_entities(model_output)
+        expected_entities = self.extract_entities(output)
         context_entities = self.extract_entities(dataset_row[self.answer_column])
         # Calculate recall
         if not expected_entities:
@@ -69,12 +69,12 @@ class ContextRelevancyScorer(LLMScorer):
     context_column: str = Field(description="The column in the dataset that contains the context")
 
     @weave.op
-    def score(self, model_output: str, dataset_row: dict) -> float:
+    def score(self, output: str, dataset_row: dict) -> float:
         if self.context_column not in dataset_row:
             raise ValueError(f"Context column {self.context_column} not found in dataset_row")
         context = dataset_row[self.context_column]
         llm = instructor_client(self.client)
-        prompt = self.relevancy_prompt.format(question=model_output, context=context)
+        prompt = self.relevancy_prompt.format(question=output, context=context)
         response = llm.chat.completions.create(
             messages=[{"role": "user", "content": prompt}],
             response_model=RelevancyResponse,
@@ -120,13 +120,13 @@ def score(self, model_output: str, dataset_row: dict) -> float:
         ]
 
         for example in examples:
-            model_output = {"answer": example["expected"]}  # Simulate model output
+            output = {"answer": example["expected"]}  # Simulate model output
             score = context_entity_recall_scorer.score(
-                model_output, example
+                output, example
             )
             print(f"Context Entity Recall Score: {score}")
             score = context_relevancy_scorer.score(
-                model_output, example
+                output, example
             )
             print(f"Context Relevancy Score: {score}")
     except Exception as e:
diff --git a/weave/flow/scorer/similarity_score.py b/weave/flow/scorer/similarity_score.py
index 9929545d078b..542b7962e7ee 100644
--- a/weave/flow/scorer/similarity_score.py
+++ b/weave/flow/scorer/similarity_score.py
@@ -21,18 +21,18 @@ class EmbeddingSimilarityScorer(LLMScorer):
     threshold: float = Field(0.5, description="The threshold for the similarity score")
 
     @weave.op
-    def score(self, model_output: Any, dataset_row: dict) -> Any:
+    def score(self, output: Any, dataset_row: dict) -> Any:
         if self.target_column not in dataset_row:
             raise ValueError(f"Target column {self.target_column} not found in dataset_row")
         
         target = str(dataset_row[self.target_column])  # TODO: handle if it is not a string
         model_embedding, target_embedding = self._compute_embeddings(
-            model_output, target
+            output, target
         )
         return self.cosine_similarity(model_embedding, target_embedding)
 
-    def _compute_embeddings(self, model_output: str, target: str) -> tuple[list[float], list[float]]:
-        embeddings = embed(self.client, self.model_id, [model_output, target])
+    def _compute_embeddings(self, output: str, target: str) -> tuple[list[float], list[float]]:
+        embeddings = embed(self.client, self.model_id, [output, target])
         return embeddings[0], embeddings[1]
 
     def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
@@ -57,6 +57,6 @@ def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
         )
 
         dataset_row = {"text": "Whales are mammals that live in the ocean."}
-        print(scorer.score(model_output="Dolphins are animals that live in the sea.", dataset_row=dataset_row))
+        print(scorer.score(output="Dolphins are animals that live in the sea.", dataset_row=dataset_row))
     except Exception as e:
         print("Error running script:", e)
\ No newline at end of file
diff --git a/weave/flow/scorer/string_scorer.py b/weave/flow/scorer/string_scorer.py
index 3f6893e1d033..6cdc344c524f 100644
--- a/weave/flow/scorer/string_scorer.py
+++ b/weave/flow/scorer/string_scorer.py
@@ -12,8 +12,8 @@ class StringScorer(Scorer):
     """
     target_columns: List[str] = Field(default_factory=list, description="The names of the columns that are used as input to the scorer")
 
-    def score(self, model_output: Any, dataset_row: dict) -> dict:
-        string_in_input = any([model_output.lower() in input.lower() for k, input in dataset_row.items() if k in self.target_columns])
+    def score(self, output: Any, dataset_row: dict) -> dict:
+        string_in_input = any([output.lower() in input.lower() for k, input in dataset_row.items() if k in self.target_columns])
         return {"string_in_input": string_in_input}
 
 class RegexScorer(Scorer):
@@ -27,10 +27,10 @@ class RegexScorer(Scorer):
 
     @weave.op
     def score(
-        self, model_output: Union[dict, str], target: Union[str, list[str], None] = None
+        self, output: Union[dict, str], target: Union[str, list[str], None] = None
     ) -> dict:
-        if isinstance(model_output, str):
-            model_output = {"output": model_output}
+        if isinstance(output, str):
+            output = {"output": output}
 
         # Use target patterns if provided
         patterns = target if target else self.patterns
@@ -48,7 +48,7 @@ def score(
                 pattern = f"^{pattern}$"
             compiled_patterns.append(re.compile(pattern, flags=flags))
 
-        text_to_search = model_output.get("output") if model_output else ""
+        text_to_search = output.get("output") if output else ""
         if self.ignore_whitespace:
             text_to_search = "".join(text_to_search.split())
 
@@ -68,8 +68,8 @@ def check_levenshtein(self):
             raise ValueError("Levenshtein package not found. Please install it with `pip install Levenshtein`")
 
     @weave.op
-    def score(self, model_output: str, target: str) -> dict:
-        distance = distance(model_output, target)
+    def score(self, output: str, target: str) -> dict:
+        distance = distance(output, target)
         return {"levenshtein_distance": distance}
 
 
@@ -82,9 +82,9 @@ def score(self, model_output: str, target: str) -> dict:
     def f(col1, col2): 
         return "Hello"    
 
-    model_output = f(col1="hello", col2="world")
+    output = f(col1="hello", col2="world")
     dataset_row = {"col1": "Hello my name is Morgan", "col2": "I am an engineer"}
-    print(scorer.score(model_output=model_output, dataset_row=dataset_row))
+    print(scorer.score(output=output, dataset_row=dataset_row))
 
     dataset = [{"col1": "Hello my name is Morgan", "col2": "I am an engineer", "target": "Morgan"}, 
                {"col1": "Hello my name is John", "col2": "I am a doctor", "target": "John"}]
diff --git a/weave/flow/scorer/summarization_scorer.py b/weave/flow/scorer/summarization_scorer.py
index bebab30c4711..214512cd078c 100644
--- a/weave/flow/scorer/summarization_scorer.py
+++ b/weave/flow/scorer/summarization_scorer.py
@@ -36,11 +36,11 @@ def extract_entities(self, text: str) -> List[str]:
         return entities
     
     @weave.op
-    def score(self, model_output: str, dataset_row: dict) -> float:
+    def score(self, output: str, dataset_row: dict) -> float:
         # Extract entities
         if self.input_column not in dataset_row:
             raise ValueError(f"Answer column {self.input_column} not found in dataset_row")
-        output_entities = self.extract_entities(model_output)
+        output_entities = self.extract_entities(output)
         input_entities = self.extract_entities(dataset_row[self.input_column])
         # Calculate recall
         if not output_entities:
@@ -72,12 +72,12 @@ def score(self, model_output: str, dataset_row: dict) -> float:
         # Create your dataset of examples
         examples = [
             {"input":"Harry Potter is a wizard. He is friends with Ron Weasley. They all go to Hogwarts to learn magic. They have been doing this for years. Their enemy is Voldemort, a dark wizard who is trying to kill them.",
-             "model_output":"Harry Potter, Ron Weasley, and Voldemort are wizards.",
+             "output":"Harry Potter, Ron Weasley, and Voldemort are wizards.",
              "relevancy_score":1}
         ]
 
         for example in examples:
-            score = summarization_scorer.score(example["model_output"], example)
+            score = summarization_scorer.score(example["output"], example)
             print(f"Summarization Score: {score}")
 
     except Exception as e:
diff --git a/weave/flow/scorer/utils.py b/weave/flow/scorer/utils.py
index 3c70a1a023ed..19db05748978 100644
--- a/weave/flow/scorer/utils.py
+++ b/weave/flow/scorer/utils.py
@@ -4,32 +4,32 @@
 from pydantic import BaseModel
 
 
-def stringify(model_output: Any) -> str:
-    if isinstance(model_output, str):
-        return model_output
-    elif isinstance(model_output, (list, tuple)):
-        return json.dumps(model_output, indent=2)
-    elif isinstance(model_output, dict):
-        return json.dumps(model_output, indent=2)
-    elif isinstance(model_output, BaseModel):
-        return model_output.model_dump_json(indent=2)
+def stringify(output: Any) -> str:
+    if isinstance(output, str):
+        return output
+    elif isinstance(output, (list, tuple)):
+        return json.dumps(output, indent=2)
+    elif isinstance(output, dict):
+        return json.dumps(output, indent=2)
+    elif isinstance(output, BaseModel):
+        return output.model_dump_json(indent=2)
     else:
-        raise ValueError(f"Unsupported model output type: {type(model_output)}")
+        raise ValueError(f"Unsupported model output type: {type(output)}")
 
 if __name__ == "__main__":
     # test
-    model_output = "hey"
-    print(stringify(model_output))
+    output = "hey"
+    print(stringify(output))
 
-    model_output = [1, 2, 3]
-    print(stringify(model_output))
+    output = [1, 2, 3]
+    print(stringify(output))
 
-    model_output = {"a": 1, "b": 2}
-    print(stringify(model_output))
+    output = {"a": 1, "b": 2}
+    print(stringify(output))
 
     class TestModel(BaseModel):
         a: int
         b: str
 
-    model_output = TestModel(a=1, b="test")
-    print(stringify(model_output))
+    output = TestModel(a=1, b="test")
+    print(stringify(output))
diff --git a/weave/flow/scorer/xml_scorer.py b/weave/flow/scorer/xml_scorer.py
index 04a161d53525..31f202636bf3 100644
--- a/weave/flow/scorer/xml_scorer.py
+++ b/weave/flow/scorer/xml_scorer.py
@@ -7,11 +7,11 @@
 class XMLScorer(Scorer):
     """Score an XML string."""
 
-    def score(self, model_output: Union[str, dict]) -> dict:
-        if isinstance(model_output, dict):
-            xml_string = model_output.get("output", "")
+    def score(self, output: Union[str, dict]) -> dict:
+        if isinstance(output, dict):
+            xml_string = output.get("output", "")
         else:
-            xml_string = model_output
+            xml_string = output
 
         try:
             ET.fromstring(xml_string)

From 5fb442a9af09d19858d154a91ef3776465f3c1ec Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 19:32:09 +0200
Subject: [PATCH 032/150] model_output -> output

---
 weave/flow/eval.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index e9a14225a0e2..37e2687b6dde 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -212,7 +212,6 @@ async def predict_and_score(
                 )
 
             if isinstance(example, dict):
-                score_args = {k: v for k, v in example.items() if k in score_arg_names}
                 # If we get a column_map from the scorer, it means that the scorer expects the input to have different names than the dataset columns
                 # So we need to remap the input names to the expected names in the scorer
                 # For instance, if the scorer expects "input" and "target" and we have a dataset with columns "question" and "expected"
@@ -220,9 +219,10 @@ async def predict_and_score(
                 # and pass those to the scorer
                 # input: is the full row, we have access to it via example
                 # output: is the model output, we have access to it via model_output
-
-                if scorer.column_map is not None:
-                    score_args = {scorer.column_map[k]: v for k, v in score_args.items()}
+                if scorer.column_map is None:
+                    score_args = {k: v for k, v in example.items() if k in score_arg_names}
+                else:
+                    score_args = {scorer.column_map[k]: v for k, v in example.items() if k in score_arg_names}
             else:
                 if len(score_arg_names) == 2:
                     score_args = {score_arg_names[0]: example}
@@ -230,7 +230,7 @@ async def predict_and_score(
                     raise ValueError(
                         f"{score_fn} expects arguments: {score_arg_names}, provide a preprocess_model_input function that returns a dict with those keys."
                     )
-            score_args["model_output"] = model_output
+            score_args["output"] = model_output
 
             try:
                 result = await async_call(score_fn, **score_args)
@@ -244,7 +244,7 @@ async def predict_and_score(
                     for param in score_signature.parameters.values()
                     if param.default == inspect.Parameter.empty
                 ]
-                required_arg_names.remove("model_output")
+                required_arg_names.remove("output")
 
                 message = textwrap.dedent(
                     f"""
@@ -259,7 +259,7 @@ async def predict_and_score(
             scores[scorer_name] = result
 
         return {
-            "model_output": model_output,
+            "output": model_output,
             "scores": scores,
             "model_latency": model_latency,
         }
@@ -302,7 +302,7 @@ async def eval_example(example: dict) -> dict:
             except Exception as e:
                 print("Predict and score failed")
                 traceback.print_exc()
-                return {"model_output": None, "scores": {}}
+                return {"output": None, "scores": {}}
             return eval_row
 
         n_complete = 0
@@ -320,7 +320,7 @@ async def eval_example(example: dict) -> dict:
             #     f"Evaluating... {duration:.2f}s [{n_complete} / {len(self.dataset.rows)} complete]"  # type:ignore
             # )
             if eval_row is None:
-                eval_row = {"model_output": None, "scores": {}}
+                eval_row = {"output": None, "scores": {}}
             else:
                 eval_row["scores"] = eval_row.get("scores", {})
             for scorer in self.scorers or []:

From 4b903e3a0cc0e7407399618c7c49d905a36e7c7f Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 20:10:07 +0200
Subject: [PATCH 033/150] unify naming

---
 weave/flow/eval.py                        |  7 ++--
 weave/flow/scorer/hallucination_scorer.py | 18 ++++-----
 weave/flow/scorer/ragas_scorer.py         | 18 ++-------
 weave/flow/scorer/string_scorer.py        | 24 ++++++------
 weave/flow/scorer/summarization_scorer.py | 48 ++++++++++++++---------
 5 files changed, 56 insertions(+), 59 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 37e2687b6dde..7da8bab9afe9 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -219,10 +219,11 @@ async def predict_and_score(
                 # and pass those to the scorer
                 # input: is the full row, we have access to it via example
                 # output: is the model output, we have access to it via model_output
-                if scorer.column_map is None:
-                    score_args = {k: v for k, v in example.items() if k in score_arg_names}
+                if isinstance(scorer, Scorer) and scorer.column_map is not None:
+                    score_args = {scorer.column_map.get(k, k): v for k, v in example.items() if scorer.column_map.get(k, k) in score_arg_names}
                 else:
-                    score_args = {scorer.column_map[k]: v for k, v in example.items() if k in score_arg_names}
+                    score_args = {k: v for k, v in example.items() if k in score_arg_names}
+
             else:
                 if len(score_arg_names) == 2:
                     score_args = {score_arg_names[0]: example}
diff --git a/weave/flow/scorer/hallucination_scorer.py b/weave/flow/scorer/hallucination_scorer.py
index a6a359fe8793..2a3aa8b3a1ae 100644
--- a/weave/flow/scorer/hallucination_scorer.py
+++ b/weave/flow/scorer/hallucination_scorer.py
@@ -3,7 +3,7 @@
 
 import weave
 from weave.flow.scorer.utils import stringify
-from weave.flow.scorer.llm_scorer import LLMScorer
+from weave.flow.scorer.llm_scorer import InstructorLLMScorer
 from weave.flow.scorer.llm_utils import OPENAI_DEFAULT_MODEL
 
 
@@ -27,7 +27,7 @@ class HallucinationResponse(BaseModel):
     chain_of_thought: str = Field(description="Think step by step about whether the output is a hallucination of the dataset_row")
     is_hallucination: bool = Field(description="Whether the model output is a hallucination of the dataset row")
 
-class HallucinationScorer(LLMScorer):
+class HallucinationScorer(InstructorLLMScorer):
     """
     Scorer that checks if the model output is a hallucination of the dataset row.
     """
@@ -36,19 +36,15 @@ class HallucinationScorer(LLMScorer):
     model_id: str = OPENAI_DEFAULT_MODEL
     temperature: float = 0.7
     max_tokens: int = 4096
-    input_data_columns: list[str] = Field(description="The columns of the input data to use as ground truth")
 
     @weave.op
-    def score(self, output: str, dataset_row: dict) -> HallucinationResponse:
+    def score(self, output: str, context: str) -> HallucinationResponse:
 
         output = stringify(output)
-
-        input_data = {k: stringify(v) for k, v in dataset_row.items() if k in self.input_data_columns}
-
         response = self.client.chat.completions.create(
             messages=[
                 {"role": "system", "content": self.system_prompt},
-                {"role": "user", "content": self.user_prompt.format(input_data=input_data, output=output)},
+                {"role": "user", "content": self.user_prompt.format(input_data=context, output=output)},
             ],
             model=self.model_id,
             response_model=HallucinationResponse,
@@ -62,14 +58,14 @@ def score(self, output: str, dataset_row: dict) -> HallucinationResponse:
     try:
         import openai, os, weave, asyncio
 
-        weave.init("hallucination-scorer-2")
+        # weave.init("hallucination-scorer-2")
 
         openai_client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-        scorer = HallucinationScorer(client=openai_client, input_data_columns=["text"])
+        scorer = HallucinationScorer(client=openai_client, column_map={"text": "context"})
 
         output = "John favorite cheese is camembert"
         dataset_row = {"text": "John doesn't like cheese"}
-        response = scorer.score(output, dataset_row)
+        response = scorer.score(output, context=dataset_row)
         print(response)
     
         @weave.op
diff --git a/weave/flow/scorer/ragas_scorer.py b/weave/flow/scorer/ragas_scorer.py
index 77d2f5a9c02e..d29674299c6d 100644
--- a/weave/flow/scorer/ragas_scorer.py
+++ b/weave/flow/scorer/ragas_scorer.py
@@ -15,7 +15,7 @@ class EntityExtractionResponse(BaseModel):
 class ContextEntityRecallScorer(LLMScorer):
     """
     Estimates context recall by extracting entities from the model output 
-    and the expected answer, then computes the recall.
+    and the context, then computes the recall.
     """
 
     extraction_prompt: str = dedent("""
@@ -24,7 +24,6 @@ class ContextEntityRecallScorer(LLMScorer):
     Text: {text}
     Entities:
     """)
-    answer_column: str = Field(description="The column in the dataset that contains the expected answer")
     
     def extract_entities(self, text: str) -> List[str]:
         # Use LLM to extract entities
@@ -40,12 +39,9 @@ def extract_entities(self, text: str) -> List[str]:
         return entities
     
     @weave.op
-    def score(self, output: str, dataset_row: dict) -> float:
-        # Extract entities
-        if self.answer_column not in dataset_row:
-            raise ValueError(f"Answer column {self.answer_column} not found in dataset_row")
+    def score(self, output: str, context: str) -> float:
         expected_entities = self.extract_entities(output)
-        context_entities = self.extract_entities(dataset_row[self.answer_column])
+        context_entities = self.extract_entities(context)
         # Calculate recall
         if not expected_entities:
             return 0.0
@@ -66,13 +62,9 @@ class ContextRelevancyScorer(LLMScorer):
     Context: {context}
     Relevancy Score (0-1):
     """)
-    context_column: str = Field(description="The column in the dataset that contains the context")
 
     @weave.op
-    def score(self, output: str, dataset_row: dict) -> float:
-        if self.context_column not in dataset_row:
-            raise ValueError(f"Context column {self.context_column} not found in dataset_row")
-        context = dataset_row[self.context_column]
+    def score(self, output: str, context: str) -> float:
         llm = instructor_client(self.client)
         prompt = self.relevancy_prompt.format(question=output, context=context)
         response = llm.chat.completions.create(
@@ -98,11 +90,9 @@ def score(self, output: str, dataset_row: dict) -> float:
         # Instantiate scorers
         context_entity_recall_scorer = ContextEntityRecallScorer(
             client=llm_client, model_id="gpt-4o",
-            answer_column="expected"
         )
         context_relevancy_scorer = ContextRelevancyScorer(
             client=llm_client, model_id="gpt-4o",
-            context_column="context"
         )
         # Create your dataset of examples
         examples = [
diff --git a/weave/flow/scorer/string_scorer.py b/weave/flow/scorer/string_scorer.py
index 6cdc344c524f..8904959392eb 100644
--- a/weave/flow/scorer/string_scorer.py
+++ b/weave/flow/scorer/string_scorer.py
@@ -1,19 +1,17 @@
 import re
-from typing import Union, List, Any
+from typing import Union, Callable
 
 from pydantic import Field, model_validator
 
 import weave
 from weave.flow.scorer.base_scorer import Scorer
 
-class StringScorer(Scorer):
+class StringMatchScorer(Scorer):
     """
     Scorer that checks if the model output string is found in the search columns of the dataset row.
     """
-    target_columns: List[str] = Field(default_factory=list, description="The names of the columns that are used as input to the scorer")
-
-    def score(self, output: Any, dataset_row: dict) -> dict:
-        string_in_input = any([output.lower() in input.lower() for k, input in dataset_row.items() if k in self.target_columns])
+    def score(self, output: str, target: str) -> dict:
+        string_in_input = output.lower() in target.lower()
         return {"string_in_input": string_in_input}
 
 class RegexScorer(Scorer):
@@ -60,35 +58,35 @@ def score(
 
 
 class LevenshteinScorer(Scorer):
+    distance: Callable[[str, str], int] = Field(default=None, description="The Levenshtein distance function")
     @model_validator(mode='after')
     def check_levenshtein(self):
         try:
             from Levenshtein import distance
+            self.distance = distance
         except ImportError:
             raise ValueError("Levenshtein package not found. Please install it with `pip install Levenshtein`")
 
     @weave.op
     def score(self, output: str, target: str) -> dict:
-        distance = distance(output, target)
+        distance = self.distance(output, target)
         return {"levenshtein_distance": distance}
 
 
 if __name__ == "__main__":
     import asyncio
 
-    scorer = StringScorer(target_columns=["col1", "col2"])
+    match_scorer = StringMatchScorer(column_map={"output": "col1"})
+    levenshtein_scorer = LevenshteinScorer(column_map={"output": "col2"})
+
     
     @weave.op
     def f(col1, col2): 
         return "Hello"    
 
-    output = f(col1="hello", col2="world")
-    dataset_row = {"col1": "Hello my name is Morgan", "col2": "I am an engineer"}
-    print(scorer.score(output=output, dataset_row=dataset_row))
-
     dataset = [{"col1": "Hello my name is Morgan", "col2": "I am an engineer", "target": "Morgan"}, 
                {"col1": "Hello my name is John", "col2": "I am a doctor", "target": "John"}]
     
-    evaluation = weave.Evaluation(dataset=dataset, scorers=[scorer])
+    evaluation = weave.Evaluation(dataset=dataset, scorers=[match_scorer, levenshtein_scorer])
 
     eval_out = asyncio.run(evaluation.evaluate(f))
\ No newline at end of file
diff --git a/weave/flow/scorer/summarization_scorer.py b/weave/flow/scorer/summarization_scorer.py
index 214512cd078c..9ce7464c0d65 100644
--- a/weave/flow/scorer/summarization_scorer.py
+++ b/weave/flow/scorer/summarization_scorer.py
@@ -3,14 +3,14 @@
 from textwrap import dedent
 
 import weave
-from weave.flow.scorer.llm_scorer import LLMScorer
+from weave.flow.scorer.llm_scorer import InstructorLLMScorer
 from weave.flow.scorer.llm_utils import instructor_client
 
 
 class EntityExtractionResponse(BaseModel):
     entities: List[str] = Field(description="A list of unique entities extracted from the text")
 
-class SummarizationScorer(LLMScorer):
+class SummarizationScorer(InstructorLLMScorer):
     """
     Estimates summary quality by computing the recall of entities in the model output compared to the input.
     """
@@ -21,27 +21,28 @@ class SummarizationScorer(LLMScorer):
     Text: {text}
     Entities:
     """)
-    input_column: str = Field(description="The column in the dataset that contains the input text")
+
+    temperature: float = 0.7
+    max_tokens: int = 1024
     
     def extract_entities(self, text: str) -> List[str]:
         # Use LLM to extract entities
-        client = instructor_client(self.client)
         prompt = self.extraction_prompt.format(text=text)
-        response = client.chat.completions.create(
+        response = self.client.chat.completions.create(
             messages=[{"role": "user", "content": prompt}],
             response_model=EntityExtractionResponse,
-            model=self.model_id
+            model=self.model_id,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
         )
         entities = [e.strip().lower() for e in response.entities]
         return entities
     
     @weave.op
-    def score(self, output: str, dataset_row: dict) -> float:
+    def score(self, input: str, output: str, **kwargs) -> float:
         # Extract entities
-        if self.input_column not in dataset_row:
-            raise ValueError(f"Answer column {self.input_column} not found in dataset_row")
         output_entities = self.extract_entities(output)
-        input_entities = self.extract_entities(dataset_row[self.input_column])
+        input_entities = self.extract_entities(input)
         # Calculate recall
         if not output_entities:
             return 0.0
@@ -52,7 +53,7 @@ def score(self, output: str, dataset_row: dict) -> float:
 
 
 if __name__ == "__main__":
-    import os
+    import os, asyncio
 
     try:
         from weave.flow.scorer.llm_utils import import_client
@@ -66,19 +67,30 @@ def score(self, output: str, dataset_row: dict) -> float:
 
         # Instantiate scorers
         summarization_scorer = SummarizationScorer(
-            client=llm_client, model_id="gpt-4o", input_column="input"
+            client=llm_client, model_id="gpt-4o", column_map={"text": "input"}
         )
 
+        @weave.op
+        def f(summary: str): 
+            return summary
+
         # Create your dataset of examples
         examples = [
-            {"input":"Harry Potter is a wizard. He is friends with Ron Weasley. They all go to Hogwarts to learn magic. They have been doing this for years. Their enemy is Voldemort, a dark wizard who is trying to kill them.",
-             "output":"Harry Potter, Ron Weasley, and Voldemort are wizards.",
-             "relevancy_score":1}
+            {"text":"Harry Potter is a wizard. He is friends with Ron Weasley. They all go to Hogwarts to learn magic. They have been doing this for years. Their enemy is Voldemort, a dark wizard who is trying to kill them.",
+             "summary":"Harry Potter, Ron Weasley, and Voldemort are wizards.",
+             "relevancy_score":1},
         ]
+        evaluation = weave.Evaluation(dataset=examples, scorers=[summarization_scorer])
+        asyncio.run(evaluation.evaluate(f))
+
+        # good naming:
+        def summarization_scorer2(text: str, output: str):
+            scorer =  SummarizationScorer(client=llm_client, model_id="gpt-4o")
+            return scorer.score(input=text, output=output)
+
+        evaluation = weave.Evaluation(dataset=examples, scorers=[summarization_scorer2])
+        asyncio.run(evaluation.evaluate(f))
 
-        for example in examples:
-            score = summarization_scorer.score(example["output"], example)
-            print(f"Summarization Score: {score}")
 
     except Exception as e:
         print(f"Error: {e}")

From 9e6e3be217fbce78cc3f46a980ec7b4e54e596b7 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 20:31:39 +0200
Subject: [PATCH 034/150] let's go with tests!

---
 tests/scorers/test_json_scorer.py     | 43 ++++++++++++++++++++
 tests/scorers/test_pydantic_scorer.py | 44 ++++++++++++++++++++
 tests/scorers/test_string_scorer.py   | 58 +++++++++++++++++++++++++++
 3 files changed, 145 insertions(+)
 create mode 100644 tests/scorers/test_json_scorer.py
 create mode 100644 tests/scorers/test_pydantic_scorer.py
 create mode 100644 tests/scorers/test_string_scorer.py

diff --git a/tests/scorers/test_json_scorer.py b/tests/scorers/test_json_scorer.py
new file mode 100644
index 000000000000..d77293b0ad49
--- /dev/null
+++ b/tests/scorers/test_json_scorer.py
@@ -0,0 +1,43 @@
+from weave.flow.scorer.json_scorer import JSONScorer
+
+def test_json_scorer_valid_json():
+    scorer = JSONScorer()
+    output = '{"city": "San Francisco", "country": "USA"}'
+    result = scorer.score(output)
+    assert result["json_valid"] is True
+
+def test_json_scorer_invalid_json():
+    scorer = JSONScorer()
+    output = '{"city": "San Francisco", "country": "USA"'
+    result = scorer.score(output)
+    assert result["json_valid"] is False
+
+def test_json_scorer_non_json_string():
+    scorer = JSONScorer()
+    output = "Just a plain string."
+    result = scorer.score(output)
+    assert result["json_valid"] is False
+
+def test_json_scorer_valid_json_list():
+    scorer = JSONScorer()
+    output = '[1, 2, 3, 4, 5]'
+    result = scorer.score(output)
+    assert result["json_valid"] is True
+
+def test_json_scorer_nested_json():
+    scorer = JSONScorer()
+    output = '{"person": {"name": "John", "age": 30}, "city": "New York"}'
+    result = scorer.score(output)
+    assert result["json_valid"] is True
+
+def test_json_scorer_empty_object():
+    scorer = JSONScorer()
+    output = '{}'
+    result = scorer.score(output)
+    assert result["json_valid"] is True
+
+def test_json_scorer_empty_list():
+    scorer = JSONScorer()
+    output = '[]'
+    result = scorer.score(output)
+    assert result["json_valid"] is True
diff --git a/tests/scorers/test_pydantic_scorer.py b/tests/scorers/test_pydantic_scorer.py
new file mode 100644
index 000000000000..3732829136b8
--- /dev/null
+++ b/tests/scorers/test_pydantic_scorer.py
@@ -0,0 +1,44 @@
+import pytest
+from pydantic import BaseModel
+from weave.flow.scorer.pydantic_scorer import PydanticScorer
+
+class User(BaseModel):
+    name: str
+    age: int
+
+@pytest.fixture
+def user_scorer():
+    return PydanticScorer(model=User)
+
+def test_pydantic_scorer_initialization():
+    scorer = PydanticScorer(model=User)
+    assert isinstance(scorer, PydanticScorer)
+    assert scorer.model == User
+
+def test_pydantic_scorer_valid_json_string(user_scorer):
+    valid_json = '{"name": "John", "age": 30}'
+    assert user_scorer.score(valid_json) == True
+
+def test_pydantic_scorer_valid_dict(user_scorer):
+    valid_dict = {"name": "John", "age": 30}
+    assert user_scorer.score(valid_dict) == True
+
+def test_pydantic_scorer_invalid_json_string(user_scorer):
+    invalid_json = '{"name": "John", "age": "thirty"}'
+    assert user_scorer.score(invalid_json) == False
+
+def test_pydantic_scorer_invalid_dict(user_scorer):
+    invalid_dict = {"name": "John", "age": "thirty"}
+    assert user_scorer.score(invalid_dict) == False
+
+def test_pydantic_scorer_missing_field(user_scorer):
+    missing_field = '{"name": "John"}'
+    assert user_scorer.score(missing_field) == False
+
+def test_pydantic_scorer_extra_field(user_scorer):
+    extra_field = '{"name": "John", "age": 30, "city": "New York"}'
+    assert user_scorer.score(extra_field) == True
+
+def test_pydantic_scorer_invalid_input_type(user_scorer):
+    invalid_input = 123  # Neither a string nor a dict
+    assert user_scorer.score(invalid_input) == False
\ No newline at end of file
diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py
new file mode 100644
index 000000000000..a542b7b3e805
--- /dev/null
+++ b/tests/scorers/test_string_scorer.py
@@ -0,0 +1,58 @@
+from weave.flow.scorer.string_scorer import (
+    StringMatchScorer,
+    RegexScorer,
+    LevenshteinScorer,
+)
+
+def test_string_match_scorer():
+    scorer = StringMatchScorer()
+    output = "Morgan"
+    target = "Hello my name is Morgan"
+    result = scorer.score(output, target)
+    assert result["string_in_input"] is True
+
+def test_string_match_scorer_false():
+    scorer = StringMatchScorer()
+    output = "Alice"
+    target = "Hello my name is Bob"
+    result = scorer.score(output, target)
+    assert result["string_in_input"] is False
+
+# def test_regex_scorer():
+#     scorer = RegexScorer(patterns="engineer")
+#     output = "I am an engineer"
+#     result = scorer.score(output)
+#     assert result["string_match"] is True
+
+# def test_regex_scorer_case_insensitive():
+#     scorer = RegexScorer(patterns="Engineer", ignore_case=True)
+#     output = "I am an engineer"
+#     result = scorer.score(output)
+#     assert result["string_match"] is True
+
+# def test_regex_scorer_no_match():
+#     scorer = RegexScorer(patterns="doctor")
+#     output = "I am an engineer"
+#     result = scorer.score(output)
+#     assert result["string_match"] is False
+
+def test_levenshtein_scorer():
+    scorer = LevenshteinScorer()
+    output = "Hello"
+    target = "Hallo"
+    result = scorer.score(output, target)
+    assert result["levenshtein_distance"] == 1
+
+def test_levenshtein_scorer_same_strings():
+    scorer = LevenshteinScorer()
+    output = "Hello"
+    target = "Hello"
+    result = scorer.score(output, target)
+    assert result["levenshtein_distance"] == 0
+
+def test_levenshtein_scorer_completely_different():
+    scorer = LevenshteinScorer()
+    output = "Hello"
+    target = "World"
+    result = scorer.score(output, target)
+    assert result["levenshtein_distance"] == 4
\ No newline at end of file

From dddb6cf58289a65d209ef742da3436b8ebcd5b60 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 20:37:06 +0200
Subject: [PATCH 035/150] rename model_output to output

---
 tests/trace/test_evaluations.py | 91 ++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 46 deletions(-)

diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py
index dbba17f5d880..a9402050da0e 100644
--- a/tests/trace/test_evaluations.py
+++ b/tests/trace/test_evaluations.py
@@ -43,7 +43,6 @@ class MyModel(Model):
 
     @weave.op()
     def predict(self, question: str):
-        # Here's where you would add your LLM call and return the output
         return {"generated_text": "Hello, " + question + self.prompt}
 
 
@@ -56,12 +55,12 @@ async def do_quickstart():
     ]
 
     @weave.op()
-    def match_score1(expected: str, model_output: dict) -> dict:
-        return {"match": expected == model_output["generated_text"]}
+    def match_score1(expected: str, output: dict) -> dict:
+        return {"match": expected == output["generated_text"]}
 
     @weave.op()
-    def match_score2(expected: dict, model_output: dict) -> dict:
-        return {"match": expected == model_output["generated_text"]}
+    def match_score2(expected: dict, output: dict) -> dict:
+        return {"match": expected == output["generated_text"]}
 
     model = MyModel(prompt="World")
     evaluation = Evaluation(dataset=examples, scorers=[match_score1, match_score2])
@@ -177,32 +176,32 @@ def predict(self, question: str):
         return {"response": res["response"], "confidence": 1 / (len(res) + 1)}
 
 
-def score_int(expected: str, model_output: dict) -> int:
+def score_int(expected: str, output: dict) -> int:
     matches = 0
-    for i in range(min(len(expected), len(model_output["response"]))):
-        if expected[i] == model_output["response"][i]:
+    for i in range(min(len(expected), len(output["response"]))):
+        if expected[i] == output["response"][i]:
             matches += 1
     return matches
 
 
-def score_float(expected: str, model_output: dict) -> float:
-    matches = score_int(expected, model_output)
-    return matches / max(len(expected), len(model_output["response"]))
+def score_float(expected: str, output: dict) -> float:
+    matches = score_int(expected, output)
+    return matches / max(len(expected), len(output["response"]))
 
 
-def score_bool(expected: str, model_output: dict) -> bool:
-    return score_float(expected, model_output) == 1.0
+def score_bool(expected: str, output: dict) -> bool:
+    return score_float(expected, output) == 1.0
 
 
-def score_dict(expected: str, model_output: dict) -> dict:
+def score_dict(expected: str, output: dict) -> dict:
     return {
-        "d_int": score_int(expected, model_output),
-        "d_float": score_float(expected, model_output),
-        "d_bool": score_bool(expected, model_output),
+        "d_int": score_int(expected, output),
+        "d_float": score_float(expected, output),
+        "d_bool": score_bool(expected, output),
         "d_nested": {
-            "d_int": score_int(expected, model_output),
-            "d_float": score_float(expected, model_output),
-            "d_bool": score_bool(expected, model_output),
+            "d_int": score_int(expected, output),
+            "d_float": score_float(expected, output),
+            "d_bool": score_bool(expected, output),
         },
         "reason": "This is a test reason",
     }
@@ -210,32 +209,32 @@ def score_dict(expected: str, model_output: dict) -> dict:
 
 class MyIntScorer(weave.Scorer):
     @weave.op()
-    def score(self, expected: str, model_output: dict) -> int:
-        return score_int(expected, model_output)
+    def score(self, expected: str, output: dict) -> int:
+        return score_int(expected, output)
 
 
 class MyFloatScorer(weave.Scorer):
     @weave.op()
-    def score(self, expected: str, model_output: dict) -> float:
-        return score_float(expected, model_output)
+    def score(self, expected: str, output: dict) -> float:
+        return score_float(expected, output)
 
 
 class MyBoolScorer(weave.Scorer):
     @weave.op()
-    def score(self, expected: str, model_output: dict) -> bool:
-        return score_bool(expected, model_output)
+    def score(self, expected: str, output: dict) -> bool:
+        return score_bool(expected, output)
 
 
 class MyDictScorer(weave.Scorer):
     @weave.op()
-    def score(self, expected: str, model_output: dict) -> dict:
-        return score_dict(expected, model_output)
+    def score(self, expected: str, output: dict) -> dict:
+        return score_dict(expected, output)
 
 
 class MyDictScorerWithCustomFloatSummary(weave.Scorer):
     @weave.op()
-    def score(self, expected: str, model_output: dict) -> dict:
-        return score_dict(expected, model_output)
+    def score(self, expected: str, output: dict) -> dict:
+        return score_dict(expected, output)
 
     @weave.op()
     def summarize(self, score_rows: list) -> Optional[dict]:
@@ -245,8 +244,8 @@ def summarize(self, score_rows: list) -> Optional[dict]:
 
 class MyDictScorerWithCustomBoolSummary(weave.Scorer):
     @weave.op()
-    def score(self, expected: str, model_output: dict) -> dict:
-        return score_dict(expected, model_output)
+    def score(self, expected: str, output: dict) -> dict:
+        return score_dict(expected, output)
 
     @weave.op()
     def summarize(self, score_rows: list) -> Optional[dict]:
@@ -256,8 +255,8 @@ def summarize(self, score_rows: list) -> Optional[dict]:
 
 class MyDictScorerWithCustomDictSummary(weave.Scorer):
     @weave.op()
-    def score(self, expected: str, model_output: dict) -> dict:
-        return score_dict(expected, model_output)
+    def score(self, expected: str, output: dict) -> dict:
+        return score_dict(expected, output)
 
     @weave.op()
     def summarize(self, score_rows: list) -> Optional[dict]:
@@ -367,7 +366,7 @@ async def test_evaluation_data_topology(client):
 
     # Prediction Section
     confidence = 1 / 4
-    model_output = {
+    output = {
         "response": "A",
         "confidence": confidence,
     }
@@ -406,7 +405,7 @@ async def test_evaluation_data_topology(client):
     }
 
     # Prediction
-    assert predict_call.output == model_output
+    assert predict_call.output == output
     assert predict_call.summary == predict_usage
 
     # Prediction Scores
@@ -429,7 +428,7 @@ async def test_evaluation_data_topology(client):
 
     # Predict And Score Group
     assert predict_and_score_call.output == {
-        "model_output": model_output,
+        "output": output,
         "scores": {
             "score_int": score_int_score,
             "score_float": score_float_score,
@@ -443,7 +442,7 @@ async def test_evaluation_data_topology(client):
     }
 
     # Summary section
-    model_output_summary = {
+    output_summary = {
         "confidence": {"mean": confidence},
     }
     score_int_auto_summary = {"mean": 1.5}
@@ -516,20 +515,20 @@ async def test_evaluation_data_topology(client):
             "MyDictScorerWithCustomBoolSummary": dict_scorer_bool_summary,
             "MyDictScorerWithCustomDictSummary": dict_scorer_dict_summary,
             "model_latency": model_latency,
-            "model_output": model_output_summary,
+            "output": output_summary,
         }
     )
     assert evaluate_call.summary == predict_usage_summary
 
 
 def make_test_eval():
-    def function_score(target: dict, model_output: dict) -> dict:
-        return {"correct": target == model_output}
+    def function_score(expected: str, output: dict) -> dict:
+        return {"correct": expected == output["generated_text"]}
 
     evaluation = weave.Evaluation(
         name="fruit_eval",
         dataset=[
-            {"id": "0", "sentence": "a", "target": "b"},
+            {"id": "0", "sentence": "a", "expected": "b"},
         ],
         scorers=[function_score],
     )
@@ -622,7 +621,7 @@ async def test_eval_is_robust_to_missing_values(client):
     def model_func(model_res) -> dict:
         return resp[model_res]
 
-    def function_score(scorer_res, model_output) -> dict:
+    def function_score(scorer_res, output) -> dict:
         return resp[scorer_res]
 
     evaluation = weave.Evaluation(
@@ -633,7 +632,7 @@ def function_score(scorer_res, model_output) -> dict:
 
     res = await evaluation.evaluate(model_func)
     assert res == {
-        "model_output": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}},
+        "output": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}},
         "function_score": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}},
         "model_latency": {"mean": pytest.approx(0, abs=1)},
     }
@@ -672,7 +671,7 @@ def model_func(
 
         return text
 
-    def function_score(image, dc, model, obj, text, model_output) -> bool:
+    def function_score(image, dc, model, obj, text, output) -> bool:
         assert isinstance(image, Image.Image)
 
         # Note: when we start recursively saving dataset rows, this will
@@ -685,7 +684,7 @@ def function_score(image, dc, model, obj, text, model_output) -> bool:
         assert isinstance(model, MyModel)
         assert isinstance(obj, MyObj)
         assert isinstance(text, str)
-        assert isinstance(model_output, str)
+        assert isinstance(output, str)
 
         return True
 

From aa4f588d215eddafe6d82b1d6a454c51ced9da18 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 21:49:18 +0200
Subject: [PATCH 036/150] fix eval tests

---
 tests/trace/test_evaluations.py    | 97 ++++++++++++++++++++++++++++++
 weave/flow/eval.py                 |  9 ++-
 weave/flow/scorer/string_scorer.py |  4 +-
 3 files changed, 107 insertions(+), 3 deletions(-)

diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py
index a9402050da0e..caea97bba21b 100644
--- a/tests/trace/test_evaluations.py
+++ b/tests/trace/test_evaluations.py
@@ -9,6 +9,7 @@
 import weave
 from tests.trace.util import AnyIntMatcher
 from weave import Evaluation, Model
+from weave.flow.scorer import Scorer
 from weave.trace_server import trace_server_interface as tsi
 
 
@@ -733,3 +734,99 @@ def function_score(image, dc, model, obj, text, output) -> bool:
     assert "table_query" in access_log
     assert "obj_read" in access_log
     assert "file_content_read" in access_log
+
+
+
+@pytest.mark.asyncio
+async def test_evaluation_with_column_map():
+
+    # Define a dummy scorer that uses column_map
+    class DummyScorer(Scorer):
+        @weave.op()
+        def score(self, foo: str, bar: str, output: str, target: str) -> dict:
+            # Return whether foo + bar equals output
+            return {"match": (foo + bar) == output == target}
+        
+    # Create the scorer with column_map mapping 'foo'->'col1', 'bar'->'col2'
+    dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"})
+
+    @weave.op()
+    def model_function(col1, col2):
+        # For testing, return the concatenation of col1 and col2
+        return col1 + col2
+
+    dataset = [
+        {"col1": "Hello", "col2": "World", "target": "HelloWorld"},
+        {"col1": "Hi", "col2": "There", "target": "HiThere"},
+        {"col1": "Good", "col2": "Morning", "target": "GoodMorning"},
+        {"col1": "Bad", "col2": "Evening", "target": "GoodEvening"},
+    ]
+
+    evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer])
+
+    # Run the evaluation
+    eval_out = await evaluation.evaluate(model_function)
+
+    # Check that 'DummyScorer' is in the results
+    assert 'DummyScorer' in eval_out
+
+    # The expected summary should show that 3 out of 4 predictions matched
+    expected_results = {"true_count": 3, "true_fraction": 0.75}
+    assert eval_out['DummyScorer']["match"] == expected_results, "The summary should reflect the correct number of matches"
+
+
+
+# Define another dummy scorer
+
+
+@pytest.mark.asyncio
+async def test_evaluation_with_multiple_column_maps():
+    class DummyScorer(Scorer):
+        @weave.op()
+        def score(self, foo: str, bar: str, output: str, target: str) -> dict:
+            # Return whether foo + bar equals output
+            return {"match": (foo + bar) == output == target}
+    class AnotherDummyScorer(Scorer):
+        @weave.op()
+        def score(self, input1: str, input2: str, output: str) -> dict:
+            # Return whether input1 == output reversed
+            return {"match": input1 == output[::-1]}
+    # First scorer maps 'foo'->'col1', 'bar'->'col2'
+    dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"})
+
+    # Second scorer maps 'input1'->'col2', 'input2'->'col1'
+    another_dummy_scorer = AnotherDummyScorer(column_map={"input1": "col2", "input2": "col1"})
+
+    @weave.op()
+    def model_function(col1, col2):
+        # For testing, return the concatenation of col1 and col2
+        return col1 + col2
+
+    dataset = [
+        {"col1": "abc", "col2": "def", "target": "abcdef"},
+        {"col1": "123", "col2": "456", "target": "1111"},
+        {"col1": "xyz", "col2": "zyx", "target": "zzzzzz"},
+    ]
+
+    evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer, another_dummy_scorer])
+
+    # Run the evaluation
+    eval_out = await evaluation.evaluate(model_function)
+
+    # Check that both scorers are in the results
+    assert 'DummyScorer' in eval_out
+    assert 'AnotherDummyScorer' in eval_out
+
+    # Assertions for the first scorer
+    expected_results_dummy = {"true_count": 1, "true_fraction": 1.0/3}
+    assert eval_out['DummyScorer']["match"] == expected_results_dummy, "All concatenations should match the target"
+
+    # Assertions for the second scorer
+    # Since input1 == col2, and output is col1 + col2, we check if col2 == (col1 + col2)[::-1]
+    # Evaluate manually:
+    # First row: col2 = "def", output = "abcdef", output[::-1] = "fedcba" -> "def" != "fedcba"
+    # Second row: col2 = "456", output = "123456", output[::-1] = "654321" -> "456" != "654321"
+    # Third row: col2 = "zyx", output = "xyzzyx", output[::-1] = "xyzzyx" -> "zyx" == "xyzzyx" is False
+    # So all matches are False
+    expected_results_another_dummy = {"true_count": 0, "true_fraction": 0.0}
+    assert eval_out['AnotherDummyScorer']["match"] == expected_results_another_dummy, "No matches should be found for AnotherDummyScorer"
diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 7da8bab9afe9..4b4853d3463c 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -220,7 +220,14 @@ async def predict_and_score(
                 # input: is the full row, we have access to it via example
                 # output: is the model output, we have access to it via model_output
                 if isinstance(scorer, Scorer) and scorer.column_map is not None:
-                    score_args = {scorer.column_map.get(k, k): v for k, v in example.items() if scorer.column_map.get(k, k) in score_arg_names}
+                    print(f"scorer.column_map: {scorer.column_map}")
+                    print(f"score_arg_names: {score_arg_names}")
+                    print(f"example: {example}")
+                    score_args = {
+                        arg: example[scorer.column_map.get(arg, arg)]
+                        for arg in score_arg_names
+                        if scorer.column_map.get(arg, arg) in example
+                    }
                 else:
                     score_args = {k: v for k, v in example.items() if k in score_arg_names}
 
diff --git a/weave/flow/scorer/string_scorer.py b/weave/flow/scorer/string_scorer.py
index 8904959392eb..956b422231e2 100644
--- a/weave/flow/scorer/string_scorer.py
+++ b/weave/flow/scorer/string_scorer.py
@@ -76,8 +76,8 @@ def score(self, output: str, target: str) -> dict:
 if __name__ == "__main__":
     import asyncio
 
-    match_scorer = StringMatchScorer(column_map={"output": "col1"})
-    levenshtein_scorer = LevenshteinScorer(column_map={"output": "col2"})
+    match_scorer = StringMatchScorer(column_map={"target": "col1"})
+    levenshtein_scorer = LevenshteinScorer(column_map={"target": "col2"})
 
     
     @weave.op

From 3e13e579aedbc63c53d95f2cd0251eb14542dac6 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 22:42:47 +0200
Subject: [PATCH 037/150] add LLM services tests

---
 tests/scorers/test_hallucination_scorer.py | 37 +++++++++++++
 tests/scorers/test_ragas_scorer.py         | 62 ++++++++++++++++++++++
 tests/scorers/test_summarization_scorer.py | 44 +++++++++++++++
 weave/flow/scorer/__init__.py              | 10 +++-
 weave/flow/scorer/hallucination_scorer.py  |  5 +-
 weave/flow/scorer/llm_scorer.py            | 12 ++---
 weave/flow/scorer/llm_utils.py             |  3 ++
 weave/flow/scorer/ragas_scorer.py          | 17 +++---
 weave/flow/scorer/summarization_scorer.py  |  5 +-
 9 files changed, 172 insertions(+), 23 deletions(-)
 create mode 100644 tests/scorers/test_hallucination_scorer.py
 create mode 100644 tests/scorers/test_ragas_scorer.py
 create mode 100644 tests/scorers/test_summarization_scorer.py

diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
new file mode 100644
index 000000000000..2a30e3ac82ea
--- /dev/null
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -0,0 +1,37 @@
+import pytest
+from weave.flow.scorer.hallucination_scorer import HallucinationScorer, HallucinationResponse
+from openai import OpenAI
+
+# Mock the OpenAI client
+class MockOpenAI(OpenAI):
+    pass
+
+# mock the create function
+@pytest.fixture
+def mock_create(monkeypatch):
+    def _mock_create(*args, **kwargs):
+        return HallucinationResponse(
+            chain_of_thought="The output is consistent with the input data.",
+            is_hallucination=False
+        )
+    monkeypatch.setattr('weave.flow.scorer.hallucination_scorer.create', _mock_create)
+
+@pytest.fixture
+def hallucination_scorer(mock_create):
+    return HallucinationScorer(client=MockOpenAI(), model_id="gpt-4o", temperature=0.7, max_tokens=4096)
+
+def test_hallucination_scorer_initialization(hallucination_scorer):
+    assert isinstance(hallucination_scorer, HallucinationScorer)
+    assert hallucination_scorer.model_id == "gpt-4o"
+    assert hallucination_scorer.temperature == 0.7
+    assert hallucination_scorer.max_tokens == 4096
+
+def test_hallucination_scorer_score(hallucination_scorer, mock_create):
+    output = "John's favorite cheese is cheddar."
+    context = "John likes various types of cheese."
+    result = hallucination_scorer.score(output, context)
+    assert isinstance(result, HallucinationResponse)
+    assert not result.is_hallucination
+    assert "The output is consistent with the input data."  == result.chain_of_thought
+
+# Add more tests as needed
diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
new file mode 100644
index 000000000000..33333b1d83d7
--- /dev/null
+++ b/tests/scorers/test_ragas_scorer.py
@@ -0,0 +1,62 @@
+import pytest
+from weave.flow.scorer.ragas_scorer import (
+    ContextEntityRecallScorer,
+    ContextRelevancyScorer,
+    EntityExtractionResponse,
+    RelevancyResponse
+)
+from openai import OpenAI
+
+# Mock the OpenAI client
+class MockOpenAI(OpenAI):
+    pass
+
+# Mock the create function
+@pytest.fixture
+def mock_create(monkeypatch):
+    def _mock_create(*args, **kwargs):
+        # Retrieve the response_model to return appropriate mock responses
+        response_model = kwargs.get('response_model')
+        if response_model == EntityExtractionResponse:
+            return EntityExtractionResponse(entities=["Paris"])
+        elif response_model == RelevancyResponse:
+            return RelevancyResponse(
+                reasoning="The context directly answers the question.",
+                relevancy_score=1
+            )
+        else:
+            return None
+    monkeypatch.setattr('weave.flow.scorer.ragas_scorer.create', _mock_create)
+
+@pytest.fixture
+def context_entity_recall_scorer(mock_create):
+    return ContextEntityRecallScorer(client=MockOpenAI(), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+
+@pytest.fixture
+def context_relevancy_scorer(mock_create):
+    return ContextRelevancyScorer(client=MockOpenAI(), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+
+def test_context_entity_recall_scorer_initialization(context_entity_recall_scorer):
+    assert isinstance(context_entity_recall_scorer, ContextEntityRecallScorer)
+    assert context_entity_recall_scorer.model_id == "gpt-4o"
+
+def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
+    output = "Paris is the capital of France."
+    context = "The capital city of France is Paris."
+    result = context_entity_recall_scorer.score(output, context)
+    assert isinstance(result, dict)
+    assert "recall" in result
+    assert result["recall"] == 1.0  # Assuming full recall in mock response
+
+def test_context_relevancy_scorer_initialization(context_relevancy_scorer):
+    assert isinstance(context_relevancy_scorer, ContextRelevancyScorer)
+    assert context_relevancy_scorer.model_id == "gpt-4o"
+
+def test_context_relevancy_scorer_score(context_relevancy_scorer):
+    output = "What is the capital of France?"
+    context = "Paris is the capital city of France."
+    result = context_relevancy_scorer.score(output, context)
+    assert isinstance(result, dict)
+    assert "relevancy_score" in result
+    assert result["relevancy_score"] == 1  # Assuming relevancy in mock response
+
diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
new file mode 100644
index 000000000000..5c011c393926
--- /dev/null
+++ b/tests/scorers/test_summarization_scorer.py
@@ -0,0 +1,44 @@
+import pytest
+from weave.flow.scorer.summarization_scorer import SummarizationScorer, EntityExtractionResponse
+from openai import OpenAI
+
+# Mock the OpenAI client
+class MockOpenAI(OpenAI):
+    pass
+
+# mock the create function
+@pytest.fixture
+def mock_create(monkeypatch):
+    def _mock_create(*args, **kwargs):
+        return EntityExtractionResponse(
+            entities=["entity1", "entity2"]
+        )
+    monkeypatch.setattr('weave.flow.scorer.summarization_scorer.create', _mock_create)
+
+@pytest.fixture
+def summarization_scorer(mock_create):
+    return SummarizationScorer(client=MockOpenAI(), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+
+def test_summarization_scorer_initialization(summarization_scorer, mock_create):
+    assert isinstance(summarization_scorer, SummarizationScorer)
+    assert summarization_scorer.model_id == "gpt-4o"
+    assert summarization_scorer.temperature == 0.7
+    assert summarization_scorer.max_tokens == 1024
+
+def test_summarization_scorer_extract_entities(summarization_scorer, mock_create):
+    text = "This is a sample text with entities."
+    entities = summarization_scorer.extract_entities(text)
+    assert isinstance(entities, list)
+    assert len(entities) == 2
+    assert "entity1" in entities
+    assert "entity2" in entities
+
+def test_summarization_scorer_score(summarization_scorer):
+    input_text = "This is the original text with entities."
+    output_text = "This is a summary with some entities."
+    result = summarization_scorer.score(input=input_text, output=output_text)
+    assert isinstance(result, dict)
+    assert "recall" in result
+    assert 0 <= result["recall"] <= 1
+
+# Add more tests as needed
diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index 46a60b3d0dde..ee990ae7051b 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -4,6 +4,7 @@
 from weave.flow.scorer.json_scorer import JSONScorer
 from weave.flow.scorer.llm_scorer import (
     LLMScorer,
+    InstructorLLMScorer,
 )
 from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
 from weave.flow.scorer.moderation_scorer import OpenAIModerationScorer
@@ -12,7 +13,9 @@
     ContextEntityRecallScorer,
     ContextRelevancyScorer,
 )
-from weave.flow.scorer.string_scorer import RegexScorer
+from weave.flow.scorer.string_scorer import RegexScorer, StringMatchScorer, LevenshteinScorer
+from weave.flow.scorer.summarization_scorer import SummarizationScorer
+from weave.flow.scorer.xml_scorer import XMLScorer
 
 __all__ = [
     "Scorer",
@@ -21,12 +24,17 @@
     "MultiTaskBinaryClassificationF1",
     "transpose",
     "RegexScorer",
+    "StringMatchScorer",
+    "LevenshteinScorer",
     "JSONScorer",
     "LLMScorer",
+    "InstructorLLMScorer",
     "EmbeddingSimilarityScorer",
     "OpenAIModerationScorer",
     "PydanticScorer",
     "HallucinationScorer",
     "ContextEntityRecallScorer",
     "ContextRelevancyScorer",
+    "SummarizationScorer",
+    "XMLScorer",
 ]
diff --git a/weave/flow/scorer/hallucination_scorer.py b/weave/flow/scorer/hallucination_scorer.py
index 2a3aa8b3a1ae..2d540295ab82 100644
--- a/weave/flow/scorer/hallucination_scorer.py
+++ b/weave/flow/scorer/hallucination_scorer.py
@@ -4,7 +4,7 @@
 import weave
 from weave.flow.scorer.utils import stringify
 from weave.flow.scorer.llm_scorer import InstructorLLMScorer
-from weave.flow.scorer.llm_utils import OPENAI_DEFAULT_MODEL
+from weave.flow.scorer.llm_utils import OPENAI_DEFAULT_MODEL, create
 
 
 DEFAULT_SYSTEM_PROMPT =  """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
@@ -41,7 +41,8 @@ class HallucinationScorer(InstructorLLMScorer):
     def score(self, output: str, context: str) -> HallucinationResponse:
 
         output = stringify(output)
-        response = self.client.chat.completions.create(
+        response = create(
+            self.client,
             messages=[
                 {"role": "system", "content": self.system_prompt},
                 {"role": "user", "content": self.user_prompt.format(input_data=context, output=output)},
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index 4edccadeaece..facbb5bac540 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -1,16 +1,10 @@
-import json
-from typing import Any, Type
+from typing import Any
 
-import numpy as np
-from pydantic import BaseModel, Field, field_validator
+from pydantic import Field, field_validator
 
 from weave.flow.scorer.base_scorer import Scorer
-from weave.flow.scorer.llm_utils import embed, instructor_client, OPENAI_DEFAULT_MODEL, _LLM_CLIENT_TYPES
+from weave.flow.scorer.llm_utils import instructor_client, _LLM_CLIENT_TYPES
 
-try:
-    from openai import AsyncOpenAI, OpenAI
-except:
-    pass
 class LLMScorer(Scorer):
     """Score a model output using an LLM"""
 
diff --git a/weave/flow/scorer/llm_utils.py b/weave/flow/scorer/llm_utils.py
index 5c910b645f13..8f24fccfe76a 100644
--- a/weave/flow/scorer/llm_utils.py
+++ b/weave/flow/scorer/llm_utils.py
@@ -54,6 +54,9 @@ def instructor_client(client: _LLM_CLIENTS):
         return instructor.from_anthropic(client)
     else:
         raise ValueError(f"Unsupported client type: {client_type}")
+    
+def create(client: _LLM_CLIENTS, *args, **kwargs):
+    return client.chat.completions.create(*args, **kwargs)
 
 def embed(client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
     client_type = type(client).__name__.lower()
diff --git a/weave/flow/scorer/ragas_scorer.py b/weave/flow/scorer/ragas_scorer.py
index d29674299c6d..1d232c053357 100644
--- a/weave/flow/scorer/ragas_scorer.py
+++ b/weave/flow/scorer/ragas_scorer.py
@@ -5,14 +5,13 @@
 from textwrap import dedent
 
 import weave
-from weave.flow.scorer.llm_utils import instructor_client
-from weave.flow.scorer.llm_scorer import LLMScorer
-from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
+from weave.flow.scorer.llm_utils import instructor_client, create
+from weave.flow.scorer.llm_scorer import InstructorLLMScorer
 
 class EntityExtractionResponse(BaseModel):
     entities: List[str] = Field(description="A list of unique entities extracted from the text")
 
-class ContextEntityRecallScorer(LLMScorer):
+class ContextEntityRecallScorer(InstructorLLMScorer):
     """
     Estimates context recall by extracting entities from the model output 
     and the context, then computes the recall.
@@ -27,9 +26,9 @@ class ContextEntityRecallScorer(LLMScorer):
     
     def extract_entities(self, text: str) -> List[str]:
         # Use LLM to extract entities
-        client = instructor_client(self.client)
         prompt = self.extraction_prompt.format(text=text)
-        response = client.chat.completions.create(
+        response = create(
+            self.client,
             messages=[{"role": "user", "content": prompt}],
             response_model=EntityExtractionResponse,
             model=self.model_id
@@ -52,7 +51,7 @@ def score(self, output: str, context: str) -> float:
 class RelevancyResponse(BaseModel):
     reasoning: str = Field(description="Think step by step about whether the context is relevant to the question")
     relevancy_score: int = Field(ge=0, le=1, description="The relevancy score of the context to the question (0 for not relevant, 1 for relevant)")
-class ContextRelevancyScorer(LLMScorer):
+class ContextRelevancyScorer(InstructorLLMScorer):
     """Evaluates the relevancy of the provided context to the model output."""
 
     relevancy_prompt: str = dedent("""
@@ -65,9 +64,9 @@ class ContextRelevancyScorer(LLMScorer):
 
     @weave.op
     def score(self, output: str, context: str) -> float:
-        llm = instructor_client(self.client)
         prompt = self.relevancy_prompt.format(question=output, context=context)
-        response = llm.chat.completions.create(
+        response = create(
+            self.client,
             messages=[{"role": "user", "content": prompt}],
             response_model=RelevancyResponse,
             model=self.model_id
diff --git a/weave/flow/scorer/summarization_scorer.py b/weave/flow/scorer/summarization_scorer.py
index 9ce7464c0d65..225ad71385bf 100644
--- a/weave/flow/scorer/summarization_scorer.py
+++ b/weave/flow/scorer/summarization_scorer.py
@@ -4,7 +4,7 @@
 
 import weave
 from weave.flow.scorer.llm_scorer import InstructorLLMScorer
-from weave.flow.scorer.llm_utils import instructor_client
+from weave.flow.scorer.llm_utils import create
 
 
 class EntityExtractionResponse(BaseModel):
@@ -28,7 +28,8 @@ class SummarizationScorer(InstructorLLMScorer):
     def extract_entities(self, text: str) -> List[str]:
         # Use LLM to extract entities
         prompt = self.extraction_prompt.format(text=text)
-        response = self.client.chat.completions.create(
+        response = create(
+            self.client,
             messages=[{"role": "user", "content": prompt}],
             response_model=EntityExtractionResponse,
             model=self.model_id,

From 04496ccdcf6ffdc88bde7099af24017be5640c01 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 22:47:57 +0200
Subject: [PATCH 038/150] enable test

---
 .github/workflows/test.yaml | 1 +
 noxfile.py                  | 2 ++
 pyproject.toml              | 1 +
 3 files changed, 4 insertions(+)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b7f03683551f..8f7aaa346d13 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -222,6 +222,7 @@ jobs:
             'mistral0',
             'mistral1',
             'openai',
+            'scorers',
           ]
       fail-fast: false
     services:
diff --git a/noxfile.py b/noxfile.py
index d9efabaa165e..5fe1a27d13d1 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -30,6 +30,7 @@ def lint(session):
         "mistral0",
         "mistral1",
         "openai",
+        "scorers",
     ],
 )
 def tests(session, shard):
@@ -56,6 +57,7 @@ def tests(session, shard):
         "trace_server": ["trace_server/"],
         "mistral0": ["integrations/mistral/v0/"],
         "mistral1": ["integrations/mistral/v1/"],
+        "scorers": ["flow/"],
     }
 
     test_dirs = test_dirs_dict.get(shard, default_test_dirs)
diff --git a/pyproject.toml b/pyproject.toml
index a2c44afd291d..2ac4dbf0c1c4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,7 @@ litellm = ["litellm>=1.36.1"]
 llamaindex = ["llama-index>=0.10.35"]
 mistral0 = ["mistralai>=0.1.8,<1.0.0"]
 mistral1 = ["mistralai>=1.0.0"]
+scorers = ["openai>=1.0.0", "instructor>=1.5.2"]
 openai = ["openai>=1.0.0"]
 modal = ["modal", "python-dotenv"]
 test = [

From 8bafa5d4f5d6e9b677d693866385f318d8356714 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 22:50:39 +0200
Subject: [PATCH 039/150] lint

---
 weave/flow/scorer/__init__.py             | 15 +++++--
 weave/flow/scorer/base_scorer.py          |  6 ++-
 weave/flow/scorer/hallucination_scorer.py | 54 ++++++++++++++---------
 weave/flow/scorer/llm_scorer.py           | 14 ++++--
 weave/flow/scorer/llm_utils.py            | 40 ++++++++++++-----
 weave/flow/scorer/moderation_scorer.py    | 16 ++++---
 weave/flow/scorer/ragas_scorer.py         | 53 +++++++++++++---------
 weave/flow/scorer/similarity_score.py     | 44 +++++++++++-------
 weave/flow/scorer/string_scorer.py        | 46 ++++++++++++-------
 weave/flow/scorer/summarization_scorer.py | 35 ++++++++-------
 weave/flow/scorer/utils.py                |  1 +
 11 files changed, 211 insertions(+), 113 deletions(-)

diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
index ee990ae7051b..d5702549ef73 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorer/__init__.py
@@ -1,19 +1,26 @@
 from weave.flow.scorer.base_scorer import Scorer, auto_summarize, get_scorer_attributes
-from weave.flow.scorer.classification_scorer import MultiTaskBinaryClassificationF1, transpose
+from weave.flow.scorer.classification_scorer import (
+    MultiTaskBinaryClassificationF1,
+    transpose,
+)
 from weave.flow.scorer.hallucination_scorer import HallucinationScorer
 from weave.flow.scorer.json_scorer import JSONScorer
 from weave.flow.scorer.llm_scorer import (
-    LLMScorer,
     InstructorLLMScorer,
+    LLMScorer,
 )
-from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
 from weave.flow.scorer.moderation_scorer import OpenAIModerationScorer
 from weave.flow.scorer.pydantic_scorer import PydanticScorer
 from weave.flow.scorer.ragas_scorer import (
     ContextEntityRecallScorer,
     ContextRelevancyScorer,
 )
-from weave.flow.scorer.string_scorer import RegexScorer, StringMatchScorer, LevenshteinScorer
+from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
+from weave.flow.scorer.string_scorer import (
+    LevenshteinScorer,
+    RegexScorer,
+    StringMatchScorer,
+)
 from weave.flow.scorer.summarization_scorer import SummarizationScorer
 from weave.flow.scorer.xml_scorer import XMLScorer
 
diff --git a/weave/flow/scorer/base_scorer.py b/weave/flow/scorer/base_scorer.py
index 08445e85856f..a0eec1ac09cf 100644
--- a/weave/flow/scorer/base_scorer.py
+++ b/weave/flow/scorer/base_scorer.py
@@ -11,7 +11,11 @@
 
 
 class Scorer(Object):
-    column_map: Optional[dict[str, str]] = Field(default=None, description="A mapping from column names in the dataset to the names expected by the scorer")
+    column_map: Optional[dict[str, str]] = Field(
+        default=None,
+        description="A mapping from column names in the dataset to the names expected by the scorer",
+    )
+
     def score(self, input: Any, target: Any, output: Any) -> Any:
         raise NotImplementedError
 
diff --git a/weave/flow/scorer/hallucination_scorer.py b/weave/flow/scorer/hallucination_scorer.py
index 2d540295ab82..f1ec1dbc75f6 100644
--- a/weave/flow/scorer/hallucination_scorer.py
+++ b/weave/flow/scorer/hallucination_scorer.py
@@ -1,14 +1,12 @@
 from pydantic import BaseModel, Field
 
-
 import weave
-from weave.flow.scorer.utils import stringify
 from weave.flow.scorer.llm_scorer import InstructorLLMScorer
 from weave.flow.scorer.llm_utils import OPENAI_DEFAULT_MODEL, create
+from weave.flow.scorer.utils import stringify
 
-
-DEFAULT_SYSTEM_PROMPT =  """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
-DEFAULT_USER_PROMPT =  """Given some input_data and a output, determine if the output is a hallucination of the input_data.
+DEFAULT_SYSTEM_PROMPT = """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
+DEFAULT_USER_PROMPT = """Given some input_data and a output, determine if the output is a hallucination of the input_data.
 ## Input data
 <input_data>
 {input_data}
@@ -20,17 +18,22 @@
 </output>
 
 ## Instructions
-Think step by step before answering. Is the output an factually and logically consistent with the input_data? 
+Think step by step before answering. Is the output an factually and logically consistent with the input_data?
 """
 
+
 class HallucinationResponse(BaseModel):
-    chain_of_thought: str = Field(description="Think step by step about whether the output is a hallucination of the dataset_row")
-    is_hallucination: bool = Field(description="Whether the model output is a hallucination of the dataset row")
+    chain_of_thought: str = Field(
+        description="Think step by step about whether the output is a hallucination of the dataset_row"
+    )
+    is_hallucination: bool = Field(
+        description="Whether the model output is a hallucination of the dataset row"
+    )
+
 
 class HallucinationScorer(InstructorLLMScorer):
-    """
-    Scorer that checks if the model output is a hallucination of the dataset row.
-    """
+    """Scorer that checks if the model output is a hallucination of the dataset row."""
+
     system_prompt: str = DEFAULT_SYSTEM_PROMPT
     user_prompt: str = DEFAULT_USER_PROMPT
     model_id: str = OPENAI_DEFAULT_MODEL
@@ -39,13 +42,17 @@ class HallucinationScorer(InstructorLLMScorer):
 
     @weave.op
     def score(self, output: str, context: str) -> HallucinationResponse:
-
         output = stringify(output)
         response = create(
             self.client,
             messages=[
                 {"role": "system", "content": self.system_prompt},
-                {"role": "user", "content": self.user_prompt.format(input_data=context, output=output)},
+                {
+                    "role": "user",
+                    "content": self.user_prompt.format(
+                        input_data=context, output=output
+                    ),
+                },
             ],
             model=self.model_id,
             response_model=HallucinationResponse,
@@ -57,28 +64,33 @@ def score(self, output: str, context: str) -> HallucinationResponse:
 
 if __name__ == "__main__":
     try:
-        import openai, os, weave, asyncio
+        import asyncio
+        import os
+
+        import openai
+
+        import weave
 
         # weave.init("hallucination-scorer-2")
 
         openai_client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-        scorer = HallucinationScorer(client=openai_client, column_map={"text": "context"})
+        scorer = HallucinationScorer(
+            client=openai_client, column_map={"text": "context"}
+        )
 
         output = "John favorite cheese is camembert"
         dataset_row = {"text": "John doesn't like cheese"}
         response = scorer.score(output, context=dataset_row)
         print(response)
-    
+
         @weave.op
         def model():
             return "John favorite food is apples"
 
-        dataset = [{"text": "John doesn't like cheese"}, 
-                   {"text": "John likes pizza"}]
-        
+        dataset = [{"text": "John doesn't like cheese"}, {"text": "John likes pizza"}]
+
         evaluation = weave.Evaluation(dataset=dataset, scorers=[scorer])
         asyncio.run(evaluation.evaluate(model))
-    
+
     except Exception as e:
         print(e)
-    
\ No newline at end of file
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index facbb5bac540..a20ff67525cc 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -3,7 +3,8 @@
 from pydantic import Field, field_validator
 
 from weave.flow.scorer.base_scorer import Scorer
-from weave.flow.scorer.llm_utils import instructor_client, _LLM_CLIENT_TYPES
+from weave.flow.scorer.llm_utils import _LLM_CLIENT_TYPES, instructor_client
+
 
 class LLMScorer(Scorer):
     """Score a model output using an LLM"""
@@ -21,6 +22,7 @@ def validate_client(cls, v):
             )
         return v
 
+
 class InstructorLLMScorer(Scorer):
     """Score a model output using an LLM"""
 
@@ -28,8 +30,12 @@ class InstructorLLMScorer(Scorer):
         description="The LLM client to use, has to be instantiated with an api_key"
     )
     model_id: str = Field(description="The model to use")
-    temperature: float = Field(..., description="The temperature to use for the response")
-    max_tokens: int = Field(..., description="The maximum number of tokens in the response")
+    temperature: float = Field(
+        ..., description="The temperature to use for the response"
+    )
+    max_tokens: int = Field(
+        ..., description="The maximum number of tokens in the response"
+    )
 
     @field_validator("client")
     def validate_client(cls, v):
@@ -37,4 +43,4 @@ def validate_client(cls, v):
             raise ValueError(
                 f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}"
             )
-        return instructor_client(v)
\ No newline at end of file
+        return instructor_client(v)
diff --git a/weave/flow/scorer/llm_utils.py b/weave/flow/scorer/llm_utils.py
index 8f24fccfe76a..dcae67267821 100644
--- a/weave/flow/scorer/llm_utils.py
+++ b/weave/flow/scorer/llm_utils.py
@@ -1,10 +1,10 @@
-from typing import List, Union, TypeVar
+from typing import List, TypeVar, Union
 
 import instructor
 
 from weave.trace.autopatch import autopatch
 
-autopatch() # fix instrucor tracing
+autopatch()  # fix instrucor tracing
 
 # TODO: Gemini
 
@@ -44,6 +44,7 @@
 
 _LLM_CLIENTS = TypeVar(Union[tuple(_LLM_CLIENT_TYPES)])
 
+
 def instructor_client(client: _LLM_CLIENTS):
     client_type = type(client).__name__.lower()
     if "mistral" in client_type:
@@ -54,11 +55,15 @@ def instructor_client(client: _LLM_CLIENTS):
         return instructor.from_anthropic(client)
     else:
         raise ValueError(f"Unsupported client type: {client_type}")
-    
+
+
 def create(client: _LLM_CLIENTS, *args, **kwargs):
     return client.chat.completions.create(*args, **kwargs)
 
-def embed(client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+
+def embed(
+    client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs
+) -> List[List[float]]:
     client_type = type(client).__name__.lower()
     if "mistral" in client_type:
         response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)
@@ -69,6 +74,7 @@ def embed(client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **k
     else:
         raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}")
 
+
 # Helper function for dynamic imports
 def import_client(provider: str):
     try:
@@ -90,13 +96,14 @@ def import_client(provider: str):
 
 # Example usage:
 if __name__ == "__main__":
-    import asyncio
     import os
 
     # Mistral example
     MistralClient = import_client("mistral")
     if MistralClient:
-        mistral_client = instructor_client(Mistral(api_key=os.environ.get("MISTRAL_API_KEY")))
+        mistral_client = instructor_client(
+            Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
+        )
         mistral_response = mistral_client.chat.completions.create(
             messages=[{"role": "user", "content": "What is the best French cheese?"}],
             model=MISTRAL_DEFAULT_MODEL,
@@ -108,7 +115,9 @@ def import_client(provider: str):
     # OpenAI example with system message
     OpenAIClient = import_client("openai")
     if OpenAIClient:
-        openai_client = instructor_client(OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY")))
+        openai_client = instructor_client(
+            OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY"))
+        )
         openai_response = openai_client.chat.completions.create(
             messages=[
                 {
@@ -129,7 +138,9 @@ def import_client(provider: str):
     # Anthropic example with system message
     AnthropicClient = import_client("anthropic")
     if AnthropicClient:
-        anthropic_client = instructor_client(AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY")))
+        anthropic_client = instructor_client(
+            AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY"))
+        )
         anthropic_response = anthropic_client.messages.create(
             messages=[
                 {
@@ -147,11 +158,18 @@ def import_client(provider: str):
     # Embedding example
     if OpenAIClient:
         openai_embed_client = OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY"))
-        openai_embeddings = embed(openai_embed_client, OPENAI_DEFAULT_EMBEDDING_MODEL, ["Embed this sentence.", "As well as this one."])
+        openai_embeddings = embed(
+            openai_embed_client,
+            OPENAI_DEFAULT_EMBEDDING_MODEL,
+            ["Embed this sentence.", "As well as this one."],
+        )
         print("OpenAI embeddings:", openai_embeddings)
 
     if MistralClient:
         mistral_embed_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
-        mistral_embeddings = embed(mistral_embed_client, MISTRAL_DEFAULT_EMBEDDING_MODEL, ["Embed this sentence.", "As well as this one."])
+        mistral_embeddings = embed(
+            mistral_embed_client,
+            MISTRAL_DEFAULT_EMBEDDING_MODEL,
+            ["Embed this sentence.", "As well as this one."],
+        )
         print("Mistral embeddings:", mistral_embeddings)
-
diff --git a/weave/flow/scorer/moderation_scorer.py b/weave/flow/scorer/moderation_scorer.py
index 6d747740723c..e116290bc0ae 100644
--- a/weave/flow/scorer/moderation_scorer.py
+++ b/weave/flow/scorer/moderation_scorer.py
@@ -1,4 +1,5 @@
 from typing import Any
+
 from pydantic import field_validator
 
 import weave
@@ -11,14 +12,17 @@ class OpenAIModerationScorer(LLMScorer):
     @field_validator("client")
     def validate_openai_client(cls, v):
         try:
-            from openai import AsyncOpenAI, OpenAI  # Ensure these are the correct imports
+            from openai import (  # Ensure these are the correct imports
+                AsyncOpenAI,
+                OpenAI,
+            )
         except ImportError:
             raise ValueError("Install openai to use this scorer")
-        
+
         if not isinstance(v, (OpenAI, AsyncOpenAI)):
             raise ValueError("Moderation scoring only works with OpenAI or AsyncOpenAI")
         return v
-    
+
     @weave.op
     def score(self, output: Any) -> Any:
         response = self.client.moderations.create(
@@ -34,8 +38,10 @@ def score(self, output: Any) -> Any:
         import openai
 
         client = openai.OpenAI()
-        scorer = OpenAIModerationScorer(client=client, model_id="omni-moderation-latest")
+        scorer = OpenAIModerationScorer(
+            client=client, model_id="omni-moderation-latest"
+        )
         print(scorer.score("I should kill someone"))
     except Exception as e:
         print("Error:", e)
-        raise e
\ No newline at end of file
+        raise e
diff --git a/weave/flow/scorer/ragas_scorer.py b/weave/flow/scorer/ragas_scorer.py
index 1d232c053357..811d337fc6f6 100644
--- a/weave/flow/scorer/ragas_scorer.py
+++ b/weave/flow/scorer/ragas_scorer.py
@@ -1,19 +1,24 @@
 # implementing metrics from ragas: https://github.com/explodinggradients/ragas
 
+from textwrap import dedent
 from typing import List
+
 from pydantic import BaseModel, Field
-from textwrap import dedent
 
 import weave
-from weave.flow.scorer.llm_utils import instructor_client, create
 from weave.flow.scorer.llm_scorer import InstructorLLMScorer
+from weave.flow.scorer.llm_utils import create
+
 
 class EntityExtractionResponse(BaseModel):
-    entities: List[str] = Field(description="A list of unique entities extracted from the text")
+    entities: List[str] = Field(
+        description="A list of unique entities extracted from the text"
+    )
+
 
 class ContextEntityRecallScorer(InstructorLLMScorer):
     """
-    Estimates context recall by extracting entities from the model output 
+    Estimates context recall by extracting entities from the model output
     and the context, then computes the recall.
     """
 
@@ -23,7 +28,7 @@ class ContextEntityRecallScorer(InstructorLLMScorer):
     Text: {text}
     Entities:
     """)
-    
+
     def extract_entities(self, text: str) -> List[str]:
         # Use LLM to extract entities
         prompt = self.extraction_prompt.format(text=text)
@@ -31,12 +36,12 @@ def extract_entities(self, text: str) -> List[str]:
             self.client,
             messages=[{"role": "user", "content": prompt}],
             response_model=EntityExtractionResponse,
-            model=self.model_id
+            model=self.model_id,
         )
         # Assume entities are returned as a comma-separated list
         entities = [e.strip() for e in response.entities]
         return entities
-    
+
     @weave.op
     def score(self, output: str, context: str) -> float:
         expected_entities = self.extract_entities(output)
@@ -48,9 +53,18 @@ def score(self, output: str, context: str) -> float:
         recall = len(matches) / len(expected_entities)
         return {"recall": recall}
 
+
 class RelevancyResponse(BaseModel):
-    reasoning: str = Field(description="Think step by step about whether the context is relevant to the question")
-    relevancy_score: int = Field(ge=0, le=1, description="The relevancy score of the context to the question (0 for not relevant, 1 for relevant)")
+    reasoning: str = Field(
+        description="Think step by step about whether the context is relevant to the question"
+    )
+    relevancy_score: int = Field(
+        ge=0,
+        le=1,
+        description="The relevancy score of the context to the question (0 for not relevant, 1 for relevant)",
+    )
+
+
 class ContextRelevancyScorer(InstructorLLMScorer):
     """Evaluates the relevancy of the provided context to the model output."""
 
@@ -69,10 +83,11 @@ def score(self, output: str, context: str) -> float:
             self.client,
             messages=[{"role": "user", "content": prompt}],
             response_model=RelevancyResponse,
-            model=self.model_id
+            model=self.model_id,
         )
         return {"relevancy_score": response.relevancy_score}
-        
+
+
 if __name__ == "__main__":
     import os
 
@@ -88,10 +103,12 @@ def score(self, output: str, context: str) -> float:
 
         # Instantiate scorers
         context_entity_recall_scorer = ContextEntityRecallScorer(
-            client=llm_client, model_id="gpt-4o",
+            client=llm_client,
+            model_id="gpt-4o",
         )
         context_relevancy_scorer = ContextRelevancyScorer(
-            client=llm_client, model_id="gpt-4o",
+            client=llm_client,
+            model_id="gpt-4o",
         )
         # Create your dataset of examples
         examples = [
@@ -110,13 +127,9 @@ def score(self, output: str, context: str) -> float:
 
         for example in examples:
             output = {"answer": example["expected"]}  # Simulate model output
-            score = context_entity_recall_scorer.score(
-                output, example
-            )
+            score = context_entity_recall_scorer.score(output, example)
             print(f"Context Entity Recall Score: {score}")
-            score = context_relevancy_scorer.score(
-                output, example
-            )
+            score = context_relevancy_scorer.score(output, example)
             print(f"Context Relevancy Score: {score}")
     except Exception as e:
-        print(e)
\ No newline at end of file
+        print(e)
diff --git a/weave/flow/scorer/similarity_score.py b/weave/flow/scorer/similarity_score.py
index 542b7962e7ee..75aebe423e52 100644
--- a/weave/flow/scorer/similarity_score.py
+++ b/weave/flow/scorer/similarity_score.py
@@ -1,7 +1,7 @@
 from typing import Any
 
-from pydantic import Field
 import numpy as np
+from pydantic import Field
 
 import weave
 from weave.flow.scorer.llm_scorer import LLMScorer
@@ -10,28 +10,35 @@
 
 class EmbeddingSimilarityScorer(LLMScorer):
     """Check the cosine similarity distance between the model output and the target.
-    
+
     The threshold is the minimum cosine similarity score that is considered similar.
-    
+
     Args:
         target_column: The column to compare the model output to. Defaults to "text".
         threshold: The minimum cosine similarity score that is considered similar. Defaults to 0.5
     """
-    target_column: str = Field(..., description="The column to compare the model output to")
+
+    target_column: str = Field(
+        ..., description="The column to compare the model output to"
+    )
     threshold: float = Field(0.5, description="The threshold for the similarity score")
 
     @weave.op
     def score(self, output: Any, dataset_row: dict) -> Any:
         if self.target_column not in dataset_row:
-            raise ValueError(f"Target column {self.target_column} not found in dataset_row")
-        
-        target = str(dataset_row[self.target_column])  # TODO: handle if it is not a string
-        model_embedding, target_embedding = self._compute_embeddings(
-            output, target
-        )
+            raise ValueError(
+                f"Target column {self.target_column} not found in dataset_row"
+            )
+
+        target = str(
+            dataset_row[self.target_column]
+        )  # TODO: handle if it is not a string
+        model_embedding, target_embedding = self._compute_embeddings(output, target)
         return self.cosine_similarity(model_embedding, target_embedding)
 
-    def _compute_embeddings(self, output: str, target: str) -> tuple[list[float], list[float]]:
+    def _compute_embeddings(
+        self, output: str, target: str
+    ) -> tuple[list[float], list[float]]:
         embeddings = embed(self.client, self.model_id, [output, target])
         return embeddings[0], embeddings[1]
 
@@ -39,7 +46,7 @@ def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
         """Compute the cosine similarity between two vectors."""
         vec1 = np.array(vec1)
         vec2 = np.array(vec2)
-        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) 
+        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
         # TODO: check if this can be negative
 
         # cast to float
@@ -49,7 +56,9 @@ def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
 
 if __name__ == "__main__":
     try:
-        import openai, os
+        import os
+
+        import openai
 
         client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
         scorer = EmbeddingSimilarityScorer(
@@ -57,6 +66,11 @@ def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
         )
 
         dataset_row = {"text": "Whales are mammals that live in the ocean."}
-        print(scorer.score(output="Dolphins are animals that live in the sea.", dataset_row=dataset_row))
+        print(
+            scorer.score(
+                output="Dolphins are animals that live in the sea.",
+                dataset_row=dataset_row,
+            )
+        )
     except Exception as e:
-        print("Error running script:", e)
\ No newline at end of file
+        print("Error running script:", e)
diff --git a/weave/flow/scorer/string_scorer.py b/weave/flow/scorer/string_scorer.py
index 956b422231e2..8fcb59dae1c6 100644
--- a/weave/flow/scorer/string_scorer.py
+++ b/weave/flow/scorer/string_scorer.py
@@ -1,19 +1,20 @@
 import re
-from typing import Union, Callable
+from typing import Callable, Union
 
 from pydantic import Field, model_validator
 
 import weave
 from weave.flow.scorer.base_scorer import Scorer
 
+
 class StringMatchScorer(Scorer):
-    """
-    Scorer that checks if the model output string is found in the search columns of the dataset row.
-    """
+    """Scorer that checks if the model output string is found in the search columns of the dataset row."""
+
     def score(self, output: str, target: str) -> dict:
         string_in_input = output.lower() in target.lower()
         return {"string_in_input": string_in_input}
 
+
 class RegexScorer(Scorer):
     patterns: Union[str, list[str]] = Field(
         default_factory=list, description="The patterns or keywords to match"
@@ -58,14 +59,20 @@ def score(
 
 
 class LevenshteinScorer(Scorer):
-    distance: Callable[[str, str], int] = Field(default=None, description="The Levenshtein distance function")
-    @model_validator(mode='after')
+    distance: Callable[[str, str], int] = Field(
+        default=None, description="The Levenshtein distance function"
+    )
+
+    @model_validator(mode="after")
     def check_levenshtein(self):
         try:
             from Levenshtein import distance
+
             self.distance = distance
         except ImportError:
-            raise ValueError("Levenshtein package not found. Please install it with `pip install Levenshtein`")
+            raise ValueError(
+                "Levenshtein package not found. Please install it with `pip install Levenshtein`"
+            )
 
     @weave.op
     def score(self, output: str, target: str) -> dict:
@@ -79,14 +86,21 @@ def score(self, output: str, target: str) -> dict:
     match_scorer = StringMatchScorer(column_map={"target": "col1"})
     levenshtein_scorer = LevenshteinScorer(column_map={"target": "col2"})
 
-    
     @weave.op
-    def f(col1, col2): 
-        return "Hello"    
-
-    dataset = [{"col1": "Hello my name is Morgan", "col2": "I am an engineer", "target": "Morgan"}, 
-               {"col1": "Hello my name is John", "col2": "I am a doctor", "target": "John"}]
-    
-    evaluation = weave.Evaluation(dataset=dataset, scorers=[match_scorer, levenshtein_scorer])
+    def f(col1, col2):
+        return "Hello"
+
+    dataset = [
+        {
+            "col1": "Hello my name is Morgan",
+            "col2": "I am an engineer",
+            "target": "Morgan",
+        },
+        {"col1": "Hello my name is John", "col2": "I am a doctor", "target": "John"},
+    ]
+
+    evaluation = weave.Evaluation(
+        dataset=dataset, scorers=[match_scorer, levenshtein_scorer]
+    )
 
-    eval_out = asyncio.run(evaluation.evaluate(f))
\ No newline at end of file
+    eval_out = asyncio.run(evaluation.evaluate(f))
diff --git a/weave/flow/scorer/summarization_scorer.py b/weave/flow/scorer/summarization_scorer.py
index 225ad71385bf..a4a01144f810 100644
--- a/weave/flow/scorer/summarization_scorer.py
+++ b/weave/flow/scorer/summarization_scorer.py
@@ -1,6 +1,7 @@
-from pydantic import BaseModel, Field
-from typing import List
 from textwrap import dedent
+from typing import List
+
+from pydantic import BaseModel, Field
 
 import weave
 from weave.flow.scorer.llm_scorer import InstructorLLMScorer
@@ -8,12 +9,13 @@
 
 
 class EntityExtractionResponse(BaseModel):
-    entities: List[str] = Field(description="A list of unique entities extracted from the text")
+    entities: List[str] = Field(
+        description="A list of unique entities extracted from the text"
+    )
+
 
 class SummarizationScorer(InstructorLLMScorer):
-    """
-    Estimates summary quality by computing the recall of entities in the model output compared to the input.
-    """
+    """Estimates summary quality by computing the recall of entities in the model output compared to the input."""
 
     extraction_prompt: str = dedent("""
     Extract unique entities from the following text without repetition.
@@ -24,7 +26,7 @@ class SummarizationScorer(InstructorLLMScorer):
 
     temperature: float = 0.7
     max_tokens: int = 1024
-    
+
     def extract_entities(self, text: str) -> List[str]:
         # Use LLM to extract entities
         prompt = self.extraction_prompt.format(text=text)
@@ -38,7 +40,7 @@ def extract_entities(self, text: str) -> List[str]:
         )
         entities = [e.strip().lower() for e in response.entities]
         return entities
-    
+
     @weave.op
     def score(self, input: str, output: str, **kwargs) -> float:
         # Extract entities
@@ -50,11 +52,11 @@ def score(self, input: str, output: str, **kwargs) -> float:
         matches = set(output_entities) & set(input_entities)
         recall = len(matches) / len(input_entities)
         return {"recall": recall}
-    
 
 
 if __name__ == "__main__":
-    import os, asyncio
+    import asyncio
+    import os
 
     try:
         from weave.flow.scorer.llm_utils import import_client
@@ -72,26 +74,27 @@ def score(self, input: str, output: str, **kwargs) -> float:
         )
 
         @weave.op
-        def f(summary: str): 
+        def f(summary: str):
             return summary
 
         # Create your dataset of examples
         examples = [
-            {"text":"Harry Potter is a wizard. He is friends with Ron Weasley. They all go to Hogwarts to learn magic. They have been doing this for years. Their enemy is Voldemort, a dark wizard who is trying to kill them.",
-             "summary":"Harry Potter, Ron Weasley, and Voldemort are wizards.",
-             "relevancy_score":1},
+            {
+                "text": "Harry Potter is a wizard. He is friends with Ron Weasley. They all go to Hogwarts to learn magic. They have been doing this for years. Their enemy is Voldemort, a dark wizard who is trying to kill them.",
+                "summary": "Harry Potter, Ron Weasley, and Voldemort are wizards.",
+                "relevancy_score": 1,
+            },
         ]
         evaluation = weave.Evaluation(dataset=examples, scorers=[summarization_scorer])
         asyncio.run(evaluation.evaluate(f))
 
         # good naming:
         def summarization_scorer2(text: str, output: str):
-            scorer =  SummarizationScorer(client=llm_client, model_id="gpt-4o")
+            scorer = SummarizationScorer(client=llm_client, model_id="gpt-4o")
             return scorer.score(input=text, output=output)
 
         evaluation = weave.Evaluation(dataset=examples, scorers=[summarization_scorer2])
         asyncio.run(evaluation.evaluate(f))
 
-
     except Exception as e:
         print(f"Error: {e}")
diff --git a/weave/flow/scorer/utils.py b/weave/flow/scorer/utils.py
index 19db05748978..8bd587ad8d41 100644
--- a/weave/flow/scorer/utils.py
+++ b/weave/flow/scorer/utils.py
@@ -16,6 +16,7 @@ def stringify(output: Any) -> str:
     else:
         raise ValueError(f"Unsupported model output type: {type(output)}")
 
+
 if __name__ == "__main__":
     # test
     output = "hey"

From 06dfb7f393d549428ba9b64d30dc5dbea430d2c7 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 22:55:19 +0200
Subject: [PATCH 040/150] ruff

---
 tests/scorers/test_hallucination_scorer.py | 7 ++++++-
 tests/scorers/test_json_scorer.py          | 1 +
 tests/scorers/test_pydantic_scorer.py      | 2 ++
 tests/scorers/test_ragas_scorer.py         | 7 ++++---
 tests/scorers/test_string_scorer.py        | 4 ++--
 tests/scorers/test_summarization_scorer.py | 7 ++++++-
 6 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index 2a30e3ac82ea..89f26ac3b7ff 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -1,7 +1,12 @@
 import pytest
-from weave.flow.scorer.hallucination_scorer import HallucinationScorer, HallucinationResponse
 from openai import OpenAI
 
+from weave.flow.scorer.hallucination_scorer import (
+    HallucinationResponse,
+    HallucinationScorer,
+)
+
+
 # Mock the OpenAI client
 class MockOpenAI(OpenAI):
     pass
diff --git a/tests/scorers/test_json_scorer.py b/tests/scorers/test_json_scorer.py
index d77293b0ad49..443e8d108728 100644
--- a/tests/scorers/test_json_scorer.py
+++ b/tests/scorers/test_json_scorer.py
@@ -1,5 +1,6 @@
 from weave.flow.scorer.json_scorer import JSONScorer
 
+
 def test_json_scorer_valid_json():
     scorer = JSONScorer()
     output = '{"city": "San Francisco", "country": "USA"}'
diff --git a/tests/scorers/test_pydantic_scorer.py b/tests/scorers/test_pydantic_scorer.py
index 3732829136b8..e0574ebbf90e 100644
--- a/tests/scorers/test_pydantic_scorer.py
+++ b/tests/scorers/test_pydantic_scorer.py
@@ -1,7 +1,9 @@
 import pytest
 from pydantic import BaseModel
+
 from weave.flow.scorer.pydantic_scorer import PydanticScorer
 
+
 class User(BaseModel):
     name: str
     age: int
diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index 33333b1d83d7..77af51cf5193 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -1,11 +1,13 @@
 import pytest
+from openai import OpenAI
+
 from weave.flow.scorer.ragas_scorer import (
     ContextEntityRecallScorer,
     ContextRelevancyScorer,
     EntityExtractionResponse,
-    RelevancyResponse
+    RelevancyResponse,
 )
-from openai import OpenAI
+
 
 # Mock the OpenAI client
 class MockOpenAI(OpenAI):
@@ -59,4 +61,3 @@ def test_context_relevancy_scorer_score(context_relevancy_scorer):
     assert isinstance(result, dict)
     assert "relevancy_score" in result
     assert result["relevancy_score"] == 1  # Assuming relevancy in mock response
-
diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py
index a542b7b3e805..3c460cb04dba 100644
--- a/tests/scorers/test_string_scorer.py
+++ b/tests/scorers/test_string_scorer.py
@@ -1,9 +1,9 @@
 from weave.flow.scorer.string_scorer import (
-    StringMatchScorer,
-    RegexScorer,
     LevenshteinScorer,
+    StringMatchScorer,
 )
 
+
 def test_string_match_scorer():
     scorer = StringMatchScorer()
     output = "Morgan"
diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
index 5c011c393926..e99a32e15337 100644
--- a/tests/scorers/test_summarization_scorer.py
+++ b/tests/scorers/test_summarization_scorer.py
@@ -1,7 +1,12 @@
 import pytest
-from weave.flow.scorer.summarization_scorer import SummarizationScorer, EntityExtractionResponse
 from openai import OpenAI
 
+from weave.flow.scorer.summarization_scorer import (
+    EntityExtractionResponse,
+    SummarizationScorer,
+)
+
+
 # Mock the OpenAI client
 class MockOpenAI(OpenAI):
     pass

From 20d164ab64a01e304ab5efdfc7f44528ddeec34e Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 11 Oct 2024 23:15:53 +0200
Subject: [PATCH 041/150] fix most tests and linting

---
 tests/scorers/test_pydantic_scorer.py     |  14 +--
 weave/flow/scorer/hallucination_scorer.py |  34 ------
 weave/flow/scorer/json_scorer.py          |  11 +-
 weave/flow/scorer/llm_scorer.py           |  14 +--
 weave/flow/scorer/llm_utils.py            | 122 +++-------------------
 weave/flow/scorer/moderation_scorer.py    |  16 +--
 weave/flow/scorer/pydantic_scorer.py      |  26 +----
 weave/flow/scorer/ragas_scorer.py         |  53 +---------
 weave/flow/scorer/similarity_score.py     |  30 +-----
 weave/flow/scorer/string_scorer.py        |  35 +------
 weave/flow/scorer/summarization_scorer.py |  52 +--------
 weave/flow/scorer/utils.py                |  19 ----
 weave/flow/scorer/xml_scorer.py           |  14 +--
 13 files changed, 49 insertions(+), 391 deletions(-)

diff --git a/tests/scorers/test_pydantic_scorer.py b/tests/scorers/test_pydantic_scorer.py
index e0574ebbf90e..1a4112fc3f65 100644
--- a/tests/scorers/test_pydantic_scorer.py
+++ b/tests/scorers/test_pydantic_scorer.py
@@ -19,28 +19,28 @@ def test_pydantic_scorer_initialization():
 
 def test_pydantic_scorer_valid_json_string(user_scorer):
     valid_json = '{"name": "John", "age": 30}'
-    assert user_scorer.score(valid_json) == True
+    assert user_scorer.score(valid_json) == {"valid_pydantic": True}
 
 def test_pydantic_scorer_valid_dict(user_scorer):
     valid_dict = {"name": "John", "age": 30}
-    assert user_scorer.score(valid_dict) == True
+    assert user_scorer.score(valid_dict) == {"valid_pydantic": True}
 
 def test_pydantic_scorer_invalid_json_string(user_scorer):
     invalid_json = '{"name": "John", "age": "thirty"}'
-    assert user_scorer.score(invalid_json) == False
+    assert user_scorer.score(invalid_json) == {"valid_pydantic": False}
 
 def test_pydantic_scorer_invalid_dict(user_scorer):
     invalid_dict = {"name": "John", "age": "thirty"}
-    assert user_scorer.score(invalid_dict) == False
+    assert user_scorer.score(invalid_dict) == {"valid_pydantic": False}
 
 def test_pydantic_scorer_missing_field(user_scorer):
     missing_field = '{"name": "John"}'
-    assert user_scorer.score(missing_field) == False
+    assert user_scorer.score(missing_field) == {"valid_pydantic": False}
 
 def test_pydantic_scorer_extra_field(user_scorer):
     extra_field = '{"name": "John", "age": 30, "city": "New York"}'
-    assert user_scorer.score(extra_field) == True
+    assert user_scorer.score(extra_field) == {"valid_pydantic": True}
 
 def test_pydantic_scorer_invalid_input_type(user_scorer):
     invalid_input = 123  # Neither a string nor a dict
-    assert user_scorer.score(invalid_input) == False
\ No newline at end of file
+    assert user_scorer.score(invalid_input) == {"valid_pydantic": False}
\ No newline at end of file
diff --git a/weave/flow/scorer/hallucination_scorer.py b/weave/flow/scorer/hallucination_scorer.py
index f1ec1dbc75f6..a2e4d38fa41d 100644
--- a/weave/flow/scorer/hallucination_scorer.py
+++ b/weave/flow/scorer/hallucination_scorer.py
@@ -60,37 +60,3 @@ def score(self, output: str, context: str) -> HallucinationResponse:
             max_tokens=self.max_tokens,
         )
         return response
-
-
-if __name__ == "__main__":
-    try:
-        import asyncio
-        import os
-
-        import openai
-
-        import weave
-
-        # weave.init("hallucination-scorer-2")
-
-        openai_client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-        scorer = HallucinationScorer(
-            client=openai_client, column_map={"text": "context"}
-        )
-
-        output = "John favorite cheese is camembert"
-        dataset_row = {"text": "John doesn't like cheese"}
-        response = scorer.score(output, context=dataset_row)
-        print(response)
-
-        @weave.op
-        def model():
-            return "John favorite food is apples"
-
-        dataset = [{"text": "John doesn't like cheese"}, {"text": "John likes pizza"}]
-
-        evaluation = weave.Evaluation(dataset=dataset, scorers=[scorer])
-        asyncio.run(evaluation.evaluate(model))
-
-    except Exception as e:
-        print(e)
diff --git a/weave/flow/scorer/json_scorer.py b/weave/flow/scorer/json_scorer.py
index 7e1fbedc6800..598b7a4f0023 100644
--- a/weave/flow/scorer/json_scorer.py
+++ b/weave/flow/scorer/json_scorer.py
@@ -7,7 +7,7 @@
 class JSONScorer(Scorer):
     """Score a JSON string."""
 
-    def score(self, output: Any) -> Any:
+    def score(self, output: Any, **kwargs: Any) -> dict:  # type: ignore
         try:
             result = json.loads(output)
 
@@ -17,12 +17,3 @@ def score(self, output: Any) -> Any:
         except json.JSONDecodeError:
             pass
         return {"json_valid": False}
-
-
-if __name__ == "__main__":
-    scorer = JSONScorer()
-    print(
-        scorer.score(
-            '{"city": "San Francisco", "country": "USA", "column2": "Santiago"}'
-        )
-    )
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
index a20ff67525cc..7bcf9cf9af67 100644
--- a/weave/flow/scorer/llm_scorer.py
+++ b/weave/flow/scorer/llm_scorer.py
@@ -3,7 +3,7 @@
 from pydantic import Field, field_validator
 
 from weave.flow.scorer.base_scorer import Scorer
-from weave.flow.scorer.llm_utils import _LLM_CLIENT_TYPES, instructor_client
+from weave.flow.scorer.llm_utils import instructor_client, _LLM_CLIENTS
 
 
 class LLMScorer(Scorer):
@@ -15,10 +15,10 @@ class LLMScorer(Scorer):
     model_id: str = Field(description="The model to use")
 
     @field_validator("client")
-    def validate_client(cls, v):
-        if not any(isinstance(v, client_type) for client_type in _LLM_CLIENT_TYPES):
+    def validate_client(cls, v):  # type: ignore
+        if not isinstance(v, _LLM_CLIENTS):
             raise ValueError(
-                f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}"
+                f"Invalid client type. Expected one of {_LLM_CLIENTS}, got {type(v)}"
             )
         return v
 
@@ -38,9 +38,9 @@ class InstructorLLMScorer(Scorer):
     )
 
     @field_validator("client")
-    def validate_client(cls, v):
-        if not any(isinstance(v, client_type) for client_type in _LLM_CLIENT_TYPES):
+    def validate_client(cls, v):  # type: ignore
+        if not isinstance(v, _LLM_CLIENTS):
             raise ValueError(
-                f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}"
+                f"Invalid client type. Expected one of {_LLM_CLIENTS}, got {type(v)}"
             )
         return instructor_client(v)
diff --git a/weave/flow/scorer/llm_utils.py b/weave/flow/scorer/llm_utils.py
index dcae67267821..069134d1452a 100644
--- a/weave/flow/scorer/llm_utils.py
+++ b/weave/flow/scorer/llm_utils.py
@@ -1,4 +1,4 @@
-from typing import List, TypeVar, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 import instructor
 
@@ -18,34 +18,17 @@
 
 DEFAULT_MAX_TOKENS = 4096
 
-
-_LLM_CLIENT_TYPES = []
-
-try:
-    from openai import AsyncOpenAI, OpenAI
-
-    _LLM_CLIENT_TYPES.append(OpenAI)
-    _LLM_CLIENT_TYPES.append(AsyncOpenAI)
-except:
-    pass
-try:
+if TYPE_CHECKING:
     from anthropic import Anthropic, AsyncAnthropic
-
-    _LLM_CLIENT_TYPES.append(Anthropic)
-    _LLM_CLIENT_TYPES.append(AsyncAnthropic)
-except:
-    pass
-try:
     from mistralai import Mistral
+    from openai import AsyncOpenAI, OpenAI
 
-    _LLM_CLIENT_TYPES.append(Mistral)
-except:
-    pass
-
-_LLM_CLIENTS = TypeVar(Union[tuple(_LLM_CLIENT_TYPES)])
+    _LLM_CLIENTS = Union[OpenAI, AsyncOpenAI, Anthropic, AsyncAnthropic, Mistral]
+else:
+    _LLM_CLIENTS = object
 
 
-def instructor_client(client: _LLM_CLIENTS):
+def instructor_client(client: _LLM_CLIENTS) -> instructor.client:  # type: ignore
     client_type = type(client).__name__.lower()
     if "mistral" in client_type:
         return instructor.from_mistral(client)
@@ -57,26 +40,26 @@ def instructor_client(client: _LLM_CLIENTS):
         raise ValueError(f"Unsupported client type: {client_type}")
 
 
-def create(client: _LLM_CLIENTS, *args, **kwargs):
+def create(client: _LLM_CLIENTS, *args, **kwargs):  # type: ignore
     return client.chat.completions.create(*args, **kwargs)
 
 
 def embed(
     client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs
-) -> List[List[float]]:
+) -> List[List[float]]:  # type: ignore
     client_type = type(client).__name__.lower()
     if "mistral" in client_type:
-        response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)
+        response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)  # type: ignore
         return [embedding.embedding for embedding in response.data]
     elif "openai" in client_type:
-        response = client.embeddings.create(model=model_id, input=texts, **kwargs)
+        response = client.embeddings.create(model=model_id, input=texts, **kwargs)  # type: ignore
         return [embedding.embedding for embedding in response.data]
     else:
         raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}")
 
 
 # Helper function for dynamic imports
-def import_client(provider: str):
+def import_client(provider: str) -> Optional[_LLM_CLIENTS]:  # type: ignore
     try:
         if provider == "mistral":
             from mistralai import Mistral
@@ -92,84 +75,3 @@ def import_client(provider: str):
             return anthropic.Anthropic
     except ImportError:
         return None
-
-
-# Example usage:
-if __name__ == "__main__":
-    import os
-
-    # Mistral example
-    MistralClient = import_client("mistral")
-    if MistralClient:
-        mistral_client = instructor_client(
-            Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
-        )
-        mistral_response = mistral_client.chat.completions.create(
-            messages=[{"role": "user", "content": "What is the best French cheese?"}],
-            model=MISTRAL_DEFAULT_MODEL,
-            max_tokens=DEFAULT_MAX_TOKENS,
-            response_model=str,
-        )
-        print("Mistral response:", mistral_response)
-
-    # OpenAI example with system message
-    OpenAIClient = import_client("openai")
-    if OpenAIClient:
-        openai_client = instructor_client(
-            OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY"))
-        )
-        openai_response = openai_client.chat.completions.create(
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You are a helpful assistant specialized in writing poetry.",
-                },
-                {
-                    "role": "user",
-                    "content": "Write a haiku about recursion in programming.",
-                },
-            ],
-            model=OPENAI_DEFAULT_MODEL,
-            max_tokens=DEFAULT_MAX_TOKENS,
-            response_model=str,
-        )
-        print("OpenAI response:", openai_response)
-
-    # Anthropic example with system message
-    AnthropicClient = import_client("anthropic")
-    if AnthropicClient:
-        anthropic_client = instructor_client(
-            AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY"))
-        )
-        anthropic_response = anthropic_client.messages.create(
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You are Claude, an AI assistant created by Anthropic.",
-                },
-                {"role": "user", "content": "Hello, Claude"},
-            ],
-            model=ANTHROPIC_DEFAULT_MODEL,
-            max_tokens=DEFAULT_MAX_TOKENS,
-            response_model=str,
-        )
-        print("Anthropic response:", anthropic_response)
-
-    # Embedding example
-    if OpenAIClient:
-        openai_embed_client = OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY"))
-        openai_embeddings = embed(
-            openai_embed_client,
-            OPENAI_DEFAULT_EMBEDDING_MODEL,
-            ["Embed this sentence.", "As well as this one."],
-        )
-        print("OpenAI embeddings:", openai_embeddings)
-
-    if MistralClient:
-        mistral_embed_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
-        mistral_embeddings = embed(
-            mistral_embed_client,
-            MISTRAL_DEFAULT_EMBEDDING_MODEL,
-            ["Embed this sentence.", "As well as this one."],
-        )
-        print("Mistral embeddings:", mistral_embeddings)
diff --git a/weave/flow/scorer/moderation_scorer.py b/weave/flow/scorer/moderation_scorer.py
index e116290bc0ae..51b92b9f85b7 100644
--- a/weave/flow/scorer/moderation_scorer.py
+++ b/weave/flow/scorer/moderation_scorer.py
@@ -10,7 +10,7 @@ class OpenAIModerationScorer(LLMScorer):
     """Use OpenAI moderation API to check if the model output is safe"""
 
     @field_validator("client")
-    def validate_openai_client(cls, v):
+    def validate_openai_client(cls, v):  # type: ignore
         try:
             from openai import (  # Ensure these are the correct imports
                 AsyncOpenAI,
@@ -31,17 +31,3 @@ def score(self, output: Any) -> Any:
         ).results[0]
         categories = {k: v for k, v in response.categories.dict().items() if v}
         return {"flagged": response.flagged, "categories": categories}
-
-
-if __name__ == "__main__":
-    try:
-        import openai
-
-        client = openai.OpenAI()
-        scorer = OpenAIModerationScorer(
-            client=client, model_id="omni-moderation-latest"
-        )
-        print(scorer.score("I should kill someone"))
-    except Exception as e:
-        print("Error:", e)
-        raise e
diff --git a/weave/flow/scorer/pydantic_scorer.py b/weave/flow/scorer/pydantic_scorer.py
index c1cccf463b0a..90fdffd63788 100644
--- a/weave/flow/scorer/pydantic_scorer.py
+++ b/weave/flow/scorer/pydantic_scorer.py
@@ -10,32 +10,16 @@ class PydanticScorer(Scorer):
 
     model: Type[BaseModel]
 
-    def score(self, output: Any):
+    def score(self, output: Any) -> dict:  # type: ignore
         if isinstance(output, str):
             try:
                 self.model.model_validate_json(output)
-                return True
+                return {"valid_pydantic": True}
             except ValidationError:
-                return False
+                return {"valid_pydantic": False}
         else:
             try:
                 self.model.model_validate(output)
-                return True
+                return {"valid_pydantic": True}
             except ValidationError:
-                return False
-
-
-if __name__ == "__main__":
-    from pydantic import BaseModel
-
-    class User(BaseModel):
-        name: str
-        age: int
-
-    scorer = PydanticScorer(model=User)
-
-    output = '{"name": "John", "age": 30}'
-    print(scorer.score(output))
-
-    output = {"name": "John", "age": 30}
-    print(scorer.score(output))
+                return {"valid_pydantic": False}
diff --git a/weave/flow/scorer/ragas_scorer.py b/weave/flow/scorer/ragas_scorer.py
index 811d337fc6f6..6180697f59f4 100644
--- a/weave/flow/scorer/ragas_scorer.py
+++ b/weave/flow/scorer/ragas_scorer.py
@@ -43,12 +43,12 @@ def extract_entities(self, text: str) -> List[str]:
         return entities
 
     @weave.op
-    def score(self, output: str, context: str) -> float:
+    def score(self, output: str, context: str) -> dict:
         expected_entities = self.extract_entities(output)
         context_entities = self.extract_entities(context)
         # Calculate recall
         if not expected_entities:
-            return 0.0
+            return {"recall": 0.0}
         matches = set(expected_entities) & set(context_entities)
         recall = len(matches) / len(expected_entities)
         return {"recall": recall}
@@ -77,7 +77,7 @@ class ContextRelevancyScorer(InstructorLLMScorer):
     """)
 
     @weave.op
-    def score(self, output: str, context: str) -> float:
+    def score(self, output: str, context: str) -> dict:
         prompt = self.relevancy_prompt.format(question=output, context=context)
         response = create(
             self.client,
@@ -86,50 +86,3 @@ def score(self, output: str, context: str) -> float:
             model=self.model_id,
         )
         return {"relevancy_score": response.relevancy_score}
-
-
-if __name__ == "__main__":
-    import os
-
-    try:
-        from weave.flow.scorer.llm_utils import import_client
-
-        # Instantiate your LLM client
-        OpenAIClient = import_client("openai")
-        if OpenAIClient:
-            llm_client = OpenAIClient(api_key=os.environ["OPENAI_API_KEY"])
-        else:
-            raise ImportError("OpenAI client not available")
-
-        # Instantiate scorers
-        context_entity_recall_scorer = ContextEntityRecallScorer(
-            client=llm_client,
-            model_id="gpt-4o",
-        )
-        context_relevancy_scorer = ContextRelevancyScorer(
-            client=llm_client,
-            model_id="gpt-4o",
-        )
-        # Create your dataset of examples
-        examples = [
-            {
-                "question": "What is the capital of France?",
-                "expected": "Paris",
-                "context": "Paris is the capital of France.",
-            },
-            {
-                "question": "Who wrote 'To Kill a Mockingbird'?",
-                "expected": "Harper Lee",
-                "context": "Harper Lee is the author of 'To Kill a Mockingbird'.",
-            },
-            # Add more examples as needed
-        ]
-
-        for example in examples:
-            output = {"answer": example["expected"]}  # Simulate model output
-            score = context_entity_recall_scorer.score(output, example)
-            print(f"Context Entity Recall Score: {score}")
-            score = context_relevancy_scorer.score(output, example)
-            print(f"Context Relevancy Score: {score}")
-    except Exception as e:
-        print(e)
diff --git a/weave/flow/scorer/similarity_score.py b/weave/flow/scorer/similarity_score.py
index 75aebe423e52..722c16e98c2a 100644
--- a/weave/flow/scorer/similarity_score.py
+++ b/weave/flow/scorer/similarity_score.py
@@ -42,35 +42,13 @@ def _compute_embeddings(
         embeddings = embed(self.client, self.model_id, [output, target])
         return embeddings[0], embeddings[1]
 
-    def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
+    def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> dict:
         """Compute the cosine similarity between two vectors."""
-        vec1 = np.array(vec1)
-        vec2 = np.array(vec2)
-        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
+        arr1 = np.array(vec1)
+        arr2 = np.array(vec2)
+        cosine_sim = np.dot(arr1, arr2) / (np.linalg.norm(arr1) * np.linalg.norm(arr2))
         # TODO: check if this can be negative
 
         # cast to float
         score = float(cosine_sim)
         return {"similarity_score": score, "is_similar": score >= self.threshold}
-
-
-if __name__ == "__main__":
-    try:
-        import os
-
-        import openai
-
-        client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
-        scorer = EmbeddingSimilarityScorer(
-            client=client, model_id="text-embedding-3-small", target_column="text"
-        )
-
-        dataset_row = {"text": "Whales are mammals that live in the ocean."}
-        print(
-            scorer.score(
-                output="Dolphins are animals that live in the sea.",
-                dataset_row=dataset_row,
-            )
-        )
-    except Exception as e:
-        print("Error running script:", e)
diff --git a/weave/flow/scorer/string_scorer.py b/weave/flow/scorer/string_scorer.py
index 8fcb59dae1c6..a34fa5619603 100644
--- a/weave/flow/scorer/string_scorer.py
+++ b/weave/flow/scorer/string_scorer.py
@@ -10,12 +10,12 @@
 class StringMatchScorer(Scorer):
     """Scorer that checks if the model output string is found in the search columns of the dataset row."""
 
-    def score(self, output: str, target: str) -> dict:
+    def score(self, output: str, target: str) -> dict:  # type: ignore
         string_in_input = output.lower() in target.lower()
         return {"string_in_input": string_in_input}
 
 
-class RegexScorer(Scorer):
+class RegexScorer(Scorer):  # type: ignore
     patterns: Union[str, list[str]] = Field(
         default_factory=list, description="The patterns or keywords to match"
     )
@@ -49,7 +49,8 @@ def score(
 
         text_to_search = output.get("output") if output else ""
         if self.ignore_whitespace:
-            text_to_search = "".join(text_to_search.split())
+            if text_to_search:
+                text_to_search = "".join(text_to_search.split())
 
         match_found = any(
             pattern.search(text_to_search) for pattern in compiled_patterns
@@ -64,7 +65,7 @@ class LevenshteinScorer(Scorer):
     )
 
     @model_validator(mode="after")
-    def check_levenshtein(self):
+    def check_levenshtein(self):  # type: ignore
         try:
             from Levenshtein import distance
 
@@ -78,29 +79,3 @@ def check_levenshtein(self):
     def score(self, output: str, target: str) -> dict:
         distance = self.distance(output, target)
         return {"levenshtein_distance": distance}
-
-
-if __name__ == "__main__":
-    import asyncio
-
-    match_scorer = StringMatchScorer(column_map={"target": "col1"})
-    levenshtein_scorer = LevenshteinScorer(column_map={"target": "col2"})
-
-    @weave.op
-    def f(col1, col2):
-        return "Hello"
-
-    dataset = [
-        {
-            "col1": "Hello my name is Morgan",
-            "col2": "I am an engineer",
-            "target": "Morgan",
-        },
-        {"col1": "Hello my name is John", "col2": "I am a doctor", "target": "John"},
-    ]
-
-    evaluation = weave.Evaluation(
-        dataset=dataset, scorers=[match_scorer, levenshtein_scorer]
-    )
-
-    eval_out = asyncio.run(evaluation.evaluate(f))
diff --git a/weave/flow/scorer/summarization_scorer.py b/weave/flow/scorer/summarization_scorer.py
index a4a01144f810..ee43a7f48b10 100644
--- a/weave/flow/scorer/summarization_scorer.py
+++ b/weave/flow/scorer/summarization_scorer.py
@@ -1,5 +1,5 @@
 from textwrap import dedent
-from typing import List
+from typing import Any, List
 
 from pydantic import BaseModel, Field
 
@@ -42,59 +42,13 @@ def extract_entities(self, text: str) -> List[str]:
         return entities
 
     @weave.op
-    def score(self, input: str, output: str, **kwargs) -> float:
+    def score(self, input: str, output: str, **kwargs: Any) -> dict:
         # Extract entities
         output_entities = self.extract_entities(output)
         input_entities = self.extract_entities(input)
         # Calculate recall
         if not output_entities:
-            return 0.0
+            return {"recall": 0.0}
         matches = set(output_entities) & set(input_entities)
         recall = len(matches) / len(input_entities)
         return {"recall": recall}
-
-
-if __name__ == "__main__":
-    import asyncio
-    import os
-
-    try:
-        from weave.flow.scorer.llm_utils import import_client
-
-        # Instantiate your LLM client
-        OpenAIClient = import_client("openai")
-        if OpenAIClient:
-            llm_client = OpenAIClient(api_key=os.environ["OPENAI_API_KEY"])
-        else:
-            raise ImportError("OpenAI client not available")
-
-        # Instantiate scorers
-        summarization_scorer = SummarizationScorer(
-            client=llm_client, model_id="gpt-4o", column_map={"text": "input"}
-        )
-
-        @weave.op
-        def f(summary: str):
-            return summary
-
-        # Create your dataset of examples
-        examples = [
-            {
-                "text": "Harry Potter is a wizard. He is friends with Ron Weasley. They all go to Hogwarts to learn magic. They have been doing this for years. Their enemy is Voldemort, a dark wizard who is trying to kill them.",
-                "summary": "Harry Potter, Ron Weasley, and Voldemort are wizards.",
-                "relevancy_score": 1,
-            },
-        ]
-        evaluation = weave.Evaluation(dataset=examples, scorers=[summarization_scorer])
-        asyncio.run(evaluation.evaluate(f))
-
-        # good naming:
-        def summarization_scorer2(text: str, output: str):
-            scorer = SummarizationScorer(client=llm_client, model_id="gpt-4o")
-            return scorer.score(input=text, output=output)
-
-        evaluation = weave.Evaluation(dataset=examples, scorers=[summarization_scorer2])
-        asyncio.run(evaluation.evaluate(f))
-
-    except Exception as e:
-        print(f"Error: {e}")
diff --git a/weave/flow/scorer/utils.py b/weave/flow/scorer/utils.py
index 8bd587ad8d41..175d6ac12d67 100644
--- a/weave/flow/scorer/utils.py
+++ b/weave/flow/scorer/utils.py
@@ -15,22 +15,3 @@ def stringify(output: Any) -> str:
         return output.model_dump_json(indent=2)
     else:
         raise ValueError(f"Unsupported model output type: {type(output)}")
-
-
-if __name__ == "__main__":
-    # test
-    output = "hey"
-    print(stringify(output))
-
-    output = [1, 2, 3]
-    print(stringify(output))
-
-    output = {"a": 1, "b": 2}
-    print(stringify(output))
-
-    class TestModel(BaseModel):
-        a: int
-        b: str
-
-    output = TestModel(a=1, b="test")
-    print(stringify(output))
diff --git a/weave/flow/scorer/xml_scorer.py b/weave/flow/scorer/xml_scorer.py
index 31f202636bf3..7bd42516e699 100644
--- a/weave/flow/scorer/xml_scorer.py
+++ b/weave/flow/scorer/xml_scorer.py
@@ -7,7 +7,7 @@
 class XMLScorer(Scorer):
     """Score an XML string."""
 
-    def score(self, output: Union[str, dict]) -> dict:
+    def score(self, output: Union[str, dict]) -> dict:  # type: ignore
         if isinstance(output, dict):
             xml_string = output.get("output", "")
         else:
@@ -18,15 +18,3 @@ def score(self, output: Union[str, dict]) -> dict:
             return {"xml_valid": True}
         except ET.ParseError:
             return {"xml_valid": False}
-
-
-if __name__ == "__main__":
-    scorer = XMLScorer()
-    print(
-        scorer.score(
-            """<xml>
-        <city>San Francisco</city>
-        <country>USA</country>
-    </xml>"""
-        )
-    )

From 73594694e34035417d61010a4cfcac24e062cccf Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Sat, 12 Oct 2024 12:55:26 +0200
Subject: [PATCH 042/150] missing distance

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0f2ada576f67..ca1ab49d09ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,7 +65,7 @@ litellm = ["litellm>=1.36.1"]
 llamaindex = ["llama-index>=0.10.35"]
 mistral0 = ["mistralai>=0.1.8,<1.0.0"]
 mistral1 = ["mistralai>=1.0.0"]
-scorers = ["openai>=1.0.0", "instructor>=1.5.2"]
+scorers = ["openai>=1.0.0", "instructor>=1.5.2", "Levenshtein>=0.26.0"]
 notdiamond = ["notdiamond>=0.3.21"]
 openai = ["openai>=1.0.0"]
 modal = ["modal", "python-dotenv"]

From 831db8539328fa1c8890c23edb325a619d404f76 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Sat, 12 Oct 2024 13:01:48 +0200
Subject: [PATCH 043/150] check instructor instal

---
 weave/flow/scorer/llm_utils.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/weave/flow/scorer/llm_utils.py b/weave/flow/scorer/llm_utils.py
index 069134d1452a..c2c1875b6d4c 100644
--- a/weave/flow/scorer/llm_utils.py
+++ b/weave/flow/scorer/llm_utils.py
@@ -1,7 +1,5 @@
 from typing import TYPE_CHECKING, List, Optional, Union
 
-import instructor
-
 from weave.trace.autopatch import autopatch
 
 autopatch()  # fix instrucor tracing
@@ -19,6 +17,7 @@
 DEFAULT_MAX_TOKENS = 4096
 
 if TYPE_CHECKING:
+    import instructor
     from anthropic import Anthropic, AsyncAnthropic
     from mistralai import Mistral
     from openai import AsyncOpenAI, OpenAI
@@ -28,7 +27,12 @@
     _LLM_CLIENTS = object
 
 
-def instructor_client(client: _LLM_CLIENTS) -> instructor.client:  # type: ignore
+def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ignore
+    try:
+        import instructor
+    except ImportError:
+        raise ImportError("We need instructor to use this the LLM-powered scorers")
+
     client_type = type(client).__name__.lower()
     if "mistral" in client_type:
         return instructor.from_mistral(client)

From 23dcd6a6fc9508abe208a15a8731d3d4f3c45178 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Sat, 12 Oct 2024 14:43:29 +0200
Subject: [PATCH 044/150] rename model_output -> output in tests

---
 tests/trace/test_client_trace.py           |  2 +-
 tests/trace/test_evaluate.py               | 22 +++++++++++-----------
 tests/trace/test_evaluation_performance.py |  4 ++--
 tests/trace/test_weave_client.py           | 14 +++++++-------
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/tests/trace/test_client_trace.py b/tests/trace/test_client_trace.py
index cde47f0f4f5d..716424554b8e 100644
--- a/tests/trace/test_client_trace.py
+++ b/tests/trace/test_client_trace.py
@@ -1443,7 +1443,7 @@ def test_named_reuse(client):
     dataset = weave.ref(d_ref.uri()).get()
 
     @weave.op()
-    async def dummy_score(model_output):
+    async def dummy_score(output):
         return 1
 
     class SimpleModel(weave.Model):
diff --git a/tests/trace/test_evaluate.py b/tests/trace/test_evaluate.py
index f5ada25215f5..a07d9dd5d232 100644
--- a/tests/trace/test_evaluate.py
+++ b/tests/trace/test_evaluate.py
@@ -11,7 +11,7 @@
 
 
 expected_eval_result = {
-    "model_output": {"mean": 9.5},
+    "output": {"mean": 9.5},
     "score": {"true_count": 1, "true_fraction": 0.5},
     "model_latency": {"mean": pytest.approx(0, abs=1)},
 }
@@ -24,8 +24,8 @@ async def predict(self, input) -> str:
 
 
 @weave.op()
-def score(target, model_output):
-    return target == model_output
+def score(target, output):
+    return target == output
 
 
 @weave.op()
@@ -57,7 +57,7 @@ async def model_predict(input, target) -> str:
     )
     result = asyncio.run(evaluation.evaluate(model_predict))
     assert result == {
-        "model_output": {"mean": 18.5},
+        "output": {"mean": 18.5},
         "score": {"true_count": 0, "true_fraction": 0.0},
         "model_latency": {
             "mean": pytest.approx(0, abs=1),
@@ -111,8 +111,8 @@ async def infer(self, input) -> str:
 def test_score_as_class(client):
     class MyScorer(weave.Scorer):
         @weave.op()
-        def score(self, target, model_output):
-            return target == model_output
+        def score(self, target, output):
+            return target == output
 
     evaluation = Evaluation(
         dataset=dataset_rows,
@@ -121,7 +121,7 @@ def score(self, target, model_output):
     model = EvalModel()
     result = asyncio.run(evaluation.evaluate(model))
     assert result == {
-        "model_output": {"mean": 9.5},
+        "output": {"mean": 9.5},
         "MyScorer": {"true_count": 1, "true_fraction": 0.5},
         "model_latency": {
             "mean": pytest.approx(0, abs=1),
@@ -137,8 +137,8 @@ def summarize(self, score_rows):
             return {"awesome": 3}
 
         @weave.op()
-        def score(self, target, model_output):
-            return target == model_output
+        def score(self, target, output):
+            return target == output
 
     evaluation = Evaluation(
         dataset=dataset_rows,
@@ -147,7 +147,7 @@ def score(self, target, model_output):
     model = EvalModel()
     result = asyncio.run(evaluation.evaluate(model))
     assert result == {
-        "model_output": {"mean": 9.5},
+        "output": {"mean": 9.5},
         "MyScorer": {"awesome": 3},
         "model_latency": {
             "mean": pytest.approx(0, abs=1),
@@ -167,7 +167,7 @@ def return_pred(pred):
 
     result = asyncio.run(evaluation.evaluate(return_pred))
     assert result == {
-        "model_output": {
+        "output": {
             "a": {"true_count": 1, "true_fraction": 1.0},
             "b": {"true_count": 0, "true_fraction": 0.0},
         },
diff --git a/tests/trace/test_evaluation_performance.py b/tests/trace/test_evaluation_performance.py
index bcc36c2ebca7..17f01192f26b 100644
--- a/tests/trace/test_evaluation_performance.py
+++ b/tests/trace/test_evaluation_performance.py
@@ -91,8 +91,8 @@ def predict(question: str):
         return "I don't know"
 
     @weave.op()
-    def score(question: str, expected: str, model_output: str):
-        return model_output == expected
+    def score(question: str, expected: str, output: str):
+        return output == expected
 
     evaluation = weave.Evaluation(
         name="My Evaluation",
diff --git a/tests/trace/test_weave_client.py b/tests/trace/test_weave_client.py
index a5eb8a49bc33..a746a871a9b1 100644
--- a/tests/trace/test_weave_client.py
+++ b/tests/trace/test_weave_client.py
@@ -393,8 +393,8 @@ async def model_predict(input) -> str:
     dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
 
     @weave.op()
-    async def score(target, model_output):
-        return target == model_output
+    async def score(target, output):
+        return target == output
 
     evaluation = Evaluation(
         name="my-eval",
@@ -747,8 +747,8 @@ async def model_predict(input) -> str:
     dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
 
     @weave.op()
-    async def score(target, model_output):
-        return target == model_output
+    async def score(target, output):
+        return target == output
 
     evaluation = Evaluation(
         name="my-eval",
@@ -757,7 +757,7 @@ async def score(target, model_output):
     )
     result = asyncio.run(evaluation.evaluate(model_predict))
     expected_eval_result = {
-        "model_output": {"mean": 9.5},
+        "output": {"mean": 9.5},
         "score": {"true_count": 1, "true_fraction": 0.5},
     }
     assert result == expected_eval_result
@@ -857,8 +857,8 @@ def test_nested_ref_is_inner(client):
     dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
 
     @weave.op()
-    async def score(target, model_output):
-        return target == model_output
+    async def score(target, output):
+        return target == output
 
     evaluation = Evaluation(
         name="my-eval",

From 343b86d6cad95829a8385837d3e94e1895dd9500 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Sat, 12 Oct 2024 14:51:51 +0200
Subject: [PATCH 045/150] wrong test path

---
 noxfile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/noxfile.py b/noxfile.py
index 2180ef9dbf98..32132813d035 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -65,7 +65,7 @@ def tests(session, shard):
         "trace_server": ["trace_server/"],
         "mistral0": ["integrations/mistral/v0/"],
         "mistral1": ["integrations/mistral/v1/"],
-        "scorers": ["flow/"],
+        "scorers": ["scorers/"],
     }
 
     test_dirs = test_dirs_dict.get(shard, default_test_dirs)

From 6a0abafff42f8aedbe5ee87259eb940adbaa9cd1 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Sat, 12 Oct 2024 15:15:29 +0200
Subject: [PATCH 046/150] don't mock oai

---
 tests/scorers/test_hallucination_scorer.py | 7 +------
 tests/scorers/test_ragas_scorer.py         | 9 ++-------
 tests/scorers/test_summarization_scorer.py | 6 +-----
 3 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index 89f26ac3b7ff..b486467f7171 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -6,11 +6,6 @@
     HallucinationScorer,
 )
 
-
-# Mock the OpenAI client
-class MockOpenAI(OpenAI):
-    pass
-
 # mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
@@ -23,7 +18,7 @@ def _mock_create(*args, **kwargs):
 
 @pytest.fixture
 def hallucination_scorer(mock_create):
-    return HallucinationScorer(client=MockOpenAI(), model_id="gpt-4o", temperature=0.7, max_tokens=4096)
+    return HallucinationScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=4096)
 
 def test_hallucination_scorer_initialization(hallucination_scorer):
     assert isinstance(hallucination_scorer, HallucinationScorer)
diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index 77af51cf5193..108db4f69f08 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -8,11 +8,6 @@
     RelevancyResponse,
 )
 
-
-# Mock the OpenAI client
-class MockOpenAI(OpenAI):
-    pass
-
 # Mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
@@ -32,11 +27,11 @@ def _mock_create(*args, **kwargs):
 
 @pytest.fixture
 def context_entity_recall_scorer(mock_create):
-    return ContextEntityRecallScorer(client=MockOpenAI(), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return ContextEntityRecallScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
 
 @pytest.fixture
 def context_relevancy_scorer(mock_create):
-    return ContextRelevancyScorer(client=MockOpenAI(), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return ContextRelevancyScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
 
 def test_context_entity_recall_scorer_initialization(context_entity_recall_scorer):
     assert isinstance(context_entity_recall_scorer, ContextEntityRecallScorer)
diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
index e99a32e15337..4534056ecf84 100644
--- a/tests/scorers/test_summarization_scorer.py
+++ b/tests/scorers/test_summarization_scorer.py
@@ -7,10 +7,6 @@
 )
 
 
-# Mock the OpenAI client
-class MockOpenAI(OpenAI):
-    pass
-
 # mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
@@ -22,7 +18,7 @@ def _mock_create(*args, **kwargs):
 
 @pytest.fixture
 def summarization_scorer(mock_create):
-    return SummarizationScorer(client=MockOpenAI(), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return SummarizationScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
 
 def test_summarization_scorer_initialization(summarization_scorer, mock_create):
     assert isinstance(summarization_scorer, SummarizationScorer)

From a4920cf59319104214acd7c3ab6056c07a46e3dd Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Sat, 12 Oct 2024 15:24:53 +0100
Subject: [PATCH 047/150] Update from scorer to scorers dir, small tidy ups

---
 examples/text-extract/evaluate.py                  |  2 +-
 examples/tutorial_scripts/05_eval_pipeline.py      |  2 +-
 examples/tutorial_scripts/06_eval_pipeline_all.py  |  2 +-
 tests/trace/test_evaluate.py                       |  2 +-
 tests/trace/test_evaluations.py                    |  2 +-
 weave/__init__.py                                  | 14 ++++++++++++++
 weave/flow/eval.py                                 |  2 +-
 weave/flow/{scorer => scorers}/__init__.py         |  6 +++++-
 weave/flow/{scorer => scorers}/base_scorer.py      |  0
 .../{scorer => scorers}/classification_scorer.py   |  0
 .../{scorer => scorers}/hallucination_scorer.py    |  0
 weave/flow/{scorer => scorers}/json_scorer.py      |  0
 weave/flow/{scorer => scorers}/llm_scorer.py       |  0
 weave/flow/{scorer => scorers}/llm_utils.py        |  0
 .../flow/{scorer => scorers}/moderation_scorer.py  |  0
 weave/flow/{scorer => scorers}/pydantic_scorer.py  |  0
 weave/flow/{scorer => scorers}/ragas_scorer.py     |  0
 weave/flow/{scorer => scorers}/similarity_score.py |  0
 weave/flow/{scorer => scorers}/string_scorer.py    |  0
 .../{scorer => scorers}/summarization_scorer.py    |  0
 weave/flow/{scorer => scorers}/utils.py            |  0
 weave/flow/{scorer => scorers}/xml_scorer.py       |  0
 22 files changed, 25 insertions(+), 7 deletions(-)
 rename weave/flow/{scorer => scorers}/__init__.py (92%)
 rename weave/flow/{scorer => scorers}/base_scorer.py (100%)
 rename weave/flow/{scorer => scorers}/classification_scorer.py (100%)
 rename weave/flow/{scorer => scorers}/hallucination_scorer.py (100%)
 rename weave/flow/{scorer => scorers}/json_scorer.py (100%)
 rename weave/flow/{scorer => scorers}/llm_scorer.py (100%)
 rename weave/flow/{scorer => scorers}/llm_utils.py (100%)
 rename weave/flow/{scorer => scorers}/moderation_scorer.py (100%)
 rename weave/flow/{scorer => scorers}/pydantic_scorer.py (100%)
 rename weave/flow/{scorer => scorers}/ragas_scorer.py (100%)
 rename weave/flow/{scorer => scorers}/similarity_score.py (100%)
 rename weave/flow/{scorer => scorers}/string_scorer.py (100%)
 rename weave/flow/{scorer => scorers}/summarization_scorer.py (100%)
 rename weave/flow/{scorer => scorers}/utils.py (100%)
 rename weave/flow/{scorer => scorers}/xml_scorer.py (100%)

diff --git a/examples/text-extract/evaluate.py b/examples/text-extract/evaluate.py
index abb292b198ee..357f101e387c 100644
--- a/examples/text-extract/evaluate.py
+++ b/examples/text-extract/evaluate.py
@@ -6,7 +6,7 @@
 import openai
 
 import weave
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 
 
 class TextExtractModel(weave.Model):
diff --git a/examples/tutorial_scripts/05_eval_pipeline.py b/examples/tutorial_scripts/05_eval_pipeline.py
index ccb14126a03e..0a6a5baf9ab8 100644
--- a/examples/tutorial_scripts/05_eval_pipeline.py
+++ b/examples/tutorial_scripts/05_eval_pipeline.py
@@ -60,7 +60,7 @@ async def predict(self, sentence: str) -> dict:
 ]
 
 import weave
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 
 
 @weave.op()
diff --git a/examples/tutorial_scripts/06_eval_pipeline_all.py b/examples/tutorial_scripts/06_eval_pipeline_all.py
index 6be10f08a440..0d5fe8fd3b2e 100644
--- a/examples/tutorial_scripts/06_eval_pipeline_all.py
+++ b/examples/tutorial_scripts/06_eval_pipeline_all.py
@@ -4,7 +4,7 @@
 import openai
 
 import weave
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 
 # We create a model class with one predict function.
 # All inputs, predictions and parameters are automatically captured for easy inspection.
diff --git a/tests/trace/test_evaluate.py b/tests/trace/test_evaluate.py
index f5ada25215f5..ccb64b2da61e 100644
--- a/tests/trace/test_evaluate.py
+++ b/tests/trace/test_evaluate.py
@@ -4,7 +4,7 @@
 
 import weave
 from weave import Dataset, Evaluation, Model
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 
 dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
 dataset = Dataset(rows=dataset_rows)
diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py
index 92bd75f97002..49f8426cc655 100644
--- a/tests/trace/test_evaluations.py
+++ b/tests/trace/test_evaluations.py
@@ -9,7 +9,7 @@
 import weave
 from tests.trace.util import AnyIntMatcher
 from weave import Evaluation, Model
-from weave.flow.scorer import Scorer
+from weave.scorers import Scorer
 from weave.trace_server import trace_server_interface as tsi
 
 
diff --git a/weave/__init__.py b/weave/__init__.py
index 3b54ba971762..7cf5b49de48d 100644
--- a/weave/__init__.py
+++ b/weave/__init__.py
@@ -15,6 +15,20 @@
 from weave.trace.util import Thread as Thread
 from weave.trace.util import ThreadPoolExecutor as ThreadPoolExecutor
 
+from typing import TYPE_CHECKING
+
+# Helper for IDEs
+if TYPE_CHECKING:
+    from weave.flow import scorers
+
+# Lazy import for the scorers module
+def __getattr__(name):
+    if name == "scorers":
+        from weave.flow import scorers
+        globals()["scorers"] = scorers
+        return scorers
+    raise AttributeError(f"module {__name__} has no attribute {name}")
+
 # Special object informing doc generation tooling which symbols
 # to document & to associate with this module.
 __docspec__ = [
diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 8ed5b24d1e37..9a807d47f986 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -13,7 +13,7 @@
 from weave.flow.dataset import Dataset
 from weave.flow.model import Model, get_infer_method
 from weave.flow.obj import Object
-from weave.flow.scorer import (
+from weave.scorers import (
     Scorer,
     auto_summarize,
     get_scorer_attributes,
diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorers/__init__.py
similarity index 92%
rename from weave/flow/scorer/__init__.py
rename to weave/flow/scorers/__init__.py
index d5702549ef73..68c423eea3c5 100644
--- a/weave/flow/scorer/__init__.py
+++ b/weave/flow/scorers/__init__.py
@@ -1,4 +1,8 @@
-from weave.flow.scorer.base_scorer import Scorer, auto_summarize, get_scorer_attributes
+from weave.flow.scorer.base_scorer import (
+    Scorer,
+    auto_summarize,
+    get_scorer_attributes,
+)
 from weave.flow.scorer.classification_scorer import (
     MultiTaskBinaryClassificationF1,
     transpose,
diff --git a/weave/flow/scorer/base_scorer.py b/weave/flow/scorers/base_scorer.py
similarity index 100%
rename from weave/flow/scorer/base_scorer.py
rename to weave/flow/scorers/base_scorer.py
diff --git a/weave/flow/scorer/classification_scorer.py b/weave/flow/scorers/classification_scorer.py
similarity index 100%
rename from weave/flow/scorer/classification_scorer.py
rename to weave/flow/scorers/classification_scorer.py
diff --git a/weave/flow/scorer/hallucination_scorer.py b/weave/flow/scorers/hallucination_scorer.py
similarity index 100%
rename from weave/flow/scorer/hallucination_scorer.py
rename to weave/flow/scorers/hallucination_scorer.py
diff --git a/weave/flow/scorer/json_scorer.py b/weave/flow/scorers/json_scorer.py
similarity index 100%
rename from weave/flow/scorer/json_scorer.py
rename to weave/flow/scorers/json_scorer.py
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorers/llm_scorer.py
similarity index 100%
rename from weave/flow/scorer/llm_scorer.py
rename to weave/flow/scorers/llm_scorer.py
diff --git a/weave/flow/scorer/llm_utils.py b/weave/flow/scorers/llm_utils.py
similarity index 100%
rename from weave/flow/scorer/llm_utils.py
rename to weave/flow/scorers/llm_utils.py
diff --git a/weave/flow/scorer/moderation_scorer.py b/weave/flow/scorers/moderation_scorer.py
similarity index 100%
rename from weave/flow/scorer/moderation_scorer.py
rename to weave/flow/scorers/moderation_scorer.py
diff --git a/weave/flow/scorer/pydantic_scorer.py b/weave/flow/scorers/pydantic_scorer.py
similarity index 100%
rename from weave/flow/scorer/pydantic_scorer.py
rename to weave/flow/scorers/pydantic_scorer.py
diff --git a/weave/flow/scorer/ragas_scorer.py b/weave/flow/scorers/ragas_scorer.py
similarity index 100%
rename from weave/flow/scorer/ragas_scorer.py
rename to weave/flow/scorers/ragas_scorer.py
diff --git a/weave/flow/scorer/similarity_score.py b/weave/flow/scorers/similarity_score.py
similarity index 100%
rename from weave/flow/scorer/similarity_score.py
rename to weave/flow/scorers/similarity_score.py
diff --git a/weave/flow/scorer/string_scorer.py b/weave/flow/scorers/string_scorer.py
similarity index 100%
rename from weave/flow/scorer/string_scorer.py
rename to weave/flow/scorers/string_scorer.py
diff --git a/weave/flow/scorer/summarization_scorer.py b/weave/flow/scorers/summarization_scorer.py
similarity index 100%
rename from weave/flow/scorer/summarization_scorer.py
rename to weave/flow/scorers/summarization_scorer.py
diff --git a/weave/flow/scorer/utils.py b/weave/flow/scorers/utils.py
similarity index 100%
rename from weave/flow/scorer/utils.py
rename to weave/flow/scorers/utils.py
diff --git a/weave/flow/scorer/xml_scorer.py b/weave/flow/scorers/xml_scorer.py
similarity index 100%
rename from weave/flow/scorer/xml_scorer.py
rename to weave/flow/scorers/xml_scorer.py

From 2114c4fb423ebe6dd2d7ba1308887f3a76d80cd5 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Sat, 12 Oct 2024 15:29:45 +0100
Subject: [PATCH 048/150] re-order llms from most popular to least

---
 weave/flow/scorers/llm_utils.py | 35 +++++++++++++++++----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/weave/flow/scorers/llm_utils.py b/weave/flow/scorers/llm_utils.py
index c2c1875b6d4c..e6bce53f8dbc 100644
--- a/weave/flow/scorers/llm_utils.py
+++ b/weave/flow/scorers/llm_utils.py
@@ -2,14 +2,14 @@
 
 from weave.trace.autopatch import autopatch
 
-autopatch()  # fix instrucor tracing
+autopatch()  # ensure both weave patching and instructor patching are applied
 
 # TODO: Gemini
 
 OPENAI_DEFAULT_MODEL = "gpt-4o"
 OPENAI_DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
 
-ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet-20240620"
+ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet"
 
 MISTRAL_DEFAULT_MODEL = "mistral-large-latest"
 MISTRAL_DEFAULT_EMBEDDING_MODEL = "mistral-embed"
@@ -31,15 +31,18 @@ def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ign
     try:
         import instructor
     except ImportError:
-        raise ImportError("We need instructor to use this the LLM-powered scorers")
+        raise ImportError(
+            "The `instructor` package is required to use LLM-powered scorers, please run `pip install instructor`"
+        )
 
     client_type = type(client).__name__.lower()
-    if "mistral" in client_type:
-        return instructor.from_mistral(client)
-    elif "openai" in client_type:
+
+    if "openai" in client_type:
         return instructor.from_openai(client)
     elif "anthropic" in client_type:
         return instructor.from_anthropic(client)
+    elif "mistral" in client_type:
+        return instructor.from_mistral(client)
     else:
         raise ValueError(f"Unsupported client type: {client_type}")
 
@@ -52,12 +55,13 @@ def embed(
     client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs
 ) -> List[List[float]]:  # type: ignore
     client_type = type(client).__name__.lower()
-    if "mistral" in client_type:
-        response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)  # type: ignore
-        return [embedding.embedding for embedding in response.data]
-    elif "openai" in client_type:
+    if "openai" in client_type:
         response = client.embeddings.create(model=model_id, input=texts, **kwargs)  # type: ignore
         return [embedding.embedding for embedding in response.data]
+    elif "mistral" in client_type:
+        response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)  # type: ignore
+        return [embedding.embedding for embedding in response.data]
+
     else:
         raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}")
 
@@ -65,17 +69,14 @@ def embed(
 # Helper function for dynamic imports
 def import_client(provider: str) -> Optional[_LLM_CLIENTS]:  # type: ignore
     try:
-        if provider == "mistral":
-            from mistralai import Mistral
-
-            return Mistral
-        elif provider == "openai":
+        if provider == "openai":
             from openai import OpenAI
-
             return OpenAI
         elif provider == "anthropic":
             import anthropic
-
             return anthropic.Anthropic
+        elif provider == "mistral":
+            from mistralai import Mistral
+            return Mistral
     except ImportError:
         return None

From 0b2bbf2f05e053f653b9ff4dad73c3bd34eae34f Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Sat, 12 Oct 2024 22:47:35 +0100
Subject: [PATCH 049/150] feat(weave): fixes tests, summarization scorer
 re-write, re-names flow/scorer dir, create weave/scorers dir

---
 docs/docs/guides/integrations/langchain.md  |   2 +-
 docs/docs/tutorial-eval.md                  |   4 +-
 docs/docs/tutorial-rag.md                   |   2 +-
 tests/scorers/test_hallucination_scorer.py  |  42 ++++-
 tests/scorers/test_json_scorer.py           |  28 ++--
 tests/scorers/test_pydantic_scorer.py       |  13 +-
 tests/scorers/test_ragas_scorer.py          |  36 +++-
 tests/scorers/test_string_scorer.py         |   9 +-
 tests/scorers/test_summarization_scorer.py  |  74 +++++---
 tests/trace/test_evaluations.py             |  35 ++--
 weave/__init__.py                           |  14 --
 weave/flow/eval.py                          |  35 ++--
 weave/flow/scorers/__init__.py              |  48 +++---
 weave/flow/scorers/classification_scorer.py |   2 +-
 weave/flow/scorers/hallucination_scorer.py  | 103 ++++++++++--
 weave/flow/scorers/json_scorer.py           |   4 +-
 weave/flow/scorers/llm_scorer.py            |  14 +-
 weave/flow/scorers/llm_utils.py             |  19 ++-
 weave/flow/scorers/moderation_scorer.py     |   2 +-
 weave/flow/scorers/pydantic_scorer.py       |   2 +-
 weave/flow/scorers/ragas_scorer.py          |   4 +-
 weave/flow/scorers/similarity_score.py      |   4 +-
 weave/flow/scorers/string_scorer.py         |   5 +-
 weave/flow/scorers/summarization_scorer.py  | 177 +++++++++++++++++---
 weave/flow/scorers/xml_scorer.py            |   4 +-
 weave/scorers/__init__.py                   |   1 +
 26 files changed, 505 insertions(+), 178 deletions(-)
 create mode 100644 weave/scorers/__init__.py

diff --git a/docs/docs/guides/integrations/langchain.md b/docs/docs/guides/integrations/langchain.md
index b382e793e705..4487a85dfd4b 100644
--- a/docs/docs/guides/integrations/langchain.md
+++ b/docs/docs/guides/integrations/langchain.md
@@ -196,7 +196,7 @@ Evaluations help you measure the performance of your models. By using the [`weav
 
 ```python
 
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 
 sentences = [
     "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
diff --git a/docs/docs/tutorial-eval.md b/docs/docs/tutorial-eval.md
index 929d29ab56a9..0705443b1f0e 100644
--- a/docs/docs/tutorial-eval.md
+++ b/docs/docs/tutorial-eval.md
@@ -94,7 +94,7 @@ Here `sentence` is passed to the model's predict function, and `target` is used
 
 ```python
 import weave
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 
 weave.init('intro-example')
 
@@ -132,7 +132,7 @@ import asyncio
 # highlight-next-line
 import weave
 # highlight-next-line
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 import openai
 
 # We create a model class with one predict function.
diff --git a/docs/docs/tutorial-rag.md b/docs/docs/tutorial-rag.md
index 81cf3b2b9b2e..466d63265499 100644
--- a/docs/docs/tutorial-rag.md
+++ b/docs/docs/tutorial-rag.md
@@ -182,7 +182,7 @@ On a high-level the steps to create custom Scorer are quite simple:
 
 
 ```python
-from weave.flow.scorer import Scorer
+from weave.scorers import Scorer
 from weave import WeaveList
 
 class CorrectnessLLMJudge(Scorer):
diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index b486467f7171..6cae7d157d85 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -1,24 +1,43 @@
 import pytest
 from openai import OpenAI
 
-from weave.flow.scorer.hallucination_scorer import (
+from weave.flow.scorers.hallucination_scorer import (
+    HallucinationReasoning,
     HallucinationResponse,
+)
+from weave.scorers import (
     HallucinationScorer,
 )
 
+
 # mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
         return HallucinationResponse(
             chain_of_thought="The output is consistent with the input data.",
-            is_hallucination=False
+            hallucination_reasonings=[
+                HallucinationReasoning(
+                    observation="My observation for this is that the output is consistent with the input data.",
+                    hallucination_type="No Hallucination",
+                )
+            ],
+            conclusion="The output is consistent with the input data.",
+            is_hallucination=False,
         )
-    monkeypatch.setattr('weave.flow.scorer.hallucination_scorer.create', _mock_create)
+
+    monkeypatch.setattr("weave.flow.scorers.hallucination_scorer.create", _mock_create)
+
 
 @pytest.fixture
 def hallucination_scorer(mock_create):
-    return HallucinationScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=4096)
+    return HallucinationScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=4096,
+    )
+
 
 def test_hallucination_scorer_initialization(hallucination_scorer):
     assert isinstance(hallucination_scorer, HallucinationScorer)
@@ -26,12 +45,19 @@ def test_hallucination_scorer_initialization(hallucination_scorer):
     assert hallucination_scorer.temperature == 0.7
     assert hallucination_scorer.max_tokens == 4096
 
+
 def test_hallucination_scorer_score(hallucination_scorer, mock_create):
     output = "John's favorite cheese is cheddar."
     context = "John likes various types of cheese."
-    result = hallucination_scorer.score(output, context)
+    result = hallucination_scorer.score(output=output, context=context)
     assert isinstance(result, HallucinationResponse)
     assert not result.is_hallucination
-    assert "The output is consistent with the input data."  == result.chain_of_thought
-
-# Add more tests as needed
+    assert isinstance(result.hallucination_reasonings, list)
+    assert isinstance(result.hallucination_reasonings[0], HallucinationReasoning)
+    assert result.chain_of_thought == "The output is consistent with the input data."
+    assert (
+        result.hallucination_reasonings[0].observation
+        == "My observation for this is that the output is consistent with the input data."
+    )
+    assert result.conclusion == "The output is consistent with the input data."
+    assert result.hallucination_reasonings[0].hallucination_type == "No Hallucination"
diff --git a/tests/scorers/test_json_scorer.py b/tests/scorers/test_json_scorer.py
index 443e8d108728..6cd1cf480cff 100644
--- a/tests/scorers/test_json_scorer.py
+++ b/tests/scorers/test_json_scorer.py
@@ -1,44 +1,50 @@
-from weave.flow.scorer.json_scorer import JSONScorer
+from weave.scorers import ValidJSONScorer
 
 
 def test_json_scorer_valid_json():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = '{"city": "San Francisco", "country": "USA"}'
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_invalid_json():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = '{"city": "San Francisco", "country": "USA"'
     result = scorer.score(output)
     assert result["json_valid"] is False
 
+
 def test_json_scorer_non_json_string():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = "Just a plain string."
     result = scorer.score(output)
     assert result["json_valid"] is False
 
+
 def test_json_scorer_valid_json_list():
-    scorer = JSONScorer()
-    output = '[1, 2, 3, 4, 5]'
+    scorer = ValidJSONScorer()
+    output = "[1, 2, 3, 4, 5]"
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_nested_json():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = '{"person": {"name": "John", "age": 30}, "city": "New York"}'
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_empty_object():
-    scorer = JSONScorer()
-    output = '{}'
+    scorer = ValidJSONScorer()
+    output = "{}"
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_empty_list():
-    scorer = JSONScorer()
-    output = '[]'
+    scorer = ValidJSONScorer()
+    output = "[]"
     result = scorer.score(output)
     assert result["json_valid"] is True
diff --git a/tests/scorers/test_pydantic_scorer.py b/tests/scorers/test_pydantic_scorer.py
index 1a4112fc3f65..f9953ba6abd9 100644
--- a/tests/scorers/test_pydantic_scorer.py
+++ b/tests/scorers/test_pydantic_scorer.py
@@ -1,46 +1,55 @@
 import pytest
 from pydantic import BaseModel
 
-from weave.flow.scorer.pydantic_scorer import PydanticScorer
+from weave.scorers import PydanticScorer
 
 
 class User(BaseModel):
     name: str
     age: int
 
+
 @pytest.fixture
 def user_scorer():
     return PydanticScorer(model=User)
 
+
 def test_pydantic_scorer_initialization():
     scorer = PydanticScorer(model=User)
     assert isinstance(scorer, PydanticScorer)
     assert scorer.model == User
 
+
 def test_pydantic_scorer_valid_json_string(user_scorer):
     valid_json = '{"name": "John", "age": 30}'
     assert user_scorer.score(valid_json) == {"valid_pydantic": True}
 
+
 def test_pydantic_scorer_valid_dict(user_scorer):
     valid_dict = {"name": "John", "age": 30}
     assert user_scorer.score(valid_dict) == {"valid_pydantic": True}
 
+
 def test_pydantic_scorer_invalid_json_string(user_scorer):
     invalid_json = '{"name": "John", "age": "thirty"}'
     assert user_scorer.score(invalid_json) == {"valid_pydantic": False}
 
+
 def test_pydantic_scorer_invalid_dict(user_scorer):
     invalid_dict = {"name": "John", "age": "thirty"}
     assert user_scorer.score(invalid_dict) == {"valid_pydantic": False}
 
+
 def test_pydantic_scorer_missing_field(user_scorer):
     missing_field = '{"name": "John"}'
     assert user_scorer.score(missing_field) == {"valid_pydantic": False}
 
+
 def test_pydantic_scorer_extra_field(user_scorer):
     extra_field = '{"name": "John", "age": 30, "city": "New York"}'
     assert user_scorer.score(extra_field) == {"valid_pydantic": True}
 
+
 def test_pydantic_scorer_invalid_input_type(user_scorer):
     invalid_input = 123  # Neither a string nor a dict
-    assert user_scorer.score(invalid_input) == {"valid_pydantic": False}
\ No newline at end of file
+    assert user_scorer.score(invalid_input) == {"valid_pydantic": False}
diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index 108db4f69f08..2144200d809f 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -1,42 +1,60 @@
 import pytest
 from openai import OpenAI
 
-from weave.flow.scorer.ragas_scorer import (
-    ContextEntityRecallScorer,
-    ContextRelevancyScorer,
+from weave.flow.scorers.ragas_scorer import (
     EntityExtractionResponse,
     RelevancyResponse,
 )
+from weave.scorers import (
+    ContextEntityRecallScorer,
+    ContextRelevancyScorer,
+)
+
 
 # Mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
         # Retrieve the response_model to return appropriate mock responses
-        response_model = kwargs.get('response_model')
+        response_model = kwargs.get("response_model")
         if response_model == EntityExtractionResponse:
             return EntityExtractionResponse(entities=["Paris"])
         elif response_model == RelevancyResponse:
             return RelevancyResponse(
                 reasoning="The context directly answers the question.",
-                relevancy_score=1
+                relevancy_score=1,
             )
         else:
             return None
-    monkeypatch.setattr('weave.flow.scorer.ragas_scorer.create', _mock_create)
+
+    monkeypatch.setattr("weave.flow.scorers.ragas_scorer.create", _mock_create)
+
 
 @pytest.fixture
 def context_entity_recall_scorer(mock_create):
-    return ContextEntityRecallScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return ContextEntityRecallScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=1024,
+    )
+
 
 @pytest.fixture
 def context_relevancy_scorer(mock_create):
-    return ContextRelevancyScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return ContextRelevancyScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=1024,
+    )
+
 
 def test_context_entity_recall_scorer_initialization(context_entity_recall_scorer):
     assert isinstance(context_entity_recall_scorer, ContextEntityRecallScorer)
     assert context_entity_recall_scorer.model_id == "gpt-4o"
 
+
 def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
     output = "Paris is the capital of France."
     context = "The capital city of France is Paris."
@@ -45,10 +63,12 @@ def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
     assert "recall" in result
     assert result["recall"] == 1.0  # Assuming full recall in mock response
 
+
 def test_context_relevancy_scorer_initialization(context_relevancy_scorer):
     assert isinstance(context_relevancy_scorer, ContextRelevancyScorer)
     assert context_relevancy_scorer.model_id == "gpt-4o"
 
+
 def test_context_relevancy_scorer_score(context_relevancy_scorer):
     output = "What is the capital of France?"
     context = "Paris is the capital city of France."
diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py
index 3c460cb04dba..dfa05daf7e89 100644
--- a/tests/scorers/test_string_scorer.py
+++ b/tests/scorers/test_string_scorer.py
@@ -1,4 +1,4 @@
-from weave.flow.scorer.string_scorer import (
+from weave.scorers import (
     LevenshteinScorer,
     StringMatchScorer,
 )
@@ -11,6 +11,7 @@ def test_string_match_scorer():
     result = scorer.score(output, target)
     assert result["string_in_input"] is True
 
+
 def test_string_match_scorer_false():
     scorer = StringMatchScorer()
     output = "Alice"
@@ -18,6 +19,7 @@ def test_string_match_scorer_false():
     result = scorer.score(output, target)
     assert result["string_in_input"] is False
 
+
 # def test_regex_scorer():
 #     scorer = RegexScorer(patterns="engineer")
 #     output = "I am an engineer"
@@ -36,6 +38,7 @@ def test_string_match_scorer_false():
 #     result = scorer.score(output)
 #     assert result["string_match"] is False
 
+
 def test_levenshtein_scorer():
     scorer = LevenshteinScorer()
     output = "Hello"
@@ -43,6 +46,7 @@ def test_levenshtein_scorer():
     result = scorer.score(output, target)
     assert result["levenshtein_distance"] == 1
 
+
 def test_levenshtein_scorer_same_strings():
     scorer = LevenshteinScorer()
     output = "Hello"
@@ -50,9 +54,10 @@ def test_levenshtein_scorer_same_strings():
     result = scorer.score(output, target)
     assert result["levenshtein_distance"] == 0
 
+
 def test_levenshtein_scorer_completely_different():
     scorer = LevenshteinScorer()
     output = "Hello"
     target = "World"
     result = scorer.score(output, target)
-    assert result["levenshtein_distance"] == 4
\ No newline at end of file
+    assert result["levenshtein_distance"] == 4
diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
index 4534056ecf84..60b026b30806 100644
--- a/tests/scorers/test_summarization_scorer.py
+++ b/tests/scorers/test_summarization_scorer.py
@@ -1,45 +1,81 @@
 import pytest
 from openai import OpenAI
 
-from weave.flow.scorer.summarization_scorer import (
+from weave.flow.scorers.summarization_scorer import (
     EntityExtractionResponse,
+    SummarizationEvaluationResponse,
+)
+from weave.scorers import (
     SummarizationScorer,
 )
 
 
-# mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
-        return EntityExtractionResponse(
-            entities=["entity1", "entity2"]
-        )
-    monkeypatch.setattr('weave.flow.scorer.summarization_scorer.create', _mock_create)
+        response_model = kwargs.get("response_model")
+        if response_model == EntityExtractionResponse:
+            return EntityExtractionResponse(entities=["entity1", "entity2"])
+        elif response_model == SummarizationEvaluationResponse:
+            return SummarizationEvaluationResponse(
+                think_step_by_step="This is some reasoning.",
+                summarization_evaluation="excellent",
+            )
+        else:
+            return None
+
+    # Patch the 'create' function wherever it is called
+    monkeypatch.setattr("weave.flow.scorers.summarization_scorer.create", _mock_create)
+
 
 @pytest.fixture
 def summarization_scorer(mock_create):
-    return SummarizationScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return SummarizationScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=1024,
+    )
+
 
-def test_summarization_scorer_initialization(summarization_scorer, mock_create):
+def test_summarization_scorer_evaluate_summary(summarization_scorer, mock_create):
+    input_text = "This is the original text."
+    summary_text = "This is the summary."
+    result = summarization_scorer.evaluate_summary(
+        input=input_text, summary=summary_text
+    )
+    assert isinstance(result, SummarizationEvaluationResponse)
+    assert result.summarization_evaluation == "excellent"
+    assert result.think_step_by_step == "This is some reasoning."
+
+
+@pytest.mark.asyncio
+async def test_summarization_scorer_score(summarization_scorer):
+    input_text = "This is the original text."
+    output_text = "This is the summary."
+    result = await summarization_scorer.score(input=input_text, output=output_text)
+    assert isinstance(result, dict)
+    assert "summarization_eval_score" in result
+    assert result["summarization_eval_score"] == 1.0  # "excellent" maps to 1.0
+    assert "llm_eval_reasoning" in result
+    assert result["llm_eval_reasoning"] == "This is some reasoning."
+    assert "is_entity_dense" in result
+    assert isinstance(result["is_entity_dense"], bool)
+    assert "entity_density" in result
+    assert isinstance(result["entity_density"], float)
+
+
+def test_summarization_scorer_initialization(summarization_scorer):
     assert isinstance(summarization_scorer, SummarizationScorer)
     assert summarization_scorer.model_id == "gpt-4o"
     assert summarization_scorer.temperature == 0.7
     assert summarization_scorer.max_tokens == 1024
 
-def test_summarization_scorer_extract_entities(summarization_scorer, mock_create):
+
+def test_summarization_scorer_extract_entities(summarization_scorer):
     text = "This is a sample text with entities."
     entities = summarization_scorer.extract_entities(text)
     assert isinstance(entities, list)
     assert len(entities) == 2
     assert "entity1" in entities
     assert "entity2" in entities
-
-def test_summarization_scorer_score(summarization_scorer):
-    input_text = "This is the original text with entities."
-    output_text = "This is a summary with some entities."
-    result = summarization_scorer.score(input=input_text, output=output_text)
-    assert isinstance(result, dict)
-    assert "recall" in result
-    assert 0 <= result["recall"] <= 1
-
-# Add more tests as needed
diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py
index 49f8426cc655..16c73aed5b33 100644
--- a/tests/trace/test_evaluations.py
+++ b/tests/trace/test_evaluations.py
@@ -749,17 +749,15 @@ def function_score(image, dc, model, obj, text, output) -> bool:
     assert "file_content_read" in access_log
 
 
-
 @pytest.mark.asyncio
 async def test_evaluation_with_column_map():
-
     # Define a dummy scorer that uses column_map
     class DummyScorer(Scorer):
         @weave.op()
         def score(self, foo: str, bar: str, output: str, target: str) -> dict:
             # Return whether foo + bar equals output
             return {"match": (foo + bar) == output == target}
-        
+
     # Create the scorer with column_map mapping 'foo'->'col1', 'bar'->'col2'
     dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"})
 
@@ -781,12 +779,13 @@ def model_function(col1, col2):
     eval_out = await evaluation.evaluate(model_function)
 
     # Check that 'DummyScorer' is in the results
-    assert 'DummyScorer' in eval_out
+    assert "DummyScorer" in eval_out
 
     # The expected summary should show that 3 out of 4 predictions matched
     expected_results = {"true_count": 3, "true_fraction": 0.75}
-    assert eval_out['DummyScorer']["match"] == expected_results, "The summary should reflect the correct number of matches"
-
+    assert (
+        eval_out["DummyScorer"]["match"] == expected_results
+    ), "The summary should reflect the correct number of matches"
 
 
 # Define another dummy scorer
@@ -799,16 +798,20 @@ class DummyScorer(Scorer):
         def score(self, foo: str, bar: str, output: str, target: str) -> dict:
             # Return whether foo + bar equals output
             return {"match": (foo + bar) == output == target}
+
     class AnotherDummyScorer(Scorer):
         @weave.op()
         def score(self, input1: str, input2: str, output: str) -> dict:
             # Return whether input1 == output reversed
             return {"match": input1 == output[::-1]}
+
     # First scorer maps 'foo'->'col1', 'bar'->'col2'
     dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"})
 
     # Second scorer maps 'input1'->'col2', 'input2'->'col1'
-    another_dummy_scorer = AnotherDummyScorer(column_map={"input1": "col2", "input2": "col1"})
+    another_dummy_scorer = AnotherDummyScorer(
+        column_map={"input1": "col2", "input2": "col1"}
+    )
 
     @weave.op()
     def model_function(col1, col2):
@@ -821,18 +824,22 @@ def model_function(col1, col2):
         {"col1": "xyz", "col2": "zyx", "target": "zzzzzz"},
     ]
 
-    evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer, another_dummy_scorer])
+    evaluation = Evaluation(
+        dataset=dataset, scorers=[dummy_scorer, another_dummy_scorer]
+    )
 
     # Run the evaluation
     eval_out = await evaluation.evaluate(model_function)
 
     # Check that both scorers are in the results
-    assert 'DummyScorer' in eval_out
-    assert 'AnotherDummyScorer' in eval_out
+    assert "DummyScorer" in eval_out
+    assert "AnotherDummyScorer" in eval_out
 
     # Assertions for the first scorer
-    expected_results_dummy = {"true_count": 1, "true_fraction": 1.0/3}
-    assert eval_out['DummyScorer']["match"] == expected_results_dummy, "All concatenations should match the target"
+    expected_results_dummy = {"true_count": 1, "true_fraction": 1.0 / 3}
+    assert (
+        eval_out["DummyScorer"]["match"] == expected_results_dummy
+    ), "All concatenations should match the target"
 
     # Assertions for the second scorer
     # Since input1 == col2, and output is col1 + col2, we check if col2 == (col1 + col2)[::-1]
@@ -842,4 +849,6 @@ def model_function(col1, col2):
     # Third row: col2 = "zyx", output = "xyzzyx", output[::-1] = "xyzzyx" -> "zyx" == "xyzzyx" is False
     # So all matches are False
     expected_results_another_dummy = {"true_count": 0, "true_fraction": 0.0}
-    assert eval_out['AnotherDummyScorer']["match"] == expected_results_another_dummy, "No matches should be found for AnotherDummyScorer"
+    assert (
+        eval_out["AnotherDummyScorer"]["match"] == expected_results_another_dummy
+    ), "No matches should be found for AnotherDummyScorer"
diff --git a/weave/__init__.py b/weave/__init__.py
index 7cf5b49de48d..3b54ba971762 100644
--- a/weave/__init__.py
+++ b/weave/__init__.py
@@ -15,20 +15,6 @@
 from weave.trace.util import Thread as Thread
 from weave.trace.util import ThreadPoolExecutor as ThreadPoolExecutor
 
-from typing import TYPE_CHECKING
-
-# Helper for IDEs
-if TYPE_CHECKING:
-    from weave.flow import scorers
-
-# Lazy import for the scorers module
-def __getattr__(name):
-    if name == "scorers":
-        from weave.flow import scorers
-        globals()["scorers"] = scorers
-        return scorers
-    raise AttributeError(f"module {__name__} has no attribute {name}")
-
 # Special object informing doc generation tooling which symbols
 # to document & to associate with this module.
 __docspec__ = [
diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 9a807d47f986..4c211e5d5469 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -206,30 +206,45 @@ async def predict_and_score(
 
             # TODO: Check for input columns parameters in the signature of the scorer
 
-            if "model_output" not in score_arg_names and "output" not in score_arg_names:
+            if (
+                "model_output" not in score_arg_names
+                and "output" not in score_arg_names
+            ):
                 raise OpCallError(
                     f"Scorer {scorer_name} must have a 'model_output' or 'output' argument, to receive the output of the model function."
                 )
 
             if isinstance(example, dict):
-                # If we get a column_map from the scorer, it means that the scorer expects the input to have different names than the dataset columns
-                # So we need to remap the input names to the expected names in the scorer
-                # For instance, if the scorer expects "input" and "target" and we have a dataset with columns "question" and "expected"
-                # we need to remap {"question": "input", "expected": "target"}
-                # and pass those to the scorer
+                # The keys of `score_args` must match the parameter names of the scorer's `score` method.
+                # If scorer.column_map is set, then user is indicating that the dataset column(s)
+                # being passed to the scorer have different names to the scorer's parameter names.
+                # So we need to remap the dataset columns to the expected parameter names in the scorer,
+                #
+                # column_map k:v pairs must be structured as `scorer param name : dataset column name`
+                #
+                # For instance, if the scorer expects "input" and "ground_truth" and we have a dataset
+                # with columns "question" and "answer", column_map should be defined as follows:
+                # {"input": "question", "ground_truth": "answer"}
+                #
                 # input: is the full row, we have access to it via example
                 # output: is the model output, we have access to it via model_output
                 if isinstance(scorer, Scorer) and scorer.column_map is not None:
-                    print(f"scorer.column_map: {scorer.column_map}")
-                    print(f"score_arg_names: {score_arg_names}")
-                    print(f"example: {example}")
+                    print(
+                        f"scorer.column_map: {scorer.column_map}"
+                    )  # TODO: delete print statement
+                    print(
+                        f"score_arg_names: {score_arg_names}"
+                    )  # TODO: delete print statement
+                    print(f"example: {example}")  # TODO: delete print statement
                     score_args = {
                         arg: example[scorer.column_map.get(arg, arg)]
                         for arg in score_arg_names
                         if scorer.column_map.get(arg, arg) in example
                     }
                 else:
-                    score_args = {k: v for k, v in example.items() if k in score_arg_names}
+                    score_args = {
+                        k: v for k, v in example.items() if k in score_arg_names
+                    }
 
             else:
                 if len(score_arg_names) == 2:
diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py
index 68c423eea3c5..811880abc449 100644
--- a/weave/flow/scorers/__init__.py
+++ b/weave/flow/scorers/__init__.py
@@ -1,51 +1,51 @@
-from weave.flow.scorer.base_scorer import (
+from weave.flow.scorers.base_scorer import (
     Scorer,
     auto_summarize,
     get_scorer_attributes,
 )
-from weave.flow.scorer.classification_scorer import (
+from weave.flow.scorers.classification_scorer import (
     MultiTaskBinaryClassificationF1,
     transpose,
 )
-from weave.flow.scorer.hallucination_scorer import HallucinationScorer
-from weave.flow.scorer.json_scorer import JSONScorer
-from weave.flow.scorer.llm_scorer import (
+from weave.flow.scorers.hallucination_scorer import HallucinationScorer
+from weave.flow.scorers.json_scorer import ValidJSONScorer
+from weave.flow.scorers.llm_scorer import (
     InstructorLLMScorer,
     LLMScorer,
 )
-from weave.flow.scorer.moderation_scorer import OpenAIModerationScorer
-from weave.flow.scorer.pydantic_scorer import PydanticScorer
-from weave.flow.scorer.ragas_scorer import (
+from weave.flow.scorers.moderation_scorer import OpenAIModerationScorer
+from weave.flow.scorers.pydantic_scorer import PydanticScorer
+from weave.flow.scorers.ragas_scorer import (
     ContextEntityRecallScorer,
     ContextRelevancyScorer,
 )
-from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
-from weave.flow.scorer.string_scorer import (
+from weave.flow.scorers.similarity_score import EmbeddingSimilarityScorer
+from weave.flow.scorers.string_scorer import (
     LevenshteinScorer,
     RegexScorer,
     StringMatchScorer,
 )
-from weave.flow.scorer.summarization_scorer import SummarizationScorer
-from weave.flow.scorer.xml_scorer import XMLScorer
+from weave.flow.scorers.summarization_scorer import SummarizationScorer
+from weave.flow.scorers.xml_scorer import ValidXMLScorer
 
 __all__ = [
-    "Scorer",
     "auto_summarize",
+    "ContextEntityRecallScorer",
+    "ContextRelevancyScorer",
+    "EmbeddingSimilarityScorer",
     "get_scorer_attributes",
-    "MultiTaskBinaryClassificationF1",
-    "transpose",
-    "RegexScorer",
-    "StringMatchScorer",
+    "HallucinationScorer",
+    "InstructorLLMScorer",
+    "ValidJSONScorer",
     "LevenshteinScorer",
-    "JSONScorer",
     "LLMScorer",
-    "InstructorLLMScorer",
-    "EmbeddingSimilarityScorer",
+    "MultiTaskBinaryClassificationF1",
     "OpenAIModerationScorer",
     "PydanticScorer",
-    "HallucinationScorer",
-    "ContextEntityRecallScorer",
-    "ContextRelevancyScorer",
+    "RegexScorer",
+    "Scorer",
+    "StringMatchScorer",
     "SummarizationScorer",
-    "XMLScorer",
+    "transpose",
+    "ValidXMLScorer",
 ]
diff --git a/weave/flow/scorers/classification_scorer.py b/weave/flow/scorers/classification_scorer.py
index 622f576e6788..4082b291029e 100644
--- a/weave/flow/scorers/classification_scorer.py
+++ b/weave/flow/scorers/classification_scorer.py
@@ -2,7 +2,7 @@
 from typing import Optional, Tuple
 
 import weave
-from weave.flow.scorer.base_scorer import Scorer
+from weave.flow.scorers.base_scorer import Scorer
 
 
 def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
diff --git a/weave/flow/scorers/hallucination_scorer.py b/weave/flow/scorers/hallucination_scorer.py
index a2e4d38fa41d..9534043d2a9f 100644
--- a/weave/flow/scorers/hallucination_scorer.py
+++ b/weave/flow/scorers/hallucination_scorer.py
@@ -1,41 +1,114 @@
+from typing import List
+
 from pydantic import BaseModel, Field
 
 import weave
-from weave.flow.scorer.llm_scorer import InstructorLLMScorer
-from weave.flow.scorer.llm_utils import OPENAI_DEFAULT_MODEL, create
-from weave.flow.scorer.utils import stringify
+from weave.flow.scorers.llm_scorer import InstructorLLMScorer
+from weave.flow.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create
+from weave.flow.scorers.utils import stringify
+
+DEFAULT_HALLUCINATION_SYSTEM_PROMPT = """
+Given some <input_data> from a user and an <output> generated by an AI system, \
+determine if the <output> contains any hallucinations.
+
+A "hallucination" is defined as information in the <output> that is not supported by \
+the <input_data> or is not factually or logically consistent with the <input_data>.
+
+# Steps
+1. Carefully read and understand the input data.
+2. Examine the model output.
+3. Compare the output to the input data, identifying any inconsistencies or additions.
+4. Evaluate the logical connection between input and output.
+5. Determine if any information in the output is not supported by or conflicts with the input.
+
+# Guidelines
+- Focus on factual accuracy and logical consistency
+- Consider both explicit and implicit information in the input data
+- Be aware of potential misinterpretations or over-generalizations in the output
+- Identify any information in the output that goes beyond the scope of the input
+
+# Examples
+## Data to analyze
+
+<input_data_example>
+The cat is black and white.
+</input_data_example>
+
+<output_example>
+The cat has orange stripes.
+</output_example>
+
+## Analysis:
+{
+  "think_step_by_step": "The cat is black and white. The cat has orange stripes. \
+The output contradicts the input data because the input specifies black and white, \
+while the output mentions orange. The output also introduces a pattern not present in \
+the input.",
+  "reasoning": [
+    {
+      "hallucination_type": "Color comparison",
+      "observation": "Input specifies black and white, output mentions orange"
+    },
+    {
+      "hallucination_type": "Pattern analysis",
+      "observation": "Input doesn't mention any pattern, output introduces stripes"
+    }
+  ],
+  "conclusion": "The output contains two hallucinations: it contradicts the color information \
+and introduces a pattern not present in the input."
+  "is_hallucination": true,
+}
+
+# Notes
+- Ensure each step in the reasoning process is clearly articulated
+- Be objective and avoid assumptions not supported by the input data
+- If the output contains factual information not present in the input, it may be a \
+hallucination even if it doesn't directly contradict the input
+"""
+
+DEFAULT_HALLUCINATION_USER_PROMPT = """
+Analyze the following <input_data> and <output> and determine if the <output> contains any hallucinations.
+# Data to analyze
 
-DEFAULT_SYSTEM_PROMPT = """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
-DEFAULT_USER_PROMPT = """Given some input_data and a output, determine if the output is a hallucination of the input_data.
-## Input data
 <input_data>
 {input_data}
-</dataset_row>
+</input_data>
 
-## Model output
 <output>
 {output}
 </output>
-
-## Instructions
-Think step by step before answering. Is the output an factually and logically consistent with the input_data?
 """
 
 
+class HallucinationReasoning(BaseModel):
+    hallucination_type: str = Field(
+        description="A short name for the type of hallucination."
+    )
+    observation: str = Field(
+        description="An observation from the <input_data> and <output> that supports the hallucination."
+    )
+
+
 class HallucinationResponse(BaseModel):
     chain_of_thought: str = Field(
-        description="Think step by step about whether the output is a hallucination of the dataset_row"
+        description="Think step by step about whether the <output> contains hallucinations \
+based on the <input_data>."
+    )
+    hallucination_reasonings: List[HallucinationReasoning] = Field(
+        description="A list of reasoning steps that lead to the conclusion about whether or not\
+the <output> contains hallucinations."
     )
+    conclusion: str = Field(description="The conclusion of the analysis.")
     is_hallucination: bool = Field(
-        description="Whether the model output is a hallucination of the dataset row"
+        description="Whether the <output> contains hallucinations based on the <input_data>."
     )
 
 
 class HallucinationScorer(InstructorLLMScorer):
     """Scorer that checks if the model output is a hallucination of the dataset row."""
 
-    system_prompt: str = DEFAULT_SYSTEM_PROMPT
-    user_prompt: str = DEFAULT_USER_PROMPT
+    system_prompt: str = DEFAULT_HALLUCINATION_SYSTEM_PROMPT
+    user_prompt: str = DEFAULT_HALLUCINATION_USER_PROMPT
     model_id: str = OPENAI_DEFAULT_MODEL
     temperature: float = 0.7
     max_tokens: int = 4096
diff --git a/weave/flow/scorers/json_scorer.py b/weave/flow/scorers/json_scorer.py
index 598b7a4f0023..f40f2d66fe67 100644
--- a/weave/flow/scorers/json_scorer.py
+++ b/weave/flow/scorers/json_scorer.py
@@ -1,10 +1,10 @@
 import json
 from typing import Any
 
-from weave.flow.scorer.base_scorer import Scorer
+from weave.flow.scorers.base_scorer import Scorer
 
 
-class JSONScorer(Scorer):
+class ValidJSONScorer(Scorer):
     """Score a JSON string."""
 
     def score(self, output: Any, **kwargs: Any) -> dict:  # type: ignore
diff --git a/weave/flow/scorers/llm_scorer.py b/weave/flow/scorers/llm_scorer.py
index 7bcf9cf9af67..d319670ae772 100644
--- a/weave/flow/scorers/llm_scorer.py
+++ b/weave/flow/scorers/llm_scorer.py
@@ -2,8 +2,8 @@
 
 from pydantic import Field, field_validator
 
-from weave.flow.scorer.base_scorer import Scorer
-from weave.flow.scorer.llm_utils import instructor_client, _LLM_CLIENTS
+from weave.flow.scorers.base_scorer import Scorer
+from weave.flow.scorers.llm_utils import _LLM_CLIENTS_NAMES, instructor_client
 
 
 class LLMScorer(Scorer):
@@ -16,9 +16,10 @@ class LLMScorer(Scorer):
 
     @field_validator("client")
     def validate_client(cls, v):  # type: ignore
-        if not isinstance(v, _LLM_CLIENTS):
+        client_type_name = type(v).__name__
+        if client_type_name not in _LLM_CLIENTS_NAMES:
             raise ValueError(
-                f"Invalid client type. Expected one of {_LLM_CLIENTS}, got {type(v)}"
+                f"Invalid client type. Expected one of {_LLM_CLIENTS_NAMES}, got {client_type_name}"
             )
         return v
 
@@ -39,8 +40,9 @@ class InstructorLLMScorer(Scorer):
 
     @field_validator("client")
     def validate_client(cls, v):  # type: ignore
-        if not isinstance(v, _LLM_CLIENTS):
+        client_type_name = type(v).__name__
+        if client_type_name not in _LLM_CLIENTS_NAMES:
             raise ValueError(
-                f"Invalid client type. Expected one of {_LLM_CLIENTS}, got {type(v)}"
+                f"Invalid client type. Expected one of {_LLM_CLIENTS_NAMES}, got {client_type_name}"
             )
         return instructor_client(v)
diff --git a/weave/flow/scorers/llm_utils.py b/weave/flow/scorers/llm_utils.py
index e6bce53f8dbc..5d480f080b86 100644
--- a/weave/flow/scorers/llm_utils.py
+++ b/weave/flow/scorers/llm_utils.py
@@ -1,4 +1,6 @@
-from typing import TYPE_CHECKING, List, Optional, Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, List, Optional, Union
 
 from weave.trace.autopatch import autopatch
 
@@ -26,6 +28,14 @@
 else:
     _LLM_CLIENTS = object
 
+_LLM_CLIENTS_NAMES = (
+    "OpenAI",
+    "AsyncOpenAI",
+    "Anthropic",
+    "AsyncAnthropic",
+    "Mistral",
+)
+
 
 def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ignore
     try:
@@ -47,12 +57,12 @@ def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ign
         raise ValueError(f"Unsupported client type: {client_type}")
 
 
-def create(client: _LLM_CLIENTS, *args, **kwargs):  # type: ignore
+def create(client: instructor.client, *args, **kwargs) -> Any:  # type: ignore
     return client.chat.completions.create(*args, **kwargs)
 
 
 def embed(
-    client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs
+    client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs: Any
 ) -> List[List[float]]:  # type: ignore
     client_type = type(client).__name__.lower()
     if "openai" in client_type:
@@ -71,12 +81,15 @@ def import_client(provider: str) -> Optional[_LLM_CLIENTS]:  # type: ignore
     try:
         if provider == "openai":
             from openai import OpenAI
+
             return OpenAI
         elif provider == "anthropic":
             import anthropic
+
             return anthropic.Anthropic
         elif provider == "mistral":
             from mistralai import Mistral
+
             return Mistral
     except ImportError:
         return None
diff --git a/weave/flow/scorers/moderation_scorer.py b/weave/flow/scorers/moderation_scorer.py
index 51b92b9f85b7..8a8e4eee9dae 100644
--- a/weave/flow/scorers/moderation_scorer.py
+++ b/weave/flow/scorers/moderation_scorer.py
@@ -3,7 +3,7 @@
 from pydantic import field_validator
 
 import weave
-from weave.flow.scorer.llm_scorer import LLMScorer
+from weave.flow.scorers.llm_scorer import LLMScorer
 
 
 class OpenAIModerationScorer(LLMScorer):
diff --git a/weave/flow/scorers/pydantic_scorer.py b/weave/flow/scorers/pydantic_scorer.py
index 90fdffd63788..5566326774d7 100644
--- a/weave/flow/scorers/pydantic_scorer.py
+++ b/weave/flow/scorers/pydantic_scorer.py
@@ -2,7 +2,7 @@
 
 from pydantic import BaseModel, ValidationError
 
-from weave.flow.scorer.base_scorer import Scorer
+from weave.flow.scorers.base_scorer import Scorer
 
 
 class PydanticScorer(Scorer):
diff --git a/weave/flow/scorers/ragas_scorer.py b/weave/flow/scorers/ragas_scorer.py
index 6180697f59f4..8b3493c35422 100644
--- a/weave/flow/scorers/ragas_scorer.py
+++ b/weave/flow/scorers/ragas_scorer.py
@@ -6,8 +6,8 @@
 from pydantic import BaseModel, Field
 
 import weave
-from weave.flow.scorer.llm_scorer import InstructorLLMScorer
-from weave.flow.scorer.llm_utils import create
+from weave.flow.scorers.llm_scorer import InstructorLLMScorer
+from weave.flow.scorers.llm_utils import create
 
 
 class EntityExtractionResponse(BaseModel):
diff --git a/weave/flow/scorers/similarity_score.py b/weave/flow/scorers/similarity_score.py
index 722c16e98c2a..82d8760b7472 100644
--- a/weave/flow/scorers/similarity_score.py
+++ b/weave/flow/scorers/similarity_score.py
@@ -4,8 +4,8 @@
 from pydantic import Field
 
 import weave
-from weave.flow.scorer.llm_scorer import LLMScorer
-from weave.flow.scorer.llm_utils import embed
+from weave.flow.scorers.llm_scorer import LLMScorer
+from weave.flow.scorers.llm_utils import embed
 
 
 class EmbeddingSimilarityScorer(LLMScorer):
diff --git a/weave/flow/scorers/string_scorer.py b/weave/flow/scorers/string_scorer.py
index a34fa5619603..4dc58922668e 100644
--- a/weave/flow/scorers/string_scorer.py
+++ b/weave/flow/scorers/string_scorer.py
@@ -4,7 +4,7 @@
 from pydantic import Field, model_validator
 
 import weave
-from weave.flow.scorer.base_scorer import Scorer
+from weave.flow.scorers.base_scorer import Scorer
 
 
 class StringMatchScorer(Scorer):
@@ -53,7 +53,7 @@ def score(
                 text_to_search = "".join(text_to_search.split())
 
         match_found = any(
-            pattern.search(text_to_search) for pattern in compiled_patterns
+            pattern.search(str(text_to_search)) for pattern in compiled_patterns
         )
 
         return {"string_match": match_found}
@@ -70,6 +70,7 @@ def check_levenshtein(self):  # type: ignore
             from Levenshtein import distance
 
             self.distance = distance
+            return self
         except ImportError:
             raise ValueError(
                 "Levenshtein package not found. Please install it with `pip install Levenshtein`"
diff --git a/weave/flow/scorers/summarization_scorer.py b/weave/flow/scorers/summarization_scorer.py
index ee43a7f48b10..effc4a990a27 100644
--- a/weave/flow/scorers/summarization_scorer.py
+++ b/weave/flow/scorers/summarization_scorer.py
@@ -1,40 +1,121 @@
-from textwrap import dedent
-from typing import Any, List
+import asyncio
+from typing import Any, List, Literal
 
 from pydantic import BaseModel, Field
 
 import weave
-from weave.flow.scorer.llm_scorer import InstructorLLMScorer
-from weave.flow.scorer.llm_utils import create
+from weave.flow.scorers.llm_scorer import InstructorLLMScorer
+from weave.flow.scorers.llm_utils import create
+
+DEFAULT_EXTRACTION_SYSTEM_PROMPT = """
+Given a <text>, extract all the unique entities from the text without repetition.
+"""
+
+DEFAULT_EXTRACTION_USER_PROMPT = """
+Extract all the unique entities from the following <text> without repetition:
+<text>
+{text}
+</text>
+"""
+
+DEFAULT_SUMMARIZATION_EVALUATION_SYSTEM_PROMPT = """
+Given an <input> and a <summary>, evaluate the quality of the <summary>.
+
+# Considerations
+- Does the <summary> contain the key information in the <input>?
+- Is the <summary> concise and informative?
+- Is the <summary> grammatically correct?
+- Does the <summary> contain information or assertions that are not present in the <input>?
+
+# Scoring Rubric
+`excellent`: The <summary> contains all of the key information and entities in the <input>, \
+is concise and information dense, is grammatically correct and doesn't contain any \
+information or assertions that are not present in the <input>.
+
+`ok`: The <summary> contains most of the key information and entities in the <input>, \
+is somewhat concise and informative, is mostly grammatically correct and doesn't contain any \
+information or assertions that are not present in the <input>.
+
+`poor`: The <summary> misses most or all of the key information in the <input>, \
+or is very verbose or vague, or is not concise or informative, or has many grammatical errors, \
+or contains information or assertions that are not present in the <input>.
+"""
+
+DEFAULT_SUMMARIZATION_EVALUATION_USER_PROMPT = """
+Evaluate the quality of the following <summary> given the <input>:
+
+<input>
+{input}
+</input>
+
+<summary>
+{summary}
+</summary>
+"""
 
 
 class EntityExtractionResponse(BaseModel):
     entities: List[str] = Field(
-        description="A list of unique entities extracted from the text"
+        description="A list of unique entities extracted from the text."
     )
 
 
-class SummarizationScorer(InstructorLLMScorer):
-    """Estimates summary quality by computing the recall of entities in the model output compared to the input."""
+summarization_quality_options = Literal["poor", "ok", "excellent"]
+summarization_quality_mapping = {"poor": 0.0, "ok": 0.5, "excellent": 1.0}
+
+
+class SummarizationEvaluationResponse(BaseModel):
+    think_step_by_step: str = Field(
+        description="Think step-by-step about the quality of the <summary> before deciding \
+on the summarization_score."
+    )
+    summarization_evaluation: summarization_quality_options = Field(
+        description="The evaluation of the summary"
+    )
 
-    extraction_prompt: str = dedent("""
-    Extract unique entities from the following text without repetition.
 
-    Text: {text}
-    Entities:
-    """)
+class SummarizationScorer(InstructorLLMScorer):
+    """
+    Estimates summary quality by both:
+    - Calculating the entity density of the summary, similar to how entity density is
+    used in the Chain of Density paper, https://arxiv.org/abs/2309.04269.
+    - Using an LLM to evaluate the summary quality.
+
+    column_map: A `scorer parameter name : dataset column name` mapping.
+    
+    This summarization scorer expects the input column in the dataset to be named "input" \
+        and the output column in the dataset to be named "summary".
+        You can specify a different mapping in the `column_map` argument. For example, \
+        if your dataset contains columns "news_article" and "news_summary" then you can \
+        specify `column_map={"input": "news_article", "output": "news_summary"}`.
+    
+    Parameters to the `score` function
+    - input: The text that was to be summarized
+    - output: the summary of the text
+    """
 
+    extraction_system_prompt: str = DEFAULT_EXTRACTION_SYSTEM_PROMPT
+    extraction_prompt: str = DEFAULT_EXTRACTION_USER_PROMPT
+    summarization_evaluation_system_prompt: str = (
+        DEFAULT_SUMMARIZATION_EVALUATION_SYSTEM_PROMPT
+    )
+    summarization_evaluation_prompt: str = DEFAULT_SUMMARIZATION_EVALUATION_USER_PROMPT
+    fast_model_id: str = "gpt-4o-mini"
+    entity_density_threshold: float = 0.08
     temperature: float = 0.7
     max_tokens: int = 1024
 
+    @weave.op
     def extract_entities(self, text: str) -> List[str]:
-        # Use LLM to extract entities
-        prompt = self.extraction_prompt.format(text=text)
+        """Use an LLM to extract entities"""
         response = create(
             self.client,
-            messages=[{"role": "user", "content": prompt}],
+            messages=[
+                {"role": "system", "content": self.extraction_system_prompt},
+                {"role": "user", "content": self.extraction_prompt.format(text=text)},
+            ],
             response_model=EntityExtractionResponse,
-            model=self.model_id,
+            model=self.fast_model_id,
             temperature=self.temperature,
             max_tokens=self.max_tokens,
         )
@@ -42,13 +123,57 @@ def extract_entities(self, text: str) -> List[str]:
         return entities
 
     @weave.op
-    def score(self, input: str, output: str, **kwargs: Any) -> dict:
-        # Extract entities
-        output_entities = self.extract_entities(output)
-        input_entities = self.extract_entities(input)
-        # Calculate recall
-        if not output_entities:
-            return {"recall": 0.0}
-        matches = set(output_entities) & set(input_entities)
-        recall = len(matches) / len(input_entities)
-        return {"recall": recall}
+    def evaluate_summary(
+        self, input: str, summary: str
+    ) -> SummarizationEvaluationResponse:
+        """Evaluate the quality of a summary using an LLM"""
+        return create(
+            self.client,
+            messages=[
+                {
+                    "role": "system",
+                    "content": self.summarization_evaluation_system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": self.summarization_evaluation_prompt.format(
+                        input=input, summary=summary
+                    ),
+                },
+            ],
+            response_model=SummarizationEvaluationResponse,
+            model=self.model_id,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+        )
+
+    def simple_word_tokenize(self, text: str) -> List[str]:
+        """Simple word tokenization"""
+        return text.split()
+
+    @weave.op
+    async def score(self, input: str, output: str, **kwargs: Any) -> dict:
+        """
+        - input: the piece of text that was to be summarized
+        - output: the generated summary of the input
+        """
+        extract_task = asyncio.to_thread(self.extract_entities, text=output)
+        evaluate_task = asyncio.to_thread(
+            self.evaluate_summary, input=input, summary=output
+        )
+        summary_entities, llm_eval = await asyncio.gather(extract_task, evaluate_task)
+
+        # LLM evaluation
+        result = {}
+        result["summarization_eval_score"] = summarization_quality_mapping.get(
+            llm_eval.summarization_evaluation.lower()
+        )
+        result["llm_eval_reasoning"] = llm_eval.think_step_by_step
+
+        # Entity density evaluation
+        summary_words = self.simple_word_tokenize(output)
+        entity_density = len(summary_entities) / len(summary_words)
+        result["is_entity_dense"] = entity_density >= self.entity_density_threshold
+        result["entity_density"] = entity_density
+
+        return result
diff --git a/weave/flow/scorers/xml_scorer.py b/weave/flow/scorers/xml_scorer.py
index 7bd42516e699..2ea8384477f5 100644
--- a/weave/flow/scorers/xml_scorer.py
+++ b/weave/flow/scorers/xml_scorer.py
@@ -1,10 +1,10 @@
 import xml.etree.ElementTree as ET
 from typing import Union
 
-from weave.flow.scorer.base_scorer import Scorer
+from weave.flow.scorers.base_scorer import Scorer
 
 
-class XMLScorer(Scorer):
+class ValidXMLScorer(Scorer):
     """Score an XML string."""
 
     def score(self, output: Union[str, dict]) -> dict:  # type: ignore
diff --git a/weave/scorers/__init__.py b/weave/scorers/__init__.py
new file mode 100644
index 000000000000..a1db6897f349
--- /dev/null
+++ b/weave/scorers/__init__.py
@@ -0,0 +1 @@
+from weave.flow.scorers import *
\ No newline at end of file

From 2f479f5e56e4be7ea9fd339c6e0c104639d2ab45 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Sat, 12 Oct 2024 23:05:09 +0100
Subject: [PATCH 050/150] feat(weave): re-add weave/flow/scorer.py for backward
 compatibiliy

---
 weave/flow/scorer.py | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 weave/flow/scorer.py

diff --git a/weave/flow/scorer.py b/weave/flow/scorer.py
new file mode 100644
index 000000000000..dfaf8132af4f
--- /dev/null
+++ b/weave/flow/scorer.py
@@ -0,0 +1,3 @@
+# Keeping this file for now to avoid breaking changes.
+# In future, users should import all scoring functionality from weave.scorers
+from weave.scorers import *
\ No newline at end of file

From 40445978136799b8d9e1eac10e4a18c0c97cf323 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Sat, 12 Oct 2024 23:13:50 +0100
Subject: [PATCH 051/150] lint

---
 weave/flow/scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weave/flow/scorer.py b/weave/flow/scorer.py
index dfaf8132af4f..2f2e999ca374 100644
--- a/weave/flow/scorer.py
+++ b/weave/flow/scorer.py
@@ -1,3 +1,3 @@
 # Keeping this file for now to avoid breaking changes.
 # In future, users should import all scoring functionality from weave.scorers
-from weave.scorers import *
\ No newline at end of file
+from weave.scorers import *

From fdf55ea56f304dc8cac00de1444fbc638e1d94dc Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Sat, 12 Oct 2024 23:14:56 +0100
Subject: [PATCH 052/150] more lint

---
 weave/scorers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weave/scorers/__init__.py b/weave/scorers/__init__.py
index a1db6897f349..25756f08a082 100644
--- a/weave/scorers/__init__.py
+++ b/weave/scorers/__init__.py
@@ -1 +1 @@
-from weave.flow.scorers import *
\ No newline at end of file
+from weave.flow.scorers import *

From 87a25c33059f756f31b14901aabfb07337751423 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 14 Oct 2024 12:02:55 +0200
Subject: [PATCH 053/150] add gemini support

---
 weave/flow/scorers/llm_utils.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/weave/flow/scorers/llm_utils.py b/weave/flow/scorers/llm_utils.py
index 5d480f080b86..8e81c3a3537b 100644
--- a/weave/flow/scorers/llm_utils.py
+++ b/weave/flow/scorers/llm_utils.py
@@ -23,8 +23,9 @@
     from anthropic import Anthropic, AsyncAnthropic
     from mistralai import Mistral
     from openai import AsyncOpenAI, OpenAI
+    from google.generativeai import GenerativeModel
 
-    _LLM_CLIENTS = Union[OpenAI, AsyncOpenAI, Anthropic, AsyncAnthropic, Mistral]
+    _LLM_CLIENTS = Union[OpenAI, AsyncOpenAI, Anthropic, AsyncAnthropic, Mistral, GenerativeModel]
 else:
     _LLM_CLIENTS = object
 
@@ -34,6 +35,7 @@
     "Anthropic",
     "AsyncAnthropic",
     "Mistral",
+    "GenerativeModel",
 )
 
 
@@ -53,6 +55,11 @@ def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ign
         return instructor.from_anthropic(client)
     elif "mistral" in client_type:
         return instructor.from_mistral(client)
+    elif "generativemodel" in client_type:
+        return instructor.from_gemini(
+            client=client,
+            mode=instructor.Mode.GEMINI_JSON,
+        )
     else:
         raise ValueError(f"Unsupported client type: {client_type}")
 
@@ -71,7 +78,6 @@ def embed(
     elif "mistral" in client_type:
         response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)  # type: ignore
         return [embedding.embedding for embedding in response.data]
-
     else:
         raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}")
 
@@ -91,5 +97,9 @@ def import_client(provider: str) -> Optional[_LLM_CLIENTS]:  # type: ignore
             from mistralai import Mistral
 
             return Mistral
+        elif provider == "gemini":
+            from google.generativeai import GenerativeModel
+
+            return GenerativeModel
     except ImportError:
         return None

From a3c6617059ffe0d02091ee417176a69f8702157e Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 14 Oct 2024 12:05:17 +0200
Subject: [PATCH 054/150] remove regex

---
 tests/scorers/test_string_scorer.py | 20 -------------
 weave/flow/scorers/__init__.py      |  2 --
 weave/flow/scorers/string_scorer.py | 44 -----------------------------
 3 files changed, 66 deletions(-)

diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py
index dfa05daf7e89..c2b0846a7c5a 100644
--- a/tests/scorers/test_string_scorer.py
+++ b/tests/scorers/test_string_scorer.py
@@ -19,26 +19,6 @@ def test_string_match_scorer_false():
     result = scorer.score(output, target)
     assert result["string_in_input"] is False
 
-
-# def test_regex_scorer():
-#     scorer = RegexScorer(patterns="engineer")
-#     output = "I am an engineer"
-#     result = scorer.score(output)
-#     assert result["string_match"] is True
-
-# def test_regex_scorer_case_insensitive():
-#     scorer = RegexScorer(patterns="Engineer", ignore_case=True)
-#     output = "I am an engineer"
-#     result = scorer.score(output)
-#     assert result["string_match"] is True
-
-# def test_regex_scorer_no_match():
-#     scorer = RegexScorer(patterns="doctor")
-#     output = "I am an engineer"
-#     result = scorer.score(output)
-#     assert result["string_match"] is False
-
-
 def test_levenshtein_scorer():
     scorer = LevenshteinScorer()
     output = "Hello"
diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py
index 811880abc449..a01a652f0af5 100644
--- a/weave/flow/scorers/__init__.py
+++ b/weave/flow/scorers/__init__.py
@@ -22,7 +22,6 @@
 from weave.flow.scorers.similarity_score import EmbeddingSimilarityScorer
 from weave.flow.scorers.string_scorer import (
     LevenshteinScorer,
-    RegexScorer,
     StringMatchScorer,
 )
 from weave.flow.scorers.summarization_scorer import SummarizationScorer
@@ -42,7 +41,6 @@
     "MultiTaskBinaryClassificationF1",
     "OpenAIModerationScorer",
     "PydanticScorer",
-    "RegexScorer",
     "Scorer",
     "StringMatchScorer",
     "SummarizationScorer",
diff --git a/weave/flow/scorers/string_scorer.py b/weave/flow/scorers/string_scorer.py
index 4dc58922668e..7525dd92e178 100644
--- a/weave/flow/scorers/string_scorer.py
+++ b/weave/flow/scorers/string_scorer.py
@@ -15,50 +15,6 @@ def score(self, output: str, target: str) -> dict:  # type: ignore
         return {"string_in_input": string_in_input}
 
 
-class RegexScorer(Scorer):  # type: ignore
-    patterns: Union[str, list[str]] = Field(
-        default_factory=list, description="The patterns or keywords to match"
-    )
-    ignore_case: bool = True
-    ignore_whitespace: bool = False
-    match_full_string: bool = False  # Match the entire string if True
-    target_column: str = Field(default="target", description="The class name to match")
-
-    @weave.op
-    def score(
-        self, output: Union[dict, str], target: Union[str, list[str], None] = None
-    ) -> dict:
-        if isinstance(output, str):
-            output = {"output": output}
-
-        # Use target patterns if provided
-        patterns = target if target else self.patterns
-        if isinstance(patterns, str):
-            patterns = [patterns]
-
-        flags = re.IGNORECASE if self.ignore_case else 0
-        compiled_patterns = []
-        for pattern in patterns:
-            if not self.use_regex:
-                pattern = re.escape(pattern)
-            if self.ignore_whitespace:
-                pattern = "".join(pattern.split())
-            if self.match_full_string:
-                pattern = f"^{pattern}$"
-            compiled_patterns.append(re.compile(pattern, flags=flags))
-
-        text_to_search = output.get("output") if output else ""
-        if self.ignore_whitespace:
-            if text_to_search:
-                text_to_search = "".join(text_to_search.split())
-
-        match_found = any(
-            pattern.search(str(text_to_search)) for pattern in compiled_patterns
-        )
-
-        return {"string_match": match_found}
-
-
 class LevenshteinScorer(Scorer):
     distance: Callable[[str, str], int] = Field(
         default=None, description="The Levenshtein distance function"

From f99f190ecd1542a14f622f17f60fcf2ff0f14b75 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Mon, 14 Oct 2024 12:01:07 +0100
Subject: [PATCH 055/150] feat(docs): start adding Scorers docs, remove
 fast_model_id

---
 docs/docs/guides/evaluation/scorers.md     | 44 +++++++++++++
 docs/sidebars.ts                           | 11 +++-
 weave/flow/scorers/hallucination_scorer.py | 27 +++++++-
 weave/flow/scorers/json_scorer.py          |  2 +-
 weave/flow/scorers/summarization_scorer.py | 74 ++++++++++++++--------
 5 files changed, 130 insertions(+), 28 deletions(-)
 create mode 100644 docs/docs/guides/evaluation/scorers.md

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
new file mode 100644
index 000000000000..98d06dbcc2a1
--- /dev/null
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -0,0 +1,44 @@
+# Evaluators
+
+TODO:
+- Why use Scorers (re-use?)
+- Explain columm map
+    column_map: A `scorer parameter name : dataset column name` mapping.
+    
+    This summarization scorer expects the input column in the dataset to be named "input" \
+        and the output column in the dataset to be named "summary".
+        You can specify a different mapping in the `column_map` argument. For example, \
+        if your dataset contains columns "news_article" and "news_summary" then you can \
+        specify `column_map={"input": "news_article", "output": "news_summary"}`.
+
+## LLM-powered Evaluators
+
+### Hallucination Scorer
+
+A Scorer that uses an LLM to determine if the model output contains any hallucinations
+based on the input data.
+
+Note:
+    - The meaning of "hallucination" can vary from person to person, you will likely want to 
+    customize the `system_prompt` and `user_prompt` to fit your specific needs.
+    - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM 
+    provider's response; you will have to install the `instructor` python package to use it.
+
+Attributes:
+    system_prompt (str): The prompt describing the task, defines what a "hallucination" is.
+    user_prompt (str): The string template to pass the input and output data. The template must 
+    contain placeholders for both `input_data` and `output`.
+
+### Summarization Scorer
+
+
+## Programmatic Evaluators
+
+
+### ValidJSONScorer
+
+Validate whether a string is valid JSON.
+
+```
+from weave.scorers import ValidJSONScorer
+```
\ No newline at end of file
diff --git a/docs/sidebars.ts b/docs/sidebars.ts
index 8958265ada11..0f6510e066d5 100644
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -54,9 +54,18 @@ const sidebars: SidebarsConfig = {
             "guides/tracking/objects",
           ],
         },
+        {
+          type: "category",
+          collapsible: true,
+          collapsed: false,
+          label: "Evaluation",
+          link: { type: "doc", id: "guides/core-types/evaluations"},
+          items: [
+            "guides/evaluation/scorers",
+          ],
+        },
         "guides/core-types/models",
         "guides/core-types/datasets",
-        "guides/core-types/evaluations",
         "guides/tracking/feedback",
         "guides/tracking/costs",
         {
diff --git a/weave/flow/scorers/hallucination_scorer.py b/weave/flow/scorers/hallucination_scorer.py
index 9534043d2a9f..c55f85d08b92 100644
--- a/weave/flow/scorers/hallucination_scorer.py
+++ b/weave/flow/scorers/hallucination_scorer.py
@@ -105,7 +105,32 @@ class HallucinationResponse(BaseModel):
 
 
 class HallucinationScorer(InstructorLLMScorer):
-    """Scorer that checks if the model output is a hallucination of the dataset row."""
+    """
+    A Scorer that uses an LLM to determine if the model output contains any hallucinations
+    based on the input data.
+
+    Note:
+        - The meaning of "hallucination" can vary from person to person, you will likely want to 
+        customize the `system_prompt` and `user_prompt` to fit your specific needs.
+        - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM 
+        provider's response; you will have to install the `instructor` python package to use it.
+        - The `score` method expects the input column from the dataset to be named "context". It will use
+        this data as the ground-truth to check hallucinations against. If your dataset column has a 
+        different name, you can specify a different mapping using the `column_map` argument in the init 
+        of HallucinationScorer by passing `column_map={"context": "context"}`.
+
+    Attributes:
+        system_prompt (str): The prompt describing the task, defines what a "hallucination" is.
+        user_prompt (str): The string template to pass the input and output data. The template must 
+        contain placeholders for both `{input_data}` and `{output}`.
+        model_id (str): The LLM model name, depends on the LLM's providers to be used `client` being used.
+        temperature (float): LLM temperature setting.
+        max_tokens (int): Maximum number of tokens in the LLM's response.
+
+    Methods:
+        score(output: str, context: str) -> HallucinationResponse:
+            Analyzes the output to detect hallucinations based on the given context.
+    """
 
     system_prompt: str = DEFAULT_HALLUCINATION_SYSTEM_PROMPT
     user_prompt: str = DEFAULT_HALLUCINATION_USER_PROMPT
diff --git a/weave/flow/scorers/json_scorer.py b/weave/flow/scorers/json_scorer.py
index f40f2d66fe67..5c4a6172324f 100644
--- a/weave/flow/scorers/json_scorer.py
+++ b/weave/flow/scorers/json_scorer.py
@@ -5,7 +5,7 @@
 
 
 class ValidJSONScorer(Scorer):
-    """Score a JSON string."""
+    """Validate whether a string is valid JSON."""
 
     def score(self, output: Any, **kwargs: Any) -> dict:  # type: ignore
         try:
diff --git a/weave/flow/scorers/summarization_scorer.py b/weave/flow/scorers/summarization_scorer.py
index effc4a990a27..eae5b3cfa6fe 100644
--- a/weave/flow/scorers/summarization_scorer.py
+++ b/weave/flow/scorers/summarization_scorer.py
@@ -5,7 +5,7 @@
 
 import weave
 from weave.flow.scorers.llm_scorer import InstructorLLMScorer
-from weave.flow.scorers.llm_utils import create
+from weave.flow.scorers.llm_utils import create, OPENAI_DEFAULT_MODEL
 
 DEFAULT_EXTRACTION_SYSTEM_PROMPT = """
 Given a <text>, extract all the unique entities from the text without repetition.
@@ -76,32 +76,58 @@ class SummarizationEvaluationResponse(BaseModel):
 
 class SummarizationScorer(InstructorLLMScorer):
     """
-    Estimates summary quality by both:
-    - Calculating the entity density of the summary, similar to how entity density is
-    used in the Chain of Density paper, https://arxiv.org/abs/2309.04269.
-    - Using an LLM to evaluate the summary quality.
-
-    column_map: A `scorer parameter name : dataset column name` mapping.
-    
-    This summarization scorer expects the input column in the dataset to be named "input" \
-        and the output column in the dataset to be named "summary".
-        You can specify a different mapping in the `column_map` argument. For example, \
-        if your dataset contains columns "news_article" and "news_summary" then you can \
-        specify `column_map={"input": "news_article", "output": "news_summary"}`.
-    
-    Parameters to the `score` function
-    - input: The text that was to be summarized
-    - output: the summary of the text
+    A Scorer that evaluates the quality of summaries in two ways:
+        - using an LLM to calculate the entity density of the summary, similar to how entity density is
+        used in the Chain of Density paper, https://arxiv.org/abs/2309.04269. This is a rough measure for
+        how information-dense the summary is.
+        - using another LLM evaluator to grade the summary quality from `poor`, `ok`, to `excellent`. These
+        grades are then mapped to numerical scores, {`poor`: 0.0, `ok`: 0.5, `excellent`: 1.0}, in order to
+        be able to calculate an average score across a dataset of summaries if needed.
+
+    To customise the LLM evaluator you can customise the `summarization_evaluation_system_prompt`and
+    `summarization_evaluation_prompt` attributes to be tailored your specific definition of what a good summary
+    should look like.
+
+    Note:
+        - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM 
+        provider's response; you will have to install the `instructor` python package to use it.
+        - The `score` method expects the input column from the dataset to be named "input". If your dataset
+        column has a different name, you can specify a different mapping using the `column_map` argument in the 
+        init of SummarizationScorer by passing `column_map={"input": "news_article"}`.
+
+    Attributes:
+        extraction_system_prompt (str): System prompt to extract the distinct entities in the input. Customising 
+        this can help ensure that the LLM identifies the `entities` that you care about.
+        extraction_prompt (str): Prompt template for entity extraction; must contain a `{text}` placeholder.
+        summarization_evaluation_system_prompt (str): System prompt defining how to evaluate the quality of a summary.
+            Asks an LLM to grade the summary from `poor`, `ok`, to `excellent` and provide a rationale for the grade.
+        summarization_evaluation_prompt (str): Prompt template for summarization evaluation instruction; must contain 
+            `{input}` and `{summary}` placeholders. 
+        entity_density_threshold (float): Threshold for determining if a summary is sufficiently entity-dense.
+        model_id (str): The LLM model name, depends on the LLM's providers to be used `client` being used.
+        temperature (float): LLM temperature setting.
+        max_tokens (int): Maximum number of tokens in the LLM's response.
+
+    Methods:
+        extract_entities(text: str) -> List[str]:
+            Uses an LLM to extract unique entities from the text.
+
+        evaluate_summary(input: str, summary: str) -> SummarizationEvaluationResponse:
+            Evaluates the quality of a summary using an LLM.
+
+        score(input: str, output: str, **kwargs: Any) -> dict:
+            Calculates summarization score and entity density score for the given input and output.
     """
 
+
     extraction_system_prompt: str = DEFAULT_EXTRACTION_SYSTEM_PROMPT
     extraction_prompt: str = DEFAULT_EXTRACTION_USER_PROMPT
     summarization_evaluation_system_prompt: str = (
         DEFAULT_SUMMARIZATION_EVALUATION_SYSTEM_PROMPT
     )
     summarization_evaluation_prompt: str = DEFAULT_SUMMARIZATION_EVALUATION_USER_PROMPT
-    fast_model_id: str = "gpt-4o-mini"
     entity_density_threshold: float = 0.08
+    model_id: str = OPENAI_DEFAULT_MODEL
     temperature: float = 0.7
     max_tokens: int = 1024
 
@@ -115,7 +141,7 @@ def extract_entities(self, text: str) -> List[str]:
                 {"role": "user", "content": self.extraction_prompt.format(text=text)},
             ],
             response_model=EntityExtractionResponse,
-            model=self.fast_model_id,
+            model=self.model_id,
             temperature=self.temperature,
             max_tokens=self.max_tokens,
         )
@@ -153,13 +179,11 @@ def simple_word_tokenize(self, text: str) -> List[str]:
 
     @weave.op
     async def score(self, input: str, output: str, **kwargs: Any) -> dict:
-        """
-        - input: the piece of text that was to be summarized
-        - output: the generated summary of the input
-        """
-        extract_task = asyncio.to_thread(self.extract_entities, text=output)
+        extract_task = asyncio.to_thread(
+            self.extract_entities, text=str(output)
+        )
         evaluate_task = asyncio.to_thread(
-            self.evaluate_summary, input=input, summary=output
+            self.evaluate_summary, input=str(input), summary=str(output)
         )
         summary_entities, llm_eval = await asyncio.gather(extract_task, evaluate_task)
 

From bd62f0d49806dee62fdb17ec8a29a95cb2f46aea Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Mon, 14 Oct 2024 12:36:57 +0100
Subject: [PATCH 056/150] update outputs of hallucination scorers, add
 docstrings, add docs

---
 docs/docs/guides/evaluation/scorers.md     | 66 ++++++++++++++++++++--
 tests/scorers/test_hallucination_scorer.py |  6 +-
 weave/flow/scorers/__init__.py             |  4 +-
 weave/flow/scorers/hallucination_scorer.py | 15 ++++-
 4 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 98d06dbcc2a1..99507fd7a525 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -1,8 +1,10 @@
-# Evaluators
+# Evaluation Metrics
+
+We call evaluation metrics "Scorers",
 
 TODO:
 - Why use Scorers (re-use?)
-- Explain columm map
+- Explain columm_map
     column_map: A `scorer parameter name : dataset column name` mapping.
     
     This summarization scorer expects the input column in the dataset to be named "input" \
@@ -13,7 +15,7 @@ TODO:
 
 ## LLM-powered Evaluators
 
-### Hallucination Scorer
+### HallucinationFreeScorer
 
 A Scorer that uses an LLM to determine if the model output contains any hallucinations
 based on the input data.
@@ -23,17 +25,69 @@ Note:
     customize the `system_prompt` and `user_prompt` to fit your specific needs.
     - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM 
     provider's response; you will have to install the `instructor` python package to use it.
+    - The `score` method expects the input column from the dataset to be named "context". It will use
+    this data as the ground-truth to check hallucinations against. If your dataset column has a 
+    different name, you can specify a different mapping using the `column_map` argument in the init 
+    of HallucinationFreeScorer by passing `column_map={"context": "context"}`.
 
 Attributes:
     system_prompt (str): The prompt describing the task, defines what a "hallucination" is.
     user_prompt (str): The string template to pass the input and output data. The template must 
-    contain placeholders for both `input_data` and `output`.
+    contain placeholders for both `{input_data}` and `{output}`.
+    model_id (str): The LLM model name, depends on the LLM's providers to be used `client` being used.
+    temperature (float): LLM temperature setting.
+    max_tokens (int): Maximum number of tokens in the LLM's response.
 
-### Summarization Scorer
+Methods:
+    score(output: str, context: str) -> HallucinationResponse:
+        Analyzes the output to detect hallucinations based on the given context.
 
+### SummarizationScorer
 
-## Programmatic Evaluators
+A Scorer that evaluates the quality of summaries in two ways:
+    - using an LLM to calculate the entity density of the summary, similar to how entity density is
+    used in the Chain of Density paper, https://arxiv.org/abs/2309.04269. This is a rough measure for
+    how information-dense the summary is.
+    - using another LLM evaluator to grade the summary quality from `poor`, `ok`, to `excellent`. These
+    grades are then mapped to numerical scores, {`poor`: 0.0, `ok`: 0.5, `excellent`: 1.0}, in order to
+    be able to calculate an average score across a dataset of summaries if needed.
+
+To customise the LLM evaluator you can customise the `summarization_evaluation_system_prompt`and
+`summarization_evaluation_prompt` attributes to be tailored your specific definition of what a good summary
+should look like.
+
+Note:
+    - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM 
+    provider's response; you will have to install the `instructor` python package to use it.
+    - The `score` method expects the input column from the dataset to be named "input". If your dataset
+    column has a different name, you can specify a different mapping using the `column_map` argument in the 
+    init of SummarizationScorer by passing `column_map={"input": "news_article"}`.
 
+Attributes:
+    extraction_system_prompt (str): System prompt to extract the distinct entities in the input. Customising 
+    this can help ensure that the LLM identifies the `entities` that you care about.
+    extraction_prompt (str): Prompt template for entity extraction; must contain a `{text}` placeholder.
+    summarization_evaluation_system_prompt (str): System prompt defining how to evaluate the quality of a summary.
+        Asks an LLM to grade the summary from `poor`, `ok`, to `excellent` and provide a rationale for the grade.
+    summarization_evaluation_prompt (str): Prompt template for summarization evaluation instruction; must contain 
+        `{input}` and `{summary}` placeholders. 
+    entity_density_threshold (float): Threshold for determining if a summary is sufficiently entity-dense.
+    model_id (str): The LLM model name, depends on the LLM's providers to be used `client` being used.
+    temperature (float): LLM temperature setting.
+    max_tokens (int): Maximum number of tokens in the LLM's response.
+
+Methods:
+    extract_entities(text: str) -> List[str]:
+        Uses an LLM to extract unique entities from the text.
+
+    evaluate_summary(input: str, summary: str) -> SummarizationEvaluationResponse:
+        Evaluates the quality of a summary using an LLM.
+
+    score(input: str, output: str, **kwargs: Any) -> dict:
+        Calculates summarization score and entity density score for the given input and output.
+
+
+## Programmatic Evaluators
 
 ### ValidJSONScorer
 
diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index 6cae7d157d85..237d7f5ddb2b 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -6,7 +6,7 @@
     HallucinationResponse,
 )
 from weave.scorers import (
-    HallucinationScorer,
+    HallucinationFreeScorer,
 )
 
 
@@ -31,7 +31,7 @@ def _mock_create(*args, **kwargs):
 
 @pytest.fixture
 def hallucination_scorer(mock_create):
-    return HallucinationScorer(
+    return HallucinationFreeScorer(
         client=OpenAI(api_key="DUMMY_API_KEY"),
         model_id="gpt-4o",
         temperature=0.7,
@@ -40,7 +40,7 @@ def hallucination_scorer(mock_create):
 
 
 def test_hallucination_scorer_initialization(hallucination_scorer):
-    assert isinstance(hallucination_scorer, HallucinationScorer)
+    assert isinstance(hallucination_scorer, HallucinationFreeScorer)
     assert hallucination_scorer.model_id == "gpt-4o"
     assert hallucination_scorer.temperature == 0.7
     assert hallucination_scorer.max_tokens == 4096
diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py
index a01a652f0af5..86eb22f52dd3 100644
--- a/weave/flow/scorers/__init__.py
+++ b/weave/flow/scorers/__init__.py
@@ -7,7 +7,7 @@
     MultiTaskBinaryClassificationF1,
     transpose,
 )
-from weave.flow.scorers.hallucination_scorer import HallucinationScorer
+from weave.flow.scorers.hallucination_scorer import HallucinationFreeScorer
 from weave.flow.scorers.json_scorer import ValidJSONScorer
 from weave.flow.scorers.llm_scorer import (
     InstructorLLMScorer,
@@ -33,7 +33,7 @@
     "ContextRelevancyScorer",
     "EmbeddingSimilarityScorer",
     "get_scorer_attributes",
-    "HallucinationScorer",
+    "HallucinationFreeScorer",
     "InstructorLLMScorer",
     "ValidJSONScorer",
     "LevenshteinScorer",
diff --git a/weave/flow/scorers/hallucination_scorer.py b/weave/flow/scorers/hallucination_scorer.py
index c55f85d08b92..e7de0b174748 100644
--- a/weave/flow/scorers/hallucination_scorer.py
+++ b/weave/flow/scorers/hallucination_scorer.py
@@ -1,3 +1,4 @@
+import json
 from typing import List
 
 from pydantic import BaseModel, Field
@@ -104,7 +105,7 @@ class HallucinationResponse(BaseModel):
     )
 
 
-class HallucinationScorer(InstructorLLMScorer):
+class HallucinationFreeScorer(InstructorLLMScorer):
     """
     A Scorer that uses an LLM to determine if the model output contains any hallucinations
     based on the input data.
@@ -117,7 +118,7 @@ class HallucinationScorer(InstructorLLMScorer):
         - The `score` method expects the input column from the dataset to be named "context". It will use
         this data as the ground-truth to check hallucinations against. If your dataset column has a 
         different name, you can specify a different mapping using the `column_map` argument in the init 
-        of HallucinationScorer by passing `column_map={"context": "context"}`.
+        of HallucinationFreeScorer by passing `column_map={"context": "context"}`.
 
     Attributes:
         system_prompt (str): The prompt describing the task, defines what a "hallucination" is.
@@ -157,4 +158,12 @@ def score(self, output: str, context: str) -> HallucinationResponse:
             temperature=self.temperature,
             max_tokens=self.max_tokens,
         )
-        return response
+        hallucination_reasonings = [
+            r.model_dump_json() for r in response.hallucination_reasonings
+        ]
+        return {
+            "hallucination_free": not response.is_hallucination,
+            "conclusion": response.conclusion,
+            "reasonings": json.dumps(hallucination_reasonings),
+            "chain_of_thought": response.chain_of_thought,
+        }

From 7c129f39185d225d2be837c326e430a0723017be Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 14 Oct 2024 12:08:39 +0200
Subject: [PATCH 057/150] clear gemini message

---
 weave/flow/scorers/llm_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/weave/flow/scorers/llm_utils.py b/weave/flow/scorers/llm_utils.py
index 8e81c3a3537b..a3938dad482e 100644
--- a/weave/flow/scorers/llm_utils.py
+++ b/weave/flow/scorers/llm_utils.py
@@ -6,8 +6,6 @@
 
 autopatch()  # ensure both weave patching and instructor patching are applied
 
-# TODO: Gemini
-
 OPENAI_DEFAULT_MODEL = "gpt-4o"
 OPENAI_DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
 

From ef4e3ca8793f932f921be60a280de1b92122c778 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 14 Oct 2024 13:54:36 +0200
Subject: [PATCH 058/150] add full eval test

---
 tests/scorers/test_summarization_scorer.py | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
index 60b026b30806..2dc6bb6e820e 100644
--- a/tests/scorers/test_summarization_scorer.py
+++ b/tests/scorers/test_summarization_scorer.py
@@ -1,4 +1,5 @@
 import pytest
+import weave
 from openai import OpenAI
 
 from weave.flow.scorers.summarization_scorer import (
@@ -79,3 +80,32 @@ def test_summarization_scorer_extract_entities(summarization_scorer):
     assert len(entities) == 2
     assert "entity1" in entities
     assert "entity2" in entities
+
+@pytest.mark.asyncio
+async def test_evaluate_summary_scorer(summarization_scorer):
+    dataset = [{
+        "input": "This is the original text.",
+        "output": "This is the summary.",
+    },
+    {
+        "input": "This is another original text.",
+        "output": "This is another summary.",
+    }]
+    evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])
+    
+    @weave.op
+    def model(input: str): 
+        return "This is the summary."
+    
+    result = await evaluation.evaluate(model)
+    assert isinstance(result, dict)
+    assert 'SummarizationScorer' in result
+    assert 'entity_density' in result['SummarizationScorer']
+    assert 'is_entity_dense' in result['SummarizationScorer']
+    assert 'summarization_eval_score' in result['SummarizationScorer']
+    assert 'model_latency' in result
+
+    assert result['SummarizationScorer']['entity_density']['mean'] == pytest.approx(0.5)
+    assert result['SummarizationScorer']['is_entity_dense']['true_count'] == 2
+    assert result['SummarizationScorer']['is_entity_dense']['true_fraction'] == 1.0
+    assert result['SummarizationScorer']['summarization_eval_score']['mean'] == 1.0
\ No newline at end of file

From bd45db66eaa1fe7ad6686895951b206d57532f29 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 14 Oct 2024 13:54:59 +0200
Subject: [PATCH 059/150] deal with gemini kwargs

---
 weave/flow/scorers/llm_utils.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/weave/flow/scorers/llm_utils.py b/weave/flow/scorers/llm_utils.py
index a3938dad482e..13ee777dcd9e 100644
--- a/weave/flow/scorers/llm_utils.py
+++ b/weave/flow/scorers/llm_utils.py
@@ -62,7 +62,18 @@ def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ign
         raise ValueError(f"Unsupported client type: {client_type}")
 
 
+import json
 def create(client: instructor.client, *args, **kwargs) -> Any:  # type: ignore
+    # gemini has slightly different argument namings...
+    # max_tokens -> max_output_tokens
+    if "generativemodel" in type(client.client).__name__.lower():
+        max_output_tokens = kwargs.pop("max_tokens")
+        temperature = kwargs.pop("temperature", None)
+        _ = kwargs.pop("model") # model is baked in the client
+        kwargs['generation_config'] = dict(
+            max_output_tokens=max_output_tokens,
+            temperature=temperature,
+        )
     return client.chat.completions.create(*args, **kwargs)
 
 

From fae45d61acf8d3f86df417ddb78393b44c9c7eb1 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 14 Oct 2024 21:57:29 +0200
Subject: [PATCH 060/150] better column map error

---
 weave/flow/eval.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 4c211e5d5469..454230a37c02 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -210,9 +210,21 @@ async def predict_and_score(
                 "model_output" not in score_arg_names
                 and "output" not in score_arg_names
             ):
-                raise OpCallError(
-                    f"Scorer {scorer_name} must have a 'model_output' or 'output' argument, to receive the output of the model function."
+                message = textwrap.dedent(
+                    f"""
+                    Call error {e}
+
+                    Scorer {scorer_name} must have a 'model_output' or 'output' argument, to receive the output of the model function.
+                    You can also set the `scorer.column_map` attribute to map dataset columns to the expected parameter names in the scorer.
+                    For example, if the scorer expects "input" and "ground_truth" and we have a dataset
+                    with columns "question" and "answer", column_map should be defined as follows:
+                    {"input": "question", "ground_truth": "answer"}
+                    scorer.column_map: {scorer.column_map}
+                    score_arg_names: {score_arg_names}
+                    example: {example}
+                    """
                 )
+                raise OpCallError(message)
 
             if isinstance(example, dict):
                 # The keys of `score_args` must match the parameter names of the scorer's `score` method.

From 748d3f494a119e676afa8b6f7e74b1fdc17b1b51 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 14 Oct 2024 21:57:53 +0200
Subject: [PATCH 061/150] test LLM integrations

---
 tests/scorers/test_llm_integrations.py | 74 ++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 tests/scorers/test_llm_integrations.py

diff --git a/tests/scorers/test_llm_integrations.py b/tests/scorers/test_llm_integrations.py
new file mode 100644
index 000000000000..1cc19541450f
--- /dev/null
+++ b/tests/scorers/test_llm_integrations.py
@@ -0,0 +1,74 @@
+import os
+import pytest
+from itertools import product
+
+from pydantic import BaseModel
+
+from weave.flow.scorers.summarization_scorer import SummarizationScorer, SummarizationEvaluationResponse
+
+# Define providers and their models
+TEST_MODELS = {
+    "openai": ["gpt-4o-mini", "gpt-4o"],
+    "anthropic": ["claude-3-haiku-20240307", "claude-3-5-sonnet-20240620"],
+    "mistral": ["mistral-small-latest", "mistral-large-latest"],
+    "gemini": ["gemini-1.5-flash", "gemini-1.5-pro-latest"],
+}
+
+def get_client_and_model(provider, model):
+    api_key_env_vars = {
+        "openai": "OPENAI_API_KEY",
+        "anthropic": "ANTHROPIC_API_KEY",
+        "mistral": "MISTRAL_API_KEY",
+        "gemini": "GEMINI_API_KEY",
+    }
+
+    if provider not in TEST_MODELS:
+        raise ValueError(f"Unknown provider: {provider}")
+
+    if model not in TEST_MODELS[provider]:
+        raise ValueError(f"Model '{model}' not available for provider '{provider}'")
+
+    api_key = os.getenv(api_key_env_vars[provider])
+    if not api_key:
+        raise EnvironmentError(f"API key for {provider} not found. Please set '{api_key_env_vars[provider]}' environment variable.")
+
+    if provider == "openai":
+        from openai import OpenAI
+        client = OpenAI(api_key=api_key)
+    elif provider == "anthropic":
+        from anthropic import Anthropic
+        client = Anthropic(api_key=api_key)
+    elif provider == "mistral":
+        from mistralai import Mistral
+        client = Mistral(api_key=api_key)
+    elif provider == "gemini":
+        import google.generativeai as genai
+        genai.configure(api_key=api_key)
+        client = genai.GenerativeModel(model_name=model)
+        model = "gemini"  # Adjust if necessary
+
+    return client, model
+
+# Generate test parameters
+test_params = [(provider, model) for provider, models in TEST_MODELS.items() for model in models]
+
+@pytest.mark.parametrize("provider,model", test_params, ids=lambda p: f"{p[0]}:{p[1]}")
+def test_summarization_scorer_evaluate_summary(provider, model):
+    try:
+        client, model_id = get_client_and_model(provider, model)
+    except (ValueError, EnvironmentError) as e:
+        pytest.skip(str(e))
+    
+    summarization_scorer = SummarizationScorer(
+        client=client,
+        model_id=model_id,
+        temperature=0.7,
+        max_tokens=1024,
+    )
+    input_text = "This is the original text."
+    summary_text = "This is the summary."
+    result = summarization_scorer.evaluate_summary(
+        input=input_text,
+        summary=summary_text
+    )
+    assert isinstance(result, SummarizationEvaluationResponse)

From 2445c97c43e1d18f14d4ab38f763ae86d397d4b6 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 14 Oct 2024 22:01:33 +0200
Subject: [PATCH 062/150] update reqa

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d8fa201b7f96..952cb5e78255 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,7 +65,7 @@ litellm = ["litellm>=1.36.1"]
 llamaindex = ["llama-index>=0.10.35"]
 mistral0 = ["mistralai>=0.1.8,<1.0.0"]
 mistral1 = ["mistralai>=1.0.0"]
-scorers = ["openai>=1.0.0", "instructor>=1.5.2", "Levenshtein>=0.26.0"]
+scorers = ["openai>=1.0.0", "instructor>=1.5.2", "Levenshtein>=0.26.0", "google-generativeai>=0.8.0", "mistralai>=1.0.3", "anthropic>=0.30.0"]
 notdiamond = ["notdiamond>=0.3.21"]
 openai = ["openai>=1.0.0"]
 modal = ["modal", "python-dotenv"]

From 16ec03fdb83c946d0050ae04aeeed5d600e8e585 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 14 Oct 2024 22:36:53 +0200
Subject: [PATCH 063/150] fix test

---
 tests/scorers/test_hallucination_scorer.py | 17 ++++-------------
 weave/flow/scorers/hallucination_scorer.py | 16 ++++------------
 2 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index 237d7f5ddb2b..8f5241028c76 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -16,14 +16,14 @@ def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
         return HallucinationResponse(
             chain_of_thought="The output is consistent with the input data.",
-            hallucination_reasonings=[
+            reasonings=[
                 HallucinationReasoning(
                     observation="My observation for this is that the output is consistent with the input data.",
                     hallucination_type="No Hallucination",
                 )
             ],
             conclusion="The output is consistent with the input data.",
-            is_hallucination=False,
+            hallucination_free=True,
         )
 
     monkeypatch.setattr("weave.flow.scorers.hallucination_scorer.create", _mock_create)
@@ -50,14 +50,5 @@ def test_hallucination_scorer_score(hallucination_scorer, mock_create):
     output = "John's favorite cheese is cheddar."
     context = "John likes various types of cheese."
     result = hallucination_scorer.score(output=output, context=context)
-    assert isinstance(result, HallucinationResponse)
-    assert not result.is_hallucination
-    assert isinstance(result.hallucination_reasonings, list)
-    assert isinstance(result.hallucination_reasonings[0], HallucinationReasoning)
-    assert result.chain_of_thought == "The output is consistent with the input data."
-    assert (
-        result.hallucination_reasonings[0].observation
-        == "My observation for this is that the output is consistent with the input data."
-    )
-    assert result.conclusion == "The output is consistent with the input data."
-    assert result.hallucination_reasonings[0].hallucination_type == "No Hallucination"
+    # we should be able to do this validation
+    _ = HallucinationResponse.model_validate(result)
diff --git a/weave/flow/scorers/hallucination_scorer.py b/weave/flow/scorers/hallucination_scorer.py
index e7de0b174748..3a25c3aa9a98 100644
--- a/weave/flow/scorers/hallucination_scorer.py
+++ b/weave/flow/scorers/hallucination_scorer.py
@@ -95,13 +95,13 @@ class HallucinationResponse(BaseModel):
         description="Think step by step about whether the <output> contains hallucinations \
 based on the <input_data>."
     )
-    hallucination_reasonings: List[HallucinationReasoning] = Field(
+    reasonings: List[HallucinationReasoning] = Field(
         description="A list of reasoning steps that lead to the conclusion about whether or not\
 the <output> contains hallucinations."
     )
     conclusion: str = Field(description="The conclusion of the analysis.")
-    is_hallucination: bool = Field(
-        description="Whether the <output> contains hallucinations based on the <input_data>."
+    hallucination_free: bool = Field(
+        description="Whether the <output> is free of hallucinations based on the <input_data>. True means it is NOT a hallucination."
     )
 
 
@@ -158,12 +158,4 @@ def score(self, output: str, context: str) -> HallucinationResponse:
             temperature=self.temperature,
             max_tokens=self.max_tokens,
         )
-        hallucination_reasonings = [
-            r.model_dump_json() for r in response.hallucination_reasonings
-        ]
-        return {
-            "hallucination_free": not response.is_hallucination,
-            "conclusion": response.conclusion,
-            "reasonings": json.dumps(hallucination_reasonings),
-            "chain_of_thought": response.chain_of_thought,
-        }
+        return response.model_dump()  # Morgan wants this to be a dict

From 1eefa3103461a86dfec0c5625bb4969fd191f82b Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Tue, 15 Oct 2024 12:20:14 +0200
Subject: [PATCH 064/150] lint

---
 tests/scorers/test_llm_integrations.py     | 29 ++++++++-----
 tests/scorers/test_string_scorer.py        |  1 +
 tests/scorers/test_summarization_scorer.py | 47 ++++++++++++----------
 weave/flow/eval.py                         |  6 +--
 4 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/tests/scorers/test_llm_integrations.py b/tests/scorers/test_llm_integrations.py
index 1cc19541450f..efdc7baab447 100644
--- a/tests/scorers/test_llm_integrations.py
+++ b/tests/scorers/test_llm_integrations.py
@@ -1,10 +1,11 @@
 import os
-import pytest
-from itertools import product
 
-from pydantic import BaseModel
+import pytest
 
-from weave.flow.scorers.summarization_scorer import SummarizationScorer, SummarizationEvaluationResponse
+from weave.flow.scorers.summarization_scorer import (
+    SummarizationEvaluationResponse,
+    SummarizationScorer,
+)
 
 # Define providers and their models
 TEST_MODELS = {
@@ -14,6 +15,7 @@
     "gemini": ["gemini-1.5-flash", "gemini-1.5-pro-latest"],
 }
 
+
 def get_client_and_model(provider, model):
     api_key_env_vars = {
         "openai": "OPENAI_API_KEY",
@@ -30,27 +32,37 @@ def get_client_and_model(provider, model):
 
     api_key = os.getenv(api_key_env_vars[provider])
     if not api_key:
-        raise EnvironmentError(f"API key for {provider} not found. Please set '{api_key_env_vars[provider]}' environment variable.")
+        raise EnvironmentError(
+            f"API key for {provider} not found. Please set '{api_key_env_vars[provider]}' environment variable."
+        )
 
     if provider == "openai":
         from openai import OpenAI
+
         client = OpenAI(api_key=api_key)
     elif provider == "anthropic":
         from anthropic import Anthropic
+
         client = Anthropic(api_key=api_key)
     elif provider == "mistral":
         from mistralai import Mistral
+
         client = Mistral(api_key=api_key)
     elif provider == "gemini":
         import google.generativeai as genai
+
         genai.configure(api_key=api_key)
         client = genai.GenerativeModel(model_name=model)
         model = "gemini"  # Adjust if necessary
 
     return client, model
 
+
 # Generate test parameters
-test_params = [(provider, model) for provider, models in TEST_MODELS.items() for model in models]
+test_params = [
+    (provider, model) for provider, models in TEST_MODELS.items() for model in models
+]
+
 
 @pytest.mark.parametrize("provider,model", test_params, ids=lambda p: f"{p[0]}:{p[1]}")
 def test_summarization_scorer_evaluate_summary(provider, model):
@@ -58,7 +70,7 @@ def test_summarization_scorer_evaluate_summary(provider, model):
         client, model_id = get_client_and_model(provider, model)
     except (ValueError, EnvironmentError) as e:
         pytest.skip(str(e))
-    
+
     summarization_scorer = SummarizationScorer(
         client=client,
         model_id=model_id,
@@ -68,7 +80,6 @@ def test_summarization_scorer_evaluate_summary(provider, model):
     input_text = "This is the original text."
     summary_text = "This is the summary."
     result = summarization_scorer.evaluate_summary(
-        input=input_text,
-        summary=summary_text
+        input=input_text, summary=summary_text
     )
     assert isinstance(result, SummarizationEvaluationResponse)
diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py
index c2b0846a7c5a..a599bdd17cf7 100644
--- a/tests/scorers/test_string_scorer.py
+++ b/tests/scorers/test_string_scorer.py
@@ -19,6 +19,7 @@ def test_string_match_scorer_false():
     result = scorer.score(output, target)
     assert result["string_in_input"] is False
 
+
 def test_levenshtein_scorer():
     scorer = LevenshteinScorer()
     output = "Hello"
diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
index 2dc6bb6e820e..97090994550a 100644
--- a/tests/scorers/test_summarization_scorer.py
+++ b/tests/scorers/test_summarization_scorer.py
@@ -1,7 +1,7 @@
 import pytest
-import weave
 from openai import OpenAI
 
+import weave
 from weave.flow.scorers.summarization_scorer import (
     EntityExtractionResponse,
     SummarizationEvaluationResponse,
@@ -81,31 +81,34 @@ def test_summarization_scorer_extract_entities(summarization_scorer):
     assert "entity1" in entities
     assert "entity2" in entities
 
+
 @pytest.mark.asyncio
 async def test_evaluate_summary_scorer(summarization_scorer):
-    dataset = [{
-        "input": "This is the original text.",
-        "output": "This is the summary.",
-    },
-    {
-        "input": "This is another original text.",
-        "output": "This is another summary.",
-    }]
+    dataset = [
+        {
+            "input": "This is the original text.",
+            "output": "This is the summary.",
+        },
+        {
+            "input": "This is another original text.",
+            "output": "This is another summary.",
+        },
+    ]
     evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])
-    
+
     @weave.op
-    def model(input: str): 
+    def model(input: str):
         return "This is the summary."
-    
+
     result = await evaluation.evaluate(model)
     assert isinstance(result, dict)
-    assert 'SummarizationScorer' in result
-    assert 'entity_density' in result['SummarizationScorer']
-    assert 'is_entity_dense' in result['SummarizationScorer']
-    assert 'summarization_eval_score' in result['SummarizationScorer']
-    assert 'model_latency' in result
-
-    assert result['SummarizationScorer']['entity_density']['mean'] == pytest.approx(0.5)
-    assert result['SummarizationScorer']['is_entity_dense']['true_count'] == 2
-    assert result['SummarizationScorer']['is_entity_dense']['true_fraction'] == 1.0
-    assert result['SummarizationScorer']['summarization_eval_score']['mean'] == 1.0
\ No newline at end of file
+    assert "SummarizationScorer" in result
+    assert "entity_density" in result["SummarizationScorer"]
+    assert "is_entity_dense" in result["SummarizationScorer"]
+    assert "summarization_eval_score" in result["SummarizationScorer"]
+    assert "model_latency" in result
+
+    assert result["SummarizationScorer"]["entity_density"]["mean"] == pytest.approx(0.5)
+    assert result["SummarizationScorer"]["is_entity_dense"]["true_count"] == 2
+    assert result["SummarizationScorer"]["is_entity_dense"]["true_fraction"] == 1.0
+    assert result["SummarizationScorer"]["summarization_eval_score"]["mean"] == 1.0
diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 454230a37c02..4fbb04d76f60 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -212,14 +212,12 @@ async def predict_and_score(
             ):
                 message = textwrap.dedent(
                     f"""
-                    Call error {e}
-
                     Scorer {scorer_name} must have a 'model_output' or 'output' argument, to receive the output of the model function.
                     You can also set the `scorer.column_map` attribute to map dataset columns to the expected parameter names in the scorer.
                     For example, if the scorer expects "input" and "ground_truth" and we have a dataset
                     with columns "question" and "answer", column_map should be defined as follows:
-                    {"input": "question", "ground_truth": "answer"}
-                    scorer.column_map: {scorer.column_map}
+                    {{"input": "question", "ground_truth": "answer"}}
+                    scorer.column_map: {getattr(scorer, 'column_map', None)}
                     score_arg_names: {score_arg_names}
                     example: {example}
                     """

From 30e9d0180b824d625d9719144694f1899a3e3d03 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Wed, 16 Oct 2024 17:19:49 +0200
Subject: [PATCH 065/150] pass API env vars

---
 .github/workflows/test.yaml | 3 +++
 noxfile.py                  | 8 ++++++++
 2 files changed, 11 insertions(+)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a63d1c75ec4f..fd3c17598e96 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -292,6 +292,9 @@ jobs:
           WF_CLICKHOUSE_HOST: weave_clickhouse
           WEAVE_SERVER_DISABLE_ECOSYSTEM: 1
           GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         run: |
           nox -e "tests-${{ matrix.python-version-major }}.${{ matrix.python-version-minor }}(shard='${{ matrix.nox-shard }}')"
   trace-tests-matrix-check: # This job does nothing and is only used for the branch protection
diff --git a/noxfile.py b/noxfile.py
index c1c4c149960d..9b0c54b02678 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -63,6 +63,14 @@ def tests(session, shard):
     # Add the GOOGLE_API_KEY environment variable for the "google" shard
     if shard == "google_ai_studio":
         env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
+    
+    # we are doing some integration test in test_llm_integrations.py that requires
+    # setting some environment variables for the LLM providers
+    if shard == "scorers":
+        env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
+        env["ANTHROPIC_API_KEY"] = session.env.get("ANTHROPIC_API_KEY")
+        env["MISTRAL_API_KEY"] = session.env.get("MISTRAL_API_KEY")
+        env["OPENAI_API_KEY"] = session.env.get("OPENAI_API_KEY")
 
     default_test_dirs = [f"integrations/{shard}/"]
     test_dirs_dict = {

From a52fc8316b31483898d6af0ff00795193b96af55 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Wed, 16 Oct 2024 17:25:22 +0200
Subject: [PATCH 066/150] rename google key

---
 tests/scorers/test_llm_integrations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scorers/test_llm_integrations.py b/tests/scorers/test_llm_integrations.py
index efdc7baab447..82ca973a22c8 100644
--- a/tests/scorers/test_llm_integrations.py
+++ b/tests/scorers/test_llm_integrations.py
@@ -21,7 +21,7 @@ def get_client_and_model(provider, model):
         "openai": "OPENAI_API_KEY",
         "anthropic": "ANTHROPIC_API_KEY",
         "mistral": "MISTRAL_API_KEY",
-        "gemini": "GEMINI_API_KEY",
+        "gemini": "GOOGLE_API_KEY",
     }
 
     if provider not in TEST_MODELS:

From 633be159d1caa1674aa7ca81a8d96b487f18236d Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Wed, 16 Oct 2024 17:27:47 +0200
Subject: [PATCH 067/150] remove 3.13

---
 noxfile.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/noxfile.py b/noxfile.py
index 9b0c54b02678..e9e224275a91 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -11,6 +11,7 @@
     "litellm",
     "notdiamond",
     "google_ai_studio",
+    "scorers",
 ]
 
 

From 3ca802d9923ebcd01f3bc2487bb6cc627a5d0ab9 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Wed, 16 Oct 2024 17:45:58 +0200
Subject: [PATCH 068/150] lint

---
 weave/flow/scorers/hallucination_scorer.py | 11 +++++------
 weave/flow/scorers/llm_utils.py            | 11 ++++++-----
 weave/flow/scorers/string_scorer.py        |  3 +--
 weave/flow/scorers/summarization_scorer.py | 17 +++++++----------
 4 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/weave/flow/scorers/hallucination_scorer.py b/weave/flow/scorers/hallucination_scorer.py
index 3a25c3aa9a98..1c4e58b1cf6f 100644
--- a/weave/flow/scorers/hallucination_scorer.py
+++ b/weave/flow/scorers/hallucination_scorer.py
@@ -1,4 +1,3 @@
-import json
 from typing import List
 
 from pydantic import BaseModel, Field
@@ -111,18 +110,18 @@ class HallucinationFreeScorer(InstructorLLMScorer):
     based on the input data.
 
     Note:
-        - The meaning of "hallucination" can vary from person to person, you will likely want to 
+        - The meaning of "hallucination" can vary from person to person, you will likely want to
         customize the `system_prompt` and `user_prompt` to fit your specific needs.
-        - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM 
+        - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM
         provider's response; you will have to install the `instructor` python package to use it.
         - The `score` method expects the input column from the dataset to be named "context". It will use
-        this data as the ground-truth to check hallucinations against. If your dataset column has a 
-        different name, you can specify a different mapping using the `column_map` argument in the init 
+        this data as the ground-truth to check hallucinations against. If your dataset column has a
+        different name, you can specify a different mapping using the `column_map` argument in the init
         of HallucinationFreeScorer by passing `column_map={"context": "context"}`.
 
     Attributes:
         system_prompt (str): The prompt describing the task, defines what a "hallucination" is.
-        user_prompt (str): The string template to pass the input and output data. The template must 
+        user_prompt (str): The string template to pass the input and output data. The template must
         contain placeholders for both `{input_data}` and `{output}`.
         model_id (str): The LLM model name, depends on the LLM's providers to be used `client` being used.
         temperature (float): LLM temperature setting.
diff --git a/weave/flow/scorers/llm_utils.py b/weave/flow/scorers/llm_utils.py
index 13ee777dcd9e..c57172277b56 100644
--- a/weave/flow/scorers/llm_utils.py
+++ b/weave/flow/scorers/llm_utils.py
@@ -19,11 +19,13 @@
 if TYPE_CHECKING:
     import instructor
     from anthropic import Anthropic, AsyncAnthropic
+    from google.generativeai import GenerativeModel
     from mistralai import Mistral
     from openai import AsyncOpenAI, OpenAI
-    from google.generativeai import GenerativeModel
 
-    _LLM_CLIENTS = Union[OpenAI, AsyncOpenAI, Anthropic, AsyncAnthropic, Mistral, GenerativeModel]
+    _LLM_CLIENTS = Union[
+        OpenAI, AsyncOpenAI, Anthropic, AsyncAnthropic, Mistral, GenerativeModel
+    ]
 else:
     _LLM_CLIENTS = object
 
@@ -62,15 +64,14 @@ def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ign
         raise ValueError(f"Unsupported client type: {client_type}")
 
 
-import json
 def create(client: instructor.client, *args, **kwargs) -> Any:  # type: ignore
     # gemini has slightly different argument namings...
     # max_tokens -> max_output_tokens
     if "generativemodel" in type(client.client).__name__.lower():
         max_output_tokens = kwargs.pop("max_tokens")
         temperature = kwargs.pop("temperature", None)
-        _ = kwargs.pop("model") # model is baked in the client
-        kwargs['generation_config'] = dict(
+        _ = kwargs.pop("model")  # model is baked in the client
+        kwargs["generation_config"] = dict(
             max_output_tokens=max_output_tokens,
             temperature=temperature,
         )
diff --git a/weave/flow/scorers/string_scorer.py b/weave/flow/scorers/string_scorer.py
index 7525dd92e178..62f4eff72e07 100644
--- a/weave/flow/scorers/string_scorer.py
+++ b/weave/flow/scorers/string_scorer.py
@@ -1,5 +1,4 @@
-import re
-from typing import Callable, Union
+from typing import Callable
 
 from pydantic import Field, model_validator
 
diff --git a/weave/flow/scorers/summarization_scorer.py b/weave/flow/scorers/summarization_scorer.py
index eae5b3cfa6fe..7e2049071a8c 100644
--- a/weave/flow/scorers/summarization_scorer.py
+++ b/weave/flow/scorers/summarization_scorer.py
@@ -5,7 +5,7 @@
 
 import weave
 from weave.flow.scorers.llm_scorer import InstructorLLMScorer
-from weave.flow.scorers.llm_utils import create, OPENAI_DEFAULT_MODEL
+from weave.flow.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create
 
 DEFAULT_EXTRACTION_SYSTEM_PROMPT = """
 Given a <text>, extract all the unique entities from the text without repetition.
@@ -89,20 +89,20 @@ class SummarizationScorer(InstructorLLMScorer):
     should look like.
 
     Note:
-        - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM 
+        - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM
         provider's response; you will have to install the `instructor` python package to use it.
         - The `score` method expects the input column from the dataset to be named "input". If your dataset
-        column has a different name, you can specify a different mapping using the `column_map` argument in the 
+        column has a different name, you can specify a different mapping using the `column_map` argument in the
         init of SummarizationScorer by passing `column_map={"input": "news_article"}`.
 
     Attributes:
-        extraction_system_prompt (str): System prompt to extract the distinct entities in the input. Customising 
+        extraction_system_prompt (str): System prompt to extract the distinct entities in the input. Customising
         this can help ensure that the LLM identifies the `entities` that you care about.
         extraction_prompt (str): Prompt template for entity extraction; must contain a `{text}` placeholder.
         summarization_evaluation_system_prompt (str): System prompt defining how to evaluate the quality of a summary.
             Asks an LLM to grade the summary from `poor`, `ok`, to `excellent` and provide a rationale for the grade.
-        summarization_evaluation_prompt (str): Prompt template for summarization evaluation instruction; must contain 
-            `{input}` and `{summary}` placeholders. 
+        summarization_evaluation_prompt (str): Prompt template for summarization evaluation instruction; must contain
+            `{input}` and `{summary}` placeholders.
         entity_density_threshold (float): Threshold for determining if a summary is sufficiently entity-dense.
         model_id (str): The LLM model name, depends on the LLM's providers to be used `client` being used.
         temperature (float): LLM temperature setting.
@@ -119,7 +119,6 @@ class SummarizationScorer(InstructorLLMScorer):
             Calculates summarization score and entity density score for the given input and output.
     """
 
-
     extraction_system_prompt: str = DEFAULT_EXTRACTION_SYSTEM_PROMPT
     extraction_prompt: str = DEFAULT_EXTRACTION_USER_PROMPT
     summarization_evaluation_system_prompt: str = (
@@ -179,9 +178,7 @@ def simple_word_tokenize(self, text: str) -> List[str]:
 
     @weave.op
     async def score(self, input: str, output: str, **kwargs: Any) -> dict:
-        extract_task = asyncio.to_thread(
-            self.extract_entities, text=str(output)
-        )
+        extract_task = asyncio.to_thread(self.extract_entities, text=str(output))
         evaluate_task = asyncio.to_thread(
             self.evaluate_summary, input=str(input), summary=str(output)
         )

From 5dfed1f244a9f6120fd5a8a9ad39df63fe2e32f7 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Wed, 16 Oct 2024 17:52:02 +0200
Subject: [PATCH 069/150] whitespaces :/

---
 noxfile.py                      | 2 +-
 tests/trace/test_evaluations.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/noxfile.py b/noxfile.py
index e9e224275a91..8188ead202fc 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -64,7 +64,7 @@ def tests(session, shard):
     # Add the GOOGLE_API_KEY environment variable for the "google" shard
     if shard == "google_ai_studio":
         env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
-    
+
     # we are doing some integration test in test_llm_integrations.py that requires
     # setting some environment variables for the LLM providers
     if shard == "scorers":
diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py
index 4356db9ffbfd..b8cd55101cd7 100644
--- a/tests/trace/test_evaluations.py
+++ b/tests/trace/test_evaluations.py
@@ -882,6 +882,8 @@ def model_function(col1, col2):
     assert (
         eval_out["AnotherDummyScorer"]["match"] == expected_results_another_dummy
     ), "No matches should be found for AnotherDummyScorer"
+
+
 async def test_feedback_is_correctly_linked(client):
     @weave.op
     def predict(text: str) -> str:

From adfad10dbfac54d2ae52bad797c6ec4116e0ab50 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Wed, 16 Oct 2024 19:14:43 +0100
Subject: [PATCH 070/150] re-write scorer docs for grade 7

---
 docs/docs/guides/evaluation/scorers.md | 134 ++++++++++++++++++++++++-
 1 file changed, 131 insertions(+), 3 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 99507fd7a525..eebe009800b0 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -1,17 +1,145 @@
 # Evaluation Metrics
 
-We call evaluation metrics "Scorers",
+## Evaluation Metrics in Weave
+In Weave, Scorers are tools that help you evaluate the outputs from your AI system. They take the AI's output, analyze it, and return a dictionary with the results. Scorers can also look at parts of your input data and provide extra information, like explanations of their evaluations.
+
+You use Scorers when creating a `weave.Evaluation object`. There are two main types of Scorers:
+
+1. Function-based Scorers: Simple Python functions decorated with @weave.op.
+2. Class-based Scorers: Python classes that inherit from weave.Scorer for more complex evaluations.
+
+## Function-based Scorers
+These are straightforward functions that return a dictionary. They're great for simple evaluations.
+
+```python
+@weave.op
+def evaluate_uppercase(text: str):
+    return  {"text_is_uppercase": text.isupper()}
+
+eval = weave.Evaluations(..., scorers=[evaluate_uppercase])
+```
+
+In this example, evaluate_uppercase checks if the text is all uppercase.
+
+## Class-based Scorers
+For more advanced evaluations, especially when you need to keep track of additional information or make multiple function calls, you can create a Scorer class.
+
+**Requirements:**
+- Inherit from weave.Scorer.
+- Define a score method decorated with @weave.op.
+- The score method must return a dictionary.
+
+Example:
+
+
+```python
+from weave import Scorer
+
+class SummarizationScorer(Scorer):
+    model_id : "the LLM model to use"
+
+    @weave.op
+    def score(output, text)
+        '''
+            output: The summary generated by an AI system
+            text: The original text being summarised
+        '''
+        ...  # evaluate the quality of the summary
+
+summarization_scorer = SummarizationScorer(model_id="o2")
+eval = weave.Evaluations(..., scorers=[summarization_scorer])
+```
+This class evaluates how good a summary is by comparing it to the original text.
+
+## How Scorers Work
+### Using Keyword Arguments
+Scorers can access both the output from your AI system and the input data.
+
+- **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output.
+
+- **Input:** Add parameters that match the names of the columns in your dataset to access input data.
+
+For example, if your dataset has a "news_article" column, you can access it in the scorer by adding a `news_article` parameter to your scorer's signature.
+
+### Mapping Column Names
+Sometimes, the scorer's parameter names don't match the column names in your dataset. You can fix this using a `column_map`.
+
+If you're using a class-based scorer, pass a dictionary to the `column_map` attribute when you create the scorer. This dictionary maps your scorer's parameter names to the dataset's column names, in the order: `{scorer keyword argument : dataset column name}`.
+
+Example:
+
+```python
+from weave import Scorer
+
+# A dataset with news articles to be summarised
+dataset = [
+    {"news_article": "The news today was great...", "date": "2030-04-20", "source": "Bright Sky Network"}
+    ...
+]
+
+# Scorer class
+class SummarizationScorer(Scorer)
+    
+    @weave.op
+    def score(output, text)
+        """
+            output: output summary from a LLM summarization system
+            text: the text being summarised
+        """
+        ...  # evaluate the quality of the summary
+
+# create a scorer with a column map
+scorer = SummarizationScorer(column_map = {"text" : "news_article"})
+```
+Here, the text parameter in the score method will receive data from the `news_article` column.
+
+
+In this case, weave Evaluations will automatically pass the output of the LLM summarization system to the `output` parameter in the `score` function and will also extract the value for the `news_article` key in the input dataset row and pass it to the `text` parameter in the scorer.
+
+
+##nExamples of Scorers
+
+LLM-Powered Evaluators
+
+### `HallucinationFreeScorer`
+This scorer checks if your AI system's output includes any hallucinations based on the input data.
+
+**Notes:**
+- Customize the `system_prompt` and `user_prompt` attributes to define what "hallucination" means for you.
+- This scorer uses the `InstructorLLMScorer` class, so you'll need to install the `instructor` Python package.
+- The `score` method expects an input column named `context`. If your dataset uses a different name, use the `column_map` attribute to map `context` to the dataset column.
+
+## `SummarizationScorer`
+This scorer evaluates summaries in two ways:
+
+1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word summary count in order to estimate the "information density" of the summary. Uses an LLM to extract the entities.
+
+2. **Quality Grading:** Uses an LLM-evaluator to grade the summary as `poor`, `ok`, or `excellent`. These grades are converted to scores (0.0 for poor, 0.5 for ok, and 1.0 for excellent) so you can calculate averages.
+
+**Customization:**
+- Adjust `summarization_evaluation_system_prompt` and `summarization_evaluation_prompt` to define what makes a good summary.
+
+**Notes:**
+- This scorer uses the `InstructorLLMScorer` class.
+- The `score` method expects the original text that was summarized to be present in the `input` column of the dataset. Use the `column_map` class attribute to map `input` to the correct dataset column if needed.
+
 
 TODO:
-- Why use Scorers (re-use?)
+
+
+- Why use Scorers (re-use?) - you want a class not a function
 - Explain columm_map
+    only during Evaluation call
+
+    output - output of predict
+
     column_map: A `scorer parameter name : dataset column name` mapping.
     
     This summarization scorer expects the input column in the dataset to be named "input" \
         and the output column in the dataset to be named "summary".
         You can specify a different mapping in the `column_map` argument. For example, \
         if your dataset contains columns "news_article" and "news_summary" then you can \
-        specify `column_map={"input": "news_article", "output": "news_summary"}`.
+        specify `column_map={"input": "news_article", "output": "news_summary"}`. show explicit mapping in the `score` as an example
 
 ## LLM-powered Evaluators
 

From 13cee248d9239d4281f98b343a7e74373be48a3a Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Wed, 16 Oct 2024 20:13:00 +0100
Subject: [PATCH 071/150] finish scorers docs edits

---
 docs/docs/guides/evaluation/scorers.md | 307 ++++++++++++++++---------
 1 file changed, 195 insertions(+), 112 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index eebe009800b0..c0bfd9952931 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -1,15 +1,16 @@
 # Evaluation Metrics
 
-## Evaluation Metrics in Weave
-In Weave, Scorers are tools that help you evaluate the outputs from your AI system. They take the AI's output, analyze it, and return a dictionary with the results. Scorers can also look at parts of your input data and provide extra information, like explanations of their evaluations.
+## Evaluations in Weave
+In Weave, Scorers are tools that help you evaluate the outputs from your AI system. They take the AI's output, analyze it, and return a dictionary with the results. Scorers can also use other parts of your input data and can also provide extra information, such as explanations of their evaluations.
 
-You use Scorers when creating a `weave.Evaluation object`. There are two main types of Scorers:
+Scorers are passed to a `weave.Evaluation` object during evaluation. There are two types of Scorers:
+
+1. **Function-based Scorers:** Simple Python functions decorated with `@weave.op`.
+2. **Class-based Scorers:** Python classes that inherit from `weave.Scorer` for more complex evaluations.
 
-1. Function-based Scorers: Simple Python functions decorated with @weave.op.
-2. Class-based Scorers: Python classes that inherit from weave.Scorer for more complex evaluations.
 
 ## Function-based Scorers
-These are straightforward functions that return a dictionary. They're great for simple evaluations.
+These are functions, decorated with @weave.op that return a dictionary. They're great for simple evaluations:
 
 ```python
 @weave.op
@@ -22,12 +23,12 @@ eval = weave.Evaluations(..., scorers=[evaluate_uppercase])
 In this example, evaluate_uppercase checks if the text is all uppercase.
 
 ## Class-based Scorers
-For more advanced evaluations, especially when you need to keep track of additional information or make multiple function calls, you can create a Scorer class.
+For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different scorer prompt or make multiple function calls, you can use the `Scorer` class.
 
 **Requirements:**
-- Inherit from weave.Scorer.
-- Define a score method decorated with @weave.op.
-- The score method must return a dictionary.
+- Inherit from `weave.Scorer`.
+- Define a `score` method decorated with `@weave.op`.
+- The `score` method must return a dictionary.
 
 Example:
 
@@ -52,7 +53,7 @@ eval = weave.Evaluations(..., scorers=[summarization_scorer])
 This class evaluates how good a summary is by comparing it to the original text.
 
 ## How Scorers Work
-### Using Keyword Arguments
+### Keyword Arguments
 Scorers can access both the output from your AI system and the input data.
 
 - **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output.
@@ -64,7 +65,7 @@ For example, if your dataset has a "news_article" column, you can access it in t
 ### Mapping Column Names
 Sometimes, the scorer's parameter names don't match the column names in your dataset. You can fix this using a `column_map`.
 
-If you're using a class-based scorer, pass a dictionary to the `column_map` attribute when you create the scorer. This dictionary maps your scorer's parameter names to the dataset's column names, in the order: `{scorer keyword argument : dataset column name}`.
+If you're using a class-based scorer, pass a dictionary to the `column_map` attribute when you create the scorer. This dictionary maps your scorer's parameter names to the dataset's column names, in the order: `{scorer keyword argument : dataset column name}`. `column_map` is an attribute of the `Scorer` class.
 
 Example:
 
@@ -94,25 +95,58 @@ scorer = SummarizationScorer(column_map = {"text" : "news_article"})
 Here, the text parameter in the score method will receive data from the `news_article` column.
 
 
-In this case, weave Evaluations will automatically pass the output of the LLM summarization system to the `output` parameter in the `score` function and will also extract the value for the `news_article` key in the input dataset row and pass it to the `text` parameter in the scorer.
+In this case, weave Evaluations will automatically pass the output of the LLM summarization system to the `output` parameter in the `score` function and will extract the value for the `news_article` key in the input dataset row and pass it to the `text` parameter in the scorer.
+
 
+## Predefined Scorers
 
-##nExamples of Scorers
+**LLM-evaluators**
 
-LLM-Powered Evaluators
+The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also uses weave's `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them.
 
 ### `HallucinationFreeScorer`
+
 This scorer checks if your AI system's output includes any hallucinations based on the input data.
 
+```python
+from weave.scorers import HallucinationFreeScorer
+
+llm_client = # initialize your LLM client here
+
+scorer = HallucinationFreeScorer(
+    client=llm_client, 
+    model_id="gpt4o"
+)
+```
+
+**Customization:**
+- Customize the `system_prompt` and `user_prompt` attributes of the scorer to define what "hallucination" means for you.
+
 **Notes:**
-- Customize the `system_prompt` and `user_prompt` attributes to define what "hallucination" means for you.
-- This scorer uses the `InstructorLLMScorer` class, so you'll need to install the `instructor` Python package.
 - The `score` method expects an input column named `context`. If your dataset uses a different name, use the `column_map` attribute to map `context` to the dataset column.
 
+---
+
 ## `SummarizationScorer`
+
+Use an LLM to compare a summary to the original text and evaluate the quality of the summary.
+
+```python
+from weave.scorers import SummarizationScorer
+
+llm_client = # initialize your LLM client here
+
+scorer = SummarizationScorer(
+    client=llm_client, 
+    model_id="gpt4o"
+)
+```
+
+**How It Works:**
+
 This scorer evaluates summaries in two ways:
 
-1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word summary count in order to estimate the "information density" of the summary. Uses an LLM to extract the entities.
+1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word count in the summary in order to estimate the "information density" of the summary. Uses an LLM to extract the entities. Similar to how entity density is used in the Chain of Density paper, https://arxiv.org/abs/2309.04269
 
 2. **Quality Grading:** Uses an LLM-evaluator to grade the summary as `poor`, `ok`, or `excellent`. These grades are converted to scores (0.0 for poor, 0.5 for ok, and 1.0 for excellent) so you can calculate averages.
 
@@ -124,103 +158,152 @@ This scorer evaluates summaries in two ways:
 - The `score` method expects the original text that was summarized to be present in the `input` column of the dataset. Use the `column_map` class attribute to map `input` to the correct dataset column if needed.
 
 
-TODO:
+---
 
+### `OpenAIModerationScorer`
 
-- Why use Scorers (re-use?) - you want a class not a function
-- Explain columm_map
-    only during Evaluation call
+The `OpenAIModerationScorer` uses OpenAI's Moderation API to check if the AI system's output contains disallowed content, such as hate speech or explicit material.
 
-    output - output of predict
+```python
+from weave.flow.scorers.moderation_scorer import OpenAIModerationScorer
+import openai
 
-    column_map: A `scorer parameter name : dataset column name` mapping.
-    
-    This summarization scorer expects the input column in the dataset to be named "input" \
-        and the output column in the dataset to be named "summary".
-        You can specify a different mapping in the `column_map` argument. For example, \
-        if your dataset contains columns "news_article" and "news_summary" then you can \
-        specify `column_map={"input": "news_article", "output": "news_summary"}`. show explicit mapping in the `score` as an example
-
-## LLM-powered Evaluators
-
-### HallucinationFreeScorer
-
-A Scorer that uses an LLM to determine if the model output contains any hallucinations
-based on the input data.
-
-Note:
-    - The meaning of "hallucination" can vary from person to person, you will likely want to 
-    customize the `system_prompt` and `user_prompt` to fit your specific needs.
-    - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM 
-    provider's response; you will have to install the `instructor` python package to use it.
-    - The `score` method expects the input column from the dataset to be named "context". It will use
-    this data as the ground-truth to check hallucinations against. If your dataset column has a 
-    different name, you can specify a different mapping using the `column_map` argument in the init 
-    of HallucinationFreeScorer by passing `column_map={"context": "context"}`.
-
-Attributes:
-    system_prompt (str): The prompt describing the task, defines what a "hallucination" is.
-    user_prompt (str): The string template to pass the input and output data. The template must 
-    contain placeholders for both `{input_data}` and `{output}`.
-    model_id (str): The LLM model name, depends on the LLM's providers to be used `client` being used.
-    temperature (float): LLM temperature setting.
-    max_tokens (int): Maximum number of tokens in the LLM's response.
-
-Methods:
-    score(output: str, context: str) -> HallucinationResponse:
-        Analyzes the output to detect hallucinations based on the given context.
-
-### SummarizationScorer
-
-A Scorer that evaluates the quality of summaries in two ways:
-    - using an LLM to calculate the entity density of the summary, similar to how entity density is
-    used in the Chain of Density paper, https://arxiv.org/abs/2309.04269. This is a rough measure for
-    how information-dense the summary is.
-    - using another LLM evaluator to grade the summary quality from `poor`, `ok`, to `excellent`. These
-    grades are then mapped to numerical scores, {`poor`: 0.0, `ok`: 0.5, `excellent`: 1.0}, in order to
-    be able to calculate an average score across a dataset of summaries if needed.
-
-To customise the LLM evaluator you can customise the `summarization_evaluation_system_prompt`and
-`summarization_evaluation_prompt` attributes to be tailored your specific definition of what a good summary
-should look like.
-
-Note:
-    - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM 
-    provider's response; you will have to install the `instructor` python package to use it.
-    - The `score` method expects the input column from the dataset to be named "input". If your dataset
-    column has a different name, you can specify a different mapping using the `column_map` argument in the 
-    init of SummarizationScorer by passing `column_map={"input": "news_article"}`.
-
-Attributes:
-    extraction_system_prompt (str): System prompt to extract the distinct entities in the input. Customising 
-    this can help ensure that the LLM identifies the `entities` that you care about.
-    extraction_prompt (str): Prompt template for entity extraction; must contain a `{text}` placeholder.
-    summarization_evaluation_system_prompt (str): System prompt defining how to evaluate the quality of a summary.
-        Asks an LLM to grade the summary from `poor`, `ok`, to `excellent` and provide a rationale for the grade.
-    summarization_evaluation_prompt (str): Prompt template for summarization evaluation instruction; must contain 
-        `{input}` and `{summary}` placeholders. 
-    entity_density_threshold (float): Threshold for determining if a summary is sufficiently entity-dense.
-    model_id (str): The LLM model name, depends on the LLM's providers to be used `client` being used.
-    temperature (float): LLM temperature setting.
-    max_tokens (int): Maximum number of tokens in the LLM's response.
-
-Methods:
-    extract_entities(text: str) -> List[str]:
-        Uses an LLM to extract unique entities from the text.
-
-    evaluate_summary(input: str, summary: str) -> SummarizationEvaluationResponse:
-        Evaluates the quality of a summary using an LLM.
-
-    score(input: str, output: str, **kwargs: Any) -> dict:
-        Calculates summarization score and entity density score for the given input and output.
-
-
-## Programmatic Evaluators
-
-### ValidJSONScorer
-
-Validate whether a string is valid JSON.
+oai_client = OpenAI(api_key=...) # initialize your LLM client here
+
+scorer = OpenAIModerationScorer(
+    client=oai_client,
+    model_id="text-embedding-3-small"
+)
+```
+
+**How It Works:**
+
+- Sends the AI's output to the OpenAI Moderation endpoint and returns a dictionary indicating whether the content is flagged and details about the categories involved.
+
+**Notes:**
+- Requires the `openai` Python package.
+- The client must be an instance of OpenAI's `OpenAI` or `AsyncOpenAI` client.
+
+---
+
+### `EmbeddingSimilarityScorer`
+
+The `EmbeddingSimilarityScorer` computes the cosine similarity between the embeddings of the AI system's output and a target text from your dataset. It's useful for measuring how similar the AI's output is to a reference text.
+
+```python
+from weave.flow.scorers.similarity_score import EmbeddingSimilarityScorer
+
+llm_client = ...  # initialise your LlM client
+
+similarity_scorer = EmbeddingSimilarityScorer(
+    client=llm_client
+    target_column="reference_text",  # the dataset column to compare the output against
+    threshold=0.4  # the cosine similarity threshold to use  
+)
+```
+
+**Parameters:**
+
+- `target_column`: Name of the dataset column containing the reference text (default is `"text"`).
+- `threshold` (float): Minimum cosine similarity score considered as similar (default is `0.5`).
+
+---
+
+### `ValidJSONScorer`
+
+The ValidJSONScorer checks whether the AI system's output is valid JSON. This scorer is useful when you expect the output to be in JSON format and need to verify its validity.
+
+```python
+from weave.flow.scorers.json_scorer import ValidJSONScorer
+
+json_scorer = ValidJSONScorer()
+```
+
+**Notes:**
+- If the output cannot be parsed as JSON, or if it parses to a data type other than dict or list, it is considered invalid.
+
+---
+Please add this section to scorers_2.md under the "Predefined Scorers" heading.
+
+---
+
+### `ValidXMLScorer`
+
+The `ValidXMLScorer` checks whether the AI system's output is valid XML. This is useful when expecting XML-formatted outputs.
+
+```python
+from weave.flow.scorers.xml_scorer import ValidXMLScorer
+
+xml_scorer = ValidXMLScorer()
+```
 
+---
+
+### `PydanticScorer`
+
+The `PydanticScorer` validates the AI system's output against a Pydantic model to ensure it adheres to a specified schema or data structure.
+
+```python
+from weave.flow.scorers.pydantic_scorer import PydanticScorer
+from pydantic import BaseModel
+
+class FinancialReport(BaseModel):
+    revenue: int
+    year: str
+
+pydantic_scorer = PydanticScorer(model=Person)
 ```
-from weave.scorers import ValidJSONScorer
-```
\ No newline at end of file
+
+---
+
+### RAGAS - `ContextEntityRecallScorer`
+
+The `ContextEntityRecallScorer` estimates context recall by extracting entities from both the AI system's output and the provided context, then computing the recall score. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library
+
+```python
+from weave.flow.scorers.ragas_scorer import ContextEntityRecallScorer
+
+llm_client = ...  # initialise your LlM client
+
+entity_recall_scorer = ContextEntityRecallScorer(
+    client=llm_client
+    model_id="your-model-id"
+    )
+```
+
+**How It Works:**
+
+- Uses an LLM to extract unique entities from the output and context.
+- Calculates recall as the proportion of entities in the output that are present in the context.
+- Returns a dictionary with the recall score.
+
+**Notes:**
+
+- Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
+
+---
+
+### RAGAS - `ContextRelevancyScorer`
+
+The `ContextRelevancyScorer` evaluates the relevancy of the provided context to the AI system's output. It helps determine if the context used is appropriate for generating the output. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library.
+
+```python
+from weave.flow.scorers.ragas_scorer import ContextRelevancyScorer
+
+llm_client = ...  # initialise your LlM client
+
+relevancy_scorer = ContextRelevancyScorer(
+    llm_client = ...  # initialise your LlM client
+    model_id="your-model-id"
+    )
+```
+
+**How It Works:**
+
+- Uses an LLM to rate the relevancy of the context to the output on a scale from 0 to 1.
+- Returns a dictionary with the `relevancy_score`.
+
+**Notes:**
+
+- Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
+- Customize the `relevancy_prompt` to define how relevancy is assessed.
\ No newline at end of file

From 6dcbbbe0d4251e2c93a38e3fdc9c60dc1800dea6 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Wed, 16 Oct 2024 20:55:53 +0100
Subject: [PATCH 072/150] small docs fixes

---
 docs/docs/guides/evaluation/scorers.md | 37 +++++++++++++++++---------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index c0bfd9952931..8cb148e89ba6 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -1,13 +1,14 @@
 # Evaluation Metrics
 
 ## Evaluations in Weave
-In Weave, Scorers are tools that help you evaluate the outputs from your AI system. They take the AI's output, analyze it, and return a dictionary with the results. Scorers can also use other parts of your input data and can also provide extra information, such as explanations of their evaluations.
+In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics They take the AI's output, analyze it, and return a dictionary with the results. Scorers can use your input data as a reference if needed and can also output extra information, such as explanations or reasonings from the evaluation.
 
-Scorers are passed to a `weave.Evaluation` object during evaluation. There are two types of Scorers:
+Scorers are passed to a `weave.Evaluation` object during evaluation. There are two types of Scorers in weave:
 
 1. **Function-based Scorers:** Simple Python functions decorated with `@weave.op`.
 2. **Class-based Scorers:** Python classes that inherit from `weave.Scorer` for more complex evaluations.
 
+Scorers must return a dictionary and can return multiple metrics, nested metrics and non-numeric values such as texted returned from a LLM-evaluator about its reasoning.
 
 ## Function-based Scorers
 These are functions, decorated with @weave.op that return a dictionary. They're great for simple evaluations:
@@ -20,10 +21,10 @@ def evaluate_uppercase(text: str):
 eval = weave.Evaluations(..., scorers=[evaluate_uppercase])
 ```
 
-In this example, evaluate_uppercase checks if the text is all uppercase.
+When the evaluation is run, `evaluate_uppercase` checks if the text is all uppercase.
 
 ## Class-based Scorers
-For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different scorer prompt or make multiple function calls, you can use the `Scorer` class.
+For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different prompts for your LLM-evaluators or make multiple function calls, you can use the `Scorer` class.
 
 **Requirements:**
 - Inherit from `weave.Scorer`.
@@ -37,7 +38,18 @@ Example:
 from weave import Scorer
 
 class SummarizationScorer(Scorer):
-    model_id : "the LLM model to use"
+    model_id: str = "the LLM model to use"
+    system_prompt: str = "Evaluate whether the summary is good."
+
+    @weave.op
+    def some_complicated_preprocessing(text):
+        ...
+        return text
+
+    @weave.op
+    def llm_call(summary, text):
+        res = create(self.system_prompt, summary, text)
+        return {"summary_quality": res}
 
     @weave.op
     def score(output, text)
@@ -45,7 +57,9 @@ class SummarizationScorer(Scorer):
             output: The summary generated by an AI system
             text: The original text being summarised
         '''
-        ...  # evaluate the quality of the summary
+        text = some_complicated_preprocessing(text)
+        eval_result = call_llm(summary, text, self.prompt)
+        return {"summary_quality": eval_result}
 
 summarization_scorer = SummarizationScorer(model_id="o2")
 eval = weave.Evaluations(..., scorers=[summarization_scorer])
@@ -65,7 +79,7 @@ For example, if your dataset has a "news_article" column, you can access it in t
 ### Mapping Column Names
 Sometimes, the scorer's parameter names don't match the column names in your dataset. You can fix this using a `column_map`.
 
-If you're using a class-based scorer, pass a dictionary to the `column_map` attribute when you create the scorer. This dictionary maps your scorer's parameter names to the dataset's column names, in the order: `{scorer keyword argument : dataset column name}`. `column_map` is an attribute of the `Scorer` class.
+If you're using a class-based scorer, pass a dictionary to the `column_map` attribute of `Scorer` when you initialise your scorer class. This dictionary maps your scorer's parameter names to the dataset's column names, in the order: `{scorer keyword argument : dataset column name}`.
 
 Example:
 
@@ -89,13 +103,10 @@ class SummarizationScorer(Scorer)
         """
         ...  # evaluate the quality of the summary
 
-# create a scorer with a column map
+# create a scorer with a column mapping the `text` parameter to the `news_article` data column
 scorer = SummarizationScorer(column_map = {"text" : "news_article"})
 ```
-Here, the text parameter in the score method will receive data from the `news_article` column.
-
-
-In this case, weave Evaluations will automatically pass the output of the LLM summarization system to the `output` parameter in the `score` function and will extract the value for the `news_article` key in the input dataset row and pass it to the `text` parameter in the scorer.
+Here, the `text` parameter in the score method will receive data from the `news_article` column.
 
 
 ## Predefined Scorers
@@ -127,7 +138,7 @@ scorer = HallucinationFreeScorer(
 
 ---
 
-## `SummarizationScorer`
+### `SummarizationScorer`
 
 Use an LLM to compare a summary to the original text and evaluate the quality of the summary.
 

From 8a1d366e47973b79b6674ebb2e2a8b226c3e678c Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Wed, 16 Oct 2024 21:19:30 +0100
Subject: [PATCH 073/150] small docs fix

---
 docs/docs/guides/evaluation/scorers.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 8cb148e89ba6..ed7100da8d51 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -233,8 +233,6 @@ json_scorer = ValidJSONScorer()
 **Notes:**
 - If the output cannot be parsed as JSON, or if it parses to a data type other than dict or list, it is considered invalid.
 
----
-Please add this section to scorers_2.md under the "Predefined Scorers" heading.
 
 ---
 

From b7058e142f4bfd25faaa8e4426e24138c94c8252 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Wed, 16 Oct 2024 21:23:01 +0100
Subject: [PATCH 074/150] delete prints from eval.py

---
 weave/flow/eval.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index d3f7245ce9ef..443ae687df72 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -272,13 +272,6 @@ async def predict_and_score(
                 # input: is the full row, we have access to it via example
                 # output: is the model output, we have access to it via model_output
                 if isinstance(scorer, Scorer) and scorer.column_map is not None:
-                    print(
-                        f"scorer.column_map: {scorer.column_map}"
-                    )  # TODO: delete print statement
-                    print(
-                        f"score_arg_names: {score_arg_names}"
-                    )  # TODO: delete print statement
-                    print(f"example: {example}")  # TODO: delete print statement
                     score_args = {
                         arg: example[scorer.column_map.get(arg, arg)]
                         for arg in score_arg_names

From c05630aff71c7ca51ceb685d030f01e00f215425 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Thu, 17 Oct 2024 12:39:37 +0100
Subject: [PATCH 075/150] Update docs/docs/guides/evaluation/scorers.md

Co-authored-by: Andrew Truong <itsandrewtruong@gmail.com>
---
 docs/docs/guides/evaluation/scorers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index ed7100da8d51..e9dbd9855f72 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -1,7 +1,7 @@
 # Evaluation Metrics
 
 ## Evaluations in Weave
-In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics They take the AI's output, analyze it, and return a dictionary with the results. Scorers can use your input data as a reference if needed and can also output extra information, such as explanations or reasonings from the evaluation.
+In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics. They take the AI's output, analyze it, and return a dictionary of results. Scorers can use your input data as reference if needed and can also output extra information, such as explanations or reasonings from the evaluation.
 
 Scorers are passed to a `weave.Evaluation` object during evaluation. There are two types of Scorers in weave:
 

From fd9e001a75f82550d6c79f80988e07efb246f944 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Thu, 17 Oct 2024 12:40:20 +0100
Subject: [PATCH 076/150] Update docs/docs/guides/evaluation/scorers.md

Co-authored-by: Andrew Truong <itsandrewtruong@gmail.com>
---
 docs/docs/guides/evaluation/scorers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index e9dbd9855f72..d29aa2488555 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -11,7 +11,7 @@ Scorers are passed to a `weave.Evaluation` object during evaluation. There are t
 Scorers must return a dictionary and can return multiple metrics, nested metrics and non-numeric values such as texted returned from a LLM-evaluator about its reasoning.
 
 ## Function-based Scorers
-These are functions, decorated with @weave.op that return a dictionary. They're great for simple evaluations:
+These are functions decorated with `@weave.op` that return a dictionary. They're great for simple evaluations like:
 
 ```python
 @weave.op

From 88fbdc6a4be285d8280c630a357806da605a2448 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 13:59:31 +0200
Subject: [PATCH 077/150] remove unused kwargs

---
 weave/flow/scorers/json_scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weave/flow/scorers/json_scorer.py b/weave/flow/scorers/json_scorer.py
index 5c4a6172324f..026449507d50 100644
--- a/weave/flow/scorers/json_scorer.py
+++ b/weave/flow/scorers/json_scorer.py
@@ -7,7 +7,7 @@
 class ValidJSONScorer(Scorer):
     """Validate whether a string is valid JSON."""
 
-    def score(self, output: Any, **kwargs: Any) -> dict:  # type: ignore
+    def score(self, output: Any) -> dict:  # type: ignore
         try:
             result = json.loads(output)
 

From 1476c19e64e1d7b4d0b4e5e62e2d772fb637f3d8 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 13:59:42 +0200
Subject: [PATCH 078/150] add similarity scorer test

---
 tests/scorers/test_similarity_scorer.py       | 84 +++++++++++++++++++
 weave/flow/scorers/__init__.py                |  2 +-
 ...milarity_score.py => similarity_scorer.py} | 15 +---
 3 files changed, 86 insertions(+), 15 deletions(-)
 create mode 100644 tests/scorers/test_similarity_scorer.py
 rename weave/flow/scorers/{similarity_score.py => similarity_scorer.py} (73%)

diff --git a/tests/scorers/test_similarity_scorer.py b/tests/scorers/test_similarity_scorer.py
new file mode 100644
index 000000000000..6452bcdd8c2e
--- /dev/null
+++ b/tests/scorers/test_similarity_scorer.py
@@ -0,0 +1,84 @@
+import pytest
+from openai import OpenAI
+
+import weave
+from weave.flow.scorers.llm_utils import OPENAI_DEFAULT_EMBEDDING_MODEL
+from weave.flow.scorers.similarity_scorer import EmbeddingSimilarityScorer
+
+
+# mock the create function
+@pytest.fixture
+def mock_embed(monkeypatch):
+    def _mock_embed(*args, **kwargs):
+        import random
+        return [[random.random(), random.random()] for _ in range(2)]
+
+    monkeypatch.setattr("weave.flow.scorers.similarity_scorer.embed", _mock_embed)
+
+
+@pytest.fixture
+def similarity_scorer(mock_embed):
+    return EmbeddingSimilarityScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id=OPENAI_DEFAULT_EMBEDDING_MODEL,
+        threshold=0.9,
+    )
+
+
+def test_similarity_scorer_score(similarity_scorer):
+    output = "John's favorite cheese is cheddar."
+    target = "John likes various types of cheese."
+    similarity_scorer.threshold = 0.0
+    result = similarity_scorer.score(output=output, target=target)
+    assert result["similarity_score"] > 0.0
+    assert result["is_similar"] is True
+
+def test_similarity_scorer_not_similar(similarity_scorer):
+    output = "John's favorite cheese is cheddar."
+    target = "John likes various types of cheese."
+    similarity_scorer.threshold = 0.99
+    result = similarity_scorer.score(output=output, target=target)
+    assert result["similarity_score"] < 0.99
+    assert result["is_similar"] is False
+
+@pytest.mark.asyncio
+async def test_similarity_scorer_eval(similarity_scorer):
+    dataset = [
+        {"target": "John likes various types of cheese."},
+        {"target": "Pepe likes various types of cheese."},
+    ]
+
+    @weave.op
+    def model():
+        return "He's name is John"
+    
+
+    evaluation = weave.Evaluation(
+        dataset=dataset,
+        scorers=[similarity_scorer],
+    )
+    result = await evaluation.evaluate(model)
+    assert result['EmbeddingSimilarityScorer']["similarity_score"]["mean"] > 0.0
+    assert 0 <= result['EmbeddingSimilarityScorer']["is_similar"]["true_count"] <= 2
+
+
+@pytest.mark.asyncio
+async def test_similarity_scorer_eval2(similarity_scorer):
+    dataset = [
+        {"input": "He's name is John", "other_col": "John likes various types of cheese."},
+        {"input": "He's name is Pepe.", "other_col": "Pepe likes various types of cheese."},
+    ]
+
+    @weave.op
+    def model(input):
+        return "John likes various types of cheese."
+    
+    similarity_scorer.column_map = {"target": "other_col"}
+
+    evaluation = weave.Evaluation(
+        dataset=dataset,
+        scorers=[similarity_scorer],
+    )
+    result = await evaluation.evaluate(model)
+    assert result['EmbeddingSimilarityScorer']["similarity_score"]["mean"] > 0.0
+    assert 0 <= result['EmbeddingSimilarityScorer']["is_similar"]["true_count"] <= 2
diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py
index 86eb22f52dd3..2bf2f7b8d332 100644
--- a/weave/flow/scorers/__init__.py
+++ b/weave/flow/scorers/__init__.py
@@ -19,7 +19,7 @@
     ContextEntityRecallScorer,
     ContextRelevancyScorer,
 )
-from weave.flow.scorers.similarity_score import EmbeddingSimilarityScorer
+from weave.flow.scorers.similarity_scorer import EmbeddingSimilarityScorer
 from weave.flow.scorers.string_scorer import (
     LevenshteinScorer,
     StringMatchScorer,
diff --git a/weave/flow/scorers/similarity_score.py b/weave/flow/scorers/similarity_scorer.py
similarity index 73%
rename from weave/flow/scorers/similarity_score.py
rename to weave/flow/scorers/similarity_scorer.py
index 82d8760b7472..bf6c74a9613f 100644
--- a/weave/flow/scorers/similarity_score.py
+++ b/weave/flow/scorers/similarity_scorer.py
@@ -14,25 +14,12 @@ class EmbeddingSimilarityScorer(LLMScorer):
     The threshold is the minimum cosine similarity score that is considered similar.
 
     Args:
-        target_column: The column to compare the model output to. Defaults to "text".
         threshold: The minimum cosine similarity score that is considered similar. Defaults to 0.5
     """
-
-    target_column: str = Field(
-        ..., description="The column to compare the model output to"
-    )
     threshold: float = Field(0.5, description="The threshold for the similarity score")
 
     @weave.op
-    def score(self, output: Any, dataset_row: dict) -> Any:
-        if self.target_column not in dataset_row:
-            raise ValueError(
-                f"Target column {self.target_column} not found in dataset_row"
-            )
-
-        target = str(
-            dataset_row[self.target_column]
-        )  # TODO: handle if it is not a string
+    def score(self, output: str, target: str) -> Any:
         model_embedding, target_embedding = self._compute_embeddings(output, target)
         return self.cosine_similarity(model_embedding, target_embedding)
 

From 4b63fe45fbe2d2971023a69ac1b3d9f84c7bb0f1 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 14:21:06 +0200
Subject: [PATCH 079/150] add test for stringify

---
 tests/scorers/test_utils.py | 8 ++++++++
 weave/flow/scorers/utils.py | 8 ++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 tests/scorers/test_utils.py

diff --git a/tests/scorers/test_utils.py b/tests/scorers/test_utils.py
new file mode 100644
index 000000000000..2fae3db1157f
--- /dev/null
+++ b/tests/scorers/test_utils.py
@@ -0,0 +1,8 @@
+from weave.flow.scorers.utils import stringify
+
+
+def test_stringify():
+    assert stringify("Hello, world!") == "Hello, world!"
+    assert stringify(123) == "123"
+    assert stringify([1, 2, 3]) == "[\n  1,\n  2,\n  3\n]"
+    assert stringify({"a": 1, "b": 2}) == '{\n  "a": 1,\n  "b": 2\n}'
\ No newline at end of file
diff --git a/weave/flow/scorers/utils.py b/weave/flow/scorers/utils.py
index 175d6ac12d67..4080f304fb5e 100644
--- a/weave/flow/scorers/utils.py
+++ b/weave/flow/scorers/utils.py
@@ -5,8 +5,16 @@
 
 
 def stringify(output: Any) -> str:
+    """
+    Convert any output to a string. If the output is a Pydantic BaseModel,
+    convert it to a JSON string using the model's dump_json method.
+    """
     if isinstance(output, str):
         return output
+    elif isinstance(output, int):
+        return str(output)
+    elif isinstance(output, float):
+        return str(output)
     elif isinstance(output, (list, tuple)):
         return json.dumps(output, indent=2)
     elif isinstance(output, dict):

From 873b90ce8934892f80d490e790d91181e9f29dd4 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 14:21:12 +0200
Subject: [PATCH 080/150] add eval test

---
 tests/scorers/test_hallucination_scorer.py | 55 +++++++++++++++++++---
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index 8f5241028c76..85b647eb2710 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -1,6 +1,7 @@
 import pytest
 from openai import OpenAI
 
+import weave
 from weave.flow.scorers.hallucination_scorer import (
     HallucinationReasoning,
     HallucinationResponse,
@@ -39,16 +40,56 @@ def hallucination_scorer(mock_create):
     )
 
 
-def test_hallucination_scorer_initialization(hallucination_scorer):
-    assert isinstance(hallucination_scorer, HallucinationFreeScorer)
-    assert hallucination_scorer.model_id == "gpt-4o"
-    assert hallucination_scorer.temperature == 0.7
-    assert hallucination_scorer.max_tokens == 4096
-
-
 def test_hallucination_scorer_score(hallucination_scorer, mock_create):
     output = "John's favorite cheese is cheddar."
     context = "John likes various types of cheese."
     result = hallucination_scorer.score(output=output, context=context)
     # we should be able to do this validation
     _ = HallucinationResponse.model_validate(result)
+
+    assert result["hallucination_free"] == True
+    assert result["conclusion"] == "The output is consistent with the input data."
+    assert len(result["reasonings"]) == 1
+    assert result["reasonings"][0]["hallucination_type"] == "No Hallucination"
+
+
+@pytest.mark.asyncio
+async def test_hallucination_scorer_eval(hallucination_scorer):
+    dataset = [
+        {"context": "John likes various types of cheese."},
+        {"context": "Pepe likes various types of cheese."},
+    ]
+
+    @weave.op
+    def model():
+        return "John's favorite cheese is cheddar."
+
+    evaluation = weave.Evaluation(
+        dataset=dataset,
+        scorers=[hallucination_scorer],
+    )
+    result = await evaluation.evaluate(model)
+    assert result['HallucinationFreeScorer']["hallucination_free"]["true_count"] == 2
+    assert result['HallucinationFreeScorer']["hallucination_free"]["true_fraction"] == 1.0
+
+
+@pytest.mark.asyncio
+async def test_hallucination_scorer_eval2(hallucination_scorer):
+    dataset = [
+        {"input": "John likes various types of cheese.", "other_col": "John's favorite cheese is cheddar."},
+        {"input": "Pepe likes various types of cheese.", "other_col": "Pepe's favorite cheese is gouda."},
+    ]
+
+    @weave.op
+    def model(input):
+        return "The person's favorite cheese is cheddar."
+    
+    hallucination_scorer.column_map = {"context": "input", "output": "other_col"}
+
+    evaluation = weave.Evaluation(
+        dataset=dataset,
+        scorers=[hallucination_scorer],
+    )
+    result = await evaluation.evaluate(model)
+    assert result['HallucinationFreeScorer']["hallucination_free"]["true_count"] == 2
+    assert result['HallucinationFreeScorer']["hallucination_free"]["true_fraction"] == 1.0

From a847395db43a0c526c7444e7a357745b3653da44 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Thu, 17 Oct 2024 14:06:51 +0100
Subject: [PATCH 081/150] add column_map warnings, fix docs, make create and
 embed available

---
 docs/docs/guides/evaluation/scorers.md | 48 +++++++++++++------
 weave/flow/eval.py                     | 65 ++++++++++++++++++++------
 weave/flow/scorers/__init__.py         |  6 +++
 3 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index d29aa2488555..b026415f3b49 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -36,29 +36,32 @@ Example:
 
 ```python
 from weave import Scorer
+from weave.scorers import create
+
+llm_client = ...
 
 class SummarizationScorer(Scorer):
     model_id: str = "the LLM model to use"
     system_prompt: str = "Evaluate whether the summary is good."
 
     @weave.op
-    def some_complicated_preprocessing(text):
+    def some_complicated_preprocessing(self, text: str) -> str:
         ...
         return text
 
     @weave.op
-    def llm_call(summary, text):
-        res = create(self.system_prompt, summary, text)
+    def call_llm(self, summary: str, text: str) -> Dict:
+        res = llm_client.create(self.system_prompt, summary, text)
         return {"summary_quality": res}
 
     @weave.op
-    def score(output, text)
-        '''
-            output: The summary generated by an AI system
-            text: The original text being summarised
-        '''
+    def score(self, output: str, text: str) -> Dict
+        """"
+        output: The summary generated by an AI system
+        text: The original text being summarised
+        """"
         text = some_complicated_preprocessing(text)
-        eval_result = call_llm(summary, text, self.prompt)
+        eval_result = call_llm(summary, text)
         return {"summary_quality": eval_result}
 
 summarization_scorer = SummarizationScorer(model_id="o2")
@@ -67,14 +70,23 @@ eval = weave.Evaluations(..., scorers=[summarization_scorer])
 This class evaluates how good a summary is by comparing it to the original text.
 
 ## How Scorers Work
-### Keyword Arguments
-Scorers can access both the output from your AI system and the input data.
+### Scorer Keyword Arguments
+Scorers can access both the output from your AI system and the input data from the dataset row.
 
-- **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output.
+- **Input:** If you would like your scorer to use data from your dataset row, such as a "label" or "target" column then you can easily make this available to the scorer by adding a `label` or `target` keyword argument to your scorer definition.
+
+For example if you wanted to use a column called "label" from your dataset then your scorer function (or `score` class method) would have a parameter list like this:
 
-- **Input:** Add parameters that match the names of the columns in your dataset to access input data.
+```python
+@weave.op
+def my_custom_scorer(outout: str, label: int):
+    ...
+```
+
+When a weave `Evaluation` is run, the output of the AI system is passed to the `output` parameter. The `Evaluation` also automatically tries to match any additional scorer parameter names to your dataset columns. If customizing your scorer parameters or dataset columns is not feasible, you can use column mapping - see below for more.
+
+- **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output.
 
-For example, if your dataset has a "news_article" column, you can access it in the scorer by adding a `news_article` parameter to your scorer's signature.
 
 ### Mapping Column Names
 Sometimes, the scorer's parameter names don't match the column names in your dataset. You can fix this using a `column_map`.
@@ -111,6 +123,14 @@ Here, the `text` parameter in the score method will receive data from the `news_
 
 ## Predefined Scorers
 
+**Installation**
+
+To use Weave's predefined scorers you need to install some additional dependencies:
+
+```bash
+pip install weave[scorers]
+```
+
 **LLM-evaluators**
 
 The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also uses weave's `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them.
diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 443ae687df72..3eff3cdba9a5 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -223,7 +223,7 @@ async def predict_and_score(
             model_output = None
         model_latency = time.time() - model_start_time
 
-        scores = {}
+        scores = {}  # TODO: Consider moving scorer setup and checks out of `predict_and_score`
         scorers = cast(list[Union[Op, Scorer]], self.scorers or [])
         for scorer in scorers:
             scorer_self = None
@@ -245,14 +245,8 @@ async def predict_and_score(
             ):
                 message = textwrap.dedent(
                     f"""
-                    Scorer {scorer_name} must have a 'model_output' or 'output' argument, to receive the output of the model function.
-                    You can also set the `scorer.column_map` attribute to map dataset columns to the expected parameter names in the scorer.
-                    For example, if the scorer expects "input" and "ground_truth" and we have a dataset
-                    with columns "question" and "answer", column_map should be defined as follows:
-                    {{"input": "question", "ground_truth": "answer"}}
-                    scorer.column_map: {getattr(scorer, 'column_map', None)}
-                    score_arg_names: {score_arg_names}
-                    example: {example}
+                    Scorer {scorer_name} must have an `output` or `model_output` argument, to receive the 
+                    output of the model function.
                     """
                 )
                 raise OpCallError(message)
@@ -272,11 +266,43 @@ async def predict_and_score(
                 # input: is the full row, we have access to it via example
                 # output: is the model output, we have access to it via model_output
                 if isinstance(scorer, Scorer) and scorer.column_map is not None:
-                    score_args = {
-                        arg: example[scorer.column_map.get(arg, arg)]
-                        for arg in score_arg_names
-                        if scorer.column_map.get(arg, arg) in example
-                    }
+                    score_args = {}
+                    for arg in score_arg_names:
+                        # Testing scorer to column_map logic
+                        # Check column_map validity, if column_map contains the scorer args
+                        if arg not in scorer.column_map:
+                            message = textwrap.dedent(
+                            f"""
+                            Scorer {scorer_name} expects argument {arg} to be in `scorer.column_map` keys.
+                            Available scorer keyword argument names: {score_arg_names}
+                            scorer.column_map keys: {scorer.column_map.keys()}
+
+                            Hint: 
+                            - column_map should follow the format: {{scorer arg name: dataset column name}}
+                            - Check if your scorer.column_map keys and values are not reversed. 
+                            """
+                            )   
+                            raise ValueError(message)
+
+                        # Try to map scorer arg to dataset columm, testing dataset to column_map logic
+                        example_column_name = scorer.column_map.get(arg)
+                        if example_column_name in example:
+                            score_args[arg] = example[example_column_name]
+                        else:
+                            message = textwrap.dedent(
+                            f"""
+                            There is an issue with `scorer.column_map`: {scorer.column_map}.
+                            
+                            The value for column_map key: {arg} is {example_column_name} but 
+                            {example_column_name} is not found in the dataset columns.
+                            
+                            Available dataset columns: {example.keys()}
+
+                            Hint: 
+                            - column_map should follow the format: {{scorer arg name: dataset column name}} 
+                            """
+                            )   
+                            raise ValueError(message)
                 else:
                     score_args = {
                         k: v for k, v in example.items() if k in score_arg_names
@@ -326,6 +352,17 @@ async def predict_and_score(
                     f"""
                     Call error: {e}
 
+                                        If using the `Scorer` weave class, you can set the `scorer.column_map` 
+                    attribute to map scorer parameter names to dataset columns.
+                    
+                    For example, if the scorer expects "output", "input" and "ground_truth" and we have a dataset
+                    with columns "question" and "answer", `column_map` can be used to map the non-output parameter to like so:
+                    {{"input": "question", "ground_truth": "answer"}}
+                    
+                    scorer argument names: {score_arg_names}
+                    dataset keys: {example.keys()}
+                    scorer.column_map: {getattr(scorer, 'column_map', None)}
+
                     Options for resolving:
                     a. change {scorer_name} argument names to match a subset of dataset column names ({dataset_column_names_str})
                     b. change dataset column names to match expected {scorer_name} argument names: {required_arg_names}
diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py
index 2bf2f7b8d332..0c9435e11f26 100644
--- a/weave/flow/scorers/__init__.py
+++ b/weave/flow/scorers/__init__.py
@@ -3,6 +3,10 @@
     auto_summarize,
     get_scorer_attributes,
 )
+from weave.flow.scorers.llm_utils import (
+    create,
+    embed,
+)
 from weave.flow.scorers.classification_scorer import (
     MultiTaskBinaryClassificationF1,
     transpose,
@@ -29,6 +33,8 @@
 
 __all__ = [
     "auto_summarize",
+    "create",
+    "embed",
     "ContextEntityRecallScorer",
     "ContextRelevancyScorer",
     "EmbeddingSimilarityScorer",

From 37028ead1e85ace4ad57465fd24fc8644abbc389 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:05:38 +0200
Subject: [PATCH 082/150] test for column_map cases

---
 tests/trace/test_evaluations.py | 53 +++++++++++++++++++-
 weave/flow/eval.py              | 89 +++++++++++++++++++++------------
 2 files changed, 110 insertions(+), 32 deletions(-)

diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py
index a85c6530c762..5470312c42ca 100644
--- a/tests/trace/test_evaluations.py
+++ b/tests/trace/test_evaluations.py
@@ -818,9 +818,60 @@ def model_function(col1, col2):
     ), "The summary should reflect the correct number of matches"
 
 
-# Define another dummy scorer
+@pytest.mark.asyncio
+async def test_evaluation_with_wrong_column_map():
+    # Define a dummy scorer that uses column_map
+    class DummyScorer(Scorer):
+        @weave.op()
+        def score(self, foo: str, bar: str, output: str, target: str) -> dict:
+            # Return whether foo + bar equals output
+            return {"match": (foo + bar) == output == target}
 
+    @weave.op()
+    def model_function(col1, col2):
+        # For testing, return the concatenation of col1 and col2
+        return col1 + col2
 
+    dataset = [
+        {"col1": "Hello", "col2": "World", "target": "HelloWorld"},  # True
+        {"col1": "Hi", "col2": "There", "target": "HiThere"},  # True
+        {"col1": "Good", "col2": "Morning", "target": "GoodMorning"},  # True
+        {"col1": "Bad", "col2": "Evening", "target": "GoodEvening"},  # False
+    ]
+
+    # Test that the column map is correctly used
+    dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"})
+    evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer])
+    eval_out = await evaluation.evaluate(model_function)
+    assert "DummyScorer" in eval_out
+    assert eval_out["DummyScorer"]["match"] == {"true_count": 3, "true_fraction": 0.75}
+
+    with pytest.raises(ValueError) as excinfo:
+        # Create the scorer with column_map mapping 'foo'->'col1', 'bar'->'col3'
+        # this is wrong because col3 does not exist
+        dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col3"})
+        evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer])
+        await evaluation.predict_and_score(model_function, dataset[0])
+        assert "which is not in the scorer's argument names" in str(excinfo.value)
+
+    with pytest.raises(ValueError) as excinfo:
+        # Create the scorer with column_map missing a column
+        dummy_scorer = DummyScorer(column_map={"foo": "col1"})
+        evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer])
+        await evaluation.predict_and_score(model_function, dataset[0])
+        assert "is not found in the dataset columns" in str(excinfo.value)
+
+    with pytest.raises(ValueError) as excinfo:
+        # Create the scorer with wrong argument name
+        dummy_scorer = DummyScorer(column_map={"jeez": "col1"})
+        evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer])
+        await evaluation.predict_and_score(model_function, dataset[0])
+        assert "is not found in the dataset columns and is not mapped" in str(
+            excinfo.value
+        )
+
+
+# Define another dummy scorer
 @pytest.mark.asyncio
 async def test_evaluation_with_multiple_column_maps():
     class DummyScorer(Scorer):
diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 3eff3cdba9a5..1223234ee120 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -265,43 +265,70 @@ async def predict_and_score(
                 #
                 # input: is the full row, we have access to it via example
                 # output: is the model output, we have access to it via model_output
+                score_arg_names = [
+                    param
+                    for param in score_arg_names
+                    if (
+                        param != "self"
+                        and param != "output"
+                        and param != "model_output"
+                    )
+                ]
+                score_args = {}
+
                 if isinstance(scorer, Scorer) and scorer.column_map is not None:
-                    score_args = {}
-                    for arg in score_arg_names:
-                        # Testing scorer to column_map logic
-                        # Check column_map validity, if column_map contains the scorer args
-                        if arg not in scorer.column_map:
+                    # Ensure that all keys in column_map are in score_arg_names
+                    for key in scorer.column_map.keys():
+                        if key not in score_arg_names:
                             message = textwrap.dedent(
-                            f"""
-                            Scorer {scorer_name} expects argument {arg} to be in `scorer.column_map` keys.
-                            Available scorer keyword argument names: {score_arg_names}
-                            scorer.column_map keys: {scorer.column_map.keys()}
-
-                            Hint: 
-                            - column_map should follow the format: {{scorer arg name: dataset column name}}
-                            - Check if your scorer.column_map keys and values are not reversed. 
-                            """
-                            )   
+                                f"""
+                                    You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
+
+                                    The `column_map` contains a key `{key}` which is not in the scorer's argument names.
+                                    Scorer argument names: {score_arg_names}
+
+                                    Hint:
+                                    - Ensure that the keys in `column_map` match the scorer's parameter names.
+                                    """
+                            )
                             raise ValueError(message)
 
-                        # Try to map scorer arg to dataset columm, testing dataset to column_map logic
-                        example_column_name = scorer.column_map.get(arg)
-                        if example_column_name in example:
-                            score_args[arg] = example[example_column_name]
+                    for arg in score_arg_names:
+                        if arg in example:
+                            score_args[arg] = example[arg]
+                        elif arg in scorer.column_map:
+                            dataset_column_name = scorer.column_map[arg]
+                            if dataset_column_name in example:
+                                score_args[arg] = example[dataset_column_name]
+                            else:
+                                message = textwrap.dedent(
+                                    f"""
+                                        You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
+
+                                        You are mapping `{dataset_column_name}`->`{arg}`, but `{dataset_column_name}` 
+                                        is not found in the dataset columns.
+                                        
+                                        Available dataset columns: {list(example.keys())}
+
+                                        Hint:
+                                        - Ensure that `column_map` maps scorer parameter names to existing dataset column names.
+                                        """
+                                )
+                                raise ValueError(message)
                         else:
                             message = textwrap.dedent(
-                            f"""
-                            There is an issue with `scorer.column_map`: {scorer.column_map}.
-                            
-                            The value for column_map key: {arg} is {example_column_name} but 
-                            {example_column_name} is not found in the dataset columns.
-                            
-                            Available dataset columns: {example.keys()}
-
-                            Hint: 
-                            - column_map should follow the format: {{scorer arg name: dataset column name}} 
-                            """
-                            )   
+                                f"""
+                                    You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
+
+                                    Scorer argument `{arg}` is not found in the dataset columns and is not mapped in `column_map`.
+                                    
+                                    Available dataset columns: {list(example.keys())}
+                                    `column_map`: {scorer.column_map}
+
+                                    Hint:
+                                    - Either provide `{arg}` directly in the dataset, or map it via `column_map`.
+                                    """
+                            )
                             raise ValueError(message)
                 else:
                     score_args = {

From 00f78687bf41e3f6dbd7883720b48e5139173501 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:05:58 +0200
Subject: [PATCH 083/150] lint

---
 tests/scorers/test_hallucination_scorer.py | 24 +++++++++++++++-------
 tests/scorers/test_similarity_scorer.py    | 24 ++++++++++++++--------
 tests/scorers/test_utils.py                |  2 +-
 weave/flow/eval.py                         |  6 +++---
 weave/flow/scorers/__init__.py             |  8 ++++----
 weave/flow/scorers/similarity_scorer.py    |  1 +
 6 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index 85b647eb2710..7eba86b6041a 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -69,21 +69,29 @@ def model():
         scorers=[hallucination_scorer],
     )
     result = await evaluation.evaluate(model)
-    assert result['HallucinationFreeScorer']["hallucination_free"]["true_count"] == 2
-    assert result['HallucinationFreeScorer']["hallucination_free"]["true_fraction"] == 1.0
+    assert result["HallucinationFreeScorer"]["hallucination_free"]["true_count"] == 2
+    assert (
+        result["HallucinationFreeScorer"]["hallucination_free"]["true_fraction"] == 1.0
+    )
 
 
 @pytest.mark.asyncio
 async def test_hallucination_scorer_eval2(hallucination_scorer):
     dataset = [
-        {"input": "John likes various types of cheese.", "other_col": "John's favorite cheese is cheddar."},
-        {"input": "Pepe likes various types of cheese.", "other_col": "Pepe's favorite cheese is gouda."},
+        {
+            "input": "John likes various types of cheese.",
+            "other_col": "John's favorite cheese is cheddar.",
+        },
+        {
+            "input": "Pepe likes various types of cheese.",
+            "other_col": "Pepe's favorite cheese is gouda.",
+        },
     ]
 
     @weave.op
     def model(input):
         return "The person's favorite cheese is cheddar."
-    
+
     hallucination_scorer.column_map = {"context": "input", "output": "other_col"}
 
     evaluation = weave.Evaluation(
@@ -91,5 +99,7 @@ def model(input):
         scorers=[hallucination_scorer],
     )
     result = await evaluation.evaluate(model)
-    assert result['HallucinationFreeScorer']["hallucination_free"]["true_count"] == 2
-    assert result['HallucinationFreeScorer']["hallucination_free"]["true_fraction"] == 1.0
+    assert result["HallucinationFreeScorer"]["hallucination_free"]["true_count"] == 2
+    assert (
+        result["HallucinationFreeScorer"]["hallucination_free"]["true_fraction"] == 1.0
+    )
diff --git a/tests/scorers/test_similarity_scorer.py b/tests/scorers/test_similarity_scorer.py
index 6452bcdd8c2e..75e03fa6f315 100644
--- a/tests/scorers/test_similarity_scorer.py
+++ b/tests/scorers/test_similarity_scorer.py
@@ -11,6 +11,7 @@
 def mock_embed(monkeypatch):
     def _mock_embed(*args, **kwargs):
         import random
+
         return [[random.random(), random.random()] for _ in range(2)]
 
     monkeypatch.setattr("weave.flow.scorers.similarity_scorer.embed", _mock_embed)
@@ -33,6 +34,7 @@ def test_similarity_scorer_score(similarity_scorer):
     assert result["similarity_score"] > 0.0
     assert result["is_similar"] is True
 
+
 def test_similarity_scorer_not_similar(similarity_scorer):
     output = "John's favorite cheese is cheddar."
     target = "John likes various types of cheese."
@@ -41,6 +43,7 @@ def test_similarity_scorer_not_similar(similarity_scorer):
     assert result["similarity_score"] < 0.99
     assert result["is_similar"] is False
 
+
 @pytest.mark.asyncio
 async def test_similarity_scorer_eval(similarity_scorer):
     dataset = [
@@ -51,28 +54,33 @@ async def test_similarity_scorer_eval(similarity_scorer):
     @weave.op
     def model():
         return "He's name is John"
-    
 
     evaluation = weave.Evaluation(
         dataset=dataset,
         scorers=[similarity_scorer],
     )
     result = await evaluation.evaluate(model)
-    assert result['EmbeddingSimilarityScorer']["similarity_score"]["mean"] > 0.0
-    assert 0 <= result['EmbeddingSimilarityScorer']["is_similar"]["true_count"] <= 2
+    assert result["EmbeddingSimilarityScorer"]["similarity_score"]["mean"] > 0.0
+    assert 0 <= result["EmbeddingSimilarityScorer"]["is_similar"]["true_count"] <= 2
 
 
 @pytest.mark.asyncio
 async def test_similarity_scorer_eval2(similarity_scorer):
     dataset = [
-        {"input": "He's name is John", "other_col": "John likes various types of cheese."},
-        {"input": "He's name is Pepe.", "other_col": "Pepe likes various types of cheese."},
+        {
+            "input": "He's name is John",
+            "other_col": "John likes various types of cheese.",
+        },
+        {
+            "input": "He's name is Pepe.",
+            "other_col": "Pepe likes various types of cheese.",
+        },
     ]
 
     @weave.op
     def model(input):
         return "John likes various types of cheese."
-    
+
     similarity_scorer.column_map = {"target": "other_col"}
 
     evaluation = weave.Evaluation(
@@ -80,5 +88,5 @@ def model(input):
         scorers=[similarity_scorer],
     )
     result = await evaluation.evaluate(model)
-    assert result['EmbeddingSimilarityScorer']["similarity_score"]["mean"] > 0.0
-    assert 0 <= result['EmbeddingSimilarityScorer']["is_similar"]["true_count"] <= 2
+    assert result["EmbeddingSimilarityScorer"]["similarity_score"]["mean"] > 0.0
+    assert 0 <= result["EmbeddingSimilarityScorer"]["is_similar"]["true_count"] <= 2
diff --git a/tests/scorers/test_utils.py b/tests/scorers/test_utils.py
index 2fae3db1157f..73bef952810f 100644
--- a/tests/scorers/test_utils.py
+++ b/tests/scorers/test_utils.py
@@ -5,4 +5,4 @@ def test_stringify():
     assert stringify("Hello, world!") == "Hello, world!"
     assert stringify(123) == "123"
     assert stringify([1, 2, 3]) == "[\n  1,\n  2,\n  3\n]"
-    assert stringify({"a": 1, "b": 2}) == '{\n  "a": 1,\n  "b": 2\n}'
\ No newline at end of file
+    assert stringify({"a": 1, "b": 2}) == '{\n  "a": 1,\n  "b": 2\n}'
diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 1223234ee120..94eec2268503 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -245,7 +245,7 @@ async def predict_and_score(
             ):
                 message = textwrap.dedent(
                     f"""
-                    Scorer {scorer_name} must have an `output` or `model_output` argument, to receive the 
+                    Scorer {scorer_name} must have an `output` or `model_output` argument, to receive the
                     output of the model function.
                     """
                 )
@@ -305,7 +305,7 @@ async def predict_and_score(
                                     f"""
                                         You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
 
-                                        You are mapping `{dataset_column_name}`->`{arg}`, but `{dataset_column_name}` 
+                                        You are mapping `{dataset_column_name}`->`{arg}`, but `{dataset_column_name}`
                                         is not found in the dataset columns.
                                         
                                         Available dataset columns: {list(example.keys())}
@@ -379,7 +379,7 @@ async def predict_and_score(
                     f"""
                     Call error: {e}
 
-                                        If using the `Scorer` weave class, you can set the `scorer.column_map` 
+                                        If using the `Scorer` weave class, you can set the `scorer.column_map`
                     attribute to map scorer parameter names to dataset columns.
                     
                     For example, if the scorer expects "output", "input" and "ground_truth" and we have a dataset
diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py
index 0c9435e11f26..1f3b4013c955 100644
--- a/weave/flow/scorers/__init__.py
+++ b/weave/flow/scorers/__init__.py
@@ -3,10 +3,6 @@
     auto_summarize,
     get_scorer_attributes,
 )
-from weave.flow.scorers.llm_utils import (
-    create,
-    embed,
-)
 from weave.flow.scorers.classification_scorer import (
     MultiTaskBinaryClassificationF1,
     transpose,
@@ -17,6 +13,10 @@
     InstructorLLMScorer,
     LLMScorer,
 )
+from weave.flow.scorers.llm_utils import (
+    create,
+    embed,
+)
 from weave.flow.scorers.moderation_scorer import OpenAIModerationScorer
 from weave.flow.scorers.pydantic_scorer import PydanticScorer
 from weave.flow.scorers.ragas_scorer import (
diff --git a/weave/flow/scorers/similarity_scorer.py b/weave/flow/scorers/similarity_scorer.py
index bf6c74a9613f..9b194504fe06 100644
--- a/weave/flow/scorers/similarity_scorer.py
+++ b/weave/flow/scorers/similarity_scorer.py
@@ -16,6 +16,7 @@ class EmbeddingSimilarityScorer(LLMScorer):
     Args:
         threshold: The minimum cosine similarity score that is considered similar. Defaults to 0.5
     """
+
     threshold: float = Field(0.5, description="The threshold for the similarity score")
 
     @weave.op

From d508f76bd70fb0f63d100306ff00c65e1c1f4b8b Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:11:06 +0200
Subject: [PATCH 084/150] remove skip

---
 tests/scorers/test_llm_integrations.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/scorers/test_llm_integrations.py b/tests/scorers/test_llm_integrations.py
index 82ca973a22c8..43d3c33fe0c0 100644
--- a/tests/scorers/test_llm_integrations.py
+++ b/tests/scorers/test_llm_integrations.py
@@ -66,11 +66,8 @@ def get_client_and_model(provider, model):
 
 @pytest.mark.parametrize("provider,model", test_params, ids=lambda p: f"{p[0]}:{p[1]}")
 def test_summarization_scorer_evaluate_summary(provider, model):
-    try:
-        client, model_id = get_client_and_model(provider, model)
-    except (ValueError, EnvironmentError) as e:
-        pytest.skip(str(e))
-
+    client, model_id = get_client_and_model(provider, model)
+        
     summarization_scorer = SummarizationScorer(
         client=client,
         model_id=model_id,

From 9a22a2203f4fe957e5ad7cde5f79b94b1a1599f3 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:13:10 +0200
Subject: [PATCH 085/150] check with isinstance

---
 tests/scorers/test_ragas_scorer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index 2144200d809f..77e1aad14281 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -17,9 +17,9 @@ def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
         # Retrieve the response_model to return appropriate mock responses
         response_model = kwargs.get("response_model")
-        if response_model == EntityExtractionResponse:
+        if isinstance(response_model, EntityExtractionResponse):
             return EntityExtractionResponse(entities=["Paris"])
-        elif response_model == RelevancyResponse:
+        elif isinstance(response_model, RelevancyResponse):
             return RelevancyResponse(
                 reasoning="The context directly answers the question.",
                 relevancy_score=1,

From 94a1229ac8f30d05573b2123241410dbbaa9b3f2 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:14:21 +0200
Subject: [PATCH 086/150] lint again

---
 tests/scorers/test_llm_integrations.py | 2 +-
 tests/scorers/test_ragas_scorer.py     | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/scorers/test_llm_integrations.py b/tests/scorers/test_llm_integrations.py
index 43d3c33fe0c0..c2419ec6da2a 100644
--- a/tests/scorers/test_llm_integrations.py
+++ b/tests/scorers/test_llm_integrations.py
@@ -67,7 +67,7 @@ def get_client_and_model(provider, model):
 @pytest.mark.parametrize("provider,model", test_params, ids=lambda p: f"{p[0]}:{p[1]}")
 def test_summarization_scorer_evaluate_summary(provider, model):
     client, model_id = get_client_and_model(provider, model)
-        
+
     summarization_scorer = SummarizationScorer(
         client=client,
         model_id=model_id,
diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index 77e1aad14281..385e1409cb0a 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -24,8 +24,6 @@ def _mock_create(*args, **kwargs):
                 reasoning="The context directly answers the question.",
                 relevancy_score=1,
             )
-        else:
-            return None
 
     monkeypatch.setattr("weave.flow.scorers.ragas_scorer.create", _mock_create)
 

From a511130881de37e93cb630379349bf08d0bdf381 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:17:14 +0200
Subject: [PATCH 087/150] remove useless test

---
 tests/scorers/test_ragas_scorer.py         | 12 +-----------
 tests/scorers/test_summarization_scorer.py |  2 --
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index 385e1409cb0a..8fde51f52743 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -48,11 +48,6 @@ def context_relevancy_scorer(mock_create):
     )
 
 
-def test_context_entity_recall_scorer_initialization(context_entity_recall_scorer):
-    assert isinstance(context_entity_recall_scorer, ContextEntityRecallScorer)
-    assert context_entity_recall_scorer.model_id == "gpt-4o"
-
-
 def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
     output = "Paris is the capital of France."
     context = "The capital city of France is Paris."
@@ -60,12 +55,7 @@ def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
     assert isinstance(result, dict)
     assert "recall" in result
     assert result["recall"] == 1.0  # Assuming full recall in mock response
-
-
-def test_context_relevancy_scorer_initialization(context_relevancy_scorer):
-    assert isinstance(context_relevancy_scorer, ContextRelevancyScorer)
-    assert context_relevancy_scorer.model_id == "gpt-4o"
-
+    
 
 def test_context_relevancy_scorer_score(context_relevancy_scorer):
     output = "What is the capital of France?"
diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
index 97090994550a..c30135fe8052 100644
--- a/tests/scorers/test_summarization_scorer.py
+++ b/tests/scorers/test_summarization_scorer.py
@@ -22,8 +22,6 @@ def _mock_create(*args, **kwargs):
                 think_step_by_step="This is some reasoning.",
                 summarization_evaluation="excellent",
             )
-        else:
-            return None
 
     # Patch the 'create' function wherever it is called
     monkeypatch.setattr("weave.flow.scorers.summarization_scorer.create", _mock_create)

From 4a92a8909444f4d2be1d7278ed20cec97f510636 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:39:59 +0200
Subject: [PATCH 088/150] back to <- map

---
 weave/flow/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 94eec2268503..f6bc97309bba 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -305,7 +305,7 @@ async def predict_and_score(
                                     f"""
                                         You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
 
-                                        You are mapping `{dataset_column_name}`->`{arg}`, but `{dataset_column_name}`
+                                        You are mapping `{arg}`<-`{dataset_column_name}`, but `{dataset_column_name}`
                                         is not found in the dataset columns.
                                         
                                         Available dataset columns: {list(example.keys())}

From e690eb5a0dcf58be2867ea0ddc932a16a7dc0741 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:40:05 +0200
Subject: [PATCH 089/150] lint

---
 tests/scorers/test_ragas_scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index 8fde51f52743..a11a68ffa561 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -55,7 +55,7 @@ def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
     assert isinstance(result, dict)
     assert "recall" in result
     assert result["recall"] == 1.0  # Assuming full recall in mock response
-    
+
 
 def test_context_relevancy_scorer_score(context_relevancy_scorer):
     output = "What is the capital of France?"

From 5cc14e3c122d70b6e7adc0d6969a65580cc51f81 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:43:32 +0200
Subject: [PATCH 090/150] typing

---
 weave/flow/scorers/llm_scorer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/weave/flow/scorers/llm_scorer.py b/weave/flow/scorers/llm_scorer.py
index d319670ae772..8ce1e318f1ff 100644
--- a/weave/flow/scorers/llm_scorer.py
+++ b/weave/flow/scorers/llm_scorer.py
@@ -3,13 +3,13 @@
 from pydantic import Field, field_validator
 
 from weave.flow.scorers.base_scorer import Scorer
-from weave.flow.scorers.llm_utils import _LLM_CLIENTS_NAMES, instructor_client
+from weave.flow.scorers.llm_utils import _LLM_CLIENTS_NAMES, instructor_client, _LLM_CLIENTS
 
 
 class LLMScorer(Scorer):
     """Score a model output using an LLM"""
 
-    client: Any = Field(
+    client: _LLM_CLIENTS = Field(
         description="The LLM client to use, has to be instantiated with an api_key"
     )
     model_id: str = Field(description="The model to use")
@@ -27,7 +27,7 @@ def validate_client(cls, v):  # type: ignore
 class InstructorLLMScorer(Scorer):
     """Score a model output using an LLM"""
 
-    client: Any = Field(
+    client: _LLM_CLIENTS = Field(
         description="The LLM client to use, has to be instantiated with an api_key"
     )
     model_id: str = Field(description="The model to use")

From 644d327e4087116f5e951e278410a2561f86e7ea Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:43:37 +0200
Subject: [PATCH 091/150] renove unused

---
 weave/flow/scorers/llm_utils.py | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/weave/flow/scorers/llm_utils.py b/weave/flow/scorers/llm_utils.py
index c57172277b56..361748190490 100644
--- a/weave/flow/scorers/llm_utils.py
+++ b/weave/flow/scorers/llm_utils.py
@@ -90,26 +90,4 @@ def embed(
         return [embedding.embedding for embedding in response.data]
     else:
         raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}")
-
-
-# Helper function for dynamic imports
-def import_client(provider: str) -> Optional[_LLM_CLIENTS]:  # type: ignore
-    try:
-        if provider == "openai":
-            from openai import OpenAI
-
-            return OpenAI
-        elif provider == "anthropic":
-            import anthropic
-
-            return anthropic.Anthropic
-        elif provider == "mistral":
-            from mistralai import Mistral
-
-            return Mistral
-        elif provider == "gemini":
-            from google.generativeai import GenerativeModel
-
-            return GenerativeModel
-    except ImportError:
-        return None
+    
\ No newline at end of file

From 1555ee9e9f2084de1c94030120ff85a4c506a0b4 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:44:01 +0200
Subject: [PATCH 092/150] ruff

---
 weave/flow/scorers/llm_scorer.py | 8 +++++---
 weave/flow/scorers/llm_utils.py  | 3 +--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/weave/flow/scorers/llm_scorer.py b/weave/flow/scorers/llm_scorer.py
index 8ce1e318f1ff..82b68667aa0f 100644
--- a/weave/flow/scorers/llm_scorer.py
+++ b/weave/flow/scorers/llm_scorer.py
@@ -1,9 +1,11 @@
-from typing import Any
-
 from pydantic import Field, field_validator
 
 from weave.flow.scorers.base_scorer import Scorer
-from weave.flow.scorers.llm_utils import _LLM_CLIENTS_NAMES, instructor_client, _LLM_CLIENTS
+from weave.flow.scorers.llm_utils import (
+    _LLM_CLIENTS,
+    _LLM_CLIENTS_NAMES,
+    instructor_client,
+)
 
 
 class LLMScorer(Scorer):
diff --git a/weave/flow/scorers/llm_utils.py b/weave/flow/scorers/llm_utils.py
index 361748190490..3debc2e3293d 100644
--- a/weave/flow/scorers/llm_utils.py
+++ b/weave/flow/scorers/llm_utils.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, List, Optional, Union
+from typing import TYPE_CHECKING, Any, List, Union
 
 from weave.trace.autopatch import autopatch
 
@@ -90,4 +90,3 @@ def embed(
         return [embedding.embedding for embedding in response.data]
     else:
         raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}")
-    
\ No newline at end of file

From 9eb2e045a42429ba1d3085b2dd696f42ea545506 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:44:41 +0200
Subject: [PATCH 093/150] just use list

---
 weave/flow/scorers/ragas_scorer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/weave/flow/scorers/ragas_scorer.py b/weave/flow/scorers/ragas_scorer.py
index 8b3493c35422..75b27e816bf8 100644
--- a/weave/flow/scorers/ragas_scorer.py
+++ b/weave/flow/scorers/ragas_scorer.py
@@ -1,7 +1,6 @@
 # implementing metrics from ragas: https://github.com/explodinggradients/ragas
 
 from textwrap import dedent
-from typing import List
 
 from pydantic import BaseModel, Field
 
@@ -11,7 +10,7 @@
 
 
 class EntityExtractionResponse(BaseModel):
-    entities: List[str] = Field(
+    entities: list[str] = Field(
         description="A list of unique entities extracted from the text"
     )
 
@@ -29,7 +28,7 @@ class ContextEntityRecallScorer(InstructorLLMScorer):
     Entities:
     """)
 
-    def extract_entities(self, text: str) -> List[str]:
+    def extract_entities(self, text: str) -> list[str]:
         # Use LLM to extract entities
         prompt = self.extraction_prompt.format(text=text)
         response = create(

From 8e247f70119cfbc81101af5e851d8322f20b4043 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:49:26 +0200
Subject: [PATCH 094/150] reverse arrow

---
 weave/flow/eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index f6bc97309bba..25905b5a9482 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -305,8 +305,8 @@ async def predict_and_score(
                                     f"""
                                         You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
 
-                                        You are mapping `{arg}`<-`{dataset_column_name}`, but `{dataset_column_name}`
-                                        is not found in the dataset columns.
+                                        You are mapping `{dataset_column_name}` to `{arg}`, but `{dataset_column_name}`
+                                        was not found in the dataset columns.
                                         
                                         Available dataset columns: {list(example.keys())}
 

From a91fd670949e6ad0090c1c42b480ea44df310f4d Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:51:53 +0200
Subject: [PATCH 095/150] reverse again

---
 weave/flow/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 25905b5a9482..00ce963016a8 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -305,7 +305,7 @@ async def predict_and_score(
                                     f"""
                                         You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
 
-                                        You are mapping `{dataset_column_name}` to `{arg}`, but `{dataset_column_name}`
+                                        You are mapping `{arg}` to `{dataset_column_name}`, but `{dataset_column_name}`
                                         was not found in the dataset columns.
                                         
                                         Available dataset columns: {list(example.keys())}

From 782bd0e9aebeeefa8e1cd09a0e2feac43d7279a5 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:55:22 +0200
Subject: [PATCH 096/150] back compat should work

---
 tests/trace/test_weave_client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/trace/test_weave_client.py b/tests/trace/test_weave_client.py
index ca77056425ff..c305b8527735 100644
--- a/tests/trace/test_weave_client.py
+++ b/tests/trace/test_weave_client.py
@@ -393,8 +393,8 @@ async def model_predict(input) -> str:
     dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
 
     @weave.op()
-    async def score(target, output):
-        return target == output
+    async def score(target, model_output):
+        return target == model_output
 
     evaluation = Evaluation(
         name="my-eval",

From b76e8ec45c739025e1906ba753dedd1a02b81799 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 17:58:35 +0200
Subject: [PATCH 097/150] move out of try

---
 weave/flow/scorers/json_scorer.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/weave/flow/scorers/json_scorer.py b/weave/flow/scorers/json_scorer.py
index 026449507d50..626c82014588 100644
--- a/weave/flow/scorers/json_scorer.py
+++ b/weave/flow/scorers/json_scorer.py
@@ -10,10 +10,11 @@ class ValidJSONScorer(Scorer):
     def score(self, output: Any) -> dict:  # type: ignore
         try:
             result = json.loads(output)
-
-            if isinstance(result, dict) or isinstance(result, list):
-                return {"json_valid": True}
-
         except json.JSONDecodeError:
-            pass
-        return {"json_valid": False}
+            return {"json_valid": False}
+        
+        if isinstance(result, dict) or isinstance(result, list):
+            return {"json_valid": True}
+        else:
+            return {"json_valid": False}
+            
\ No newline at end of file

From 2f704c5e67fbb32987df7edb5ecf5aa5acfa01d6 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 18:18:16 +0200
Subject: [PATCH 098/150] another edge case...

---
 weave/flow/eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 00ce963016a8..f9bab2301ba1 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -270,8 +270,6 @@ async def predict_and_score(
                     for param in score_arg_names
                     if (
                         param != "self"
-                        and param != "output"
-                        and param != "model_output"
                     )
                 ]
                 score_args = {}
@@ -294,6 +292,8 @@ async def predict_and_score(
                             raise ValueError(message)
 
                     for arg in score_arg_names:
+                        if arg == "output" or arg == "model_output":
+                            continue
                         if arg in example:
                             score_args[arg] = example[arg]
                         elif arg in scorer.column_map:

From 9d27bc2b6dc6ea0988d827127b900eb9c466dd27 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 18:18:32 +0200
Subject: [PATCH 099/150] rename `has_hallucination`

---
 tests/scorers/test_hallucination_scorer.py | 12 ++++++------
 weave/flow/scorers/hallucination_scorer.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index 7eba86b6041a..8303ddf75f2e 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -24,7 +24,7 @@ def _mock_create(*args, **kwargs):
                 )
             ],
             conclusion="The output is consistent with the input data.",
-            hallucination_free=True,
+            has_hallucination=True,
         )
 
     monkeypatch.setattr("weave.flow.scorers.hallucination_scorer.create", _mock_create)
@@ -47,7 +47,7 @@ def test_hallucination_scorer_score(hallucination_scorer, mock_create):
     # we should be able to do this validation
     _ = HallucinationResponse.model_validate(result)
 
-    assert result["hallucination_free"] == True
+    assert result["has_hallucination"] == True
     assert result["conclusion"] == "The output is consistent with the input data."
     assert len(result["reasonings"]) == 1
     assert result["reasonings"][0]["hallucination_type"] == "No Hallucination"
@@ -69,9 +69,9 @@ def model():
         scorers=[hallucination_scorer],
     )
     result = await evaluation.evaluate(model)
-    assert result["HallucinationFreeScorer"]["hallucination_free"]["true_count"] == 2
+    assert result["HallucinationFreeScorer"]["has_hallucination"]["true_count"] == 2
     assert (
-        result["HallucinationFreeScorer"]["hallucination_free"]["true_fraction"] == 1.0
+        result["HallucinationFreeScorer"]["has_hallucination"]["true_fraction"] == 1.0
     )
 
 
@@ -99,7 +99,7 @@ def model(input):
         scorers=[hallucination_scorer],
     )
     result = await evaluation.evaluate(model)
-    assert result["HallucinationFreeScorer"]["hallucination_free"]["true_count"] == 2
+    assert result["HallucinationFreeScorer"]["has_hallucination"]["true_count"] == 2
     assert (
-        result["HallucinationFreeScorer"]["hallucination_free"]["true_fraction"] == 1.0
+        result["HallucinationFreeScorer"]["has_hallucination"]["true_fraction"] == 1.0
     )
diff --git a/weave/flow/scorers/hallucination_scorer.py b/weave/flow/scorers/hallucination_scorer.py
index 1c4e58b1cf6f..b78bb97dc801 100644
--- a/weave/flow/scorers/hallucination_scorer.py
+++ b/weave/flow/scorers/hallucination_scorer.py
@@ -99,7 +99,7 @@ class HallucinationResponse(BaseModel):
 the <output> contains hallucinations."
     )
     conclusion: str = Field(description="The conclusion of the analysis.")
-    hallucination_free: bool = Field(
+    has_hallucination: bool = Field(
         description="Whether the <output> is free of hallucinations based on the <input_data>. True means it is NOT a hallucination."
     )
 

From 8b1ec7625a25e0020729ffedc5c66a85b7145f56 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 18:18:45 +0200
Subject: [PATCH 100/150] fix ragas

---
 tests/scorers/test_ragas_scorer.py | 4 ++--
 weave/flow/scorers/ragas_scorer.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index a11a68ffa561..e2eb164a675b 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -17,9 +17,9 @@ def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
         # Retrieve the response_model to return appropriate mock responses
         response_model = kwargs.get("response_model")
-        if isinstance(response_model, EntityExtractionResponse):
+        if response_model is EntityExtractionResponse:
             return EntityExtractionResponse(entities=["Paris"])
-        elif isinstance(response_model, RelevancyResponse):
+        elif response_model is RelevancyResponse:
             return RelevancyResponse(
                 reasoning="The context directly answers the question.",
                 relevancy_score=1,
diff --git a/weave/flow/scorers/ragas_scorer.py b/weave/flow/scorers/ragas_scorer.py
index 75b27e816bf8..53693cb16e13 100644
--- a/weave/flow/scorers/ragas_scorer.py
+++ b/weave/flow/scorers/ragas_scorer.py
@@ -84,4 +84,4 @@ def score(self, output: str, context: str) -> dict:
             response_model=RelevancyResponse,
             model=self.model_id,
         )
-        return {"relevancy_score": response.relevancy_score}
+        return response.model_dump()

From 6cfc6fa80d9adf7a28040022ca6d984864b168aa Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Thu, 17 Oct 2024 18:19:01 +0200
Subject: [PATCH 101/150] lint

---
 weave/flow/eval.py                | 6 +-----
 weave/flow/scorers/json_scorer.py | 3 +--
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index f9bab2301ba1..5c7ad7ac0b0d 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -266,11 +266,7 @@ async def predict_and_score(
                 # input: is the full row, we have access to it via example
                 # output: is the model output, we have access to it via model_output
                 score_arg_names = [
-                    param
-                    for param in score_arg_names
-                    if (
-                        param != "self"
-                    )
+                    param for param in score_arg_names if (param != "self")
                 ]
                 score_args = {}
 
diff --git a/weave/flow/scorers/json_scorer.py b/weave/flow/scorers/json_scorer.py
index 626c82014588..4ca2fd75a53a 100644
--- a/weave/flow/scorers/json_scorer.py
+++ b/weave/flow/scorers/json_scorer.py
@@ -12,9 +12,8 @@ def score(self, output: Any) -> dict:  # type: ignore
             result = json.loads(output)
         except json.JSONDecodeError:
             return {"json_valid": False}
-        
+
         if isinstance(result, dict) or isinstance(result, list):
             return {"json_valid": True}
         else:
             return {"json_valid": False}
-            
\ No newline at end of file

From 448139b784f6c3eac99195f2a9b5579c14fc8b11 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Fri, 18 Oct 2024 13:55:23 +0100
Subject: [PATCH 102/150] Apply scorers docs suggestions from Andrew's review

Co-authored-by: Andrew Truong <itsandrewtruong@gmail.com>
---
 docs/docs/guides/evaluation/scorers.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index b026415f3b49..7e33805d6a9d 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -8,7 +8,7 @@ Scorers are passed to a `weave.Evaluation` object during evaluation. There are t
 1. **Function-based Scorers:** Simple Python functions decorated with `@weave.op`.
 2. **Class-based Scorers:** Python classes that inherit from `weave.Scorer` for more complex evaluations.
 
-Scorers must return a dictionary and can return multiple metrics, nested metrics and non-numeric values such as texted returned from a LLM-evaluator about its reasoning.
+Scorers must return a dictionary and can return multiple metrics, nested metrics and non-numeric values such as text returned from a LLM-evaluator about its reasoning.
 
 ## Function-based Scorers
 These are functions decorated with `@weave.op` that return a dictionary. They're great for simple evaluations like:
@@ -16,7 +16,7 @@ These are functions decorated with `@weave.op` that return a dictionary. They're
 ```python
 @weave.op
 def evaluate_uppercase(text: str):
-    return  {"text_is_uppercase": text.isupper()}
+    return {"text_is_uppercase": text.isupper()}
 
 eval = weave.Evaluations(..., scorers=[evaluate_uppercase])
 ```
@@ -24,12 +24,12 @@ eval = weave.Evaluations(..., scorers=[evaluate_uppercase])
 When the evaluation is run, `evaluate_uppercase` checks if the text is all uppercase.
 
 ## Class-based Scorers
-For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different prompts for your LLM-evaluators or make multiple function calls, you can use the `Scorer` class.
+For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different prompts for your LLM-evaluators, or make multiple function calls, you can use the `Scorer` class.
 
 **Requirements:**
-- Inherit from `weave.Scorer`.
-- Define a `score` method decorated with `@weave.op`.
-- The `score` method must return a dictionary.
+1. Inherit from `weave.Scorer`.
+2. Define a `score` method decorated with `@weave.op`.
+3. The `score` method must return a dictionary.
 
 Example:
 
@@ -116,7 +116,7 @@ class SummarizationScorer(Scorer)
         ...  # evaluate the quality of the summary
 
 # create a scorer with a column mapping the `text` parameter to the `news_article` data column
-scorer = SummarizationScorer(column_map = {"text" : "news_article"})
+scorer = SummarizationScorer(column_map={"text" : "news_article"})
 ```
 Here, the `text` parameter in the score method will receive data from the `news_article` column.
 
@@ -133,7 +133,7 @@ pip install weave[scorers]
 
 **LLM-evaluators**
 
-The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also uses weave's `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them.
+The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also use `weave`'s `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them.  You can get all necessary dependencies with `pip install "weave[scorers]"`
 
 ### `HallucinationFreeScorer`
 

From 1b7d26f9938fc4528c71df40a1b935fa6532920c Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Fri, 18 Oct 2024 14:14:54 +0100
Subject: [PATCH 103/150] update scorer docs, fix similarityscore threshold

---
 docs/docs/guides/evaluation/scorers.md  | 23 ++++++++++++++---------
 weave/flow/scorers/similarity_scorer.py |  8 +++-----
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 7e33805d6a9d..e5c98fbb2dcf 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -196,7 +196,7 @@ This scorer evaluates summaries in two ways:
 The `OpenAIModerationScorer` uses OpenAI's Moderation API to check if the AI system's output contains disallowed content, such as hate speech or explicit material.
 
 ```python
-from weave.flow.scorers.moderation_scorer import OpenAIModerationScorer
+from weave.scorers import OpenAIModerationScorer
 import openai
 
 oai_client = OpenAI(api_key=...) # initialize your LLM client here
@@ -222,7 +222,7 @@ scorer = OpenAIModerationScorer(
 The `EmbeddingSimilarityScorer` computes the cosine similarity between the embeddings of the AI system's output and a target text from your dataset. It's useful for measuring how similar the AI's output is to a reference text.
 
 ```python
-from weave.flow.scorers.similarity_score import EmbeddingSimilarityScorer
+from weave.scorers import EmbeddingSimilarityScorer
 
 llm_client = ...  # initialise your LlM client
 
@@ -235,8 +235,13 @@ similarity_scorer = EmbeddingSimilarityScorer(
 
 **Parameters:**
 
-- `target_column`: Name of the dataset column containing the reference text (default is `"text"`).
-- `threshold` (float): Minimum cosine similarity score considered as similar (default is `0.5`).
+- `target`: This scorer expects a `target` column in your dataset, it will calculate the cosine similarity of the embeddings of the `target` column to the AI system output. If your dataset doesn't contain a column called `target` you can use the scorers `column_map` attribute to map `target` to the appropriate column name in your dataset. See the Column Mapping section for more.
+- `threshold` (float): Minimum cosine similarity score to be considered as "similar" (default is `0.5`). Cosine similarity can range from -1 to 1:
+    - 1 indicates identical direction.
+    - 0 indicates orthogonal vectors.
+    - -1 indicates opposite direction.
+
+`threshold` should in a range between -1 and 1. The cosine similarity between the embedding of the AI system output and the `target`  correct threshold to set can fluctuate quite a lot depending on your use case, we advise exploring different thresholds
 
 ---
 
@@ -245,7 +250,7 @@ similarity_scorer = EmbeddingSimilarityScorer(
 The ValidJSONScorer checks whether the AI system's output is valid JSON. This scorer is useful when you expect the output to be in JSON format and need to verify its validity.
 
 ```python
-from weave.flow.scorers.json_scorer import ValidJSONScorer
+from weave.scorers import ValidJSONScorer
 
 json_scorer = ValidJSONScorer()
 ```
@@ -261,7 +266,7 @@ json_scorer = ValidJSONScorer()
 The `ValidXMLScorer` checks whether the AI system's output is valid XML. This is useful when expecting XML-formatted outputs.
 
 ```python
-from weave.flow.scorers.xml_scorer import ValidXMLScorer
+from weave.scorers import ValidXMLScorer
 
 xml_scorer = ValidXMLScorer()
 ```
@@ -273,7 +278,7 @@ xml_scorer = ValidXMLScorer()
 The `PydanticScorer` validates the AI system's output against a Pydantic model to ensure it adheres to a specified schema or data structure.
 
 ```python
-from weave.flow.scorers.pydantic_scorer import PydanticScorer
+from weave.scorers import PydanticScorer
 from pydantic import BaseModel
 
 class FinancialReport(BaseModel):
@@ -290,7 +295,7 @@ pydantic_scorer = PydanticScorer(model=Person)
 The `ContextEntityRecallScorer` estimates context recall by extracting entities from both the AI system's output and the provided context, then computing the recall score. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library
 
 ```python
-from weave.flow.scorers.ragas_scorer import ContextEntityRecallScorer
+from weave.scorers import ContextEntityRecallScorer
 
 llm_client = ...  # initialise your LlM client
 
@@ -317,7 +322,7 @@ entity_recall_scorer = ContextEntityRecallScorer(
 The `ContextRelevancyScorer` evaluates the relevancy of the provided context to the AI system's output. It helps determine if the context used is appropriate for generating the output. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library.
 
 ```python
-from weave.flow.scorers.ragas_scorer import ContextRelevancyScorer
+from weave.scorers import ContextRelevancyScorer
 
 llm_client = ...  # initialise your LlM client
 
diff --git a/weave/flow/scorers/similarity_scorer.py b/weave/flow/scorers/similarity_scorer.py
index 9b194504fe06..77a3a6befc39 100644
--- a/weave/flow/scorers/similarity_scorer.py
+++ b/weave/flow/scorers/similarity_scorer.py
@@ -21,6 +21,7 @@ class EmbeddingSimilarityScorer(LLMScorer):
 
     @weave.op
     def score(self, output: str, target: str) -> Any:
+        assert self.threshold >= -1 and self.threshold <= 1, "`threshold` should be between -1 and 1"
         model_embedding, target_embedding = self._compute_embeddings(output, target)
         return self.cosine_similarity(model_embedding, target_embedding)
 
@@ -35,8 +36,5 @@ def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> dict:
         arr1 = np.array(vec1)
         arr2 = np.array(vec2)
         cosine_sim = np.dot(arr1, arr2) / (np.linalg.norm(arr1) * np.linalg.norm(arr2))
-        # TODO: check if this can be negative
-
-        # cast to float
-        score = float(cosine_sim)
-        return {"similarity_score": score, "is_similar": score >= self.threshold}
+        cosine_sim = float(cosine_sim)
+        return {"similarity_score": cosine_sim, "is_similar": cosine_sim >= self.threshold}

From 4893a3c9588675d9f19fbd1d15c2296d94c3584a Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Fri, 18 Oct 2024 15:20:34 +0100
Subject: [PATCH 104/150] update similarity scorer and context entity scorer
 docs

---
 docs/docs/guides/evaluation/scorers.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index e5c98fbb2dcf..5bd38a3c6670 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -236,12 +236,12 @@ similarity_scorer = EmbeddingSimilarityScorer(
 **Parameters:**
 
 - `target`: This scorer expects a `target` column in your dataset, it will calculate the cosine similarity of the embeddings of the `target` column to the AI system output. If your dataset doesn't contain a column called `target` you can use the scorers `column_map` attribute to map `target` to the appropriate column name in your dataset. See the Column Mapping section for more.
-- `threshold` (float): Minimum cosine similarity score to be considered as "similar" (default is `0.5`). Cosine similarity can range from -1 to 1:
+- `threshold` (float): The minimum cosine similarity score between the embedding of the AI system output and the embdedding of the `target`, above which the 2 samples are considered "similar", (defaults to `0.5`). `threshold` can be in a range from -1 to 1:
     - 1 indicates identical direction.
     - 0 indicates orthogonal vectors.
     - -1 indicates opposite direction.
 
-`threshold` should in a range between -1 and 1. The cosine similarity between the embedding of the AI system output and the `target`  correct threshold to set can fluctuate quite a lot depending on your use case, we advise exploring different thresholds
+The correct cosine similarity threshold to set can fluctuate quite a lot depending on your use case, we advise exploring different thresholds.
 
 ---
 
@@ -307,8 +307,8 @@ entity_recall_scorer = ContextEntityRecallScorer(
 
 **How It Works:**
 
-- Uses an LLM to extract unique entities from the output and context.
-- Calculates recall as the proportion of entities in the output that are present in the context.
+- Uses an LLM to extract unique entities from the output and context and calculates recall.
+- **Recall** indicates the proportion of important entities from the context that are captured in the output, helping to assess the model's effectiveness in retrieving relevant information. 
 - Returns a dictionary with the recall score.
 
 **Notes:**

From eaa360cd4054953443816e761bb3615983713985 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Fri, 18 Oct 2024 15:53:44 +0100
Subject: [PATCH 105/150] Move all scorers from flow into weave.scorers

---
 docs/docs/guides/evaluation/scorers.md        |  2 +-
 weave/flow/scorers/__init__.py                | 57 +------------------
 weave/scorers/__init__.py                     | 56 +++++++++++++++++-
 weave/{flow => }/scorers/base_scorer.py       |  0
 .../scorers/classification_scorer.py          |  0
 .../scorers/hallucination_scorer.py           |  0
 weave/{flow => }/scorers/json_scorer.py       |  0
 weave/{flow => }/scorers/llm_scorer.py        |  0
 weave/{flow => }/scorers/llm_utils.py         |  0
 weave/{flow => }/scorers/moderation_scorer.py |  0
 weave/{flow => }/scorers/pydantic_scorer.py   |  0
 weave/{flow => }/scorers/ragas_scorer.py      |  0
 weave/{flow => }/scorers/similarity_scorer.py |  9 ++-
 weave/{flow => }/scorers/string_scorer.py     |  0
 .../scorers/summarization_scorer.py           |  0
 weave/{flow => }/scorers/utils.py             |  0
 weave/{flow => }/scorers/xml_scorer.py        |  0
 17 files changed, 65 insertions(+), 59 deletions(-)
 rename weave/{flow => }/scorers/base_scorer.py (100%)
 rename weave/{flow => }/scorers/classification_scorer.py (100%)
 rename weave/{flow => }/scorers/hallucination_scorer.py (100%)
 rename weave/{flow => }/scorers/json_scorer.py (100%)
 rename weave/{flow => }/scorers/llm_scorer.py (100%)
 rename weave/{flow => }/scorers/llm_utils.py (100%)
 rename weave/{flow => }/scorers/moderation_scorer.py (100%)
 rename weave/{flow => }/scorers/pydantic_scorer.py (100%)
 rename weave/{flow => }/scorers/ragas_scorer.py (100%)
 rename weave/{flow => }/scorers/similarity_scorer.py (84%)
 rename weave/{flow => }/scorers/string_scorer.py (100%)
 rename weave/{flow => }/scorers/summarization_scorer.py (100%)
 rename weave/{flow => }/scorers/utils.py (100%)
 rename weave/{flow => }/scorers/xml_scorer.py (100%)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 5bd38a3c6670..e3e35da1a32c 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -18,7 +18,7 @@ These are functions decorated with `@weave.op` that return a dictionary. They're
 def evaluate_uppercase(text: str):
     return {"text_is_uppercase": text.isupper()}
 
-eval = weave.Evaluations(..., scorers=[evaluate_uppercase])
+my_eval = weave.Evaluations(..., scorers=[evaluate_uppercase])
 ```
 
 When the evaluation is run, `evaluate_uppercase` checks if the text is all uppercase.
diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py
index 1f3b4013c955..180cb3f7d2f9 100644
--- a/weave/flow/scorers/__init__.py
+++ b/weave/flow/scorers/__init__.py
@@ -1,55 +1,2 @@
-from weave.flow.scorers.base_scorer import (
-    Scorer,
-    auto_summarize,
-    get_scorer_attributes,
-)
-from weave.flow.scorers.classification_scorer import (
-    MultiTaskBinaryClassificationF1,
-    transpose,
-)
-from weave.flow.scorers.hallucination_scorer import HallucinationFreeScorer
-from weave.flow.scorers.json_scorer import ValidJSONScorer
-from weave.flow.scorers.llm_scorer import (
-    InstructorLLMScorer,
-    LLMScorer,
-)
-from weave.flow.scorers.llm_utils import (
-    create,
-    embed,
-)
-from weave.flow.scorers.moderation_scorer import OpenAIModerationScorer
-from weave.flow.scorers.pydantic_scorer import PydanticScorer
-from weave.flow.scorers.ragas_scorer import (
-    ContextEntityRecallScorer,
-    ContextRelevancyScorer,
-)
-from weave.flow.scorers.similarity_scorer import EmbeddingSimilarityScorer
-from weave.flow.scorers.string_scorer import (
-    LevenshteinScorer,
-    StringMatchScorer,
-)
-from weave.flow.scorers.summarization_scorer import SummarizationScorer
-from weave.flow.scorers.xml_scorer import ValidXMLScorer
-
-__all__ = [
-    "auto_summarize",
-    "create",
-    "embed",
-    "ContextEntityRecallScorer",
-    "ContextRelevancyScorer",
-    "EmbeddingSimilarityScorer",
-    "get_scorer_attributes",
-    "HallucinationFreeScorer",
-    "InstructorLLMScorer",
-    "ValidJSONScorer",
-    "LevenshteinScorer",
-    "LLMScorer",
-    "MultiTaskBinaryClassificationF1",
-    "OpenAIModerationScorer",
-    "PydanticScorer",
-    "Scorer",
-    "StringMatchScorer",
-    "SummarizationScorer",
-    "transpose",
-    "ValidXMLScorer",
-]
+# Keeping this file for backwards compatibility
+from weave.scorers import *
diff --git a/weave/scorers/__init__.py b/weave/scorers/__init__.py
index 25756f08a082..941f48e7b131 100644
--- a/weave/scorers/__init__.py
+++ b/weave/scorers/__init__.py
@@ -1 +1,55 @@
-from weave.flow.scorers import *
+from weave.scorers.base_scorer import (
+    Scorer,
+    auto_summarize,
+    get_scorer_attributes,
+)
+from weave.scorers.classification_scorer import (
+    MultiTaskBinaryClassificationF1,
+    transpose,
+)
+from weave.scorers.hallucination_scorer import HallucinationFreeScorer
+from weave.scorers.json_scorer import ValidJSONScorer
+from weave.scorers.llm_scorer import (
+    InstructorLLMScorer,
+    LLMScorer,
+)
+from weave.scorers.llm_utils import (
+    create,
+    embed,
+)
+from weave.scorers.moderation_scorer import OpenAIModerationScorer
+from weave.scorers.pydantic_scorer import PydanticScorer
+from weave.scorers.ragas_scorer import (
+    ContextEntityRecallScorer,
+    ContextRelevancyScorer,
+)
+from weave.scorers.similarity_scorer import EmbeddingSimilarityScorer
+from weave.scorers.string_scorer import (
+    LevenshteinScorer,
+    StringMatchScorer,
+)
+from weave.scorers.summarization_scorer import SummarizationScorer
+from weave.scorers.xml_scorer import ValidXMLScorer
+
+__all__ = [
+    "auto_summarize",
+    "create",
+    "embed",
+    "ContextEntityRecallScorer",
+    "ContextRelevancyScorer",
+    "EmbeddingSimilarityScorer",
+    "get_scorer_attributes",
+    "HallucinationFreeScorer",
+    "InstructorLLMScorer",
+    "ValidJSONScorer",
+    "LevenshteinScorer",
+    "LLMScorer",
+    "MultiTaskBinaryClassificationF1",
+    "OpenAIModerationScorer",
+    "PydanticScorer",
+    "Scorer",
+    "StringMatchScorer",
+    "SummarizationScorer",
+    "transpose",
+    "ValidXMLScorer",
+]
diff --git a/weave/flow/scorers/base_scorer.py b/weave/scorers/base_scorer.py
similarity index 100%
rename from weave/flow/scorers/base_scorer.py
rename to weave/scorers/base_scorer.py
diff --git a/weave/flow/scorers/classification_scorer.py b/weave/scorers/classification_scorer.py
similarity index 100%
rename from weave/flow/scorers/classification_scorer.py
rename to weave/scorers/classification_scorer.py
diff --git a/weave/flow/scorers/hallucination_scorer.py b/weave/scorers/hallucination_scorer.py
similarity index 100%
rename from weave/flow/scorers/hallucination_scorer.py
rename to weave/scorers/hallucination_scorer.py
diff --git a/weave/flow/scorers/json_scorer.py b/weave/scorers/json_scorer.py
similarity index 100%
rename from weave/flow/scorers/json_scorer.py
rename to weave/scorers/json_scorer.py
diff --git a/weave/flow/scorers/llm_scorer.py b/weave/scorers/llm_scorer.py
similarity index 100%
rename from weave/flow/scorers/llm_scorer.py
rename to weave/scorers/llm_scorer.py
diff --git a/weave/flow/scorers/llm_utils.py b/weave/scorers/llm_utils.py
similarity index 100%
rename from weave/flow/scorers/llm_utils.py
rename to weave/scorers/llm_utils.py
diff --git a/weave/flow/scorers/moderation_scorer.py b/weave/scorers/moderation_scorer.py
similarity index 100%
rename from weave/flow/scorers/moderation_scorer.py
rename to weave/scorers/moderation_scorer.py
diff --git a/weave/flow/scorers/pydantic_scorer.py b/weave/scorers/pydantic_scorer.py
similarity index 100%
rename from weave/flow/scorers/pydantic_scorer.py
rename to weave/scorers/pydantic_scorer.py
diff --git a/weave/flow/scorers/ragas_scorer.py b/weave/scorers/ragas_scorer.py
similarity index 100%
rename from weave/flow/scorers/ragas_scorer.py
rename to weave/scorers/ragas_scorer.py
diff --git a/weave/flow/scorers/similarity_scorer.py b/weave/scorers/similarity_scorer.py
similarity index 84%
rename from weave/flow/scorers/similarity_scorer.py
rename to weave/scorers/similarity_scorer.py
index 77a3a6befc39..b2676feb46c6 100644
--- a/weave/flow/scorers/similarity_scorer.py
+++ b/weave/scorers/similarity_scorer.py
@@ -21,7 +21,9 @@ class EmbeddingSimilarityScorer(LLMScorer):
 
     @weave.op
     def score(self, output: str, target: str) -> Any:
-        assert self.threshold >= -1 and self.threshold <= 1, "`threshold` should be between -1 and 1"
+        assert (
+            self.threshold >= -1 and self.threshold <= 1
+        ), "`threshold` should be between -1 and 1"
         model_embedding, target_embedding = self._compute_embeddings(output, target)
         return self.cosine_similarity(model_embedding, target_embedding)
 
@@ -37,4 +39,7 @@ def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> dict:
         arr2 = np.array(vec2)
         cosine_sim = np.dot(arr1, arr2) / (np.linalg.norm(arr1) * np.linalg.norm(arr2))
         cosine_sim = float(cosine_sim)
-        return {"similarity_score": cosine_sim, "is_similar": cosine_sim >= self.threshold}
+        return {
+            "similarity_score": cosine_sim,
+            "is_similar": cosine_sim >= self.threshold,
+        }
diff --git a/weave/flow/scorers/string_scorer.py b/weave/scorers/string_scorer.py
similarity index 100%
rename from weave/flow/scorers/string_scorer.py
rename to weave/scorers/string_scorer.py
diff --git a/weave/flow/scorers/summarization_scorer.py b/weave/scorers/summarization_scorer.py
similarity index 100%
rename from weave/flow/scorers/summarization_scorer.py
rename to weave/scorers/summarization_scorer.py
diff --git a/weave/flow/scorers/utils.py b/weave/scorers/utils.py
similarity index 100%
rename from weave/flow/scorers/utils.py
rename to weave/scorers/utils.py
diff --git a/weave/flow/scorers/xml_scorer.py b/weave/scorers/xml_scorer.py
similarity index 100%
rename from weave/flow/scorers/xml_scorer.py
rename to weave/scorers/xml_scorer.py

From c5ba6dca9fd6b422e53d7c1d8987e4271e101432 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 21 Oct 2024 10:52:36 +0200
Subject: [PATCH 106/150] simplify JSON scorer

---
 weave/flow/scorers/json_scorer.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/weave/flow/scorers/json_scorer.py b/weave/flow/scorers/json_scorer.py
index 4ca2fd75a53a..7f4e566c88ac 100644
--- a/weave/flow/scorers/json_scorer.py
+++ b/weave/flow/scorers/json_scorer.py
@@ -9,11 +9,7 @@ class ValidJSONScorer(Scorer):
 
     def score(self, output: Any) -> dict:  # type: ignore
         try:
-            result = json.loads(output)
-        except json.JSONDecodeError:
-            return {"json_valid": False}
-
-        if isinstance(result, dict) or isinstance(result, list):
+            _ = json.loads(output)
             return {"json_valid": True}
-        else:
+        except json.JSONDecodeError:
             return {"json_valid": False}

From d1e748c24d6ae5b3514f4dd000d8147e093a73c4 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 21 Oct 2024 10:54:01 +0200
Subject: [PATCH 107/150] warn new scorers path

---
 weave/flow/scorer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/weave/flow/scorer.py b/weave/flow/scorer.py
index 2f2e999ca374..348013054b38 100644
--- a/weave/flow/scorer.py
+++ b/weave/flow/scorer.py
@@ -1,3 +1,5 @@
 # Keeping this file for now to avoid breaking changes.
 # In future, users should import all scoring functionality from weave.scorers
 from weave.scorers import *
+
+raise Warning("This file is deprecated. Please import all scoring functionality from weave.scorers")

From bdb95e66cc2e0854de72843bcf4964ddb84919ca Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 21 Oct 2024 10:57:57 +0200
Subject: [PATCH 108/150] remove TODO

---
 weave/flow/eval.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 5c7ad7ac0b0d..09d358698759 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -237,8 +237,6 @@ async def predict_and_score(
                 score_signature = inspect.signature(score_fn)
             score_arg_names = list(score_signature.parameters.keys())
 
-            # TODO: Check for input columns parameters in the signature of the scorer
-
             if (
                 "model_output" not in score_arg_names
                 and "output" not in score_arg_names

From 85384d1cd2702df5d684fb0b0ad53c4cfc045b71 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 21 Oct 2024 11:15:19 +0200
Subject: [PATCH 109/150] split into scorers + scorers_test

---
 noxfile.py     | 6 ++++--
 pyproject.toml | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index 8188ead202fc..fadbb11d86ea 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -12,6 +12,7 @@
     "notdiamond",
     "google_ai_studio",
     "scorers",
+    "scorers_test",
 ]
 
 
@@ -42,6 +43,7 @@ def lint(session):
         "notdiamond",
         "openai",
         "scorers",
+        "scorers_test",
     ],
 )
 def tests(session, shard):
@@ -67,7 +69,7 @@ def tests(session, shard):
 
     # we are doing some integration test in test_llm_integrations.py that requires
     # setting some environment variables for the LLM providers
-    if shard == "scorers":
+    if shard == "scorers_test":
         env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
         env["ANTHROPIC_API_KEY"] = session.env.get("ANTHROPIC_API_KEY")
         env["MISTRAL_API_KEY"] = session.env.get("MISTRAL_API_KEY")
@@ -79,7 +81,7 @@ def tests(session, shard):
         "trace_server": ["trace_server/"],
         "mistral0": ["integrations/mistral/v0/"],
         "mistral1": ["integrations/mistral/v1/"],
-        "scorers": ["scorers/"],
+        "scorers_test": ["scorers/"],
     }
 
     test_dirs = test_dirs_dict.get(shard, default_test_dirs)
diff --git a/pyproject.toml b/pyproject.toml
index b66e6d2ef903..14d46d5eec6b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,8 @@ litellm = ["litellm>=1.36.1"]
 llamaindex = ["llama-index>=0.10.35"]
 mistral0 = ["mistralai>=0.1.8,<1.0.0"]
 mistral1 = ["mistralai>=1.0.0"]
-scorers = ["openai>=1.0.0", "instructor>=1.5.2", "Levenshtein>=0.26.0", "google-generativeai>=0.8.0", "mistralai>=1.0.3", "anthropic>=0.30.0"]
+scorers = ["Levenshtein>=0.26.0"]
+scorers_test = ["openai>=1.0.0", "instructor>=1.5.2", "Levenshtein>=0.26.0", "google-generativeai>=0.8.0", "mistralai>=1.0.3", "anthropic>=0.30.0"]
 notdiamond = ["notdiamond>=0.3.21", "litellm<=1.49.1"]
 openai = ["openai>=1.0.0"]
 modal = ["modal", "python-dotenv"]

From 2d08123860e2f465f1d7a14ff53d1c3f341f0989 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 21 Oct 2024 11:18:19 +0200
Subject: [PATCH 110/150] make more real

---
 tests/scorers/test_similarity_scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scorers/test_similarity_scorer.py b/tests/scorers/test_similarity_scorer.py
index 75e03fa6f315..1a2d11a1490a 100644
--- a/tests/scorers/test_similarity_scorer.py
+++ b/tests/scorers/test_similarity_scorer.py
@@ -12,7 +12,7 @@ def mock_embed(monkeypatch):
     def _mock_embed(*args, **kwargs):
         import random
 
-        return [[random.random(), random.random()] for _ in range(2)]
+        return [[random.random() for _ in range(1024)] for _ in range(2)]
 
     monkeypatch.setattr("weave.flow.scorers.similarity_scorer.embed", _mock_embed)
 

From f29b305d52eb635e9853e7f845311e393cedff94 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 21 Oct 2024 11:23:57 +0200
Subject: [PATCH 111/150] duh, don't tet the `scorers` shard

---
 noxfile.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index fadbb11d86ea..404fb7639040 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -11,7 +11,6 @@
     "litellm",
     "notdiamond",
     "google_ai_studio",
-    "scorers",
     "scorers_test",
 ]
 
@@ -42,7 +41,6 @@ def lint(session):
         "mistral1",
         "notdiamond",
         "openai",
-        "scorers",
         "scorers_test",
     ],
 )

From 49d29b68e193ebb6348cfbd91fa4b1e38f53022b Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 21 Oct 2024 11:25:09 +0200
Subject: [PATCH 112/150] lint

---
 weave/flow/scorer.py                    | 4 +++-
 weave/flow/scorers/similarity_scorer.py | 9 +++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/weave/flow/scorer.py b/weave/flow/scorer.py
index 348013054b38..1ac3fcea5060 100644
--- a/weave/flow/scorer.py
+++ b/weave/flow/scorer.py
@@ -2,4 +2,6 @@
 # In future, users should import all scoring functionality from weave.scorers
 from weave.scorers import *
 
-raise Warning("This file is deprecated. Please import all scoring functionality from weave.scorers")
+raise Warning(
+    "This file is deprecated. Please import all scoring functionality from weave.scorers"
+)
diff --git a/weave/flow/scorers/similarity_scorer.py b/weave/flow/scorers/similarity_scorer.py
index 77a3a6befc39..b2676feb46c6 100644
--- a/weave/flow/scorers/similarity_scorer.py
+++ b/weave/flow/scorers/similarity_scorer.py
@@ -21,7 +21,9 @@ class EmbeddingSimilarityScorer(LLMScorer):
 
     @weave.op
     def score(self, output: str, target: str) -> Any:
-        assert self.threshold >= -1 and self.threshold <= 1, "`threshold` should be between -1 and 1"
+        assert (
+            self.threshold >= -1 and self.threshold <= 1
+        ), "`threshold` should be between -1 and 1"
         model_embedding, target_embedding = self._compute_embeddings(output, target)
         return self.cosine_similarity(model_embedding, target_embedding)
 
@@ -37,4 +39,7 @@ def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> dict:
         arr2 = np.array(vec2)
         cosine_sim = np.dot(arr1, arr2) / (np.linalg.norm(arr1) * np.linalg.norm(arr2))
         cosine_sim = float(cosine_sim)
-        return {"similarity_score": cosine_sim, "is_similar": cosine_sim >= self.threshold}
+        return {
+            "similarity_score": cosine_sim,
+            "is_similar": cosine_sim >= self.threshold,
+        }

From 9e7ec30b4a339c7407881816b420409236b9998d Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 21 Oct 2024 14:46:58 +0200
Subject: [PATCH 113/150] rename integrations

---
 noxfile.py     | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index 404fb7639040..e31daa98de56 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -11,7 +11,7 @@
     "litellm",
     "notdiamond",
     "google_ai_studio",
-    "scorers_test",
+    "scorers_integrations",
 ]
 
 
@@ -41,7 +41,7 @@ def lint(session):
         "mistral1",
         "notdiamond",
         "openai",
-        "scorers_test",
+        "scorers_integrations",
     ],
 )
 def tests(session, shard):
@@ -67,7 +67,7 @@ def tests(session, shard):
 
     # we are doing some integration test in test_llm_integrations.py that requires
     # setting some environment variables for the LLM providers
-    if shard == "scorers_test":
+    if shard == "scorers_integrations":
         env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
         env["ANTHROPIC_API_KEY"] = session.env.get("ANTHROPIC_API_KEY")
         env["MISTRAL_API_KEY"] = session.env.get("MISTRAL_API_KEY")
@@ -79,7 +79,7 @@ def tests(session, shard):
         "trace_server": ["trace_server/"],
         "mistral0": ["integrations/mistral/v0/"],
         "mistral1": ["integrations/mistral/v1/"],
-        "scorers_test": ["scorers/"],
+        "scorers_integrations": ["scorers/"],
     }
 
     test_dirs = test_dirs_dict.get(shard, default_test_dirs)
diff --git a/pyproject.toml b/pyproject.toml
index 14d46d5eec6b..1b2419fc2981 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,7 +67,7 @@ llamaindex = ["llama-index>=0.10.35"]
 mistral0 = ["mistralai>=0.1.8,<1.0.0"]
 mistral1 = ["mistralai>=1.0.0"]
 scorers = ["Levenshtein>=0.26.0"]
-scorers_test = ["openai>=1.0.0", "instructor>=1.5.2", "Levenshtein>=0.26.0", "google-generativeai>=0.8.0", "mistralai>=1.0.3", "anthropic>=0.30.0"]
+scorers_integrations = ["openai>=1.0.0", "instructor>=1.5.2", "Levenshtein>=0.26.0", "google-generativeai>=0.8.0", "mistralai>=1.0.3", "anthropic>=0.30.0"]
 notdiamond = ["notdiamond>=0.3.21", "litellm<=1.49.1"]
 openai = ["openai>=1.0.0"]
 modal = ["modal", "python-dotenv"]

From 2a0fd55f38698fcc8d813963f024db236a1e6590 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 21 Oct 2024 14:48:51 +0200
Subject: [PATCH 114/150] missing update to test.yaml

---
 .github/workflows/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index fd3c17598e96..40004df8ac5f 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -240,7 +240,7 @@ jobs:
             'mistral1',
             'notdiamond',
             'openai',
-            'scorers',
+            'scorers_integrations',
           ]
       fail-fast: false
     services:

From e69757a548754dbdb6cf00149f0cdc6b366048db Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 09:20:19 +0100
Subject: [PATCH 115/150] add Instructor req to scorers deps

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9ead92fd08cb..0821923cab4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,7 @@ litellm = ["litellm>=1.36.1"]
 llamaindex = ["llama-index>=0.10.35"]
 mistral0 = ["mistralai>=0.1.8,<1.0.0"]
 mistral1 = ["mistralai>=1.0.0"]
-scorers = ["Levenshtein>=0.26.0"]
+scorers = ["Levenshtein>=0.26.0", "instructor>=1.5.2"]
 scorers_integrations = ["openai>=1.0.0", "instructor>=1.5.2", "Levenshtein>=0.26.0", "google-generativeai>=0.8.0", "mistralai>=1.0.3", "anthropic>=0.30.0"]
 notdiamond = ["notdiamond>=0.3.21", "litellm<=1.49.1"]
 openai = ["openai>=1.0.0"]

From e8a974ea15253657a466d16e2ed2582f03e343e7 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 09:47:15 +0100
Subject: [PATCH 116/150] Updating column error messages to be more consistent
 eval.py

---
 weave/flow/eval.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 09d358698759..7f3d30e7cb82 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -250,10 +250,10 @@ async def predict_and_score(
                 raise OpCallError(message)
 
             if isinstance(example, dict):
-                # The keys of `score_args` must match the parameter names of the scorer's `score` method.
+                # The keys of `score_args` must match the argument names of the scorer's `score` method.
                 # If scorer.column_map is set, then user is indicating that the dataset column(s)
-                # being passed to the scorer have different names to the scorer's parameter names.
-                # So we need to remap the dataset columns to the expected parameter names in the scorer,
+                # being passed to the scorer have different names to the `score` functions' argument names.
+                # So we need to remap the dataset columns to the expected argument names in the scorer,
                 #
                 # column_map k:v pairs must be structured as `scorer param name : dataset column name`
                 #
@@ -276,11 +276,11 @@ async def predict_and_score(
                                 f"""
                                     You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
 
-                                    The `column_map` contains a key `{key}` which is not in the scorer's argument names.
-                                    Scorer argument names: {score_arg_names}
+                                    The `column_map` contains a key, `{key}`, which is not in the `score` methods' argument names.
+                                    `score` methods' argument names: {score_arg_names}
 
                                     Hint:
-                                    - Ensure that the keys in `column_map` match the scorer's parameter names.
+                                    - Ensure that the keys in `column_map` match the scorer's argument names.
                                     """
                             )
                             raise ValueError(message)
@@ -305,7 +305,7 @@ async def predict_and_score(
                                         Available dataset columns: {list(example.keys())}
 
                                         Hint:
-                                        - Ensure that `column_map` maps scorer parameter names to existing dataset column names.
+                                        - Ensure that `column_map` maps the `score` methods' argument names to existing dataset column names.
                                         """
                                 )
                                 raise ValueError(message)
@@ -314,13 +314,16 @@ async def predict_and_score(
                                 f"""
                                     You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
 
-                                    Scorer argument `{arg}` is not found in the dataset columns and is not mapped in `column_map`.
+                                    `score` method argument `{arg}` is not found in the dataset columns and is not mapped in `column_map`.
                                     
                                     Available dataset columns: {list(example.keys())}
                                     `column_map`: {scorer.column_map}
 
                                     Hint:
-                                    - Either provide `{arg}` directly in the dataset, or map it via `column_map`.
+                                    Either:
+                                    - map the argument name to the dataset column using the scorers `column_map` attribute, in the form {{score_arg_name : dataset_column_name}} or
+                                    - rename a column in the dataset to `{arg}` or
+                                    - re-name the `{arg}` argument in your `score` method to match a dataset column name
                                     """
                             )
                             raise ValueError(message)

From 475eaf92ea4922ee00f525f3d85e079e93561f14 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 09:48:03 +0100
Subject: [PATCH 117/150] Apply docs suggestions from code review

Co-authored-by: Andrew Truong <itsandrewtruong@gmail.com>
---
 docs/docs/guides/evaluation/scorers.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 5bd38a3c6670..7b3e49e85d2d 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -91,7 +91,7 @@ When a weave `Evaluation` is run, the output of the AI system is passed to the `
 ### Mapping Column Names
 Sometimes, the scorer's parameter names don't match the column names in your dataset. You can fix this using a `column_map`.
 
-If you're using a class-based scorer, pass a dictionary to the `column_map` attribute of `Scorer` when you initialise your scorer class. This dictionary maps your scorer's parameter names to the dataset's column names, in the order: `{scorer keyword argument : dataset column name}`.
+If you're using a class-based scorer, pass a dictionary to the `column_map` attribute of `Scorer` when you initialise your scorer class. This dictionary maps your scorer's parameter names to the dataset's column names, in the order: `{scorer_keyword_argument: dataset_column_name}`.
 
 Example:
 
@@ -302,7 +302,7 @@ llm_client = ...  # initialise your LlM client
 entity_recall_scorer = ContextEntityRecallScorer(
     client=llm_client
     model_id="your-model-id"
-    )
+)
 ```
 
 **How It Works:**

From ec5826e69ec12c89ab338790be1be24a95c3bdab Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 09:56:06 +0100
Subject: [PATCH 118/150] Modify scorers error message to be more consistent
 and precise eval.py

---
 weave/flow/eval.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 7f3d30e7cb82..ae433129f467 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -363,7 +363,7 @@ async def predict_and_score(
             except OpCallError as e:
                 dataset_column_names = list(example.keys())
                 dataset_column_names_str = ", ".join(dataset_column_names[:3])
-                if len(dataset_column_names) > 3:
+                if len(dataset_column_names) > 10:
                     dataset_column_names_str += ", ..."
                 required_arg_names = [
                     param.name
@@ -377,19 +377,20 @@ async def predict_and_score(
                     Call error: {e}
 
                                         If using the `Scorer` weave class, you can set the `scorer.column_map`
-                    attribute to map scorer parameter names to dataset columns.
+                    attribute to map scorer argument names to dataset columns.
                     
-                    For example, if the scorer expects "output", "input" and "ground_truth" and we have a dataset
-                    with columns "question" and "answer", `column_map` can be used to map the non-output parameter to like so:
+                    For example, if the `score` expects "output", "input" and "ground_truth" and we have a dataset
+                    with columns "question" and "answer", `column_map` can be used to map the non-output parameter like so:
                     {{"input": "question", "ground_truth": "answer"}}
                     
                     scorer argument names: {score_arg_names}
                     dataset keys: {example.keys()}
-                    scorer.column_map: {getattr(scorer, 'column_map', None)}
+                    scorer.column_map: {getattr(scorer, 'column_map', '{}')}
 
                     Options for resolving:
-                    a. change {scorer_name} argument names to match a subset of dataset column names ({dataset_column_names_str})
-                    b. change dataset column names to match expected {scorer_name} argument names: {required_arg_names}
+                    a. if using the `Scorer` weave class, you can set the `scorer.column_map` attribute to map scorer argument names to dataset column names or
+                    b. change the argument names the in the scoring function of {scorer_name} to match a subset of dataset column names: ({dataset_column_names_str}) or
+                    c. change dataset column names to match expected {scorer_name} argument names: {required_arg_names}
                     """
                 )
                 raise OpCallError(message)

From 652ae756260f7ff20a785eb82fb089f85d917162 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 10:09:37 +0100
Subject: [PATCH 119/150] add deprecation warning to flow/scorer.py

---
 weave/flow/scorer.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/weave/flow/scorer.py b/weave/flow/scorer.py
index 1ac3fcea5060..86df3d6a0553 100644
--- a/weave/flow/scorer.py
+++ b/weave/flow/scorer.py
@@ -1,7 +1,12 @@
 # Keeping this file for now to avoid breaking changes.
 # In future, users should import all scoring functionality from weave.scorers
+import warnings
+
 from weave.scorers import *
 
-raise Warning(
-    "This file is deprecated. Please import all scoring functionality from weave.scorers"
+warnings.warn(
+    "Importing from weave.flow.scorer is deprecated. "
+    "Please import from weave.scorers in the future.",
+    DeprecationWarning,
+    stacklevel=2,
 )

From dc4242ecf56651ba430706239f6a5d6fc28cd778 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 10:20:14 +0100
Subject: [PATCH 120/150] remove temp weave.flow.scorers dir

---
 weave/flow/scorers/__init__.py | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 weave/flow/scorers/__init__.py

diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py
deleted file mode 100644
index 180cb3f7d2f9..000000000000
--- a/weave/flow/scorers/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Keeping this file for backwards compatibility
-from weave.scorers import *

From 0b61f7cc95213c44171dbecc50978bbd407caa23 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 10:34:23 +0100
Subject: [PATCH 121/150] Fix code formatting in scorers docs

---
 docs/docs/guides/evaluation/scorers.md | 42 ++++++++++++++------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index f04d8cecf5c2..10d9e6dcc00b 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -36,7 +36,7 @@ Example:
 
 ```python
 from weave import Scorer
-from weave.scorers import create
+from typing import Dict
 
 llm_client = ...
 
@@ -50,18 +50,18 @@ class SummarizationScorer(Scorer):
         return text
 
     @weave.op
-    def call_llm(self, summary: str, text: str) -> Dict:
+    def ask_llm(self, summary: str, text: str) -> Dict:
         res = llm_client.create(self.system_prompt, summary, text)
         return {"summary_quality": res}
 
     @weave.op
-    def score(self, output: str, text: str) -> Dict
-        """"
+    def score(self, output: str, text: str) -> Dict:
+        """
         output: The summary generated by an AI system
         text: The original text being summarised
-        """"
-        text = some_complicated_preprocessing(text)
-        eval_result = call_llm(summary, text)
+        """
+        text = self.some_complicated_preprocessing(text)
+        eval_result = self.ask_llm(output, text)
         return {"summary_quality": eval_result}
 
 summarization_scorer = SummarizationScorer(model_id="o2")
@@ -79,24 +79,25 @@ For example if you wanted to use a column called "label" from your dataset then
 
 ```python
 @weave.op
-def my_custom_scorer(outout: str, label: int):
+def my_custom_scorer(output: str, label: int):
     ...
 ```
 
-When a weave `Evaluation` is run, the output of the AI system is passed to the `output` parameter. The `Evaluation` also automatically tries to match any additional scorer parameter names to your dataset columns. If customizing your scorer parameters or dataset columns is not feasible, you can use column mapping - see below for more.
+When a weave `Evaluation` is run, the output of the AI system is passed to the `output` parameter. The `Evaluation` also automatically tries to match any additional scorer argument names to your dataset columns. If customizing your scorer arguments or dataset columns is not feasible, you can use column mapping - see below for more.
 
 - **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output.
 
 
-### Mapping Column Names
-Sometimes, the scorer's parameter names don't match the column names in your dataset. You can fix this using a `column_map`.
+### Mapping Column Names with column_map
+Sometimes, the `score` methods' argument names don't match the column names in your dataset. You can fix this using a `column_map`.
 
-If you're using a class-based scorer, pass a dictionary to the `column_map` attribute of `Scorer` when you initialise your scorer class. This dictionary maps your scorer's parameter names to the dataset's column names, in the order: `{scorer_keyword_argument: dataset_column_name}`.
+If you're using a class-based scorer, pass a dictionary to the `column_map` attribute of `Scorer` when you initialise your scorer class. This dictionary maps your `score` method's  argument names to the dataset's column names, in the order: `{scorer_keyword_argument: dataset_column_name}`.
 
 Example:
 
 ```python
 from weave import Scorer
+from typing import Dict
 
 # A dataset with news articles to be summarised
 dataset = [
@@ -105,20 +106,21 @@ dataset = [
 ]
 
 # Scorer class
-class SummarizationScorer(Scorer)
+class SummarizationScorer(Scorer):
     
     @weave.op
-    def score(output, text)
+    def score(output, text) -> Dict:
         """
             output: output summary from a LLM summarization system
             text: the text being summarised
         """
         ...  # evaluate the quality of the summary
 
-# create a scorer with a column mapping the `text` parameter to the `news_article` data column
+# create a scorer with a column mapping the `text` argument to the `news_article` data column
 scorer = SummarizationScorer(column_map={"text" : "news_article"})
 ```
-Here, the `text` parameter in the score method will receive data from the `news_article` column.
+
+Now, the `text` argument in the `score` method will receive data from the `news_article` dataset column.
 
 
 ## Predefined Scorers
@@ -142,7 +144,7 @@ This scorer checks if your AI system's output includes any hallucinations based
 ```python
 from weave.scorers import HallucinationFreeScorer
 
-llm_client = # initialize your LLM client here
+llm_client = ... # initialize your LLM client here
 
 scorer = HallucinationFreeScorer(
     client=llm_client, 
@@ -165,7 +167,7 @@ Use an LLM to compare a summary to the original text and evaluate the quality of
 ```python
 from weave.scorers import SummarizationScorer
 
-llm_client = # initialize your LLM client here
+llm_client = ... # initialize your LLM client here
 
 scorer = SummarizationScorer(
     client=llm_client, 
@@ -197,7 +199,7 @@ The `OpenAIModerationScorer` uses OpenAI's Moderation API to check if the AI sys
 
 ```python
 from weave.scorers import OpenAIModerationScorer
-import openai
+from openai import OpenAI
 
 oai_client = OpenAI(api_key=...) # initialize your LLM client here
 
@@ -285,7 +287,7 @@ class FinancialReport(BaseModel):
     revenue: int
     year: str
 
-pydantic_scorer = PydanticScorer(model=Person)
+pydantic_scorer = PydanticScorer(model=FinancialReport)
 ```
 
 ---

From afc1ab1690ee573a48374c9f00d6a442d0f002f5 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 10:37:25 +0100
Subject: [PATCH 122/150] scorers.md formatting

---
 docs/docs/guides/evaluation/scorers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 10d9e6dcc00b..382bb7a7dfaf 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -101,7 +101,7 @@ from typing import Dict
 
 # A dataset with news articles to be summarised
 dataset = [
-    {"news_article": "The news today was great...", "date": "2030-04-20", "source": "Bright Sky Network"}
+    {"news_article": "The news today was great...", "date": "2030-04-20", "source": "Bright Sky Network"},
     ...
 ]
 

From ff05051ce482035839f7a28d3946bce180c60601 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 10:39:53 +0100
Subject: [PATCH 123/150] scorers docs update

---
 docs/docs/guides/evaluation/scorers.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 382bb7a7dfaf..8e038c27b07c 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -10,7 +10,8 @@ Scorers are passed to a `weave.Evaluation` object during evaluation. There are t
 
 Scorers must return a dictionary and can return multiple metrics, nested metrics and non-numeric values such as text returned from a LLM-evaluator about its reasoning.
 
-## Function-based Scorers
+## Create your own Scorers
+### Function-based Scorers
 These are functions decorated with `@weave.op` that return a dictionary. They're great for simple evaluations like:
 
 ```python
@@ -23,7 +24,7 @@ my_eval = weave.Evaluations(..., scorers=[evaluate_uppercase])
 
 When the evaluation is run, `evaluate_uppercase` checks if the text is all uppercase.
 
-## Class-based Scorers
+### Class-based Scorers
 For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different prompts for your LLM-evaluators, or make multiple function calls, you can use the `Scorer` class.
 
 **Requirements:**

From c6aac641ecfab65e60dfbcd786c5e5e882acab14 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 11:10:33 +0100
Subject: [PATCH 124/150] rename scorers_integrations to scorers_tests, fix
 scorers imports in tests

---
 .github/workflows/test.yaml                |  2 +-
 noxfile.py                                 |  8 ++--
 pyproject.toml                             |  2 +-
 tests/scorers/test_hallucination_scorer.py |  4 +-
 tests/scorers/test_llm_integrations.py     |  2 +-
 tests/scorers/test_pydantic_scorer.py      | 53 ++++++----------------
 tests/scorers/test_ragas_scorer.py         |  4 +-
 tests/scorers/test_similarity_scorer.py    |  6 +--
 tests/scorers/test_summarization_scorer.py |  4 +-
 tests/scorers/test_utils.py                |  2 +-
 weave/scorers/classification_scorer.py     |  2 +-
 weave/scorers/hallucination_scorer.py      |  6 +--
 weave/scorers/json_scorer.py               |  2 +-
 weave/scorers/llm_scorer.py                |  4 +-
 weave/scorers/moderation_scorer.py         |  2 +-
 weave/scorers/pydantic_scorer.py           |  2 +-
 weave/scorers/ragas_scorer.py              |  4 +-
 weave/scorers/similarity_scorer.py         |  4 +-
 weave/scorers/string_scorer.py             |  2 +-
 weave/scorers/summarization_scorer.py      |  4 +-
 weave/scorers/xml_scorer.py                |  2 +-
 21 files changed, 48 insertions(+), 73 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 40004df8ac5f..4a440b3bc335 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -240,7 +240,7 @@ jobs:
             'mistral1',
             'notdiamond',
             'openai',
-            'scorers_integrations',
+            'scorers_tests',
           ]
       fail-fast: false
     services:
diff --git a/noxfile.py b/noxfile.py
index e31daa98de56..1249de554f83 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -11,7 +11,7 @@
     "litellm",
     "notdiamond",
     "google_ai_studio",
-    "scorers_integrations",
+    "scorers_tests",
 ]
 
 
@@ -41,7 +41,7 @@ def lint(session):
         "mistral1",
         "notdiamond",
         "openai",
-        "scorers_integrations",
+        "scorers_tests",
     ],
 )
 def tests(session, shard):
@@ -67,7 +67,7 @@ def tests(session, shard):
 
     # we are doing some integration test in test_llm_integrations.py that requires
     # setting some environment variables for the LLM providers
-    if shard == "scorers_integrations":
+    if shard == "scorers_tests":
         env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
         env["ANTHROPIC_API_KEY"] = session.env.get("ANTHROPIC_API_KEY")
         env["MISTRAL_API_KEY"] = session.env.get("MISTRAL_API_KEY")
@@ -79,7 +79,7 @@ def tests(session, shard):
         "trace_server": ["trace_server/"],
         "mistral0": ["integrations/mistral/v0/"],
         "mistral1": ["integrations/mistral/v1/"],
-        "scorers_integrations": ["scorers/"],
+        "scorers_tests": ["scorers/"],
     }
 
     test_dirs = test_dirs_dict.get(shard, default_test_dirs)
diff --git a/pyproject.toml b/pyproject.toml
index 0821923cab4e..eee32e782c15 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,7 +67,7 @@ llamaindex = ["llama-index>=0.10.35"]
 mistral0 = ["mistralai>=0.1.8,<1.0.0"]
 mistral1 = ["mistralai>=1.0.0"]
 scorers = ["Levenshtein>=0.26.0", "instructor>=1.5.2"]
-scorers_integrations = ["openai>=1.0.0", "instructor>=1.5.2", "Levenshtein>=0.26.0", "google-generativeai>=0.8.0", "mistralai>=1.0.3", "anthropic>=0.30.0"]
+scorers_tests = ["instructor>=1.5.2", "Levenshtein>=0.26.0", "openai>=1.0.0", "google-generativeai>=0.8.0", "mistralai>=1.0.3", "anthropic>=0.30.0"]
 notdiamond = ["notdiamond>=0.3.21", "litellm<=1.49.1"]
 openai = ["openai>=1.0.0"]
 modal = ["modal", "python-dotenv"]
diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index 8303ddf75f2e..520cf773d497 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -2,7 +2,7 @@
 from openai import OpenAI
 
 import weave
-from weave.flow.scorers.hallucination_scorer import (
+from weave.scorers.hallucination_scorer import (
     HallucinationReasoning,
     HallucinationResponse,
 )
@@ -27,7 +27,7 @@ def _mock_create(*args, **kwargs):
             has_hallucination=True,
         )
 
-    monkeypatch.setattr("weave.flow.scorers.hallucination_scorer.create", _mock_create)
+    monkeypatch.setattr("weave.scorers.hallucination_scorer.create", _mock_create)
 
 
 @pytest.fixture
diff --git a/tests/scorers/test_llm_integrations.py b/tests/scorers/test_llm_integrations.py
index c2419ec6da2a..0336955d740b 100644
--- a/tests/scorers/test_llm_integrations.py
+++ b/tests/scorers/test_llm_integrations.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from weave.flow.scorers.summarization_scorer import (
+from weave.scorers.summarization_scorer import (
     SummarizationEvaluationResponse,
     SummarizationScorer,
 )
diff --git a/tests/scorers/test_pydantic_scorer.py b/tests/scorers/test_pydantic_scorer.py
index f9953ba6abd9..f06dc83bca74 100644
--- a/tests/scorers/test_pydantic_scorer.py
+++ b/tests/scorers/test_pydantic_scorer.py
@@ -14,42 +14,17 @@ def user_scorer():
     return PydanticScorer(model=User)
 
 
-def test_pydantic_scorer_initialization():
-    scorer = PydanticScorer(model=User)
-    assert isinstance(scorer, PydanticScorer)
-    assert scorer.model == User
-
-
-def test_pydantic_scorer_valid_json_string(user_scorer):
-    valid_json = '{"name": "John", "age": 30}'
-    assert user_scorer.score(valid_json) == {"valid_pydantic": True}
-
-
-def test_pydantic_scorer_valid_dict(user_scorer):
-    valid_dict = {"name": "John", "age": 30}
-    assert user_scorer.score(valid_dict) == {"valid_pydantic": True}
-
-
-def test_pydantic_scorer_invalid_json_string(user_scorer):
-    invalid_json = '{"name": "John", "age": "thirty"}'
-    assert user_scorer.score(invalid_json) == {"valid_pydantic": False}
-
-
-def test_pydantic_scorer_invalid_dict(user_scorer):
-    invalid_dict = {"name": "John", "age": "thirty"}
-    assert user_scorer.score(invalid_dict) == {"valid_pydantic": False}
-
-
-def test_pydantic_scorer_missing_field(user_scorer):
-    missing_field = '{"name": "John"}'
-    assert user_scorer.score(missing_field) == {"valid_pydantic": False}
-
-
-def test_pydantic_scorer_extra_field(user_scorer):
-    extra_field = '{"name": "John", "age": 30, "city": "New York"}'
-    assert user_scorer.score(extra_field) == {"valid_pydantic": True}
-
-
-def test_pydantic_scorer_invalid_input_type(user_scorer):
-    invalid_input = 123  # Neither a string nor a dict
-    assert user_scorer.score(invalid_input) == {"valid_pydantic": False}
+@pytest.mark.parametrize(
+    "input_data, expected_result",
+    [
+        ('{"name": "John", "age": 30}', {"valid_pydantic": True}),
+        ({"name": "John", "age": 30}, {"valid_pydantic": True}),
+        ('{"name": "John", "age": "thirty"}', {"valid_pydantic": False}),
+        ({"name": "John", "age": "thirty"}, {"valid_pydantic": False}),
+        ('{"name": "John"}', {"valid_pydantic": False}),
+        ('{"name": "John", "age": 30, "city": "New York"}', {"valid_pydantic": True}),
+        (123, {"valid_pydantic": False}),
+    ],
+)
+def test_pydantic_scorer(user_scorer, input_data, expected_result):
+    assert user_scorer.score(input_data) == expected_result
diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index e2eb164a675b..e9c44338b973 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -1,7 +1,7 @@
 import pytest
 from openai import OpenAI
 
-from weave.flow.scorers.ragas_scorer import (
+from weave.scorers.ragas_scorer import (
     EntityExtractionResponse,
     RelevancyResponse,
 )
@@ -25,7 +25,7 @@ def _mock_create(*args, **kwargs):
                 relevancy_score=1,
             )
 
-    monkeypatch.setattr("weave.flow.scorers.ragas_scorer.create", _mock_create)
+    monkeypatch.setattr("weave.scorers.ragas_scorer.create", _mock_create)
 
 
 @pytest.fixture
diff --git a/tests/scorers/test_similarity_scorer.py b/tests/scorers/test_similarity_scorer.py
index 1a2d11a1490a..0a02296a55a0 100644
--- a/tests/scorers/test_similarity_scorer.py
+++ b/tests/scorers/test_similarity_scorer.py
@@ -2,8 +2,8 @@
 from openai import OpenAI
 
 import weave
-from weave.flow.scorers.llm_utils import OPENAI_DEFAULT_EMBEDDING_MODEL
-from weave.flow.scorers.similarity_scorer import EmbeddingSimilarityScorer
+from weave.scorers.llm_utils import OPENAI_DEFAULT_EMBEDDING_MODEL
+from weave.scorers.similarity_scorer import EmbeddingSimilarityScorer
 
 
 # mock the create function
@@ -14,7 +14,7 @@ def _mock_embed(*args, **kwargs):
 
         return [[random.random() for _ in range(1024)] for _ in range(2)]
 
-    monkeypatch.setattr("weave.flow.scorers.similarity_scorer.embed", _mock_embed)
+    monkeypatch.setattr("weave.scorers.similarity_scorer.embed", _mock_embed)
 
 
 @pytest.fixture
diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
index c30135fe8052..6e89f7102d76 100644
--- a/tests/scorers/test_summarization_scorer.py
+++ b/tests/scorers/test_summarization_scorer.py
@@ -2,7 +2,7 @@
 from openai import OpenAI
 
 import weave
-from weave.flow.scorers.summarization_scorer import (
+from weave.scorers.summarization_scorer import (
     EntityExtractionResponse,
     SummarizationEvaluationResponse,
 )
@@ -24,7 +24,7 @@ def _mock_create(*args, **kwargs):
             )
 
     # Patch the 'create' function wherever it is called
-    monkeypatch.setattr("weave.flow.scorers.summarization_scorer.create", _mock_create)
+    monkeypatch.setattr("weave.scorers.summarization_scorer.create", _mock_create)
 
 
 @pytest.fixture
diff --git a/tests/scorers/test_utils.py b/tests/scorers/test_utils.py
index 73bef952810f..03d95aff6c9a 100644
--- a/tests/scorers/test_utils.py
+++ b/tests/scorers/test_utils.py
@@ -1,4 +1,4 @@
-from weave.flow.scorers.utils import stringify
+from weave.scorers.utils import stringify
 
 
 def test_stringify():
diff --git a/weave/scorers/classification_scorer.py b/weave/scorers/classification_scorer.py
index 4082b291029e..7c6cb1207c38 100644
--- a/weave/scorers/classification_scorer.py
+++ b/weave/scorers/classification_scorer.py
@@ -2,7 +2,7 @@
 from typing import Optional, Tuple
 
 import weave
-from weave.flow.scorers.base_scorer import Scorer
+from weave.scorers.base_scorer import Scorer
 
 
 def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
diff --git a/weave/scorers/hallucination_scorer.py b/weave/scorers/hallucination_scorer.py
index b78bb97dc801..1aee20121340 100644
--- a/weave/scorers/hallucination_scorer.py
+++ b/weave/scorers/hallucination_scorer.py
@@ -3,9 +3,9 @@
 from pydantic import BaseModel, Field
 
 import weave
-from weave.flow.scorers.llm_scorer import InstructorLLMScorer
-from weave.flow.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create
-from weave.flow.scorers.utils import stringify
+from weave.scorers.llm_scorer import InstructorLLMScorer
+from weave.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create
+from weave.scorers.utils import stringify
 
 DEFAULT_HALLUCINATION_SYSTEM_PROMPT = """
 Given some <input_data> from a user and an <output> generated by an AI system, \
diff --git a/weave/scorers/json_scorer.py b/weave/scorers/json_scorer.py
index 7f4e566c88ac..5c6a69a6d51f 100644
--- a/weave/scorers/json_scorer.py
+++ b/weave/scorers/json_scorer.py
@@ -1,7 +1,7 @@
 import json
 from typing import Any
 
-from weave.flow.scorers.base_scorer import Scorer
+from weave.scorers.base_scorer import Scorer
 
 
 class ValidJSONScorer(Scorer):
diff --git a/weave/scorers/llm_scorer.py b/weave/scorers/llm_scorer.py
index 82b68667aa0f..b97189e93b61 100644
--- a/weave/scorers/llm_scorer.py
+++ b/weave/scorers/llm_scorer.py
@@ -1,7 +1,7 @@
 from pydantic import Field, field_validator
 
-from weave.flow.scorers.base_scorer import Scorer
-from weave.flow.scorers.llm_utils import (
+from weave.scorers.base_scorer import Scorer
+from weave.scorers.llm_utils import (
     _LLM_CLIENTS,
     _LLM_CLIENTS_NAMES,
     instructor_client,
diff --git a/weave/scorers/moderation_scorer.py b/weave/scorers/moderation_scorer.py
index 8a8e4eee9dae..4b284c908acf 100644
--- a/weave/scorers/moderation_scorer.py
+++ b/weave/scorers/moderation_scorer.py
@@ -3,7 +3,7 @@
 from pydantic import field_validator
 
 import weave
-from weave.flow.scorers.llm_scorer import LLMScorer
+from weave.scorers.llm_scorer import LLMScorer
 
 
 class OpenAIModerationScorer(LLMScorer):
diff --git a/weave/scorers/pydantic_scorer.py b/weave/scorers/pydantic_scorer.py
index 5566326774d7..381834f270be 100644
--- a/weave/scorers/pydantic_scorer.py
+++ b/weave/scorers/pydantic_scorer.py
@@ -2,7 +2,7 @@
 
 from pydantic import BaseModel, ValidationError
 
-from weave.flow.scorers.base_scorer import Scorer
+from weave.scorers.base_scorer import Scorer
 
 
 class PydanticScorer(Scorer):
diff --git a/weave/scorers/ragas_scorer.py b/weave/scorers/ragas_scorer.py
index 53693cb16e13..fe0b7be26a8e 100644
--- a/weave/scorers/ragas_scorer.py
+++ b/weave/scorers/ragas_scorer.py
@@ -5,8 +5,8 @@
 from pydantic import BaseModel, Field
 
 import weave
-from weave.flow.scorers.llm_scorer import InstructorLLMScorer
-from weave.flow.scorers.llm_utils import create
+from weave.scorers.llm_scorer import InstructorLLMScorer
+from weave.scorers.llm_utils import create
 
 
 class EntityExtractionResponse(BaseModel):
diff --git a/weave/scorers/similarity_scorer.py b/weave/scorers/similarity_scorer.py
index b2676feb46c6..c084107d49b3 100644
--- a/weave/scorers/similarity_scorer.py
+++ b/weave/scorers/similarity_scorer.py
@@ -4,8 +4,8 @@
 from pydantic import Field
 
 import weave
-from weave.flow.scorers.llm_scorer import LLMScorer
-from weave.flow.scorers.llm_utils import embed
+from weave.scorers.llm_scorer import LLMScorer
+from weave.scorers.llm_utils import embed
 
 
 class EmbeddingSimilarityScorer(LLMScorer):
diff --git a/weave/scorers/string_scorer.py b/weave/scorers/string_scorer.py
index 62f4eff72e07..1cca2bf6ec86 100644
--- a/weave/scorers/string_scorer.py
+++ b/weave/scorers/string_scorer.py
@@ -3,7 +3,7 @@
 from pydantic import Field, model_validator
 
 import weave
-from weave.flow.scorers.base_scorer import Scorer
+from weave.scorers.base_scorer import Scorer
 
 
 class StringMatchScorer(Scorer):
diff --git a/weave/scorers/summarization_scorer.py b/weave/scorers/summarization_scorer.py
index 7e2049071a8c..c77df0acfc3a 100644
--- a/weave/scorers/summarization_scorer.py
+++ b/weave/scorers/summarization_scorer.py
@@ -4,8 +4,8 @@
 from pydantic import BaseModel, Field
 
 import weave
-from weave.flow.scorers.llm_scorer import InstructorLLMScorer
-from weave.flow.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create
+from weave.scorers.llm_scorer import InstructorLLMScorer
+from weave.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create
 
 DEFAULT_EXTRACTION_SYSTEM_PROMPT = """
 Given a <text>, extract all the unique entities from the text without repetition.
diff --git a/weave/scorers/xml_scorer.py b/weave/scorers/xml_scorer.py
index 2ea8384477f5..fc18188c2c3b 100644
--- a/weave/scorers/xml_scorer.py
+++ b/weave/scorers/xml_scorer.py
@@ -1,7 +1,7 @@
 import xml.etree.ElementTree as ET
 from typing import Union
 
-from weave.flow.scorers.base_scorer import Scorer
+from weave.scorers.base_scorer import Scorer
 
 
 class ValidXMLScorer(Scorer):

From ac127e81ddf930e4a923174e040fb0d400b8dadf Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 11:14:14 +0100
Subject: [PATCH 125/150] scorers tests linting

---
 tests/scorers/test_hallucination_scorer.py | 6 +++---
 tests/scorers/test_ragas_scorer.py         | 8 ++++----
 tests/scorers/test_summarization_scorer.py | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index 520cf773d497..5f71fe724b92 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -2,13 +2,13 @@
 from openai import OpenAI
 
 import weave
+from weave.scorers import (
+    HallucinationFreeScorer,
+)
 from weave.scorers.hallucination_scorer import (
     HallucinationReasoning,
     HallucinationResponse,
 )
-from weave.scorers import (
-    HallucinationFreeScorer,
-)
 
 
 # mock the create function
diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index e9c44338b973..f663ac965c2a 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -1,14 +1,14 @@
 import pytest
 from openai import OpenAI
 
-from weave.scorers.ragas_scorer import (
-    EntityExtractionResponse,
-    RelevancyResponse,
-)
 from weave.scorers import (
     ContextEntityRecallScorer,
     ContextRelevancyScorer,
 )
+from weave.scorers.ragas_scorer import (
+    EntityExtractionResponse,
+    RelevancyResponse,
+)
 
 
 # Mock the create function
diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
index 6e89f7102d76..255a1055b7d2 100644
--- a/tests/scorers/test_summarization_scorer.py
+++ b/tests/scorers/test_summarization_scorer.py
@@ -2,13 +2,13 @@
 from openai import OpenAI
 
 import weave
+from weave.scorers import (
+    SummarizationScorer,
+)
 from weave.scorers.summarization_scorer import (
     EntityExtractionResponse,
     SummarizationEvaluationResponse,
 )
-from weave.scorers import (
-    SummarizationScorer,
-)
 
 
 @pytest.fixture

From 1730b5737b1692833420f320e094880395c65eb6 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 22 Oct 2024 11:21:22 +0100
Subject: [PATCH 126/150] parameterize scorer tests

---
 tests/scorers/test_json_scorer.py   | 61 ++++++++---------------------
 tests/scorers/test_string_scorer.py | 53 ++++++++++---------------
 2 files changed, 37 insertions(+), 77 deletions(-)

diff --git a/tests/scorers/test_json_scorer.py b/tests/scorers/test_json_scorer.py
index 6cd1cf480cff..c80b7a54743e 100644
--- a/tests/scorers/test_json_scorer.py
+++ b/tests/scorers/test_json_scorer.py
@@ -1,50 +1,21 @@
-from weave.scorers import ValidJSONScorer
-
-
-def test_json_scorer_valid_json():
-    scorer = ValidJSONScorer()
-    output = '{"city": "San Francisco", "country": "USA"}'
-    result = scorer.score(output)
-    assert result["json_valid"] is True
-
-
-def test_json_scorer_invalid_json():
-    scorer = ValidJSONScorer()
-    output = '{"city": "San Francisco", "country": "USA"'
-    result = scorer.score(output)
-    assert result["json_valid"] is False
-
-
-def test_json_scorer_non_json_string():
-    scorer = ValidJSONScorer()
-    output = "Just a plain string."
-    result = scorer.score(output)
-    assert result["json_valid"] is False
+import pytest
 
-
-def test_json_scorer_valid_json_list():
-    scorer = ValidJSONScorer()
-    output = "[1, 2, 3, 4, 5]"
-    result = scorer.score(output)
-    assert result["json_valid"] is True
-
-
-def test_json_scorer_nested_json():
-    scorer = ValidJSONScorer()
-    output = '{"person": {"name": "John", "age": 30}, "city": "New York"}'
-    result = scorer.score(output)
-    assert result["json_valid"] is True
-
-
-def test_json_scorer_empty_object():
-    scorer = ValidJSONScorer()
-    output = "{}"
-    result = scorer.score(output)
-    assert result["json_valid"] is True
+from weave.scorers import ValidJSONScorer
 
 
-def test_json_scorer_empty_list():
+@pytest.mark.parametrize(
+    "output, expected_result",
+    [
+        ('{"city": "San Francisco", "country": "USA"}', True),
+        ('{"city": "San Francisco", "country": "USA"', False),
+        ("Just a plain string.", False),
+        ("[1, 2, 3, 4, 5]", True),
+        ('{"person": {"name": "John", "age": 30}, "city": "New York"}', True),
+        ("{}", True),
+        ("[]", True),
+    ],
+)
+def test_json_scorer(output, expected_result):
     scorer = ValidJSONScorer()
-    output = "[]"
     result = scorer.score(output)
-    assert result["json_valid"] is True
+    assert result["json_valid"] is expected_result
diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py
index a599bdd17cf7..2c635ea81db0 100644
--- a/tests/scorers/test_string_scorer.py
+++ b/tests/scorers/test_string_scorer.py
@@ -1,44 +1,33 @@
+import pytest
+
 from weave.scorers import (
     LevenshteinScorer,
     StringMatchScorer,
 )
 
 
-def test_string_match_scorer():
-    scorer = StringMatchScorer()
-    output = "Morgan"
-    target = "Hello my name is Morgan"
-    result = scorer.score(output, target)
-    assert result["string_in_input"] is True
-
-
-def test_string_match_scorer_false():
+@pytest.mark.parametrize(
+    "output, target, expected_result",
+    [
+        ("Morgan", "Hello my name is Morgan", True),
+        ("Alice", "Hello my name is Bob", False),
+    ],
+)
+def test_string_match_scorer(output, target, expected_result):
     scorer = StringMatchScorer()
-    output = "Alice"
-    target = "Hello my name is Bob"
     result = scorer.score(output, target)
-    assert result["string_in_input"] is False
+    assert result["string_in_input"] is expected_result
 
 
-def test_levenshtein_scorer():
-    scorer = LevenshteinScorer()
-    output = "Hello"
-    target = "Hallo"
-    result = scorer.score(output, target)
-    assert result["levenshtein_distance"] == 1
-
-
-def test_levenshtein_scorer_same_strings():
-    scorer = LevenshteinScorer()
-    output = "Hello"
-    target = "Hello"
-    result = scorer.score(output, target)
-    assert result["levenshtein_distance"] == 0
-
-
-def test_levenshtein_scorer_completely_different():
+@pytest.mark.parametrize(
+    "output, target, expected_distance",
+    [
+        ("Hello", "Hallo", 1),
+        ("Hello", "Hello", 0),
+        ("Hello", "World", 4),
+    ],
+)
+def test_levenshtein_scorer(output, target, expected_distance):
     scorer = LevenshteinScorer()
-    output = "Hello"
-    target = "World"
     result = scorer.score(output, target)
-    assert result["levenshtein_distance"] == 4
+    assert result["levenshtein_distance"] == expected_distance

From 8187547b67a181e716b8c06fe4ef7cea13d49f52 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Wed, 23 Oct 2024 10:33:15 +0200
Subject: [PATCH 127/150] add full evaluation examples for each scorer

---
 docs/docs/guides/evaluation/scorers.md | 232 ++++++++++++++++++++++++-
 1 file changed, 229 insertions(+), 3 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 8e038c27b07c..61218703634e 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -159,6 +159,35 @@ scorer = HallucinationFreeScorer(
 **Notes:**
 - The `score` method expects an input column named `context`. If your dataset uses a different name, use the `column_map` attribute to map `context` to the dataset column.
 
+Here you have an example in the context of an evaluation:
+
+```python
+from weave.scorers import HallucinationFreeScorer
+from openai import OpenAI
+import weave
+
+class SimpleModel(weave.Model):
+    @weave.op()
+    async def predict(self, question: str) -> str:
+        return "The Earth is the third planet from the Sun."
+
+llm_client = OpenAI()
+model = SimpleModel()
+hallucination_scorer = HallucinationFreeScorer(
+    client=llm_client, 
+    model_id="gpt-4o",
+    column_map={"context": "background_info"}
+)
+
+dataset = [
+    {"background_info": "The Earth is the third planet from the Sun.", "question": "What is the position of Earth in the solar system?"},
+    {"background_info": "Paris is the capital of France.", "question": "What is the capital of Germany?"}
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[hallucination_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+```
 ---
 
 ### `SummarizationScorer`
@@ -192,6 +221,32 @@ This scorer evaluates summaries in two ways:
 - The `score` method expects the original text that was summarized to be present in the `input` column of the dataset. Use the `column_map` class attribute to map `input` to the correct dataset column if needed.
 
 
+Here you have an example usage of the `SummarizationScorer` in the context of an evaluation:
+
+```python
+from weave.scorers import SummarizationScorer
+from openai import OpenAI
+import weave
+
+class SummarizationModel(weave.Model):
+    @weave.op()
+    async def predict(self, input: str) -> str:
+        return "This is a summary of the input text."
+
+llm_client = OpenAI()
+model = SummarizationModel()
+summarization_scorer = SummarizationScorer(client=llm_client, model_id="gpt-4o")
+
+dataset = [
+    {"input": "The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.", "expected": "A sentence with all alphabet letters."},
+    {"input": "Artificial Intelligence is revolutionizing various industries, from healthcare to finance.", "expected": "AI's impact on different sectors."}
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+```
+
 ---
 
 ### `OpenAIModerationScorer`
@@ -218,6 +273,25 @@ scorer = OpenAIModerationScorer(
 - Requires the `openai` Python package.
 - The client must be an instance of OpenAI's `OpenAI` or `AsyncOpenAI` client.
 
+
+Here you have an example in the context of an evaluation:
+```python
+from weave.scorers import OpenAIModerationScorer
+from openai import OpenAI
+
+client = OpenAI()
+moderation_scorer = OpenAIModerationScorer(client=client)
+
+dataset = [
+    {"input": "I love puppies and kittens!"},
+    {"input": "I hate everyone and want to hurt them."}
+]
+
+evaluation = Evaluation(dataset=dataset, scorers=[moderation_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+```
+
 ---
 
 ### `EmbeddingSimilarityScorer`
@@ -246,6 +320,37 @@ similarity_scorer = EmbeddingSimilarityScorer(
 
 The correct cosine similarity threshold to set can fluctuate quite a lot depending on your use case, we advise exploring different thresholds.
 
+
+Here you have an example usage of the `EmbeddingSimilarityScorer` in the context of an evaluation:
+
+```python
+from weave.scorers import EmbeddingSimilarityScorer
+from openai import OpenAI
+import weave
+
+class AnswerModel(weave.Model):
+    @weave.op()
+    async def predict(self, question: str) -> str:
+        return "The capital of France is Paris."
+
+llm_client = OpenAI()
+model = AnswerModel()
+similarity_scorer = EmbeddingSimilarityScorer(
+    client=llm_client,
+    threshold=0.7,
+    column_map={"target": "reference_answer"}
+)
+
+dataset = [
+    {"question": "What is the capital of France?", "reference_answer": "The capital of France is Paris."},
+    {"question": "Who wrote Romeo and Juliet?", "reference_answer": "Shakespeare wrote Romeo and Juliet."}
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[similarity_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+```
+
 ---
 
 ### `ValidJSONScorer`
@@ -258,8 +363,31 @@ from weave.scorers import ValidJSONScorer
 json_scorer = ValidJSONScorer()
 ```
 
-**Notes:**
-- If the output cannot be parsed as JSON, or if it parses to a data type other than dict or list, it is considered invalid.
+Here you have an example usage of the `ValidJSONScorer` in the context of an evaluation:
+
+```python
+from weave.scorers import ValidJSONScorer
+import weave
+
+class JSONModel(weave.Model):
+    @weave.op()
+    async def predict(self, input: str) -> str:
+        # This is a placeholder. 
+        # In a real scenario, this would generate JSON.
+        return '{"key": "value"}'
+
+model = JSONModel()
+json_scorer = ValidJSONScorer()
+
+dataset = [
+    {"input": "Generate a JSON object with a key and value"},
+    {"input": "Create an invalid JSON"}
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[json_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+```
 
 
 ---
@@ -274,6 +402,32 @@ from weave.scorers import ValidXMLScorer
 xml_scorer = ValidXMLScorer()
 ```
 
+
+Here you have an example usage of the `ValidXMLScorer` in the context of an evaluation:
+
+```python
+from weave.scorers import ValidXMLScorer
+import weave
+
+class XMLModel(weave.Model):
+    @weave.op()
+    async def predict(self, input: str) -> str:
+        # This is a placeholder. In a real scenario, this would generate XML.
+        return '<root><element>value</element></root>'
+
+model = XMLModel()
+xml_scorer = ValidXMLScorer()
+
+dataset = [
+    {"input": "Generate a valid XML with a root element"},
+    {"input": "Create an invalid XML"}
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[xml_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+```
+
 ---
 
 ### `PydanticScorer`
@@ -318,6 +472,36 @@ entity_recall_scorer = ContextEntityRecallScorer(
 
 - Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
 
+
+Here you have an example usage of the `ContextEntityRecallScorer` in the context of an evaluation:
+
+```python
+from weave.scorers import ContextEntityRecallScorer
+from openai import OpenAI
+import weave
+
+class RAGModel(weave.Model):
+    @weave.op()
+    async def predict(self, question: str, context: str) -> str:
+        return "Paris is the capital of France."
+
+llm_client = OpenAI()
+model = RAGModel()
+entity_recall_scorer = ContextEntityRecallScorer(
+    client=llm_client,
+    model_id="gpt-4o",
+    column_map={"context": "answer"}
+)
+
+dataset = [
+    {"question": "What is the capital of France?", "answer": "The capital city of France is Paris."},
+    {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare wrote many famous plays, including Romeo and Juliet."}
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[entity_recall_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+```
 ---
 
 ### RAGAS - `ContextRelevancyScorer`
@@ -343,4 +527,46 @@ relevancy_scorer = ContextRelevancyScorer(
 **Notes:**
 
 - Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
-- Customize the `relevancy_prompt` to define how relevancy is assessed.
\ No newline at end of file
+- Customize the `relevancy_prompt` to define how relevancy is assessed.
+
+
+Here you have an example usage of the `ContextRelevancyScorer` in the context of an evaluation:
+
+```python
+from weave.scorers import ContextRelevancyScorer
+from openai import OpenAI
+import weave
+from textwrap import dedent
+
+class RAGModel(weave.Model):
+    @weave.op()
+    async def predict(self, question: str, context: str) -> str:
+        return "Paris is the capital of France."
+
+llm_client = OpenAI()
+model = RAGModel()
+
+relevancy_prompt: str = dedent("""
+    Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1.
+
+    Question: {question}
+    Context: {context}
+    Relevancy Score (0-1):
+    """)
+
+
+relevancy_scorer = ContextRelevancyScorer(
+    client=llm_client,
+    model_id="gpt-4o",
+    relevancy_prompt=relevancy_prompt
+)
+
+dataset = [
+    {"question": "What is the capital of France?", "context": "Paris is the capital city of France."},
+    {"question": "Who wrote Romeo and Juliet?", "context": "The Eiffel Tower is located in Paris, France."}
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[relevancy_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+```
\ No newline at end of file

From df7a4a1998f73fd450858bdd067679b3fcde3775 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 25 Oct 2024 15:27:02 +0200
Subject: [PATCH 128/150] subclass to map columns

---
 docs/docs/guides/evaluation/scorers.md | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 61218703634e..b87e987dd44e 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -37,7 +37,6 @@ Example:
 
 ```python
 from weave import Scorer
-from typing import Dict
 
 llm_client = ...
 
@@ -51,12 +50,12 @@ class SummarizationScorer(Scorer):
         return text
 
     @weave.op
-    def ask_llm(self, summary: str, text: str) -> Dict:
+    def ask_llm(self, summary: str, text: str) -> dict:
         res = llm_client.create(self.system_prompt, summary, text)
         return {"summary_quality": res}
 
     @weave.op
-    def score(self, output: str, text: str) -> Dict:
+    def score(self, output: str, text: str) -> dict:
         """
         output: The summary generated by an AI system
         text: The original text being summarised
@@ -98,7 +97,6 @@ Example:
 
 ```python
 from weave import Scorer
-from typing import Dict
 
 # A dataset with news articles to be summarised
 dataset = [
@@ -110,7 +108,7 @@ dataset = [
 class SummarizationScorer(Scorer):
     
     @weave.op
-    def score(output, text) -> Dict:
+    def score(output, text) -> dict:
         """
             output: output summary from a LLM summarization system
             text: the text being summarised
@@ -123,6 +121,18 @@ scorer = SummarizationScorer(column_map={"text" : "news_article"})
 
 Now, the `text` argument in the `score` method will receive data from the `news_article` dataset column.
 
+**Notes:**
+- Another equivalent option to map your columns is to subclass the `Scorer` and overload the `score` method mapping the columns explicitly.
+
+```python
+class MySummarizationScorer(SummarizationScorer):
+    
+    @weave.op
+    def score(output, news_article) -> dict:
+        # overload the score method and map columns manually
+        super().score(output=output, text=news_article)
+```
+
 
 ## Predefined Scorers
 

From 8f8abec06b4d33c90865085cd76317b0a86d47e4 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 25 Oct 2024 17:24:47 +0200
Subject: [PATCH 129/150] scorer summarization

---
 docs/docs/guides/evaluation/scorers.md | 30 ++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index b87e987dd44e..4951393f7fd8 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -133,6 +133,36 @@ class MySummarizationScorer(SummarizationScorer):
         super().score(output=output, text=news_article)
 ```
 
+### Final summarization of the scorer
+
+During evaluation, the scorer will be computed for each row of your dataset. To provide a final score for the evaluation we provide an `auto_summarize` depending on the returning type of the output.
+    - average will be computed for numerical columns
+    - count and fraction for boolean cols
+    - other col types are ignored
+
+You can override the `summarize` method on the `Scorer` class and provide your own way of computing the final scores. The `summarize` function expects:
+
+**Why this is useful?** 
+
+When you need to score all rows before deciding on the final value of the score for the dataset.
+
+```python
+class MyBinaryScorer(Scorer):
+    """
+    Returns True if the full output matches the target, False if not
+    """
+    
+    @weave.op
+    def score(output, target):
+        return {"match": if output == target}
+
+    def summarize(self, score_rows: list) -> dict:
+        full_match = all(row["match"] for row in score_rows)
+        return {"full_match": full_match}
+```
+> In this example, the default `auto_summarize` would have returned the count and proportion of True.
+
+If you want to learn more, check the implementation of [CorrectnessLLMJudge](/tutorial-rag#optional-defining-a-scorer-class).
 
 ## Predefined Scorers
 

From d89ebcc45027d3efddb334a9cb878c0dd7df5a5c Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 25 Oct 2024 17:28:15 +0200
Subject: [PATCH 130/150] add missing summarize

---
 docs/docs/guides/evaluation/scorers.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 4951393f7fd8..40e6dfbc59f4 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -142,6 +142,9 @@ During evaluation, the scorer will be computed for each row of your dataset. To
 
 You can override the `summarize` method on the `Scorer` class and provide your own way of computing the final scores. The `summarize` function expects:
 
+- A single parameter `score_rows`: This is a list of dictionaries, where each dictionary contains the scores returned by the `score` method for a single row of your dataset.
+- It should return a dictionary containing the summarized scores.
+
 **Why this is useful?** 
 
 When you need to score all rows before deciding on the final value of the score for the dataset.

From f9911c39aa2ce6d2a808289294131722be840173 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 11:43:19 +0100
Subject: [PATCH 131/150] remove unused kwargs

---
 weave/scorers/summarization_scorer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/weave/scorers/summarization_scorer.py b/weave/scorers/summarization_scorer.py
index c77df0acfc3a..adcf4bea33bf 100644
--- a/weave/scorers/summarization_scorer.py
+++ b/weave/scorers/summarization_scorer.py
@@ -115,7 +115,7 @@ class SummarizationScorer(InstructorLLMScorer):
         evaluate_summary(input: str, summary: str) -> SummarizationEvaluationResponse:
             Evaluates the quality of a summary using an LLM.
 
-        score(input: str, output: str, **kwargs: Any) -> dict:
+        score(input: str, output: str) -> dict:
             Calculates summarization score and entity density score for the given input and output.
     """
 
@@ -177,7 +177,7 @@ def simple_word_tokenize(self, text: str) -> List[str]:
         return text.split()
 
     @weave.op
-    async def score(self, input: str, output: str, **kwargs: Any) -> dict:
+    async def score(self, input: str, output: str) -> dict:
         extract_task = asyncio.to_thread(self.extract_entities, text=str(output))
         evaluate_task = asyncio.to_thread(
             self.evaluate_summary, input=str(input), summary=str(output)

From 5f7388d35d7f229e78638834180bf0613840ebb6 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 11:48:35 +0100
Subject: [PATCH 132/150] set default parameters for temp and max_tokens

---
 weave/scorers/ragas_scorer.py | 56 ++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/weave/scorers/ragas_scorer.py b/weave/scorers/ragas_scorer.py
index fe0b7be26a8e..98f5cbdf923e 100644
--- a/weave/scorers/ragas_scorer.py
+++ b/weave/scorers/ragas_scorer.py
@@ -6,7 +6,7 @@
 
 import weave
 from weave.scorers.llm_scorer import InstructorLLMScorer
-from weave.scorers.llm_utils import create
+from weave.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create
 
 
 class EntityExtractionResponse(BaseModel):
@@ -17,8 +17,28 @@ class EntityExtractionResponse(BaseModel):
 
 class ContextEntityRecallScorer(InstructorLLMScorer):
     """
-    Estimates context recall by extracting entities from the model output
-    and the context, then computes the recall.
+    A Scorer that estimates context recall by extracting entities from both the model output
+    and the context, then computing the recall score between them.
+
+    Note:
+        - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM
+        provider's response; you will have to install the `instructor` python package to use it.
+        - The `score` method expects two arguments: 'output' (the model's response) and 'context' 
+        (the reference text). If your dataset columns have different names, use the `column_map` 
+        argument when initializing the scorer.
+        - Entity extraction is performed using an LLM, so results may vary based on the model used.
+
+    Attributes:
+        extraction_prompt (str): The prompt template used to extract entities from text. Must
+        contain a {text} placeholder.
+        model_id (str): The LLM model name, depends on the LLM provider being used.
+        temperature (float): LLM temperature setting.
+        max_tokens (int): Maximum number of tokens in the LLM's response.
+
+    Methods:
+        score(output: str, context: str) -> dict:
+            Computes the recall score by comparing entities in the output against those in the context.
+            Returns a dict with a 'recall' key containing the score (0.0 to 1.0).
     """
 
     extraction_prompt: str = dedent("""
@@ -27,6 +47,9 @@ class ContextEntityRecallScorer(InstructorLLMScorer):
     Text: {text}
     Entities:
     """)
+    model_id: str = OPENAI_DEFAULT_MODEL
+    temperature: float = 0.7
+    max_tokens: int = 4096
 
     def extract_entities(self, text: str) -> list[str]:
         # Use LLM to extract entities
@@ -65,7 +88,29 @@ class RelevancyResponse(BaseModel):
 
 
 class ContextRelevancyScorer(InstructorLLMScorer):
-    """Evaluates the relevancy of the provided context to the model output."""
+    """
+    A Scorer that evaluates the relevancy of the provided context to the model output using an LLM.
+
+    Note:
+        - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM
+        provider's response; you will have to install the `instructor` python package to use it.
+        - The `score` method expects two arguments: 'output' (treated as the question) and 'context' 
+        (the reference text). If your dataset columns have different names, use the `column_map` 
+        argument when initializing the scorer.
+        - The relevancy score is binary (0 or 1) where 1 indicates relevant context.
+
+    Attributes:
+        relevancy_prompt (str): The prompt template used to evaluate context relevancy. Must
+        contain placeholders for both {question} and {context}.
+        model_id (str): The LLM model name, depends on the LLM provider being used.
+        temperature (float): LLM temperature setting.
+        max_tokens (int): Maximum number of tokens in the LLM's response.
+
+    Methods:
+        score(output: str, context: str) -> dict:
+            Evaluates the relevancy of the context to the output/question.
+            Returns a dict with 'relevancy_score' (0 or 1) and 'reasoning' keys.
+    """
 
     relevancy_prompt: str = dedent("""
     Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1.
@@ -74,6 +119,9 @@ class ContextRelevancyScorer(InstructorLLMScorer):
     Context: {context}
     Relevancy Score (0-1):
     """)
+    model_id: str = OPENAI_DEFAULT_MODEL
+    temperature: float = 0.7
+    max_tokens: int = 4096
 
     @weave.op
     def score(self, output: str, context: str) -> dict:

From cc12cf1044c9277bd5fed28ec8dde6f7cfc30a99 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 11:52:39 +0100
Subject: [PATCH 133/150] improve docstrings

---
 weave/scorers/llm_scorer.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/weave/scorers/llm_scorer.py b/weave/scorers/llm_scorer.py
index b97189e93b61..79979282a734 100644
--- a/weave/scorers/llm_scorer.py
+++ b/weave/scorers/llm_scorer.py
@@ -9,7 +9,15 @@
 
 
 class LLMScorer(Scorer):
-    """Score a model output using an LLM"""
+    """Score model outputs using a Language Learning Model (LLM).
+
+    This scorer leverages LLMs to evaluate and score model outputs. It provides a flexible
+    way to use different LLM providers for scoring purposes.
+
+    Attributes:
+        client: An instantiated LLM client with valid API credentials
+        model_id: The specific model identifier to use for scoring
+    """
 
     client: _LLM_CLIENTS = Field(
         description="The LLM client to use, has to be instantiated with an api_key"
@@ -27,7 +35,18 @@ def validate_client(cls, v):  # type: ignore
 
 
 class InstructorLLMScorer(Scorer):
-    """Score a model output using an LLM"""
+    """Score model outputs using an LLM with instructor-guided evaluation.
+
+    This scorer extends the base LLM scoring capability by adding temperature and
+    token control for more precise scoring behavior. It automatically wraps the
+    provided client with instructor functionality for structured outputs.
+
+    Attributes:
+        client: An instantiated LLM client with valid API credentials
+        model_id: The specific model identifier to use for scoring
+        temperature: Controls randomness in the LLM's responses (0.0 to 1.0)
+        max_tokens: Maximum number of tokens allowed in the LLM's response
+    """
 
     client: _LLM_CLIENTS = Field(
         description="The LLM client to use, has to be instantiated with an api_key"

From f65928e8fbaceed8bda244083e6278cbc01778eb Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 11:52:52 +0100
Subject: [PATCH 134/150] add default model_id

---
 weave/scorers/similarity_scorer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/weave/scorers/similarity_scorer.py b/weave/scorers/similarity_scorer.py
index c084107d49b3..a20e4a13841e 100644
--- a/weave/scorers/similarity_scorer.py
+++ b/weave/scorers/similarity_scorer.py
@@ -5,7 +5,7 @@
 
 import weave
 from weave.scorers.llm_scorer import LLMScorer
-from weave.scorers.llm_utils import embed
+from weave.scorers.llm_utils import OPENAI_DEFAULT_EMBEDDING_MODEL, embed
 
 
 class EmbeddingSimilarityScorer(LLMScorer):
@@ -18,6 +18,7 @@ class EmbeddingSimilarityScorer(LLMScorer):
     """
 
     threshold: float = Field(0.5, description="The threshold for the similarity score")
+    model_id: str = OPENAI_DEFAULT_EMBEDDING_MODEL
 
     @weave.op
     def score(self, output: str, target: str) -> Any:

From cde268160833291cf33228fdf2e4603c55329e57 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 11:54:51 +0100
Subject: [PATCH 135/150] typos and link to instructor

---
 weave/scorers/llm_scorer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/weave/scorers/llm_scorer.py b/weave/scorers/llm_scorer.py
index 79979282a734..4cfc56734834 100644
--- a/weave/scorers/llm_scorer.py
+++ b/weave/scorers/llm_scorer.py
@@ -9,7 +9,7 @@
 
 
 class LLMScorer(Scorer):
-    """Score model outputs using a Language Learning Model (LLM).
+    """Score model outputs using a Large Language Model (LLM).
 
     This scorer leverages LLMs to evaluate and score model outputs. It provides a flexible
     way to use different LLM providers for scoring purposes.
@@ -35,11 +35,12 @@ def validate_client(cls, v):  # type: ignore
 
 
 class InstructorLLMScorer(Scorer):
-    """Score model outputs using an LLM with instructor-guided evaluation.
+    """Score a model using an LLM with structured outputs.
 
     This scorer extends the base LLM scoring capability by adding temperature and
     token control for more precise scoring behavior. It automatically wraps the
-    provided client with instructor functionality for structured outputs.
+    provided client with [instructor](https://github.com/instructor-ai/instructor) 
+    functionality for structured outputs.
 
     Attributes:
         client: An instantiated LLM client with valid API credentials

From fd8c8d570fbc8f0f926b3fc07c2bf9adb0a68685 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 12:14:28 +0100
Subject: [PATCH 136/150] add default model

---
 weave/scorers/moderation_scorer.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/weave/scorers/moderation_scorer.py b/weave/scorers/moderation_scorer.py
index 4b284c908acf..3bc5b4139493 100644
--- a/weave/scorers/moderation_scorer.py
+++ b/weave/scorers/moderation_scorer.py
@@ -4,10 +4,15 @@
 
 import weave
 from weave.scorers.llm_scorer import LLMScorer
-
+from weave.scorers.llm_utils import OPENAI_DEFAULT_MODERATION_MODEL
 
 class OpenAIModerationScorer(LLMScorer):
-    """Use OpenAI moderation API to check if the model output is safe"""
+    """Use OpenAI moderation API to check if the model output is safe.
+
+    Args:
+        model_id: The OpenAI model to use for moderation. Defaults to `text-moderation-latest`.
+    """
+    model_id: str = OPENAI_DEFAULT_MODERATION_MODEL
 
     @field_validator("client")
     def validate_openai_client(cls, v):  # type: ignore

From c47be95420077557b44d33cbc9954557e7713b2f Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 12:27:53 +0100
Subject: [PATCH 137/150] fixed code snippets

---
 docs/docs/guides/evaluation/scorers.md | 117 ++++++++++++++++---------
 1 file changed, 76 insertions(+), 41 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 40e6dfbc59f4..afc6307b330f 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -205,31 +205,42 @@ scorer = HallucinationFreeScorer(
 Here you have an example in the context of an evaluation:
 
 ```python
-from weave.scorers import HallucinationFreeScorer
+import asyncio
 from openai import OpenAI
 import weave
-
-class SimpleModel(weave.Model):
-    @weave.op()
-    async def predict(self, question: str) -> str:
-        return "The Earth is the third planet from the Sun."
+from weave.scorers import HallucinationFreeScorer
 
 llm_client = OpenAI()
-model = SimpleModel()
+
 hallucination_scorer = HallucinationFreeScorer(
     client=llm_client, 
     model_id="gpt-4o",
-    column_map={"context": "background_info"}
+    temperature=0.7,
+    max_tokens=4096,
+    column_map={"context": "input", "output": "other_col"}
 )
-
 dataset = [
-    {"background_info": "The Earth is the third planet from the Sun.", "question": "What is the position of Earth in the solar system?"},
-    {"background_info": "Paris is the capital of France.", "question": "What is the capital of Germany?"}
-]
+        {
+            "input": "John likes various types of cheese.",
+            "other_col": "John's favorite cheese is cheddar.",
+        },
+        {
+            "input": "Pepe likes various types of cheese.",
+            "other_col": "Pepe's favorite cheese is gouda.",
+        },
+    ]
 
-evaluation = weave.Evaluation(dataset=dataset, scorers=[hallucination_scorer])
-results = asyncio.run(evaluation.evaluate(model))
-print(results)
+@weave.op
+def model(input):
+    return "The person's favorite cheese is cheddar."
+
+evaluation = weave.Evaluation(
+    dataset=dataset,
+    scorers=[hallucination_scorer],
+)
+result = asyncio.run(evaluation.evaluate(model))
+# result["HallucinationFreeScorer"]["has_hallucination"]["true_count"] == 2
+# result["HallucinationFreeScorer"]["has_hallucination"]["true_fraction"] == 1.0
 ```
 ---
 
@@ -267,22 +278,30 @@ This scorer evaluates summaries in two ways:
 Here you have an example usage of the `SummarizationScorer` in the context of an evaluation:
 
 ```python
-from weave.scorers import SummarizationScorer
+import asyncio
 from openai import OpenAI
 import weave
+from weave.scorers import SummarizationScorer
 
 class SummarizationModel(weave.Model):
+    """
+    A model that generates a summary of the input text.
+    """
     @weave.op()
     async def predict(self, input: str) -> str:
         return "This is a summary of the input text."
 
 llm_client = OpenAI()
 model = SummarizationModel()
-summarization_scorer = SummarizationScorer(client=llm_client, model_id="gpt-4o")
-
+summarization_scorer = SummarizationScorer(
+    client=llm_client, 
+    model_id="gpt-4o",
+    temperature=0.7,
+    max_tokens=1024,
+)
 dataset = [
-    {"input": "The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.", "expected": "A sentence with all alphabet letters."},
-    {"input": "Artificial Intelligence is revolutionizing various industries, from healthcare to finance.", "expected": "AI's impact on different sectors."}
+    {"input": "The quick brown fox jumps over the lazy dog."},
+    {"input": "Artificial Intelligence is revolutionizing various industries, from healthcare to finance."}
 ]
 
 evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])
@@ -319,8 +338,10 @@ scorer = OpenAIModerationScorer(
 
 Here you have an example in the context of an evaluation:
 ```python
-from weave.scorers import OpenAIModerationScorer
+import asyncio
 from openai import OpenAI
+import weave
+from weave.scorers import OpenAIModerationScorer
 
 client = OpenAI()
 moderation_scorer = OpenAIModerationScorer(client=client)
@@ -367,31 +388,39 @@ The correct cosine similarity threshold to set can fluctuate quite a lot dependi
 Here you have an example usage of the `EmbeddingSimilarityScorer` in the context of an evaluation:
 
 ```python
-from weave.scorers import EmbeddingSimilarityScorer
+import asyncio
 from openai import OpenAI
 import weave
+from weave.scorers import EmbeddingSimilarityScorer
 
-class AnswerModel(weave.Model):
-    @weave.op()
-    async def predict(self, question: str) -> str:
-        return "The capital of France is Paris."
 
-llm_client = OpenAI()
-model = AnswerModel()
 similarity_scorer = EmbeddingSimilarityScorer(
     client=llm_client,
     threshold=0.7,
-    column_map={"target": "reference_answer"}
+    column_map={"target": "other_col"}
 )
 
 dataset = [
-    {"question": "What is the capital of France?", "reference_answer": "The capital of France is Paris."},
-    {"question": "Who wrote Romeo and Juliet?", "reference_answer": "Shakespeare wrote Romeo and Juliet."}
+    {
+        "input": "He's name is John",
+        "other_col": "John likes various types of cheese.",
+    },
+    {
+        "input": "He's name is Pepe.",
+        "other_col": "Pepe likes various types of cheese.",
+    },
 ]
 
-evaluation = weave.Evaluation(dataset=dataset, scorers=[similarity_scorer])
-results = asyncio.run(evaluation.evaluate(model))
-print(results)
+@weave.op
+def model(input):
+    return "John likes various types of cheese."
+
+evaluation = weave.Evaluation(
+    dataset=dataset,
+    scorers=[similarity_scorer],
+)
+result = asyncio.run(evaluation.evaluate(model))
+print(result)
 ```
 
 ---
@@ -409,8 +438,9 @@ json_scorer = ValidJSONScorer()
 Here you have an example usage of the `ValidJSONScorer` in the context of an evaluation:
 
 ```python
-from weave.scorers import ValidJSONScorer
+import asyncio
 import weave
+from weave.scorers import ValidJSONScorer
 
 class JSONModel(weave.Model):
     @weave.op()
@@ -449,8 +479,9 @@ xml_scorer = ValidXMLScorer()
 Here you have an example usage of the `ValidXMLScorer` in the context of an evaluation:
 
 ```python
-from weave.scorers import ValidXMLScorer
+import asyncio
 import weave
+from weave.scorers import ValidXMLScorer
 
 class XMLModel(weave.Model):
     @weave.op()
@@ -519,13 +550,15 @@ entity_recall_scorer = ContextEntityRecallScorer(
 Here you have an example usage of the `ContextEntityRecallScorer` in the context of an evaluation:
 
 ```python
-from weave.scorers import ContextEntityRecallScorer
+import asyncio
 from openai import OpenAI
 import weave
+from weave.scorers import ContextEntityRecallScorer
 
 class RAGModel(weave.Model):
     @weave.op()
-    async def predict(self, question: str, context: str) -> str:
+    async def predict(self, question: str) -> str:
+        "Retrieve relevant context"
         return "Paris is the capital of France."
 
 llm_client = OpenAI()
@@ -576,14 +609,16 @@ relevancy_scorer = ContextRelevancyScorer(
 Here you have an example usage of the `ContextRelevancyScorer` in the context of an evaluation:
 
 ```python
-from weave.scorers import ContextRelevancyScorer
+import asyncio
+from textwrap import dedent
 from openai import OpenAI
 import weave
-from textwrap import dedent
+from weave.scorers import ContextRelevancyScorer
 
 class RAGModel(weave.Model):
     @weave.op()
-    async def predict(self, question: str, context: str) -> str:
+    async def predict(self, question: str) -> str:
+        "Retrieve relevant context"
         return "Paris is the capital of France."
 
 llm_client = OpenAI()

From 4a80d5e452719c7cc533d601dadeb6aedb602e69 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 12:29:51 +0100
Subject: [PATCH 138/150] typos

---
 docs/docs/guides/evaluation/scorers.md | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index afc6307b330f..4062e38b9163 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -16,10 +16,10 @@ These are functions decorated with `@weave.op` that return a dictionary. They're
 
 ```python
 @weave.op
-def evaluate_uppercase(text: str):
+def evaluate_uppercase(text: str) -> dict:  # Added return type hint
     return {"text_is_uppercase": text.isupper()}
 
-my_eval = weave.Evaluations(..., scorers=[evaluate_uppercase])
+my_eval = weave.Evaluation(..., scorers=[evaluate_uppercase])
 ```
 
 When the evaluation is run, `evaluate_uppercase` checks if the text is all uppercase.
@@ -56,16 +56,18 @@ class SummarizationScorer(Scorer):
 
     @weave.op
     def score(self, output: str, text: str) -> dict:
-        """
-        output: The summary generated by an AI system
-        text: The original text being summarised
+        """Score the summary quality.
+
+        Args:
+            output: The summary generated by an AI system
+            text: The original text being summarised
         """
         text = self.some_complicated_preprocessing(text)
         eval_result = self.ask_llm(output, text)
         return {"summary_quality": eval_result}
 
 summarization_scorer = SummarizationScorer(model_id="o2")
-eval = weave.Evaluations(..., scorers=[summarization_scorer])
+eval = weave.Evaluation(..., scorers=[summarization_scorer])
 ```
 This class evaluates how good a summary is by comparing it to the original text.
 
@@ -79,7 +81,7 @@ For example if you wanted to use a column called "label" from your dataset then
 
 ```python
 @weave.op
-def my_custom_scorer(output: str, label: int):
+def my_custom_scorer(output: str, label: int) -> dict:  # Added return type hint
     ...
 ```
 
@@ -128,9 +130,9 @@ Now, the `text` argument in the `score` method will receive data from the `news_
 class MySummarizationScorer(SummarizationScorer):
     
     @weave.op
-    def score(output, news_article) -> dict:
+    def score(self, output: str, news_article: str) -> dict:  # Added type hints
         # overload the score method and map columns manually
-        super().score(output=output, text=news_article)
+        return super().score(output=output, text=news_article)
 ```
 
 ### Final summarization of the scorer
@@ -647,4 +649,4 @@ dataset = [
 evaluation = weave.Evaluation(dataset=dataset, scorers=[relevancy_scorer])
 results = asyncio.run(evaluation.evaluate(model))
 print(results)
-```
\ No newline at end of file
+```

From 8f1acce2f07cf784fe58d0331f1be177c01828a2 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 12:51:33 +0100
Subject: [PATCH 139/150] update docs

---
 docs/docs/guides/evaluation/scorers.md | 131 ++++++++++++-------------
 1 file changed, 65 insertions(+), 66 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 4062e38b9163..d7112f32829c 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -212,37 +212,32 @@ from openai import OpenAI
 import weave
 from weave.scorers import HallucinationFreeScorer
 
+# Initialize clients and scorers
 llm_client = OpenAI()
-
 hallucination_scorer = HallucinationFreeScorer(
     client=llm_client, 
     model_id="gpt-4o",
-    temperature=0.7,
-    max_tokens=4096,
     column_map={"context": "input", "output": "other_col"}
 )
+
+# Create dataset
 dataset = [
-        {
-            "input": "John likes various types of cheese.",
-            "other_col": "John's favorite cheese is cheddar.",
-        },
-        {
-            "input": "Pepe likes various types of cheese.",
-            "other_col": "Pepe's favorite cheese is gouda.",
-        },
-    ]
+    {"input": "John likes various types of cheese."},
+    {"input": "Pepe likes various types of cheese."},
+]
 
 @weave.op
-def model(input):
+def model(input: str) -> str:
     return "The person's favorite cheese is cheddar."
 
+# Run evaluation
 evaluation = weave.Evaluation(
     dataset=dataset,
     scorers=[hallucination_scorer],
 )
 result = asyncio.run(evaluation.evaluate(model))
-# result["HallucinationFreeScorer"]["has_hallucination"]["true_count"] == 2
-# result["HallucinationFreeScorer"]["has_hallucination"]["true_fraction"] == 1.0
+print(result)
+# {'HallucinationFreeScorer': {'has_hallucination': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 1.4395725727081299}}
 ```
 ---
 
@@ -286,29 +281,28 @@ import weave
 from weave.scorers import SummarizationScorer
 
 class SummarizationModel(weave.Model):
-    """
-    A model that generates a summary of the input text.
-    """
     @weave.op()
     async def predict(self, input: str) -> str:
         return "This is a summary of the input text."
 
+# Initialize clients and scorers
 llm_client = OpenAI()
 model = SummarizationModel()
 summarization_scorer = SummarizationScorer(
     client=llm_client, 
     model_id="gpt-4o",
-    temperature=0.7,
-    max_tokens=1024,
 )
+# Create dataset
 dataset = [
     {"input": "The quick brown fox jumps over the lazy dog."},
-    {"input": "Artificial Intelligence is revolutionizing various industries, from healthcare to finance."}
+    {"input": "Artificial Intelligence is revolutionizing various industries."}
 ]
 
+# Run evaluation
 evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])
 results = asyncio.run(evaluation.evaluate(model))
 print(results)
+# {'SummarizationScorer': {'is_entity_dense': {'true_count': 0, 'true_fraction': 0.0}, 'summarization_eval_score': {'mean': 0.0}, 'entity_density': {'mean': 0.0}}, 'model_latency': {'mean': 6.210803985595703e-05}}
 ```
 
 ---
@@ -345,17 +339,27 @@ from openai import OpenAI
 import weave
 from weave.scorers import OpenAIModerationScorer
 
+class MyModel(weave.Model):
+    @weave.op
+    async def predict(self, input: str) -> str:
+        return input
+
+# Initialize clients and scorers
 client = OpenAI()
+model = MyModel()
 moderation_scorer = OpenAIModerationScorer(client=client)
 
+# Create dataset
 dataset = [
     {"input": "I love puppies and kittens!"},
     {"input": "I hate everyone and want to hurt them."}
 ]
 
-evaluation = Evaluation(dataset=dataset, scorers=[moderation_scorer])
+# Run evaluation
+evaluation = weave.Evaluation(dataset=dataset, scorers=[moderation_scorer])
 results = asyncio.run(evaluation.evaluate(model))
 print(results)
+# {'OpenAIModerationScorer': {'flagged': {'true_count': 1, 'true_fraction': 0.5}, 'categories': {'violence': {'true_count': 1, 'true_fraction': 1.0}}}, 'model_latency': {'mean': 9.500980377197266e-05}}
 ```
 
 ---
@@ -395,34 +399,39 @@ from openai import OpenAI
 import weave
 from weave.scorers import EmbeddingSimilarityScorer
 
-
+# Initialize clients and scorers
+client = OpenAI()
 similarity_scorer = EmbeddingSimilarityScorer(
-    client=llm_client,
+    client=client,
     threshold=0.7,
-    column_map={"target": "other_col"}
+    column_map={"target": "reference"}
 )
 
+# Create dataset
 dataset = [
     {
         "input": "He's name is John",
-        "other_col": "John likes various types of cheese.",
+        "reference": "John likes various types of cheese.",
     },
     {
         "input": "He's name is Pepe.",
-        "other_col": "Pepe likes various types of cheese.",
+        "reference": "Pepe likes various types of cheese.",
     },
 ]
 
+# Define model
 @weave.op
-def model(input):
+def model(input: str) -> str:
     return "John likes various types of cheese."
 
+# Run evaluation
 evaluation = weave.Evaluation(
     dataset=dataset,
     scorers=[similarity_scorer],
 )
 result = asyncio.run(evaluation.evaluate(model))
 print(result)
+# {'EmbeddingSimilarityScorer': {'is_similar': {'true_count': 1, 'true_fraction': 0.5}, 'similarity_score': {'mean': 0.8448514031462045}}, 'model_latency': {'mean': 0.45862746238708496}}
 ```
 
 ---
@@ -462,6 +471,7 @@ dataset = [
 evaluation = weave.Evaluation(dataset=dataset, scorers=[json_scorer])
 results = asyncio.run(evaluation.evaluate(model))
 print(results)
+# {'ValidJSONScorer': {'json_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.58306884765625e-05}}
 ```
 
 
@@ -502,6 +512,7 @@ dataset = [
 evaluation = weave.Evaluation(dataset=dataset, scorers=[xml_scorer])
 results = asyncio.run(evaluation.evaluate(model))
 print(results)
+# {'ValidXMLScorer': {'xml_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.20159912109375e-05}}
 ```
 
 ---
@@ -548,38 +559,6 @@ entity_recall_scorer = ContextEntityRecallScorer(
 
 - Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
 
-
-Here you have an example usage of the `ContextEntityRecallScorer` in the context of an evaluation:
-
-```python
-import asyncio
-from openai import OpenAI
-import weave
-from weave.scorers import ContextEntityRecallScorer
-
-class RAGModel(weave.Model):
-    @weave.op()
-    async def predict(self, question: str) -> str:
-        "Retrieve relevant context"
-        return "Paris is the capital of France."
-
-llm_client = OpenAI()
-model = RAGModel()
-entity_recall_scorer = ContextEntityRecallScorer(
-    client=llm_client,
-    model_id="gpt-4o",
-    column_map={"context": "answer"}
-)
-
-dataset = [
-    {"question": "What is the capital of France?", "answer": "The capital city of France is Paris."},
-    {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare wrote many famous plays, including Romeo and Juliet."}
-]
-
-evaluation = weave.Evaluation(dataset=dataset, scorers=[entity_recall_scorer])
-results = asyncio.run(evaluation.evaluate(model))
-print(results)
-```
 ---
 
 ### RAGAS - `ContextRelevancyScorer`
@@ -608,14 +587,14 @@ relevancy_scorer = ContextRelevancyScorer(
 - Customize the `relevancy_prompt` to define how relevancy is assessed.
 
 
-Here you have an example usage of the `ContextRelevancyScorer` in the context of an evaluation:
+Here you have an example usage of `ContextEntityRecallScorer` and `ContextRelevancyScorer` in the context of an evaluation:
 
 ```python
 import asyncio
 from textwrap import dedent
 from openai import OpenAI
 import weave
-from weave.scorers import ContextRelevancyScorer
+from weave.scorers import ContextEntityRecallScorer, ContextRelevancyScorer
 
 class RAGModel(weave.Model):
     @weave.op()
@@ -623,9 +602,10 @@ class RAGModel(weave.Model):
         "Retrieve relevant context"
         return "Paris is the capital of France."
 
-llm_client = OpenAI()
+
 model = RAGModel()
 
+# Define prompts
 relevancy_prompt: str = dedent("""
     Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1.
 
@@ -634,6 +614,12 @@ relevancy_prompt: str = dedent("""
     Relevancy Score (0-1):
     """)
 
+# Initialize clients and scorers
+llm_client = OpenAI()
+entity_recall_scorer = ContextEntityRecallScorer(
+    client=client,
+    model_id="gpt-4o",
+)
 
 relevancy_scorer = ContextRelevancyScorer(
     client=llm_client,
@@ -641,12 +627,25 @@ relevancy_scorer = ContextRelevancyScorer(
     relevancy_prompt=relevancy_prompt
 )
 
+# Create dataset
 dataset = [
-    {"question": "What is the capital of France?", "context": "Paris is the capital city of France."},
-    {"question": "Who wrote Romeo and Juliet?", "context": "The Eiffel Tower is located in Paris, France."}
+    {
+        "question": "What is the capital of France?", 
+        "context": "Paris is the capital city of France."
+    },
+    {
+        "question": "Who wrote Romeo and Juliet?", 
+        "context": "William Shakespeare wrote many famous plays."
+    }
 ]
 
-evaluation = weave.Evaluation(dataset=dataset, scorers=[relevancy_scorer])
+# Run evaluation
+evaluation = weave.Evaluation(
+    dataset=dataset, 
+    scorers=[entity_recall_scorer, relevancy_scorer]
+)
 results = asyncio.run(evaluation.evaluate(model))
 print(results)
+# {'ContextEntityRecallScorer': {'recall': {'mean': 0.3333333333333333}}, 'ContextRelevancyScorer': {'relevancy_score': {'mean': 0.5}}, 'model_latency': {'mean': 9.393692016601562e-05}}
 ```
+

From d66e9a81dc8ce28dd05c519a7dab8443cab9572a Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 12:51:43 +0100
Subject: [PATCH 140/150] add default embedding model

---
 weave/scorers/llm_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/weave/scorers/llm_utils.py b/weave/scorers/llm_utils.py
index 3debc2e3293d..70dddfe8c079 100644
--- a/weave/scorers/llm_utils.py
+++ b/weave/scorers/llm_utils.py
@@ -8,6 +8,7 @@
 
 OPENAI_DEFAULT_MODEL = "gpt-4o"
 OPENAI_DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
+OPENAI_DEFAULT_MODERATION_MODEL = "text-moderation-latest"
 
 ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet"
 

From b1932bce6d90c38940494f4e09375201b1fbe115 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 12:58:59 +0100
Subject: [PATCH 141/150] remove unused col

---
 tests/scorers/test_summarization_scorer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
index 255a1055b7d2..ca6c3f7139b9 100644
--- a/tests/scorers/test_summarization_scorer.py
+++ b/tests/scorers/test_summarization_scorer.py
@@ -85,11 +85,9 @@ async def test_evaluate_summary_scorer(summarization_scorer):
     dataset = [
         {
             "input": "This is the original text.",
-            "output": "This is the summary.",
         },
         {
             "input": "This is another original text.",
-            "output": "This is another summary.",
         },
     ]
     evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])

From e212a039e6b09e0d4e69f1e962f56b51d98b7b9f Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 13:04:16 +0100
Subject: [PATCH 142/150] lint

---
 weave/scorers/llm_scorer.py           | 2 +-
 weave/scorers/moderation_scorer.py    | 2 ++
 weave/scorers/ragas_scorer.py         | 8 ++++----
 weave/scorers/summarization_scorer.py | 2 +-
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/weave/scorers/llm_scorer.py b/weave/scorers/llm_scorer.py
index 4cfc56734834..1d66fd412442 100644
--- a/weave/scorers/llm_scorer.py
+++ b/weave/scorers/llm_scorer.py
@@ -39,7 +39,7 @@ class InstructorLLMScorer(Scorer):
 
     This scorer extends the base LLM scoring capability by adding temperature and
     token control for more precise scoring behavior. It automatically wraps the
-    provided client with [instructor](https://github.com/instructor-ai/instructor) 
+    provided client with [instructor](https://github.com/instructor-ai/instructor)
     functionality for structured outputs.
 
     Attributes:
diff --git a/weave/scorers/moderation_scorer.py b/weave/scorers/moderation_scorer.py
index 3bc5b4139493..7880de9b0006 100644
--- a/weave/scorers/moderation_scorer.py
+++ b/weave/scorers/moderation_scorer.py
@@ -6,12 +6,14 @@
 from weave.scorers.llm_scorer import LLMScorer
 from weave.scorers.llm_utils import OPENAI_DEFAULT_MODERATION_MODEL
 
+
 class OpenAIModerationScorer(LLMScorer):
     """Use OpenAI moderation API to check if the model output is safe.
 
     Args:
         model_id: The OpenAI model to use for moderation. Defaults to `text-moderation-latest`.
     """
+
     model_id: str = OPENAI_DEFAULT_MODERATION_MODEL
 
     @field_validator("client")
diff --git a/weave/scorers/ragas_scorer.py b/weave/scorers/ragas_scorer.py
index 98f5cbdf923e..a8b754af541a 100644
--- a/weave/scorers/ragas_scorer.py
+++ b/weave/scorers/ragas_scorer.py
@@ -23,8 +23,8 @@ class ContextEntityRecallScorer(InstructorLLMScorer):
     Note:
         - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM
         provider's response; you will have to install the `instructor` python package to use it.
-        - The `score` method expects two arguments: 'output' (the model's response) and 'context' 
-        (the reference text). If your dataset columns have different names, use the `column_map` 
+        - The `score` method expects two arguments: 'output' (the model's response) and 'context'
+        (the reference text). If your dataset columns have different names, use the `column_map`
         argument when initializing the scorer.
         - Entity extraction is performed using an LLM, so results may vary based on the model used.
 
@@ -94,8 +94,8 @@ class ContextRelevancyScorer(InstructorLLMScorer):
     Note:
         - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM
         provider's response; you will have to install the `instructor` python package to use it.
-        - The `score` method expects two arguments: 'output' (treated as the question) and 'context' 
-        (the reference text). If your dataset columns have different names, use the `column_map` 
+        - The `score` method expects two arguments: 'output' (treated as the question) and 'context'
+        (the reference text). If your dataset columns have different names, use the `column_map`
         argument when initializing the scorer.
         - The relevancy score is binary (0 or 1) where 1 indicates relevant context.
 
diff --git a/weave/scorers/summarization_scorer.py b/weave/scorers/summarization_scorer.py
index adcf4bea33bf..18e7c7cb64b7 100644
--- a/weave/scorers/summarization_scorer.py
+++ b/weave/scorers/summarization_scorer.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Any, List, Literal
+from typing import List, Literal
 
 from pydantic import BaseModel, Field
 

From 390fd6c573eeef2b58174c39a50db3806b73f3cc Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 13:41:55 +0100
Subject: [PATCH 143/150] remove ignore type

---
 weave/scorers/llm_scorer.py        | 4 ++--
 weave/scorers/moderation_scorer.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/weave/scorers/llm_scorer.py b/weave/scorers/llm_scorer.py
index 1d66fd412442..cd1d1a27bfc9 100644
--- a/weave/scorers/llm_scorer.py
+++ b/weave/scorers/llm_scorer.py
@@ -25,7 +25,7 @@ class LLMScorer(Scorer):
     model_id: str = Field(description="The model to use")
 
     @field_validator("client")
-    def validate_client(cls, v):  # type: ignore
+    def validate_client(cls, v):
         client_type_name = type(v).__name__
         if client_type_name not in _LLM_CLIENTS_NAMES:
             raise ValueError(
@@ -61,7 +61,7 @@ class InstructorLLMScorer(Scorer):
     )
 
     @field_validator("client")
-    def validate_client(cls, v):  # type: ignore
+    def validate_client(cls, v):
         client_type_name = type(v).__name__
         if client_type_name not in _LLM_CLIENTS_NAMES:
             raise ValueError(
diff --git a/weave/scorers/moderation_scorer.py b/weave/scorers/moderation_scorer.py
index 7880de9b0006..db128fe64c09 100644
--- a/weave/scorers/moderation_scorer.py
+++ b/weave/scorers/moderation_scorer.py
@@ -17,7 +17,8 @@ class OpenAIModerationScorer(LLMScorer):
     model_id: str = OPENAI_DEFAULT_MODERATION_MODEL
 
     @field_validator("client")
-    def validate_openai_client(cls, v):  # type: ignore
+    def validate_openai_client(cls, v):
+        # Method implementation
         try:
             from openai import (  # Ensure these are the correct imports
                 AsyncOpenAI,

From 394e39ef03c81f3ca58a682ad549be98d732677f Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 14:48:19 +0100
Subject: [PATCH 144/150] remove ignores

---
 weave/scorers/llm_utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/weave/scorers/llm_utils.py b/weave/scorers/llm_utils.py
index 70dddfe8c079..da0be1b4d12d 100644
--- a/weave/scorers/llm_utils.py
+++ b/weave/scorers/llm_utils.py
@@ -21,6 +21,7 @@
     import instructor
     from anthropic import Anthropic, AsyncAnthropic
     from google.generativeai import GenerativeModel
+    from instructor.patch import InstructorChatCompletionCreate
     from mistralai import Mistral
     from openai import AsyncOpenAI, OpenAI
 
@@ -65,7 +66,9 @@ def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ign
         raise ValueError(f"Unsupported client type: {client_type}")
 
 
-def create(client: instructor.client, *args, **kwargs) -> Any:  # type: ignore
+def create(
+    client: instructor.client, *args: Any, **kwargs: Any
+) -> InstructorChatCompletionCreate:
     # gemini has slightly different argument namings...
     # max_tokens -> max_output_tokens
     if "generativemodel" in type(client.client).__name__.lower():
@@ -84,10 +87,10 @@ def embed(
 ) -> List[List[float]]:  # type: ignore
     client_type = type(client).__name__.lower()
     if "openai" in client_type:
-        response = client.embeddings.create(model=model_id, input=texts, **kwargs)  # type: ignore
+        response = client.embeddings.create(model=model_id, input=texts, **kwargs)
         return [embedding.embedding for embedding in response.data]
     elif "mistral" in client_type:
-        response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)  # type: ignore
+        response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)
         return [embedding.embedding for embedding in response.data]
     else:
         raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}")

From d67611c55c118d91e9f64d45e6c48f7835f345e3 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 14:48:34 +0100
Subject: [PATCH 145/150] type

---
 weave/scorers/llm_scorer.py        | 4 ++--
 weave/scorers/moderation_scorer.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/weave/scorers/llm_scorer.py b/weave/scorers/llm_scorer.py
index cd1d1a27bfc9..b3660a3b9cd3 100644
--- a/weave/scorers/llm_scorer.py
+++ b/weave/scorers/llm_scorer.py
@@ -25,7 +25,7 @@ class LLMScorer(Scorer):
     model_id: str = Field(description="The model to use")
 
     @field_validator("client")
-    def validate_client(cls, v):
+    def validate_client(cls, v: _LLM_CLIENTS) -> _LLM_CLIENTS:
         client_type_name = type(v).__name__
         if client_type_name not in _LLM_CLIENTS_NAMES:
             raise ValueError(
@@ -61,7 +61,7 @@ class InstructorLLMScorer(Scorer):
     )
 
     @field_validator("client")
-    def validate_client(cls, v):
+    def validate_client(cls, v: _LLM_CLIENTS) -> _LLM_CLIENTS:
         client_type_name = type(v).__name__
         if client_type_name not in _LLM_CLIENTS_NAMES:
             raise ValueError(
diff --git a/weave/scorers/moderation_scorer.py b/weave/scorers/moderation_scorer.py
index db128fe64c09..e2268bb21c06 100644
--- a/weave/scorers/moderation_scorer.py
+++ b/weave/scorers/moderation_scorer.py
@@ -4,7 +4,7 @@
 
 import weave
 from weave.scorers.llm_scorer import LLMScorer
-from weave.scorers.llm_utils import OPENAI_DEFAULT_MODERATION_MODEL
+from weave.scorers.llm_utils import _LLM_CLIENTS, OPENAI_DEFAULT_MODERATION_MODEL
 
 
 class OpenAIModerationScorer(LLMScorer):
@@ -17,7 +17,7 @@ class OpenAIModerationScorer(LLMScorer):
     model_id: str = OPENAI_DEFAULT_MODERATION_MODEL
 
     @field_validator("client")
-    def validate_openai_client(cls, v):
+    def validate_openai_client(cls, v: _LLM_CLIENTS) -> _LLM_CLIENTS:
         # Method implementation
         try:
             from openai import (  # Ensure these are the correct imports

From 3dd3dcebd742b19bb866e7ed3099fb0256f81be5 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 14:49:18 +0100
Subject: [PATCH 146/150] remove ignore type

---
 weave/scorers/llm_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weave/scorers/llm_utils.py b/weave/scorers/llm_utils.py
index da0be1b4d12d..cf0a1584df56 100644
--- a/weave/scorers/llm_utils.py
+++ b/weave/scorers/llm_utils.py
@@ -84,7 +84,7 @@ def create(
 
 def embed(
     client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs: Any
-) -> List[List[float]]:  # type: ignore
+) -> List[List[float]]:
     client_type = type(client).__name__.lower()
     if "openai" in client_type:
         response = client.embeddings.create(model=model_id, input=texts, **kwargs)

From 30a11b3988e1ef0a18f5be4086f294a70bc11793 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 14:50:15 +0100
Subject: [PATCH 147/150] remove ignore type

---
 weave/scorers/llm_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weave/scorers/llm_utils.py b/weave/scorers/llm_utils.py
index cf0a1584df56..4cf70af729a7 100644
--- a/weave/scorers/llm_utils.py
+++ b/weave/scorers/llm_utils.py
@@ -41,7 +41,7 @@
 )
 
 
-def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ignore
+def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":
     try:
         import instructor
     except ImportError:

From 77f786300a3726856e76dfc252bf4cff5f9217ef Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 15:13:07 +0100
Subject: [PATCH 148/150] make copy-pastable

---
 docs/docs/guides/evaluation/scorers.md | 38 ++++++++++++++++++--------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index d7112f32829c..4502087f9750 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -15,11 +15,16 @@ Scorers must return a dictionary and can return multiple metrics, nested metrics
 These are functions decorated with `@weave.op` that return a dictionary. They're great for simple evaluations like:
 
 ```python
+import weave
+
 @weave.op
 def evaluate_uppercase(text: str) -> dict:  # Added return type hint
     return {"text_is_uppercase": text.isupper()}
 
-my_eval = weave.Evaluation(..., scorers=[evaluate_uppercase])
+my_eval = weave.Evaluation(
+    dataset=[{"text": "HELLO WORLD"}], 
+    scorers=[evaluate_uppercase]
+)
 ```
 
 When the evaluation is run, `evaluate_uppercase` checks if the text is all uppercase.
@@ -36,22 +41,30 @@ Example:
 
 
 ```python
+from openai import OpenAI
 from weave import Scorer
 
-llm_client = ...
+llm_client = OpenAI()
 
+#highlight-next-line
 class SummarizationScorer(Scorer):
-    model_id: str = "the LLM model to use"
+    model_id: str = "gpt-4o"
     system_prompt: str = "Evaluate whether the summary is good."
 
     @weave.op
     def some_complicated_preprocessing(self, text: str) -> str:
-        ...
-        return text
+        processed_text = "Original text: \n" + text + "\n"
+        return processed_text
 
     @weave.op
-    def ask_llm(self, summary: str, text: str) -> dict:
-        res = llm_client.create(self.system_prompt, summary, text)
+    def call_llm(self, summary: str, processed_text: str) -> dict:
+        res = llm_client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": self.system_prompt},
+                {"role": "user", "content": (
+                    f"Analyse how good the summary is compared to the original text." 
+                    f"Summary: {summary}\n{processed_text}"
+                )}])
         return {"summary_quality": res}
 
     @weave.op
@@ -60,14 +73,15 @@ class SummarizationScorer(Scorer):
 
         Args:
             output: The summary generated by an AI system
-            text: The original text being summarised
+            text: The original text being summarized
         """
-        text = self.some_complicated_preprocessing(text)
-        eval_result = self.ask_llm(output, text)
+        processed_text = self.some_complicated_preprocessing(text)
+        eval_result = self.call_llm(summary=output, processed_text=processed_text)
         return {"summary_quality": eval_result}
 
-summarization_scorer = SummarizationScorer(model_id="o2")
-eval = weave.Evaluation(..., scorers=[summarization_scorer])
+evaluation = weave.Evaluation(
+    dataset=[{"text": "The quick brown fox jumps over the lazy dog."}], 
+    scorers=[summarization_scorer])
 ```
 This class evaluates how good a summary is by comparing it to the original text.
 

From c23c95f5c24a9ab6895853f3f67c57a2ade20dec Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 15:21:37 +0100
Subject: [PATCH 149/150] remove missing ignores

---
 weave/scorers/json_scorer.py       | 4 +++-
 weave/scorers/moderation_scorer.py | 4 ++--
 weave/scorers/pydantic_scorer.py   | 4 +++-
 weave/scorers/string_scorer.py     | 5 +++--
 weave/scorers/xml_scorer.py        | 4 +++-
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/weave/scorers/json_scorer.py b/weave/scorers/json_scorer.py
index 5c6a69a6d51f..e7604a8f0aec 100644
--- a/weave/scorers/json_scorer.py
+++ b/weave/scorers/json_scorer.py
@@ -1,13 +1,15 @@
 import json
 from typing import Any
 
+import weave
 from weave.scorers.base_scorer import Scorer
 
 
 class ValidJSONScorer(Scorer):
     """Validate whether a string is valid JSON."""
 
-    def score(self, output: Any) -> dict:  # type: ignore
+    @weave.op
+    def score(self, output: Any) -> dict:
         try:
             _ = json.loads(output)
             return {"json_valid": True}
diff --git a/weave/scorers/moderation_scorer.py b/weave/scorers/moderation_scorer.py
index e2268bb21c06..aaadeb7952c7 100644
--- a/weave/scorers/moderation_scorer.py
+++ b/weave/scorers/moderation_scorer.py
@@ -32,10 +32,10 @@ def validate_openai_client(cls, v: _LLM_CLIENTS) -> _LLM_CLIENTS:
         return v
 
     @weave.op
-    def score(self, output: Any) -> Any:
+    def score(self, output: Any) -> dict:
         response = self.client.moderations.create(
             model=self.model_id,
             input=output,
         ).results[0]
-        categories = {k: v for k, v in response.categories.dict().items() if v}
+        categories = {k: v for k, v in response.categories.items() if v}
         return {"flagged": response.flagged, "categories": categories}
diff --git a/weave/scorers/pydantic_scorer.py b/weave/scorers/pydantic_scorer.py
index 381834f270be..0a5dcf1e768b 100644
--- a/weave/scorers/pydantic_scorer.py
+++ b/weave/scorers/pydantic_scorer.py
@@ -2,6 +2,7 @@
 
 from pydantic import BaseModel, ValidationError
 
+import weave
 from weave.scorers.base_scorer import Scorer
 
 
@@ -10,7 +11,8 @@ class PydanticScorer(Scorer):
 
     model: Type[BaseModel]
 
-    def score(self, output: Any) -> dict:  # type: ignore
+    @weave.op
+    def score(self, output: Any) -> dict:
         if isinstance(output, str):
             try:
                 self.model.model_validate_json(output)
diff --git a/weave/scorers/string_scorer.py b/weave/scorers/string_scorer.py
index 1cca2bf6ec86..83dec55c762a 100644
--- a/weave/scorers/string_scorer.py
+++ b/weave/scorers/string_scorer.py
@@ -9,7 +9,8 @@
 class StringMatchScorer(Scorer):
     """Scorer that checks if the model output string is found in the search columns of the dataset row."""
 
-    def score(self, output: str, target: str) -> dict:  # type: ignore
+    @weave.op
+    def score(self, output: str, target: str) -> dict:
         string_in_input = output.lower() in target.lower()
         return {"string_in_input": string_in_input}
 
@@ -20,7 +21,7 @@ class LevenshteinScorer(Scorer):
     )
 
     @model_validator(mode="after")
-    def check_levenshtein(self):  # type: ignore
+    def check_levenshtein(self) -> "LevenshteinScorer":
         try:
             from Levenshtein import distance
 
diff --git a/weave/scorers/xml_scorer.py b/weave/scorers/xml_scorer.py
index fc18188c2c3b..8545a96686ba 100644
--- a/weave/scorers/xml_scorer.py
+++ b/weave/scorers/xml_scorer.py
@@ -1,13 +1,15 @@
 import xml.etree.ElementTree as ET
 from typing import Union
 
+import weave
 from weave.scorers.base_scorer import Scorer
 
 
 class ValidXMLScorer(Scorer):
     """Score an XML string."""
 
-    def score(self, output: Union[str, dict]) -> dict:  # type: ignore
+    @weave.op
+    def score(self, output: Union[str, dict]) -> dict:
         if isinstance(output, dict):
             xml_string = output.get("output", "")
         else:

From 2ac6f53439eac176e6820bcfb7fe1d6bbc232514 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Mon, 28 Oct 2024 16:17:34 +0100
Subject: [PATCH 150/150] missing imports

---
 docs/docs/guides/evaluation/scorers.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index 4502087f9750..ce7ea3b86c15 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -41,6 +41,7 @@ Example:
 
 
 ```python
+import weave
 from openai import OpenAI
 from weave import Scorer
 
@@ -112,6 +113,7 @@ If you're using a class-based scorer, pass a dictionary to the `column_map` attr
 Example:
 
 ```python
+import weave
 from weave import Scorer
 
 # A dataset with news articles to be summarised
@@ -141,6 +143,9 @@ Now, the `text` argument in the `score` method will receive data from the `news_
 - Another equivalent option to map your columns is to subclass the `Scorer` and overload the `score` method mapping the columns explicitly.
 
 ```python
+import weave
+from weave import Scorer
+
 class MySummarizationScorer(SummarizationScorer):
     
     @weave.op