Skip to content

Commit

Permalink
improved events for scoring / human agent (#1297)
Browse files Browse the repository at this point in the history
* initial work on human agent events

* display for info event sources and intermediate scores

* ruff lint

* Update CHANGELOG.md

---------

Co-authored-by: J.J. Allaire <[email protected]>
  • Loading branch information
jjallaire-aisi and jjallaire authored Feb 12, 2025
1 parent 40f78ec commit 944b61b
Show file tree
Hide file tree
Showing 14 changed files with 87 additions and 38 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
- Use a sample adjustment for the `var()` metric.
- OpenAI: Native tool calling for o1-mini (upon initial release it required emulated tool calling like o1-preview).
- Python and Bash tools: Add `sandbox` argument for running in non-default sandboxes.
- Transcript: Log `ScoreEvent` (with `intermediate=True`) when the `score()` function is called.
- Transcript: Add `source` field to `InfoEvent` and use it for events logged by the human agent.
- Docker: Support Dockerfiles with `.Dockerfile` extension.
- Docker: Raise error when there is an explicitly configured `container_name` (incompatible with epochs > 1).
- Log: Validate that `log_dir` is writeable at startup.
Expand Down
4 changes: 2 additions & 2 deletions src/inspect_ai/_view/www/dist/assets/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -50618,7 +50618,7 @@ self.onmessage = function (e) {
EventPanel,
{
id,
title: "Info",
title: "Info" + (event.source ? ": " + event.source : ""),
className: className2,
subTitle: formatDateTime(new Date(event.timestamp)),
icon: ApplicationIcons.info,
Expand Down Expand Up @@ -51022,7 +51022,7 @@ self.onmessage = function (e) {
EventPanel,
{
id,
title: "Score",
title: (event.intermediate ? "Intermediate " : "") + "Score",
className: clsx(className2, "text-size-small"),
subTitle: formatDateTime(new Date(event.timestamp)),
icon: ApplicationIcons.scorer,
Expand Down
23 changes: 21 additions & 2 deletions src/inspect_ai/_view/www/log-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -2345,6 +2345,18 @@
"title": "Event",
"type": "string"
},
"source": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Source"
},
"data": {
"$ref": "#/$defs/JsonValue"
}
Expand All @@ -2353,6 +2365,7 @@
"timestamp",
"pending",
"event",
"source",
"data"
],
"title": "InfoEvent",
Expand Down Expand Up @@ -3309,7 +3322,7 @@
"additionalProperties": false
},
"ScoreEvent": {
"description": "Event with sample score.",
"description": "Event with score.\n\nCan be the final score for a `Sample`, or can be an intermediate score\nresulting from a call to `score`.",
"properties": {
"timestamp": {
"format": "date-time",
Expand Down Expand Up @@ -3354,14 +3367,20 @@
],
"default": null,
"title": "Target"
},
"intermediate": {
"default": false,
"title": "Intermediate",
"type": "boolean"
}
},
"required": [
"timestamp",
"pending",
"event",
"score",
"target"
"target",
"intermediate"
],
"title": "ScoreEvent",
"type": "object",
Expand Down
2 changes: 1 addition & 1 deletion src/inspect_ai/_view/www/src/components/AnsiDisplay.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { ANSIColor, ANSIOutput, ANSIOutputRun, ANSIStyle } from "ansi-output";
import clsx from "clsx";
import "./ANSIDisplay.css";
import "./AnsiDisplay.css";

interface ANSIDisplayProps {
output: string;
Expand Down
2 changes: 1 addition & 1 deletion src/inspect_ai/_view/www/src/components/JsonPanel.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import clsx from "clsx";
import { highlightElement } from "prismjs";
import React, { useEffect, useMemo, useRef } from "react";
import "./JSONPanel.css";
import "./JsonPanel.css";

const kPrismRenderMaxSize = 250000;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ export const InfoEventView: React.FC<InfoEventViewProps> = ({
return (
<EventPanel
id={id}
title="Info"
title={"Info" + (event.source ? ": " + event.source : "")}
className={className}
subTitle={formatDateTime(new Date(event.timestamp))}
icon={ApplicationIcons.info}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export const ScoreEventView: React.FC<ScoreEventViewProps> = ({
return (
<EventPanel
id={id}
title="Score"
title={(event.intermediate ? "Intermediate " : "") + "Score"}
className={clsx(className, "text-size-small")}
subTitle={formatDateTime(new Date(event.timestamp))}
icon={ApplicationIcons.scorer}
Expand Down
9 changes: 8 additions & 1 deletion src/inspect_ai/_view/www/src/types/log.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ export type Timestamp8 = string;
export type Pending8 = boolean | null;
export type Event8 = "score";
export type Target2 = string | string[] | null;
export type Intermediate = boolean;
export type Timestamp9 = string;
export type Pending9 = boolean | null;
export type Event9 = "error";
Expand All @@ -339,6 +340,7 @@ export type Lineno = number;
export type Timestamp11 = string;
export type Pending11 = boolean | null;
export type Event11 = "info";
export type Source4 = string | null;
export type Timestamp12 = string;
export type Pending12 = boolean | null;
export type Event12 = "step";
Expand Down Expand Up @@ -1053,14 +1055,18 @@ export interface InputEvent {
input_ansi: InputAnsi;
}
/**
* Event with sample score.
* Event with score.
*
* Can be the final score for a `Sample`, or can be an intermediate score
* resulting from a call to `score`.
*/
export interface ScoreEvent {
timestamp: Timestamp8;
pending: Pending8;
event: Event8;
score: Score;
target: Target2;
intermediate: Intermediate;
}
/**
* Event with sample error.
Expand Down Expand Up @@ -1099,6 +1105,7 @@ export interface InfoEvent {
timestamp: Timestamp11;
pending: Pending11;
event: Event11;
source: Source4;
data: JsonValue;
}
/**
Expand Down
21 changes: 16 additions & 5 deletions src/inspect_ai/log/_transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,9 @@ class InfoEvent(BaseEvent):
event: Literal["info"] = Field(default="info")
"""Event type."""

source: str | None = Field(default=None)
"""Optional source for info event."""

data: JsonValue
"""Data provided with event."""

Expand All @@ -279,17 +282,24 @@ class ErrorEvent(BaseEvent):


class ScoreEvent(BaseEvent):
"""Event with sample score."""
"""Event with score.
Can be the final score for a `Sample`, or can be an intermediate score
resulting from a call to `score`.
"""

event: Literal["score"] = Field(default="score")
"""Event type."""

score: Score
"""Sample score."""
"""Score value."""

target: str | list[str] | None = Field(default=None)
""""Sample target."""

intermediate: bool = Field(default=False)
"""Was this an intermediate scoring?"""


class StepEvent(BaseEvent):
"""Step within current sample or subtask."""
Expand Down Expand Up @@ -355,13 +365,14 @@ def __init__(self, name: str = "") -> None:
self.name = name
self._events: list[Event] = []

def info(self, data: JsonValue) -> None:
def info(self, data: JsonValue, *, source: str | None = None) -> None:
"""Add an `InfoEvent` to the transcript.
Args:
data (JsonValue): Data associated with the event.
data: Data associated with the event.
source: Optional event source.
"""
self._event(InfoEvent(data=data))
self._event(InfoEvent(source=source, data=data))

@contextlib.contextmanager
def step(self, name: str, type: str | None = None) -> Iterator[None]:
Expand Down
12 changes: 11 additions & 1 deletion src/inspect_ai/scorer/_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,24 @@ async def score(state: TaskState) -> list[Score]:
a task that does not have a scorer.
"""
from inspect_ai.log._transcript import ScoreEvent, transcript

scorers = _scorers.get(None)
target = _target.get(None)
if scorers is None or target is None:
raise RuntimeError(
"The score() function can only be called while executing a task with a scorer."
)

return [await scorer(state, target) for scorer in scorers]
scores: list[Score] = []
for scorer in scorers:
score = await scorer(state, target)
scores.append(score)
transcript()._event(
ScoreEvent(score=score, target=target.target, intermediate=True)
)

return scores


def init_scoring_context(scorers: list[Scorer], target: Target) -> None:
Expand Down
24 changes: 14 additions & 10 deletions src/inspect_ai/solver/_human_agent/commands/clock.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,10 @@ def cli(self, args: Namespace) -> None:
print(call_human_agent("start"))

def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
from inspect_ai.log._transcript import transcript

async def start() -> str:
if not state.running:
state.running = True
transcript().info(
f"Task started (total time: {format_progress_time(state.time)})"
)
clock_action_event("start", state)
return render_status(state)

return start
Expand All @@ -57,14 +53,22 @@ def cli(self, args: Namespace) -> None:
print(call_human_agent("stop"))

def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
from inspect_ai.log._transcript import transcript

async def stop() -> str:
if state.running:
state.running = False
transcript().info(
f"Task stopped (total time: {format_progress_time(state.time)})"
)
clock_action_event("stop", state)
return render_status(state)

return stop


def clock_action_event(action: str, state: HumanAgentState) -> None:
from inspect_ai.log._transcript import transcript

transcript().info(
{
"action": action,
"total_time": format_progress_time(state.time, False),
},
source="human_agent",
)
2 changes: 1 addition & 1 deletion src/inspect_ai/solver/_human_agent/commands/note.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,6 @@ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]
from inspect_ai.log._transcript import transcript

async def note(content: str) -> None:
transcript().info(content)
transcript().info(content, source="human_agent")

return note
11 changes: 0 additions & 11 deletions src/inspect_ai/solver/_human_agent/commands/score.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from argparse import Namespace
from copy import deepcopy
from textwrap import dedent
from typing import Awaitable, Callable, Literal

from pydantic import JsonValue
Expand Down Expand Up @@ -51,8 +50,6 @@ def cli(self, args: Namespace) -> None:

def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
async def score_task(answer: str | None) -> str:
from inspect_ai.log._transcript import transcript

# make a copy of TaskState, add the answer, then score
if answer:
task_state = deepcopy(self._state)
Expand All @@ -64,14 +61,6 @@ async def score_task(answer: str | None) -> str:
# record the scoring action in our state
state.scorings.append(IntermediateScoring(time=state.time, scores=result))

# record to transcript
transcript().info(
dedent(f"""
### Intermediate Score
**Answer:** {result[0].answer}, **Score:** {result[0].as_str()}
""")
)

# notify user
return render_text(
f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}"
Expand Down
9 changes: 8 additions & 1 deletion tools/vscode/src/@types/log.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ export type Timestamp8 = string;
export type Pending8 = boolean | null;
export type Event8 = "score";
export type Target2 = string | string[] | null;
export type Intermediate = boolean;
export type Timestamp9 = string;
export type Pending9 = boolean | null;
export type Event9 = "error";
Expand All @@ -339,6 +340,7 @@ export type Lineno = number;
export type Timestamp11 = string;
export type Pending11 = boolean | null;
export type Event11 = "info";
export type Source4 = string | null;
export type Timestamp12 = string;
export type Pending12 = boolean | null;
export type Event12 = "step";
Expand Down Expand Up @@ -1053,14 +1055,18 @@ export interface InputEvent {
input_ansi: InputAnsi;
}
/**
* Event with sample score.
* Event with score.
*
* Can be the final score for a `Sample`, or can be an intermediate score
* resulting from a call to `score`.
*/
export interface ScoreEvent {
timestamp: Timestamp8;
pending: Pending8;
event: Event8;
score: Score;
target: Target2;
intermediate: Intermediate;
}
/**
* Event with sample error.
Expand Down Expand Up @@ -1099,6 +1105,7 @@ export interface InfoEvent {
timestamp: Timestamp11;
pending: Pending11;
event: Event11;
source: Source4;
data: JsonValue;
}
/**
Expand Down

0 comments on commit 944b61b

Please sign in to comment.