diff --git a/CHANGELOG.md b/CHANGELOG.md index 160214e0d..e4187f4b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ - Use a sample adjustment for the `var()` metric. - OpenAI: Native tool calling for o1-mini (upon initial release it required emulated tool calling like o1-preview). - Python and Bash tools: Add `sandbox` argument for running in non-default sandboxes. +- Transcript: Log `ScoreEvent` (with `intermediate=True`) when the `score()` function is called. +- Transcript: Add `source` field to `InfoEvent` and use it for events logged by the human agent. - Docker: Support Dockerfiles with `.Dockerfile` extension. - Docker: Raise error when there is an explicitly configured `container_name` (incompatible with epochs > 1). - Log: Validate that `log_dir` is writeable at startup. diff --git a/src/inspect_ai/_view/www/dist/assets/index.js b/src/inspect_ai/_view/www/dist/assets/index.js index 522580a65..6c776d95c 100644 --- a/src/inspect_ai/_view/www/dist/assets/index.js +++ b/src/inspect_ai/_view/www/dist/assets/index.js @@ -50618,7 +50618,7 @@ self.onmessage = function (e) { EventPanel, { id, - title: "Info", + title: "Info" + (event.source ? ": " + event.source : ""), className: className2, subTitle: formatDateTime(new Date(event.timestamp)), icon: ApplicationIcons.info, @@ -51022,7 +51022,7 @@ self.onmessage = function (e) { EventPanel, { id, - title: "Score", + title: (event.intermediate ? "Intermediate " : "") + "Score", className: clsx(className2, "text-size-small"), subTitle: formatDateTime(new Date(event.timestamp)), icon: ApplicationIcons.scorer, diff --git a/src/inspect_ai/_view/www/log-schema.json b/src/inspect_ai/_view/www/log-schema.json index 699941786..90a5a2fc2 100644 --- a/src/inspect_ai/_view/www/log-schema.json +++ b/src/inspect_ai/_view/www/log-schema.json @@ -2345,6 +2345,18 @@ "title": "Event", "type": "string" }, + "source": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Source" + }, "data": { "$ref": "#/$defs/JsonValue" } @@ -2353,6 +2365,7 @@ "timestamp", "pending", "event", + "source", "data" ], "title": "InfoEvent", @@ -3309,7 +3322,7 @@ "additionalProperties": false }, "ScoreEvent": { - "description": "Event with sample score.", + "description": "Event with score.\n\nCan be the final score for a `Sample`, or can be an intermediate score\nresulting from a call to `score`.", "properties": { "timestamp": { "format": "date-time", @@ -3354,6 +3367,11 @@ ], "default": null, "title": "Target" + }, + "intermediate": { + "default": false, + "title": "Intermediate", + "type": "boolean" } }, "required": [ @@ -3361,7 +3379,8 @@ "pending", "event", "score", - "target" + "target", + "intermediate" ], "title": "ScoreEvent", "type": "object", diff --git a/src/inspect_ai/_view/www/src/components/AnsiDisplay.tsx b/src/inspect_ai/_view/www/src/components/AnsiDisplay.tsx index 5b189f7f4..39c2ff0b6 100644 --- a/src/inspect_ai/_view/www/src/components/AnsiDisplay.tsx +++ b/src/inspect_ai/_view/www/src/components/AnsiDisplay.tsx @@ -1,6 +1,6 @@ import { ANSIColor, ANSIOutput, ANSIOutputRun, ANSIStyle } from "ansi-output"; import clsx from "clsx"; -import "./ANSIDisplay.css"; +import "./AnsiDisplay.css"; interface ANSIDisplayProps { output: string; diff --git a/src/inspect_ai/_view/www/src/components/JsonPanel.tsx b/src/inspect_ai/_view/www/src/components/JsonPanel.tsx index 3b57c8699..09633f19b 100644 --- a/src/inspect_ai/_view/www/src/components/JsonPanel.tsx +++ b/src/inspect_ai/_view/www/src/components/JsonPanel.tsx @@ -1,7 +1,7 @@ import clsx from "clsx"; import { highlightElement } from "prismjs"; import React, { useEffect, useMemo, useRef } from "react"; -import "./JSONPanel.css"; +import "./JsonPanel.css"; const kPrismRenderMaxSize = 250000; diff --git a/src/inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx b/src/inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx index f64c31d00..2a806aea4 100644 --- a/src/inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +++ b/src/inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx @@ -35,7 +35,7 @@ export const InfoEventView: React.FC = ({ return ( = ({ return ( None: self.name = name self._events: list[Event] = [] - def info(self, data: JsonValue) -> None: + def info(self, data: JsonValue, *, source: str | None = None) -> None: """Add an `InfoEvent` to the transcript. Args: - data (JsonValue): Data associated with the event. + data: Data associated with the event. + source: Optional event source. """ - self._event(InfoEvent(data=data)) + self._event(InfoEvent(source=source, data=data)) @contextlib.contextmanager def step(self, name: str, type: str | None = None) -> Iterator[None]: diff --git a/src/inspect_ai/scorer/_score.py b/src/inspect_ai/scorer/_score.py index f02ec7f66..0c1913158 100644 --- a/src/inspect_ai/scorer/_score.py +++ b/src/inspect_ai/scorer/_score.py @@ -23,6 +23,8 @@ async def score(state: TaskState) -> list[Score]: a task that does not have a scorer. """ + from inspect_ai.log._transcript import ScoreEvent, transcript + scorers = _scorers.get(None) target = _target.get(None) if scorers is None or target is None: @@ -30,7 +32,15 @@ async def score(state: TaskState) -> list[Score]: "The score() function can only be called while executing a task with a scorer." ) - return [await scorer(state, target) for scorer in scorers] + scores: list[Score] = [] + for scorer in scorers: + score = await scorer(state, target) + scores.append(score) + transcript()._event( + ScoreEvent(score=score, target=target.target, intermediate=True) + ) + + return scores def init_scoring_context(scorers: list[Scorer], target: Target) -> None: diff --git a/src/inspect_ai/solver/_human_agent/commands/clock.py b/src/inspect_ai/solver/_human_agent/commands/clock.py index 309d05148..76a1d8480 100644 --- a/src/inspect_ai/solver/_human_agent/commands/clock.py +++ b/src/inspect_ai/solver/_human_agent/commands/clock.py @@ -27,14 +27,10 @@ def cli(self, args: Namespace) -> None: print(call_human_agent("start")) def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]: - from inspect_ai.log._transcript import transcript - async def start() -> str: if not state.running: state.running = True - transcript().info( - f"Task started (total time: {format_progress_time(state.time)})" - ) + clock_action_event("start", state) return render_status(state) return start @@ -57,14 +53,22 @@ def cli(self, args: Namespace) -> None: print(call_human_agent("stop")) def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]: - from inspect_ai.log._transcript import transcript - async def stop() -> str: if state.running: state.running = False - transcript().info( - f"Task stopped (total time: {format_progress_time(state.time)})" - ) + clock_action_event("stop", state) return render_status(state) return stop + + +def clock_action_event(action: str, state: HumanAgentState) -> None: + from inspect_ai.log._transcript import transcript + + transcript().info( + { + "action": action, + "total_time": format_progress_time(state.time, False), + }, + source="human_agent", + ) diff --git a/src/inspect_ai/solver/_human_agent/commands/note.py b/src/inspect_ai/solver/_human_agent/commands/note.py index 5049af6fd..ffe69beb6 100644 --- a/src/inspect_ai/solver/_human_agent/commands/note.py +++ b/src/inspect_ai/solver/_human_agent/commands/note.py @@ -37,6 +37,6 @@ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]] from inspect_ai.log._transcript import transcript async def note(content: str) -> None: - transcript().info(content) + transcript().info(content, source="human_agent") return note diff --git a/src/inspect_ai/solver/_human_agent/commands/score.py b/src/inspect_ai/solver/_human_agent/commands/score.py index e61776b45..424144ede 100644 --- a/src/inspect_ai/solver/_human_agent/commands/score.py +++ b/src/inspect_ai/solver/_human_agent/commands/score.py @@ -1,6 +1,5 @@ from argparse import Namespace from copy import deepcopy -from textwrap import dedent from typing import Awaitable, Callable, Literal from pydantic import JsonValue @@ -51,8 +50,6 @@ def cli(self, args: Namespace) -> None: def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]: async def score_task(answer: str | None) -> str: - from inspect_ai.log._transcript import transcript - # make a copy of TaskState, add the answer, then score if answer: task_state = deepcopy(self._state) @@ -64,14 +61,6 @@ async def score_task(answer: str | None) -> str: # record the scoring action in our state state.scorings.append(IntermediateScoring(time=state.time, scores=result)) - # record to transcript - transcript().info( - dedent(f""" - ### Intermediate Score - **Answer:** {result[0].answer}, **Score:** {result[0].as_str()} - """) - ) - # notify user return render_text( f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}" diff --git a/tools/vscode/src/@types/log.d.ts b/tools/vscode/src/@types/log.d.ts index b8e2bdd1e..d9c9a4df7 100644 --- a/tools/vscode/src/@types/log.d.ts +++ b/tools/vscode/src/@types/log.d.ts @@ -315,6 +315,7 @@ export type Timestamp8 = string; export type Pending8 = boolean | null; export type Event8 = "score"; export type Target2 = string | string[] | null; +export type Intermediate = boolean; export type Timestamp9 = string; export type Pending9 = boolean | null; export type Event9 = "error"; @@ -339,6 +340,7 @@ export type Lineno = number; export type Timestamp11 = string; export type Pending11 = boolean | null; export type Event11 = "info"; +export type Source4 = string | null; export type Timestamp12 = string; export type Pending12 = boolean | null; export type Event12 = "step"; @@ -1053,7 +1055,10 @@ export interface InputEvent { input_ansi: InputAnsi; } /** - * Event with sample score. + * Event with score. + * + * Can be the final score for a `Sample`, or can be an intermediate score + * resulting from a call to `score`. */ export interface ScoreEvent { timestamp: Timestamp8; @@ -1061,6 +1066,7 @@ export interface ScoreEvent { event: Event8; score: Score; target: Target2; + intermediate: Intermediate; } /** * Event with sample error. @@ -1099,6 +1105,7 @@ export interface InfoEvent { timestamp: Timestamp11; pending: Pending11; event: Event11; + source: Source4; data: JsonValue; } /**