Skip to content

Commit

Permalink
Collect / Display Reasoning Tokens For Samples, Evals (#1417)
Browse files Browse the repository at this point in the history
* update schema

* Fix time formatting

* Correct modelusage types, output

* record reasoning in overall model usage

* Display reasoning in model usage panel

* Correct vscode dark mode api call display

* Show tool call function call if there is no explicit input and we’re not compact

---------

Co-authored-by: jjallaire <[email protected]>
  • Loading branch information
dragonstyle and jjallaire authored Feb 27, 2025
1 parent 4a57309 commit dd80f0b
Show file tree
Hide file tree
Showing 14 changed files with 127 additions and 52 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
- Bugfix: Catch standard `TimeoutError` for running shell commands in the computer tool container.
- Inspect View: Fix layout issues with human agent terminal session playback.
- Inspect View: Improve tool input / output appearance when rendered in VSCode.
- Inspect View: Display reasoning tokens in model usage for the samples and for the complete eval.
- Inspect View: Improve model api request / response output when rendere in VSCode.
- Inspect View: Improve rendering of some tool calls in the transcript.
- Bugfix: Fix audio and video inputs for new Google GenAI client.
- Bugfix: Ensure that token limits are not enforced during model graded scoring.

Expand Down
2 changes: 2 additions & 0 deletions src/inspect_ai/_view/www/App.css
Original file line number Diff line number Diff line change
Expand Up @@ -811,12 +811,14 @@ pre[class*="language-"].tool-output,
background-color: #f8f8f8;
}

.vscode-dark .model-call pre[class*="language-"],
.vscode-dark .markdown-content pre[class*="language-"],
.vscode-dark pre[class*="language-"].tool-output,
.vscode-dark .tool-output {
background-color: #333333;
}

.model-call pre[class*="language-"],
.markdown-content pre[class*="language-"],
pre[class*="language-"].tool-output {
border: none !important;
Expand Down
16 changes: 11 additions & 5 deletions src/inspect_ai/_view/www/dist/assets/index.css

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

72 changes: 45 additions & 27 deletions src/inspect_ai/_view/www/dist/assets/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -14273,7 +14273,7 @@ var require_assets = __commonJS({
if (seconds < 60) {
return `${formatPrettyDecimal(seconds, 1)} sec`;
} else if (seconds < 60 * 60) {
return `${Math.floor(seconds / 60)} min ${seconds % 60} sec`;
return `${Math.floor(seconds / 60)} min ${Math.floor(seconds % 60)} sec`;
} else if (seconds < 60 * 60 * 24) {
const hours = Math.floor(seconds / (60 * 60));
const minutes = Math.floor(seconds % (60 * 60) / 60);
Expand Down Expand Up @@ -14428,12 +14428,12 @@ var require_assets = __commonJS({
}
};
const container$c = "_container_w37fs_1";
const padded$1 = "_padded_w37fs_8";
const padded$2 = "_padded_w37fs_8";
const key$1 = "_key_w37fs_12";
const value$1 = "_value_w37fs_16";
const styles$14 = {
container: container$c,
padded: padded$1,
padded: padded$2,
key: key$1,
value: value$1
};
Expand Down Expand Up @@ -21653,14 +21653,15 @@ var require_assets = __commonJS({
}
const collapse = Array.isArray(output2) ? output2.every((item2) => !isContentImage(item2)) : !isContentImage(output2);
const normalizedContent = reactExports.useMemo(() => normalizeContent$1(output2), [output2]);
const contents2 = mode !== "compact" ? input2 : input2 || functionCall;
return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [
mode !== "compact" && (!view || view.title) ? /* @__PURE__ */ jsxRuntimeExports.jsx(ToolTitle, { title: (view == null ? void 0 : view.title) || functionCall }) : "",
/* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [
/* @__PURE__ */ jsxRuntimeExports.jsx(
ToolInput,
{
highlightLanguage,
contents: input2,
contents: contents2,
toolCallView: view
}
),
Expand Down Expand Up @@ -49722,29 +49723,44 @@ self.onmessage = function (e) {
}
);
};
const wrapper$2 = "_wrapper_b0it4_1";
const col2$1 = "_col2_b0it4_8";
const col1_3 = "_col1_3_b0it4_12";
const col3 = "_col3_b0it4_16";
const separator$2 = "_separator_b0it4_20";
const wrapper$2 = "_wrapper_sq96g_1";
const col2$1 = "_col2_sq96g_8";
const col1_3 = "_col1_3_sq96g_12";
const col3 = "_col3_sq96g_16";
const separator$2 = "_separator_sq96g_20";
const padded$1 = "_padded_sq96g_26";
const styles$G = {
wrapper: wrapper$2,
col2: col2$1,
col1_3,
col3,
separator: separator$2
separator: separator$2,
padded: padded$1
};
const ModelUsagePanel = ({ usage }) => {
if (!usage) {
return null;
}
const rows = [
{
label: "input",
value: usage.input_tokens,
secondary: false
}
];
const rows = [];
if (usage.reasoning_tokens) {
rows.push({
label: "Reasoning",
value: usage.reasoning_tokens,
secondary: false,
bordered: true
});
rows.push({
label: "---",
value: void 0,
secondary: false,
padded: true
});
}
rows.push({
label: "input",
value: usage.input_tokens,
secondary: false
});
if (usage.input_tokens_cache_read) {
rows.push({
label: "cache_read",
Expand Down Expand Up @@ -49777,7 +49793,16 @@ self.onmessage = function (e) {
});
return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-size-small", styles$G.wrapper), children: rows.map((row2, idx) => {
if (row2.label === "---") {
return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$G.separator }, `$usage-sep-${idx}`);
return /* @__PURE__ */ jsxRuntimeExports.jsx(
"div",
{
className: clsx(
styles$G.separator,
row2.padded ? styles$G.padded : void 0
)
},
`$usage-sep-${idx}`
);
} else {
return /* @__PURE__ */ jsxRuntimeExports.jsxs(reactExports.Fragment, { children: [
/* @__PURE__ */ jsxRuntimeExports.jsx(
Expand Down Expand Up @@ -49882,14 +49907,7 @@ self.onmessage = function (e) {
return /* @__PURE__ */ jsxRuntimeExports.jsxs(TokenTable, { className: className2, children: [
/* @__PURE__ */ jsxRuntimeExports.jsx(TokenHeader, {}),
/* @__PURE__ */ jsxRuntimeExports.jsx("tbody", { children: Object.keys(model_usage).map((key2) => {
return /* @__PURE__ */ jsxRuntimeExports.jsx(
TokenRow,
{
model: `${key2}-token-row`,
usage: model_usage[key2]
},
key2
);
return /* @__PURE__ */ jsxRuntimeExports.jsx(TokenRow, { model: key2, usage: model_usage[key2] }, key2);
}) })
] });
};
Expand Down Expand Up @@ -53760,7 +53778,7 @@ self.onmessage = function (e) {
if (!contents2) {
return null;
}
return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: /* @__PURE__ */ jsxRuntimeExports.jsx("pre", { className: styles$t.codePre, children: /* @__PURE__ */ jsxRuntimeExports.jsx(
return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("model-call"), children: /* @__PURE__ */ jsxRuntimeExports.jsx("pre", { className: clsx(styles$t.codePre), children: /* @__PURE__ */ jsxRuntimeExports.jsx(
"code",
{
id,
Expand Down
15 changes: 14 additions & 1 deletion src/inspect_ai/_view/www/log-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -3200,6 +3200,18 @@
],
"default": null,
"title": "Input Tokens Cache Read"
},
"reasoning_tokens": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"title": "Reasoning Tokens"
}
},
"title": "ModelUsage",
Expand All @@ -3209,7 +3221,8 @@
"output_tokens",
"total_tokens",
"input_tokens_cache_write",
"input_tokens_cache_read"
"input_tokens_cache_read",
"reasoning_tokens"
],
"additionalProperties": false
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
: !isContentImage(output);
const normalizedContent = useMemo(() => normalizeContent(output), [output]);

const contents = mode !== "compact" ? input : input || functionCall;

return (
<div>
{mode !== "compact" && (!view || view.title) ? (
Expand All @@ -94,7 +96,7 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
<div>
<ToolInput
highlightLanguage={highlightLanguage}
contents={input}
contents={contents}
toolCallView={view}
/>
<ExpandablePanel collapse={collapse} border={true} lines={15}>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,8 @@ export const APICodeCell: FC<APICodeCellProps> = ({ id, contents }) => {
}

return (
<div>
<pre className={styles.codePre}>
<div className={clsx("model-call")}>
<pre className={clsx(styles.codePre)}>
<code
id={id}
ref={codeRef}
Expand Down
3 changes: 3 additions & 0 deletions src/inspect_ai/_view/www/src/types/log.d.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/* eslint-disable */
/**
* This file was automatically generated by json-schema-to-typescript.
* DO NOT MODIFY IT BY HAND. Instead, modify the source JSONSchema file,
Expand Down Expand Up @@ -122,6 +123,7 @@ export type OutputTokens = number;
export type TotalTokens = number;
export type InputTokensCacheWrite = number | null;
export type InputTokensCacheRead = number | null;
export type ReasoningTokens1 = number | null;
export type Message = string;
export type Traceback = string;
export type TracebackAnsi = string;
Expand Down Expand Up @@ -735,6 +737,7 @@ export interface ModelUsage1 {
total_tokens: TotalTokens;
input_tokens_cache_write: InputTokensCacheWrite;
input_tokens_cache_read: InputTokensCacheRead;
reasoning_tokens: ReasoningTokens1;
}
/**
* Eval error details.
Expand Down
11 changes: 3 additions & 8 deletions src/inspect_ai/_view/www/src/usage/ModelTokenTable.tsx
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { FC } from "react";
import { ModelUsage, ModelUsage2 } from "../types/log";
import { TokenHeader, TokenRow, TokenTable } from "./TokenTable";

interface ModelTokenTableProps {
model_usage: any;
model_usage: ModelUsage | ModelUsage2;
className?: string | string[];
}

Expand All @@ -15,13 +16,7 @@ export const ModelTokenTable: FC<ModelTokenTableProps> = ({
<TokenHeader />
<tbody>
{Object.keys(model_usage).map((key) => {
return (
<TokenRow
key={key}
model={`${key}-token-row`}
usage={model_usage[key]}
/>
);
return <TokenRow key={key} model={key} usage={model_usage[key]} />;
})}
</tbody>
</TokenTable>
Expand Down
4 changes: 4 additions & 0 deletions src/inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@
height: 1px;
background-color: var(--bs-light-border-subtle);
}

.padded {
margin-bottom: 1em;
}
37 changes: 30 additions & 7 deletions src/inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ interface ModelUsageRow {
value?: number;
secondary?: boolean;
bordered?: boolean;
padded?: boolean;
}

/**
Expand All @@ -23,13 +24,29 @@ export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
return null;
}

const rows: ModelUsageRow[] = [
{
label: "input",
value: usage.input_tokens,
const rows: ModelUsageRow[] = [];

if (usage.reasoning_tokens) {
rows.push({
label: "Reasoning",
value: usage.reasoning_tokens,
secondary: false,
bordered: true,
});

rows.push({
label: "---",
value: undefined,
secondary: false,
},
];
padded: true,
});
}

rows.push({
label: "input",
value: usage.input_tokens,
secondary: false,
});

if (usage.input_tokens_cache_read) {
rows.push({
Expand Down Expand Up @@ -71,7 +88,13 @@ export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
{rows.map((row, idx) => {
if (row.label === "---") {
return (
<div key={`$usage-sep-${idx}`} className={styles.separator}></div>
<div
key={`$usage-sep-${idx}`}
className={clsx(
styles.separator,
row.padded ? styles.padded : undefined,
)}
></div>
);
} else {
return (
Expand Down
Loading

0 comments on commit dd80f0b

Please sign in to comment.