Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collect / Display Reasoning Tokens For Samples, Evals #1417

Merged
merged 8 commits into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
- Bugfix: Ensure that token limits are not enforced during model graded scoring.
- Inspect View: Fix layout issues with human agent terminal session playback.
- Inspect View: Improve tool input / output appearance when rendered in VSCode.
- Inspect View: Display reasoning tokens in model usage for the samples and for the complete eval.
- Inspect View: Improve model api request / response output when rendere in VSCode.
- Inspect View: Improve rendering of some tool calls in the transcript.

## v0.3.70 (25 February 2025)

Expand Down
2 changes: 2 additions & 0 deletions src/inspect_ai/_view/www/App.css
Original file line number Diff line number Diff line change
Expand Up @@ -811,12 +811,14 @@ pre[class*="language-"].tool-output,
background-color: #f8f8f8;
}

.vscode-dark .model-call pre[class*="language-"],
.vscode-dark .markdown-content pre[class*="language-"],
.vscode-dark pre[class*="language-"].tool-output,
.vscode-dark .tool-output {
background-color: #333333;
}

.model-call pre[class*="language-"],
.markdown-content pre[class*="language-"],
pre[class*="language-"].tool-output {
border: none !important;
Expand Down
16 changes: 11 additions & 5 deletions src/inspect_ai/_view/www/dist/assets/index.css

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

72 changes: 45 additions & 27 deletions src/inspect_ai/_view/www/dist/assets/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -14273,7 +14273,7 @@ var require_assets = __commonJS({
if (seconds < 60) {
return `${formatPrettyDecimal(seconds, 1)} sec`;
} else if (seconds < 60 * 60) {
return `${Math.floor(seconds / 60)} min ${seconds % 60} sec`;
return `${Math.floor(seconds / 60)} min ${Math.floor(seconds % 60)} sec`;
} else if (seconds < 60 * 60 * 24) {
const hours = Math.floor(seconds / (60 * 60));
const minutes = Math.floor(seconds % (60 * 60) / 60);
Expand Down Expand Up @@ -14428,12 +14428,12 @@ var require_assets = __commonJS({
}
};
const container$c = "_container_w37fs_1";
const padded$1 = "_padded_w37fs_8";
const padded$2 = "_padded_w37fs_8";
const key$1 = "_key_w37fs_12";
const value$1 = "_value_w37fs_16";
const styles$14 = {
container: container$c,
padded: padded$1,
padded: padded$2,
key: key$1,
value: value$1
};
Expand Down Expand Up @@ -21653,14 +21653,15 @@ var require_assets = __commonJS({
}
const collapse = Array.isArray(output2) ? output2.every((item2) => !isContentImage(item2)) : !isContentImage(output2);
const normalizedContent = reactExports.useMemo(() => normalizeContent$1(output2), [output2]);
const contents2 = mode !== "compact" ? input2 : input2 || functionCall;
return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [
mode !== "compact" && (!view || view.title) ? /* @__PURE__ */ jsxRuntimeExports.jsx(ToolTitle, { title: (view == null ? void 0 : view.title) || functionCall }) : "",
/* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [
/* @__PURE__ */ jsxRuntimeExports.jsx(
ToolInput,
{
highlightLanguage,
contents: input2,
contents: contents2,
toolCallView: view
}
),
Expand Down Expand Up @@ -49722,29 +49723,44 @@ self.onmessage = function (e) {
}
);
};
const wrapper$2 = "_wrapper_b0it4_1";
const col2$1 = "_col2_b0it4_8";
const col1_3 = "_col1_3_b0it4_12";
const col3 = "_col3_b0it4_16";
const separator$2 = "_separator_b0it4_20";
const wrapper$2 = "_wrapper_sq96g_1";
const col2$1 = "_col2_sq96g_8";
const col1_3 = "_col1_3_sq96g_12";
const col3 = "_col3_sq96g_16";
const separator$2 = "_separator_sq96g_20";
const padded$1 = "_padded_sq96g_26";
const styles$G = {
wrapper: wrapper$2,
col2: col2$1,
col1_3,
col3,
separator: separator$2
separator: separator$2,
padded: padded$1
};
const ModelUsagePanel = ({ usage }) => {
if (!usage) {
return null;
}
const rows = [
{
label: "input",
value: usage.input_tokens,
secondary: false
}
];
const rows = [];
if (usage.reasoning_tokens) {
rows.push({
label: "Reasoning",
value: usage.reasoning_tokens,
secondary: false,
bordered: true
});
rows.push({
label: "---",
value: void 0,
secondary: false,
padded: true
});
}
rows.push({
label: "input",
value: usage.input_tokens,
secondary: false
});
if (usage.input_tokens_cache_read) {
rows.push({
label: "cache_read",
Expand Down Expand Up @@ -49777,7 +49793,16 @@ self.onmessage = function (e) {
});
return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-size-small", styles$G.wrapper), children: rows.map((row2, idx) => {
if (row2.label === "---") {
return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$G.separator }, `$usage-sep-${idx}`);
return /* @__PURE__ */ jsxRuntimeExports.jsx(
"div",
{
className: clsx(
styles$G.separator,
row2.padded ? styles$G.padded : void 0
)
},
`$usage-sep-${idx}`
);
} else {
return /* @__PURE__ */ jsxRuntimeExports.jsxs(reactExports.Fragment, { children: [
/* @__PURE__ */ jsxRuntimeExports.jsx(
Expand Down Expand Up @@ -49882,14 +49907,7 @@ self.onmessage = function (e) {
return /* @__PURE__ */ jsxRuntimeExports.jsxs(TokenTable, { className: className2, children: [
/* @__PURE__ */ jsxRuntimeExports.jsx(TokenHeader, {}),
/* @__PURE__ */ jsxRuntimeExports.jsx("tbody", { children: Object.keys(model_usage).map((key2) => {
return /* @__PURE__ */ jsxRuntimeExports.jsx(
TokenRow,
{
model: `${key2}-token-row`,
usage: model_usage[key2]
},
key2
);
return /* @__PURE__ */ jsxRuntimeExports.jsx(TokenRow, { model: key2, usage: model_usage[key2] }, key2);
}) })
] });
};
Expand Down Expand Up @@ -53760,7 +53778,7 @@ self.onmessage = function (e) {
if (!contents2) {
return null;
}
return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: /* @__PURE__ */ jsxRuntimeExports.jsx("pre", { className: styles$t.codePre, children: /* @__PURE__ */ jsxRuntimeExports.jsx(
return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("model-call"), children: /* @__PURE__ */ jsxRuntimeExports.jsx("pre", { className: clsx(styles$t.codePre), children: /* @__PURE__ */ jsxRuntimeExports.jsx(
"code",
{
id,
Expand Down
15 changes: 14 additions & 1 deletion src/inspect_ai/_view/www/log-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -3200,6 +3200,18 @@
],
"default": null,
"title": "Input Tokens Cache Read"
},
"reasoning_tokens": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"title": "Reasoning Tokens"
}
},
"title": "ModelUsage",
Expand All @@ -3209,7 +3221,8 @@
"output_tokens",
"total_tokens",
"input_tokens_cache_write",
"input_tokens_cache_read"
"input_tokens_cache_read",
"reasoning_tokens"
],
"additionalProperties": false
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
: !isContentImage(output);
const normalizedContent = useMemo(() => normalizeContent(output), [output]);

const contents = mode !== "compact" ? input : input || functionCall;

return (
<div>
{mode !== "compact" && (!view || view.title) ? (
Expand All @@ -94,7 +96,7 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
<div>
<ToolInput
highlightLanguage={highlightLanguage}
contents={input}
contents={contents}
toolCallView={view}
/>
<ExpandablePanel collapse={collapse} border={true} lines={15}>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,8 @@ export const APICodeCell: FC<APICodeCellProps> = ({ id, contents }) => {
}

return (
<div>
<pre className={styles.codePre}>
<div className={clsx("model-call")}>
<pre className={clsx(styles.codePre)}>
<code
id={id}
ref={codeRef}
Expand Down
3 changes: 3 additions & 0 deletions src/inspect_ai/_view/www/src/types/log.d.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/* eslint-disable */
/**
* This file was automatically generated by json-schema-to-typescript.
* DO NOT MODIFY IT BY HAND. Instead, modify the source JSONSchema file,
Expand Down Expand Up @@ -122,6 +123,7 @@ export type OutputTokens = number;
export type TotalTokens = number;
export type InputTokensCacheWrite = number | null;
export type InputTokensCacheRead = number | null;
export type ReasoningTokens1 = number | null;
export type Message = string;
export type Traceback = string;
export type TracebackAnsi = string;
Expand Down Expand Up @@ -735,6 +737,7 @@ export interface ModelUsage1 {
total_tokens: TotalTokens;
input_tokens_cache_write: InputTokensCacheWrite;
input_tokens_cache_read: InputTokensCacheRead;
reasoning_tokens: ReasoningTokens1;
}
/**
* Eval error details.
Expand Down
11 changes: 3 additions & 8 deletions src/inspect_ai/_view/www/src/usage/ModelTokenTable.tsx
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { FC } from "react";
import { ModelUsage, ModelUsage2 } from "../types/log";
import { TokenHeader, TokenRow, TokenTable } from "./TokenTable";

interface ModelTokenTableProps {
model_usage: any;
model_usage: ModelUsage | ModelUsage2;
className?: string | string[];
}

Expand All @@ -15,13 +16,7 @@ export const ModelTokenTable: FC<ModelTokenTableProps> = ({
<TokenHeader />
<tbody>
{Object.keys(model_usage).map((key) => {
return (
<TokenRow
key={key}
model={`${key}-token-row`}
usage={model_usage[key]}
/>
);
return <TokenRow key={key} model={key} usage={model_usage[key]} />;
})}
</tbody>
</TokenTable>
Expand Down
4 changes: 4 additions & 0 deletions src/inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@
height: 1px;
background-color: var(--bs-light-border-subtle);
}

.padded {
margin-bottom: 1em;
}
37 changes: 30 additions & 7 deletions src/inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ interface ModelUsageRow {
value?: number;
secondary?: boolean;
bordered?: boolean;
padded?: boolean;
}

/**
Expand All @@ -23,13 +24,29 @@ export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
return null;
}

const rows: ModelUsageRow[] = [
{
label: "input",
value: usage.input_tokens,
const rows: ModelUsageRow[] = [];

if (usage.reasoning_tokens) {
rows.push({
label: "Reasoning",
value: usage.reasoning_tokens,
secondary: false,
bordered: true,
});

rows.push({
label: "---",
value: undefined,
secondary: false,
},
];
padded: true,
});
}

rows.push({
label: "input",
value: usage.input_tokens,
secondary: false,
});

if (usage.input_tokens_cache_read) {
rows.push({
Expand Down Expand Up @@ -71,7 +88,13 @@ export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
{rows.map((row, idx) => {
if (row.label === "---") {
return (
<div key={`$usage-sep-${idx}`} className={styles.separator}></div>
<div
key={`$usage-sep-${idx}`}
className={clsx(
styles.separator,
row.padded ? styles.padded : undefined,
)}
></div>
);
} else {
return (
Expand Down
Loading