diff --git a/.changeset/curvy-pillows-attack.md b/.changeset/curvy-pillows-attack.md index 5b12acb56..f94308c4a 100644 --- a/.changeset/curvy-pillows-attack.md +++ b/.changeset/curvy-pillows-attack.md @@ -2,6 +2,6 @@ "@browserbasehq/stagehand": patch --- -Fix non-CUA `agent.execute()` reporting a successful run as failed. After the agent finished all of its work, the forced "done" finalization step (`handleDoneToolCall`) re-submitted the accumulated run history to the model; with some providers (e.g. reasoning models like `openai/gpt-5.x`) that history is rejected by the AI SDK (`Invalid prompt: messages must be a ModelMessage[]`), which surfaced as a red error and flipped the result to `{ success: false }` even though every action had already completed. +Fix non-CUA `agent.execute()` reporting a successful run as failed. After the agent finished all of its work, the forced "done" finalization step (`handleDoneToolCall`) re-submitted the accumulated run history to the model. When a custom tool returned an object with an optional field left `undefined` (e.g. `{ matchedExpected: undefined }`), that `undefined` ended up inside a tool-result `output.value`, which the AI SDK's prompt validation rejects (its JSON-value schema disallows `undefined`), throwing `Invalid prompt: messages must be a ModelMessage[]`. This surfaced as a red error and flipped the result to `{ success: false }` even though every action had already completed (STG-2335). -The finalization "done" call is now best-effort: if it throws, the agent logs a warning (with the underlying cause) and synthesizes a completion from the run instead of failing it. Also hardens `handleDoneToolCall` against a missing `toolCalls` array. +Root cause fix: deep-strip `undefined` values from the run history before re-submitting it to the forced "done" finalization call, keeping the messages valid without dropping any real content. Class instances (URL, typed arrays for binary data, etc.) are passed through untouched. diff --git a/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts b/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts index 08cb542e4..3e9319c19 100644 --- a/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts +++ b/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts @@ -17,6 +17,46 @@ interface DoneResult { output?: Record; } +/** + * Deep-remove `undefined` values from the run history before it is re-submitted + * to the forced "done" call. + * + * The AI SDK validates re-submitted messages against its `ModelMessage` schema, + * whose JSON-value schema rejects `undefined` (only null/string/number/boolean/ + * object/array are allowed). A custom tool that returns an object with an + * optional field left `undefined` (e.g. `{ matchedExpected: undefined }`) lands + * that `undefined` inside a tool-result `output.value`, and reasoning models can + * leave `undefined` in part fields too. Either makes the forced "done" call + * throw `Invalid prompt: messages must be a ModelMessage[]` (STG-2335) — a red + * error that fires after the run has already completed. Stripping `undefined` + * keeps the history valid without dropping any real content. + * + * Only plain objects and arrays are traversed; class instances (URL, typed + * arrays for binary image data, Date, …) are passed through untouched. + */ +function stripUndefinedDeep(value: T): T { + if (Array.isArray(value)) { + return value.map((v) => stripUndefinedDeep(v)) as unknown as T; + } + if (value !== null && typeof value === "object") { + const proto = Object.getPrototypeOf(value); + if (proto === Object.prototype || proto === null) { + const out: Record = {}; + for (const [k, v] of Object.entries(value as Record)) { + if (v !== undefined) out[k] = stripUndefinedDeep(v); + } + return out as T; + } + } + return value; +} + +export function sanitizeMessagesForResubmission( + messages: ModelMessage[], +): ModelMessage[] { + return stripUndefinedDeep(messages); +} + function buildBaseDoneSchema(factory: typeof z) { return factory.object({ reasoning: factory @@ -114,7 +154,7 @@ Call the "done" tool with: const result = await generateText({ model, system: systemPrompt, - messages: [...inputMessages, userPrompt], + messages: [...sanitizeMessagesForResubmission(inputMessages), userPrompt], tools: { done: doneTool } as ToolSet, toolChoice: rejectsForcedToolUse(modelId) ? "auto" diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index e85acf78c..db39aced6 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -831,33 +831,13 @@ export class V3AgentHandler { ): Promise<{ messages: ModelMessage[]; output?: Record }> { if (state.completed) return { messages }; - let doneResult: Awaited>; - try { - doneResult = await handleDoneToolCall({ - model, - inputMessages: messages, - instruction, - outputSchema, - logger, - }); - } catch (error) { - // The forced "done" call only summarizes the run, so its failure must not - // fail a run whose work already completed (e.g. a provider rejecting the - // re-submitted history). Warn and synthesize a completion. We log only the - // message, not the cause — the cause embeds the full history (base64 - // images included) and would bloat the log. - logger?.({ - category: "agent", - level: 1, - message: `Agent "done" finalization call failed; using run summary instead: ${getErrorMessage(error)}`, - }); - state.completed = true; - state.finalMessage = - state.finalMessage || - state.collectedReasoning.join(" ").trim() || - "Task execution completed"; - return { messages }; - } + const doneResult = await handleDoneToolCall({ + model, + inputMessages: messages, + instruction, + outputSchema, + logger, + }); state.completed = doneResult.taskComplete; state.finalMessage = doneResult.reasoning; diff --git a/packages/core/tests/unit/agent-finalization-resilience.test.ts b/packages/core/tests/unit/agent-finalization-resilience.test.ts index 6e4eed8a9..2815a633c 100644 --- a/packages/core/tests/unit/agent-finalization-resilience.test.ts +++ b/packages/core/tests/unit/agent-finalization-resilience.test.ts @@ -1,198 +1,111 @@ -import { beforeEach, describe, expect, it, vi } from "vitest"; +import { describe, expect, it } from "vitest"; +import { generateText, type ModelMessage } from "ai"; import type { LanguageModelV2 } from "@ai-sdk/provider"; -import type { LLMClient } from "../../lib/v3/llm/LLMClient.js"; -import type { LogLine } from "../../lib/v3/types/public/logs.js"; -import type { V3 } from "../../lib/v3/v3.js"; - -// Make the module-level "ai" generateText (used only by the forced "done" -// finalization call) throw, while the main loop uses the injected LLMClient. -// Mirrors the provider rejecting the re-submitted history (STG-2335). -const finalizationError = Object.assign( - new Error( - "Invalid prompt: The messages must be a ModelMessage[]. If you have " + - "passed a UIMessage[], you can use convertToModelMessages to convert them.", - ), - { cause: new Error("validation failed at messages[3]") }, -); - -vi.mock("ai", async () => { - const actual = await vi.importActual("ai"); - return { - ...actual, - wrapLanguageModel: vi.fn(({ model }) => model), - generateText: vi.fn(async () => { - throw finalizationError; - }), - }; -}); - -import { V3AgentHandler } from "../../lib/v3/handlers/v3AgentHandler.js"; - -type AgentLlmOptions = { - onStepFinish?: (step: unknown) => Promise | void; - onFinish?: (event: unknown) => void; -}; - -const usage = { - inputTokens: 1, - outputTokens: 1, - reasoningTokens: 0, - cachedInputTokens: 0, - totalTokens: 2, -}; - -const emptyList = () => [] as unknown[]; - -// End-of-run step with no `done` call, so state.completed stays false and -// ensureDone proceeds to the forced finalization call (which throws). -function createNonDoneStep() { - return { - content: emptyList(), - text: "", - reasoning: emptyList(), - reasoningText: undefined as string | undefined, - files: emptyList(), - sources: emptyList(), - toolCalls: emptyList(), - staticToolCalls: emptyList(), - dynamicToolCalls: emptyList(), - toolResults: emptyList(), - staticToolResults: emptyList(), - dynamicToolResults: emptyList(), - finishReason: "stop", - usage, - warnings: undefined as unknown, - request: {}, - response: { - id: "response-id", - modelId: "openai/gpt-5-mini", - timestamp: new Date(0), - messages: emptyList(), - }, - providerMetadata: undefined as unknown, - }; -} - -function createGenerateResult(step: ReturnType) { - return { - content: emptyList(), - text: "", - reasoning: emptyList(), - reasoningText: undefined as string | undefined, - files: emptyList(), - sources: emptyList(), - toolCalls: emptyList(), - staticToolCalls: emptyList(), - dynamicToolCalls: emptyList(), - toolResults: emptyList(), - staticToolResults: emptyList(), - dynamicToolResults: emptyList(), - finishReason: "stop", - usage, - totalUsage: usage, - warnings: undefined as unknown, - request: {}, - response: { - id: "response-id", - modelId: "openai/gpt-5-mini", - timestamp: new Date(0), - messages: emptyList(), - }, - providerMetadata: undefined as unknown, - steps: [step], - experimental_output: undefined as unknown, - }; -} - -function createV3() { - const page = { - url: () => "https://example.com", - enableCursorOverlay: vi.fn(async () => {}), - }; - - return { - context: { - awaitActivePage: vi.fn(async () => page), - }, - isCaptchaAutoSolveEnabled: false, - browserbaseApiKey: undefined, - logger: vi.fn(), - recordAgentReplayStep: vi.fn(), - updateMetrics: vi.fn(), - act: vi.fn(), - extract: vi.fn(), - observe: vi.fn(), - } as unknown as V3; -} - -function createLlmClient() { - const model = { - modelId: "openai/gpt-5-mini", - provider: "openai", - specificationVersion: "v2", - } as unknown as LanguageModelV2; - - const generateText = vi.fn(async (options: AgentLlmOptions) => { - const step = createNonDoneStep(); - await options.onStepFinish?.(step); - return createGenerateResult(step); +import { sanitizeMessagesForResubmission } from "../../lib/v3/agent/utils/handleDoneToolCall.js"; + +// A minimal mock model. generateText runs the AI SDK's prompt validation +// (standardizePrompt) before ever reaching the model, so this is enough to +// reproduce STG-2335: the forced "done" finalization re-submits the run +// history, and a tool-result whose output value contains an `undefined` field +// trips that validation with "Invalid prompt: messages must be a +// ModelMessage[]" — the AI SDK's JSON-value schema rejects `undefined`. +const mockModel = { + specificationVersion: "v2", + provider: "mock", + modelId: "mock", + supportedUrls: {}, + async doGenerate() { + return { + finishReason: "stop" as const, + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + content: [{ type: "text" as const, text: "ok" }], + warnings: [] as [], + }; + }, +} as unknown as LanguageModelV2; + +// Mirrors PermitFlow's custom tool: an optional field (`matchedExpected`) is +// left `undefined` in the tool result, so the re-submitted tool-result carries +// an `undefined` inside output.value. Also includes a valid reasoning part +// (text: "") to confirm sanitization leaves real content intact. +const malformedHistory = [ + { role: "user", content: "do the task" }, + { + role: "assistant", + content: [ + { + type: "reasoning", + text: "", + providerOptions: { openai: { itemId: "rs_1" } }, + }, + { type: "tool-call", toolCallId: "c1", toolName: "captureField", input: {} }, + ], + }, + { + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: "c1", + toolName: "captureField", + output: { + type: "json", + value: { success: true, value: "permit", matchedExpected: undefined }, + }, + }, + ], + }, +] as unknown as ModelMessage[]; + +describe("v3 agent finalization: tool-result re-submission (STG-2335)", () => { + it("reproduces the InvalidPromptError when an undefined tool-result field is re-submitted", async () => { + await expect( + generateText({ model: mockModel, messages: malformedHistory }), + ).rejects.toThrow(/must be a ModelMessage\[\]/); }); - return { - client: { - getLanguageModel: vi.fn(() => model), - generateText, - } as unknown as LLMClient, - generateText, - }; -} - -describe("v3 agent finalization resilience", () => { - let logger: (line: LogLine) => void; - - beforeEach(() => { - logger = vi.fn(); + it("re-submission succeeds once undefined values are stripped", async () => { + const result = await generateText({ + model: mockModel, + messages: sanitizeMessagesForResubmission(malformedHistory), + }); + expect(result.text).toBe("ok"); }); - it("returns a successful result when the forced done call fails", async () => { - const { client } = createLlmClient(); - const handler = new V3AgentHandler(createV3(), logger, client); + it("drops undefined fields but preserves all real content", () => { + const cleaned = sanitizeMessagesForResubmission(malformedHistory); - const result = await handler.execute({ - instruction: "finish", - maxSteps: 1, - excludeTools: ["search"], - }); + expect(cleaned).toHaveLength(3); + expect(cleaned[0]).toEqual({ role: "user", content: "do the task" }); - // A finalization failure must NOT flip a completed run to failure. - expect(result.success).toBe(true); - expect(result.completed).toBe(true); - expect(result.message).not.toMatch(/^Failed to execute task/); - expect(result.message.length).toBeGreaterThan(0); - }); - - it("logs the finalization failure as a concise warning, not a red error", async () => { - const lines: LogLine[] = []; - const captureLogger = (line: LogLine) => { - lines.push(line); - }; - const { client } = createLlmClient(); - const handler = new V3AgentHandler(createV3(), captureLogger, client); + // reasoning + tool-call survive on the assistant message. + const assistant = cleaned[1].content as Array<{ type: string }>; + expect(assistant.map((p) => p.type)).toEqual(["reasoning", "tool-call"]); - await handler.execute({ - instruction: "finish", - maxSteps: 1, - excludeTools: ["search"], - }); + // tool-result value keeps real fields, drops the undefined one. + const toolResult = ( + cleaned[2].content as Array<{ output: { value: Record } }> + )[0]; + expect(toolResult.output.value).toEqual({ success: true, value: "permit" }); + expect("matchedExpected" in toolResult.output.value).toBe(false); + }); - const warning = lines.find((l) => - l.message.includes('Agent "done" finalization call failed'), - ); - expect(warning).toBeDefined(); - // Warning (level 1), not error (level 0) — the run still succeeded. - expect(warning?.level).toBe(1); - // Cause is not logged (it would bloat the log); message stays short. - expect(warning?.auxiliary).toBeUndefined(); - expect(warning?.message.length).toBeLessThan(500); + it("leaves class instances (URL, typed arrays) untouched", () => { + const url = new URL("https://example.com"); + const bytes = new Uint8Array([1, 2, 3]); + const messages = [ + { + role: "user", + content: [ + { type: "file", data: url, mediaType: "text/plain" }, + { type: "file", data: bytes, mediaType: "application/octet-stream" }, + ], + }, + ] as unknown as ModelMessage[]; + + const cleaned = sanitizeMessagesForResubmission(messages); + const parts = cleaned[0].content as Array<{ data: unknown }>; + expect(parts[0].data).toBeInstanceOf(URL); + expect(parts[1].data).toBeInstanceOf(Uint8Array); }); });