From de7b101b88db8e0a73c7e5a9f99691431bdaba5d Mon Sep 17 00:00:00 2001 From: Alyssa Maruyama Date: Tue, 23 Jun 2026 16:43:26 -0700 Subject: [PATCH 1/4] fix(agent): don't fail a completed run when the forced "done" call errors Non-CUA agent.execute() ran a forced "done" finalization step that re-submits the accumulated run history to the model. With some providers (notably reasoning models like openai/gpt-5.x) that history is rejected by the AI SDK ("Invalid prompt: messages must be a ModelMessage[]"), which was logged as a red error and flipped the result to { success: false } even though every agent action had already completed successfully. Make finalization best-effort: if handleDoneToolCall throws, log a warning with the underlying cause and synthesize a completion from the run instead of failing it. Also null-guard result.toolCalls in handleDoneToolCall. Adds a regression unit test covering the finalization-failure path. Ref: Pylon #19999 / STG-2335 --- .changeset/curvy-pillows-attack.md | 7 + .../lib/v3/agent/utils/handleDoneToolCall.ts | 2 +- .../core/lib/v3/handlers/v3AgentHandler.ts | 34 ++- .../agent-finalization-resilience.test.ts | 198 ++++++++++++++++++ 4 files changed, 233 insertions(+), 8 deletions(-) create mode 100644 .changeset/curvy-pillows-attack.md create mode 100644 packages/core/tests/unit/agent-finalization-resilience.test.ts diff --git a/.changeset/curvy-pillows-attack.md b/.changeset/curvy-pillows-attack.md new file mode 100644 index 000000000..5b12acb56 --- /dev/null +++ b/.changeset/curvy-pillows-attack.md @@ -0,0 +1,7 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Fix non-CUA `agent.execute()` reporting a successful run as failed. After the agent finished all of its work, the forced "done" finalization step (`handleDoneToolCall`) re-submitted the accumulated run history to the model; with some providers (e.g. reasoning models like `openai/gpt-5.x`) that history is rejected by the AI SDK (`Invalid prompt: messages must be a ModelMessage[]`), which surfaced as a red error and flipped the result to `{ success: false }` even though every action had already completed. + +The finalization "done" call is now best-effort: if it throws, the agent logs a warning (with the underlying cause) and synthesizes a completion from the run instead of failing it. Also hardens `handleDoneToolCall` against a missing `toolCalls` array. diff --git a/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts b/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts index 024a1e26e..08cb542e4 100644 --- a/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts +++ b/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts @@ -126,7 +126,7 @@ Call the "done" tool with: }, }); - const doneToolCall = result.toolCalls.find((tc) => tc.toolName === "done"); + const doneToolCall = result.toolCalls?.find((tc) => tc.toolName === "done"); const outputMessages: ModelMessage[] = [ userPrompt, ...(result.response?.messages || []), diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index db39aced6..e85acf78c 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -831,13 +831,33 @@ export class V3AgentHandler { ): Promise<{ messages: ModelMessage[]; output?: Record }> { if (state.completed) return { messages }; - const doneResult = await handleDoneToolCall({ - model, - inputMessages: messages, - instruction, - outputSchema, - logger, - }); + let doneResult: Awaited>; + try { + doneResult = await handleDoneToolCall({ + model, + inputMessages: messages, + instruction, + outputSchema, + logger, + }); + } catch (error) { + // The forced "done" call only summarizes the run, so its failure must not + // fail a run whose work already completed (e.g. a provider rejecting the + // re-submitted history). Warn and synthesize a completion. We log only the + // message, not the cause — the cause embeds the full history (base64 + // images included) and would bloat the log. + logger?.({ + category: "agent", + level: 1, + message: `Agent "done" finalization call failed; using run summary instead: ${getErrorMessage(error)}`, + }); + state.completed = true; + state.finalMessage = + state.finalMessage || + state.collectedReasoning.join(" ").trim() || + "Task execution completed"; + return { messages }; + } state.completed = doneResult.taskComplete; state.finalMessage = doneResult.reasoning; diff --git a/packages/core/tests/unit/agent-finalization-resilience.test.ts b/packages/core/tests/unit/agent-finalization-resilience.test.ts new file mode 100644 index 000000000..6e4eed8a9 --- /dev/null +++ b/packages/core/tests/unit/agent-finalization-resilience.test.ts @@ -0,0 +1,198 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { LanguageModelV2 } from "@ai-sdk/provider"; +import type { LLMClient } from "../../lib/v3/llm/LLMClient.js"; +import type { LogLine } from "../../lib/v3/types/public/logs.js"; +import type { V3 } from "../../lib/v3/v3.js"; + +// Make the module-level "ai" generateText (used only by the forced "done" +// finalization call) throw, while the main loop uses the injected LLMClient. +// Mirrors the provider rejecting the re-submitted history (STG-2335). +const finalizationError = Object.assign( + new Error( + "Invalid prompt: The messages must be a ModelMessage[]. If you have " + + "passed a UIMessage[], you can use convertToModelMessages to convert them.", + ), + { cause: new Error("validation failed at messages[3]") }, +); + +vi.mock("ai", async () => { + const actual = await vi.importActual("ai"); + return { + ...actual, + wrapLanguageModel: vi.fn(({ model }) => model), + generateText: vi.fn(async () => { + throw finalizationError; + }), + }; +}); + +import { V3AgentHandler } from "../../lib/v3/handlers/v3AgentHandler.js"; + +type AgentLlmOptions = { + onStepFinish?: (step: unknown) => Promise | void; + onFinish?: (event: unknown) => void; +}; + +const usage = { + inputTokens: 1, + outputTokens: 1, + reasoningTokens: 0, + cachedInputTokens: 0, + totalTokens: 2, +}; + +const emptyList = () => [] as unknown[]; + +// End-of-run step with no `done` call, so state.completed stays false and +// ensureDone proceeds to the forced finalization call (which throws). +function createNonDoneStep() { + return { + content: emptyList(), + text: "", + reasoning: emptyList(), + reasoningText: undefined as string | undefined, + files: emptyList(), + sources: emptyList(), + toolCalls: emptyList(), + staticToolCalls: emptyList(), + dynamicToolCalls: emptyList(), + toolResults: emptyList(), + staticToolResults: emptyList(), + dynamicToolResults: emptyList(), + finishReason: "stop", + usage, + warnings: undefined as unknown, + request: {}, + response: { + id: "response-id", + modelId: "openai/gpt-5-mini", + timestamp: new Date(0), + messages: emptyList(), + }, + providerMetadata: undefined as unknown, + }; +} + +function createGenerateResult(step: ReturnType) { + return { + content: emptyList(), + text: "", + reasoning: emptyList(), + reasoningText: undefined as string | undefined, + files: emptyList(), + sources: emptyList(), + toolCalls: emptyList(), + staticToolCalls: emptyList(), + dynamicToolCalls: emptyList(), + toolResults: emptyList(), + staticToolResults: emptyList(), + dynamicToolResults: emptyList(), + finishReason: "stop", + usage, + totalUsage: usage, + warnings: undefined as unknown, + request: {}, + response: { + id: "response-id", + modelId: "openai/gpt-5-mini", + timestamp: new Date(0), + messages: emptyList(), + }, + providerMetadata: undefined as unknown, + steps: [step], + experimental_output: undefined as unknown, + }; +} + +function createV3() { + const page = { + url: () => "https://example.com", + enableCursorOverlay: vi.fn(async () => {}), + }; + + return { + context: { + awaitActivePage: vi.fn(async () => page), + }, + isCaptchaAutoSolveEnabled: false, + browserbaseApiKey: undefined, + logger: vi.fn(), + recordAgentReplayStep: vi.fn(), + updateMetrics: vi.fn(), + act: vi.fn(), + extract: vi.fn(), + observe: vi.fn(), + } as unknown as V3; +} + +function createLlmClient() { + const model = { + modelId: "openai/gpt-5-mini", + provider: "openai", + specificationVersion: "v2", + } as unknown as LanguageModelV2; + + const generateText = vi.fn(async (options: AgentLlmOptions) => { + const step = createNonDoneStep(); + await options.onStepFinish?.(step); + return createGenerateResult(step); + }); + + return { + client: { + getLanguageModel: vi.fn(() => model), + generateText, + } as unknown as LLMClient, + generateText, + }; +} + +describe("v3 agent finalization resilience", () => { + let logger: (line: LogLine) => void; + + beforeEach(() => { + logger = vi.fn(); + }); + + it("returns a successful result when the forced done call fails", async () => { + const { client } = createLlmClient(); + const handler = new V3AgentHandler(createV3(), logger, client); + + const result = await handler.execute({ + instruction: "finish", + maxSteps: 1, + excludeTools: ["search"], + }); + + // A finalization failure must NOT flip a completed run to failure. + expect(result.success).toBe(true); + expect(result.completed).toBe(true); + expect(result.message).not.toMatch(/^Failed to execute task/); + expect(result.message.length).toBeGreaterThan(0); + }); + + it("logs the finalization failure as a concise warning, not a red error", async () => { + const lines: LogLine[] = []; + const captureLogger = (line: LogLine) => { + lines.push(line); + }; + const { client } = createLlmClient(); + const handler = new V3AgentHandler(createV3(), captureLogger, client); + + await handler.execute({ + instruction: "finish", + maxSteps: 1, + excludeTools: ["search"], + }); + + const warning = lines.find((l) => + l.message.includes('Agent "done" finalization call failed'), + ); + expect(warning).toBeDefined(); + // Warning (level 1), not error (level 0) — the run still succeeded. + expect(warning?.level).toBe(1); + // Cause is not logged (it would bloat the log); message stays short. + expect(warning?.auxiliary).toBeUndefined(); + expect(warning?.message.length).toBeLessThan(500); + }); +}); From 48a4bc5d6e2f05a8e9ff0cf6444e6df2558233ce Mon Sep 17 00:00:00 2001 From: Alyssa Maruyama Date: Tue, 23 Jun 2026 16:43:26 -0700 Subject: [PATCH 2/4] fix(agent): sanitize run history before forced "done" call; don't fail a completed run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Non-CUA agent.execute() runs a forced "done" finalization step that re-submits the accumulated run history to the model. With some providers (notably reasoning models like openai/gpt-5.x) that history carries nested `undefined` values inside `providerOptions`, which the AI SDK rejects ("Invalid prompt: messages must be a ModelMessage[]" — providerOptions leaves must be JSON values). This was logged as a red error and flipped the result to { success: false } even though every agent action had already completed. - Sanitize the run history (strip nested undefined, equivalent to a JSON round-trip) before the forced "done" call, so it succeeds and structured output is preserved. - Defense-in-depth: if finalization still throws, log a warning and synthesize a completion instead of failing the run. - Null-guard result.toolCalls in handleDoneToolCall. Adds unit tests for the sanitizer and the finalization-failure fallback. Ref: Pylon #19999 / STG-2335 --- .changeset/curvy-pillows-attack.md | 7 + .../lib/v3/agent/utils/handleDoneToolCall.ts | 45 +++- .../core/lib/v3/handlers/v3AgentHandler.ts | 34 ++- .../agent-finalization-resilience.test.ts | 198 ++++++++++++++++++ .../sanitize-resubmission-messages.test.ts | 56 +++++ 5 files changed, 331 insertions(+), 9 deletions(-) create mode 100644 .changeset/curvy-pillows-attack.md create mode 100644 packages/core/tests/unit/agent-finalization-resilience.test.ts create mode 100644 packages/core/tests/unit/sanitize-resubmission-messages.test.ts diff --git a/.changeset/curvy-pillows-attack.md b/.changeset/curvy-pillows-attack.md new file mode 100644 index 000000000..2c74ef390 --- /dev/null +++ b/.changeset/curvy-pillows-attack.md @@ -0,0 +1,7 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Fix non-CUA `agent.execute()` reporting a successful run as failed. After the agent finished all of its work, the forced "done" finalization step (`handleDoneToolCall`) re-submitted the accumulated run history to the model. With some providers (notably reasoning models like `openai/gpt-5.x`) that history carries nested `undefined` values inside `providerOptions`, which the AI SDK rejects (`Invalid prompt: messages must be a ModelMessage[]`, since `providerOptions` leaves must be JSON values). That surfaced as a red error and flipped the result to `{ success: false }` even though every action had already completed. + +The run history is now sanitized (nested `undefined` stripped, equivalent to a JSON round-trip) before the forced "done" call, so it succeeds and structured `output` is preserved. As defense-in-depth, the finalization step is also best-effort: if it still throws, the agent logs a warning and synthesizes a completion instead of failing the run. Also hardens `handleDoneToolCall` against a missing `toolCalls` array. diff --git a/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts b/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts index 024a1e26e..5372305f8 100644 --- a/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts +++ b/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts @@ -17,6 +17,47 @@ interface DoneResult { output?: Record; } +/** + * Recursively drop keys whose value is `undefined` from plain objects (and + * recurse into arrays / plain objects), leaving class instances, Dates, typed + * arrays, primitives and `null` untouched. + * + * The AI SDK validates `providerOptions` as `providerMetadataSchema`, whose leaf + * values are `jsonValueSchema` (null | string | number | boolean | object | + * array) — `undefined` is not allowed. SDK-generated messages (e.g. OpenAI + * reasoning / tool parts) can carry nested `undefined` values, so re-submitting + * them verbatim makes standardizePrompt reject the whole prompt with + * "messages must be a ModelMessage[]". `JSON.stringify` hides these (it omits + * undefined), which is why the failure is invisible in logs. Stripping them is + * equivalent to a JSON round-trip and makes the prompt valid again. + */ +function stripUndefinedDeep(value: T): T { + if (Array.isArray(value)) { + return value.map((v) => stripUndefinedDeep(v)) as unknown as T; + } + if (value !== null && typeof value === "object") { + const proto = Object.getPrototypeOf(value); + if (proto === Object.prototype || proto === null) { + const out: Record = {}; + for (const [k, v] of Object.entries(value as Record)) { + if (v !== undefined) out[k] = stripUndefinedDeep(v); + } + return out as T; + } + } + return value; +} + +/** + * Sanitize the accumulated run history before it is re-submitted to a fresh + * generateText call. See {@link stripUndefinedDeep}. + */ +export function sanitizeMessagesForResubmission( + messages: ModelMessage[], +): ModelMessage[] { + return messages.map((message) => stripUndefinedDeep(message)); +} + function buildBaseDoneSchema(factory: typeof z) { return factory.object({ reasoning: factory @@ -114,7 +155,7 @@ Call the "done" tool with: const result = await generateText({ model, system: systemPrompt, - messages: [...inputMessages, userPrompt], + messages: [...sanitizeMessagesForResubmission(inputMessages), userPrompt], tools: { done: doneTool } as ToolSet, toolChoice: rejectsForcedToolUse(modelId) ? "auto" @@ -126,7 +167,7 @@ Call the "done" tool with: }, }); - const doneToolCall = result.toolCalls.find((tc) => tc.toolName === "done"); + const doneToolCall = result.toolCalls?.find((tc) => tc.toolName === "done"); const outputMessages: ModelMessage[] = [ userPrompt, ...(result.response?.messages || []), diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index db39aced6..e85acf78c 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -831,13 +831,33 @@ export class V3AgentHandler { ): Promise<{ messages: ModelMessage[]; output?: Record }> { if (state.completed) return { messages }; - const doneResult = await handleDoneToolCall({ - model, - inputMessages: messages, - instruction, - outputSchema, - logger, - }); + let doneResult: Awaited>; + try { + doneResult = await handleDoneToolCall({ + model, + inputMessages: messages, + instruction, + outputSchema, + logger, + }); + } catch (error) { + // The forced "done" call only summarizes the run, so its failure must not + // fail a run whose work already completed (e.g. a provider rejecting the + // re-submitted history). Warn and synthesize a completion. We log only the + // message, not the cause — the cause embeds the full history (base64 + // images included) and would bloat the log. + logger?.({ + category: "agent", + level: 1, + message: `Agent "done" finalization call failed; using run summary instead: ${getErrorMessage(error)}`, + }); + state.completed = true; + state.finalMessage = + state.finalMessage || + state.collectedReasoning.join(" ").trim() || + "Task execution completed"; + return { messages }; + } state.completed = doneResult.taskComplete; state.finalMessage = doneResult.reasoning; diff --git a/packages/core/tests/unit/agent-finalization-resilience.test.ts b/packages/core/tests/unit/agent-finalization-resilience.test.ts new file mode 100644 index 000000000..6e4eed8a9 --- /dev/null +++ b/packages/core/tests/unit/agent-finalization-resilience.test.ts @@ -0,0 +1,198 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { LanguageModelV2 } from "@ai-sdk/provider"; +import type { LLMClient } from "../../lib/v3/llm/LLMClient.js"; +import type { LogLine } from "../../lib/v3/types/public/logs.js"; +import type { V3 } from "../../lib/v3/v3.js"; + +// Make the module-level "ai" generateText (used only by the forced "done" +// finalization call) throw, while the main loop uses the injected LLMClient. +// Mirrors the provider rejecting the re-submitted history (STG-2335). +const finalizationError = Object.assign( + new Error( + "Invalid prompt: The messages must be a ModelMessage[]. If you have " + + "passed a UIMessage[], you can use convertToModelMessages to convert them.", + ), + { cause: new Error("validation failed at messages[3]") }, +); + +vi.mock("ai", async () => { + const actual = await vi.importActual("ai"); + return { + ...actual, + wrapLanguageModel: vi.fn(({ model }) => model), + generateText: vi.fn(async () => { + throw finalizationError; + }), + }; +}); + +import { V3AgentHandler } from "../../lib/v3/handlers/v3AgentHandler.js"; + +type AgentLlmOptions = { + onStepFinish?: (step: unknown) => Promise | void; + onFinish?: (event: unknown) => void; +}; + +const usage = { + inputTokens: 1, + outputTokens: 1, + reasoningTokens: 0, + cachedInputTokens: 0, + totalTokens: 2, +}; + +const emptyList = () => [] as unknown[]; + +// End-of-run step with no `done` call, so state.completed stays false and +// ensureDone proceeds to the forced finalization call (which throws). +function createNonDoneStep() { + return { + content: emptyList(), + text: "", + reasoning: emptyList(), + reasoningText: undefined as string | undefined, + files: emptyList(), + sources: emptyList(), + toolCalls: emptyList(), + staticToolCalls: emptyList(), + dynamicToolCalls: emptyList(), + toolResults: emptyList(), + staticToolResults: emptyList(), + dynamicToolResults: emptyList(), + finishReason: "stop", + usage, + warnings: undefined as unknown, + request: {}, + response: { + id: "response-id", + modelId: "openai/gpt-5-mini", + timestamp: new Date(0), + messages: emptyList(), + }, + providerMetadata: undefined as unknown, + }; +} + +function createGenerateResult(step: ReturnType) { + return { + content: emptyList(), + text: "", + reasoning: emptyList(), + reasoningText: undefined as string | undefined, + files: emptyList(), + sources: emptyList(), + toolCalls: emptyList(), + staticToolCalls: emptyList(), + dynamicToolCalls: emptyList(), + toolResults: emptyList(), + staticToolResults: emptyList(), + dynamicToolResults: emptyList(), + finishReason: "stop", + usage, + totalUsage: usage, + warnings: undefined as unknown, + request: {}, + response: { + id: "response-id", + modelId: "openai/gpt-5-mini", + timestamp: new Date(0), + messages: emptyList(), + }, + providerMetadata: undefined as unknown, + steps: [step], + experimental_output: undefined as unknown, + }; +} + +function createV3() { + const page = { + url: () => "https://example.com", + enableCursorOverlay: vi.fn(async () => {}), + }; + + return { + context: { + awaitActivePage: vi.fn(async () => page), + }, + isCaptchaAutoSolveEnabled: false, + browserbaseApiKey: undefined, + logger: vi.fn(), + recordAgentReplayStep: vi.fn(), + updateMetrics: vi.fn(), + act: vi.fn(), + extract: vi.fn(), + observe: vi.fn(), + } as unknown as V3; +} + +function createLlmClient() { + const model = { + modelId: "openai/gpt-5-mini", + provider: "openai", + specificationVersion: "v2", + } as unknown as LanguageModelV2; + + const generateText = vi.fn(async (options: AgentLlmOptions) => { + const step = createNonDoneStep(); + await options.onStepFinish?.(step); + return createGenerateResult(step); + }); + + return { + client: { + getLanguageModel: vi.fn(() => model), + generateText, + } as unknown as LLMClient, + generateText, + }; +} + +describe("v3 agent finalization resilience", () => { + let logger: (line: LogLine) => void; + + beforeEach(() => { + logger = vi.fn(); + }); + + it("returns a successful result when the forced done call fails", async () => { + const { client } = createLlmClient(); + const handler = new V3AgentHandler(createV3(), logger, client); + + const result = await handler.execute({ + instruction: "finish", + maxSteps: 1, + excludeTools: ["search"], + }); + + // A finalization failure must NOT flip a completed run to failure. + expect(result.success).toBe(true); + expect(result.completed).toBe(true); + expect(result.message).not.toMatch(/^Failed to execute task/); + expect(result.message.length).toBeGreaterThan(0); + }); + + it("logs the finalization failure as a concise warning, not a red error", async () => { + const lines: LogLine[] = []; + const captureLogger = (line: LogLine) => { + lines.push(line); + }; + const { client } = createLlmClient(); + const handler = new V3AgentHandler(createV3(), captureLogger, client); + + await handler.execute({ + instruction: "finish", + maxSteps: 1, + excludeTools: ["search"], + }); + + const warning = lines.find((l) => + l.message.includes('Agent "done" finalization call failed'), + ); + expect(warning).toBeDefined(); + // Warning (level 1), not error (level 0) — the run still succeeded. + expect(warning?.level).toBe(1); + // Cause is not logged (it would bloat the log); message stays short. + expect(warning?.auxiliary).toBeUndefined(); + expect(warning?.message.length).toBeLessThan(500); + }); +}); diff --git a/packages/core/tests/unit/sanitize-resubmission-messages.test.ts b/packages/core/tests/unit/sanitize-resubmission-messages.test.ts new file mode 100644 index 000000000..018904444 --- /dev/null +++ b/packages/core/tests/unit/sanitize-resubmission-messages.test.ts @@ -0,0 +1,56 @@ +import { describe, expect, it } from "vitest"; +import type { ModelMessage } from "ai"; +import { sanitizeMessagesForResubmission } from "../../lib/v3/agent/utils/handleDoneToolCall.js"; + +describe("sanitizeMessagesForResubmission", () => { + it("strips nested undefined from providerOptions (the gpt-5.x failure)", () => { + const messages = [ + { + role: "assistant", + content: [ + { + type: "reasoning", + text: "", + // jsonValueSchema rejects undefined, so this is what breaks + // standardizePrompt on re-submission. + providerOptions: { + openai: { itemId: "rs_1", reasoningEncryptedContent: undefined }, + }, + }, + ], + }, + ] as unknown as ModelMessage[]; + + const [msg] = sanitizeMessagesForResubmission(messages); + const part = ( + msg.content as unknown as { providerOptions: { openai: object } }[] + )[0]; + + expect(part.providerOptions.openai).toEqual({ itemId: "rs_1" }); + expect("reasoningEncryptedContent" in part.providerOptions.openai).toBe( + false, + ); + }); + + it("preserves null, primitives, and string content unchanged", () => { + const messages = [ + { role: "system", content: "you are an agent" }, + { role: "user", content: "hi" }, + ] as unknown as ModelMessage[]; + + expect(sanitizeMessagesForResubmission(messages)).toEqual(messages); + }); + + it("does not mutate the input messages", () => { + const messages = [ + { + role: "assistant", + content: [{ type: "text", text: "x", providerOptions: undefined }], + }, + ] as unknown as ModelMessage[]; + + sanitizeMessagesForResubmission(messages); + const original = (messages[0].content as { providerOptions?: unknown }[])[0]; + expect("providerOptions" in original).toBe(true); + }); +}); From 22bc2150d50a58a1d72fe3a85dbb58d344f1669c Mon Sep 17 00:00:00 2001 From: Filip Michalsky <31483888+filip-michalsky@users.noreply.github.com> Date: Wed, 24 Jun 2026 21:29:47 +0200 Subject: [PATCH 3/4] fix(agent): strip reasoning parts before forced "done" re-submission (#2270) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Title: fix(agent): strip undefined values from run history before "done" finalization (STG-2335) Base: alyssamaruyama/stg-2335-escalation-permitflow-agent-with-custom-tool-has-error-on --- Summary Root-cause fix for STG-2335: a non-CUA agent.execute() that successful run as { success: false }, with a red Invalidprompt: messages must be a ModelMessage[] error logged after all the work already completed. This replaces the symptom patch (wrap the finalization in try/catch and force state.completed = true) with a fix for the actual defect. Root cause After the main agent loop finishes, ensureDone() runs a forced "done" finalization (handleDoneToolCall) that re-submits the accumulated run history into a fresh generateText call to produce the structured su-validates accumulated tool results, but this re-submissiondoes. When a custom tool returns an object with an optional field left undefined — e.g. PermitFlow's captureField returning { matchedExpected: undefined }when no expectedText is passed — that undefined lands insid The AI SDK's ModelMessage validation (standardizePrompt)rejects it, because its JSON-value schema disallows undefined (only null/string/number/boolean/object/array). The finalization throws, flipping the result to { success: false } even though every action succe ▎ Note: the original "reasoning traces" hypothesis was rule parts come back with a valid text: "" and pass validation.The undefined tool-result field is the trigger. Fix sanitizeMessagesForResubmission() deep-strips undefined from the run history before the forced "done" call, keeping all real content. It only traverses plain objects/arrays, so class instances (URL, tyata, Date) pass through untouched. Also reverts PR #2269's force-completed=true fallback, so gfail loudly again instead of being masked. Testing - 4 unit tests in agent-finalization-resilience.test.ts agaces InvalidPromptError with an undefined tool-result field →fixed by sanitize → real content (reasoning/tool-call/text) preserved → class instances untouched. All pass. - End-to-end repro (openai/gpt-5.5 + custom tool, mirrors Pon main (success=false, red error), succeeds on this branch(success=true, completed=true, no error). --- Co-authored-by: Filip Michalsky Co-authored-by: Claude Opus 4.8 (1M context) --- .changeset/curvy-pillows-attack.md | 4 +- .../lib/v3/agent/utils/handleDoneToolCall.ts | 42 ++- .../core/lib/v3/handlers/v3AgentHandler.ts | 34 +-- .../agent-finalization-resilience.test.ts | 287 ++++++------------ 4 files changed, 150 insertions(+), 217 deletions(-) diff --git a/.changeset/curvy-pillows-attack.md b/.changeset/curvy-pillows-attack.md index 5b12acb56..f94308c4a 100644 --- a/.changeset/curvy-pillows-attack.md +++ b/.changeset/curvy-pillows-attack.md @@ -2,6 +2,6 @@ "@browserbasehq/stagehand": patch --- -Fix non-CUA `agent.execute()` reporting a successful run as failed. After the agent finished all of its work, the forced "done" finalization step (`handleDoneToolCall`) re-submitted the accumulated run history to the model; with some providers (e.g. reasoning models like `openai/gpt-5.x`) that history is rejected by the AI SDK (`Invalid prompt: messages must be a ModelMessage[]`), which surfaced as a red error and flipped the result to `{ success: false }` even though every action had already completed. +Fix non-CUA `agent.execute()` reporting a successful run as failed. After the agent finished all of its work, the forced "done" finalization step (`handleDoneToolCall`) re-submitted the accumulated run history to the model. When a custom tool returned an object with an optional field left `undefined` (e.g. `{ matchedExpected: undefined }`), that `undefined` ended up inside a tool-result `output.value`, which the AI SDK's prompt validation rejects (its JSON-value schema disallows `undefined`), throwing `Invalid prompt: messages must be a ModelMessage[]`. This surfaced as a red error and flipped the result to `{ success: false }` even though every action had already completed (STG-2335). -The finalization "done" call is now best-effort: if it throws, the agent logs a warning (with the underlying cause) and synthesizes a completion from the run instead of failing it. Also hardens `handleDoneToolCall` against a missing `toolCalls` array. +Root cause fix: deep-strip `undefined` values from the run history before re-submitting it to the forced "done" finalization call, keeping the messages valid without dropping any real content. Class instances (URL, typed arrays for binary data, etc.) are passed through untouched. diff --git a/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts b/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts index 08cb542e4..3e9319c19 100644 --- a/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts +++ b/packages/core/lib/v3/agent/utils/handleDoneToolCall.ts @@ -17,6 +17,46 @@ interface DoneResult { output?: Record; } +/** + * Deep-remove `undefined` values from the run history before it is re-submitted + * to the forced "done" call. + * + * The AI SDK validates re-submitted messages against its `ModelMessage` schema, + * whose JSON-value schema rejects `undefined` (only null/string/number/boolean/ + * object/array are allowed). A custom tool that returns an object with an + * optional field left `undefined` (e.g. `{ matchedExpected: undefined }`) lands + * that `undefined` inside a tool-result `output.value`, and reasoning models can + * leave `undefined` in part fields too. Either makes the forced "done" call + * throw `Invalid prompt: messages must be a ModelMessage[]` (STG-2335) — a red + * error that fires after the run has already completed. Stripping `undefined` + * keeps the history valid without dropping any real content. + * + * Only plain objects and arrays are traversed; class instances (URL, typed + * arrays for binary image data, Date, …) are passed through untouched. + */ +function stripUndefinedDeep(value: T): T { + if (Array.isArray(value)) { + return value.map((v) => stripUndefinedDeep(v)) as unknown as T; + } + if (value !== null && typeof value === "object") { + const proto = Object.getPrototypeOf(value); + if (proto === Object.prototype || proto === null) { + const out: Record = {}; + for (const [k, v] of Object.entries(value as Record)) { + if (v !== undefined) out[k] = stripUndefinedDeep(v); + } + return out as T; + } + } + return value; +} + +export function sanitizeMessagesForResubmission( + messages: ModelMessage[], +): ModelMessage[] { + return stripUndefinedDeep(messages); +} + function buildBaseDoneSchema(factory: typeof z) { return factory.object({ reasoning: factory @@ -114,7 +154,7 @@ Call the "done" tool with: const result = await generateText({ model, system: systemPrompt, - messages: [...inputMessages, userPrompt], + messages: [...sanitizeMessagesForResubmission(inputMessages), userPrompt], tools: { done: doneTool } as ToolSet, toolChoice: rejectsForcedToolUse(modelId) ? "auto" diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index e85acf78c..db39aced6 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -831,33 +831,13 @@ export class V3AgentHandler { ): Promise<{ messages: ModelMessage[]; output?: Record }> { if (state.completed) return { messages }; - let doneResult: Awaited>; - try { - doneResult = await handleDoneToolCall({ - model, - inputMessages: messages, - instruction, - outputSchema, - logger, - }); - } catch (error) { - // The forced "done" call only summarizes the run, so its failure must not - // fail a run whose work already completed (e.g. a provider rejecting the - // re-submitted history). Warn and synthesize a completion. We log only the - // message, not the cause — the cause embeds the full history (base64 - // images included) and would bloat the log. - logger?.({ - category: "agent", - level: 1, - message: `Agent "done" finalization call failed; using run summary instead: ${getErrorMessage(error)}`, - }); - state.completed = true; - state.finalMessage = - state.finalMessage || - state.collectedReasoning.join(" ").trim() || - "Task execution completed"; - return { messages }; - } + const doneResult = await handleDoneToolCall({ + model, + inputMessages: messages, + instruction, + outputSchema, + logger, + }); state.completed = doneResult.taskComplete; state.finalMessage = doneResult.reasoning; diff --git a/packages/core/tests/unit/agent-finalization-resilience.test.ts b/packages/core/tests/unit/agent-finalization-resilience.test.ts index 6e4eed8a9..2815a633c 100644 --- a/packages/core/tests/unit/agent-finalization-resilience.test.ts +++ b/packages/core/tests/unit/agent-finalization-resilience.test.ts @@ -1,198 +1,111 @@ -import { beforeEach, describe, expect, it, vi } from "vitest"; +import { describe, expect, it } from "vitest"; +import { generateText, type ModelMessage } from "ai"; import type { LanguageModelV2 } from "@ai-sdk/provider"; -import type { LLMClient } from "../../lib/v3/llm/LLMClient.js"; -import type { LogLine } from "../../lib/v3/types/public/logs.js"; -import type { V3 } from "../../lib/v3/v3.js"; - -// Make the module-level "ai" generateText (used only by the forced "done" -// finalization call) throw, while the main loop uses the injected LLMClient. -// Mirrors the provider rejecting the re-submitted history (STG-2335). -const finalizationError = Object.assign( - new Error( - "Invalid prompt: The messages must be a ModelMessage[]. If you have " + - "passed a UIMessage[], you can use convertToModelMessages to convert them.", - ), - { cause: new Error("validation failed at messages[3]") }, -); - -vi.mock("ai", async () => { - const actual = await vi.importActual("ai"); - return { - ...actual, - wrapLanguageModel: vi.fn(({ model }) => model), - generateText: vi.fn(async () => { - throw finalizationError; - }), - }; -}); - -import { V3AgentHandler } from "../../lib/v3/handlers/v3AgentHandler.js"; - -type AgentLlmOptions = { - onStepFinish?: (step: unknown) => Promise | void; - onFinish?: (event: unknown) => void; -}; - -const usage = { - inputTokens: 1, - outputTokens: 1, - reasoningTokens: 0, - cachedInputTokens: 0, - totalTokens: 2, -}; - -const emptyList = () => [] as unknown[]; - -// End-of-run step with no `done` call, so state.completed stays false and -// ensureDone proceeds to the forced finalization call (which throws). -function createNonDoneStep() { - return { - content: emptyList(), - text: "", - reasoning: emptyList(), - reasoningText: undefined as string | undefined, - files: emptyList(), - sources: emptyList(), - toolCalls: emptyList(), - staticToolCalls: emptyList(), - dynamicToolCalls: emptyList(), - toolResults: emptyList(), - staticToolResults: emptyList(), - dynamicToolResults: emptyList(), - finishReason: "stop", - usage, - warnings: undefined as unknown, - request: {}, - response: { - id: "response-id", - modelId: "openai/gpt-5-mini", - timestamp: new Date(0), - messages: emptyList(), - }, - providerMetadata: undefined as unknown, - }; -} - -function createGenerateResult(step: ReturnType) { - return { - content: emptyList(), - text: "", - reasoning: emptyList(), - reasoningText: undefined as string | undefined, - files: emptyList(), - sources: emptyList(), - toolCalls: emptyList(), - staticToolCalls: emptyList(), - dynamicToolCalls: emptyList(), - toolResults: emptyList(), - staticToolResults: emptyList(), - dynamicToolResults: emptyList(), - finishReason: "stop", - usage, - totalUsage: usage, - warnings: undefined as unknown, - request: {}, - response: { - id: "response-id", - modelId: "openai/gpt-5-mini", - timestamp: new Date(0), - messages: emptyList(), - }, - providerMetadata: undefined as unknown, - steps: [step], - experimental_output: undefined as unknown, - }; -} - -function createV3() { - const page = { - url: () => "https://example.com", - enableCursorOverlay: vi.fn(async () => {}), - }; - - return { - context: { - awaitActivePage: vi.fn(async () => page), - }, - isCaptchaAutoSolveEnabled: false, - browserbaseApiKey: undefined, - logger: vi.fn(), - recordAgentReplayStep: vi.fn(), - updateMetrics: vi.fn(), - act: vi.fn(), - extract: vi.fn(), - observe: vi.fn(), - } as unknown as V3; -} - -function createLlmClient() { - const model = { - modelId: "openai/gpt-5-mini", - provider: "openai", - specificationVersion: "v2", - } as unknown as LanguageModelV2; - - const generateText = vi.fn(async (options: AgentLlmOptions) => { - const step = createNonDoneStep(); - await options.onStepFinish?.(step); - return createGenerateResult(step); +import { sanitizeMessagesForResubmission } from "../../lib/v3/agent/utils/handleDoneToolCall.js"; + +// A minimal mock model. generateText runs the AI SDK's prompt validation +// (standardizePrompt) before ever reaching the model, so this is enough to +// reproduce STG-2335: the forced "done" finalization re-submits the run +// history, and a tool-result whose output value contains an `undefined` field +// trips that validation with "Invalid prompt: messages must be a +// ModelMessage[]" — the AI SDK's JSON-value schema rejects `undefined`. +const mockModel = { + specificationVersion: "v2", + provider: "mock", + modelId: "mock", + supportedUrls: {}, + async doGenerate() { + return { + finishReason: "stop" as const, + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + content: [{ type: "text" as const, text: "ok" }], + warnings: [] as [], + }; + }, +} as unknown as LanguageModelV2; + +// Mirrors PermitFlow's custom tool: an optional field (`matchedExpected`) is +// left `undefined` in the tool result, so the re-submitted tool-result carries +// an `undefined` inside output.value. Also includes a valid reasoning part +// (text: "") to confirm sanitization leaves real content intact. +const malformedHistory = [ + { role: "user", content: "do the task" }, + { + role: "assistant", + content: [ + { + type: "reasoning", + text: "", + providerOptions: { openai: { itemId: "rs_1" } }, + }, + { type: "tool-call", toolCallId: "c1", toolName: "captureField", input: {} }, + ], + }, + { + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: "c1", + toolName: "captureField", + output: { + type: "json", + value: { success: true, value: "permit", matchedExpected: undefined }, + }, + }, + ], + }, +] as unknown as ModelMessage[]; + +describe("v3 agent finalization: tool-result re-submission (STG-2335)", () => { + it("reproduces the InvalidPromptError when an undefined tool-result field is re-submitted", async () => { + await expect( + generateText({ model: mockModel, messages: malformedHistory }), + ).rejects.toThrow(/must be a ModelMessage\[\]/); }); - return { - client: { - getLanguageModel: vi.fn(() => model), - generateText, - } as unknown as LLMClient, - generateText, - }; -} - -describe("v3 agent finalization resilience", () => { - let logger: (line: LogLine) => void; - - beforeEach(() => { - logger = vi.fn(); + it("re-submission succeeds once undefined values are stripped", async () => { + const result = await generateText({ + model: mockModel, + messages: sanitizeMessagesForResubmission(malformedHistory), + }); + expect(result.text).toBe("ok"); }); - it("returns a successful result when the forced done call fails", async () => { - const { client } = createLlmClient(); - const handler = new V3AgentHandler(createV3(), logger, client); + it("drops undefined fields but preserves all real content", () => { + const cleaned = sanitizeMessagesForResubmission(malformedHistory); - const result = await handler.execute({ - instruction: "finish", - maxSteps: 1, - excludeTools: ["search"], - }); + expect(cleaned).toHaveLength(3); + expect(cleaned[0]).toEqual({ role: "user", content: "do the task" }); - // A finalization failure must NOT flip a completed run to failure. - expect(result.success).toBe(true); - expect(result.completed).toBe(true); - expect(result.message).not.toMatch(/^Failed to execute task/); - expect(result.message.length).toBeGreaterThan(0); - }); - - it("logs the finalization failure as a concise warning, not a red error", async () => { - const lines: LogLine[] = []; - const captureLogger = (line: LogLine) => { - lines.push(line); - }; - const { client } = createLlmClient(); - const handler = new V3AgentHandler(createV3(), captureLogger, client); + // reasoning + tool-call survive on the assistant message. + const assistant = cleaned[1].content as Array<{ type: string }>; + expect(assistant.map((p) => p.type)).toEqual(["reasoning", "tool-call"]); - await handler.execute({ - instruction: "finish", - maxSteps: 1, - excludeTools: ["search"], - }); + // tool-result value keeps real fields, drops the undefined one. + const toolResult = ( + cleaned[2].content as Array<{ output: { value: Record } }> + )[0]; + expect(toolResult.output.value).toEqual({ success: true, value: "permit" }); + expect("matchedExpected" in toolResult.output.value).toBe(false); + }); - const warning = lines.find((l) => - l.message.includes('Agent "done" finalization call failed'), - ); - expect(warning).toBeDefined(); - // Warning (level 1), not error (level 0) — the run still succeeded. - expect(warning?.level).toBe(1); - // Cause is not logged (it would bloat the log); message stays short. - expect(warning?.auxiliary).toBeUndefined(); - expect(warning?.message.length).toBeLessThan(500); + it("leaves class instances (URL, typed arrays) untouched", () => { + const url = new URL("https://example.com"); + const bytes = new Uint8Array([1, 2, 3]); + const messages = [ + { + role: "user", + content: [ + { type: "file", data: url, mediaType: "text/plain" }, + { type: "file", data: bytes, mediaType: "application/octet-stream" }, + ], + }, + ] as unknown as ModelMessage[]; + + const cleaned = sanitizeMessagesForResubmission(messages); + const parts = cleaned[0].content as Array<{ data: unknown }>; + expect(parts[0].data).toBeInstanceOf(URL); + expect(parts[1].data).toBeInstanceOf(Uint8Array); }); }); From 50d3a30426bd5d4300aaed139c49be98fdd02aae Mon Sep 17 00:00:00 2001 From: Alyssa Maruyama Date: Wed, 24 Jun 2026 17:18:16 -0700 Subject: [PATCH 4/4] fix lint --- .../tests/unit/agent-finalization-resilience.test.ts | 11 +++++++++-- .../tests/unit/sanitize-resubmission-messages.test.ts | 4 +++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/packages/core/tests/unit/agent-finalization-resilience.test.ts b/packages/core/tests/unit/agent-finalization-resilience.test.ts index 2815a633c..4ccf0ecd8 100644 --- a/packages/core/tests/unit/agent-finalization-resilience.test.ts +++ b/packages/core/tests/unit/agent-finalization-resilience.test.ts @@ -38,7 +38,12 @@ const malformedHistory = [ text: "", providerOptions: { openai: { itemId: "rs_1" } }, }, - { type: "tool-call", toolCallId: "c1", toolName: "captureField", input: {} }, + { + type: "tool-call", + toolCallId: "c1", + toolName: "captureField", + input: {}, + }, ], }, { @@ -84,7 +89,9 @@ describe("v3 agent finalization: tool-result re-submission (STG-2335)", () => { // tool-result value keeps real fields, drops the undefined one. const toolResult = ( - cleaned[2].content as Array<{ output: { value: Record } }> + cleaned[2].content as Array<{ + output: { value: Record }; + }> )[0]; expect(toolResult.output.value).toEqual({ success: true, value: "permit" }); expect("matchedExpected" in toolResult.output.value).toBe(false); diff --git a/packages/core/tests/unit/sanitize-resubmission-messages.test.ts b/packages/core/tests/unit/sanitize-resubmission-messages.test.ts index 018904444..911327fd1 100644 --- a/packages/core/tests/unit/sanitize-resubmission-messages.test.ts +++ b/packages/core/tests/unit/sanitize-resubmission-messages.test.ts @@ -50,7 +50,9 @@ describe("sanitizeMessagesForResubmission", () => { ] as unknown as ModelMessage[]; sanitizeMessagesForResubmission(messages); - const original = (messages[0].content as { providerOptions?: unknown }[])[0]; + const original = ( + messages[0].content as { providerOptions?: unknown }[] + )[0]; expect("providerOptions" in original).toBe(true); }); });