From 4f921ee292d95a64233383c28fd199c04637b970 Mon Sep 17 00:00:00 2001 From: yawbtng <154343001+yawbtng@users.noreply.github.com> Date: Sun, 21 Jun 2026 20:53:04 -0700 Subject: [PATCH] fix(core): press CUA keypress combinations as a single chord CUA keypress actions describe a single key chord, but the executor pressed each key in the array separately, releasing modifiers before the main key. Combinations like ["Control", "A"] sent Ctrl alone and then typed a literal "a" instead of select-all. This broke the OpenAI, Google (key_combination), and Microsoft computer-use clients, which emit multi-element key arrays; Anthropic (single +-joined string) was fine. Join the mapped keys into one +-delimited combination so page.keyPress holds modifiers down for the main key, and record the chord as a single replay step. mapKeyToPlaywright is idempotent, so already-combined and single-key inputs are unchanged. Adds unit coverage for chord, alias normalization, single-key, already-combined, and empty-array cases. --- .changeset/fix-cua-keypress-chord.md | 5 ++ .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 42 +++++----- .../tests/unit/cua-keypress-chord.test.ts | 81 +++++++++++++++++++ 3 files changed, 109 insertions(+), 19 deletions(-) create mode 100644 .changeset/fix-cua-keypress-chord.md create mode 100644 packages/core/tests/unit/cua-keypress-chord.test.ts diff --git a/.changeset/fix-cua-keypress-chord.md b/.changeset/fix-cua-keypress-chord.md new file mode 100644 index 000000000..4302b22cc --- /dev/null +++ b/.changeset/fix-cua-keypress-chord.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Fix CUA `keypress` actions to press key combinations as a single chord. Previously each key in the array was pressed separately, releasing modifiers before the main key — so combinations like `["Control", "A"]` sent Ctrl on its own and then typed a literal `a` instead of select-all. This affected the OpenAI, Google (`key_combination`), and Microsoft computer-use clients, which emit multi-element key arrays; Anthropic (which sends a single `+`-joined string) was unaffected. diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 22513339c..9dc3901bf 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -452,29 +452,33 @@ export class V3CuaAgentHandler { case "keypress": { const { keys } = action; const keyList = Array.isArray(keys) ? keys : [keys]; - const stagehandActions: Action[] = []; - for (const rawKey of keyList) { - const mapped = mapKeyToPlaywright(String(rawKey ?? "")); + if (keyList.length > 0) { + // CUA "keypress" actions describe a single key *chord* (modifiers held + // down for the main key), not a sequence of independent presses. + // Pressing each key separately released modifiers before the main key, + // so combinations like ["Control", "A"] sent Ctrl on its own and then + // typed a literal "a" instead of select-all. Join into one + // "+"-delimited combination so page.keyPress holds the modifiers down. + // page.keyPress already handles the literal "+" key correctly. + const mapped = keyList + .map((rawKey) => mapKeyToPlaywright(String(rawKey ?? ""))) + .join("+"); await page.keyPress(mapped); if (recording) { - stagehandActions.push({ - selector: "xpath=/html", - description: `press ${mapped}`, - method: "press", - arguments: [mapped], - }); + this.recordCuaActStep( + action, + [ + { + selector: "xpath=/html", + description: `press ${mapped}`, + method: "press", + arguments: [mapped], + }, + ], + `press ${mapped}`, + ); } } - if (recording && stagehandActions.length > 0) { - this.recordCuaActStep( - action, - stagehandActions, - stagehandActions - .map((a) => a.description) - .filter(Boolean) - .join(", ") || "keypress", - ); - } return { success: true }; } case "scroll": { diff --git a/packages/core/tests/unit/cua-keypress-chord.test.ts b/packages/core/tests/unit/cua-keypress-chord.test.ts new file mode 100644 index 000000000..37fd08fa5 --- /dev/null +++ b/packages/core/tests/unit/cua-keypress-chord.test.ts @@ -0,0 +1,81 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { V3CuaAgentHandler } from "../../lib/v3/handlers/v3CuaAgentHandler.js"; +import type { V3 } from "../../lib/v3/v3.js"; +import type { AgentAction } from "../../lib/v3/types/public/agent.js"; + +/** + * Regression coverage for CUA "keypress" chord handling. + * + * A keypress action describes a single key combination (modifiers held down for + * the main key). The handler used to press each key in the array separately, + * which released modifiers early — so ["Control", "A"] sent Ctrl alone and then + * typed a literal "a" instead of select-all. This broke Google's + * `key_combination`, OpenAI's `keypress`, and Microsoft's `keypress` (all of + * which emit a multi-element keys array), while Anthropic (single "+"-joined + * string) happened to work. + */ +describe("V3CuaAgentHandler keypress chord handling", () => { + let handler: V3CuaAgentHandler; + let keyPress: ReturnType; + + // executeAction is private; expose it through a typed accessor for the test. + const execute = (action: AgentAction) => + ( + handler as unknown as { + executeAction: (a: AgentAction) => Promise; + } + ).executeAction(action); + + beforeEach(() => { + keyPress = vi.fn().mockResolvedValue(undefined); + const mockPage = { + keyPress, + url: () => "https://example.com", + }; + const mockV3 = { + context: { + awaitActivePage: vi.fn().mockResolvedValue(mockPage), + }, + isAgentReplayActive: () => false, + } as unknown as V3; + + handler = new V3CuaAgentHandler(mockV3, vi.fn(), { + modelName: "claude-sonnet-4-5-20250929", + clientOptions: { apiKey: "test-key" }, + }); + }); + + it("presses a multi-key combination as a single chord", async () => { + await execute({ type: "keypress", keys: ["Control", "A"] } as AgentAction); + + expect(keyPress).toHaveBeenCalledTimes(1); + expect(keyPress).toHaveBeenCalledWith("Control+A"); + }); + + it("normalizes provider key aliases before chording (CTRL -> Control)", async () => { + await execute({ type: "keypress", keys: ["CTRL", "A"] } as AgentAction); + + expect(keyPress).toHaveBeenCalledTimes(1); + expect(keyPress).toHaveBeenCalledWith("Control+A"); + }); + + it("still presses a single key correctly", async () => { + await execute({ type: "keypress", keys: ["Enter"] } as AgentAction); + + expect(keyPress).toHaveBeenCalledTimes(1); + expect(keyPress).toHaveBeenCalledWith("Enter"); + }); + + it("preserves an already-combined key string (Anthropic shape)", async () => { + await execute({ type: "keypress", keys: ["ctrl+s"] } as AgentAction); + + expect(keyPress).toHaveBeenCalledTimes(1); + expect(keyPress).toHaveBeenCalledWith("ctrl+s"); + }); + + it("does not press anything for an empty keys array", async () => { + await execute({ type: "keypress", keys: [] } as AgentAction); + + expect(keyPress).not.toHaveBeenCalled(); + }); +});