Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/fix-cua-keypress-chord.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

Fix CUA `keypress` actions to press key combinations as a single chord. Previously each key in the array was pressed separately, releasing modifiers before the main key — so combinations like `["Control", "A"]` sent Ctrl on its own and then typed a literal `a` instead of select-all. This affected the OpenAI, Google (`key_combination`), and Microsoft computer-use clients, which emit multi-element key arrays; Anthropic (which sends a single `+`-joined string) was unaffected.
42 changes: 23 additions & 19 deletions packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -452,29 +452,33 @@ export class V3CuaAgentHandler {
case "keypress": {
const { keys } = action;
const keyList = Array.isArray(keys) ? keys : [keys];
const stagehandActions: Action[] = [];
for (const rawKey of keyList) {
const mapped = mapKeyToPlaywright(String(rawKey ?? ""));
if (keyList.length > 0) {
// CUA "keypress" actions describe a single key *chord* (modifiers held
// down for the main key), not a sequence of independent presses.
// Pressing each key separately released modifiers before the main key,
// so combinations like ["Control", "A"] sent Ctrl on its own and then
// typed a literal "a" instead of select-all. Join into one
// "+"-delimited combination so page.keyPress holds the modifiers down.
// page.keyPress already handles the literal "+" key correctly.
const mapped = keyList
.map((rawKey) => mapKeyToPlaywright(String(rawKey ?? "")))
.join("+");
await page.keyPress(mapped);
if (recording) {
stagehandActions.push({
selector: "xpath=/html",
description: `press ${mapped}`,
method: "press",
arguments: [mapped],
});
this.recordCuaActStep(
action,
[
{
selector: "xpath=/html",
description: `press ${mapped}`,
method: "press",
arguments: [mapped],
},
],
`press ${mapped}`,
);
}
}
if (recording && stagehandActions.length > 0) {
this.recordCuaActStep(
action,
stagehandActions,
stagehandActions
.map((a) => a.description)
.filter(Boolean)
.join(", ") || "keypress",
);
}
return { success: true };
}
case "scroll": {
Expand Down
81 changes: 81 additions & 0 deletions packages/core/tests/unit/cua-keypress-chord.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import { describe, it, expect, vi, beforeEach } from "vitest";
import { V3CuaAgentHandler } from "../../lib/v3/handlers/v3CuaAgentHandler.js";
import type { V3 } from "../../lib/v3/v3.js";
import type { AgentAction } from "../../lib/v3/types/public/agent.js";

/**
* Regression coverage for CUA "keypress" chord handling.
*
* A keypress action describes a single key combination (modifiers held down for
* the main key). The handler used to press each key in the array separately,
* which released modifiers early — so ["Control", "A"] sent Ctrl alone and then
* typed a literal "a" instead of select-all. This broke Google's
* `key_combination`, OpenAI's `keypress`, and Microsoft's `keypress` (all of
* which emit a multi-element keys array), while Anthropic (single "+"-joined
* string) happened to work.
*/
describe("V3CuaAgentHandler keypress chord handling", () => {
let handler: V3CuaAgentHandler;
let keyPress: ReturnType<typeof vi.fn>;

// executeAction is private; expose it through a typed accessor for the test.
const execute = (action: AgentAction) =>
(
handler as unknown as {
executeAction: (a: AgentAction) => Promise<unknown>;
}
).executeAction(action);

beforeEach(() => {
keyPress = vi.fn().mockResolvedValue(undefined);
const mockPage = {
keyPress,
url: () => "https://example.com",
};
const mockV3 = {
context: {
awaitActivePage: vi.fn().mockResolvedValue(mockPage),
},
isAgentReplayActive: () => false,
} as unknown as V3;

handler = new V3CuaAgentHandler(mockV3, vi.fn(), {
modelName: "claude-sonnet-4-5-20250929",
clientOptions: { apiKey: "test-key" },
});
});

it("presses a multi-key combination as a single chord", async () => {
await execute({ type: "keypress", keys: ["Control", "A"] } as AgentAction);

expect(keyPress).toHaveBeenCalledTimes(1);
expect(keyPress).toHaveBeenCalledWith("Control+A");
});

it("normalizes provider key aliases before chording (CTRL -> Control)", async () => {
await execute({ type: "keypress", keys: ["CTRL", "A"] } as AgentAction);

expect(keyPress).toHaveBeenCalledTimes(1);
expect(keyPress).toHaveBeenCalledWith("Control+A");
});

it("still presses a single key correctly", async () => {
await execute({ type: "keypress", keys: ["Enter"] } as AgentAction);

expect(keyPress).toHaveBeenCalledTimes(1);
expect(keyPress).toHaveBeenCalledWith("Enter");
});

it("preserves an already-combined key string (Anthropic shape)", async () => {
await execute({ type: "keypress", keys: ["ctrl+s"] } as AgentAction);

expect(keyPress).toHaveBeenCalledTimes(1);
expect(keyPress).toHaveBeenCalledWith("ctrl+s");
});

it("does not press anything for an empty keys array", async () => {
await execute({ type: "keypress", keys: [] } as AgentAction);

expect(keyPress).not.toHaveBeenCalled();
});
});
Loading