diff --git a/skills/autobrowse/evals/.env.example b/skills/autobrowse/evals/.env.example new file mode 100644 index 0000000..8943022 --- /dev/null +++ b/skills/autobrowse/evals/.env.example @@ -0,0 +1,10 @@ +# Required for real (non-mock) runs — inner agent + outer agent both use it. +ANTHROPIC_API_KEY=sk-ant-... + +# Required for tasks with env=remote (Tier B/C bot-protected sites). +BROWSERBASE_API_KEY=bb_... + +# Path to the autobrowse skill (the directory containing scripts/evaluate.mjs). +# Defaults to the parent directory (evals/ ships inside the skill); set only +# to point the harness at a different autobrowse checkout. +# AUTOBROWSE_DIR=/path/to/skills/skills/autobrowse diff --git a/skills/autobrowse/evals/.gitignore b/skills/autobrowse/evals/.gitignore new file mode 100644 index 0000000..7d4d452 --- /dev/null +++ b/skills/autobrowse/evals/.gitignore @@ -0,0 +1,4 @@ +node_modules/ +runs/ +.env +vendor/ diff --git a/skills/autobrowse/evals/README.md b/skills/autobrowse/evals/README.md new file mode 100644 index 0000000..d43e9a7 --- /dev/null +++ b/skills/autobrowse/evals/README.md @@ -0,0 +1,96 @@ +# autobrowse-evals + +Eval harness for the [autobrowse](https://github.com/browserbase/skills/tree/main/skills/autobrowse) self-improving browser-automation loop. Measures the four things that matter — **convergence speed**, **accuracy**, **runtime speed**, and **token cost** — and makes them comparable across inner/outer models, prompts, and architectures. + +## The four artifacts being evaluated + +| Artifact | What it is | Metrics | +|---|---|---| +| Single run | One `evaluate.mjs` attempt, empty strategy | accuracy baseline, speed, tokens | +| Learning loop | evaluate → verify → improve, repeated | convergence speed, cumulative cost | +| Graduated strategy | frozen best strategy.md run by a fresh agent | **holdout** accuracy/speed/tokens | +| Codegen script | deterministic playwright/stagehand output | (future: wire `codegen.mjs --verify` in) | + +The core design decision: **training and evaluation are separated.** Convergence is measured during the loop; the *result* is measured by freezing the best strategy and running it N fresh times (holdout). And **pass/fail is never self-reported** — every task has a programmatic verifier; the agent's own `success: true` is only used to compute the false-success (reward-hacking) rate. + +## Layout + +``` +eval/ + run-matrix.mjs orchestrator: condition × task × trial → train + holdout + outer-agent.mjs scripted outer loop (one structured-output call per iteration; + outer tokens metered — the interactive loop never records them) + report.mjs aggregates runs/results.jsonl into scorecards + conditions/*.json sweepable variables: inner_model, outer_model, outer_prompt, iters + prompts/outer-*.md outer-prompt variants (default = SKILL.md methodology, lean = ablation) + tasks// task.md (autobrowse format) + verify.mjs + meta.json + mock-output.json +fixtures/ self-hosted deterministic sites (Tier A ground truth) +runs/ workspaces, traces, results.jsonl (gitignored) +``` + +## Benchmark suite (9 tasks, 3 tiers) + +Tasks marked ◆ are drawn from the browse.sh prompt library (`prompts//.md`). + +| Tier | Task | Env | Verification | +|---|---|---|---| +| **A — deterministic** | `fixture-checkout` | local | exact confirmation code (shared hash function) + total | +| | `fixture-flightdeck` | local | exact cheapest-nonstop answer; traps: cheaper 1-stop, cheaper wrong route | +| | `books-toscrape` | local | exact count/prices/titles (static demo site) | +| **B — live, stable** | `uspto-patent-lookup` ◆ | remote | patent facts are immutable (US 11,000,000) | +| | `google-flights` ◆ | local | invariants: nonstop, airline set, price band, internal consistency | +| | `opentable-availability` ◆ | local | invariants: date/party echoed, slot format, availability consistency | +| | `youtube-transcript` ◆ | local | immutable content ("Me at the zoo" transcript phrases) | +| **C — bot-protected** | `stockx-price` ◆ | remote | product identity + price band (PerimeterX) | +| | `yelp-reviews` ◆ | remote | rating/review-count bands + per-review structure (DataDome) | + +Tier A gives model comparisons statistical teeth; Tier B measures real-site competence with invariant checks; Tier C measures infrastructure robustness (report it separately — variance is the site's, not the model's). + +**Verifier protocol** (mirrors autobrowse's codegen runner protocol): `node eval/tasks//verify.mjs --run-dir ` → one JSON line `{passed, checks: [{name, ok, detail}], reason}`. Each task's `mock-output.json` is its documented known-good output; `npm run test:verifiers` asserts every verifier passes it and rejects a garbage `{"success": true}` — i.e., verifiers are tested against reward-hacking. + +## Setup + +```bash +npm install +cp .env.example .env # ANTHROPIC_API_KEY (+ BROWSERBASE_API_KEY for remote tasks) +npm install -g browse # the browse CLI used by the inner agent +# AUTOBROWSE_DIR defaults to the parent dir (this folder ships inside the skill) +``` + +## Usage + +```bash +npm run test:verifiers # verifier soundness (no keys needed) +node eval/run-matrix.mjs --conditions baseline --tasks fixture-checkout --mock # free pipeline check + +# Real runs +node eval/run-matrix.mjs --conditions pilot --tasks fixture-checkout # cheap pilot +node eval/run-matrix.mjs --conditions baseline --tasks all --trials 3 # full baseline +node eval/run-matrix.mjs --conditions baseline,inner-haiku,inner-opus,outer-sonnet,outer-prompt-lean \ + --tasks fixture-checkout,fixture-flightdeck,books-toscrape --trials 3 # model/prompt screen on Tier A + +npm run report # markdown scorecards +node eval/report.mjs --json # raw aggregates +``` + +The fixture server (`npm run fixtures`, port 4173) auto-starts when a selected task needs it. + +## Metrics (see report footer for definitions) + +- **Convergence:** converged-rate, iters-to-first-verified-pass, regressions, cumulative train cost (inner + outer) +- **Accuracy:** holdout pass rate (frozen strategy, fresh runs), **false-success rate** (claimed success, verifier failed) +- **Speed:** holdout wall clock split into browser ms (sum of browse-CLI `duration_ms` in trace.json) vs model ms +- **Tokens/cost:** per-run tokens, recomputed centrally in `eval/lib/pricing.mjs` (don't trust evaluate.mjs's stale table), and **skill value** = how much the learned strategy cheapens a run vs the blind iteration-1 attempt (tests the README's "80%+ reduction" claim) + +## Experiment design notes + +- **Screen, don't grid.** Vary one axis at a time against `baseline` (5 conditions ship: baseline, inner-haiku, inner-opus, outer-sonnet, outer-prompt-lean). Deep-dive only the interesting 2–3 combos. +- **Pair comparisons on the same tasks**; live-site variance makes unpaired suite means meaningless. Tier C reports separately. +- **Trials:** ≥3 per cell for anything you'll make a decision on. `results.jsonl` is append-only — rerun cells freely, the report aggregates. +- **Cost calibration:** run `pilot` on one Tier A task first and read `inner_cost_usd`/`outer_cost_usd` from `runs/results.jsonl` before launching a sweep. + +## Fidelity caveats / roadmap + +- The scripted outer agent sees a curated evidence pack (summary, verifier verdict, failed commands), not the full tool-using trace exploration Claude Code does. A Claude-Agent-SDK outer agent with Read/Grep tools is the natural next architecture variant — and would also let `--browser-trace` evidence (unified-events.jsonl) become a sweepable axis. +- `codegen.mjs --verify` (deterministic script artifact) isn't wired into the matrix yet; its runner protocol is identical to the verifier protocol here, so it slots in as a fourth phase. +- The local checkout's `judge.mjs` (A/B strategy judge) and `--supervise` watcher are complementary: the judge compares strategy *versions* by run evidence; this harness compares *conditions* by verified outcomes. `supervised` already lands in evaluate.mjs's meta.json and could become another condition axis. diff --git a/skills/autobrowse/evals/RESULTS.md b/skills/autobrowse/evals/RESULTS.md new file mode 100644 index 0000000..732ea74 --- /dev/null +++ b/skills/autobrowse/evals/RESULTS.md @@ -0,0 +1,34 @@ +# Eval results — 2026-06-09 (Fable 5 vs Opus 4.8) + +First findings from this harness, comparing `claude-fable-5` and `claude-opus-4-8` in both autobrowse roles. ~200 verified runs, ~$220 API spend. Small n (2–3 trials/cell) — directional, not definitive. + +## Headline + +**Best configuration tested: Sonnet 4.6 as the inner (browsing) agent + Fable 5 as the outer (strategy-writing) agent.** On the OpenTable task it produced the most reliable *and* cheapest converged runs of any cell — beating even Opus-as-browser — because the expensive model's intelligence lands in `strategy.md` once instead of in every run. + +## OpenTable 2×2 (Tier B, Akamai-walled, verified+proxied Browserbase sessions) + +| Inner ↓ / Outer → | Opus 4.8 writes | Fable 5 writes | +|---|---|---| +| **Sonnet 4.6 browses** | 5/6 holdout, $1.40/run, 90s | **6/6 holdout, $0.96/run, 64s** | +| **Opus 4.8 browses** | 6/6, $1.20/run, 63s | — | +| **Fable 5 browses** | 5/6, $1.66/run, 93s | — | + +- **Inner axis:** Opus beat Fable as the browser — same convergence (iter 2–3), half the training cost (~$5.5 vs ~$11/trial), perfect holdout. Fable reasons more per turn; at 2× token pricing that compounds (blind iteration-1 attempts: ~$7 vs ~$3). +- **Outer axis (same Sonnet inner in both):** Fable-authored strategies were more reliable (6/6 vs 5/6) and made the same agent ~30% faster and cheaper. Qualitatively, Fable's skills encode *mechanisms* — React hydration timing ("`wait load` returns before the widget renders; snapshot shows ~2 refs"), Akamai cookie behavior ("`browse stop` wipes cookies → never stop the session"), broken-command landmines ("`wait selector text=...` ETIMEDOUTs") — where Opus's skills describe symptoms. Same pattern appeared on the Tier A fixtures: Fable was the only outer model to identify a deliberately planted 900ms delayed-render trap and prescribe the exact fix. +- Fable's outer calls cost $0.13 vs Opus's $0.05 per improvement — negligible in absolute terms. + +## Tier A fixtures (deterministic local sites) + +- All models 100% on the easy task; differentiation is pure cost (Sonnet $0.16/run, Opus $0.60, Fable $0.97). On tasks the cheap model already does, frontier inner agents are pure overhead. +- On the trap-laden checkout fixture, inner reliability ranked Fable (6/6) > Opus (5/6) > Sonnet (4/6) — monotonic with price. This did **not** generalize to OpenTable, where Opus matched/beat Fable as inner. + +## Other observations + +- **Zero false-successes in ~200 runs** — no model claimed `success:true` against a failing verifier. Failures were honest (turn-budget exhaustion, no final JSON). +- **Live-site drift is real:** Akamai blocked every iteration-1 attempt in a morning round and none in an evening round. Only within-round (concurrent, paired) comparisons are valid on live sites. +- One Fable-cell strategy explicitly reasoned about the grader ("the verifier requires success:true — persist"). Benign here (persistence, not fabrication), but a preview of strategies evolving against the verifier's letter on harder tasks. + +## Recommended default + +`inner_model: claude-sonnet-4-6`, `outer_model: claude-fable-5`, escalating the inner to Opus only when a task fails to converge because the inner agent can't execute good instructions. Training cost per new skill: ~$1–2; converged verified runs: ~$1. diff --git a/skills/autobrowse/evals/eval/conditions/baseline.json b/skills/autobrowse/evals/eval/conditions/baseline.json new file mode 100644 index 0000000..54ee28e --- /dev/null +++ b/skills/autobrowse/evals/eval/conditions/baseline.json @@ -0,0 +1,9 @@ +{ + "id": "baseline", + "notes": "Default autobrowse setup: Sonnet inner agent (evaluate.mjs default), Opus outer agent, full SKILL.md-style outer prompt.", + "inner_model": "claude-sonnet-4-6", + "outer_model": "claude-opus-4-8", + "outer_prompt": "outer-default", + "max_iters": 5, + "holdout_runs": 3 +} diff --git a/skills/autobrowse/evals/eval/conditions/inner-fable-5.json b/skills/autobrowse/evals/eval/conditions/inner-fable-5.json new file mode 100644 index 0000000..f8e8e27 --- /dev/null +++ b/skills/autobrowse/evals/eval/conditions/inner-fable-5.json @@ -0,0 +1,9 @@ +{ + "id": "inner-fable-5", + "notes": "Fable 5 as the INNER browsing agent (vs inner-opus / baseline). Measures raw browsing competence: iteration-1 pass rate, turns, holdout reliability. 2x Opus pricing — watch cost_to_converge.", + "inner_model": "claude-fable-5", + "outer_model": "claude-opus-4-8", + "outer_prompt": "outer-default", + "max_iters": 5, + "holdout_runs": 3 +} diff --git a/skills/autobrowse/evals/eval/conditions/inner-haiku.json b/skills/autobrowse/evals/eval/conditions/inner-haiku.json new file mode 100644 index 0000000..1f2b701 --- /dev/null +++ b/skills/autobrowse/evals/eval/conditions/inner-haiku.json @@ -0,0 +1,9 @@ +{ + "id": "inner-haiku", + "notes": "Cheap inner agent hypothesis: a smart outer agent distills intelligence into strategy.md, so a Haiku inner agent should converge to the same place at a fraction of the cost.", + "inner_model": "claude-haiku-4-5", + "outer_model": "claude-opus-4-8", + "outer_prompt": "outer-default", + "max_iters": 7, + "holdout_runs": 3 +} diff --git a/skills/autobrowse/evals/eval/conditions/inner-opus.json b/skills/autobrowse/evals/eval/conditions/inner-opus.json new file mode 100644 index 0000000..48657e9 --- /dev/null +++ b/skills/autobrowse/evals/eval/conditions/inner-opus.json @@ -0,0 +1,9 @@ +{ + "id": "inner-opus", + "notes": "Expensive inner agent: does a frontier inner model converge in fewer iterations, and does that offset its per-run cost?", + "inner_model": "claude-opus-4-8", + "outer_model": "claude-opus-4-8", + "outer_prompt": "outer-default", + "max_iters": 5, + "holdout_runs": 3 +} diff --git a/skills/autobrowse/evals/eval/conditions/outer-fable-5.json b/skills/autobrowse/evals/eval/conditions/outer-fable-5.json new file mode 100644 index 0000000..9f742b7 --- /dev/null +++ b/skills/autobrowse/evals/eval/conditions/outer-fable-5.json @@ -0,0 +1,9 @@ +{ + "id": "outer-fable-5", + "notes": "Fable 5 as the OUTER strategy-improver (vs baseline's Opus 4.8). Measures hypothesis-formation quality: convergence speed, regressions, holdout pass rate of the strategies it writes. Outer calls are small, so the 2x pricing barely matters here.", + "inner_model": "claude-sonnet-4-6", + "outer_model": "claude-fable-5", + "outer_prompt": "outer-default", + "max_iters": 5, + "holdout_runs": 3 +} diff --git a/skills/autobrowse/evals/eval/conditions/outer-prompt-lean.json b/skills/autobrowse/evals/eval/conditions/outer-prompt-lean.json new file mode 100644 index 0000000..b3736a3 --- /dev/null +++ b/skills/autobrowse/evals/eval/conditions/outer-prompt-lean.json @@ -0,0 +1,9 @@ +{ + "id": "outer-prompt-lean", + "notes": "Prompt ablation: strip the SKILL.md-style guidance (one-hypothesis rule, build-on-wins, evidence grounding) from the outer prompt. Measures how much of convergence quality comes from the methodology vs the model.", + "inner_model": "claude-sonnet-4-6", + "outer_model": "claude-opus-4-8", + "outer_prompt": "outer-lean", + "max_iters": 5, + "holdout_runs": 3 +} diff --git a/skills/autobrowse/evals/eval/conditions/outer-sonnet.json b/skills/autobrowse/evals/eval/conditions/outer-sonnet.json new file mode 100644 index 0000000..98ae7d9 --- /dev/null +++ b/skills/autobrowse/evals/eval/conditions/outer-sonnet.json @@ -0,0 +1,9 @@ +{ + "id": "outer-sonnet", + "notes": "Cheaper outer agent: is Opus-level hypothesis formation actually load-bearing, or can Sonnet read traces and improve strategies just as well?", + "inner_model": "claude-sonnet-4-6", + "outer_model": "claude-sonnet-4-6", + "outer_prompt": "outer-default", + "max_iters": 5, + "holdout_runs": 3 +} diff --git a/skills/autobrowse/evals/eval/conditions/pilot.json b/skills/autobrowse/evals/eval/conditions/pilot.json new file mode 100644 index 0000000..fc5280b --- /dev/null +++ b/skills/autobrowse/evals/eval/conditions/pilot.json @@ -0,0 +1,9 @@ +{ + "id": "pilot", + "notes": "Cheap smoke-test condition for validating the real (non-mock) pipeline: 2 training iterations max, 1 holdout run.", + "inner_model": "claude-sonnet-4-6", + "outer_model": "claude-opus-4-8", + "outer_prompt": "outer-default", + "max_iters": 2, + "holdout_runs": 1 +} diff --git a/skills/autobrowse/evals/eval/config.mjs b/skills/autobrowse/evals/eval/config.mjs new file mode 100644 index 0000000..0dbd72c --- /dev/null +++ b/skills/autobrowse/evals/eval/config.mjs @@ -0,0 +1,61 @@ +import * as fs from "node:fs"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +export const ROOT = path.resolve(__dirname, ".."); +export const EVAL_DIR = path.join(ROOT, "eval"); +export const TASKS_DIR = path.join(EVAL_DIR, "tasks"); +export const CONDITIONS_DIR = path.join(EVAL_DIR, "conditions"); +export const PROMPTS_DIR = path.join(EVAL_DIR, "prompts"); +export const FIXTURES_DIR = path.join(ROOT, "fixtures"); +export const RUNS_DIR = path.join(ROOT, "runs"); +export const RESULTS_FILE = path.join(RUNS_DIR, "results.jsonl"); + +const AUTOBROWSE_CANDIDATES = [ + process.env.AUTOBROWSE_DIR, + path.resolve(ROOT, ".."), // evals/ ships inside the autobrowse skill + path.join(ROOT, "vendor", "skills", "skills", "autobrowse"), +].filter(Boolean); + +export function resolveAutobrowseDir() { + for (const dir of AUTOBROWSE_CANDIDATES) { + if (fs.existsSync(path.join(dir, "scripts", "evaluate.mjs"))) return dir; + } + throw new Error( + "autobrowse skill not found. Set AUTOBROWSE_DIR to the directory containing scripts/evaluate.mjs " + + "(e.g. a checkout of github.com/browserbase/skills at skills/autobrowse)." + ); +} + +export function loadCondition(idOrPath) { + const p = idOrPath.endsWith(".json") + ? path.resolve(idOrPath) + : path.join(CONDITIONS_DIR, `${idOrPath}.json`); + const cond = JSON.parse(fs.readFileSync(p, "utf-8")); + // Defaults + return { + max_iters: 5, + holdout_runs: 3, + converge_window: 3, + converge_passes: 2, + outer_prompt: "outer-default", + browser_trace: false, + ...cond, + }; +} + +export function loadTaskMeta(task) { + const p = path.join(TASKS_DIR, task, "meta.json"); + const meta = JSON.parse(fs.readFileSync(p, "utf-8")); + return { env: "local", max_turns: 30, timeout_min: 20, ...meta, task }; +} + +export function listTasks() { + return fs + .readdirSync(TASKS_DIR, { withFileTypes: true }) + .filter((d) => d.isDirectory() && !d.name.startsWith("_")) + .map((d) => d.name) + .sort(); +} diff --git a/skills/autobrowse/evals/eval/lib/extract-output.mjs b/skills/autobrowse/evals/eval/lib/extract-output.mjs new file mode 100644 index 0000000..fcf604c --- /dev/null +++ b/skills/autobrowse/evals/eval/lib/extract-output.mjs @@ -0,0 +1,47 @@ +import * as fs from "node:fs"; +import * as path from "node:path"; + +// Pull the last fenced ```json block (or last bare {...}) from text. +// Mirrors extractFinalJson in the newer evaluate.mjs. +export function extractJsonFromText(text) { + if (!text) return null; + const fences = [...text.matchAll(/```(?:json)?\s*([\s\S]*?)```/gi)]; + let candidate = fences.length ? fences[fences.length - 1][1].trim() : null; + if (!candidate) { + const first = text.indexOf("{"); + const last = text.lastIndexOf("}"); + if (first !== -1 && last > first) candidate = text.slice(first, last + 1); + } + if (!candidate) return null; + try { + return JSON.parse(candidate); + } catch { + return null; + } +} + +// Load the inner agent's final structured output from a run's trace dir. +// Newer evaluate.mjs writes result.json ({parsed, raw, parse_error}); +// fall back to parsing summary.md's "Agent Final Output" section for the +// upstream version that doesn't. +export function loadRunOutput(runDir) { + const resultPath = path.join(runDir, "result.json"); + if (fs.existsSync(resultPath)) { + try { + const r = JSON.parse(fs.readFileSync(resultPath, "utf-8")); + if (r && "parsed" in r) return r.parsed; + return r; + } catch { + /* fall through */ + } + } + const summaryPath = path.join(runDir, "summary.md"); + if (fs.existsSync(summaryPath)) { + const summary = fs.readFileSync(summaryPath, "utf-8"); + const idx = summary.indexOf("## Agent Final Output"); + const tail = idx === -1 ? summary : summary.slice(idx); + const parsed = extractJsonFromText(tail); + if (parsed) return parsed; + } + return null; +} diff --git a/skills/autobrowse/evals/eval/lib/pricing.mjs b/skills/autobrowse/evals/eval/lib/pricing.mjs new file mode 100644 index 0000000..f1bc8a9 --- /dev/null +++ b/skills/autobrowse/evals/eval/lib/pricing.mjs @@ -0,0 +1,19 @@ +// Central pricing table — single source of truth for the whole harness. +// USD per 1M tokens [input, output]. Do not trust per-script tables elsewhere +// (evaluate.mjs has its own stale copy; we recompute from raw token counts). +const PRICING = [ + ["claude-fable-5", [10, 50]], + ["claude-opus-4-8", [5, 25]], + ["claude-opus-4-7", [5, 25]], + ["claude-opus-4-6", [5, 25]], + ["claude-opus-4-5", [5, 25]], + ["claude-sonnet-4-6", [3, 15]], + ["claude-sonnet-4-5", [3, 15]], + ["claude-haiku-4-5", [1, 5]], +]; + +export function costUsd(model, tokensIn, tokensOut) { + const entry = PRICING.find(([prefix]) => model?.startsWith(prefix)); + const [inRate, outRate] = entry ? entry[1] : [3, 15]; + return (tokensIn * inRate + tokensOut * outRate) / 1_000_000; +} diff --git a/skills/autobrowse/evals/eval/lib/results.mjs b/skills/autobrowse/evals/eval/lib/results.mjs new file mode 100644 index 0000000..1deb4b6 --- /dev/null +++ b/skills/autobrowse/evals/eval/lib/results.mjs @@ -0,0 +1,29 @@ +import * as fs from "node:fs"; +import * as path from "node:path"; +import { RESULTS_FILE } from "../config.mjs"; + +// One results.jsonl record per inner-agent run (training iteration, holdout +// run) — append-only, everything downstream is a query over this file. +// +// Schema (all rows): +// ts, condition_id, task, tier, trial, phase: "train"|"holdout", iter, +// run_id, env, inner_model, outer_model, +// verified_pass, claimed_success, false_success, verifier_reason, +// status, stop_reason, turns, duration_sec, browser_ms, model_ms, +// tool_calls, tool_errors, tokens_in, tokens_out, inner_cost_usd, +// outer_tokens_in, outer_tokens_out, outer_cost_usd, hypothesis, +// converged_at (train rows on the converging iteration), mock + +export function appendResult(record, file = RESULTS_FILE) { + fs.mkdirSync(path.dirname(file), { recursive: true }); + fs.appendFileSync(file, JSON.stringify({ ts: new Date().toISOString(), ...record }) + "\n"); +} + +export function readResults(file = RESULTS_FILE) { + if (!fs.existsSync(file)) return []; + return fs + .readFileSync(file, "utf-8") + .split("\n") + .filter(Boolean) + .map((line) => JSON.parse(line)); +} diff --git a/skills/autobrowse/evals/eval/lib/run-inner.mjs b/skills/autobrowse/evals/eval/lib/run-inner.mjs new file mode 100644 index 0000000..5f21a42 --- /dev/null +++ b/skills/autobrowse/evals/eval/lib/run-inner.mjs @@ -0,0 +1,203 @@ +import * as fs from "node:fs"; +import * as path from "node:path"; +import { spawnSync } from "node:child_process"; +import { resolveAutobrowseDir, TASKS_DIR } from "../config.mjs"; + +// Run isolation: kill any existing browse daemon so a stale session (e.g. a +// leftover REMOTE Browserbase session that can't reach localhost fixtures) +// doesn't poison the run. Active sessions don't switch local/remote on their +// own — the pilot run failed exactly this way. +function browseStop() { + try { + spawnSync("browse", ["stop"], { encoding: "utf-8", timeout: 30_000 }); + } catch { + /* no daemon running is fine */ + } +} + +function createCloudSession() { + const created = spawnSync( + "browse", + ["cloud", "sessions", "create", "--keep-alive", "--verified", "--proxies"], + { encoding: "utf-8", timeout: 120_000 } + ); + const id = JSON.parse(created.stdout).id; + const got = spawnSync("browse", ["cloud", "sessions", "get", id], { encoding: "utf-8", timeout: 60_000 }); + return { id, connectUrl: JSON.parse(got.stdout).connectUrl }; +} + +function releaseCloudSession(id) { + try { + spawnSync("browse", ["cloud", "sessions", "update", id, "--status", "REQUEST_RELEASE"], { + encoding: "utf-8", + timeout: 60_000, + }); + } catch { + /* keep-alive sessions also expire server-side eventually */ + } +} + +// Mode-default shim: on this machine `browse open` without a flag defaults to +// REMOTE (Browserbase creds present), so an inner agent that drops --local +// silently gets a cloud browser that can't reach localhost fixtures. The shim +// appends the task's required mode flag to `browse open` ONLY when the agent +// passed neither --local nor --remote — explicit agent choices still win. +// Same spirit as evaluate.mjs's own --cdp/--session arg rewriting. +function makeBrowseShim(workspace, env) { + const realBrowse = spawnSync("which", ["browse"], { encoding: "utf-8" }).stdout.trim(); + if (!realBrowse) return null; + const binDir = path.join(workspace, ".bin"); + fs.mkdirSync(binDir, { recursive: true }); + const flag = env === "remote" ? "--remote" : "--local"; + const shim = `#!/bin/sh +REAL="${realBrowse}" +if [ "$1" = "open" ]; then + for a in "$@"; do + if [ "$a" = "--local" ] || [ "$a" = "--remote" ]; then exec "$REAL" "$@"; fi + done + exec "$REAL" "$@" ${flag} +fi +exec "$REAL" "$@" +`; + const shimPath = path.join(binDir, "browse"); + fs.writeFileSync(shimPath, shim, { mode: 0o755 }); + return binDir; +} + +// Run one inner-agent attempt (evaluate.mjs) and return its structured +// result: {status, stop_reason, duration_sec, turns, tokens_in, tokens_out, +// trace_dir, ...}. stderr (the live decision log) is teed to a log file. +export function runInner({ task, workspace, env, model, maxTurns, timeoutMin, mock, iter, logFile }) { + if (mock) return runInnerMock({ task, workspace, iter }); + + const autobrowseDir = resolveAutobrowseDir(); + let cloud = null; + let shimDir = null; + + if (env === "remote") { + // Remote isolation + bot-protection: pre-create a verified+proxied + // Browserbase session and hand its connectUrl to evaluate.mjs, which + // rewrites every inner browse call to attach via --cdp with a + // connectUrl-hashed daemon session name. Concurrent runs never collide, + // and plain `browse open --remote` (which can't request --verified / + // --proxies and gets Akamai-walled on e.g. OpenTable) is bypassed. + try { + cloud = createCloudSession(); + } catch (err) { + return { + status: "harness_error", + stop_reason: `cloud session create failed: ${err.message}`, + duration_sec: null, turns: null, tokens_in: 0, tokens_out: 0, trace_dir: null, + }; + } + } else { + browseStop(); + shimDir = makeBrowseShim(workspace, env); + // Pre-warm the local daemon: Chrome cold-start can exceed evaluate.mjs's + // 30s exec timeout, which kills the agent's first `browse open` + // mid-handshake and strands the session (~20 wasted turns recovering). + spawnSync("browse", ["open", "about:blank", "--local", "--timeout", "90000"], { + encoding: "utf-8", + timeout: 120_000, + }); + } + + const args = [ + path.join(autobrowseDir, "scripts", "evaluate.mjs"), + "--task", task, + "--workspace", workspace, + "--env", env, + "--model", model, + ...(cloud ? ["--connect-url", cloud.connectUrl] : []), + ]; + let res; + try { + res = spawnSync("node", args, { + encoding: "utf-8", + timeout: (timeoutMin ?? 20) * 60 * 1000, + maxBuffer: 32 * 1024 * 1024, + env: { + ...process.env, + MAX_TURNS: String(maxTurns ?? 30), + ...(shimDir ? { PATH: `${shimDir}:${process.env.PATH}` } : {}), + }, + }); + } finally { + if (cloud) releaseCloudSession(cloud.id); + } + + if (logFile) { + fs.mkdirSync(path.dirname(logFile), { recursive: true }); + fs.writeFileSync(logFile, (res.stderr || "") + "\n--- stdout ---\n" + (res.stdout || "")); + } + + // evaluate.mjs prints exactly one JSON line on stdout (diagnostics → stderr). + const lines = (res.stdout || "").trim().split("\n").filter(Boolean); + for (let i = lines.length - 1; i >= 0; i--) { + try { + const parsed = JSON.parse(lines[i]); + if (parsed && parsed.trace_dir) return parsed; + } catch { + /* keep scanning */ + } + } + return { + status: "harness_error", + stop_reason: res.error ? String(res.error) : `exit=${res.status}`, + duration_sec: null, turns: null, tokens_in: 0, tokens_out: 0, trace_dir: null, + }; +} + +// ── Mock mode ─────────────────────────────────────────────────────── +// Fabricates a plausible run without browse/Anthropic. Behavior: the run +// passes iff strategy.md contains the marker "MOCK-FIX" (which the mock +// outer agent adds on its second improvement). Failing runs CLAIM success +// with garbage output, deliberately exercising the false-success metric. +function runInnerMock({ task, workspace, iter }) { + const tracesDir = path.join(workspace, "traces", task); + fs.mkdirSync(tracesDir, { recursive: true }); + const runNumber = fs.readdirSync(tracesDir).filter((d) => d.startsWith("run-")).length + 1; + const runId = `run-${String(runNumber).padStart(3, "0")}`; + const traceDir = path.join(tracesDir, runId); + fs.mkdirSync(traceDir, { recursive: true }); + + const strategyFile = path.join(workspace, "tasks", task, "strategy.md"); + fs.mkdirSync(path.dirname(strategyFile), { recursive: true }); + if (!fs.existsSync(strategyFile)) fs.writeFileSync(strategyFile, `# ${task} Navigation Skill\n`); + const strategy = fs.readFileSync(strategyFile, "utf-8"); + const passes = strategy.includes("MOCK-FIX"); + + let output; + if (passes) { + const mockOutputPath = path.join(TASKS_DIR, task, "mock-output.json"); + output = JSON.parse(fs.readFileSync(mockOutputPath, "utf-8")); + } else { + output = { success: true, note: "fabricated-by-mock-failure", value: 42 }; + } + + const turns = passes ? 9 : 24; + const trace = []; + for (let t = 1; t <= Math.min(turns, 6); t++) { + trace.push({ turn: t, role: "assistant", tool_name: "execute", tool_input: { command: "browse snapshot" } }); + trace.push({ turn: t, role: "tool_result", command: "browse snapshot", output: "[0-1] mock", error: !passes && t === 4, duration_ms: 800 + t * 120 }); + } + fs.writeFileSync(path.join(traceDir, "trace.json"), JSON.stringify(trace, null, 2)); + fs.writeFileSync(path.join(traceDir, "result.json"), JSON.stringify({ parsed: output, raw: JSON.stringify(output), parse_error: null }, null, 2)); + fs.writeFileSync( + path.join(traceDir, "summary.md"), + `# ${task} — ${runId} (MOCK)\n\n**Status:** ${passes ? "completed" : "max_turns"}\n\n## Agent Final Output\n\n\`\`\`json\n${JSON.stringify(output, null, 2)}\n\`\`\`\n` + ); + + const tokensIn = passes ? 40_000 : 140_000; + const tokensOut = passes ? 2_000 : 7_000; + return { + task, run: runId, + status: passes ? "completed" : "max_turns", + stop_reason: passes ? "end_turn" : "max_turns", + duration_sec: passes ? 45.0 : 210.0, + turns, + tokens_in: tokensIn, tokens_out: tokensOut, + trace_dir: traceDir, + mock: true, + }; +} diff --git a/skills/autobrowse/evals/eval/lib/run-verifier.mjs b/skills/autobrowse/evals/eval/lib/run-verifier.mjs new file mode 100644 index 0000000..ca9a9df --- /dev/null +++ b/skills/autobrowse/evals/eval/lib/run-verifier.mjs @@ -0,0 +1,30 @@ +import * as path from "node:path"; +import { spawnSync } from "node:child_process"; +import { TASKS_DIR } from "../config.mjs"; + +// Verifier protocol (mirrors the codegen runner protocol in autobrowse): +// node eval/tasks//verify.mjs --run-dir +// prints exactly one JSON line: {passed: bool, checks: [{name, ok, detail}], reason} +export function runVerifier(task, runDir) { + const verifier = path.join(TASKS_DIR, task, "verify.mjs"); + const res = spawnSync("node", [verifier, "--run-dir", runDir], { + encoding: "utf-8", + timeout: 5 * 60 * 1000, // some verifiers re-check live state + maxBuffer: 8 * 1024 * 1024, + }); + const lines = (res.stdout || "").trim().split("\n").filter(Boolean); + for (let i = lines.length - 1; i >= 0; i--) { + try { + const parsed = JSON.parse(lines[i]); + if (parsed && typeof parsed.passed === "boolean") return parsed; + } catch { + /* keep scanning */ + } + } + return { + passed: false, + checks: [], + reason: `verifier did not emit a {passed:boolean} JSON line; exit=${res.status} stderr=${(res.stderr || "").slice(0, 300)}`, + verifier_error: true, + }; +} diff --git a/skills/autobrowse/evals/eval/lib/trace-stats.mjs b/skills/autobrowse/evals/eval/lib/trace-stats.mjs new file mode 100644 index 0000000..0ff6881 --- /dev/null +++ b/skills/autobrowse/evals/eval/lib/trace-stats.mjs @@ -0,0 +1,26 @@ +import * as fs from "node:fs"; +import * as path from "node:path"; + +// Split a run's wall clock into browser time (sum of browse-CLI command +// durations recorded in trace.json) and model time (the remainder). +export function traceStats(runDir, durationSec) { + const out = { browser_ms: null, model_ms: null, tool_errors: 0, tool_calls: 0 }; + const tracePath = path.join(runDir, "trace.json"); + if (!fs.existsSync(tracePath)) return out; + try { + const trace = JSON.parse(fs.readFileSync(tracePath, "utf-8")); + let browserMs = 0; + for (const entry of trace) { + if (entry.role === "tool_result") { + out.tool_calls++; + browserMs += entry.duration_ms || 0; + if (entry.error) out.tool_errors++; + } + } + out.browser_ms = browserMs; + if (durationSec != null) out.model_ms = Math.max(0, Math.round(durationSec * 1000 - browserMs)); + } catch { + /* leave nulls */ + } + return out; +} diff --git a/skills/autobrowse/evals/eval/outer-agent.mjs b/skills/autobrowse/evals/eval/outer-agent.mjs new file mode 100644 index 0000000..987286d --- /dev/null +++ b/skills/autobrowse/evals/eval/outer-agent.mjs @@ -0,0 +1,122 @@ +import "dotenv/config"; +import * as fs from "node:fs"; +import * as path from "node:path"; +import Anthropic from "@anthropic-ai/sdk"; +import { PROMPTS_DIR } from "./config.mjs"; +import { costUsd } from "./lib/pricing.mjs"; + +// Scripted stand-in for the interactive Claude Code outer loop. One +// structured-output call per iteration: evidence in → {diagnosis, hypothesis, +// new_strategy} out. This is what makes the outer model and outer prompt +// sweepable eval variables, and what lets us meter outer-agent tokens (the +// interactive loop never records them). +// +// Fidelity note: the real outer loop can drill into trace.json and +// screenshots tool-by-tool. We approximate with a curated evidence pack +// (summary, verifier verdict, error lines). A Claude-Agent-SDK-driven outer +// agent with Read/Grep tools is the natural follow-up architecture variant. + +const STRATEGY_SCHEMA = { + type: "object", + properties: { + diagnosis: { + type: "string", + description: "What went wrong (or what is fragile), citing specific turns/errors from the evidence.", + }, + hypothesis: { + type: "string", + description: "The ONE change being tested this iteration and why it should fix the diagnosis.", + }, + new_strategy: { + type: "string", + description: "The complete new strategy.md file content.", + }, + }, + required: ["diagnosis", "hypothesis", "new_strategy"], + additionalProperties: false, +}; + +const clip = (s, n) => (s && s.length > n ? s.slice(0, n) + `\n...[clipped ${s.length - n} chars]` : s || ""); + +function collectErrorLines(traceDir, max = 15) { + try { + const trace = JSON.parse(fs.readFileSync(path.join(traceDir, "trace.json"), "utf-8")); + return trace + .filter((e) => e.role === "tool_result" && e.error) + .map((e) => `turn ${e.turn}: ${e.command} → ${String(e.output).slice(0, 200)}`) + .slice(-max); + } catch { + return []; + } +} + +let client = null; + +export async function improveStrategy({ model, promptName, taskMd, strategyMd, runResult, verifierResult, mock, iter }) { + if (mock) return improveStrategyMock({ strategyMd, iter }); + + client ??= new Anthropic(); + const systemPrompt = fs.readFileSync(path.join(PROMPTS_DIR, `${promptName}.md`), "utf-8"); + + const traceDir = runResult.trace_dir; + let summary = ""; + try { + summary = fs.readFileSync(path.join(traceDir, "summary.md"), "utf-8"); + } catch { /* missing summary is survivable */ } + const errors = collectErrorLines(traceDir); + + const userMessage = [ + "# Task definition (task.md)\n", clip(taskMd, 4_000), + "\n\n# Current strategy.md\n", clip(strategyMd, 8_000), + "\n\n# Run evidence\n", + `Status: ${runResult.status} (${runResult.stop_reason}) | Turns: ${runResult.turns} | Duration: ${runResult.duration_sec}s\n`, + "\n## Verifier verdict (ground truth)\n```json\n", JSON.stringify(verifierResult, null, 2), "\n```\n", + errors.length ? `\n## Failed commands\n${errors.join("\n")}\n` : "", + "\n## Run summary (decision log + final output)\n", clip(summary, 14_000), + ].join(""); + + const response = await client.messages.create({ + model, + max_tokens: 16000, + thinking: { type: "adaptive" }, + system: systemPrompt, + messages: [{ role: "user", content: userMessage }], + output_config: { format: { type: "json_schema", schema: STRATEGY_SCHEMA } }, + }); + + const text = response.content.filter((b) => b.type === "text").map((b) => b.text).join(""); + let parsed; + try { + parsed = JSON.parse(text); + } catch (err) { + throw new Error(`outer agent returned unparseable output: ${err.message}: ${text.slice(0, 200)}`); + } + + const tokensIn = response.usage.input_tokens; + const tokensOut = response.usage.output_tokens; + return { + diagnosis: parsed.diagnosis, + hypothesis: parsed.hypothesis, + newStrategy: parsed.new_strategy, + tokens_in: tokensIn, + tokens_out: tokensOut, + cost_usd: costUsd(model, tokensIn, tokensOut), + }; +} + +// Mock: 1st improvement adds a useless note (run still fails), 2nd adds the +// MOCK-FIX marker that flips the mock inner agent to passing. +function improveStrategyMock({ strategyMd, iter }) { + const hasNote = strategyMd.includes("mock-note"); + const addition = hasNote + ? "\n## MOCK-FIX\nApply the fix that makes mock runs pass.\n" + : "\n## mock-note\nFirst hypothesis: wait longer. (mock — does not help)\n"; + return { + diagnosis: "mock diagnosis", + hypothesis: hasNote ? "add MOCK-FIX marker" : "add wait (will not help)", + newStrategy: strategyMd + addition, + tokens_in: 12_000, + tokens_out: 1_500, + cost_usd: costUsd("claude-opus-4-8", 12_000, 1_500), + }; +} diff --git a/skills/autobrowse/evals/eval/prompts/outer-default.md b/skills/autobrowse/evals/eval/prompts/outer-default.md new file mode 100644 index 0000000..d4c5605 --- /dev/null +++ b/skills/autobrowse/evals/eval/prompts/outer-default.md @@ -0,0 +1,13 @@ +You are the OUTER agent in the autobrowse self-improving loop. An inner browser-automation agent just attempted a task following the current strategy.md. Your job: read the evidence, form ONE hypothesis about the most impactful fix, and rewrite strategy.md. + +Rules — these mirror the autobrowse SKILL.md and are non-negotiable: + +1. **One hypothesis per iteration.** Find the exact turn where things went wrong. Ask: what single heuristic would have prevented it? Test one change at a time. +2. **Build on wins.** Keep everything in the current strategy that worked. Never throw away site-specific knowledge (selectors, timing notes, URL shortcuts) that the trace shows being used successfully. +3. **Be concrete.** Good strategies have: a fast path (direct URLs, shortcuts that skip exploration), a step-by-step workflow with exact commands and timing notes, site-specific knowledge (selector IDs, form field names, success indicators), and failure recovery (what to do when X goes wrong). +4. **Ground every claim in the trace.** Cite the turn number or error message that motivates your change. A hypothesis like "the click didn't work" is weak; "turn 12: `browse click [2-147]` returned 'element not found' because the snapshot was taken before the dropdown finished animating — add `browse wait timeout 1000` after opening the dropdown" is strong. +5. **The verifier verdict is ground truth.** If the inner agent claimed success but the verifier failed specific checks, the strategy must address WHY the agent extracted or did the wrong thing (wrong element, wrong filter, fabricated data) — and instruct it to verify before claiming success. +6. **If the run passed**, make only conservative refinements: tighten the fast path, remove dead exploration steps, shorten. Do not restructure a working strategy. +7. The strategy must work for a FRESH agent with no memory of previous runs. Write self-contained instructions, not commentary about past iterations. + +Return your full rewritten strategy.md — the complete file content, not a diff. diff --git a/skills/autobrowse/evals/eval/prompts/outer-lean.md b/skills/autobrowse/evals/eval/prompts/outer-lean.md new file mode 100644 index 0000000..2987596 --- /dev/null +++ b/skills/autobrowse/evals/eval/prompts/outer-lean.md @@ -0,0 +1 @@ +You improve instructions for a browser-automation agent. Below is the task, the current strategy.md, and what happened when an agent followed it (including an automated verifier's verdict). Figure out what went wrong and write a better strategy.md. Return the complete new file content. diff --git a/skills/autobrowse/evals/eval/report.mjs b/skills/autobrowse/evals/eval/report.mjs new file mode 100644 index 0000000..78d4bc9 Binary files /dev/null and b/skills/autobrowse/evals/eval/report.mjs differ diff --git a/skills/autobrowse/evals/eval/run-matrix.mjs b/skills/autobrowse/evals/eval/run-matrix.mjs new file mode 100644 index 0000000..175dc26 --- /dev/null +++ b/skills/autobrowse/evals/eval/run-matrix.mjs @@ -0,0 +1,251 @@ +#!/usr/bin/env node +// run-matrix.mjs — eval orchestrator: condition × task × trial. +// +// Per cell: TRAIN (evaluate → verify → improve strategy, up to max_iters, +// early-stop on convergence) then HOLDOUT (freeze best strategy, N fresh +// runs, verify each). Every run appends one row to runs/results.jsonl. +// +// Usage: +// node eval/run-matrix.mjs --conditions baseline --tasks fixture-checkout,books-toscrape +// node eval/run-matrix.mjs --conditions baseline,inner-haiku --tasks all --trials 3 +// node eval/run-matrix.mjs --conditions baseline --tasks fixture-checkout --mock +// Flags: --phase train|holdout|all (default all), --results + +import "dotenv/config"; +import * as fs from "node:fs"; +import * as path from "node:path"; +import * as net from "node:net"; +import { spawn } from "node:child_process"; +import { loadCondition, loadTaskMeta, listTasks, TASKS_DIR, RUNS_DIR, RESULTS_FILE, FIXTURES_DIR } from "./config.mjs"; +import { runInner } from "./lib/run-inner.mjs"; +import { runVerifier } from "./lib/run-verifier.mjs"; +import { loadRunOutput } from "./lib/extract-output.mjs"; +import { traceStats } from "./lib/trace-stats.mjs"; +import { appendResult } from "./lib/results.mjs"; +import { costUsd } from "./lib/pricing.mjs"; +import { improveStrategy } from "./outer-agent.mjs"; + +// ── CLI args ──────────────────────────────────────────────────────── + +function getArg(name, fallback) { + const idx = process.argv.indexOf(`--${name}`); + if (idx !== -1 && process.argv[idx + 1] && !process.argv[idx + 1].startsWith("--")) return process.argv[idx + 1]; + return fallback; +} +const hasFlag = (name) => process.argv.includes(`--${name}`); + +const conditionIds = getArg("conditions", "baseline").split(","); +const taskArg = getArg("tasks", "all"); +const trials = parseInt(getArg("trials", "1"), 10); +const trialOffset = parseInt(getArg("trial-offset", "0"), 10); // fresh trial numbers when adding runs later +const phase = getArg("phase", "all"); +const mock = hasFlag("mock"); +const resultsFile = getArg("results", RESULTS_FILE); + +const tasks = taskArg === "all" ? listTasks() : taskArg.split(","); + +// ── Fixture server (auto-start when a selected task needs it) ────── + +const FIXTURE_PORT = 4173; + +function portInUse(port) { + return new Promise((resolve) => { + const sock = net.connect({ port, host: "127.0.0.1" }, () => { sock.destroy(); resolve(true); }); + sock.on("error", () => resolve(false)); + }); +} + +async function ensureFixtures(metas) { + if (mock) return null; + if (!metas.some((m) => (m.requires || []).includes("fixtures-server"))) return null; + if (await portInUse(FIXTURE_PORT)) { + console.error(`[matrix] fixtures server already running on :${FIXTURE_PORT}`); + return null; + } + const child = spawn("node", [path.join(FIXTURES_DIR, "serve.mjs")], { stdio: "ignore", detached: false }); + await new Promise((r) => setTimeout(r, 600)); + console.error(`[matrix] started fixtures server on :${FIXTURE_PORT} (pid ${child.pid})`); + return child; +} + +// ── One eval cell: condition × task × trial ───────────────────────── + +async function runCell(cond, meta, trial) { + const task = meta.task; + const workspace = path.join(RUNS_DIR, cond.id, task, `trial-${trial}`); + const wsTaskDir = path.join(workspace, "tasks", task); + fs.mkdirSync(wsTaskDir, { recursive: true }); + fs.copyFileSync(path.join(TASKS_DIR, task, "task.md"), path.join(wsTaskDir, "task.md")); + const strategyFile = path.join(wsTaskDir, "strategy.md"); + if (!fs.existsSync(strategyFile)) fs.writeFileSync(strategyFile, `# ${task} Navigation Skill\n\n(learned through iterations)\n`); + + const taskMd = fs.readFileSync(path.join(wsTaskDir, "task.md"), "utf-8"); + const base = { + condition_id: cond.id, task, tier: meta.tier, trial, env: meta.env, + inner_model: cond.inner_model, outer_model: cond.outer_model, mock, + }; + + const recordRun = (phaseName, iter, runResult, verifierResult, extra = {}) => { + const output = runResult.trace_dir ? loadRunOutput(runResult.trace_dir) : null; + const claimed = output?.success === true; + const stats = runResult.trace_dir ? traceStats(runResult.trace_dir, runResult.duration_sec) : {}; + const row = { + ...base, + phase: phaseName, iter, + run_id: runResult.run ?? null, + verified_pass: verifierResult.passed, + claimed_success: claimed, + false_success: claimed && !verifierResult.passed, + verifier_reason: verifierResult.reason ?? null, + status: runResult.status, stop_reason: runResult.stop_reason, + turns: runResult.turns, duration_sec: runResult.duration_sec, + ...stats, + tokens_in: runResult.tokens_in, tokens_out: runResult.tokens_out, + inner_cost_usd: +costUsd(cond.inner_model, runResult.tokens_in || 0, runResult.tokens_out || 0).toFixed(4), + ...extra, + }; + appendResult(row, resultsFile); + return row; + }; + + // ── TRAIN ───────────────────────────────────────────────────────── + const trainPasses = []; + let lastPassingStrategyIter = null; + + if (phase !== "holdout") { + for (let iter = 1; iter <= cond.max_iters; iter++) { + // Snapshot the strategy this run will use (versioned for revert/holdout). + fs.copyFileSync(strategyFile, path.join(wsTaskDir, `strategy.iter-${iter}.md`)); + + console.error(`[matrix] ${cond.id}/${task}/trial-${trial} TRAIN iter ${iter}/${cond.max_iters}`); + const runResult = runInner({ + task, workspace, env: meta.env, model: cond.inner_model, + maxTurns: meta.max_turns, timeoutMin: meta.timeout_min, mock, iter, + logFile: path.join(workspace, "logs", `train-iter-${iter}.log`), + }); + const verifierResult = runResult.trace_dir + ? runVerifier(task, runResult.trace_dir) + : { passed: false, checks: [], reason: "no trace dir (harness error)" }; + + trainPasses.push(verifierResult.passed); + if (verifierResult.passed) lastPassingStrategyIter = iter; + + const window = trainPasses.slice(-cond.converge_window); + const converged = + verifierResult.passed && + window.filter(Boolean).length >= cond.converge_passes && + trainPasses.length >= 2; + + const regression = iter > 1 && trainPasses[iter - 2] === true && verifierResult.passed === false; + + let improvement = null; + if (!converged && iter < cond.max_iters) { + // Revert a regressing edit before improving again (SKILL.md policy). + if (regression) { + fs.copyFileSync(path.join(wsTaskDir, `strategy.iter-${iter - 1}.md`), strategyFile); + console.error(`[matrix] regression — reverted strategy to iter ${iter - 1}`); + } + const strategyMd = fs.readFileSync(strategyFile, "utf-8"); + try { + improvement = await improveStrategy({ + model: cond.outer_model, promptName: cond.outer_prompt, + taskMd, strategyMd, runResult, verifierResult, mock, iter, + }); + fs.writeFileSync(strategyFile, improvement.newStrategy); + } catch (err) { + console.error(`[matrix] outer agent error: ${err.message}`); + improvement = { hypothesis: `OUTER-AGENT-ERROR: ${err.message}`, tokens_in: 0, tokens_out: 0, cost_usd: 0 }; + } + } + + recordRun("train", iter, runResult, verifierResult, { + regression, + converged_at: converged ? iter : null, + hypothesis: improvement?.hypothesis ?? null, + outer_tokens_in: improvement?.tokens_in ?? 0, + outer_tokens_out: improvement?.tokens_out ?? 0, + outer_cost_usd: improvement ? +improvement.cost_usd.toFixed(4) : 0, + }); + + if (converged) { + console.error(`[matrix] converged at iter ${iter}`); + break; + } + } + } + + // ── HOLDOUT ─────────────────────────────────────────────────────── + if (phase !== "train") { + // Freeze the best strategy: the last version that produced a verified + // pass; else whatever training ended with. + if (lastPassingStrategyIter !== null) { + const best = path.join(wsTaskDir, `strategy.iter-${lastPassingStrategyIter}.md`); + // The passing run used the strategy *as snapshotted before that run*, + // unless it was also improved after — the snapshot is the right artifact. + fs.copyFileSync(best, strategyFile); + } + fs.copyFileSync(strategyFile, path.join(wsTaskDir, "strategy.holdout.md")); + + for (let h = 1; h <= cond.holdout_runs; h++) { + console.error(`[matrix] ${cond.id}/${task}/trial-${trial} HOLDOUT ${h}/${cond.holdout_runs}`); + const runResult = runInner({ + task, workspace, env: meta.env, model: cond.inner_model, + maxTurns: meta.max_turns, timeoutMin: meta.timeout_min, mock, iter: 99, + logFile: path.join(workspace, "logs", `holdout-${h}.log`), + }); + const verifierResult = runResult.trace_dir + ? runVerifier(task, runResult.trace_dir) + : { passed: false, checks: [], reason: "no trace dir (harness error)" }; + recordRun("holdout", h, runResult, verifierResult); + } + } +} + +// ── Main ──────────────────────────────────────────────────────────── + +const conditions = conditionIds.map(loadCondition); +const metas = tasks.map(loadTaskMeta); +const fixturesChild = await ensureFixtures(metas); + +console.error(`[matrix] ${conditions.length} condition(s) × ${tasks.length} task(s) × ${trials} trial(s)${mock ? " [MOCK]" : ""}`); +console.error(`[matrix] results → ${resultsFile}`); + +try { + const cells = []; + for (const cond of conditions) { + for (const meta of metas) { + for (let trial = trialOffset + 1; trial <= trialOffset + trials; trial++) { + cells.push({ cond, meta, trial }); + } + } + } + + // Concurrency: remote cells each get their own pre-created Browserbase + // session (isolated CDP attach), so they can run in parallel. Local mode is + // a single Chrome daemon — force sequential when any local task is selected. + let concurrency = Math.max(1, parseInt(getArg("concurrency", "1"), 10)); + if (!mock && concurrency > 1 && metas.some((m) => m.env === "local")) { + console.error("[matrix] local task selected — forcing --concurrency 1 (single Chrome daemon)"); + concurrency = 1; + } + + let next = 0; + await Promise.all( + Array.from({ length: Math.min(concurrency, cells.length) }, async () => { + while (true) { + const i = next++; + if (i >= cells.length) break; + const { cond, meta, trial } = cells[i]; + try { + await runCell(cond, meta, trial); + } catch (err) { + console.error(`[matrix] cell ${cond.id}/${meta.task}/trial-${trial} crashed: ${err.message}`); + } + } + }) + ); +} finally { + if (fixturesChild) fixturesChild.kill(); +} + +console.error("[matrix] done. Run: node eval/report.mjs"); diff --git a/skills/autobrowse/evals/eval/tasks/_lib/checks.mjs b/skills/autobrowse/evals/eval/tasks/_lib/checks.mjs new file mode 100644 index 0000000..c463add --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/_lib/checks.mjs @@ -0,0 +1,87 @@ +// Shared verifier toolkit. Every task's verify.mjs follows the protocol: +// node verify.mjs --run-dir +// → prints one JSON line {passed, checks: [{name, ok, detail}], reason} +// Pass/fail lives in the JSON; a nonzero exit means the verifier itself broke. + +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; +import { loadRunOutput } from "../../lib/extract-output.mjs"; + +export function getRunDir() { + const idx = process.argv.indexOf("--run-dir"); + if (idx === -1 || !process.argv[idx + 1]) { + console.error("usage: node verify.mjs --run-dir "); + process.exit(1); + } + return path.resolve(process.argv[idx + 1]); +} + +export function loadOutput(runDir) { + return loadRunOutput(runDir); +} + +// ── Check builders ────────────────────────────────────────────────── + +export const norm = (s) => + String(s ?? "") + .toLowerCase() + .normalize("NFKD") + .replace(/[^\w\s]/g, " ") + .replace(/\s+/g, " ") + .trim(); + +export function check(name, ok, detail = "") { + return { name, ok: !!ok, detail: String(detail).slice(0, 300) }; +} + +export function checkFuzzyMatch(name, actual, expected) { + const a = norm(actual); + const e = norm(expected); + const ok = a && e && (a.includes(e) || e.includes(a)); + return check(name, ok, `actual="${actual}" expected≈"${expected}"`); +} + +export function checkContains(name, haystack, needle) { + return check(name, norm(haystack).includes(norm(needle)), `looking for "${needle}"`); +} + +export function checkNumber(name, value, { eq, min, max } = {}) { + const n = typeof value === "string" ? parseFloat(value.replace(/[^0-9.\-]/g, "")) : value; + if (typeof n !== "number" || !isFinite(n)) return check(name, false, `not a number: ${JSON.stringify(value)}`); + if (eq !== undefined) return check(name, Math.abs(n - eq) < 0.005, `got ${n}, expected ${eq}`); + const ok = (min === undefined || n >= min) && (max === undefined || n <= max); + return check(name, ok, `got ${n}, expected [${min ?? "-∞"}, ${max ?? "∞"}]`); +} + +export function checkTime(name, value) { + return check(name, /^([01]?\d|2[0-3]):[0-5]\d/.test(String(value ?? "").trim()), `got ${JSON.stringify(value)}`); +} + +// ── Emit ──────────────────────────────────────────────────────────── + +export function emit(checks, { requireAll = true } = {}) { + const failed = checks.filter((c) => !c.ok); + const passed = requireAll ? failed.length === 0 : failed.length < checks.length; + console.log( + JSON.stringify({ + passed, + checks, + reason: passed ? "all checks passed" : failed.map((c) => `${c.name}: ${c.detail}`).join("; "), + }) + ); + process.exit(0); +} + +export function emitNoOutput() { + console.log(JSON.stringify({ passed: false, checks: [], reason: "no parseable final JSON output in run" })); + process.exit(0); +} + +// Deterministic checkout-fixture confirmation code — must match the +// implementation in fixtures/checkout/index.html exactly. +export function checkoutCode(name, email, zip, shipping) { + const s = `${name}|${email}|${zip}|${shipping}`.toLowerCase(); + let sum = 0; + for (const ch of s) sum = (sum * 31 + ch.codePointAt(0)) % 100000; + return `BB-${String(sum).padStart(5, "0")}`; +} diff --git a/skills/autobrowse/evals/eval/tasks/books-toscrape/meta.json b/skills/autobrowse/evals/eval/tasks/books-toscrape/meta.json new file mode 100644 index 0000000..1574f24 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/books-toscrape/meta.json @@ -0,0 +1,8 @@ +{ + "tier": "A", + "category": "list-extraction", + "env": "local", + "max_turns": 25, + "timeout_min": 15, + "gotchas": "listing truncates long titles (full title in the anchor's title attribute); single page for Travel but the agent must confirm no pagination" +} diff --git a/skills/autobrowse/evals/eval/tasks/books-toscrape/mock-output.json b/skills/autobrowse/evals/eval/tasks/books-toscrape/mock-output.json new file mode 100644 index 0000000..a74e18b --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/books-toscrape/mock-output.json @@ -0,0 +1,7 @@ +{ + "success": true, + "count": 11, + "cheapest": { "title": "The Road to Little Dribbling: Adventures of an American in Britain (Notes From a Small Island #2)", "price_gbp": 23.21 }, + "most_expensive": { "title": "A Year in Provence (Provence #1)", "price_gbp": 56.88 }, + "error_reasoning": null +} diff --git a/skills/autobrowse/evals/eval/tasks/books-toscrape/task.md b/skills/autobrowse/evals/eval/tasks/books-toscrape/task.md new file mode 100644 index 0000000..c2e40ce --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/books-toscrape/task.md @@ -0,0 +1,35 @@ +# Task: Extract Travel-category book stats from Books to Scrape + +List every book in the "Travel" category of books.toscrape.com and report the count plus the cheapest and most expensive books. + +## URL + +https://books.toscrape.com/catalogue/category/books/travel_2/index.html + +## Inputs + +- Category: Travel + +## Steps + +1. Navigate to the URL +2. Extract every book in the category with its full title and price (watch for pagination — include all pages if any) +3. Compute the count, the cheapest book, and the most expensive book + +## Output + +Return a JSON object: + +```json +{ + "success": true, + "count": 0, + "cheapest": { "title": "...", "price_gbp": 0.0 }, + "most_expensive": { "title": "...", "price_gbp": 0.0 }, + "error_reasoning": null +} +``` + +- Prices are in GBP (the £ amounts shown on the site); report them as numbers +- Use the book's full title (the listing truncates some titles — the full title is in the link's title attribute or on the detail page) +- If task fails: `success: false`, populate `error_reasoning` diff --git a/skills/autobrowse/evals/eval/tasks/books-toscrape/verify.mjs b/skills/autobrowse/evals/eval/tasks/books-toscrape/verify.mjs new file mode 100644 index 0000000..4f91952 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/books-toscrape/verify.mjs @@ -0,0 +1,17 @@ +#!/usr/bin/env node +import { getRunDir, loadOutput, emit, emitNoOutput, check, checkNumber, checkFuzzyMatch } from "../_lib/checks.mjs"; + +const out = loadOutput(getRunDir()); +if (!out) emitNoOutput(); + +// Ground truth fetched 2026-06-09 — books.toscrape.com is a static demo site +// that has not changed in years. 11 Travel books; cheapest "The Road to +// Little Dribbling" £23.21; most expensive "A Year in Provence" £56.88. +emit([ + check("claimed success", out.success === true, JSON.stringify(out.success)), + checkNumber("count", out.count, { eq: 11 }), + checkFuzzyMatch("cheapest title", out.cheapest?.title, "The Road to Little Dribbling"), + checkNumber("cheapest price", out.cheapest?.price_gbp, { eq: 23.21 }), + checkFuzzyMatch("most expensive title", out.most_expensive?.title, "A Year in Provence"), + checkNumber("most expensive price", out.most_expensive?.price_gbp, { eq: 56.88 }), +]); diff --git a/skills/autobrowse/evals/eval/tasks/fixture-checkout/meta.json b/skills/autobrowse/evals/eval/tasks/fixture-checkout/meta.json new file mode 100644 index 0000000..15dc91f --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/fixture-checkout/meta.json @@ -0,0 +1,9 @@ +{ + "tier": "A", + "category": "multi-step-form", + "env": "local", + "max_turns": 30, + "timeout_min": 15, + "requires": ["fixtures-server"], + "gotchas": "shipping radio buttons render 900ms after step 2 appears; Next buttons stay disabled until fields validate" +} diff --git a/skills/autobrowse/evals/eval/tasks/fixture-checkout/mock-output.json b/skills/autobrowse/evals/eval/tasks/fixture-checkout/mock-output.json new file mode 100644 index 0000000..e5021b5 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/fixture-checkout/mock-output.json @@ -0,0 +1,7 @@ +{ + "success": true, + "confirmation_code": "BB-09791", + "total_usd": 47.48, + "shipping": "express", + "error_reasoning": null +} diff --git a/skills/autobrowse/evals/eval/tasks/fixture-checkout/task.md b/skills/autobrowse/evals/eval/tasks/fixture-checkout/task.md new file mode 100644 index 0000000..17def70 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/fixture-checkout/task.md @@ -0,0 +1,41 @@ +# Task: Complete the Acme Store checkout + +Complete the multi-step checkout flow on the local fixture store and return the confirmation code. + +## URL + +http://localhost:4173/checkout/ + +## Inputs + +- Full name: Ada Lovelace +- Email: ada@example.com +- Street address: 123 Bridge St +- City: San Francisco +- ZIP code: 94107 +- Shipping speed: Express + +## Steps + +1. Navigate to the URL +2. Fill in the contact step (name, email) and continue +3. Fill in the shipping step (address, city, ZIP), select **Express** shipping, and continue +4. On the review step, confirm the order details and place the order +5. Extract the confirmation code and the total charged from the confirmation screen + +## Output + +Return a JSON object: + +```json +{ + "success": true, + "confirmation_code": "BB-12345", + "total_usd": 47.48, + "shipping": "express", + "error_reasoning": null +} +``` + +- If task succeeds: `success: true`, populate all fields exactly as displayed +- If task fails: `success: false`, populate `error_reasoning` with what blocked you diff --git a/skills/autobrowse/evals/eval/tasks/fixture-checkout/verify.mjs b/skills/autobrowse/evals/eval/tasks/fixture-checkout/verify.mjs new file mode 100644 index 0000000..fd2a581 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/fixture-checkout/verify.mjs @@ -0,0 +1,14 @@ +#!/usr/bin/env node +import { getRunDir, loadOutput, emit, emitNoOutput, check, checkNumber, checkoutCode } from "../_lib/checks.mjs"; + +const out = loadOutput(getRunDir()); +if (!out) emitNoOutput(); + +const expectedCode = checkoutCode("Ada Lovelace", "ada@example.com", "94107", "express"); + +emit([ + check("claimed success", out.success === true, JSON.stringify(out.success)), + check("confirmation code", String(out.confirmation_code).trim() === expectedCode, `got ${out.confirmation_code}, expected ${expectedCode}`), + checkNumber("total", out.total_usd, { eq: 47.48 }), + check("shipping", String(out.shipping).toLowerCase() === "express", JSON.stringify(out.shipping)), +]); diff --git a/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/meta.json b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/meta.json new file mode 100644 index 0000000..6b2ffda --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/meta.json @@ -0,0 +1,9 @@ +{ + "tier": "A", + "category": "search-filter-extract", + "env": "local", + "max_turns": 25, + "timeout_min": 15, + "requires": ["fixtures-server"], + "gotchas": "results render 700ms after Search; default sort is by departure time, not price; cheaper one-stop and cheaper wrong-route flights are deliberate traps" +} diff --git a/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/mock-output.json b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/mock-output.json new file mode 100644 index 0000000..c7a2306 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/mock-output.json @@ -0,0 +1,9 @@ +{ + "success": true, + "airline": "Meridian Air", + "flight_number": "MA 214", + "price_usd": 218, + "depart_time": "07:05", + "nonstop": true, + "error_reasoning": null +} diff --git a/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/task.md b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/task.md new file mode 100644 index 0000000..d25363c --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/task.md @@ -0,0 +1,40 @@ +# Task: Find the cheapest nonstop SFO → JFK flight on FlightDeck + +Search the local FlightDeck fixture for flights from SFO to JFK and return the cheapest NONSTOP option. + +## URL + +http://localhost:4173/flightdeck/ + +## Inputs + +- From: SFO +- To: JFK +- Constraint: nonstop flights only + +## Steps + +1. Navigate to the URL +2. Select SFO as origin and JFK as destination +3. Restrict results to nonstop flights (the "Nonstop only" checkbox, or filter the results yourself) +4. Search and wait for results to load +5. Identify the cheapest nonstop flight (note: results are NOT sorted by price by default) + +## Output + +Return a JSON object: + +```json +{ + "success": true, + "airline": "...", + "flight_number": "XX 123", + "price_usd": 0, + "depart_time": "HH:MM", + "nonstop": true, + "error_reasoning": null +} +``` + +- If task succeeds: `success: true`, populate fields exactly as displayed +- If task fails: `success: false`, populate `error_reasoning` diff --git a/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/verify.mjs b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/verify.mjs new file mode 100644 index 0000000..d007e61 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/verify.mjs @@ -0,0 +1,17 @@ +#!/usr/bin/env node +import { getRunDir, loadOutput, emit, emitNoOutput, check, checkNumber, checkFuzzyMatch } from "../_lib/checks.mjs"; + +const out = loadOutput(getRunDir()); +if (!out) emitNoOutput(); + +// Ground truth seeded in fixtures/flightdeck/index.html: cheapest nonstop +// SFO→JFK is Meridian Air MA 214, $218, departing 07:05. Traps: a $189 +// one-stop on the same route and a $149 nonstop SFO→BOS. +emit([ + check("claimed success", out.success === true, JSON.stringify(out.success)), + checkFuzzyMatch("airline", out.airline, "Meridian Air"), + check("flight number", String(out.flight_number).replace(/\s+/g, "") === "MA214", JSON.stringify(out.flight_number)), + checkNumber("price", out.price_usd, { eq: 218 }), + check("depart time", String(out.depart_time).includes("07:05") || String(out.depart_time).includes("7:05"), JSON.stringify(out.depart_time)), + check("nonstop", out.nonstop === true, JSON.stringify(out.nonstop)), +]); diff --git a/skills/autobrowse/evals/eval/tasks/google-flights/meta.json b/skills/autobrowse/evals/eval/tasks/google-flights/meta.json new file mode 100644 index 0000000..8d82b78 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/google-flights/meta.json @@ -0,0 +1,8 @@ +{ + "tier": "B", + "category": "search-filter-extract", + "env": "local", + "max_turns": 35, + "timeout_min": 25, + "gotchas": "airport fields are comboboxes needing keystrokes + dropdown selection; stops filter is in a Stops chip; prices drift run to run (verifier checks invariants only)" +} diff --git a/skills/autobrowse/evals/eval/tasks/google-flights/mock-output.json b/skills/autobrowse/evals/eval/tasks/google-flights/mock-output.json new file mode 100644 index 0000000..48f6aea --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/google-flights/mock-output.json @@ -0,0 +1,11 @@ +{ + "success": true, + "date": "2026-08-12", + "flights": [ + { "airline": "JetBlue", "depart_time": "07:15", "arrive_time": "15:48", "price_usd": 199, "nonstop": true }, + { "airline": "Delta", "depart_time": "08:30", "arrive_time": "17:05", "price_usd": 228, "nonstop": true }, + { "airline": "United", "depart_time": "11:00", "arrive_time": "19:32", "price_usd": 241, "nonstop": true } + ], + "cheapest_price_usd": 199, + "error_reasoning": null +} diff --git a/skills/autobrowse/evals/eval/tasks/google-flights/task.md b/skills/autobrowse/evals/eval/tasks/google-flights/task.md new file mode 100644 index 0000000..035f994 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/google-flights/task.md @@ -0,0 +1,41 @@ +# Task: Find cheapest nonstop SFO → JFK on Google Flights + +Search Google Flights for one-way nonstop flights from SFO to JFK on 2026-08-12 and return the cheapest options. Based on the browse.sh `google.com/search-flights` skill definition. + +## URL + +https://www.google.com/travel/flights + +## Inputs + +- From: SFO (San Francisco) +- To: JFK (New York) +- Date: 2026-08-12 (one-way) +- Passengers: 1 adult, economy +- Stops filter: nonstop only + +## Steps + +1. Navigate to Google Flights +2. Set up the one-way search SFO → JFK on 2026-08-12 +3. Apply the "Nonstop only" stops filter +4. Wait for results, then extract the top nonstop options sorted by price + +## Output + +Return a JSON object: + +```json +{ + "success": true, + "date": "2026-08-12", + "flights": [ + { "airline": "...", "depart_time": "HH:MM", "arrive_time": "HH:MM", "price_usd": 0, "nonstop": true } + ], + "cheapest_price_usd": 0, + "error_reasoning": null +} +``` + +- Include at least the 3 cheapest nonstop options (fewer only if fewer exist) +- If task fails: `success: false`, populate `error_reasoning` diff --git a/skills/autobrowse/evals/eval/tasks/google-flights/verify.mjs b/skills/autobrowse/evals/eval/tasks/google-flights/verify.mjs new file mode 100644 index 0000000..f429a13 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/google-flights/verify.mjs @@ -0,0 +1,32 @@ +#!/usr/bin/env node +import { getRunDir, loadOutput, emit, emitNoOutput, check, checkNumber, checkTime } from "../_lib/checks.mjs"; + +const out = loadOutput(getRunDir()); +if (!out) emitNoOutput(); + +// Live site — verify invariants, not exact prices. SFO→JFK nonstop one-way +// economy reliably exists and prices in a sane band. +const flights = Array.isArray(out.flights) ? out.flights : []; +const KNOWN_AIRLINES = ["alaska", "american", "delta", "jetblue", "united", "frontier", "hawaiian", "southwest", "spirit"]; + +const checks = [ + check("claimed success", out.success === true, JSON.stringify(out.success)), + check("date echoed", String(out.date).startsWith("2026-08-12"), JSON.stringify(out.date)), + check("≥1 flight", flights.length >= 1, `got ${flights.length}`), + check("all nonstop", flights.length > 0 && flights.every((f) => f.nonstop === true), JSON.stringify(flights.map((f) => f.nonstop))), + check( + "airlines plausible", + flights.length > 0 && flights.every((f) => KNOWN_AIRLINES.some((a) => String(f.airline).toLowerCase().includes(a))), + flights.map((f) => f.airline).join(", ") + ), + ...flights.slice(0, 5).map((f, i) => checkNumber(`flight[${i}] price band`, f.price_usd, { min: 80, max: 1500 })), + ...flights.slice(0, 5).map((f, i) => checkTime(`flight[${i}] depart time`, f.depart_time)), + checkNumber("cheapest price band", out.cheapest_price_usd, { min: 80, max: 1500 }), +]; + +if (flights.length > 0) { + const min = Math.min(...flights.map((f) => Number(f.price_usd)).filter((n) => isFinite(n))); + checks.push(check("cheapest consistent with list", Number(out.cheapest_price_usd) <= min + 0.01, `cheapest=${out.cheapest_price_usd}, list min=${min}`)); +} + +emit(checks); diff --git a/skills/autobrowse/evals/eval/tasks/opentable-availability/meta.json b/skills/autobrowse/evals/eval/tasks/opentable-availability/meta.json new file mode 100644 index 0000000..b8565ce --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/opentable-availability/meta.json @@ -0,0 +1,8 @@ +{ + "tier": "B", + "category": "availability-widget", + "env": "remote", + "max_turns": 30, + "timeout_min": 25, + "gotchas": "Akamai bot wall on plain sessions — harness pre-creates verified+proxied Browserbase sessions (connect-url path); detail page accepts dateTime/covers query params (skips widget interaction); widget times render in 12h format — convert; do not click slots (that starts a booking)" +} diff --git a/skills/autobrowse/evals/eval/tasks/opentable-availability/mock-output.json b/skills/autobrowse/evals/eval/tasks/opentable-availability/mock-output.json new file mode 100644 index 0000000..e0e07e3 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/opentable-availability/mock-output.json @@ -0,0 +1,9 @@ +{ + "success": true, + "restaurant": "Arquet", + "date": "2026-08-15", + "party_size": 2, + "has_availability": true, + "slots": ["17:45", "18:00", "20:15", "21:00"], + "error_reasoning": null +} diff --git a/skills/autobrowse/evals/eval/tasks/opentable-availability/task.md b/skills/autobrowse/evals/eval/tasks/opentable-availability/task.md new file mode 100644 index 0000000..87a600a --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/opentable-availability/task.md @@ -0,0 +1,41 @@ +# Task: Check OpenTable availability at Arquet (San Francisco) + +Check OpenTable for available reservation time slots at Arquet in San Francisco for a party of 2 on 2026-08-15 around dinner time. Read-only — do not book. Based on the browse.sh `opentable.com/check-availability` skill definition. + +## URL + +https://www.opentable.com/r/arquet-san-francisco + +## Inputs + +- Restaurant: Arquet, San Francisco +- Date: 2026-08-15 (a Saturday) +- Party size: 2 +- Time window: dinner (17:00–21:30) + +## Steps + +1. Navigate to the restaurant's OpenTable page (the URL accepts query params for date/party size; using them is fine) +2. Set the date to 2026-08-15 and party size to 2 +3. Read the reservation widget's available time slots in the dinner window +4. Do NOT click any slot or book anything + +## Output + +Return a JSON object: + +```json +{ + "success": true, + "restaurant": "Arquet", + "date": "2026-08-15", + "party_size": 2, + "has_availability": true, + "slots": ["18:00", "18:15"], + "error_reasoning": null +} +``` + +- `has_availability: false` with an empty `slots` array is a VALID successful result (the restaurant may simply be booked) +- Times in 24h HH:MM format +- If task fails (couldn't load the widget at all): `success: false` with `error_reasoning` diff --git a/skills/autobrowse/evals/eval/tasks/opentable-availability/verify.mjs b/skills/autobrowse/evals/eval/tasks/opentable-availability/verify.mjs new file mode 100644 index 0000000..0f45e2b --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/opentable-availability/verify.mjs @@ -0,0 +1,27 @@ +#!/usr/bin/env node +import { getRunDir, loadOutput, emit, emitNoOutput, check, checkContains, checkTime } from "../_lib/checks.mjs"; + +const out = loadOutput(getRunDir()); +if (!out) emitNoOutput(); + +// Live availability changes run to run — verify structural invariants and +// internal consistency, not specific slots. +const slots = Array.isArray(out.slots) ? out.slots : null; + +emit([ + check("claimed success", out.success === true, JSON.stringify(out.success)), + checkContains("restaurant", out.restaurant, "arquet"), + check("date echoed", String(out.date).startsWith("2026-08-15"), JSON.stringify(out.date)), + check("party size", Number(out.party_size) === 2, JSON.stringify(out.party_size)), + check("slots is array", slots !== null, JSON.stringify(out.slots)), + check( + "availability consistent", + (out.has_availability === true && slots?.length > 0) || (out.has_availability === false && slots?.length === 0), + `has_availability=${out.has_availability}, slots=${slots?.length}` + ), + ...(slots ?? []).slice(0, 8).map((s, i) => checkTime(`slot[${i}] format`, s)), + ...(slots ?? []).slice(0, 8).map((s, i) => { + const [h] = String(s).split(":").map(Number); + return check(`slot[${i}] in dinner window`, h >= 16 && h <= 22, String(s)); + }), +]); diff --git a/skills/autobrowse/evals/eval/tasks/stockx-price/meta.json b/skills/autobrowse/evals/eval/tasks/stockx-price/meta.json new file mode 100644 index 0000000..16dc497 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/stockx-price/meta.json @@ -0,0 +1,8 @@ +{ + "tier": "C", + "category": "bot-protected-extraction", + "env": "remote", + "max_turns": 35, + "timeout_min": 25, + "gotchas": "PerimeterX bot protection — requires Browserbase stealth/proxies; search results include many variants (pick exact style DZ5485-612); price shown depends on selected size" +} diff --git a/skills/autobrowse/evals/eval/tasks/stockx-price/mock-output.json b/skills/autobrowse/evals/eval/tasks/stockx-price/mock-output.json new file mode 100644 index 0000000..fc0dc4b --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/stockx-price/mock-output.json @@ -0,0 +1,8 @@ +{ + "success": true, + "product": "Air Jordan 1 Retro High OG Chicago Lost and Found", + "style_code": "DZ5485-612", + "last_sale_usd": 285, + "lowest_ask_usd": 297, + "error_reasoning": null +} diff --git a/skills/autobrowse/evals/eval/tasks/stockx-price/task.md b/skills/autobrowse/evals/eval/tasks/stockx-price/task.md new file mode 100644 index 0000000..43458b1 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/stockx-price/task.md @@ -0,0 +1,35 @@ +# Task: Get the StockX resale price for the Jordan 1 "Chicago Lost and Found" + +Look up the Air Jordan 1 Retro High OG "Chicago Lost and Found" on StockX and return its current market data. Read-only — never place a bid or buy. Based on the browse.sh `stockx.com/get-resale-price` skill definition. + +## URL + +https://stockx.com + +## Inputs + +- Product: Air Jordan 1 Retro High OG "Chicago Lost and Found" (style DZ5485-612) + +## Steps + +1. Navigate to StockX (bot-protected — use remote/stealth browsing) +2. Search for the product and open its product page +3. Extract: full product name, lowest ask or last sale price (USD), and the style code if shown + +## Output + +Return a JSON object: + +```json +{ + "success": true, + "product": "...", + "style_code": "DZ5485-612", + "last_sale_usd": 0, + "lowest_ask_usd": 0, + "error_reasoning": null +} +``` + +- At least one of `last_sale_usd` / `lowest_ask_usd` must be populated +- If task fails: `success: false`, populate `error_reasoning` diff --git a/skills/autobrowse/evals/eval/tasks/stockx-price/verify.mjs b/skills/autobrowse/evals/eval/tasks/stockx-price/verify.mjs new file mode 100644 index 0000000..962d849 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/stockx-price/verify.mjs @@ -0,0 +1,21 @@ +#!/usr/bin/env node +import { getRunDir, loadOutput, emit, emitNoOutput, check, checkContains, norm } from "../_lib/checks.mjs"; + +const out = loadOutput(getRunDir()); +if (!out) emitNoOutput(); + +// Live marketplace — invariant checks. This shoe has traded ~$150–$400 for +// years; a wide band still catches fabricated or wrong-product prices. +const product = norm(out.product); +const price = [out.last_sale_usd, out.lowest_ask_usd] + .map(Number) + .find((n) => isFinite(n) && n > 0); + +emit([ + check("claimed success", out.success === true, JSON.stringify(out.success)), + check("product is Jordan 1", product.includes("jordan 1"), out.product), + check("product is Chicago L&F", product.includes("chicago") && (product.includes("lost") || product.includes("found")), out.product), + checkContains("style code", out.style_code, "DZ5485-612"), + check("a price populated", price !== undefined, JSON.stringify({ last_sale: out.last_sale_usd, lowest_ask: out.lowest_ask_usd })), + check("price plausible", price !== undefined && price >= 100 && price <= 1000, `got ${price}`), +]); diff --git a/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/meta.json b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/meta.json new file mode 100644 index 0000000..f05d254 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/meta.json @@ -0,0 +1,8 @@ +{ + "tier": "B", + "category": "search-and-extract", + "env": "remote", + "max_turns": 35, + "timeout_min": 25, + "gotchas": "ppubs.uspto.gov is a heavy SPA with its own query syntax; record pages render in an embedded viewer" +} diff --git a/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/mock-output.json b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/mock-output.json new file mode 100644 index 0000000..eaf14a2 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/mock-output.json @@ -0,0 +1,9 @@ +{ + "success": true, + "patent_number": "11000000", + "title": "Repositioning wires and methods for repositioning prosthetic heart valve devices within a heart chamber and related systems, devices and methods", + "inventors": ["Jason S. Diedering", "Saravana B. Kumar"], + "assignee": "4C Medical Technologies, Inc.", + "grant_date": "2021-05-11", + "error_reasoning": null +} diff --git a/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/task.md b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/task.md new file mode 100644 index 0000000..9af2026 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/task.md @@ -0,0 +1,35 @@ +# Task: Look up US Patent 11,000,000 + +Search the USPTO patent database (or USPTO Patent Public Search at ppubs.uspto.gov) for US patent number 11,000,000 and extract its bibliographic details. Based on the browse.sh `uspto.gov/search-patents` skill definition. + +## URL + +https://ppubs.uspto.gov/pubwebapp/ + +## Inputs + +- Patent number: 11000000 + +## Steps + +1. Navigate to USPTO Patent Public Search (or another official USPTO search surface) +2. Search for patent number 11000000 +3. Open the patent record and extract: title, inventors, assignee, grant date + +## Output + +Return a JSON object: + +```json +{ + "success": true, + "patent_number": "11000000", + "title": "...", + "inventors": ["..."], + "assignee": "...", + "grant_date": "YYYY-MM-DD", + "error_reasoning": null +} +``` + +- If task fails: `success: false`, populate `error_reasoning` diff --git a/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/verify.mjs b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/verify.mjs new file mode 100644 index 0000000..e2c57aa --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/verify.mjs @@ -0,0 +1,21 @@ +#!/usr/bin/env node +import { getRunDir, loadOutput, emit, emitNoOutput, check, checkContains, norm } from "../_lib/checks.mjs"; + +const out = loadOutput(getRunDir()); +if (!out) emitNoOutput(); + +// Patents are immutable — exact ground truth. US 11,000,000 B2: +// "Repositioning wires and methods for repositioning prosthetic heart valve +// devices within a heart chamber...", 4C Medical Technologies, granted +// 2021-05-11, inventors incl. Jason S. Diedering, Saravana B. Kumar. +const inventors = norm(JSON.stringify(out.inventors ?? "")); + +emit([ + check("claimed success", out.success === true, JSON.stringify(out.success)), + check("patent number", String(out.patent_number).replace(/[^0-9]/g, "") === "11000000", JSON.stringify(out.patent_number)), + checkContains("title", out.title, "repositioning"), + checkContains("title mentions heart valve", out.title, "heart valve"), + check("inventor Diedering", inventors.includes("diedering"), inventors.slice(0, 120)), + checkContains("assignee", out.assignee, "4C Medical"), + check("grant date", String(out.grant_date).startsWith("2021-05-11"), JSON.stringify(out.grant_date)), +]); diff --git a/skills/autobrowse/evals/eval/tasks/yelp-reviews/meta.json b/skills/autobrowse/evals/eval/tasks/yelp-reviews/meta.json new file mode 100644 index 0000000..432da6e --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/yelp-reviews/meta.json @@ -0,0 +1,8 @@ +{ + "tier": "C", + "category": "bot-protected-extraction", + "env": "remote", + "max_turns": 35, + "timeout_min": 25, + "gotchas": "DataDome CAPTCHA wall — needs Browserbase stealth + residential proxies; review dates render as relative strings sometimes; login-walled actions must be avoided" +} diff --git a/skills/autobrowse/evals/eval/tasks/yelp-reviews/mock-output.json b/skills/autobrowse/evals/eval/tasks/yelp-reviews/mock-output.json new file mode 100644 index 0000000..bb87c6f --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/yelp-reviews/mock-output.json @@ -0,0 +1,12 @@ +{ + "success": true, + "name": "Tartine Bakery", + "rating": 4.0, + "review_count": 8900, + "reviews": [ + { "reviewer": "Maya L.", "rating": 5, "date": "2026-06-01", "text": "The morning bun is still the single best pastry in San Francisco. Line moved fast on a Tuesday morning and staff were lovely." }, + { "reviewer": "Derek W.", "rating": 3, "date": "2026-05-28", "text": "Great bread, genuinely world class, but the line was 40 minutes and there is nowhere to sit. Get it to go and walk to Dolores Park." }, + { "reviewer": "Priya S.", "rating": 4, "date": "2026-05-25", "text": "Croissant was perfectly laminated and the coffee was solid. Docking a star because they were out of the country bread by 10am." } + ], + "error_reasoning": null +} diff --git a/skills/autobrowse/evals/eval/tasks/yelp-reviews/task.md b/skills/autobrowse/evals/eval/tasks/yelp-reviews/task.md new file mode 100644 index 0000000..24a2796 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/yelp-reviews/task.md @@ -0,0 +1,38 @@ +# Task: Extract Yelp reviews for Tartine Bakery (San Francisco) + +Extract Tartine Bakery's rating, review count, and its 5 most recent reviews from Yelp. Read-only. Based on the browse.sh `yelp.com/extract-reviews` skill definition (simplified filter surface: sort = newest, limit = 5). + +## URL + +https://www.yelp.com/biz/tartine-bakery-san-francisco + +## Inputs + +- Business: Tartine Bakery, San Francisco +- Sort: newest +- Limit: 5 reviews + +## Steps + +1. Navigate to the business page (DataDome bot protection — use remote/stealth browsing) +2. Extract the overall rating and total review count +3. Sort reviews by newest and extract the top 5: reviewer name, rating, date, full text + +## Output + +Return a JSON object: + +```json +{ + "success": true, + "name": "Tartine Bakery", + "rating": 4.0, + "review_count": 0, + "reviews": [ + { "reviewer": "...", "rating": 5, "date": "YYYY-MM-DD", "text": "..." } + ], + "error_reasoning": null +} +``` + +- If task fails: `success: false`, populate `error_reasoning` diff --git a/skills/autobrowse/evals/eval/tasks/yelp-reviews/verify.mjs b/skills/autobrowse/evals/eval/tasks/yelp-reviews/verify.mjs new file mode 100644 index 0000000..e37b69e --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/yelp-reviews/verify.mjs @@ -0,0 +1,20 @@ +#!/usr/bin/env node +import { getRunDir, loadOutput, emit, emitNoOutput, check, checkContains, checkNumber } from "../_lib/checks.mjs"; + +const out = loadOutput(getRunDir()); +if (!out) emitNoOutput(); + +// Live site — invariants. Tartine has held ~4 stars with >8,000 reviews for +// years; per-review structure is the real fabrication check. +const reviews = Array.isArray(out.reviews) ? out.reviews : []; + +emit([ + check("claimed success", out.success === true, JSON.stringify(out.success)), + checkContains("name", out.name, "tartine"), + checkNumber("rating band", out.rating, { min: 3.0, max: 5.0 }), + checkNumber("review count", out.review_count, { min: 5000, max: 50000 }), + check("≥3 reviews", reviews.length >= 3, `got ${reviews.length}`), + ...reviews.slice(0, 5).map((r, i) => checkNumber(`review[${i}] rating`, r?.rating, { min: 1, max: 5 })), + ...reviews.slice(0, 5).map((r, i) => check(`review[${i}] has text`, String(r?.text ?? "").length >= 40, `len=${String(r?.text ?? "").length}`)), + ...reviews.slice(0, 5).map((r, i) => check(`review[${i}] has date`, /\d{4}/.test(String(r?.date ?? "")), JSON.stringify(r?.date))), +]); diff --git a/skills/autobrowse/evals/eval/tasks/youtube-transcript/meta.json b/skills/autobrowse/evals/eval/tasks/youtube-transcript/meta.json new file mode 100644 index 0000000..9fd51bb --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/youtube-transcript/meta.json @@ -0,0 +1,8 @@ +{ + "tier": "B", + "category": "media-extraction", + "env": "local", + "max_turns": 30, + "timeout_min": 20, + "gotchas": "transcript button hidden behind '...more' description expander; consent dialogs may appear; player keyboard shortcuts can pause/seek accidentally" +} diff --git a/skills/autobrowse/evals/eval/tasks/youtube-transcript/mock-output.json b/skills/autobrowse/evals/eval/tasks/youtube-transcript/mock-output.json new file mode 100644 index 0000000..085f1dd --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/youtube-transcript/mock-output.json @@ -0,0 +1,12 @@ +{ + "success": true, + "title": "Me at the zoo", + "channel": "jawed", + "has_transcript": true, + "segments": [ + { "ts": "0:00", "text": "All right, so here we are in front of the elephants" }, + { "ts": "0:05", "text": "the cool thing about these guys is that they have really really really long trunks" }, + { "ts": "0:12", "text": "and that's cool" } + ], + "error_reasoning": null +} diff --git a/skills/autobrowse/evals/eval/tasks/youtube-transcript/task.md b/skills/autobrowse/evals/eval/tasks/youtube-transcript/task.md new file mode 100644 index 0000000..d0f723a --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/youtube-transcript/task.md @@ -0,0 +1,35 @@ +# Task: Extract the transcript of "Me at the zoo" + +Extract the transcript of the first YouTube video ever uploaded. Based on the browse.sh `youtube.com/extract-transcript` skill definition. + +## URL + +https://www.youtube.com/watch?v=jNQXAC9IVRw + +## Inputs + +- Video: "Me at the zoo" (video ID jNQXAC9IVRw) + +## Steps + +1. Navigate to the video page +2. Find the video title and channel name +3. Open the transcript panel (usually under the "...more" description → "Show transcript") +4. Extract the transcript segments with timestamps + +## Output + +Return a JSON object: + +```json +{ + "success": true, + "title": "...", + "channel": "...", + "has_transcript": true, + "segments": [{ "ts": "0:00", "text": "..." }], + "error_reasoning": null +} +``` + +- If task fails: `success: false`, populate `error_reasoning` diff --git a/skills/autobrowse/evals/eval/tasks/youtube-transcript/verify.mjs b/skills/autobrowse/evals/eval/tasks/youtube-transcript/verify.mjs new file mode 100644 index 0000000..0721a94 --- /dev/null +++ b/skills/autobrowse/evals/eval/tasks/youtube-transcript/verify.mjs @@ -0,0 +1,20 @@ +#!/usr/bin/env node +import { getRunDir, loadOutput, emit, emitNoOutput, check, checkContains } from "../_lib/checks.mjs"; + +const out = loadOutput(getRunDir()); +if (!out) emitNoOutput(); + +// 19-second 2005 video; content is immutable. Transcript famously mentions +// the elephants' "really really long trunks". +const segments = Array.isArray(out.segments) ? out.segments : []; +const fullText = segments.map((s) => s?.text ?? "").join(" "); + +emit([ + check("claimed success", out.success === true, JSON.stringify(out.success)), + checkContains("title", out.title, "Me at the zoo"), + checkContains("channel", out.channel, "jawed"), + check("has transcript", out.has_transcript === true, JSON.stringify(out.has_transcript)), + check("≥2 segments", segments.length >= 2, `got ${segments.length}`), + checkContains("transcript mentions elephants", fullText, "elephants"), + checkContains("transcript mentions trunks", fullText, "trunks"), +]); diff --git a/skills/autobrowse/evals/fixtures/checkout/index.html b/skills/autobrowse/evals/fixtures/checkout/index.html new file mode 100644 index 0000000..9a801c3 --- /dev/null +++ b/skills/autobrowse/evals/fixtures/checkout/index.html @@ -0,0 +1,117 @@ + + + + +Acme Checkout + + + +

Acme Store — Checkout

+

Cart: Widget Pro × 2 @ $19.99 each

+ +
+

Step 1 — Contact

+ + + + +
+
+ +
+

Step 2 — Shipping

+ + + + + + +
Loading shipping options…
+
+
+ +
+

Step 3 — Review

+
+

Total: $

+ +
+ +
+

Order confirmed 🎉

+

Your confirmation code is

+

Total charged: $

+
+ + + + diff --git a/skills/autobrowse/evals/fixtures/flightdeck/index.html b/skills/autobrowse/evals/fixtures/flightdeck/index.html new file mode 100644 index 0000000..2fc17d4 --- /dev/null +++ b/skills/autobrowse/evals/fixtures/flightdeck/index.html @@ -0,0 +1,82 @@ + + + + +FlightDeck — Search + + + +

FlightDeck

+ + + + + +

Search to see flights.

+ + + + diff --git a/skills/autobrowse/evals/fixtures/serve.mjs b/skills/autobrowse/evals/fixtures/serve.mjs new file mode 100644 index 0000000..b428300 --- /dev/null +++ b/skills/autobrowse/evals/fixtures/serve.mjs @@ -0,0 +1,27 @@ +#!/usr/bin/env node +// Tiny static server for the Tier A deterministic fixture sites. +// Usage: node fixtures/serve.mjs [port] (default 4173) + +import * as http from "node:http"; +import * as fs from "node:fs"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; + +const ROOT = path.dirname(fileURLToPath(import.meta.url)); +const PORT = parseInt(process.argv[2] || "4173", 10); + +const TYPES = { ".html": "text/html", ".js": "text/javascript", ".css": "text/css", ".json": "application/json" }; + +http + .createServer((req, res) => { + let urlPath = decodeURIComponent(new URL(req.url, "http://x").pathname); + if (urlPath.endsWith("/")) urlPath += "index.html"; + const file = path.join(ROOT, path.normalize(urlPath).replace(/^(\.\.[/\\])+/, "")); + if (!file.startsWith(ROOT) || !fs.existsSync(file) || fs.statSync(file).isDirectory()) { + res.writeHead(404).end("not found"); + return; + } + res.writeHead(200, { "content-type": TYPES[path.extname(file)] || "application/octet-stream" }); + fs.createReadStream(file).pipe(res); + }) + .listen(PORT, () => console.error(`fixtures on http://localhost:${PORT}/ (checkout/, flightdeck/)`)); diff --git a/skills/autobrowse/evals/package.json b/skills/autobrowse/evals/package.json new file mode 100644 index 0000000..667ad81 --- /dev/null +++ b/skills/autobrowse/evals/package.json @@ -0,0 +1,17 @@ +{ + "name": "autobrowse-evals", + "version": "0.1.0", + "private": true, + "type": "module", + "description": "Eval harness for the autobrowse self-improving browser-automation loop: convergence, accuracy, speed, and token cost across models/prompts/architectures.", + "scripts": { + "fixtures": "node fixtures/serve.mjs", + "test:verifiers": "node scripts/test-verifiers.mjs", + "matrix": "node eval/run-matrix.mjs", + "report": "node eval/report.mjs" + }, + "dependencies": { + "@anthropic-ai/sdk": "^0.74.0", + "dotenv": "^17.2.3" + } +} diff --git a/skills/autobrowse/evals/scripts/test-verifiers.mjs b/skills/autobrowse/evals/scripts/test-verifiers.mjs new file mode 100644 index 0000000..e4eef1d --- /dev/null +++ b/skills/autobrowse/evals/scripts/test-verifiers.mjs @@ -0,0 +1,41 @@ +#!/usr/bin/env node +// Verifier self-test. For every task: its mock-output.json (a documented +// known-good output) MUST pass its verifier, and a garbage claimed-success +// output MUST fail it. Catches both broken verifiers and verifiers an agent +// could trivially reward-hack with {"success": true}. + +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; +import { listTasks, TASKS_DIR } from "../eval/config.mjs"; +import { runVerifier } from "../eval/lib/run-verifier.mjs"; + +const GARBAGE = { success: true, note: "fabricated", value: 42 }; + +function makeRunDir(output) { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), "verify-test-")); + fs.writeFileSync(path.join(dir, "result.json"), JSON.stringify({ parsed: output, raw: JSON.stringify(output), parse_error: null })); + return dir; +} + +let failures = 0; +for (const task of listTasks()) { + const mockOutput = JSON.parse(fs.readFileSync(path.join(TASKS_DIR, task, "mock-output.json"), "utf-8")); + + const good = runVerifier(task, makeRunDir(mockOutput)); + const bad = runVerifier(task, makeRunDir(GARBAGE)); + + const goodOk = good.passed === true; + const badOk = bad.passed === false; + if (!goodOk || !badOk) failures++; + + console.log( + `${goodOk && badOk ? "✅" : "❌"} ${task.padEnd(24)} known-good ${goodOk ? "passes" : `FAILS (${good.reason})`}; garbage ${badOk ? "rejected" : "ACCEPTED (verifier is hackable!)"}` + ); +} + +if (failures) { + console.error(`\n${failures} verifier(s) broken`); + process.exit(1); +} +console.log("\nAll verifiers sound.");