diff --git a/skills/autobrowse/evals/.env.example b/skills/autobrowse/evals/.env.example
new file mode 100644
index 0000000..8943022
--- /dev/null
+++ b/skills/autobrowse/evals/.env.example
@@ -0,0 +1,10 @@
+# Required for real (non-mock) runs — inner agent + outer agent both use it.
+ANTHROPIC_API_KEY=sk-ant-...
+
+# Required for tasks with env=remote (Tier B/C bot-protected sites).
+BROWSERBASE_API_KEY=bb_...
+
+# Path to the autobrowse skill (the directory containing scripts/evaluate.mjs).
+# Defaults to the parent directory (evals/ ships inside the skill); set only
+# to point the harness at a different autobrowse checkout.
+# AUTOBROWSE_DIR=/path/to/skills/skills/autobrowse
diff --git a/skills/autobrowse/evals/.gitignore b/skills/autobrowse/evals/.gitignore
new file mode 100644
index 0000000..7d4d452
--- /dev/null
+++ b/skills/autobrowse/evals/.gitignore
@@ -0,0 +1,4 @@
+node_modules/
+runs/
+.env
+vendor/
diff --git a/skills/autobrowse/evals/README.md b/skills/autobrowse/evals/README.md
new file mode 100644
index 0000000..d43e9a7
--- /dev/null
+++ b/skills/autobrowse/evals/README.md
@@ -0,0 +1,96 @@
+# autobrowse-evals
+
+Eval harness for the [autobrowse](https://github.com/browserbase/skills/tree/main/skills/autobrowse) self-improving browser-automation loop. Measures the four things that matter — **convergence speed**, **accuracy**, **runtime speed**, and **token cost** — and makes them comparable across inner/outer models, prompts, and architectures.
+
+## The four artifacts being evaluated
+
+| Artifact | What it is | Metrics |
+|---|---|---|
+| Single run | One `evaluate.mjs` attempt, empty strategy | accuracy baseline, speed, tokens |
+| Learning loop | evaluate → verify → improve, repeated | convergence speed, cumulative cost |
+| Graduated strategy | frozen best strategy.md run by a fresh agent | **holdout** accuracy/speed/tokens |
+| Codegen script | deterministic playwright/stagehand output | (future: wire `codegen.mjs --verify` in) |
+
+The core design decision: **training and evaluation are separated.** Convergence is measured during the loop; the *result* is measured by freezing the best strategy and running it N fresh times (holdout). And **pass/fail is never self-reported** — every task has a programmatic verifier; the agent's own `success: true` is only used to compute the false-success (reward-hacking) rate.
+
+## Layout
+
+```
+eval/
+  run-matrix.mjs        orchestrator: condition × task × trial → train + holdout
+  outer-agent.mjs       scripted outer loop (one structured-output call per iteration;
+                        outer tokens metered — the interactive loop never records them)
+  report.mjs            aggregates runs/results.jsonl into scorecards
+  conditions/*.json     sweepable variables: inner_model, outer_model, outer_prompt, iters
+  prompts/outer-*.md    outer-prompt variants (default = SKILL.md methodology, lean = ablation)
+  tasks/<task>/         task.md (autobrowse format) + verify.mjs + meta.json + mock-output.json
+fixtures/               self-hosted deterministic sites (Tier A ground truth)
+runs/                   workspaces, traces, results.jsonl (gitignored)
+```
+
+## Benchmark suite (9 tasks, 3 tiers)
+
+Tasks marked ◆ are drawn from the browse.sh prompt library (`prompts/<domain>/<task>.md`).
+
+| Tier | Task | Env | Verification |
+|---|---|---|---|
+| **A — deterministic** | `fixture-checkout` | local | exact confirmation code (shared hash function) + total |
+| | `fixture-flightdeck` | local | exact cheapest-nonstop answer; traps: cheaper 1-stop, cheaper wrong route |
+| | `books-toscrape` | local | exact count/prices/titles (static demo site) |
+| **B — live, stable** | `uspto-patent-lookup` ◆ | remote | patent facts are immutable (US 11,000,000) |
+| | `google-flights` ◆ | local | invariants: nonstop, airline set, price band, internal consistency |
+| | `opentable-availability` ◆ | local | invariants: date/party echoed, slot format, availability consistency |
+| | `youtube-transcript` ◆ | local | immutable content ("Me at the zoo" transcript phrases) |
+| **C — bot-protected** | `stockx-price` ◆ | remote | product identity + price band (PerimeterX) |
+| | `yelp-reviews` ◆ | remote | rating/review-count bands + per-review structure (DataDome) |
+
+Tier A gives model comparisons statistical teeth; Tier B measures real-site competence with invariant checks; Tier C measures infrastructure robustness (report it separately — variance is the site's, not the model's).
+
+**Verifier protocol** (mirrors autobrowse's codegen runner protocol): `node eval/tasks/<task>/verify.mjs --run-dir <traceDir>` → one JSON line `{passed, checks: [{name, ok, detail}], reason}`. Each task's `mock-output.json` is its documented known-good output; `npm run test:verifiers` asserts every verifier passes it and rejects a garbage `{"success": true}` — i.e., verifiers are tested against reward-hacking.
+
+## Setup
+
+```bash
+npm install
+cp .env.example .env        # ANTHROPIC_API_KEY (+ BROWSERBASE_API_KEY for remote tasks)
+npm install -g browse       # the browse CLI used by the inner agent
+# AUTOBROWSE_DIR defaults to the parent dir (this folder ships inside the skill)
+```
+
+## Usage
+
+```bash
+npm run test:verifiers                                  # verifier soundness (no keys needed)
+node eval/run-matrix.mjs --conditions baseline --tasks fixture-checkout --mock   # free pipeline check
+
+# Real runs
+node eval/run-matrix.mjs --conditions pilot --tasks fixture-checkout            # cheap pilot
+node eval/run-matrix.mjs --conditions baseline --tasks all --trials 3           # full baseline
+node eval/run-matrix.mjs --conditions baseline,inner-haiku,inner-opus,outer-sonnet,outer-prompt-lean \
+    --tasks fixture-checkout,fixture-flightdeck,books-toscrape --trials 3       # model/prompt screen on Tier A
+
+npm run report                                          # markdown scorecards
+node eval/report.mjs --json                             # raw aggregates
+```
+
+The fixture server (`npm run fixtures`, port 4173) auto-starts when a selected task needs it.
+
+## Metrics (see report footer for definitions)
+
+- **Convergence:** converged-rate, iters-to-first-verified-pass, regressions, cumulative train cost (inner + outer)
+- **Accuracy:** holdout pass rate (frozen strategy, fresh runs), **false-success rate** (claimed success, verifier failed)
+- **Speed:** holdout wall clock split into browser ms (sum of browse-CLI `duration_ms` in trace.json) vs model ms
+- **Tokens/cost:** per-run tokens, recomputed centrally in `eval/lib/pricing.mjs` (don't trust evaluate.mjs's stale table), and **skill value** = how much the learned strategy cheapens a run vs the blind iteration-1 attempt (tests the README's "80%+ reduction" claim)
+
+## Experiment design notes
+
+- **Screen, don't grid.** Vary one axis at a time against `baseline` (5 conditions ship: baseline, inner-haiku, inner-opus, outer-sonnet, outer-prompt-lean). Deep-dive only the interesting 2–3 combos.
+- **Pair comparisons on the same tasks**; live-site variance makes unpaired suite means meaningless. Tier C reports separately.
+- **Trials:** ≥3 per cell for anything you'll make a decision on. `results.jsonl` is append-only — rerun cells freely, the report aggregates.
+- **Cost calibration:** run `pilot` on one Tier A task first and read `inner_cost_usd`/`outer_cost_usd` from `runs/results.jsonl` before launching a sweep.
+
+## Fidelity caveats / roadmap
+
+- The scripted outer agent sees a curated evidence pack (summary, verifier verdict, failed commands), not the full tool-using trace exploration Claude Code does. A Claude-Agent-SDK outer agent with Read/Grep tools is the natural next architecture variant — and would also let `--browser-trace` evidence (unified-events.jsonl) become a sweepable axis.
+- `codegen.mjs --verify` (deterministic script artifact) isn't wired into the matrix yet; its runner protocol is identical to the verifier protocol here, so it slots in as a fourth phase.
+- The local checkout's `judge.mjs` (A/B strategy judge) and `--supervise` watcher are complementary: the judge compares strategy *versions* by run evidence; this harness compares *conditions* by verified outcomes. `supervised` already lands in evaluate.mjs's meta.json and could become another condition axis.
diff --git a/skills/autobrowse/evals/RESULTS.md b/skills/autobrowse/evals/RESULTS.md
new file mode 100644
index 0000000..732ea74
--- /dev/null
+++ b/skills/autobrowse/evals/RESULTS.md
@@ -0,0 +1,34 @@
+# Eval results — 2026-06-09 (Fable 5 vs Opus 4.8)
+
+First findings from this harness, comparing `claude-fable-5` and `claude-opus-4-8` in both autobrowse roles. ~200 verified runs, ~$220 API spend. Small n (2–3 trials/cell) — directional, not definitive.
+
+## Headline
+
+**Best configuration tested: Sonnet 4.6 as the inner (browsing) agent + Fable 5 as the outer (strategy-writing) agent.** On the OpenTable task it produced the most reliable *and* cheapest converged runs of any cell — beating even Opus-as-browser — because the expensive model's intelligence lands in `strategy.md` once instead of in every run.
+
+## OpenTable 2×2 (Tier B, Akamai-walled, verified+proxied Browserbase sessions)
+
+| Inner ↓ / Outer → | Opus 4.8 writes | Fable 5 writes |
+|---|---|---|
+| **Sonnet 4.6 browses** | 5/6 holdout, $1.40/run, 90s | **6/6 holdout, $0.96/run, 64s** |
+| **Opus 4.8 browses** | 6/6, $1.20/run, 63s | — |
+| **Fable 5 browses** | 5/6, $1.66/run, 93s | — |
+
+- **Inner axis:** Opus beat Fable as the browser — same convergence (iter 2–3), half the training cost (~$5.5 vs ~$11/trial), perfect holdout. Fable reasons more per turn; at 2× token pricing that compounds (blind iteration-1 attempts: ~$7 vs ~$3).
+- **Outer axis (same Sonnet inner in both):** Fable-authored strategies were more reliable (6/6 vs 5/6) and made the same agent ~30% faster and cheaper. Qualitatively, Fable's skills encode *mechanisms* — React hydration timing ("`wait load` returns before the widget renders; snapshot shows ~2 refs"), Akamai cookie behavior ("`browse stop` wipes cookies → never stop the session"), broken-command landmines ("`wait selector text=...` ETIMEDOUTs") — where Opus's skills describe symptoms. Same pattern appeared on the Tier A fixtures: Fable was the only outer model to identify a deliberately planted 900ms delayed-render trap and prescribe the exact fix.
+- Fable's outer calls cost $0.13 vs Opus's $0.05 per improvement — negligible in absolute terms.
+
+## Tier A fixtures (deterministic local sites)
+
+- All models 100% on the easy task; differentiation is pure cost (Sonnet $0.16/run, Opus $0.60, Fable $0.97). On tasks the cheap model already does, frontier inner agents are pure overhead.
+- On the trap-laden checkout fixture, inner reliability ranked Fable (6/6) > Opus (5/6) > Sonnet (4/6) — monotonic with price. This did **not** generalize to OpenTable, where Opus matched/beat Fable as inner.
+
+## Other observations
+
+- **Zero false-successes in ~200 runs** — no model claimed `success:true` against a failing verifier. Failures were honest (turn-budget exhaustion, no final JSON).
+- **Live-site drift is real:** Akamai blocked every iteration-1 attempt in a morning round and none in an evening round. Only within-round (concurrent, paired) comparisons are valid on live sites.
+- One Fable-cell strategy explicitly reasoned about the grader ("the verifier requires success:true — persist"). Benign here (persistence, not fabrication), but a preview of strategies evolving against the verifier's letter on harder tasks.
+
+## Recommended default
+
+`inner_model: claude-sonnet-4-6`, `outer_model: claude-fable-5`, escalating the inner to Opus only when a task fails to converge because the inner agent can't execute good instructions. Training cost per new skill: ~$1–2; converged verified runs: ~$1.
diff --git a/skills/autobrowse/evals/eval/conditions/baseline.json b/skills/autobrowse/evals/eval/conditions/baseline.json
new file mode 100644
index 0000000..54ee28e
--- /dev/null
+++ b/skills/autobrowse/evals/eval/conditions/baseline.json
@@ -0,0 +1,9 @@
+{
+  "id": "baseline",
+  "notes": "Default autobrowse setup: Sonnet inner agent (evaluate.mjs default), Opus outer agent, full SKILL.md-style outer prompt.",
+  "inner_model": "claude-sonnet-4-6",
+  "outer_model": "claude-opus-4-8",
+  "outer_prompt": "outer-default",
+  "max_iters": 5,
+  "holdout_runs": 3
+}
diff --git a/skills/autobrowse/evals/eval/conditions/inner-fable-5.json b/skills/autobrowse/evals/eval/conditions/inner-fable-5.json
new file mode 100644
index 0000000..f8e8e27
--- /dev/null
+++ b/skills/autobrowse/evals/eval/conditions/inner-fable-5.json
@@ -0,0 +1,9 @@
+{
+  "id": "inner-fable-5",
+  "notes": "Fable 5 as the INNER browsing agent (vs inner-opus / baseline). Measures raw browsing competence: iteration-1 pass rate, turns, holdout reliability. 2x Opus pricing — watch cost_to_converge.",
+  "inner_model": "claude-fable-5",
+  "outer_model": "claude-opus-4-8",
+  "outer_prompt": "outer-default",
+  "max_iters": 5,
+  "holdout_runs": 3
+}
diff --git a/skills/autobrowse/evals/eval/conditions/inner-haiku.json b/skills/autobrowse/evals/eval/conditions/inner-haiku.json
new file mode 100644
index 0000000..1f2b701
--- /dev/null
+++ b/skills/autobrowse/evals/eval/conditions/inner-haiku.json
@@ -0,0 +1,9 @@
+{
+  "id": "inner-haiku",
+  "notes": "Cheap inner agent hypothesis: a smart outer agent distills intelligence into strategy.md, so a Haiku inner agent should converge to the same place at a fraction of the cost.",
+  "inner_model": "claude-haiku-4-5",
+  "outer_model": "claude-opus-4-8",
+  "outer_prompt": "outer-default",
+  "max_iters": 7,
+  "holdout_runs": 3
+}
diff --git a/skills/autobrowse/evals/eval/conditions/inner-opus.json b/skills/autobrowse/evals/eval/conditions/inner-opus.json
new file mode 100644
index 0000000..48657e9
--- /dev/null
+++ b/skills/autobrowse/evals/eval/conditions/inner-opus.json
@@ -0,0 +1,9 @@
+{
+  "id": "inner-opus",
+  "notes": "Expensive inner agent: does a frontier inner model converge in fewer iterations, and does that offset its per-run cost?",
+  "inner_model": "claude-opus-4-8",
+  "outer_model": "claude-opus-4-8",
+  "outer_prompt": "outer-default",
+  "max_iters": 5,
+  "holdout_runs": 3
+}
diff --git a/skills/autobrowse/evals/eval/conditions/outer-fable-5.json b/skills/autobrowse/evals/eval/conditions/outer-fable-5.json
new file mode 100644
index 0000000..9f742b7
--- /dev/null
+++ b/skills/autobrowse/evals/eval/conditions/outer-fable-5.json
@@ -0,0 +1,9 @@
+{
+  "id": "outer-fable-5",
+  "notes": "Fable 5 as the OUTER strategy-improver (vs baseline's Opus 4.8). Measures hypothesis-formation quality: convergence speed, regressions, holdout pass rate of the strategies it writes. Outer calls are small, so the 2x pricing barely matters here.",
+  "inner_model": "claude-sonnet-4-6",
+  "outer_model": "claude-fable-5",
+  "outer_prompt": "outer-default",
+  "max_iters": 5,
+  "holdout_runs": 3
+}
diff --git a/skills/autobrowse/evals/eval/conditions/outer-prompt-lean.json b/skills/autobrowse/evals/eval/conditions/outer-prompt-lean.json
new file mode 100644
index 0000000..b3736a3
--- /dev/null
+++ b/skills/autobrowse/evals/eval/conditions/outer-prompt-lean.json
@@ -0,0 +1,9 @@
+{
+  "id": "outer-prompt-lean",
+  "notes": "Prompt ablation: strip the SKILL.md-style guidance (one-hypothesis rule, build-on-wins, evidence grounding) from the outer prompt. Measures how much of convergence quality comes from the methodology vs the model.",
+  "inner_model": "claude-sonnet-4-6",
+  "outer_model": "claude-opus-4-8",
+  "outer_prompt": "outer-lean",
+  "max_iters": 5,
+  "holdout_runs": 3
+}
diff --git a/skills/autobrowse/evals/eval/conditions/outer-sonnet.json b/skills/autobrowse/evals/eval/conditions/outer-sonnet.json
new file mode 100644
index 0000000..98ae7d9
--- /dev/null
+++ b/skills/autobrowse/evals/eval/conditions/outer-sonnet.json
@@ -0,0 +1,9 @@
+{
+  "id": "outer-sonnet",
+  "notes": "Cheaper outer agent: is Opus-level hypothesis formation actually load-bearing, or can Sonnet read traces and improve strategies just as well?",
+  "inner_model": "claude-sonnet-4-6",
+  "outer_model": "claude-sonnet-4-6",
+  "outer_prompt": "outer-default",
+  "max_iters": 5,
+  "holdout_runs": 3
+}
diff --git a/skills/autobrowse/evals/eval/conditions/pilot.json b/skills/autobrowse/evals/eval/conditions/pilot.json
new file mode 100644
index 0000000..fc5280b
--- /dev/null
+++ b/skills/autobrowse/evals/eval/conditions/pilot.json
@@ -0,0 +1,9 @@
+{
+  "id": "pilot",
+  "notes": "Cheap smoke-test condition for validating the real (non-mock) pipeline: 2 training iterations max, 1 holdout run.",
+  "inner_model": "claude-sonnet-4-6",
+  "outer_model": "claude-opus-4-8",
+  "outer_prompt": "outer-default",
+  "max_iters": 2,
+  "holdout_runs": 1
+}
diff --git a/skills/autobrowse/evals/eval/config.mjs b/skills/autobrowse/evals/eval/config.mjs
new file mode 100644
index 0000000..0dbd72c
--- /dev/null
+++ b/skills/autobrowse/evals/eval/config.mjs
@@ -0,0 +1,61 @@
+import * as fs from "node:fs";
+import * as path from "node:path";
+import { fileURLToPath } from "node:url";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+export const ROOT = path.resolve(__dirname, "..");
+export const EVAL_DIR = path.join(ROOT, "eval");
+export const TASKS_DIR = path.join(EVAL_DIR, "tasks");
+export const CONDITIONS_DIR = path.join(EVAL_DIR, "conditions");
+export const PROMPTS_DIR = path.join(EVAL_DIR, "prompts");
+export const FIXTURES_DIR = path.join(ROOT, "fixtures");
+export const RUNS_DIR = path.join(ROOT, "runs");
+export const RESULTS_FILE = path.join(RUNS_DIR, "results.jsonl");
+
+const AUTOBROWSE_CANDIDATES = [
+  process.env.AUTOBROWSE_DIR,
+  path.resolve(ROOT, ".."), // evals/ ships inside the autobrowse skill
+  path.join(ROOT, "vendor", "skills", "skills", "autobrowse"),
+].filter(Boolean);
+
+export function resolveAutobrowseDir() {
+  for (const dir of AUTOBROWSE_CANDIDATES) {
+    if (fs.existsSync(path.join(dir, "scripts", "evaluate.mjs"))) return dir;
+  }
+  throw new Error(
+    "autobrowse skill not found. Set AUTOBROWSE_DIR to the directory containing scripts/evaluate.mjs " +
+    "(e.g. a checkout of github.com/browserbase/skills at skills/autobrowse)."
+  );
+}
+
+export function loadCondition(idOrPath) {
+  const p = idOrPath.endsWith(".json")
+    ? path.resolve(idOrPath)
+    : path.join(CONDITIONS_DIR, `${idOrPath}.json`);
+  const cond = JSON.parse(fs.readFileSync(p, "utf-8"));
+  // Defaults
+  return {
+    max_iters: 5,
+    holdout_runs: 3,
+    converge_window: 3,
+    converge_passes: 2,
+    outer_prompt: "outer-default",
+    browser_trace: false,
+    ...cond,
+  };
+}
+
+export function loadTaskMeta(task) {
+  const p = path.join(TASKS_DIR, task, "meta.json");
+  const meta = JSON.parse(fs.readFileSync(p, "utf-8"));
+  return { env: "local", max_turns: 30, timeout_min: 20, ...meta, task };
+}
+
+export function listTasks() {
+  return fs
+    .readdirSync(TASKS_DIR, { withFileTypes: true })
+    .filter((d) => d.isDirectory() && !d.name.startsWith("_"))
+    .map((d) => d.name)
+    .sort();
+}
diff --git a/skills/autobrowse/evals/eval/lib/extract-output.mjs b/skills/autobrowse/evals/eval/lib/extract-output.mjs
new file mode 100644
index 0000000..fcf604c
--- /dev/null
+++ b/skills/autobrowse/evals/eval/lib/extract-output.mjs
@@ -0,0 +1,47 @@
+import * as fs from "node:fs";
+import * as path from "node:path";
+
+// Pull the last fenced ```json block (or last bare {...}) from text.
+// Mirrors extractFinalJson in the newer evaluate.mjs.
+export function extractJsonFromText(text) {
+  if (!text) return null;
+  const fences = [...text.matchAll(/```(?:json)?\s*([\s\S]*?)```/gi)];
+  let candidate = fences.length ? fences[fences.length - 1][1].trim() : null;
+  if (!candidate) {
+    const first = text.indexOf("{");
+    const last = text.lastIndexOf("}");
+    if (first !== -1 && last > first) candidate = text.slice(first, last + 1);
+  }
+  if (!candidate) return null;
+  try {
+    return JSON.parse(candidate);
+  } catch {
+    return null;
+  }
+}
+
+// Load the inner agent's final structured output from a run's trace dir.
+// Newer evaluate.mjs writes result.json ({parsed, raw, parse_error});
+// fall back to parsing summary.md's "Agent Final Output" section for the
+// upstream version that doesn't.
+export function loadRunOutput(runDir) {
+  const resultPath = path.join(runDir, "result.json");
+  if (fs.existsSync(resultPath)) {
+    try {
+      const r = JSON.parse(fs.readFileSync(resultPath, "utf-8"));
+      if (r && "parsed" in r) return r.parsed;
+      return r;
+    } catch {
+      /* fall through */
+    }
+  }
+  const summaryPath = path.join(runDir, "summary.md");
+  if (fs.existsSync(summaryPath)) {
+    const summary = fs.readFileSync(summaryPath, "utf-8");
+    const idx = summary.indexOf("## Agent Final Output");
+    const tail = idx === -1 ? summary : summary.slice(idx);
+    const parsed = extractJsonFromText(tail);
+    if (parsed) return parsed;
+  }
+  return null;
+}
diff --git a/skills/autobrowse/evals/eval/lib/pricing.mjs b/skills/autobrowse/evals/eval/lib/pricing.mjs
new file mode 100644
index 0000000..f1bc8a9
--- /dev/null
+++ b/skills/autobrowse/evals/eval/lib/pricing.mjs
@@ -0,0 +1,19 @@
+// Central pricing table — single source of truth for the whole harness.
+// USD per 1M tokens [input, output]. Do not trust per-script tables elsewhere
+// (evaluate.mjs has its own stale copy; we recompute from raw token counts).
+const PRICING = [
+  ["claude-fable-5", [10, 50]],
+  ["claude-opus-4-8", [5, 25]],
+  ["claude-opus-4-7", [5, 25]],
+  ["claude-opus-4-6", [5, 25]],
+  ["claude-opus-4-5", [5, 25]],
+  ["claude-sonnet-4-6", [3, 15]],
+  ["claude-sonnet-4-5", [3, 15]],
+  ["claude-haiku-4-5", [1, 5]],
+];
+
+export function costUsd(model, tokensIn, tokensOut) {
+  const entry = PRICING.find(([prefix]) => model?.startsWith(prefix));
+  const [inRate, outRate] = entry ? entry[1] : [3, 15];
+  return (tokensIn * inRate + tokensOut * outRate) / 1_000_000;
+}
diff --git a/skills/autobrowse/evals/eval/lib/results.mjs b/skills/autobrowse/evals/eval/lib/results.mjs
new file mode 100644
index 0000000..1deb4b6
--- /dev/null
+++ b/skills/autobrowse/evals/eval/lib/results.mjs
@@ -0,0 +1,29 @@
+import * as fs from "node:fs";
+import * as path from "node:path";
+import { RESULTS_FILE } from "../config.mjs";
+
+// One results.jsonl record per inner-agent run (training iteration, holdout
+// run) — append-only, everything downstream is a query over this file.
+//
+// Schema (all rows):
+//   ts, condition_id, task, tier, trial, phase: "train"|"holdout", iter,
+//   run_id, env, inner_model, outer_model,
+//   verified_pass, claimed_success, false_success, verifier_reason,
+//   status, stop_reason, turns, duration_sec, browser_ms, model_ms,
+//   tool_calls, tool_errors, tokens_in, tokens_out, inner_cost_usd,
+//   outer_tokens_in, outer_tokens_out, outer_cost_usd, hypothesis,
+//   converged_at (train rows on the converging iteration), mock
+
+export function appendResult(record, file = RESULTS_FILE) {
+  fs.mkdirSync(path.dirname(file), { recursive: true });
+  fs.appendFileSync(file, JSON.stringify({ ts: new Date().toISOString(), ...record }) + "\n");
+}
+
+export function readResults(file = RESULTS_FILE) {
+  if (!fs.existsSync(file)) return [];
+  return fs
+    .readFileSync(file, "utf-8")
+    .split("\n")
+    .filter(Boolean)
+    .map((line) => JSON.parse(line));
+}
diff --git a/skills/autobrowse/evals/eval/lib/run-inner.mjs b/skills/autobrowse/evals/eval/lib/run-inner.mjs
new file mode 100644
index 0000000..5f21a42
--- /dev/null
+++ b/skills/autobrowse/evals/eval/lib/run-inner.mjs
@@ -0,0 +1,203 @@
+import * as fs from "node:fs";
+import * as path from "node:path";
+import { spawnSync } from "node:child_process";
+import { resolveAutobrowseDir, TASKS_DIR } from "../config.mjs";
+
+// Run isolation: kill any existing browse daemon so a stale session (e.g. a
+// leftover REMOTE Browserbase session that can't reach localhost fixtures)
+// doesn't poison the run. Active sessions don't switch local/remote on their
+// own — the pilot run failed exactly this way.
+function browseStop() {
+  try {
+    spawnSync("browse", ["stop"], { encoding: "utf-8", timeout: 30_000 });
+  } catch {
+    /* no daemon running is fine */
+  }
+}
+
+function createCloudSession() {
+  const created = spawnSync(
+    "browse",
+    ["cloud", "sessions", "create", "--keep-alive", "--verified", "--proxies"],
+    { encoding: "utf-8", timeout: 120_000 }
+  );
+  const id = JSON.parse(created.stdout).id;
+  const got = spawnSync("browse", ["cloud", "sessions", "get", id], { encoding: "utf-8", timeout: 60_000 });
+  return { id, connectUrl: JSON.parse(got.stdout).connectUrl };
+}
+
+function releaseCloudSession(id) {
+  try {
+    spawnSync("browse", ["cloud", "sessions", "update", id, "--status", "REQUEST_RELEASE"], {
+      encoding: "utf-8",
+      timeout: 60_000,
+    });
+  } catch {
+    /* keep-alive sessions also expire server-side eventually */
+  }
+}
+
+// Mode-default shim: on this machine `browse open` without a flag defaults to
+// REMOTE (Browserbase creds present), so an inner agent that drops --local
+// silently gets a cloud browser that can't reach localhost fixtures. The shim
+// appends the task's required mode flag to `browse open` ONLY when the agent
+// passed neither --local nor --remote — explicit agent choices still win.
+// Same spirit as evaluate.mjs's own --cdp/--session arg rewriting.
+function makeBrowseShim(workspace, env) {
+  const realBrowse = spawnSync("which", ["browse"], { encoding: "utf-8" }).stdout.trim();
+  if (!realBrowse) return null;
+  const binDir = path.join(workspace, ".bin");
+  fs.mkdirSync(binDir, { recursive: true });
+  const flag = env === "remote" ? "--remote" : "--local";
+  const shim = `#!/bin/sh
+REAL="${realBrowse}"
+if [ "$1" = "open" ]; then
+  for a in "$@"; do
+    if [ "$a" = "--local" ] || [ "$a" = "--remote" ]; then exec "$REAL" "$@"; fi
+  done
+  exec "$REAL" "$@" ${flag}
+fi
+exec "$REAL" "$@"
+`;
+  const shimPath = path.join(binDir, "browse");
+  fs.writeFileSync(shimPath, shim, { mode: 0o755 });
+  return binDir;
+}
+
+// Run one inner-agent attempt (evaluate.mjs) and return its structured
+// result: {status, stop_reason, duration_sec, turns, tokens_in, tokens_out,
+// trace_dir, ...}. stderr (the live decision log) is teed to a log file.
+export function runInner({ task, workspace, env, model, maxTurns, timeoutMin, mock, iter, logFile }) {
+  if (mock) return runInnerMock({ task, workspace, iter });
+
+  const autobrowseDir = resolveAutobrowseDir();
+  let cloud = null;
+  let shimDir = null;
+
+  if (env === "remote") {
+    // Remote isolation + bot-protection: pre-create a verified+proxied
+    // Browserbase session and hand its connectUrl to evaluate.mjs, which
+    // rewrites every inner browse call to attach via --cdp with a
+    // connectUrl-hashed daemon session name. Concurrent runs never collide,
+    // and plain `browse open --remote` (which can't request --verified /
+    // --proxies and gets Akamai-walled on e.g. OpenTable) is bypassed.
+    try {
+      cloud = createCloudSession();
+    } catch (err) {
+      return {
+        status: "harness_error",
+        stop_reason: `cloud session create failed: ${err.message}`,
+        duration_sec: null, turns: null, tokens_in: 0, tokens_out: 0, trace_dir: null,
+      };
+    }
+  } else {
+    browseStop();
+    shimDir = makeBrowseShim(workspace, env);
+    // Pre-warm the local daemon: Chrome cold-start can exceed evaluate.mjs's
+    // 30s exec timeout, which kills the agent's first `browse open`
+    // mid-handshake and strands the session (~20 wasted turns recovering).
+    spawnSync("browse", ["open", "about:blank", "--local", "--timeout", "90000"], {
+      encoding: "utf-8",
+      timeout: 120_000,
+    });
+  }
+
+  const args = [
+    path.join(autobrowseDir, "scripts", "evaluate.mjs"),
+    "--task", task,
+    "--workspace", workspace,
+    "--env", env,
+    "--model", model,
+    ...(cloud ? ["--connect-url", cloud.connectUrl] : []),
+  ];
+  let res;
+  try {
+    res = spawnSync("node", args, {
+      encoding: "utf-8",
+      timeout: (timeoutMin ?? 20) * 60 * 1000,
+      maxBuffer: 32 * 1024 * 1024,
+      env: {
+        ...process.env,
+        MAX_TURNS: String(maxTurns ?? 30),
+        ...(shimDir ? { PATH: `${shimDir}:${process.env.PATH}` } : {}),
+      },
+    });
+  } finally {
+    if (cloud) releaseCloudSession(cloud.id);
+  }
+
+  if (logFile) {
+    fs.mkdirSync(path.dirname(logFile), { recursive: true });
+    fs.writeFileSync(logFile, (res.stderr || "") + "\n--- stdout ---\n" + (res.stdout || ""));
+  }
+
+  // evaluate.mjs prints exactly one JSON line on stdout (diagnostics → stderr).
+  const lines = (res.stdout || "").trim().split("\n").filter(Boolean);
+  for (let i = lines.length - 1; i >= 0; i--) {
+    try {
+      const parsed = JSON.parse(lines[i]);
+      if (parsed && parsed.trace_dir) return parsed;
+    } catch {
+      /* keep scanning */
+    }
+  }
+  return {
+    status: "harness_error",
+    stop_reason: res.error ? String(res.error) : `exit=${res.status}`,
+    duration_sec: null, turns: null, tokens_in: 0, tokens_out: 0, trace_dir: null,
+  };
+}
+
+// ── Mock mode ───────────────────────────────────────────────────────
+// Fabricates a plausible run without browse/Anthropic. Behavior: the run
+// passes iff strategy.md contains the marker "MOCK-FIX" (which the mock
+// outer agent adds on its second improvement). Failing runs CLAIM success
+// with garbage output, deliberately exercising the false-success metric.
+function runInnerMock({ task, workspace, iter }) {
+  const tracesDir = path.join(workspace, "traces", task);
+  fs.mkdirSync(tracesDir, { recursive: true });
+  const runNumber = fs.readdirSync(tracesDir).filter((d) => d.startsWith("run-")).length + 1;
+  const runId = `run-${String(runNumber).padStart(3, "0")}`;
+  const traceDir = path.join(tracesDir, runId);
+  fs.mkdirSync(traceDir, { recursive: true });
+
+  const strategyFile = path.join(workspace, "tasks", task, "strategy.md");
+  fs.mkdirSync(path.dirname(strategyFile), { recursive: true });
+  if (!fs.existsSync(strategyFile)) fs.writeFileSync(strategyFile, `# ${task} Navigation Skill\n`);
+  const strategy = fs.readFileSync(strategyFile, "utf-8");
+  const passes = strategy.includes("MOCK-FIX");
+
+  let output;
+  if (passes) {
+    const mockOutputPath = path.join(TASKS_DIR, task, "mock-output.json");
+    output = JSON.parse(fs.readFileSync(mockOutputPath, "utf-8"));
+  } else {
+    output = { success: true, note: "fabricated-by-mock-failure", value: 42 };
+  }
+
+  const turns = passes ? 9 : 24;
+  const trace = [];
+  for (let t = 1; t <= Math.min(turns, 6); t++) {
+    trace.push({ turn: t, role: "assistant", tool_name: "execute", tool_input: { command: "browse snapshot" } });
+    trace.push({ turn: t, role: "tool_result", command: "browse snapshot", output: "[0-1] mock", error: !passes && t === 4, duration_ms: 800 + t * 120 });
+  }
+  fs.writeFileSync(path.join(traceDir, "trace.json"), JSON.stringify(trace, null, 2));
+  fs.writeFileSync(path.join(traceDir, "result.json"), JSON.stringify({ parsed: output, raw: JSON.stringify(output), parse_error: null }, null, 2));
+  fs.writeFileSync(
+    path.join(traceDir, "summary.md"),
+    `# ${task} — ${runId} (MOCK)\n\n**Status:** ${passes ? "completed" : "max_turns"}\n\n## Agent Final Output\n\n\`\`\`json\n${JSON.stringify(output, null, 2)}\n\`\`\`\n`
+  );
+
+  const tokensIn = passes ? 40_000 : 140_000;
+  const tokensOut = passes ? 2_000 : 7_000;
+  return {
+    task, run: runId,
+    status: passes ? "completed" : "max_turns",
+    stop_reason: passes ? "end_turn" : "max_turns",
+    duration_sec: passes ? 45.0 : 210.0,
+    turns,
+    tokens_in: tokensIn, tokens_out: tokensOut,
+    trace_dir: traceDir,
+    mock: true,
+  };
+}
diff --git a/skills/autobrowse/evals/eval/lib/run-verifier.mjs b/skills/autobrowse/evals/eval/lib/run-verifier.mjs
new file mode 100644
index 0000000..ca9a9df
--- /dev/null
+++ b/skills/autobrowse/evals/eval/lib/run-verifier.mjs
@@ -0,0 +1,30 @@
+import * as path from "node:path";
+import { spawnSync } from "node:child_process";
+import { TASKS_DIR } from "../config.mjs";
+
+// Verifier protocol (mirrors the codegen runner protocol in autobrowse):
+//   node eval/tasks/<task>/verify.mjs --run-dir <traceDir>
+// prints exactly one JSON line: {passed: bool, checks: [{name, ok, detail}], reason}
+export function runVerifier(task, runDir) {
+  const verifier = path.join(TASKS_DIR, task, "verify.mjs");
+  const res = spawnSync("node", [verifier, "--run-dir", runDir], {
+    encoding: "utf-8",
+    timeout: 5 * 60 * 1000, // some verifiers re-check live state
+    maxBuffer: 8 * 1024 * 1024,
+  });
+  const lines = (res.stdout || "").trim().split("\n").filter(Boolean);
+  for (let i = lines.length - 1; i >= 0; i--) {
+    try {
+      const parsed = JSON.parse(lines[i]);
+      if (parsed && typeof parsed.passed === "boolean") return parsed;
+    } catch {
+      /* keep scanning */
+    }
+  }
+  return {
+    passed: false,
+    checks: [],
+    reason: `verifier did not emit a {passed:boolean} JSON line; exit=${res.status} stderr=${(res.stderr || "").slice(0, 300)}`,
+    verifier_error: true,
+  };
+}
diff --git a/skills/autobrowse/evals/eval/lib/trace-stats.mjs b/skills/autobrowse/evals/eval/lib/trace-stats.mjs
new file mode 100644
index 0000000..0ff6881
--- /dev/null
+++ b/skills/autobrowse/evals/eval/lib/trace-stats.mjs
@@ -0,0 +1,26 @@
+import * as fs from "node:fs";
+import * as path from "node:path";
+
+// Split a run's wall clock into browser time (sum of browse-CLI command
+// durations recorded in trace.json) and model time (the remainder).
+export function traceStats(runDir, durationSec) {
+  const out = { browser_ms: null, model_ms: null, tool_errors: 0, tool_calls: 0 };
+  const tracePath = path.join(runDir, "trace.json");
+  if (!fs.existsSync(tracePath)) return out;
+  try {
+    const trace = JSON.parse(fs.readFileSync(tracePath, "utf-8"));
+    let browserMs = 0;
+    for (const entry of trace) {
+      if (entry.role === "tool_result") {
+        out.tool_calls++;
+        browserMs += entry.duration_ms || 0;
+        if (entry.error) out.tool_errors++;
+      }
+    }
+    out.browser_ms = browserMs;
+    if (durationSec != null) out.model_ms = Math.max(0, Math.round(durationSec * 1000 - browserMs));
+  } catch {
+    /* leave nulls */
+  }
+  return out;
+}
diff --git a/skills/autobrowse/evals/eval/outer-agent.mjs b/skills/autobrowse/evals/eval/outer-agent.mjs
new file mode 100644
index 0000000..987286d
--- /dev/null
+++ b/skills/autobrowse/evals/eval/outer-agent.mjs
@@ -0,0 +1,122 @@
+import "dotenv/config";
+import * as fs from "node:fs";
+import * as path from "node:path";
+import Anthropic from "@anthropic-ai/sdk";
+import { PROMPTS_DIR } from "./config.mjs";
+import { costUsd } from "./lib/pricing.mjs";
+
+// Scripted stand-in for the interactive Claude Code outer loop. One
+// structured-output call per iteration: evidence in → {diagnosis, hypothesis,
+// new_strategy} out. This is what makes the outer model and outer prompt
+// sweepable eval variables, and what lets us meter outer-agent tokens (the
+// interactive loop never records them).
+//
+// Fidelity note: the real outer loop can drill into trace.json and
+// screenshots tool-by-tool. We approximate with a curated evidence pack
+// (summary, verifier verdict, error lines). A Claude-Agent-SDK-driven outer
+// agent with Read/Grep tools is the natural follow-up architecture variant.
+
+const STRATEGY_SCHEMA = {
+  type: "object",
+  properties: {
+    diagnosis: {
+      type: "string",
+      description: "What went wrong (or what is fragile), citing specific turns/errors from the evidence.",
+    },
+    hypothesis: {
+      type: "string",
+      description: "The ONE change being tested this iteration and why it should fix the diagnosis.",
+    },
+    new_strategy: {
+      type: "string",
+      description: "The complete new strategy.md file content.",
+    },
+  },
+  required: ["diagnosis", "hypothesis", "new_strategy"],
+  additionalProperties: false,
+};
+
+const clip = (s, n) => (s && s.length > n ? s.slice(0, n) + `\n...[clipped ${s.length - n} chars]` : s || "");
+
+function collectErrorLines(traceDir, max = 15) {
+  try {
+    const trace = JSON.parse(fs.readFileSync(path.join(traceDir, "trace.json"), "utf-8"));
+    return trace
+      .filter((e) => e.role === "tool_result" && e.error)
+      .map((e) => `turn ${e.turn}: ${e.command} → ${String(e.output).slice(0, 200)}`)
+      .slice(-max);
+  } catch {
+    return [];
+  }
+}
+
+let client = null;
+
+export async function improveStrategy({ model, promptName, taskMd, strategyMd, runResult, verifierResult, mock, iter }) {
+  if (mock) return improveStrategyMock({ strategyMd, iter });
+
+  client ??= new Anthropic();
+  const systemPrompt = fs.readFileSync(path.join(PROMPTS_DIR, `${promptName}.md`), "utf-8");
+
+  const traceDir = runResult.trace_dir;
+  let summary = "";
+  try {
+    summary = fs.readFileSync(path.join(traceDir, "summary.md"), "utf-8");
+  } catch { /* missing summary is survivable */ }
+  const errors = collectErrorLines(traceDir);
+
+  const userMessage = [
+    "# Task definition (task.md)\n", clip(taskMd, 4_000),
+    "\n\n# Current strategy.md\n", clip(strategyMd, 8_000),
+    "\n\n# Run evidence\n",
+    `Status: ${runResult.status} (${runResult.stop_reason}) | Turns: ${runResult.turns} | Duration: ${runResult.duration_sec}s\n`,
+    "\n## Verifier verdict (ground truth)\n```json\n", JSON.stringify(verifierResult, null, 2), "\n```\n",
+    errors.length ? `\n## Failed commands\n${errors.join("\n")}\n` : "",
+    "\n## Run summary (decision log + final output)\n", clip(summary, 14_000),
+  ].join("");
+
+  const response = await client.messages.create({
+    model,
+    max_tokens: 16000,
+    thinking: { type: "adaptive" },
+    system: systemPrompt,
+    messages: [{ role: "user", content: userMessage }],
+    output_config: { format: { type: "json_schema", schema: STRATEGY_SCHEMA } },
+  });
+
+  const text = response.content.filter((b) => b.type === "text").map((b) => b.text).join("");
+  let parsed;
+  try {
+    parsed = JSON.parse(text);
+  } catch (err) {
+    throw new Error(`outer agent returned unparseable output: ${err.message}: ${text.slice(0, 200)}`);
+  }
+
+  const tokensIn = response.usage.input_tokens;
+  const tokensOut = response.usage.output_tokens;
+  return {
+    diagnosis: parsed.diagnosis,
+    hypothesis: parsed.hypothesis,
+    newStrategy: parsed.new_strategy,
+    tokens_in: tokensIn,
+    tokens_out: tokensOut,
+    cost_usd: costUsd(model, tokensIn, tokensOut),
+  };
+}
+
+// Mock: 1st improvement adds a useless note (run still fails), 2nd adds the
+// MOCK-FIX marker that flips the mock inner agent to passing.
+function improveStrategyMock({ strategyMd, iter }) {
+  const hasNote = strategyMd.includes("mock-note");
+  const addition = hasNote
+    ? "\n## MOCK-FIX\nApply the fix that makes mock runs pass.\n"
+    : "\n## mock-note\nFirst hypothesis: wait longer. (mock — does not help)\n";
+  return {
+    diagnosis: "mock diagnosis",
+    hypothesis: hasNote ? "add MOCK-FIX marker" : "add wait (will not help)",
+    newStrategy: strategyMd + addition,
+    tokens_in: 12_000,
+    tokens_out: 1_500,
+    cost_usd: costUsd("claude-opus-4-8", 12_000, 1_500),
+  };
+}
diff --git a/skills/autobrowse/evals/eval/prompts/outer-default.md b/skills/autobrowse/evals/eval/prompts/outer-default.md
new file mode 100644
index 0000000..d4c5605
--- /dev/null
+++ b/skills/autobrowse/evals/eval/prompts/outer-default.md
@@ -0,0 +1,13 @@
+You are the OUTER agent in the autobrowse self-improving loop. An inner browser-automation agent just attempted a task following the current strategy.md. Your job: read the evidence, form ONE hypothesis about the most impactful fix, and rewrite strategy.md.
+
+Rules — these mirror the autobrowse SKILL.md and are non-negotiable:
+
+1. **One hypothesis per iteration.** Find the exact turn where things went wrong. Ask: what single heuristic would have prevented it? Test one change at a time.
+2. **Build on wins.** Keep everything in the current strategy that worked. Never throw away site-specific knowledge (selectors, timing notes, URL shortcuts) that the trace shows being used successfully.
+3. **Be concrete.** Good strategies have: a fast path (direct URLs, shortcuts that skip exploration), a step-by-step workflow with exact commands and timing notes, site-specific knowledge (selector IDs, form field names, success indicators), and failure recovery (what to do when X goes wrong).
+4. **Ground every claim in the trace.** Cite the turn number or error message that motivates your change. A hypothesis like "the click didn't work" is weak; "turn 12: `browse click [2-147]` returned 'element not found' because the snapshot was taken before the dropdown finished animating — add `browse wait timeout 1000` after opening the dropdown" is strong.
+5. **The verifier verdict is ground truth.** If the inner agent claimed success but the verifier failed specific checks, the strategy must address WHY the agent extracted or did the wrong thing (wrong element, wrong filter, fabricated data) — and instruct it to verify before claiming success.
+6. **If the run passed**, make only conservative refinements: tighten the fast path, remove dead exploration steps, shorten. Do not restructure a working strategy.
+7. The strategy must work for a FRESH agent with no memory of previous runs. Write self-contained instructions, not commentary about past iterations.
+
+Return your full rewritten strategy.md — the complete file content, not a diff.
diff --git a/skills/autobrowse/evals/eval/prompts/outer-lean.md b/skills/autobrowse/evals/eval/prompts/outer-lean.md
new file mode 100644
index 0000000..2987596
--- /dev/null
+++ b/skills/autobrowse/evals/eval/prompts/outer-lean.md
@@ -0,0 +1 @@
+You improve instructions for a browser-automation agent. Below is the task, the current strategy.md, and what happened when an agent followed it (including an automated verifier's verdict). Figure out what went wrong and write a better strategy.md. Return the complete new file content.
diff --git a/skills/autobrowse/evals/eval/report.mjs b/skills/autobrowse/evals/eval/report.mjs
new file mode 100644
index 0000000..78d4bc9
Binary files /dev/null and b/skills/autobrowse/evals/eval/report.mjs differ
diff --git a/skills/autobrowse/evals/eval/run-matrix.mjs b/skills/autobrowse/evals/eval/run-matrix.mjs
new file mode 100644
index 0000000..175dc26
--- /dev/null
+++ b/skills/autobrowse/evals/eval/run-matrix.mjs
@@ -0,0 +1,251 @@
+#!/usr/bin/env node
+// run-matrix.mjs — eval orchestrator: condition × task × trial.
+//
+// Per cell: TRAIN (evaluate → verify → improve strategy, up to max_iters,
+// early-stop on convergence) then HOLDOUT (freeze best strategy, N fresh
+// runs, verify each). Every run appends one row to runs/results.jsonl.
+//
+// Usage:
+//   node eval/run-matrix.mjs --conditions baseline --tasks fixture-checkout,books-toscrape
+//   node eval/run-matrix.mjs --conditions baseline,inner-haiku --tasks all --trials 3
+//   node eval/run-matrix.mjs --conditions baseline --tasks fixture-checkout --mock
+//   Flags: --phase train|holdout|all (default all), --results <file>
+
+import "dotenv/config";
+import * as fs from "node:fs";
+import * as path from "node:path";
+import * as net from "node:net";
+import { spawn } from "node:child_process";
+import { loadCondition, loadTaskMeta, listTasks, TASKS_DIR, RUNS_DIR, RESULTS_FILE, FIXTURES_DIR } from "./config.mjs";
+import { runInner } from "./lib/run-inner.mjs";
+import { runVerifier } from "./lib/run-verifier.mjs";
+import { loadRunOutput } from "./lib/extract-output.mjs";
+import { traceStats } from "./lib/trace-stats.mjs";
+import { appendResult } from "./lib/results.mjs";
+import { costUsd } from "./lib/pricing.mjs";
+import { improveStrategy } from "./outer-agent.mjs";
+
+// ── CLI args ────────────────────────────────────────────────────────
+
+function getArg(name, fallback) {
+  const idx = process.argv.indexOf(`--${name}`);
+  if (idx !== -1 && process.argv[idx + 1] && !process.argv[idx + 1].startsWith("--")) return process.argv[idx + 1];
+  return fallback;
+}
+const hasFlag = (name) => process.argv.includes(`--${name}`);
+
+const conditionIds = getArg("conditions", "baseline").split(",");
+const taskArg = getArg("tasks", "all");
+const trials = parseInt(getArg("trials", "1"), 10);
+const trialOffset = parseInt(getArg("trial-offset", "0"), 10); // fresh trial numbers when adding runs later
+const phase = getArg("phase", "all");
+const mock = hasFlag("mock");
+const resultsFile = getArg("results", RESULTS_FILE);
+
+const tasks = taskArg === "all" ? listTasks() : taskArg.split(",");
+
+// ── Fixture server (auto-start when a selected task needs it) ──────
+
+const FIXTURE_PORT = 4173;
+
+function portInUse(port) {
+  return new Promise((resolve) => {
+    const sock = net.connect({ port, host: "127.0.0.1" }, () => { sock.destroy(); resolve(true); });
+    sock.on("error", () => resolve(false));
+  });
+}
+
+async function ensureFixtures(metas) {
+  if (mock) return null;
+  if (!metas.some((m) => (m.requires || []).includes("fixtures-server"))) return null;
+  if (await portInUse(FIXTURE_PORT)) {
+    console.error(`[matrix] fixtures server already running on :${FIXTURE_PORT}`);
+    return null;
+  }
+  const child = spawn("node", [path.join(FIXTURES_DIR, "serve.mjs")], { stdio: "ignore", detached: false });
+  await new Promise((r) => setTimeout(r, 600));
+  console.error(`[matrix] started fixtures server on :${FIXTURE_PORT} (pid ${child.pid})`);
+  return child;
+}
+
+// ── One eval cell: condition × task × trial ─────────────────────────
+
+async function runCell(cond, meta, trial) {
+  const task = meta.task;
+  const workspace = path.join(RUNS_DIR, cond.id, task, `trial-${trial}`);
+  const wsTaskDir = path.join(workspace, "tasks", task);
+  fs.mkdirSync(wsTaskDir, { recursive: true });
+  fs.copyFileSync(path.join(TASKS_DIR, task, "task.md"), path.join(wsTaskDir, "task.md"));
+  const strategyFile = path.join(wsTaskDir, "strategy.md");
+  if (!fs.existsSync(strategyFile)) fs.writeFileSync(strategyFile, `# ${task} Navigation Skill\n\n(learned through iterations)\n`);
+
+  const taskMd = fs.readFileSync(path.join(wsTaskDir, "task.md"), "utf-8");
+  const base = {
+    condition_id: cond.id, task, tier: meta.tier, trial, env: meta.env,
+    inner_model: cond.inner_model, outer_model: cond.outer_model, mock,
+  };
+
+  const recordRun = (phaseName, iter, runResult, verifierResult, extra = {}) => {
+    const output = runResult.trace_dir ? loadRunOutput(runResult.trace_dir) : null;
+    const claimed = output?.success === true;
+    const stats = runResult.trace_dir ? traceStats(runResult.trace_dir, runResult.duration_sec) : {};
+    const row = {
+      ...base,
+      phase: phaseName, iter,
+      run_id: runResult.run ?? null,
+      verified_pass: verifierResult.passed,
+      claimed_success: claimed,
+      false_success: claimed && !verifierResult.passed,
+      verifier_reason: verifierResult.reason ?? null,
+      status: runResult.status, stop_reason: runResult.stop_reason,
+      turns: runResult.turns, duration_sec: runResult.duration_sec,
+      ...stats,
+      tokens_in: runResult.tokens_in, tokens_out: runResult.tokens_out,
+      inner_cost_usd: +costUsd(cond.inner_model, runResult.tokens_in || 0, runResult.tokens_out || 0).toFixed(4),
+      ...extra,
+    };
+    appendResult(row, resultsFile);
+    return row;
+  };
+
+  // ── TRAIN ─────────────────────────────────────────────────────────
+  const trainPasses = [];
+  let lastPassingStrategyIter = null;
+
+  if (phase !== "holdout") {
+    for (let iter = 1; iter <= cond.max_iters; iter++) {
+      // Snapshot the strategy this run will use (versioned for revert/holdout).
+      fs.copyFileSync(strategyFile, path.join(wsTaskDir, `strategy.iter-${iter}.md`));
+
+      console.error(`[matrix] ${cond.id}/${task}/trial-${trial} TRAIN iter ${iter}/${cond.max_iters}`);
+      const runResult = runInner({
+        task, workspace, env: meta.env, model: cond.inner_model,
+        maxTurns: meta.max_turns, timeoutMin: meta.timeout_min, mock, iter,
+        logFile: path.join(workspace, "logs", `train-iter-${iter}.log`),
+      });
+      const verifierResult = runResult.trace_dir
+        ? runVerifier(task, runResult.trace_dir)
+        : { passed: false, checks: [], reason: "no trace dir (harness error)" };
+
+      trainPasses.push(verifierResult.passed);
+      if (verifierResult.passed) lastPassingStrategyIter = iter;
+
+      const window = trainPasses.slice(-cond.converge_window);
+      const converged =
+        verifierResult.passed &&
+        window.filter(Boolean).length >= cond.converge_passes &&
+        trainPasses.length >= 2;
+
+      const regression = iter > 1 && trainPasses[iter - 2] === true && verifierResult.passed === false;
+
+      let improvement = null;
+      if (!converged && iter < cond.max_iters) {
+        // Revert a regressing edit before improving again (SKILL.md policy).
+        if (regression) {
+          fs.copyFileSync(path.join(wsTaskDir, `strategy.iter-${iter - 1}.md`), strategyFile);
+          console.error(`[matrix]   regression — reverted strategy to iter ${iter - 1}`);
+        }
+        const strategyMd = fs.readFileSync(strategyFile, "utf-8");
+        try {
+          improvement = await improveStrategy({
+            model: cond.outer_model, promptName: cond.outer_prompt,
+            taskMd, strategyMd, runResult, verifierResult, mock, iter,
+          });
+          fs.writeFileSync(strategyFile, improvement.newStrategy);
+        } catch (err) {
+          console.error(`[matrix]   outer agent error: ${err.message}`);
+          improvement = { hypothesis: `OUTER-AGENT-ERROR: ${err.message}`, tokens_in: 0, tokens_out: 0, cost_usd: 0 };
+        }
+      }
+
+      recordRun("train", iter, runResult, verifierResult, {
+        regression,
+        converged_at: converged ? iter : null,
+        hypothesis: improvement?.hypothesis ?? null,
+        outer_tokens_in: improvement?.tokens_in ?? 0,
+        outer_tokens_out: improvement?.tokens_out ?? 0,
+        outer_cost_usd: improvement ? +improvement.cost_usd.toFixed(4) : 0,
+      });
+
+      if (converged) {
+        console.error(`[matrix]   converged at iter ${iter}`);
+        break;
+      }
+    }
+  }
+
+  // ── HOLDOUT ───────────────────────────────────────────────────────
+  if (phase !== "train") {
+    // Freeze the best strategy: the last version that produced a verified
+    // pass; else whatever training ended with.
+    if (lastPassingStrategyIter !== null) {
+      const best = path.join(wsTaskDir, `strategy.iter-${lastPassingStrategyIter}.md`);
+      // The passing run used the strategy *as snapshotted before that run*,
+      // unless it was also improved after — the snapshot is the right artifact.
+      fs.copyFileSync(best, strategyFile);
+    }
+    fs.copyFileSync(strategyFile, path.join(wsTaskDir, "strategy.holdout.md"));
+
+    for (let h = 1; h <= cond.holdout_runs; h++) {
+      console.error(`[matrix] ${cond.id}/${task}/trial-${trial} HOLDOUT ${h}/${cond.holdout_runs}`);
+      const runResult = runInner({
+        task, workspace, env: meta.env, model: cond.inner_model,
+        maxTurns: meta.max_turns, timeoutMin: meta.timeout_min, mock, iter: 99,
+        logFile: path.join(workspace, "logs", `holdout-${h}.log`),
+      });
+      const verifierResult = runResult.trace_dir
+        ? runVerifier(task, runResult.trace_dir)
+        : { passed: false, checks: [], reason: "no trace dir (harness error)" };
+      recordRun("holdout", h, runResult, verifierResult);
+    }
+  }
+}
+
+// ── Main ────────────────────────────────────────────────────────────
+
+const conditions = conditionIds.map(loadCondition);
+const metas = tasks.map(loadTaskMeta);
+const fixturesChild = await ensureFixtures(metas);
+
+console.error(`[matrix] ${conditions.length} condition(s) × ${tasks.length} task(s) × ${trials} trial(s)${mock ? " [MOCK]" : ""}`);
+console.error(`[matrix] results → ${resultsFile}`);
+
+try {
+  const cells = [];
+  for (const cond of conditions) {
+    for (const meta of metas) {
+      for (let trial = trialOffset + 1; trial <= trialOffset + trials; trial++) {
+        cells.push({ cond, meta, trial });
+      }
+    }
+  }
+
+  // Concurrency: remote cells each get their own pre-created Browserbase
+  // session (isolated CDP attach), so they can run in parallel. Local mode is
+  // a single Chrome daemon — force sequential when any local task is selected.
+  let concurrency = Math.max(1, parseInt(getArg("concurrency", "1"), 10));
+  if (!mock && concurrency > 1 && metas.some((m) => m.env === "local")) {
+    console.error("[matrix] local task selected — forcing --concurrency 1 (single Chrome daemon)");
+    concurrency = 1;
+  }
+
+  let next = 0;
+  await Promise.all(
+    Array.from({ length: Math.min(concurrency, cells.length) }, async () => {
+      while (true) {
+        const i = next++;
+        if (i >= cells.length) break;
+        const { cond, meta, trial } = cells[i];
+        try {
+          await runCell(cond, meta, trial);
+        } catch (err) {
+          console.error(`[matrix] cell ${cond.id}/${meta.task}/trial-${trial} crashed: ${err.message}`);
+        }
+      }
+    })
+  );
+} finally {
+  if (fixturesChild) fixturesChild.kill();
+}
+
+console.error("[matrix] done. Run: node eval/report.mjs");
diff --git a/skills/autobrowse/evals/eval/tasks/_lib/checks.mjs b/skills/autobrowse/evals/eval/tasks/_lib/checks.mjs
new file mode 100644
index 0000000..c463add
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/_lib/checks.mjs
@@ -0,0 +1,87 @@
+// Shared verifier toolkit. Every task's verify.mjs follows the protocol:
+//   node verify.mjs --run-dir <traceDir>
+//   → prints one JSON line {passed, checks: [{name, ok, detail}], reason}
+// Pass/fail lives in the JSON; a nonzero exit means the verifier itself broke.
+
+import * as path from "node:path";
+import { fileURLToPath } from "node:url";
+import { loadRunOutput } from "../../lib/extract-output.mjs";
+
+export function getRunDir() {
+  const idx = process.argv.indexOf("--run-dir");
+  if (idx === -1 || !process.argv[idx + 1]) {
+    console.error("usage: node verify.mjs --run-dir <traceDir>");
+    process.exit(1);
+  }
+  return path.resolve(process.argv[idx + 1]);
+}
+
+export function loadOutput(runDir) {
+  return loadRunOutput(runDir);
+}
+
+// ── Check builders ──────────────────────────────────────────────────
+
+export const norm = (s) =>
+  String(s ?? "")
+    .toLowerCase()
+    .normalize("NFKD")
+    .replace(/[^\w\s]/g, " ")
+    .replace(/\s+/g, " ")
+    .trim();
+
+export function check(name, ok, detail = "") {
+  return { name, ok: !!ok, detail: String(detail).slice(0, 300) };
+}
+
+export function checkFuzzyMatch(name, actual, expected) {
+  const a = norm(actual);
+  const e = norm(expected);
+  const ok = a && e && (a.includes(e) || e.includes(a));
+  return check(name, ok, `actual="${actual}" expected≈"${expected}"`);
+}
+
+export function checkContains(name, haystack, needle) {
+  return check(name, norm(haystack).includes(norm(needle)), `looking for "${needle}"`);
+}
+
+export function checkNumber(name, value, { eq, min, max } = {}) {
+  const n = typeof value === "string" ? parseFloat(value.replace(/[^0-9.\-]/g, "")) : value;
+  if (typeof n !== "number" || !isFinite(n)) return check(name, false, `not a number: ${JSON.stringify(value)}`);
+  if (eq !== undefined) return check(name, Math.abs(n - eq) < 0.005, `got ${n}, expected ${eq}`);
+  const ok = (min === undefined || n >= min) && (max === undefined || n <= max);
+  return check(name, ok, `got ${n}, expected [${min ?? "-∞"}, ${max ?? "∞"}]`);
+}
+
+export function checkTime(name, value) {
+  return check(name, /^([01]?\d|2[0-3]):[0-5]\d/.test(String(value ?? "").trim()), `got ${JSON.stringify(value)}`);
+}
+
+// ── Emit ────────────────────────────────────────────────────────────
+
+export function emit(checks, { requireAll = true } = {}) {
+  const failed = checks.filter((c) => !c.ok);
+  const passed = requireAll ? failed.length === 0 : failed.length < checks.length;
+  console.log(
+    JSON.stringify({
+      passed,
+      checks,
+      reason: passed ? "all checks passed" : failed.map((c) => `${c.name}: ${c.detail}`).join("; "),
+    })
+  );
+  process.exit(0);
+}
+
+export function emitNoOutput() {
+  console.log(JSON.stringify({ passed: false, checks: [], reason: "no parseable final JSON output in run" }));
+  process.exit(0);
+}
+
+// Deterministic checkout-fixture confirmation code — must match the
+// implementation in fixtures/checkout/index.html exactly.
+export function checkoutCode(name, email, zip, shipping) {
+  const s = `${name}|${email}|${zip}|${shipping}`.toLowerCase();
+  let sum = 0;
+  for (const ch of s) sum = (sum * 31 + ch.codePointAt(0)) % 100000;
+  return `BB-${String(sum).padStart(5, "0")}`;
+}
diff --git a/skills/autobrowse/evals/eval/tasks/books-toscrape/meta.json b/skills/autobrowse/evals/eval/tasks/books-toscrape/meta.json
new file mode 100644
index 0000000..1574f24
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/books-toscrape/meta.json
@@ -0,0 +1,8 @@
+{
+  "tier": "A",
+  "category": "list-extraction",
+  "env": "local",
+  "max_turns": 25,
+  "timeout_min": 15,
+  "gotchas": "listing truncates long titles (full title in the anchor's title attribute); single page for Travel but the agent must confirm no pagination"
+}
diff --git a/skills/autobrowse/evals/eval/tasks/books-toscrape/mock-output.json b/skills/autobrowse/evals/eval/tasks/books-toscrape/mock-output.json
new file mode 100644
index 0000000..a74e18b
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/books-toscrape/mock-output.json
@@ -0,0 +1,7 @@
+{
+  "success": true,
+  "count": 11,
+  "cheapest": { "title": "The Road to Little Dribbling: Adventures of an American in Britain (Notes From a Small Island #2)", "price_gbp": 23.21 },
+  "most_expensive": { "title": "A Year in Provence (Provence #1)", "price_gbp": 56.88 },
+  "error_reasoning": null
+}
diff --git a/skills/autobrowse/evals/eval/tasks/books-toscrape/task.md b/skills/autobrowse/evals/eval/tasks/books-toscrape/task.md
new file mode 100644
index 0000000..c2e40ce
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/books-toscrape/task.md
@@ -0,0 +1,35 @@
+# Task: Extract Travel-category book stats from Books to Scrape
+
+List every book in the "Travel" category of books.toscrape.com and report the count plus the cheapest and most expensive books.
+
+## URL
+
+https://books.toscrape.com/catalogue/category/books/travel_2/index.html
+
+## Inputs
+
+- Category: Travel
+
+## Steps
+
+1. Navigate to the URL
+2. Extract every book in the category with its full title and price (watch for pagination — include all pages if any)
+3. Compute the count, the cheapest book, and the most expensive book
+
+## Output
+
+Return a JSON object:
+
+```json
+{
+  "success": true,
+  "count": 0,
+  "cheapest": { "title": "...", "price_gbp": 0.0 },
+  "most_expensive": { "title": "...", "price_gbp": 0.0 },
+  "error_reasoning": null
+}
+```
+
+- Prices are in GBP (the £ amounts shown on the site); report them as numbers
+- Use the book's full title (the listing truncates some titles — the full title is in the link's title attribute or on the detail page)
+- If task fails: `success: false`, populate `error_reasoning`
diff --git a/skills/autobrowse/evals/eval/tasks/books-toscrape/verify.mjs b/skills/autobrowse/evals/eval/tasks/books-toscrape/verify.mjs
new file mode 100644
index 0000000..4f91952
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/books-toscrape/verify.mjs
@@ -0,0 +1,17 @@
+#!/usr/bin/env node
+import { getRunDir, loadOutput, emit, emitNoOutput, check, checkNumber, checkFuzzyMatch } from "../_lib/checks.mjs";
+
+const out = loadOutput(getRunDir());
+if (!out) emitNoOutput();
+
+// Ground truth fetched 2026-06-09 — books.toscrape.com is a static demo site
+// that has not changed in years. 11 Travel books; cheapest "The Road to
+// Little Dribbling" £23.21; most expensive "A Year in Provence" £56.88.
+emit([
+  check("claimed success", out.success === true, JSON.stringify(out.success)),
+  checkNumber("count", out.count, { eq: 11 }),
+  checkFuzzyMatch("cheapest title", out.cheapest?.title, "The Road to Little Dribbling"),
+  checkNumber("cheapest price", out.cheapest?.price_gbp, { eq: 23.21 }),
+  checkFuzzyMatch("most expensive title", out.most_expensive?.title, "A Year in Provence"),
+  checkNumber("most expensive price", out.most_expensive?.price_gbp, { eq: 56.88 }),
+]);
diff --git a/skills/autobrowse/evals/eval/tasks/fixture-checkout/meta.json b/skills/autobrowse/evals/eval/tasks/fixture-checkout/meta.json
new file mode 100644
index 0000000..15dc91f
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/fixture-checkout/meta.json
@@ -0,0 +1,9 @@
+{
+  "tier": "A",
+  "category": "multi-step-form",
+  "env": "local",
+  "max_turns": 30,
+  "timeout_min": 15,
+  "requires": ["fixtures-server"],
+  "gotchas": "shipping radio buttons render 900ms after step 2 appears; Next buttons stay disabled until fields validate"
+}
diff --git a/skills/autobrowse/evals/eval/tasks/fixture-checkout/mock-output.json b/skills/autobrowse/evals/eval/tasks/fixture-checkout/mock-output.json
new file mode 100644
index 0000000..e5021b5
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/fixture-checkout/mock-output.json
@@ -0,0 +1,7 @@
+{
+  "success": true,
+  "confirmation_code": "BB-09791",
+  "total_usd": 47.48,
+  "shipping": "express",
+  "error_reasoning": null
+}
diff --git a/skills/autobrowse/evals/eval/tasks/fixture-checkout/task.md b/skills/autobrowse/evals/eval/tasks/fixture-checkout/task.md
new file mode 100644
index 0000000..17def70
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/fixture-checkout/task.md
@@ -0,0 +1,41 @@
+# Task: Complete the Acme Store checkout
+
+Complete the multi-step checkout flow on the local fixture store and return the confirmation code.
+
+## URL
+
+http://localhost:4173/checkout/
+
+## Inputs
+
+- Full name: Ada Lovelace
+- Email: ada@example.com
+- Street address: 123 Bridge St
+- City: San Francisco
+- ZIP code: 94107
+- Shipping speed: Express
+
+## Steps
+
+1. Navigate to the URL
+2. Fill in the contact step (name, email) and continue
+3. Fill in the shipping step (address, city, ZIP), select **Express** shipping, and continue
+4. On the review step, confirm the order details and place the order
+5. Extract the confirmation code and the total charged from the confirmation screen
+
+## Output
+
+Return a JSON object:
+
+```json
+{
+  "success": true,
+  "confirmation_code": "BB-12345",
+  "total_usd": 47.48,
+  "shipping": "express",
+  "error_reasoning": null
+}
+```
+
+- If task succeeds: `success: true`, populate all fields exactly as displayed
+- If task fails: `success: false`, populate `error_reasoning` with what blocked you
diff --git a/skills/autobrowse/evals/eval/tasks/fixture-checkout/verify.mjs b/skills/autobrowse/evals/eval/tasks/fixture-checkout/verify.mjs
new file mode 100644
index 0000000..fd2a581
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/fixture-checkout/verify.mjs
@@ -0,0 +1,14 @@
+#!/usr/bin/env node
+import { getRunDir, loadOutput, emit, emitNoOutput, check, checkNumber, checkoutCode } from "../_lib/checks.mjs";
+
+const out = loadOutput(getRunDir());
+if (!out) emitNoOutput();
+
+const expectedCode = checkoutCode("Ada Lovelace", "ada@example.com", "94107", "express");
+
+emit([
+  check("claimed success", out.success === true, JSON.stringify(out.success)),
+  check("confirmation code", String(out.confirmation_code).trim() === expectedCode, `got ${out.confirmation_code}, expected ${expectedCode}`),
+  checkNumber("total", out.total_usd, { eq: 47.48 }),
+  check("shipping", String(out.shipping).toLowerCase() === "express", JSON.stringify(out.shipping)),
+]);
diff --git a/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/meta.json b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/meta.json
new file mode 100644
index 0000000..6b2ffda
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/meta.json
@@ -0,0 +1,9 @@
+{
+  "tier": "A",
+  "category": "search-filter-extract",
+  "env": "local",
+  "max_turns": 25,
+  "timeout_min": 15,
+  "requires": ["fixtures-server"],
+  "gotchas": "results render 700ms after Search; default sort is by departure time, not price; cheaper one-stop and cheaper wrong-route flights are deliberate traps"
+}
diff --git a/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/mock-output.json b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/mock-output.json
new file mode 100644
index 0000000..c7a2306
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/mock-output.json
@@ -0,0 +1,9 @@
+{
+  "success": true,
+  "airline": "Meridian Air",
+  "flight_number": "MA 214",
+  "price_usd": 218,
+  "depart_time": "07:05",
+  "nonstop": true,
+  "error_reasoning": null
+}
diff --git a/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/task.md b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/task.md
new file mode 100644
index 0000000..d25363c
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/task.md
@@ -0,0 +1,40 @@
+# Task: Find the cheapest nonstop SFO → JFK flight on FlightDeck
+
+Search the local FlightDeck fixture for flights from SFO to JFK and return the cheapest NONSTOP option.
+
+## URL
+
+http://localhost:4173/flightdeck/
+
+## Inputs
+
+- From: SFO
+- To: JFK
+- Constraint: nonstop flights only
+
+## Steps
+
+1. Navigate to the URL
+2. Select SFO as origin and JFK as destination
+3. Restrict results to nonstop flights (the "Nonstop only" checkbox, or filter the results yourself)
+4. Search and wait for results to load
+5. Identify the cheapest nonstop flight (note: results are NOT sorted by price by default)
+
+## Output
+
+Return a JSON object:
+
+```json
+{
+  "success": true,
+  "airline": "...",
+  "flight_number": "XX 123",
+  "price_usd": 0,
+  "depart_time": "HH:MM",
+  "nonstop": true,
+  "error_reasoning": null
+}
+```
+
+- If task succeeds: `success: true`, populate fields exactly as displayed
+- If task fails: `success: false`, populate `error_reasoning`
diff --git a/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/verify.mjs b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/verify.mjs
new file mode 100644
index 0000000..d007e61
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/fixture-flightdeck/verify.mjs
@@ -0,0 +1,17 @@
+#!/usr/bin/env node
+import { getRunDir, loadOutput, emit, emitNoOutput, check, checkNumber, checkFuzzyMatch } from "../_lib/checks.mjs";
+
+const out = loadOutput(getRunDir());
+if (!out) emitNoOutput();
+
+// Ground truth seeded in fixtures/flightdeck/index.html: cheapest nonstop
+// SFO→JFK is Meridian Air MA 214, $218, departing 07:05. Traps: a $189
+// one-stop on the same route and a $149 nonstop SFO→BOS.
+emit([
+  check("claimed success", out.success === true, JSON.stringify(out.success)),
+  checkFuzzyMatch("airline", out.airline, "Meridian Air"),
+  check("flight number", String(out.flight_number).replace(/\s+/g, "") === "MA214", JSON.stringify(out.flight_number)),
+  checkNumber("price", out.price_usd, { eq: 218 }),
+  check("depart time", String(out.depart_time).includes("07:05") || String(out.depart_time).includes("7:05"), JSON.stringify(out.depart_time)),
+  check("nonstop", out.nonstop === true, JSON.stringify(out.nonstop)),
+]);
diff --git a/skills/autobrowse/evals/eval/tasks/google-flights/meta.json b/skills/autobrowse/evals/eval/tasks/google-flights/meta.json
new file mode 100644
index 0000000..8d82b78
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/google-flights/meta.json
@@ -0,0 +1,8 @@
+{
+  "tier": "B",
+  "category": "search-filter-extract",
+  "env": "local",
+  "max_turns": 35,
+  "timeout_min": 25,
+  "gotchas": "airport fields are comboboxes needing keystrokes + dropdown selection; stops filter is in a Stops chip; prices drift run to run (verifier checks invariants only)"
+}
diff --git a/skills/autobrowse/evals/eval/tasks/google-flights/mock-output.json b/skills/autobrowse/evals/eval/tasks/google-flights/mock-output.json
new file mode 100644
index 0000000..48f6aea
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/google-flights/mock-output.json
@@ -0,0 +1,11 @@
+{
+  "success": true,
+  "date": "2026-08-12",
+  "flights": [
+    { "airline": "JetBlue", "depart_time": "07:15", "arrive_time": "15:48", "price_usd": 199, "nonstop": true },
+    { "airline": "Delta", "depart_time": "08:30", "arrive_time": "17:05", "price_usd": 228, "nonstop": true },
+    { "airline": "United", "depart_time": "11:00", "arrive_time": "19:32", "price_usd": 241, "nonstop": true }
+  ],
+  "cheapest_price_usd": 199,
+  "error_reasoning": null
+}
diff --git a/skills/autobrowse/evals/eval/tasks/google-flights/task.md b/skills/autobrowse/evals/eval/tasks/google-flights/task.md
new file mode 100644
index 0000000..035f994
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/google-flights/task.md
@@ -0,0 +1,41 @@
+# Task: Find cheapest nonstop SFO → JFK on Google Flights
+
+Search Google Flights for one-way nonstop flights from SFO to JFK on 2026-08-12 and return the cheapest options. Based on the browse.sh `google.com/search-flights` skill definition.
+
+## URL
+
+https://www.google.com/travel/flights
+
+## Inputs
+
+- From: SFO (San Francisco)
+- To: JFK (New York)
+- Date: 2026-08-12 (one-way)
+- Passengers: 1 adult, economy
+- Stops filter: nonstop only
+
+## Steps
+
+1. Navigate to Google Flights
+2. Set up the one-way search SFO → JFK on 2026-08-12
+3. Apply the "Nonstop only" stops filter
+4. Wait for results, then extract the top nonstop options sorted by price
+
+## Output
+
+Return a JSON object:
+
+```json
+{
+  "success": true,
+  "date": "2026-08-12",
+  "flights": [
+    { "airline": "...", "depart_time": "HH:MM", "arrive_time": "HH:MM", "price_usd": 0, "nonstop": true }
+  ],
+  "cheapest_price_usd": 0,
+  "error_reasoning": null
+}
+```
+
+- Include at least the 3 cheapest nonstop options (fewer only if fewer exist)
+- If task fails: `success: false`, populate `error_reasoning`
diff --git a/skills/autobrowse/evals/eval/tasks/google-flights/verify.mjs b/skills/autobrowse/evals/eval/tasks/google-flights/verify.mjs
new file mode 100644
index 0000000..f429a13
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/google-flights/verify.mjs
@@ -0,0 +1,32 @@
+#!/usr/bin/env node
+import { getRunDir, loadOutput, emit, emitNoOutput, check, checkNumber, checkTime } from "../_lib/checks.mjs";
+
+const out = loadOutput(getRunDir());
+if (!out) emitNoOutput();
+
+// Live site — verify invariants, not exact prices. SFO→JFK nonstop one-way
+// economy reliably exists and prices in a sane band.
+const flights = Array.isArray(out.flights) ? out.flights : [];
+const KNOWN_AIRLINES = ["alaska", "american", "delta", "jetblue", "united", "frontier", "hawaiian", "southwest", "spirit"];
+
+const checks = [
+  check("claimed success", out.success === true, JSON.stringify(out.success)),
+  check("date echoed", String(out.date).startsWith("2026-08-12"), JSON.stringify(out.date)),
+  check("≥1 flight", flights.length >= 1, `got ${flights.length}`),
+  check("all nonstop", flights.length > 0 && flights.every((f) => f.nonstop === true), JSON.stringify(flights.map((f) => f.nonstop))),
+  check(
+    "airlines plausible",
+    flights.length > 0 && flights.every((f) => KNOWN_AIRLINES.some((a) => String(f.airline).toLowerCase().includes(a))),
+    flights.map((f) => f.airline).join(", ")
+  ),
+  ...flights.slice(0, 5).map((f, i) => checkNumber(`flight[${i}] price band`, f.price_usd, { min: 80, max: 1500 })),
+  ...flights.slice(0, 5).map((f, i) => checkTime(`flight[${i}] depart time`, f.depart_time)),
+  checkNumber("cheapest price band", out.cheapest_price_usd, { min: 80, max: 1500 }),
+];
+
+if (flights.length > 0) {
+  const min = Math.min(...flights.map((f) => Number(f.price_usd)).filter((n) => isFinite(n)));
+  checks.push(check("cheapest consistent with list", Number(out.cheapest_price_usd) <= min + 0.01, `cheapest=${out.cheapest_price_usd}, list min=${min}`));
+}
+
+emit(checks);
diff --git a/skills/autobrowse/evals/eval/tasks/opentable-availability/meta.json b/skills/autobrowse/evals/eval/tasks/opentable-availability/meta.json
new file mode 100644
index 0000000..b8565ce
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/opentable-availability/meta.json
@@ -0,0 +1,8 @@
+{
+  "tier": "B",
+  "category": "availability-widget",
+  "env": "remote",
+  "max_turns": 30,
+  "timeout_min": 25,
+  "gotchas": "Akamai bot wall on plain sessions — harness pre-creates verified+proxied Browserbase sessions (connect-url path); detail page accepts dateTime/covers query params (skips widget interaction); widget times render in 12h format — convert; do not click slots (that starts a booking)"
+}
diff --git a/skills/autobrowse/evals/eval/tasks/opentable-availability/mock-output.json b/skills/autobrowse/evals/eval/tasks/opentable-availability/mock-output.json
new file mode 100644
index 0000000..e0e07e3
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/opentable-availability/mock-output.json
@@ -0,0 +1,9 @@
+{
+  "success": true,
+  "restaurant": "Arquet",
+  "date": "2026-08-15",
+  "party_size": 2,
+  "has_availability": true,
+  "slots": ["17:45", "18:00", "20:15", "21:00"],
+  "error_reasoning": null
+}
diff --git a/skills/autobrowse/evals/eval/tasks/opentable-availability/task.md b/skills/autobrowse/evals/eval/tasks/opentable-availability/task.md
new file mode 100644
index 0000000..87a600a
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/opentable-availability/task.md
@@ -0,0 +1,41 @@
+# Task: Check OpenTable availability at Arquet (San Francisco)
+
+Check OpenTable for available reservation time slots at Arquet in San Francisco for a party of 2 on 2026-08-15 around dinner time. Read-only — do not book. Based on the browse.sh `opentable.com/check-availability` skill definition.
+
+## URL
+
+https://www.opentable.com/r/arquet-san-francisco
+
+## Inputs
+
+- Restaurant: Arquet, San Francisco
+- Date: 2026-08-15 (a Saturday)
+- Party size: 2
+- Time window: dinner (17:00–21:30)
+
+## Steps
+
+1. Navigate to the restaurant's OpenTable page (the URL accepts query params for date/party size; using them is fine)
+2. Set the date to 2026-08-15 and party size to 2
+3. Read the reservation widget's available time slots in the dinner window
+4. Do NOT click any slot or book anything
+
+## Output
+
+Return a JSON object:
+
+```json
+{
+  "success": true,
+  "restaurant": "Arquet",
+  "date": "2026-08-15",
+  "party_size": 2,
+  "has_availability": true,
+  "slots": ["18:00", "18:15"],
+  "error_reasoning": null
+}
+```
+
+- `has_availability: false` with an empty `slots` array is a VALID successful result (the restaurant may simply be booked)
+- Times in 24h HH:MM format
+- If task fails (couldn't load the widget at all): `success: false` with `error_reasoning`
diff --git a/skills/autobrowse/evals/eval/tasks/opentable-availability/verify.mjs b/skills/autobrowse/evals/eval/tasks/opentable-availability/verify.mjs
new file mode 100644
index 0000000..0f45e2b
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/opentable-availability/verify.mjs
@@ -0,0 +1,27 @@
+#!/usr/bin/env node
+import { getRunDir, loadOutput, emit, emitNoOutput, check, checkContains, checkTime } from "../_lib/checks.mjs";
+
+const out = loadOutput(getRunDir());
+if (!out) emitNoOutput();
+
+// Live availability changes run to run — verify structural invariants and
+// internal consistency, not specific slots.
+const slots = Array.isArray(out.slots) ? out.slots : null;
+
+emit([
+  check("claimed success", out.success === true, JSON.stringify(out.success)),
+  checkContains("restaurant", out.restaurant, "arquet"),
+  check("date echoed", String(out.date).startsWith("2026-08-15"), JSON.stringify(out.date)),
+  check("party size", Number(out.party_size) === 2, JSON.stringify(out.party_size)),
+  check("slots is array", slots !== null, JSON.stringify(out.slots)),
+  check(
+    "availability consistent",
+    (out.has_availability === true && slots?.length > 0) || (out.has_availability === false && slots?.length === 0),
+    `has_availability=${out.has_availability}, slots=${slots?.length}`
+  ),
+  ...(slots ?? []).slice(0, 8).map((s, i) => checkTime(`slot[${i}] format`, s)),
+  ...(slots ?? []).slice(0, 8).map((s, i) => {
+    const [h] = String(s).split(":").map(Number);
+    return check(`slot[${i}] in dinner window`, h >= 16 && h <= 22, String(s));
+  }),
+]);
diff --git a/skills/autobrowse/evals/eval/tasks/stockx-price/meta.json b/skills/autobrowse/evals/eval/tasks/stockx-price/meta.json
new file mode 100644
index 0000000..16dc497
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/stockx-price/meta.json
@@ -0,0 +1,8 @@
+{
+  "tier": "C",
+  "category": "bot-protected-extraction",
+  "env": "remote",
+  "max_turns": 35,
+  "timeout_min": 25,
+  "gotchas": "PerimeterX bot protection — requires Browserbase stealth/proxies; search results include many variants (pick exact style DZ5485-612); price shown depends on selected size"
+}
diff --git a/skills/autobrowse/evals/eval/tasks/stockx-price/mock-output.json b/skills/autobrowse/evals/eval/tasks/stockx-price/mock-output.json
new file mode 100644
index 0000000..fc0dc4b
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/stockx-price/mock-output.json
@@ -0,0 +1,8 @@
+{
+  "success": true,
+  "product": "Air Jordan 1 Retro High OG Chicago Lost and Found",
+  "style_code": "DZ5485-612",
+  "last_sale_usd": 285,
+  "lowest_ask_usd": 297,
+  "error_reasoning": null
+}
diff --git a/skills/autobrowse/evals/eval/tasks/stockx-price/task.md b/skills/autobrowse/evals/eval/tasks/stockx-price/task.md
new file mode 100644
index 0000000..43458b1
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/stockx-price/task.md
@@ -0,0 +1,35 @@
+# Task: Get the StockX resale price for the Jordan 1 "Chicago Lost and Found"
+
+Look up the Air Jordan 1 Retro High OG "Chicago Lost and Found" on StockX and return its current market data. Read-only — never place a bid or buy. Based on the browse.sh `stockx.com/get-resale-price` skill definition.
+
+## URL
+
+https://stockx.com
+
+## Inputs
+
+- Product: Air Jordan 1 Retro High OG "Chicago Lost and Found" (style DZ5485-612)
+
+## Steps
+
+1. Navigate to StockX (bot-protected — use remote/stealth browsing)
+2. Search for the product and open its product page
+3. Extract: full product name, lowest ask or last sale price (USD), and the style code if shown
+
+## Output
+
+Return a JSON object:
+
+```json
+{
+  "success": true,
+  "product": "...",
+  "style_code": "DZ5485-612",
+  "last_sale_usd": 0,
+  "lowest_ask_usd": 0,
+  "error_reasoning": null
+}
+```
+
+- At least one of `last_sale_usd` / `lowest_ask_usd` must be populated
+- If task fails: `success: false`, populate `error_reasoning`
diff --git a/skills/autobrowse/evals/eval/tasks/stockx-price/verify.mjs b/skills/autobrowse/evals/eval/tasks/stockx-price/verify.mjs
new file mode 100644
index 0000000..962d849
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/stockx-price/verify.mjs
@@ -0,0 +1,21 @@
+#!/usr/bin/env node
+import { getRunDir, loadOutput, emit, emitNoOutput, check, checkContains, norm } from "../_lib/checks.mjs";
+
+const out = loadOutput(getRunDir());
+if (!out) emitNoOutput();
+
+// Live marketplace — invariant checks. This shoe has traded ~$150–$400 for
+// years; a wide band still catches fabricated or wrong-product prices.
+const product = norm(out.product);
+const price = [out.last_sale_usd, out.lowest_ask_usd]
+  .map(Number)
+  .find((n) => isFinite(n) && n > 0);
+
+emit([
+  check("claimed success", out.success === true, JSON.stringify(out.success)),
+  check("product is Jordan 1", product.includes("jordan 1"), out.product),
+  check("product is Chicago L&F", product.includes("chicago") && (product.includes("lost") || product.includes("found")), out.product),
+  checkContains("style code", out.style_code, "DZ5485-612"),
+  check("a price populated", price !== undefined, JSON.stringify({ last_sale: out.last_sale_usd, lowest_ask: out.lowest_ask_usd })),
+  check("price plausible", price !== undefined && price >= 100 && price <= 1000, `got ${price}`),
+]);
diff --git a/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/meta.json b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/meta.json
new file mode 100644
index 0000000..f05d254
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/meta.json
@@ -0,0 +1,8 @@
+{
+  "tier": "B",
+  "category": "search-and-extract",
+  "env": "remote",
+  "max_turns": 35,
+  "timeout_min": 25,
+  "gotchas": "ppubs.uspto.gov is a heavy SPA with its own query syntax; record pages render in an embedded viewer"
+}
diff --git a/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/mock-output.json b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/mock-output.json
new file mode 100644
index 0000000..eaf14a2
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/mock-output.json
@@ -0,0 +1,9 @@
+{
+  "success": true,
+  "patent_number": "11000000",
+  "title": "Repositioning wires and methods for repositioning prosthetic heart valve devices within a heart chamber and related systems, devices and methods",
+  "inventors": ["Jason S. Diedering", "Saravana B. Kumar"],
+  "assignee": "4C Medical Technologies, Inc.",
+  "grant_date": "2021-05-11",
+  "error_reasoning": null
+}
diff --git a/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/task.md b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/task.md
new file mode 100644
index 0000000..9af2026
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/task.md
@@ -0,0 +1,35 @@
+# Task: Look up US Patent 11,000,000
+
+Search the USPTO patent database (or USPTO Patent Public Search at ppubs.uspto.gov) for US patent number 11,000,000 and extract its bibliographic details. Based on the browse.sh `uspto.gov/search-patents` skill definition.
+
+## URL
+
+https://ppubs.uspto.gov/pubwebapp/
+
+## Inputs
+
+- Patent number: 11000000
+
+## Steps
+
+1. Navigate to USPTO Patent Public Search (or another official USPTO search surface)
+2. Search for patent number 11000000
+3. Open the patent record and extract: title, inventors, assignee, grant date
+
+## Output
+
+Return a JSON object:
+
+```json
+{
+  "success": true,
+  "patent_number": "11000000",
+  "title": "...",
+  "inventors": ["..."],
+  "assignee": "...",
+  "grant_date": "YYYY-MM-DD",
+  "error_reasoning": null
+}
+```
+
+- If task fails: `success: false`, populate `error_reasoning`
diff --git a/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/verify.mjs b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/verify.mjs
new file mode 100644
index 0000000..e2c57aa
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/uspto-patent-lookup/verify.mjs
@@ -0,0 +1,21 @@
+#!/usr/bin/env node
+import { getRunDir, loadOutput, emit, emitNoOutput, check, checkContains, norm } from "../_lib/checks.mjs";
+
+const out = loadOutput(getRunDir());
+if (!out) emitNoOutput();
+
+// Patents are immutable — exact ground truth. US 11,000,000 B2:
+// "Repositioning wires and methods for repositioning prosthetic heart valve
+// devices within a heart chamber...", 4C Medical Technologies, granted
+// 2021-05-11, inventors incl. Jason S. Diedering, Saravana B. Kumar.
+const inventors = norm(JSON.stringify(out.inventors ?? ""));
+
+emit([
+  check("claimed success", out.success === true, JSON.stringify(out.success)),
+  check("patent number", String(out.patent_number).replace(/[^0-9]/g, "") === "11000000", JSON.stringify(out.patent_number)),
+  checkContains("title", out.title, "repositioning"),
+  checkContains("title mentions heart valve", out.title, "heart valve"),
+  check("inventor Diedering", inventors.includes("diedering"), inventors.slice(0, 120)),
+  checkContains("assignee", out.assignee, "4C Medical"),
+  check("grant date", String(out.grant_date).startsWith("2021-05-11"), JSON.stringify(out.grant_date)),
+]);
diff --git a/skills/autobrowse/evals/eval/tasks/yelp-reviews/meta.json b/skills/autobrowse/evals/eval/tasks/yelp-reviews/meta.json
new file mode 100644
index 0000000..432da6e
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/yelp-reviews/meta.json
@@ -0,0 +1,8 @@
+{
+  "tier": "C",
+  "category": "bot-protected-extraction",
+  "env": "remote",
+  "max_turns": 35,
+  "timeout_min": 25,
+  "gotchas": "DataDome CAPTCHA wall — needs Browserbase stealth + residential proxies; review dates render as relative strings sometimes; login-walled actions must be avoided"
+}
diff --git a/skills/autobrowse/evals/eval/tasks/yelp-reviews/mock-output.json b/skills/autobrowse/evals/eval/tasks/yelp-reviews/mock-output.json
new file mode 100644
index 0000000..bb87c6f
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/yelp-reviews/mock-output.json
@@ -0,0 +1,12 @@
+{
+  "success": true,
+  "name": "Tartine Bakery",
+  "rating": 4.0,
+  "review_count": 8900,
+  "reviews": [
+    { "reviewer": "Maya L.", "rating": 5, "date": "2026-06-01", "text": "The morning bun is still the single best pastry in San Francisco. Line moved fast on a Tuesday morning and staff were lovely." },
+    { "reviewer": "Derek W.", "rating": 3, "date": "2026-05-28", "text": "Great bread, genuinely world class, but the line was 40 minutes and there is nowhere to sit. Get it to go and walk to Dolores Park." },
+    { "reviewer": "Priya S.", "rating": 4, "date": "2026-05-25", "text": "Croissant was perfectly laminated and the coffee was solid. Docking a star because they were out of the country bread by 10am." }
+  ],
+  "error_reasoning": null
+}
diff --git a/skills/autobrowse/evals/eval/tasks/yelp-reviews/task.md b/skills/autobrowse/evals/eval/tasks/yelp-reviews/task.md
new file mode 100644
index 0000000..24a2796
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/yelp-reviews/task.md
@@ -0,0 +1,38 @@
+# Task: Extract Yelp reviews for Tartine Bakery (San Francisco)
+
+Extract Tartine Bakery's rating, review count, and its 5 most recent reviews from Yelp. Read-only. Based on the browse.sh `yelp.com/extract-reviews` skill definition (simplified filter surface: sort = newest, limit = 5).
+
+## URL
+
+https://www.yelp.com/biz/tartine-bakery-san-francisco
+
+## Inputs
+
+- Business: Tartine Bakery, San Francisco
+- Sort: newest
+- Limit: 5 reviews
+
+## Steps
+
+1. Navigate to the business page (DataDome bot protection — use remote/stealth browsing)
+2. Extract the overall rating and total review count
+3. Sort reviews by newest and extract the top 5: reviewer name, rating, date, full text
+
+## Output
+
+Return a JSON object:
+
+```json
+{
+  "success": true,
+  "name": "Tartine Bakery",
+  "rating": 4.0,
+  "review_count": 0,
+  "reviews": [
+    { "reviewer": "...", "rating": 5, "date": "YYYY-MM-DD", "text": "..." }
+  ],
+  "error_reasoning": null
+}
+```
+
+- If task fails: `success: false`, populate `error_reasoning`
diff --git a/skills/autobrowse/evals/eval/tasks/yelp-reviews/verify.mjs b/skills/autobrowse/evals/eval/tasks/yelp-reviews/verify.mjs
new file mode 100644
index 0000000..e37b69e
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/yelp-reviews/verify.mjs
@@ -0,0 +1,20 @@
+#!/usr/bin/env node
+import { getRunDir, loadOutput, emit, emitNoOutput, check, checkContains, checkNumber } from "../_lib/checks.mjs";
+
+const out = loadOutput(getRunDir());
+if (!out) emitNoOutput();
+
+// Live site — invariants. Tartine has held ~4 stars with >8,000 reviews for
+// years; per-review structure is the real fabrication check.
+const reviews = Array.isArray(out.reviews) ? out.reviews : [];
+
+emit([
+  check("claimed success", out.success === true, JSON.stringify(out.success)),
+  checkContains("name", out.name, "tartine"),
+  checkNumber("rating band", out.rating, { min: 3.0, max: 5.0 }),
+  checkNumber("review count", out.review_count, { min: 5000, max: 50000 }),
+  check("≥3 reviews", reviews.length >= 3, `got ${reviews.length}`),
+  ...reviews.slice(0, 5).map((r, i) => checkNumber(`review[${i}] rating`, r?.rating, { min: 1, max: 5 })),
+  ...reviews.slice(0, 5).map((r, i) => check(`review[${i}] has text`, String(r?.text ?? "").length >= 40, `len=${String(r?.text ?? "").length}`)),
+  ...reviews.slice(0, 5).map((r, i) => check(`review[${i}] has date`, /\d{4}/.test(String(r?.date ?? "")), JSON.stringify(r?.date))),
+]);
diff --git a/skills/autobrowse/evals/eval/tasks/youtube-transcript/meta.json b/skills/autobrowse/evals/eval/tasks/youtube-transcript/meta.json
new file mode 100644
index 0000000..9fd51bb
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/youtube-transcript/meta.json
@@ -0,0 +1,8 @@
+{
+  "tier": "B",
+  "category": "media-extraction",
+  "env": "local",
+  "max_turns": 30,
+  "timeout_min": 20,
+  "gotchas": "transcript button hidden behind '...more' description expander; consent dialogs may appear; player keyboard shortcuts can pause/seek accidentally"
+}
diff --git a/skills/autobrowse/evals/eval/tasks/youtube-transcript/mock-output.json b/skills/autobrowse/evals/eval/tasks/youtube-transcript/mock-output.json
new file mode 100644
index 0000000..085f1dd
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/youtube-transcript/mock-output.json
@@ -0,0 +1,12 @@
+{
+  "success": true,
+  "title": "Me at the zoo",
+  "channel": "jawed",
+  "has_transcript": true,
+  "segments": [
+    { "ts": "0:00", "text": "All right, so here we are in front of the elephants" },
+    { "ts": "0:05", "text": "the cool thing about these guys is that they have really really really long trunks" },
+    { "ts": "0:12", "text": "and that's cool" }
+  ],
+  "error_reasoning": null
+}
diff --git a/skills/autobrowse/evals/eval/tasks/youtube-transcript/task.md b/skills/autobrowse/evals/eval/tasks/youtube-transcript/task.md
new file mode 100644
index 0000000..d0f723a
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/youtube-transcript/task.md
@@ -0,0 +1,35 @@
+# Task: Extract the transcript of "Me at the zoo"
+
+Extract the transcript of the first YouTube video ever uploaded. Based on the browse.sh `youtube.com/extract-transcript` skill definition.
+
+## URL
+
+https://www.youtube.com/watch?v=jNQXAC9IVRw
+
+## Inputs
+
+- Video: "Me at the zoo" (video ID jNQXAC9IVRw)
+
+## Steps
+
+1. Navigate to the video page
+2. Find the video title and channel name
+3. Open the transcript panel (usually under the "...more" description → "Show transcript")
+4. Extract the transcript segments with timestamps
+
+## Output
+
+Return a JSON object:
+
+```json
+{
+  "success": true,
+  "title": "...",
+  "channel": "...",
+  "has_transcript": true,
+  "segments": [{ "ts": "0:00", "text": "..." }],
+  "error_reasoning": null
+}
+```
+
+- If task fails: `success: false`, populate `error_reasoning`
diff --git a/skills/autobrowse/evals/eval/tasks/youtube-transcript/verify.mjs b/skills/autobrowse/evals/eval/tasks/youtube-transcript/verify.mjs
new file mode 100644
index 0000000..0721a94
--- /dev/null
+++ b/skills/autobrowse/evals/eval/tasks/youtube-transcript/verify.mjs
@@ -0,0 +1,20 @@
+#!/usr/bin/env node
+import { getRunDir, loadOutput, emit, emitNoOutput, check, checkContains } from "../_lib/checks.mjs";
+
+const out = loadOutput(getRunDir());
+if (!out) emitNoOutput();
+
+// 19-second 2005 video; content is immutable. Transcript famously mentions
+// the elephants' "really really long trunks".
+const segments = Array.isArray(out.segments) ? out.segments : [];
+const fullText = segments.map((s) => s?.text ?? "").join(" ");
+
+emit([
+  check("claimed success", out.success === true, JSON.stringify(out.success)),
+  checkContains("title", out.title, "Me at the zoo"),
+  checkContains("channel", out.channel, "jawed"),
+  check("has transcript", out.has_transcript === true, JSON.stringify(out.has_transcript)),
+  check("≥2 segments", segments.length >= 2, `got ${segments.length}`),
+  checkContains("transcript mentions elephants", fullText, "elephants"),
+  checkContains("transcript mentions trunks", fullText, "trunks"),
+]);
diff --git a/skills/autobrowse/evals/fixtures/checkout/index.html b/skills/autobrowse/evals/fixtures/checkout/index.html
new file mode 100644
index 0000000..9a801c3
--- /dev/null
+++ b/skills/autobrowse/evals/fixtures/checkout/index.html
@@ -0,0 +1,117 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<title>Acme Checkout</title>
+<style>
+  body { font-family: system-ui, sans-serif; max-width: 560px; margin: 40px auto; }
+  .step { display: none; }
+  .step.active { display: block; }
+  label { display: block; margin: 10px 0 2px; }
+  input, select { padding: 6px; width: 280px; }
+  button { margin-top: 16px; padding: 8px 18px; }
+  button:disabled { opacity: 0.4; }
+  #confirmation { display: none; border: 2px solid #2a7; padding: 16px; }
+  .muted { color: #777; font-size: 13px; }
+</style>
+</head>
+<body>
+<h1>Acme Store — Checkout</h1>
+<p class="muted">Cart: Widget Pro × 2 @ $19.99 each</p>
+
+<div id="step1" class="step active">
+  <h2>Step 1 — Contact</h2>
+  <label for="name">Full name</label>
+  <input id="name" name="name" type="text" />
+  <label for="email">Email</label>
+  <input id="email" name="email" type="email" />
+  <br /><button id="next1" disabled>Next</button>
+</div>
+
+<div id="step2" class="step">
+  <h2>Step 2 — Shipping</h2>
+  <label for="address">Street address</label>
+  <input id="address" name="address" type="text" />
+  <label for="city">City</label>
+  <input id="city" name="city" type="text" />
+  <label for="zip">ZIP code</label>
+  <input id="zip" name="zip" type="text" />
+  <div id="shipping-options" class="muted">Loading shipping options…</div>
+  <br /><button id="next2" disabled>Next</button>
+</div>
+
+<div id="step3" class="step">
+  <h2>Step 3 — Review</h2>
+  <div id="review"></div>
+  <p><strong>Total: $<span id="total"></span></strong></p>
+  <button id="place-order">Place order</button>
+</div>
+
+<div id="confirmation">
+  <h2>Order confirmed 🎉</h2>
+  <p>Your confirmation code is <strong id="confirmation-code"></strong></p>
+  <p>Total charged: $<span id="confirmed-total"></span></p>
+</div>
+
+<script>
+  // Deterministic confirmation code — the eval verifier recomputes this
+  // exact function from the task inputs.
+  function checkoutCode(name, email, zip, shipping) {
+    const s = (name + "|" + email + "|" + zip + "|" + shipping).toLowerCase();
+    let sum = 0;
+    for (const ch of s) sum = (sum * 31 + ch.codePointAt(0)) % 100000;
+    return "BB-" + String(sum).padStart(5, "0");
+  }
+
+  const $ = (id) => document.getElementById(id);
+  const state = {};
+
+  // Step 1: Next enables only with a name and a plausible email.
+  function validate1() {
+    $("next1").disabled = !($("name").value.trim().length > 1 && /.+@.+\..+/.test($("email").value));
+  }
+  $("name").addEventListener("input", validate1);
+  $("email").addEventListener("input", validate1);
+  $("next1").addEventListener("click", () => {
+    state.name = $("name").value.trim();
+    state.email = $("email").value.trim();
+    $("step1").classList.remove("active");
+    $("step2").classList.add("active");
+    // Gotcha: shipping radios render after a delay — naive agents snapshot too early.
+    setTimeout(() => {
+      $("shipping-options").innerHTML =
+        '<label><input type="radio" name="shipping" value="standard" checked /> Standard (free, 5–7 days)</label>' +
+        '<label><input type="radio" name="shipping" value="express" /> Express ($7.50, 1–2 days)</label>';
+      validate2();
+    }, 900);
+  });
+
+  function validate2() {
+    const ship = document.querySelector('input[name="shipping"]');
+    $("next2").disabled = !($("address").value.trim() && $("city").value.trim() && /^\d{5}$/.test($("zip").value) && ship);
+  }
+  ["address", "city", "zip"].forEach((id) => $(id).addEventListener("input", validate2));
+
+  $("next2").addEventListener("click", () => {
+    state.address = $("address").value.trim();
+    state.city = $("city").value.trim();
+    state.zip = $("zip").value.trim();
+    state.shipping = document.querySelector('input[name="shipping"]:checked').value;
+    state.total = (2 * 19.99 + (state.shipping === "express" ? 7.5 : 0)).toFixed(2);
+    $("review").innerHTML =
+      `<p>${state.name} &lt;${state.email}&gt;</p><p>${state.address}, ${state.city} ${state.zip}</p>` +
+      `<p>Shipping: ${state.shipping}</p><p>Widget Pro × 2 — $39.98</p>`;
+    $("total").textContent = state.total;
+    $("step2").classList.remove("active");
+    $("step3").classList.add("active");
+  });
+
+  $("place-order").addEventListener("click", () => {
+    $("step3").classList.remove("active");
+    $("confirmation-code").textContent = checkoutCode(state.name, state.email, state.zip, state.shipping);
+    $("confirmed-total").textContent = state.total;
+    $("confirmation").style.display = "block";
+  });
+</script>
+</body>
+</html>
diff --git a/skills/autobrowse/evals/fixtures/flightdeck/index.html b/skills/autobrowse/evals/fixtures/flightdeck/index.html
new file mode 100644
index 0000000..2fc17d4
--- /dev/null
+++ b/skills/autobrowse/evals/fixtures/flightdeck/index.html
@@ -0,0 +1,82 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<title>FlightDeck — Search</title>
+<style>
+  body { font-family: system-ui, sans-serif; max-width: 640px; margin: 40px auto; }
+  .flight { border: 1px solid #ccc; padding: 8px 12px; margin: 6px 0; }
+  .price { font-weight: bold; float: right; }
+  button, select { padding: 6px 10px; }
+  .muted { color: #777; font-size: 13px; }
+</style>
+</head>
+<body>
+<h1>FlightDeck</h1>
+<label>From
+  <select id="origin">
+    <option>SFO</option><option>LAX</option><option>SEA</option>
+  </select>
+</label>
+<label>To
+  <select id="dest">
+    <option>JFK</option><option>BOS</option><option>ORD</option>
+  </select>
+</label>
+<label><input type="checkbox" id="nonstop" /> Nonstop only</label>
+<button id="search">Search</button>
+<button id="sort-price" style="display:none">Sort by price</button>
+<div id="results"><p class="muted">Search to see flights.</p></div>
+
+<script>
+  // Seeded, immutable flight data — ground truth for the eval verifier.
+  // Cheapest NONSTOP SFO→JFK is Meridian Air MA 214 at $218 (a cheaper
+  // one-stop at $189 and a cheaper nonstop on another route are traps).
+  const FLIGHTS = [
+    { airline: "Horizon Connect", flight: "HZ 17",  from: "SFO", to: "JFK", depart: "06:10", arrive: "17:25", stops: 1, price: 189 },
+    { airline: "Meridian Air",    flight: "MA 214", from: "SFO", to: "JFK", depart: "07:05", arrive: "15:40", stops: 0, price: 218 },
+    { airline: "Atlantic Blue",   flight: "AB 901", from: "SFO", to: "JFK", depart: "09:30", arrive: "18:05", stops: 0, price: 232 },
+    { airline: "Pacific Western", flight: "PW 88",  from: "SFO", to: "JFK", depart: "11:45", arrive: "20:21", stops: 0, price: 245 },
+    { airline: "Meridian Air",    flight: "MA 442", from: "SFO", to: "JFK", depart: "13:20", arrive: "21:58", stops: 0, price: 261 },
+    { airline: "Horizon Connect", flight: "HZ 233", from: "SFO", to: "JFK", depart: "16:40", arrive: "03:55", stops: 1, price: 205 },
+    { airline: "Atlantic Blue",   flight: "AB 117", from: "SFO", to: "BOS", depart: "08:15", arrive: "16:50", stops: 0, price: 149 },
+    { airline: "Pacific Western", flight: "PW 301", from: "SFO", to: "BOS", depart: "10:05", arrive: "18:44", stops: 0, price: 171 },
+    { airline: "Meridian Air",    flight: "MA 77",  from: "LAX", to: "JFK", depart: "07:55", arrive: "16:12", stops: 0, price: 198 },
+    { airline: "Atlantic Blue",   flight: "AB 555", from: "LAX", to: "JFK", depart: "12:30", arrive: "20:48", stops: 0, price: 226 },
+    { airline: "Horizon Connect", flight: "HZ 64",  from: "SEA", to: "ORD", depart: "06:45", arrive: "12:31", stops: 0, price: 142 },
+    { airline: "Pacific Western", flight: "PW 410", from: "SEA", to: "JFK", depart: "09:10", arrive: "17:36", stops: 0, price: 234 },
+  ];
+
+  const $ = (id) => document.getElementById(id);
+  let current = [];
+
+  function render(list) {
+    $("results").innerHTML = list.length
+      ? list.map((f) =>
+          `<div class="flight" data-flight="${f.flight}"><span class="price">$${f.price}</span>` +
+          `<strong>${f.airline} ${f.flight}</strong><br/>${f.from} ${f.depart} → ${f.to} ${f.arrive} · ` +
+          `${f.stops === 0 ? "Nonstop" : f.stops + " stop"}</div>`
+        ).join("")
+      : "<p>No flights found.</p>";
+  }
+
+  $("search").addEventListener("click", () => {
+    $("results").innerHTML = '<p class="muted">Searching…</p>';
+    // Gotcha: results render after a delay; default order is by DEPARTURE
+    // time, so "first result" is not the cheapest.
+    setTimeout(() => {
+      current = FLIGHTS.filter(
+        (f) => f.from === $("origin").value && f.to === $("dest").value && (!$("nonstop").checked || f.stops === 0)
+      ).sort((a, b) => a.depart.localeCompare(b.depart));
+      render(current);
+      $("sort-price").style.display = "inline-block";
+    }, 700);
+  });
+
+  $("sort-price").addEventListener("click", () => {
+    current = [...current].sort((a, b) => a.price - b.price);
+    render(current);
+  });
+</script>
+</body>
+</html>
diff --git a/skills/autobrowse/evals/fixtures/serve.mjs b/skills/autobrowse/evals/fixtures/serve.mjs
new file mode 100644
index 0000000..b428300
--- /dev/null
+++ b/skills/autobrowse/evals/fixtures/serve.mjs
@@ -0,0 +1,27 @@
+#!/usr/bin/env node
+// Tiny static server for the Tier A deterministic fixture sites.
+// Usage: node fixtures/serve.mjs [port]   (default 4173)
+
+import * as http from "node:http";
+import * as fs from "node:fs";
+import * as path from "node:path";
+import { fileURLToPath } from "node:url";
+
+const ROOT = path.dirname(fileURLToPath(import.meta.url));
+const PORT = parseInt(process.argv[2] || "4173", 10);
+
+const TYPES = { ".html": "text/html", ".js": "text/javascript", ".css": "text/css", ".json": "application/json" };
+
+http
+  .createServer((req, res) => {
+    let urlPath = decodeURIComponent(new URL(req.url, "http://x").pathname);
+    if (urlPath.endsWith("/")) urlPath += "index.html";
+    const file = path.join(ROOT, path.normalize(urlPath).replace(/^(\.\.[/\\])+/, ""));
+    if (!file.startsWith(ROOT) || !fs.existsSync(file) || fs.statSync(file).isDirectory()) {
+      res.writeHead(404).end("not found");
+      return;
+    }
+    res.writeHead(200, { "content-type": TYPES[path.extname(file)] || "application/octet-stream" });
+    fs.createReadStream(file).pipe(res);
+  })
+  .listen(PORT, () => console.error(`fixtures on http://localhost:${PORT}/ (checkout/, flightdeck/)`));
diff --git a/skills/autobrowse/evals/package.json b/skills/autobrowse/evals/package.json
new file mode 100644
index 0000000..667ad81
--- /dev/null
+++ b/skills/autobrowse/evals/package.json
@@ -0,0 +1,17 @@
+{
+  "name": "autobrowse-evals",
+  "version": "0.1.0",
+  "private": true,
+  "type": "module",
+  "description": "Eval harness for the autobrowse self-improving browser-automation loop: convergence, accuracy, speed, and token cost across models/prompts/architectures.",
+  "scripts": {
+    "fixtures": "node fixtures/serve.mjs",
+    "test:verifiers": "node scripts/test-verifiers.mjs",
+    "matrix": "node eval/run-matrix.mjs",
+    "report": "node eval/report.mjs"
+  },
+  "dependencies": {
+    "@anthropic-ai/sdk": "^0.74.0",
+    "dotenv": "^17.2.3"
+  }
+}
diff --git a/skills/autobrowse/evals/scripts/test-verifiers.mjs b/skills/autobrowse/evals/scripts/test-verifiers.mjs
new file mode 100644
index 0000000..e4eef1d
--- /dev/null
+++ b/skills/autobrowse/evals/scripts/test-verifiers.mjs
@@ -0,0 +1,41 @@
+#!/usr/bin/env node
+// Verifier self-test. For every task: its mock-output.json (a documented
+// known-good output) MUST pass its verifier, and a garbage claimed-success
+// output MUST fail it. Catches both broken verifiers and verifiers an agent
+// could trivially reward-hack with {"success": true}.
+
+import * as fs from "node:fs";
+import * as os from "node:os";
+import * as path from "node:path";
+import { listTasks, TASKS_DIR } from "../eval/config.mjs";
+import { runVerifier } from "../eval/lib/run-verifier.mjs";
+
+const GARBAGE = { success: true, note: "fabricated", value: 42 };
+
+function makeRunDir(output) {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), "verify-test-"));
+  fs.writeFileSync(path.join(dir, "result.json"), JSON.stringify({ parsed: output, raw: JSON.stringify(output), parse_error: null }));
+  return dir;
+}
+
+let failures = 0;
+for (const task of listTasks()) {
+  const mockOutput = JSON.parse(fs.readFileSync(path.join(TASKS_DIR, task, "mock-output.json"), "utf-8"));
+
+  const good = runVerifier(task, makeRunDir(mockOutput));
+  const bad = runVerifier(task, makeRunDir(GARBAGE));
+
+  const goodOk = good.passed === true;
+  const badOk = bad.passed === false;
+  if (!goodOk || !badOk) failures++;
+
+  console.log(
+    `${goodOk && badOk ? "✅" : "❌"} ${task.padEnd(24)} known-good ${goodOk ? "passes" : `FAILS (${good.reason})`}; garbage ${badOk ? "rejected" : "ACCEPTED (verifier is hackable!)"}`
+  );
+}
+
+if (failures) {
+  console.error(`\n${failures} verifier(s) broken`);
+  process.exit(1);
+}
+console.log("\nAll verifiers sound.");