daydreamlive · hthillman · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/.claude/launch.json b/.claude/launch.json
@@ -6,6 +6,24 @@
       "runtimeExecutable": "bash",
       "runtimeArgs": ["-c", "source ~/.nvm/nvm.sh && cd frontend && npm run dev"],
       "port": 5173
+    },
+    {
+      "name": "backend",
+      "runtimeExecutable": "bash",
+      "runtimeArgs": [
+        "-c",
+        "CUDA_VISIBLE_DEVICES='' uv run daydream-scope --port 8033"
+      ],
+      "port": 8033
+    },
+    {
+      "name": "scope-cloud",
+      "runtimeExecutable": "bash",
+      "runtimeArgs": [
+        "-c",
+        "CUDA_VISIBLE_DEVICES='' SCOPE_CLOUD_MODE=livepeer SCOPE_CLOUD_APP_ID='daydream/scope-livepeer-pr-971--preview/ws' uv run daydream-scope"
+      ],
+      "port": 8000
     }
   ]
 }
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -0,0 +1,65 @@
+name: Agent Evals
+
+# Manual-dispatch only. These evals hit the live Anthropic API, so we do NOT
+# run them on push/pull_request — they cost money and are inherently noisy.
+on:
+  workflow_dispatch:
+    inputs:
+      case:
+        description: "Case name to run (blank = all cases)"
+        required: false
+        default: ""
+      runs:
+        description: "Samples per case"
+        required: false
+        default: "5"
+      model:
+        description: "Model id override (blank = default)"
+        required: false
+        default: ""
+      fail_threshold:
+        description: "Overall pass-rate threshold (0-100; blank = no gate)"
+        required: false
+        default: ""
+
+jobs:
+  evals:
+    runs-on: ubuntu-latest
+    name: Run Scope agent evals
+    env:
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+          version: "0.9.11"
+
+      - name: Install dependencies
+        run: uv sync --group dev
+
+      - name: Run evals
+        shell: bash
+        run: |
+          args=(--runs "${{ inputs.runs }}")
+          if [ -n "${{ inputs.case }}" ]; then
+            args+=(--case "${{ inputs.case }}")
+          fi
+          if [ -n "${{ inputs.model }}" ]; then
+            args+=(--model "${{ inputs.model }}")
+          fi
+          if [ -n "${{ inputs.fail_threshold }}" ]; then
+            args+=(--fail-threshold "${{ inputs.fail_threshold }}")
+          fi
+          uv run python -m evals "${args[@]}"
+
+      - name: Upload artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-artifacts
+          path: evals/outputs/
+          retention-days: 14
diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,6 @@ notes/
 .cursor/
 .specstory/
 *.local*
+
+# Eval harness artifacts
+evals/outputs/
diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,138 @@
+# Scope Agent Eval Harness
+
+Measures how often the agentic workflow builder produces a workflow that
+matches the user's intent from a single natural-language prompt.
+
+Each **case** = one prompt + structural checks. The runner drives the real
+agent via in-process ASGI (no uvicorn, no port) N times and prints a
+pass-rate table per case.
+
+## Quickstart
+
+```bash
+# Install deps (one-time):
+uv sync --group dev
+
+# Ensure an Anthropic key is set:
+export ANTHROPIC_API_KEY=sk-ant-...
+
+# Run everything, 5 samples per case (default):
+uv run python -m evals
+
+# Run just one case, 1 sample (fast smoke):
+uv run python -m evals --case starter-ltx-text-to-video --runs 1
+
+# Cheaper iteration:
+uv run python -m evals --model claude-haiku-4-5
+
+# Enforce a bar in CI-like mode:
+uv run python -m evals --runs 10 --fail-threshold 90
+```
+
+Artifacts land in `evals/outputs/<case>/r<NN>/`:
+
+- `proposal.json` — the full graph the agent proposed.
+- `meta.json` — pass/fail, failures, rationale, wall time.
+- `trace.jsonl` — every SSE event the agent emitted (one per line).
+
+## Authoring a case
+
+Drop a file in `evals/cases/my-case.yaml`:
+
+```yaml
+name: my-case
+description: one-line explanation of what good looks like
+prompt: |
+  A natural-language prompt — as if a user typed it into the agent chat.
+runs: 5
+expect:
+  # Each entry is a single-key mapping: {check_name: argument}.
+  - pipelines_include: [longlive]
+  - wire_present: { kind: vace_to_pipeline }
+  - no_validator_errors: true
+forbid:
+  - bad_handle_prefix: "parameter:"
+```
+
+### Available checks
+
+Registered in [`grader.py`](grader.py):
+
+| Check | Argument | Passes when… |
+| ----- | -------- | ------------ |
+| `pipelines_equal` | `[ids]` | Pipeline nodes' `pipeline_id`s exactly equal the set. |
+| `pipelines_include` | `[ids]` | Pipeline nodes include every id in the list (extras ok). |
+| `pipelines_count_at_least` | `int` | At least N pipeline nodes exist (any ids). Good for vague prompts. |
+| `lora_count_at_least` | `int` | Total LoRA entries across `lora` UI nodes ≥ N. |
+| `wire_present` | `{kind, …}` | An edge of the named kind exists. See below. |
+| `node_present` | `{type, count?, min_items?}` | ≥ `count` UI nodes of `type`. For `prompt_list`, `min_items` asserts list length. |
+| `no_validator_errors` | _(any)_ | `_validate_proposal()` returns zero errors on the graph. |
+| `bad_handle_prefix` | `"parameter:"` | (Forbid) No edge handle starts with the prefix. |
+| `orphan_sinks` | _(any)_ | (Forbid) Every top-level `sink` node has at least one incoming top-level `stream` edge. Catches cases where the agent tacks on an extra sink that isn't wired to anything. |
+
+`wire_present` kinds:
+
+| Kind | Extra args | Matches |
+| ---- | ---------- | ------- |
+| `slider_to_pipeline_param` | `target_handle: "param:noise_scale"` | UI-value node → pipeline's `targetHandle`. |
+| `vace_to_pipeline` | — | VACE UI node → pipeline's `param:__vace`. |
+| `image_to_vace` | — | Image (or value) node → VACE node's `param:ref_image`/`first_frame`/`last_frame`. |
+| `prompt_to_pipeline` | — | Any source → pipeline's `param:__prompt`. |
+| `lora_to_pipeline` | — | LoRA node → pipeline's `param:__loras`. |
+| `prompt_list_to_pipeline` | — | `prompt_list` UI node → pipeline's `param:__prompt`. |
+| `trigger_to_prompt_list` | — | Value source → `prompt_list`'s `param:trigger`/`param:cycle`. |
+| `pipeline_to_record` | — | A pipeline's stream output → a `record` UI node. |
+
+Adding a new check type = adding a function to `grader.py` and registering
+it in `CHECKS`. The YAML format picks it up automatically.
+
+### Case tone: precise vs. vague
+
+Real users send prompts across a wide range of specificity. Cases should
+cover that range:
+
+- **Precise** (`complex-krea-prompt-switch-record`) — the prompt names the
+  pipeline, exact counts, specific behaviors. Graders assert the precise
+  structure: `pipelines_include: [krea-realtime-video]`,
+  `node_present: { type: prompt_list, min_items: 5 }`, specific wires.
+- **Vague** (`vague-capture-moments`) — the prompt says what the user
+  wants to *do*, not how. Graders assert only what the intent clearly
+  implies (`pipelines_count_at_least: 1`, `node_present: { type: record }`).
+  The agent gets latitude on everything else; the eval measures whether
+  it makes reasonable choices.
+
+Prefer more vague cases as pass-rate on precise ones improves — vague
+ones surface filling-the-gaps failures that don't show up when every
+detail is spelled out.
+
+## Pytest integration
+
+A single smoke test at `tests/test_evals_smoke.py` runs one case under
+`@pytest.mark.eval`. Default `pytest` skips it (pytest-ini addopts
+`-m 'not eval'`). To include it:
+
+```bash
+uv run pytest -m eval
+```
+
+This only verifies the harness wires up end-to-end — it doesn't enforce
+pass-rates. For pass-rate enforcement, use `python -m evals`.
+
+## CI
+
+There is a `.github/workflows/eval.yml` that runs on manual dispatch only
+(`workflow_dispatch`). It is **not** hooked into `pull_request` or `push`
+— LLM evals cost money and are inherently noisy at the edges. Gate launch
+decisions on the number, not on PR green.
+
+## Design notes
+
+- The driver uses `httpx.ASGITransport` + `asgi-lifespan` so we hit the
+  real `/api/v1/agent/chat` endpoint without spawning a server. This is
+  the same endpoint the frontend uses, so behavior is identical to
+  production.
+- Each case spins up an isolated `AgentSession`; no cross-case
+  contamination. Conversation history does not leak between runs.
+- Grading is deterministic and structural. No LLM-as-judge in v1.
+- Model/provider overrides flow through the on-disk agent config file so
+  runs respect the same resolution order the server uses.
diff --git a/evals/__init__.py b/evals/__init__.py
@@ -0,0 +1,10 @@
+"""Eval harness for the Scope agentic workflow builder.
+
+Each "case" is a YAML file in ``evals/cases/`` describing a natural-language
+prompt, how many times to sample the model, and structural checks to run on
+the resulting workflow proposal. The runner drives the real agent via an
+in-process ASGI transport and grades proposals deterministically.
+
+This package is NOT imported by the running server; it is only exercised by
+``python -m evals`` (CLI) and the opt-in ``pytest -m eval`` smoke test.
+"""
diff --git a/evals/__main__.py b/evals/__main__.py
@@ -0,0 +1,10 @@
+"""``python -m evals`` entry point."""
+
+from __future__ import annotations
+
+import sys
+
+from .runner import main
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/evals/case.py b/evals/case.py
@@ -0,0 +1,104 @@
+"""YAML → Case dataclass loader for the eval harness.
+
+A case file looks like::
+
+    name: starter-mythical-creature
+    description: |
+      Reproduces the Mythical Creature teaching starter.
+    prompt: |
+      I want a slime creature ...
+    runs: 5
+    expect:
+      - pipelines_equal: [longlive]
+      - wire_present: { kind: vace_to_pipeline }
+    forbid:
+      - bad_handle_prefix: "parameter:"
+
+Each entry under ``expect`` / ``forbid`` is a single-key mapping whose key is
+the name of a check in :mod:`evals.grader` and whose value is the check
+argument. We deliberately keep the format flat and declarative so adding a
+case is just dropping a new YAML file.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+@dataclass
+class CheckSpec:
+    """One graded check: ``(name, arg)`` where ``name`` resolves to a function
+    in :mod:`evals.grader`."""
+
+    name: str
+    arg: Any
+
+
+@dataclass
+class Case:
+    name: str
+    prompt: str
+    description: str = ""
+    runs: int = 5
+    expect: list[CheckSpec] = field(default_factory=list)
+    forbid: list[CheckSpec] = field(default_factory=list)
+    source_path: Path | None = None
+    # When true, the case passes iff the agent did NOT emit a
+    # ``workflow_proposal`` SSE event. Used for runtime-tweak cases where
+    # the right tool is ``update_parameters`` and re-proposing the graph
+    # is the regression we want to catch.
+    forbid_proposal: bool = False
+
+
+def _parse_check_list(raw: list[Any], context: str) -> list[CheckSpec]:
+    """Convert a list of single-key mappings to ``CheckSpec``s."""
+    out: list[CheckSpec] = []
+    for idx, entry in enumerate(raw or []):
+        if not isinstance(entry, dict) or len(entry) != 1:
+            raise ValueError(
+                f"{context}[{idx}] must be a single-key mapping, got: {entry!r}"
+            )
+        ((name, arg),) = entry.items()
+        if not isinstance(name, str):
+            raise ValueError(f"{context}[{idx}] check name must be a string")
+        out.append(CheckSpec(name=name, arg=arg))
+    return out
+
+
+def load_case(path: Path) -> Case:
+    """Load a single case YAML file into a :class:`Case`."""
+    data = yaml.safe_load(path.read_text()) or {}
+    if not isinstance(data, dict):
+        raise ValueError(f"{path}: expected a mapping at top level")
+
+    name = data.get("name") or path.stem
+    prompt = data.get("prompt")
+    if not isinstance(prompt, str) or not prompt.strip():
+        raise ValueError(f"{path}: 'prompt' is required and must be a non-empty string")
+
+    runs = data.get("runs", 5)
+    if not isinstance(runs, int) or runs < 1:
+        raise ValueError(f"{path}: 'runs' must be a positive integer")
+
+    return Case(
+        name=str(name),
+        prompt=prompt,
+        description=str(data.get("description") or ""),
+        runs=runs,
+        expect=_parse_check_list(data.get("expect") or [], f"{path}:expect"),
+        forbid=_parse_check_list(data.get("forbid") or [], f"{path}:forbid"),
+        source_path=path,
+        forbid_proposal=bool(data.get("forbid_proposal", False)),
+    )
+
+
+def discover_cases(cases_dir: Path) -> list[Case]:
+    """Load every ``*.yaml`` / ``*.yml`` case in ``cases_dir``, alpha-sorted."""
+    paths = sorted(
+        p for p in cases_dir.iterdir() if p.suffix in (".yaml", ".yml") and p.is_file()
+    )
+    return [load_case(p) for p in paths]
diff --git a/evals/cases/complex-krea-prompt-switch-record.yaml b/evals/cases/complex-krea-prompt-switch-record.yaml
@@ -0,0 +1,29 @@
+name: complex-krea-prompt-switch-record
+description: |
+  Multi-concept request that tests: picking a specific pipeline (krea), wiring
+  a reference image via VACE, using a prompt_list with ≥5 items driven by a
+  button/trigger, and wiring the output into a record node. Verbatim phrasing
+  of the kind a user would type.
+prompt: |
+  Make a krea workflow that allows me to supply a reference image, switch
+  between 5 prompts with a button press, and record the output.
+runs: 5
+expect:
+  - pipelines_include: [krea-realtime-video]
+  # Reference image path: krea supports VACE, so the reference image flows
+  # image -> vace -> pipeline's param:__vace aggregate.
+  - wire_present: { kind: image_to_vace }
+  - wire_present: { kind: vace_to_pipeline }
+  # Prompt switching: prompt_list node with at least 5 entries, its output
+  # feeding the pipeline's aggregate prompt input.
+  - node_present: { type: prompt_list, min_items: 5 }
+  - wire_present: { kind: prompt_list_to_pipeline }
+  - wire_present: { kind: trigger_to_prompt_list }
+  # Recording: at least one record node wired to the pipeline's stream output.
+  - node_present: { type: record }
+  - wire_present: { kind: pipeline_to_record }
+  - no_validator_errors: true
+forbid:
+  - bad_handle_prefix: "parameter:"
+  - orphan_sinks: true
+  - overlapping_nodes: true