diff --git a/.gitignore b/.gitignore
index 852ea75..e750d18 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,12 @@ build/
 venv/
 env/
 
+# IDE configurations
+.vscode/
+.idea/
+*.sublime-project
+*.sublime-workspace
+
 # Test / tooling caches
 .pytest_cache/
 .mypy_cache/
@@ -31,4 +37,6 @@ cuda_installer.pyz
 interface/smoke_tests/results/
 # Generated evaluation outputs
 mazes/results/*_results.json
-mazes/results/*.progress.json
\ No newline at end of file
+mazes/results/*.progress.json
+# Bare-bones run pipeline artifact tree (regenerated locally)
+artifacts/
\ No newline at end of file
diff --git a/docs/system_design.md b/docs/system_design.md
index 16010d4..6f1dad3 100644
--- a/docs/system_design.md
+++ b/docs/system_design.md
@@ -18,7 +18,7 @@ This document is the single canonical source of truth for how the MultiNet v2.0
 1. [Overview & north stars](#1-overview--north-stars)
 2. [Pipeline DAG: stages, artifacts, invalidation](#2-pipeline-dag-stages-artifacts-invalidation)
 3. [Task spec contract](#3-task-spec-contract)
-4. [Static scoring (13 dimensions)](#4-static-scoring-13-dimensions)
+4. [Static scoring (12 dimensions plus canonical-agent features)](#4-static-scoring-12-dimensions-plus-canonical-agent-features)
 5. [Runtime scoring](#5-runtime-scoring)
 6. [Backend & inference adapter contracts](#6-backend--inference-adapter-contracts)
 7. [Reporting & aggregate](#7-reporting--aggregate)
@@ -74,18 +74,18 @@ The pipeline is a five-stage DAG. Each stage has declared inputs and outputs and
 2. **Solve & Score-static**
    - Inputs: `task.json`.
    - Outputs:
-     - `canonical_paths.json` `{ bfs: { path, steps, states_explored }, greedy: { success, path, steps }, … }`
-     - `scored.json` `{ is_beatable, dimensions[13], fragility, mechanism_necessity_violations, distractor_safety_violations, message }`
+     - `canonical_paths.json` `{ bfs: { actions, positions, optimal_steps, states_explored }, greedy: { success, actions, positions, steps }, … }`
+     - `scored_static.json` `{ is_beatable, dimensions_12, canonical_agent_features, validation, message }`
    - Hash key: `hash(solver_v, scorer_v, task.json, agent_set_v)`.
-   - If `scored.json.is_beatable == false`, downstream stages skip the task; it is logged as ineligible and surfaced in reports.
+   - If `scored_static.json.is_beatable == false`, downstream stages skip the task; it is logged as ineligible and surfaced in reports.
 
 3. **Render-and-Run**
-   - Inputs: `task.json`, `scored.json` (gate on `is_beatable`), backend choice, adapter choice, `model_id`, `seed`.
+   - Inputs: `task.json`, `scored_static.json` (gate on `is_beatable`), backend choice, adapter choice, `model_id`, `seed`.
    - Outputs: `run.json` `{ trajectory, actions, tokens, terminated, success }`.
    - Hash key: `hash(backend_v, adapter_v, model_id, task.json, seed)`.
 
 4. **Score-runtime**
-   - Inputs: `run.json`, `scored.json`, `canonical_paths.json`.
+   - Inputs: `run.json`, `scored_static.json`, `canonical_paths.json`.
    - Outputs: `run_score.json` `{ success, step_ratio, cell_overlap_*, distractor_interactions, irreversible_failures, tokens, composite }`.
    - Hash key: `hash(runtime_scorer_v, inputs)`.
 
@@ -106,7 +106,7 @@ artifacts/
 ├── tasks/<task_id>/
 │   ├── task.json                # Stage 1
 │   ├── canonical_paths.json     # Stage 2 (a)
-│   └── scored.json              # Stage 2 (b) — includes is_beatable
+│   └── scored_static.json       # Stage 2 (b) — includes is_beatable
 ├── runs/<task_id>/<backend>/<adapter>/<model_id>/<seed>/
 │   ├── run.json                 # Stage 3
 │   └── run_score.json           # Stage 4
@@ -218,9 +218,9 @@ Enforced by `TaskSpecification.validate()`:
 
 ---
 
-## 4. Static scoring (13 dimensions)
+## 4. Static scoring (12 dimensions plus canonical-agent features)
 
-Static scoring runs once per task at pipeline stage 2 (Solve & Score-static). It produces `scored.json`, which carries `is_beatable` plus a 13-dimension vector and supporting validation reports. The static scorer consumes `task.json` and `canonical_paths.json`.
+Static scoring runs once per task at pipeline stage 2 (Solve & Score-static). It produces `scored_static.json`, which carries `is_beatable`, a 12-dimension vector, canonical-agent features, and supporting validation reports. The scorer consumes `task.json` and emits this artifact alongside `canonical_paths.json`.
 
 ### 4.1 Dimensions
 
@@ -238,7 +238,7 @@ All raw values are floats (or counts cast to float). Higher = harder *unless* ex
 10. **`wall_density`** — Source: spec. Computation: `len(walls) / grid_size`. Crude (does not separate interior vs functional walls); **calibration target**.
 11. **`partial_observability`** — Source: spec rules. Computation: ordinal `{full: 0, view_cone: 1, fog_of_war: 2}` from `rules.observability`.
 12. **`irreversibility`** — Source: spec rules + mechanisms. Computation: `key_consumption × #doors + #one_shot_switches + #non_bidirectional_teleporters`.
-13. **`greedy_solvability`** — Source: Greedy canonical agent. Computation: `1.0 if greedy succeeds else 0.0`. **Penalty** (greedy-solvable tasks lower the runtime composite, on the rationale that they are less a test of spatial reasoning).
+`greedy_solvability` is recorded separately under `canonical_agent_features`, rather than appended to the calibrated 12-dimension vector. Source: Greedy canonical agent. Computation: `1.0 if greedy succeeds else 0.0`. **Penalty** (greedy-solvable tasks lower the runtime composite, on the rationale that they are less a test of spatial reasoning).
 
 ### 4.2 Static composite (difficulty score)
 
@@ -246,13 +246,13 @@ All raw values are floats (or counts cast to float). Higher = harder *unless* ex
 static_composite = Σ_i (raw_dim_i × calibration.weights[dim_name_i])
 ```
 
-- `calibration.weights` lives in `calibration.yaml`; defaults to `1.0` for all dimensions until empirical tuning.
+- Calibration weights live in `scorer/scorer_config.json` by default; optional JSON or YAML overrides may be passed explicitly. Weights default to `1.0` for all dimensions until empirical tuning.
 - `static_composite` is used for task ranking and live-benchmark filtering (e.g., reject tasks whose composite falls outside a tier's target range).
 - It is *not* used directly in runtime scoring; runtime uses individual dimensions plus a derived "difficulty weight" (Section 5).
 
-### 4.3 Validation reports (also in `scored.json`)
+### 4.3 Validation reports (also in `scored_static.json`)
 
-Beyond the dimension vector, `scored.json` carries the validator's structural reports:
+Beyond the dimension vector, `scored_static.json` carries the validator's structural reports:
 
 - `is_beatable` (bool) and `message` (str) — gate for downstream stages.
 - `mechanism_necessity_violations` (list of strings) — mechanisms whose removal still leaves the task solvable; flags accidental decoration.
@@ -260,6 +260,7 @@ Beyond the dimension vector, `scored.json` carries the validator's structural re
 - `chain_ordering_valid` (bool) — each dependency step actually gates the next.
 
 These do not enter the composite but are surfaced in reports for task-quality auditing.
+Schema-invalid tasks are rejected before canonical planners execute and do not emit score artifacts.
 
 ### 4.4 Calibration notes
 
@@ -272,16 +273,16 @@ These do not enter the composite but are surfaced in reports for task-quality au
 
 ## 5. Runtime scoring
 
-Runtime scoring runs at pipeline stage 4 (Score-runtime), once per `run.json`. It produces `run_score.json`. It consumes the run trajectory plus the static scoring artifacts (`scored.json`, `canonical_paths.json`).
+Runtime scoring runs at pipeline stage 4 (Score-runtime), once per `run.json`. It produces `run_score.json`. It consumes the run trajectory plus the static scoring artifacts (`scored_static.json`, `canonical_paths.json`).
 
 ### 5.1 Per-run signal vector
 
 Recorded for every `(task, backend, adapter, model_id, seed)`:
 
 - `success` (bool) — goal reached within `max_steps`, no terminal hazard.
-- `steps` (int) — agent's actual step count.
+- `steps` (int) — agent's actual step count. Required; runtime scoring rejects missing telemetry.
 - `terminated_reason` (str) — one of `{goal_reached, hazard, max_steps, deadlock, invalid_action_excess}`.
-- `token_count` (int) — total prompt + response tokens summed over all model turns.
+- `token_count` (positive int) — total prompt + response tokens summed over all model turns. Required; runtime scoring rejects missing or non-positive telemetry.
 - `distractor_interactions` (int) — count of distractor-element interactions (any `pickup` / `toggle` / `push` on an element registered as a distractor).
 - `irreversible_failures` (int) — count of irreversible actions that broke solvability, detected by re-running the validator from the post-action state.
 
@@ -298,11 +299,11 @@ composite = success_factor × efficiency_factor × difficulty_weight − greedy_
 ```
 
 - `success_factor = 1.0 if success else 0.0` — hard gate; failed runs score 0 regardless of efficiency.
-- `efficiency_factor = α × step_ratio + β × cell_overlap_bfs + γ × token_efficiency` — weighted blend; default `α = β = γ = 1/3`. `token_efficiency = min(1, baseline_tokens / max(model_tokens, 1))` where `baseline_tokens` lives in `calibration.yaml`.
-- `difficulty_weight = normalize(static_composite)` — harder tasks contribute more. Default normalization: `f(x) = x / max_observed_static_composite_in_suite`.
+- `efficiency_factor = α × step_ratio + β × cell_overlap_bfs + γ × token_efficiency` — weighted blend; default `α = β = γ = 1/3`. `token_efficiency = min(1, baseline_tokens / model_tokens)` where `baseline_tokens` lives in scorer config. Missing or non-positive token telemetry is an artifact error, not a neutral score.
+- `difficulty_weight = normalize(static_composite)` — harder tasks contribute more. Default normalization: `f(x) = x / max_observed_static_composite_in_suite`. Runtime scoring requires that suite maximum either in scorer config or as an explicit runtime argument.
 - `greedy_penalty = δ × greedy_solvability × success_factor` — applied only to successful runs; `δ` is a calibration coefficient with default 0.5.
 
-All Greek-letter coefficients (`α, β, γ, δ`) and the normalization function live in `calibration.yaml`. The design commits to the *shape*, not the values.
+All Greek-letter coefficients (`α, β, γ, δ`) and the normalization value live in scorer config. The design commits to the *shape*, not the values.
 
 ### 5.4 Single-point benchmark score (ARC-AGI style)
 
@@ -340,7 +341,8 @@ Defaults to a uniform mean. Calibration may switch to a tier-weighted or difficu
 ### 5.6 Calibration notes
 
 - All composite coefficients ship as `1.0` or sensible defaults; the design does not claim correctness.
-- `calibration.yaml` is versioned in git; changes bump `calibration_version` and trigger stage-4 / stage-5 invalidation.
+- `scorer/scorer_config.json` is versioned in git; changes bump `calibration_version` and trigger stage-4 / stage-5 invalidation.
+- The shipped config intentionally leaves `difficulty_max_static_score` unset. Runtime scoring requires a calibrated suite maximum through config or `--difficulty-max-static-score`.
 - After a calibration update, the pipeline regenerates `run_score.json` and `reports/` from cached `run.json`. Run records do **not** re-execute model calls. This is a deliberate consequence of the DAG split.
 
 ---
@@ -533,16 +535,16 @@ Status legend:
 
 **2. Validator** — folded into Stage 2
 - ✅ `gridworld/task_validator.py::TaskValidator` does exhaustive BFS over the full mechanism state space, plus `compute_fragility`, `validate_mechanism_necessity`, `validate_chain_ordering`, `validate_distractor_safety`.
-- Delta: surface validation reports into `scored.json` instead of emitting a separate `validity.json`.
+- Delta: surface validation reports into `scored_static.json` instead of emitting a separate `validity.json`.
 
 **3. Solver suite (canonical agents)** — Stage 2
-- ⚠️ BFS exists inside `TaskValidator._find_solution`. Greedy does not yet exist as a separate canonical agent.
-- 🚧 Multi-tier solver suite pending; Greedy is the next addition, then heuristic, then random.
-- Delta: extract BFS path emission as one canonical agent, add Greedy as a peer, write combined output to `canonical_paths.json`.
+- ✅ `gridworld/baselines.py` exposes BFS and Greedy planners; `scorer/solvers.py` writes their combined output to `canonical_paths.json`.
+- 🚧 Heuristic and random canonical-agent peers remain optional future additions.
+- Delta: add calibration runs before extending the canonical-agent feature vector.
 
 **4. Static scorer** — Stage 2
-- ⚠️ `gridworld/scoring.py::compute_12d_score` exists with 12 dimensions matching dimensions 1–12 of §4 (modulo formula calibration).
-- Delta: add dimension 13 (`greedy_solvability`), restructure output to `scored.json` sidecar, move composite weights to `calibration.yaml`, include validation reports.
+- ✅ `scorer/scoring.py::compute_12d_score` exposes the public interface for the 12 calibrated dimensions and writes `scored_static.json` with validation reports plus `canonical_agent_features.greedy_solvability`.
+- Delta: empirically calibrate the shipped placeholder weights.
 
 **5. `MiniGridBackend`** — backend axis
 - ✅ `gridworld/backends/minigrid_backend.py` implements `AbstractGridBackend` for square grids with discrete actions + RGB rendering.
@@ -566,8 +568,8 @@ Status legend:
 - Delta: emit canonical `run.json`; remove inline scoring (move to Stage 4); add per-step trajectory recording.
 
 **10. Runtime scorer** — Stage 4
-- 🚧 Does not exist as a component. Some scoring logic lives inside `evaluation_harness.py`.
-- Delta: new module that consumes `run.json` + `scored.json` + `canonical_paths.json` and produces `run_score.json`.
+- ✅ `scorer/runtime.py` consumes `run.json` + `scored_static.json` + `canonical_paths.json` and produces `run_score.json`.
+- Delta: populate optional interaction diagnostics in runtime producers and calibrate the suite-level difficulty maximum.
 
 **11. Aggregator / reporter** — Stage 5
 - ⚠️ Partial. `evaluation_harness.py` produces some summary dicts; nothing matches the per-run-set artifact layout.
@@ -597,7 +599,7 @@ Items the design intentionally defers. None block initial implementation.
 - DAG runner technology — Snakemake leading candidate; final pick deferred to implementation.
 - Token-efficiency baseline (`baseline_tokens`) — per-task vs global constant; needs a sensible default once a few model runs exist.
 
-### 9.2 Calibration coefficients (live in `calibration.yaml`, default to placeholders)
+### 9.2 Calibration coefficients (live in scorer config, default to placeholders)
 - Runtime composite blend weights `α, β, γ` (step ratio / cell overlap / token efficiency).
 - Greedy penalty coefficient `δ`.
 - `difficulty_weight` normalization function (currently `x / max_observed`; may switch to a percentile or log normalization).
@@ -645,7 +647,7 @@ Mapping to the canonical pipeline:
 | JSON generator | Stage 1 (Generate) | §2.1 |
 | Task spec / Validator | folded into Stage 2 (Solve & Score-static) | §2.1 |
 | BFS-greedy agents | Multi-tier canonical agent suite (Stage 2) | §2.1, §4 |
-| Score calculation (static) | Static scoring (13 dimensions) (Stage 2) | §4 |
+| Score calculation (static) | Static scoring (12 dimensions plus canonical-agent features) (Stage 2) | §4 |
 | Backend Generator | Backend axis: `MiniGridBackend` / `MultiGridBackend` / `TextBackend` | §6 |
 | Inference scripts | Adapter axis: `ModelInterface` implementations | §6 |
 | Scoring code (final score, comparison) | Runtime scoring (Stage 4) + Aggregate (Stage 5) | §5, §7 |
diff --git a/evaluation_harness.py b/evaluation_harness.py
index 57fa3ee..55ea840 100644
--- a/evaluation_harness.py
+++ b/evaluation_harness.py
@@ -22,7 +22,8 @@
     from .gridworld.task_spec import TaskSpecification
     from .gridworld.actions import ACTION_NAMES, ACTION_DESCRIPTIONS
     from .gridworld.task_validator import compute_difficulty
-    from .gridworld.scoring import compute_12d_score
+    from .scorer.io import json_default as _json_default
+    from .scorer.scoring import compute_12d_score
 except ImportError:
     from model_interface import ModelInterface, ModelInput, ModelOutput
     from gridworld.runner.grid_runner import GridRunner, EpisodeResult
@@ -31,14 +32,8 @@
     from gridworld.task_spec import TaskSpecification
     from gridworld.actions import ACTION_NAMES, ACTION_DESCRIPTIONS
     from gridworld.task_validator import compute_difficulty
-    from gridworld.scoring import compute_12d_score
-
-
-def _json_default(value):
-    """Convert NumPy scalars to native Python types for JSON serialization."""
-    if isinstance(value, np.generic):
-        return value.item()
-    raise TypeError(f"Object of type {value.__class__.__name__} is not JSON serializable")
+    from scorer.io import json_default as _json_default
+    from scorer.scoring import compute_12d_score
 
 
 @dataclass
diff --git a/gridworld/__init__.py b/gridworld/__init__.py
index 27425ed..fd567a0 100644
--- a/gridworld/__init__.py
+++ b/gridworld/__init__.py
@@ -1,7 +1,7 @@
 """Gridworld domain for MultiNet-v2.0.
 
-This module provides task schema, validation, and scoring utilities for
-gridworld puzzle specifications.
+This module provides task schema and validation utilities for gridworld
+puzzle specifications.
 """
 
 from .bootstrap import disable_gymnasium_env_plugins
@@ -32,9 +32,6 @@
     TaskValidator,
     compute_difficulty,
 )
-from .scoring import ScoredDifficulty, compute_12d_score
-
-
 __all__ = [
     # Task specification
     "Position",
@@ -58,6 +55,4 @@
     "DifficultyReport",
     "FragilityReport",
     "compute_difficulty",
-    "ScoredDifficulty",
-    "compute_12d_score",
 ]
diff --git a/gridworld/baselines.py b/gridworld/baselines.py
index ee5c8b0..d81efcd 100644
--- a/gridworld/baselines.py
+++ b/gridworld/baselines.py
@@ -49,6 +49,17 @@ class Transition:
     next_state: PlannerState
 
 
+@dataclass(frozen=True)
+class PlannedPath:
+    """Planner output with replayed positions for scorer/reporting artifacts."""
+
+    success: bool
+    actions: list[int]
+    action_labels: list[str]
+    positions: list[tuple[int, int]]
+    states_explored: int = 0
+
+
 class TaskPlanningContext:
     """Fast lookup tables derived from a ``TaskSpecification``."""
 
@@ -239,25 +250,6 @@ def _successors(ctx: TaskPlanningContext, state: PlannerState) -> Iterable[Trans
             ),
         )
 
-    door = ctx.doors_by_pos.get(front)
-    if door and door["id"] not in state.open_doors and state.carrying_key is not None:
-        held_color = ctx.keys_by_id[state.carrying_key]["color"]
-        if held_color == door["color"]:
-            yield Transition(
-                action=int(MiniGridActions.TOGGLE),
-                label=f"open_door:{door['id']}",
-                next_state=PlannerState(
-                    agent_pos=state.agent_pos,
-                    agent_dir=state.agent_dir,
-                    carrying_key=None if ctx.key_consumption else state.carrying_key,
-                    collected_keys=state.collected_keys,
-                    active_switches=state.active_switches,
-                    used_switches=state.used_switches,
-                    open_gates=state.open_gates,
-                    open_doors=state.open_doors | {door["id"]},
-                ),
-            )
-
     switch = ctx.switches_by_pos.get(state.agent_pos)
     if switch and switch["switch_type"] != "hold":
         toggled = _apply_switch(ctx, state, switch)
@@ -268,6 +260,27 @@ def _successors(ctx: TaskPlanningContext, state: PlannerState) -> Iterable[Trans
                 next_state=toggled,
             )
 
+    # Runtime consumes TOGGLE on the current-cell switch before checking front doors.
+    if switch is None:
+        door = ctx.doors_by_pos.get(front)
+        if door and door["id"] not in state.open_doors and state.carrying_key is not None:
+            held_color = ctx.keys_by_id[state.carrying_key]["color"]
+            if held_color == door["color"]:
+                yield Transition(
+                    action=int(MiniGridActions.TOGGLE),
+                    label=f"open_door:{door['id']}",
+                    next_state=PlannerState(
+                        agent_pos=state.agent_pos,
+                        agent_dir=state.agent_dir,
+                        carrying_key=None if ctx.key_consumption else state.carrying_key,
+                        collected_keys=state.collected_keys,
+                        active_switches=state.active_switches,
+                        used_switches=state.used_switches,
+                        open_gates=state.open_gates,
+                        open_doors=state.open_doors | {door["id"]},
+                    ),
+                )
+
     yield from _forward_successor(ctx, state, front)
 
 
@@ -353,10 +366,10 @@ def _shortest_plan(
     ctx: TaskPlanningContext,
     start: PlannerState,
     is_goal: Callable[[PlannerState], bool],
-) -> tuple[list[int], PlannerState | None]:
+) -> tuple[list[int], PlannerState | None, int]:
     """Run BFS over executable actions and return the first shortest plan."""
     if is_goal(start):
-        return [], start
+        return [], start, 1
 
     queue = deque([start])
     parent: dict[PlannerState, tuple[PlannerState, int]] = {}
@@ -370,10 +383,14 @@ def _shortest_plan(
             visited.add(transition.next_state)
             parent[transition.next_state] = (state, transition.action)
             if is_goal(transition.next_state):
-                return _reconstruct_actions(parent, transition.next_state), transition.next_state
+                return (
+                    _reconstruct_actions(parent, transition.next_state),
+                    transition.next_state,
+                    len(visited),
+                )
             queue.append(transition.next_state)
 
-    return [], None
+    return [], None, len(visited)
 
 
 def _shortest_plan_to_interaction(
@@ -437,9 +454,18 @@ def _reconstruct_actions(
 
 
 def _bfs_actions(spec: TaskSpecification) -> list[int]:
+    actions, _ = _bfs_actions_with_stats(spec)
+    return actions
+
+
+def _bfs_actions_with_stats(spec: TaskSpecification) -> tuple[list[int], int]:
     ctx = TaskPlanningContext(spec)
-    actions, _ = _shortest_plan(ctx, ctx.initial_state(), lambda st: st.agent_pos == ctx.goal)
-    return actions or [int(MiniGridActions.DONE)]
+    actions, _, states_explored = _shortest_plan(
+        ctx,
+        ctx.initial_state(),
+        lambda st: st.agent_pos == ctx.goal,
+    )
+    return actions, states_explored
 
 
 def _greedy_actions(spec: TaskSpecification) -> list[int]:
@@ -452,13 +478,81 @@ def _greedy_actions(spec: TaskSpecification) -> list[int]:
             break
         chunk, next_state = _shortest_plan_to_interaction(ctx, state)
         if next_state is None:
-            chunk, next_state = _shortest_plan(ctx, state, lambda st: st.agent_pos == ctx.goal)
+            chunk, next_state, _ = _shortest_plan(
+                ctx,
+                state,
+                lambda st: st.agent_pos == ctx.goal,
+            )
         if next_state is None or not chunk:
             break
         actions.extend(chunk)
         state = next_state
 
-    return actions or [int(MiniGridActions.DONE)]
+    return actions
+
+
+def trace_planned_actions(spec: TaskSpecification, actions: list[int]) -> PlannedPath:
+    """Replay planner actions through the planner graph without running a backend."""
+    ctx = TaskPlanningContext(spec)
+    state = ctx.initial_state()
+    positions = [state.agent_pos]
+    executed_actions: list[int] = []
+    labels: list[str] = []
+
+    for action in actions:
+        if action == int(MiniGridActions.DONE):
+            break
+        executed_actions.append(action)
+        transition = next(
+            (candidate for candidate in _successors(ctx, state) if candidate.action == action),
+            None,
+        )
+        if transition is None:
+            labels.append(f"invalid:{action}")
+            return PlannedPath(
+                success=False,
+                actions=executed_actions,
+                action_labels=labels,
+                positions=positions,
+            )
+        labels.append(transition.label)
+        state = transition.next_state
+        positions.append(state.agent_pos)
+
+    return PlannedPath(
+        success=state.agent_pos == ctx.goal,
+        actions=executed_actions,
+        action_labels=labels,
+        positions=positions,
+    )
+
+
+def plan_bfs_actions(spec: TaskSpecification) -> list[int]:
+    """Return the deterministic BFS baseline action plan."""
+    return _bfs_actions(spec)
+
+
+def plan_greedy_actions(spec: TaskSpecification) -> list[int]:
+    """Return the deterministic greedy baseline action plan."""
+    return _greedy_actions(spec)
+
+
+def plan_bfs_path(spec: TaskSpecification) -> PlannedPath:
+    """Return the BFS baseline plan plus replayed positions."""
+    actions, states_explored = _bfs_actions_with_stats(spec)
+    path = trace_planned_actions(spec, actions)
+    return PlannedPath(
+        success=path.success,
+        actions=path.actions,
+        action_labels=path.action_labels,
+        positions=path.positions,
+        states_explored=states_explored,
+    )
+
+
+def plan_greedy_path(spec: TaskSpecification) -> PlannedPath:
+    """Return the greedy baseline plan plus replayed positions."""
+    return trace_planned_actions(spec, plan_greedy_actions(spec))
 
 
 class PlannedBaselineModel(ModelInterface):
diff --git a/gridworld/fixtures/manifest.json b/gridworld/fixtures/manifest.json
new file mode 100644
index 0000000..92cbfee
--- /dev/null
+++ b/gridworld/fixtures/manifest.json
@@ -0,0 +1,365 @@
+{
+  "description": "Fixture manifest for the bare-bones run pipeline (tests 1-3). route_short_cells/route_long_cells on test2 rows are populated by scripts/validate_fixtures.py.",
+  "tasks": [
+    {
+      "task_id": "validation_10_v01_empty_room",
+      "experiment": "test1",
+      "condition": "default",
+      "variant": "empty_room",
+      "source": "mazes/validation_10/V01_empty_room.json",
+      "expected_mechanisms": [],
+      "notes": "Baseline empty room."
+    },
+    {
+      "task_id": "validation_10_v02_winding_corridor",
+      "experiment": "test1",
+      "condition": "default",
+      "variant": "winding_corridor",
+      "source": "mazes/validation_10/V02_winding_corridor.json",
+      "expected_mechanisms": [],
+      "notes": "Navigation only."
+    },
+    {
+      "task_id": "validation_10_v03_multi_path",
+      "experiment": "test1",
+      "condition": "default",
+      "variant": "multi_path",
+      "source": "mazes/validation_10/V03_multi_path.json",
+      "expected_mechanisms": [],
+      "notes": "Also used for test2."
+    },
+    {
+      "task_id": "validation_10_v04_single_key",
+      "experiment": "test1",
+      "condition": "default",
+      "variant": "single_key",
+      "source": "mazes/validation_10/V04_single_key.json",
+      "expected_mechanisms": [
+        "kR"
+      ],
+      "notes": "Single key-door."
+    },
+    {
+      "task_id": "validation_10_v05_single_switch",
+      "experiment": "test1",
+      "condition": "default",
+      "variant": "single_switch",
+      "source": "mazes/validation_10/V05_single_switch.json",
+      "expected_mechanisms": [
+        "s1"
+      ],
+      "notes": "Single switch-gate."
+    },
+    {
+      "task_id": "validation_10_v06_chain_ks",
+      "experiment": "test1",
+      "condition": "default",
+      "variant": "chain_ks",
+      "source": "mazes/validation_10/V06_chain_ks.json",
+      "expected_mechanisms": [
+        "kR",
+        "s1"
+      ],
+      "notes": "Also used for test3 (key-first)."
+    },
+    {
+      "task_id": "validation_10_v07_chain_sk",
+      "experiment": "test1",
+      "condition": "default",
+      "variant": "chain_sk",
+      "source": "mazes/validation_10/V07_chain_sk.json",
+      "expected_mechanisms": [
+        "s1",
+        "kR"
+      ],
+      "notes": "Also used for test3 (switch-first)."
+    },
+    {
+      "task_id": "validation_10_v08_chain_kk",
+      "experiment": "test1",
+      "condition": "default",
+      "variant": "chain_kk",
+      "source": "mazes/validation_10/V08_chain_kk.json",
+      "expected_mechanisms": [],
+      "notes": "Two-key chain."
+    },
+    {
+      "task_id": "validation_10_v09_distractor_simple",
+      "experiment": "test1",
+      "condition": "default",
+      "variant": "distractor_simple",
+      "source": "mazes/validation_10/V09_distractor_simple.json",
+      "expected_mechanisms": [],
+      "notes": "Distractor present."
+    },
+    {
+      "task_id": "validation_10_v10_distractor_chain",
+      "experiment": "test1",
+      "condition": "default",
+      "variant": "distractor_chain",
+      "source": "mazes/validation_10/V10_distractor_chain.json",
+      "expected_mechanisms": [],
+      "notes": "Distractor chain."
+    },
+    {
+      "task_id": "validation_10_v03_multi_path__t2",
+      "experiment": "test2",
+      "condition": "multi_path",
+      "variant": "open_routes",
+      "source": "mazes/validation_10/V03_multi_path.json",
+      "expected_mechanisms": [],
+      "route_block": [
+        5,
+        6
+      ],
+      "notes": "Three open routes; route_block forces a longer route to discriminate path_choice.",
+      "route_short_cells": [
+        [
+          2,
+          6
+        ],
+        [
+          3,
+          6
+        ],
+        [
+          4,
+          6
+        ],
+        [
+          5,
+          5
+        ],
+        [
+          5,
+          6
+        ],
+        [
+          6,
+          5
+        ],
+        [
+          7,
+          5
+        ],
+        [
+          8,
+          5
+        ],
+        [
+          9,
+          5
+        ],
+        [
+          10,
+          5
+        ]
+      ],
+      "route_long_cells": [
+        [
+          1,
+          7
+        ],
+        [
+          1,
+          8
+        ],
+        [
+          2,
+          8
+        ],
+        [
+          3,
+          8
+        ],
+        [
+          4,
+          8
+        ],
+        [
+          4,
+          9
+        ],
+        [
+          4,
+          10
+        ],
+        [
+          5,
+          10
+        ],
+        [
+          6,
+          10
+        ],
+        [
+          7,
+          10
+        ],
+        [
+          8,
+          6
+        ],
+        [
+          8,
+          7
+        ],
+        [
+          8,
+          8
+        ],
+        [
+          8,
+          9
+        ],
+        [
+          8,
+          10
+        ],
+        [
+          9,
+          6
+        ]
+      ]
+    },
+    {
+      "task_id": "T2_corridor_shortcut",
+      "experiment": "test2",
+      "condition": "shortcut",
+      "variant": "door_shortcut",
+      "source": "gridworld/fixtures/test2/T2_corridor_shortcut.json",
+      "expected_mechanisms": [
+        "kB"
+      ],
+      "route_block": [
+        4,
+        1
+      ],
+      "notes": "Short mechanistic (door) route vs long open detour; route_block is the door cell.",
+      "route_short_cells": [
+        [
+          1,
+          2
+        ],
+        [
+          3,
+          1
+        ],
+        [
+          4,
+          1
+        ],
+        [
+          5,
+          1
+        ],
+        [
+          6,
+          1
+        ]
+      ],
+      "route_long_cells": [
+        [
+          2,
+          2
+        ],
+        [
+          2,
+          3
+        ],
+        [
+          2,
+          4
+        ],
+        [
+          2,
+          5
+        ],
+        [
+          3,
+          5
+        ],
+        [
+          4,
+          5
+        ],
+        [
+          5,
+          5
+        ],
+        [
+          6,
+          5
+        ],
+        [
+          7,
+          2
+        ],
+        [
+          7,
+          3
+        ],
+        [
+          7,
+          4
+        ],
+        [
+          7,
+          5
+        ]
+      ]
+    },
+    {
+      "task_id": "T3_corr_key_first",
+      "experiment": "test3",
+      "condition": "key_first",
+      "variant": "ks",
+      "pair_id": "corridor",
+      "source": "gridworld/fixtures/test3/T3_corr_key_first.json",
+      "expected_mechanisms": [
+        "kB",
+        "s1"
+      ],
+      "notes": "Single-row corridor, key-first; matched with switch-first."
+    },
+    {
+      "task_id": "T3_corr_switch_first",
+      "experiment": "test3",
+      "condition": "switch_first",
+      "variant": "sk",
+      "pair_id": "corridor",
+      "source": "gridworld/fixtures/test3/T3_corr_switch_first.json",
+      "expected_mechanisms": [
+        "s1",
+        "kB"
+      ],
+      "notes": "Single-row corridor, switch-first; matched with key-first."
+    },
+    {
+      "task_id": "T3_corr2_key_first",
+      "experiment": "test3",
+      "condition": "key_first",
+      "variant": "ks",
+      "pair_id": "corridor2",
+      "source": "gridworld/fixtures/test3/T3_corr2_key_first.json",
+      "expected_mechanisms": [
+        "kB",
+        "s1"
+      ],
+      "notes": "Longer single-row corridor, key-first; matched with switch-first."
+    },
+    {
+      "task_id": "T3_corr2_switch_first",
+      "experiment": "test3",
+      "condition": "switch_first",
+      "variant": "sk",
+      "pair_id": "corridor2",
+      "source": "gridworld/fixtures/test3/T3_corr2_switch_first.json",
+      "expected_mechanisms": [
+        "s1",
+        "kB"
+      ],
+      "notes": "Longer single-row corridor, switch-first; matched with key-first."
+    }
+  ]
+}
diff --git a/gridworld/fixtures/run_config.example.json b/gridworld/fixtures/run_config.example.json
new file mode 100644
index 0000000..2dfa050
--- /dev/null
+++ b/gridworld/fixtures/run_config.example.json
@@ -0,0 +1,27 @@
+{
+  "description": "Example run-config: maps each model to the task files (or experiment keywords / task_ids) it should run. Per-task scoring metadata is looked up from the manifest catalog by path. Run with: multinet-run-pipeline --run-config gridworld/fixtures/run_config.example.json",
+  "models": {
+    "sonnet": {
+      "provider": "claude",
+      "model": "claude-sonnet-4-6",
+      "temperature": 0.0,
+      "max_tokens": 1024,
+      "tasks": [
+        "mazes/validation_10/V01_empty_room.json",
+        "mazes/validation_10/V04_single_key.json",
+        "gridworld/fixtures/test2/T2_corridor_shortcut.json",
+        "gridworld/fixtures/test3/T3_corr_key_first.json",
+        "gridworld/fixtures/test3/T3_corr_switch_first.json"
+      ]
+    },
+    "qwen35vl": {
+      "provider": "qwen",
+      "model": "Qwen/Qwen3.5-4B",
+      "temperature": 0.0,
+      "max_tokens": 1024,
+      "tasks": [
+        "test3"
+      ]
+    }
+  }
+}
diff --git a/gridworld/fixtures/test2/T2_corridor_shortcut.json b/gridworld/fixtures/test2/T2_corridor_shortcut.json
new file mode 100644
index 0000000..fbea884
--- /dev/null
+++ b/gridworld/fixtures/test2/T2_corridor_shortcut.json
@@ -0,0 +1,41 @@
+{
+  "task_id": "T2_corridor_shortcut",
+  "version": "1.0",
+  "seed": 201,
+  "difficulty_tier": 2,
+  "description": "A short top route crosses a locked door (key near start); a long open route detours through the bottom passage. Both routes reach the goal.",
+  "maze": {
+    "dimensions": [9, 7],
+    "walls": [
+      [4, 2], [4, 3], [4, 4]
+    ],
+    "start": [1, 1],
+    "goal": [7, 1]
+  },
+  "mechanisms": {
+    "keys": [
+      {"id": "kB", "position": [1, 3], "color": "blue"}
+    ],
+    "doors": [
+      {"id": "DR", "position": [4, 1], "requires_key": "blue", "initial_state": "locked"}
+    ],
+    "switches": [],
+    "gates": [],
+    "blocks": [],
+    "teleporters": [],
+    "hazards": []
+  },
+  "rules": {
+    "key_consumption": true,
+    "switch_type": "toggle",
+    "hidden_mechanisms": [],
+    "observability": "full",
+    "view_size": 7
+  },
+  "goal": {
+    "type": "reach_position",
+    "target": [7, 1],
+    "auxiliary_conditions": []
+  },
+  "max_steps": 90
+}
diff --git a/gridworld/fixtures/test3/T3_corr2_key_first.json b/gridworld/fixtures/test3/T3_corr2_key_first.json
new file mode 100644
index 0000000..b8aadc3
--- /dev/null
+++ b/gridworld/fixtures/test3/T3_corr2_key_first.json
@@ -0,0 +1,51 @@
+{
+  "task_id": "T3_corr2_key_first",
+  "version": "1.0",
+  "seed": 303,
+  "difficulty_tier": 3,
+  "description": "Longer single-row corridor. Required order: collect the blue key, open the door, then toggle the switch to open the gate to the goal.",
+  "maze": {
+    "dimensions": [13, 3],
+    "walls": [],
+    "start": [1, 1],
+    "goal": [11, 1]
+  },
+  "mechanisms": {
+    "keys": [
+      {"id": "kB", "position": [2, 1], "color": "blue"}
+    ],
+    "doors": [
+      {"id": "DR", "position": [4, 1], "requires_key": "blue", "initial_state": "locked"}
+    ],
+    "switches": [
+      {"id": "s1", "position": [6, 1], "controls": ["g1"], "switch_type": "toggle", "initial_state": "off"}
+    ],
+    "gates": [
+      {"id": "g1", "position": [8, 1], "initial_state": "closed"}
+    ],
+    "blocks": [],
+    "teleporters": [],
+    "hazards": []
+  },
+  "rules": {
+    "key_consumption": true,
+    "switch_type": "toggle",
+    "hidden_mechanisms": [],
+    "observability": "full",
+    "view_size": 7
+  },
+  "goal": {
+    "type": "reach_position",
+    "target": [11, 1],
+    "auxiliary_conditions": []
+  },
+  "dependency_chain": {
+    "depth": 2,
+    "sequence": [
+      {"step": 1, "type": "key-door", "element": "kB", "unlocks": "DR"},
+      {"step": 2, "type": "switch-gate", "element": "s1", "unlocks": "g1"}
+    ],
+    "notation": "kB -> DR -> s1 -> g1 -> G"
+  },
+  "max_steps": 90
+}
diff --git a/gridworld/fixtures/test3/T3_corr2_switch_first.json b/gridworld/fixtures/test3/T3_corr2_switch_first.json
new file mode 100644
index 0000000..0c517f3
--- /dev/null
+++ b/gridworld/fixtures/test3/T3_corr2_switch_first.json
@@ -0,0 +1,51 @@
+{
+  "task_id": "T3_corr2_switch_first",
+  "version": "1.0",
+  "seed": 304,
+  "difficulty_tier": 3,
+  "description": "Longer single-row corridor with identical topology to the key-first variant. Required order: toggle the switch to open the gate, then collect the blue key and open the door to the goal.",
+  "maze": {
+    "dimensions": [13, 3],
+    "walls": [],
+    "start": [1, 1],
+    "goal": [11, 1]
+  },
+  "mechanisms": {
+    "keys": [
+      {"id": "kB", "position": [6, 1], "color": "blue"}
+    ],
+    "doors": [
+      {"id": "DR", "position": [8, 1], "requires_key": "blue", "initial_state": "locked"}
+    ],
+    "switches": [
+      {"id": "s1", "position": [2, 1], "controls": ["g1"], "switch_type": "toggle", "initial_state": "off"}
+    ],
+    "gates": [
+      {"id": "g1", "position": [4, 1], "initial_state": "closed"}
+    ],
+    "blocks": [],
+    "teleporters": [],
+    "hazards": []
+  },
+  "rules": {
+    "key_consumption": true,
+    "switch_type": "toggle",
+    "hidden_mechanisms": [],
+    "observability": "full",
+    "view_size": 7
+  },
+  "goal": {
+    "type": "reach_position",
+    "target": [11, 1],
+    "auxiliary_conditions": []
+  },
+  "dependency_chain": {
+    "depth": 2,
+    "sequence": [
+      {"step": 1, "type": "switch-gate", "element": "s1", "unlocks": "g1"},
+      {"step": 2, "type": "key-door", "element": "kB", "unlocks": "DR"}
+    ],
+    "notation": "s1 -> g1 -> kB -> DR -> G"
+  },
+  "max_steps": 90
+}
diff --git a/gridworld/fixtures/test3/T3_corr_key_first.json b/gridworld/fixtures/test3/T3_corr_key_first.json
new file mode 100644
index 0000000..6e5d66b
--- /dev/null
+++ b/gridworld/fixtures/test3/T3_corr_key_first.json
@@ -0,0 +1,51 @@
+{
+  "task_id": "T3_corr_key_first",
+  "version": "1.0",
+  "seed": 301,
+  "difficulty_tier": 3,
+  "description": "Single-row corridor. Required order: collect the blue key, open the door, then toggle the switch to open the gate to the goal.",
+  "maze": {
+    "dimensions": [11, 3],
+    "walls": [],
+    "start": [1, 1],
+    "goal": [9, 1]
+  },
+  "mechanisms": {
+    "keys": [
+      {"id": "kB", "position": [2, 1], "color": "blue"}
+    ],
+    "doors": [
+      {"id": "DR", "position": [4, 1], "requires_key": "blue", "initial_state": "locked"}
+    ],
+    "switches": [
+      {"id": "s1", "position": [6, 1], "controls": ["g1"], "switch_type": "toggle", "initial_state": "off"}
+    ],
+    "gates": [
+      {"id": "g1", "position": [8, 1], "initial_state": "closed"}
+    ],
+    "blocks": [],
+    "teleporters": [],
+    "hazards": []
+  },
+  "rules": {
+    "key_consumption": true,
+    "switch_type": "toggle",
+    "hidden_mechanisms": [],
+    "observability": "full",
+    "view_size": 7
+  },
+  "goal": {
+    "type": "reach_position",
+    "target": [9, 1],
+    "auxiliary_conditions": []
+  },
+  "dependency_chain": {
+    "depth": 2,
+    "sequence": [
+      {"step": 1, "type": "key-door", "element": "kB", "unlocks": "DR"},
+      {"step": 2, "type": "switch-gate", "element": "s1", "unlocks": "g1"}
+    ],
+    "notation": "kB -> DR -> s1 -> g1 -> G"
+  },
+  "max_steps": 80
+}
diff --git a/gridworld/fixtures/test3/T3_corr_switch_first.json b/gridworld/fixtures/test3/T3_corr_switch_first.json
new file mode 100644
index 0000000..4a214fa
--- /dev/null
+++ b/gridworld/fixtures/test3/T3_corr_switch_first.json
@@ -0,0 +1,51 @@
+{
+  "task_id": "T3_corr_switch_first",
+  "version": "1.0",
+  "seed": 302,
+  "difficulty_tier": 3,
+  "description": "Single-row corridor with identical topology to the key-first variant. Required order: toggle the switch to open the gate, then collect the blue key and open the door to the goal.",
+  "maze": {
+    "dimensions": [11, 3],
+    "walls": [],
+    "start": [1, 1],
+    "goal": [9, 1]
+  },
+  "mechanisms": {
+    "keys": [
+      {"id": "kB", "position": [6, 1], "color": "blue"}
+    ],
+    "doors": [
+      {"id": "DR", "position": [8, 1], "requires_key": "blue", "initial_state": "locked"}
+    ],
+    "switches": [
+      {"id": "s1", "position": [2, 1], "controls": ["g1"], "switch_type": "toggle", "initial_state": "off"}
+    ],
+    "gates": [
+      {"id": "g1", "position": [4, 1], "initial_state": "closed"}
+    ],
+    "blocks": [],
+    "teleporters": [],
+    "hazards": []
+  },
+  "rules": {
+    "key_consumption": true,
+    "switch_type": "toggle",
+    "hidden_mechanisms": [],
+    "observability": "full",
+    "view_size": 7
+  },
+  "goal": {
+    "type": "reach_position",
+    "target": [9, 1],
+    "auxiliary_conditions": []
+  },
+  "dependency_chain": {
+    "depth": 2,
+    "sequence": [
+      {"step": 1, "type": "switch-gate", "element": "s1", "unlocks": "g1"},
+      {"step": 2, "type": "key-door", "element": "kB", "unlocks": "DR"}
+    ],
+    "notation": "s1 -> g1 -> kB -> DR -> G"
+  },
+  "max_steps": 80
+}
diff --git a/gridworld/scoring.py b/gridworld/scoring.py
deleted file mode 100644
index 9dd3670..0000000
--- a/gridworld/scoring.py
+++ /dev/null
@@ -1,152 +0,0 @@
-"""12-dimension scoring for gridworld tasks."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-
-from .task_spec import TaskSpecification
-from .task_validator import DifficultyReport, TaskValidator
-
-
-DIMENSION_NAMES = [
-    "optimal_path_length",
-    "search_space_size",
-    "backtracking_required",
-    "fragility",
-    "dependency_depth",
-    "dependency_variety",
-    "distractor_count",
-    "distractor_quality",
-    "grid_size",
-    "wall_density",
-    "partial_observability",
-    "irreversibility",
-]
-
-
-@dataclass
-class ScoredDifficulty:
-    """Full 12-dimension score report."""
-    dimensions: list[float]
-    dimension_names: list[str] = field(default_factory=lambda: DIMENSION_NAMES.copy())
-    composite: float = 0.0
-    weights: list[float] = field(default_factory=lambda: [1.0] * len(DIMENSION_NAMES))
-
-    def to_dict(self) -> dict:
-        return {
-            "dimensions": self.dimensions,
-            "dimension_names": self.dimension_names,
-            "composite": self.composite,
-            "weights": self.weights,
-        }
-
-
-def _count_backtracking(solution: list[tuple[int, int]] | None) -> float:
-    if not solution:
-        return 0.0
-    seen = set()
-    revisits = 0
-    previous_pos = None
-    for pos in solution:
-        if pos == previous_pos:
-            continue
-        if pos in seen:
-            revisits += 1
-        seen.add(pos)
-        previous_pos = pos
-    return float(revisits)
-
-
-def _dependency_variety(spec: TaskSpecification) -> float:
-    if spec.dependency_chain is not None:
-        return float(len({step.type for step in spec.dependency_chain.sequence}))
-
-    variety = 0
-    if spec.mechanisms.keys and spec.mechanisms.doors:
-        variety += 1
-    if spec.mechanisms.switches and spec.mechanisms.gates:
-        variety += 1
-    if spec.mechanisms.blocks:
-        variety += 1
-    if spec.mechanisms.teleporters:
-        variety += 1
-    if spec.mechanisms.hazards:
-        variety += 1
-    return float(variety)
-
-
-def _distractor_quality(spec: TaskSpecification) -> float:
-    if not spec.distractors:
-        return 0.0
-    weights = {
-        "wrong_color_key": 1.0,
-        "inactive_switch": 2.0,
-        "decoy_door": 2.0,
-        "distractor_chain": 3.0,
-    }
-    return float(sum(weights.get(d.type, 1.0) for d in spec.distractors))
-
-
-def _partial_observability(spec: TaskSpecification) -> float:
-    mapping = {"full": 0.0, "view_cone": 1.0, "fog_of_war": 2.0}
-    return mapping.get(spec.rules.observability, 0.0)
-
-
-def _irreversibility(spec: TaskSpecification) -> float:
-    score = 0.0
-    if spec.rules.key_consumption:
-        score += float(len(spec.mechanisms.doors))
-    score += float(sum(1 for switch in spec.mechanisms.switches if switch.switch_type == "one_shot"))
-    score += float(sum(1 for tp in spec.mechanisms.teleporters if not tp.bidirectional))
-    return score
-
-
-def compute_12d_score(
-    spec: TaskSpecification,
-    solver_output: DifficultyReport | None = None,
-    weights: list[float] | None = None,
-) -> ScoredDifficulty:
-    """
-    Compute the full 12-dimension benchmark score.
-
-    This wraps solver-derived metrics with rubric dimensions such as
-    fragility, dependency variety, distractor quality, partial observability,
-    wall density, and irreversibility. The compact solver report remains in
-    compute_difficulty for callers that only need path/search metrics.
-    """
-    validator = TaskValidator(spec)
-    is_beatable, solution, message = validator.validate()
-    if solver_output is None:
-        from .task_validator import compute_difficulty
-
-        solver_output = compute_difficulty(spec)
-
-    fragility = validator.compute_fragility()
-    fragility_value = 0.0 if fragility.min_steps_to_break == -1 else 1.0 / fragility.min_steps_to_break
-
-    width, height = spec.maze.dimensions
-    grid_size = float(width * height)
-    wall_density = float(len(spec.maze.walls) / grid_size) if grid_size else 0.0
-
-    dimensions = [
-        float(solver_output.optimal_steps),
-        float(solver_output.states_explored),
-        float(solver_output.backtrack_count if hasattr(solver_output, "backtrack_count") else _count_backtracking(solution)),
-        fragility_value,
-        float(spec.dependency_chain.depth if spec.dependency_chain is not None else solver_output.dependency_depth),
-        _dependency_variety(spec),
-        float(len(spec.distractors or [])),
-        _distractor_quality(spec),
-        grid_size,
-        wall_density,
-        _partial_observability(spec),
-        _irreversibility(spec),
-    ]
-
-    weight_vector = weights or [1.0] * len(DIMENSION_NAMES)
-    composite = float(sum(d * w for d, w in zip(dimensions, weight_vector)))
-    return ScoredDifficulty(
-        dimensions=dimensions,
-        composite=composite,
-        weights=weight_vector,
-    )
diff --git a/gridworld/task_validator.py b/gridworld/task_validator.py
index aee948f..4befedf 100644
--- a/gridworld/task_validator.py
+++ b/gridworld/task_validator.py
@@ -493,12 +493,13 @@ def validate_chain_ordering(self) -> bool:
                 return False
         return True
 
-    def validate_distractor_safety(self) -> list[str]:
+    def validate_distractor_safety(self, base_beatable: bool | None = None) -> list[str]:
         """Check whether a single distractor interaction can make the task unsolvable."""
         if not self.spec.distractors:
             return []
 
-        base_beatable, _, _ = self.validate()
+        if base_beatable is None:
+            base_beatable, _, _ = self.validate()
         if not base_beatable:
             return ["Base task is not solvable"]
 
@@ -767,17 +768,23 @@ def to_dict(self) -> dict:
         }
 
 
-def compute_difficulty(spec: TaskSpecification) -> DifficultyReport:
+def compute_difficulty(
+    spec: TaskSpecification,
+    validator: TaskValidator | None = None,
+    validation_result: tuple[bool, Optional[list[tuple[int, int]]], str] | None = None,
+) -> DifficultyReport:
     """
     Compute solver-derived difficulty metrics for a task.
 
     This is a compact report centered on BFS output: beatability, shortest
     action count, states explored, coarse mechanism complexity, and a legacy
-    composite score. Use compute_12d_score when the full rubric vector is
+    composite score. Use scorer.scoring.compute_12d_score when the full rubric vector is
     needed for benchmark comparison.
     """
-    validator = TaskValidator(spec)
-    is_beatable, solution, message = validator.validate()
+    task_validator = validator or TaskValidator(spec)
+    if validation_result is None:
+        validation_result = task_validator.validate()
+    is_beatable, solution, message = validation_result
 
     optimal_steps = len(solution) - 1 if solution else 0  # -1 because path includes start
     # Extract states_explored from message
diff --git a/interface/agents/claude.py b/interface/agents/claude.py
index 9a6fc8e..1a2466b 100644
--- a/interface/agents/claude.py
+++ b/interface/agents/claude.py
@@ -17,6 +17,7 @@
     parse_runner_content,
     split_system_prompt,
 )
+from interface.telemetry import normalize_token_usage
 
 logger = logging.getLogger(__name__)
 
@@ -83,7 +84,7 @@ def _post_messages(
     system: Optional[str],
     messages: List[Dict[str, object]],
     timeout: Optional[float],
-) -> str:
+) -> Tuple[str, Optional[Dict[str, int]]]:
     body: Dict[str, object] = {
         "model": model,
         "max_tokens": max_tokens,
@@ -136,7 +137,7 @@ def _post_messages(
     for block in payload.get("content", []) or []:
         if isinstance(block, dict) and block.get("type") == "text":
             parts.append(str(block.get("text", "")))
-    return "".join(parts).strip()
+    return "".join(parts).strip(), normalize_token_usage(payload.get("usage"))
 
 
 @dataclass
@@ -153,6 +154,7 @@ class ClaudeAnthropicAgent:
 
     config: ClaudeAnthropicConfig = field(default_factory=ClaudeAnthropicConfig)
     api_key: Optional[str] = None
+    last_usage: Optional[Dict[str, int]] = field(default=None, init=False)
 
     def __post_init__(self) -> None:
         key = (self.api_key or os.environ.get("ANTHROPIC_API_KEY") or "").strip()
@@ -165,7 +167,7 @@ def __post_init__(self) -> None:
 
     def __call__(self, messages: List[dict]) -> str:
         system, turns = _to_anthropic_turns(messages)
-        return _post_messages(
+        text, self.last_usage = _post_messages(
             self.api_key,
             model=self.config.model,
             max_tokens=self.config.max_tokens,
@@ -174,3 +176,4 @@ def __call__(self, messages: List[dict]) -> str:
             messages=turns,
             timeout=self.config.timeout,
         )
+        return text
diff --git a/interface/agents/qwen35_vl.py b/interface/agents/qwen35_vl.py
index 2ad4e90..6963800 100644
--- a/interface/agents/qwen35_vl.py
+++ b/interface/agents/qwen35_vl.py
@@ -69,6 +69,13 @@ class Qwen35VLConfig:
     temperature: float = 0.0
     max_new_tokens: int = 1024
     device_map: str = "auto"
+    local_files_only: bool = True
+    trust_remote_code: bool = False
+    torch_dtype: str | None = "auto"
+    load_in_4bit: bool = False
+    attn_implementation: str | None = None
+    max_memory: dict[str, str] | None = None
+    enable_thinking: bool = False
 
 
 @dataclass
@@ -78,29 +85,96 @@ class Qwen35VLAgent:
     config: Qwen35VLConfig = field(default_factory=Qwen35VLConfig)
     processor: Any = None
     model: Any = None
+    last_usage: dict[str, int] | None = field(default=None, init=False)
 
     def __post_init__(self) -> None:
-        from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
+        from transformers import AutoProcessor
 
         if self.processor is None:
-            self.processor = AutoProcessor.from_pretrained(self.config.model)
+            self.processor = AutoProcessor.from_pretrained(
+                self.config.model,
+                local_files_only=self.config.local_files_only,
+                trust_remote_code=self.config.trust_remote_code,
+            )
         if self.model is None:
-            self.model = Qwen3_5ForConditionalGeneration.from_pretrained(
+            model_cls = self._model_class()
+            self.model = model_cls.from_pretrained(
                 self.config.model,
-                device_map=self.config.device_map,
+                **self._model_kwargs(),
             )
 
+    def reset_usage(self) -> None:
+        self.last_usage = None
+
+    def _model_class(self):
+        import transformers
+
+        for name in (
+            "Qwen3_5ForConditionalGeneration",
+            "AutoModelForImageTextToText",
+            "AutoModelForVision2Seq",
+            "AutoModelForCausalLM",
+        ):
+            model_cls = getattr(transformers, name, None)
+            if model_cls is not None:
+                return model_cls
+        raise ImportError("Transformers does not provide a usable Qwen 3.5 model class.")
+
+    def _torch_dtype(self):
+        dtype = self.config.torch_dtype
+        if dtype is None or dtype == "auto":
+            return dtype
+        import torch
+
+        return getattr(torch, dtype)
+
+    def _model_kwargs(self) -> dict[str, Any]:
+        kwargs: dict[str, Any] = {
+            "device_map": self.config.device_map,
+            "local_files_only": self.config.local_files_only,
+            "trust_remote_code": self.config.trust_remote_code,
+        }
+        torch_dtype = self._torch_dtype()
+        if torch_dtype is not None:
+            kwargs["torch_dtype"] = torch_dtype
+        if self.config.attn_implementation:
+            kwargs["attn_implementation"] = self.config.attn_implementation
+        if self.config.max_memory:
+            kwargs["max_memory"] = dict(self.config.max_memory)
+        if self.config.load_in_4bit:
+            import torch
+            from transformers import BitsAndBytesConfig
+
+            kwargs["quantization_config"] = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_compute_dtype=torch.bfloat16,
+            )
+        return kwargs
+
+    def _input_device(self):
+        device = getattr(self.model, "device", None)
+        if device is not None:
+            return device
+        try:
+            return next(self.model.parameters()).device
+        except StopIteration:
+            return None
+
     def __call__(self, messages: List[dict]) -> str:
         qwen_messages = _to_qwen_messages(messages)
         inputs = self.processor.apply_chat_template(
             qwen_messages,
             tokenize=True,
             add_generation_prompt=True,
+            enable_thinking=self.config.enable_thinking,
             return_dict=True,
             return_tensors="pt",
         )
+        input_device = self._input_device()
         inputs = {
-            key: value.to(self.model.device) if hasattr(value, "to") else value
+            key: value.to(input_device) if input_device is not None and hasattr(value, "to") else value
             for key, value in inputs.items()
         }
 
@@ -121,4 +195,9 @@ def __call__(self, messages: List[dict]) -> str:
             )
 
         new_tokens = generated[0][prompt_len:]
+        self.last_usage = {
+            "input_tokens": int(prompt_len),
+            "output_tokens": int(len(new_tokens)),
+            "total_tokens": int(prompt_len + len(new_tokens)),
+        }
         return self.processor.decode(new_tokens, skip_special_tokens=True).strip()
diff --git a/interface/config.py b/interface/config.py
index d99c2e0..23a6d60 100644
--- a/interface/config.py
+++ b/interface/config.py
@@ -10,7 +10,7 @@ class ExperimentConfig:
 
     prompting: Literal["minimal", "standard", "verbose"] = "standard"
     observation: Literal["text_only", "image_text", "image_only"] = "image_text"
-    context_window: Literal["current", "last3"] = "last3"
+    context_window: Literal["current", "last3"] = "current"
     querying: Literal["step_by_step", "subgoal", "full_trajectory"] = "step_by_step"
     chat_history: Literal["stateless", "rolling", "full"] = "stateless"
     chat_turns_max: int = 3
diff --git a/interface/coords.py b/interface/coords.py
index 6511c2c..e3b3153 100644
--- a/interface/coords.py
+++ b/interface/coords.py
@@ -4,6 +4,7 @@
 
 from gridworld.backends.base import GridState
 from gridworld.task_spec import Position, TaskSpecification
+from prompting_experiments.prompt_templates import observation as observation_templates
 
 FACING_ORDER = ["NORTH", "EAST", "SOUTH", "WEST"]
 
@@ -126,29 +127,42 @@ def describe_cell(
     cols: int,
 ) -> str:
     if row < 1 or row > rows or col < 1 or col > cols:
-        return "out of bounds"
+        return observation_templates.CELL_OUT_OF_BOUNDS
     if (row, col) in walls:
-        return "wall"
+        return observation_templates.CELL_WALL
     if (row, col) == goal:
-        return f"GOAL ({row},{col})"
+        return observation_templates.CELL_GOAL.format(row=row, col=col)
 
     key_color = key_at_cell(task_spec, state, row, col)
     if key_color:
-        return f"{key_color} key ({row},{col})"
+        return observation_templates.CELL_KEY.format(
+            key_color=key_color,
+            row=row,
+            col=col,
+        )
 
     for door in task_spec.mechanisms.doors:
         if to_row_col(door.position) == (row, col):
             status = "open" if door.id in state.open_doors else door.initial_state
-            return f"{status} {door.requires_key} door ({row},{col})"
+            return observation_templates.CELL_DOOR.format(
+                status=status,
+                requires_key=door.requires_key,
+                row=row,
+                col=col,
+            )
 
     for gate in task_spec.mechanisms.gates:
         if to_row_col(gate.position) == (row, col):
             cur = "open" if gate.id in state.open_gates else gate.initial_state
-            return f"{cur} gate ({row},{col})"
+            return observation_templates.CELL_GATE.format(state=cur, row=row, col=col)
 
     for switch in task_spec.mechanisms.switches:
         if to_row_col(switch.position) == (row, col):
             on_off = "on" if switch.id in state.active_switches else switch.initial_state
-            return f"switch ({on_off}) ({row},{col})"
+            return observation_templates.CELL_SWITCH.format(
+                state=on_off,
+                row=row,
+                col=col,
+            )
 
-    return f"open ({row},{col})"
+    return observation_templates.CELL_OPEN.format(row=row, col=col)
diff --git a/interface/feedback.py b/interface/feedback.py
index 18cc3aa..95416df 100644
--- a/interface/feedback.py
+++ b/interface/feedback.py
@@ -15,6 +15,7 @@
     switch_at_cell,
     switches_controlling_gate,
 )
+from prompting_experiments.prompt_templates import feedback as feedback_templates
 
 
 def infer_step_outcome(
@@ -35,13 +36,17 @@ def infer_step_outcome(
         door = next((d for d in task_spec.mechanisms.doors if d.id == door_id), None)
         color = door.requires_key if door else "matching"
         if action == "MOVE_FORWARD" and prev_pos != curr_pos:
-            return "OPENED", f"Opened {color} door {door_id} and moved to {curr_pos}."
-        return "OPENED", f"Opened {color} door {door_id}."
+            return "OPENED", feedback_templates.OPENED_AND_MOVED.format(
+                color=color,
+                door_id=door_id,
+                position=curr_pos,
+            )
+        return "OPENED", feedback_templates.OPENED_DOOR.format(color=color, door_id=door_id)
 
     if action in ("TURN_LEFT", "TURN_RIGHT"):
         if prev.agent_direction != curr.agent_direction:
-            return "TURNED", f"Now facing {agent_facing(curr)}."
-        return "NOTHING", f"{action} had no effect."
+            return "TURNED", feedback_templates.NOW_FACING.format(facing=agent_facing(curr))
+        return "NOTHING", feedback_templates.ACTION_NO_EFFECT.format(action=action)
 
     if action == "MOVE_FORWARD":
         if prev_pos == curr_pos:
@@ -50,9 +55,10 @@ def infer_step_outcome(
             if key_color:
                 return (
                     "BLOCKED",
-                    f"MOVE_FORWARD blocked by a {key_color} key at {fwd}. "
-                    "Keys occupy their cell; you cannot walk onto them. "
-                    "Face the key and use PICKUP from your current cell.",
+                    feedback_templates.MOVE_BLOCKED_BY_KEY.format(
+                        key_color=key_color,
+                        position=fwd,
+                    ),
                 )
             gate = gate_at_cell(task_spec, prev, fwd[0], fwd[1])
             if gate and not gate["open"]:
@@ -61,17 +67,23 @@ def infer_step_outcome(
                     switch_list = ", ".join(controllers)
                     return (
                         "BLOCKED",
-                        f"MOVE_FORWARD blocked by closed gate {gate['id']} at {fwd}. "
-                        f"Activate switch(es) {switch_list} to open it.",
+                        feedback_templates.MOVE_BLOCKED_BY_GATE_WITH_SWITCHES.format(
+                            gate_id=gate["id"],
+                            position=fwd,
+                            switches=switch_list,
+                        ),
                     )
                 return (
                     "BLOCKED",
-                    f"MOVE_FORWARD blocked by closed gate {gate['id']} at {fwd}.",
+                    feedback_templates.MOVE_BLOCKED_BY_GATE.format(
+                        gate_id=gate["id"],
+                        position=fwd,
+                    ),
                 )
-            return "BLOCKED", "MOVE_FORWARD blocked by wall or closed door/gate."
+            return "BLOCKED", feedback_templates.MOVE_BLOCKED_GENERIC
         if terminated and reward > 0 and curr_pos == goal:
-            return "DONE", f"Reached goal at {goal}."
-        return "MOVED", f"Moved to {curr_pos}."
+            return "DONE", feedback_templates.REACHED_GOAL.format(goal=goal)
+        return "MOVED", feedback_templates.MOVED_TO.format(position=curr_pos)
 
     if action == "PICKUP":
         if (
@@ -79,15 +91,15 @@ def infer_step_outcome(
             or len(curr.collected_keys) > len(prev.collected_keys)
         ):
             carried = curr.agent_carrying or "a"
-            return "PICKUP", f"Picked up {carried} key."
-        return "NOTHING", "Nothing to pick up here."
+            return "PICKUP", feedback_templates.PICKED_UP_KEY.format(key_color=carried)
+        return "NOTHING", feedback_templates.NOTHING_TO_PICK_UP
 
     if action == "TOGGLE":
         if (
             prev.active_switches != curr.active_switches
             or prev.open_gates != curr.open_gates
         ):
-            return "TOGGLED", "Toggled switch or gate state changed."
+            return "TOGGLED", feedback_templates.TOGGLED_STATE_CHANGED
         fwd = forward_cell(prev)
         switch_ahead = switch_at_cell(task_spec, fwd[0], fwd[1])
         switch_here = switch_at_cell(task_spec, prev_pos[0], prev_pos[1])
@@ -96,12 +108,11 @@ def infer_step_outcome(
             if switch_ahead["switch_type"] == "hold":
                 return (
                     "NOTHING",
-                    f"TOGGLE had no effect. MOVE_FORWARD onto the switch at {fwd} "
-                    "(hold switches activate while you stand on them).",
+                    feedback_templates.TOGGLE_HOLD_SWITCH_HINT.format(position=fwd),
                 )
             return (
                 "NOTHING",
-                f"TOGGLE had no effect. MOVE_FORWARD onto the switch at {fwd}, then TOGGLE.",
+                feedback_templates.TOGGLE_SWITCH_HINT.format(position=fwd),
             )
         if gate_ahead and not gate_ahead["open"]:
             controllers = switches_controlling_gate(task_spec, str(gate_ahead["id"]))
@@ -109,21 +120,22 @@ def infer_step_outcome(
                 switch_list = ", ".join(controllers)
                 return (
                     "NOTHING",
-                    "Gates cannot be toggled directly. "
-                    f"Activate switch(es) {switch_list} instead.",
+                    feedback_templates.GATE_TOGGLE_WITH_SWITCHES.format(
+                        switches=switch_list,
+                    ),
                 )
-            return "NOTHING", "Gates cannot be toggled directly. Activate a linked switch instead."
+            return "NOTHING", feedback_templates.GATE_TOGGLE_GENERIC
         return (
             "NOTHING",
-            "TOGGLE had no effect. Stand on a switch and TOGGLE, or use PICKUP/keys for doors.",
+            feedback_templates.TOGGLE_NO_EFFECT,
         )
 
     if action == "DONE":
         if terminated and reward > 0 and curr_pos == goal:
-            return "DONE", f"Task complete at {goal}."
-        return "WRONG_DONE", f"DONE called but not at goal {goal}."
+            return "DONE", feedback_templates.TASK_COMPLETE.format(goal=goal)
+        return "WRONG_DONE", feedback_templates.WRONG_DONE.format(goal=goal)
 
-    return "INVALID", f"Unknown or unsupported action {action}."
+    return "INVALID", feedback_templates.UNKNOWN_ACTION.format(action=action)
 
 
 def format_step_feedback(
@@ -139,23 +151,27 @@ def format_step_feedback(
     )
     prev_pos = agent_row_col(prev)
     if event_type == "BLOCKED":
-        return f"BLOCKED — {action}: {event_message} You remain at {prev_pos}.", event_type
+        return feedback_templates.BLOCKED_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type
     if event_type == "TURNED":
-        return f"TURNED — {action}: {event_message}", event_type
+        return feedback_templates.TURNED_FEEDBACK.format(action=action, message=event_message), event_type
     if event_type == "MOVED":
-        return f"MOVED — {action}: {event_message}", event_type
+        return feedback_templates.MOVED_FEEDBACK.format(action=action, message=event_message), event_type
     if event_type == "DONE":
-        return f"SUCCESS — {action}: {event_message}", event_type
+        return feedback_templates.SUCCESS_FEEDBACK.format(action=action, message=event_message), event_type
     if event_type == "PICKUP":
-        return f"PICKUP — {action}: {event_message}", event_type
+        return feedback_templates.PICKUP_FEEDBACK.format(action=action, message=event_message), event_type
     if event_type == "NOTHING":
-        return f"NOTHING — {action}: {event_message} You remain at {prev_pos}.", event_type
+        return feedback_templates.NOTHING_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type
     if event_type == "OPENED":
-        return f"OPENED — {action}: {event_message}", event_type
+        return feedback_templates.OPENED_FEEDBACK.format(action=action, message=event_message), event_type
     if event_type == "TOGGLED":
-        return f"TOGGLED — {action}: {event_message}", event_type
+        return feedback_templates.TOGGLED_FEEDBACK.format(action=action, message=event_message), event_type
     if event_type == "WRONG_DONE":
-        return f"WRONG DONE — {action}: {event_message} You remain at {prev_pos}.", event_type
+        return feedback_templates.WRONG_DONE_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type
     if event_type == "INVALID":
-        return f"INVALID — {action}: {event_message} You remain at {prev_pos}.", event_type
-    return f"{event_type} — {action}: {event_message}", event_type
+        return feedback_templates.INVALID_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type
+    return feedback_templates.DEFAULT_FEEDBACK.format(
+        event_type=event_type,
+        action=action,
+        message=event_message,
+    ), event_type
diff --git a/interface/observation.py b/interface/observation.py
index d898d26..ff97abb 100644
--- a/interface/observation.py
+++ b/interface/observation.py
@@ -19,6 +19,7 @@
 from gridworld.task_spec import TaskSpecification
 
 from interface.renderer import render_user_observation_text, rgb_to_image_block
+from prompting_experiments.prompt_templates import observation as observation_templates
 
 ObservationMode = Literal["text_only", "image_text", "image_only"]
 ContextWindow = Literal["current", "last3"]
@@ -51,11 +52,17 @@ def history_text(
     if not recs:
         return ""
 
-    lines = ["Recent history (last 3 steps, oldest first):"]
+    lines = [observation_templates.RECENT_HISTORY_HEADER]
     for rec in recs:
         row, col = rec["position_after"]
         lines.append(
-            f"  ({int(row)}, {int(col)}) facing {rec['facing_after']} -> {rec['action']} -> {rec['prompt_feedback']}"
+            observation_templates.RECENT_HISTORY_STEP.format(
+                row=int(row),
+                col=int(col),
+                facing=rec["facing_after"],
+                action=rec["action"],
+                feedback=rec["prompt_feedback"],
+            )
         )
     return "\n".join(lines)
 
@@ -78,16 +85,22 @@ def history_content_blocks(
             continue
         blocks.append(rgb_to_image_block(rgb))
         if observation == "image_only":
-            blocks.append({"type": "text", "text": f"Action: {rec['action']}\n\n"})
+            blocks.append(
+                {
+                    "type": "text",
+                    "text": observation_templates.IMAGE_HISTORY_ACTION.format(
+                        action=rec["action"]
+                    ),
+                }
+            )
 
     if not blocks:
         return []
 
     intro = (
-        "Recent steps (oldest first). Each image is the maze view from which the "
-        "following action was chosen; infer pose and environment state from the image.\n\n"
+        observation_templates.IMAGE_ONLY_HISTORY_INTRO
         if observation == "image_only"
-        else "Recent step views (oldest first):\n\n"
+        else observation_templates.IMAGE_TEXT_HISTORY_INTRO
     )
     return [{"type": "text", "text": intro}] + blocks
 
diff --git a/interface/prompt_strategies.py b/interface/prompt_strategies.py
index 64580cc..657cb33 100644
--- a/interface/prompt_strategies.py
+++ b/interface/prompt_strategies.py
@@ -16,34 +16,12 @@
     maze_rows_cols,
     wall_cells,
 )
+from prompting_experiments.prompt_templates import system as system_templates
+from prompting_experiments.prompt_templates import user as user_templates
 
-MECHANISM_LIST = (
-    "The environment may contain:\n"
-    "- Keys: pick them up to open doors of the matching color\n"
-    "- Doors: blocked passages that require a matching key\n"
-    "- Switches: step onto them to activate (hold) or TOGGLE while standing on them\n"
-    "- Gates: blocked passages controlled by switches\n"
-)
-
-MECHANISM_RULES = (
-    "RULES (domain logic):\n"
-    "  - PICKUP: pick up a key from the adjacent cell you are facing. Keys block movement — you\n"
-    "    cannot MOVE_FORWARD onto a key; stand beside it, face it, and PICKUP.\n"
-    "  - Doors: face a locked door with the matching key in inventory and TOGGLE to open it, then\n"
-    "    MOVE_FORWARD through the open door. MOVE_FORWARD alone does not open a locked door.\n"
-    "  - Switches: MOVE_FORWARD onto the switch cell, then TOGGLE (toggle/one-shot types). Hold-type\n"
-    "    switches activate automatically while you stand on them. Only switches are toggled. Linked\n"
-    "    gates are open if at least one linked switch is on, and closed if all are off.\n"
-    "  - Gates: you cannot TOGGLE a gate. CLOSED gates block movement; OPEN gates do not.\n"
-    "  - Closed gates and doors you lack a key for block movement like walls until resolved.\n"
-    "  - Use DONE only when you are standing on the goal cell."
-)
-
-FINAL_OUTPUT_INSTRUCTION = (
-    "On the last line, output exactly:\n"
-    "FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  "
-    "(comma-separated; one or more valid actions)"
-)
+MECHANISM_LIST = system_templates.MECHANISM_LIST
+MECHANISM_RULES = system_templates.MECHANISM_RULES
+FINAL_OUTPUT_INSTRUCTION = system_templates.FINAL_OUTPUT_INSTRUCTION
 
 
 class MinimalPromptStrategy:
@@ -51,12 +29,14 @@ def __init__(self, actions_hint: str) -> None:
         self._actions_hint = actions_hint
 
     def build_system_prompt(self, querying_suffix: str = "") -> str:
-        return (
-            "Task: move to the goal cell in the grid.\n"
-            f"Valid actions: {self._actions_hint}.\n"
-            f"{FINAL_OUTPUT_INSTRUCTION}"
-            + (f"\n\n{querying_suffix}" if querying_suffix else "")
-        )
+        chunks = [
+            system_templates.TASK_PREFIX,
+            system_templates.VALID_ACTIONS_TEMPLATE.format(actions_hint=self._actions_hint),
+            FINAL_OUTPUT_INSTRUCTION,
+        ]
+        if querying_suffix:
+            chunks.append(querying_suffix)
+        return "\n".join(chunks[:2]) + "\n" + "\n\n".join(chunks[2:])
 
     def build_user_prompt(
         self,
@@ -66,29 +46,34 @@ def build_user_prompt(
         state: GridState,
         last_feedback: str,
     ) -> str:
-        history_block = f"{history_text}\n\n" if history_text else ""
-        obs_block = f"Observation:\n{obs_text}\n\n" if obs_text else ""
+        obs_block = (
+            user_templates.OBSERVATION_SECTION.format(obs_text=obs_text)
+            if obs_text
+            else ""
+        )
         pos = agent_row_col(state)
         goal = goal_row_col(task_spec)
-        return (
-            f"{history_block}"
-            f"{obs_block}"
-            f"Position: {pos}  |  Facing: {agent_facing(state)}  |  Goal: {goal}  |  "
-            f"Step {state.step_count + 1}/{state.max_steps}\n"
-            f"Last result: {last_feedback}\n"
-            "What is your next action?"
+        prompt = user_templates.MINIMAL_USER_PROMPT.format(
+            obs_block=obs_block,
+            position=pos,
+            facing=agent_facing(state),
+            goal=goal,
+            last_feedback=last_feedback,
         )
+        return _with_history(prompt, history_text)
 
 
 class StandardPromptStrategy(MinimalPromptStrategy):
     def build_system_prompt(self, querying_suffix: str = "") -> str:
-        return (
-            "Task: move to the goal cell in the grid.\n"
-            f"{MECHANISM_LIST}\n"
-            f"Valid actions: {self._actions_hint}.\n"
-            f"{FINAL_OUTPUT_INSTRUCTION}"
-            + (f"\n\n{querying_suffix}" if querying_suffix else "")
-        )
+        chunks = [
+            system_templates.TASK_PREFIX,
+            MECHANISM_LIST,
+            system_templates.VALID_ACTIONS_TEMPLATE.format(actions_hint=self._actions_hint),
+            FINAL_OUTPUT_INSTRUCTION,
+        ]
+        if querying_suffix:
+            chunks.append(querying_suffix)
+        return "\n".join(chunks[:3]) + "\n" + "\n\n".join(chunks[3:])
 
 
 class VerbosePromptStrategy(StandardPromptStrategy):
@@ -107,12 +92,6 @@ def build_user_prompt(
         state: GridState,
         last_feedback: str,
     ) -> str:
-        steps_left = state.max_steps - state.step_count
-        budget_warn = (
-            f"  WARNING: Only {steps_left} steps remaining!\n"
-            if steps_left <= max(5, state.max_steps // 5)
-            else ""
-        )
         row, col = agent_row_col(state)
         grow, gcol = goal_row_col(task_spec)
         manhattan = abs(row - grow) + abs(col - gcol)
@@ -140,42 +119,52 @@ def build_user_prompt(
                 rows=rows,
                 cols=cols,
             )
-            neighbour_lines.append(f"  {rel}: {desc}")
-        neighbour_block = "From your perspective:\n" + "\n".join(neighbour_lines) + "\n"
+            neighbour_lines.append(
+                user_templates.NEIGHBOUR_LINE.format(
+                    relative_direction=rel,
+                    description=desc,
+                )
+            )
+        neighbour_block = (
+            user_templates.NEIGHBOUR_BLOCK_HEADER + "\n".join(neighbour_lines) + "\n"
+        )
         mechanism_block = _mechanism_hints_text(task_spec)
-        history_block = f"{history_text}\n\n" if history_text else ""
-        obs_block = f"Observation:\n{obs_text}\n\n" if obs_text else ""
+        obs_block = (
+            user_templates.OBSERVATION_SECTION.format(obs_text=obs_text)
+            if obs_text
+            else ""
+        )
         inventory_str = ", ".join(inventory_list(state)) or "none"
 
-        return (
-            f"{history_block}"
-            f"{obs_block}"
-            f"Position: {row, col}  |  Facing: {agent_facing(state)}  |  Goal: {(grow, gcol)}  |  "
-            f"Manhattan: {manhattan}  |  Step {state.step_count + 1}/{state.max_steps} ({steps_left} left)\n"
-            f"Inventory: {inventory_str}\n"
-            f"{budget_warn}"
-            f"{neighbour_block}"
-            f"{mechanism_block}"
-            f"Last result: {last_feedback}\n"
-            "What is your next action?"
+        prompt = user_templates.VERBOSE_USER_PROMPT.format(
+            obs_block=obs_block,
+            position=(row, col),
+            facing=agent_facing(state),
+            goal=(grow, gcol),
+            manhattan=manhattan,
+            inventory=inventory_str,
+            neighbour_block=neighbour_block,
+            mechanism_block=mechanism_block,
+            last_feedback=last_feedback,
         )
+        return _with_history(prompt, history_text)
 
 
 PromptStrategy = MinimalPromptStrategy
 
 
+def _with_history(prompt: str, history_text: str) -> str:
+    if not history_text:
+        return prompt
+    return f"{history_text}\n\n{prompt}"
+
+
 def _mechanism_hints_text(task_spec: TaskSpecification) -> str:
     lines = []
     if task_spec.mechanisms.keys or task_spec.mechanisms.doors:
-        lines.append(
-            "  - Face an adjacent key and PICKUP (do not walk onto the key). "
-            "Face a locked door with the matching key and TOGGLE to open it, then MOVE_FORWARD through."
-        )
+        lines.append(user_templates.KEY_DOOR_HINT)
     if task_spec.mechanisms.switches or task_spec.mechanisms.gates:
-        lines.append(
-            "  - MOVE_FORWARD onto a switch, then TOGGLE (hold switches activate on step). "
-            "Gates cannot be toggled — activate their linked switch(es)."
-        )
+        lines.append(user_templates.SWITCH_GATE_HINT)
     if not lines:
         return ""
-    return "Hints:\n" + "\n".join(lines) + "\n"
+    return user_templates.MECHANISM_HINTS_HEADER + "\n".join(lines) + "\n"
diff --git a/interface/querying.py b/interface/querying.py
index daa4117..9413a98 100644
--- a/interface/querying.py
+++ b/interface/querying.py
@@ -4,6 +4,7 @@
 from typing import List, Literal
 
 from interface.parser import normalize_action, parse_final_output
+from prompting_experiments.prompt_templates import querying as querying_templates
 
 QueryingKind = Literal["step_by_step", "subgoal", "full_trajectory"]
 
@@ -51,18 +52,8 @@ def system_prompt_suffix(self) -> str:
         if self.kind == "step_by_step":
             return ""
         if self.kind == "subgoal":
-            return (
-                "For each turn output:\n"
-                "  SUB_GOAL: <short description of your next waypoint>\n"
-                "  ACTIONS: <comma-separated action list to reach it>"
-            )
-        return (
-            "Output your complete trajectory once as:\n"
-            "  SUB_GOAL: <short description of the full plan>\n"
-            "  ACTIONS: <comma-separated action list from start to finish>\n"
-            "The last action in ACTIONS should be DONE (when you expect to be at the goal).\n"
-            "You will not be queried again — this is your only planning turn."
-        )
+            return querying_templates.SUBGOAL_SUFFIX
+        return querying_templates.FULL_TRAJECTORY_SUFFIX
 
     def step_metadata(self) -> dict:
         if self.kind == "step_by_step":
diff --git a/interface/renderer.py b/interface/renderer.py
index 34881d3..d9638fb 100644
--- a/interface/renderer.py
+++ b/interface/renderer.py
@@ -18,11 +18,13 @@
     to_row_col,
     wall_cells,
 )
+from prompting_experiments.prompt_templates import observation as observation_templates
 
 if TYPE_CHECKING:
     from gridworld.backends.base import GridState
     from gridworld.task_spec import TaskSpecification
 
+
 #TODO: Move to utils.py
 def rgb_to_png_bytes(rgb: np.ndarray) -> bytes:
     img = Image.fromarray(np.asarray(rgb, dtype=np.uint8))
@@ -43,13 +45,11 @@ def _static_layout_lines(task_spec: TaskSpecification) -> list[str]:
     start = to_row_col(task_spec.maze.start)
     goal = goal_row_col(task_spec)
     return [
-        f"The world is a {rows} by {cols} grid.",
-        "Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``;"
-        " tuples in this text use ``(row, column)`` matching env state (row southward, column east)."
-        " So ``x`` = column index, ``y`` = row index.",
-        f"The start is at {start}.",
-        f"The goal is at {goal}.",
-        f"The following cells are walls: {wall_str}.",
+        observation_templates.WORLD_SIZE_LINE.format(rows=rows, cols=cols),
+        observation_templates.COORDINATE_EXPLANATION,
+        observation_templates.START_LINE.format(start=start),
+        observation_templates.GOAL_LINE.format(goal=goal),
+        observation_templates.WALLS_LINE.format(walls=wall_str),
     ]
 
 
@@ -64,14 +64,20 @@ def _mechanism_lines(task_spec: TaskSpecification, state: GridState | None = Non
         if key.id in collected:
             continue
         row, col = to_row_col(key.position)
-        parts.append(f"There is a {key.color} key at ({row},{col}).")
+        parts.append(
+            observation_templates.KEY_LINE.format(color=key.color, row=row, col=col)
+        )
 
     for door in task_spec.mechanisms.doors:
         row, col = to_row_col(door.position)
         status = "open" if door.id in open_doors else door.initial_state
         parts.append(
-            f"There is a {status} {door.requires_key} door at ({row},{col})."
-            f" It requires the {door.requires_key} key to open."
+            observation_templates.DOOR_LINE.format(
+                status=status,
+                requires_key=door.requires_key,
+                row=row,
+                col=col,
+            )
         )
 
     for switch in task_spec.mechanisms.switches:
@@ -79,16 +85,26 @@ def _mechanism_lines(task_spec: TaskSpecification, state: GridState | None = Non
         on_off = "on" if switch.id in active else switch.initial_state
         controls = ", ".join(switch.controls)
         parts.append(
-            f"There is a {switch.switch_type} switch at ({row},{col}) (currently {on_off})."
-            f" It controls: {controls}."
+            observation_templates.SWITCH_LINE.format(
+                switch_type=switch.switch_type,
+                row=row,
+                col=col,
+                state=on_off,
+                controls=controls,
+            )
         )
 
     for gate in task_spec.mechanisms.gates:
         row, col = to_row_col(gate.position)
         cur = "open" if gate.id in open_gates else gate.initial_state
         parts.append(
-            f"There is a gate ({gate.id}) at ({row},{col})."
-            f" It is currently {cur} (initially {gate.initial_state})."
+            observation_templates.GATE_LINE.format(
+                gate_id=gate.id,
+                row=row,
+                col=col,
+                state=cur,
+                initial_state=gate.initial_state,
+            )
         )
     return parts
 
@@ -102,18 +118,19 @@ def render_user_observation_text(task_spec: TaskSpecification, state: GridState)
     pos = agent_row_col(state)
     inv = ", ".join(inventory_list(state)) or "empty"
     head = [
-        "Current situation (this step):",
-        f"The goal is at {goal}.",
-        f"You are at {pos} facing {agent_facing(state)}.",
-        f"Environment steps used so far: {state.step_count} (max {state.max_steps} before timeout).",
-        f"Your inventory: {inv}.",
+        observation_templates.CURRENT_SITUATION_HEADER,
+        observation_templates.CURRENT_GOAL_LINE.format(goal=goal),
+        observation_templates.CURRENT_AGENT_LINE.format(
+            position=pos,
+            facing=agent_facing(state),
+        ),
+        observation_templates.CURRENT_INVENTORY_LINE.format(inventory=inv),
         "",
-        "Map contents as of this step (keys on the ground, doors, switches, gates):",
+        observation_templates.CURRENT_MAP_CONTENTS_HEADER,
     ]
     mech = _mechanism_lines(task_spec, state)
     if mech:
         head.extend(mech)
     else:
-        head.append("(No keys on the ground, doors, switches, or gates in the current state description.)")
+        head.append(observation_templates.NO_MECHANISMS_LINE)
     return "\n".join(head)
-
diff --git a/interface/runner.py b/interface/runner.py
index 91dc448..c48b86c 100644
--- a/interface/runner.py
+++ b/interface/runner.py
@@ -33,6 +33,8 @@
 )
 from interface.querying import QueryingMode
 from interface.renderer import render_initial_maze_text
+from prompting_experiments.prompt_templates import feedback as feedback_templates
+from prompting_experiments.prompt_templates import system as system_templates
 
 logger = logging.getLogger(__name__)
 
@@ -57,6 +59,18 @@ def _trim_rolling_chat(messages: List[dict], max_pairs: int) -> None:
         del messages[1 : 1 + (tail_len - cap)]
 
 
+def _reset_agent_usage(agent: Callable[[List[dict]], str]) -> None:
+    """Clear per-call telemetry so stale usage cannot leak into a later query."""
+    reset_usage = getattr(agent, "reset_usage", None)
+    if callable(reset_usage):
+        reset_usage()
+        return
+    try:
+        setattr(agent, "last_usage", None)
+    except (AttributeError, TypeError):
+        pass
+
+
 def build_runner(
     config: ExperimentConfig,
     backend: MiniGridBackend,
@@ -100,15 +114,15 @@ def run(
         system_prompt = self.prompt.build_system_prompt(self.querying.system_prompt_suffix())
         if self.config.observation in ("text_only", "image_text"):
             system_prompt = (
-                f"{system_prompt}\n\nInitial maze (fixed for this episode):\n"
-                f"{render_initial_maze_text(self.task_spec)}"
+                f"{system_prompt}\n\n"
+                f"{system_templates.INITIAL_MAZE_SECTION.format(maze_text=render_initial_maze_text(self.task_spec))}"
             )
         system_message = {"role": "system", "content": system_prompt}
         chat_history = self.config.chat_history
         messages: List[dict] = [system_message] if chat_history in ("rolling", "full") else []
 
         action_queue: List[str] = []
-        last_feedback = "Episode start."
+        last_feedback = feedback_templates.INITIAL_FEEDBACK
         consecutive_failures = 0
         transcript: List[dict] = []
         max_steps = self.task_spec.max_steps
@@ -122,7 +136,9 @@ def run(
 
         if logger.isEnabledFor(logging.INFO):
             logger.info(
-                "Episode start: max_steps=%s querying=%s observation=%s context_window=%s chat_history=%s",
+                "Episode start: task_id=%s seed=%s max_steps=%s querying=%s observation=%s context_window=%s chat_history=%s",
+                self.task_spec.task_id,
+                self.task_spec.seed,
                 max_steps,
                 self.config.querying,
                 self.config.observation,
@@ -154,11 +170,14 @@ def run(
                     agent_messages = messages
                 if logger.isEnabledFor(logging.INFO):
                     logger.info(
-                        "LLM query #%d: messages_in_context=%d current_turn_has_image=%s",
+                        "LLM query #%d: task_id=%s observation=%s messages_in_context=%d current_turn_has_image=%s",
                         query_count,
+                        self.task_spec.task_id,
+                        self.config.observation,
                         len(agent_messages),
                         has_image,
                     )
+                _reset_agent_usage(agent)
                 t_llm = time.perf_counter()
                 model_text = agent(agent_messages)
                 llm_s = time.perf_counter() - t_llm
@@ -169,44 +188,57 @@ def run(
                 action_queue = self.querying.parse_actions(model_text)
                 if logger.isEnabledFor(logging.INFO):
                     logger.info(
-                        "LLM query #%d finished in %.2fs: reply_chars=%d actions_parsed=%d",
+                        "LLM query #%d finished: task_id=%s observation=%s elapsed=%.2fs reply_chars=%d actions_parsed=%d",
                         query_count,
+                        self.task_spec.task_id,
+                        self.config.observation,
                         llm_s,
                         len(model_text),
                         len(action_queue),
                     )
                 if logger.isEnabledFor(logging.DEBUG):
-                    logger.debug("LLM query #%d reply:\n%s", query_count, model_text)
-                transcript.append(
-                    {
-                        "kind": "query",
-                        "query_index": query_count,
-                        "env_step_count": state.step_count,
-                        "agent_messages": copy.deepcopy(agent_messages),
-                        "assistant_reply": model_text,
-                        "parsed_actions": list(action_queue),
-                        "parse_ok": bool(action_queue),
-                        "has_image": has_image,
-                        "llm_latency_s": llm_s,
-                        "chat_history_mode": chat_history,
-                        "agent_message_count": len(agent_messages),
-                        "actions_remaining_before_step": len(action_queue),
-                    }
-                )
+                    logger.debug(
+                        "LLM query #%d reply: task_id=%s observation=%s\n%s",
+                        query_count,
+                        self.task_spec.task_id,
+                        self.config.observation,
+                        model_text,
+                    )
+                query_record = {
+                    "kind": "query",
+                    "query_index": query_count,
+                    "env_step_count": state.step_count,
+                    "agent_messages": copy.deepcopy(agent_messages),
+                    "assistant_reply": model_text,
+                    "parsed_actions": list(action_queue),
+                    "parse_ok": bool(action_queue),
+                    "has_image": has_image,
+                    "llm_latency_s": llm_s,
+                    "chat_history_mode": chat_history,
+                    "agent_message_count": len(agent_messages),
+                    "actions_remaining_before_step": len(action_queue),
+                }
+                usage = getattr(agent, "last_usage", None)
+                if isinstance(usage, dict):
+                    query_record["usage"] = dict(usage)
+                transcript.append(query_record)
                 # check if we got any valid actions; 
                 # if not, we'll count it as a parse failure and give feedback, 
                 # but still allow retries until max_parse_retries is reached
                 if not action_queue:
                     parse_failures += 1
                     logger.warning(
-                        "LLM query #%d: no valid actions parsed; parse failure %d/%d",
+                        "LLM query #%d: task_id=%s observation=%s no valid actions parsed; parse failure %d/%d",
                         query_count,
+                        self.task_spec.task_id,
+                        self.config.observation,
                         parse_failures,
                         self.config.max_parse_retries,
                     )
                     last_feedback = (
-                        f"Could not parse FINAL_OUTPUT (one or more valid actions). "
-                        f"Use only: {ACTIONS_HINT}."
+                        feedback_templates.PARSE_FAILURE_FEEDBACK.format(
+                            actions_hint=ACTIONS_HINT
+                        )
                     )
                     if parse_failures >= self.config.max_parse_retries:
                         end_reason = "parse_failed"
diff --git a/interface/smoke_tests/smoke_llm.py b/interface/smoke_tests/smoke_llm.py
index fd7d5e0..8d9058c 100644
--- a/interface/smoke_tests/smoke_llm.py
+++ b/interface/smoke_tests/smoke_llm.py
@@ -80,19 +80,35 @@ def __init__(
     def __call__(self, messages: list[dict]) -> str:
         self._query_seq += 1
         text = self._inner(messages)
-        self._records.append(
-            {
-                "query": self._query_seq,
-                "messages_in_context": len(messages),
-                "reply": text,
-            }
-        )
+        record = {
+            "query": self._query_seq,
+            "messages_in_context": len(messages),
+            "reply": text,
+        }
+        if self.last_usage is not None:
+            record["usage"] = dict(self.last_usage)
+        self._records.append(record)
         if self._log_replies:
             print(f"\n{'=' * 72}\nLLM query {self._query_seq} (messages={len(messages)})\n{'=' * 72}")
             print(text)
             print(f"{'=' * 72}\n")
         return text
 
+    @property
+    def last_usage(self) -> dict[str, int] | None:
+        usage = getattr(self._inner, "last_usage", None)
+        return usage if isinstance(usage, dict) else None
+
+    def reset_usage(self) -> None:
+        reset_usage = getattr(self._inner, "reset_usage", None)
+        if callable(reset_usage):
+            reset_usage()
+            return
+        try:
+            setattr(self._inner, "last_usage", None)
+        except (AttributeError, TypeError):
+            pass
+
 
 def main() -> None:
     parser = argparse.ArgumentParser(
diff --git a/interface/telemetry.py b/interface/telemetry.py
new file mode 100644
index 0000000..dd3a3c4
--- /dev/null
+++ b/interface/telemetry.py
@@ -0,0 +1,42 @@
+"""Shared telemetry normalization for interface producers and scorer consumers."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+TOKEN_COUNT_KEYS = ("total_tokens", "token_count", "tokens", "model_tokens")
+
+
+def normalize_token_usage(usage: Any) -> dict[str, int] | None:
+    """Normalize provider token usage into input, output, and total counts."""
+    if not isinstance(usage, dict):
+        return None
+    input_tokens = usage.get("input_tokens", usage.get("prompt_tokens"))
+    output_tokens = usage.get("output_tokens", usage.get("completion_tokens"))
+    total_tokens = usage.get("total_tokens")
+    if total_tokens is None and (input_tokens is not None or output_tokens is not None):
+        total_tokens = int(input_tokens or 0) + int(output_tokens or 0)
+
+    normalized = {}
+    if input_tokens is not None:
+        normalized["input_tokens"] = int(input_tokens)
+    if output_tokens is not None:
+        normalized["output_tokens"] = int(output_tokens)
+    if total_tokens is not None:
+        normalized["total_tokens"] = int(total_tokens)
+    return normalized or None
+
+
+def token_count_from_record(record: dict[str, Any]) -> int | None:
+    """Extract one token total without counting nested aliases twice."""
+    for container in (record, record.get("info"), record.get("metadata")):
+        if not isinstance(container, dict):
+            continue
+        for key in TOKEN_COUNT_KEYS:
+            if container.get(key) is not None:
+                return int(container[key])
+        usage = normalize_token_usage(container.get("usage"))
+        if usage is not None and usage.get("total_tokens") is not None:
+            return usage["total_tokens"]
+    return None
diff --git a/pipeline/__init__.py b/pipeline/__init__.py
new file mode 100644
index 0000000..b603032
--- /dev/null
+++ b/pipeline/__init__.py
@@ -0,0 +1,14 @@
+"""Bare-bones run pipeline for MultiNet v2.0 (tests 1-3).
+
+Sequential, inspectable orchestration that wires the canonical pipeline stages
+over the ``interface/`` runner (Stack A) and the ``scorer/`` package:
+
+- Stage 1: fixtures + manifest (``gridworld/fixtures/manifest.json``)
+- Stage 2: static solve & score  -> ``scorer.score_task_file``
+- Stage 3: runtime runs (live models) -> ``pipeline.run_stage3``
+- Stage 3 instrumentation -> ``pipeline.episode_metrics``
+- Stage 4: runtime score -> ``scorer.compute_runtime_score``
+- Stage 5: reports -> ``pipeline.reports``
+
+See ``scripts/run_pipeline.py`` for the orchestrator CLI.
+"""
diff --git a/pipeline/episode_metrics.py b/pipeline/episode_metrics.py
new file mode 100644
index 0000000..de97bee
--- /dev/null
+++ b/pipeline/episode_metrics.py
@@ -0,0 +1,286 @@
+"""Stage-3 instrumentation: derive test-2/test-3 signals from an episode log.
+
+Pure post-processing over the ``interface/`` runner's ``episode.json`` (the dict
+returned by ``ExperimentRunner.run`` and flushed by ``flush_episode_log``), the
+task spec, the canonical paths, and the manifest row. No runner edits required:
+each ``kind == "step"`` transcript record already carries ``event_type`` and a
+``state_after`` snapshot with the mechanism id sets and agent ``(x, y)`` position.
+
+Coordinate convention: positions here are ``(x, y)`` taken from
+``state_after.agent_position`` (NOT the ``(row, col)`` ``position_after`` field),
+matching the planner positions in ``canonical_paths.json``.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+# Mechanism id sets carried on every state snapshot, in direct-actuation priority
+# order (keys/switches the agent acts on, then doors/gates that open as effects).
+_MECHANISM_FIELDS = ("collected_keys", "active_switches", "open_doors", "open_gates")
+
+
+def _position(state: Any) -> Optional[tuple[int, int]]:
+    if not isinstance(state, dict):
+        return None
+    raw = state.get("agent_position") or state.get("position")
+    if isinstance(raw, (list, tuple)) and len(raw) >= 2:
+        return int(raw[0]), int(raw[1])
+    return None
+
+
+def _mechanism_sets(state: Any) -> dict[str, set[str]]:
+    state = state if isinstance(state, dict) else {}
+    return {field: set(state.get(field, []) or []) for field in _MECHANISM_FIELDS}
+
+
+def _step_records(episode: dict[str, Any]) -> list[dict[str, Any]]:
+    return [
+        rec
+        for rec in episode.get("transcript", [])
+        if isinstance(rec, dict) and rec.get("kind") == "step"
+    ]
+
+
+def visited_cells(episode: dict[str, Any]) -> list[tuple[int, int]]:
+    """Ordered agent cells (x, y), consecutive duplicates collapsed."""
+    cells: list[tuple[int, int]] = []
+    initial = _position(episode.get("initial_state"))
+    if initial is not None:
+        cells.append(initial)
+    for rec in episode.get("transcript", []):
+        if not isinstance(rec, dict):
+            continue
+        if rec.get("kind") == "reset":
+            pos = _position(rec.get("state"))
+        elif rec.get("kind") == "step":
+            pos = _position(rec.get("state_after"))
+        else:
+            continue
+        if pos is not None:
+            cells.append(pos)
+    final = _position(episode.get("final_state"))
+    if final is not None:
+        cells.append(final)
+
+    deduped: list[tuple[int, int]] = []
+    for pos in cells:
+        if not deduped or deduped[-1] != pos:
+            deduped.append(pos)
+    return deduped
+
+
+def mechanism_interaction_order(episode: dict[str, Any]) -> list[str]:
+    """Ordered mechanism ids in the order the agent first engaged each one.
+
+    Walks the step records and diffs the ``state_after`` mechanism id sets
+    against the previous step; newly-added ids are appended in field-priority
+    order (keys, switches, doors, gates) so a single switch toggle that also
+    opens a gate records the switch before its downstream gate.
+    """
+    order: list[str] = []
+    seen: set[str] = set()
+    prev = _mechanism_sets(episode.get("initial_state"))
+    for rec in _step_records(episode):
+        current = _mechanism_sets(rec.get("state_after"))
+        for field in _MECHANISM_FIELDS:
+            for mech_id in sorted(current[field] - prev[field]):
+                if mech_id not in seen:
+                    seen.add(mech_id)
+                    order.append(mech_id)
+        prev = current
+    return order
+
+
+def failure_point(
+    episode: dict[str, Any],
+    expected_mechanisms: list[str],
+    mech_order: list[str],
+) -> Optional[dict[str, Any]]:
+    """First expected mechanism the agent never engaged, with context.
+
+    Returns ``None`` for successful runs. For failed runs, reports the first id
+    in ``expected_mechanisms`` missing from ``mech_order`` (``None`` if all were
+    engaged but the run still failed), the runner ``end_reason``, the final cell,
+    and the engaged-mechanism order for diagnostics.
+    """
+    if episode.get("success"):
+        return None
+    engaged = set(mech_order)
+    missing = [m for m in expected_mechanisms if m not in engaged]
+    cells = visited_cells(episode)
+    return {
+        "mechanism": missing[0] if missing else None,
+        "end_reason": episode.get("end_reason"),
+        "final_cell": list(cells[-1]) if cells else None,
+        "engaged": list(mech_order),
+        "missing": missing,
+    }
+
+
+def path_choice(
+    episode: dict[str, Any],
+    route_short_cells: Optional[list[Any]],
+    route_long_cells: Optional[list[Any]],
+) -> Optional[str]:
+    """Classify which test-2 route the agent committed to.
+
+    ``route_*_cells`` are discriminator cells unique to each route (cached in the
+    manifest by ``validate_fixtures``). Returns ``"short_mech"``, ``"long_open"``,
+    ``"mixed"``, or ``"none"``; ``None`` when no route cells are defined (non-test-2).
+    """
+    if not route_short_cells and not route_long_cells:
+        return None
+    cells = set(visited_cells(episode))
+    short = {tuple(c) for c in (route_short_cells or [])}
+    long = {tuple(c) for c in (route_long_cells or [])}
+    hit_short = bool(short & cells)
+    hit_long = bool(long & cells)
+    if hit_short and not hit_long:
+        return "short_mech"
+    if hit_long and not hit_short:
+        return "long_open"
+    if hit_short and hit_long:
+        return "mixed"
+    return "none"
+
+
+def episode_token_count(episode: dict[str, Any]) -> Optional[int]:
+    """Sum token usage over ``kind == "query"`` transcript records."""
+    from interface.telemetry import token_count_from_record
+
+    total = 0
+    found = False
+    for rec in episode.get("transcript", []):
+        if not isinstance(rec, dict) or rec.get("kind") != "query":
+            continue
+        count = token_count_from_record(rec)
+        if count is not None:
+            total += count
+            found = True
+    return total if found else None
+
+
+def _canonical_optimal_steps(canonical_paths: dict[str, Any]) -> Optional[int]:
+    bfs = canonical_paths.get("bfs", canonical_paths)
+    if isinstance(bfs, dict) and bfs.get("optimal_steps") is not None:
+        return int(bfs["optimal_steps"])
+    if canonical_paths.get("optimal_steps") is not None:
+        return int(canonical_paths["optimal_steps"])
+    return None
+
+
+def _episode_reward(episode: dict[str, Any]) -> Any:
+    """Final-state reward, guarding an explicit ``final_state: null``."""
+    final = episode.get("final_state")
+    return final.get("reward") if isinstance(final, dict) else None
+
+
+def build_metrics(
+    episode: dict[str, Any],
+    canonical_paths: dict[str, Any],
+    manifest_row: dict[str, Any],
+) -> dict[str, Any]:
+    """Derive the test-specific signals shared by the run row and the scorer."""
+    mech_order = mechanism_interaction_order(episode)
+    expected = list(manifest_row.get("expected_mechanisms", []) or [])
+    return {
+        "mechanism_interaction_order": mech_order,
+        "failure_point": failure_point(episode, expected, mech_order),
+        "path_choice": path_choice(
+            episode,
+            manifest_row.get("route_short_cells"),
+            manifest_row.get("route_long_cells"),
+        ),
+    }
+
+
+def build_run_row(
+    episode: dict[str, Any],
+    canonical_paths: dict[str, Any],
+    manifest_row: dict[str, Any],
+    *,
+    agent_or_model: str,
+    seed: int,
+    backend: str = "minigrid",
+    raw_output_ref: Optional[str] = None,
+    metrics: Optional[dict[str, Any]] = None,
+    prompt_variant: str = "default",
+) -> dict[str, Any]:
+    """Build one ``episode_runs.jsonl`` row (Appendix A.3 fields).
+
+    ``condition`` is the task-intrinsic axis (e.g. the test-3 mechanism order);
+    ``prompt_variant`` is the orthogonal prompt axis selected by ``--conditions``.
+    The two are kept distinct so prompt variants do not collapse onto the
+    manifest condition.
+    """
+    metrics = metrics if metrics is not None else build_metrics(episode, canonical_paths, manifest_row)
+    success = bool(episode.get("success"))
+    end_reason = episode.get("end_reason")
+    steps = int(episode.get("steps_used", 0))
+    optimal_steps = _canonical_optimal_steps(canonical_paths)
+    # Mirror scorer.runtime's step_ratio: optimal_steps == 0 is a perfect 0-step
+    # solve, not a zero ratio, so the jsonl and run_score.json agree.
+    if not success or optimal_steps is None:
+        optimality_ratio = 0.0
+    elif optimal_steps == 0:
+        optimality_ratio = 1.0 if steps == 0 else 0.0
+    else:
+        optimality_ratio = optimal_steps / max(steps, optimal_steps)
+    return {
+        "task_id": manifest_row.get("task_id") or episode.get("task_spec", {}).get("task_id"),
+        "experiment": manifest_row.get("experiment"),
+        "condition": manifest_row.get("condition"),
+        "prompt_variant": prompt_variant,
+        "backend": backend,
+        "agent_or_model": agent_or_model,
+        "seed": seed,
+        "success": success,
+        "terminated": end_reason == "success",
+        "truncated": end_reason == "truncated",
+        "reward": _episode_reward(episode),
+        "steps": steps,
+        "optimal_steps": optimal_steps,
+        "optimality_ratio": optimality_ratio,
+        "path_choice": metrics["path_choice"],
+        "mechanism_interaction_order": metrics["mechanism_interaction_order"],
+        "failure_point": metrics["failure_point"],
+        "tokens": episode_token_count(episode),
+        "raw_output_ref": raw_output_ref,
+    }
+
+
+def enrich_run_for_scoring(
+    episode: dict[str, Any],
+    manifest_row: dict[str, Any],
+    *,
+    agent_or_model: str,
+    seed: int,
+    backend: str = "minigrid",
+    metrics: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    """Episode dict + the fields ``scorer.compute_runtime_score`` reads/passes through.
+
+    The scorer already understands the episode transcript (success, steps,
+    positions, query-record token usage); this layers on the run identity and
+    the derived test-2/test-3 signals so they flow into ``run_score.json``.
+    """
+    metrics = metrics if metrics is not None else build_metrics(episode, {}, manifest_row)
+    run = dict(episode)
+    run["task_id"] = manifest_row.get("task_id") or episode.get("task_spec", {}).get("task_id")
+    run["backend"] = backend
+    run["adapter"] = agent_or_model
+    run["agent_or_model"] = agent_or_model
+    run["model_id"] = agent_or_model
+    run["seed"] = seed
+    run["terminated"] = episode.get("end_reason") == "success"
+    run["truncated"] = episode.get("end_reason") == "truncated"
+    # episode_log nests reward under final_state; the scorer only reads a
+    # top-level ``reward``, so lift it (keeps run_score.json reward in sync
+    # with the episode_runs.jsonl row).
+    if run.get("reward") is None:
+        run["reward"] = _episode_reward(episode)
+    for key in ("path_choice", "mechanism_interaction_order", "failure_point"):
+        if metrics.get(key) is not None:
+            run[key] = metrics[key]
+    return run
diff --git a/pipeline/reports.py b/pipeline/reports.py
new file mode 100644
index 0000000..a8c43d5
--- /dev/null
+++ b/pipeline/reports.py
@@ -0,0 +1,286 @@
+"""Stage 5 — thin aggregation reports for tests 1-3.
+
+Pure functions over in-memory run rows (Appendix A.3 dicts), per-run composites,
+static-score artifacts, and the manifest. These produce calibration *evidence*,
+not a final MultiNet score.
+"""
+
+from __future__ import annotations
+
+import statistics
+from collections import defaultdict
+from typing import Any, Iterable, Optional
+
+import numpy as np
+
+from scorer.config import DIMENSION_NAMES
+
+
+def _run_key(row: dict[str, Any]) -> tuple:
+    return (
+        row.get("task_id"),
+        row.get("agent_or_model"),
+        row.get("seed"),
+        row.get("condition"),
+        row.get("prompt_variant"),
+    )
+
+
+def _mean(values: list[float]) -> Optional[float]:
+    return float(statistics.fmean(values)) if values else None
+
+
+def _median(values: list[float]) -> Optional[float]:
+    return float(statistics.median(values)) if values else None
+
+
+def _group_success(rows: Iterable[dict[str, Any]], key: str) -> dict[str, dict[str, float]]:
+    buckets: dict[str, list[bool]] = defaultdict(list)
+    for row in rows:
+        buckets[str(row.get(key))].append(bool(row.get("success")))
+    return {
+        name: {"n": len(flags), "success_rate": _mean([float(f) for f in flags])}
+        for name, flags in buckets.items()
+    }
+
+
+def scoring_calibration_summary(
+    rows: list[dict[str, Any]],
+    composites: dict[tuple, float],
+    static_by_task: dict[str, dict[str, Any]],
+) -> dict[str, Any]:
+    """Test 1: success rates, optimality, and 12-dimension correlation evidence."""
+    successful_opt = [
+        float(r["optimality_ratio"])
+        for r in rows
+        if r.get("success") and r.get("optimality_ratio") is not None
+    ]
+
+    # Per-task mean composite, for correlating static dimensions against difficulty.
+    comp_by_task: dict[str, list[float]] = defaultdict(list)
+    succ_by_task: dict[str, list[float]] = defaultdict(list)
+    for r in rows:
+        comp = composites.get(_run_key(r))
+        if comp is not None:
+            comp_by_task[r["task_id"]].append(float(comp))
+        succ_by_task[r["task_id"]].append(float(bool(r.get("success"))))
+
+    tasks = [t for t in static_by_task if t in comp_by_task]
+    correlation: dict[str, Optional[float]] = {}
+    point_weight_candidates: dict[str, Optional[float]] = {}
+    if len(tasks) >= 2:
+        dim_matrix = np.array(
+            [
+                [float((static_by_task[t].get("dimensions_12") or {}).get(name, 0.0)) for name in DIMENSION_NAMES]
+                for t in tasks
+            ],
+            dtype=float,
+        )
+        target = np.array([_mean(comp_by_task[t]) or 0.0 for t in tasks], dtype=float)
+        for idx, name in enumerate(DIMENSION_NAMES):
+            col = dim_matrix[:, idx]
+            if np.std(col) == 0 or np.std(target) == 0:
+                correlation[name] = None
+            else:
+                correlation[name] = float(np.corrcoef(col, target)[0, 1])
+        abs_corr = {n: abs(c) for n, c in correlation.items() if c is not None}
+        total = sum(abs_corr.values())
+        for name in DIMENSION_NAMES:
+            point_weight_candidates[name] = (
+                abs_corr[name] / total if total > 0 and name in abs_corr else None
+            )
+
+    static_scores = [
+        float(static_by_task[t]["static_score"])
+        for t in static_by_task
+        if static_by_task[t].get("static_score") is not None
+    ]
+    tier_boundary_candidates = (
+        {
+            "p33": float(np.percentile(static_scores, 33)),
+            "p66": float(np.percentile(static_scores, 66)),
+        }
+        if static_scores
+        else {}
+    )
+
+    return {
+        "experiment": "test1",
+        "run_count": len(rows),
+        "task_count": len(static_by_task),
+        "ineligible_tasks": sorted(
+            t for t, s in static_by_task.items() if not s.get("is_beatable", True)
+        ),
+        "success_rate_by_task": _group_success(rows, "task_id"),
+        "success_rate_by_condition": _group_success(rows, "condition"),
+        "success_rate_by_prompt_variant": _group_success(rows, "prompt_variant"),
+        "success_rate_by_model": _group_success(rows, "agent_or_model"),
+        "optimality_ratio_mean": _mean(successful_opt),
+        "optimality_ratio_median": _median(successful_opt),
+        "dimension_correlation": correlation,
+        "point_weight_candidates": point_weight_candidates,
+        "tier_boundary_candidates": tier_boundary_candidates,
+    }
+
+
+def complexity_distance_summary(rows: list[dict[str, Any]]) -> dict[str, Any]:
+    """Test 2: path-choice counts (short mechanistic vs long open route)."""
+    test2 = [r for r in rows if r.get("experiment") == "test2"]
+    by_group: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
+    overall: dict[str, int] = defaultdict(int)
+    for r in test2:
+        choice = r.get("path_choice") or "none"
+        group = (
+            f"{r.get('task_id')}|{r.get('condition')}|"
+            f"{r.get('prompt_variant')}|{r.get('agent_or_model')}"
+        )
+        by_group[group][choice] += 1
+        overall[choice] += 1
+    return {
+        "experiment": "test2",
+        "run_count": len(test2),
+        "path_choice_overall": dict(overall),
+        "path_choice_by_group": {g: dict(c) for g, c in by_group.items()},
+        "success_rate_by_path_choice": {
+            choice: _mean(
+                [float(bool(r.get("success"))) for r in test2 if (r.get("path_choice") or "none") == choice]
+            )
+            for choice in set((r.get("path_choice") or "none") for r in test2)
+        },
+    }
+
+
+def mechanism_ordering_pairs(
+    rows: list[dict[str, Any]],
+    manifest_rows: list[dict[str, Any]],
+) -> dict[str, Any]:
+    """Test 3: paired success deltas across matched mechanism-ordering pairs."""
+    pair_of = {m.get("task_id"): m.get("pair_id") for m in manifest_rows}
+    expected_of = {m.get("task_id"): list(m.get("expected_mechanisms", []) or []) for m in manifest_rows}
+
+    test3 = [r for r in rows if r.get("experiment") == "test3"]
+    pairs: dict[str, dict[str, list[dict[str, Any]]]] = defaultdict(lambda: defaultdict(list))
+    for r in test3:
+        pid = pair_of.get(r.get("task_id"))
+        if pid is None:
+            continue
+        pairs[pid][str(r.get("condition"))].append(r)
+
+    pair_reports: dict[str, Any] = {}
+    for pid, conditions in pairs.items():
+        cond_stats = {}
+        for cond, cond_rows in conditions.items():
+            failures: dict[str, int] = defaultdict(int)
+            order_match = []
+            for r in cond_rows:
+                if not r.get("success"):
+                    fp = r.get("failure_point") or {}
+                    failures[str(fp.get("mechanism"))] += 1
+                expected = expected_of.get(r.get("task_id"), [])
+                # The interaction order also carries downstream effects (opened
+                # doors/gates) that are not in expected_mechanisms; compare only
+                # the actuated mechanisms' relative order so a correct solve matches.
+                expected_set = set(expected)
+                engaged_order = [
+                    m for m in (r.get("mechanism_interaction_order") or []) if m in expected_set
+                ]
+                order_match.append(
+                    float(engaged_order == expected) if expected else 0.0
+                )
+            cond_stats[cond] = {
+                "n": len(cond_rows),
+                "success_rate": _mean([float(bool(r.get("success"))) for r in cond_rows]),
+                "failure_point_counts": dict(failures),
+                "expected_order_match_rate": _mean(order_match),
+            }
+        sorted_conds = sorted(cond_stats)
+        delta = None
+        if len(sorted_conds) == 2:
+            a, b = sorted_conds
+            sr_a, sr_b = cond_stats[a]["success_rate"], cond_stats[b]["success_rate"]
+            if sr_a is not None and sr_b is not None:
+                delta = {"conditions": [a, b], "success_delta": sr_a - sr_b}
+        pair_reports[pid] = {"conditions": cond_stats, "paired_success_delta": delta}
+
+    return {
+        "experiment": "test3",
+        "run_count": len(test3),
+        "pairs": pair_reports,
+    }
+
+
+def _summary(
+    rows: list[dict[str, Any]], composites: dict[tuple, Optional[float]]
+) -> dict[str, Any]:
+    """Aggregate model-performance metrics over a set of run rows."""
+    opt = [
+        float(r["optimality_ratio"])
+        for r in rows
+        if r.get("success") and r.get("optimality_ratio") is not None
+    ]
+    tokens = [int(r["tokens"]) for r in rows if r.get("tokens") is not None]
+    comps = [
+        c for c in (composites.get(_run_key(r)) for r in rows) if c is not None
+    ]
+    return {
+        "n": len(rows),
+        "success_rate": _mean([float(bool(r.get("success"))) for r in rows]),
+        "optimality_ratio_mean": _mean(opt),
+        "optimality_ratio_median": _median(opt),
+        "steps_mean": _mean([float(r["steps"]) for r in rows if r.get("steps") is not None]),
+        "tokens_mean": _mean([float(t) for t in tokens]),
+        "tokens_total": float(sum(tokens)) if tokens else None,
+        "composite_mean": _mean([float(c) for c in comps]),
+    }
+
+
+def model_report(
+    run_rows: list[dict[str, Any]],
+    composites: dict[tuple, Optional[float]],
+    model_id: str,
+    run_set_id: str,
+) -> dict[str, Any]:
+    """Machine-readable per-model performance report.
+
+    Provisional: the raw metrics (success/steps/optimality/tokens) are
+    meaningful now, but composite fields are placeholders until the scorer is
+    tuned. Shares one schema across models so an external tool can compare them.
+    """
+    rows = [r for r in run_rows if r.get("agent_or_model") == model_id]
+
+    def _group(key: str) -> dict[str, Any]:
+        buckets: dict[str, list[dict[str, Any]]] = defaultdict(list)
+        for r in rows:
+            buckets[str(r.get(key))].append(r)
+        return {name: _summary(group, composites) for name, group in buckets.items()}
+
+    return {
+        "schema_version": "0.1.0",
+        "model_id": model_id,
+        "run_set_id": run_set_id,
+        "backend": rows[0].get("backend", "minigrid") if rows else "minigrid",
+        "seeds": sorted({r.get("seed") for r in rows if r.get("seed") is not None}),
+        "task_count": len({r.get("task_id") for r in rows}),
+        "run_count": len(rows),
+        "provisional": True,
+        "overall": _summary(rows, composites),
+        "by_experiment": _group("experiment"),
+        "by_prompt_variant": _group("prompt_variant"),
+        "tasks": [
+            {
+                "task_id": r.get("task_id"),
+                "experiment": r.get("experiment"),
+                "condition": r.get("condition"),
+                "prompt_variant": r.get("prompt_variant"),
+                "seed": r.get("seed"),
+                "success": bool(r.get("success")),
+                "steps": r.get("steps"),
+                "optimal_steps": r.get("optimal_steps"),
+                "optimality_ratio": r.get("optimality_ratio"),
+                "path_choice": r.get("path_choice"),
+                "tokens": r.get("tokens"),
+                "composite": composites.get(_run_key(r)),
+            }
+            for r in rows
+        ],
+    }
diff --git a/pipeline/run_stage3.py b/pipeline/run_stage3.py
new file mode 100644
index 0000000..f78b570
--- /dev/null
+++ b/pipeline/run_stage3.py
@@ -0,0 +1,56 @@
+"""Stage 3 — runtime runs on the ``interface/`` stack (Stack A, live models).
+
+Builds a MiniGrid backend + ``ExperimentRunner`` for one task, runs a single
+episode with a live-model agent, and flushes the canonical ``episode.json``
+artifact (plus PNG frames). Baselines are NOT run here — they feed Stage-2
+difficulty/canonical paths via the scorer.
+"""
+
+from __future__ import annotations
+
+import dataclasses
+import json
+from pathlib import Path
+from typing import Any, Callable
+
+from interface.config import ExperimentConfig
+from interface.episode_log import flush_episode_log
+from interface.loader import load_task
+from interface.runner import build_runner
+from gridworld.task_spec import TaskSpecification
+
+
+# An agent is any callable mapping chat messages -> model text (optionally
+# exposing a ``last_usage`` attribute for token telemetry).
+Agent = Callable[[list[dict]], str]
+
+
+def _spec_with_seed(spec: TaskSpecification, seed: int) -> TaskSpecification:
+    """Return a copy of ``spec`` with ``seed`` overridden (runner seeds from it)."""
+    if spec.seed == seed:
+        return spec
+    return dataclasses.replace(spec, seed=seed)
+
+
+def run_episode(
+    task_source: str | Path,
+    config: ExperimentConfig,
+    agent: Agent,
+    seed: int,
+    out_dir: str | Path,
+) -> dict[str, Any]:
+    """Run one episode and flush ``episode.json`` into ``out_dir``.
+
+    Returns the in-memory episode dict (the JSON-safe payload written to
+    ``out_dir/episode.json``), so callers can derive metrics without re-reading.
+    """
+    backend, spec = load_task(task_source)
+    spec = _spec_with_seed(spec, seed)
+    backend.configure(spec)
+
+    runner = build_runner(config, backend, spec)
+    result = runner.run(agent, verbose=False, maze_path=str(task_source))
+
+    out_dir = Path(out_dir)
+    episode_path = flush_episode_log(result, out_dir)
+    return json.loads(episode_path.read_text(encoding="utf-8"))
diff --git a/prompting_experiments/__init__.py b/prompting_experiments/__init__.py
new file mode 100644
index 0000000..d7ccc0f
--- /dev/null
+++ b/prompting_experiments/__init__.py
@@ -0,0 +1,5 @@
+"""Prompt condition-set configs for interface experiments."""
+
+from .exp_design import CONDITION_SETS, ConditionSet, Variant, iter_condition_configs
+
+__all__ = ["CONDITION_SETS", "ConditionSet", "Variant", "iter_condition_configs"]
diff --git a/prompting_experiments/condition_set_1_prompt.py b/prompting_experiments/condition_set_1_prompt.py
new file mode 100644
index 0000000..8d0d312
--- /dev/null
+++ b/prompting_experiments/condition_set_1_prompt.py
@@ -0,0 +1,27 @@
+"""Condition set 1: prompt verbosity."""
+
+from __future__ import annotations
+
+from .core import ConditionSet, Variant
+
+
+CONDITION_SET = ConditionSet(
+    name="Prompt",
+    comparisons=(
+        "Standard: goal + mechanism descriptions + action list",
+        "Verbose: standard + explicit rules",
+    ),
+    decision="If delta < 5%, use standard. If > 5%, use verbose.",
+    variants={
+        "standard": Variant(
+            name="standard",
+            description="Standard task prompt with mechanism descriptions.",
+            config_overrides={"prompting": "standard"},
+        ),
+        "verbose": Variant(
+            name="verbose",
+            description="Standard prompt plus explicit domain rules and local hints.",
+            config_overrides={"prompting": "verbose"},
+        ),
+    },
+)
diff --git a/prompting_experiments/condition_set_2_observation_format.py b/prompting_experiments/condition_set_2_observation_format.py
new file mode 100644
index 0000000..ccb3152
--- /dev/null
+++ b/prompting_experiments/condition_set_2_observation_format.py
@@ -0,0 +1,33 @@
+"""Condition set 2: observation format."""
+
+from __future__ import annotations
+
+from .core import ConditionSet, Variant
+
+
+CONDITION_SET = ConditionSet(
+    name="Observation format",
+    comparisons=(
+        "Text only",
+        "Image + text",
+        "Image only",
+    ),
+    decision="Measure whether text adds meaningful signal beyond image input.",
+    variants={
+        "text_only": Variant(
+            name="text_only",
+            description="Natural-language current observation, no image blocks.",
+            config_overrides={"observation": "text_only"},
+        ),
+        "image_text": Variant(
+            name="image_text",
+            description="Image block plus natural-language observation.",
+            config_overrides={"observation": "image_text"},
+        ),
+        "image_only": Variant(
+            name="image_only",
+            description="Image block with no initial natural-language maze map.",
+            config_overrides={"observation": "image_only"},
+        ),
+    },
+)
diff --git a/prompting_experiments/condition_set_3_context_window.py b/prompting_experiments/condition_set_3_context_window.py
new file mode 100644
index 0000000..9129c35
--- /dev/null
+++ b/prompting_experiments/condition_set_3_context_window.py
@@ -0,0 +1,33 @@
+"""Condition set 3: context window."""
+
+from __future__ import annotations
+
+from .core import ConditionSet, Variant
+
+
+CONDITION_SET = ConditionSet(
+    name="Context window",
+    comparisons=(
+        "0 history: current observation only",
+        "Last 3 executed steps",
+        "Current observation + text summary of prior actions",
+    ),
+    decision="Compare current-state-only prompting against recent history.",
+    variants={
+        "current": Variant(
+            name="current",
+            description="Prompt only with the current observation.",
+            config_overrides={"context_window": "current"},
+        ),
+        "last3": Variant(
+            name="last3",
+            description="Include up to the last three executed steps.",
+            config_overrides={"context_window": "last3"},
+        ),
+        "text_summary": Variant(
+            name="text_summary",
+            description="PR #12 design axis; no ExperimentConfig summary mode exists yet.",
+            implemented=False,
+        ),
+    },
+)
diff --git a/prompting_experiments/condition_set_4_querying_strategy.py b/prompting_experiments/condition_set_4_querying_strategy.py
new file mode 100644
index 0000000..9f166e0
--- /dev/null
+++ b/prompting_experiments/condition_set_4_querying_strategy.py
@@ -0,0 +1,33 @@
+"""Condition set 5: querying strategy."""
+
+from __future__ import annotations
+
+from .core import ConditionSet, Variant
+
+
+CONDITION_SET = ConditionSet(
+    name="Querying strategy",
+    comparisons=(
+        "Step-by-step: one action per query",
+        "Subgoal planning: model outputs a subgoal and action chunk",
+        "Full trajectory: model outputs a complete plan once",
+    ),
+    decision="Determine whether chunked or one-shot planning improves performance.",
+    variants={
+        "step_by_step": Variant(
+            name="step_by_step",
+            description="Ask for one action each query.",
+            config_overrides={"querying": "step_by_step"},
+        ),
+        "subgoal": Variant(
+            name="subgoal",
+            description="Ask for a short subgoal and action chunk.",
+            config_overrides={"querying": "subgoal"},
+        ),
+        "full_trajectory": Variant(
+            name="full_trajectory",
+            description="Ask once for a complete action trajectory.",
+            config_overrides={"querying": "full_trajectory"},
+        ),
+    },
+)
diff --git a/prompting_experiments/condition_set_5_in_context_learning.py b/prompting_experiments/condition_set_5_in_context_learning.py
new file mode 100644
index 0000000..7b1d8e3
--- /dev/null
+++ b/prompting_experiments/condition_set_5_in_context_learning.py
@@ -0,0 +1,32 @@
+"""Condition set 6: in-context learning."""
+
+from __future__ import annotations
+
+from .core import ConditionSet, Variant
+
+
+CONDITION_SET = ConditionSet(
+    name="In-context learning",
+    comparisons=(
+        "Zero-shot: no examples",
+        "1-shot: one example trajectory from a different maze",
+    ),
+    decision=(
+        "If 1-shot dramatically improves performance, the bottleneck is likely "
+        "task understanding rather than navigation capability."
+    ),
+    variants={
+        "zero_shot": Variant(
+            name="zero_shot",
+            description="Current interface behavior.",
+            config_overrides={},
+        ),
+        "one_shot": Variant(
+            name="one_shot",
+            description="PR #12 design axis; example selection/injection is not implemented yet.",
+            implemented=False,
+        ),
+    },
+    implemented=False,
+    notes="ICL examples must not use evaluation mazes.",
+)
diff --git a/prompting_experiments/core.py b/prompting_experiments/core.py
new file mode 100644
index 0000000..a5c1a0a
--- /dev/null
+++ b/prompting_experiments/core.py
@@ -0,0 +1,50 @@
+"""Shared types for prompt experiment condition registries."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, replace
+from typing import TYPE_CHECKING, Iterator, Mapping
+
+if TYPE_CHECKING:
+    from interface.config import ExperimentConfig
+
+
+@dataclass(frozen=True)
+class Variant:
+    """One experiment variant expressed as overrides to ``ExperimentConfig``."""
+
+    name: str
+    description: str
+    config_overrides: Mapping[str, object] | None = None
+    implemented: bool = True
+
+    def build_config(self, base: ExperimentConfig | None = None) -> ExperimentConfig:
+        if not self.implemented:
+            raise ValueError(f"Variant is not implemented in ExperimentConfig: {self.name}")
+        from interface.config import ExperimentConfig
+
+        cfg = base or ExperimentConfig()
+        return replace(cfg, **dict(self.config_overrides or {}))
+
+
+@dataclass(frozen=True)
+class ConditionSet:
+    """A named experimental axis and its comparable variants."""
+
+    name: str
+    comparisons: tuple[str, ...]
+    decision: str
+    variants: Mapping[str, Variant]
+    implemented: bool = True
+    notes: str = ""
+
+
+def iter_condition_configs(
+    condition: ConditionSet,
+    base: ExperimentConfig | None = None,
+) -> Iterator[tuple[str, ExperimentConfig]]:
+    """Yield ``(variant_name, config)`` pairs for implemented variants."""
+
+    for variant_name, variant in condition.variants.items():
+        if variant.implemented:
+            yield variant_name, variant.build_config(base)
diff --git a/prompting_experiments/exp_design.py b/prompting_experiments/exp_design.py
new file mode 100644
index 0000000..d9237c7
--- /dev/null
+++ b/prompting_experiments/exp_design.py
@@ -0,0 +1,40 @@
+"""Experiment prompt condition-set registry.
+
+Each condition set is split into its own module to mirror the PR #12 experiment
+design while keeping runnable prompt behavior centralized in ``interface``.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Iterator, Mapping
+
+if TYPE_CHECKING:
+    from interface.config import ExperimentConfig
+
+from .condition_set_1_prompt import CONDITION_SET as CONDITION_SET_1
+from .condition_set_2_observation_format import CONDITION_SET as CONDITION_SET_2
+from .condition_set_3_context_window import CONDITION_SET as CONDITION_SET_3
+from .condition_set_4_querying_strategy import CONDITION_SET as CONDITION_SET_5
+from .condition_set_5_in_context_learning import CONDITION_SET as CONDITION_SET_6
+from .core import ConditionSet, Variant, iter_condition_configs as _iter_condition_configs
+
+
+CONDITION_SETS: Mapping[str, ConditionSet] = {
+    CONDITION_SET_1.name: CONDITION_SET_1,
+    CONDITION_SET_2.name: CONDITION_SET_2,
+    CONDITION_SET_3.name: CONDITION_SET_3,
+    CONDITION_SET_5.name: CONDITION_SET_5,
+    CONDITION_SET_6.name: CONDITION_SET_6,
+}
+
+
+def iter_condition_configs(
+    condition_name: str,
+    base: ExperimentConfig | None = None,
+) -> Iterator[tuple[str, ExperimentConfig]]:
+    """Yield runnable ``(variant_name, config)`` pairs for one condition set."""
+
+    yield from _iter_condition_configs(CONDITION_SETS[condition_name], base)
+
+
+__all__ = ["CONDITION_SETS", "ConditionSet", "Variant", "iter_condition_configs"]
diff --git a/prompting_experiments/preview_prompts.py b/prompting_experiments/preview_prompts.py
new file mode 100644
index 0000000..b1e82a8
--- /dev/null
+++ b/prompting_experiments/preview_prompts.py
@@ -0,0 +1,141 @@
+"""Generate a text preview of prompt experiment condition variants."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+from typing import Any
+
+from prompting_experiments import CONDITION_SETS
+from prompting_experiments.prompt_templates import feedback as feedback_templates
+from prompting_experiments.prompt_templates import system as system_templates
+
+
+def _content_to_text(content: Any) -> str:
+    if isinstance(content, str):
+        return content
+    if not isinstance(content, list):
+        return str(content)
+
+    lines: list[str] = []
+    image_count = 0
+    for block in content:
+        if not isinstance(block, dict):
+            continue
+        if block.get("type") == "text":
+            lines.append(block.get("text", ""))
+        elif block.get("type") == "image_url":
+            image_count += 1
+            lines.append(f"[image block {image_count}]")
+    return "\n".join(part for part in lines if part)
+
+
+def _missing_dependency_message(exc: ModuleNotFoundError) -> str:
+    return (
+        f"Missing dependency: {exc.name}. Install the project dependencies in this environment, "
+        "for example: python3 -m pip install -e '.[dev]'"
+    )
+
+
+def _prompt_preview(config, maze_path: Path, max_steps: int) -> tuple[str, str]:
+    try:
+        from interface.loader import load_task
+        from interface.runner import build_runner
+        from interface.renderer import render_initial_maze_text
+    except ModuleNotFoundError as exc:
+        raise SystemExit(_missing_dependency_message(exc)) from exc
+
+    backend, spec = load_task(maze_path)
+    spec.max_steps = max_steps
+    runner = build_runner(config, backend, spec)
+    runner.last_rgb, state, _reset_info = backend.reset(seed=spec.seed)
+
+    system_prompt = runner.prompt.build_system_prompt(runner.querying.system_prompt_suffix())
+    if config.observation in ("text_only", "image_text"):
+        system_prompt = (
+            f"{system_prompt}\n\n"
+            f"{system_templates.INITIAL_MAZE_SECTION.format(maze_text=render_initial_maze_text(spec))}"
+        )
+
+    user_message = runner._build_message(state, feedback_templates.INITIAL_FEEDBACK, [])
+    return system_prompt, _content_to_text(user_message.get("content"))
+
+
+def build_preview(maze_path: Path, max_steps: int) -> str:
+    chunks = [
+        "Prompt Experiment Preview",
+        f"Maze: {maze_path}",
+        f"Max steps: {max_steps}",
+        "",
+    ]
+
+    for idx, condition in enumerate(CONDITION_SETS.values(), start=1):
+        chunks.extend(
+            [
+                "=" * 88,
+                f"condition set {idx}: {condition.name}",
+                "=" * 88,
+            ]
+        )
+        for variant_name, variant in condition.variants.items():
+            chunks.extend(
+                [
+                    f"variant name: {variant_name}",
+                    f"description: {variant.description}",
+                    "prompts:",
+                ]
+            )
+            if not variant.implemented:
+                chunks.extend(
+                    [
+                        "Status: not implemented in ExperimentConfig",
+                        "-" * 88,
+                    ]
+                )
+                continue
+
+            try:
+                config = variant.build_config()
+            except ModuleNotFoundError as exc:
+                raise SystemExit(_missing_dependency_message(exc)) from exc
+            system_prompt, user_prompt = _prompt_preview(config, maze_path, max_steps)
+            chunks.extend(
+                [
+                    "[system prompt]",
+                    system_prompt,
+                    "",
+                    "[user prompt]",
+                    user_prompt,
+                    "-" * 88,
+                ]
+            )
+
+    return "\n".join(chunks).rstrip() + "\n"
+
+
+def _default_maze_path(name: str) -> Path:
+    return Path(__file__).resolve().parents[1] / "mazes" / "validation_10" / name
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Write prompt experiment previews to prompts.txt.")
+    parser.add_argument("--maze", default="V01_empty_room.json")
+    parser.add_argument("--max-steps", type=int, default=5)
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path(__file__).resolve().parent / "prompts.txt",
+    )
+    args = parser.parse_args()
+
+    maze_path = Path(args.maze)
+    if not maze_path.is_file():
+        maze_path = _default_maze_path(args.maze)
+
+    preview = build_preview(maze_path, args.max_steps)
+    args.output.write_text(preview, encoding="utf-8")
+    print(f"wrote {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/prompting_experiments/prompt_templates/__init__.py b/prompting_experiments/prompt_templates/__init__.py
new file mode 100644
index 0000000..344f4fc
--- /dev/null
+++ b/prompting_experiments/prompt_templates/__init__.py
@@ -0,0 +1 @@
+"""Agent-facing prompt templates grouped by prompt surface."""
diff --git a/prompting_experiments/prompt_templates/feedback.py b/prompting_experiments/prompt_templates/feedback.py
new file mode 100644
index 0000000..9fb29af
--- /dev/null
+++ b/prompting_experiments/prompt_templates/feedback.py
@@ -0,0 +1,50 @@
+"""Step feedback templates."""
+
+INITIAL_FEEDBACK = "Episode start."
+OPENED_AND_MOVED = "Opened {color} door {door_id} and moved to {position}."
+OPENED_DOOR = "Opened {color} door {door_id}."
+NOW_FACING = "Now facing {facing}."
+ACTION_NO_EFFECT = "{action} had no effect."
+MOVE_BLOCKED_BY_KEY = (
+    "MOVE_FORWARD blocked by a {key_color} key at {position}. "
+    "Keys occupy their cell; you cannot walk onto them. "
+    "Face the key and use PICKUP from your current cell."
+)
+MOVE_BLOCKED_BY_GATE_WITH_SWITCHES = (
+    "MOVE_FORWARD blocked by closed gate {gate_id} at {position}. "
+    "Activate switch(es) {switches} to open it."
+)
+MOVE_BLOCKED_BY_GATE = "MOVE_FORWARD blocked by closed gate {gate_id} at {position}."
+MOVE_BLOCKED_GENERIC = "MOVE_FORWARD blocked by wall or closed door/gate."
+REACHED_GOAL = "Reached goal at {goal}."
+MOVED_TO = "Moved to {position}."
+PICKED_UP_KEY = "Picked up {key_color} key."
+NOTHING_TO_PICK_UP = "Nothing to pick up here."
+TOGGLED_STATE_CHANGED = "Toggled switch or gate state changed."
+TOGGLE_HOLD_SWITCH_HINT = (
+    "TOGGLE had no effect. MOVE_FORWARD onto the switch at {position} "
+    "(hold switches activate while you stand on them)."
+)
+TOGGLE_SWITCH_HINT = "TOGGLE had no effect. MOVE_FORWARD onto the switch at {position}, then TOGGLE."
+GATE_TOGGLE_WITH_SWITCHES = "Gates cannot be toggled directly. Activate switch(es) {switches} instead."
+GATE_TOGGLE_GENERIC = "Gates cannot be toggled directly. Activate a linked switch instead."
+TOGGLE_NO_EFFECT = "TOGGLE had no effect. Stand on a switch and TOGGLE, or use PICKUP/keys for doors."
+TASK_COMPLETE = "Task complete at {goal}."
+WRONG_DONE = "DONE called but not at goal {goal}."
+UNKNOWN_ACTION = "Unknown or unsupported action {action}."
+
+BLOCKED_FEEDBACK = "BLOCKED — {action}: {message} You remain at {position}."
+TURNED_FEEDBACK = "TURNED — {action}: {message}"
+MOVED_FEEDBACK = "MOVED — {action}: {message}"
+SUCCESS_FEEDBACK = "SUCCESS — {action}: {message}"
+PICKUP_FEEDBACK = "PICKUP — {action}: {message}"
+NOTHING_FEEDBACK = "NOTHING — {action}: {message} You remain at {position}."
+OPENED_FEEDBACK = "OPENED — {action}: {message}"
+TOGGLED_FEEDBACK = "TOGGLED — {action}: {message}"
+WRONG_DONE_FEEDBACK = "WRONG DONE — {action}: {message} You remain at {position}."
+INVALID_FEEDBACK = "INVALID — {action}: {message} You remain at {position}."
+DEFAULT_FEEDBACK = "{event_type} — {action}: {message}"
+PARSE_FAILURE_FEEDBACK = (
+    "Could not parse FINAL_OUTPUT. Do not explain. Reply exactly as one line: "
+    "FINAL_OUTPUT: <one of {actions_hint}>."
+)
diff --git a/prompting_experiments/prompt_templates/observation.py b/prompting_experiments/prompt_templates/observation.py
new file mode 100644
index 0000000..c86a74d
--- /dev/null
+++ b/prompting_experiments/prompt_templates/observation.py
@@ -0,0 +1,51 @@
+"""Observation and history prompt templates."""
+
+RECENT_HISTORY_HEADER = "Recent history (last 3 steps, oldest first):"
+RECENT_HISTORY_STEP = "  ({row}, {col}) facing {facing} -> {action} -> {feedback}"
+
+IMAGE_HISTORY_ACTION = "Action: {action}\n\n"
+IMAGE_ONLY_HISTORY_INTRO = (
+    "Recent steps (oldest first). Each image is the maze view from which the "
+    "following action was chosen; infer pose and environment state from the image.\n\n"
+)
+IMAGE_TEXT_HISTORY_INTRO = "Recent step views (oldest first):\n\n"
+
+WORLD_SIZE_LINE = "The world is a {rows} by {cols} grid."
+COORDINATE_EXPLANATION = (
+    "Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``;"
+    " tuples in this text use ``(row, column)`` matching env state (row southward, column east)."
+    " So ``x`` = column index, ``y`` = row index."
+)
+START_LINE = "The start is at {start}."
+GOAL_LINE = "The goal is at {goal}."
+WALLS_LINE = "The following cells are walls: {walls}."
+
+KEY_LINE = "There is a {color} key at ({row},{col})."
+DOOR_LINE = (
+    "There is a {status} {requires_key} door at ({row},{col})."
+    " It requires the {requires_key} key to open."
+)
+SWITCH_LINE = (
+    "There is a {switch_type} switch at ({row},{col}) (currently {state})."
+    " It controls: {controls}."
+)
+GATE_LINE = (
+    "There is a gate ({gate_id}) at ({row},{col})."
+    " It is currently {state} (initially {initial_state})."
+)
+
+CURRENT_SITUATION_HEADER = "Current situation (this step):"
+CURRENT_GOAL_LINE = "The goal is at {goal}."
+CURRENT_AGENT_LINE = "You are at {position} facing {facing}."
+CURRENT_INVENTORY_LINE = "Your inventory: {inventory}."
+CURRENT_MAP_CONTENTS_HEADER = "Map contents as of this step (keys on the ground, doors, switches, gates):"
+NO_MECHANISMS_LINE = "(No keys on the ground, doors, switches, or gates in the current state description.)"
+
+CELL_OUT_OF_BOUNDS = "out of bounds"
+CELL_WALL = "wall"
+CELL_GOAL = "GOAL ({row},{col})"
+CELL_KEY = "{key_color} key ({row},{col})"
+CELL_DOOR = "{status} {requires_key} door ({row},{col})"
+CELL_GATE = "{state} gate ({row},{col})"
+CELL_SWITCH = "switch ({state}) ({row},{col})"
+CELL_OPEN = "open ({row},{col})"
diff --git a/prompting_experiments/prompt_templates/querying.py b/prompting_experiments/prompt_templates/querying.py
new file mode 100644
index 0000000..c878349
--- /dev/null
+++ b/prompting_experiments/prompt_templates/querying.py
@@ -0,0 +1,15 @@
+"""Querying strategy prompt templates."""
+
+SUBGOAL_SUFFIX = (
+    "For each turn output:\n"
+    "  SUB_GOAL: <short description of your next waypoint>\n"
+    "  ACTIONS: <comma-separated action list to reach it>"
+)
+
+FULL_TRAJECTORY_SUFFIX = (
+    "Output your complete trajectory once as:\n"
+    "  SUB_GOAL: <short description of the full plan>\n"
+    "  ACTIONS: <comma-separated action list from start to finish>\n"
+    "The last action in ACTIONS should be DONE (when you expect to be at the goal).\n"
+    "You will not be queried again — this is your only planning turn."
+)
diff --git a/prompting_experiments/prompt_templates/system.py b/prompting_experiments/prompt_templates/system.py
new file mode 100644
index 0000000..f96d774
--- /dev/null
+++ b/prompting_experiments/prompt_templates/system.py
@@ -0,0 +1,36 @@
+"""System prompt templates."""
+
+TASK_PREFIX = "Task: move to the goal cell in the grid."
+
+MECHANISM_LIST = (
+    "The environment may contain:\n"
+    "- Keys: pick them up to open doors of the matching color\n"
+    "- Doors: blocked passages that require a matching key\n"
+    "- Switches: step onto them to activate (hold) or TOGGLE while standing on them\n"
+    "- Gates: blocked passages controlled by switches\n"
+)
+
+MECHANISM_RULES = (
+    "RULES (domain logic):\n"
+    "  - PICKUP: pick up a key from the adjacent cell you are facing. Keys block movement — you\n"
+    "    cannot MOVE_FORWARD onto a key; stand beside it, face it, and PICKUP.\n"
+    "  - Doors: face a locked door with the matching key in inventory and TOGGLE to open it, then\n"
+    "    MOVE_FORWARD through the open door. MOVE_FORWARD alone does not open a locked door.\n"
+    "  - Switches: MOVE_FORWARD onto the switch cell, then TOGGLE (toggle/one-shot types). Hold-type\n"
+    "    switches activate automatically while you stand on them. Only switches are toggled. Linked\n"
+    "    gates are open if at least one linked switch is on, and closed if all are off.\n"
+    "  - Gates: you cannot TOGGLE a gate. CLOSED gates block movement; OPEN gates do not.\n"
+    "  - Closed gates and doors you lack a key for block movement like walls until resolved.\n"
+    "  - Use DONE only when you are standing on the goal cell."
+)
+
+VALID_ACTIONS_TEMPLATE = "Valid actions: {actions_hint}."
+
+FINAL_OUTPUT_INSTRUCTION = (
+    "Do not explain, reason, summarize the map, or include any text before the answer.\n"
+    "On the last line, output exactly:\n"
+    "FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  "
+    "(comma-separated; one or more valid actions)"
+)
+
+INITIAL_MAZE_SECTION = "Initial maze (fixed for this episode):\n{maze_text}"
diff --git a/prompting_experiments/prompt_templates/user.py b/prompting_experiments/prompt_templates/user.py
new file mode 100644
index 0000000..100429e
--- /dev/null
+++ b/prompting_experiments/prompt_templates/user.py
@@ -0,0 +1,36 @@
+"""User prompt templates."""
+
+OBSERVATION_SECTION = "Observation:\n{obs_text}\n\n"
+
+MINIMAL_USER_PROMPT = (
+    "{obs_block}"
+    "Position: {position}  |  Facing: {facing}  |  Goal: {goal}\n"
+    "Last result: {last_feedback}\n"
+    "What is your next action?\n"
+    "Reply exactly as one line: FINAL_OUTPUT: <one valid action>"
+)
+
+VERBOSE_USER_PROMPT = (
+    "{obs_block}"
+    "Position: {position}  |  Facing: {facing}  |  Goal: {goal}  |  "
+    "Manhattan: {manhattan}\n"
+    "Inventory: {inventory}\n"
+    "{neighbour_block}"
+    "{mechanism_block}"
+    "Last result: {last_feedback}\n"
+    "What is your next action?\n"
+    "Reply exactly as one line: FINAL_OUTPUT: <one valid action>"
+)
+
+NEIGHBOUR_BLOCK_HEADER = "From your perspective:\n"
+NEIGHBOUR_LINE = "  {relative_direction}: {description}"
+
+MECHANISM_HINTS_HEADER = "Hints:\n"
+KEY_DOOR_HINT = (
+    "  - Face an adjacent key and PICKUP (do not walk onto the key). "
+    "Face a locked door with the matching key and TOGGLE to open it, then MOVE_FORWARD through."
+)
+SWITCH_GATE_HINT = (
+    "  - MOVE_FORWARD onto a switch, then TOGGLE (hold switches activate on step). "
+    "Gates cannot be toggled — activate their linked switch(es)."
+)
diff --git a/prompting_experiments/prompts.txt b/prompting_experiments/prompts.txt
new file mode 100644
index 0000000..9511504
--- /dev/null
+++ b/prompting_experiments/prompts.txt
@@ -0,0 +1,447 @@
+Prompt Experiment Preview
+Maze: /Users/helenlu/HRI/MultiNet-v2.0/mazes/validation_10/V01_empty_room.json
+Max steps: 5
+
+========================================================================================
+condition set 1: Prompt
+========================================================================================
+variant name: standard
+description: Standard task prompt with mechanism descriptions.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+Initial maze (fixed for this episode):
+The world is a 8 by 8 grid.
+Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index.
+The start is at (1, 1).
+The goal is at (6, 6).
+The following cells are walls: none.
+
+[user prompt]
+[image block 1]
+Observation:
+Current situation (this step):
+The goal is at (6, 6).
+You are at (1, 1) facing EAST.
+Your inventory: empty.
+
+Map contents as of this step (keys on the ground, doors, switches, gates):
+(No keys on the ground, doors, switches, or gates in the current state description.)
+
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+variant name: verbose
+description: Standard prompt plus explicit domain rules and local hints.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+RULES (domain logic):
+  - PICKUP: pick up a key from the adjacent cell you are facing. Keys block movement — you
+    cannot MOVE_FORWARD onto a key; stand beside it, face it, and PICKUP.
+  - Doors: face a locked door with the matching key in inventory and TOGGLE to open it, then
+    MOVE_FORWARD through the open door. MOVE_FORWARD alone does not open a locked door.
+  - Switches: MOVE_FORWARD onto the switch cell, then TOGGLE (toggle/one-shot types). Hold-type
+    switches activate automatically while you stand on them. Only switches are toggled. Linked
+    gates are open if at least one linked switch is on, and closed if all are off.
+  - Gates: you cannot TOGGLE a gate. CLOSED gates block movement; OPEN gates do not.
+  - Closed gates and doors you lack a key for block movement like walls until resolved.
+  - Use DONE only when you are standing on the goal cell.
+
+Initial maze (fixed for this episode):
+The world is a 8 by 8 grid.
+Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index.
+The start is at (1, 1).
+The goal is at (6, 6).
+The following cells are walls: none.
+
+[user prompt]
+[image block 1]
+Observation:
+Current situation (this step):
+The goal is at (6, 6).
+You are at (1, 1) facing EAST.
+Your inventory: empty.
+
+Map contents as of this step (keys on the ground, doors, switches, gates):
+(No keys on the ground, doors, switches, or gates in the current state description.)
+
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)  |  Manhattan: 10
+Inventory: none
+From your perspective:
+  AHEAD: open (1,2)
+  RIGHT: open (2,1)
+  BEHIND: out of bounds
+  LEFT: out of bounds
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+========================================================================================
+condition set 2: Observation format
+========================================================================================
+variant name: text_only
+description: Natural-language current observation, no image blocks.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+Initial maze (fixed for this episode):
+The world is a 8 by 8 grid.
+Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index.
+The start is at (1, 1).
+The goal is at (6, 6).
+The following cells are walls: none.
+
+[user prompt]
+Observation:
+Current situation (this step):
+The goal is at (6, 6).
+You are at (1, 1) facing EAST.
+Your inventory: empty.
+
+Map contents as of this step (keys on the ground, doors, switches, gates):
+(No keys on the ground, doors, switches, or gates in the current state description.)
+
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+variant name: image_text
+description: Image block plus natural-language observation.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+Initial maze (fixed for this episode):
+The world is a 8 by 8 grid.
+Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index.
+The start is at (1, 1).
+The goal is at (6, 6).
+The following cells are walls: none.
+
+[user prompt]
+[image block 1]
+Observation:
+Current situation (this step):
+The goal is at (6, 6).
+You are at (1, 1) facing EAST.
+Your inventory: empty.
+
+Map contents as of this step (keys on the ground, doors, switches, gates):
+(No keys on the ground, doors, switches, or gates in the current state description.)
+
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+variant name: image_only
+description: Image block with no initial natural-language maze map.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+[user prompt]
+[image block 1]
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+========================================================================================
+condition set 3: Context window
+========================================================================================
+variant name: current
+description: Prompt only with the current observation.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+Initial maze (fixed for this episode):
+The world is a 8 by 8 grid.
+Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index.
+The start is at (1, 1).
+The goal is at (6, 6).
+The following cells are walls: none.
+
+[user prompt]
+[image block 1]
+Observation:
+Current situation (this step):
+The goal is at (6, 6).
+You are at (1, 1) facing EAST.
+Your inventory: empty.
+
+Map contents as of this step (keys on the ground, doors, switches, gates):
+(No keys on the ground, doors, switches, or gates in the current state description.)
+
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+variant name: last3
+description: Include up to the last three executed steps.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+Initial maze (fixed for this episode):
+The world is a 8 by 8 grid.
+Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index.
+The start is at (1, 1).
+The goal is at (6, 6).
+The following cells are walls: none.
+
+[user prompt]
+[image block 1]
+Observation:
+Current situation (this step):
+The goal is at (6, 6).
+You are at (1, 1) facing EAST.
+Your inventory: empty.
+
+Map contents as of this step (keys on the ground, doors, switches, gates):
+(No keys on the ground, doors, switches, or gates in the current state description.)
+
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+variant name: text_summary
+description: PR #12 design axis; no ExperimentConfig summary mode exists yet.
+prompts:
+Status: not implemented in ExperimentConfig
+----------------------------------------------------------------------------------------
+========================================================================================
+condition set 4: Querying strategy
+========================================================================================
+variant name: step_by_step
+description: Ask for one action each query.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+Initial maze (fixed for this episode):
+The world is a 8 by 8 grid.
+Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index.
+The start is at (1, 1).
+The goal is at (6, 6).
+The following cells are walls: none.
+
+[user prompt]
+[image block 1]
+Observation:
+Current situation (this step):
+The goal is at (6, 6).
+You are at (1, 1) facing EAST.
+Your inventory: empty.
+
+Map contents as of this step (keys on the ground, doors, switches, gates):
+(No keys on the ground, doors, switches, or gates in the current state description.)
+
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+variant name: subgoal
+description: Ask for a short subgoal and action chunk.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+For each turn output:
+  SUB_GOAL: <short description of your next waypoint>
+  ACTIONS: <comma-separated action list to reach it>
+
+Initial maze (fixed for this episode):
+The world is a 8 by 8 grid.
+Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index.
+The start is at (1, 1).
+The goal is at (6, 6).
+The following cells are walls: none.
+
+[user prompt]
+[image block 1]
+Observation:
+Current situation (this step):
+The goal is at (6, 6).
+You are at (1, 1) facing EAST.
+Your inventory: empty.
+
+Map contents as of this step (keys on the ground, doors, switches, gates):
+(No keys on the ground, doors, switches, or gates in the current state description.)
+
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+variant name: full_trajectory
+description: Ask once for a complete action trajectory.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+Output your complete trajectory once as:
+  SUB_GOAL: <short description of the full plan>
+  ACTIONS: <comma-separated action list from start to finish>
+The last action in ACTIONS should be DONE (when you expect to be at the goal).
+You will not be queried again — this is your only planning turn.
+
+Initial maze (fixed for this episode):
+The world is a 8 by 8 grid.
+Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index.
+The start is at (1, 1).
+The goal is at (6, 6).
+The following cells are walls: none.
+
+[user prompt]
+[image block 1]
+Observation:
+Current situation (this step):
+The goal is at (6, 6).
+You are at (1, 1) facing EAST.
+Your inventory: empty.
+
+Map contents as of this step (keys on the ground, doors, switches, gates):
+(No keys on the ground, doors, switches, or gates in the current state description.)
+
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+========================================================================================
+condition set 5: In-context learning
+========================================================================================
+variant name: zero_shot
+description: Current interface behavior.
+prompts:
+[system prompt]
+Task: move to the goal cell in the grid.
+The environment may contain:
+- Keys: pick them up to open doors of the matching color
+- Doors: blocked passages that require a matching key
+- Switches: step onto them to activate (hold) or TOGGLE while standing on them
+- Gates: blocked passages controlled by switches
+
+Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE.
+On the last line, output exactly:
+FINAL_OUTPUT: <action>  or  FINAL_OUTPUT: <a>, <b>, ...  (comma-separated; one or more valid actions)
+
+Initial maze (fixed for this episode):
+The world is a 8 by 8 grid.
+Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index.
+The start is at (1, 1).
+The goal is at (6, 6).
+The following cells are walls: none.
+
+[user prompt]
+[image block 1]
+Observation:
+Current situation (this step):
+The goal is at (6, 6).
+You are at (1, 1) facing EAST.
+Your inventory: empty.
+
+Map contents as of this step (keys on the ground, doors, switches, gates):
+(No keys on the ground, doors, switches, or gates in the current state description.)
+
+Position: (1, 1)  |  Facing: EAST  |  Goal: (6, 6)
+Last result: Episode start.
+What is your next action?
+----------------------------------------------------------------------------------------
+variant name: one_shot
+description: PR #12 design axis; example selection/injection is not implemented yet.
+prompts:
+Status: not implemented in ExperimentConfig
+----------------------------------------------------------------------------------------
diff --git a/pyproject.toml b/pyproject.toml
index 7ce045e..61a82de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,9 @@ multinet-probe-vlm = "scripts.probe_vlm:main"
 multinet-ollama-vision-check = "scripts.ollama_vision_check:main"
 multinet-ollama-maze-shape-check = "scripts.ollama_maze_shape_check:main"
 multinet-vlm-sanity = "scripts.vlm_sanity_check:main"
+multinet-preview-prompts = "prompting_experiments.preview_prompts:main"
+multinet-score-json = "scripts.score_json:main"
+multinet-run-pipeline = "scripts.run_pipeline:main"
 
 [tool.setuptools]
 include-package-data = true
@@ -63,9 +66,13 @@ include = [
     "interface*",
     "mazes*",
     "multigrid*",
+    "pipeline*",
+    "prompting_experiments*",
+    "scorer*",
     "scripts*",
 ]
 
 [tool.setuptools.package-data]
-gridworld = ["tasks/**/*.json", "tasks/*.json"]
+gridworld = ["tasks/**/*.json", "tasks/*.json", "fixtures/**/*.json", "fixtures/*.json"]
 mazes = ["validation_10/**/*.json", "validation_10/*.json"]
+scorer = ["scorer_config.json"]
diff --git a/scorer/__init__.py b/scorer/__init__.py
new file mode 100644
index 0000000..df4a4db
--- /dev/null
+++ b/scorer/__init__.py
@@ -0,0 +1,33 @@
+"""Standalone scoring package for MultiNet task and run artifacts."""
+
+from .scoring import (
+    CanonicalPathReport,
+    RuntimeScoreArtifact,
+    ScoredDifficulty,
+    ScorerConfig,
+    StaticScoreArtifact,
+    compute_12d_score,
+    compute_canonical_paths,
+    compute_greedy_solvability,
+    compute_runtime_score,
+    compute_static_score_artifact,
+    load_scorer_config,
+    score_runtime_file,
+    score_task_file,
+)
+
+__all__ = [
+    "CanonicalPathReport",
+    "RuntimeScoreArtifact",
+    "ScoredDifficulty",
+    "ScorerConfig",
+    "StaticScoreArtifact",
+    "compute_12d_score",
+    "compute_canonical_paths",
+    "compute_greedy_solvability",
+    "compute_runtime_score",
+    "compute_static_score_artifact",
+    "load_scorer_config",
+    "score_runtime_file",
+    "score_task_file",
+]
diff --git a/scorer/artifacts.py b/scorer/artifacts.py
new file mode 100644
index 0000000..165d147
--- /dev/null
+++ b/scorer/artifacts.py
@@ -0,0 +1,173 @@
+"""Dataclasses for scorer artifact payloads."""
+
+from __future__ import annotations
+
+import copy
+from dataclasses import dataclass, field
+from typing import Any
+
+from .config import DIMENSION_NAMES, SCORER_VERSION
+
+
+@dataclass
+class ScoredDifficulty:
+    """Backward-compatible 12-dimension score report."""
+
+    dimensions: list[float]
+    dimension_names: list[str] = field(default_factory=lambda: DIMENSION_NAMES.copy())
+    composite: float = 0.0
+    weights: list[float] = field(default_factory=lambda: [1.0] * len(DIMENSION_NAMES))
+
+    @property
+    def dimensions_by_name(self) -> dict[str, float]:
+        return dict(zip(self.dimension_names, self.dimensions))
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "dimensions": list(self.dimensions),
+            "dimension_names": list(self.dimension_names),
+            "composite": self.composite,
+            "weights": list(self.weights),
+        }
+
+
+@dataclass
+class CanonicalPathReport:
+    """Canonical solver trace artifact for a task."""
+
+    task_id: str
+    success: bool
+    actions: list[str]
+    positions: list[tuple[int, int]]
+    optimal_steps: int
+    states_explored: int
+    message: str
+    greedy: dict[str, Any] | None = None
+    inputs_hash: str = ""
+    producer_version: str = SCORER_VERSION
+
+    @property
+    def bfs(self) -> dict[str, Any]:
+        return {
+            "success": self.success,
+            "actions": list(self.actions),
+            "positions": [list(pos) for pos in self.positions],
+            "optimal_steps": self.optimal_steps,
+            "states_explored": self.states_explored,
+            "message": self.message,
+        }
+
+    def to_dict(self) -> dict[str, Any]:
+        payload = {
+            "task_id": self.task_id,
+            "bfs": self.bfs,
+            "inputs_hash": self.inputs_hash,
+            "producer_version": self.producer_version,
+        }
+        if self.greedy is not None:
+            payload["greedy"] = copy.deepcopy(self.greedy)
+        return payload
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "CanonicalPathReport":
+        bfs = data.get("bfs", data)
+        return cls(
+            task_id=str(data.get("task_id", "")),
+            success=bool(bfs.get("success", False)),
+            actions=[str(action) for action in bfs.get("actions", [])],
+            positions=[
+                (int(pos[0]), int(pos[1]))
+                for pos in bfs.get("positions", [])
+                if isinstance(pos, (list, tuple)) and len(pos) >= 2
+            ],
+            optimal_steps=int(bfs.get("optimal_steps", 0)),
+            states_explored=int(bfs.get("states_explored", 0)),
+            message=str(bfs.get("message", "")),
+            greedy=copy.deepcopy(data.get("greedy")),
+            inputs_hash=str(data.get("inputs_hash", "")),
+            producer_version=str(data.get("producer_version", SCORER_VERSION)),
+        )
+
+
+@dataclass
+class StaticScoreArtifact:
+    """Stage 2 static score artifact."""
+
+    task_id: str
+    is_beatable: bool
+    message: str
+    dimensions: dict[str, float]
+    static_score_unweighted: float
+    static_score: float
+    weights: dict[str, float]
+    validation: dict[str, Any]
+    canonical_agent_features: dict[str, float | None]
+    calibration_version: str
+    inputs_hash: str
+    producer_version: str = SCORER_VERSION
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "task_id": self.task_id,
+            "is_beatable": self.is_beatable,
+            "message": self.message,
+            "dimensions_12": dict(self.dimensions),
+            "static_score_unweighted": self.static_score_unweighted,
+            "static_score": self.static_score,
+            "weights": dict(self.weights),
+            "validation": copy.deepcopy(self.validation),
+            "canonical_agent_features": dict(self.canonical_agent_features),
+            "calibration_version": self.calibration_version,
+            "inputs_hash": self.inputs_hash,
+            "producer_version": self.producer_version,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "StaticScoreArtifact":
+        dimensions = data.get("dimensions_12", data.get("dimensions", {}))
+        if isinstance(dimensions, list):
+            dimensions = dict(zip(DIMENSION_NAMES, dimensions))
+        return cls(
+            task_id=str(data.get("task_id", "")),
+            is_beatable=bool(data.get("is_beatable", False)),
+            message=str(data.get("message", "")),
+            dimensions={str(k): float(v) for k, v in dimensions.items()},
+            static_score_unweighted=float(data.get("static_score_unweighted", 0.0)),
+            static_score=float(data.get("static_score", data.get("composite", 0.0))),
+            weights={str(k): float(v) for k, v in data.get("weights", {}).items()},
+            validation=dict(data.get("validation", {})),
+            canonical_agent_features=dict(data.get("canonical_agent_features", {})),
+            calibration_version=str(data.get("calibration_version", "unknown")),
+            inputs_hash=str(data.get("inputs_hash", "")),
+            producer_version=str(data.get("producer_version", SCORER_VERSION)),
+        )
+
+
+@dataclass
+class RuntimeScoreArtifact:
+    """Stage 4 runtime score artifact for one run."""
+
+    task_id: str
+    backend: str
+    adapter: str
+    model_id: str
+    seed: int | None
+    signals: dict[str, Any]
+    composite: float
+    calibration_version: str
+    inputs_hash: str
+    producer_version: str = SCORER_VERSION
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "task_id": self.task_id,
+            "backend": self.backend,
+            "adapter": self.adapter,
+            "model_id": self.model_id,
+            "seed": self.seed,
+            "signals": copy.deepcopy(self.signals),
+            "composite": self.composite,
+            "calibration_version": self.calibration_version,
+            "inputs_hash": self.inputs_hash,
+            "producer_version": self.producer_version,
+        }
diff --git a/scorer/config.py b/scorer/config.py
new file mode 100644
index 0000000..cce1c45
--- /dev/null
+++ b/scorer/config.py
@@ -0,0 +1,146 @@
+"""Scorer configuration and calibration defaults."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from .io import load_json
+
+
+SCORER_VERSION = "0.3.0"
+DEFAULT_CONFIG_PATH = Path(__file__).with_name("scorer_config.json")
+
+DIMENSION_NAMES = [
+    "optimal_path_length",
+    "search_space_size",
+    "backtracking_required",
+    "fragility",
+    "dependency_depth",
+    "dependency_variety",
+    "distractor_count",
+    "distractor_quality",
+    "grid_size",
+    "wall_density",
+    "partial_observability",
+    "irreversibility",
+]
+
+GREEDY_SOLVABILITY_FEATURE = "greedy_solvability"
+
+CANONICAL_AGENT_FEATURE_NAMES = [
+    GREEDY_SOLVABILITY_FEATURE,
+]
+
+DEFAULT_DISTRACTOR_TYPE_WEIGHTS = {
+    "wrong_color_key": 1.0,
+    "inactive_switch": 2.0,
+    "decoy_door": 2.0,
+    "distractor_chain": 3.0,
+}
+
+DEFAULT_RUNTIME_WEIGHTS = {
+    "step_ratio": 1.0,
+    "cell_overlap_bfs": 1.0,
+    "token_efficiency": 1.0,
+    "greedy_penalty": 0.5,
+}
+
+
+def _coerce_float_mapping(
+    values: dict[str, Any] | list[Any] | None,
+    names: list[str],
+    default: float = 1.0,
+) -> dict[str, float]:
+    if values is None:
+        return {name: default for name in names}
+    if isinstance(values, list):
+        if len(values) != len(names):
+            raise ValueError(f"Expected {len(names)} weights, got {len(values)}")
+        result = {name: default for name in names}
+        for name, value in zip(names, values):
+            result[name] = float(value)
+        return result
+    return {name: float(values.get(name, default)) for name in names}
+
+
+@dataclass
+class ScorerConfig:
+    """Weights and runtime coefficients used by the standalone scorer."""
+
+    version: str = "default"
+    static_dimension_weights: dict[str, float] = field(
+        default_factory=lambda: {name: 1.0 for name in DIMENSION_NAMES}
+    )
+    distractor_type_weights: dict[str, float] = field(
+        default_factory=lambda: DEFAULT_DISTRACTOR_TYPE_WEIGHTS.copy()
+    )
+    runtime_weights: dict[str, float] = field(
+        default_factory=lambda: DEFAULT_RUNTIME_WEIGHTS.copy()
+    )
+    baseline_tokens: float = 1000.0
+    difficulty_max_static_score: float | None = None
+
+    @classmethod
+    def default(cls) -> "ScorerConfig":
+        return cls()
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "ScorerConfig":
+        static_weights = data.get("static_dimension_weights", data.get("static_weights"))
+        runtime_weights = data.get("runtime_weights")
+        distractor_weights = data.get("distractor_type_weights", data.get("distractor_weights"))
+
+        difficulty_max = data.get("difficulty_max_static_score")
+        return cls(
+            version=str(data.get("version", "default")),
+            static_dimension_weights=_coerce_float_mapping(static_weights, DIMENSION_NAMES),
+            distractor_type_weights={
+                **DEFAULT_DISTRACTOR_TYPE_WEIGHTS,
+                **{k: float(v) for k, v in (distractor_weights or {}).items()},
+            },
+            runtime_weights={
+                **DEFAULT_RUNTIME_WEIGHTS,
+                **{k: float(v) for k, v in (runtime_weights or {}).items()},
+            },
+            baseline_tokens=float(data.get("baseline_tokens", 1000.0)),
+            difficulty_max_static_score=(
+                None if difficulty_max is None else float(difficulty_max)
+            ),
+        )
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "version": self.version,
+            "static_dimension_weights": dict(self.static_dimension_weights),
+            "distractor_type_weights": dict(self.distractor_type_weights),
+            "runtime_weights": dict(self.runtime_weights),
+            "baseline_tokens": self.baseline_tokens,
+            "difficulty_max_static_score": self.difficulty_max_static_score,
+        }
+
+    def static_weight_list(self) -> list[float]:
+        return [self.static_dimension_weights.get(name, 1.0) for name in DIMENSION_NAMES]
+
+
+def load_scorer_config(path: str | Path | None = None) -> ScorerConfig:
+    """Load scorer weights from JSON, or return defaults if no file exists."""
+    config_path = Path(path) if path is not None else DEFAULT_CONFIG_PATH
+    if not config_path.exists():
+        if path is not None:
+            raise FileNotFoundError(f"Scorer config not found: {config_path}")
+        return ScorerConfig.default()
+    if config_path.suffix.lower() in {".yaml", ".yml"}:
+        try:
+            import yaml  # type: ignore
+        except ImportError as exc:
+            raise ImportError(
+                "YAML scorer configs require PyYAML. Use JSON or install PyYAML."
+            ) from exc
+        with open(config_path, "r", encoding="utf-8") as f:
+            data = yaml.safe_load(f) or {}
+        if not isinstance(data, dict):
+            raise ValueError(f"Expected a YAML object in {config_path}")
+        return ScorerConfig.from_dict(data)
+    return ScorerConfig.from_dict(load_json(config_path))
diff --git a/scorer/io.py b/scorer/io.py
new file mode 100644
index 0000000..6d929f2
--- /dev/null
+++ b/scorer/io.py
@@ -0,0 +1,62 @@
+"""JSON and hash helpers for scorer artifacts."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+from pathlib import Path
+from typing import Any
+
+from gridworld.task_spec import TaskSpecification
+
+
+def json_default(value: Any) -> Any:
+    if hasattr(value, "item"):
+        return value.item()
+    raise TypeError(f"Object of type {value.__class__.__name__} is not JSON serializable")
+
+
+def load_json(path: str | Path) -> dict[str, Any]:
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, dict):
+        raise ValueError(f"Expected a JSON object in {path}")
+    return data
+
+
+def dump_json(path: str | Path, payload: dict[str, Any]) -> None:
+    output_path = Path(path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(payload, f, indent=2, default=json_default)
+        f.write("\n")
+
+
+def json_files(paths: list[str]) -> list[Path]:
+    """Expand JSON files and directories into a stable file list."""
+    files: list[Path] = []
+    for value in paths:
+        path = Path(value)
+        if path.is_dir():
+            files.extend(sorted(path.rglob("*.json")))
+        else:
+            files.append(path)
+    return files
+
+
+def stable_hash(payload: Any) -> str:
+    encoded = json.dumps(payload, sort_keys=True, separators=(",", ":"), default=json_default)
+    return hashlib.sha256(encoded.encode("utf-8")).hexdigest()
+
+
+def task_spec_from_payload(data: dict[str, Any]) -> TaskSpecification:
+    if "task_spec" in data and isinstance(data["task_spec"], dict):
+        return TaskSpecification.from_dict(data["task_spec"])
+    if "TaskSpecification" in data and isinstance(data["TaskSpecification"], dict):
+        return TaskSpecification.from_dict(data)
+    required_fields = {"task_id", "maze", "goal", "max_steps"}
+    if not required_fields.issubset(data):
+        raise ValueError(
+            "Input JSON is not a task artifact. Expected task fields or a nested task_spec."
+        )
+    return TaskSpecification.from_dict(data)
diff --git a/scorer/runtime.py b/scorer/runtime.py
new file mode 100644
index 0000000..a1567ab
--- /dev/null
+++ b/scorer/runtime.py
@@ -0,0 +1,363 @@
+"""Runtime scoring for run and episode JSON artifacts."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from .artifacts import CanonicalPathReport, RuntimeScoreArtifact, StaticScoreArtifact
+from .config import SCORER_VERSION, ScorerConfig
+from .io import dump_json, load_json, stable_hash
+
+
+def _artifact_dict(value: dict[str, Any] | StaticScoreArtifact | CanonicalPathReport) -> dict[str, Any]:
+    if hasattr(value, "to_dict"):
+        return value.to_dict()  # type: ignore[no-any-return]
+    return value
+
+
+def _lookup_path(data: dict[str, Any], *keys: str) -> Any:
+    current: Any = data
+    for key in keys:
+        if not isinstance(current, dict) or key not in current:
+            return None
+        current = current[key]
+    return current
+
+
+def _extract_task_id(run: dict[str, Any], fallback: str = "") -> str:
+    return str(
+        run.get("task_id")
+        or _lookup_path(run, "task_spec", "task_id")
+        or _lookup_path(run, "episode", "task_id")
+        or fallback
+    )
+
+
+def _extract_bool(run: dict[str, Any], *keys: str, default: bool = False) -> bool:
+    for key in keys:
+        value = run.get(key)
+        if value is not None:
+            return bool(value)
+    return default
+
+
+def _extract_steps(run: dict[str, Any]) -> int | None:
+    for key in ("steps", "steps_taken", "steps_used"):
+        if run.get(key) is not None:
+            return int(run[key])
+    signal_steps = _lookup_path(run, "signals", "steps")
+    if signal_steps is not None:
+        return int(signal_steps)
+    final_step = _lookup_path(run, "final_state", "step_count")
+    if final_step is not None:
+        return int(final_step)
+    return None
+
+
+def _extract_token_count(run: dict[str, Any]) -> int | None:
+    for key in ("total_tokens", "token_count", "tokens"):
+        if run.get(key) is not None:
+            return int(run[key])
+    signal_tokens = _lookup_path(run, "signals", "token_count")
+    if signal_tokens is not None:
+        return int(signal_tokens)
+
+    trajectory_total = _sum_record_tokens(run.get("trajectory", []))
+    if trajectory_total is not None:
+        return trajectory_total
+    return _sum_record_tokens(run.get("transcript", []), kind="query")
+
+
+def _sum_record_tokens(records: Any, kind: str | None = None) -> int | None:
+    from interface.telemetry import token_count_from_record
+
+    if not isinstance(records, list):
+        return None
+    total = 0
+    found = False
+    for item in records:
+        if not isinstance(item, dict):
+            continue
+        if kind is not None and item.get("kind") != kind:
+            continue
+        item_tokens = token_count_from_record(item)
+        if item_tokens is not None:
+            total += item_tokens
+            found = True
+    return total if found else None
+
+
+def _state_position(state: Any) -> tuple[int, int] | None:
+    if not isinstance(state, dict):
+        return None
+    raw = state.get("agent_position") or state.get("position")
+    if isinstance(raw, (list, tuple)) and len(raw) >= 2:
+        return int(raw[0]), int(raw[1])
+    return None
+
+
+def _extract_run_positions(run: dict[str, Any]) -> list[tuple[int, int]]:
+    positions: list[tuple[int, int]] = []
+
+    initial_pos = _state_position(run.get("initial_state"))
+    if initial_pos is not None:
+        positions.append(initial_pos)
+
+    for item in run.get("trajectory", []):
+        if not isinstance(item, dict):
+            continue
+        pos = _state_position(item.get("state"))
+        if pos is not None:
+            positions.append(pos)
+
+    for item in run.get("transcript", []):
+        if not isinstance(item, dict):
+            continue
+        if item.get("kind") == "reset":
+            pos = _state_position(item.get("state"))
+        else:
+            pos = _state_position(item.get("state_after"))
+            if pos is None:
+                raw = item.get("position_after")
+                pos = (int(raw[0]), int(raw[1])) if isinstance(raw, list) and len(raw) >= 2 else None
+        if pos is not None:
+            positions.append(pos)
+
+    final_pos = _state_position(run.get("final_state"))
+    if final_pos is not None:
+        positions.append(final_pos)
+
+    deduped: list[tuple[int, int]] = []
+    for pos in positions:
+        if not deduped or deduped[-1] != pos:
+            deduped.append(pos)
+    return deduped
+
+
+def _extract_canonical_positions(
+    canonical_paths: dict[str, Any],
+    agent: str = "bfs",
+) -> list[tuple[int, int]]:
+    path = canonical_paths.get(agent, canonical_paths if agent == "bfs" else {})
+    if not isinstance(path, dict):
+        return []
+    positions = []
+    for pos in path.get("positions", []):
+        if isinstance(pos, (list, tuple)) and len(pos) >= 2:
+            positions.append((int(pos[0]), int(pos[1])))
+    return positions
+
+
+def _cell_overlap(run_positions: list[tuple[int, int]], canonical_positions: list[tuple[int, int]]) -> float:
+    canonical_cells = set(canonical_positions)
+    if not canonical_cells:
+        return 0.0
+    return len(set(run_positions) & canonical_cells) / len(canonical_cells)
+
+
+def _extract_static_score(static_score: dict[str, Any]) -> float:
+    return float(static_score.get("static_score", static_score.get("composite", 0.0)))
+
+
+def _extract_greedy_solvability(static_score: dict[str, Any]) -> float:
+    value = _lookup_path(static_score, "canonical_agent_features", "greedy_solvability")
+    if value is None:
+        raise ValueError("Runtime scoring requires evaluated canonical_agent_features.greedy_solvability")
+    solvability = float(value)
+    if not 0.0 <= solvability <= 1.0:
+        raise ValueError("greedy_solvability must be between 0.0 and 1.0")
+    return solvability
+
+
+def _runtime_weighted_average(signals: dict[str, float], weights: dict[str, float]) -> float:
+    numerator = 0.0
+    denominator = 0.0
+    for key in ("step_ratio", "cell_overlap_bfs", "token_efficiency"):
+        weight = float(weights.get(key, 0.0))
+        numerator += signals[key] * weight
+        denominator += weight
+    return numerator / denominator if denominator else 0.0
+
+
+def _first_present(*values: Any) -> Any:
+    for value in values:
+        if value is not None:
+            return value
+    return None
+
+
+def compute_runtime_score(
+    run: dict[str, Any],
+    static_score: dict[str, Any] | StaticScoreArtifact,
+    canonical_paths: dict[str, Any] | CanonicalPathReport,
+    config: ScorerConfig | None = None,
+    difficulty_max_static_score: float | None = None,
+) -> RuntimeScoreArtifact:
+    """Compute the Stage 4 runtime score for one run JSON payload."""
+    scorer_config = config or ScorerConfig.default()
+    static_data = _artifact_dict(static_score)
+    canonical_data = _artifact_dict(canonical_paths)
+    if _lookup_path(static_data, "validation", "schema_valid") is False:
+        raise ValueError("Runtime scoring requires a schema-valid scored_static.json artifact")
+
+    task_id = _extract_task_id(run, fallback=str(static_data.get("task_id", "")))
+    success = _extract_bool(run, "success", default=bool(_lookup_path(run, "signals", "success") or False))
+    steps = _extract_steps(run)
+    token_count = _extract_token_count(run)
+    canonical_positions = _extract_canonical_positions(canonical_data)
+    greedy_positions = _extract_canonical_positions(canonical_data, agent="greedy")
+    run_positions = _extract_run_positions(run)
+
+    optimal_steps_value = _first_present(
+        _lookup_path(canonical_data, "bfs", "optimal_steps"),
+        canonical_data.get("optimal_steps"),
+        static_data.get("optimal_steps"),
+    )
+    if optimal_steps_value is None:
+        raise ValueError("Runtime scoring requires bfs.optimal_steps in canonical_paths.json")
+    optimal_steps = int(optimal_steps_value)
+    if steps is None:
+        raise ValueError("Runtime scoring requires step telemetry")
+    if steps < 0:
+        raise ValueError("steps must not be negative")
+    step_ratio = 0.0
+    if success and optimal_steps == 0:
+        step_ratio = 1.0 if steps == 0 else 0.0
+    elif success:
+        step_ratio = optimal_steps / max(float(steps), float(optimal_steps), 1.0)
+
+    cell_overlap_bfs = _cell_overlap(run_positions, canonical_positions)
+    cell_overlap_greedy = (
+        _cell_overlap(run_positions, greedy_positions)
+        if isinstance(canonical_data.get("greedy"), dict)
+        else None
+    )
+    if token_count is None:
+        raise ValueError("Runtime scoring requires positive token telemetry")
+    if token_count <= 0:
+        raise ValueError("token_count must be greater than zero")
+    token_efficiency = min(1.0, scorer_config.baseline_tokens / float(token_count))
+
+    static_composite = _extract_static_score(static_data)
+    normalizer = (
+        difficulty_max_static_score
+        if difficulty_max_static_score is not None
+        else scorer_config.difficulty_max_static_score
+    )
+    if normalizer is None:
+        raise ValueError(
+            "Runtime scoring requires difficulty_max_static_score from the task suite "
+            "or scorer config"
+        )
+    if normalizer <= 0:
+        raise ValueError("difficulty_max_static_score must be greater than zero")
+    if static_composite > normalizer:
+        raise ValueError("difficulty_max_static_score must be at least the task static score")
+    difficulty_weight = static_composite / normalizer
+    success_factor = 1.0 if success else 0.0
+    efficiency_signals = {
+        "step_ratio": step_ratio,
+        "cell_overlap_bfs": cell_overlap_bfs,
+        "token_efficiency": token_efficiency,
+    }
+    efficiency_factor = _runtime_weighted_average(
+        efficiency_signals,
+        scorer_config.runtime_weights,
+    )
+    greedy_solvability = _extract_greedy_solvability(static_data)
+    greedy_penalty = (
+        scorer_config.runtime_weights.get("greedy_penalty", 0.0)
+        * greedy_solvability
+        * success_factor
+    )
+    composite = max(0.0, success_factor * efficiency_factor * difficulty_weight - greedy_penalty)
+
+    signals: dict[str, Any] = {
+        "success": success,
+        "steps": steps,
+        "terminated": _extract_bool(run, "terminated", default=False),
+        "truncated": _extract_bool(run, "truncated", default=False),
+        "terminated_reason": run.get("terminated_reason") or run.get("end_reason") or ("success" if success else "unknown"),
+        "reward": run.get("reward", run.get("total_reward")),
+        "token_count": token_count,
+        "optimal_steps": optimal_steps,
+        "step_ratio": step_ratio,
+        "cell_overlap_bfs": cell_overlap_bfs,
+        "cell_overlap_greedy": cell_overlap_greedy,
+        "token_efficiency": token_efficiency,
+        "difficulty_weight": difficulty_weight,
+        "efficiency_factor": efficiency_factor,
+        "greedy_penalty": greedy_penalty,
+    }
+    for key in (
+        "distractor_interactions",
+        "irreversible_failures",
+        "path_choice",
+        "mechanism_interaction_order",
+        "failure_point",
+    ):
+        if run.get(key) is not None:
+            signals[key] = run[key]
+
+    inputs_hash = stable_hash(
+        {
+            "run": {
+                "task_id": task_id,
+                "backend": run.get("backend"),
+                "adapter": run.get("adapter", run.get("agent_or_model")),
+                "model_id": run.get("model_id", run.get("model_name", run.get("agent_or_model"))),
+                "seed": run.get("seed"),
+                "positions": run_positions,
+                "signals": signals,
+            },
+            "static_score": {
+                "task_id": static_data.get("task_id"),
+                "static_score": static_composite,
+                "greedy_solvability": greedy_solvability,
+            },
+            "canonical_paths": {
+                "bfs_positions": canonical_positions,
+                "greedy_positions": greedy_positions,
+                "optimal_steps": optimal_steps,
+            },
+            "config": scorer_config.to_dict(),
+            "scorer_version": SCORER_VERSION,
+        }
+    )
+
+    return RuntimeScoreArtifact(
+        task_id=task_id,
+        backend=str(run.get("backend", "")),
+        adapter=str(run.get("adapter", run.get("agent_or_model", ""))),
+        model_id=str(run.get("model_id", run.get("model_name", run.get("agent_or_model", "")))),
+        seed=int(run["seed"]) if run.get("seed") is not None else None,
+        signals=signals,
+        composite=composite,
+        calibration_version=scorer_config.version,
+        inputs_hash=inputs_hash,
+    )
+
+
+def score_runtime_file(
+    run_path: str | Path,
+    static_score_path: str | Path,
+    canonical_paths_path: str | Path,
+    output_path: str | Path | None = None,
+    config: ScorerConfig | None = None,
+    difficulty_max_static_score: float | None = None,
+) -> RuntimeScoreArtifact:
+    """Score one run JSON file and optionally write run_score.json."""
+    run = load_json(run_path)
+    static_score = load_json(static_score_path)
+    canonical_paths = load_json(canonical_paths_path)
+    score = compute_runtime_score(
+        run,
+        static_score=static_score,
+        canonical_paths=canonical_paths,
+        config=config,
+        difficulty_max_static_score=difficulty_max_static_score,
+    )
+    if output_path is not None:
+        dump_json(output_path, score.to_dict())
+    return score
diff --git a/scorer/scorer_config.json b/scorer/scorer_config.json
new file mode 100644
index 0000000..fb7ed8f
--- /dev/null
+++ b/scorer/scorer_config.json
@@ -0,0 +1,31 @@
+{
+  "version": "default-v2",
+  "static_dimension_weights": {
+    "optimal_path_length": 1.0,
+    "search_space_size": 1.0,
+    "backtracking_required": 1.0,
+    "fragility": 1.0,
+    "dependency_depth": 1.0,
+    "dependency_variety": 1.0,
+    "distractor_count": 1.0,
+    "distractor_quality": 1.0,
+    "grid_size": 1.0,
+    "wall_density": 1.0,
+    "partial_observability": 1.0,
+    "irreversibility": 1.0
+  },
+  "distractor_type_weights": {
+    "wrong_color_key": 1.0,
+    "inactive_switch": 2.0,
+    "decoy_door": 2.0,
+    "distractor_chain": 3.0
+  },
+  "runtime_weights": {
+    "step_ratio": 1.0,
+    "cell_overlap_bfs": 1.0,
+    "token_efficiency": 1.0,
+    "greedy_penalty": 0.5
+  },
+  "baseline_tokens": 1000.0,
+  "difficulty_max_static_score": null
+}
diff --git a/scorer/scoring.py b/scorer/scoring.py
new file mode 100644
index 0000000..6d12100
--- /dev/null
+++ b/scorer/scoring.py
@@ -0,0 +1,45 @@
+"""Public scorer interface for static and runtime analysis."""
+
+from __future__ import annotations
+
+from .artifacts import (
+    CanonicalPathReport,
+    RuntimeScoreArtifact,
+    ScoredDifficulty,
+    StaticScoreArtifact,
+)
+from .config import (
+    CANONICAL_AGENT_FEATURE_NAMES,
+    DEFAULT_CONFIG_PATH,
+    DEFAULT_DISTRACTOR_TYPE_WEIGHTS,
+    DEFAULT_RUNTIME_WEIGHTS,
+    DIMENSION_NAMES,
+    SCORER_VERSION,
+    ScorerConfig,
+    load_scorer_config,
+)
+from .runtime import compute_runtime_score, score_runtime_file
+from .solvers import compute_canonical_paths, compute_greedy_solvability
+from .static import compute_12d_score, compute_static_score_artifact, score_task_file
+
+__all__ = [
+    "CANONICAL_AGENT_FEATURE_NAMES",
+    "DEFAULT_CONFIG_PATH",
+    "DEFAULT_DISTRACTOR_TYPE_WEIGHTS",
+    "DEFAULT_RUNTIME_WEIGHTS",
+    "DIMENSION_NAMES",
+    "SCORER_VERSION",
+    "CanonicalPathReport",
+    "RuntimeScoreArtifact",
+    "ScoredDifficulty",
+    "ScorerConfig",
+    "StaticScoreArtifact",
+    "compute_12d_score",
+    "compute_canonical_paths",
+    "compute_greedy_solvability",
+    "compute_runtime_score",
+    "compute_static_score_artifact",
+    "load_scorer_config",
+    "score_runtime_file",
+    "score_task_file",
+]
diff --git a/scorer/solvers.py b/scorer/solvers.py
new file mode 100644
index 0000000..5803fef
--- /dev/null
+++ b/scorer/solvers.py
@@ -0,0 +1,79 @@
+"""Canonical solver integration for scorer artifacts."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from gridworld.baselines import PlannedPath, plan_bfs_path, plan_greedy_path
+from gridworld.task_spec import TaskSpecification
+
+from .artifacts import CanonicalPathReport
+from .config import SCORER_VERSION
+from .io import stable_hash
+
+
+def _path_payload(path) -> dict[str, Any]:
+    return {
+        "success": path.success,
+        "actions": list(path.action_labels),
+        "positions": [list(pos) for pos in path.positions],
+        "steps": len(path.action_labels),
+    }
+
+
+def require_scorable_spec(spec: TaskSpecification) -> None:
+    """Reject malformed tasks before canonical planners inspect their coordinates."""
+    schema_valid, schema_errors = spec.validate()
+    if not schema_valid:
+        detail = "; ".join(schema_errors)
+        raise ValueError(f"Task {spec.task_id!r} failed schema validation: {detail}")
+
+
+def compute_canonical_paths(
+    spec: TaskSpecification,
+    bfs_path: PlannedPath | None = None,
+    greedy_path: PlannedPath | None = None,
+) -> CanonicalPathReport:
+    """Emit canonical BFS and greedy traces using the merged baseline solvers."""
+    require_scorable_spec(spec)
+    if bfs_path is None:
+        bfs_path = plan_bfs_path(spec)
+    if greedy_path is None:
+        greedy_path = plan_greedy_path(spec)
+
+    if bfs_path.success:
+        message = (
+            f"Solution found in {len(bfs_path.action_labels)} steps "
+            f"({bfs_path.states_explored} states explored)"
+        )
+    elif bfs_path.states_explored:
+        message = (
+            "No solution found "
+            f"({bfs_path.states_explored} states explored, all reachable states checked)"
+        )
+    else:
+        message = "No solution found"
+
+    inputs_hash = stable_hash({"task": spec.to_dict(), "scorer_version": SCORER_VERSION})
+
+    return CanonicalPathReport(
+        task_id=spec.task_id,
+        success=bfs_path.success,
+        actions=list(bfs_path.action_labels),
+        positions=list(bfs_path.positions),
+        optimal_steps=len(bfs_path.action_labels) if bfs_path.success else 0,
+        states_explored=bfs_path.states_explored,
+        message=message,
+        greedy=_path_payload(greedy_path),
+        inputs_hash=inputs_hash,
+    )
+
+
+def compute_greedy_solvability(
+    spec: TaskSpecification,
+    greedy_path: PlannedPath | None = None,
+) -> float:
+    """Return 1 when the merged greedy planner solves the task, else 0."""
+    if greedy_path is None:
+        greedy_path = plan_greedy_path(spec)
+    return 1.0 if greedy_path.success else 0.0
diff --git a/scorer/static.py b/scorer/static.py
new file mode 100644
index 0000000..adac502
--- /dev/null
+++ b/scorer/static.py
@@ -0,0 +1,264 @@
+"""Static task scoring and Stage 2 artifact generation."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from gridworld.baselines import PlannedPath, plan_bfs_path, plan_greedy_path
+from gridworld.task_spec import TaskSpecification
+from gridworld.task_validator import DifficultyReport, TaskValidator, compute_difficulty
+
+from .artifacts import ScoredDifficulty, StaticScoreArtifact
+from .config import (
+    DEFAULT_DISTRACTOR_TYPE_WEIGHTS,
+    DIMENSION_NAMES,
+    GREEDY_SOLVABILITY_FEATURE,
+    SCORER_VERSION,
+    ScorerConfig,
+)
+from .io import dump_json, load_json, stable_hash, task_spec_from_payload
+from .solvers import compute_canonical_paths, compute_greedy_solvability, require_scorable_spec
+
+
+def _count_backtracking(solution: list[tuple[int, int]] | None) -> float:
+    if not solution:
+        return 0.0
+    seen = set()
+    revisits = 0
+    previous_pos = None
+    for pos in solution:
+        if pos == previous_pos:
+            continue
+        if pos in seen:
+            revisits += 1
+        seen.add(pos)
+        previous_pos = pos
+    return float(revisits)
+
+
+def _dependency_variety(spec: TaskSpecification) -> float:
+    if spec.dependency_chain is not None:
+        return float(len({step.type for step in spec.dependency_chain.sequence}))
+
+    variety = 0
+    if spec.mechanisms.keys and spec.mechanisms.doors:
+        variety += 1
+    if spec.mechanisms.switches and spec.mechanisms.gates:
+        variety += 1
+    if spec.mechanisms.blocks:
+        variety += 1
+    if spec.mechanisms.teleporters:
+        variety += 1
+    if spec.mechanisms.hazards:
+        variety += 1
+    return float(variety)
+
+
+def _distractor_quality(
+    spec: TaskSpecification,
+    distractor_type_weights: dict[str, float] | None = None,
+) -> float:
+    if not spec.distractors:
+        return 0.0
+    weights = distractor_type_weights or DEFAULT_DISTRACTOR_TYPE_WEIGHTS
+    return float(sum(weights.get(d.type, 1.0) for d in spec.distractors))
+
+
+def _partial_observability(spec: TaskSpecification) -> float:
+    mapping = {"full": 0.0, "view_cone": 1.0, "fog_of_war": 2.0}
+    return mapping.get(spec.rules.observability, 0.0)
+
+
+def _irreversibility(spec: TaskSpecification) -> float:
+    score = 0.0
+    if spec.rules.key_consumption:
+        score += float(len(spec.mechanisms.doors))
+    score += float(sum(1 for switch in spec.mechanisms.switches if switch.switch_type == "one_shot"))
+    score += float(sum(1 for tp in spec.mechanisms.teleporters if not tp.bidirectional))
+    return score
+
+
+def compute_12d_score(
+    spec: TaskSpecification,
+    solver_output: DifficultyReport | None = None,
+    weights: list[float] | None = None,
+    config: ScorerConfig | None = None,
+    validator: TaskValidator | None = None,
+    bfs_path: PlannedPath | None = None,
+) -> ScoredDifficulty:
+    """
+    Compute the 12-dimension static benchmark score.
+
+    This keeps the old call shape while calibration and artifact generation
+    live in the standalone scorer package.
+    """
+    require_scorable_spec(spec)
+    scorer_config = config or ScorerConfig.default()
+    task_validator = validator or TaskValidator(spec)
+    if solver_output is None:
+        solver_output = compute_difficulty(spec, validator=task_validator)
+    if bfs_path is None:
+        bfs_path = plan_bfs_path(spec)
+
+    fragility = task_validator.compute_fragility()
+    fragility_value = 0.0 if fragility.min_steps_to_break == -1 else 1.0 / fragility.min_steps_to_break
+
+    width, height = spec.maze.dimensions
+    grid_size = float(width * height)
+    wall_density = float(len(spec.maze.walls) / grid_size) if grid_size else 0.0
+
+    dimensions = [
+        float(len(bfs_path.action_labels) if bfs_path.success else 0),
+        float(bfs_path.states_explored),
+        _count_backtracking(bfs_path.positions),
+        fragility_value,
+        float(spec.dependency_chain.depth if spec.dependency_chain is not None else solver_output.dependency_depth),
+        _dependency_variety(spec),
+        float(len(spec.distractors or [])),
+        _distractor_quality(spec, scorer_config.distractor_type_weights),
+        grid_size,
+        wall_density,
+        _partial_observability(spec),
+        _irreversibility(spec),
+    ]
+
+    weight_vector = (
+        scorer_config.static_weight_list()
+        if weights is None
+        else [float(weight) for weight in weights]
+    )
+    if len(weight_vector) != len(dimensions):
+        raise ValueError(f"Expected {len(dimensions)} static weights, got {len(weight_vector)}")
+    composite = float(sum(d * w for d, w in zip(dimensions, weight_vector)))
+    return ScoredDifficulty(
+        dimensions=dimensions,
+        dimension_names=DIMENSION_NAMES.copy(),
+        composite=composite,
+        weights=weight_vector,
+    )
+
+
+def compute_static_score_artifact(
+    spec: TaskSpecification,
+    config: ScorerConfig | None = None,
+    solver_output: DifficultyReport | None = None,
+    validator: TaskValidator | None = None,
+    validation_result: tuple[bool, list[tuple[int, int]] | None, str] | None = None,
+    bfs_path: PlannedPath | None = None,
+    greedy_path: PlannedPath | None = None,
+) -> StaticScoreArtifact:
+    """Compute the Stage 2 static score artifact for one task."""
+    require_scorable_spec(spec)
+    scorer_config = config or ScorerConfig.default()
+    schema_valid, schema_errors = spec.validate()
+    task_validator = validator or TaskValidator(spec)
+    if validation_result is None:
+        validation_result = task_validator.validate()
+    is_beatable, _, message = validation_result
+    if solver_output is None:
+        solver_output = compute_difficulty(
+            spec,
+            validator=task_validator,
+            validation_result=validation_result,
+        )
+    if bfs_path is None:
+        bfs_path = plan_bfs_path(spec)
+    if is_beatable != bfs_path.success:
+        raise ValueError(
+            "Task validator and canonical BFS disagree on beatability for "
+            f"{spec.task_id!r}"
+        )
+    score = compute_12d_score(
+        spec,
+        solver_output=solver_output,
+        config=scorer_config,
+        validator=task_validator,
+        bfs_path=bfs_path,
+    )
+
+    mechanism_necessity_violations: list[str] = []
+    distractor_safety_violations: list[str] = []
+    chain_ordering_valid = True
+    if schema_valid:
+        mechanism_necessity_violations = task_validator.validate_mechanism_necessity()
+        distractor_safety_violations = task_validator.validate_distractor_safety(
+            base_beatable=is_beatable
+        )
+        chain_ordering_valid = task_validator.validate_chain_ordering()
+
+    dimensions = score.dimensions_by_name
+    static_score_unweighted = float(sum(dimensions.values()))
+    inputs_hash = stable_hash(
+        {
+            "task": spec.to_dict(),
+            "config": scorer_config.to_dict(),
+            "scorer_version": SCORER_VERSION,
+        }
+    )
+
+    return StaticScoreArtifact(
+        task_id=spec.task_id,
+        is_beatable=is_beatable,
+        message=message,
+        dimensions=dimensions,
+        static_score_unweighted=static_score_unweighted,
+        static_score=score.composite,
+        weights=dict(scorer_config.static_dimension_weights),
+        validation={
+            "schema_valid": schema_valid,
+            "schema_errors": schema_errors,
+            "mechanism_necessity_violations": mechanism_necessity_violations,
+            "distractor_safety_violations": distractor_safety_violations,
+            "chain_ordering_valid": chain_ordering_valid,
+        },
+        canonical_agent_features={
+            GREEDY_SOLVABILITY_FEATURE: (
+                compute_greedy_solvability(spec, greedy_path=greedy_path)
+                if schema_valid
+                else None
+            ),
+        },
+        calibration_version=scorer_config.version,
+        inputs_hash=inputs_hash,
+    )
+
+
+def score_task_file(
+    task_path: str | Path,
+    output_dir: str | Path | None = None,
+    config: ScorerConfig | None = None,
+):
+    """Score a task JSON file and optionally write canonical score artifacts."""
+    spec = task_spec_from_payload(load_json(task_path))
+    require_scorable_spec(spec)
+    validator = TaskValidator(spec)
+    validation_result = validator.validate()
+    difficulty = compute_difficulty(
+        spec,
+        validator=validator,
+        validation_result=validation_result,
+    )
+    bfs_path = plan_bfs_path(spec)
+    greedy_path = plan_greedy_path(spec)
+    canonical_paths = compute_canonical_paths(
+        spec,
+        bfs_path=bfs_path,
+        greedy_path=greedy_path,
+    )
+    static_score = compute_static_score_artifact(
+        spec,
+        config=config,
+        solver_output=difficulty,
+        validator=validator,
+        validation_result=validation_result,
+        bfs_path=bfs_path,
+        greedy_path=greedy_path,
+    )
+
+    if output_dir is not None:
+        out = Path(output_dir)
+        out.mkdir(parents=True, exist_ok=True)
+        dump_json(out / "canonical_paths.json", canonical_paths.to_dict())
+        dump_json(out / "scored_static.json", static_score.to_dict())
+
+    return canonical_paths, static_score
diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py
new file mode 100644
index 0000000..9e55628
--- /dev/null
+++ b/scripts/run_pipeline.py
@@ -0,0 +1,603 @@
+"""Bare-bones run-pipeline orchestrator for MultiNet v2.0 (tests 1-3).
+
+Sequential, inspectable Stage 1->5 driver. No DAG runner. Writes the
+``artifacts/`` tree:
+
+    artifacts/
+      tasks/<task_id>/{canonical_paths.json, scored_static.json}
+      tasks/_suite.json
+      runs/<task_id>/<backend>/<model>/seed_<seed>/<condition>/{episode.json, run_inputs.json, run_score.json}
+      episode_runs.jsonl
+      reports/<run_set_id>/{scoring_calibration_summary,complexity_distance_summary,mechanism_ordering_pairs}.json
+
+Selection is data-driven via a **run-config** that maps each model to the task
+files it should run (plus its provider/params); the **manifest** is a separate
+task *catalog* that supplies per-task scoring metadata (experiment, condition,
+expected_mechanisms, test-2 route cells). Stage 3 uses the ``interface/`` runner
+(Stack A) with a live-model agent. Programmatic callers can inject any agent
+callable, e.g. a stub for testing.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Any, Callable, Iterable, Optional
+
+from prompting_experiments import CONDITION_SETS, iter_condition_configs
+from scorer import compute_runtime_score, load_scorer_config, score_task_file
+from scorer.config import SCORER_VERSION, ScorerConfig
+from scorer.io import stable_hash, task_spec_from_payload
+
+from pipeline import episode_metrics, reports
+
+# Bump when Stage-3 run production changes in a way that invalidates cached episodes.
+PIPELINE_VERSION = "0.1.0"
+
+Agent = Callable[[list[dict]], str]
+# A factory used by tests to supply stub agents: (model_name, model_cfg) -> (agent, label).
+AgentFactory = Callable[[str, dict[str, Any]], "tuple[Agent, str]"]
+
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+_DEFAULT_MANIFEST = _REPO_ROOT / "gridworld" / "fixtures" / "manifest.json"
+_EXPERIMENT_KEYWORDS = {"test1", "test2", "test3", "all"}
+
+
+def _sanitize(name: str) -> str:
+    return re.sub(r"[^A-Za-z0-9._-]+", "_", name).strip("_") or "model"
+
+
+# --------------------------------------------------------------------------- #
+# Manifest catalog + task resolution
+# --------------------------------------------------------------------------- #
+def load_manifest(manifest_path: str | Path) -> list[dict[str, Any]]:
+    data = json.loads(Path(manifest_path).read_text(encoding="utf-8"))
+    rows = data["tasks"] if isinstance(data, dict) else data
+    if not isinstance(rows, list):
+        raise ValueError("Manifest must be a list of task rows or {'tasks': [...]}.")
+    return rows
+
+
+def _resolve_path(source: str, manifest_path: Path) -> Optional[Path]:
+    candidate = Path(source)
+    if candidate.is_absolute():
+        return candidate if candidate.exists() else None
+    for base in (Path.cwd(), manifest_path.parent, _REPO_ROOT):
+        resolved = (base / source).resolve()
+        if resolved.exists():
+            return resolved
+    return None
+
+
+def _resolve_source(row: dict[str, Any], manifest_path: Path) -> Path:
+    resolved = _resolve_path(row["source"], manifest_path)
+    if resolved is None:
+        raise FileNotFoundError(f"Task source not found for {row.get('task_id')}: {row['source']}")
+    return resolved
+
+
+def _synth_row(path: Path) -> dict[str, Any]:
+    """A plain task file with no catalog entry runs as a test-1 nav task."""
+    return {
+        "task_id": path.stem,
+        "experiment": "test1",
+        "condition": "default",
+        "variant": path.stem,
+        "source": str(path),
+        "expected_mechanisms": [],
+        "notes": "Synthesized (not in manifest catalog).",
+    }
+
+
+def resolve_task_rows(
+    entries: Iterable[str],
+    catalog: list[dict[str, Any]],
+    manifest_path: Path,
+) -> list[dict[str, Any]]:
+    """Resolve run-config task entries to manifest-style rows (metadata attached).
+
+    Each entry may be an experiment keyword (``test1``/``test2``/``test3``/``all``),
+    a catalog ``task_id``, or a path to a task ``.json``. Paths are matched against
+    the catalog (by resolved path) so test-2/test-3 metadata is preserved; an
+    unmatched path is synthesized as a plain test-1 task. Duplicate task_ids are
+    de-duplicated, keeping first occurrence.
+    """
+    by_id = {r["task_id"]: r for r in catalog}
+    by_path: dict[Path, list[dict[str, Any]]] = {}
+    for r in catalog:
+        resolved = _resolve_path(r["source"], manifest_path)
+        if resolved is not None:
+            by_path.setdefault(resolved, []).append(r)
+
+    resolved_rows: list[dict[str, Any]] = []
+    for entry in entries:
+        if entry in _EXPERIMENT_KEYWORDS:
+            matches = catalog if entry == "all" else [r for r in catalog if r.get("experiment") == entry]
+            if not matches:
+                raise ValueError(f"No catalog tasks for experiment {entry!r}.")
+            resolved_rows.extend(matches)
+            continue
+        if entry in by_id:
+            resolved_rows.append(by_id[entry])
+            continue
+        path = _resolve_path(entry, manifest_path)
+        if path is not None:
+            matches = by_path.get(path)
+            resolved_rows.append(matches[0] if matches else _synth_row(path))
+            continue
+        raise ValueError(
+            f"Cannot resolve task entry {entry!r} (not an experiment keyword, catalog task_id, or file path)."
+        )
+
+    deduped: dict[str, dict[str, Any]] = {}
+    for row in resolved_rows:
+        deduped.setdefault(row["task_id"], row)
+    return list(deduped.values())
+
+
+def _condition_configs(conditions: Optional[str]) -> list[tuple[str, ExperimentConfig]]:
+    from interface.config import ExperimentConfig
+
+    if not conditions:
+        return [("default", ExperimentConfig())]
+    if conditions not in CONDITION_SETS:
+        raise ValueError(
+            f"Unknown --conditions {conditions!r}; available: {sorted(CONDITION_SETS)}."
+        )
+    return list(iter_condition_configs(conditions, ExperimentConfig()))
+
+
+# --------------------------------------------------------------------------- #
+# Content-hash invalidation
+# --------------------------------------------------------------------------- #
+def _expected_static_hash(spec, config: ScorerConfig) -> str:
+    """Mirror scorer.static's scored_static inputs_hash recipe (task + config)."""
+    return stable_hash(
+        {"task": spec.to_dict(), "config": config.to_dict(), "scorer_version": SCORER_VERSION}
+    )
+
+
+def _expected_run_hash(spec, model_name: str, seed: int, backend: str) -> str:
+    """Hash the inputs that determine a Stage-3 episode.
+
+    Excludes scorer config (that invalidates run_score, not the model call) and,
+    pre-v1, the prompt/ExperimentConfig (prompts are not yet versioned while we
+    iterate; the prompt variant still separates runs via the <condition> dir).
+    TODO(release): fold in backend_version + adapter/model code version so code
+    changes invalidate cached episodes at v1.
+    """
+    return stable_hash(
+        {
+            "task": spec.to_dict(),
+            "model_id": model_name,
+            "seed": seed,
+            "backend": backend,
+            "pipeline_version": PIPELINE_VERSION,
+        }
+    )
+
+
+# --------------------------------------------------------------------------- #
+# Stage 2 — static solve & score
+# --------------------------------------------------------------------------- #
+def score_tasks(
+    rows: list[dict[str, Any]],
+    manifest_path: Path,
+    artifacts_root: Path,
+    config: ScorerConfig,
+    force: bool = False,
+) -> dict[str, dict[str, Any]]:
+    """Run Stage 2 over every task; return ``task_id -> scored_static dict``.
+
+    Hash-aware: a cached ``scored_static.json`` is reused only when its
+    ``inputs_hash`` matches the hash recomputed from the current task spec and
+    scorer config; otherwise the task bundle (canonical_paths + scored_static)
+    is regenerated. ``force`` always regenerates.
+    """
+    static_by_task: dict[str, dict[str, Any]] = {}
+    for row in rows:
+        task_id = row["task_id"]
+        source = _resolve_source(row, manifest_path)
+        out_dir = artifacts_root / "tasks" / task_id
+        scored_path = out_dir / "scored_static.json"
+        canonical_path = out_dir / "canonical_paths.json"
+        # Stage 3 reads canonical_paths.json unconditionally, so both halves of
+        # the task bundle must be present to honor the cache.
+        if scored_path.exists() and canonical_path.exists() and not force:
+            cached = json.loads(scored_path.read_text(encoding="utf-8"))
+            spec = task_spec_from_payload(json.loads(Path(source).read_text(encoding="utf-8")))
+            if cached.get("inputs_hash") == _expected_static_hash(spec, config):
+                static_by_task[task_id] = cached
+                continue
+        _, static_score = score_task_file(source, output_dir=out_dir, config=config)
+        static_by_task[task_id] = static_score.to_dict()
+    return static_by_task
+
+
+def _score_suite(
+    rows: list[dict[str, Any]],
+    manifest_path: Path,
+    artifacts_root: Path,
+    config: ScorerConfig,
+    force: bool,
+) -> tuple[dict[str, dict[str, Any]], float]:
+    static_by_task = score_tasks(rows, manifest_path, artifacts_root, config, force=force)
+    scores = [float(s.get("static_score", 0.0)) for s in static_by_task.values()]
+    difficulty_max = max(scores) if scores else 1.0
+    suite_path = artifacts_root / "tasks" / "_suite.json"
+    suite_path.parent.mkdir(parents=True, exist_ok=True)
+    suite_path.write_text(
+        json.dumps(
+            {
+                "difficulty_max_static_score": difficulty_max,
+                "tasks": {t: s.get("static_score") for t, s in static_by_task.items()},
+            },
+            indent=2,
+        ),
+        encoding="utf-8",
+    )
+    return static_by_task, difficulty_max
+
+
+# --------------------------------------------------------------------------- #
+# Stages 3-4 — runs + runtime score (per model)
+# --------------------------------------------------------------------------- #
+def _run_dir(artifacts_root: Path, task_id: str, model: str, seed: int, condition: str) -> Path:
+    return artifacts_root / "runs" / task_id / "minigrid" / model / f"seed_{seed}" / condition
+
+
+def _run_one_model(
+    rows: list[dict[str, Any]],
+    agent: Agent,
+    model_name: str,
+    *,
+    manifest_path: Path,
+    artifacts_root: Path,
+    static_by_task: dict[str, dict[str, Any]],
+    difficulty_max: float,
+    config: ScorerConfig,
+    seeds: Iterable[int],
+    conditions: Optional[str],
+    force: bool,
+) -> tuple[list[dict[str, Any]], dict[tuple, Optional[float]]]:
+    from pipeline.run_stage3 import run_episode
+
+    condition_configs = _condition_configs(conditions)
+    run_rows: list[dict[str, Any]] = []
+    composites: dict[tuple, Optional[float]] = {}
+
+    for row in rows:
+        task_id = row["task_id"]
+        scored_static = static_by_task[task_id]
+        # Tasks Stage 2 marks unbeatable are ineligible: skip the expensive
+        # Stage 3/4 work (model/API calls + scoring) entirely. The reports
+        # surface them via scoring_calibration_summary's ineligible_tasks.
+        if not scored_static.get("is_beatable", True):
+            continue
+        source = _resolve_source(row, manifest_path)
+        spec = task_spec_from_payload(json.loads(Path(source).read_text(encoding="utf-8")))
+        canonical = json.loads(
+            (artifacts_root / "tasks" / task_id / "canonical_paths.json").read_text(encoding="utf-8")
+        )
+
+        for seed in seeds:
+            for variant, cfg in condition_configs:
+                run_dir = _run_dir(artifacts_root, task_id, model_name, seed, variant)
+                episode_path = run_dir / "episode.json"
+                sidecar_path = run_dir / "run_inputs.json"
+                run_score_path = run_dir / "run_score.json"
+
+                # ``condition`` is the task-intrinsic axis (test-3 mechanism
+                # order, carried by the manifest); ``variant`` is the orthogonal
+                # prompt axis from --conditions. Keep them separate so prompt
+                # variants do not collapse onto the manifest condition.
+                manifest_row = dict(row)
+
+                # Stage 3 (expensive: model calls) is hash-cached. Reuse a cached
+                # episode only when its stamped run-inputs hash still matches.
+                expected_hash = _expected_run_hash(spec, model_name, seed, "minigrid")
+                reuse = (
+                    not force
+                    and episode_path.exists()
+                    and sidecar_path.exists()
+                    and json.loads(sidecar_path.read_text(encoding="utf-8")).get("inputs_hash")
+                    == expected_hash
+                )
+                if reuse:
+                    episode = json.loads(episode_path.read_text(encoding="utf-8"))
+                else:
+                    episode = run_episode(source, cfg, agent, seed, run_dir)
+                    sidecar_path.write_text(
+                        json.dumps(
+                            {
+                                "inputs_hash": expected_hash,
+                                "producer_version": PIPELINE_VERSION,
+                                "task_id": task_id,
+                                "model_id": model_name,
+                                "seed": seed,
+                                "backend": "minigrid",
+                                "condition": variant,
+                            },
+                            indent=2,
+                        ),
+                        encoding="utf-8",
+                    )
+
+                # Derive the test-2/test-3 signals once and share them between the
+                # scorer-facing dict and the jsonl row (each call would otherwise
+                # re-walk the whole transcript).
+                metrics = episode_metrics.build_metrics(episode, canonical, manifest_row)
+
+                # Stage 4 is cheap + deterministic: always (re)score from the
+                # episode so scorer-config / static / canonical changes propagate.
+                enriched = episode_metrics.enrich_run_for_scoring(
+                    episode, manifest_row, agent_or_model=model_name, seed=seed, metrics=metrics
+                )
+                run_score = compute_runtime_score(
+                    enriched,
+                    static_score=scored_static,
+                    canonical_paths=canonical,
+                    config=config,
+                    difficulty_max_static_score=difficulty_max,
+                ).to_dict()
+                run_score_path.write_text(json.dumps(run_score, indent=2), encoding="utf-8")
+
+                run_rows.append(
+                    episode_metrics.build_run_row(
+                        episode,
+                        canonical,
+                        manifest_row,
+                        agent_or_model=model_name,
+                        seed=seed,
+                        raw_output_ref=str(episode_path.relative_to(artifacts_root)),
+                        metrics=metrics,
+                        prompt_variant=variant,
+                    )
+                )
+                composites[
+                    (task_id, model_name, seed, manifest_row.get("condition"), variant)
+                ] = run_score.get("composite")
+
+    return run_rows, composites
+
+
+def _write_aggregate(
+    run_rows: list[dict[str, Any]],
+    composites: dict[tuple, Optional[float]],
+    static_by_task: dict[str, dict[str, Any]],
+    metadata_rows: list[dict[str, Any]],
+    artifacts_root: Path,
+    run_set_id: str,
+) -> dict[str, Any]:
+    jsonl_path = artifacts_root / "episode_runs.jsonl"
+    with jsonl_path.open("w", encoding="utf-8") as handle:
+        for run_row in run_rows:
+            handle.write(json.dumps(run_row) + "\n")
+
+    report_dir = artifacts_root / "reports" / run_set_id
+    report_dir.mkdir(parents=True, exist_ok=True)
+    payloads = {
+        "scoring_calibration_summary": reports.scoring_calibration_summary(
+            run_rows, composites, static_by_task
+        ),
+        "complexity_distance_summary": reports.complexity_distance_summary(run_rows),
+        "mechanism_ordering_pairs": reports.mechanism_ordering_pairs(run_rows, metadata_rows),
+    }
+    for name, payload in payloads.items():
+        (report_dir / f"{name}.json").write_text(json.dumps(payload, indent=2), encoding="utf-8")
+
+    # Per-model reports: machine-readable, one file per model, kept separate
+    # from the scorer-calibration ("tuning") artifacts above.
+    models_dir = report_dir / "models"
+    models_dir.mkdir(parents=True, exist_ok=True)
+    model_reports: dict[str, Any] = {}
+    for model_id in sorted({str(r.get("agent_or_model")) for r in run_rows}):
+        report = reports.model_report(run_rows, composites, model_id, run_set_id)
+        (models_dir / f"{_sanitize(model_id)}.json").write_text(
+            json.dumps(report, indent=2), encoding="utf-8"
+        )
+        model_reports[model_id] = report
+    payloads["model_reports"] = model_reports
+    return payloads
+
+
+# --------------------------------------------------------------------------- #
+# Entry points
+# --------------------------------------------------------------------------- #
+def run_pipeline(
+    *,
+    manifest_path: str | Path,
+    experiment: str,
+    agent: Agent,
+    agent_name: str,
+    seeds: Iterable[int] = (0,),
+    conditions: Optional[str] = None,
+    artifacts_root: str | Path = "artifacts",
+    run_set_id: str = "default",
+    scorer_config: Optional[ScorerConfig] = None,
+    force: bool = False,
+) -> dict[str, Any]:
+    """Single-model convenience entry: run one experiment with one agent."""
+    manifest_path = Path(manifest_path)
+    artifacts_root = Path(artifacts_root)
+    config = scorer_config or load_scorer_config()
+
+    catalog = load_manifest(manifest_path)
+    rows = resolve_task_rows([experiment], catalog, manifest_path)
+    static_by_task, difficulty_max = _score_suite(rows, manifest_path, artifacts_root, config, force)
+    run_rows, composites = _run_one_model(
+        rows,
+        agent,
+        _sanitize(agent_name),
+        manifest_path=manifest_path,
+        artifacts_root=artifacts_root,
+        static_by_task=static_by_task,
+        difficulty_max=difficulty_max,
+        config=config,
+        seeds=seeds,
+        conditions=conditions,
+        force=force,
+    )
+    return _write_aggregate(run_rows, composites, static_by_task, rows, artifacts_root, run_set_id)
+
+
+def run_from_config(
+    *,
+    run_config_path: str | Path,
+    manifest_path: str | Path = _DEFAULT_MANIFEST,
+    seeds: Iterable[int] = (0,),
+    conditions: Optional[str] = None,
+    artifacts_root: str | Path = "artifacts",
+    run_set_id: str = "default",
+    scorer_config: Optional[ScorerConfig] = None,
+    force: bool = False,
+    agent_factory: Optional[AgentFactory] = None,
+) -> dict[str, Any]:
+    """Run-config entry: each model runs its own task selection (model -> task files)."""
+    manifest_path = Path(manifest_path)
+    artifacts_root = Path(artifacts_root)
+    config = scorer_config or load_scorer_config()
+    factory = agent_factory or _build_agent_from_spec
+
+    run_config = load_run_config(run_config_path)
+    catalog = load_manifest(manifest_path)
+
+    # Resolve each model's task rows + build its agent.
+    plans: list[tuple[str, Agent, list[dict[str, Any]]]] = []
+    union: dict[str, dict[str, Any]] = {}
+    for name, model_cfg in run_config["models"].items():
+        entries = model_cfg.get("tasks") or model_cfg.get("runs") or []
+        if not entries:
+            raise ValueError(f"Model {name!r} lists no tasks/runs.")
+        rows = resolve_task_rows(entries, catalog, manifest_path)
+        agent, label = factory(name, model_cfg)
+        plans.append((_sanitize(label), agent, rows))
+        for r in rows:
+            union.setdefault(r["task_id"], r)
+
+    union_rows = list(union.values())
+    static_by_task, difficulty_max = _score_suite(union_rows, manifest_path, artifacts_root, config, force)
+
+    all_run_rows: list[dict[str, Any]] = []
+    composites: dict[tuple, Optional[float]] = {}
+    for model_name, agent, rows in plans:
+        rr, comp = _run_one_model(
+            rows,
+            agent,
+            model_name,
+            manifest_path=manifest_path,
+            artifacts_root=artifacts_root,
+            static_by_task=static_by_task,
+            difficulty_max=difficulty_max,
+            config=config,
+            seeds=seeds,
+            conditions=conditions,
+            force=force,
+        )
+        all_run_rows.extend(rr)
+        composites.update(comp)
+
+    return _write_aggregate(all_run_rows, composites, static_by_task, union_rows, artifacts_root, run_set_id)
+
+
+# --------------------------------------------------------------------------- #
+# Run-config + agent construction
+# --------------------------------------------------------------------------- #
+def load_run_config(path: str | Path) -> dict[str, Any]:
+    data = json.loads(Path(path).read_text(encoding="utf-8"))
+    if not isinstance(data, dict) or "models" not in data or not isinstance(data["models"], dict):
+        raise ValueError("Run-config must be an object with a 'models' mapping.")
+    return data
+
+
+def _build_agent_from_spec(name: str, model_cfg: dict[str, Any]) -> tuple[Agent, str]:
+    """Construct a live agent from a run-config model entry."""
+    provider = (model_cfg.get("provider") or "").lower()
+    model = model_cfg.get("model")
+    temperature = float(model_cfg.get("temperature", 0.0))
+    max_tokens = model_cfg.get("max_tokens")
+
+    if provider == "claude":
+        from interface.agents import ClaudeAnthropicAgent, ClaudeAnthropicConfig
+
+        cfg = ClaudeAnthropicConfig(temperature=temperature)
+        if model:
+            cfg.model = model
+        if max_tokens:
+            cfg.max_tokens = int(max_tokens)
+        return ClaudeAnthropicAgent(config=cfg), model or cfg.model
+    if provider == "qwen":
+        from interface.agents import Qwen35VLAgent, Qwen35VLConfig
+
+        cfg = Qwen35VLConfig(temperature=temperature)
+        if model:
+            cfg.model = model
+        if max_tokens:
+            cfg.max_new_tokens = int(max_tokens)
+        for key in (
+            "device_map",
+            "local_files_only",
+            "trust_remote_code",
+            "torch_dtype",
+            "load_in_4bit",
+            "attn_implementation",
+            "max_memory",
+            "enable_thinking",
+        ):
+            if key in model_cfg:
+                setattr(cfg, key, model_cfg[key])
+        return Qwen35VLAgent(config=cfg), model or cfg.model
+    raise ValueError(f"Model {name!r}: unknown provider {provider!r} (expected 'claude' or 'qwen').")
+
+
+def main(argv: Optional[list[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description="MultiNet v2.0 bare-bones run pipeline (tests 1-3).")
+    parser.add_argument("--run-config", help="JSON run-config mapping models to task files (preferred).")
+    parser.add_argument("--manifest", default=str(_DEFAULT_MANIFEST), help="Task catalog (metadata).")
+    parser.add_argument("--seeds", type=int, nargs="+", default=[0])
+    parser.add_argument("--conditions", default=None, help="Prompt condition-set name (optional).")
+    parser.add_argument("--artifacts-root", default=str(_REPO_ROOT / "artifacts"))
+    parser.add_argument("--run-set-id", default="default")
+    parser.add_argument("--force", action="store_true", help="Recompute existing artifacts.")
+    # Single-model fallback (when --run-config is not supplied):
+    parser.add_argument("--experiment", choices=["test1", "test2", "test3", "all"], default="all")
+    parser.add_argument("--agent", choices=["claude", "qwen"], help="Single-model provider.")
+    args = parser.parse_args(argv)
+
+    if args.run_config:
+        payloads = run_from_config(
+            run_config_path=args.run_config,
+            manifest_path=args.manifest,
+            seeds=args.seeds,
+            conditions=args.conditions,
+            artifacts_root=args.artifacts_root,
+            run_set_id=args.run_set_id,
+            force=args.force,
+        )
+    else:
+        if not args.agent:
+            parser.error("provide --run-config, or --agent for a single-model run.")
+        agent, label = _build_agent_from_spec(args.agent, {"provider": args.agent})
+        payloads = run_pipeline(
+            manifest_path=args.manifest,
+            experiment=args.experiment,
+            agent=agent,
+            agent_name=label,
+            seeds=args.seeds,
+            conditions=args.conditions,
+            artifacts_root=args.artifacts_root,
+            run_set_id=args.run_set_id,
+            force=args.force,
+        )
+
+    summary = payloads["scoring_calibration_summary"]
+    print(
+        f"Pipeline complete: {summary['run_count']} runs over {summary['task_count']} tasks "
+        f"-> {args.artifacts_root}/reports/{args.run_set_id}/"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/score_json.py b/scripts/score_json.py
new file mode 100644
index 0000000..a39707c
--- /dev/null
+++ b/scripts/score_json.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""CLI for scoring task and run JSON artifacts."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+from scorer.io import dump_json, json_files, load_json
+from scorer.scoring import (
+    ScorerConfig,
+    compute_runtime_score,
+    load_scorer_config,
+    score_runtime_file,
+    score_task_file,
+)
+
+
+def _load_config(args: argparse.Namespace) -> ScorerConfig:
+    return load_scorer_config(args.config)
+
+
+def _static_target_dirs(files: list[Path], output_root: Path | None) -> list[Path]:
+    if output_root is None:
+        return [path.with_suffix("").with_name(f"{path.stem}_score") for path in files]
+    if len(files) == 1:
+        return [output_root]
+
+    target_dirs = [output_root / path.stem for path in files]
+    duplicates = sorted(
+        {
+            str(target)
+            for target in target_dirs
+            if target_dirs.count(target) > 1
+        }
+    )
+    if duplicates:
+        raise ValueError(
+            "Static output directories collide for same-stem inputs: "
+            f"{', '.join(duplicates)}. Score those inputs separately or use distinct filenames."
+        )
+    return target_dirs
+
+
+def _default_runtime_output(run_path: str | Path) -> Path:
+    path = Path(run_path)
+    return path.with_name(f"{path.stem}_score.json")
+
+
+def _static(args: argparse.Namespace) -> int:
+    config = _load_config(args)
+    files = json_files(args.inputs)
+    if not files:
+        raise FileNotFoundError("No JSON files matched the static scoring inputs")
+
+    output_root = Path(args.output_dir) if args.output_dir else None
+    succeeded = 0
+    failed = 0
+    for task_path, target_dir in zip(files, _static_target_dirs(files, output_root)):
+        try:
+            canonical, static_score = score_task_file(
+                task_path,
+                output_dir=target_dir,
+                config=config,
+            )
+        except Exception as exc:
+            failed += 1
+            print(
+                "static: error "
+                f"input={task_path} output_dir={target_dir} "
+                f"error={exc.__class__.__name__}: {exc}",
+                file=sys.stderr,
+                flush=True,
+            )
+            continue
+
+        succeeded += 1
+        print(
+            "static: ok "
+            f"input={task_path} task_id={static_score.task_id} "
+            f"static_score={static_score.static_score:.3f} "
+            f"beatable={static_score.is_beatable} "
+            f"optimal_steps={canonical.optimal_steps} output_dir={target_dir}",
+            flush=True,
+        )
+
+    summary = f"static: summary scored={succeeded} failed={failed} total={len(files)}"
+    print(summary, file=sys.stderr if failed else sys.stdout, flush=True)
+    return 1 if failed else 0
+
+
+def _runtime(args: argparse.Namespace) -> int:
+    config = _load_config(args)
+    output_path = Path(args.output) if args.output else _default_runtime_output(args.run)
+    if (args.static_score is None) != (args.canonical_paths is None):
+        raise ValueError("--static-score and --canonical-paths must be provided together")
+    if (
+        args.difficulty_max_static_score is None
+        and config.difficulty_max_static_score is None
+    ):
+        raise ValueError(
+            "Runtime scoring needs a suite maximum. Pass --difficulty-max-static-score "
+            "or set difficulty_max_static_score in scorer config."
+        )
+
+    if args.static_score and args.canonical_paths:
+        score = score_runtime_file(
+            args.run,
+            static_score_path=args.static_score,
+            canonical_paths_path=args.canonical_paths,
+            output_path=output_path,
+            config=config,
+            difficulty_max_static_score=args.difficulty_max_static_score,
+        )
+    else:
+        if not args.task:
+            raise ValueError(
+                "Runtime scoring needs --static-score and --canonical-paths, "
+                "or --task so those artifacts can be computed."
+            )
+        canonical, static_score = score_task_file(
+            args.task,
+            output_dir=args.artifact_dir,
+            config=config,
+        )
+        run = load_json(args.run)
+        score = compute_runtime_score(
+            run,
+            static_score=static_score,
+            canonical_paths=canonical,
+            config=config,
+            difficulty_max_static_score=args.difficulty_max_static_score,
+        )
+        dump_json(output_path, score.to_dict())
+
+    print(f"{score.task_id}: runtime_score={score.composite:.3f} -> {output_path}")
+    return 0
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Score MultiNet task and run JSON artifacts.")
+    parser.add_argument(
+        "--config",
+        default=None,
+        help="Optional scorer config JSON/YAML path. Defaults to scorer/scorer_config.json.",
+    )
+
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    static_parser = subparsers.add_parser(
+        "static",
+        help="Write canonical_paths.json and scored_static.json for task JSON files.",
+    )
+    static_parser.add_argument("inputs", nargs="+", help="Task JSON files or directories.")
+    static_parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="Directory for score artifacts. Multiple inputs are written under per-file subdirectories.",
+    )
+    static_parser.set_defaults(func=_static)
+
+    runtime_parser = subparsers.add_parser(
+        "runtime",
+        help="Write run_score.json for one run/episode JSON artifact.",
+    )
+    runtime_parser.add_argument("run", help="Run or episode JSON file.")
+    runtime_parser.add_argument("--task", default=None, help="Task JSON file, used when static artifacts are omitted.")
+    runtime_parser.add_argument("--static-score", default=None, help="Existing scored_static.json path.")
+    runtime_parser.add_argument("--canonical-paths", default=None, help="Existing canonical_paths.json path.")
+    runtime_parser.add_argument(
+        "--artifact-dir",
+        default=None,
+        help="Optional directory to write computed static artifacts when --task is used.",
+    )
+    runtime_parser.add_argument("--output", default=None, help="Output run_score.json path.")
+    runtime_parser.add_argument(
+        "--difficulty-max-static-score",
+        type=float,
+        default=None,
+        help="Suite max static score for difficulty normalization. Required unless configured.",
+    )
+    runtime_parser.set_defaults(func=_runtime)
+
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return int(args.func(args))
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/validate_fixtures.py b/scripts/validate_fixtures.py
new file mode 100644
index 0000000..8ef6280
--- /dev/null
+++ b/scripts/validate_fixtures.py
@@ -0,0 +1,175 @@
+"""Validate pipeline fixtures and derive test-2 route discriminators.
+
+Read-only with respect to task files; with ``--write`` it caches the computed
+``route_short_cells`` / ``route_long_cells`` back into the manifest so the
+runtime ``path_choice`` metric has unambiguous per-route cell sets.
+
+Checks:
+  * every fixture passes ``TaskSpecification.validate()`` and BFS-solves;
+  * test-2 rows: both the short route and the route forced by walling off
+    ``route_block`` are solvable and visit distinct cells;
+  * test-3 rows: members sharing a ``pair_id`` have identical maze topology
+    (dimensions + walls) and equal BFS optimal step counts.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+from gridworld.baselines import plan_bfs_path
+from gridworld.task_spec import TaskSpecification
+
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+def _resolve(source: str, manifest_path: Path) -> Path:
+    candidate = Path(source)
+    if candidate.is_absolute() and candidate.exists():
+        return candidate
+    for base in (manifest_path.parent, _REPO_ROOT):
+        resolved = (base / source).resolve()
+        if resolved.exists():
+            return resolved
+    raise FileNotFoundError(f"Task source not found: {source}")
+
+
+def _load_spec(path: Path) -> TaskSpecification:
+    return TaskSpecification.from_json(str(path))
+
+
+def _spec_with_extra_wall(spec: TaskSpecification, cell: list[int]) -> TaskSpecification:
+    data = spec.to_dict()
+    walls = [list(w) for w in data["maze"].get("walls", [])]
+    if list(cell) not in walls:
+        walls.append(list(cell))
+    data["maze"]["walls"] = walls
+    return TaskSpecification.from_dict(data)
+
+
+def _validate_one(row: dict[str, Any], manifest_path: Path) -> list[str]:
+    errors: list[str] = []
+    source = _resolve(row["source"], manifest_path)
+    spec = _load_spec(source)
+    ok, messages = spec.validate()
+    if not ok:
+        errors.append(f"{row['task_id']}: validate() failed: {messages}")
+        return errors
+    bfs = plan_bfs_path(spec)
+    if not bfs.success:
+        errors.append(f"{row['task_id']}: BFS could not solve the task")
+    return errors
+
+
+def _derive_test2_routes(row: dict[str, Any], manifest_path: Path) -> list[str]:
+    errors: list[str] = []
+    source = _resolve(row["source"], manifest_path)
+    spec = _load_spec(source)
+
+    short = plan_bfs_path(spec)
+    if not short.success:
+        return [f"{row['task_id']}: short route unsolvable"]
+
+    block = row.get("route_block")
+    if block is None:
+        interior = [p for p in short.positions[1:-1]]
+        if not interior:
+            return [f"{row['task_id']}: no interior cell to block; set route_block explicitly"]
+        block = list(interior[len(interior) // 2])
+
+    long = plan_bfs_path(_spec_with_extra_wall(spec, block))
+    if not long.success:
+        errors.append(
+            f"{row['task_id']}: blocking {block} leaves no alternate route (pick a different route_block)"
+        )
+        return errors
+
+    short_cells = {tuple(p) for p in short.positions}
+    long_cells = {tuple(p) for p in long.positions}
+    short_only = sorted(short_cells - long_cells)
+    long_only = sorted(long_cells - short_cells)
+    if not short_only or not long_only:
+        errors.append(f"{row['task_id']}: routes do not diverge enough to discriminate path_choice")
+
+    row["route_block"] = list(block)
+    row["route_short_cells"] = [list(c) for c in short_only]
+    row["route_long_cells"] = [list(c) for c in long_only]
+    return errors
+
+
+def _check_test3_pairs(rows: list[dict[str, Any]], manifest_path: Path) -> list[str]:
+    errors: list[str] = []
+    pairs: dict[str, list[dict[str, Any]]] = defaultdict(list)
+    for row in rows:
+        if row.get("experiment") == "test3" and row.get("pair_id"):
+            pairs[row["pair_id"]].append(row)
+
+    for pair_id, members in pairs.items():
+        if len(members) < 2:
+            errors.append(f"pair {pair_id}: needs >= 2 members, found {len(members)}")
+            continue
+        specs = [_load_spec(_resolve(m["source"], manifest_path)) for m in members]
+        dims = {tuple(s.maze.dimensions) for s in specs}
+        walls = {frozenset((w.x, w.y) for w in s.maze.walls) for s in specs}
+        if len(dims) != 1:
+            errors.append(f"pair {pair_id}: maze dimensions differ: {dims}")
+        if len(walls) != 1:
+            errors.append(f"pair {pair_id}: wall layouts differ across members")
+        optimal = []
+        for member, spec in zip(members, specs):
+            bfs = plan_bfs_path(spec)
+            if not bfs.success:
+                errors.append(f"{member['task_id']}: BFS could not solve the task")
+            else:
+                optimal.append(len(bfs.action_labels))
+        if len(set(optimal)) > 1:
+            errors.append(
+                f"pair {pair_id}: BFS optimal step counts differ {optimal} "
+                "(test3 requires equal path length within a pair)"
+            )
+    return errors
+
+
+def validate_manifest(manifest_path: Path) -> tuple[dict[str, Any], list[str]]:
+    data = json.loads(manifest_path.read_text(encoding="utf-8"))
+    rows = data["tasks"] if isinstance(data, dict) else data
+
+    errors: list[str] = []
+    for row in rows:
+        errors.extend(_validate_one(row, manifest_path))
+        if row.get("experiment") == "test2":
+            errors.extend(_derive_test2_routes(row, manifest_path))
+    errors.extend(_check_test3_pairs(rows, manifest_path))
+    return data, errors
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Validate pipeline fixtures and derive test-2 routes.")
+    parser.add_argument(
+        "--manifest", default=str(_REPO_ROOT / "gridworld" / "fixtures" / "manifest.json")
+    )
+    parser.add_argument("--write", action="store_true", help="Persist derived route cells to the manifest.")
+    args = parser.parse_args(argv)
+
+    manifest_path = Path(args.manifest)
+    data, errors = validate_manifest(manifest_path)
+
+    if errors:
+        print("Fixture validation FAILED:")
+        for err in errors:
+            print(f"  - {err}")
+        return 1
+
+    if args.write:
+        manifest_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8")
+        print(f"Validated OK; route discriminators written to {manifest_path}")
+    else:
+        print("Validated OK (use --write to cache test-2 route discriminators).")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_episode_metrics.py b/tests/test_episode_metrics.py
new file mode 100644
index 0000000..b00fa52
--- /dev/null
+++ b/tests/test_episode_metrics.py
@@ -0,0 +1,163 @@
+"""Unit tests for Stage-3 instrumentation (pipeline.episode_metrics)."""
+
+from __future__ import annotations
+
+from pipeline import episode_metrics as em
+
+
+def _state(pos, *, keys=(), switches=(), doors=(), gates=(), reward=0.0):
+    return {
+        "agent_position": list(pos),
+        "collected_keys": list(keys),
+        "active_switches": list(switches),
+        "open_doors": list(doors),
+        "open_gates": list(gates),
+        "reward": reward,
+    }
+
+
+def _step(pos, event_type="MOVED", **state_kwargs):
+    return {"kind": "step", "event_type": event_type, "state_after": _state(pos, **state_kwargs)}
+
+
+def _episode(steps, *, success, end_reason, initial_pos=(1, 1)):
+    initial = _state(initial_pos)
+    final = steps[-1]["state_after"] if steps else initial
+    return {
+        "success": success,
+        "end_reason": end_reason,
+        "steps_used": len(steps),
+        "initial_state": initial,
+        "final_state": final,
+        "transcript": [{"kind": "reset", "state": initial}, *steps],
+    }
+
+
+# --------------------------------------------------------------------------- #
+# visited_cells: uses state_after.agent_position (x, y), collapses duplicates
+# --------------------------------------------------------------------------- #
+def test_visited_cells_uses_agent_position_and_dedupes():
+    ep = _episode(
+        [_step((1, 1)), _step((2, 1)), _step((2, 1)), _step((3, 1))],
+        success=True,
+        end_reason="success",
+    )
+    assert em.visited_cells(ep) == [(1, 1), (2, 1), (3, 1)]
+
+
+# --------------------------------------------------------------------------- #
+# mechanism_interaction_order
+# --------------------------------------------------------------------------- #
+def test_mechanism_order_key_then_switch():
+    ep = _episode(
+        [
+            _step((2, 1), "PICKUP", keys=("kB",)),
+            _step((2, 1), "TOGGLED", keys=("kB",), switches=("s1",), gates=("g1",)),
+        ],
+        success=True,
+        end_reason="success",
+    )
+    # switch (active_switches) ranks before its downstream gate (open_gates).
+    assert em.mechanism_interaction_order(ep) == ["kB", "s1", "g1"]
+
+
+def test_mechanism_order_switch_then_key():
+    ep = _episode(
+        [
+            _step((2, 1), "TOGGLED", switches=("s1",), gates=("g1",)),
+            _step((6, 1), "PICKUP", switches=("s1",), gates=("g1",), keys=("kB",)),
+        ],
+        success=True,
+        end_reason="success",
+    )
+    assert em.mechanism_interaction_order(ep) == ["s1", "g1", "kB"]
+
+
+def test_mechanism_order_navigation_only_is_empty():
+    ep = _episode([_step((2, 1)), _step((3, 1))], success=True, end_reason="success")
+    assert em.mechanism_interaction_order(ep) == []
+
+
+# --------------------------------------------------------------------------- #
+# failure_point
+# --------------------------------------------------------------------------- #
+def test_failure_point_reports_first_missing_expected_mechanism():
+    ep = _episode(
+        [_step((2, 1), "PICKUP", keys=("kB",))],
+        success=False,
+        end_reason="max_steps",
+    )
+    order = em.mechanism_interaction_order(ep)
+    fp = em.failure_point(ep, ["kB", "s1"], order)
+    assert fp["mechanism"] == "s1"
+    assert fp["end_reason"] == "max_steps"
+    assert fp["final_cell"] == [2, 1]
+
+
+def test_failure_point_none_on_success():
+    ep = _episode([_step((2, 1))], success=True, end_reason="success")
+    assert em.failure_point(ep, ["kB"], []) is None
+
+
+# --------------------------------------------------------------------------- #
+# path_choice
+# --------------------------------------------------------------------------- #
+def test_path_choice_short_long_mixed_none():
+    short = [[5, 1], [6, 1]]
+    long = [[2, 5], [3, 5]]
+    short_ep = _episode([_step((5, 1)), _step((6, 1))], success=True, end_reason="success")
+    long_ep = _episode([_step((2, 5)), _step((3, 5))], success=False, end_reason="max_steps")
+    mixed_ep = _episode([_step((5, 1)), _step((2, 5))], success=False, end_reason="max_steps")
+    none_ep = _episode([_step((9, 9))], success=False, end_reason="max_steps")
+
+    assert em.path_choice(short_ep, short, long) == "short_mech"
+    assert em.path_choice(long_ep, short, long) == "long_open"
+    assert em.path_choice(mixed_ep, short, long) == "mixed"
+    assert em.path_choice(none_ep, short, long) == "none"
+    assert em.path_choice(short_ep, None, None) is None
+
+
+# --------------------------------------------------------------------------- #
+# token accounting + run row
+# --------------------------------------------------------------------------- #
+def test_episode_token_count_sums_query_usage():
+    ep = {
+        "transcript": [
+            {"kind": "query", "usage": {"total_tokens": 10}},
+            {"kind": "step"},
+            {"kind": "query", "usage": {"input_tokens": 5, "output_tokens": 3}},
+        ]
+    }
+    assert em.episode_token_count(ep) == 18
+
+
+def test_build_run_row_fields_and_optimality():
+    ep = _episode(
+        [_step((2, 1), "PICKUP", keys=("kB",)), _step((3, 1))],
+        success=True,
+        end_reason="success",
+    )
+    ep["transcript"].append({"kind": "query", "usage": {"total_tokens": 12}})
+    canonical = {"bfs": {"optimal_steps": 2}}
+    manifest_row = {
+        "task_id": "T_demo",
+        "experiment": "test3",
+        "condition": "key_first",
+        "expected_mechanisms": ["kB", "s1"],
+    }
+    row = em.build_run_row(
+        ep, canonical, manifest_row, agent_or_model="stub", seed=0, raw_output_ref="x/episode.json"
+    )
+    assert row["task_id"] == "T_demo"
+    assert row["experiment"] == "test3"
+    assert row["backend"] == "minigrid"
+    assert row["agent_or_model"] == "stub"
+    assert row["success"] is True
+    assert row["terminated"] is True
+    assert row["truncated"] is False
+    assert row["optimal_steps"] == 2
+    assert row["optimality_ratio"] == 1.0  # steps_used (2) == optimal (2)
+    assert row["mechanism_interaction_order"] == ["kB"]
+    assert row["failure_point"] is None
+    assert row["tokens"] == 12
+    assert row["raw_output_ref"] == "x/episode.json"
diff --git a/tests/test_import_isolation.py b/tests/test_import_isolation.py
new file mode 100644
index 0000000..c0bc0c4
--- /dev/null
+++ b/tests/test_import_isolation.py
@@ -0,0 +1,45 @@
+"""The solver/scorer path must not import the heavy interface stack.
+
+Each check runs in a fresh interpreter (subprocess) because the rest of the
+suite imports `interface`, which would pollute sys.modules within one process.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+def _pulls_interface(module: str) -> bool:
+    code = (
+        f"import {module}, sys; "
+        "hit = [m for m in sys.modules if m == 'interface' or m.startswith('interface.')]; "
+        "print('IFACE' if hit else 'CLEAN')"
+    )
+    result = subprocess.run(
+        [sys.executable, "-c", code],
+        capture_output=True,
+        text=True,
+        cwd=str(_REPO_ROOT),
+    )
+    assert result.returncode == 0, result.stderr
+    return "IFACE" in result.stdout
+
+
+def test_scorer_import_is_interface_free():
+    assert not _pulls_interface("scorer"), "import scorer pulled in interface"
+
+
+def test_episode_metrics_import_is_interface_free():
+    assert not _pulls_interface("pipeline.episode_metrics"), (
+        "import pipeline.episode_metrics pulled in interface"
+    )
+
+
+def test_run_pipeline_import_is_interface_free():
+    assert not _pulls_interface("scripts.run_pipeline"), (
+        "import scripts.run_pipeline pulled in interface"
+    )
diff --git a/tests/test_interface_token_usage.py b/tests/test_interface_token_usage.py
new file mode 100644
index 0000000..a5919f1
--- /dev/null
+++ b/tests/test_interface_token_usage.py
@@ -0,0 +1,101 @@
+from interface.config import ExperimentConfig
+from interface.loader import default_maze_path, load_task
+from interface.runner import build_runner
+from interface.smoke_tests.plans import v01_empty_room_trajectory
+from interface.smoke_tests.smoke_llm import _AgentRecorder
+from interface.telemetry import normalize_token_usage
+
+
+class UsageReplayAgent:
+    def __init__(self):
+        self._actions = iter(v01_empty_room_trajectory())
+        self.last_usage = None
+
+    def __call__(self, messages):
+        self.last_usage = {
+            "input_tokens": 8,
+            "output_tokens": 2,
+            "total_tokens": 10,
+        }
+        return f"FINAL_OUTPUT: {next(self._actions)}"
+
+
+class FirstQueryUsageReplayAgent(UsageReplayAgent):
+    def __init__(self):
+        super().__init__()
+        self._calls = 0
+
+    def __call__(self, messages):
+        self._calls += 1
+        if self._calls == 1:
+            self.last_usage = {
+                "input_tokens": 8,
+                "output_tokens": 2,
+                "total_tokens": 10,
+            }
+        return f"FINAL_OUTPUT: {next(self._actions)}"
+
+
+def test_normalized_usage_accepts_provider_token_keys():
+    assert normalize_token_usage({"input_tokens": 8, "output_tokens": 2}) == {
+        "input_tokens": 8,
+        "output_tokens": 2,
+        "total_tokens": 10,
+    }
+
+
+def test_agent_recorder_forwards_usage_metadata():
+    records = []
+    recorder = _AgentRecorder(UsageReplayAgent(), records)
+
+    recorder([])
+
+    assert recorder.last_usage == {
+        "input_tokens": 8,
+        "output_tokens": 2,
+        "total_tokens": 10,
+    }
+    assert records[0]["usage"]["total_tokens"] == 10
+
+
+def test_runner_persists_agent_usage_in_query_transcript():
+    maze_path = default_maze_path("V01_empty_room.json")
+    backend, spec = load_task(maze_path)
+    runner = build_runner(
+        ExperimentConfig(
+            observation="text_only",
+            context_window="current",
+            querying="step_by_step",
+            chat_history="stateless",
+        ),
+        backend,
+        spec,
+    )
+
+    result = runner.run(UsageReplayAgent(), verbose=False, maze_path=maze_path)
+    query_records = [item for item in result["transcript"] if item.get("kind") == "query"]
+
+    assert result["success"] is True
+    assert query_records
+    assert query_records[0]["usage"]["total_tokens"] == 10
+
+
+def test_runner_clears_stale_usage_between_queries():
+    maze_path = default_maze_path("V01_empty_room.json")
+    backend, spec = load_task(maze_path)
+    runner = build_runner(
+        ExperimentConfig(
+            observation="text_only",
+            context_window="current",
+            querying="step_by_step",
+            chat_history="stateless",
+        ),
+        backend,
+        spec,
+    )
+
+    result = runner.run(FirstQueryUsageReplayAgent(), verbose=False, maze_path=maze_path)
+    query_records = [item for item in result["transcript"] if item.get("kind") == "query"]
+
+    assert query_records[0]["usage"]["total_tokens"] == 10
+    assert "usage" not in query_records[1]
diff --git a/tests/test_reports.py b/tests/test_reports.py
new file mode 100644
index 0000000..f146880
--- /dev/null
+++ b/tests/test_reports.py
@@ -0,0 +1,153 @@
+"""Unit tests for Stage-5 report aggregators (pipeline.reports)."""
+
+from __future__ import annotations
+
+from pipeline import reports
+
+
+def _row(**kw):
+    base = {
+        "task_id": "t",
+        "experiment": "test1",
+        "condition": "default",
+        "prompt_variant": "default",
+        "agent_or_model": "m",
+        "seed": 0,
+        "success": True,
+        "optimality_ratio": 1.0,
+        "path_choice": None,
+        "mechanism_interaction_order": [],
+        "failure_point": None,
+    }
+    base.update(kw)
+    return base
+
+
+def test_scoring_calibration_summary_groups_and_correlates():
+    rows = [
+        _row(task_id="a", success=True, optimality_ratio=1.0),
+        _row(task_id="b", success=False, optimality_ratio=0.0),
+    ]
+    composites = {
+        ("a", "m", 0, "default", "default"): 0.2,
+        ("b", "m", 0, "default", "default"): 0.8,
+    }
+    static_by_task = {
+        "a": {"static_score": 1.0, "dimensions_12": {"grid_size": 9.0, "optimal_path_length": 3.0}},
+        "b": {"static_score": 5.0, "dimensions_12": {"grid_size": 25.0, "optimal_path_length": 9.0}},
+    }
+    summary = reports.scoring_calibration_summary(rows, composites, static_by_task)
+
+    assert summary["run_count"] == 2
+    assert summary["task_count"] == 2
+    assert summary["success_rate_by_task"]["a"]["success_rate"] == 1.0
+    assert summary["success_rate_by_task"]["b"]["success_rate"] == 0.0
+    # Only successful runs feed optimality.
+    assert summary["optimality_ratio_mean"] == 1.0
+    # Two tasks with variance -> correlation defined for the populated dims.
+    assert summary["dimension_correlation"]["grid_size"] is not None
+    assert "p33" in summary["tier_boundary_candidates"]
+
+
+def test_prompt_variants_do_not_collide():
+    # Same task + manifest condition, two prompt variants: their composites and
+    # success must stay distinct (regression for the setdefault collapse bug).
+    rows = [
+        _row(task_id="a", prompt_variant="step_by_step", success=True),
+        _row(task_id="a", prompt_variant="bulk", success=False),
+    ]
+    composites = {
+        ("a", "m", 0, "default", "step_by_step"): 0.9,
+        ("a", "m", 0, "default", "bulk"): 0.1,
+    }
+    static_by_task = {"a": {"static_score": 1.0, "dimensions_12": {}}}
+    summary = reports.scoring_calibration_summary(rows, composites, static_by_task)
+
+    by_variant = summary["success_rate_by_prompt_variant"]
+    assert by_variant["step_by_step"]["success_rate"] == 1.0
+    assert by_variant["bulk"]["success_rate"] == 0.0
+    # Both per-variant composites are reachable (neither overwrote the other).
+    assert summary["run_count"] == 2
+
+
+def test_complexity_distance_summary_counts_path_choice():
+    rows = [
+        _row(experiment="test2", task_id="T2", condition="shortcut", path_choice="short_mech", success=True),
+        _row(experiment="test2", task_id="T2", condition="shortcut", path_choice="long_open", success=False),
+        _row(experiment="test2", task_id="T2", condition="shortcut", path_choice="short_mech", success=True),
+    ]
+    summary = reports.complexity_distance_summary(rows)
+    assert summary["run_count"] == 3
+    assert summary["path_choice_overall"]["short_mech"] == 2
+    assert summary["path_choice_overall"]["long_open"] == 1
+    assert summary["success_rate_by_path_choice"]["short_mech"] == 1.0
+    assert summary["success_rate_by_path_choice"]["long_open"] == 0.0
+
+
+def test_mechanism_ordering_pairs_paired_delta():
+    rows = [
+        _row(experiment="test3", task_id="k", condition="key_first", success=True,
+             mechanism_interaction_order=["kB", "s1"]),
+        _row(experiment="test3", task_id="s", condition="switch_first", success=False,
+             mechanism_interaction_order=["s1"], failure_point={"mechanism": "kB"}),
+    ]
+    manifest = [
+        {"task_id": "k", "pair_id": "corridor", "expected_mechanisms": ["kB", "s1"]},
+        {"task_id": "s", "pair_id": "corridor", "expected_mechanisms": ["s1", "kB"]},
+    ]
+    summary = reports.mechanism_ordering_pairs(rows, manifest)
+    pair = summary["pairs"]["corridor"]
+    assert pair["conditions"]["key_first"]["success_rate"] == 1.0
+    assert pair["conditions"]["switch_first"]["success_rate"] == 0.0
+    assert pair["conditions"]["key_first"]["expected_order_match_rate"] == 1.0
+    assert pair["conditions"]["switch_first"]["failure_point_counts"]["kB"] == 1
+    # sorted conditions: ["key_first", "switch_first"] -> delta = 1.0 - 0.0
+    assert pair["paired_success_delta"]["success_delta"] == 1.0
+
+
+def test_model_report_aggregates_per_model():
+    rows = [
+        _row(task_id="a", agent_or_model="m1", experiment="test1",
+             success=True, optimality_ratio=1.0, steps=3, optimal_steps=3, tokens=10),
+        _row(task_id="b", agent_or_model="m1", experiment="test1",
+             success=False, optimality_ratio=0.0, steps=9, optimal_steps=3, tokens=20),
+        _row(task_id="a", agent_or_model="m2", experiment="test1",
+             success=True, optimality_ratio=0.5, steps=6, optimal_steps=3, tokens=5),
+    ]
+    composites = {
+        ("a", "m1", 0, "default", "default"): 0.4,
+        ("b", "m1", 0, "default", "default"): 0.0,
+        ("a", "m2", 0, "default", "default"): 0.2,
+    }
+
+    rep = reports.model_report(rows, composites, "m1", "rs")
+    assert rep["schema_version"] == "0.1.0"
+    assert rep["model_id"] == "m1"
+    assert rep["run_set_id"] == "rs"
+    assert rep["provisional"] is True
+    assert rep["run_count"] == 2
+    assert rep["task_count"] == 2
+    assert rep["overall"]["success_rate"] == 0.5
+    assert rep["overall"]["optimality_ratio_mean"] == 1.0  # successful runs only
+    assert rep["overall"]["tokens_total"] == 30.0
+    assert rep["overall"]["composite_mean"] == 0.2  # mean(0.4, 0.0)
+    assert "test1" in rep["by_experiment"]
+    assert "default" in rep["by_prompt_variant"]
+    assert len(rep["tasks"]) == 2
+    assert {t["task_id"] for t in rep["tasks"]} == {"a", "b"}
+
+    # A second model is fully independent (no collision).
+    rep2 = reports.model_report(rows, composites, "m2", "rs")
+    assert rep2["run_count"] == 1
+    assert rep2["overall"]["success_rate"] == 1.0
+
+
+def test_scoring_calibration_summary_lists_ineligible_tasks():
+    rows = [_row(task_id="a", success=True)]
+    composites = {("a", "m", 0, "default", "default"): 0.5}
+    static_by_task = {
+        "a": {"static_score": 1.0, "dimensions_12": {}, "is_beatable": True},
+        "dead": {"static_score": 0.0, "dimensions_12": {}, "is_beatable": False},
+    }
+    summary = reports.scoring_calibration_summary(rows, composites, static_by_task)
+    assert summary["ineligible_tasks"] == ["dead"]
diff --git a/tests/test_run_pipeline.py b/tests/test_run_pipeline.py
new file mode 100644
index 0000000..6af8e9e
--- /dev/null
+++ b/tests/test_run_pipeline.py
@@ -0,0 +1,379 @@
+"""End-to-end test for the bare-bones run pipeline using a replay stub agent.
+
+Runs are live-model-only in production, but the runner accepts any callable
+``messages -> str`` agent, so a deterministic replay stub exercises the full
+Stage 1->5 chain (real MiniGrid backend, episode log, and scorer) with no API.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from interface.loader import default_maze_path
+from interface.smoke_tests.plans import v01_empty_room_trajectory
+
+from scripts.run_pipeline import resolve_task_rows, run_from_config, run_pipeline
+
+_MANIFEST = Path(__file__).resolve().parents[1] / "gridworld" / "fixtures" / "manifest.json"
+
+
+class ReplayAgent:
+    """Replays a fixed action plan and reports token usage (scorer needs >0)."""
+
+    def __init__(self, actions):
+        self._actions = iter(actions)
+        self.last_usage = None
+
+    def __call__(self, messages):
+        self.last_usage = {"input_tokens": 8, "output_tokens": 2, "total_tokens": 10}
+        try:
+            action = next(self._actions)
+        except StopIteration:
+            action = "DONE"
+        return f"FINAL_OUTPUT: {action}"
+
+
+def _write_manifest(tmp_path: Path) -> Path:
+    manifest = {
+        "tasks": [
+            {
+                "task_id": "validation_10_v01_empty_room",
+                "experiment": "test1",
+                "condition": "default",
+                "variant": "empty_room",
+                "source": str(default_maze_path("V01_empty_room.json")),
+                "expected_mechanisms": [],
+                "notes": "E2E smoke task.",
+            }
+        ]
+    }
+    path = tmp_path / "manifest.json"
+    path.write_text(json.dumps(manifest), encoding="utf-8")
+    return path
+
+
+def test_pipeline_writes_full_artifact_tree(tmp_path):
+    manifest_path = _write_manifest(tmp_path)
+    artifacts = tmp_path / "artifacts"
+
+    payloads = run_pipeline(
+        manifest_path=manifest_path,
+        experiment="test1",
+        agent=ReplayAgent(v01_empty_room_trajectory()),
+        agent_name="replay-stub",
+        seeds=[0],
+        conditions=None,
+        artifacts_root=artifacts,
+        run_set_id="smoke",
+    )
+
+    task_id = "validation_10_v01_empty_room"
+    task_dir = artifacts / "tasks" / task_id
+    assert (task_dir / "canonical_paths.json").exists()
+    assert (task_dir / "scored_static.json").exists()
+    assert (artifacts / "tasks" / "_suite.json").exists()
+
+    run_dir = artifacts / "runs" / task_id / "minigrid" / "replay-stub" / "seed_0" / "default"
+    assert (run_dir / "episode.json").exists()
+    run_score = json.loads((run_dir / "run_score.json").read_text())
+    assert "composite" in run_score
+    assert run_score["signals"]["success"] is True
+
+    jsonl = (artifacts / "episode_runs.jsonl").read_text().strip().splitlines()
+    assert len(jsonl) == 1
+    row = json.loads(jsonl[0])
+    for field in (
+        "task_id", "experiment", "condition", "prompt_variant", "backend",
+        "agent_or_model", "seed", "success", "terminated", "truncated", "reward",
+        "steps", "optimal_steps", "optimality_ratio", "path_choice",
+        "mechanism_interaction_order", "failure_point", "tokens", "raw_output_ref",
+    ):
+        assert field in row, f"missing episode_runs field: {field}"
+    assert row["prompt_variant"] == "default"
+    assert row["tokens"] and row["tokens"] > 0
+
+    report_dir = artifacts / "reports" / "smoke"
+    for name in (
+        "scoring_calibration_summary",
+        "complexity_distance_summary",
+        "mechanism_ordering_pairs",
+    ):
+        assert (report_dir / f"{name}.json").exists()
+    assert payloads["scoring_calibration_summary"]["run_count"] == 1
+
+
+# --------------------------------------------------------------------------- #
+# Task resolution (run-config entries -> catalog rows with metadata)
+# --------------------------------------------------------------------------- #
+def _catalog():
+    return json.loads(_MANIFEST.read_text())["tasks"]
+
+
+def test_resolve_experiment_keyword_expands_from_catalog():
+    rows = resolve_task_rows(["test3"], _catalog(), _MANIFEST)
+    assert rows and all(r["experiment"] == "test3" for r in rows)
+    assert {"T3_corr_key_first", "T3_corr_switch_first"} <= {r["task_id"] for r in rows}
+
+
+def test_resolve_task_file_attaches_catalog_metadata():
+    rows = resolve_task_rows(
+        ["gridworld/fixtures/test3/T3_corr_key_first.json"], _catalog(), _MANIFEST
+    )
+    assert len(rows) == 1
+    assert rows[0]["task_id"] == "T3_corr_key_first"
+    assert rows[0]["expected_mechanisms"] == ["kB", "s1"]
+    assert rows[0]["pair_id"] == "corridor"
+
+
+def test_resolve_unknown_file_synthesizes_test1_row(tmp_path):
+    task_file = str(default_maze_path("V01_empty_room.json"))
+    rows = resolve_task_rows([task_file], _catalog(), _MANIFEST)
+    # V01 is in the catalog by path -> keeps its catalog task_id.
+    assert rows[0]["task_id"] == "validation_10_v01_empty_room"
+
+
+# --------------------------------------------------------------------------- #
+# Config-driven multi-model run (stub agent factory, no API)
+# --------------------------------------------------------------------------- #
+def test_run_from_config_drives_per_model_tasks(tmp_path):
+    run_config = {
+        "models": {
+            "stub": {
+                "provider": "claude",
+                "model": "stub-model",
+                "tasks": [str(default_maze_path("V01_empty_room.json"))],
+            }
+        }
+    }
+    cfg_path = tmp_path / "run_config.json"
+    cfg_path.write_text(json.dumps(run_config), encoding="utf-8")
+    artifacts = tmp_path / "artifacts"
+
+    def factory(name, model_cfg):
+        return ReplayAgent(v01_empty_room_trajectory()), model_cfg["model"]
+
+    payloads = run_from_config(
+        run_config_path=cfg_path,
+        manifest_path=_MANIFEST,
+        seeds=[0],
+        artifacts_root=artifacts,
+        run_set_id="cfg",
+        agent_factory=factory,
+    )
+
+    run_dir = (
+        artifacts / "runs" / "validation_10_v01_empty_room" / "minigrid" / "stub-model" / "seed_0" / "default"
+    )
+    assert (run_dir / "episode.json").exists()
+    assert (run_dir / "run_score.json").exists()
+    assert payloads["scoring_calibration_summary"]["run_count"] == 1
+
+
+# --------------------------------------------------------------------------- #
+# Content-hash invalidation
+# --------------------------------------------------------------------------- #
+import itertools
+import shutil
+
+from scorer import load_scorer_config, score_task_file
+from scorer.io import load_json, task_spec_from_payload
+from scripts.run_pipeline import _expected_static_hash
+
+
+class CountingReplayAgent:
+    """Cycles a fixed plan (one full pass per episode) and counts model calls."""
+
+    def __init__(self, actions):
+        self._actions = itertools.cycle(actions)
+        self.last_usage = None
+        self.calls = 0
+
+    def __call__(self, messages):
+        self.calls += 1
+        self.last_usage = {"input_tokens": 8, "output_tokens": 2, "total_tokens": 10}
+        return f"FINAL_OUTPUT: {next(self._actions)}"
+
+
+def _single_task_manifest(tmp_path, source):
+    manifest = {"tasks": [{
+        "task_id": "copy_v01", "experiment": "test1", "condition": "default",
+        "variant": "copy", "source": str(source), "expected_mechanisms": [],
+    }]}
+    path = tmp_path / "manifest.json"
+    path.write_text(json.dumps(manifest), encoding="utf-8")
+    return path
+
+
+def test_expected_static_hash_matches_scorer(tmp_path):
+    source = default_maze_path("V06_chain_ks.json")
+    cfg = load_scorer_config()
+    _, static = score_task_file(source, output_dir=tmp_path / "t", config=cfg)
+    spec = task_spec_from_payload(load_json(source))
+    assert _expected_static_hash(spec, cfg) == static.to_dict()["inputs_hash"]
+
+
+def test_canonical_paths_carry_inputs_hash(tmp_path):
+    source = default_maze_path("V06_chain_ks.json")
+    score_task_file(source, output_dir=tmp_path / "t")
+    canonical = load_json(tmp_path / "t" / "canonical_paths.json")
+    assert canonical.get("inputs_hash")
+
+
+def test_unchanged_rerun_reuses_episode_and_static(tmp_path):
+    task_file = tmp_path / "task.json"
+    shutil.copy(default_maze_path("V01_empty_room.json"), task_file)
+    manifest = _single_task_manifest(tmp_path, task_file)
+    artifacts = tmp_path / "artifacts"
+    agent = CountingReplayAgent(v01_empty_room_trajectory())
+
+    run_pipeline(manifest_path=manifest, experiment="test1", agent=agent,
+                 agent_name="stub", artifacts_root=artifacts, run_set_id="r")
+    calls_after_first = agent.calls
+    assert calls_after_first > 0
+
+    # Second identical run: episode cache hit -> agent not called again.
+    run_pipeline(manifest_path=manifest, experiment="test1", agent=agent,
+                 agent_name="stub", artifacts_root=artifacts, run_set_id="r")
+    assert agent.calls == calls_after_first
+
+
+def test_task_edit_invalidates_static_and_episode(tmp_path):
+    task_file = tmp_path / "task.json"
+    shutil.copy(default_maze_path("V01_empty_room.json"), task_file)
+    manifest = _single_task_manifest(tmp_path, task_file)
+    artifacts = tmp_path / "artifacts"
+    agent = CountingReplayAgent(v01_empty_room_trajectory())
+
+    run_pipeline(manifest_path=manifest, experiment="test1", agent=agent,
+                 agent_name="stub", artifacts_root=artifacts, run_set_id="r")
+    first_calls = agent.calls
+    first_static_hash = load_json(artifacts / "tasks" / "copy_v01" / "scored_static.json")["inputs_hash"]
+
+    # Mutate the task spec -> both static and run hashes must change.
+    data = json.loads(task_file.read_text())
+    data["max_steps"] = data["max_steps"] + 5
+    task_file.write_text(json.dumps(data), encoding="utf-8")
+
+    run_pipeline(manifest_path=manifest, experiment="test1", agent=agent,
+                 agent_name="stub", artifacts_root=artifacts, run_set_id="r")
+    new_static_hash = load_json(artifacts / "tasks" / "copy_v01" / "scored_static.json")["inputs_hash"]
+    assert new_static_hash != first_static_hash  # Stage 2 recomputed
+    assert agent.calls > first_calls             # Stage 3 episode re-run
+
+
+def test_scorer_config_change_rescore_without_rerunning_model(tmp_path):
+    task_file = tmp_path / "task.json"
+    shutil.copy(default_maze_path("V01_empty_room.json"), task_file)
+    manifest = _single_task_manifest(tmp_path, task_file)
+    artifacts = tmp_path / "artifacts"
+    agent = CountingReplayAgent(v01_empty_room_trajectory())
+
+    # Small baselines (below the run's token count) so token_efficiency stays < 1
+    # and actually moves with the config.
+    cfg_a = load_scorer_config()
+    cfg_a.baseline_tokens = 1.0
+    run_pipeline(manifest_path=manifest, experiment="test1", agent=agent, agent_name="stub",
+                 artifacts_root=artifacts, run_set_id="r", scorer_config=cfg_a)
+    calls_after_first = agent.calls
+    run_dir = artifacts / "runs" / "copy_v01" / "minigrid" / "stub" / "seed_0" / "default"
+    eff_a = load_json(run_dir / "run_score.json")["signals"]["token_efficiency"]
+
+    cfg_b = load_scorer_config()
+    cfg_b.baseline_tokens = 5.0
+    run_pipeline(manifest_path=manifest, experiment="test1", agent=agent, agent_name="stub",
+                 artifacts_root=artifacts, run_set_id="r", scorer_config=cfg_b)
+
+    # Episode reused (model not re-called) but run_score reflects the new config.
+    assert agent.calls == calls_after_first
+    eff_b = load_json(run_dir / "run_score.json")["signals"]["token_efficiency"]
+    assert eff_b != eff_a
+
+
+# --------------------------------------------------------------------------- #
+# Prompt variants are an axis distinct from the manifest condition
+# --------------------------------------------------------------------------- #
+def test_pipeline_keeps_prompt_variants_distinct(tmp_path):
+    # Two prompt variants over one task must produce two distinct runs that do
+    # not collapse onto the manifest condition (regression for the setdefault bug).
+    manifest_path = _write_manifest(tmp_path)
+    artifacts = tmp_path / "artifacts"
+
+    payloads = run_pipeline(
+        manifest_path=manifest_path,
+        experiment="test1",
+        agent=CountingReplayAgent(v01_empty_room_trajectory()),
+        agent_name="replay-stub",
+        seeds=[0],
+        conditions="Prompt",  # implemented variants: standard, verbose
+        artifacts_root=artifacts,
+        run_set_id="variants",
+    )
+
+    task_id = "validation_10_v01_empty_room"
+    base = artifacts / "runs" / task_id / "minigrid" / "replay-stub" / "seed_0"
+    assert (base / "standard" / "episode.json").exists()
+    assert (base / "verbose" / "episode.json").exists()
+
+    rows = [
+        json.loads(line)
+        for line in (artifacts / "episode_runs.jsonl").read_text().strip().splitlines()
+    ]
+    assert {r["prompt_variant"] for r in rows} == {"standard", "verbose"}
+    # Same task-intrinsic condition, distinct prompt variants -> distinct rows.
+    assert all(r["condition"] == "default" for r in rows)
+    summary = payloads["scoring_calibration_summary"]
+    assert summary["run_count"] == 2
+    assert set(summary["success_rate_by_prompt_variant"]) == {"standard", "verbose"}
+
+
+def test_pipeline_writes_per_model_report(tmp_path):
+    manifest_path = _write_manifest(tmp_path)
+    artifacts = tmp_path / "artifacts"
+
+    payloads = run_pipeline(
+        manifest_path=manifest_path,
+        experiment="test1",
+        agent=ReplayAgent(v01_empty_room_trajectory()),
+        agent_name="replay-stub",
+        seeds=[0],
+        artifacts_root=artifacts,
+        run_set_id="smoke",
+    )
+
+    report_path = artifacts / "reports" / "smoke" / "models" / "replay-stub.json"
+    assert report_path.exists()
+    rep = json.loads(report_path.read_text())
+    assert rep["schema_version"] == "0.1.0"
+    assert rep["model_id"] == "replay-stub"
+    assert rep["provisional"] is True
+    assert rep["run_count"] == 1
+    assert "overall" in rep and "by_experiment" in rep and "tasks" in rep
+    assert payloads["model_reports"]["replay-stub"]["run_count"] == 1
+
+
+def test_run_one_model_skips_unbeatable_tasks(tmp_path):
+    # A task Stage 2 marks unbeatable must not enter Stage 3/4: no model call,
+    # no run rows, no composites — without even resolving its (missing) source.
+    from scripts.run_pipeline import _run_one_model
+
+    calls = []
+
+    def agent(messages):
+        calls.append(messages)
+        return "FINAL_OUTPUT: DONE"
+
+    rows = [{"task_id": "dead", "source": "missing.json",
+             "experiment": "test1", "condition": "default"}]
+    run_rows, composites = _run_one_model(
+        rows, agent, "m",
+        manifest_path=tmp_path / "manifest.json",
+        artifacts_root=tmp_path / "artifacts",
+        static_by_task={"dead": {"is_beatable": False}},
+        difficulty_max=1.0,
+        config=load_scorer_config(),
+        seeds=[0], conditions=None, force=False,
+    )
+    assert run_rows == []
+    assert composites == {}
+    assert calls == []  # ineligible task -> model never invoked
diff --git a/tests/test_scoring_system.py b/tests/test_scoring_system.py
new file mode 100644
index 0000000..b58b8ac
--- /dev/null
+++ b/tests/test_scoring_system.py
@@ -0,0 +1,648 @@
+import argparse
+import json
+
+import pytest
+
+from gridworld.actions import MiniGridActions
+from gridworld.baselines import plan_bfs_path, trace_planned_actions
+from gridworld.task_spec import TaskSpecification
+from gridworld.task_validator import TaskValidator
+from scorer.artifacts import CanonicalPathReport, ScoredDifficulty
+from scorer.config import (
+    DEFAULT_CONFIG_PATH,
+    DEFAULT_DISTRACTOR_TYPE_WEIGHTS,
+    DEFAULT_RUNTIME_WEIGHTS,
+    DIMENSION_NAMES,
+    load_scorer_config,
+)
+from scorer.io import dump_json, load_json
+from scorer.scoring import (
+    ScorerConfig,
+    compute_12d_score,
+    compute_canonical_paths,
+    compute_runtime_score,
+    compute_static_score_artifact,
+    score_task_file,
+)
+from scripts.score_json import _default_runtime_output, _runtime, _static, _static_target_dirs
+
+
+def make_spec(**overrides):
+    data = {
+        "task_id": "scorer_case",
+        "seed": 7,
+        "difficulty_tier": 1,
+        "maze": {
+            "dimensions": [5, 5],
+            "walls": [],
+            "start": [1, 1],
+            "goal": [3, 1],
+        },
+        "mechanisms": {},
+        "rules": {"observability": "full", "view_size": 7},
+        "goal": {"type": "reach_position", "target": [3, 1]},
+        "max_steps": 20,
+    }
+    data.update(overrides)
+    return TaskSpecification.from_dict(data)
+
+
+def test_canonical_paths_include_bfs_actions_and_positions():
+    spec = make_spec()
+
+    report = compute_canonical_paths(spec)
+
+    assert report.success is True
+    assert report.actions == ["move_forward", "move_forward"]
+    assert report.positions == [(1, 1), (2, 1), (3, 1)]
+    assert report.optimal_steps == 2
+    assert report.states_explored > 0
+    assert report.greedy is not None
+    assert report.greedy["success"] is True
+
+
+def test_planner_toggle_trace_matches_current_cell_switch_precedence():
+    spec = make_spec(
+        maze={
+            "dimensions": [7, 5],
+            "walls": [[1, 2], [2, 2], [3, 2], [4, 2], [5, 2]],
+            "start": [1, 1],
+            "goal": [5, 1],
+        },
+        mechanisms={
+            "keys": [{"id": "k1", "position": [2, 1], "color": "red"}],
+            "doors": [
+                {
+                    "id": "d1",
+                    "position": [4, 1],
+                    "requires_key": "red",
+                    "initial_state": "locked",
+                }
+            ],
+            "switches": [
+                {
+                    "id": "s1",
+                    "position": [3, 1],
+                    "controls": [],
+                    "switch_type": "toggle",
+                    "initial_state": "off",
+                }
+            ],
+        },
+        goal={"type": "reach_position", "target": [5, 1]},
+        max_steps=30,
+    )
+
+    traced = trace_planned_actions(
+        spec,
+        [
+            int(MiniGridActions.PICKUP),
+            int(MiniGridActions.MOVE_FORWARD),
+            int(MiniGridActions.MOVE_FORWARD),
+            int(MiniGridActions.TOGGLE),
+        ],
+    )
+    bfs_path = plan_bfs_path(spec)
+
+    assert traced.action_labels[-1] == "toggle:s1"
+    assert "open_door:d1" not in traced.action_labels
+    assert bfs_path.success is False
+
+
+def test_static_score_uses_configurable_weights():
+    spec = make_spec()
+    default_score = compute_12d_score(spec)
+    config = ScorerConfig.from_dict(
+        {
+            "version": "unit",
+            "static_dimension_weights": {
+                "optimal_path_length": 2.0,
+                "grid_size": 0.0,
+            },
+        }
+    )
+
+    weighted = compute_12d_score(spec, config=config)
+
+    assert weighted.weights[0] == 2.0
+    assert weighted.weights[8] == 0.0
+    assert weighted.composite != default_score.composite
+
+
+def test_static_score_rejects_partial_explicit_weight_vectors():
+    spec = make_spec()
+
+    with pytest.raises(ValueError, match="Expected 12 static weights"):
+        compute_12d_score(spec, weights=[1.0, 2.0])
+    with pytest.raises(ValueError, match="Expected 12 static weights"):
+        compute_12d_score(spec, weights=[])
+
+
+def test_shipped_config_matches_code_defaults():
+    config = load_scorer_config(DEFAULT_CONFIG_PATH)
+
+    assert list(config.static_dimension_weights) == DIMENSION_NAMES
+    assert config.distractor_type_weights == DEFAULT_DISTRACTOR_TYPE_WEIGHTS
+    assert config.runtime_weights == DEFAULT_RUNTIME_WEIGHTS
+
+
+def test_explicit_missing_config_path_fails(tmp_path):
+    with pytest.raises(FileNotFoundError, match="Scorer config not found"):
+        load_scorer_config(tmp_path / "missing_config.json")
+
+
+def test_score_task_file_writes_stage_two_artifacts(tmp_path):
+    spec = make_spec()
+    task_path = tmp_path / "task.json"
+    spec.to_json(str(task_path))
+
+    canonical, static_score = score_task_file(task_path, output_dir=tmp_path / "artifacts")
+
+    assert canonical.success is True
+    assert static_score.is_beatable is True
+    assert (tmp_path / "artifacts" / "canonical_paths.json").exists()
+    scored_path = tmp_path / "artifacts" / "scored_static.json"
+    assert scored_path.exists()
+    with open(scored_path, encoding="utf-8") as f:
+        payload = json.load(f)
+    assert payload["task_id"] == spec.task_id
+    assert "dimensions_12" in payload
+    assert "dimensions" not in payload
+    assert "composite" not in payload
+    assert payload["validation"]["schema_valid"] is True
+    assert payload["canonical_agent_features"]["greedy_solvability"] == 1.0
+
+
+def test_scorer_json_io_uses_utf8_encoding(tmp_path, monkeypatch):
+    real_open = open
+    observed: list[tuple[str, str, str | None]] = []
+
+    def tracking_open(path, mode="r", *args, **kwargs):
+        observed.append((str(path), mode, kwargs.get("encoding")))
+        return real_open(path, mode, *args, **kwargs)
+
+    monkeypatch.setattr("builtins.open", tracking_open)
+
+    payload = {"message": "reach \u2192 caf\u00e9", "label": "caf\u00e9"}
+    path = tmp_path / "unicode.json"
+    dump_json(path, payload)
+
+    assert load_json(path) == payload
+    assert (str(path), "w", "utf-8") in observed
+    assert (str(path), "r", "utf-8") in observed
+
+
+def test_score_task_file_reuses_primary_validator_result(tmp_path, monkeypatch):
+    spec = make_spec()
+    task_path = tmp_path / "task.json"
+    spec.to_json(str(task_path))
+    calls = 0
+    original_validate = TaskValidator.validate
+
+    def count_validate(self, *args, **kwargs):
+        nonlocal calls
+        calls += 1
+        return original_validate(self, *args, **kwargs)
+
+    monkeypatch.setattr(TaskValidator, "validate", count_validate)
+
+    score_task_file(task_path)
+
+    assert calls == 1
+
+
+def test_score_task_file_rejects_invalid_schema_before_planning(tmp_path, monkeypatch):
+    spec = make_spec(
+        maze={
+            "dimensions": [5, 5],
+            "walls": [],
+            "start": [1, 1],
+            "goal": [9, 9],
+        },
+        goal={"type": "reach_position", "target": [9, 9]},
+    )
+    task_path = tmp_path / "task.json"
+    spec.to_json(str(task_path))
+
+    def fail_if_called(*args, **kwargs):
+        raise AssertionError("planner must not execute for schema-invalid tasks")
+
+    monkeypatch.setattr("scorer.static.plan_bfs_path", fail_if_called)
+    monkeypatch.setattr("scorer.static.plan_greedy_path", fail_if_called)
+
+    with pytest.raises(ValueError, match="failed schema validation"):
+        score_task_file(task_path)
+
+
+def test_static_score_uses_canonical_bfs_metrics():
+    spec = make_spec()
+    bfs_path = plan_bfs_path(spec)
+    score = compute_12d_score(spec, bfs_path=bfs_path)
+
+    assert score.dimensions[0] == len(bfs_path.action_labels)
+    assert score.dimensions[1] == bfs_path.states_explored
+
+
+def test_runtime_score_from_episode_json_payload():
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec)
+    run = {
+        "task_id": spec.task_id,
+        "backend": "minigrid",
+        "adapter": "unit",
+        "model_id": "unit-model",
+        "seed": 7,
+        "success": True,
+        "steps_taken": 2,
+        "terminated": True,
+        "truncated": False,
+        "total_tokens": 500,
+        "trajectory": [
+            {"state": {"agent_position": [1, 1]}},
+            {"state": {"agent_position": [2, 1]}},
+        ],
+        "final_state": {"agent_position": [3, 1], "step_count": 2},
+    }
+
+    config = ScorerConfig.from_dict({"runtime_weights": {"greedy_penalty": 0.0}})
+    score = compute_runtime_score(
+        run,
+        static_score=static_score,
+        canonical_paths=canonical,
+        config=config,
+        difficulty_max_static_score=static_score.static_score,
+    )
+
+    assert score.task_id == spec.task_id
+    assert score.composite == 1.0
+    assert score.signals["step_ratio"] == 1.0
+    assert score.signals["cell_overlap_bfs"] == 1.0
+    assert score.signals["cell_overlap_greedy"] == 1.0
+    assert score.signals["token_efficiency"] == 1.0
+    assert "path_choice" not in score.signals
+    assert "distractor_interactions" not in score.signals
+
+
+def test_runtime_score_prefers_interface_state_after_over_row_col_position_after():
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec)
+    run = {
+        "success": True,
+        "steps_used": 2,
+        "total_tokens": 100,
+        "end_reason": "success",
+        "task_spec": spec.to_dict(),
+        "initial_state": {"agent_position": [1, 1]},
+        "final_state": {"agent_position": [3, 1], "step_count": 2},
+        "transcript": [
+            {
+                "kind": "reset",
+                "state": {"agent_position": [1, 1]},
+            },
+            {
+                "kind": "step",
+                "position_after": [1, 2],
+                "state_after": {"agent_position": [2, 1]},
+            },
+            {
+                "kind": "step",
+                "position_after": [1, 3],
+                "state_after": {"agent_position": [3, 1]},
+            },
+        ],
+    }
+
+    config = ScorerConfig.from_dict({"runtime_weights": {"greedy_penalty": 0.0}})
+    score = compute_runtime_score(
+        run,
+        static_score=static_score,
+        canonical_paths=canonical,
+        config=config,
+        difficulty_max_static_score=static_score.static_score,
+    )
+
+    assert score.signals["cell_overlap_bfs"] == 1.0
+
+
+def test_runtime_score_requires_suite_difficulty_normalizer():
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec)
+
+    with pytest.raises(ValueError, match="difficulty_max_static_score"):
+        compute_runtime_score(
+            {"success": True, "steps": 2, "total_tokens": 100},
+            static_score=static_score,
+            canonical_paths=canonical,
+        )
+
+
+def test_runtime_score_rejects_suite_max_smaller_than_task_score():
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec)
+
+    with pytest.raises(ValueError, match="at least the task static score"):
+        compute_runtime_score(
+            {"success": True, "steps": 2, "total_tokens": 100},
+            static_score=static_score,
+            canonical_paths=canonical,
+            difficulty_max_static_score=static_score.static_score - 1,
+        )
+
+
+def test_runtime_score_rejects_unevaluated_greedy_solvability():
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec).to_dict()
+    static_score["canonical_agent_features"]["greedy_solvability"] = None
+
+    with pytest.raises(ValueError, match="greedy_solvability"):
+        compute_runtime_score(
+            {"success": True, "steps": 2, "total_tokens": 100},
+            static_score=static_score,
+            canonical_paths=canonical,
+            difficulty_max_static_score=static_score["static_score"],
+        )
+
+
+def test_runtime_score_rejects_schema_invalid_static_artifact_clearly():
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec).to_dict()
+    static_score["validation"]["schema_valid"] = False
+
+    with pytest.raises(ValueError, match="schema-valid"):
+        compute_runtime_score(
+            {"success": True, "steps": 2, "total_tokens": 100},
+            static_score=static_score,
+            canonical_paths=canonical,
+            difficulty_max_static_score=static_score["static_score"],
+        )
+
+
+def test_runtime_token_count_does_not_double_count_nested_step_tokens():
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec)
+    score = compute_runtime_score(
+        {
+            "success": True,
+            "steps": 2,
+            "trajectory": [{"tokens": 100, "info": {"tokens": 100}}],
+        },
+        static_score=static_score,
+        canonical_paths=canonical,
+        difficulty_max_static_score=static_score.static_score,
+    )
+
+    assert score.signals["token_count"] == 100
+
+
+def test_runtime_token_count_reads_query_transcript_usage():
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec)
+    score = compute_runtime_score(
+        {
+            "success": True,
+            "steps": 2,
+            "transcript": [
+                {
+                    "kind": "query",
+                    "usage": {"input_tokens": 80, "output_tokens": 20},
+                }
+            ],
+        },
+        static_score=static_score,
+        canonical_paths=canonical,
+        difficulty_max_static_score=static_score.static_score,
+    )
+
+    assert score.signals["token_count"] == 100
+
+
+def test_runtime_hash_ignores_non_scoring_transcript_context():
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec)
+    base_run = {
+        "success": True,
+        "steps": 2,
+        "total_tokens": 100,
+        "transcript": [
+            {
+                "kind": "query",
+                "agent_messages": [{"role": "user", "content": "first"}],
+            }
+        ],
+    }
+    changed_context = {
+        **base_run,
+        "transcript": [
+            {
+                "kind": "query",
+                "agent_messages": [{"role": "user", "content": "second"}],
+            }
+        ],
+    }
+
+    first = compute_runtime_score(
+        base_run,
+        static_score=static_score,
+        canonical_paths=canonical,
+        difficulty_max_static_score=static_score.static_score,
+    )
+    second = compute_runtime_score(
+        changed_context,
+        static_score=static_score,
+        canonical_paths=canonical,
+        difficulty_max_static_score=static_score.static_score,
+    )
+
+    assert first.inputs_hash == second.inputs_hash
+
+
+@pytest.mark.parametrize("token_count", [None, 0])
+def test_runtime_score_rejects_missing_or_zero_token_telemetry(token_count):
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec)
+    run = {"success": True, "steps": 2}
+    if token_count is not None:
+        run["total_tokens"] = token_count
+
+    with pytest.raises(ValueError, match="token"):
+        compute_runtime_score(
+            run,
+            static_score=static_score,
+            canonical_paths=canonical,
+            difficulty_max_static_score=static_score.static_score,
+        )
+
+
+def test_runtime_score_rejects_missing_step_telemetry():
+    spec = make_spec()
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec)
+
+    with pytest.raises(ValueError, match="step telemetry"):
+        compute_runtime_score(
+            {"success": True, "total_tokens": 100},
+            static_score=static_score,
+            canonical_paths=canonical,
+            difficulty_max_static_score=static_score.static_score,
+        )
+
+
+def test_zero_step_plans_do_not_inflate_optimal_steps_with_done():
+    spec = make_spec(
+        maze={
+            "dimensions": [5, 5],
+            "walls": [],
+            "start": [1, 1],
+            "goal": [1, 1],
+        },
+        goal={"type": "reach_position", "target": [1, 1]},
+    )
+
+    path = plan_bfs_path(spec)
+    traced_done = trace_planned_actions(spec, [int(MiniGridActions.DONE)])
+
+    assert path.success is True
+    assert path.action_labels == []
+    assert traced_done.success is True
+    assert traced_done.action_labels == []
+
+
+def test_runtime_zero_step_success_gets_full_step_credit():
+    spec = make_spec(
+        maze={
+            "dimensions": [5, 5],
+            "walls": [],
+            "start": [1, 1],
+            "goal": [1, 1],
+        },
+        goal={"type": "reach_position", "target": [1, 1]},
+    )
+    canonical = compute_canonical_paths(spec)
+    static_score = compute_static_score_artifact(spec)
+    score = compute_runtime_score(
+        {
+            "success": True,
+            "steps": 0,
+            "total_tokens": 100,
+            "initial_state": {"agent_position": [1, 1]},
+            "final_state": {"agent_position": [1, 1], "step_count": 0},
+        },
+        static_score=static_score,
+        canonical_paths=canonical,
+        config=ScorerConfig.from_dict({"runtime_weights": {"greedy_penalty": 0.0}}),
+        difficulty_max_static_score=static_score.static_score,
+    )
+
+    assert score.signals["step_ratio"] == 1.0
+    assert score.composite == 1.0
+
+
+def test_static_cli_target_dirs_reject_same_stem_collisions(tmp_path):
+    files = [tmp_path / "a" / "task.json", tmp_path / "b" / "task.json"]
+
+    with pytest.raises(ValueError, match="collide"):
+        _static_target_dirs(files, tmp_path / "scores")
+
+
+def test_static_cli_continues_after_file_failure_and_summarizes(tmp_path, capsys):
+    task_a = tmp_path / "task_a.json"
+    task_b = tmp_path / "task_b.json"
+    bad_task = tmp_path / "bad.json"
+    dump_json(task_a, make_spec(task_id="ok_a").to_dict())
+    dump_json(task_b, make_spec(task_id="ok_b").to_dict())
+    bad_task.write_text("{", encoding="utf-8")
+
+    exit_code = _static(
+        argparse.Namespace(
+            config=None,
+            inputs=[str(task_a), str(bad_task), str(task_b)],
+            output_dir=str(tmp_path / "scores"),
+        )
+    )
+    captured = capsys.readouterr()
+
+    assert exit_code == 1
+    assert "static: ok input=" in captured.out
+    assert "task_id=ok_a" in captured.out
+    assert "task_id=ok_b" in captured.out
+    assert "static: error input=" in captured.err
+    assert "bad.json" in captured.err
+    assert "JSONDecodeError" in captured.err
+    assert "Traceback" not in captured.err
+    assert "static: summary scored=2 failed=1 total=3" in captured.err
+    assert (tmp_path / "scores" / "task_a" / "scored_static.json").exists()
+    assert (tmp_path / "scores" / "task_b" / "scored_static.json").exists()
+    assert not (tmp_path / "scores" / "bad" / "scored_static.json").exists()
+
+
+def test_runtime_cli_default_output_uses_source_stem(tmp_path):
+    assert _default_runtime_output(tmp_path / "run.json") == tmp_path / "run_score.json"
+    assert _default_runtime_output(tmp_path / "episode.json") == tmp_path / "episode_score.json"
+
+
+def test_runtime_cli_rejects_half_specified_artifacts(tmp_path):
+    args = argparse.Namespace(
+        config=None,
+        run=str(tmp_path / "episode.json"),
+        output=None,
+        static_score=str(tmp_path / "scored_static.json"),
+        canonical_paths=None,
+        task=str(tmp_path / "task.json"),
+        artifact_dir=None,
+        difficulty_max_static_score=100.0,
+    )
+
+    with pytest.raises(ValueError, match="provided together"):
+        _runtime(args)
+
+
+def test_runtime_cli_explains_missing_suite_maximum(tmp_path):
+    args = argparse.Namespace(
+        config=None,
+        run=str(tmp_path / "episode.json"),
+        output=None,
+        static_score=str(tmp_path / "scored_static.json"),
+        canonical_paths=str(tmp_path / "canonical_paths.json"),
+        task=None,
+        artifact_dir=None,
+        difficulty_max_static_score=None,
+    )
+
+    with pytest.raises(ValueError, match="--difficulty-max-static-score"):
+        _runtime(args)
+
+
+def test_artifact_serialization_returns_detached_data():
+    scored = ScoredDifficulty(dimensions=[1.0], dimension_names=["only"], weights=[2.0])
+    scored_payload = scored.to_dict()
+    scored_payload["dimensions"][0] = 9.0
+    scored_payload["weights"][0] = 9.0
+
+    canonical = CanonicalPathReport(
+        task_id="task",
+        success=True,
+        actions=["move_forward"],
+        positions=[(1, 1), (2, 1)],
+        optimal_steps=1,
+        states_explored=2,
+        message="ok",
+        greedy={"actions": ["move_forward"]},
+    )
+    canonical_payload = canonical.to_dict()
+    canonical_payload["bfs"]["actions"][0] = "mutated"
+    canonical_payload["greedy"]["actions"][0] = "mutated"
+
+    assert scored.dimensions == [1.0]
+    assert scored.weights == [2.0]
+    assert canonical.actions == ["move_forward"]
+    assert canonical.greedy == {"actions": ["move_forward"]}