diff --git a/.gitignore b/.gitignore index 852ea75..e750d18 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,12 @@ build/ venv/ env/ +# IDE configurations +.vscode/ +.idea/ +*.sublime-project +*.sublime-workspace + # Test / tooling caches .pytest_cache/ .mypy_cache/ @@ -31,4 +37,6 @@ cuda_installer.pyz interface/smoke_tests/results/ # Generated evaluation outputs mazes/results/*_results.json -mazes/results/*.progress.json \ No newline at end of file +mazes/results/*.progress.json +# Bare-bones run pipeline artifact tree (regenerated locally) +artifacts/ \ No newline at end of file diff --git a/docs/system_design.md b/docs/system_design.md index 16010d4..6f1dad3 100644 --- a/docs/system_design.md +++ b/docs/system_design.md @@ -18,7 +18,7 @@ This document is the single canonical source of truth for how the MultiNet v2.0 1. [Overview & north stars](#1-overview--north-stars) 2. [Pipeline DAG: stages, artifacts, invalidation](#2-pipeline-dag-stages-artifacts-invalidation) 3. [Task spec contract](#3-task-spec-contract) -4. [Static scoring (13 dimensions)](#4-static-scoring-13-dimensions) +4. [Static scoring (12 dimensions plus canonical-agent features)](#4-static-scoring-12-dimensions-plus-canonical-agent-features) 5. [Runtime scoring](#5-runtime-scoring) 6. [Backend & inference adapter contracts](#6-backend--inference-adapter-contracts) 7. [Reporting & aggregate](#7-reporting--aggregate) @@ -74,18 +74,18 @@ The pipeline is a five-stage DAG. Each stage has declared inputs and outputs and 2. **Solve & Score-static** - Inputs: `task.json`. - Outputs: - - `canonical_paths.json` `{ bfs: { path, steps, states_explored }, greedy: { success, path, steps }, … }` - - `scored.json` `{ is_beatable, dimensions[13], fragility, mechanism_necessity_violations, distractor_safety_violations, message }` + - `canonical_paths.json` `{ bfs: { actions, positions, optimal_steps, states_explored }, greedy: { success, actions, positions, steps }, … }` + - `scored_static.json` `{ is_beatable, dimensions_12, canonical_agent_features, validation, message }` - Hash key: `hash(solver_v, scorer_v, task.json, agent_set_v)`. - - If `scored.json.is_beatable == false`, downstream stages skip the task; it is logged as ineligible and surfaced in reports. + - If `scored_static.json.is_beatable == false`, downstream stages skip the task; it is logged as ineligible and surfaced in reports. 3. **Render-and-Run** - - Inputs: `task.json`, `scored.json` (gate on `is_beatable`), backend choice, adapter choice, `model_id`, `seed`. + - Inputs: `task.json`, `scored_static.json` (gate on `is_beatable`), backend choice, adapter choice, `model_id`, `seed`. - Outputs: `run.json` `{ trajectory, actions, tokens, terminated, success }`. - Hash key: `hash(backend_v, adapter_v, model_id, task.json, seed)`. 4. **Score-runtime** - - Inputs: `run.json`, `scored.json`, `canonical_paths.json`. + - Inputs: `run.json`, `scored_static.json`, `canonical_paths.json`. - Outputs: `run_score.json` `{ success, step_ratio, cell_overlap_*, distractor_interactions, irreversible_failures, tokens, composite }`. - Hash key: `hash(runtime_scorer_v, inputs)`. @@ -106,7 +106,7 @@ artifacts/ ├── tasks// │ ├── task.json # Stage 1 │ ├── canonical_paths.json # Stage 2 (a) -│ └── scored.json # Stage 2 (b) — includes is_beatable +│ └── scored_static.json # Stage 2 (b) — includes is_beatable ├── runs////// │ ├── run.json # Stage 3 │ └── run_score.json # Stage 4 @@ -218,9 +218,9 @@ Enforced by `TaskSpecification.validate()`: --- -## 4. Static scoring (13 dimensions) +## 4. Static scoring (12 dimensions plus canonical-agent features) -Static scoring runs once per task at pipeline stage 2 (Solve & Score-static). It produces `scored.json`, which carries `is_beatable` plus a 13-dimension vector and supporting validation reports. The static scorer consumes `task.json` and `canonical_paths.json`. +Static scoring runs once per task at pipeline stage 2 (Solve & Score-static). It produces `scored_static.json`, which carries `is_beatable`, a 12-dimension vector, canonical-agent features, and supporting validation reports. The scorer consumes `task.json` and emits this artifact alongside `canonical_paths.json`. ### 4.1 Dimensions @@ -238,7 +238,7 @@ All raw values are floats (or counts cast to float). Higher = harder *unless* ex 10. **`wall_density`** — Source: spec. Computation: `len(walls) / grid_size`. Crude (does not separate interior vs functional walls); **calibration target**. 11. **`partial_observability`** — Source: spec rules. Computation: ordinal `{full: 0, view_cone: 1, fog_of_war: 2}` from `rules.observability`. 12. **`irreversibility`** — Source: spec rules + mechanisms. Computation: `key_consumption × #doors + #one_shot_switches + #non_bidirectional_teleporters`. -13. **`greedy_solvability`** — Source: Greedy canonical agent. Computation: `1.0 if greedy succeeds else 0.0`. **Penalty** (greedy-solvable tasks lower the runtime composite, on the rationale that they are less a test of spatial reasoning). +`greedy_solvability` is recorded separately under `canonical_agent_features`, rather than appended to the calibrated 12-dimension vector. Source: Greedy canonical agent. Computation: `1.0 if greedy succeeds else 0.0`. **Penalty** (greedy-solvable tasks lower the runtime composite, on the rationale that they are less a test of spatial reasoning). ### 4.2 Static composite (difficulty score) @@ -246,13 +246,13 @@ All raw values are floats (or counts cast to float). Higher = harder *unless* ex static_composite = Σ_i (raw_dim_i × calibration.weights[dim_name_i]) ``` -- `calibration.weights` lives in `calibration.yaml`; defaults to `1.0` for all dimensions until empirical tuning. +- Calibration weights live in `scorer/scorer_config.json` by default; optional JSON or YAML overrides may be passed explicitly. Weights default to `1.0` for all dimensions until empirical tuning. - `static_composite` is used for task ranking and live-benchmark filtering (e.g., reject tasks whose composite falls outside a tier's target range). - It is *not* used directly in runtime scoring; runtime uses individual dimensions plus a derived "difficulty weight" (Section 5). -### 4.3 Validation reports (also in `scored.json`) +### 4.3 Validation reports (also in `scored_static.json`) -Beyond the dimension vector, `scored.json` carries the validator's structural reports: +Beyond the dimension vector, `scored_static.json` carries the validator's structural reports: - `is_beatable` (bool) and `message` (str) — gate for downstream stages. - `mechanism_necessity_violations` (list of strings) — mechanisms whose removal still leaves the task solvable; flags accidental decoration. @@ -260,6 +260,7 @@ Beyond the dimension vector, `scored.json` carries the validator's structural re - `chain_ordering_valid` (bool) — each dependency step actually gates the next. These do not enter the composite but are surfaced in reports for task-quality auditing. +Schema-invalid tasks are rejected before canonical planners execute and do not emit score artifacts. ### 4.4 Calibration notes @@ -272,16 +273,16 @@ These do not enter the composite but are surfaced in reports for task-quality au ## 5. Runtime scoring -Runtime scoring runs at pipeline stage 4 (Score-runtime), once per `run.json`. It produces `run_score.json`. It consumes the run trajectory plus the static scoring artifacts (`scored.json`, `canonical_paths.json`). +Runtime scoring runs at pipeline stage 4 (Score-runtime), once per `run.json`. It produces `run_score.json`. It consumes the run trajectory plus the static scoring artifacts (`scored_static.json`, `canonical_paths.json`). ### 5.1 Per-run signal vector Recorded for every `(task, backend, adapter, model_id, seed)`: - `success` (bool) — goal reached within `max_steps`, no terminal hazard. -- `steps` (int) — agent's actual step count. +- `steps` (int) — agent's actual step count. Required; runtime scoring rejects missing telemetry. - `terminated_reason` (str) — one of `{goal_reached, hazard, max_steps, deadlock, invalid_action_excess}`. -- `token_count` (int) — total prompt + response tokens summed over all model turns. +- `token_count` (positive int) — total prompt + response tokens summed over all model turns. Required; runtime scoring rejects missing or non-positive telemetry. - `distractor_interactions` (int) — count of distractor-element interactions (any `pickup` / `toggle` / `push` on an element registered as a distractor). - `irreversible_failures` (int) — count of irreversible actions that broke solvability, detected by re-running the validator from the post-action state. @@ -298,11 +299,11 @@ composite = success_factor × efficiency_factor × difficulty_weight − greedy_ ``` - `success_factor = 1.0 if success else 0.0` — hard gate; failed runs score 0 regardless of efficiency. -- `efficiency_factor = α × step_ratio + β × cell_overlap_bfs + γ × token_efficiency` — weighted blend; default `α = β = γ = 1/3`. `token_efficiency = min(1, baseline_tokens / max(model_tokens, 1))` where `baseline_tokens` lives in `calibration.yaml`. -- `difficulty_weight = normalize(static_composite)` — harder tasks contribute more. Default normalization: `f(x) = x / max_observed_static_composite_in_suite`. +- `efficiency_factor = α × step_ratio + β × cell_overlap_bfs + γ × token_efficiency` — weighted blend; default `α = β = γ = 1/3`. `token_efficiency = min(1, baseline_tokens / model_tokens)` where `baseline_tokens` lives in scorer config. Missing or non-positive token telemetry is an artifact error, not a neutral score. +- `difficulty_weight = normalize(static_composite)` — harder tasks contribute more. Default normalization: `f(x) = x / max_observed_static_composite_in_suite`. Runtime scoring requires that suite maximum either in scorer config or as an explicit runtime argument. - `greedy_penalty = δ × greedy_solvability × success_factor` — applied only to successful runs; `δ` is a calibration coefficient with default 0.5. -All Greek-letter coefficients (`α, β, γ, δ`) and the normalization function live in `calibration.yaml`. The design commits to the *shape*, not the values. +All Greek-letter coefficients (`α, β, γ, δ`) and the normalization value live in scorer config. The design commits to the *shape*, not the values. ### 5.4 Single-point benchmark score (ARC-AGI style) @@ -340,7 +341,8 @@ Defaults to a uniform mean. Calibration may switch to a tier-weighted or difficu ### 5.6 Calibration notes - All composite coefficients ship as `1.0` or sensible defaults; the design does not claim correctness. -- `calibration.yaml` is versioned in git; changes bump `calibration_version` and trigger stage-4 / stage-5 invalidation. +- `scorer/scorer_config.json` is versioned in git; changes bump `calibration_version` and trigger stage-4 / stage-5 invalidation. +- The shipped config intentionally leaves `difficulty_max_static_score` unset. Runtime scoring requires a calibrated suite maximum through config or `--difficulty-max-static-score`. - After a calibration update, the pipeline regenerates `run_score.json` and `reports/` from cached `run.json`. Run records do **not** re-execute model calls. This is a deliberate consequence of the DAG split. --- @@ -533,16 +535,16 @@ Status legend: **2. Validator** — folded into Stage 2 - ✅ `gridworld/task_validator.py::TaskValidator` does exhaustive BFS over the full mechanism state space, plus `compute_fragility`, `validate_mechanism_necessity`, `validate_chain_ordering`, `validate_distractor_safety`. -- Delta: surface validation reports into `scored.json` instead of emitting a separate `validity.json`. +- Delta: surface validation reports into `scored_static.json` instead of emitting a separate `validity.json`. **3. Solver suite (canonical agents)** — Stage 2 -- ⚠️ BFS exists inside `TaskValidator._find_solution`. Greedy does not yet exist as a separate canonical agent. -- 🚧 Multi-tier solver suite pending; Greedy is the next addition, then heuristic, then random. -- Delta: extract BFS path emission as one canonical agent, add Greedy as a peer, write combined output to `canonical_paths.json`. +- ✅ `gridworld/baselines.py` exposes BFS and Greedy planners; `scorer/solvers.py` writes their combined output to `canonical_paths.json`. +- 🚧 Heuristic and random canonical-agent peers remain optional future additions. +- Delta: add calibration runs before extending the canonical-agent feature vector. **4. Static scorer** — Stage 2 -- ⚠️ `gridworld/scoring.py::compute_12d_score` exists with 12 dimensions matching dimensions 1–12 of §4 (modulo formula calibration). -- Delta: add dimension 13 (`greedy_solvability`), restructure output to `scored.json` sidecar, move composite weights to `calibration.yaml`, include validation reports. +- ✅ `scorer/scoring.py::compute_12d_score` exposes the public interface for the 12 calibrated dimensions and writes `scored_static.json` with validation reports plus `canonical_agent_features.greedy_solvability`. +- Delta: empirically calibrate the shipped placeholder weights. **5. `MiniGridBackend`** — backend axis - ✅ `gridworld/backends/minigrid_backend.py` implements `AbstractGridBackend` for square grids with discrete actions + RGB rendering. @@ -566,8 +568,8 @@ Status legend: - Delta: emit canonical `run.json`; remove inline scoring (move to Stage 4); add per-step trajectory recording. **10. Runtime scorer** — Stage 4 -- 🚧 Does not exist as a component. Some scoring logic lives inside `evaluation_harness.py`. -- Delta: new module that consumes `run.json` + `scored.json` + `canonical_paths.json` and produces `run_score.json`. +- ✅ `scorer/runtime.py` consumes `run.json` + `scored_static.json` + `canonical_paths.json` and produces `run_score.json`. +- Delta: populate optional interaction diagnostics in runtime producers and calibrate the suite-level difficulty maximum. **11. Aggregator / reporter** — Stage 5 - ⚠️ Partial. `evaluation_harness.py` produces some summary dicts; nothing matches the per-run-set artifact layout. @@ -597,7 +599,7 @@ Items the design intentionally defers. None block initial implementation. - DAG runner technology — Snakemake leading candidate; final pick deferred to implementation. - Token-efficiency baseline (`baseline_tokens`) — per-task vs global constant; needs a sensible default once a few model runs exist. -### 9.2 Calibration coefficients (live in `calibration.yaml`, default to placeholders) +### 9.2 Calibration coefficients (live in scorer config, default to placeholders) - Runtime composite blend weights `α, β, γ` (step ratio / cell overlap / token efficiency). - Greedy penalty coefficient `δ`. - `difficulty_weight` normalization function (currently `x / max_observed`; may switch to a percentile or log normalization). @@ -645,7 +647,7 @@ Mapping to the canonical pipeline: | JSON generator | Stage 1 (Generate) | §2.1 | | Task spec / Validator | folded into Stage 2 (Solve & Score-static) | §2.1 | | BFS-greedy agents | Multi-tier canonical agent suite (Stage 2) | §2.1, §4 | -| Score calculation (static) | Static scoring (13 dimensions) (Stage 2) | §4 | +| Score calculation (static) | Static scoring (12 dimensions plus canonical-agent features) (Stage 2) | §4 | | Backend Generator | Backend axis: `MiniGridBackend` / `MultiGridBackend` / `TextBackend` | §6 | | Inference scripts | Adapter axis: `ModelInterface` implementations | §6 | | Scoring code (final score, comparison) | Runtime scoring (Stage 4) + Aggregate (Stage 5) | §5, §7 | diff --git a/evaluation_harness.py b/evaluation_harness.py index 57fa3ee..55ea840 100644 --- a/evaluation_harness.py +++ b/evaluation_harness.py @@ -22,7 +22,8 @@ from .gridworld.task_spec import TaskSpecification from .gridworld.actions import ACTION_NAMES, ACTION_DESCRIPTIONS from .gridworld.task_validator import compute_difficulty - from .gridworld.scoring import compute_12d_score + from .scorer.io import json_default as _json_default + from .scorer.scoring import compute_12d_score except ImportError: from model_interface import ModelInterface, ModelInput, ModelOutput from gridworld.runner.grid_runner import GridRunner, EpisodeResult @@ -31,14 +32,8 @@ from gridworld.task_spec import TaskSpecification from gridworld.actions import ACTION_NAMES, ACTION_DESCRIPTIONS from gridworld.task_validator import compute_difficulty - from gridworld.scoring import compute_12d_score - - -def _json_default(value): - """Convert NumPy scalars to native Python types for JSON serialization.""" - if isinstance(value, np.generic): - return value.item() - raise TypeError(f"Object of type {value.__class__.__name__} is not JSON serializable") + from scorer.io import json_default as _json_default + from scorer.scoring import compute_12d_score @dataclass diff --git a/gridworld/__init__.py b/gridworld/__init__.py index 27425ed..fd567a0 100644 --- a/gridworld/__init__.py +++ b/gridworld/__init__.py @@ -1,7 +1,7 @@ """Gridworld domain for MultiNet-v2.0. -This module provides task schema, validation, and scoring utilities for -gridworld puzzle specifications. +This module provides task schema and validation utilities for gridworld +puzzle specifications. """ from .bootstrap import disable_gymnasium_env_plugins @@ -32,9 +32,6 @@ TaskValidator, compute_difficulty, ) -from .scoring import ScoredDifficulty, compute_12d_score - - __all__ = [ # Task specification "Position", @@ -58,6 +55,4 @@ "DifficultyReport", "FragilityReport", "compute_difficulty", - "ScoredDifficulty", - "compute_12d_score", ] diff --git a/gridworld/baselines.py b/gridworld/baselines.py index ee5c8b0..d81efcd 100644 --- a/gridworld/baselines.py +++ b/gridworld/baselines.py @@ -49,6 +49,17 @@ class Transition: next_state: PlannerState +@dataclass(frozen=True) +class PlannedPath: + """Planner output with replayed positions for scorer/reporting artifacts.""" + + success: bool + actions: list[int] + action_labels: list[str] + positions: list[tuple[int, int]] + states_explored: int = 0 + + class TaskPlanningContext: """Fast lookup tables derived from a ``TaskSpecification``.""" @@ -239,25 +250,6 @@ def _successors(ctx: TaskPlanningContext, state: PlannerState) -> Iterable[Trans ), ) - door = ctx.doors_by_pos.get(front) - if door and door["id"] not in state.open_doors and state.carrying_key is not None: - held_color = ctx.keys_by_id[state.carrying_key]["color"] - if held_color == door["color"]: - yield Transition( - action=int(MiniGridActions.TOGGLE), - label=f"open_door:{door['id']}", - next_state=PlannerState( - agent_pos=state.agent_pos, - agent_dir=state.agent_dir, - carrying_key=None if ctx.key_consumption else state.carrying_key, - collected_keys=state.collected_keys, - active_switches=state.active_switches, - used_switches=state.used_switches, - open_gates=state.open_gates, - open_doors=state.open_doors | {door["id"]}, - ), - ) - switch = ctx.switches_by_pos.get(state.agent_pos) if switch and switch["switch_type"] != "hold": toggled = _apply_switch(ctx, state, switch) @@ -268,6 +260,27 @@ def _successors(ctx: TaskPlanningContext, state: PlannerState) -> Iterable[Trans next_state=toggled, ) + # Runtime consumes TOGGLE on the current-cell switch before checking front doors. + if switch is None: + door = ctx.doors_by_pos.get(front) + if door and door["id"] not in state.open_doors and state.carrying_key is not None: + held_color = ctx.keys_by_id[state.carrying_key]["color"] + if held_color == door["color"]: + yield Transition( + action=int(MiniGridActions.TOGGLE), + label=f"open_door:{door['id']}", + next_state=PlannerState( + agent_pos=state.agent_pos, + agent_dir=state.agent_dir, + carrying_key=None if ctx.key_consumption else state.carrying_key, + collected_keys=state.collected_keys, + active_switches=state.active_switches, + used_switches=state.used_switches, + open_gates=state.open_gates, + open_doors=state.open_doors | {door["id"]}, + ), + ) + yield from _forward_successor(ctx, state, front) @@ -353,10 +366,10 @@ def _shortest_plan( ctx: TaskPlanningContext, start: PlannerState, is_goal: Callable[[PlannerState], bool], -) -> tuple[list[int], PlannerState | None]: +) -> tuple[list[int], PlannerState | None, int]: """Run BFS over executable actions and return the first shortest plan.""" if is_goal(start): - return [], start + return [], start, 1 queue = deque([start]) parent: dict[PlannerState, tuple[PlannerState, int]] = {} @@ -370,10 +383,14 @@ def _shortest_plan( visited.add(transition.next_state) parent[transition.next_state] = (state, transition.action) if is_goal(transition.next_state): - return _reconstruct_actions(parent, transition.next_state), transition.next_state + return ( + _reconstruct_actions(parent, transition.next_state), + transition.next_state, + len(visited), + ) queue.append(transition.next_state) - return [], None + return [], None, len(visited) def _shortest_plan_to_interaction( @@ -437,9 +454,18 @@ def _reconstruct_actions( def _bfs_actions(spec: TaskSpecification) -> list[int]: + actions, _ = _bfs_actions_with_stats(spec) + return actions + + +def _bfs_actions_with_stats(spec: TaskSpecification) -> tuple[list[int], int]: ctx = TaskPlanningContext(spec) - actions, _ = _shortest_plan(ctx, ctx.initial_state(), lambda st: st.agent_pos == ctx.goal) - return actions or [int(MiniGridActions.DONE)] + actions, _, states_explored = _shortest_plan( + ctx, + ctx.initial_state(), + lambda st: st.agent_pos == ctx.goal, + ) + return actions, states_explored def _greedy_actions(spec: TaskSpecification) -> list[int]: @@ -452,13 +478,81 @@ def _greedy_actions(spec: TaskSpecification) -> list[int]: break chunk, next_state = _shortest_plan_to_interaction(ctx, state) if next_state is None: - chunk, next_state = _shortest_plan(ctx, state, lambda st: st.agent_pos == ctx.goal) + chunk, next_state, _ = _shortest_plan( + ctx, + state, + lambda st: st.agent_pos == ctx.goal, + ) if next_state is None or not chunk: break actions.extend(chunk) state = next_state - return actions or [int(MiniGridActions.DONE)] + return actions + + +def trace_planned_actions(spec: TaskSpecification, actions: list[int]) -> PlannedPath: + """Replay planner actions through the planner graph without running a backend.""" + ctx = TaskPlanningContext(spec) + state = ctx.initial_state() + positions = [state.agent_pos] + executed_actions: list[int] = [] + labels: list[str] = [] + + for action in actions: + if action == int(MiniGridActions.DONE): + break + executed_actions.append(action) + transition = next( + (candidate for candidate in _successors(ctx, state) if candidate.action == action), + None, + ) + if transition is None: + labels.append(f"invalid:{action}") + return PlannedPath( + success=False, + actions=executed_actions, + action_labels=labels, + positions=positions, + ) + labels.append(transition.label) + state = transition.next_state + positions.append(state.agent_pos) + + return PlannedPath( + success=state.agent_pos == ctx.goal, + actions=executed_actions, + action_labels=labels, + positions=positions, + ) + + +def plan_bfs_actions(spec: TaskSpecification) -> list[int]: + """Return the deterministic BFS baseline action plan.""" + return _bfs_actions(spec) + + +def plan_greedy_actions(spec: TaskSpecification) -> list[int]: + """Return the deterministic greedy baseline action plan.""" + return _greedy_actions(spec) + + +def plan_bfs_path(spec: TaskSpecification) -> PlannedPath: + """Return the BFS baseline plan plus replayed positions.""" + actions, states_explored = _bfs_actions_with_stats(spec) + path = trace_planned_actions(spec, actions) + return PlannedPath( + success=path.success, + actions=path.actions, + action_labels=path.action_labels, + positions=path.positions, + states_explored=states_explored, + ) + + +def plan_greedy_path(spec: TaskSpecification) -> PlannedPath: + """Return the greedy baseline plan plus replayed positions.""" + return trace_planned_actions(spec, plan_greedy_actions(spec)) class PlannedBaselineModel(ModelInterface): diff --git a/gridworld/fixtures/manifest.json b/gridworld/fixtures/manifest.json new file mode 100644 index 0000000..92cbfee --- /dev/null +++ b/gridworld/fixtures/manifest.json @@ -0,0 +1,365 @@ +{ + "description": "Fixture manifest for the bare-bones run pipeline (tests 1-3). route_short_cells/route_long_cells on test2 rows are populated by scripts/validate_fixtures.py.", + "tasks": [ + { + "task_id": "validation_10_v01_empty_room", + "experiment": "test1", + "condition": "default", + "variant": "empty_room", + "source": "mazes/validation_10/V01_empty_room.json", + "expected_mechanisms": [], + "notes": "Baseline empty room." + }, + { + "task_id": "validation_10_v02_winding_corridor", + "experiment": "test1", + "condition": "default", + "variant": "winding_corridor", + "source": "mazes/validation_10/V02_winding_corridor.json", + "expected_mechanisms": [], + "notes": "Navigation only." + }, + { + "task_id": "validation_10_v03_multi_path", + "experiment": "test1", + "condition": "default", + "variant": "multi_path", + "source": "mazes/validation_10/V03_multi_path.json", + "expected_mechanisms": [], + "notes": "Also used for test2." + }, + { + "task_id": "validation_10_v04_single_key", + "experiment": "test1", + "condition": "default", + "variant": "single_key", + "source": "mazes/validation_10/V04_single_key.json", + "expected_mechanisms": [ + "kR" + ], + "notes": "Single key-door." + }, + { + "task_id": "validation_10_v05_single_switch", + "experiment": "test1", + "condition": "default", + "variant": "single_switch", + "source": "mazes/validation_10/V05_single_switch.json", + "expected_mechanisms": [ + "s1" + ], + "notes": "Single switch-gate." + }, + { + "task_id": "validation_10_v06_chain_ks", + "experiment": "test1", + "condition": "default", + "variant": "chain_ks", + "source": "mazes/validation_10/V06_chain_ks.json", + "expected_mechanisms": [ + "kR", + "s1" + ], + "notes": "Also used for test3 (key-first)." + }, + { + "task_id": "validation_10_v07_chain_sk", + "experiment": "test1", + "condition": "default", + "variant": "chain_sk", + "source": "mazes/validation_10/V07_chain_sk.json", + "expected_mechanisms": [ + "s1", + "kR" + ], + "notes": "Also used for test3 (switch-first)." + }, + { + "task_id": "validation_10_v08_chain_kk", + "experiment": "test1", + "condition": "default", + "variant": "chain_kk", + "source": "mazes/validation_10/V08_chain_kk.json", + "expected_mechanisms": [], + "notes": "Two-key chain." + }, + { + "task_id": "validation_10_v09_distractor_simple", + "experiment": "test1", + "condition": "default", + "variant": "distractor_simple", + "source": "mazes/validation_10/V09_distractor_simple.json", + "expected_mechanisms": [], + "notes": "Distractor present." + }, + { + "task_id": "validation_10_v10_distractor_chain", + "experiment": "test1", + "condition": "default", + "variant": "distractor_chain", + "source": "mazes/validation_10/V10_distractor_chain.json", + "expected_mechanisms": [], + "notes": "Distractor chain." + }, + { + "task_id": "validation_10_v03_multi_path__t2", + "experiment": "test2", + "condition": "multi_path", + "variant": "open_routes", + "source": "mazes/validation_10/V03_multi_path.json", + "expected_mechanisms": [], + "route_block": [ + 5, + 6 + ], + "notes": "Three open routes; route_block forces a longer route to discriminate path_choice.", + "route_short_cells": [ + [ + 2, + 6 + ], + [ + 3, + 6 + ], + [ + 4, + 6 + ], + [ + 5, + 5 + ], + [ + 5, + 6 + ], + [ + 6, + 5 + ], + [ + 7, + 5 + ], + [ + 8, + 5 + ], + [ + 9, + 5 + ], + [ + 10, + 5 + ] + ], + "route_long_cells": [ + [ + 1, + 7 + ], + [ + 1, + 8 + ], + [ + 2, + 8 + ], + [ + 3, + 8 + ], + [ + 4, + 8 + ], + [ + 4, + 9 + ], + [ + 4, + 10 + ], + [ + 5, + 10 + ], + [ + 6, + 10 + ], + [ + 7, + 10 + ], + [ + 8, + 6 + ], + [ + 8, + 7 + ], + [ + 8, + 8 + ], + [ + 8, + 9 + ], + [ + 8, + 10 + ], + [ + 9, + 6 + ] + ] + }, + { + "task_id": "T2_corridor_shortcut", + "experiment": "test2", + "condition": "shortcut", + "variant": "door_shortcut", + "source": "gridworld/fixtures/test2/T2_corridor_shortcut.json", + "expected_mechanisms": [ + "kB" + ], + "route_block": [ + 4, + 1 + ], + "notes": "Short mechanistic (door) route vs long open detour; route_block is the door cell.", + "route_short_cells": [ + [ + 1, + 2 + ], + [ + 3, + 1 + ], + [ + 4, + 1 + ], + [ + 5, + 1 + ], + [ + 6, + 1 + ] + ], + "route_long_cells": [ + [ + 2, + 2 + ], + [ + 2, + 3 + ], + [ + 2, + 4 + ], + [ + 2, + 5 + ], + [ + 3, + 5 + ], + [ + 4, + 5 + ], + [ + 5, + 5 + ], + [ + 6, + 5 + ], + [ + 7, + 2 + ], + [ + 7, + 3 + ], + [ + 7, + 4 + ], + [ + 7, + 5 + ] + ] + }, + { + "task_id": "T3_corr_key_first", + "experiment": "test3", + "condition": "key_first", + "variant": "ks", + "pair_id": "corridor", + "source": "gridworld/fixtures/test3/T3_corr_key_first.json", + "expected_mechanisms": [ + "kB", + "s1" + ], + "notes": "Single-row corridor, key-first; matched with switch-first." + }, + { + "task_id": "T3_corr_switch_first", + "experiment": "test3", + "condition": "switch_first", + "variant": "sk", + "pair_id": "corridor", + "source": "gridworld/fixtures/test3/T3_corr_switch_first.json", + "expected_mechanisms": [ + "s1", + "kB" + ], + "notes": "Single-row corridor, switch-first; matched with key-first." + }, + { + "task_id": "T3_corr2_key_first", + "experiment": "test3", + "condition": "key_first", + "variant": "ks", + "pair_id": "corridor2", + "source": "gridworld/fixtures/test3/T3_corr2_key_first.json", + "expected_mechanisms": [ + "kB", + "s1" + ], + "notes": "Longer single-row corridor, key-first; matched with switch-first." + }, + { + "task_id": "T3_corr2_switch_first", + "experiment": "test3", + "condition": "switch_first", + "variant": "sk", + "pair_id": "corridor2", + "source": "gridworld/fixtures/test3/T3_corr2_switch_first.json", + "expected_mechanisms": [ + "s1", + "kB" + ], + "notes": "Longer single-row corridor, switch-first; matched with key-first." + } + ] +} diff --git a/gridworld/fixtures/run_config.example.json b/gridworld/fixtures/run_config.example.json new file mode 100644 index 0000000..2dfa050 --- /dev/null +++ b/gridworld/fixtures/run_config.example.json @@ -0,0 +1,27 @@ +{ + "description": "Example run-config: maps each model to the task files (or experiment keywords / task_ids) it should run. Per-task scoring metadata is looked up from the manifest catalog by path. Run with: multinet-run-pipeline --run-config gridworld/fixtures/run_config.example.json", + "models": { + "sonnet": { + "provider": "claude", + "model": "claude-sonnet-4-6", + "temperature": 0.0, + "max_tokens": 1024, + "tasks": [ + "mazes/validation_10/V01_empty_room.json", + "mazes/validation_10/V04_single_key.json", + "gridworld/fixtures/test2/T2_corridor_shortcut.json", + "gridworld/fixtures/test3/T3_corr_key_first.json", + "gridworld/fixtures/test3/T3_corr_switch_first.json" + ] + }, + "qwen35vl": { + "provider": "qwen", + "model": "Qwen/Qwen3.5-4B", + "temperature": 0.0, + "max_tokens": 1024, + "tasks": [ + "test3" + ] + } + } +} diff --git a/gridworld/fixtures/test2/T2_corridor_shortcut.json b/gridworld/fixtures/test2/T2_corridor_shortcut.json new file mode 100644 index 0000000..fbea884 --- /dev/null +++ b/gridworld/fixtures/test2/T2_corridor_shortcut.json @@ -0,0 +1,41 @@ +{ + "task_id": "T2_corridor_shortcut", + "version": "1.0", + "seed": 201, + "difficulty_tier": 2, + "description": "A short top route crosses a locked door (key near start); a long open route detours through the bottom passage. Both routes reach the goal.", + "maze": { + "dimensions": [9, 7], + "walls": [ + [4, 2], [4, 3], [4, 4] + ], + "start": [1, 1], + "goal": [7, 1] + }, + "mechanisms": { + "keys": [ + {"id": "kB", "position": [1, 3], "color": "blue"} + ], + "doors": [ + {"id": "DR", "position": [4, 1], "requires_key": "blue", "initial_state": "locked"} + ], + "switches": [], + "gates": [], + "blocks": [], + "teleporters": [], + "hazards": [] + }, + "rules": { + "key_consumption": true, + "switch_type": "toggle", + "hidden_mechanisms": [], + "observability": "full", + "view_size": 7 + }, + "goal": { + "type": "reach_position", + "target": [7, 1], + "auxiliary_conditions": [] + }, + "max_steps": 90 +} diff --git a/gridworld/fixtures/test3/T3_corr2_key_first.json b/gridworld/fixtures/test3/T3_corr2_key_first.json new file mode 100644 index 0000000..b8aadc3 --- /dev/null +++ b/gridworld/fixtures/test3/T3_corr2_key_first.json @@ -0,0 +1,51 @@ +{ + "task_id": "T3_corr2_key_first", + "version": "1.0", + "seed": 303, + "difficulty_tier": 3, + "description": "Longer single-row corridor. Required order: collect the blue key, open the door, then toggle the switch to open the gate to the goal.", + "maze": { + "dimensions": [13, 3], + "walls": [], + "start": [1, 1], + "goal": [11, 1] + }, + "mechanisms": { + "keys": [ + {"id": "kB", "position": [2, 1], "color": "blue"} + ], + "doors": [ + {"id": "DR", "position": [4, 1], "requires_key": "blue", "initial_state": "locked"} + ], + "switches": [ + {"id": "s1", "position": [6, 1], "controls": ["g1"], "switch_type": "toggle", "initial_state": "off"} + ], + "gates": [ + {"id": "g1", "position": [8, 1], "initial_state": "closed"} + ], + "blocks": [], + "teleporters": [], + "hazards": [] + }, + "rules": { + "key_consumption": true, + "switch_type": "toggle", + "hidden_mechanisms": [], + "observability": "full", + "view_size": 7 + }, + "goal": { + "type": "reach_position", + "target": [11, 1], + "auxiliary_conditions": [] + }, + "dependency_chain": { + "depth": 2, + "sequence": [ + {"step": 1, "type": "key-door", "element": "kB", "unlocks": "DR"}, + {"step": 2, "type": "switch-gate", "element": "s1", "unlocks": "g1"} + ], + "notation": "kB -> DR -> s1 -> g1 -> G" + }, + "max_steps": 90 +} diff --git a/gridworld/fixtures/test3/T3_corr2_switch_first.json b/gridworld/fixtures/test3/T3_corr2_switch_first.json new file mode 100644 index 0000000..0c517f3 --- /dev/null +++ b/gridworld/fixtures/test3/T3_corr2_switch_first.json @@ -0,0 +1,51 @@ +{ + "task_id": "T3_corr2_switch_first", + "version": "1.0", + "seed": 304, + "difficulty_tier": 3, + "description": "Longer single-row corridor with identical topology to the key-first variant. Required order: toggle the switch to open the gate, then collect the blue key and open the door to the goal.", + "maze": { + "dimensions": [13, 3], + "walls": [], + "start": [1, 1], + "goal": [11, 1] + }, + "mechanisms": { + "keys": [ + {"id": "kB", "position": [6, 1], "color": "blue"} + ], + "doors": [ + {"id": "DR", "position": [8, 1], "requires_key": "blue", "initial_state": "locked"} + ], + "switches": [ + {"id": "s1", "position": [2, 1], "controls": ["g1"], "switch_type": "toggle", "initial_state": "off"} + ], + "gates": [ + {"id": "g1", "position": [4, 1], "initial_state": "closed"} + ], + "blocks": [], + "teleporters": [], + "hazards": [] + }, + "rules": { + "key_consumption": true, + "switch_type": "toggle", + "hidden_mechanisms": [], + "observability": "full", + "view_size": 7 + }, + "goal": { + "type": "reach_position", + "target": [11, 1], + "auxiliary_conditions": [] + }, + "dependency_chain": { + "depth": 2, + "sequence": [ + {"step": 1, "type": "switch-gate", "element": "s1", "unlocks": "g1"}, + {"step": 2, "type": "key-door", "element": "kB", "unlocks": "DR"} + ], + "notation": "s1 -> g1 -> kB -> DR -> G" + }, + "max_steps": 90 +} diff --git a/gridworld/fixtures/test3/T3_corr_key_first.json b/gridworld/fixtures/test3/T3_corr_key_first.json new file mode 100644 index 0000000..6e5d66b --- /dev/null +++ b/gridworld/fixtures/test3/T3_corr_key_first.json @@ -0,0 +1,51 @@ +{ + "task_id": "T3_corr_key_first", + "version": "1.0", + "seed": 301, + "difficulty_tier": 3, + "description": "Single-row corridor. Required order: collect the blue key, open the door, then toggle the switch to open the gate to the goal.", + "maze": { + "dimensions": [11, 3], + "walls": [], + "start": [1, 1], + "goal": [9, 1] + }, + "mechanisms": { + "keys": [ + {"id": "kB", "position": [2, 1], "color": "blue"} + ], + "doors": [ + {"id": "DR", "position": [4, 1], "requires_key": "blue", "initial_state": "locked"} + ], + "switches": [ + {"id": "s1", "position": [6, 1], "controls": ["g1"], "switch_type": "toggle", "initial_state": "off"} + ], + "gates": [ + {"id": "g1", "position": [8, 1], "initial_state": "closed"} + ], + "blocks": [], + "teleporters": [], + "hazards": [] + }, + "rules": { + "key_consumption": true, + "switch_type": "toggle", + "hidden_mechanisms": [], + "observability": "full", + "view_size": 7 + }, + "goal": { + "type": "reach_position", + "target": [9, 1], + "auxiliary_conditions": [] + }, + "dependency_chain": { + "depth": 2, + "sequence": [ + {"step": 1, "type": "key-door", "element": "kB", "unlocks": "DR"}, + {"step": 2, "type": "switch-gate", "element": "s1", "unlocks": "g1"} + ], + "notation": "kB -> DR -> s1 -> g1 -> G" + }, + "max_steps": 80 +} diff --git a/gridworld/fixtures/test3/T3_corr_switch_first.json b/gridworld/fixtures/test3/T3_corr_switch_first.json new file mode 100644 index 0000000..4a214fa --- /dev/null +++ b/gridworld/fixtures/test3/T3_corr_switch_first.json @@ -0,0 +1,51 @@ +{ + "task_id": "T3_corr_switch_first", + "version": "1.0", + "seed": 302, + "difficulty_tier": 3, + "description": "Single-row corridor with identical topology to the key-first variant. Required order: toggle the switch to open the gate, then collect the blue key and open the door to the goal.", + "maze": { + "dimensions": [11, 3], + "walls": [], + "start": [1, 1], + "goal": [9, 1] + }, + "mechanisms": { + "keys": [ + {"id": "kB", "position": [6, 1], "color": "blue"} + ], + "doors": [ + {"id": "DR", "position": [8, 1], "requires_key": "blue", "initial_state": "locked"} + ], + "switches": [ + {"id": "s1", "position": [2, 1], "controls": ["g1"], "switch_type": "toggle", "initial_state": "off"} + ], + "gates": [ + {"id": "g1", "position": [4, 1], "initial_state": "closed"} + ], + "blocks": [], + "teleporters": [], + "hazards": [] + }, + "rules": { + "key_consumption": true, + "switch_type": "toggle", + "hidden_mechanisms": [], + "observability": "full", + "view_size": 7 + }, + "goal": { + "type": "reach_position", + "target": [9, 1], + "auxiliary_conditions": [] + }, + "dependency_chain": { + "depth": 2, + "sequence": [ + {"step": 1, "type": "switch-gate", "element": "s1", "unlocks": "g1"}, + {"step": 2, "type": "key-door", "element": "kB", "unlocks": "DR"} + ], + "notation": "s1 -> g1 -> kB -> DR -> G" + }, + "max_steps": 80 +} diff --git a/gridworld/scoring.py b/gridworld/scoring.py deleted file mode 100644 index 9dd3670..0000000 --- a/gridworld/scoring.py +++ /dev/null @@ -1,152 +0,0 @@ -"""12-dimension scoring for gridworld tasks.""" - -from __future__ import annotations - -from dataclasses import dataclass, field - -from .task_spec import TaskSpecification -from .task_validator import DifficultyReport, TaskValidator - - -DIMENSION_NAMES = [ - "optimal_path_length", - "search_space_size", - "backtracking_required", - "fragility", - "dependency_depth", - "dependency_variety", - "distractor_count", - "distractor_quality", - "grid_size", - "wall_density", - "partial_observability", - "irreversibility", -] - - -@dataclass -class ScoredDifficulty: - """Full 12-dimension score report.""" - dimensions: list[float] - dimension_names: list[str] = field(default_factory=lambda: DIMENSION_NAMES.copy()) - composite: float = 0.0 - weights: list[float] = field(default_factory=lambda: [1.0] * len(DIMENSION_NAMES)) - - def to_dict(self) -> dict: - return { - "dimensions": self.dimensions, - "dimension_names": self.dimension_names, - "composite": self.composite, - "weights": self.weights, - } - - -def _count_backtracking(solution: list[tuple[int, int]] | None) -> float: - if not solution: - return 0.0 - seen = set() - revisits = 0 - previous_pos = None - for pos in solution: - if pos == previous_pos: - continue - if pos in seen: - revisits += 1 - seen.add(pos) - previous_pos = pos - return float(revisits) - - -def _dependency_variety(spec: TaskSpecification) -> float: - if spec.dependency_chain is not None: - return float(len({step.type for step in spec.dependency_chain.sequence})) - - variety = 0 - if spec.mechanisms.keys and spec.mechanisms.doors: - variety += 1 - if spec.mechanisms.switches and spec.mechanisms.gates: - variety += 1 - if spec.mechanisms.blocks: - variety += 1 - if spec.mechanisms.teleporters: - variety += 1 - if spec.mechanisms.hazards: - variety += 1 - return float(variety) - - -def _distractor_quality(spec: TaskSpecification) -> float: - if not spec.distractors: - return 0.0 - weights = { - "wrong_color_key": 1.0, - "inactive_switch": 2.0, - "decoy_door": 2.0, - "distractor_chain": 3.0, - } - return float(sum(weights.get(d.type, 1.0) for d in spec.distractors)) - - -def _partial_observability(spec: TaskSpecification) -> float: - mapping = {"full": 0.0, "view_cone": 1.0, "fog_of_war": 2.0} - return mapping.get(spec.rules.observability, 0.0) - - -def _irreversibility(spec: TaskSpecification) -> float: - score = 0.0 - if spec.rules.key_consumption: - score += float(len(spec.mechanisms.doors)) - score += float(sum(1 for switch in spec.mechanisms.switches if switch.switch_type == "one_shot")) - score += float(sum(1 for tp in spec.mechanisms.teleporters if not tp.bidirectional)) - return score - - -def compute_12d_score( - spec: TaskSpecification, - solver_output: DifficultyReport | None = None, - weights: list[float] | None = None, -) -> ScoredDifficulty: - """ - Compute the full 12-dimension benchmark score. - - This wraps solver-derived metrics with rubric dimensions such as - fragility, dependency variety, distractor quality, partial observability, - wall density, and irreversibility. The compact solver report remains in - compute_difficulty for callers that only need path/search metrics. - """ - validator = TaskValidator(spec) - is_beatable, solution, message = validator.validate() - if solver_output is None: - from .task_validator import compute_difficulty - - solver_output = compute_difficulty(spec) - - fragility = validator.compute_fragility() - fragility_value = 0.0 if fragility.min_steps_to_break == -1 else 1.0 / fragility.min_steps_to_break - - width, height = spec.maze.dimensions - grid_size = float(width * height) - wall_density = float(len(spec.maze.walls) / grid_size) if grid_size else 0.0 - - dimensions = [ - float(solver_output.optimal_steps), - float(solver_output.states_explored), - float(solver_output.backtrack_count if hasattr(solver_output, "backtrack_count") else _count_backtracking(solution)), - fragility_value, - float(spec.dependency_chain.depth if spec.dependency_chain is not None else solver_output.dependency_depth), - _dependency_variety(spec), - float(len(spec.distractors or [])), - _distractor_quality(spec), - grid_size, - wall_density, - _partial_observability(spec), - _irreversibility(spec), - ] - - weight_vector = weights or [1.0] * len(DIMENSION_NAMES) - composite = float(sum(d * w for d, w in zip(dimensions, weight_vector))) - return ScoredDifficulty( - dimensions=dimensions, - composite=composite, - weights=weight_vector, - ) diff --git a/gridworld/task_validator.py b/gridworld/task_validator.py index aee948f..4befedf 100644 --- a/gridworld/task_validator.py +++ b/gridworld/task_validator.py @@ -493,12 +493,13 @@ def validate_chain_ordering(self) -> bool: return False return True - def validate_distractor_safety(self) -> list[str]: + def validate_distractor_safety(self, base_beatable: bool | None = None) -> list[str]: """Check whether a single distractor interaction can make the task unsolvable.""" if not self.spec.distractors: return [] - base_beatable, _, _ = self.validate() + if base_beatable is None: + base_beatable, _, _ = self.validate() if not base_beatable: return ["Base task is not solvable"] @@ -767,17 +768,23 @@ def to_dict(self) -> dict: } -def compute_difficulty(spec: TaskSpecification) -> DifficultyReport: +def compute_difficulty( + spec: TaskSpecification, + validator: TaskValidator | None = None, + validation_result: tuple[bool, Optional[list[tuple[int, int]]], str] | None = None, +) -> DifficultyReport: """ Compute solver-derived difficulty metrics for a task. This is a compact report centered on BFS output: beatability, shortest action count, states explored, coarse mechanism complexity, and a legacy - composite score. Use compute_12d_score when the full rubric vector is + composite score. Use scorer.scoring.compute_12d_score when the full rubric vector is needed for benchmark comparison. """ - validator = TaskValidator(spec) - is_beatable, solution, message = validator.validate() + task_validator = validator or TaskValidator(spec) + if validation_result is None: + validation_result = task_validator.validate() + is_beatable, solution, message = validation_result optimal_steps = len(solution) - 1 if solution else 0 # -1 because path includes start # Extract states_explored from message diff --git a/interface/agents/claude.py b/interface/agents/claude.py index 9a6fc8e..1a2466b 100644 --- a/interface/agents/claude.py +++ b/interface/agents/claude.py @@ -17,6 +17,7 @@ parse_runner_content, split_system_prompt, ) +from interface.telemetry import normalize_token_usage logger = logging.getLogger(__name__) @@ -83,7 +84,7 @@ def _post_messages( system: Optional[str], messages: List[Dict[str, object]], timeout: Optional[float], -) -> str: +) -> Tuple[str, Optional[Dict[str, int]]]: body: Dict[str, object] = { "model": model, "max_tokens": max_tokens, @@ -136,7 +137,7 @@ def _post_messages( for block in payload.get("content", []) or []: if isinstance(block, dict) and block.get("type") == "text": parts.append(str(block.get("text", ""))) - return "".join(parts).strip() + return "".join(parts).strip(), normalize_token_usage(payload.get("usage")) @dataclass @@ -153,6 +154,7 @@ class ClaudeAnthropicAgent: config: ClaudeAnthropicConfig = field(default_factory=ClaudeAnthropicConfig) api_key: Optional[str] = None + last_usage: Optional[Dict[str, int]] = field(default=None, init=False) def __post_init__(self) -> None: key = (self.api_key or os.environ.get("ANTHROPIC_API_KEY") or "").strip() @@ -165,7 +167,7 @@ def __post_init__(self) -> None: def __call__(self, messages: List[dict]) -> str: system, turns = _to_anthropic_turns(messages) - return _post_messages( + text, self.last_usage = _post_messages( self.api_key, model=self.config.model, max_tokens=self.config.max_tokens, @@ -174,3 +176,4 @@ def __call__(self, messages: List[dict]) -> str: messages=turns, timeout=self.config.timeout, ) + return text diff --git a/interface/agents/qwen35_vl.py b/interface/agents/qwen35_vl.py index 2ad4e90..6963800 100644 --- a/interface/agents/qwen35_vl.py +++ b/interface/agents/qwen35_vl.py @@ -69,6 +69,13 @@ class Qwen35VLConfig: temperature: float = 0.0 max_new_tokens: int = 1024 device_map: str = "auto" + local_files_only: bool = True + trust_remote_code: bool = False + torch_dtype: str | None = "auto" + load_in_4bit: bool = False + attn_implementation: str | None = None + max_memory: dict[str, str] | None = None + enable_thinking: bool = False @dataclass @@ -78,29 +85,96 @@ class Qwen35VLAgent: config: Qwen35VLConfig = field(default_factory=Qwen35VLConfig) processor: Any = None model: Any = None + last_usage: dict[str, int] | None = field(default=None, init=False) def __post_init__(self) -> None: - from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration + from transformers import AutoProcessor if self.processor is None: - self.processor = AutoProcessor.from_pretrained(self.config.model) + self.processor = AutoProcessor.from_pretrained( + self.config.model, + local_files_only=self.config.local_files_only, + trust_remote_code=self.config.trust_remote_code, + ) if self.model is None: - self.model = Qwen3_5ForConditionalGeneration.from_pretrained( + model_cls = self._model_class() + self.model = model_cls.from_pretrained( self.config.model, - device_map=self.config.device_map, + **self._model_kwargs(), ) + def reset_usage(self) -> None: + self.last_usage = None + + def _model_class(self): + import transformers + + for name in ( + "Qwen3_5ForConditionalGeneration", + "AutoModelForImageTextToText", + "AutoModelForVision2Seq", + "AutoModelForCausalLM", + ): + model_cls = getattr(transformers, name, None) + if model_cls is not None: + return model_cls + raise ImportError("Transformers does not provide a usable Qwen 3.5 model class.") + + def _torch_dtype(self): + dtype = self.config.torch_dtype + if dtype is None or dtype == "auto": + return dtype + import torch + + return getattr(torch, dtype) + + def _model_kwargs(self) -> dict[str, Any]: + kwargs: dict[str, Any] = { + "device_map": self.config.device_map, + "local_files_only": self.config.local_files_only, + "trust_remote_code": self.config.trust_remote_code, + } + torch_dtype = self._torch_dtype() + if torch_dtype is not None: + kwargs["torch_dtype"] = torch_dtype + if self.config.attn_implementation: + kwargs["attn_implementation"] = self.config.attn_implementation + if self.config.max_memory: + kwargs["max_memory"] = dict(self.config.max_memory) + if self.config.load_in_4bit: + import torch + from transformers import BitsAndBytesConfig + + kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + ) + return kwargs + + def _input_device(self): + device = getattr(self.model, "device", None) + if device is not None: + return device + try: + return next(self.model.parameters()).device + except StopIteration: + return None + def __call__(self, messages: List[dict]) -> str: qwen_messages = _to_qwen_messages(messages) inputs = self.processor.apply_chat_template( qwen_messages, tokenize=True, add_generation_prompt=True, + enable_thinking=self.config.enable_thinking, return_dict=True, return_tensors="pt", ) + input_device = self._input_device() inputs = { - key: value.to(self.model.device) if hasattr(value, "to") else value + key: value.to(input_device) if input_device is not None and hasattr(value, "to") else value for key, value in inputs.items() } @@ -121,4 +195,9 @@ def __call__(self, messages: List[dict]) -> str: ) new_tokens = generated[0][prompt_len:] + self.last_usage = { + "input_tokens": int(prompt_len), + "output_tokens": int(len(new_tokens)), + "total_tokens": int(prompt_len + len(new_tokens)), + } return self.processor.decode(new_tokens, skip_special_tokens=True).strip() diff --git a/interface/config.py b/interface/config.py index d99c2e0..23a6d60 100644 --- a/interface/config.py +++ b/interface/config.py @@ -10,7 +10,7 @@ class ExperimentConfig: prompting: Literal["minimal", "standard", "verbose"] = "standard" observation: Literal["text_only", "image_text", "image_only"] = "image_text" - context_window: Literal["current", "last3"] = "last3" + context_window: Literal["current", "last3"] = "current" querying: Literal["step_by_step", "subgoal", "full_trajectory"] = "step_by_step" chat_history: Literal["stateless", "rolling", "full"] = "stateless" chat_turns_max: int = 3 diff --git a/interface/coords.py b/interface/coords.py index 6511c2c..e3b3153 100644 --- a/interface/coords.py +++ b/interface/coords.py @@ -4,6 +4,7 @@ from gridworld.backends.base import GridState from gridworld.task_spec import Position, TaskSpecification +from prompting_experiments.prompt_templates import observation as observation_templates FACING_ORDER = ["NORTH", "EAST", "SOUTH", "WEST"] @@ -126,29 +127,42 @@ def describe_cell( cols: int, ) -> str: if row < 1 or row > rows or col < 1 or col > cols: - return "out of bounds" + return observation_templates.CELL_OUT_OF_BOUNDS if (row, col) in walls: - return "wall" + return observation_templates.CELL_WALL if (row, col) == goal: - return f"GOAL ({row},{col})" + return observation_templates.CELL_GOAL.format(row=row, col=col) key_color = key_at_cell(task_spec, state, row, col) if key_color: - return f"{key_color} key ({row},{col})" + return observation_templates.CELL_KEY.format( + key_color=key_color, + row=row, + col=col, + ) for door in task_spec.mechanisms.doors: if to_row_col(door.position) == (row, col): status = "open" if door.id in state.open_doors else door.initial_state - return f"{status} {door.requires_key} door ({row},{col})" + return observation_templates.CELL_DOOR.format( + status=status, + requires_key=door.requires_key, + row=row, + col=col, + ) for gate in task_spec.mechanisms.gates: if to_row_col(gate.position) == (row, col): cur = "open" if gate.id in state.open_gates else gate.initial_state - return f"{cur} gate ({row},{col})" + return observation_templates.CELL_GATE.format(state=cur, row=row, col=col) for switch in task_spec.mechanisms.switches: if to_row_col(switch.position) == (row, col): on_off = "on" if switch.id in state.active_switches else switch.initial_state - return f"switch ({on_off}) ({row},{col})" + return observation_templates.CELL_SWITCH.format( + state=on_off, + row=row, + col=col, + ) - return f"open ({row},{col})" + return observation_templates.CELL_OPEN.format(row=row, col=col) diff --git a/interface/feedback.py b/interface/feedback.py index 18cc3aa..95416df 100644 --- a/interface/feedback.py +++ b/interface/feedback.py @@ -15,6 +15,7 @@ switch_at_cell, switches_controlling_gate, ) +from prompting_experiments.prompt_templates import feedback as feedback_templates def infer_step_outcome( @@ -35,13 +36,17 @@ def infer_step_outcome( door = next((d for d in task_spec.mechanisms.doors if d.id == door_id), None) color = door.requires_key if door else "matching" if action == "MOVE_FORWARD" and prev_pos != curr_pos: - return "OPENED", f"Opened {color} door {door_id} and moved to {curr_pos}." - return "OPENED", f"Opened {color} door {door_id}." + return "OPENED", feedback_templates.OPENED_AND_MOVED.format( + color=color, + door_id=door_id, + position=curr_pos, + ) + return "OPENED", feedback_templates.OPENED_DOOR.format(color=color, door_id=door_id) if action in ("TURN_LEFT", "TURN_RIGHT"): if prev.agent_direction != curr.agent_direction: - return "TURNED", f"Now facing {agent_facing(curr)}." - return "NOTHING", f"{action} had no effect." + return "TURNED", feedback_templates.NOW_FACING.format(facing=agent_facing(curr)) + return "NOTHING", feedback_templates.ACTION_NO_EFFECT.format(action=action) if action == "MOVE_FORWARD": if prev_pos == curr_pos: @@ -50,9 +55,10 @@ def infer_step_outcome( if key_color: return ( "BLOCKED", - f"MOVE_FORWARD blocked by a {key_color} key at {fwd}. " - "Keys occupy their cell; you cannot walk onto them. " - "Face the key and use PICKUP from your current cell.", + feedback_templates.MOVE_BLOCKED_BY_KEY.format( + key_color=key_color, + position=fwd, + ), ) gate = gate_at_cell(task_spec, prev, fwd[0], fwd[1]) if gate and not gate["open"]: @@ -61,17 +67,23 @@ def infer_step_outcome( switch_list = ", ".join(controllers) return ( "BLOCKED", - f"MOVE_FORWARD blocked by closed gate {gate['id']} at {fwd}. " - f"Activate switch(es) {switch_list} to open it.", + feedback_templates.MOVE_BLOCKED_BY_GATE_WITH_SWITCHES.format( + gate_id=gate["id"], + position=fwd, + switches=switch_list, + ), ) return ( "BLOCKED", - f"MOVE_FORWARD blocked by closed gate {gate['id']} at {fwd}.", + feedback_templates.MOVE_BLOCKED_BY_GATE.format( + gate_id=gate["id"], + position=fwd, + ), ) - return "BLOCKED", "MOVE_FORWARD blocked by wall or closed door/gate." + return "BLOCKED", feedback_templates.MOVE_BLOCKED_GENERIC if terminated and reward > 0 and curr_pos == goal: - return "DONE", f"Reached goal at {goal}." - return "MOVED", f"Moved to {curr_pos}." + return "DONE", feedback_templates.REACHED_GOAL.format(goal=goal) + return "MOVED", feedback_templates.MOVED_TO.format(position=curr_pos) if action == "PICKUP": if ( @@ -79,15 +91,15 @@ def infer_step_outcome( or len(curr.collected_keys) > len(prev.collected_keys) ): carried = curr.agent_carrying or "a" - return "PICKUP", f"Picked up {carried} key." - return "NOTHING", "Nothing to pick up here." + return "PICKUP", feedback_templates.PICKED_UP_KEY.format(key_color=carried) + return "NOTHING", feedback_templates.NOTHING_TO_PICK_UP if action == "TOGGLE": if ( prev.active_switches != curr.active_switches or prev.open_gates != curr.open_gates ): - return "TOGGLED", "Toggled switch or gate state changed." + return "TOGGLED", feedback_templates.TOGGLED_STATE_CHANGED fwd = forward_cell(prev) switch_ahead = switch_at_cell(task_spec, fwd[0], fwd[1]) switch_here = switch_at_cell(task_spec, prev_pos[0], prev_pos[1]) @@ -96,12 +108,11 @@ def infer_step_outcome( if switch_ahead["switch_type"] == "hold": return ( "NOTHING", - f"TOGGLE had no effect. MOVE_FORWARD onto the switch at {fwd} " - "(hold switches activate while you stand on them).", + feedback_templates.TOGGLE_HOLD_SWITCH_HINT.format(position=fwd), ) return ( "NOTHING", - f"TOGGLE had no effect. MOVE_FORWARD onto the switch at {fwd}, then TOGGLE.", + feedback_templates.TOGGLE_SWITCH_HINT.format(position=fwd), ) if gate_ahead and not gate_ahead["open"]: controllers = switches_controlling_gate(task_spec, str(gate_ahead["id"])) @@ -109,21 +120,22 @@ def infer_step_outcome( switch_list = ", ".join(controllers) return ( "NOTHING", - "Gates cannot be toggled directly. " - f"Activate switch(es) {switch_list} instead.", + feedback_templates.GATE_TOGGLE_WITH_SWITCHES.format( + switches=switch_list, + ), ) - return "NOTHING", "Gates cannot be toggled directly. Activate a linked switch instead." + return "NOTHING", feedback_templates.GATE_TOGGLE_GENERIC return ( "NOTHING", - "TOGGLE had no effect. Stand on a switch and TOGGLE, or use PICKUP/keys for doors.", + feedback_templates.TOGGLE_NO_EFFECT, ) if action == "DONE": if terminated and reward > 0 and curr_pos == goal: - return "DONE", f"Task complete at {goal}." - return "WRONG_DONE", f"DONE called but not at goal {goal}." + return "DONE", feedback_templates.TASK_COMPLETE.format(goal=goal) + return "WRONG_DONE", feedback_templates.WRONG_DONE.format(goal=goal) - return "INVALID", f"Unknown or unsupported action {action}." + return "INVALID", feedback_templates.UNKNOWN_ACTION.format(action=action) def format_step_feedback( @@ -139,23 +151,27 @@ def format_step_feedback( ) prev_pos = agent_row_col(prev) if event_type == "BLOCKED": - return f"BLOCKED — {action}: {event_message} You remain at {prev_pos}.", event_type + return feedback_templates.BLOCKED_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type if event_type == "TURNED": - return f"TURNED — {action}: {event_message}", event_type + return feedback_templates.TURNED_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "MOVED": - return f"MOVED — {action}: {event_message}", event_type + return feedback_templates.MOVED_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "DONE": - return f"SUCCESS — {action}: {event_message}", event_type + return feedback_templates.SUCCESS_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "PICKUP": - return f"PICKUP — {action}: {event_message}", event_type + return feedback_templates.PICKUP_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "NOTHING": - return f"NOTHING — {action}: {event_message} You remain at {prev_pos}.", event_type + return feedback_templates.NOTHING_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type if event_type == "OPENED": - return f"OPENED — {action}: {event_message}", event_type + return feedback_templates.OPENED_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "TOGGLED": - return f"TOGGLED — {action}: {event_message}", event_type + return feedback_templates.TOGGLED_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "WRONG_DONE": - return f"WRONG DONE — {action}: {event_message} You remain at {prev_pos}.", event_type + return feedback_templates.WRONG_DONE_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type if event_type == "INVALID": - return f"INVALID — {action}: {event_message} You remain at {prev_pos}.", event_type - return f"{event_type} — {action}: {event_message}", event_type + return feedback_templates.INVALID_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type + return feedback_templates.DEFAULT_FEEDBACK.format( + event_type=event_type, + action=action, + message=event_message, + ), event_type diff --git a/interface/observation.py b/interface/observation.py index d898d26..ff97abb 100644 --- a/interface/observation.py +++ b/interface/observation.py @@ -19,6 +19,7 @@ from gridworld.task_spec import TaskSpecification from interface.renderer import render_user_observation_text, rgb_to_image_block +from prompting_experiments.prompt_templates import observation as observation_templates ObservationMode = Literal["text_only", "image_text", "image_only"] ContextWindow = Literal["current", "last3"] @@ -51,11 +52,17 @@ def history_text( if not recs: return "" - lines = ["Recent history (last 3 steps, oldest first):"] + lines = [observation_templates.RECENT_HISTORY_HEADER] for rec in recs: row, col = rec["position_after"] lines.append( - f" ({int(row)}, {int(col)}) facing {rec['facing_after']} -> {rec['action']} -> {rec['prompt_feedback']}" + observation_templates.RECENT_HISTORY_STEP.format( + row=int(row), + col=int(col), + facing=rec["facing_after"], + action=rec["action"], + feedback=rec["prompt_feedback"], + ) ) return "\n".join(lines) @@ -78,16 +85,22 @@ def history_content_blocks( continue blocks.append(rgb_to_image_block(rgb)) if observation == "image_only": - blocks.append({"type": "text", "text": f"Action: {rec['action']}\n\n"}) + blocks.append( + { + "type": "text", + "text": observation_templates.IMAGE_HISTORY_ACTION.format( + action=rec["action"] + ), + } + ) if not blocks: return [] intro = ( - "Recent steps (oldest first). Each image is the maze view from which the " - "following action was chosen; infer pose and environment state from the image.\n\n" + observation_templates.IMAGE_ONLY_HISTORY_INTRO if observation == "image_only" - else "Recent step views (oldest first):\n\n" + else observation_templates.IMAGE_TEXT_HISTORY_INTRO ) return [{"type": "text", "text": intro}] + blocks diff --git a/interface/prompt_strategies.py b/interface/prompt_strategies.py index 64580cc..657cb33 100644 --- a/interface/prompt_strategies.py +++ b/interface/prompt_strategies.py @@ -16,34 +16,12 @@ maze_rows_cols, wall_cells, ) +from prompting_experiments.prompt_templates import system as system_templates +from prompting_experiments.prompt_templates import user as user_templates -MECHANISM_LIST = ( - "The environment may contain:\n" - "- Keys: pick them up to open doors of the matching color\n" - "- Doors: blocked passages that require a matching key\n" - "- Switches: step onto them to activate (hold) or TOGGLE while standing on them\n" - "- Gates: blocked passages controlled by switches\n" -) - -MECHANISM_RULES = ( - "RULES (domain logic):\n" - " - PICKUP: pick up a key from the adjacent cell you are facing. Keys block movement — you\n" - " cannot MOVE_FORWARD onto a key; stand beside it, face it, and PICKUP.\n" - " - Doors: face a locked door with the matching key in inventory and TOGGLE to open it, then\n" - " MOVE_FORWARD through the open door. MOVE_FORWARD alone does not open a locked door.\n" - " - Switches: MOVE_FORWARD onto the switch cell, then TOGGLE (toggle/one-shot types). Hold-type\n" - " switches activate automatically while you stand on them. Only switches are toggled. Linked\n" - " gates are open if at least one linked switch is on, and closed if all are off.\n" - " - Gates: you cannot TOGGLE a gate. CLOSED gates block movement; OPEN gates do not.\n" - " - Closed gates and doors you lack a key for block movement like walls until resolved.\n" - " - Use DONE only when you are standing on the goal cell." -) - -FINAL_OUTPUT_INSTRUCTION = ( - "On the last line, output exactly:\n" - "FINAL_OUTPUT: or FINAL_OUTPUT: , , ... " - "(comma-separated; one or more valid actions)" -) +MECHANISM_LIST = system_templates.MECHANISM_LIST +MECHANISM_RULES = system_templates.MECHANISM_RULES +FINAL_OUTPUT_INSTRUCTION = system_templates.FINAL_OUTPUT_INSTRUCTION class MinimalPromptStrategy: @@ -51,12 +29,14 @@ def __init__(self, actions_hint: str) -> None: self._actions_hint = actions_hint def build_system_prompt(self, querying_suffix: str = "") -> str: - return ( - "Task: move to the goal cell in the grid.\n" - f"Valid actions: {self._actions_hint}.\n" - f"{FINAL_OUTPUT_INSTRUCTION}" - + (f"\n\n{querying_suffix}" if querying_suffix else "") - ) + chunks = [ + system_templates.TASK_PREFIX, + system_templates.VALID_ACTIONS_TEMPLATE.format(actions_hint=self._actions_hint), + FINAL_OUTPUT_INSTRUCTION, + ] + if querying_suffix: + chunks.append(querying_suffix) + return "\n".join(chunks[:2]) + "\n" + "\n\n".join(chunks[2:]) def build_user_prompt( self, @@ -66,29 +46,34 @@ def build_user_prompt( state: GridState, last_feedback: str, ) -> str: - history_block = f"{history_text}\n\n" if history_text else "" - obs_block = f"Observation:\n{obs_text}\n\n" if obs_text else "" + obs_block = ( + user_templates.OBSERVATION_SECTION.format(obs_text=obs_text) + if obs_text + else "" + ) pos = agent_row_col(state) goal = goal_row_col(task_spec) - return ( - f"{history_block}" - f"{obs_block}" - f"Position: {pos} | Facing: {agent_facing(state)} | Goal: {goal} | " - f"Step {state.step_count + 1}/{state.max_steps}\n" - f"Last result: {last_feedback}\n" - "What is your next action?" + prompt = user_templates.MINIMAL_USER_PROMPT.format( + obs_block=obs_block, + position=pos, + facing=agent_facing(state), + goal=goal, + last_feedback=last_feedback, ) + return _with_history(prompt, history_text) class StandardPromptStrategy(MinimalPromptStrategy): def build_system_prompt(self, querying_suffix: str = "") -> str: - return ( - "Task: move to the goal cell in the grid.\n" - f"{MECHANISM_LIST}\n" - f"Valid actions: {self._actions_hint}.\n" - f"{FINAL_OUTPUT_INSTRUCTION}" - + (f"\n\n{querying_suffix}" if querying_suffix else "") - ) + chunks = [ + system_templates.TASK_PREFIX, + MECHANISM_LIST, + system_templates.VALID_ACTIONS_TEMPLATE.format(actions_hint=self._actions_hint), + FINAL_OUTPUT_INSTRUCTION, + ] + if querying_suffix: + chunks.append(querying_suffix) + return "\n".join(chunks[:3]) + "\n" + "\n\n".join(chunks[3:]) class VerbosePromptStrategy(StandardPromptStrategy): @@ -107,12 +92,6 @@ def build_user_prompt( state: GridState, last_feedback: str, ) -> str: - steps_left = state.max_steps - state.step_count - budget_warn = ( - f" WARNING: Only {steps_left} steps remaining!\n" - if steps_left <= max(5, state.max_steps // 5) - else "" - ) row, col = agent_row_col(state) grow, gcol = goal_row_col(task_spec) manhattan = abs(row - grow) + abs(col - gcol) @@ -140,42 +119,52 @@ def build_user_prompt( rows=rows, cols=cols, ) - neighbour_lines.append(f" {rel}: {desc}") - neighbour_block = "From your perspective:\n" + "\n".join(neighbour_lines) + "\n" + neighbour_lines.append( + user_templates.NEIGHBOUR_LINE.format( + relative_direction=rel, + description=desc, + ) + ) + neighbour_block = ( + user_templates.NEIGHBOUR_BLOCK_HEADER + "\n".join(neighbour_lines) + "\n" + ) mechanism_block = _mechanism_hints_text(task_spec) - history_block = f"{history_text}\n\n" if history_text else "" - obs_block = f"Observation:\n{obs_text}\n\n" if obs_text else "" + obs_block = ( + user_templates.OBSERVATION_SECTION.format(obs_text=obs_text) + if obs_text + else "" + ) inventory_str = ", ".join(inventory_list(state)) or "none" - return ( - f"{history_block}" - f"{obs_block}" - f"Position: {row, col} | Facing: {agent_facing(state)} | Goal: {(grow, gcol)} | " - f"Manhattan: {manhattan} | Step {state.step_count + 1}/{state.max_steps} ({steps_left} left)\n" - f"Inventory: {inventory_str}\n" - f"{budget_warn}" - f"{neighbour_block}" - f"{mechanism_block}" - f"Last result: {last_feedback}\n" - "What is your next action?" + prompt = user_templates.VERBOSE_USER_PROMPT.format( + obs_block=obs_block, + position=(row, col), + facing=agent_facing(state), + goal=(grow, gcol), + manhattan=manhattan, + inventory=inventory_str, + neighbour_block=neighbour_block, + mechanism_block=mechanism_block, + last_feedback=last_feedback, ) + return _with_history(prompt, history_text) PromptStrategy = MinimalPromptStrategy +def _with_history(prompt: str, history_text: str) -> str: + if not history_text: + return prompt + return f"{history_text}\n\n{prompt}" + + def _mechanism_hints_text(task_spec: TaskSpecification) -> str: lines = [] if task_spec.mechanisms.keys or task_spec.mechanisms.doors: - lines.append( - " - Face an adjacent key and PICKUP (do not walk onto the key). " - "Face a locked door with the matching key and TOGGLE to open it, then MOVE_FORWARD through." - ) + lines.append(user_templates.KEY_DOOR_HINT) if task_spec.mechanisms.switches or task_spec.mechanisms.gates: - lines.append( - " - MOVE_FORWARD onto a switch, then TOGGLE (hold switches activate on step). " - "Gates cannot be toggled — activate their linked switch(es)." - ) + lines.append(user_templates.SWITCH_GATE_HINT) if not lines: return "" - return "Hints:\n" + "\n".join(lines) + "\n" + return user_templates.MECHANISM_HINTS_HEADER + "\n".join(lines) + "\n" diff --git a/interface/querying.py b/interface/querying.py index daa4117..9413a98 100644 --- a/interface/querying.py +++ b/interface/querying.py @@ -4,6 +4,7 @@ from typing import List, Literal from interface.parser import normalize_action, parse_final_output +from prompting_experiments.prompt_templates import querying as querying_templates QueryingKind = Literal["step_by_step", "subgoal", "full_trajectory"] @@ -51,18 +52,8 @@ def system_prompt_suffix(self) -> str: if self.kind == "step_by_step": return "" if self.kind == "subgoal": - return ( - "For each turn output:\n" - " SUB_GOAL: \n" - " ACTIONS: " - ) - return ( - "Output your complete trajectory once as:\n" - " SUB_GOAL: \n" - " ACTIONS: \n" - "The last action in ACTIONS should be DONE (when you expect to be at the goal).\n" - "You will not be queried again — this is your only planning turn." - ) + return querying_templates.SUBGOAL_SUFFIX + return querying_templates.FULL_TRAJECTORY_SUFFIX def step_metadata(self) -> dict: if self.kind == "step_by_step": diff --git a/interface/renderer.py b/interface/renderer.py index 34881d3..d9638fb 100644 --- a/interface/renderer.py +++ b/interface/renderer.py @@ -18,11 +18,13 @@ to_row_col, wall_cells, ) +from prompting_experiments.prompt_templates import observation as observation_templates if TYPE_CHECKING: from gridworld.backends.base import GridState from gridworld.task_spec import TaskSpecification + #TODO: Move to utils.py def rgb_to_png_bytes(rgb: np.ndarray) -> bytes: img = Image.fromarray(np.asarray(rgb, dtype=np.uint8)) @@ -43,13 +45,11 @@ def _static_layout_lines(task_spec: TaskSpecification) -> list[str]: start = to_row_col(task_spec.maze.start) goal = goal_row_col(task_spec) return [ - f"The world is a {rows} by {cols} grid.", - "Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``;" - " tuples in this text use ``(row, column)`` matching env state (row southward, column east)." - " So ``x`` = column index, ``y`` = row index.", - f"The start is at {start}.", - f"The goal is at {goal}.", - f"The following cells are walls: {wall_str}.", + observation_templates.WORLD_SIZE_LINE.format(rows=rows, cols=cols), + observation_templates.COORDINATE_EXPLANATION, + observation_templates.START_LINE.format(start=start), + observation_templates.GOAL_LINE.format(goal=goal), + observation_templates.WALLS_LINE.format(walls=wall_str), ] @@ -64,14 +64,20 @@ def _mechanism_lines(task_spec: TaskSpecification, state: GridState | None = Non if key.id in collected: continue row, col = to_row_col(key.position) - parts.append(f"There is a {key.color} key at ({row},{col}).") + parts.append( + observation_templates.KEY_LINE.format(color=key.color, row=row, col=col) + ) for door in task_spec.mechanisms.doors: row, col = to_row_col(door.position) status = "open" if door.id in open_doors else door.initial_state parts.append( - f"There is a {status} {door.requires_key} door at ({row},{col})." - f" It requires the {door.requires_key} key to open." + observation_templates.DOOR_LINE.format( + status=status, + requires_key=door.requires_key, + row=row, + col=col, + ) ) for switch in task_spec.mechanisms.switches: @@ -79,16 +85,26 @@ def _mechanism_lines(task_spec: TaskSpecification, state: GridState | None = Non on_off = "on" if switch.id in active else switch.initial_state controls = ", ".join(switch.controls) parts.append( - f"There is a {switch.switch_type} switch at ({row},{col}) (currently {on_off})." - f" It controls: {controls}." + observation_templates.SWITCH_LINE.format( + switch_type=switch.switch_type, + row=row, + col=col, + state=on_off, + controls=controls, + ) ) for gate in task_spec.mechanisms.gates: row, col = to_row_col(gate.position) cur = "open" if gate.id in open_gates else gate.initial_state parts.append( - f"There is a gate ({gate.id}) at ({row},{col})." - f" It is currently {cur} (initially {gate.initial_state})." + observation_templates.GATE_LINE.format( + gate_id=gate.id, + row=row, + col=col, + state=cur, + initial_state=gate.initial_state, + ) ) return parts @@ -102,18 +118,19 @@ def render_user_observation_text(task_spec: TaskSpecification, state: GridState) pos = agent_row_col(state) inv = ", ".join(inventory_list(state)) or "empty" head = [ - "Current situation (this step):", - f"The goal is at {goal}.", - f"You are at {pos} facing {agent_facing(state)}.", - f"Environment steps used so far: {state.step_count} (max {state.max_steps} before timeout).", - f"Your inventory: {inv}.", + observation_templates.CURRENT_SITUATION_HEADER, + observation_templates.CURRENT_GOAL_LINE.format(goal=goal), + observation_templates.CURRENT_AGENT_LINE.format( + position=pos, + facing=agent_facing(state), + ), + observation_templates.CURRENT_INVENTORY_LINE.format(inventory=inv), "", - "Map contents as of this step (keys on the ground, doors, switches, gates):", + observation_templates.CURRENT_MAP_CONTENTS_HEADER, ] mech = _mechanism_lines(task_spec, state) if mech: head.extend(mech) else: - head.append("(No keys on the ground, doors, switches, or gates in the current state description.)") + head.append(observation_templates.NO_MECHANISMS_LINE) return "\n".join(head) - diff --git a/interface/runner.py b/interface/runner.py index 91dc448..c48b86c 100644 --- a/interface/runner.py +++ b/interface/runner.py @@ -33,6 +33,8 @@ ) from interface.querying import QueryingMode from interface.renderer import render_initial_maze_text +from prompting_experiments.prompt_templates import feedback as feedback_templates +from prompting_experiments.prompt_templates import system as system_templates logger = logging.getLogger(__name__) @@ -57,6 +59,18 @@ def _trim_rolling_chat(messages: List[dict], max_pairs: int) -> None: del messages[1 : 1 + (tail_len - cap)] +def _reset_agent_usage(agent: Callable[[List[dict]], str]) -> None: + """Clear per-call telemetry so stale usage cannot leak into a later query.""" + reset_usage = getattr(agent, "reset_usage", None) + if callable(reset_usage): + reset_usage() + return + try: + setattr(agent, "last_usage", None) + except (AttributeError, TypeError): + pass + + def build_runner( config: ExperimentConfig, backend: MiniGridBackend, @@ -100,15 +114,15 @@ def run( system_prompt = self.prompt.build_system_prompt(self.querying.system_prompt_suffix()) if self.config.observation in ("text_only", "image_text"): system_prompt = ( - f"{system_prompt}\n\nInitial maze (fixed for this episode):\n" - f"{render_initial_maze_text(self.task_spec)}" + f"{system_prompt}\n\n" + f"{system_templates.INITIAL_MAZE_SECTION.format(maze_text=render_initial_maze_text(self.task_spec))}" ) system_message = {"role": "system", "content": system_prompt} chat_history = self.config.chat_history messages: List[dict] = [system_message] if chat_history in ("rolling", "full") else [] action_queue: List[str] = [] - last_feedback = "Episode start." + last_feedback = feedback_templates.INITIAL_FEEDBACK consecutive_failures = 0 transcript: List[dict] = [] max_steps = self.task_spec.max_steps @@ -122,7 +136,9 @@ def run( if logger.isEnabledFor(logging.INFO): logger.info( - "Episode start: max_steps=%s querying=%s observation=%s context_window=%s chat_history=%s", + "Episode start: task_id=%s seed=%s max_steps=%s querying=%s observation=%s context_window=%s chat_history=%s", + self.task_spec.task_id, + self.task_spec.seed, max_steps, self.config.querying, self.config.observation, @@ -154,11 +170,14 @@ def run( agent_messages = messages if logger.isEnabledFor(logging.INFO): logger.info( - "LLM query #%d: messages_in_context=%d current_turn_has_image=%s", + "LLM query #%d: task_id=%s observation=%s messages_in_context=%d current_turn_has_image=%s", query_count, + self.task_spec.task_id, + self.config.observation, len(agent_messages), has_image, ) + _reset_agent_usage(agent) t_llm = time.perf_counter() model_text = agent(agent_messages) llm_s = time.perf_counter() - t_llm @@ -169,44 +188,57 @@ def run( action_queue = self.querying.parse_actions(model_text) if logger.isEnabledFor(logging.INFO): logger.info( - "LLM query #%d finished in %.2fs: reply_chars=%d actions_parsed=%d", + "LLM query #%d finished: task_id=%s observation=%s elapsed=%.2fs reply_chars=%d actions_parsed=%d", query_count, + self.task_spec.task_id, + self.config.observation, llm_s, len(model_text), len(action_queue), ) if logger.isEnabledFor(logging.DEBUG): - logger.debug("LLM query #%d reply:\n%s", query_count, model_text) - transcript.append( - { - "kind": "query", - "query_index": query_count, - "env_step_count": state.step_count, - "agent_messages": copy.deepcopy(agent_messages), - "assistant_reply": model_text, - "parsed_actions": list(action_queue), - "parse_ok": bool(action_queue), - "has_image": has_image, - "llm_latency_s": llm_s, - "chat_history_mode": chat_history, - "agent_message_count": len(agent_messages), - "actions_remaining_before_step": len(action_queue), - } - ) + logger.debug( + "LLM query #%d reply: task_id=%s observation=%s\n%s", + query_count, + self.task_spec.task_id, + self.config.observation, + model_text, + ) + query_record = { + "kind": "query", + "query_index": query_count, + "env_step_count": state.step_count, + "agent_messages": copy.deepcopy(agent_messages), + "assistant_reply": model_text, + "parsed_actions": list(action_queue), + "parse_ok": bool(action_queue), + "has_image": has_image, + "llm_latency_s": llm_s, + "chat_history_mode": chat_history, + "agent_message_count": len(agent_messages), + "actions_remaining_before_step": len(action_queue), + } + usage = getattr(agent, "last_usage", None) + if isinstance(usage, dict): + query_record["usage"] = dict(usage) + transcript.append(query_record) # check if we got any valid actions; # if not, we'll count it as a parse failure and give feedback, # but still allow retries until max_parse_retries is reached if not action_queue: parse_failures += 1 logger.warning( - "LLM query #%d: no valid actions parsed; parse failure %d/%d", + "LLM query #%d: task_id=%s observation=%s no valid actions parsed; parse failure %d/%d", query_count, + self.task_spec.task_id, + self.config.observation, parse_failures, self.config.max_parse_retries, ) last_feedback = ( - f"Could not parse FINAL_OUTPUT (one or more valid actions). " - f"Use only: {ACTIONS_HINT}." + feedback_templates.PARSE_FAILURE_FEEDBACK.format( + actions_hint=ACTIONS_HINT + ) ) if parse_failures >= self.config.max_parse_retries: end_reason = "parse_failed" diff --git a/interface/smoke_tests/smoke_llm.py b/interface/smoke_tests/smoke_llm.py index fd7d5e0..8d9058c 100644 --- a/interface/smoke_tests/smoke_llm.py +++ b/interface/smoke_tests/smoke_llm.py @@ -80,19 +80,35 @@ def __init__( def __call__(self, messages: list[dict]) -> str: self._query_seq += 1 text = self._inner(messages) - self._records.append( - { - "query": self._query_seq, - "messages_in_context": len(messages), - "reply": text, - } - ) + record = { + "query": self._query_seq, + "messages_in_context": len(messages), + "reply": text, + } + if self.last_usage is not None: + record["usage"] = dict(self.last_usage) + self._records.append(record) if self._log_replies: print(f"\n{'=' * 72}\nLLM query {self._query_seq} (messages={len(messages)})\n{'=' * 72}") print(text) print(f"{'=' * 72}\n") return text + @property + def last_usage(self) -> dict[str, int] | None: + usage = getattr(self._inner, "last_usage", None) + return usage if isinstance(usage, dict) else None + + def reset_usage(self) -> None: + reset_usage = getattr(self._inner, "reset_usage", None) + if callable(reset_usage): + reset_usage() + return + try: + setattr(self._inner, "last_usage", None) + except (AttributeError, TypeError): + pass + def main() -> None: parser = argparse.ArgumentParser( diff --git a/interface/telemetry.py b/interface/telemetry.py new file mode 100644 index 0000000..dd3a3c4 --- /dev/null +++ b/interface/telemetry.py @@ -0,0 +1,42 @@ +"""Shared telemetry normalization for interface producers and scorer consumers.""" + +from __future__ import annotations + +from typing import Any + + +TOKEN_COUNT_KEYS = ("total_tokens", "token_count", "tokens", "model_tokens") + + +def normalize_token_usage(usage: Any) -> dict[str, int] | None: + """Normalize provider token usage into input, output, and total counts.""" + if not isinstance(usage, dict): + return None + input_tokens = usage.get("input_tokens", usage.get("prompt_tokens")) + output_tokens = usage.get("output_tokens", usage.get("completion_tokens")) + total_tokens = usage.get("total_tokens") + if total_tokens is None and (input_tokens is not None or output_tokens is not None): + total_tokens = int(input_tokens or 0) + int(output_tokens or 0) + + normalized = {} + if input_tokens is not None: + normalized["input_tokens"] = int(input_tokens) + if output_tokens is not None: + normalized["output_tokens"] = int(output_tokens) + if total_tokens is not None: + normalized["total_tokens"] = int(total_tokens) + return normalized or None + + +def token_count_from_record(record: dict[str, Any]) -> int | None: + """Extract one token total without counting nested aliases twice.""" + for container in (record, record.get("info"), record.get("metadata")): + if not isinstance(container, dict): + continue + for key in TOKEN_COUNT_KEYS: + if container.get(key) is not None: + return int(container[key]) + usage = normalize_token_usage(container.get("usage")) + if usage is not None and usage.get("total_tokens") is not None: + return usage["total_tokens"] + return None diff --git a/pipeline/__init__.py b/pipeline/__init__.py new file mode 100644 index 0000000..b603032 --- /dev/null +++ b/pipeline/__init__.py @@ -0,0 +1,14 @@ +"""Bare-bones run pipeline for MultiNet v2.0 (tests 1-3). + +Sequential, inspectable orchestration that wires the canonical pipeline stages +over the ``interface/`` runner (Stack A) and the ``scorer/`` package: + +- Stage 1: fixtures + manifest (``gridworld/fixtures/manifest.json``) +- Stage 2: static solve & score -> ``scorer.score_task_file`` +- Stage 3: runtime runs (live models) -> ``pipeline.run_stage3`` +- Stage 3 instrumentation -> ``pipeline.episode_metrics`` +- Stage 4: runtime score -> ``scorer.compute_runtime_score`` +- Stage 5: reports -> ``pipeline.reports`` + +See ``scripts/run_pipeline.py`` for the orchestrator CLI. +""" diff --git a/pipeline/episode_metrics.py b/pipeline/episode_metrics.py new file mode 100644 index 0000000..de97bee --- /dev/null +++ b/pipeline/episode_metrics.py @@ -0,0 +1,286 @@ +"""Stage-3 instrumentation: derive test-2/test-3 signals from an episode log. + +Pure post-processing over the ``interface/`` runner's ``episode.json`` (the dict +returned by ``ExperimentRunner.run`` and flushed by ``flush_episode_log``), the +task spec, the canonical paths, and the manifest row. No runner edits required: +each ``kind == "step"`` transcript record already carries ``event_type`` and a +``state_after`` snapshot with the mechanism id sets and agent ``(x, y)`` position. + +Coordinate convention: positions here are ``(x, y)`` taken from +``state_after.agent_position`` (NOT the ``(row, col)`` ``position_after`` field), +matching the planner positions in ``canonical_paths.json``. +""" + +from __future__ import annotations + +from typing import Any, Optional + +# Mechanism id sets carried on every state snapshot, in direct-actuation priority +# order (keys/switches the agent acts on, then doors/gates that open as effects). +_MECHANISM_FIELDS = ("collected_keys", "active_switches", "open_doors", "open_gates") + + +def _position(state: Any) -> Optional[tuple[int, int]]: + if not isinstance(state, dict): + return None + raw = state.get("agent_position") or state.get("position") + if isinstance(raw, (list, tuple)) and len(raw) >= 2: + return int(raw[0]), int(raw[1]) + return None + + +def _mechanism_sets(state: Any) -> dict[str, set[str]]: + state = state if isinstance(state, dict) else {} + return {field: set(state.get(field, []) or []) for field in _MECHANISM_FIELDS} + + +def _step_records(episode: dict[str, Any]) -> list[dict[str, Any]]: + return [ + rec + for rec in episode.get("transcript", []) + if isinstance(rec, dict) and rec.get("kind") == "step" + ] + + +def visited_cells(episode: dict[str, Any]) -> list[tuple[int, int]]: + """Ordered agent cells (x, y), consecutive duplicates collapsed.""" + cells: list[tuple[int, int]] = [] + initial = _position(episode.get("initial_state")) + if initial is not None: + cells.append(initial) + for rec in episode.get("transcript", []): + if not isinstance(rec, dict): + continue + if rec.get("kind") == "reset": + pos = _position(rec.get("state")) + elif rec.get("kind") == "step": + pos = _position(rec.get("state_after")) + else: + continue + if pos is not None: + cells.append(pos) + final = _position(episode.get("final_state")) + if final is not None: + cells.append(final) + + deduped: list[tuple[int, int]] = [] + for pos in cells: + if not deduped or deduped[-1] != pos: + deduped.append(pos) + return deduped + + +def mechanism_interaction_order(episode: dict[str, Any]) -> list[str]: + """Ordered mechanism ids in the order the agent first engaged each one. + + Walks the step records and diffs the ``state_after`` mechanism id sets + against the previous step; newly-added ids are appended in field-priority + order (keys, switches, doors, gates) so a single switch toggle that also + opens a gate records the switch before its downstream gate. + """ + order: list[str] = [] + seen: set[str] = set() + prev = _mechanism_sets(episode.get("initial_state")) + for rec in _step_records(episode): + current = _mechanism_sets(rec.get("state_after")) + for field in _MECHANISM_FIELDS: + for mech_id in sorted(current[field] - prev[field]): + if mech_id not in seen: + seen.add(mech_id) + order.append(mech_id) + prev = current + return order + + +def failure_point( + episode: dict[str, Any], + expected_mechanisms: list[str], + mech_order: list[str], +) -> Optional[dict[str, Any]]: + """First expected mechanism the agent never engaged, with context. + + Returns ``None`` for successful runs. For failed runs, reports the first id + in ``expected_mechanisms`` missing from ``mech_order`` (``None`` if all were + engaged but the run still failed), the runner ``end_reason``, the final cell, + and the engaged-mechanism order for diagnostics. + """ + if episode.get("success"): + return None + engaged = set(mech_order) + missing = [m for m in expected_mechanisms if m not in engaged] + cells = visited_cells(episode) + return { + "mechanism": missing[0] if missing else None, + "end_reason": episode.get("end_reason"), + "final_cell": list(cells[-1]) if cells else None, + "engaged": list(mech_order), + "missing": missing, + } + + +def path_choice( + episode: dict[str, Any], + route_short_cells: Optional[list[Any]], + route_long_cells: Optional[list[Any]], +) -> Optional[str]: + """Classify which test-2 route the agent committed to. + + ``route_*_cells`` are discriminator cells unique to each route (cached in the + manifest by ``validate_fixtures``). Returns ``"short_mech"``, ``"long_open"``, + ``"mixed"``, or ``"none"``; ``None`` when no route cells are defined (non-test-2). + """ + if not route_short_cells and not route_long_cells: + return None + cells = set(visited_cells(episode)) + short = {tuple(c) for c in (route_short_cells or [])} + long = {tuple(c) for c in (route_long_cells or [])} + hit_short = bool(short & cells) + hit_long = bool(long & cells) + if hit_short and not hit_long: + return "short_mech" + if hit_long and not hit_short: + return "long_open" + if hit_short and hit_long: + return "mixed" + return "none" + + +def episode_token_count(episode: dict[str, Any]) -> Optional[int]: + """Sum token usage over ``kind == "query"`` transcript records.""" + from interface.telemetry import token_count_from_record + + total = 0 + found = False + for rec in episode.get("transcript", []): + if not isinstance(rec, dict) or rec.get("kind") != "query": + continue + count = token_count_from_record(rec) + if count is not None: + total += count + found = True + return total if found else None + + +def _canonical_optimal_steps(canonical_paths: dict[str, Any]) -> Optional[int]: + bfs = canonical_paths.get("bfs", canonical_paths) + if isinstance(bfs, dict) and bfs.get("optimal_steps") is not None: + return int(bfs["optimal_steps"]) + if canonical_paths.get("optimal_steps") is not None: + return int(canonical_paths["optimal_steps"]) + return None + + +def _episode_reward(episode: dict[str, Any]) -> Any: + """Final-state reward, guarding an explicit ``final_state: null``.""" + final = episode.get("final_state") + return final.get("reward") if isinstance(final, dict) else None + + +def build_metrics( + episode: dict[str, Any], + canonical_paths: dict[str, Any], + manifest_row: dict[str, Any], +) -> dict[str, Any]: + """Derive the test-specific signals shared by the run row and the scorer.""" + mech_order = mechanism_interaction_order(episode) + expected = list(manifest_row.get("expected_mechanisms", []) or []) + return { + "mechanism_interaction_order": mech_order, + "failure_point": failure_point(episode, expected, mech_order), + "path_choice": path_choice( + episode, + manifest_row.get("route_short_cells"), + manifest_row.get("route_long_cells"), + ), + } + + +def build_run_row( + episode: dict[str, Any], + canonical_paths: dict[str, Any], + manifest_row: dict[str, Any], + *, + agent_or_model: str, + seed: int, + backend: str = "minigrid", + raw_output_ref: Optional[str] = None, + metrics: Optional[dict[str, Any]] = None, + prompt_variant: str = "default", +) -> dict[str, Any]: + """Build one ``episode_runs.jsonl`` row (Appendix A.3 fields). + + ``condition`` is the task-intrinsic axis (e.g. the test-3 mechanism order); + ``prompt_variant`` is the orthogonal prompt axis selected by ``--conditions``. + The two are kept distinct so prompt variants do not collapse onto the + manifest condition. + """ + metrics = metrics if metrics is not None else build_metrics(episode, canonical_paths, manifest_row) + success = bool(episode.get("success")) + end_reason = episode.get("end_reason") + steps = int(episode.get("steps_used", 0)) + optimal_steps = _canonical_optimal_steps(canonical_paths) + # Mirror scorer.runtime's step_ratio: optimal_steps == 0 is a perfect 0-step + # solve, not a zero ratio, so the jsonl and run_score.json agree. + if not success or optimal_steps is None: + optimality_ratio = 0.0 + elif optimal_steps == 0: + optimality_ratio = 1.0 if steps == 0 else 0.0 + else: + optimality_ratio = optimal_steps / max(steps, optimal_steps) + return { + "task_id": manifest_row.get("task_id") or episode.get("task_spec", {}).get("task_id"), + "experiment": manifest_row.get("experiment"), + "condition": manifest_row.get("condition"), + "prompt_variant": prompt_variant, + "backend": backend, + "agent_or_model": agent_or_model, + "seed": seed, + "success": success, + "terminated": end_reason == "success", + "truncated": end_reason == "truncated", + "reward": _episode_reward(episode), + "steps": steps, + "optimal_steps": optimal_steps, + "optimality_ratio": optimality_ratio, + "path_choice": metrics["path_choice"], + "mechanism_interaction_order": metrics["mechanism_interaction_order"], + "failure_point": metrics["failure_point"], + "tokens": episode_token_count(episode), + "raw_output_ref": raw_output_ref, + } + + +def enrich_run_for_scoring( + episode: dict[str, Any], + manifest_row: dict[str, Any], + *, + agent_or_model: str, + seed: int, + backend: str = "minigrid", + metrics: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + """Episode dict + the fields ``scorer.compute_runtime_score`` reads/passes through. + + The scorer already understands the episode transcript (success, steps, + positions, query-record token usage); this layers on the run identity and + the derived test-2/test-3 signals so they flow into ``run_score.json``. + """ + metrics = metrics if metrics is not None else build_metrics(episode, {}, manifest_row) + run = dict(episode) + run["task_id"] = manifest_row.get("task_id") or episode.get("task_spec", {}).get("task_id") + run["backend"] = backend + run["adapter"] = agent_or_model + run["agent_or_model"] = agent_or_model + run["model_id"] = agent_or_model + run["seed"] = seed + run["terminated"] = episode.get("end_reason") == "success" + run["truncated"] = episode.get("end_reason") == "truncated" + # episode_log nests reward under final_state; the scorer only reads a + # top-level ``reward``, so lift it (keeps run_score.json reward in sync + # with the episode_runs.jsonl row). + if run.get("reward") is None: + run["reward"] = _episode_reward(episode) + for key in ("path_choice", "mechanism_interaction_order", "failure_point"): + if metrics.get(key) is not None: + run[key] = metrics[key] + return run diff --git a/pipeline/reports.py b/pipeline/reports.py new file mode 100644 index 0000000..a8c43d5 --- /dev/null +++ b/pipeline/reports.py @@ -0,0 +1,286 @@ +"""Stage 5 — thin aggregation reports for tests 1-3. + +Pure functions over in-memory run rows (Appendix A.3 dicts), per-run composites, +static-score artifacts, and the manifest. These produce calibration *evidence*, +not a final MultiNet score. +""" + +from __future__ import annotations + +import statistics +from collections import defaultdict +from typing import Any, Iterable, Optional + +import numpy as np + +from scorer.config import DIMENSION_NAMES + + +def _run_key(row: dict[str, Any]) -> tuple: + return ( + row.get("task_id"), + row.get("agent_or_model"), + row.get("seed"), + row.get("condition"), + row.get("prompt_variant"), + ) + + +def _mean(values: list[float]) -> Optional[float]: + return float(statistics.fmean(values)) if values else None + + +def _median(values: list[float]) -> Optional[float]: + return float(statistics.median(values)) if values else None + + +def _group_success(rows: Iterable[dict[str, Any]], key: str) -> dict[str, dict[str, float]]: + buckets: dict[str, list[bool]] = defaultdict(list) + for row in rows: + buckets[str(row.get(key))].append(bool(row.get("success"))) + return { + name: {"n": len(flags), "success_rate": _mean([float(f) for f in flags])} + for name, flags in buckets.items() + } + + +def scoring_calibration_summary( + rows: list[dict[str, Any]], + composites: dict[tuple, float], + static_by_task: dict[str, dict[str, Any]], +) -> dict[str, Any]: + """Test 1: success rates, optimality, and 12-dimension correlation evidence.""" + successful_opt = [ + float(r["optimality_ratio"]) + for r in rows + if r.get("success") and r.get("optimality_ratio") is not None + ] + + # Per-task mean composite, for correlating static dimensions against difficulty. + comp_by_task: dict[str, list[float]] = defaultdict(list) + succ_by_task: dict[str, list[float]] = defaultdict(list) + for r in rows: + comp = composites.get(_run_key(r)) + if comp is not None: + comp_by_task[r["task_id"]].append(float(comp)) + succ_by_task[r["task_id"]].append(float(bool(r.get("success")))) + + tasks = [t for t in static_by_task if t in comp_by_task] + correlation: dict[str, Optional[float]] = {} + point_weight_candidates: dict[str, Optional[float]] = {} + if len(tasks) >= 2: + dim_matrix = np.array( + [ + [float((static_by_task[t].get("dimensions_12") or {}).get(name, 0.0)) for name in DIMENSION_NAMES] + for t in tasks + ], + dtype=float, + ) + target = np.array([_mean(comp_by_task[t]) or 0.0 for t in tasks], dtype=float) + for idx, name in enumerate(DIMENSION_NAMES): + col = dim_matrix[:, idx] + if np.std(col) == 0 or np.std(target) == 0: + correlation[name] = None + else: + correlation[name] = float(np.corrcoef(col, target)[0, 1]) + abs_corr = {n: abs(c) for n, c in correlation.items() if c is not None} + total = sum(abs_corr.values()) + for name in DIMENSION_NAMES: + point_weight_candidates[name] = ( + abs_corr[name] / total if total > 0 and name in abs_corr else None + ) + + static_scores = [ + float(static_by_task[t]["static_score"]) + for t in static_by_task + if static_by_task[t].get("static_score") is not None + ] + tier_boundary_candidates = ( + { + "p33": float(np.percentile(static_scores, 33)), + "p66": float(np.percentile(static_scores, 66)), + } + if static_scores + else {} + ) + + return { + "experiment": "test1", + "run_count": len(rows), + "task_count": len(static_by_task), + "ineligible_tasks": sorted( + t for t, s in static_by_task.items() if not s.get("is_beatable", True) + ), + "success_rate_by_task": _group_success(rows, "task_id"), + "success_rate_by_condition": _group_success(rows, "condition"), + "success_rate_by_prompt_variant": _group_success(rows, "prompt_variant"), + "success_rate_by_model": _group_success(rows, "agent_or_model"), + "optimality_ratio_mean": _mean(successful_opt), + "optimality_ratio_median": _median(successful_opt), + "dimension_correlation": correlation, + "point_weight_candidates": point_weight_candidates, + "tier_boundary_candidates": tier_boundary_candidates, + } + + +def complexity_distance_summary(rows: list[dict[str, Any]]) -> dict[str, Any]: + """Test 2: path-choice counts (short mechanistic vs long open route).""" + test2 = [r for r in rows if r.get("experiment") == "test2"] + by_group: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) + overall: dict[str, int] = defaultdict(int) + for r in test2: + choice = r.get("path_choice") or "none" + group = ( + f"{r.get('task_id')}|{r.get('condition')}|" + f"{r.get('prompt_variant')}|{r.get('agent_or_model')}" + ) + by_group[group][choice] += 1 + overall[choice] += 1 + return { + "experiment": "test2", + "run_count": len(test2), + "path_choice_overall": dict(overall), + "path_choice_by_group": {g: dict(c) for g, c in by_group.items()}, + "success_rate_by_path_choice": { + choice: _mean( + [float(bool(r.get("success"))) for r in test2 if (r.get("path_choice") or "none") == choice] + ) + for choice in set((r.get("path_choice") or "none") for r in test2) + }, + } + + +def mechanism_ordering_pairs( + rows: list[dict[str, Any]], + manifest_rows: list[dict[str, Any]], +) -> dict[str, Any]: + """Test 3: paired success deltas across matched mechanism-ordering pairs.""" + pair_of = {m.get("task_id"): m.get("pair_id") for m in manifest_rows} + expected_of = {m.get("task_id"): list(m.get("expected_mechanisms", []) or []) for m in manifest_rows} + + test3 = [r for r in rows if r.get("experiment") == "test3"] + pairs: dict[str, dict[str, list[dict[str, Any]]]] = defaultdict(lambda: defaultdict(list)) + for r in test3: + pid = pair_of.get(r.get("task_id")) + if pid is None: + continue + pairs[pid][str(r.get("condition"))].append(r) + + pair_reports: dict[str, Any] = {} + for pid, conditions in pairs.items(): + cond_stats = {} + for cond, cond_rows in conditions.items(): + failures: dict[str, int] = defaultdict(int) + order_match = [] + for r in cond_rows: + if not r.get("success"): + fp = r.get("failure_point") or {} + failures[str(fp.get("mechanism"))] += 1 + expected = expected_of.get(r.get("task_id"), []) + # The interaction order also carries downstream effects (opened + # doors/gates) that are not in expected_mechanisms; compare only + # the actuated mechanisms' relative order so a correct solve matches. + expected_set = set(expected) + engaged_order = [ + m for m in (r.get("mechanism_interaction_order") or []) if m in expected_set + ] + order_match.append( + float(engaged_order == expected) if expected else 0.0 + ) + cond_stats[cond] = { + "n": len(cond_rows), + "success_rate": _mean([float(bool(r.get("success"))) for r in cond_rows]), + "failure_point_counts": dict(failures), + "expected_order_match_rate": _mean(order_match), + } + sorted_conds = sorted(cond_stats) + delta = None + if len(sorted_conds) == 2: + a, b = sorted_conds + sr_a, sr_b = cond_stats[a]["success_rate"], cond_stats[b]["success_rate"] + if sr_a is not None and sr_b is not None: + delta = {"conditions": [a, b], "success_delta": sr_a - sr_b} + pair_reports[pid] = {"conditions": cond_stats, "paired_success_delta": delta} + + return { + "experiment": "test3", + "run_count": len(test3), + "pairs": pair_reports, + } + + +def _summary( + rows: list[dict[str, Any]], composites: dict[tuple, Optional[float]] +) -> dict[str, Any]: + """Aggregate model-performance metrics over a set of run rows.""" + opt = [ + float(r["optimality_ratio"]) + for r in rows + if r.get("success") and r.get("optimality_ratio") is not None + ] + tokens = [int(r["tokens"]) for r in rows if r.get("tokens") is not None] + comps = [ + c for c in (composites.get(_run_key(r)) for r in rows) if c is not None + ] + return { + "n": len(rows), + "success_rate": _mean([float(bool(r.get("success"))) for r in rows]), + "optimality_ratio_mean": _mean(opt), + "optimality_ratio_median": _median(opt), + "steps_mean": _mean([float(r["steps"]) for r in rows if r.get("steps") is not None]), + "tokens_mean": _mean([float(t) for t in tokens]), + "tokens_total": float(sum(tokens)) if tokens else None, + "composite_mean": _mean([float(c) for c in comps]), + } + + +def model_report( + run_rows: list[dict[str, Any]], + composites: dict[tuple, Optional[float]], + model_id: str, + run_set_id: str, +) -> dict[str, Any]: + """Machine-readable per-model performance report. + + Provisional: the raw metrics (success/steps/optimality/tokens) are + meaningful now, but composite fields are placeholders until the scorer is + tuned. Shares one schema across models so an external tool can compare them. + """ + rows = [r for r in run_rows if r.get("agent_or_model") == model_id] + + def _group(key: str) -> dict[str, Any]: + buckets: dict[str, list[dict[str, Any]]] = defaultdict(list) + for r in rows: + buckets[str(r.get(key))].append(r) + return {name: _summary(group, composites) for name, group in buckets.items()} + + return { + "schema_version": "0.1.0", + "model_id": model_id, + "run_set_id": run_set_id, + "backend": rows[0].get("backend", "minigrid") if rows else "minigrid", + "seeds": sorted({r.get("seed") for r in rows if r.get("seed") is not None}), + "task_count": len({r.get("task_id") for r in rows}), + "run_count": len(rows), + "provisional": True, + "overall": _summary(rows, composites), + "by_experiment": _group("experiment"), + "by_prompt_variant": _group("prompt_variant"), + "tasks": [ + { + "task_id": r.get("task_id"), + "experiment": r.get("experiment"), + "condition": r.get("condition"), + "prompt_variant": r.get("prompt_variant"), + "seed": r.get("seed"), + "success": bool(r.get("success")), + "steps": r.get("steps"), + "optimal_steps": r.get("optimal_steps"), + "optimality_ratio": r.get("optimality_ratio"), + "path_choice": r.get("path_choice"), + "tokens": r.get("tokens"), + "composite": composites.get(_run_key(r)), + } + for r in rows + ], + } diff --git a/pipeline/run_stage3.py b/pipeline/run_stage3.py new file mode 100644 index 0000000..f78b570 --- /dev/null +++ b/pipeline/run_stage3.py @@ -0,0 +1,56 @@ +"""Stage 3 — runtime runs on the ``interface/`` stack (Stack A, live models). + +Builds a MiniGrid backend + ``ExperimentRunner`` for one task, runs a single +episode with a live-model agent, and flushes the canonical ``episode.json`` +artifact (plus PNG frames). Baselines are NOT run here — they feed Stage-2 +difficulty/canonical paths via the scorer. +""" + +from __future__ import annotations + +import dataclasses +import json +from pathlib import Path +from typing import Any, Callable + +from interface.config import ExperimentConfig +from interface.episode_log import flush_episode_log +from interface.loader import load_task +from interface.runner import build_runner +from gridworld.task_spec import TaskSpecification + + +# An agent is any callable mapping chat messages -> model text (optionally +# exposing a ``last_usage`` attribute for token telemetry). +Agent = Callable[[list[dict]], str] + + +def _spec_with_seed(spec: TaskSpecification, seed: int) -> TaskSpecification: + """Return a copy of ``spec`` with ``seed`` overridden (runner seeds from it).""" + if spec.seed == seed: + return spec + return dataclasses.replace(spec, seed=seed) + + +def run_episode( + task_source: str | Path, + config: ExperimentConfig, + agent: Agent, + seed: int, + out_dir: str | Path, +) -> dict[str, Any]: + """Run one episode and flush ``episode.json`` into ``out_dir``. + + Returns the in-memory episode dict (the JSON-safe payload written to + ``out_dir/episode.json``), so callers can derive metrics without re-reading. + """ + backend, spec = load_task(task_source) + spec = _spec_with_seed(spec, seed) + backend.configure(spec) + + runner = build_runner(config, backend, spec) + result = runner.run(agent, verbose=False, maze_path=str(task_source)) + + out_dir = Path(out_dir) + episode_path = flush_episode_log(result, out_dir) + return json.loads(episode_path.read_text(encoding="utf-8")) diff --git a/prompting_experiments/__init__.py b/prompting_experiments/__init__.py new file mode 100644 index 0000000..d7ccc0f --- /dev/null +++ b/prompting_experiments/__init__.py @@ -0,0 +1,5 @@ +"""Prompt condition-set configs for interface experiments.""" + +from .exp_design import CONDITION_SETS, ConditionSet, Variant, iter_condition_configs + +__all__ = ["CONDITION_SETS", "ConditionSet", "Variant", "iter_condition_configs"] diff --git a/prompting_experiments/condition_set_1_prompt.py b/prompting_experiments/condition_set_1_prompt.py new file mode 100644 index 0000000..8d0d312 --- /dev/null +++ b/prompting_experiments/condition_set_1_prompt.py @@ -0,0 +1,27 @@ +"""Condition set 1: prompt verbosity.""" + +from __future__ import annotations + +from .core import ConditionSet, Variant + + +CONDITION_SET = ConditionSet( + name="Prompt", + comparisons=( + "Standard: goal + mechanism descriptions + action list", + "Verbose: standard + explicit rules", + ), + decision="If delta < 5%, use standard. If > 5%, use verbose.", + variants={ + "standard": Variant( + name="standard", + description="Standard task prompt with mechanism descriptions.", + config_overrides={"prompting": "standard"}, + ), + "verbose": Variant( + name="verbose", + description="Standard prompt plus explicit domain rules and local hints.", + config_overrides={"prompting": "verbose"}, + ), + }, +) diff --git a/prompting_experiments/condition_set_2_observation_format.py b/prompting_experiments/condition_set_2_observation_format.py new file mode 100644 index 0000000..ccb3152 --- /dev/null +++ b/prompting_experiments/condition_set_2_observation_format.py @@ -0,0 +1,33 @@ +"""Condition set 2: observation format.""" + +from __future__ import annotations + +from .core import ConditionSet, Variant + + +CONDITION_SET = ConditionSet( + name="Observation format", + comparisons=( + "Text only", + "Image + text", + "Image only", + ), + decision="Measure whether text adds meaningful signal beyond image input.", + variants={ + "text_only": Variant( + name="text_only", + description="Natural-language current observation, no image blocks.", + config_overrides={"observation": "text_only"}, + ), + "image_text": Variant( + name="image_text", + description="Image block plus natural-language observation.", + config_overrides={"observation": "image_text"}, + ), + "image_only": Variant( + name="image_only", + description="Image block with no initial natural-language maze map.", + config_overrides={"observation": "image_only"}, + ), + }, +) diff --git a/prompting_experiments/condition_set_3_context_window.py b/prompting_experiments/condition_set_3_context_window.py new file mode 100644 index 0000000..9129c35 --- /dev/null +++ b/prompting_experiments/condition_set_3_context_window.py @@ -0,0 +1,33 @@ +"""Condition set 3: context window.""" + +from __future__ import annotations + +from .core import ConditionSet, Variant + + +CONDITION_SET = ConditionSet( + name="Context window", + comparisons=( + "0 history: current observation only", + "Last 3 executed steps", + "Current observation + text summary of prior actions", + ), + decision="Compare current-state-only prompting against recent history.", + variants={ + "current": Variant( + name="current", + description="Prompt only with the current observation.", + config_overrides={"context_window": "current"}, + ), + "last3": Variant( + name="last3", + description="Include up to the last three executed steps.", + config_overrides={"context_window": "last3"}, + ), + "text_summary": Variant( + name="text_summary", + description="PR #12 design axis; no ExperimentConfig summary mode exists yet.", + implemented=False, + ), + }, +) diff --git a/prompting_experiments/condition_set_4_querying_strategy.py b/prompting_experiments/condition_set_4_querying_strategy.py new file mode 100644 index 0000000..9f166e0 --- /dev/null +++ b/prompting_experiments/condition_set_4_querying_strategy.py @@ -0,0 +1,33 @@ +"""Condition set 5: querying strategy.""" + +from __future__ import annotations + +from .core import ConditionSet, Variant + + +CONDITION_SET = ConditionSet( + name="Querying strategy", + comparisons=( + "Step-by-step: one action per query", + "Subgoal planning: model outputs a subgoal and action chunk", + "Full trajectory: model outputs a complete plan once", + ), + decision="Determine whether chunked or one-shot planning improves performance.", + variants={ + "step_by_step": Variant( + name="step_by_step", + description="Ask for one action each query.", + config_overrides={"querying": "step_by_step"}, + ), + "subgoal": Variant( + name="subgoal", + description="Ask for a short subgoal and action chunk.", + config_overrides={"querying": "subgoal"}, + ), + "full_trajectory": Variant( + name="full_trajectory", + description="Ask once for a complete action trajectory.", + config_overrides={"querying": "full_trajectory"}, + ), + }, +) diff --git a/prompting_experiments/condition_set_5_in_context_learning.py b/prompting_experiments/condition_set_5_in_context_learning.py new file mode 100644 index 0000000..7b1d8e3 --- /dev/null +++ b/prompting_experiments/condition_set_5_in_context_learning.py @@ -0,0 +1,32 @@ +"""Condition set 6: in-context learning.""" + +from __future__ import annotations + +from .core import ConditionSet, Variant + + +CONDITION_SET = ConditionSet( + name="In-context learning", + comparisons=( + "Zero-shot: no examples", + "1-shot: one example trajectory from a different maze", + ), + decision=( + "If 1-shot dramatically improves performance, the bottleneck is likely " + "task understanding rather than navigation capability." + ), + variants={ + "zero_shot": Variant( + name="zero_shot", + description="Current interface behavior.", + config_overrides={}, + ), + "one_shot": Variant( + name="one_shot", + description="PR #12 design axis; example selection/injection is not implemented yet.", + implemented=False, + ), + }, + implemented=False, + notes="ICL examples must not use evaluation mazes.", +) diff --git a/prompting_experiments/core.py b/prompting_experiments/core.py new file mode 100644 index 0000000..a5c1a0a --- /dev/null +++ b/prompting_experiments/core.py @@ -0,0 +1,50 @@ +"""Shared types for prompt experiment condition registries.""" + +from __future__ import annotations + +from dataclasses import dataclass, replace +from typing import TYPE_CHECKING, Iterator, Mapping + +if TYPE_CHECKING: + from interface.config import ExperimentConfig + + +@dataclass(frozen=True) +class Variant: + """One experiment variant expressed as overrides to ``ExperimentConfig``.""" + + name: str + description: str + config_overrides: Mapping[str, object] | None = None + implemented: bool = True + + def build_config(self, base: ExperimentConfig | None = None) -> ExperimentConfig: + if not self.implemented: + raise ValueError(f"Variant is not implemented in ExperimentConfig: {self.name}") + from interface.config import ExperimentConfig + + cfg = base or ExperimentConfig() + return replace(cfg, **dict(self.config_overrides or {})) + + +@dataclass(frozen=True) +class ConditionSet: + """A named experimental axis and its comparable variants.""" + + name: str + comparisons: tuple[str, ...] + decision: str + variants: Mapping[str, Variant] + implemented: bool = True + notes: str = "" + + +def iter_condition_configs( + condition: ConditionSet, + base: ExperimentConfig | None = None, +) -> Iterator[tuple[str, ExperimentConfig]]: + """Yield ``(variant_name, config)`` pairs for implemented variants.""" + + for variant_name, variant in condition.variants.items(): + if variant.implemented: + yield variant_name, variant.build_config(base) diff --git a/prompting_experiments/exp_design.py b/prompting_experiments/exp_design.py new file mode 100644 index 0000000..d9237c7 --- /dev/null +++ b/prompting_experiments/exp_design.py @@ -0,0 +1,40 @@ +"""Experiment prompt condition-set registry. + +Each condition set is split into its own module to mirror the PR #12 experiment +design while keeping runnable prompt behavior centralized in ``interface``. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Iterator, Mapping + +if TYPE_CHECKING: + from interface.config import ExperimentConfig + +from .condition_set_1_prompt import CONDITION_SET as CONDITION_SET_1 +from .condition_set_2_observation_format import CONDITION_SET as CONDITION_SET_2 +from .condition_set_3_context_window import CONDITION_SET as CONDITION_SET_3 +from .condition_set_4_querying_strategy import CONDITION_SET as CONDITION_SET_5 +from .condition_set_5_in_context_learning import CONDITION_SET as CONDITION_SET_6 +from .core import ConditionSet, Variant, iter_condition_configs as _iter_condition_configs + + +CONDITION_SETS: Mapping[str, ConditionSet] = { + CONDITION_SET_1.name: CONDITION_SET_1, + CONDITION_SET_2.name: CONDITION_SET_2, + CONDITION_SET_3.name: CONDITION_SET_3, + CONDITION_SET_5.name: CONDITION_SET_5, + CONDITION_SET_6.name: CONDITION_SET_6, +} + + +def iter_condition_configs( + condition_name: str, + base: ExperimentConfig | None = None, +) -> Iterator[tuple[str, ExperimentConfig]]: + """Yield runnable ``(variant_name, config)`` pairs for one condition set.""" + + yield from _iter_condition_configs(CONDITION_SETS[condition_name], base) + + +__all__ = ["CONDITION_SETS", "ConditionSet", "Variant", "iter_condition_configs"] diff --git a/prompting_experiments/preview_prompts.py b/prompting_experiments/preview_prompts.py new file mode 100644 index 0000000..b1e82a8 --- /dev/null +++ b/prompting_experiments/preview_prompts.py @@ -0,0 +1,141 @@ +"""Generate a text preview of prompt experiment condition variants.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Any + +from prompting_experiments import CONDITION_SETS +from prompting_experiments.prompt_templates import feedback as feedback_templates +from prompting_experiments.prompt_templates import system as system_templates + + +def _content_to_text(content: Any) -> str: + if isinstance(content, str): + return content + if not isinstance(content, list): + return str(content) + + lines: list[str] = [] + image_count = 0 + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") == "text": + lines.append(block.get("text", "")) + elif block.get("type") == "image_url": + image_count += 1 + lines.append(f"[image block {image_count}]") + return "\n".join(part for part in lines if part) + + +def _missing_dependency_message(exc: ModuleNotFoundError) -> str: + return ( + f"Missing dependency: {exc.name}. Install the project dependencies in this environment, " + "for example: python3 -m pip install -e '.[dev]'" + ) + + +def _prompt_preview(config, maze_path: Path, max_steps: int) -> tuple[str, str]: + try: + from interface.loader import load_task + from interface.runner import build_runner + from interface.renderer import render_initial_maze_text + except ModuleNotFoundError as exc: + raise SystemExit(_missing_dependency_message(exc)) from exc + + backend, spec = load_task(maze_path) + spec.max_steps = max_steps + runner = build_runner(config, backend, spec) + runner.last_rgb, state, _reset_info = backend.reset(seed=spec.seed) + + system_prompt = runner.prompt.build_system_prompt(runner.querying.system_prompt_suffix()) + if config.observation in ("text_only", "image_text"): + system_prompt = ( + f"{system_prompt}\n\n" + f"{system_templates.INITIAL_MAZE_SECTION.format(maze_text=render_initial_maze_text(spec))}" + ) + + user_message = runner._build_message(state, feedback_templates.INITIAL_FEEDBACK, []) + return system_prompt, _content_to_text(user_message.get("content")) + + +def build_preview(maze_path: Path, max_steps: int) -> str: + chunks = [ + "Prompt Experiment Preview", + f"Maze: {maze_path}", + f"Max steps: {max_steps}", + "", + ] + + for idx, condition in enumerate(CONDITION_SETS.values(), start=1): + chunks.extend( + [ + "=" * 88, + f"condition set {idx}: {condition.name}", + "=" * 88, + ] + ) + for variant_name, variant in condition.variants.items(): + chunks.extend( + [ + f"variant name: {variant_name}", + f"description: {variant.description}", + "prompts:", + ] + ) + if not variant.implemented: + chunks.extend( + [ + "Status: not implemented in ExperimentConfig", + "-" * 88, + ] + ) + continue + + try: + config = variant.build_config() + except ModuleNotFoundError as exc: + raise SystemExit(_missing_dependency_message(exc)) from exc + system_prompt, user_prompt = _prompt_preview(config, maze_path, max_steps) + chunks.extend( + [ + "[system prompt]", + system_prompt, + "", + "[user prompt]", + user_prompt, + "-" * 88, + ] + ) + + return "\n".join(chunks).rstrip() + "\n" + + +def _default_maze_path(name: str) -> Path: + return Path(__file__).resolve().parents[1] / "mazes" / "validation_10" / name + + +def main() -> None: + parser = argparse.ArgumentParser(description="Write prompt experiment previews to prompts.txt.") + parser.add_argument("--maze", default="V01_empty_room.json") + parser.add_argument("--max-steps", type=int, default=5) + parser.add_argument( + "--output", + type=Path, + default=Path(__file__).resolve().parent / "prompts.txt", + ) + args = parser.parse_args() + + maze_path = Path(args.maze) + if not maze_path.is_file(): + maze_path = _default_maze_path(args.maze) + + preview = build_preview(maze_path, args.max_steps) + args.output.write_text(preview, encoding="utf-8") + print(f"wrote {args.output}") + + +if __name__ == "__main__": + main() diff --git a/prompting_experiments/prompt_templates/__init__.py b/prompting_experiments/prompt_templates/__init__.py new file mode 100644 index 0000000..344f4fc --- /dev/null +++ b/prompting_experiments/prompt_templates/__init__.py @@ -0,0 +1 @@ +"""Agent-facing prompt templates grouped by prompt surface.""" diff --git a/prompting_experiments/prompt_templates/feedback.py b/prompting_experiments/prompt_templates/feedback.py new file mode 100644 index 0000000..9fb29af --- /dev/null +++ b/prompting_experiments/prompt_templates/feedback.py @@ -0,0 +1,50 @@ +"""Step feedback templates.""" + +INITIAL_FEEDBACK = "Episode start." +OPENED_AND_MOVED = "Opened {color} door {door_id} and moved to {position}." +OPENED_DOOR = "Opened {color} door {door_id}." +NOW_FACING = "Now facing {facing}." +ACTION_NO_EFFECT = "{action} had no effect." +MOVE_BLOCKED_BY_KEY = ( + "MOVE_FORWARD blocked by a {key_color} key at {position}. " + "Keys occupy their cell; you cannot walk onto them. " + "Face the key and use PICKUP from your current cell." +) +MOVE_BLOCKED_BY_GATE_WITH_SWITCHES = ( + "MOVE_FORWARD blocked by closed gate {gate_id} at {position}. " + "Activate switch(es) {switches} to open it." +) +MOVE_BLOCKED_BY_GATE = "MOVE_FORWARD blocked by closed gate {gate_id} at {position}." +MOVE_BLOCKED_GENERIC = "MOVE_FORWARD blocked by wall or closed door/gate." +REACHED_GOAL = "Reached goal at {goal}." +MOVED_TO = "Moved to {position}." +PICKED_UP_KEY = "Picked up {key_color} key." +NOTHING_TO_PICK_UP = "Nothing to pick up here." +TOGGLED_STATE_CHANGED = "Toggled switch or gate state changed." +TOGGLE_HOLD_SWITCH_HINT = ( + "TOGGLE had no effect. MOVE_FORWARD onto the switch at {position} " + "(hold switches activate while you stand on them)." +) +TOGGLE_SWITCH_HINT = "TOGGLE had no effect. MOVE_FORWARD onto the switch at {position}, then TOGGLE." +GATE_TOGGLE_WITH_SWITCHES = "Gates cannot be toggled directly. Activate switch(es) {switches} instead." +GATE_TOGGLE_GENERIC = "Gates cannot be toggled directly. Activate a linked switch instead." +TOGGLE_NO_EFFECT = "TOGGLE had no effect. Stand on a switch and TOGGLE, or use PICKUP/keys for doors." +TASK_COMPLETE = "Task complete at {goal}." +WRONG_DONE = "DONE called but not at goal {goal}." +UNKNOWN_ACTION = "Unknown or unsupported action {action}." + +BLOCKED_FEEDBACK = "BLOCKED — {action}: {message} You remain at {position}." +TURNED_FEEDBACK = "TURNED — {action}: {message}" +MOVED_FEEDBACK = "MOVED — {action}: {message}" +SUCCESS_FEEDBACK = "SUCCESS — {action}: {message}" +PICKUP_FEEDBACK = "PICKUP — {action}: {message}" +NOTHING_FEEDBACK = "NOTHING — {action}: {message} You remain at {position}." +OPENED_FEEDBACK = "OPENED — {action}: {message}" +TOGGLED_FEEDBACK = "TOGGLED — {action}: {message}" +WRONG_DONE_FEEDBACK = "WRONG DONE — {action}: {message} You remain at {position}." +INVALID_FEEDBACK = "INVALID — {action}: {message} You remain at {position}." +DEFAULT_FEEDBACK = "{event_type} — {action}: {message}" +PARSE_FAILURE_FEEDBACK = ( + "Could not parse FINAL_OUTPUT. Do not explain. Reply exactly as one line: " + "FINAL_OUTPUT: ." +) diff --git a/prompting_experiments/prompt_templates/observation.py b/prompting_experiments/prompt_templates/observation.py new file mode 100644 index 0000000..c86a74d --- /dev/null +++ b/prompting_experiments/prompt_templates/observation.py @@ -0,0 +1,51 @@ +"""Observation and history prompt templates.""" + +RECENT_HISTORY_HEADER = "Recent history (last 3 steps, oldest first):" +RECENT_HISTORY_STEP = " ({row}, {col}) facing {facing} -> {action} -> {feedback}" + +IMAGE_HISTORY_ACTION = "Action: {action}\n\n" +IMAGE_ONLY_HISTORY_INTRO = ( + "Recent steps (oldest first). Each image is the maze view from which the " + "following action was chosen; infer pose and environment state from the image.\n\n" +) +IMAGE_TEXT_HISTORY_INTRO = "Recent step views (oldest first):\n\n" + +WORLD_SIZE_LINE = "The world is a {rows} by {cols} grid." +COORDINATE_EXPLANATION = ( + "Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``;" + " tuples in this text use ``(row, column)`` matching env state (row southward, column east)." + " So ``x`` = column index, ``y`` = row index." +) +START_LINE = "The start is at {start}." +GOAL_LINE = "The goal is at {goal}." +WALLS_LINE = "The following cells are walls: {walls}." + +KEY_LINE = "There is a {color} key at ({row},{col})." +DOOR_LINE = ( + "There is a {status} {requires_key} door at ({row},{col})." + " It requires the {requires_key} key to open." +) +SWITCH_LINE = ( + "There is a {switch_type} switch at ({row},{col}) (currently {state})." + " It controls: {controls}." +) +GATE_LINE = ( + "There is a gate ({gate_id}) at ({row},{col})." + " It is currently {state} (initially {initial_state})." +) + +CURRENT_SITUATION_HEADER = "Current situation (this step):" +CURRENT_GOAL_LINE = "The goal is at {goal}." +CURRENT_AGENT_LINE = "You are at {position} facing {facing}." +CURRENT_INVENTORY_LINE = "Your inventory: {inventory}." +CURRENT_MAP_CONTENTS_HEADER = "Map contents as of this step (keys on the ground, doors, switches, gates):" +NO_MECHANISMS_LINE = "(No keys on the ground, doors, switches, or gates in the current state description.)" + +CELL_OUT_OF_BOUNDS = "out of bounds" +CELL_WALL = "wall" +CELL_GOAL = "GOAL ({row},{col})" +CELL_KEY = "{key_color} key ({row},{col})" +CELL_DOOR = "{status} {requires_key} door ({row},{col})" +CELL_GATE = "{state} gate ({row},{col})" +CELL_SWITCH = "switch ({state}) ({row},{col})" +CELL_OPEN = "open ({row},{col})" diff --git a/prompting_experiments/prompt_templates/querying.py b/prompting_experiments/prompt_templates/querying.py new file mode 100644 index 0000000..c878349 --- /dev/null +++ b/prompting_experiments/prompt_templates/querying.py @@ -0,0 +1,15 @@ +"""Querying strategy prompt templates.""" + +SUBGOAL_SUFFIX = ( + "For each turn output:\n" + " SUB_GOAL: \n" + " ACTIONS: " +) + +FULL_TRAJECTORY_SUFFIX = ( + "Output your complete trajectory once as:\n" + " SUB_GOAL: \n" + " ACTIONS: \n" + "The last action in ACTIONS should be DONE (when you expect to be at the goal).\n" + "You will not be queried again — this is your only planning turn." +) diff --git a/prompting_experiments/prompt_templates/system.py b/prompting_experiments/prompt_templates/system.py new file mode 100644 index 0000000..f96d774 --- /dev/null +++ b/prompting_experiments/prompt_templates/system.py @@ -0,0 +1,36 @@ +"""System prompt templates.""" + +TASK_PREFIX = "Task: move to the goal cell in the grid." + +MECHANISM_LIST = ( + "The environment may contain:\n" + "- Keys: pick them up to open doors of the matching color\n" + "- Doors: blocked passages that require a matching key\n" + "- Switches: step onto them to activate (hold) or TOGGLE while standing on them\n" + "- Gates: blocked passages controlled by switches\n" +) + +MECHANISM_RULES = ( + "RULES (domain logic):\n" + " - PICKUP: pick up a key from the adjacent cell you are facing. Keys block movement — you\n" + " cannot MOVE_FORWARD onto a key; stand beside it, face it, and PICKUP.\n" + " - Doors: face a locked door with the matching key in inventory and TOGGLE to open it, then\n" + " MOVE_FORWARD through the open door. MOVE_FORWARD alone does not open a locked door.\n" + " - Switches: MOVE_FORWARD onto the switch cell, then TOGGLE (toggle/one-shot types). Hold-type\n" + " switches activate automatically while you stand on them. Only switches are toggled. Linked\n" + " gates are open if at least one linked switch is on, and closed if all are off.\n" + " - Gates: you cannot TOGGLE a gate. CLOSED gates block movement; OPEN gates do not.\n" + " - Closed gates and doors you lack a key for block movement like walls until resolved.\n" + " - Use DONE only when you are standing on the goal cell." +) + +VALID_ACTIONS_TEMPLATE = "Valid actions: {actions_hint}." + +FINAL_OUTPUT_INSTRUCTION = ( + "Do not explain, reason, summarize the map, or include any text before the answer.\n" + "On the last line, output exactly:\n" + "FINAL_OUTPUT: or FINAL_OUTPUT: , , ... " + "(comma-separated; one or more valid actions)" +) + +INITIAL_MAZE_SECTION = "Initial maze (fixed for this episode):\n{maze_text}" diff --git a/prompting_experiments/prompt_templates/user.py b/prompting_experiments/prompt_templates/user.py new file mode 100644 index 0000000..100429e --- /dev/null +++ b/prompting_experiments/prompt_templates/user.py @@ -0,0 +1,36 @@ +"""User prompt templates.""" + +OBSERVATION_SECTION = "Observation:\n{obs_text}\n\n" + +MINIMAL_USER_PROMPT = ( + "{obs_block}" + "Position: {position} | Facing: {facing} | Goal: {goal}\n" + "Last result: {last_feedback}\n" + "What is your next action?\n" + "Reply exactly as one line: FINAL_OUTPUT: " +) + +VERBOSE_USER_PROMPT = ( + "{obs_block}" + "Position: {position} | Facing: {facing} | Goal: {goal} | " + "Manhattan: {manhattan}\n" + "Inventory: {inventory}\n" + "{neighbour_block}" + "{mechanism_block}" + "Last result: {last_feedback}\n" + "What is your next action?\n" + "Reply exactly as one line: FINAL_OUTPUT: " +) + +NEIGHBOUR_BLOCK_HEADER = "From your perspective:\n" +NEIGHBOUR_LINE = " {relative_direction}: {description}" + +MECHANISM_HINTS_HEADER = "Hints:\n" +KEY_DOOR_HINT = ( + " - Face an adjacent key and PICKUP (do not walk onto the key). " + "Face a locked door with the matching key and TOGGLE to open it, then MOVE_FORWARD through." +) +SWITCH_GATE_HINT = ( + " - MOVE_FORWARD onto a switch, then TOGGLE (hold switches activate on step). " + "Gates cannot be toggled — activate their linked switch(es)." +) diff --git a/prompting_experiments/prompts.txt b/prompting_experiments/prompts.txt new file mode 100644 index 0000000..9511504 --- /dev/null +++ b/prompting_experiments/prompts.txt @@ -0,0 +1,447 @@ +Prompt Experiment Preview +Maze: /Users/helenlu/HRI/MultiNet-v2.0/mazes/validation_10/V01_empty_room.json +Max steps: 5 + +======================================================================================== +condition set 1: Prompt +======================================================================================== +variant name: standard +description: Standard task prompt with mechanism descriptions. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +[user prompt] +[image block 1] +Observation: +Current situation (this step): +The goal is at (6, 6). +You are at (1, 1) facing EAST. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +Position: (1, 1) | Facing: EAST | Goal: (6, 6) +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +variant name: verbose +description: Standard prompt plus explicit domain rules and local hints. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +RULES (domain logic): + - PICKUP: pick up a key from the adjacent cell you are facing. Keys block movement — you + cannot MOVE_FORWARD onto a key; stand beside it, face it, and PICKUP. + - Doors: face a locked door with the matching key in inventory and TOGGLE to open it, then + MOVE_FORWARD through the open door. MOVE_FORWARD alone does not open a locked door. + - Switches: MOVE_FORWARD onto the switch cell, then TOGGLE (toggle/one-shot types). Hold-type + switches activate automatically while you stand on them. Only switches are toggled. Linked + gates are open if at least one linked switch is on, and closed if all are off. + - Gates: you cannot TOGGLE a gate. CLOSED gates block movement; OPEN gates do not. + - Closed gates and doors you lack a key for block movement like walls until resolved. + - Use DONE only when you are standing on the goal cell. + +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +[user prompt] +[image block 1] +Observation: +Current situation (this step): +The goal is at (6, 6). +You are at (1, 1) facing EAST. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +Position: (1, 1) | Facing: EAST | Goal: (6, 6) | Manhattan: 10 +Inventory: none +From your perspective: + AHEAD: open (1,2) + RIGHT: open (2,1) + BEHIND: out of bounds + LEFT: out of bounds +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +======================================================================================== +condition set 2: Observation format +======================================================================================== +variant name: text_only +description: Natural-language current observation, no image blocks. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +[user prompt] +Observation: +Current situation (this step): +The goal is at (6, 6). +You are at (1, 1) facing EAST. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +Position: (1, 1) | Facing: EAST | Goal: (6, 6) +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +variant name: image_text +description: Image block plus natural-language observation. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +[user prompt] +[image block 1] +Observation: +Current situation (this step): +The goal is at (6, 6). +You are at (1, 1) facing EAST. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +Position: (1, 1) | Facing: EAST | Goal: (6, 6) +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +variant name: image_only +description: Image block with no initial natural-language maze map. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +[user prompt] +[image block 1] +Position: (1, 1) | Facing: EAST | Goal: (6, 6) +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +======================================================================================== +condition set 3: Context window +======================================================================================== +variant name: current +description: Prompt only with the current observation. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +[user prompt] +[image block 1] +Observation: +Current situation (this step): +The goal is at (6, 6). +You are at (1, 1) facing EAST. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +Position: (1, 1) | Facing: EAST | Goal: (6, 6) +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +variant name: last3 +description: Include up to the last three executed steps. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +[user prompt] +[image block 1] +Observation: +Current situation (this step): +The goal is at (6, 6). +You are at (1, 1) facing EAST. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +Position: (1, 1) | Facing: EAST | Goal: (6, 6) +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +variant name: text_summary +description: PR #12 design axis; no ExperimentConfig summary mode exists yet. +prompts: +Status: not implemented in ExperimentConfig +---------------------------------------------------------------------------------------- +======================================================================================== +condition set 4: Querying strategy +======================================================================================== +variant name: step_by_step +description: Ask for one action each query. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +[user prompt] +[image block 1] +Observation: +Current situation (this step): +The goal is at (6, 6). +You are at (1, 1) facing EAST. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +Position: (1, 1) | Facing: EAST | Goal: (6, 6) +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +variant name: subgoal +description: Ask for a short subgoal and action chunk. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +For each turn output: + SUB_GOAL: + ACTIONS: + +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +[user prompt] +[image block 1] +Observation: +Current situation (this step): +The goal is at (6, 6). +You are at (1, 1) facing EAST. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +Position: (1, 1) | Facing: EAST | Goal: (6, 6) +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +variant name: full_trajectory +description: Ask once for a complete action trajectory. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +Output your complete trajectory once as: + SUB_GOAL: + ACTIONS: +The last action in ACTIONS should be DONE (when you expect to be at the goal). +You will not be queried again — this is your only planning turn. + +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +[user prompt] +[image block 1] +Observation: +Current situation (this step): +The goal is at (6, 6). +You are at (1, 1) facing EAST. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +Position: (1, 1) | Facing: EAST | Goal: (6, 6) +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +======================================================================================== +condition set 5: In-context learning +======================================================================================== +variant name: zero_shot +description: Current interface behavior. +prompts: +[system prompt] +Task: move to the goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: step onto them to activate (hold) or TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. +On the last line, output exactly: +FINAL_OUTPUT: or FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) + +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +[user prompt] +[image block 1] +Observation: +Current situation (this step): +The goal is at (6, 6). +You are at (1, 1) facing EAST. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +Position: (1, 1) | Facing: EAST | Goal: (6, 6) +Last result: Episode start. +What is your next action? +---------------------------------------------------------------------------------------- +variant name: one_shot +description: PR #12 design axis; example selection/injection is not implemented yet. +prompts: +Status: not implemented in ExperimentConfig +---------------------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 7ce045e..61a82de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,9 @@ multinet-probe-vlm = "scripts.probe_vlm:main" multinet-ollama-vision-check = "scripts.ollama_vision_check:main" multinet-ollama-maze-shape-check = "scripts.ollama_maze_shape_check:main" multinet-vlm-sanity = "scripts.vlm_sanity_check:main" +multinet-preview-prompts = "prompting_experiments.preview_prompts:main" +multinet-score-json = "scripts.score_json:main" +multinet-run-pipeline = "scripts.run_pipeline:main" [tool.setuptools] include-package-data = true @@ -63,9 +66,13 @@ include = [ "interface*", "mazes*", "multigrid*", + "pipeline*", + "prompting_experiments*", + "scorer*", "scripts*", ] [tool.setuptools.package-data] -gridworld = ["tasks/**/*.json", "tasks/*.json"] +gridworld = ["tasks/**/*.json", "tasks/*.json", "fixtures/**/*.json", "fixtures/*.json"] mazes = ["validation_10/**/*.json", "validation_10/*.json"] +scorer = ["scorer_config.json"] diff --git a/scorer/__init__.py b/scorer/__init__.py new file mode 100644 index 0000000..df4a4db --- /dev/null +++ b/scorer/__init__.py @@ -0,0 +1,33 @@ +"""Standalone scoring package for MultiNet task and run artifacts.""" + +from .scoring import ( + CanonicalPathReport, + RuntimeScoreArtifact, + ScoredDifficulty, + ScorerConfig, + StaticScoreArtifact, + compute_12d_score, + compute_canonical_paths, + compute_greedy_solvability, + compute_runtime_score, + compute_static_score_artifact, + load_scorer_config, + score_runtime_file, + score_task_file, +) + +__all__ = [ + "CanonicalPathReport", + "RuntimeScoreArtifact", + "ScoredDifficulty", + "ScorerConfig", + "StaticScoreArtifact", + "compute_12d_score", + "compute_canonical_paths", + "compute_greedy_solvability", + "compute_runtime_score", + "compute_static_score_artifact", + "load_scorer_config", + "score_runtime_file", + "score_task_file", +] diff --git a/scorer/artifacts.py b/scorer/artifacts.py new file mode 100644 index 0000000..165d147 --- /dev/null +++ b/scorer/artifacts.py @@ -0,0 +1,173 @@ +"""Dataclasses for scorer artifact payloads.""" + +from __future__ import annotations + +import copy +from dataclasses import dataclass, field +from typing import Any + +from .config import DIMENSION_NAMES, SCORER_VERSION + + +@dataclass +class ScoredDifficulty: + """Backward-compatible 12-dimension score report.""" + + dimensions: list[float] + dimension_names: list[str] = field(default_factory=lambda: DIMENSION_NAMES.copy()) + composite: float = 0.0 + weights: list[float] = field(default_factory=lambda: [1.0] * len(DIMENSION_NAMES)) + + @property + def dimensions_by_name(self) -> dict[str, float]: + return dict(zip(self.dimension_names, self.dimensions)) + + def to_dict(self) -> dict[str, Any]: + return { + "dimensions": list(self.dimensions), + "dimension_names": list(self.dimension_names), + "composite": self.composite, + "weights": list(self.weights), + } + + +@dataclass +class CanonicalPathReport: + """Canonical solver trace artifact for a task.""" + + task_id: str + success: bool + actions: list[str] + positions: list[tuple[int, int]] + optimal_steps: int + states_explored: int + message: str + greedy: dict[str, Any] | None = None + inputs_hash: str = "" + producer_version: str = SCORER_VERSION + + @property + def bfs(self) -> dict[str, Any]: + return { + "success": self.success, + "actions": list(self.actions), + "positions": [list(pos) for pos in self.positions], + "optimal_steps": self.optimal_steps, + "states_explored": self.states_explored, + "message": self.message, + } + + def to_dict(self) -> dict[str, Any]: + payload = { + "task_id": self.task_id, + "bfs": self.bfs, + "inputs_hash": self.inputs_hash, + "producer_version": self.producer_version, + } + if self.greedy is not None: + payload["greedy"] = copy.deepcopy(self.greedy) + return payload + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "CanonicalPathReport": + bfs = data.get("bfs", data) + return cls( + task_id=str(data.get("task_id", "")), + success=bool(bfs.get("success", False)), + actions=[str(action) for action in bfs.get("actions", [])], + positions=[ + (int(pos[0]), int(pos[1])) + for pos in bfs.get("positions", []) + if isinstance(pos, (list, tuple)) and len(pos) >= 2 + ], + optimal_steps=int(bfs.get("optimal_steps", 0)), + states_explored=int(bfs.get("states_explored", 0)), + message=str(bfs.get("message", "")), + greedy=copy.deepcopy(data.get("greedy")), + inputs_hash=str(data.get("inputs_hash", "")), + producer_version=str(data.get("producer_version", SCORER_VERSION)), + ) + + +@dataclass +class StaticScoreArtifact: + """Stage 2 static score artifact.""" + + task_id: str + is_beatable: bool + message: str + dimensions: dict[str, float] + static_score_unweighted: float + static_score: float + weights: dict[str, float] + validation: dict[str, Any] + canonical_agent_features: dict[str, float | None] + calibration_version: str + inputs_hash: str + producer_version: str = SCORER_VERSION + + def to_dict(self) -> dict[str, Any]: + return { + "task_id": self.task_id, + "is_beatable": self.is_beatable, + "message": self.message, + "dimensions_12": dict(self.dimensions), + "static_score_unweighted": self.static_score_unweighted, + "static_score": self.static_score, + "weights": dict(self.weights), + "validation": copy.deepcopy(self.validation), + "canonical_agent_features": dict(self.canonical_agent_features), + "calibration_version": self.calibration_version, + "inputs_hash": self.inputs_hash, + "producer_version": self.producer_version, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "StaticScoreArtifact": + dimensions = data.get("dimensions_12", data.get("dimensions", {})) + if isinstance(dimensions, list): + dimensions = dict(zip(DIMENSION_NAMES, dimensions)) + return cls( + task_id=str(data.get("task_id", "")), + is_beatable=bool(data.get("is_beatable", False)), + message=str(data.get("message", "")), + dimensions={str(k): float(v) for k, v in dimensions.items()}, + static_score_unweighted=float(data.get("static_score_unweighted", 0.0)), + static_score=float(data.get("static_score", data.get("composite", 0.0))), + weights={str(k): float(v) for k, v in data.get("weights", {}).items()}, + validation=dict(data.get("validation", {})), + canonical_agent_features=dict(data.get("canonical_agent_features", {})), + calibration_version=str(data.get("calibration_version", "unknown")), + inputs_hash=str(data.get("inputs_hash", "")), + producer_version=str(data.get("producer_version", SCORER_VERSION)), + ) + + +@dataclass +class RuntimeScoreArtifact: + """Stage 4 runtime score artifact for one run.""" + + task_id: str + backend: str + adapter: str + model_id: str + seed: int | None + signals: dict[str, Any] + composite: float + calibration_version: str + inputs_hash: str + producer_version: str = SCORER_VERSION + + def to_dict(self) -> dict[str, Any]: + return { + "task_id": self.task_id, + "backend": self.backend, + "adapter": self.adapter, + "model_id": self.model_id, + "seed": self.seed, + "signals": copy.deepcopy(self.signals), + "composite": self.composite, + "calibration_version": self.calibration_version, + "inputs_hash": self.inputs_hash, + "producer_version": self.producer_version, + } diff --git a/scorer/config.py b/scorer/config.py new file mode 100644 index 0000000..cce1c45 --- /dev/null +++ b/scorer/config.py @@ -0,0 +1,146 @@ +"""Scorer configuration and calibration defaults.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from .io import load_json + + +SCORER_VERSION = "0.3.0" +DEFAULT_CONFIG_PATH = Path(__file__).with_name("scorer_config.json") + +DIMENSION_NAMES = [ + "optimal_path_length", + "search_space_size", + "backtracking_required", + "fragility", + "dependency_depth", + "dependency_variety", + "distractor_count", + "distractor_quality", + "grid_size", + "wall_density", + "partial_observability", + "irreversibility", +] + +GREEDY_SOLVABILITY_FEATURE = "greedy_solvability" + +CANONICAL_AGENT_FEATURE_NAMES = [ + GREEDY_SOLVABILITY_FEATURE, +] + +DEFAULT_DISTRACTOR_TYPE_WEIGHTS = { + "wrong_color_key": 1.0, + "inactive_switch": 2.0, + "decoy_door": 2.0, + "distractor_chain": 3.0, +} + +DEFAULT_RUNTIME_WEIGHTS = { + "step_ratio": 1.0, + "cell_overlap_bfs": 1.0, + "token_efficiency": 1.0, + "greedy_penalty": 0.5, +} + + +def _coerce_float_mapping( + values: dict[str, Any] | list[Any] | None, + names: list[str], + default: float = 1.0, +) -> dict[str, float]: + if values is None: + return {name: default for name in names} + if isinstance(values, list): + if len(values) != len(names): + raise ValueError(f"Expected {len(names)} weights, got {len(values)}") + result = {name: default for name in names} + for name, value in zip(names, values): + result[name] = float(value) + return result + return {name: float(values.get(name, default)) for name in names} + + +@dataclass +class ScorerConfig: + """Weights and runtime coefficients used by the standalone scorer.""" + + version: str = "default" + static_dimension_weights: dict[str, float] = field( + default_factory=lambda: {name: 1.0 for name in DIMENSION_NAMES} + ) + distractor_type_weights: dict[str, float] = field( + default_factory=lambda: DEFAULT_DISTRACTOR_TYPE_WEIGHTS.copy() + ) + runtime_weights: dict[str, float] = field( + default_factory=lambda: DEFAULT_RUNTIME_WEIGHTS.copy() + ) + baseline_tokens: float = 1000.0 + difficulty_max_static_score: float | None = None + + @classmethod + def default(cls) -> "ScorerConfig": + return cls() + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ScorerConfig": + static_weights = data.get("static_dimension_weights", data.get("static_weights")) + runtime_weights = data.get("runtime_weights") + distractor_weights = data.get("distractor_type_weights", data.get("distractor_weights")) + + difficulty_max = data.get("difficulty_max_static_score") + return cls( + version=str(data.get("version", "default")), + static_dimension_weights=_coerce_float_mapping(static_weights, DIMENSION_NAMES), + distractor_type_weights={ + **DEFAULT_DISTRACTOR_TYPE_WEIGHTS, + **{k: float(v) for k, v in (distractor_weights or {}).items()}, + }, + runtime_weights={ + **DEFAULT_RUNTIME_WEIGHTS, + **{k: float(v) for k, v in (runtime_weights or {}).items()}, + }, + baseline_tokens=float(data.get("baseline_tokens", 1000.0)), + difficulty_max_static_score=( + None if difficulty_max is None else float(difficulty_max) + ), + ) + + def to_dict(self) -> dict[str, Any]: + return { + "version": self.version, + "static_dimension_weights": dict(self.static_dimension_weights), + "distractor_type_weights": dict(self.distractor_type_weights), + "runtime_weights": dict(self.runtime_weights), + "baseline_tokens": self.baseline_tokens, + "difficulty_max_static_score": self.difficulty_max_static_score, + } + + def static_weight_list(self) -> list[float]: + return [self.static_dimension_weights.get(name, 1.0) for name in DIMENSION_NAMES] + + +def load_scorer_config(path: str | Path | None = None) -> ScorerConfig: + """Load scorer weights from JSON, or return defaults if no file exists.""" + config_path = Path(path) if path is not None else DEFAULT_CONFIG_PATH + if not config_path.exists(): + if path is not None: + raise FileNotFoundError(f"Scorer config not found: {config_path}") + return ScorerConfig.default() + if config_path.suffix.lower() in {".yaml", ".yml"}: + try: + import yaml # type: ignore + except ImportError as exc: + raise ImportError( + "YAML scorer configs require PyYAML. Use JSON or install PyYAML." + ) from exc + with open(config_path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + if not isinstance(data, dict): + raise ValueError(f"Expected a YAML object in {config_path}") + return ScorerConfig.from_dict(data) + return ScorerConfig.from_dict(load_json(config_path)) diff --git a/scorer/io.py b/scorer/io.py new file mode 100644 index 0000000..6d929f2 --- /dev/null +++ b/scorer/io.py @@ -0,0 +1,62 @@ +"""JSON and hash helpers for scorer artifacts.""" + +from __future__ import annotations + +import hashlib +import json +from pathlib import Path +from typing import Any + +from gridworld.task_spec import TaskSpecification + + +def json_default(value: Any) -> Any: + if hasattr(value, "item"): + return value.item() + raise TypeError(f"Object of type {value.__class__.__name__} is not JSON serializable") + + +def load_json(path: str | Path) -> dict[str, Any]: + with open(path, "r", encoding="utf-8") as f: + data = json.load(f) + if not isinstance(data, dict): + raise ValueError(f"Expected a JSON object in {path}") + return data + + +def dump_json(path: str | Path, payload: dict[str, Any]) -> None: + output_path = Path(path) + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2, default=json_default) + f.write("\n") + + +def json_files(paths: list[str]) -> list[Path]: + """Expand JSON files and directories into a stable file list.""" + files: list[Path] = [] + for value in paths: + path = Path(value) + if path.is_dir(): + files.extend(sorted(path.rglob("*.json"))) + else: + files.append(path) + return files + + +def stable_hash(payload: Any) -> str: + encoded = json.dumps(payload, sort_keys=True, separators=(",", ":"), default=json_default) + return hashlib.sha256(encoded.encode("utf-8")).hexdigest() + + +def task_spec_from_payload(data: dict[str, Any]) -> TaskSpecification: + if "task_spec" in data and isinstance(data["task_spec"], dict): + return TaskSpecification.from_dict(data["task_spec"]) + if "TaskSpecification" in data and isinstance(data["TaskSpecification"], dict): + return TaskSpecification.from_dict(data) + required_fields = {"task_id", "maze", "goal", "max_steps"} + if not required_fields.issubset(data): + raise ValueError( + "Input JSON is not a task artifact. Expected task fields or a nested task_spec." + ) + return TaskSpecification.from_dict(data) diff --git a/scorer/runtime.py b/scorer/runtime.py new file mode 100644 index 0000000..a1567ab --- /dev/null +++ b/scorer/runtime.py @@ -0,0 +1,363 @@ +"""Runtime scoring for run and episode JSON artifacts.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from .artifacts import CanonicalPathReport, RuntimeScoreArtifact, StaticScoreArtifact +from .config import SCORER_VERSION, ScorerConfig +from .io import dump_json, load_json, stable_hash + + +def _artifact_dict(value: dict[str, Any] | StaticScoreArtifact | CanonicalPathReport) -> dict[str, Any]: + if hasattr(value, "to_dict"): + return value.to_dict() # type: ignore[no-any-return] + return value + + +def _lookup_path(data: dict[str, Any], *keys: str) -> Any: + current: Any = data + for key in keys: + if not isinstance(current, dict) or key not in current: + return None + current = current[key] + return current + + +def _extract_task_id(run: dict[str, Any], fallback: str = "") -> str: + return str( + run.get("task_id") + or _lookup_path(run, "task_spec", "task_id") + or _lookup_path(run, "episode", "task_id") + or fallback + ) + + +def _extract_bool(run: dict[str, Any], *keys: str, default: bool = False) -> bool: + for key in keys: + value = run.get(key) + if value is not None: + return bool(value) + return default + + +def _extract_steps(run: dict[str, Any]) -> int | None: + for key in ("steps", "steps_taken", "steps_used"): + if run.get(key) is not None: + return int(run[key]) + signal_steps = _lookup_path(run, "signals", "steps") + if signal_steps is not None: + return int(signal_steps) + final_step = _lookup_path(run, "final_state", "step_count") + if final_step is not None: + return int(final_step) + return None + + +def _extract_token_count(run: dict[str, Any]) -> int | None: + for key in ("total_tokens", "token_count", "tokens"): + if run.get(key) is not None: + return int(run[key]) + signal_tokens = _lookup_path(run, "signals", "token_count") + if signal_tokens is not None: + return int(signal_tokens) + + trajectory_total = _sum_record_tokens(run.get("trajectory", [])) + if trajectory_total is not None: + return trajectory_total + return _sum_record_tokens(run.get("transcript", []), kind="query") + + +def _sum_record_tokens(records: Any, kind: str | None = None) -> int | None: + from interface.telemetry import token_count_from_record + + if not isinstance(records, list): + return None + total = 0 + found = False + for item in records: + if not isinstance(item, dict): + continue + if kind is not None and item.get("kind") != kind: + continue + item_tokens = token_count_from_record(item) + if item_tokens is not None: + total += item_tokens + found = True + return total if found else None + + +def _state_position(state: Any) -> tuple[int, int] | None: + if not isinstance(state, dict): + return None + raw = state.get("agent_position") or state.get("position") + if isinstance(raw, (list, tuple)) and len(raw) >= 2: + return int(raw[0]), int(raw[1]) + return None + + +def _extract_run_positions(run: dict[str, Any]) -> list[tuple[int, int]]: + positions: list[tuple[int, int]] = [] + + initial_pos = _state_position(run.get("initial_state")) + if initial_pos is not None: + positions.append(initial_pos) + + for item in run.get("trajectory", []): + if not isinstance(item, dict): + continue + pos = _state_position(item.get("state")) + if pos is not None: + positions.append(pos) + + for item in run.get("transcript", []): + if not isinstance(item, dict): + continue + if item.get("kind") == "reset": + pos = _state_position(item.get("state")) + else: + pos = _state_position(item.get("state_after")) + if pos is None: + raw = item.get("position_after") + pos = (int(raw[0]), int(raw[1])) if isinstance(raw, list) and len(raw) >= 2 else None + if pos is not None: + positions.append(pos) + + final_pos = _state_position(run.get("final_state")) + if final_pos is not None: + positions.append(final_pos) + + deduped: list[tuple[int, int]] = [] + for pos in positions: + if not deduped or deduped[-1] != pos: + deduped.append(pos) + return deduped + + +def _extract_canonical_positions( + canonical_paths: dict[str, Any], + agent: str = "bfs", +) -> list[tuple[int, int]]: + path = canonical_paths.get(agent, canonical_paths if agent == "bfs" else {}) + if not isinstance(path, dict): + return [] + positions = [] + for pos in path.get("positions", []): + if isinstance(pos, (list, tuple)) and len(pos) >= 2: + positions.append((int(pos[0]), int(pos[1]))) + return positions + + +def _cell_overlap(run_positions: list[tuple[int, int]], canonical_positions: list[tuple[int, int]]) -> float: + canonical_cells = set(canonical_positions) + if not canonical_cells: + return 0.0 + return len(set(run_positions) & canonical_cells) / len(canonical_cells) + + +def _extract_static_score(static_score: dict[str, Any]) -> float: + return float(static_score.get("static_score", static_score.get("composite", 0.0))) + + +def _extract_greedy_solvability(static_score: dict[str, Any]) -> float: + value = _lookup_path(static_score, "canonical_agent_features", "greedy_solvability") + if value is None: + raise ValueError("Runtime scoring requires evaluated canonical_agent_features.greedy_solvability") + solvability = float(value) + if not 0.0 <= solvability <= 1.0: + raise ValueError("greedy_solvability must be between 0.0 and 1.0") + return solvability + + +def _runtime_weighted_average(signals: dict[str, float], weights: dict[str, float]) -> float: + numerator = 0.0 + denominator = 0.0 + for key in ("step_ratio", "cell_overlap_bfs", "token_efficiency"): + weight = float(weights.get(key, 0.0)) + numerator += signals[key] * weight + denominator += weight + return numerator / denominator if denominator else 0.0 + + +def _first_present(*values: Any) -> Any: + for value in values: + if value is not None: + return value + return None + + +def compute_runtime_score( + run: dict[str, Any], + static_score: dict[str, Any] | StaticScoreArtifact, + canonical_paths: dict[str, Any] | CanonicalPathReport, + config: ScorerConfig | None = None, + difficulty_max_static_score: float | None = None, +) -> RuntimeScoreArtifact: + """Compute the Stage 4 runtime score for one run JSON payload.""" + scorer_config = config or ScorerConfig.default() + static_data = _artifact_dict(static_score) + canonical_data = _artifact_dict(canonical_paths) + if _lookup_path(static_data, "validation", "schema_valid") is False: + raise ValueError("Runtime scoring requires a schema-valid scored_static.json artifact") + + task_id = _extract_task_id(run, fallback=str(static_data.get("task_id", ""))) + success = _extract_bool(run, "success", default=bool(_lookup_path(run, "signals", "success") or False)) + steps = _extract_steps(run) + token_count = _extract_token_count(run) + canonical_positions = _extract_canonical_positions(canonical_data) + greedy_positions = _extract_canonical_positions(canonical_data, agent="greedy") + run_positions = _extract_run_positions(run) + + optimal_steps_value = _first_present( + _lookup_path(canonical_data, "bfs", "optimal_steps"), + canonical_data.get("optimal_steps"), + static_data.get("optimal_steps"), + ) + if optimal_steps_value is None: + raise ValueError("Runtime scoring requires bfs.optimal_steps in canonical_paths.json") + optimal_steps = int(optimal_steps_value) + if steps is None: + raise ValueError("Runtime scoring requires step telemetry") + if steps < 0: + raise ValueError("steps must not be negative") + step_ratio = 0.0 + if success and optimal_steps == 0: + step_ratio = 1.0 if steps == 0 else 0.0 + elif success: + step_ratio = optimal_steps / max(float(steps), float(optimal_steps), 1.0) + + cell_overlap_bfs = _cell_overlap(run_positions, canonical_positions) + cell_overlap_greedy = ( + _cell_overlap(run_positions, greedy_positions) + if isinstance(canonical_data.get("greedy"), dict) + else None + ) + if token_count is None: + raise ValueError("Runtime scoring requires positive token telemetry") + if token_count <= 0: + raise ValueError("token_count must be greater than zero") + token_efficiency = min(1.0, scorer_config.baseline_tokens / float(token_count)) + + static_composite = _extract_static_score(static_data) + normalizer = ( + difficulty_max_static_score + if difficulty_max_static_score is not None + else scorer_config.difficulty_max_static_score + ) + if normalizer is None: + raise ValueError( + "Runtime scoring requires difficulty_max_static_score from the task suite " + "or scorer config" + ) + if normalizer <= 0: + raise ValueError("difficulty_max_static_score must be greater than zero") + if static_composite > normalizer: + raise ValueError("difficulty_max_static_score must be at least the task static score") + difficulty_weight = static_composite / normalizer + success_factor = 1.0 if success else 0.0 + efficiency_signals = { + "step_ratio": step_ratio, + "cell_overlap_bfs": cell_overlap_bfs, + "token_efficiency": token_efficiency, + } + efficiency_factor = _runtime_weighted_average( + efficiency_signals, + scorer_config.runtime_weights, + ) + greedy_solvability = _extract_greedy_solvability(static_data) + greedy_penalty = ( + scorer_config.runtime_weights.get("greedy_penalty", 0.0) + * greedy_solvability + * success_factor + ) + composite = max(0.0, success_factor * efficiency_factor * difficulty_weight - greedy_penalty) + + signals: dict[str, Any] = { + "success": success, + "steps": steps, + "terminated": _extract_bool(run, "terminated", default=False), + "truncated": _extract_bool(run, "truncated", default=False), + "terminated_reason": run.get("terminated_reason") or run.get("end_reason") or ("success" if success else "unknown"), + "reward": run.get("reward", run.get("total_reward")), + "token_count": token_count, + "optimal_steps": optimal_steps, + "step_ratio": step_ratio, + "cell_overlap_bfs": cell_overlap_bfs, + "cell_overlap_greedy": cell_overlap_greedy, + "token_efficiency": token_efficiency, + "difficulty_weight": difficulty_weight, + "efficiency_factor": efficiency_factor, + "greedy_penalty": greedy_penalty, + } + for key in ( + "distractor_interactions", + "irreversible_failures", + "path_choice", + "mechanism_interaction_order", + "failure_point", + ): + if run.get(key) is not None: + signals[key] = run[key] + + inputs_hash = stable_hash( + { + "run": { + "task_id": task_id, + "backend": run.get("backend"), + "adapter": run.get("adapter", run.get("agent_or_model")), + "model_id": run.get("model_id", run.get("model_name", run.get("agent_or_model"))), + "seed": run.get("seed"), + "positions": run_positions, + "signals": signals, + }, + "static_score": { + "task_id": static_data.get("task_id"), + "static_score": static_composite, + "greedy_solvability": greedy_solvability, + }, + "canonical_paths": { + "bfs_positions": canonical_positions, + "greedy_positions": greedy_positions, + "optimal_steps": optimal_steps, + }, + "config": scorer_config.to_dict(), + "scorer_version": SCORER_VERSION, + } + ) + + return RuntimeScoreArtifact( + task_id=task_id, + backend=str(run.get("backend", "")), + adapter=str(run.get("adapter", run.get("agent_or_model", ""))), + model_id=str(run.get("model_id", run.get("model_name", run.get("agent_or_model", "")))), + seed=int(run["seed"]) if run.get("seed") is not None else None, + signals=signals, + composite=composite, + calibration_version=scorer_config.version, + inputs_hash=inputs_hash, + ) + + +def score_runtime_file( + run_path: str | Path, + static_score_path: str | Path, + canonical_paths_path: str | Path, + output_path: str | Path | None = None, + config: ScorerConfig | None = None, + difficulty_max_static_score: float | None = None, +) -> RuntimeScoreArtifact: + """Score one run JSON file and optionally write run_score.json.""" + run = load_json(run_path) + static_score = load_json(static_score_path) + canonical_paths = load_json(canonical_paths_path) + score = compute_runtime_score( + run, + static_score=static_score, + canonical_paths=canonical_paths, + config=config, + difficulty_max_static_score=difficulty_max_static_score, + ) + if output_path is not None: + dump_json(output_path, score.to_dict()) + return score diff --git a/scorer/scorer_config.json b/scorer/scorer_config.json new file mode 100644 index 0000000..fb7ed8f --- /dev/null +++ b/scorer/scorer_config.json @@ -0,0 +1,31 @@ +{ + "version": "default-v2", + "static_dimension_weights": { + "optimal_path_length": 1.0, + "search_space_size": 1.0, + "backtracking_required": 1.0, + "fragility": 1.0, + "dependency_depth": 1.0, + "dependency_variety": 1.0, + "distractor_count": 1.0, + "distractor_quality": 1.0, + "grid_size": 1.0, + "wall_density": 1.0, + "partial_observability": 1.0, + "irreversibility": 1.0 + }, + "distractor_type_weights": { + "wrong_color_key": 1.0, + "inactive_switch": 2.0, + "decoy_door": 2.0, + "distractor_chain": 3.0 + }, + "runtime_weights": { + "step_ratio": 1.0, + "cell_overlap_bfs": 1.0, + "token_efficiency": 1.0, + "greedy_penalty": 0.5 + }, + "baseline_tokens": 1000.0, + "difficulty_max_static_score": null +} diff --git a/scorer/scoring.py b/scorer/scoring.py new file mode 100644 index 0000000..6d12100 --- /dev/null +++ b/scorer/scoring.py @@ -0,0 +1,45 @@ +"""Public scorer interface for static and runtime analysis.""" + +from __future__ import annotations + +from .artifacts import ( + CanonicalPathReport, + RuntimeScoreArtifact, + ScoredDifficulty, + StaticScoreArtifact, +) +from .config import ( + CANONICAL_AGENT_FEATURE_NAMES, + DEFAULT_CONFIG_PATH, + DEFAULT_DISTRACTOR_TYPE_WEIGHTS, + DEFAULT_RUNTIME_WEIGHTS, + DIMENSION_NAMES, + SCORER_VERSION, + ScorerConfig, + load_scorer_config, +) +from .runtime import compute_runtime_score, score_runtime_file +from .solvers import compute_canonical_paths, compute_greedy_solvability +from .static import compute_12d_score, compute_static_score_artifact, score_task_file + +__all__ = [ + "CANONICAL_AGENT_FEATURE_NAMES", + "DEFAULT_CONFIG_PATH", + "DEFAULT_DISTRACTOR_TYPE_WEIGHTS", + "DEFAULT_RUNTIME_WEIGHTS", + "DIMENSION_NAMES", + "SCORER_VERSION", + "CanonicalPathReport", + "RuntimeScoreArtifact", + "ScoredDifficulty", + "ScorerConfig", + "StaticScoreArtifact", + "compute_12d_score", + "compute_canonical_paths", + "compute_greedy_solvability", + "compute_runtime_score", + "compute_static_score_artifact", + "load_scorer_config", + "score_runtime_file", + "score_task_file", +] diff --git a/scorer/solvers.py b/scorer/solvers.py new file mode 100644 index 0000000..5803fef --- /dev/null +++ b/scorer/solvers.py @@ -0,0 +1,79 @@ +"""Canonical solver integration for scorer artifacts.""" + +from __future__ import annotations + +from typing import Any + +from gridworld.baselines import PlannedPath, plan_bfs_path, plan_greedy_path +from gridworld.task_spec import TaskSpecification + +from .artifacts import CanonicalPathReport +from .config import SCORER_VERSION +from .io import stable_hash + + +def _path_payload(path) -> dict[str, Any]: + return { + "success": path.success, + "actions": list(path.action_labels), + "positions": [list(pos) for pos in path.positions], + "steps": len(path.action_labels), + } + + +def require_scorable_spec(spec: TaskSpecification) -> None: + """Reject malformed tasks before canonical planners inspect their coordinates.""" + schema_valid, schema_errors = spec.validate() + if not schema_valid: + detail = "; ".join(schema_errors) + raise ValueError(f"Task {spec.task_id!r} failed schema validation: {detail}") + + +def compute_canonical_paths( + spec: TaskSpecification, + bfs_path: PlannedPath | None = None, + greedy_path: PlannedPath | None = None, +) -> CanonicalPathReport: + """Emit canonical BFS and greedy traces using the merged baseline solvers.""" + require_scorable_spec(spec) + if bfs_path is None: + bfs_path = plan_bfs_path(spec) + if greedy_path is None: + greedy_path = plan_greedy_path(spec) + + if bfs_path.success: + message = ( + f"Solution found in {len(bfs_path.action_labels)} steps " + f"({bfs_path.states_explored} states explored)" + ) + elif bfs_path.states_explored: + message = ( + "No solution found " + f"({bfs_path.states_explored} states explored, all reachable states checked)" + ) + else: + message = "No solution found" + + inputs_hash = stable_hash({"task": spec.to_dict(), "scorer_version": SCORER_VERSION}) + + return CanonicalPathReport( + task_id=spec.task_id, + success=bfs_path.success, + actions=list(bfs_path.action_labels), + positions=list(bfs_path.positions), + optimal_steps=len(bfs_path.action_labels) if bfs_path.success else 0, + states_explored=bfs_path.states_explored, + message=message, + greedy=_path_payload(greedy_path), + inputs_hash=inputs_hash, + ) + + +def compute_greedy_solvability( + spec: TaskSpecification, + greedy_path: PlannedPath | None = None, +) -> float: + """Return 1 when the merged greedy planner solves the task, else 0.""" + if greedy_path is None: + greedy_path = plan_greedy_path(spec) + return 1.0 if greedy_path.success else 0.0 diff --git a/scorer/static.py b/scorer/static.py new file mode 100644 index 0000000..adac502 --- /dev/null +++ b/scorer/static.py @@ -0,0 +1,264 @@ +"""Static task scoring and Stage 2 artifact generation.""" + +from __future__ import annotations + +from pathlib import Path + +from gridworld.baselines import PlannedPath, plan_bfs_path, plan_greedy_path +from gridworld.task_spec import TaskSpecification +from gridworld.task_validator import DifficultyReport, TaskValidator, compute_difficulty + +from .artifacts import ScoredDifficulty, StaticScoreArtifact +from .config import ( + DEFAULT_DISTRACTOR_TYPE_WEIGHTS, + DIMENSION_NAMES, + GREEDY_SOLVABILITY_FEATURE, + SCORER_VERSION, + ScorerConfig, +) +from .io import dump_json, load_json, stable_hash, task_spec_from_payload +from .solvers import compute_canonical_paths, compute_greedy_solvability, require_scorable_spec + + +def _count_backtracking(solution: list[tuple[int, int]] | None) -> float: + if not solution: + return 0.0 + seen = set() + revisits = 0 + previous_pos = None + for pos in solution: + if pos == previous_pos: + continue + if pos in seen: + revisits += 1 + seen.add(pos) + previous_pos = pos + return float(revisits) + + +def _dependency_variety(spec: TaskSpecification) -> float: + if spec.dependency_chain is not None: + return float(len({step.type for step in spec.dependency_chain.sequence})) + + variety = 0 + if spec.mechanisms.keys and spec.mechanisms.doors: + variety += 1 + if spec.mechanisms.switches and spec.mechanisms.gates: + variety += 1 + if spec.mechanisms.blocks: + variety += 1 + if spec.mechanisms.teleporters: + variety += 1 + if spec.mechanisms.hazards: + variety += 1 + return float(variety) + + +def _distractor_quality( + spec: TaskSpecification, + distractor_type_weights: dict[str, float] | None = None, +) -> float: + if not spec.distractors: + return 0.0 + weights = distractor_type_weights or DEFAULT_DISTRACTOR_TYPE_WEIGHTS + return float(sum(weights.get(d.type, 1.0) for d in spec.distractors)) + + +def _partial_observability(spec: TaskSpecification) -> float: + mapping = {"full": 0.0, "view_cone": 1.0, "fog_of_war": 2.0} + return mapping.get(spec.rules.observability, 0.0) + + +def _irreversibility(spec: TaskSpecification) -> float: + score = 0.0 + if spec.rules.key_consumption: + score += float(len(spec.mechanisms.doors)) + score += float(sum(1 for switch in spec.mechanisms.switches if switch.switch_type == "one_shot")) + score += float(sum(1 for tp in spec.mechanisms.teleporters if not tp.bidirectional)) + return score + + +def compute_12d_score( + spec: TaskSpecification, + solver_output: DifficultyReport | None = None, + weights: list[float] | None = None, + config: ScorerConfig | None = None, + validator: TaskValidator | None = None, + bfs_path: PlannedPath | None = None, +) -> ScoredDifficulty: + """ + Compute the 12-dimension static benchmark score. + + This keeps the old call shape while calibration and artifact generation + live in the standalone scorer package. + """ + require_scorable_spec(spec) + scorer_config = config or ScorerConfig.default() + task_validator = validator or TaskValidator(spec) + if solver_output is None: + solver_output = compute_difficulty(spec, validator=task_validator) + if bfs_path is None: + bfs_path = plan_bfs_path(spec) + + fragility = task_validator.compute_fragility() + fragility_value = 0.0 if fragility.min_steps_to_break == -1 else 1.0 / fragility.min_steps_to_break + + width, height = spec.maze.dimensions + grid_size = float(width * height) + wall_density = float(len(spec.maze.walls) / grid_size) if grid_size else 0.0 + + dimensions = [ + float(len(bfs_path.action_labels) if bfs_path.success else 0), + float(bfs_path.states_explored), + _count_backtracking(bfs_path.positions), + fragility_value, + float(spec.dependency_chain.depth if spec.dependency_chain is not None else solver_output.dependency_depth), + _dependency_variety(spec), + float(len(spec.distractors or [])), + _distractor_quality(spec, scorer_config.distractor_type_weights), + grid_size, + wall_density, + _partial_observability(spec), + _irreversibility(spec), + ] + + weight_vector = ( + scorer_config.static_weight_list() + if weights is None + else [float(weight) for weight in weights] + ) + if len(weight_vector) != len(dimensions): + raise ValueError(f"Expected {len(dimensions)} static weights, got {len(weight_vector)}") + composite = float(sum(d * w for d, w in zip(dimensions, weight_vector))) + return ScoredDifficulty( + dimensions=dimensions, + dimension_names=DIMENSION_NAMES.copy(), + composite=composite, + weights=weight_vector, + ) + + +def compute_static_score_artifact( + spec: TaskSpecification, + config: ScorerConfig | None = None, + solver_output: DifficultyReport | None = None, + validator: TaskValidator | None = None, + validation_result: tuple[bool, list[tuple[int, int]] | None, str] | None = None, + bfs_path: PlannedPath | None = None, + greedy_path: PlannedPath | None = None, +) -> StaticScoreArtifact: + """Compute the Stage 2 static score artifact for one task.""" + require_scorable_spec(spec) + scorer_config = config or ScorerConfig.default() + schema_valid, schema_errors = spec.validate() + task_validator = validator or TaskValidator(spec) + if validation_result is None: + validation_result = task_validator.validate() + is_beatable, _, message = validation_result + if solver_output is None: + solver_output = compute_difficulty( + spec, + validator=task_validator, + validation_result=validation_result, + ) + if bfs_path is None: + bfs_path = plan_bfs_path(spec) + if is_beatable != bfs_path.success: + raise ValueError( + "Task validator and canonical BFS disagree on beatability for " + f"{spec.task_id!r}" + ) + score = compute_12d_score( + spec, + solver_output=solver_output, + config=scorer_config, + validator=task_validator, + bfs_path=bfs_path, + ) + + mechanism_necessity_violations: list[str] = [] + distractor_safety_violations: list[str] = [] + chain_ordering_valid = True + if schema_valid: + mechanism_necessity_violations = task_validator.validate_mechanism_necessity() + distractor_safety_violations = task_validator.validate_distractor_safety( + base_beatable=is_beatable + ) + chain_ordering_valid = task_validator.validate_chain_ordering() + + dimensions = score.dimensions_by_name + static_score_unweighted = float(sum(dimensions.values())) + inputs_hash = stable_hash( + { + "task": spec.to_dict(), + "config": scorer_config.to_dict(), + "scorer_version": SCORER_VERSION, + } + ) + + return StaticScoreArtifact( + task_id=spec.task_id, + is_beatable=is_beatable, + message=message, + dimensions=dimensions, + static_score_unweighted=static_score_unweighted, + static_score=score.composite, + weights=dict(scorer_config.static_dimension_weights), + validation={ + "schema_valid": schema_valid, + "schema_errors": schema_errors, + "mechanism_necessity_violations": mechanism_necessity_violations, + "distractor_safety_violations": distractor_safety_violations, + "chain_ordering_valid": chain_ordering_valid, + }, + canonical_agent_features={ + GREEDY_SOLVABILITY_FEATURE: ( + compute_greedy_solvability(spec, greedy_path=greedy_path) + if schema_valid + else None + ), + }, + calibration_version=scorer_config.version, + inputs_hash=inputs_hash, + ) + + +def score_task_file( + task_path: str | Path, + output_dir: str | Path | None = None, + config: ScorerConfig | None = None, +): + """Score a task JSON file and optionally write canonical score artifacts.""" + spec = task_spec_from_payload(load_json(task_path)) + require_scorable_spec(spec) + validator = TaskValidator(spec) + validation_result = validator.validate() + difficulty = compute_difficulty( + spec, + validator=validator, + validation_result=validation_result, + ) + bfs_path = plan_bfs_path(spec) + greedy_path = plan_greedy_path(spec) + canonical_paths = compute_canonical_paths( + spec, + bfs_path=bfs_path, + greedy_path=greedy_path, + ) + static_score = compute_static_score_artifact( + spec, + config=config, + solver_output=difficulty, + validator=validator, + validation_result=validation_result, + bfs_path=bfs_path, + greedy_path=greedy_path, + ) + + if output_dir is not None: + out = Path(output_dir) + out.mkdir(parents=True, exist_ok=True) + dump_json(out / "canonical_paths.json", canonical_paths.to_dict()) + dump_json(out / "scored_static.json", static_score.to_dict()) + + return canonical_paths, static_score diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py new file mode 100644 index 0000000..9e55628 --- /dev/null +++ b/scripts/run_pipeline.py @@ -0,0 +1,603 @@ +"""Bare-bones run-pipeline orchestrator for MultiNet v2.0 (tests 1-3). + +Sequential, inspectable Stage 1->5 driver. No DAG runner. Writes the +``artifacts/`` tree: + + artifacts/ + tasks//{canonical_paths.json, scored_static.json} + tasks/_suite.json + runs////seed_//{episode.json, run_inputs.json, run_score.json} + episode_runs.jsonl + reports//{scoring_calibration_summary,complexity_distance_summary,mechanism_ordering_pairs}.json + +Selection is data-driven via a **run-config** that maps each model to the task +files it should run (plus its provider/params); the **manifest** is a separate +task *catalog* that supplies per-task scoring metadata (experiment, condition, +expected_mechanisms, test-2 route cells). Stage 3 uses the ``interface/`` runner +(Stack A) with a live-model agent. Programmatic callers can inject any agent +callable, e.g. a stub for testing. +""" + +from __future__ import annotations + +import argparse +import json +import re +from pathlib import Path +from typing import Any, Callable, Iterable, Optional + +from prompting_experiments import CONDITION_SETS, iter_condition_configs +from scorer import compute_runtime_score, load_scorer_config, score_task_file +from scorer.config import SCORER_VERSION, ScorerConfig +from scorer.io import stable_hash, task_spec_from_payload + +from pipeline import episode_metrics, reports + +# Bump when Stage-3 run production changes in a way that invalidates cached episodes. +PIPELINE_VERSION = "0.1.0" + +Agent = Callable[[list[dict]], str] +# A factory used by tests to supply stub agents: (model_name, model_cfg) -> (agent, label). +AgentFactory = Callable[[str, dict[str, Any]], "tuple[Agent, str]"] + +_REPO_ROOT = Path(__file__).resolve().parents[1] +_DEFAULT_MANIFEST = _REPO_ROOT / "gridworld" / "fixtures" / "manifest.json" +_EXPERIMENT_KEYWORDS = {"test1", "test2", "test3", "all"} + + +def _sanitize(name: str) -> str: + return re.sub(r"[^A-Za-z0-9._-]+", "_", name).strip("_") or "model" + + +# --------------------------------------------------------------------------- # +# Manifest catalog + task resolution +# --------------------------------------------------------------------------- # +def load_manifest(manifest_path: str | Path) -> list[dict[str, Any]]: + data = json.loads(Path(manifest_path).read_text(encoding="utf-8")) + rows = data["tasks"] if isinstance(data, dict) else data + if not isinstance(rows, list): + raise ValueError("Manifest must be a list of task rows or {'tasks': [...]}.") + return rows + + +def _resolve_path(source: str, manifest_path: Path) -> Optional[Path]: + candidate = Path(source) + if candidate.is_absolute(): + return candidate if candidate.exists() else None + for base in (Path.cwd(), manifest_path.parent, _REPO_ROOT): + resolved = (base / source).resolve() + if resolved.exists(): + return resolved + return None + + +def _resolve_source(row: dict[str, Any], manifest_path: Path) -> Path: + resolved = _resolve_path(row["source"], manifest_path) + if resolved is None: + raise FileNotFoundError(f"Task source not found for {row.get('task_id')}: {row['source']}") + return resolved + + +def _synth_row(path: Path) -> dict[str, Any]: + """A plain task file with no catalog entry runs as a test-1 nav task.""" + return { + "task_id": path.stem, + "experiment": "test1", + "condition": "default", + "variant": path.stem, + "source": str(path), + "expected_mechanisms": [], + "notes": "Synthesized (not in manifest catalog).", + } + + +def resolve_task_rows( + entries: Iterable[str], + catalog: list[dict[str, Any]], + manifest_path: Path, +) -> list[dict[str, Any]]: + """Resolve run-config task entries to manifest-style rows (metadata attached). + + Each entry may be an experiment keyword (``test1``/``test2``/``test3``/``all``), + a catalog ``task_id``, or a path to a task ``.json``. Paths are matched against + the catalog (by resolved path) so test-2/test-3 metadata is preserved; an + unmatched path is synthesized as a plain test-1 task. Duplicate task_ids are + de-duplicated, keeping first occurrence. + """ + by_id = {r["task_id"]: r for r in catalog} + by_path: dict[Path, list[dict[str, Any]]] = {} + for r in catalog: + resolved = _resolve_path(r["source"], manifest_path) + if resolved is not None: + by_path.setdefault(resolved, []).append(r) + + resolved_rows: list[dict[str, Any]] = [] + for entry in entries: + if entry in _EXPERIMENT_KEYWORDS: + matches = catalog if entry == "all" else [r for r in catalog if r.get("experiment") == entry] + if not matches: + raise ValueError(f"No catalog tasks for experiment {entry!r}.") + resolved_rows.extend(matches) + continue + if entry in by_id: + resolved_rows.append(by_id[entry]) + continue + path = _resolve_path(entry, manifest_path) + if path is not None: + matches = by_path.get(path) + resolved_rows.append(matches[0] if matches else _synth_row(path)) + continue + raise ValueError( + f"Cannot resolve task entry {entry!r} (not an experiment keyword, catalog task_id, or file path)." + ) + + deduped: dict[str, dict[str, Any]] = {} + for row in resolved_rows: + deduped.setdefault(row["task_id"], row) + return list(deduped.values()) + + +def _condition_configs(conditions: Optional[str]) -> list[tuple[str, ExperimentConfig]]: + from interface.config import ExperimentConfig + + if not conditions: + return [("default", ExperimentConfig())] + if conditions not in CONDITION_SETS: + raise ValueError( + f"Unknown --conditions {conditions!r}; available: {sorted(CONDITION_SETS)}." + ) + return list(iter_condition_configs(conditions, ExperimentConfig())) + + +# --------------------------------------------------------------------------- # +# Content-hash invalidation +# --------------------------------------------------------------------------- # +def _expected_static_hash(spec, config: ScorerConfig) -> str: + """Mirror scorer.static's scored_static inputs_hash recipe (task + config).""" + return stable_hash( + {"task": spec.to_dict(), "config": config.to_dict(), "scorer_version": SCORER_VERSION} + ) + + +def _expected_run_hash(spec, model_name: str, seed: int, backend: str) -> str: + """Hash the inputs that determine a Stage-3 episode. + + Excludes scorer config (that invalidates run_score, not the model call) and, + pre-v1, the prompt/ExperimentConfig (prompts are not yet versioned while we + iterate; the prompt variant still separates runs via the dir). + TODO(release): fold in backend_version + adapter/model code version so code + changes invalidate cached episodes at v1. + """ + return stable_hash( + { + "task": spec.to_dict(), + "model_id": model_name, + "seed": seed, + "backend": backend, + "pipeline_version": PIPELINE_VERSION, + } + ) + + +# --------------------------------------------------------------------------- # +# Stage 2 — static solve & score +# --------------------------------------------------------------------------- # +def score_tasks( + rows: list[dict[str, Any]], + manifest_path: Path, + artifacts_root: Path, + config: ScorerConfig, + force: bool = False, +) -> dict[str, dict[str, Any]]: + """Run Stage 2 over every task; return ``task_id -> scored_static dict``. + + Hash-aware: a cached ``scored_static.json`` is reused only when its + ``inputs_hash`` matches the hash recomputed from the current task spec and + scorer config; otherwise the task bundle (canonical_paths + scored_static) + is regenerated. ``force`` always regenerates. + """ + static_by_task: dict[str, dict[str, Any]] = {} + for row in rows: + task_id = row["task_id"] + source = _resolve_source(row, manifest_path) + out_dir = artifacts_root / "tasks" / task_id + scored_path = out_dir / "scored_static.json" + canonical_path = out_dir / "canonical_paths.json" + # Stage 3 reads canonical_paths.json unconditionally, so both halves of + # the task bundle must be present to honor the cache. + if scored_path.exists() and canonical_path.exists() and not force: + cached = json.loads(scored_path.read_text(encoding="utf-8")) + spec = task_spec_from_payload(json.loads(Path(source).read_text(encoding="utf-8"))) + if cached.get("inputs_hash") == _expected_static_hash(spec, config): + static_by_task[task_id] = cached + continue + _, static_score = score_task_file(source, output_dir=out_dir, config=config) + static_by_task[task_id] = static_score.to_dict() + return static_by_task + + +def _score_suite( + rows: list[dict[str, Any]], + manifest_path: Path, + artifacts_root: Path, + config: ScorerConfig, + force: bool, +) -> tuple[dict[str, dict[str, Any]], float]: + static_by_task = score_tasks(rows, manifest_path, artifacts_root, config, force=force) + scores = [float(s.get("static_score", 0.0)) for s in static_by_task.values()] + difficulty_max = max(scores) if scores else 1.0 + suite_path = artifacts_root / "tasks" / "_suite.json" + suite_path.parent.mkdir(parents=True, exist_ok=True) + suite_path.write_text( + json.dumps( + { + "difficulty_max_static_score": difficulty_max, + "tasks": {t: s.get("static_score") for t, s in static_by_task.items()}, + }, + indent=2, + ), + encoding="utf-8", + ) + return static_by_task, difficulty_max + + +# --------------------------------------------------------------------------- # +# Stages 3-4 — runs + runtime score (per model) +# --------------------------------------------------------------------------- # +def _run_dir(artifacts_root: Path, task_id: str, model: str, seed: int, condition: str) -> Path: + return artifacts_root / "runs" / task_id / "minigrid" / model / f"seed_{seed}" / condition + + +def _run_one_model( + rows: list[dict[str, Any]], + agent: Agent, + model_name: str, + *, + manifest_path: Path, + artifacts_root: Path, + static_by_task: dict[str, dict[str, Any]], + difficulty_max: float, + config: ScorerConfig, + seeds: Iterable[int], + conditions: Optional[str], + force: bool, +) -> tuple[list[dict[str, Any]], dict[tuple, Optional[float]]]: + from pipeline.run_stage3 import run_episode + + condition_configs = _condition_configs(conditions) + run_rows: list[dict[str, Any]] = [] + composites: dict[tuple, Optional[float]] = {} + + for row in rows: + task_id = row["task_id"] + scored_static = static_by_task[task_id] + # Tasks Stage 2 marks unbeatable are ineligible: skip the expensive + # Stage 3/4 work (model/API calls + scoring) entirely. The reports + # surface them via scoring_calibration_summary's ineligible_tasks. + if not scored_static.get("is_beatable", True): + continue + source = _resolve_source(row, manifest_path) + spec = task_spec_from_payload(json.loads(Path(source).read_text(encoding="utf-8"))) + canonical = json.loads( + (artifacts_root / "tasks" / task_id / "canonical_paths.json").read_text(encoding="utf-8") + ) + + for seed in seeds: + for variant, cfg in condition_configs: + run_dir = _run_dir(artifacts_root, task_id, model_name, seed, variant) + episode_path = run_dir / "episode.json" + sidecar_path = run_dir / "run_inputs.json" + run_score_path = run_dir / "run_score.json" + + # ``condition`` is the task-intrinsic axis (test-3 mechanism + # order, carried by the manifest); ``variant`` is the orthogonal + # prompt axis from --conditions. Keep them separate so prompt + # variants do not collapse onto the manifest condition. + manifest_row = dict(row) + + # Stage 3 (expensive: model calls) is hash-cached. Reuse a cached + # episode only when its stamped run-inputs hash still matches. + expected_hash = _expected_run_hash(spec, model_name, seed, "minigrid") + reuse = ( + not force + and episode_path.exists() + and sidecar_path.exists() + and json.loads(sidecar_path.read_text(encoding="utf-8")).get("inputs_hash") + == expected_hash + ) + if reuse: + episode = json.loads(episode_path.read_text(encoding="utf-8")) + else: + episode = run_episode(source, cfg, agent, seed, run_dir) + sidecar_path.write_text( + json.dumps( + { + "inputs_hash": expected_hash, + "producer_version": PIPELINE_VERSION, + "task_id": task_id, + "model_id": model_name, + "seed": seed, + "backend": "minigrid", + "condition": variant, + }, + indent=2, + ), + encoding="utf-8", + ) + + # Derive the test-2/test-3 signals once and share them between the + # scorer-facing dict and the jsonl row (each call would otherwise + # re-walk the whole transcript). + metrics = episode_metrics.build_metrics(episode, canonical, manifest_row) + + # Stage 4 is cheap + deterministic: always (re)score from the + # episode so scorer-config / static / canonical changes propagate. + enriched = episode_metrics.enrich_run_for_scoring( + episode, manifest_row, agent_or_model=model_name, seed=seed, metrics=metrics + ) + run_score = compute_runtime_score( + enriched, + static_score=scored_static, + canonical_paths=canonical, + config=config, + difficulty_max_static_score=difficulty_max, + ).to_dict() + run_score_path.write_text(json.dumps(run_score, indent=2), encoding="utf-8") + + run_rows.append( + episode_metrics.build_run_row( + episode, + canonical, + manifest_row, + agent_or_model=model_name, + seed=seed, + raw_output_ref=str(episode_path.relative_to(artifacts_root)), + metrics=metrics, + prompt_variant=variant, + ) + ) + composites[ + (task_id, model_name, seed, manifest_row.get("condition"), variant) + ] = run_score.get("composite") + + return run_rows, composites + + +def _write_aggregate( + run_rows: list[dict[str, Any]], + composites: dict[tuple, Optional[float]], + static_by_task: dict[str, dict[str, Any]], + metadata_rows: list[dict[str, Any]], + artifacts_root: Path, + run_set_id: str, +) -> dict[str, Any]: + jsonl_path = artifacts_root / "episode_runs.jsonl" + with jsonl_path.open("w", encoding="utf-8") as handle: + for run_row in run_rows: + handle.write(json.dumps(run_row) + "\n") + + report_dir = artifacts_root / "reports" / run_set_id + report_dir.mkdir(parents=True, exist_ok=True) + payloads = { + "scoring_calibration_summary": reports.scoring_calibration_summary( + run_rows, composites, static_by_task + ), + "complexity_distance_summary": reports.complexity_distance_summary(run_rows), + "mechanism_ordering_pairs": reports.mechanism_ordering_pairs(run_rows, metadata_rows), + } + for name, payload in payloads.items(): + (report_dir / f"{name}.json").write_text(json.dumps(payload, indent=2), encoding="utf-8") + + # Per-model reports: machine-readable, one file per model, kept separate + # from the scorer-calibration ("tuning") artifacts above. + models_dir = report_dir / "models" + models_dir.mkdir(parents=True, exist_ok=True) + model_reports: dict[str, Any] = {} + for model_id in sorted({str(r.get("agent_or_model")) for r in run_rows}): + report = reports.model_report(run_rows, composites, model_id, run_set_id) + (models_dir / f"{_sanitize(model_id)}.json").write_text( + json.dumps(report, indent=2), encoding="utf-8" + ) + model_reports[model_id] = report + payloads["model_reports"] = model_reports + return payloads + + +# --------------------------------------------------------------------------- # +# Entry points +# --------------------------------------------------------------------------- # +def run_pipeline( + *, + manifest_path: str | Path, + experiment: str, + agent: Agent, + agent_name: str, + seeds: Iterable[int] = (0,), + conditions: Optional[str] = None, + artifacts_root: str | Path = "artifacts", + run_set_id: str = "default", + scorer_config: Optional[ScorerConfig] = None, + force: bool = False, +) -> dict[str, Any]: + """Single-model convenience entry: run one experiment with one agent.""" + manifest_path = Path(manifest_path) + artifacts_root = Path(artifacts_root) + config = scorer_config or load_scorer_config() + + catalog = load_manifest(manifest_path) + rows = resolve_task_rows([experiment], catalog, manifest_path) + static_by_task, difficulty_max = _score_suite(rows, manifest_path, artifacts_root, config, force) + run_rows, composites = _run_one_model( + rows, + agent, + _sanitize(agent_name), + manifest_path=manifest_path, + artifacts_root=artifacts_root, + static_by_task=static_by_task, + difficulty_max=difficulty_max, + config=config, + seeds=seeds, + conditions=conditions, + force=force, + ) + return _write_aggregate(run_rows, composites, static_by_task, rows, artifacts_root, run_set_id) + + +def run_from_config( + *, + run_config_path: str | Path, + manifest_path: str | Path = _DEFAULT_MANIFEST, + seeds: Iterable[int] = (0,), + conditions: Optional[str] = None, + artifacts_root: str | Path = "artifacts", + run_set_id: str = "default", + scorer_config: Optional[ScorerConfig] = None, + force: bool = False, + agent_factory: Optional[AgentFactory] = None, +) -> dict[str, Any]: + """Run-config entry: each model runs its own task selection (model -> task files).""" + manifest_path = Path(manifest_path) + artifacts_root = Path(artifacts_root) + config = scorer_config or load_scorer_config() + factory = agent_factory or _build_agent_from_spec + + run_config = load_run_config(run_config_path) + catalog = load_manifest(manifest_path) + + # Resolve each model's task rows + build its agent. + plans: list[tuple[str, Agent, list[dict[str, Any]]]] = [] + union: dict[str, dict[str, Any]] = {} + for name, model_cfg in run_config["models"].items(): + entries = model_cfg.get("tasks") or model_cfg.get("runs") or [] + if not entries: + raise ValueError(f"Model {name!r} lists no tasks/runs.") + rows = resolve_task_rows(entries, catalog, manifest_path) + agent, label = factory(name, model_cfg) + plans.append((_sanitize(label), agent, rows)) + for r in rows: + union.setdefault(r["task_id"], r) + + union_rows = list(union.values()) + static_by_task, difficulty_max = _score_suite(union_rows, manifest_path, artifacts_root, config, force) + + all_run_rows: list[dict[str, Any]] = [] + composites: dict[tuple, Optional[float]] = {} + for model_name, agent, rows in plans: + rr, comp = _run_one_model( + rows, + agent, + model_name, + manifest_path=manifest_path, + artifacts_root=artifacts_root, + static_by_task=static_by_task, + difficulty_max=difficulty_max, + config=config, + seeds=seeds, + conditions=conditions, + force=force, + ) + all_run_rows.extend(rr) + composites.update(comp) + + return _write_aggregate(all_run_rows, composites, static_by_task, union_rows, artifacts_root, run_set_id) + + +# --------------------------------------------------------------------------- # +# Run-config + agent construction +# --------------------------------------------------------------------------- # +def load_run_config(path: str | Path) -> dict[str, Any]: + data = json.loads(Path(path).read_text(encoding="utf-8")) + if not isinstance(data, dict) or "models" not in data or not isinstance(data["models"], dict): + raise ValueError("Run-config must be an object with a 'models' mapping.") + return data + + +def _build_agent_from_spec(name: str, model_cfg: dict[str, Any]) -> tuple[Agent, str]: + """Construct a live agent from a run-config model entry.""" + provider = (model_cfg.get("provider") or "").lower() + model = model_cfg.get("model") + temperature = float(model_cfg.get("temperature", 0.0)) + max_tokens = model_cfg.get("max_tokens") + + if provider == "claude": + from interface.agents import ClaudeAnthropicAgent, ClaudeAnthropicConfig + + cfg = ClaudeAnthropicConfig(temperature=temperature) + if model: + cfg.model = model + if max_tokens: + cfg.max_tokens = int(max_tokens) + return ClaudeAnthropicAgent(config=cfg), model or cfg.model + if provider == "qwen": + from interface.agents import Qwen35VLAgent, Qwen35VLConfig + + cfg = Qwen35VLConfig(temperature=temperature) + if model: + cfg.model = model + if max_tokens: + cfg.max_new_tokens = int(max_tokens) + for key in ( + "device_map", + "local_files_only", + "trust_remote_code", + "torch_dtype", + "load_in_4bit", + "attn_implementation", + "max_memory", + "enable_thinking", + ): + if key in model_cfg: + setattr(cfg, key, model_cfg[key]) + return Qwen35VLAgent(config=cfg), model or cfg.model + raise ValueError(f"Model {name!r}: unknown provider {provider!r} (expected 'claude' or 'qwen').") + + +def main(argv: Optional[list[str]] = None) -> None: + parser = argparse.ArgumentParser(description="MultiNet v2.0 bare-bones run pipeline (tests 1-3).") + parser.add_argument("--run-config", help="JSON run-config mapping models to task files (preferred).") + parser.add_argument("--manifest", default=str(_DEFAULT_MANIFEST), help="Task catalog (metadata).") + parser.add_argument("--seeds", type=int, nargs="+", default=[0]) + parser.add_argument("--conditions", default=None, help="Prompt condition-set name (optional).") + parser.add_argument("--artifacts-root", default=str(_REPO_ROOT / "artifacts")) + parser.add_argument("--run-set-id", default="default") + parser.add_argument("--force", action="store_true", help="Recompute existing artifacts.") + # Single-model fallback (when --run-config is not supplied): + parser.add_argument("--experiment", choices=["test1", "test2", "test3", "all"], default="all") + parser.add_argument("--agent", choices=["claude", "qwen"], help="Single-model provider.") + args = parser.parse_args(argv) + + if args.run_config: + payloads = run_from_config( + run_config_path=args.run_config, + manifest_path=args.manifest, + seeds=args.seeds, + conditions=args.conditions, + artifacts_root=args.artifacts_root, + run_set_id=args.run_set_id, + force=args.force, + ) + else: + if not args.agent: + parser.error("provide --run-config, or --agent for a single-model run.") + agent, label = _build_agent_from_spec(args.agent, {"provider": args.agent}) + payloads = run_pipeline( + manifest_path=args.manifest, + experiment=args.experiment, + agent=agent, + agent_name=label, + seeds=args.seeds, + conditions=args.conditions, + artifacts_root=args.artifacts_root, + run_set_id=args.run_set_id, + force=args.force, + ) + + summary = payloads["scoring_calibration_summary"] + print( + f"Pipeline complete: {summary['run_count']} runs over {summary['task_count']} tasks " + f"-> {args.artifacts_root}/reports/{args.run_set_id}/" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/score_json.py b/scripts/score_json.py new file mode 100644 index 0000000..a39707c --- /dev/null +++ b/scripts/score_json.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +"""CLI for scoring task and run JSON artifacts.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from scorer.io import dump_json, json_files, load_json +from scorer.scoring import ( + ScorerConfig, + compute_runtime_score, + load_scorer_config, + score_runtime_file, + score_task_file, +) + + +def _load_config(args: argparse.Namespace) -> ScorerConfig: + return load_scorer_config(args.config) + + +def _static_target_dirs(files: list[Path], output_root: Path | None) -> list[Path]: + if output_root is None: + return [path.with_suffix("").with_name(f"{path.stem}_score") for path in files] + if len(files) == 1: + return [output_root] + + target_dirs = [output_root / path.stem for path in files] + duplicates = sorted( + { + str(target) + for target in target_dirs + if target_dirs.count(target) > 1 + } + ) + if duplicates: + raise ValueError( + "Static output directories collide for same-stem inputs: " + f"{', '.join(duplicates)}. Score those inputs separately or use distinct filenames." + ) + return target_dirs + + +def _default_runtime_output(run_path: str | Path) -> Path: + path = Path(run_path) + return path.with_name(f"{path.stem}_score.json") + + +def _static(args: argparse.Namespace) -> int: + config = _load_config(args) + files = json_files(args.inputs) + if not files: + raise FileNotFoundError("No JSON files matched the static scoring inputs") + + output_root = Path(args.output_dir) if args.output_dir else None + succeeded = 0 + failed = 0 + for task_path, target_dir in zip(files, _static_target_dirs(files, output_root)): + try: + canonical, static_score = score_task_file( + task_path, + output_dir=target_dir, + config=config, + ) + except Exception as exc: + failed += 1 + print( + "static: error " + f"input={task_path} output_dir={target_dir} " + f"error={exc.__class__.__name__}: {exc}", + file=sys.stderr, + flush=True, + ) + continue + + succeeded += 1 + print( + "static: ok " + f"input={task_path} task_id={static_score.task_id} " + f"static_score={static_score.static_score:.3f} " + f"beatable={static_score.is_beatable} " + f"optimal_steps={canonical.optimal_steps} output_dir={target_dir}", + flush=True, + ) + + summary = f"static: summary scored={succeeded} failed={failed} total={len(files)}" + print(summary, file=sys.stderr if failed else sys.stdout, flush=True) + return 1 if failed else 0 + + +def _runtime(args: argparse.Namespace) -> int: + config = _load_config(args) + output_path = Path(args.output) if args.output else _default_runtime_output(args.run) + if (args.static_score is None) != (args.canonical_paths is None): + raise ValueError("--static-score and --canonical-paths must be provided together") + if ( + args.difficulty_max_static_score is None + and config.difficulty_max_static_score is None + ): + raise ValueError( + "Runtime scoring needs a suite maximum. Pass --difficulty-max-static-score " + "or set difficulty_max_static_score in scorer config." + ) + + if args.static_score and args.canonical_paths: + score = score_runtime_file( + args.run, + static_score_path=args.static_score, + canonical_paths_path=args.canonical_paths, + output_path=output_path, + config=config, + difficulty_max_static_score=args.difficulty_max_static_score, + ) + else: + if not args.task: + raise ValueError( + "Runtime scoring needs --static-score and --canonical-paths, " + "or --task so those artifacts can be computed." + ) + canonical, static_score = score_task_file( + args.task, + output_dir=args.artifact_dir, + config=config, + ) + run = load_json(args.run) + score = compute_runtime_score( + run, + static_score=static_score, + canonical_paths=canonical, + config=config, + difficulty_max_static_score=args.difficulty_max_static_score, + ) + dump_json(output_path, score.to_dict()) + + print(f"{score.task_id}: runtime_score={score.composite:.3f} -> {output_path}") + return 0 + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Score MultiNet task and run JSON artifacts.") + parser.add_argument( + "--config", + default=None, + help="Optional scorer config JSON/YAML path. Defaults to scorer/scorer_config.json.", + ) + + subparsers = parser.add_subparsers(dest="command", required=True) + + static_parser = subparsers.add_parser( + "static", + help="Write canonical_paths.json and scored_static.json for task JSON files.", + ) + static_parser.add_argument("inputs", nargs="+", help="Task JSON files or directories.") + static_parser.add_argument( + "--output-dir", + default=None, + help="Directory for score artifacts. Multiple inputs are written under per-file subdirectories.", + ) + static_parser.set_defaults(func=_static) + + runtime_parser = subparsers.add_parser( + "runtime", + help="Write run_score.json for one run/episode JSON artifact.", + ) + runtime_parser.add_argument("run", help="Run or episode JSON file.") + runtime_parser.add_argument("--task", default=None, help="Task JSON file, used when static artifacts are omitted.") + runtime_parser.add_argument("--static-score", default=None, help="Existing scored_static.json path.") + runtime_parser.add_argument("--canonical-paths", default=None, help="Existing canonical_paths.json path.") + runtime_parser.add_argument( + "--artifact-dir", + default=None, + help="Optional directory to write computed static artifacts when --task is used.", + ) + runtime_parser.add_argument("--output", default=None, help="Output run_score.json path.") + runtime_parser.add_argument( + "--difficulty-max-static-score", + type=float, + default=None, + help="Suite max static score for difficulty normalization. Required unless configured.", + ) + runtime_parser.set_defaults(func=_runtime) + + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + return int(args.func(args)) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/validate_fixtures.py b/scripts/validate_fixtures.py new file mode 100644 index 0000000..8ef6280 --- /dev/null +++ b/scripts/validate_fixtures.py @@ -0,0 +1,175 @@ +"""Validate pipeline fixtures and derive test-2 route discriminators. + +Read-only with respect to task files; with ``--write`` it caches the computed +``route_short_cells`` / ``route_long_cells`` back into the manifest so the +runtime ``path_choice`` metric has unambiguous per-route cell sets. + +Checks: + * every fixture passes ``TaskSpecification.validate()`` and BFS-solves; + * test-2 rows: both the short route and the route forced by walling off + ``route_block`` are solvable and visit distinct cells; + * test-3 rows: members sharing a ``pair_id`` have identical maze topology + (dimensions + walls) and equal BFS optimal step counts. +""" + +from __future__ import annotations + +import argparse +import json +from collections import defaultdict +from pathlib import Path +from typing import Any + +from gridworld.baselines import plan_bfs_path +from gridworld.task_spec import TaskSpecification + +_REPO_ROOT = Path(__file__).resolve().parents[1] + + +def _resolve(source: str, manifest_path: Path) -> Path: + candidate = Path(source) + if candidate.is_absolute() and candidate.exists(): + return candidate + for base in (manifest_path.parent, _REPO_ROOT): + resolved = (base / source).resolve() + if resolved.exists(): + return resolved + raise FileNotFoundError(f"Task source not found: {source}") + + +def _load_spec(path: Path) -> TaskSpecification: + return TaskSpecification.from_json(str(path)) + + +def _spec_with_extra_wall(spec: TaskSpecification, cell: list[int]) -> TaskSpecification: + data = spec.to_dict() + walls = [list(w) for w in data["maze"].get("walls", [])] + if list(cell) not in walls: + walls.append(list(cell)) + data["maze"]["walls"] = walls + return TaskSpecification.from_dict(data) + + +def _validate_one(row: dict[str, Any], manifest_path: Path) -> list[str]: + errors: list[str] = [] + source = _resolve(row["source"], manifest_path) + spec = _load_spec(source) + ok, messages = spec.validate() + if not ok: + errors.append(f"{row['task_id']}: validate() failed: {messages}") + return errors + bfs = plan_bfs_path(spec) + if not bfs.success: + errors.append(f"{row['task_id']}: BFS could not solve the task") + return errors + + +def _derive_test2_routes(row: dict[str, Any], manifest_path: Path) -> list[str]: + errors: list[str] = [] + source = _resolve(row["source"], manifest_path) + spec = _load_spec(source) + + short = plan_bfs_path(spec) + if not short.success: + return [f"{row['task_id']}: short route unsolvable"] + + block = row.get("route_block") + if block is None: + interior = [p for p in short.positions[1:-1]] + if not interior: + return [f"{row['task_id']}: no interior cell to block; set route_block explicitly"] + block = list(interior[len(interior) // 2]) + + long = plan_bfs_path(_spec_with_extra_wall(spec, block)) + if not long.success: + errors.append( + f"{row['task_id']}: blocking {block} leaves no alternate route (pick a different route_block)" + ) + return errors + + short_cells = {tuple(p) for p in short.positions} + long_cells = {tuple(p) for p in long.positions} + short_only = sorted(short_cells - long_cells) + long_only = sorted(long_cells - short_cells) + if not short_only or not long_only: + errors.append(f"{row['task_id']}: routes do not diverge enough to discriminate path_choice") + + row["route_block"] = list(block) + row["route_short_cells"] = [list(c) for c in short_only] + row["route_long_cells"] = [list(c) for c in long_only] + return errors + + +def _check_test3_pairs(rows: list[dict[str, Any]], manifest_path: Path) -> list[str]: + errors: list[str] = [] + pairs: dict[str, list[dict[str, Any]]] = defaultdict(list) + for row in rows: + if row.get("experiment") == "test3" and row.get("pair_id"): + pairs[row["pair_id"]].append(row) + + for pair_id, members in pairs.items(): + if len(members) < 2: + errors.append(f"pair {pair_id}: needs >= 2 members, found {len(members)}") + continue + specs = [_load_spec(_resolve(m["source"], manifest_path)) for m in members] + dims = {tuple(s.maze.dimensions) for s in specs} + walls = {frozenset((w.x, w.y) for w in s.maze.walls) for s in specs} + if len(dims) != 1: + errors.append(f"pair {pair_id}: maze dimensions differ: {dims}") + if len(walls) != 1: + errors.append(f"pair {pair_id}: wall layouts differ across members") + optimal = [] + for member, spec in zip(members, specs): + bfs = plan_bfs_path(spec) + if not bfs.success: + errors.append(f"{member['task_id']}: BFS could not solve the task") + else: + optimal.append(len(bfs.action_labels)) + if len(set(optimal)) > 1: + errors.append( + f"pair {pair_id}: BFS optimal step counts differ {optimal} " + "(test3 requires equal path length within a pair)" + ) + return errors + + +def validate_manifest(manifest_path: Path) -> tuple[dict[str, Any], list[str]]: + data = json.loads(manifest_path.read_text(encoding="utf-8")) + rows = data["tasks"] if isinstance(data, dict) else data + + errors: list[str] = [] + for row in rows: + errors.extend(_validate_one(row, manifest_path)) + if row.get("experiment") == "test2": + errors.extend(_derive_test2_routes(row, manifest_path)) + errors.extend(_check_test3_pairs(rows, manifest_path)) + return data, errors + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Validate pipeline fixtures and derive test-2 routes.") + parser.add_argument( + "--manifest", default=str(_REPO_ROOT / "gridworld" / "fixtures" / "manifest.json") + ) + parser.add_argument("--write", action="store_true", help="Persist derived route cells to the manifest.") + args = parser.parse_args(argv) + + manifest_path = Path(args.manifest) + data, errors = validate_manifest(manifest_path) + + if errors: + print("Fixture validation FAILED:") + for err in errors: + print(f" - {err}") + return 1 + + if args.write: + manifest_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8") + print(f"Validated OK; route discriminators written to {manifest_path}") + else: + print("Validated OK (use --write to cache test-2 route discriminators).") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_episode_metrics.py b/tests/test_episode_metrics.py new file mode 100644 index 0000000..b00fa52 --- /dev/null +++ b/tests/test_episode_metrics.py @@ -0,0 +1,163 @@ +"""Unit tests for Stage-3 instrumentation (pipeline.episode_metrics).""" + +from __future__ import annotations + +from pipeline import episode_metrics as em + + +def _state(pos, *, keys=(), switches=(), doors=(), gates=(), reward=0.0): + return { + "agent_position": list(pos), + "collected_keys": list(keys), + "active_switches": list(switches), + "open_doors": list(doors), + "open_gates": list(gates), + "reward": reward, + } + + +def _step(pos, event_type="MOVED", **state_kwargs): + return {"kind": "step", "event_type": event_type, "state_after": _state(pos, **state_kwargs)} + + +def _episode(steps, *, success, end_reason, initial_pos=(1, 1)): + initial = _state(initial_pos) + final = steps[-1]["state_after"] if steps else initial + return { + "success": success, + "end_reason": end_reason, + "steps_used": len(steps), + "initial_state": initial, + "final_state": final, + "transcript": [{"kind": "reset", "state": initial}, *steps], + } + + +# --------------------------------------------------------------------------- # +# visited_cells: uses state_after.agent_position (x, y), collapses duplicates +# --------------------------------------------------------------------------- # +def test_visited_cells_uses_agent_position_and_dedupes(): + ep = _episode( + [_step((1, 1)), _step((2, 1)), _step((2, 1)), _step((3, 1))], + success=True, + end_reason="success", + ) + assert em.visited_cells(ep) == [(1, 1), (2, 1), (3, 1)] + + +# --------------------------------------------------------------------------- # +# mechanism_interaction_order +# --------------------------------------------------------------------------- # +def test_mechanism_order_key_then_switch(): + ep = _episode( + [ + _step((2, 1), "PICKUP", keys=("kB",)), + _step((2, 1), "TOGGLED", keys=("kB",), switches=("s1",), gates=("g1",)), + ], + success=True, + end_reason="success", + ) + # switch (active_switches) ranks before its downstream gate (open_gates). + assert em.mechanism_interaction_order(ep) == ["kB", "s1", "g1"] + + +def test_mechanism_order_switch_then_key(): + ep = _episode( + [ + _step((2, 1), "TOGGLED", switches=("s1",), gates=("g1",)), + _step((6, 1), "PICKUP", switches=("s1",), gates=("g1",), keys=("kB",)), + ], + success=True, + end_reason="success", + ) + assert em.mechanism_interaction_order(ep) == ["s1", "g1", "kB"] + + +def test_mechanism_order_navigation_only_is_empty(): + ep = _episode([_step((2, 1)), _step((3, 1))], success=True, end_reason="success") + assert em.mechanism_interaction_order(ep) == [] + + +# --------------------------------------------------------------------------- # +# failure_point +# --------------------------------------------------------------------------- # +def test_failure_point_reports_first_missing_expected_mechanism(): + ep = _episode( + [_step((2, 1), "PICKUP", keys=("kB",))], + success=False, + end_reason="max_steps", + ) + order = em.mechanism_interaction_order(ep) + fp = em.failure_point(ep, ["kB", "s1"], order) + assert fp["mechanism"] == "s1" + assert fp["end_reason"] == "max_steps" + assert fp["final_cell"] == [2, 1] + + +def test_failure_point_none_on_success(): + ep = _episode([_step((2, 1))], success=True, end_reason="success") + assert em.failure_point(ep, ["kB"], []) is None + + +# --------------------------------------------------------------------------- # +# path_choice +# --------------------------------------------------------------------------- # +def test_path_choice_short_long_mixed_none(): + short = [[5, 1], [6, 1]] + long = [[2, 5], [3, 5]] + short_ep = _episode([_step((5, 1)), _step((6, 1))], success=True, end_reason="success") + long_ep = _episode([_step((2, 5)), _step((3, 5))], success=False, end_reason="max_steps") + mixed_ep = _episode([_step((5, 1)), _step((2, 5))], success=False, end_reason="max_steps") + none_ep = _episode([_step((9, 9))], success=False, end_reason="max_steps") + + assert em.path_choice(short_ep, short, long) == "short_mech" + assert em.path_choice(long_ep, short, long) == "long_open" + assert em.path_choice(mixed_ep, short, long) == "mixed" + assert em.path_choice(none_ep, short, long) == "none" + assert em.path_choice(short_ep, None, None) is None + + +# --------------------------------------------------------------------------- # +# token accounting + run row +# --------------------------------------------------------------------------- # +def test_episode_token_count_sums_query_usage(): + ep = { + "transcript": [ + {"kind": "query", "usage": {"total_tokens": 10}}, + {"kind": "step"}, + {"kind": "query", "usage": {"input_tokens": 5, "output_tokens": 3}}, + ] + } + assert em.episode_token_count(ep) == 18 + + +def test_build_run_row_fields_and_optimality(): + ep = _episode( + [_step((2, 1), "PICKUP", keys=("kB",)), _step((3, 1))], + success=True, + end_reason="success", + ) + ep["transcript"].append({"kind": "query", "usage": {"total_tokens": 12}}) + canonical = {"bfs": {"optimal_steps": 2}} + manifest_row = { + "task_id": "T_demo", + "experiment": "test3", + "condition": "key_first", + "expected_mechanisms": ["kB", "s1"], + } + row = em.build_run_row( + ep, canonical, manifest_row, agent_or_model="stub", seed=0, raw_output_ref="x/episode.json" + ) + assert row["task_id"] == "T_demo" + assert row["experiment"] == "test3" + assert row["backend"] == "minigrid" + assert row["agent_or_model"] == "stub" + assert row["success"] is True + assert row["terminated"] is True + assert row["truncated"] is False + assert row["optimal_steps"] == 2 + assert row["optimality_ratio"] == 1.0 # steps_used (2) == optimal (2) + assert row["mechanism_interaction_order"] == ["kB"] + assert row["failure_point"] is None + assert row["tokens"] == 12 + assert row["raw_output_ref"] == "x/episode.json" diff --git a/tests/test_import_isolation.py b/tests/test_import_isolation.py new file mode 100644 index 0000000..c0bc0c4 --- /dev/null +++ b/tests/test_import_isolation.py @@ -0,0 +1,45 @@ +"""The solver/scorer path must not import the heavy interface stack. + +Each check runs in a fresh interpreter (subprocess) because the rest of the +suite imports `interface`, which would pollute sys.modules within one process. +""" + +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parents[1] + + +def _pulls_interface(module: str) -> bool: + code = ( + f"import {module}, sys; " + "hit = [m for m in sys.modules if m == 'interface' or m.startswith('interface.')]; " + "print('IFACE' if hit else 'CLEAN')" + ) + result = subprocess.run( + [sys.executable, "-c", code], + capture_output=True, + text=True, + cwd=str(_REPO_ROOT), + ) + assert result.returncode == 0, result.stderr + return "IFACE" in result.stdout + + +def test_scorer_import_is_interface_free(): + assert not _pulls_interface("scorer"), "import scorer pulled in interface" + + +def test_episode_metrics_import_is_interface_free(): + assert not _pulls_interface("pipeline.episode_metrics"), ( + "import pipeline.episode_metrics pulled in interface" + ) + + +def test_run_pipeline_import_is_interface_free(): + assert not _pulls_interface("scripts.run_pipeline"), ( + "import scripts.run_pipeline pulled in interface" + ) diff --git a/tests/test_interface_token_usage.py b/tests/test_interface_token_usage.py new file mode 100644 index 0000000..a5919f1 --- /dev/null +++ b/tests/test_interface_token_usage.py @@ -0,0 +1,101 @@ +from interface.config import ExperimentConfig +from interface.loader import default_maze_path, load_task +from interface.runner import build_runner +from interface.smoke_tests.plans import v01_empty_room_trajectory +from interface.smoke_tests.smoke_llm import _AgentRecorder +from interface.telemetry import normalize_token_usage + + +class UsageReplayAgent: + def __init__(self): + self._actions = iter(v01_empty_room_trajectory()) + self.last_usage = None + + def __call__(self, messages): + self.last_usage = { + "input_tokens": 8, + "output_tokens": 2, + "total_tokens": 10, + } + return f"FINAL_OUTPUT: {next(self._actions)}" + + +class FirstQueryUsageReplayAgent(UsageReplayAgent): + def __init__(self): + super().__init__() + self._calls = 0 + + def __call__(self, messages): + self._calls += 1 + if self._calls == 1: + self.last_usage = { + "input_tokens": 8, + "output_tokens": 2, + "total_tokens": 10, + } + return f"FINAL_OUTPUT: {next(self._actions)}" + + +def test_normalized_usage_accepts_provider_token_keys(): + assert normalize_token_usage({"input_tokens": 8, "output_tokens": 2}) == { + "input_tokens": 8, + "output_tokens": 2, + "total_tokens": 10, + } + + +def test_agent_recorder_forwards_usage_metadata(): + records = [] + recorder = _AgentRecorder(UsageReplayAgent(), records) + + recorder([]) + + assert recorder.last_usage == { + "input_tokens": 8, + "output_tokens": 2, + "total_tokens": 10, + } + assert records[0]["usage"]["total_tokens"] == 10 + + +def test_runner_persists_agent_usage_in_query_transcript(): + maze_path = default_maze_path("V01_empty_room.json") + backend, spec = load_task(maze_path) + runner = build_runner( + ExperimentConfig( + observation="text_only", + context_window="current", + querying="step_by_step", + chat_history="stateless", + ), + backend, + spec, + ) + + result = runner.run(UsageReplayAgent(), verbose=False, maze_path=maze_path) + query_records = [item for item in result["transcript"] if item.get("kind") == "query"] + + assert result["success"] is True + assert query_records + assert query_records[0]["usage"]["total_tokens"] == 10 + + +def test_runner_clears_stale_usage_between_queries(): + maze_path = default_maze_path("V01_empty_room.json") + backend, spec = load_task(maze_path) + runner = build_runner( + ExperimentConfig( + observation="text_only", + context_window="current", + querying="step_by_step", + chat_history="stateless", + ), + backend, + spec, + ) + + result = runner.run(FirstQueryUsageReplayAgent(), verbose=False, maze_path=maze_path) + query_records = [item for item in result["transcript"] if item.get("kind") == "query"] + + assert query_records[0]["usage"]["total_tokens"] == 10 + assert "usage" not in query_records[1] diff --git a/tests/test_reports.py b/tests/test_reports.py new file mode 100644 index 0000000..f146880 --- /dev/null +++ b/tests/test_reports.py @@ -0,0 +1,153 @@ +"""Unit tests for Stage-5 report aggregators (pipeline.reports).""" + +from __future__ import annotations + +from pipeline import reports + + +def _row(**kw): + base = { + "task_id": "t", + "experiment": "test1", + "condition": "default", + "prompt_variant": "default", + "agent_or_model": "m", + "seed": 0, + "success": True, + "optimality_ratio": 1.0, + "path_choice": None, + "mechanism_interaction_order": [], + "failure_point": None, + } + base.update(kw) + return base + + +def test_scoring_calibration_summary_groups_and_correlates(): + rows = [ + _row(task_id="a", success=True, optimality_ratio=1.0), + _row(task_id="b", success=False, optimality_ratio=0.0), + ] + composites = { + ("a", "m", 0, "default", "default"): 0.2, + ("b", "m", 0, "default", "default"): 0.8, + } + static_by_task = { + "a": {"static_score": 1.0, "dimensions_12": {"grid_size": 9.0, "optimal_path_length": 3.0}}, + "b": {"static_score": 5.0, "dimensions_12": {"grid_size": 25.0, "optimal_path_length": 9.0}}, + } + summary = reports.scoring_calibration_summary(rows, composites, static_by_task) + + assert summary["run_count"] == 2 + assert summary["task_count"] == 2 + assert summary["success_rate_by_task"]["a"]["success_rate"] == 1.0 + assert summary["success_rate_by_task"]["b"]["success_rate"] == 0.0 + # Only successful runs feed optimality. + assert summary["optimality_ratio_mean"] == 1.0 + # Two tasks with variance -> correlation defined for the populated dims. + assert summary["dimension_correlation"]["grid_size"] is not None + assert "p33" in summary["tier_boundary_candidates"] + + +def test_prompt_variants_do_not_collide(): + # Same task + manifest condition, two prompt variants: their composites and + # success must stay distinct (regression for the setdefault collapse bug). + rows = [ + _row(task_id="a", prompt_variant="step_by_step", success=True), + _row(task_id="a", prompt_variant="bulk", success=False), + ] + composites = { + ("a", "m", 0, "default", "step_by_step"): 0.9, + ("a", "m", 0, "default", "bulk"): 0.1, + } + static_by_task = {"a": {"static_score": 1.0, "dimensions_12": {}}} + summary = reports.scoring_calibration_summary(rows, composites, static_by_task) + + by_variant = summary["success_rate_by_prompt_variant"] + assert by_variant["step_by_step"]["success_rate"] == 1.0 + assert by_variant["bulk"]["success_rate"] == 0.0 + # Both per-variant composites are reachable (neither overwrote the other). + assert summary["run_count"] == 2 + + +def test_complexity_distance_summary_counts_path_choice(): + rows = [ + _row(experiment="test2", task_id="T2", condition="shortcut", path_choice="short_mech", success=True), + _row(experiment="test2", task_id="T2", condition="shortcut", path_choice="long_open", success=False), + _row(experiment="test2", task_id="T2", condition="shortcut", path_choice="short_mech", success=True), + ] + summary = reports.complexity_distance_summary(rows) + assert summary["run_count"] == 3 + assert summary["path_choice_overall"]["short_mech"] == 2 + assert summary["path_choice_overall"]["long_open"] == 1 + assert summary["success_rate_by_path_choice"]["short_mech"] == 1.0 + assert summary["success_rate_by_path_choice"]["long_open"] == 0.0 + + +def test_mechanism_ordering_pairs_paired_delta(): + rows = [ + _row(experiment="test3", task_id="k", condition="key_first", success=True, + mechanism_interaction_order=["kB", "s1"]), + _row(experiment="test3", task_id="s", condition="switch_first", success=False, + mechanism_interaction_order=["s1"], failure_point={"mechanism": "kB"}), + ] + manifest = [ + {"task_id": "k", "pair_id": "corridor", "expected_mechanisms": ["kB", "s1"]}, + {"task_id": "s", "pair_id": "corridor", "expected_mechanisms": ["s1", "kB"]}, + ] + summary = reports.mechanism_ordering_pairs(rows, manifest) + pair = summary["pairs"]["corridor"] + assert pair["conditions"]["key_first"]["success_rate"] == 1.0 + assert pair["conditions"]["switch_first"]["success_rate"] == 0.0 + assert pair["conditions"]["key_first"]["expected_order_match_rate"] == 1.0 + assert pair["conditions"]["switch_first"]["failure_point_counts"]["kB"] == 1 + # sorted conditions: ["key_first", "switch_first"] -> delta = 1.0 - 0.0 + assert pair["paired_success_delta"]["success_delta"] == 1.0 + + +def test_model_report_aggregates_per_model(): + rows = [ + _row(task_id="a", agent_or_model="m1", experiment="test1", + success=True, optimality_ratio=1.0, steps=3, optimal_steps=3, tokens=10), + _row(task_id="b", agent_or_model="m1", experiment="test1", + success=False, optimality_ratio=0.0, steps=9, optimal_steps=3, tokens=20), + _row(task_id="a", agent_or_model="m2", experiment="test1", + success=True, optimality_ratio=0.5, steps=6, optimal_steps=3, tokens=5), + ] + composites = { + ("a", "m1", 0, "default", "default"): 0.4, + ("b", "m1", 0, "default", "default"): 0.0, + ("a", "m2", 0, "default", "default"): 0.2, + } + + rep = reports.model_report(rows, composites, "m1", "rs") + assert rep["schema_version"] == "0.1.0" + assert rep["model_id"] == "m1" + assert rep["run_set_id"] == "rs" + assert rep["provisional"] is True + assert rep["run_count"] == 2 + assert rep["task_count"] == 2 + assert rep["overall"]["success_rate"] == 0.5 + assert rep["overall"]["optimality_ratio_mean"] == 1.0 # successful runs only + assert rep["overall"]["tokens_total"] == 30.0 + assert rep["overall"]["composite_mean"] == 0.2 # mean(0.4, 0.0) + assert "test1" in rep["by_experiment"] + assert "default" in rep["by_prompt_variant"] + assert len(rep["tasks"]) == 2 + assert {t["task_id"] for t in rep["tasks"]} == {"a", "b"} + + # A second model is fully independent (no collision). + rep2 = reports.model_report(rows, composites, "m2", "rs") + assert rep2["run_count"] == 1 + assert rep2["overall"]["success_rate"] == 1.0 + + +def test_scoring_calibration_summary_lists_ineligible_tasks(): + rows = [_row(task_id="a", success=True)] + composites = {("a", "m", 0, "default", "default"): 0.5} + static_by_task = { + "a": {"static_score": 1.0, "dimensions_12": {}, "is_beatable": True}, + "dead": {"static_score": 0.0, "dimensions_12": {}, "is_beatable": False}, + } + summary = reports.scoring_calibration_summary(rows, composites, static_by_task) + assert summary["ineligible_tasks"] == ["dead"] diff --git a/tests/test_run_pipeline.py b/tests/test_run_pipeline.py new file mode 100644 index 0000000..6af8e9e --- /dev/null +++ b/tests/test_run_pipeline.py @@ -0,0 +1,379 @@ +"""End-to-end test for the bare-bones run pipeline using a replay stub agent. + +Runs are live-model-only in production, but the runner accepts any callable +``messages -> str`` agent, so a deterministic replay stub exercises the full +Stage 1->5 chain (real MiniGrid backend, episode log, and scorer) with no API. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from interface.loader import default_maze_path +from interface.smoke_tests.plans import v01_empty_room_trajectory + +from scripts.run_pipeline import resolve_task_rows, run_from_config, run_pipeline + +_MANIFEST = Path(__file__).resolve().parents[1] / "gridworld" / "fixtures" / "manifest.json" + + +class ReplayAgent: + """Replays a fixed action plan and reports token usage (scorer needs >0).""" + + def __init__(self, actions): + self._actions = iter(actions) + self.last_usage = None + + def __call__(self, messages): + self.last_usage = {"input_tokens": 8, "output_tokens": 2, "total_tokens": 10} + try: + action = next(self._actions) + except StopIteration: + action = "DONE" + return f"FINAL_OUTPUT: {action}" + + +def _write_manifest(tmp_path: Path) -> Path: + manifest = { + "tasks": [ + { + "task_id": "validation_10_v01_empty_room", + "experiment": "test1", + "condition": "default", + "variant": "empty_room", + "source": str(default_maze_path("V01_empty_room.json")), + "expected_mechanisms": [], + "notes": "E2E smoke task.", + } + ] + } + path = tmp_path / "manifest.json" + path.write_text(json.dumps(manifest), encoding="utf-8") + return path + + +def test_pipeline_writes_full_artifact_tree(tmp_path): + manifest_path = _write_manifest(tmp_path) + artifacts = tmp_path / "artifacts" + + payloads = run_pipeline( + manifest_path=manifest_path, + experiment="test1", + agent=ReplayAgent(v01_empty_room_trajectory()), + agent_name="replay-stub", + seeds=[0], + conditions=None, + artifacts_root=artifacts, + run_set_id="smoke", + ) + + task_id = "validation_10_v01_empty_room" + task_dir = artifacts / "tasks" / task_id + assert (task_dir / "canonical_paths.json").exists() + assert (task_dir / "scored_static.json").exists() + assert (artifacts / "tasks" / "_suite.json").exists() + + run_dir = artifacts / "runs" / task_id / "minigrid" / "replay-stub" / "seed_0" / "default" + assert (run_dir / "episode.json").exists() + run_score = json.loads((run_dir / "run_score.json").read_text()) + assert "composite" in run_score + assert run_score["signals"]["success"] is True + + jsonl = (artifacts / "episode_runs.jsonl").read_text().strip().splitlines() + assert len(jsonl) == 1 + row = json.loads(jsonl[0]) + for field in ( + "task_id", "experiment", "condition", "prompt_variant", "backend", + "agent_or_model", "seed", "success", "terminated", "truncated", "reward", + "steps", "optimal_steps", "optimality_ratio", "path_choice", + "mechanism_interaction_order", "failure_point", "tokens", "raw_output_ref", + ): + assert field in row, f"missing episode_runs field: {field}" + assert row["prompt_variant"] == "default" + assert row["tokens"] and row["tokens"] > 0 + + report_dir = artifacts / "reports" / "smoke" + for name in ( + "scoring_calibration_summary", + "complexity_distance_summary", + "mechanism_ordering_pairs", + ): + assert (report_dir / f"{name}.json").exists() + assert payloads["scoring_calibration_summary"]["run_count"] == 1 + + +# --------------------------------------------------------------------------- # +# Task resolution (run-config entries -> catalog rows with metadata) +# --------------------------------------------------------------------------- # +def _catalog(): + return json.loads(_MANIFEST.read_text())["tasks"] + + +def test_resolve_experiment_keyword_expands_from_catalog(): + rows = resolve_task_rows(["test3"], _catalog(), _MANIFEST) + assert rows and all(r["experiment"] == "test3" for r in rows) + assert {"T3_corr_key_first", "T3_corr_switch_first"} <= {r["task_id"] for r in rows} + + +def test_resolve_task_file_attaches_catalog_metadata(): + rows = resolve_task_rows( + ["gridworld/fixtures/test3/T3_corr_key_first.json"], _catalog(), _MANIFEST + ) + assert len(rows) == 1 + assert rows[0]["task_id"] == "T3_corr_key_first" + assert rows[0]["expected_mechanisms"] == ["kB", "s1"] + assert rows[0]["pair_id"] == "corridor" + + +def test_resolve_unknown_file_synthesizes_test1_row(tmp_path): + task_file = str(default_maze_path("V01_empty_room.json")) + rows = resolve_task_rows([task_file], _catalog(), _MANIFEST) + # V01 is in the catalog by path -> keeps its catalog task_id. + assert rows[0]["task_id"] == "validation_10_v01_empty_room" + + +# --------------------------------------------------------------------------- # +# Config-driven multi-model run (stub agent factory, no API) +# --------------------------------------------------------------------------- # +def test_run_from_config_drives_per_model_tasks(tmp_path): + run_config = { + "models": { + "stub": { + "provider": "claude", + "model": "stub-model", + "tasks": [str(default_maze_path("V01_empty_room.json"))], + } + } + } + cfg_path = tmp_path / "run_config.json" + cfg_path.write_text(json.dumps(run_config), encoding="utf-8") + artifacts = tmp_path / "artifacts" + + def factory(name, model_cfg): + return ReplayAgent(v01_empty_room_trajectory()), model_cfg["model"] + + payloads = run_from_config( + run_config_path=cfg_path, + manifest_path=_MANIFEST, + seeds=[0], + artifacts_root=artifacts, + run_set_id="cfg", + agent_factory=factory, + ) + + run_dir = ( + artifacts / "runs" / "validation_10_v01_empty_room" / "minigrid" / "stub-model" / "seed_0" / "default" + ) + assert (run_dir / "episode.json").exists() + assert (run_dir / "run_score.json").exists() + assert payloads["scoring_calibration_summary"]["run_count"] == 1 + + +# --------------------------------------------------------------------------- # +# Content-hash invalidation +# --------------------------------------------------------------------------- # +import itertools +import shutil + +from scorer import load_scorer_config, score_task_file +from scorer.io import load_json, task_spec_from_payload +from scripts.run_pipeline import _expected_static_hash + + +class CountingReplayAgent: + """Cycles a fixed plan (one full pass per episode) and counts model calls.""" + + def __init__(self, actions): + self._actions = itertools.cycle(actions) + self.last_usage = None + self.calls = 0 + + def __call__(self, messages): + self.calls += 1 + self.last_usage = {"input_tokens": 8, "output_tokens": 2, "total_tokens": 10} + return f"FINAL_OUTPUT: {next(self._actions)}" + + +def _single_task_manifest(tmp_path, source): + manifest = {"tasks": [{ + "task_id": "copy_v01", "experiment": "test1", "condition": "default", + "variant": "copy", "source": str(source), "expected_mechanisms": [], + }]} + path = tmp_path / "manifest.json" + path.write_text(json.dumps(manifest), encoding="utf-8") + return path + + +def test_expected_static_hash_matches_scorer(tmp_path): + source = default_maze_path("V06_chain_ks.json") + cfg = load_scorer_config() + _, static = score_task_file(source, output_dir=tmp_path / "t", config=cfg) + spec = task_spec_from_payload(load_json(source)) + assert _expected_static_hash(spec, cfg) == static.to_dict()["inputs_hash"] + + +def test_canonical_paths_carry_inputs_hash(tmp_path): + source = default_maze_path("V06_chain_ks.json") + score_task_file(source, output_dir=tmp_path / "t") + canonical = load_json(tmp_path / "t" / "canonical_paths.json") + assert canonical.get("inputs_hash") + + +def test_unchanged_rerun_reuses_episode_and_static(tmp_path): + task_file = tmp_path / "task.json" + shutil.copy(default_maze_path("V01_empty_room.json"), task_file) + manifest = _single_task_manifest(tmp_path, task_file) + artifacts = tmp_path / "artifacts" + agent = CountingReplayAgent(v01_empty_room_trajectory()) + + run_pipeline(manifest_path=manifest, experiment="test1", agent=agent, + agent_name="stub", artifacts_root=artifacts, run_set_id="r") + calls_after_first = agent.calls + assert calls_after_first > 0 + + # Second identical run: episode cache hit -> agent not called again. + run_pipeline(manifest_path=manifest, experiment="test1", agent=agent, + agent_name="stub", artifacts_root=artifacts, run_set_id="r") + assert agent.calls == calls_after_first + + +def test_task_edit_invalidates_static_and_episode(tmp_path): + task_file = tmp_path / "task.json" + shutil.copy(default_maze_path("V01_empty_room.json"), task_file) + manifest = _single_task_manifest(tmp_path, task_file) + artifacts = tmp_path / "artifacts" + agent = CountingReplayAgent(v01_empty_room_trajectory()) + + run_pipeline(manifest_path=manifest, experiment="test1", agent=agent, + agent_name="stub", artifacts_root=artifacts, run_set_id="r") + first_calls = agent.calls + first_static_hash = load_json(artifacts / "tasks" / "copy_v01" / "scored_static.json")["inputs_hash"] + + # Mutate the task spec -> both static and run hashes must change. + data = json.loads(task_file.read_text()) + data["max_steps"] = data["max_steps"] + 5 + task_file.write_text(json.dumps(data), encoding="utf-8") + + run_pipeline(manifest_path=manifest, experiment="test1", agent=agent, + agent_name="stub", artifacts_root=artifacts, run_set_id="r") + new_static_hash = load_json(artifacts / "tasks" / "copy_v01" / "scored_static.json")["inputs_hash"] + assert new_static_hash != first_static_hash # Stage 2 recomputed + assert agent.calls > first_calls # Stage 3 episode re-run + + +def test_scorer_config_change_rescore_without_rerunning_model(tmp_path): + task_file = tmp_path / "task.json" + shutil.copy(default_maze_path("V01_empty_room.json"), task_file) + manifest = _single_task_manifest(tmp_path, task_file) + artifacts = tmp_path / "artifacts" + agent = CountingReplayAgent(v01_empty_room_trajectory()) + + # Small baselines (below the run's token count) so token_efficiency stays < 1 + # and actually moves with the config. + cfg_a = load_scorer_config() + cfg_a.baseline_tokens = 1.0 + run_pipeline(manifest_path=manifest, experiment="test1", agent=agent, agent_name="stub", + artifacts_root=artifacts, run_set_id="r", scorer_config=cfg_a) + calls_after_first = agent.calls + run_dir = artifacts / "runs" / "copy_v01" / "minigrid" / "stub" / "seed_0" / "default" + eff_a = load_json(run_dir / "run_score.json")["signals"]["token_efficiency"] + + cfg_b = load_scorer_config() + cfg_b.baseline_tokens = 5.0 + run_pipeline(manifest_path=manifest, experiment="test1", agent=agent, agent_name="stub", + artifacts_root=artifacts, run_set_id="r", scorer_config=cfg_b) + + # Episode reused (model not re-called) but run_score reflects the new config. + assert agent.calls == calls_after_first + eff_b = load_json(run_dir / "run_score.json")["signals"]["token_efficiency"] + assert eff_b != eff_a + + +# --------------------------------------------------------------------------- # +# Prompt variants are an axis distinct from the manifest condition +# --------------------------------------------------------------------------- # +def test_pipeline_keeps_prompt_variants_distinct(tmp_path): + # Two prompt variants over one task must produce two distinct runs that do + # not collapse onto the manifest condition (regression for the setdefault bug). + manifest_path = _write_manifest(tmp_path) + artifacts = tmp_path / "artifacts" + + payloads = run_pipeline( + manifest_path=manifest_path, + experiment="test1", + agent=CountingReplayAgent(v01_empty_room_trajectory()), + agent_name="replay-stub", + seeds=[0], + conditions="Prompt", # implemented variants: standard, verbose + artifacts_root=artifacts, + run_set_id="variants", + ) + + task_id = "validation_10_v01_empty_room" + base = artifacts / "runs" / task_id / "minigrid" / "replay-stub" / "seed_0" + assert (base / "standard" / "episode.json").exists() + assert (base / "verbose" / "episode.json").exists() + + rows = [ + json.loads(line) + for line in (artifacts / "episode_runs.jsonl").read_text().strip().splitlines() + ] + assert {r["prompt_variant"] for r in rows} == {"standard", "verbose"} + # Same task-intrinsic condition, distinct prompt variants -> distinct rows. + assert all(r["condition"] == "default" for r in rows) + summary = payloads["scoring_calibration_summary"] + assert summary["run_count"] == 2 + assert set(summary["success_rate_by_prompt_variant"]) == {"standard", "verbose"} + + +def test_pipeline_writes_per_model_report(tmp_path): + manifest_path = _write_manifest(tmp_path) + artifacts = tmp_path / "artifacts" + + payloads = run_pipeline( + manifest_path=manifest_path, + experiment="test1", + agent=ReplayAgent(v01_empty_room_trajectory()), + agent_name="replay-stub", + seeds=[0], + artifacts_root=artifacts, + run_set_id="smoke", + ) + + report_path = artifacts / "reports" / "smoke" / "models" / "replay-stub.json" + assert report_path.exists() + rep = json.loads(report_path.read_text()) + assert rep["schema_version"] == "0.1.0" + assert rep["model_id"] == "replay-stub" + assert rep["provisional"] is True + assert rep["run_count"] == 1 + assert "overall" in rep and "by_experiment" in rep and "tasks" in rep + assert payloads["model_reports"]["replay-stub"]["run_count"] == 1 + + +def test_run_one_model_skips_unbeatable_tasks(tmp_path): + # A task Stage 2 marks unbeatable must not enter Stage 3/4: no model call, + # no run rows, no composites — without even resolving its (missing) source. + from scripts.run_pipeline import _run_one_model + + calls = [] + + def agent(messages): + calls.append(messages) + return "FINAL_OUTPUT: DONE" + + rows = [{"task_id": "dead", "source": "missing.json", + "experiment": "test1", "condition": "default"}] + run_rows, composites = _run_one_model( + rows, agent, "m", + manifest_path=tmp_path / "manifest.json", + artifacts_root=tmp_path / "artifacts", + static_by_task={"dead": {"is_beatable": False}}, + difficulty_max=1.0, + config=load_scorer_config(), + seeds=[0], conditions=None, force=False, + ) + assert run_rows == [] + assert composites == {} + assert calls == [] # ineligible task -> model never invoked diff --git a/tests/test_scoring_system.py b/tests/test_scoring_system.py new file mode 100644 index 0000000..b58b8ac --- /dev/null +++ b/tests/test_scoring_system.py @@ -0,0 +1,648 @@ +import argparse +import json + +import pytest + +from gridworld.actions import MiniGridActions +from gridworld.baselines import plan_bfs_path, trace_planned_actions +from gridworld.task_spec import TaskSpecification +from gridworld.task_validator import TaskValidator +from scorer.artifacts import CanonicalPathReport, ScoredDifficulty +from scorer.config import ( + DEFAULT_CONFIG_PATH, + DEFAULT_DISTRACTOR_TYPE_WEIGHTS, + DEFAULT_RUNTIME_WEIGHTS, + DIMENSION_NAMES, + load_scorer_config, +) +from scorer.io import dump_json, load_json +from scorer.scoring import ( + ScorerConfig, + compute_12d_score, + compute_canonical_paths, + compute_runtime_score, + compute_static_score_artifact, + score_task_file, +) +from scripts.score_json import _default_runtime_output, _runtime, _static, _static_target_dirs + + +def make_spec(**overrides): + data = { + "task_id": "scorer_case", + "seed": 7, + "difficulty_tier": 1, + "maze": { + "dimensions": [5, 5], + "walls": [], + "start": [1, 1], + "goal": [3, 1], + }, + "mechanisms": {}, + "rules": {"observability": "full", "view_size": 7}, + "goal": {"type": "reach_position", "target": [3, 1]}, + "max_steps": 20, + } + data.update(overrides) + return TaskSpecification.from_dict(data) + + +def test_canonical_paths_include_bfs_actions_and_positions(): + spec = make_spec() + + report = compute_canonical_paths(spec) + + assert report.success is True + assert report.actions == ["move_forward", "move_forward"] + assert report.positions == [(1, 1), (2, 1), (3, 1)] + assert report.optimal_steps == 2 + assert report.states_explored > 0 + assert report.greedy is not None + assert report.greedy["success"] is True + + +def test_planner_toggle_trace_matches_current_cell_switch_precedence(): + spec = make_spec( + maze={ + "dimensions": [7, 5], + "walls": [[1, 2], [2, 2], [3, 2], [4, 2], [5, 2]], + "start": [1, 1], + "goal": [5, 1], + }, + mechanisms={ + "keys": [{"id": "k1", "position": [2, 1], "color": "red"}], + "doors": [ + { + "id": "d1", + "position": [4, 1], + "requires_key": "red", + "initial_state": "locked", + } + ], + "switches": [ + { + "id": "s1", + "position": [3, 1], + "controls": [], + "switch_type": "toggle", + "initial_state": "off", + } + ], + }, + goal={"type": "reach_position", "target": [5, 1]}, + max_steps=30, + ) + + traced = trace_planned_actions( + spec, + [ + int(MiniGridActions.PICKUP), + int(MiniGridActions.MOVE_FORWARD), + int(MiniGridActions.MOVE_FORWARD), + int(MiniGridActions.TOGGLE), + ], + ) + bfs_path = plan_bfs_path(spec) + + assert traced.action_labels[-1] == "toggle:s1" + assert "open_door:d1" not in traced.action_labels + assert bfs_path.success is False + + +def test_static_score_uses_configurable_weights(): + spec = make_spec() + default_score = compute_12d_score(spec) + config = ScorerConfig.from_dict( + { + "version": "unit", + "static_dimension_weights": { + "optimal_path_length": 2.0, + "grid_size": 0.0, + }, + } + ) + + weighted = compute_12d_score(spec, config=config) + + assert weighted.weights[0] == 2.0 + assert weighted.weights[8] == 0.0 + assert weighted.composite != default_score.composite + + +def test_static_score_rejects_partial_explicit_weight_vectors(): + spec = make_spec() + + with pytest.raises(ValueError, match="Expected 12 static weights"): + compute_12d_score(spec, weights=[1.0, 2.0]) + with pytest.raises(ValueError, match="Expected 12 static weights"): + compute_12d_score(spec, weights=[]) + + +def test_shipped_config_matches_code_defaults(): + config = load_scorer_config(DEFAULT_CONFIG_PATH) + + assert list(config.static_dimension_weights) == DIMENSION_NAMES + assert config.distractor_type_weights == DEFAULT_DISTRACTOR_TYPE_WEIGHTS + assert config.runtime_weights == DEFAULT_RUNTIME_WEIGHTS + + +def test_explicit_missing_config_path_fails(tmp_path): + with pytest.raises(FileNotFoundError, match="Scorer config not found"): + load_scorer_config(tmp_path / "missing_config.json") + + +def test_score_task_file_writes_stage_two_artifacts(tmp_path): + spec = make_spec() + task_path = tmp_path / "task.json" + spec.to_json(str(task_path)) + + canonical, static_score = score_task_file(task_path, output_dir=tmp_path / "artifacts") + + assert canonical.success is True + assert static_score.is_beatable is True + assert (tmp_path / "artifacts" / "canonical_paths.json").exists() + scored_path = tmp_path / "artifacts" / "scored_static.json" + assert scored_path.exists() + with open(scored_path, encoding="utf-8") as f: + payload = json.load(f) + assert payload["task_id"] == spec.task_id + assert "dimensions_12" in payload + assert "dimensions" not in payload + assert "composite" not in payload + assert payload["validation"]["schema_valid"] is True + assert payload["canonical_agent_features"]["greedy_solvability"] == 1.0 + + +def test_scorer_json_io_uses_utf8_encoding(tmp_path, monkeypatch): + real_open = open + observed: list[tuple[str, str, str | None]] = [] + + def tracking_open(path, mode="r", *args, **kwargs): + observed.append((str(path), mode, kwargs.get("encoding"))) + return real_open(path, mode, *args, **kwargs) + + monkeypatch.setattr("builtins.open", tracking_open) + + payload = {"message": "reach \u2192 caf\u00e9", "label": "caf\u00e9"} + path = tmp_path / "unicode.json" + dump_json(path, payload) + + assert load_json(path) == payload + assert (str(path), "w", "utf-8") in observed + assert (str(path), "r", "utf-8") in observed + + +def test_score_task_file_reuses_primary_validator_result(tmp_path, monkeypatch): + spec = make_spec() + task_path = tmp_path / "task.json" + spec.to_json(str(task_path)) + calls = 0 + original_validate = TaskValidator.validate + + def count_validate(self, *args, **kwargs): + nonlocal calls + calls += 1 + return original_validate(self, *args, **kwargs) + + monkeypatch.setattr(TaskValidator, "validate", count_validate) + + score_task_file(task_path) + + assert calls == 1 + + +def test_score_task_file_rejects_invalid_schema_before_planning(tmp_path, monkeypatch): + spec = make_spec( + maze={ + "dimensions": [5, 5], + "walls": [], + "start": [1, 1], + "goal": [9, 9], + }, + goal={"type": "reach_position", "target": [9, 9]}, + ) + task_path = tmp_path / "task.json" + spec.to_json(str(task_path)) + + def fail_if_called(*args, **kwargs): + raise AssertionError("planner must not execute for schema-invalid tasks") + + monkeypatch.setattr("scorer.static.plan_bfs_path", fail_if_called) + monkeypatch.setattr("scorer.static.plan_greedy_path", fail_if_called) + + with pytest.raises(ValueError, match="failed schema validation"): + score_task_file(task_path) + + +def test_static_score_uses_canonical_bfs_metrics(): + spec = make_spec() + bfs_path = plan_bfs_path(spec) + score = compute_12d_score(spec, bfs_path=bfs_path) + + assert score.dimensions[0] == len(bfs_path.action_labels) + assert score.dimensions[1] == bfs_path.states_explored + + +def test_runtime_score_from_episode_json_payload(): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec) + run = { + "task_id": spec.task_id, + "backend": "minigrid", + "adapter": "unit", + "model_id": "unit-model", + "seed": 7, + "success": True, + "steps_taken": 2, + "terminated": True, + "truncated": False, + "total_tokens": 500, + "trajectory": [ + {"state": {"agent_position": [1, 1]}}, + {"state": {"agent_position": [2, 1]}}, + ], + "final_state": {"agent_position": [3, 1], "step_count": 2}, + } + + config = ScorerConfig.from_dict({"runtime_weights": {"greedy_penalty": 0.0}}) + score = compute_runtime_score( + run, + static_score=static_score, + canonical_paths=canonical, + config=config, + difficulty_max_static_score=static_score.static_score, + ) + + assert score.task_id == spec.task_id + assert score.composite == 1.0 + assert score.signals["step_ratio"] == 1.0 + assert score.signals["cell_overlap_bfs"] == 1.0 + assert score.signals["cell_overlap_greedy"] == 1.0 + assert score.signals["token_efficiency"] == 1.0 + assert "path_choice" not in score.signals + assert "distractor_interactions" not in score.signals + + +def test_runtime_score_prefers_interface_state_after_over_row_col_position_after(): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec) + run = { + "success": True, + "steps_used": 2, + "total_tokens": 100, + "end_reason": "success", + "task_spec": spec.to_dict(), + "initial_state": {"agent_position": [1, 1]}, + "final_state": {"agent_position": [3, 1], "step_count": 2}, + "transcript": [ + { + "kind": "reset", + "state": {"agent_position": [1, 1]}, + }, + { + "kind": "step", + "position_after": [1, 2], + "state_after": {"agent_position": [2, 1]}, + }, + { + "kind": "step", + "position_after": [1, 3], + "state_after": {"agent_position": [3, 1]}, + }, + ], + } + + config = ScorerConfig.from_dict({"runtime_weights": {"greedy_penalty": 0.0}}) + score = compute_runtime_score( + run, + static_score=static_score, + canonical_paths=canonical, + config=config, + difficulty_max_static_score=static_score.static_score, + ) + + assert score.signals["cell_overlap_bfs"] == 1.0 + + +def test_runtime_score_requires_suite_difficulty_normalizer(): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec) + + with pytest.raises(ValueError, match="difficulty_max_static_score"): + compute_runtime_score( + {"success": True, "steps": 2, "total_tokens": 100}, + static_score=static_score, + canonical_paths=canonical, + ) + + +def test_runtime_score_rejects_suite_max_smaller_than_task_score(): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec) + + with pytest.raises(ValueError, match="at least the task static score"): + compute_runtime_score( + {"success": True, "steps": 2, "total_tokens": 100}, + static_score=static_score, + canonical_paths=canonical, + difficulty_max_static_score=static_score.static_score - 1, + ) + + +def test_runtime_score_rejects_unevaluated_greedy_solvability(): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec).to_dict() + static_score["canonical_agent_features"]["greedy_solvability"] = None + + with pytest.raises(ValueError, match="greedy_solvability"): + compute_runtime_score( + {"success": True, "steps": 2, "total_tokens": 100}, + static_score=static_score, + canonical_paths=canonical, + difficulty_max_static_score=static_score["static_score"], + ) + + +def test_runtime_score_rejects_schema_invalid_static_artifact_clearly(): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec).to_dict() + static_score["validation"]["schema_valid"] = False + + with pytest.raises(ValueError, match="schema-valid"): + compute_runtime_score( + {"success": True, "steps": 2, "total_tokens": 100}, + static_score=static_score, + canonical_paths=canonical, + difficulty_max_static_score=static_score["static_score"], + ) + + +def test_runtime_token_count_does_not_double_count_nested_step_tokens(): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec) + score = compute_runtime_score( + { + "success": True, + "steps": 2, + "trajectory": [{"tokens": 100, "info": {"tokens": 100}}], + }, + static_score=static_score, + canonical_paths=canonical, + difficulty_max_static_score=static_score.static_score, + ) + + assert score.signals["token_count"] == 100 + + +def test_runtime_token_count_reads_query_transcript_usage(): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec) + score = compute_runtime_score( + { + "success": True, + "steps": 2, + "transcript": [ + { + "kind": "query", + "usage": {"input_tokens": 80, "output_tokens": 20}, + } + ], + }, + static_score=static_score, + canonical_paths=canonical, + difficulty_max_static_score=static_score.static_score, + ) + + assert score.signals["token_count"] == 100 + + +def test_runtime_hash_ignores_non_scoring_transcript_context(): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec) + base_run = { + "success": True, + "steps": 2, + "total_tokens": 100, + "transcript": [ + { + "kind": "query", + "agent_messages": [{"role": "user", "content": "first"}], + } + ], + } + changed_context = { + **base_run, + "transcript": [ + { + "kind": "query", + "agent_messages": [{"role": "user", "content": "second"}], + } + ], + } + + first = compute_runtime_score( + base_run, + static_score=static_score, + canonical_paths=canonical, + difficulty_max_static_score=static_score.static_score, + ) + second = compute_runtime_score( + changed_context, + static_score=static_score, + canonical_paths=canonical, + difficulty_max_static_score=static_score.static_score, + ) + + assert first.inputs_hash == second.inputs_hash + + +@pytest.mark.parametrize("token_count", [None, 0]) +def test_runtime_score_rejects_missing_or_zero_token_telemetry(token_count): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec) + run = {"success": True, "steps": 2} + if token_count is not None: + run["total_tokens"] = token_count + + with pytest.raises(ValueError, match="token"): + compute_runtime_score( + run, + static_score=static_score, + canonical_paths=canonical, + difficulty_max_static_score=static_score.static_score, + ) + + +def test_runtime_score_rejects_missing_step_telemetry(): + spec = make_spec() + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec) + + with pytest.raises(ValueError, match="step telemetry"): + compute_runtime_score( + {"success": True, "total_tokens": 100}, + static_score=static_score, + canonical_paths=canonical, + difficulty_max_static_score=static_score.static_score, + ) + + +def test_zero_step_plans_do_not_inflate_optimal_steps_with_done(): + spec = make_spec( + maze={ + "dimensions": [5, 5], + "walls": [], + "start": [1, 1], + "goal": [1, 1], + }, + goal={"type": "reach_position", "target": [1, 1]}, + ) + + path = plan_bfs_path(spec) + traced_done = trace_planned_actions(spec, [int(MiniGridActions.DONE)]) + + assert path.success is True + assert path.action_labels == [] + assert traced_done.success is True + assert traced_done.action_labels == [] + + +def test_runtime_zero_step_success_gets_full_step_credit(): + spec = make_spec( + maze={ + "dimensions": [5, 5], + "walls": [], + "start": [1, 1], + "goal": [1, 1], + }, + goal={"type": "reach_position", "target": [1, 1]}, + ) + canonical = compute_canonical_paths(spec) + static_score = compute_static_score_artifact(spec) + score = compute_runtime_score( + { + "success": True, + "steps": 0, + "total_tokens": 100, + "initial_state": {"agent_position": [1, 1]}, + "final_state": {"agent_position": [1, 1], "step_count": 0}, + }, + static_score=static_score, + canonical_paths=canonical, + config=ScorerConfig.from_dict({"runtime_weights": {"greedy_penalty": 0.0}}), + difficulty_max_static_score=static_score.static_score, + ) + + assert score.signals["step_ratio"] == 1.0 + assert score.composite == 1.0 + + +def test_static_cli_target_dirs_reject_same_stem_collisions(tmp_path): + files = [tmp_path / "a" / "task.json", tmp_path / "b" / "task.json"] + + with pytest.raises(ValueError, match="collide"): + _static_target_dirs(files, tmp_path / "scores") + + +def test_static_cli_continues_after_file_failure_and_summarizes(tmp_path, capsys): + task_a = tmp_path / "task_a.json" + task_b = tmp_path / "task_b.json" + bad_task = tmp_path / "bad.json" + dump_json(task_a, make_spec(task_id="ok_a").to_dict()) + dump_json(task_b, make_spec(task_id="ok_b").to_dict()) + bad_task.write_text("{", encoding="utf-8") + + exit_code = _static( + argparse.Namespace( + config=None, + inputs=[str(task_a), str(bad_task), str(task_b)], + output_dir=str(tmp_path / "scores"), + ) + ) + captured = capsys.readouterr() + + assert exit_code == 1 + assert "static: ok input=" in captured.out + assert "task_id=ok_a" in captured.out + assert "task_id=ok_b" in captured.out + assert "static: error input=" in captured.err + assert "bad.json" in captured.err + assert "JSONDecodeError" in captured.err + assert "Traceback" not in captured.err + assert "static: summary scored=2 failed=1 total=3" in captured.err + assert (tmp_path / "scores" / "task_a" / "scored_static.json").exists() + assert (tmp_path / "scores" / "task_b" / "scored_static.json").exists() + assert not (tmp_path / "scores" / "bad" / "scored_static.json").exists() + + +def test_runtime_cli_default_output_uses_source_stem(tmp_path): + assert _default_runtime_output(tmp_path / "run.json") == tmp_path / "run_score.json" + assert _default_runtime_output(tmp_path / "episode.json") == tmp_path / "episode_score.json" + + +def test_runtime_cli_rejects_half_specified_artifacts(tmp_path): + args = argparse.Namespace( + config=None, + run=str(tmp_path / "episode.json"), + output=None, + static_score=str(tmp_path / "scored_static.json"), + canonical_paths=None, + task=str(tmp_path / "task.json"), + artifact_dir=None, + difficulty_max_static_score=100.0, + ) + + with pytest.raises(ValueError, match="provided together"): + _runtime(args) + + +def test_runtime_cli_explains_missing_suite_maximum(tmp_path): + args = argparse.Namespace( + config=None, + run=str(tmp_path / "episode.json"), + output=None, + static_score=str(tmp_path / "scored_static.json"), + canonical_paths=str(tmp_path / "canonical_paths.json"), + task=None, + artifact_dir=None, + difficulty_max_static_score=None, + ) + + with pytest.raises(ValueError, match="--difficulty-max-static-score"): + _runtime(args) + + +def test_artifact_serialization_returns_detached_data(): + scored = ScoredDifficulty(dimensions=[1.0], dimension_names=["only"], weights=[2.0]) + scored_payload = scored.to_dict() + scored_payload["dimensions"][0] = 9.0 + scored_payload["weights"][0] = 9.0 + + canonical = CanonicalPathReport( + task_id="task", + success=True, + actions=["move_forward"], + positions=[(1, 1), (2, 1)], + optimal_steps=1, + states_explored=2, + message="ok", + greedy={"actions": ["move_forward"]}, + ) + canonical_payload = canonical.to_dict() + canonical_payload["bfs"]["actions"][0] = "mutated" + canonical_payload["greedy"]["actions"][0] = "mutated" + + assert scored.dimensions == [1.0] + assert scored.weights == [2.0] + assert canonical.actions == ["move_forward"] + assert canonical.greedy == {"actions": ["move_forward"]}