Agenta-AI · junaway · Jun 11, 2026 · Jun 11, 2026
diff --git a/api/oss/src/core/evaluations/service.py b/api/oss/src/core/evaluations/service.py
@@ -22,6 +22,7 @@
     EvaluationRunDataStep,
     EvaluationRunDataConcurrency,
     EvaluationRunData,
+    JsonSchemas,
     EvaluationRun,
     EvaluationRunCreate,
     EvaluationRunEdit,
@@ -1428,6 +1429,9 @@ async def _refresh_metrics(
         # Resolved metric keys per step (declared schema, else trace-inferred);
         # become the run's `mappings`. Rewrite only when something was inferred.
         metrics_keys_by_step: Dict[str, List[Dict[str, str]]] = {}
+        # Trace-inferred outputs schema per step, persisted onto the run step so
+        # the UI can type filter columns for schema-less evaluators.
+        inferred_schemas_by_step: Dict[str, Dict[str, Any]] = {}
         any_inferred = False
 
         for step in refreshable_steps:
@@ -1487,6 +1491,7 @@ async def _refresh_metrics(
 
                     if metrics_keys:
                         any_inferred = True
+                        inferred_schemas_by_step[step.key] = inferred_schema
 
                 # Record declared + inferred keys; skip [] (would wipe the
                 # step's existing mapping without replacing it).
@@ -1510,6 +1515,7 @@ async def _refresh_metrics(
                 user_id=user_id,
                 run=run,
                 inferred_metrics_keys_by_step=metrics_keys_by_step,
+                inferred_schemas_by_step=inferred_schemas_by_step,
             )
 
         steps_specs: Dict[str, List[MetricSpec]] = dict()
@@ -1699,6 +1705,7 @@ async def _update_run_mappings_from_inferred_metrics(
         user_id: UUID,
         run: EvaluationRun,
         inferred_metrics_keys_by_step: Dict[str, List[Dict[str, str]]],
+        inferred_schemas_by_step: Optional[Dict[str, Dict[str, Any]]] = None,
     ) -> None:
         existing_mappings = list(run.data.mappings or [])
         updated_mappings: List[EvaluationRunDataMapping] = []
@@ -1764,9 +1771,24 @@ def mapping_key(
                     )
                 )
 
-        if updated_mappings != existing_mappings:
+        existing_steps = list(run.data.steps or [])
+        updated_steps = existing_steps
+        if inferred_schemas_by_step:
+            updated_steps = []
+            for step in existing_steps:
+                inferred_outputs = inferred_schemas_by_step.get(step.key)
+                if inferred_outputs and (not step.schemas or not step.schemas.outputs):
+                    updated_steps.append(
+                        step.model_copy(
+                            update={"schemas": JsonSchemas(outputs=inferred_outputs)}
+                        )
+                    )
+                else:
+                    updated_steps.append(step)
+
+        if updated_mappings != existing_mappings or updated_steps != existing_steps:
             run_data = EvaluationRunData(
-                steps=run.data.steps,
+                steps=updated_steps,
                 repeats=run.data.repeats,
                 mappings=updated_mappings,
             )

diff --git a/api/oss/src/core/evaluations/types.py b/api/oss/src/core/evaluations/types.py
@@ -35,6 +35,7 @@
 # engine is the package that ships it. Importers keep using
 # `core.evaluations.types.EvaluationStatus` unchanged.
 from agenta.sdk.models.evaluations import EvaluationStatus  # noqa: E402
+from agenta.sdk.models.workflows import JsonSchemas  # noqa: E402
 
 
 class EvaluationClosedConflict(Exception):
@@ -257,6 +258,10 @@ class EvaluationRunDataStep(BaseModel):
     origin: Origin
     references: Dict[str, Reference]
     inputs: Optional[List[EvaluationRunDataStepInput]] = None
+    # Outputs schema inferred from traces when the evaluator declares none.
+    # Run-scoped (reflects this run's observed outputs), so the immutable
+    # evaluator revision is never rewritten. Only `outputs` is populated.
+    schemas: Optional[JsonSchemas] = None
 
 
 class EvaluationRunDataMappingColumn(BaseModel):

diff --git a/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts b/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts
@@ -1,3 +1,4 @@
+import {extractMetrics} from "@agenta/entities/workflow"
 import {atom} from "jotai"
 import {atomFamily} from "jotai/utils"
 
@@ -329,6 +330,24 @@ const tableColumnsBaseAtomFamily = atomFamily((runId: string | null) =>
             ? runData.camelRun.data.mappings
             : []
 
+        // Per-step metric definitions derived from the outputs schema the
+        // backend inferred from traces and stored on the run step. This is the
+        // type source for schema-less evaluators, whose immutable revision
+        // carries no schema. Keyed by step key.
+        const stepSchemaMetricsByStepKey = new Map<string, ReturnType<typeof extractMetrics>>()
+        const runSteps = Array.isArray(runData.camelRun?.data?.steps)
+            ? runData.camelRun.data.steps
+            : []
+        for (const step of runSteps) {
+            const outputs = (step as {schemas?: {outputs?: unknown}})?.schemas?.outputs
+            const stepKey = (step as {key?: string})?.key
+            if (!outputs || !stepKey) continue
+            stepSchemaMetricsByStepKey.set(
+                stepKey,
+                extractMetrics({id: stepKey, slug: stepKey, data: {schemas: {outputs}}}),
+            )
+        }
+
         const counters: Record<"input" | "invocation" | "annotation", number> = {
             input: 0,
             invocation: 0,
@@ -506,13 +525,22 @@ const tableColumnsBaseAtomFamily = atomFamily((runId: string | null) =>
             // canonical-key-only match misses and `metricType` falls back
             // to "string", mis-typing the column (e.g. a boolean output).
             const metricKey = column.metricKey || column.valueKey
-            const metricDefinition = evaluator?.metrics.find(
-                (metric) =>
-                    metric.name === metricKey ||
-                    metric.path === metricKey ||
-                    metric.name === column.valueKey ||
-                    metric.path === column.valueKey,
-            )
+            const matchMetric = (metrics: ReturnType<typeof extractMetrics> | undefined) =>
+                metrics?.find(
+                    (metric) =>
+                        metric.name === metricKey ||
+                        metric.path === metricKey ||
+                        metric.name === column.valueKey ||
+                        metric.path === column.valueKey,
+                )
+            // Schema-declared evaluator metrics first; else fall back to the
+            // outputs schema the backend inferred from traces and stored on the
+            // run step — the only type source for schema-less evaluators, whose
+            // immutable revision carries no schema. "string" is the cold-start
+            // fallback before any type is known.
+            const metricDefinition =
+                matchMetric(evaluator?.metrics) ??
+                matchMetric(stepSchemaMetricsByStepKey.get(column.stepKey ?? ""))
             const metricType =
                 metricDefinition?.metricType || column.metricType || METRIC_TYPE_FALLBACK
             const evaluatorLabel = evaluator?.name || column.evaluatorSlug || "Annotations"