Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions api/oss/src/core/evaluations/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
EvaluationRunDataStep,
EvaluationRunDataConcurrency,
EvaluationRunData,
JsonSchemas,
EvaluationRun,
EvaluationRunCreate,
EvaluationRunEdit,
Expand Down Expand Up @@ -1428,6 +1429,9 @@ async def _refresh_metrics(
# Resolved metric keys per step (declared schema, else trace-inferred);
# become the run's `mappings`. Rewrite only when something was inferred.
metrics_keys_by_step: Dict[str, List[Dict[str, str]]] = {}
# Trace-inferred outputs schema per step, persisted onto the run step so
# the UI can type filter columns for schema-less evaluators.
inferred_schemas_by_step: Dict[str, Dict[str, Any]] = {}
any_inferred = False

for step in refreshable_steps:
Expand Down Expand Up @@ -1487,6 +1491,7 @@ async def _refresh_metrics(

if metrics_keys:
any_inferred = True
inferred_schemas_by_step[step.key] = inferred_schema

# Record declared + inferred keys; skip [] (would wipe the
# step's existing mapping without replacing it).
Expand All @@ -1510,6 +1515,7 @@ async def _refresh_metrics(
user_id=user_id,
run=run,
inferred_metrics_keys_by_step=metrics_keys_by_step,
inferred_schemas_by_step=inferred_schemas_by_step,
)

steps_specs: Dict[str, List[MetricSpec]] = dict()
Expand Down Expand Up @@ -1699,6 +1705,7 @@ async def _update_run_mappings_from_inferred_metrics(
user_id: UUID,
run: EvaluationRun,
inferred_metrics_keys_by_step: Dict[str, List[Dict[str, str]]],
inferred_schemas_by_step: Optional[Dict[str, Dict[str, Any]]] = None,
) -> None:
existing_mappings = list(run.data.mappings or [])
updated_mappings: List[EvaluationRunDataMapping] = []
Expand Down Expand Up @@ -1764,9 +1771,24 @@ def mapping_key(
)
)

if updated_mappings != existing_mappings:
existing_steps = list(run.data.steps or [])
updated_steps = existing_steps
if inferred_schemas_by_step:
updated_steps = []
for step in existing_steps:
inferred_outputs = inferred_schemas_by_step.get(step.key)
if inferred_outputs and (not step.schemas or not step.schemas.outputs):
updated_steps.append(
step.model_copy(
update={"schemas": JsonSchemas(outputs=inferred_outputs)}
)
)
Comment thread
junaway marked this conversation as resolved.
else:
updated_steps.append(step)

if updated_mappings != existing_mappings or updated_steps != existing_steps:
run_data = EvaluationRunData(
steps=run.data.steps,
steps=updated_steps,
repeats=run.data.repeats,
mappings=updated_mappings,
)
Expand Down
5 changes: 5 additions & 0 deletions api/oss/src/core/evaluations/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
# engine is the package that ships it. Importers keep using
# `core.evaluations.types.EvaluationStatus` unchanged.
from agenta.sdk.models.evaluations import EvaluationStatus # noqa: E402
from agenta.sdk.models.workflows import JsonSchemas # noqa: E402


class EvaluationClosedConflict(Exception):
Expand Down Expand Up @@ -257,6 +258,10 @@ class EvaluationRunDataStep(BaseModel):
origin: Origin
references: Dict[str, Reference]
inputs: Optional[List[EvaluationRunDataStepInput]] = None
# Outputs schema inferred from traces when the evaluator declares none.
# Run-scoped (reflects this run's observed outputs), so the immutable
# evaluator revision is never rewritten. Only `outputs` is populated.
schemas: Optional[JsonSchemas] = None


class EvaluationRunDataMappingColumn(BaseModel):
Expand Down
42 changes: 35 additions & 7 deletions web/oss/src/components/EvalRunDetails/atoms/table/columns.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import {extractMetrics} from "@agenta/entities/workflow"
import {atom} from "jotai"
import {atomFamily} from "jotai/utils"

Expand Down Expand Up @@ -329,6 +330,24 @@ const tableColumnsBaseAtomFamily = atomFamily((runId: string | null) =>
? runData.camelRun.data.mappings
: []

// Per-step metric definitions derived from the outputs schema the
// backend inferred from traces and stored on the run step. This is the
// type source for schema-less evaluators, whose immutable revision
// carries no schema. Keyed by step key.
const stepSchemaMetricsByStepKey = new Map<string, ReturnType<typeof extractMetrics>>()
const runSteps = Array.isArray(runData.camelRun?.data?.steps)
? runData.camelRun.data.steps
: []
for (const step of runSteps) {
const outputs = (step as {schemas?: {outputs?: unknown}})?.schemas?.outputs
const stepKey = (step as {key?: string})?.key
if (!outputs || !stepKey) continue
stepSchemaMetricsByStepKey.set(
stepKey,
extractMetrics({id: stepKey, slug: stepKey, data: {schemas: {outputs}}}),
)
}

const counters: Record<"input" | "invocation" | "annotation", number> = {
input: 0,
invocation: 0,
Expand Down Expand Up @@ -506,13 +525,22 @@ const tableColumnsBaseAtomFamily = atomFamily((runId: string | null) =>
// canonical-key-only match misses and `metricType` falls back
// to "string", mis-typing the column (e.g. a boolean output).
const metricKey = column.metricKey || column.valueKey
const metricDefinition = evaluator?.metrics.find(
(metric) =>
metric.name === metricKey ||
metric.path === metricKey ||
metric.name === column.valueKey ||
metric.path === column.valueKey,
)
const matchMetric = (metrics: ReturnType<typeof extractMetrics> | undefined) =>
metrics?.find(
(metric) =>
metric.name === metricKey ||
metric.path === metricKey ||
metric.name === column.valueKey ||
metric.path === column.valueKey,
)
// Schema-declared evaluator metrics first; else fall back to the
// outputs schema the backend inferred from traces and stored on the
// run step — the only type source for schema-less evaluators, whose
// immutable revision carries no schema. "string" is the cold-start
// fallback before any type is known.
const metricDefinition =
matchMetric(evaluator?.metrics) ??
matchMetric(stepSchemaMetricsByStepKey.get(column.stepKey ?? ""))
const metricType =
metricDefinition?.metricType || column.metricType || METRIC_TYPE_FALLBACK
const evaluatorLabel = evaluator?.name || column.evaluatorSlug || "Annotations"
Expand Down
Loading