diff --git a/src/eva/__init__.py b/src/eva/__init__.py index ecc5f2a8..cf0ea3e5 100644 --- a/src/eva/__init__.py +++ b/src/eva/__init__.py @@ -11,4 +11,4 @@ # Bump metrics_version when changes affect metric computation (metrics code, # judge prompts, pricing tables, postprocessor). -metrics_version = "2.0.0" +metrics_version = "2.1.0" diff --git a/src/eva/metrics/aggregation.py b/src/eva/metrics/aggregation.py index 7a19a9c5..7217c4dc 100644 --- a/src/eva/metrics/aggregation.py +++ b/src/eva/metrics/aggregation.py @@ -9,7 +9,10 @@ from dataclasses import dataclass, field from typing import Literal +import numpy as np + from eva.models.results import RecordMetrics +from eva.utils.bootstrap import bootstrap_ci_fields, mean_ci_fields from eva.utils.pass_at_k import ( compute_pass_at_k, compute_pass_power_k, @@ -83,6 +86,55 @@ class EVACompositeDefinition: ] +def _scenario_means(per_record_values: dict[str, float | None]) -> np.ndarray: + """Group per-record values by base scenario id and return per-scenario means. + + Scenarios where every record contributes None are dropped. + """ + grouped: dict[str, list[float]] = {} + for record_id, val in per_record_values.items(): + if val is None: + continue + base_id, _ = parse_trial_record_id(record_id) + grouped.setdefault(base_id, []).append(float(val)) + if not grouped: + return np.array([], dtype=float) + return np.array([sum(vs) / len(vs) for vs in grouped.values()], dtype=float) + + +def scenario_means_for_metric( + all_metrics: dict[str, RecordMetrics], + metric_name: str, +) -> np.ndarray: + """Per-scenario mean over trials of one metric's score. + + Uses ``normalized_score`` (falling back to ``score``). Scenarios where all + trials are missing/errored are dropped. For k=1 runs each record is its own + scenario. + """ + return _scenario_means( + {record_id: record_metrics.get_score(metric_name) for record_id, record_metrics in all_metrics.items()} + ) + + +def _scenario_values_for_composite( + all_metrics: dict[str, RecordMetrics], + comp: EVACompositeDefinition, +) -> np.ndarray: + """Per-scenario mean over trials of a composite's per-trial value. + + Reads from ``aggregate_metrics``. For pass/derived composites this is the + scenario pass rate. Scenarios where all trials have ``None`` for this + composite are dropped. + """ + return _scenario_means( + { + record_id: record_metrics.aggregate_metrics.get(comp.name) + for record_id, record_metrics in all_metrics.items() + } + ) + + def _check_threshold(value: float, operator: str, threshold: float) -> bool: """Check whether a value passes the given threshold comparison.""" if operator == "==": @@ -159,6 +211,8 @@ def compute_run_level_aggregates( all_metrics: dict[str, RecordMetrics], num_draws: int = 1, composites: list[EVACompositeDefinition] | None = None, + *, + seed: int, ) -> dict: """Compute run-level aggregate scores from all records. @@ -166,9 +220,12 @@ def compute_run_level_aggregates( all_metrics: Dict mapping record ID to RecordMetrics (must have aggregate_metrics populated). num_draws: Number of draws (k) for pass@k computation. composites: Custom composite definitions. Defaults to EVA_COMPOSITES. + seed: Bootstrap seed for CI computation. Keyword-only and required. + Production callers pass ``run_seed(run_dir.name)`` for within-run + determinism. Returns: - Dict with per-composite statistics and optional pass@k data. + Dict with per-composite statistics, CI fields, and optional pass@k data. """ composites = composites or EVA_COMPOSITES @@ -206,11 +263,14 @@ def compute_run_level_aggregates( else: entry["success_rate"] = round(sum(1 for v in values if v >= 0.5) / len(values), 4) + # Bootstrap CI on the per-scenario mean. + entry.update(mean_ci_fields(_scenario_values_for_composite(all_metrics, comp), seed=seed)) + result[comp.name] = entry # pass_k for aggregate metrics if multi-trial if num_draws > 1: - pass_k_data = _compute_aggregate_pass_k(all_metrics, num_draws, composites) + pass_k_data = _compute_aggregate_pass_k(all_metrics, num_draws, composites, seed=seed) if pass_k_data: result["pass_k"] = pass_k_data @@ -221,6 +281,8 @@ def _compute_aggregate_pass_k( all_metrics: dict[str, RecordMetrics], num_draws: int, composites: list[EVACompositeDefinition] | None = None, + *, + seed: int, ) -> dict: """Compute pass@1, pass@k, pass^k (observed), and pass^k (theoretical) for aggregate metrics across trials.""" composites = composites or EVA_COMPOSITES @@ -264,7 +326,7 @@ def _compute_aggregate_pass_k( if pass_at_k_values: count = len(pass_at_k_values) - result[comp.name] = { + entry = { "pass_at_1": round(sum(pass_at_1_values) / count, 4), "pass_at_k": round(sum(pass_at_k_values) / count, 4), "pass_power_k_observed": round(sum(pass_power_k_observed_values) / count, 4), @@ -272,5 +334,16 @@ def _compute_aggregate_pass_k( "k": num_draws, "count": count, } + entry.update( + bootstrap_ci_fields( + { + "pass_at_1": pass_at_1_values, + "pass_at_k": pass_at_k_values, + "pass_power_k_observed": pass_power_k_observed_values, + }, + seed=seed, + ) + ) + result[comp.name] = entry return result diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index 470ec6fc..aad09079 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -10,7 +10,11 @@ import yaml from eva.metrics.accuracy.agent_speech_fidelity_s2s import AgentSpeechFidelityS2SMetric -from eva.metrics.aggregation import compute_record_aggregates, compute_run_level_aggregates +from eva.metrics.aggregation import ( + compute_record_aggregates, + compute_run_level_aggregates, + scenario_means_for_metric, +) from eva.metrics.base import BaseMetric, MetricContext from eva.metrics.legacy_aliases import rename_metric_keys from eva.metrics.processor import MetricsContextProcessor @@ -20,6 +24,7 @@ from eva.models.config import PipelineType, get_pipeline_type from eva.models.record import EvaluationRecord from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics +from eva.utils.bootstrap import bootstrap_ci_fields, mean_ci_fields, run_seed from eva.utils.hash_utils import get_dict_hash from eva.utils.logging import get_logger from eva.utils.pass_at_k import ( @@ -632,6 +637,8 @@ def _build_per_metric_aggregates( metric_names: list[str], pass_at_k_results: dict[str, dict[str, PassAtKResult]] | None = None, num_draws: int = 1, + *, + seed: int, ) -> dict[str, dict[str, Any]]: """Build per-metric aggregate stats including pass_k. @@ -640,6 +647,9 @@ def _build_per_metric_aggregates( metric_names: List of metric names to aggregate. pass_at_k_results: Per-record pass@k results (if multi-trial). num_draws: Number of draws (k) for pass@k. + seed: Bootstrap seed for CI computation. Keyword-only and required; + production callers pass ``run_seed(run_dir.name)`` for within-run + determinism. Returns: Dict mapping metric name to aggregate stats. @@ -698,6 +708,9 @@ def _build_per_metric_aggregates( coverage["not_applicable_turns"] = total_not_applicable_across_records entry["per_turn_coverage"] = coverage + # Bootstrap CI on the per-scenario mean. + entry.update(mean_ci_fields(scenario_means_for_metric(all_metrics, name), seed=seed)) + entry["higher_is_better"] = _metric_higher_is_better(name) metric_aggregates[name] = entry @@ -720,7 +733,7 @@ def _build_per_metric_aggregates( if pass_at_k_values: count = len(pass_at_k_values) - metric_aggregates[name]["pass_k"] = { + pass_k_block: dict[str, Any] = { "pass_at_1": round(sum(pass_at_1_values) / count, 4), "pass_at_k": round(sum(pass_at_k_values) / count, 4), "pass_power_k_observed": round(sum(pass_power_k_obs_values) / count, 4), @@ -728,6 +741,17 @@ def _build_per_metric_aggregates( "k": num_draws, "count": count, } + pass_k_block.update( + bootstrap_ci_fields( + { + "pass_at_1": pass_at_1_values, + "pass_at_k": pass_at_k_values, + "pass_power_k_observed": pass_power_k_obs_values, + }, + seed=seed, + ) + ) + metric_aggregates[name]["pass_k"] = pass_k_block # Generic sub-metric aggregation. # Sub-keys are collected in first-seen insertion order so each metric controls @@ -920,8 +944,13 @@ async def _save_summary( # Aggregate per_metric for ALL metrics present across records (not just those just run), # so that a partial re-run (e.g. --metrics response_speed) preserves other metrics. all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics}) + seed = run_seed(self.run_dir.name) metric_aggregates = self._build_per_metric_aggregates( - all_metrics, all_metric_names, pass_at_k_results, self.num_draws + all_metrics, + all_metric_names, + pass_at_k_results, + self.num_draws, + seed=seed, ) # Compute metric failures for MetricsRunResult (only for metrics just run) @@ -934,7 +963,7 @@ async def _save_summary( metric_failures.setdefault(name, []).append(record_id) # Compute EVA composite run-level aggregates - overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws) + overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws, seed=seed) # Load existing summary to preserve fields for metrics not being re-run summary_path = self.run_dir / "metrics_summary.json" @@ -1038,12 +1067,17 @@ async def run_aggregate_only(cls, run_dir: Path, num_draws: int = 1) -> None: all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics}) # Compute per-metric aggregates (including pass_k) + seed = run_seed(run_dir.name) metric_aggregates = cls._build_per_metric_aggregates( - all_metrics, all_metric_names, pass_at_k_results or None, num_draws + all_metrics, + all_metric_names, + pass_at_k_results or None, + num_draws, + seed=seed, ) # Compute run-level aggregates - overall_scores = compute_run_level_aggregates(all_metrics, num_draws) + overall_scores = compute_run_level_aggregates(all_metrics, num_draws, seed=seed) # Update metrics_summary.json (preserve existing fields, replace computed sections) summary_path = run_dir / "metrics_summary.json" diff --git a/src/eva/utils/bootstrap.py b/src/eva/utils/bootstrap.py new file mode 100644 index 00000000..073dfd14 --- /dev/null +++ b/src/eva/utils/bootstrap.py @@ -0,0 +1,99 @@ +"""Percentile bootstrap primitives for sample-mean confidence intervals. + +This module is pure: numpy in, numpy/floats out. It has no eva imports and +is safe to use from anywhere in the package. +""" + +from __future__ import annotations + +import hashlib +from collections.abc import Sequence +from typing import Any + +import numpy as np + +N_BOOT = 2000 +ALPHA = 0.05 + + +def run_seed(run_id: str) -> int: + """Stable, run-dependent seed derived from the run directory name. + + Uses ``hashlib.sha256`` rather than Python's built-in ``hash()`` because the + latter is salted per interpreter process — re-invoking ``eva metrics`` on the + same run would otherwise yield slightly different CI bounds. SHA-based hashing + is byte-stable across processes. + """ + h = hashlib.sha256(run_id.encode()).digest() + return int.from_bytes(h[:4], "big") % (2**31) + + +def bootstrap_resample(values: np.ndarray, n_boot: int, seed: int) -> np.ndarray: + """Return ``n_boot`` resampled means of ``values``. + + Returns a zero-length array for empty input. + """ + values = np.asarray(values, dtype=float) + if len(values) == 0: + return np.array([], dtype=float) + rng = np.random.default_rng(seed) + idx = rng.integers(0, len(values), size=(n_boot, len(values))) + return values[idx].mean(axis=1) + + +def bootstrap_ci( + values: np.ndarray, + n_boot: int = N_BOOT, + *, + seed: int, + alpha: float = ALPHA, +) -> tuple[float, float]: + """95% bootstrap CI on the mean (default alpha=0.05). + + ``seed`` is keyword-only and required: callers must supply a deliberate + seed (typically from ``run_seed(run_dir.name)``) so behavior is deterministic. + + Returns ``(lower, upper)``; ``(nan, nan)`` if the input is empty. + """ + boot = bootstrap_resample(values, n_boot=n_boot, seed=seed) + if len(boot) == 0: + return float("nan"), float("nan") + lower = float(np.percentile(boot, 100 * alpha / 2)) + upper = float(np.percentile(boot, 100 * (1 - alpha / 2))) + return lower, upper + + +def bootstrap_ci_fields( + samples: dict[str, Sequence[float]], + *, + seed: int, + decimals: int = 4, +) -> dict[str, float]: + """Return ``{name}_ci_lower`` / ``{name}_ci_upper`` for each ``(name, sample)`` pair.""" + out: dict[str, float] = {} + for name, sample in samples.items(): + lower, upper = bootstrap_ci(sample, seed=seed) + out[f"{name}_ci_lower"] = round(lower, decimals) + out[f"{name}_ci_upper"] = round(upper, decimals) + return out + + +def mean_ci_fields( + scenario_values: np.ndarray, + *, + seed: int, + decimals: int = 4, +) -> dict[str, Any]: + """Return ``mean_ci_lower`` / ``mean_ci_upper`` / ``mean_ci_n_scenarios``. + + Empty ``scenario_values`` yields ``None`` bounds and ``n_scenarios=0``; otherwise + returns a percentile bootstrap CI on the mean. + """ + if len(scenario_values) == 0: + return {"mean_ci_lower": None, "mean_ci_upper": None, "mean_ci_n_scenarios": 0} + lower, upper = bootstrap_ci(scenario_values, seed=seed) + return { + "mean_ci_lower": round(lower, decimals), + "mean_ci_upper": round(upper, decimals), + "mean_ci_n_scenarios": len(scenario_values), + } diff --git a/tests/unit/metrics/test_aggregation.py b/tests/unit/metrics/test_aggregation.py index 93ff83c5..738412b6 100644 --- a/tests/unit/metrics/test_aggregation.py +++ b/tests/unit/metrics/test_aggregation.py @@ -1,17 +1,48 @@ """Unit tests for EVA composite metric aggregation.""" +import numpy as np import pytest from eva.metrics.aggregation import ( + EVA_COMPOSITES, _check_threshold, + _scenario_values_for_composite, compute_record_aggregates, compute_run_level_aggregates, + scenario_means_for_metric, ) -from eva.models.results import MetricScore, RecordMetrics +from eva.metrics.runner import MetricsRunner +from eva.models.results import MetricScore, PassAtKResult, RecordMetrics +from eva.utils.bootstrap import run_seed from .conftest import make_record_metrics +def _composite_by_name(name: str): + return next(c for c in EVA_COMPOSITES if c.name == name) + + +def _make_clean_records(n: int, passing: int) -> dict[str, RecordMetrics]: + """Return n records, ``passing`` of which pass EVA-A_pass.""" + records: dict[str, RecordMetrics] = {} + for i in range(n): + is_pass = i < passing + r = make_record_metrics( + { + "task_completion": 1.0 if is_pass else 0.0, + "faithfulness": 0.5, + "agent_speech_fidelity": 0.95, + "conversation_progression": 0.5, + "turn_taking": 0.8, + "conciseness": 0.5, + }, + record_id=f"1.1.{i}", + ) + r.aggregate_metrics = compute_record_aggregates(r) + records[f"1.1.{i}"] = r + return records + + class TestCheckThreshold: def test_eq_exact(self): assert _check_threshold(1.0, "==", 1.0) is True @@ -255,7 +286,7 @@ def test_basic_run_level(self): ) r2.aggregate_metrics = compute_record_aggregates(r2) - result = compute_run_level_aggregates({"1.1.1": r1, "1.1.2": r2}) + result = compute_run_level_aggregates({"1.1.1": r1, "1.1.2": r2}, seed=42) # EVA-A_pass: r1=1.0, r2=0.0 -> mean=0.5 assert result["EVA-A_pass"]["mean"] == 0.5 @@ -287,7 +318,7 @@ def test_mean_success_rate(self): ) r2.aggregate_metrics = compute_record_aggregates(r2) - result = compute_run_level_aggregates({"1": r1, "2": r2}) + result = compute_run_level_aggregates({"1": r1, "2": r2}, seed=42) # EVA-A_mean: r1=1.0, r2=0.0 -> mean=0.5, success_rate=0.5 (1 of 2 >= 0.5) assert result["EVA-A_mean"]["mean"] == 0.5 @@ -295,7 +326,7 @@ def test_mean_success_rate(self): def test_empty_metrics(self): """No records -> empty result.""" - result = compute_run_level_aggregates({}) + result = compute_run_level_aggregates({}, seed=42) assert result == {} def test_records_with_none_aggregates_excluded(self): @@ -305,7 +336,7 @@ def test_records_with_none_aggregates_excluded(self): # EVA-A_pass should be None (missing faithfulness, agent_speech_fidelity) assert r1.aggregate_metrics["EVA-A_pass"] is None - result = compute_run_level_aggregates({"1": r1}) + result = compute_run_level_aggregates({"1": r1}, seed=42) # EVA-A_pass present but with None mean and none_count tracking assert result["EVA-A_pass"]["mean"] is None @@ -331,7 +362,7 @@ def test_pass_at_k_with_multi_trial(self): rm.aggregate_metrics = compute_record_aggregates(rm) all_metrics[f"1.1.1/trial_{trial_idx}"] = rm - result = compute_run_level_aggregates(all_metrics, num_draws=3) + result = compute_run_level_aggregates(all_metrics, num_draws=3, seed=42) assert "pass_k" in result eva_a = result["pass_k"]["EVA-A_pass"] @@ -369,7 +400,317 @@ def test_pass_at_k_excludes_record_with_none_trial(self): # Verify trial 2 has None for EVA-A_pass assert all_metrics["1.1.1/trial_2"].aggregate_metrics["EVA-A_pass"] is None - result = compute_run_level_aggregates(all_metrics, num_draws=3) + result = compute_run_level_aggregates(all_metrics, num_draws=3, seed=42) # Record should be excluded from pass_k since not all 3 trials are valid assert "pass_k" not in result or "EVA-A_pass" not in result.get("pass_k", {}) + + +class TestScenarioGrouping: + def test_per_metric_k1_record_equals_scenario(self): + r1 = make_record_metrics({"task_completion": 1.0}, record_id="1.1.1") + r2 = make_record_metrics({"task_completion": 0.5}, record_id="1.1.2") + vals = scenario_means_for_metric({"1.1.1": r1, "1.1.2": r2}, "task_completion") + np.testing.assert_allclose(sorted(vals.tolist()), [0.5, 1.0]) + + def test_per_metric_k3_collapses_trials(self): + # Same scenario id "1.1.1", three trials with scores 0.0, 0.5, 1.0 → scenario mean 0.5 + r0 = make_record_metrics({"task_completion": 0.0}, record_id="1.1.1/trial_0") + r1 = make_record_metrics({"task_completion": 0.5}, record_id="1.1.1/trial_1") + r2 = make_record_metrics({"task_completion": 1.0}, record_id="1.1.1/trial_2") + all_m = {"1.1.1/trial_0": r0, "1.1.1/trial_1": r1, "1.1.1/trial_2": r2} + vals = scenario_means_for_metric(all_m, "task_completion") + np.testing.assert_allclose(vals.tolist(), [0.5]) + + def test_per_metric_skips_errored_trials(self): + # One scenario, two trials; one trial has the metric errored + r0 = make_record_metrics({"task_completion": 1.0}, record_id="1.1.1/trial_0") + r1 = RecordMetrics( + record_id="1.1.1/trial_1", + metrics={"task_completion": MetricScore(name="task_completion", score=0.0, error="boom")}, + ) + vals = scenario_means_for_metric({"1.1.1/trial_0": r0, "1.1.1/trial_1": r1}, "task_completion") + np.testing.assert_allclose(vals.tolist(), [1.0]) # mean over the 1 valid trial + + def test_per_metric_drops_all_none_scenarios(self): + # Scenario with all trials errored is dropped from the bootstrap unit count. + r0 = RecordMetrics( + record_id="1.1.1/trial_0", + metrics={"task_completion": MetricScore(name="task_completion", score=0.0, error="boom")}, + ) + r1 = RecordMetrics( + record_id="1.1.1/trial_1", + metrics={"task_completion": MetricScore(name="task_completion", score=0.0, error="boom")}, + ) + r2 = make_record_metrics({"task_completion": 0.5}, record_id="1.1.2/trial_0") + all_m = {"1.1.1/trial_0": r0, "1.1.1/trial_1": r1, "1.1.2/trial_0": r2} + vals = scenario_means_for_metric(all_m, "task_completion") + np.testing.assert_allclose(vals.tolist(), [0.5]) + + def test_composite_k3_collapses_trials(self): + # EVA-A_pass scenario value = mean over trials of per-trial 0/1 + comp = _composite_by_name("EVA-A_pass") + r0 = make_record_metrics( + {"task_completion": 1.0, "faithfulness": 0.5, "agent_speech_fidelity": 0.95}, + record_id="1.1.1/trial_0", + ) + r0.aggregate_metrics = compute_record_aggregates(r0) + r1 = make_record_metrics( + {"task_completion": 0.0, "faithfulness": 0.5, "agent_speech_fidelity": 0.95}, + record_id="1.1.1/trial_1", + ) + r1.aggregate_metrics = compute_record_aggregates(r1) + all_m = {"1.1.1/trial_0": r0, "1.1.1/trial_1": r1} + vals = _scenario_values_for_composite(all_m, comp) + # trial 0 passes (1.0), trial 1 fails (0.0) → scenario mean 0.5 + np.testing.assert_allclose(vals.tolist(), [0.5]) + + def test_composite_empty_returns_empty_array(self): + comp = _composite_by_name("EVA-A_pass") + vals = _scenario_values_for_composite({}, comp) + assert vals.shape == (0,) + + +class TestRunLevelCompositeCIs: + def test_emits_ci_fields_for_all_composites(self): + records = _make_clean_records(n=20, passing=10) + result = compute_run_level_aggregates(records, seed=42) + for comp_name in [ + "EVA-A_pass", + "EVA-X_pass", + "EVA-A_mean", + "EVA-X_mean", + "EVA-overall_mean", + "EVA-overall_pass", + ]: + assert "mean_ci_lower" in result[comp_name], f"missing mean_ci_lower on {comp_name}" + assert "mean_ci_upper" in result[comp_name], f"missing mean_ci_upper on {comp_name}" + assert "mean_ci_n_scenarios" in result[comp_name], f"missing mean_ci_n_scenarios on {comp_name}" + + def test_ci_brackets_point_estimate(self): + records = _make_clean_records(n=50, passing=25) + result = compute_run_level_aggregates(records, seed=42) + entry = result["EVA-A_pass"] + assert entry["mean_ci_lower"] <= entry["mean"] <= entry["mean_ci_upper"] + + def test_n_scenarios_equals_count_for_k1(self): + records = _make_clean_records(n=20, passing=10) + result = compute_run_level_aggregates(records, seed=42) + assert result["EVA-A_pass"]["mean_ci_n_scenarios"] == result["EVA-A_pass"]["count"] + + def test_empty_run_returns_empty_dict(self): + result = compute_run_level_aggregates({}, seed=42) + # The existing function already early-returns {} for empty input; CI + # addition must not change this. + assert result == {} + + def test_composite_with_no_valid_data_emits_null_ci(self): + # A record where every component has an error → composite is None + r = RecordMetrics( + record_id="1.1.1", + metrics={ + "task_completion": MetricScore(name="task_completion", score=0.0, error="boom"), + "faithfulness": MetricScore(name="faithfulness", score=0.0, error="boom"), + "agent_speech_fidelity": MetricScore(name="agent_speech_fidelity", score=0.0, error="boom"), + }, + ) + r.aggregate_metrics = compute_record_aggregates(r) + # Sanity: composite is None for this record + assert r.aggregate_metrics["EVA-A_pass"] is None + + result = compute_run_level_aggregates({"1.1.1": r}, seed=42) + entry = result["EVA-A_pass"] + assert entry["mean_ci_lower"] is None + assert entry["mean_ci_upper"] is None + assert entry["mean_ci_n_scenarios"] == 0 + + +class TestRunLevelPassKCIs: + def _make_multi_trial_records(self, scenario_pass_pattern: list[tuple[int, int]]): + """For each ``(n_scenarios, n_passing_trials_per_scenario)`` group, build records. + + Always uses k=3 trials per scenario. + """ + records = {} + sid = 0 + for n_scen, n_pass in scenario_pass_pattern: + for _ in range(n_scen): + sid += 1 + for trial in range(3): + is_pass = trial < n_pass + r = make_record_metrics( + { + "task_completion": 1.0 if is_pass else 0.0, + "faithfulness": 0.5, + "agent_speech_fidelity": 0.95, + "conversation_progression": 0.5, + "turn_taking": 0.8, + "conciseness": 0.5, + }, + record_id=f"1.1.{sid}/trial_{trial}", + ) + r.aggregate_metrics = compute_record_aggregates(r) + records[f"1.1.{sid}/trial_{trial}"] = r + return records + + def test_pass_k_block_has_ci_fields(self): + records = self._make_multi_trial_records([(10, 3), (10, 1), (10, 0)]) + result = compute_run_level_aggregates(records, num_draws=3, seed=42) + block = result["pass_k"]["EVA-A_pass"] + for stat in ["pass_at_1", "pass_at_k", "pass_power_k_observed"]: + assert f"{stat}_ci_lower" in block, f"missing {stat}_ci_lower" + assert f"{stat}_ci_upper" in block, f"missing {stat}_ci_upper" + # pass_power_k_theoretical stays bare + assert "pass_power_k_theoretical_ci_lower" not in block + assert "pass_power_k_theoretical_ci_upper" not in block + + def test_pass_k_ci_brackets_point(self): + records = self._make_multi_trial_records([(10, 3), (10, 1), (10, 0)]) + result = compute_run_level_aggregates(records, num_draws=3, seed=42) + block = result["pass_k"]["EVA-A_pass"] + assert block["pass_at_1_ci_lower"] <= block["pass_at_1"] <= block["pass_at_1_ci_upper"] + assert block["pass_at_k_ci_lower"] <= block["pass_at_k"] <= block["pass_at_k_ci_upper"] + assert ( + block["pass_power_k_observed_ci_lower"] + <= block["pass_power_k_observed"] + <= block["pass_power_k_observed_ci_upper"] + ) + + +class TestPerMetricCIs: + def _records_with_metric(self, name: str, values: list[tuple[str, float | None]]): + """Build a dict[record_id, RecordMetrics] from (record_id, value) pairs. + + ``None`` value means the metric is errored for that record. + """ + out = {} + for rid, v in values: + if v is None: + m = MetricScore(name=name, score=0.0, error="boom") + else: + m = MetricScore(name=name, score=v, normalized_score=v) + out[rid] = RecordMetrics(record_id=rid, metrics={name: m}) + return out + + def test_per_metric_mean_ci_fields(self): + records = self._records_with_metric( + "task_completion", + [(f"1.1.{i}", float(i) / 10) for i in range(20)], + ) + agg = MetricsRunner._build_per_metric_aggregates( + records, ["task_completion"], pass_at_k_results=None, num_draws=1, seed=42 + ) + entry = agg["task_completion"] + assert "mean_ci_lower" in entry + assert "mean_ci_upper" in entry + assert "mean_ci_n_scenarios" in entry + assert entry["mean_ci_lower"] <= entry["mean"] <= entry["mean_ci_upper"] + # n_scenarios == count for k=1 + assert entry["mean_ci_n_scenarios"] == entry["count"] + + def test_per_metric_no_valid_records_emits_null_ci(self): + records = self._records_with_metric( + "task_completion", + [("1.1.1", None), ("1.1.2", None)], + ) + agg = MetricsRunner._build_per_metric_aggregates( + records, ["task_completion"], pass_at_k_results=None, num_draws=1, seed=42 + ) + entry = agg["task_completion"] + assert entry["mean_ci_lower"] is None + assert entry["mean_ci_upper"] is None + assert entry["mean_ci_n_scenarios"] == 0 + + def test_per_metric_pass_k_ci_fields(self): + # Build per-scenario PassAtKResult fixtures and confirm pass_k CI fields appear. + records = {} + for sid in range(10): + for trial in range(3): + m = MetricScore( + name="task_completion", score=1.0 if trial < 2 else 0.0, normalized_score=1.0 if trial < 2 else 0.0 + ) + records[f"1.1.{sid}/trial_{trial}"] = RecordMetrics( + record_id=f"1.1.{sid}/trial_{trial}", + metrics={"task_completion": m}, + ) + pass_at_k_results = { + f"1.1.{sid}": { + "task_completion": PassAtKResult( + metric_name="task_completion", + n=3, + k=3, + c=2, + pass_at_k=1.0, + pass_power_k=0.0, + threshold=0.5, + ) + } + for sid in range(10) + } + agg = MetricsRunner._build_per_metric_aggregates( + records, ["task_completion"], pass_at_k_results=pass_at_k_results, num_draws=3, seed=42 + ) + block = agg["task_completion"]["pass_k"] + for stat in ["pass_at_1", "pass_at_k", "pass_power_k_observed"]: + assert f"{stat}_ci_lower" in block + assert f"{stat}_ci_upper" in block + + +class TestRunSeedIntegration: + def test_within_run_byte_identical(self): + records = _make_clean_records(n=20, passing=10) + seed = run_seed("2026-04-16_18-55-44.848147_gpt-realtime-1.5") + a = compute_run_level_aggregates(records, seed=seed) + b = compute_run_level_aggregates(records, seed=seed) + assert a == b + + def test_across_run_independence(self): + records = _make_clean_records(n=20, passing=10) + # Seed strings chosen empirically: the bimodal n=20 fixture gives a low-variance + # bootstrap distribution where many seed pairs land on identical percentile bounds. + # The "x"/"y" pair produces differing CI bounds for both EVA-A_pass and EVA-A_mean. + seed_a = run_seed("x") + seed_b = run_seed("y") + a = compute_run_level_aggregates(records, seed=seed_a) + b = compute_run_level_aggregates(records, seed=seed_b) + # Point estimates are identical (same data); CI bounds differ (different MC noise). + for comp_name in ["EVA-A_pass", "EVA-A_mean"]: + assert a[comp_name]["mean"] == b[comp_name]["mean"] + # At least one of (lower, upper) must differ across runs. + assert ( + a[comp_name]["mean_ci_lower"] != b[comp_name]["mean_ci_lower"] + or a[comp_name]["mean_ci_upper"] != b[comp_name]["mean_ci_upper"] + ) + + def test_per_metric_seed_propagation(self): + # The seed kwarg added in Task 5 to _build_per_metric_aggregates must actually + # change the CI bounds; same data + same seed must be deterministic. + records = {} + for i in range(20): + value = float(i) / 20.0 + m = MetricScore(name="task_completion", score=value, normalized_score=value) + records[f"1.1.{i}"] = RecordMetrics(record_id=f"1.1.{i}", metrics={"task_completion": m}) + + seed_a = run_seed("run-a") + seed_b = run_seed("run-b") + + agg_a1 = MetricsRunner._build_per_metric_aggregates( + records, ["task_completion"], pass_at_k_results=None, num_draws=1, seed=seed_a + ) + agg_a2 = MetricsRunner._build_per_metric_aggregates( + records, ["task_completion"], pass_at_k_results=None, num_draws=1, seed=seed_a + ) + agg_b = MetricsRunner._build_per_metric_aggregates( + records, ["task_completion"], pass_at_k_results=None, num_draws=1, seed=seed_b + ) + + # Same seed → byte-identical + assert agg_a1["task_completion"] == agg_a2["task_completion"] + # Different seed → at least one bound differs. The n=20 continuous-value fixture + # produces enough bootstrap variance for bounds to differ across seeds. + entry_a = agg_a1["task_completion"] + entry_b = agg_b["task_completion"] + assert entry_a["mean"] == entry_b["mean"] + assert ( + entry_a["mean_ci_lower"] != entry_b["mean_ci_lower"] or entry_a["mean_ci_upper"] != entry_b["mean_ci_upper"] + ) diff --git a/tests/unit/metrics/test_runner.py b/tests/unit/metrics/test_runner.py index 29101483..e50057ac 100644 --- a/tests/unit/metrics/test_runner.py +++ b/tests/unit/metrics/test_runner.py @@ -764,7 +764,7 @@ def test_all_successful(self): "r1": RecordMetrics(record_id="r1", metrics={"m": _ms("m", 0.8)}), "r2": RecordMetrics(record_id="r2", metrics={"m": _ms("m", 0.6)}), } - result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"]) + result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"], seed=42) assert result["m"]["count"] == 2 assert result["m"]["none_count"] == 0 assert result["m"]["error_count"] == 0 @@ -778,7 +778,7 @@ def test_errors_and_missing_split(self): "r2": RecordMetrics(record_id="r2", metrics={"m": _ms("m", 0.0, error="JSON parse failed")}), "r3": RecordMetrics(record_id="r3", metrics={}), # metric missing entirely } - result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"]) + result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"], seed=42) assert result["m"]["count"] == 1 assert result["m"]["error_count"] == 1 assert result["m"]["missing_count"] == 1 @@ -791,7 +791,7 @@ def test_only_errors(self): "r1": RecordMetrics(record_id="r1", metrics={"m": _ms("m", 0.0, error="fail1")}), "r2": RecordMetrics(record_id="r2", metrics={"m": _ms("m", 0.0, error="fail2")}), } - result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"]) + result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"], seed=42) assert result["m"]["count"] == 0 assert result["m"]["error_count"] == 2 assert result["m"]["missing_count"] == 0 @@ -807,7 +807,7 @@ def test_higher_is_better_read_from_registered_metric(self): metrics={"response_speed": MetricScore(name="response_speed", score=1.2, normalized_score=None)}, ), } - result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["response_speed"]) + result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["response_speed"], seed=42) assert result["response_speed"]["higher_is_better"] is False def test_higher_is_better_defaults_true_for_unknown_metric(self): @@ -815,7 +815,7 @@ def test_higher_is_better_defaults_true_for_unknown_metric(self): all_metrics = { "r1": RecordMetrics(record_id="r1", metrics={"m": MetricScore(name="m", score=0.3)}), } - result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"]) + result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"], seed=42) assert result["m"]["higher_is_better"] is True def test_sub_metric_direction_derived_from_suffix(self): @@ -848,7 +848,7 @@ def test_sub_metric_direction_derived_from_suffix(self): ), } result = MetricsRunner._build_per_metric_aggregates( - all_metrics, ["faithfulness", "transcription_accuracy_key_entities"] + all_metrics, ["faithfulness", "transcription_accuracy_key_entities"], seed=42 ) assert result["faithfulness"]["sub_metrics"]["hallucination_rate"]["higher_is_better"] is False assert result["transcription_accuracy_key_entities"]["sub_metrics"]["name_accuracy"]["higher_is_better"] is True diff --git a/tests/unit/utils/test_bootstrap.py b/tests/unit/utils/test_bootstrap.py new file mode 100644 index 00000000..361975bb --- /dev/null +++ b/tests/unit/utils/test_bootstrap.py @@ -0,0 +1,101 @@ +"""Unit tests for src/eva/utils/bootstrap.py.""" + +from __future__ import annotations + +import math +import subprocess +import sys +import textwrap + +import numpy as np + +from eva.utils.bootstrap import ( + ALPHA, + N_BOOT, + bootstrap_ci, + bootstrap_resample, + run_seed, +) + + +class TestBootstrapResample: + def test_shape_and_determinism(self): + values = np.array([0.0, 0.5, 1.0, 0.25, 0.75]) + a = bootstrap_resample(values, n_boot=100, seed=42) + b = bootstrap_resample(values, n_boot=100, seed=42) + assert a.shape == (100,) + np.testing.assert_array_equal(a, b) + + def test_different_seeds_differ(self): + values = np.array([0.0, 0.5, 1.0]) + a = bootstrap_resample(values, n_boot=100, seed=1) + b = bootstrap_resample(values, n_boot=100, seed=2) + assert not np.array_equal(a, b) + + def test_constant_input_constant_output(self): + values = np.full(10, 0.7) + boot = bootstrap_resample(values, n_boot=50, seed=0) + np.testing.assert_allclose(boot, 0.7) + + def test_empty_input(self): + boot = bootstrap_resample(np.array([]), n_boot=10, seed=0) + assert boot.shape == (0,) + + +class TestBootstrapCI: + def test_brackets_mean(self): + rng = np.random.default_rng(0) + values = rng.normal(loc=0.5, scale=0.1, size=100) + lower, upper = bootstrap_ci(values, n_boot=2000, seed=42, alpha=0.05) + assert lower < values.mean() < upper + assert upper - lower < 0.1 + + def test_narrower_alpha_widens(self): + rng = np.random.default_rng(0) + values = rng.normal(loc=0.5, scale=0.1, size=100) + lo_90, hi_90 = bootstrap_ci(values, n_boot=2000, seed=42, alpha=0.10) + lo_95, hi_95 = bootstrap_ci(values, n_boot=2000, seed=42, alpha=0.05) + assert (hi_95 - lo_95) > (hi_90 - lo_90) + + def test_empty_input_returns_nans(self): + lower, upper = bootstrap_ci(np.array([]), n_boot=100, seed=0) + assert math.isnan(lower) + assert math.isnan(upper) + + def test_single_value(self): + lower, upper = bootstrap_ci(np.array([0.42]), n_boot=100, seed=0) + assert lower == upper == 0.42 + + def test_n_boot_and_alpha_defaults_match_module_constants(self): + # bootstrap_ci's optional n_boot/alpha defaults should match the module constants. + values = np.array([0.1, 0.2, 0.3, 0.4, 0.5]) + a = bootstrap_ci(values, seed=0) + b = bootstrap_ci(values, n_boot=N_BOOT, seed=0, alpha=ALPHA) + assert a == b + + +class TestRunSeed: + def test_deterministic_same_input(self): + assert run_seed("abc") == run_seed("abc") + + def test_different_inputs_differ(self): + assert run_seed("abc") != run_seed("def") + + def test_returns_nonnegative_int(self): + s = run_seed("any-run-id") + assert isinstance(s, int) + assert s >= 0 + assert s < 2**31 + + def test_cross_process_stable(self): + """run_seed must NOT use Python's salted hash(); spawn a subprocess and check equality.""" + in_process = run_seed("cross-process-check") + script = textwrap.dedent( + """ + from eva.utils.bootstrap import run_seed + print(run_seed("cross-process-check")) + """ + ) + result = subprocess.run([sys.executable, "-c", script], capture_output=True, text=True, check=True) + subprocess_value = int(result.stdout.strip()) + assert in_process == subprocess_value