diff --git a/src/eva/__init__.py b/src/eva/__init__.py
index ecc5f2a8..cf0ea3e5 100644
--- a/src/eva/__init__.py
+++ b/src/eva/__init__.py
@@ -11,4 +11,4 @@
 
 # Bump metrics_version when changes affect metric computation (metrics code,
 # judge prompts, pricing tables, postprocessor).
-metrics_version = "2.0.0"
+metrics_version = "2.1.0"
diff --git a/src/eva/metrics/aggregation.py b/src/eva/metrics/aggregation.py
index 7a19a9c5..7217c4dc 100644
--- a/src/eva/metrics/aggregation.py
+++ b/src/eva/metrics/aggregation.py
@@ -9,7 +9,10 @@
 from dataclasses import dataclass, field
 from typing import Literal
 
+import numpy as np
+
 from eva.models.results import RecordMetrics
+from eva.utils.bootstrap import bootstrap_ci_fields, mean_ci_fields
 from eva.utils.pass_at_k import (
     compute_pass_at_k,
     compute_pass_power_k,
@@ -83,6 +86,55 @@ class EVACompositeDefinition:
 ]
 
 
+def _scenario_means(per_record_values: dict[str, float | None]) -> np.ndarray:
+    """Group per-record values by base scenario id and return per-scenario means.
+
+    Scenarios where every record contributes None are dropped.
+    """
+    grouped: dict[str, list[float]] = {}
+    for record_id, val in per_record_values.items():
+        if val is None:
+            continue
+        base_id, _ = parse_trial_record_id(record_id)
+        grouped.setdefault(base_id, []).append(float(val))
+    if not grouped:
+        return np.array([], dtype=float)
+    return np.array([sum(vs) / len(vs) for vs in grouped.values()], dtype=float)
+
+
+def scenario_means_for_metric(
+    all_metrics: dict[str, RecordMetrics],
+    metric_name: str,
+) -> np.ndarray:
+    """Per-scenario mean over trials of one metric's score.
+
+    Uses ``normalized_score`` (falling back to ``score``). Scenarios where all
+    trials are missing/errored are dropped. For k=1 runs each record is its own
+    scenario.
+    """
+    return _scenario_means(
+        {record_id: record_metrics.get_score(metric_name) for record_id, record_metrics in all_metrics.items()}
+    )
+
+
+def _scenario_values_for_composite(
+    all_metrics: dict[str, RecordMetrics],
+    comp: EVACompositeDefinition,
+) -> np.ndarray:
+    """Per-scenario mean over trials of a composite's per-trial value.
+
+    Reads from ``aggregate_metrics``. For pass/derived composites this is the
+    scenario pass rate. Scenarios where all trials have ``None`` for this
+    composite are dropped.
+    """
+    return _scenario_means(
+        {
+            record_id: record_metrics.aggregate_metrics.get(comp.name)
+            for record_id, record_metrics in all_metrics.items()
+        }
+    )
+
+
 def _check_threshold(value: float, operator: str, threshold: float) -> bool:
     """Check whether a value passes the given threshold comparison."""
     if operator == "==":
@@ -159,6 +211,8 @@ def compute_run_level_aggregates(
     all_metrics: dict[str, RecordMetrics],
     num_draws: int = 1,
     composites: list[EVACompositeDefinition] | None = None,
+    *,
+    seed: int,
 ) -> dict:
     """Compute run-level aggregate scores from all records.
 
@@ -166,9 +220,12 @@ def compute_run_level_aggregates(
         all_metrics: Dict mapping record ID to RecordMetrics (must have aggregate_metrics populated).
         num_draws: Number of draws (k) for pass@k computation.
         composites: Custom composite definitions. Defaults to EVA_COMPOSITES.
+        seed: Bootstrap seed for CI computation. Keyword-only and required.
+            Production callers pass ``run_seed(run_dir.name)`` for within-run
+            determinism.
 
     Returns:
-        Dict with per-composite statistics and optional pass@k data.
+        Dict with per-composite statistics, CI fields, and optional pass@k data.
     """
     composites = composites or EVA_COMPOSITES
 
@@ -206,11 +263,14 @@ def compute_run_level_aggregates(
             else:
                 entry["success_rate"] = round(sum(1 for v in values if v >= 0.5) / len(values), 4)
 
+        # Bootstrap CI on the per-scenario mean.
+        entry.update(mean_ci_fields(_scenario_values_for_composite(all_metrics, comp), seed=seed))
+
         result[comp.name] = entry
 
     # pass_k for aggregate metrics if multi-trial
     if num_draws > 1:
-        pass_k_data = _compute_aggregate_pass_k(all_metrics, num_draws, composites)
+        pass_k_data = _compute_aggregate_pass_k(all_metrics, num_draws, composites, seed=seed)
         if pass_k_data:
             result["pass_k"] = pass_k_data
 
@@ -221,6 +281,8 @@ def _compute_aggregate_pass_k(
     all_metrics: dict[str, RecordMetrics],
     num_draws: int,
     composites: list[EVACompositeDefinition] | None = None,
+    *,
+    seed: int,
 ) -> dict:
     """Compute pass@1, pass@k, pass^k (observed), and pass^k (theoretical) for aggregate metrics across trials."""
     composites = composites or EVA_COMPOSITES
@@ -264,7 +326,7 @@ def _compute_aggregate_pass_k(
 
         if pass_at_k_values:
             count = len(pass_at_k_values)
-            result[comp.name] = {
+            entry = {
                 "pass_at_1": round(sum(pass_at_1_values) / count, 4),
                 "pass_at_k": round(sum(pass_at_k_values) / count, 4),
                 "pass_power_k_observed": round(sum(pass_power_k_observed_values) / count, 4),
@@ -272,5 +334,16 @@ def _compute_aggregate_pass_k(
                 "k": num_draws,
                 "count": count,
             }
+            entry.update(
+                bootstrap_ci_fields(
+                    {
+                        "pass_at_1": pass_at_1_values,
+                        "pass_at_k": pass_at_k_values,
+                        "pass_power_k_observed": pass_power_k_observed_values,
+                    },
+                    seed=seed,
+                )
+            )
+            result[comp.name] = entry
 
     return result
diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
index 470ec6fc..aad09079 100644
--- a/src/eva/metrics/runner.py
+++ b/src/eva/metrics/runner.py
@@ -10,7 +10,11 @@
 import yaml
 
 from eva.metrics.accuracy.agent_speech_fidelity_s2s import AgentSpeechFidelityS2SMetric
-from eva.metrics.aggregation import compute_record_aggregates, compute_run_level_aggregates
+from eva.metrics.aggregation import (
+    compute_record_aggregates,
+    compute_run_level_aggregates,
+    scenario_means_for_metric,
+)
 from eva.metrics.base import BaseMetric, MetricContext
 from eva.metrics.legacy_aliases import rename_metric_keys
 from eva.metrics.processor import MetricsContextProcessor
@@ -20,6 +24,7 @@
 from eva.models.config import PipelineType, get_pipeline_type
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics
+from eva.utils.bootstrap import bootstrap_ci_fields, mean_ci_fields, run_seed
 from eva.utils.hash_utils import get_dict_hash
 from eva.utils.logging import get_logger
 from eva.utils.pass_at_k import (
@@ -632,6 +637,8 @@ def _build_per_metric_aggregates(
         metric_names: list[str],
         pass_at_k_results: dict[str, dict[str, PassAtKResult]] | None = None,
         num_draws: int = 1,
+        *,
+        seed: int,
     ) -> dict[str, dict[str, Any]]:
         """Build per-metric aggregate stats including pass_k.
 
@@ -640,6 +647,9 @@ def _build_per_metric_aggregates(
             metric_names: List of metric names to aggregate.
             pass_at_k_results: Per-record pass@k results (if multi-trial).
             num_draws: Number of draws (k) for pass@k.
+            seed: Bootstrap seed for CI computation. Keyword-only and required;
+                production callers pass ``run_seed(run_dir.name)`` for within-run
+                determinism.
 
         Returns:
             Dict mapping metric name to aggregate stats.
@@ -698,6 +708,9 @@ def _build_per_metric_aggregates(
                         coverage["not_applicable_turns"] = total_not_applicable_across_records
                     entry["per_turn_coverage"] = coverage
 
+                # Bootstrap CI on the per-scenario mean.
+                entry.update(mean_ci_fields(scenario_means_for_metric(all_metrics, name), seed=seed))
+
                 entry["higher_is_better"] = _metric_higher_is_better(name)
                 metric_aggregates[name] = entry
 
@@ -720,7 +733,7 @@ def _build_per_metric_aggregates(
 
                 if pass_at_k_values:
                     count = len(pass_at_k_values)
-                    metric_aggregates[name]["pass_k"] = {
+                    pass_k_block: dict[str, Any] = {
                         "pass_at_1": round(sum(pass_at_1_values) / count, 4),
                         "pass_at_k": round(sum(pass_at_k_values) / count, 4),
                         "pass_power_k_observed": round(sum(pass_power_k_obs_values) / count, 4),
@@ -728,6 +741,17 @@ def _build_per_metric_aggregates(
                         "k": num_draws,
                         "count": count,
                     }
+                    pass_k_block.update(
+                        bootstrap_ci_fields(
+                            {
+                                "pass_at_1": pass_at_1_values,
+                                "pass_at_k": pass_at_k_values,
+                                "pass_power_k_observed": pass_power_k_obs_values,
+                            },
+                            seed=seed,
+                        )
+                    )
+                    metric_aggregates[name]["pass_k"] = pass_k_block
 
         # Generic sub-metric aggregation.
         # Sub-keys are collected in first-seen insertion order so each metric controls
@@ -920,8 +944,13 @@ async def _save_summary(
         # Aggregate per_metric for ALL metrics present across records (not just those just run),
         # so that a partial re-run (e.g. --metrics response_speed) preserves other metrics.
         all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics})
+        seed = run_seed(self.run_dir.name)
         metric_aggregates = self._build_per_metric_aggregates(
-            all_metrics, all_metric_names, pass_at_k_results, self.num_draws
+            all_metrics,
+            all_metric_names,
+            pass_at_k_results,
+            self.num_draws,
+            seed=seed,
         )
 
         # Compute metric failures for MetricsRunResult (only for metrics just run)
@@ -934,7 +963,7 @@ async def _save_summary(
                         metric_failures.setdefault(name, []).append(record_id)
 
         # Compute EVA composite run-level aggregates
-        overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws)
+        overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws, seed=seed)
 
         # Load existing summary to preserve fields for metrics not being re-run
         summary_path = self.run_dir / "metrics_summary.json"
@@ -1038,12 +1067,17 @@ async def run_aggregate_only(cls, run_dir: Path, num_draws: int = 1) -> None:
         all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics})
 
         # Compute per-metric aggregates (including pass_k)
+        seed = run_seed(run_dir.name)
         metric_aggregates = cls._build_per_metric_aggregates(
-            all_metrics, all_metric_names, pass_at_k_results or None, num_draws
+            all_metrics,
+            all_metric_names,
+            pass_at_k_results or None,
+            num_draws,
+            seed=seed,
         )
 
         # Compute run-level aggregates
-        overall_scores = compute_run_level_aggregates(all_metrics, num_draws)
+        overall_scores = compute_run_level_aggregates(all_metrics, num_draws, seed=seed)
 
         # Update metrics_summary.json (preserve existing fields, replace computed sections)
         summary_path = run_dir / "metrics_summary.json"
diff --git a/src/eva/utils/bootstrap.py b/src/eva/utils/bootstrap.py
new file mode 100644
index 00000000..073dfd14
--- /dev/null
+++ b/src/eva/utils/bootstrap.py
@@ -0,0 +1,99 @@
+"""Percentile bootstrap primitives for sample-mean confidence intervals.
+
+This module is pure: numpy in, numpy/floats out. It has no eva imports and
+is safe to use from anywhere in the package.
+"""
+
+from __future__ import annotations
+
+import hashlib
+from collections.abc import Sequence
+from typing import Any
+
+import numpy as np
+
+N_BOOT = 2000
+ALPHA = 0.05
+
+
+def run_seed(run_id: str) -> int:
+    """Stable, run-dependent seed derived from the run directory name.
+
+    Uses ``hashlib.sha256`` rather than Python's built-in ``hash()`` because the
+    latter is salted per interpreter process — re-invoking ``eva metrics`` on the
+    same run would otherwise yield slightly different CI bounds. SHA-based hashing
+    is byte-stable across processes.
+    """
+    h = hashlib.sha256(run_id.encode()).digest()
+    return int.from_bytes(h[:4], "big") % (2**31)
+
+
+def bootstrap_resample(values: np.ndarray, n_boot: int, seed: int) -> np.ndarray:
+    """Return ``n_boot`` resampled means of ``values``.
+
+    Returns a zero-length array for empty input.
+    """
+    values = np.asarray(values, dtype=float)
+    if len(values) == 0:
+        return np.array([], dtype=float)
+    rng = np.random.default_rng(seed)
+    idx = rng.integers(0, len(values), size=(n_boot, len(values)))
+    return values[idx].mean(axis=1)
+
+
+def bootstrap_ci(
+    values: np.ndarray,
+    n_boot: int = N_BOOT,
+    *,
+    seed: int,
+    alpha: float = ALPHA,
+) -> tuple[float, float]:
+    """95% bootstrap CI on the mean (default alpha=0.05).
+
+    ``seed`` is keyword-only and required: callers must supply a deliberate
+    seed (typically from ``run_seed(run_dir.name)``) so behavior is deterministic.
+
+    Returns ``(lower, upper)``; ``(nan, nan)`` if the input is empty.
+    """
+    boot = bootstrap_resample(values, n_boot=n_boot, seed=seed)
+    if len(boot) == 0:
+        return float("nan"), float("nan")
+    lower = float(np.percentile(boot, 100 * alpha / 2))
+    upper = float(np.percentile(boot, 100 * (1 - alpha / 2)))
+    return lower, upper
+
+
+def bootstrap_ci_fields(
+    samples: dict[str, Sequence[float]],
+    *,
+    seed: int,
+    decimals: int = 4,
+) -> dict[str, float]:
+    """Return ``{name}_ci_lower`` / ``{name}_ci_upper`` for each ``(name, sample)`` pair."""
+    out: dict[str, float] = {}
+    for name, sample in samples.items():
+        lower, upper = bootstrap_ci(sample, seed=seed)
+        out[f"{name}_ci_lower"] = round(lower, decimals)
+        out[f"{name}_ci_upper"] = round(upper, decimals)
+    return out
+
+
+def mean_ci_fields(
+    scenario_values: np.ndarray,
+    *,
+    seed: int,
+    decimals: int = 4,
+) -> dict[str, Any]:
+    """Return ``mean_ci_lower`` / ``mean_ci_upper`` / ``mean_ci_n_scenarios``.
+
+    Empty ``scenario_values`` yields ``None`` bounds and ``n_scenarios=0``; otherwise
+    returns a percentile bootstrap CI on the mean.
+    """
+    if len(scenario_values) == 0:
+        return {"mean_ci_lower": None, "mean_ci_upper": None, "mean_ci_n_scenarios": 0}
+    lower, upper = bootstrap_ci(scenario_values, seed=seed)
+    return {
+        "mean_ci_lower": round(lower, decimals),
+        "mean_ci_upper": round(upper, decimals),
+        "mean_ci_n_scenarios": len(scenario_values),
+    }
diff --git a/tests/unit/metrics/test_aggregation.py b/tests/unit/metrics/test_aggregation.py
index 93ff83c5..738412b6 100644
--- a/tests/unit/metrics/test_aggregation.py
+++ b/tests/unit/metrics/test_aggregation.py
@@ -1,17 +1,48 @@
 """Unit tests for EVA composite metric aggregation."""
 
+import numpy as np
 import pytest
 
 from eva.metrics.aggregation import (
+    EVA_COMPOSITES,
     _check_threshold,
+    _scenario_values_for_composite,
     compute_record_aggregates,
     compute_run_level_aggregates,
+    scenario_means_for_metric,
 )
-from eva.models.results import MetricScore, RecordMetrics
+from eva.metrics.runner import MetricsRunner
+from eva.models.results import MetricScore, PassAtKResult, RecordMetrics
+from eva.utils.bootstrap import run_seed
 
 from .conftest import make_record_metrics
 
 
+def _composite_by_name(name: str):
+    return next(c for c in EVA_COMPOSITES if c.name == name)
+
+
+def _make_clean_records(n: int, passing: int) -> dict[str, RecordMetrics]:
+    """Return n records, ``passing`` of which pass EVA-A_pass."""
+    records: dict[str, RecordMetrics] = {}
+    for i in range(n):
+        is_pass = i < passing
+        r = make_record_metrics(
+            {
+                "task_completion": 1.0 if is_pass else 0.0,
+                "faithfulness": 0.5,
+                "agent_speech_fidelity": 0.95,
+                "conversation_progression": 0.5,
+                "turn_taking": 0.8,
+                "conciseness": 0.5,
+            },
+            record_id=f"1.1.{i}",
+        )
+        r.aggregate_metrics = compute_record_aggregates(r)
+        records[f"1.1.{i}"] = r
+    return records
+
+
 class TestCheckThreshold:
     def test_eq_exact(self):
         assert _check_threshold(1.0, "==", 1.0) is True
@@ -255,7 +286,7 @@ def test_basic_run_level(self):
         )
         r2.aggregate_metrics = compute_record_aggregates(r2)
 
-        result = compute_run_level_aggregates({"1.1.1": r1, "1.1.2": r2})
+        result = compute_run_level_aggregates({"1.1.1": r1, "1.1.2": r2}, seed=42)
 
         # EVA-A_pass: r1=1.0, r2=0.0 -> mean=0.5
         assert result["EVA-A_pass"]["mean"] == 0.5
@@ -287,7 +318,7 @@ def test_mean_success_rate(self):
         )
         r2.aggregate_metrics = compute_record_aggregates(r2)
 
-        result = compute_run_level_aggregates({"1": r1, "2": r2})
+        result = compute_run_level_aggregates({"1": r1, "2": r2}, seed=42)
 
         # EVA-A_mean: r1=1.0, r2=0.0 -> mean=0.5, success_rate=0.5 (1 of 2 >= 0.5)
         assert result["EVA-A_mean"]["mean"] == 0.5
@@ -295,7 +326,7 @@ def test_mean_success_rate(self):
 
     def test_empty_metrics(self):
         """No records -> empty result."""
-        result = compute_run_level_aggregates({})
+        result = compute_run_level_aggregates({}, seed=42)
         assert result == {}
 
     def test_records_with_none_aggregates_excluded(self):
@@ -305,7 +336,7 @@ def test_records_with_none_aggregates_excluded(self):
         # EVA-A_pass should be None (missing faithfulness, agent_speech_fidelity)
         assert r1.aggregate_metrics["EVA-A_pass"] is None
 
-        result = compute_run_level_aggregates({"1": r1})
+        result = compute_run_level_aggregates({"1": r1}, seed=42)
 
         # EVA-A_pass present but with None mean and none_count tracking
         assert result["EVA-A_pass"]["mean"] is None
@@ -331,7 +362,7 @@ def test_pass_at_k_with_multi_trial(self):
             rm.aggregate_metrics = compute_record_aggregates(rm)
             all_metrics[f"1.1.1/trial_{trial_idx}"] = rm
 
-        result = compute_run_level_aggregates(all_metrics, num_draws=3)
+        result = compute_run_level_aggregates(all_metrics, num_draws=3, seed=42)
 
         assert "pass_k" in result
         eva_a = result["pass_k"]["EVA-A_pass"]
@@ -369,7 +400,317 @@ def test_pass_at_k_excludes_record_with_none_trial(self):
         # Verify trial 2 has None for EVA-A_pass
         assert all_metrics["1.1.1/trial_2"].aggregate_metrics["EVA-A_pass"] is None
 
-        result = compute_run_level_aggregates(all_metrics, num_draws=3)
+        result = compute_run_level_aggregates(all_metrics, num_draws=3, seed=42)
 
         # Record should be excluded from pass_k since not all 3 trials are valid
         assert "pass_k" not in result or "EVA-A_pass" not in result.get("pass_k", {})
+
+
+class TestScenarioGrouping:
+    def test_per_metric_k1_record_equals_scenario(self):
+        r1 = make_record_metrics({"task_completion": 1.0}, record_id="1.1.1")
+        r2 = make_record_metrics({"task_completion": 0.5}, record_id="1.1.2")
+        vals = scenario_means_for_metric({"1.1.1": r1, "1.1.2": r2}, "task_completion")
+        np.testing.assert_allclose(sorted(vals.tolist()), [0.5, 1.0])
+
+    def test_per_metric_k3_collapses_trials(self):
+        # Same scenario id "1.1.1", three trials with scores 0.0, 0.5, 1.0 → scenario mean 0.5
+        r0 = make_record_metrics({"task_completion": 0.0}, record_id="1.1.1/trial_0")
+        r1 = make_record_metrics({"task_completion": 0.5}, record_id="1.1.1/trial_1")
+        r2 = make_record_metrics({"task_completion": 1.0}, record_id="1.1.1/trial_2")
+        all_m = {"1.1.1/trial_0": r0, "1.1.1/trial_1": r1, "1.1.1/trial_2": r2}
+        vals = scenario_means_for_metric(all_m, "task_completion")
+        np.testing.assert_allclose(vals.tolist(), [0.5])
+
+    def test_per_metric_skips_errored_trials(self):
+        # One scenario, two trials; one trial has the metric errored
+        r0 = make_record_metrics({"task_completion": 1.0}, record_id="1.1.1/trial_0")
+        r1 = RecordMetrics(
+            record_id="1.1.1/trial_1",
+            metrics={"task_completion": MetricScore(name="task_completion", score=0.0, error="boom")},
+        )
+        vals = scenario_means_for_metric({"1.1.1/trial_0": r0, "1.1.1/trial_1": r1}, "task_completion")
+        np.testing.assert_allclose(vals.tolist(), [1.0])  # mean over the 1 valid trial
+
+    def test_per_metric_drops_all_none_scenarios(self):
+        # Scenario with all trials errored is dropped from the bootstrap unit count.
+        r0 = RecordMetrics(
+            record_id="1.1.1/trial_0",
+            metrics={"task_completion": MetricScore(name="task_completion", score=0.0, error="boom")},
+        )
+        r1 = RecordMetrics(
+            record_id="1.1.1/trial_1",
+            metrics={"task_completion": MetricScore(name="task_completion", score=0.0, error="boom")},
+        )
+        r2 = make_record_metrics({"task_completion": 0.5}, record_id="1.1.2/trial_0")
+        all_m = {"1.1.1/trial_0": r0, "1.1.1/trial_1": r1, "1.1.2/trial_0": r2}
+        vals = scenario_means_for_metric(all_m, "task_completion")
+        np.testing.assert_allclose(vals.tolist(), [0.5])
+
+    def test_composite_k3_collapses_trials(self):
+        # EVA-A_pass scenario value = mean over trials of per-trial 0/1
+        comp = _composite_by_name("EVA-A_pass")
+        r0 = make_record_metrics(
+            {"task_completion": 1.0, "faithfulness": 0.5, "agent_speech_fidelity": 0.95},
+            record_id="1.1.1/trial_0",
+        )
+        r0.aggregate_metrics = compute_record_aggregates(r0)
+        r1 = make_record_metrics(
+            {"task_completion": 0.0, "faithfulness": 0.5, "agent_speech_fidelity": 0.95},
+            record_id="1.1.1/trial_1",
+        )
+        r1.aggregate_metrics = compute_record_aggregates(r1)
+        all_m = {"1.1.1/trial_0": r0, "1.1.1/trial_1": r1}
+        vals = _scenario_values_for_composite(all_m, comp)
+        # trial 0 passes (1.0), trial 1 fails (0.0) → scenario mean 0.5
+        np.testing.assert_allclose(vals.tolist(), [0.5])
+
+    def test_composite_empty_returns_empty_array(self):
+        comp = _composite_by_name("EVA-A_pass")
+        vals = _scenario_values_for_composite({}, comp)
+        assert vals.shape == (0,)
+
+
+class TestRunLevelCompositeCIs:
+    def test_emits_ci_fields_for_all_composites(self):
+        records = _make_clean_records(n=20, passing=10)
+        result = compute_run_level_aggregates(records, seed=42)
+        for comp_name in [
+            "EVA-A_pass",
+            "EVA-X_pass",
+            "EVA-A_mean",
+            "EVA-X_mean",
+            "EVA-overall_mean",
+            "EVA-overall_pass",
+        ]:
+            assert "mean_ci_lower" in result[comp_name], f"missing mean_ci_lower on {comp_name}"
+            assert "mean_ci_upper" in result[comp_name], f"missing mean_ci_upper on {comp_name}"
+            assert "mean_ci_n_scenarios" in result[comp_name], f"missing mean_ci_n_scenarios on {comp_name}"
+
+    def test_ci_brackets_point_estimate(self):
+        records = _make_clean_records(n=50, passing=25)
+        result = compute_run_level_aggregates(records, seed=42)
+        entry = result["EVA-A_pass"]
+        assert entry["mean_ci_lower"] <= entry["mean"] <= entry["mean_ci_upper"]
+
+    def test_n_scenarios_equals_count_for_k1(self):
+        records = _make_clean_records(n=20, passing=10)
+        result = compute_run_level_aggregates(records, seed=42)
+        assert result["EVA-A_pass"]["mean_ci_n_scenarios"] == result["EVA-A_pass"]["count"]
+
+    def test_empty_run_returns_empty_dict(self):
+        result = compute_run_level_aggregates({}, seed=42)
+        # The existing function already early-returns {} for empty input; CI
+        # addition must not change this.
+        assert result == {}
+
+    def test_composite_with_no_valid_data_emits_null_ci(self):
+        # A record where every component has an error → composite is None
+        r = RecordMetrics(
+            record_id="1.1.1",
+            metrics={
+                "task_completion": MetricScore(name="task_completion", score=0.0, error="boom"),
+                "faithfulness": MetricScore(name="faithfulness", score=0.0, error="boom"),
+                "agent_speech_fidelity": MetricScore(name="agent_speech_fidelity", score=0.0, error="boom"),
+            },
+        )
+        r.aggregate_metrics = compute_record_aggregates(r)
+        # Sanity: composite is None for this record
+        assert r.aggregate_metrics["EVA-A_pass"] is None
+
+        result = compute_run_level_aggregates({"1.1.1": r}, seed=42)
+        entry = result["EVA-A_pass"]
+        assert entry["mean_ci_lower"] is None
+        assert entry["mean_ci_upper"] is None
+        assert entry["mean_ci_n_scenarios"] == 0
+
+
+class TestRunLevelPassKCIs:
+    def _make_multi_trial_records(self, scenario_pass_pattern: list[tuple[int, int]]):
+        """For each ``(n_scenarios, n_passing_trials_per_scenario)`` group, build records.
+
+        Always uses k=3 trials per scenario.
+        """
+        records = {}
+        sid = 0
+        for n_scen, n_pass in scenario_pass_pattern:
+            for _ in range(n_scen):
+                sid += 1
+                for trial in range(3):
+                    is_pass = trial < n_pass
+                    r = make_record_metrics(
+                        {
+                            "task_completion": 1.0 if is_pass else 0.0,
+                            "faithfulness": 0.5,
+                            "agent_speech_fidelity": 0.95,
+                            "conversation_progression": 0.5,
+                            "turn_taking": 0.8,
+                            "conciseness": 0.5,
+                        },
+                        record_id=f"1.1.{sid}/trial_{trial}",
+                    )
+                    r.aggregate_metrics = compute_record_aggregates(r)
+                    records[f"1.1.{sid}/trial_{trial}"] = r
+        return records
+
+    def test_pass_k_block_has_ci_fields(self):
+        records = self._make_multi_trial_records([(10, 3), (10, 1), (10, 0)])
+        result = compute_run_level_aggregates(records, num_draws=3, seed=42)
+        block = result["pass_k"]["EVA-A_pass"]
+        for stat in ["pass_at_1", "pass_at_k", "pass_power_k_observed"]:
+            assert f"{stat}_ci_lower" in block, f"missing {stat}_ci_lower"
+            assert f"{stat}_ci_upper" in block, f"missing {stat}_ci_upper"
+        # pass_power_k_theoretical stays bare
+        assert "pass_power_k_theoretical_ci_lower" not in block
+        assert "pass_power_k_theoretical_ci_upper" not in block
+
+    def test_pass_k_ci_brackets_point(self):
+        records = self._make_multi_trial_records([(10, 3), (10, 1), (10, 0)])
+        result = compute_run_level_aggregates(records, num_draws=3, seed=42)
+        block = result["pass_k"]["EVA-A_pass"]
+        assert block["pass_at_1_ci_lower"] <= block["pass_at_1"] <= block["pass_at_1_ci_upper"]
+        assert block["pass_at_k_ci_lower"] <= block["pass_at_k"] <= block["pass_at_k_ci_upper"]
+        assert (
+            block["pass_power_k_observed_ci_lower"]
+            <= block["pass_power_k_observed"]
+            <= block["pass_power_k_observed_ci_upper"]
+        )
+
+
+class TestPerMetricCIs:
+    def _records_with_metric(self, name: str, values: list[tuple[str, float | None]]):
+        """Build a dict[record_id, RecordMetrics] from (record_id, value) pairs.
+
+        ``None`` value means the metric is errored for that record.
+        """
+        out = {}
+        for rid, v in values:
+            if v is None:
+                m = MetricScore(name=name, score=0.0, error="boom")
+            else:
+                m = MetricScore(name=name, score=v, normalized_score=v)
+            out[rid] = RecordMetrics(record_id=rid, metrics={name: m})
+        return out
+
+    def test_per_metric_mean_ci_fields(self):
+        records = self._records_with_metric(
+            "task_completion",
+            [(f"1.1.{i}", float(i) / 10) for i in range(20)],
+        )
+        agg = MetricsRunner._build_per_metric_aggregates(
+            records, ["task_completion"], pass_at_k_results=None, num_draws=1, seed=42
+        )
+        entry = agg["task_completion"]
+        assert "mean_ci_lower" in entry
+        assert "mean_ci_upper" in entry
+        assert "mean_ci_n_scenarios" in entry
+        assert entry["mean_ci_lower"] <= entry["mean"] <= entry["mean_ci_upper"]
+        # n_scenarios == count for k=1
+        assert entry["mean_ci_n_scenarios"] == entry["count"]
+
+    def test_per_metric_no_valid_records_emits_null_ci(self):
+        records = self._records_with_metric(
+            "task_completion",
+            [("1.1.1", None), ("1.1.2", None)],
+        )
+        agg = MetricsRunner._build_per_metric_aggregates(
+            records, ["task_completion"], pass_at_k_results=None, num_draws=1, seed=42
+        )
+        entry = agg["task_completion"]
+        assert entry["mean_ci_lower"] is None
+        assert entry["mean_ci_upper"] is None
+        assert entry["mean_ci_n_scenarios"] == 0
+
+    def test_per_metric_pass_k_ci_fields(self):
+        # Build per-scenario PassAtKResult fixtures and confirm pass_k CI fields appear.
+        records = {}
+        for sid in range(10):
+            for trial in range(3):
+                m = MetricScore(
+                    name="task_completion", score=1.0 if trial < 2 else 0.0, normalized_score=1.0 if trial < 2 else 0.0
+                )
+                records[f"1.1.{sid}/trial_{trial}"] = RecordMetrics(
+                    record_id=f"1.1.{sid}/trial_{trial}",
+                    metrics={"task_completion": m},
+                )
+        pass_at_k_results = {
+            f"1.1.{sid}": {
+                "task_completion": PassAtKResult(
+                    metric_name="task_completion",
+                    n=3,
+                    k=3,
+                    c=2,
+                    pass_at_k=1.0,
+                    pass_power_k=0.0,
+                    threshold=0.5,
+                )
+            }
+            for sid in range(10)
+        }
+        agg = MetricsRunner._build_per_metric_aggregates(
+            records, ["task_completion"], pass_at_k_results=pass_at_k_results, num_draws=3, seed=42
+        )
+        block = agg["task_completion"]["pass_k"]
+        for stat in ["pass_at_1", "pass_at_k", "pass_power_k_observed"]:
+            assert f"{stat}_ci_lower" in block
+            assert f"{stat}_ci_upper" in block
+
+
+class TestRunSeedIntegration:
+    def test_within_run_byte_identical(self):
+        records = _make_clean_records(n=20, passing=10)
+        seed = run_seed("2026-04-16_18-55-44.848147_gpt-realtime-1.5")
+        a = compute_run_level_aggregates(records, seed=seed)
+        b = compute_run_level_aggregates(records, seed=seed)
+        assert a == b
+
+    def test_across_run_independence(self):
+        records = _make_clean_records(n=20, passing=10)
+        # Seed strings chosen empirically: the bimodal n=20 fixture gives a low-variance
+        # bootstrap distribution where many seed pairs land on identical percentile bounds.
+        # The "x"/"y" pair produces differing CI bounds for both EVA-A_pass and EVA-A_mean.
+        seed_a = run_seed("x")
+        seed_b = run_seed("y")
+        a = compute_run_level_aggregates(records, seed=seed_a)
+        b = compute_run_level_aggregates(records, seed=seed_b)
+        # Point estimates are identical (same data); CI bounds differ (different MC noise).
+        for comp_name in ["EVA-A_pass", "EVA-A_mean"]:
+            assert a[comp_name]["mean"] == b[comp_name]["mean"]
+            # At least one of (lower, upper) must differ across runs.
+            assert (
+                a[comp_name]["mean_ci_lower"] != b[comp_name]["mean_ci_lower"]
+                or a[comp_name]["mean_ci_upper"] != b[comp_name]["mean_ci_upper"]
+            )
+
+    def test_per_metric_seed_propagation(self):
+        # The seed kwarg added in Task 5 to _build_per_metric_aggregates must actually
+        # change the CI bounds; same data + same seed must be deterministic.
+        records = {}
+        for i in range(20):
+            value = float(i) / 20.0
+            m = MetricScore(name="task_completion", score=value, normalized_score=value)
+            records[f"1.1.{i}"] = RecordMetrics(record_id=f"1.1.{i}", metrics={"task_completion": m})
+
+        seed_a = run_seed("run-a")
+        seed_b = run_seed("run-b")
+
+        agg_a1 = MetricsRunner._build_per_metric_aggregates(
+            records, ["task_completion"], pass_at_k_results=None, num_draws=1, seed=seed_a
+        )
+        agg_a2 = MetricsRunner._build_per_metric_aggregates(
+            records, ["task_completion"], pass_at_k_results=None, num_draws=1, seed=seed_a
+        )
+        agg_b = MetricsRunner._build_per_metric_aggregates(
+            records, ["task_completion"], pass_at_k_results=None, num_draws=1, seed=seed_b
+        )
+
+        # Same seed → byte-identical
+        assert agg_a1["task_completion"] == agg_a2["task_completion"]
+        # Different seed → at least one bound differs. The n=20 continuous-value fixture
+        # produces enough bootstrap variance for bounds to differ across seeds.
+        entry_a = agg_a1["task_completion"]
+        entry_b = agg_b["task_completion"]
+        assert entry_a["mean"] == entry_b["mean"]
+        assert (
+            entry_a["mean_ci_lower"] != entry_b["mean_ci_lower"] or entry_a["mean_ci_upper"] != entry_b["mean_ci_upper"]
+        )
diff --git a/tests/unit/metrics/test_runner.py b/tests/unit/metrics/test_runner.py
index 29101483..e50057ac 100644
--- a/tests/unit/metrics/test_runner.py
+++ b/tests/unit/metrics/test_runner.py
@@ -764,7 +764,7 @@ def test_all_successful(self):
             "r1": RecordMetrics(record_id="r1", metrics={"m": _ms("m", 0.8)}),
             "r2": RecordMetrics(record_id="r2", metrics={"m": _ms("m", 0.6)}),
         }
-        result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"])
+        result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"], seed=42)
         assert result["m"]["count"] == 2
         assert result["m"]["none_count"] == 0
         assert result["m"]["error_count"] == 0
@@ -778,7 +778,7 @@ def test_errors_and_missing_split(self):
             "r2": RecordMetrics(record_id="r2", metrics={"m": _ms("m", 0.0, error="JSON parse failed")}),
             "r3": RecordMetrics(record_id="r3", metrics={}),  # metric missing entirely
         }
-        result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"])
+        result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"], seed=42)
         assert result["m"]["count"] == 1
         assert result["m"]["error_count"] == 1
         assert result["m"]["missing_count"] == 1
@@ -791,7 +791,7 @@ def test_only_errors(self):
             "r1": RecordMetrics(record_id="r1", metrics={"m": _ms("m", 0.0, error="fail1")}),
             "r2": RecordMetrics(record_id="r2", metrics={"m": _ms("m", 0.0, error="fail2")}),
         }
-        result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"])
+        result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"], seed=42)
         assert result["m"]["count"] == 0
         assert result["m"]["error_count"] == 2
         assert result["m"]["missing_count"] == 0
@@ -807,7 +807,7 @@ def test_higher_is_better_read_from_registered_metric(self):
                 metrics={"response_speed": MetricScore(name="response_speed", score=1.2, normalized_score=None)},
             ),
         }
-        result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["response_speed"])
+        result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["response_speed"], seed=42)
         assert result["response_speed"]["higher_is_better"] is False
 
     def test_higher_is_better_defaults_true_for_unknown_metric(self):
@@ -815,7 +815,7 @@ def test_higher_is_better_defaults_true_for_unknown_metric(self):
         all_metrics = {
             "r1": RecordMetrics(record_id="r1", metrics={"m": MetricScore(name="m", score=0.3)}),
         }
-        result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"])
+        result = MetricsRunner._build_per_metric_aggregates(all_metrics, ["m"], seed=42)
         assert result["m"]["higher_is_better"] is True
 
     def test_sub_metric_direction_derived_from_suffix(self):
@@ -848,7 +848,7 @@ def test_sub_metric_direction_derived_from_suffix(self):
             ),
         }
         result = MetricsRunner._build_per_metric_aggregates(
-            all_metrics, ["faithfulness", "transcription_accuracy_key_entities"]
+            all_metrics, ["faithfulness", "transcription_accuracy_key_entities"], seed=42
         )
         assert result["faithfulness"]["sub_metrics"]["hallucination_rate"]["higher_is_better"] is False
         assert result["transcription_accuracy_key_entities"]["sub_metrics"]["name_accuracy"]["higher_is_better"] is True
diff --git a/tests/unit/utils/test_bootstrap.py b/tests/unit/utils/test_bootstrap.py
new file mode 100644
index 00000000..361975bb
--- /dev/null
+++ b/tests/unit/utils/test_bootstrap.py
@@ -0,0 +1,101 @@
+"""Unit tests for src/eva/utils/bootstrap.py."""
+
+from __future__ import annotations
+
+import math
+import subprocess
+import sys
+import textwrap
+
+import numpy as np
+
+from eva.utils.bootstrap import (
+    ALPHA,
+    N_BOOT,
+    bootstrap_ci,
+    bootstrap_resample,
+    run_seed,
+)
+
+
+class TestBootstrapResample:
+    def test_shape_and_determinism(self):
+        values = np.array([0.0, 0.5, 1.0, 0.25, 0.75])
+        a = bootstrap_resample(values, n_boot=100, seed=42)
+        b = bootstrap_resample(values, n_boot=100, seed=42)
+        assert a.shape == (100,)
+        np.testing.assert_array_equal(a, b)
+
+    def test_different_seeds_differ(self):
+        values = np.array([0.0, 0.5, 1.0])
+        a = bootstrap_resample(values, n_boot=100, seed=1)
+        b = bootstrap_resample(values, n_boot=100, seed=2)
+        assert not np.array_equal(a, b)
+
+    def test_constant_input_constant_output(self):
+        values = np.full(10, 0.7)
+        boot = bootstrap_resample(values, n_boot=50, seed=0)
+        np.testing.assert_allclose(boot, 0.7)
+
+    def test_empty_input(self):
+        boot = bootstrap_resample(np.array([]), n_boot=10, seed=0)
+        assert boot.shape == (0,)
+
+
+class TestBootstrapCI:
+    def test_brackets_mean(self):
+        rng = np.random.default_rng(0)
+        values = rng.normal(loc=0.5, scale=0.1, size=100)
+        lower, upper = bootstrap_ci(values, n_boot=2000, seed=42, alpha=0.05)
+        assert lower < values.mean() < upper
+        assert upper - lower < 0.1
+
+    def test_narrower_alpha_widens(self):
+        rng = np.random.default_rng(0)
+        values = rng.normal(loc=0.5, scale=0.1, size=100)
+        lo_90, hi_90 = bootstrap_ci(values, n_boot=2000, seed=42, alpha=0.10)
+        lo_95, hi_95 = bootstrap_ci(values, n_boot=2000, seed=42, alpha=0.05)
+        assert (hi_95 - lo_95) > (hi_90 - lo_90)
+
+    def test_empty_input_returns_nans(self):
+        lower, upper = bootstrap_ci(np.array([]), n_boot=100, seed=0)
+        assert math.isnan(lower)
+        assert math.isnan(upper)
+
+    def test_single_value(self):
+        lower, upper = bootstrap_ci(np.array([0.42]), n_boot=100, seed=0)
+        assert lower == upper == 0.42
+
+    def test_n_boot_and_alpha_defaults_match_module_constants(self):
+        # bootstrap_ci's optional n_boot/alpha defaults should match the module constants.
+        values = np.array([0.1, 0.2, 0.3, 0.4, 0.5])
+        a = bootstrap_ci(values, seed=0)
+        b = bootstrap_ci(values, n_boot=N_BOOT, seed=0, alpha=ALPHA)
+        assert a == b
+
+
+class TestRunSeed:
+    def test_deterministic_same_input(self):
+        assert run_seed("abc") == run_seed("abc")
+
+    def test_different_inputs_differ(self):
+        assert run_seed("abc") != run_seed("def")
+
+    def test_returns_nonnegative_int(self):
+        s = run_seed("any-run-id")
+        assert isinstance(s, int)
+        assert s >= 0
+        assert s < 2**31
+
+    def test_cross_process_stable(self):
+        """run_seed must NOT use Python's salted hash(); spawn a subprocess and check equality."""
+        in_process = run_seed("cross-process-check")
+        script = textwrap.dedent(
+            """
+            from eva.utils.bootstrap import run_seed
+            print(run_seed("cross-process-check"))
+            """
+        )
+        result = subprocess.run([sys.executable, "-c", script], capture_output=True, text=True, check=True)
+        subprocess_value = int(result.stdout.strip())
+        assert in_process == subprocess_value