From b743bf10a1366c3e65a135343a6f831808f0f741 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Fri, 17 Apr 2026 17:48:58 -0500
Subject: [PATCH 1/3] feat: add operational counters for sampling,
 requirements, and tools (#467)

Adds six new OpenTelemetry counters giving operators visibility into
retry behaviour, validation failure rates, and tool call health:
mellea.sampling.attempts/successes/failures, mellea.requirement.checks/failures,
and mellea.tool.calls.

Follows the established lazy-init globals + record_* helpers + Plugin hooks
pattern. Extends SamplingIterationPayload and SamplingLoopEndPayload with a
strategy_name field so plugins can tag counters by strategy class.

Assisted-by: Claude Code
Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com>
---
 .../evaluation-and-observability/metrics.md   |  43 ++
 .../evaluation-and-observability/telemetry.md |   5 +
 mellea/plugins/hooks/sampling.py              |   4 +
 mellea/stdlib/sampling/base.py                |   3 +
 mellea/telemetry/__init__.py                  |  10 +
 mellea/telemetry/metrics.py                   | 177 +++++++++
 mellea/telemetry/metrics_plugins.py           | 111 +++++-
 test/telemetry/test_metrics.py                |  67 ++++
 test/telemetry/test_metrics_backend.py        |  49 +++
 test/telemetry/test_metrics_operational.py    | 376 ++++++++++++++++++
 test/telemetry/test_metrics_plugins.py        | 236 ++++++++++-
 11 files changed, 1079 insertions(+), 2 deletions(-)
 create mode 100644 test/telemetry/test_metrics_operational.py

diff --git a/docs/docs/evaluation-and-observability/metrics.md b/docs/docs/evaluation-and-observability/metrics.md
index ac87d3661..f6e198f23 100644
--- a/docs/docs/evaluation-and-observability/metrics.md
+++ b/docs/docs/evaluation-and-observability/metrics.md
@@ -173,6 +173,49 @@ Error metrics are recorded when a backend raises an exception during generation,
 after the request has been dispatched to the provider. Construction-time errors
 (e.g. missing API key) are not captured by the error counter.
 
+## Operational metrics
+
+Mellea records metrics for its internal sampling, validation, and tool execution
+loops. These counters give visibility into retry behavior, validation failure
+rates, and tool call health — independent of the underlying LLM provider.
+
+### Sampling counters
+
+| Metric Name | Type | Unit | Description |
+| ----------- | ---- | ---- | ----------- |
+| `mellea.sampling.attempts` | Counter | `{attempt}` | Sampling attempts per loop iteration |
+| `mellea.sampling.successes` | Counter | `{sample}` | Sampling loops that produced a passing sample |
+| `mellea.sampling.failures` | Counter | `{failure}` | Sampling loops that exhausted the loop budget without success |
+
+All sampling metrics include:
+
+| Attribute | Description | Example Values |
+| --------- | ----------- | -------------- |
+| `strategy` | Sampling strategy class name | `RejectionSamplingStrategy`, `MultiTurnStrategy`, `RepairTemplateStrategy` |
+
+### Requirement counters
+
+| Metric Name | Type | Unit | Description |
+| ----------- | ---- | ---- | ----------- |
+| `mellea.requirement.checks` | Counter | `{check}` | Requirement validation checks performed |
+| `mellea.requirement.failures` | Counter | `{failure}` | Requirement validation checks that failed |
+
+| Attribute | Description | Example Values |
+| --------- | ----------- | -------------- |
+| `requirement` | Requirement class name | `LLMaJRequirement`, `PythonExecutionReq`, `ALoraRequirement`, `GuardianCheck` |
+| `reason` | Human-readable failure reason (`mellea.requirement.failures` only) | `"Output did not satisfy constraint"`, `"unknown"` |
+
+### Tool counter
+
+| Metric Name | Type | Unit | Description |
+| ----------- | ---- | ---- | ----------- |
+| `mellea.tool.calls` | Counter | `{call}` | Tool invocations by name and status |
+
+| Attribute | Description | Example Values |
+| --------- | ----------- | -------------- |
+| `tool` | Name of the invoked tool | `"search"`, `"calculator"` |
+| `status` | Execution outcome | `success`, `failure` |
+
 ## Metrics export configuration
 
 Mellea supports multiple metrics exporters that can be used independently or
diff --git a/docs/docs/evaluation-and-observability/telemetry.md b/docs/docs/evaluation-and-observability/telemetry.md
index 4e592f2cb..ee19e1076 100644
--- a/docs/docs/evaluation-and-observability/telemetry.md
+++ b/docs/docs/evaluation-and-observability/telemetry.md
@@ -125,6 +125,11 @@ OpenTelemetry. No code changes are required:
   `mellea.llm.ttfb` (streaming requests only).
 - **Error counter** — `mellea.llm.errors` on each failed backend call,
   classified by semantic error type.
+- **Sampling counters** — `mellea.sampling.attempts`, `mellea.sampling.successes`,
+  and `mellea.sampling.failures` per strategy.
+- **Requirement counters** — `mellea.requirement.checks` and
+  `mellea.requirement.failures` per requirement type.
+- **Tool counter** — `mellea.tool.calls` by tool name and status.
 
 The metrics API also exposes `create_counter`, `create_histogram`, and
 `create_up_down_counter` for instrumenting your own application code.
diff --git a/mellea/plugins/hooks/sampling.py b/mellea/plugins/hooks/sampling.py
index 4e0bceefe..8488e9745 100644
--- a/mellea/plugins/hooks/sampling.py
+++ b/mellea/plugins/hooks/sampling.py
@@ -31,6 +31,7 @@ class SamplingIterationPayload(MelleaBasePayload):
     """Payload for ``sampling_iteration`` — after each sampling attempt.
 
     Attributes:
+        strategy_name: Class name of the sampling strategy (e.g. ``"RejectionSamplingStrategy"``).
         iteration: 1-based iteration number within the sampling loop.
         action: The ``Component`` used for this attempt.
 
@@ -42,6 +43,7 @@ class SamplingIterationPayload(MelleaBasePayload):
         total_count: Total number of requirements evaluated.
     """
 
+    strategy_name: str = ""
     iteration: int = 0
     action: Any = None
     result: Any = None
@@ -78,6 +80,7 @@ class SamplingLoopEndPayload(MelleaBasePayload):
     """Payload for ``sampling_loop_end`` — when sampling completes.
 
     Attributes:
+        strategy_name: Class name of the sampling strategy (e.g. ``"RejectionSamplingStrategy"``).
         success: ``True`` if at least one attempt passed all requirements.
         iterations_used: Total number of iterations the loop executed.
         final_result: The selected ``ModelOutputThunk`` (best success or best failure).
@@ -91,6 +94,7 @@ class SamplingLoopEndPayload(MelleaBasePayload):
             ``(Requirement, ValidationResult)`` tuples for iteration *i*.
     """
 
+    strategy_name: str = ""
     success: bool = False
     iterations_used: int = 0
     final_result: Any = None
diff --git a/mellea/stdlib/sampling/base.py b/mellea/stdlib/sampling/base.py
index b843028cb..67a116d00 100644
--- a/mellea/stdlib/sampling/base.py
+++ b/mellea/stdlib/sampling/base.py
@@ -247,6 +247,7 @@ async def sample(
                         from ...plugins.hooks.sampling import SamplingIterationPayload
 
                         iter_payload = SamplingIterationPayload(
+                            strategy_name=type(self).__name__,
                             iteration=loop_count,
                             action=next_action,
                             result=result,
@@ -272,6 +273,7 @@ async def sample(
                             from ...plugins.hooks.sampling import SamplingLoopEndPayload
 
                             end_payload = SamplingLoopEndPayload(
+                                strategy_name=type(self).__name__,
                                 success=True,
                                 iterations_used=loop_count,
                                 final_result=result,
@@ -362,6 +364,7 @@ async def sample(
                     sample_contexts[best_failed_index] if sample_contexts else context
                 )
                 end_payload = SamplingLoopEndPayload(
+                    strategy_name=type(self).__name__,
                     success=False,
                     iterations_used=loop_count,
                     final_result=sampled_results[best_failed_index],
diff --git a/mellea/telemetry/__init__.py b/mellea/telemetry/__init__.py
index 288e404db..62449b892 100644
--- a/mellea/telemetry/__init__.py
+++ b/mellea/telemetry/__init__.py
@@ -79,7 +79,12 @@ def my_function():
     create_up_down_counter,
     is_metrics_enabled,
     record_request_duration,
+    record_requirement_check,
+    record_requirement_failure,
+    record_sampling_attempt,
+    record_sampling_outcome,
     record_token_usage_metrics,
+    record_tool_call,
     record_ttfb,
 )
 from .tracing import (
@@ -111,7 +116,12 @@ def my_function():
     "is_backend_tracing_enabled",
     "is_metrics_enabled",
     "record_request_duration",
+    "record_requirement_check",
+    "record_requirement_failure",
+    "record_sampling_attempt",
+    "record_sampling_outcome",
     "record_token_usage_metrics",
+    "record_tool_call",
     "record_ttfb",
     "set_span_attribute",
     "set_span_error",
diff --git a/mellea/telemetry/metrics.py b/mellea/telemetry/metrics.py
index eeb4f2ab0..199d3c054 100644
--- a/mellea/telemetry/metrics.py
+++ b/mellea/telemetry/metrics.py
@@ -54,6 +54,9 @@
 - Token counters: mellea.llm.tokens.input, mellea.llm.tokens.output (unit: tokens)
 - Latency histograms: mellea.llm.request.duration (unit: s), mellea.llm.ttfb (unit: s, streaming only)
 - Error counter: mellea.llm.errors (unit: {error}), categorized by semantic error type
+- Sampling counters: mellea.sampling.attempts, mellea.sampling.successes, mellea.sampling.failures (unit: {attempt}/{sample}/{failure})
+- Requirement counters: mellea.requirement.checks (unit: {check}), mellea.requirement.failures (unit: {failure})
+- Tool counter: mellea.tool.calls (unit: {call}), tagged by tool name and status
 
 Programmatic usage:
     from mellea.telemetry.metrics import create_counter, create_histogram
@@ -719,6 +722,175 @@ def record_error(
     )
 
 
+_sampling_attempts_counter: Any = None
+_sampling_successes_counter: Any = None
+_sampling_failures_counter: Any = None
+
+
+def _get_sampling_attempts_counter() -> Any:
+    """Get or create the sampling attempts counter (internal use only)."""
+    global _sampling_attempts_counter
+
+    if _sampling_attempts_counter is None:
+        _sampling_attempts_counter = create_counter(
+            "mellea.sampling.attempts",
+            description="Total number of sampling attempts per strategy",
+            unit="{attempt}",
+        )
+    return _sampling_attempts_counter
+
+
+def _get_sampling_successes_counter() -> Any:
+    """Get or create the sampling successes counter (internal use only)."""
+    global _sampling_successes_counter
+
+    if _sampling_successes_counter is None:
+        _sampling_successes_counter = create_counter(
+            "mellea.sampling.successes",
+            description="Total number of successful sampling loops per strategy",
+            unit="{sample}",
+        )
+    return _sampling_successes_counter
+
+
+def _get_sampling_failures_counter() -> Any:
+    """Get or create the sampling failures counter (internal use only)."""
+    global _sampling_failures_counter
+
+    if _sampling_failures_counter is None:
+        _sampling_failures_counter = create_counter(
+            "mellea.sampling.failures",
+            description="Total number of failed sampling loops (budget exhausted) per strategy",
+            unit="{failure}",
+        )
+    return _sampling_failures_counter
+
+
+def record_sampling_attempt(strategy: str) -> None:
+    """Record one sampling attempt for the given strategy.
+
+    This is a no-op when metrics are disabled, ensuring zero overhead.
+
+    Args:
+        strategy: Sampling strategy class name (e.g. ``"RejectionSamplingStrategy"``).
+    """
+    if not _METRICS_ENABLED:
+        return
+
+    _get_sampling_attempts_counter().add(1, {"strategy": strategy})
+
+
+def record_sampling_outcome(strategy: str, success: bool) -> None:
+    """Record the final outcome (success or failure) of a sampling loop.
+
+    This is a no-op when metrics are disabled, ensuring zero overhead.
+
+    Args:
+        strategy: Sampling strategy class name (e.g. ``"RejectionSamplingStrategy"``).
+        success: ``True`` if at least one attempt passed all requirements.
+    """
+    if not _METRICS_ENABLED:
+        return
+
+    if success:
+        _get_sampling_successes_counter().add(1, {"strategy": strategy})
+    else:
+        _get_sampling_failures_counter().add(1, {"strategy": strategy})
+
+
+_requirement_checks_counter: Any = None
+_requirement_failures_counter: Any = None
+
+
+def _get_requirement_checks_counter() -> Any:
+    """Get or create the requirement checks counter (internal use only)."""
+    global _requirement_checks_counter
+
+    if _requirement_checks_counter is None:
+        _requirement_checks_counter = create_counter(
+            "mellea.requirement.checks",
+            description="Total number of requirement validation checks",
+            unit="{check}",
+        )
+    return _requirement_checks_counter
+
+
+def _get_requirement_failures_counter() -> Any:
+    """Get or create the requirement failures counter (internal use only)."""
+    global _requirement_failures_counter
+
+    if _requirement_failures_counter is None:
+        _requirement_failures_counter = create_counter(
+            "mellea.requirement.failures",
+            description="Total number of requirement validation failures",
+            unit="{failure}",
+        )
+    return _requirement_failures_counter
+
+
+def record_requirement_check(requirement: str) -> None:
+    """Record one requirement validation check.
+
+    This is a no-op when metrics are disabled, ensuring zero overhead.
+
+    Args:
+        requirement: Requirement class name (e.g. ``"LLMaJRequirement"``).
+    """
+    if not _METRICS_ENABLED:
+        return
+
+    _get_requirement_checks_counter().add(1, {"requirement": requirement})
+
+
+def record_requirement_failure(requirement: str, reason: str) -> None:
+    """Record one requirement validation failure.
+
+    This is a no-op when metrics are disabled, ensuring zero overhead.
+
+    Args:
+        requirement: Requirement class name (e.g. ``"LLMaJRequirement"``).
+        reason: Human-readable failure reason from ``ValidationResult.reason``.
+    """
+    if not _METRICS_ENABLED:
+        return
+
+    _get_requirement_failures_counter().add(
+        1, {"requirement": requirement, "reason": reason}
+    )
+
+
+_tool_calls_counter: Any = None
+
+
+def _get_tool_calls_counter() -> Any:
+    """Get or create the tool calls counter (internal use only)."""
+    global _tool_calls_counter
+
+    if _tool_calls_counter is None:
+        _tool_calls_counter = create_counter(
+            "mellea.tool.calls",
+            description="Total number of tool invocations by name and status",
+            unit="{call}",
+        )
+    return _tool_calls_counter
+
+
+def record_tool_call(tool: str, status: str) -> None:
+    """Record one tool invocation.
+
+    This is a no-op when metrics are disabled, ensuring zero overhead.
+
+    Args:
+        tool: Name of the tool that was invoked.
+        status: ``"success"`` if the tool executed without error, ``"failure"`` otherwise.
+    """
+    if not _METRICS_ENABLED:
+        return
+
+    counter = _get_tool_calls_counter()
+    counter.add(1, {"tool": tool, "status": status})
+
+
 __all__ = [
     "classify_error",
     "create_counter",
@@ -727,6 +899,11 @@ def record_error(
     "is_metrics_enabled",
     "record_error",
     "record_request_duration",
+    "record_requirement_check",
+    "record_requirement_failure",
+    "record_sampling_attempt",
+    "record_sampling_outcome",
     "record_token_usage_metrics",
+    "record_tool_call",
     "record_ttfb",
 ]
diff --git a/mellea/telemetry/metrics_plugins.py b/mellea/telemetry/metrics_plugins.py
index 1c0561f58..3d71d783b 100644
--- a/mellea/telemetry/metrics_plugins.py
+++ b/mellea/telemetry/metrics_plugins.py
@@ -6,6 +6,9 @@
 - TokenMetricsPlugin: Records token usage statistics from ModelOutputThunk.usage
 - LatencyMetricsPlugin: Records request duration and TTFB latency histograms
 - ErrorMetricsPlugin: Records LLM error counts categorized by semantic error type
+- SamplingMetricsPlugin: Records sampling attempt/success/failure counts per strategy
+- RequirementMetricsPlugin: Records requirement validation check and failure counts
+- ToolMetricsPlugin: Records tool invocation counts by name and status
 """
 
 from __future__ import annotations
@@ -21,6 +24,12 @@
         GenerationErrorPayload,
         GenerationPostCallPayload,
     )
+    from mellea.plugins.hooks.sampling import (
+        SamplingIterationPayload,
+        SamplingLoopEndPayload,
+    )
+    from mellea.plugins.hooks.tool import ToolPostInvokePayload
+    from mellea.plugins.hooks.validation import ValidationPostCheckPayload
 
 
 class TokenMetricsPlugin(Plugin, name="token_metrics", priority=50):
@@ -127,5 +136,105 @@ async def record_error_metrics(
         )
 
 
+class SamplingMetricsPlugin(Plugin, name="sampling_metrics", priority=54):
+    """Records sampling loop attempt and outcome metrics.
+
+    Hooks into ``sampling_iteration`` to count attempts per strategy and
+    ``sampling_loop_end`` to count successes and failures.
+    """
+
+    @hook("sampling_iteration", mode=PluginMode.FIRE_AND_FORGET)
+    async def record_sampling_attempt(
+        self, payload: SamplingIterationPayload, context: dict[str, Any]
+    ) -> None:
+        """Record one sampling attempt after each iteration.
+
+        Args:
+            payload: Contains strategy_name and iteration metadata.
+            context: Plugin context (unused).
+        """
+        from mellea.telemetry.metrics import record_sampling_attempt
+
+        record_sampling_attempt(payload.strategy_name or "unknown")
+
+    @hook("sampling_loop_end", mode=PluginMode.FIRE_AND_FORGET)
+    async def record_sampling_outcome(
+        self, payload: SamplingLoopEndPayload, context: dict[str, Any]
+    ) -> None:
+        """Record success or failure when the sampling loop ends.
+
+        Args:
+            payload: Contains strategy_name and success flag.
+            context: Plugin context (unused).
+        """
+        from mellea.telemetry.metrics import record_sampling_outcome
+
+        record_sampling_outcome(payload.strategy_name or "unknown", payload.success)
+
+
+class RequirementMetricsPlugin(Plugin, name="requirement_metrics", priority=55):
+    """Records requirement validation check and failure metrics.
+
+    Hooks into ``validation_post_check`` to count checks and failures per
+    requirement type after each validation batch.
+    """
+
+    @hook("validation_post_check", mode=PluginMode.FIRE_AND_FORGET)
+    async def record_requirement_metrics(
+        self, payload: ValidationPostCheckPayload, context: dict[str, Any]
+    ) -> None:
+        """Record validation checks and failures for each requirement.
+
+        Args:
+            payload: Contains requirements list and corresponding results.
+            context: Plugin context (unused).
+        """
+        from mellea.telemetry.metrics import (
+            record_requirement_check,
+            record_requirement_failure,
+        )
+
+        for req, result in zip(payload.requirements, payload.results):
+            req_name = type(req).__name__
+            record_requirement_check(req_name)
+            if not bool(result):
+                reason = getattr(result, "reason", None) or "unknown"
+                record_requirement_failure(req_name, reason)
+
+
+class ToolMetricsPlugin(Plugin, name="tool_metrics", priority=56):
+    """Records tool invocation metrics.
+
+    Hooks into ``tool_post_invoke`` to count tool calls by name and success/failure status.
+    """
+
+    @hook("tool_post_invoke", mode=PluginMode.FIRE_AND_FORGET)
+    async def record_tool_call(
+        self, payload: ToolPostInvokePayload, context: dict[str, Any]
+    ) -> None:
+        """Record one tool invocation after it completes.
+
+        Args:
+            payload: Contains model_tool_call (with name) and success flag.
+            context: Plugin context (unused).
+        """
+        from mellea.telemetry.metrics import record_tool_call
+
+        tool_name = (
+            payload.model_tool_call.name
+            if payload.model_tool_call is not None
+            else "unknown"
+        )
+        status = "success" if payload.success else "failure"
+        record_tool_call(tool_name, status)
+
+
 # All metrics plugins to auto-register when metrics are enabled
-_METRICS_PLUGIN_CLASSES = (TokenMetricsPlugin, LatencyMetricsPlugin, ErrorMetricsPlugin)
+_METRICS_PLUGIN_CLASSES = (
+    TokenMetricsPlugin,
+    LatencyMetricsPlugin,
+    ErrorMetricsPlugin,
+    SamplingMetricsPlugin,
+    RequirementMetricsPlugin,
+    ToolMetricsPlugin,
+)
diff --git a/test/telemetry/test_metrics.py b/test/telemetry/test_metrics.py
index d96d71687..e5a9360ad 100644
--- a/test/telemetry/test_metrics.py
+++ b/test/telemetry/test_metrics.py
@@ -574,6 +574,12 @@ def test_metric_instruments_lazy_initialization(enable_metrics):
         _duration_histogram,
         _input_token_counter,
         _output_token_counter,
+        _requirement_checks_counter,
+        _requirement_failures_counter,
+        _sampling_attempts_counter,
+        _sampling_failures_counter,
+        _sampling_successes_counter,
+        _tool_calls_counter,
         _ttfb_histogram,
     )
 
@@ -582,21 +588,44 @@ def test_metric_instruments_lazy_initialization(enable_metrics):
     assert _output_token_counter is None
     assert _duration_histogram is None
     assert _ttfb_histogram is None
+    assert _sampling_attempts_counter is None
+    assert _sampling_successes_counter is None
+    assert _sampling_failures_counter is None
+    assert _requirement_checks_counter is None
+    assert _requirement_failures_counter is None
+    assert _tool_calls_counter is None
 
     from mellea.telemetry.metrics import (
         record_request_duration,
+        record_requirement_check,
+        record_requirement_failure,
+        record_sampling_attempt,
+        record_sampling_outcome,
         record_token_usage_metrics,
+        record_tool_call,
     )
 
     record_token_usage_metrics(
         input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama"
     )
     record_request_duration(duration_s=1.0, model="llama2:7b", provider="ollama")
+    record_sampling_attempt("RejectionSamplingStrategy")
+    record_sampling_outcome("RejectionSamplingStrategy", success=True)
+    record_sampling_outcome("RejectionSamplingStrategy", success=False)
+    record_requirement_check("LLMaJRequirement")
+    record_requirement_failure("LLMaJRequirement", "constraint not met")
+    record_tool_call("search", "success")
 
     from mellea.telemetry.metrics import (
         _duration_histogram,
         _input_token_counter,
         _output_token_counter,
+        _requirement_checks_counter,
+        _requirement_failures_counter,
+        _sampling_attempts_counter,
+        _sampling_failures_counter,
+        _sampling_successes_counter,
+        _tool_calls_counter,
         _ttfb_histogram,
     )
 
@@ -606,6 +635,12 @@ def test_metric_instruments_lazy_initialization(enable_metrics):
     assert (
         _ttfb_histogram is not None
     )  # initialized together via _get_latency_histograms
+    assert _sampling_attempts_counter is not None
+    assert _sampling_successes_counter is not None
+    assert _sampling_failures_counter is not None
+    assert _requirement_checks_counter is not None
+    assert _requirement_failures_counter is not None
+    assert _tool_calls_counter is not None
 
 
 def test_record_metrics_noop_when_disabled(clean_metrics_env):
@@ -613,7 +648,12 @@ def test_record_metrics_noop_when_disabled(clean_metrics_env):
     from mellea.telemetry.metrics import (
         record_error,
         record_request_duration,
+        record_requirement_check,
+        record_requirement_failure,
+        record_sampling_attempt,
+        record_sampling_outcome,
         record_token_usage_metrics,
+        record_tool_call,
     )
 
     record_token_usage_metrics(
@@ -626,6 +666,11 @@ def test_record_metrics_noop_when_disabled(clean_metrics_env):
         provider="ollama",
         exception_class="TimeoutError",
     )
+    record_sampling_attempt("RejectionSamplingStrategy")
+    record_sampling_outcome("RejectionSamplingStrategy", success=True)
+    record_requirement_check("LLMaJRequirement")
+    record_requirement_failure("LLMaJRequirement", "constraint not met")
+    record_tool_call("search", "success")
 
     # No instruments should have been initialized
     from mellea.telemetry.metrics import (
@@ -633,6 +678,12 @@ def test_record_metrics_noop_when_disabled(clean_metrics_env):
         _error_counter,
         _input_token_counter,
         _output_token_counter,
+        _requirement_checks_counter,
+        _requirement_failures_counter,
+        _sampling_attempts_counter,
+        _sampling_failures_counter,
+        _sampling_successes_counter,
+        _tool_calls_counter,
         _ttfb_histogram,
     )
 
@@ -641,19 +692,35 @@ def test_record_metrics_noop_when_disabled(clean_metrics_env):
     assert _duration_histogram is None
     assert _ttfb_histogram is None
     assert _error_counter is None
+    assert _sampling_attempts_counter is None
+    assert _sampling_successes_counter is None
+    assert _sampling_failures_counter is None
+    assert _requirement_checks_counter is None
+    assert _requirement_failures_counter is None
+    assert _tool_calls_counter is None
 
 
 def test_record_functions_exported_in_public_api():
     """Test that all record functions are exported in the public API."""
     from mellea.telemetry import (
         record_request_duration,
+        record_requirement_check,
+        record_requirement_failure,
+        record_sampling_attempt,
+        record_sampling_outcome,
         record_token_usage_metrics,
+        record_tool_call,
         record_ttfb,
     )
 
     assert callable(record_token_usage_metrics)
     assert callable(record_request_duration)
     assert callable(record_ttfb)
+    assert callable(record_sampling_attempt)
+    assert callable(record_sampling_outcome)
+    assert callable(record_requirement_check)
+    assert callable(record_requirement_failure)
+    assert callable(record_tool_call)
 
 
 # Token Counter Tests
diff --git a/test/telemetry/test_metrics_backend.py b/test/telemetry/test_metrics_backend.py
index cd34ce194..95a40b5f7 100644
--- a/test/telemetry/test_metrics_backend.py
+++ b/test/telemetry/test_metrics_backend.py
@@ -89,6 +89,12 @@ def _setup_metrics_provider(metrics_module, metric_reader):
     metrics_module._duration_histogram = None
     metrics_module._ttfb_histogram = None
     metrics_module._error_counter = None
+    metrics_module._sampling_attempts_counter = None
+    metrics_module._sampling_successes_counter = None
+    metrics_module._sampling_failures_counter = None
+    metrics_module._requirement_checks_counter = None
+    metrics_module._requirement_failures_counter = None
+    metrics_module._tool_calls_counter = None
     return provider
 
 
@@ -480,3 +486,46 @@ async def test_error_metrics_on_backend_failure(enable_metrics, metric_reader):
 
     assert error_count is not None, "Error counter should have been recorded"
     assert error_count == 1, f"Expected 1 error, got {error_count}"
+
+
+@pytest.mark.asyncio
+@pytest.mark.ollama
+async def test_ollama_sampling_metrics_integration(enable_metrics, metric_reader):
+    """Test that sampling metrics are recorded through a full RejectionSamplingStrategy loop."""
+    from mellea.backends.ollama import OllamaModelBackend
+    from mellea.stdlib.components import Instruction
+    from mellea.stdlib.context import SimpleContext
+    from mellea.stdlib.sampling import RejectionSamplingStrategy
+    from mellea.telemetry import metrics as metrics_module
+
+    provider = _setup_metrics_provider(metrics_module, metric_reader)
+
+    backend = OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name)  # type: ignore
+    strategy = RejectionSamplingStrategy(loop_budget=1)
+    ctx = SimpleContext()
+
+    result = await strategy.sample(
+        action=Instruction("Say hello"), context=ctx, backend=backend, requirements=None
+    )
+
+    # Yield to event loop so FIRE_AND_FORGET plugin tasks complete
+    await asyncio.sleep(0.05)
+    provider.force_flush()
+    metrics_data = metric_reader.get_metrics_data()
+
+    attempts = get_metric_value(
+        metrics_data,
+        "mellea.sampling.attempts",
+        {"strategy": "RejectionSamplingStrategy"},
+    )
+    assert attempts is not None, "Sampling attempts should be recorded"
+    assert attempts >= 1, f"Expected at least 1 attempt, got {attempts}"
+
+    # With no requirements and loop_budget=1 the loop always succeeds
+    successes = get_metric_value(
+        metrics_data,
+        "mellea.sampling.successes",
+        {"strategy": "RejectionSamplingStrategy"},
+    )
+    assert result.success
+    assert successes == 1, f"Expected 1 success, got {successes}"
diff --git a/test/telemetry/test_metrics_operational.py b/test/telemetry/test_metrics_operational.py
new file mode 100644
index 000000000..0d2187133
--- /dev/null
+++ b/test/telemetry/test_metrics_operational.py
@@ -0,0 +1,376 @@
+"""Tests for operational counter metrics (sampling, requirement, tool).
+
+Integration tests use InMemoryMetricReader to verify counter values and attributes.
+Unit tests verify no-op behaviour when metrics are disabled.
+"""
+
+import pytest
+
+try:
+    from opentelemetry.sdk.metrics import MeterProvider
+    from opentelemetry.sdk.metrics.export import InMemoryMetricReader
+
+    OTEL_AVAILABLE = True
+except ImportError:
+    OTEL_AVAILABLE = False
+
+pytestmark = pytest.mark.skipif(
+    not OTEL_AVAILABLE, reason="OpenTelemetry not installed"
+)
+
+
+@pytest.fixture
+def clean_metrics_env(monkeypatch):
+    """Enable metrics and reset all module state for each test."""
+    monkeypatch.setenv("MELLEA_METRICS_ENABLED", "true")
+    monkeypatch.delenv("MELLEA_METRICS_CONSOLE", raising=False)
+
+    import importlib
+
+    import mellea.telemetry.metrics
+
+    importlib.reload(mellea.telemetry.metrics)
+    yield
+    importlib.reload(mellea.telemetry.metrics)
+
+
+def _setup_in_memory_provider(metrics_module):
+    """Wire an InMemoryMetricReader into the metrics module globals."""
+    reader = InMemoryMetricReader()
+    provider = MeterProvider(metric_readers=[reader])
+    metrics_module._meter_provider = provider
+    metrics_module._meter = provider.get_meter("mellea")
+    # Reset all operational counter globals so they bind to the new meter
+    metrics_module._sampling_attempts_counter = None
+    metrics_module._sampling_successes_counter = None
+    metrics_module._sampling_failures_counter = None
+    metrics_module._requirement_checks_counter = None
+    metrics_module._requirement_failures_counter = None
+    metrics_module._tool_calls_counter = None
+    return reader, provider
+
+
+def _data_points_for(metrics_data, metric_name):
+    """Return all data points for the named metric."""
+    if metrics_data is None:
+        return []
+    data_points = []
+    for rm in metrics_data.resource_metrics:
+        for sm in rm.scope_metrics:
+            for metric in sm.metrics:
+                if metric.name == metric_name:
+                    data_points.extend(metric.data.data_points)
+    return data_points
+
+
+# ---------------------------------------------------------------------------
+# Sampling — attempts
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.integration
+def test_record_sampling_attempt_basic(clean_metrics_env):
+    """Sampling attempt counter records correct value and strategy attribute."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_sampling_attempt("RejectionSamplingStrategy")
+
+    provider.force_flush()
+    dps = _data_points_for(reader.get_metrics_data(), "mellea.sampling.attempts")
+
+    assert len(dps) == 1
+    assert dps[0].value == 1
+    assert dict(dps[0].attributes)["strategy"] == "RejectionSamplingStrategy"
+
+
+@pytest.mark.integration
+def test_record_sampling_attempt_accumulation(clean_metrics_env):
+    """Multiple attempts accumulate correctly."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    for _ in range(3):
+        m.record_sampling_attempt("RejectionSamplingStrategy")
+
+    provider.force_flush()
+    dps = _data_points_for(reader.get_metrics_data(), "mellea.sampling.attempts")
+
+    assert len(dps) == 1
+    assert dps[0].value == 3
+
+
+@pytest.mark.integration
+def test_record_sampling_attempt_multiple_strategies(clean_metrics_env):
+    """Different strategies are tracked as separate attribute sets."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_sampling_attempt("RejectionSamplingStrategy")
+    m.record_sampling_attempt("MultiTurnStrategy")
+
+    provider.force_flush()
+    dps = _data_points_for(reader.get_metrics_data(), "mellea.sampling.attempts")
+
+    assert len(dps) == 2
+    strategies = {dict(dp.attributes)["strategy"] for dp in dps}
+    assert strategies == {"RejectionSamplingStrategy", "MultiTurnStrategy"}
+
+
+# ---------------------------------------------------------------------------
+# Sampling — outcomes
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.integration
+def test_record_sampling_outcome_success(clean_metrics_env):
+    """Success outcome increments the successes counter."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_sampling_outcome("RejectionSamplingStrategy", success=True)
+
+    provider.force_flush()
+    success_dps = _data_points_for(
+        reader.get_metrics_data(), "mellea.sampling.successes"
+    )
+    failure_dps = _data_points_for(
+        reader.get_metrics_data(), "mellea.sampling.failures"
+    )
+
+    assert len(success_dps) == 1
+    assert success_dps[0].value == 1
+    assert len(failure_dps) == 0
+
+
+@pytest.mark.integration
+def test_record_sampling_outcome_failure(clean_metrics_env):
+    """Failure outcome increments the failures counter."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_sampling_outcome("RejectionSamplingStrategy", success=False)
+
+    provider.force_flush()
+    success_dps = _data_points_for(
+        reader.get_metrics_data(), "mellea.sampling.successes"
+    )
+    failure_dps = _data_points_for(
+        reader.get_metrics_data(), "mellea.sampling.failures"
+    )
+
+    assert len(success_dps) == 0
+    assert len(failure_dps) == 1
+    assert failure_dps[0].value == 1
+
+
+# ---------------------------------------------------------------------------
+# Requirement checks
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.integration
+def test_record_requirement_check_basic(clean_metrics_env):
+    """Requirement check counter records correct value and requirement attribute."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_requirement_check("LLMaJRequirement")
+
+    provider.force_flush()
+    dps = _data_points_for(reader.get_metrics_data(), "mellea.requirement.checks")
+
+    assert len(dps) == 1
+    assert dps[0].value == 1
+    assert dict(dps[0].attributes)["requirement"] == "LLMaJRequirement"
+
+
+@pytest.mark.integration
+def test_record_requirement_check_multiple_types(clean_metrics_env):
+    """Different requirement types are tracked separately."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_requirement_check("LLMaJRequirement")
+    m.record_requirement_check("PythonExecutionReq")
+
+    provider.force_flush()
+    dps = _data_points_for(reader.get_metrics_data(), "mellea.requirement.checks")
+
+    assert len(dps) == 2
+    req_names = {dict(dp.attributes)["requirement"] for dp in dps}
+    assert req_names == {"LLMaJRequirement", "PythonExecutionReq"}
+
+
+@pytest.mark.integration
+def test_record_requirement_failure_attributes(clean_metrics_env):
+    """Requirement failure counter records requirement and reason attributes."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_requirement_failure(
+        "LLMaJRequirement", "Output did not satisfy constraint"
+    )
+
+    provider.force_flush()
+    dps = _data_points_for(reader.get_metrics_data(), "mellea.requirement.failures")
+
+    assert len(dps) == 1
+    attrs = dict(dps[0].attributes)
+    assert attrs["requirement"] == "LLMaJRequirement"
+    assert attrs["reason"] == "Output did not satisfy constraint"
+
+
+@pytest.mark.integration
+def test_record_requirement_failure_accumulation(clean_metrics_env):
+    """Multiple failures with the same attributes accumulate correctly."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_requirement_failure("LLMaJRequirement", "unknown")
+    m.record_requirement_failure("LLMaJRequirement", "unknown")
+
+    provider.force_flush()
+    dps = _data_points_for(reader.get_metrics_data(), "mellea.requirement.failures")
+
+    assert len(dps) == 1
+    assert dps[0].value == 2
+
+
+# ---------------------------------------------------------------------------
+# Tool calls
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.integration
+def test_record_tool_call_success(clean_metrics_env):
+    """Tool call counter records name and success status."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_tool_call("search", "success")
+
+    provider.force_flush()
+    dps = _data_points_for(reader.get_metrics_data(), "mellea.tool.calls")
+
+    assert len(dps) == 1
+    attrs = dict(dps[0].attributes)
+    assert attrs["tool"] == "search"
+    assert attrs["status"] == "success"
+
+
+@pytest.mark.integration
+def test_record_tool_call_failure(clean_metrics_env):
+    """Tool call counter records failure status separately from success."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_tool_call("search", "success")
+    m.record_tool_call("search", "failure")
+
+    provider.force_flush()
+    dps = _data_points_for(reader.get_metrics_data(), "mellea.tool.calls")
+
+    assert len(dps) == 2
+    statuses = {dict(dp.attributes)["status"] for dp in dps}
+    assert statuses == {"success", "failure"}
+
+
+@pytest.mark.integration
+def test_record_tool_call_multiple_tools(clean_metrics_env):
+    """Different tool names are tracked as separate attribute sets."""
+    from mellea.telemetry import metrics as m
+
+    reader, provider = _setup_in_memory_provider(m)
+
+    m.record_tool_call("search", "success")
+    m.record_tool_call("calculator", "success")
+
+    provider.force_flush()
+    dps = _data_points_for(reader.get_metrics_data(), "mellea.tool.calls")
+
+    assert len(dps) == 2
+    tools = {dict(dp.attributes)["tool"] for dp in dps}
+    assert tools == {"search", "calculator"}
+
+
+# ---------------------------------------------------------------------------
+# Unit: no-op when metrics disabled
+# ---------------------------------------------------------------------------
+
+
+def test_record_sampling_attempt_noop_when_disabled(monkeypatch):
+    """record_sampling_attempt is a no-op when metrics are disabled."""
+    import importlib
+
+    import mellea.telemetry.metrics as m
+
+    importlib.reload(m)
+    monkeypatch.setattr(m, "_METRICS_ENABLED", False)
+
+    # Should not raise and should not create any counter
+    m.record_sampling_attempt("RejectionSamplingStrategy")
+    assert m._sampling_attempts_counter is None
+
+
+def test_record_sampling_outcome_noop_when_disabled(monkeypatch):
+    """record_sampling_outcome is a no-op when metrics are disabled."""
+    import importlib
+
+    import mellea.telemetry.metrics as m
+
+    importlib.reload(m)
+    monkeypatch.setattr(m, "_METRICS_ENABLED", False)
+
+    m.record_sampling_outcome("RejectionSamplingStrategy", success=True)
+    assert m._sampling_successes_counter is None
+
+
+def test_record_requirement_check_noop_when_disabled(monkeypatch):
+    """record_requirement_check is a no-op when metrics are disabled."""
+    import importlib
+
+    import mellea.telemetry.metrics as m
+
+    importlib.reload(m)
+    monkeypatch.setattr(m, "_METRICS_ENABLED", False)
+
+    m.record_requirement_check("LLMaJRequirement")
+    assert m._requirement_checks_counter is None
+
+
+def test_record_requirement_failure_noop_when_disabled(monkeypatch):
+    """record_requirement_failure is a no-op when metrics are disabled."""
+    import importlib
+
+    import mellea.telemetry.metrics as m
+
+    importlib.reload(m)
+    monkeypatch.setattr(m, "_METRICS_ENABLED", False)
+
+    m.record_requirement_failure("LLMaJRequirement", "reason")
+    assert m._requirement_failures_counter is None
+
+
+def test_record_tool_call_noop_when_disabled(monkeypatch):
+    """record_tool_call is a no-op when metrics are disabled."""
+    import importlib
+
+    import mellea.telemetry.metrics as m
+
+    importlib.reload(m)
+    monkeypatch.setattr(m, "_METRICS_ENABLED", False)
+
+    m.record_tool_call("search", "success")
+    assert m._tool_calls_counter is None
diff --git a/test/telemetry/test_metrics_plugins.py b/test/telemetry/test_metrics_plugins.py
index d78feea9d..aee8e0a2a 100644
--- a/test/telemetry/test_metrics_plugins.py
+++ b/test/telemetry/test_metrics_plugins.py
@@ -1,4 +1,4 @@
-"""Unit tests for TokenMetricsPlugin, LatencyMetricsPlugin, and ErrorMetricsPlugin."""
+"""Unit tests for metrics plugins."""
 
 from unittest.mock import patch
 
@@ -11,6 +11,12 @@
     GenerationErrorPayload,
     GenerationPostCallPayload,
 )
+from mellea.plugins.hooks.sampling import (
+    SamplingIterationPayload,
+    SamplingLoopEndPayload,
+)
+from mellea.plugins.hooks.tool import ToolPostInvokePayload
+from mellea.plugins.hooks.validation import ValidationPostCheckPayload
 from mellea.telemetry.metrics import (
     ERROR_TYPE_TIMEOUT,
     ERROR_TYPE_TRANSPORT_ERROR,
@@ -19,7 +25,10 @@
 from mellea.telemetry.metrics_plugins import (
     ErrorMetricsPlugin,
     LatencyMetricsPlugin,
+    RequirementMetricsPlugin,
+    SamplingMetricsPlugin,
     TokenMetricsPlugin,
+    ToolMetricsPlugin,
 )
 
 
@@ -267,3 +276,228 @@ async def test_error_plugin_handles_none_model_output(error_plugin):
             provider="unknown",
             exception_class="RuntimeError",
         )
+
+
+# SamplingMetricsPlugin tests
+
+
+class _PassResult:
+    reason = None
+
+    def __bool__(self) -> bool:
+        return True
+
+
+class _FailResult:
+    def __init__(self, reason: str = "constraint not met") -> None:
+        self.reason = reason
+
+    def __bool__(self) -> bool:
+        return False
+
+
+class _FakeReq:
+    pass
+
+
+@pytest.fixture
+def sampling_plugin():
+    return SamplingMetricsPlugin()
+
+
+@pytest.mark.asyncio
+async def test_sampling_plugin_records_attempt(sampling_plugin):
+    """Plugin calls record_sampling_attempt with the strategy name on each iteration."""
+    payload = SamplingIterationPayload(strategy_name="RejectionSamplingStrategy")
+
+    with patch("mellea.telemetry.metrics.record_sampling_attempt") as mock_record:
+        await sampling_plugin.record_sampling_attempt(payload, {})
+
+        mock_record.assert_called_once_with("RejectionSamplingStrategy")
+
+
+@pytest.mark.asyncio
+async def test_sampling_plugin_attempt_empty_name_falls_back_to_unknown(
+    sampling_plugin,
+):
+    """Empty strategy_name falls back to 'unknown'."""
+    payload = SamplingIterationPayload(strategy_name="")
+
+    with patch("mellea.telemetry.metrics.record_sampling_attempt") as mock_record:
+        await sampling_plugin.record_sampling_attempt(payload, {})
+
+        mock_record.assert_called_once_with("unknown")
+
+
+@pytest.mark.asyncio
+async def test_sampling_plugin_records_success_outcome(sampling_plugin):
+    """Plugin calls record_sampling_outcome(success=True) on a successful loop end."""
+    payload = SamplingLoopEndPayload(
+        strategy_name="RejectionSamplingStrategy", success=True
+    )
+
+    with patch("mellea.telemetry.metrics.record_sampling_outcome") as mock_record:
+        await sampling_plugin.record_sampling_outcome(payload, {})
+
+        mock_record.assert_called_once_with("RejectionSamplingStrategy", True)
+
+
+@pytest.mark.asyncio
+async def test_sampling_plugin_records_failure_outcome(sampling_plugin):
+    """Plugin calls record_sampling_outcome(success=False) on a failed loop end."""
+    payload = SamplingLoopEndPayload(strategy_name="MultiTurnStrategy", success=False)
+
+    with patch("mellea.telemetry.metrics.record_sampling_outcome") as mock_record:
+        await sampling_plugin.record_sampling_outcome(payload, {})
+
+        mock_record.assert_called_once_with("MultiTurnStrategy", False)
+
+
+# RequirementMetricsPlugin tests
+
+
+@pytest.fixture
+def requirement_plugin():
+    return RequirementMetricsPlugin()
+
+
+@pytest.mark.asyncio
+async def test_requirement_plugin_records_checks_and_no_failures_when_all_pass(
+    requirement_plugin,
+):
+    """When all requirements pass, only checks are recorded."""
+    req = _FakeReq()
+    payload = ValidationPostCheckPayload(
+        requirements=[req],
+        results=[_PassResult()],
+        all_validations_passed=True,
+        passed_count=1,
+        failed_count=0,
+    )
+
+    with (
+        patch("mellea.telemetry.metrics.record_requirement_check") as mock_check,
+        patch("mellea.telemetry.metrics.record_requirement_failure") as mock_fail,
+    ):
+        await requirement_plugin.record_requirement_metrics(payload, {})
+
+        mock_check.assert_called_once_with("_FakeReq")
+        mock_fail.assert_not_called()
+
+
+@pytest.mark.asyncio
+async def test_requirement_plugin_records_failure_with_reason(requirement_plugin):
+    """Failed requirements record both a check and a failure with the reason."""
+    req = _FakeReq()
+    payload = ValidationPostCheckPayload(
+        requirements=[req],
+        results=[_FailResult("output too short")],
+        all_validations_passed=False,
+        passed_count=0,
+        failed_count=1,
+    )
+
+    with (
+        patch("mellea.telemetry.metrics.record_requirement_check") as mock_check,
+        patch("mellea.telemetry.metrics.record_requirement_failure") as mock_fail,
+    ):
+        await requirement_plugin.record_requirement_metrics(payload, {})
+
+        mock_check.assert_called_once_with("_FakeReq")
+        mock_fail.assert_called_once_with("_FakeReq", "output too short")
+
+
+@pytest.mark.asyncio
+async def test_requirement_plugin_mixed_pass_fail(requirement_plugin):
+    """Mixed results record a check for each and a failure only for failing ones."""
+    req_a = _FakeReq()
+    req_b = _FakeReq()
+    payload = ValidationPostCheckPayload(
+        requirements=[req_a, req_b],
+        results=[_PassResult(), _FailResult("constraint not met")],
+        all_validations_passed=False,
+        passed_count=1,
+        failed_count=1,
+    )
+
+    with (
+        patch("mellea.telemetry.metrics.record_requirement_check") as mock_check,
+        patch("mellea.telemetry.metrics.record_requirement_failure") as mock_fail,
+    ):
+        await requirement_plugin.record_requirement_metrics(payload, {})
+
+        assert mock_check.call_count == 2
+        mock_fail.assert_called_once_with("_FakeReq", "constraint not met")
+
+
+@pytest.mark.asyncio
+async def test_requirement_plugin_failure_with_no_reason_falls_back_to_unknown(
+    requirement_plugin,
+):
+    """A None reason falls back to 'unknown'."""
+    req = _FakeReq()
+    payload = ValidationPostCheckPayload(
+        requirements=[req],
+        results=[_FailResult(reason=None)],  # type: ignore[arg-type]
+        all_validations_passed=False,
+        passed_count=0,
+        failed_count=1,
+    )
+
+    with (
+        patch("mellea.telemetry.metrics.record_requirement_check"),
+        patch("mellea.telemetry.metrics.record_requirement_failure") as mock_fail,
+    ):
+        await requirement_plugin.record_requirement_metrics(payload, {})
+
+        mock_fail.assert_called_once_with("_FakeReq", "unknown")
+
+
+# ToolMetricsPlugin tests
+
+
+class _MockToolCall:
+    def __init__(self, name: str) -> None:
+        self.name = name
+
+
+@pytest.fixture
+def tool_plugin():
+    return ToolMetricsPlugin()
+
+
+@pytest.mark.asyncio
+async def test_tool_plugin_records_success(tool_plugin):
+    """Successful tool calls are recorded with status='success'."""
+    payload = ToolPostInvokePayload(
+        model_tool_call=_MockToolCall("search"), success=True
+    )
+
+    with patch("mellea.telemetry.metrics.record_tool_call") as mock_record:
+        await tool_plugin.record_tool_call(payload, {})
+
+        mock_record.assert_called_once_with("search", "success")
+
+
+@pytest.mark.asyncio
+async def test_tool_plugin_records_failure(tool_plugin):
+    """Failed tool calls are recorded with status='failure'."""
+    payload = ToolPostInvokePayload(
+        model_tool_call=_MockToolCall("calculator"), success=False
+    )
+
+    with patch("mellea.telemetry.metrics.record_tool_call") as mock_record:
+        await tool_plugin.record_tool_call(payload, {})
+
+        mock_record.assert_called_once_with("calculator", "failure")
+
+
+@pytest.mark.asyncio
+async def test_tool_plugin_none_tool_call_falls_back_to_unknown(tool_plugin):
+    """A None model_tool_call falls back to tool name 'unknown'."""
+    payload = ToolPostInvokePayload(model_tool_call=None, success=True)
+
+    with patch("mellea.telemetry.metrics.record_tool_call") as mock_record:
+        await tool_plugin.record_tool_call(payload, {})
+
+        mock_record.assert_called_once_with("unknown", "success")

From a5827b7f543824773fdf071e155f7e7ee7d2386d Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Fri, 17 Apr 2026 18:19:46 -0500
Subject: [PATCH 2/3] test: shut down MeterProvider after exporter tests

PeriodicExportingMetricReader background threads (60 s default) would
fire after pytest closed stdout once the suite crossed the 60 s mark,
causing "I/O operation on closed file" and OTLP UNAVAILABLE errors.

Assisted-by: Claude Code
Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com>
---
 test/telemetry/test_metrics.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/test/telemetry/test_metrics.py b/test/telemetry/test_metrics.py
index e5a9360ad..e71a378fe 100644
--- a/test/telemetry/test_metrics.py
+++ b/test/telemetry/test_metrics.py
@@ -65,6 +65,20 @@ def enable_metrics(monkeypatch):
     importlib.reload(mellea.telemetry.metrics)
 
 
+@pytest.fixture
+def shutdown_meter_provider():
+    """Shut down the MeterProvider after tests that reload with real exporters enabled.
+
+    Prevents PeriodicExportingMetricReader background threads from firing after
+    pytest closes stdout (60 s default interval).
+    """
+    yield
+    import mellea.telemetry.metrics as _m
+
+    if _m._meter_provider is not None:
+        _m._meter_provider.shutdown()
+
+
 # Configuration Tests
 
 
@@ -364,7 +378,7 @@ def test_default_service_name(enable_metrics):
 # Console Exporter Tests
 
 
-def test_console_exporter_enabled(monkeypatch):
+def test_console_exporter_enabled(monkeypatch, shutdown_meter_provider):
     """Test that console exporter can be enabled."""
     monkeypatch.setenv("MELLEA_METRICS_ENABLED", "true")
     monkeypatch.setenv("MELLEA_METRICS_CONSOLE", "true")
@@ -390,7 +404,7 @@ def test_console_exporter_disabled_by_default(enable_metrics):
 # OTLP Exporter Tests
 
 
-def test_otlp_explicit_enablement(monkeypatch):
+def test_otlp_explicit_enablement(monkeypatch, shutdown_meter_provider):
     """Test that OTLP exporter requires explicit enablement via MELLEA_METRICS_OTLP."""
     monkeypatch.setenv("MELLEA_METRICS_ENABLED", "true")
     monkeypatch.setenv("MELLEA_METRICS_OTLP", "true")
@@ -521,7 +535,7 @@ def test_prometheus_exporter_import_error_warning(monkeypatch):
         sys.modules.update(original_modules)
 
 
-def test_prometheus_and_otlp_exporters_together(monkeypatch):
+def test_prometheus_and_otlp_exporters_together(monkeypatch, shutdown_meter_provider):
     """Test that Prometheus and OTLP exporters can run simultaneously."""
     monkeypatch.setenv("MELLEA_METRICS_ENABLED", "true")
     monkeypatch.setenv("MELLEA_METRICS_PROMETHEUS", "true")
@@ -547,7 +561,9 @@ def test_prometheus_exporter_disabled_by_default(enable_metrics):
     assert _METRICS_PROMETHEUS is False
 
 
-def test_prometheus_exporter_with_console_exporter(monkeypatch):
+def test_prometheus_exporter_with_console_exporter(
+    monkeypatch, shutdown_meter_provider
+):
     """Test that Prometheus works alongside console exporter."""
     monkeypatch.setenv("MELLEA_METRICS_ENABLED", "true")
     monkeypatch.setenv("MELLEA_METRICS_PROMETHEUS", "true")

From e0b49290dbc139339feeea02bf489fff85194cf3 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Tue, 21 Apr 2026 14:53:52 -0500
Subject: [PATCH 3/3] fix: use bounded reason for requirement.failures metric

Replace unbounded ValidationResult.reason (which can be raw LLM output)
with a bounded value: rule-based requirements pass their result.reason
through directly; model-based requirements (no validation_fn) emit
"LLM judgment" to prevent metric cardinality explosion.

Assisted-by: Claude Code
Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com>
---
 mellea/telemetry/metrics_plugins.py    | 6 +++++-
 test/telemetry/test_metrics_plugins.py | 9 +++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/mellea/telemetry/metrics_plugins.py b/mellea/telemetry/metrics_plugins.py
index 3d71d783b..1639c5f2b 100644
--- a/mellea/telemetry/metrics_plugins.py
+++ b/mellea/telemetry/metrics_plugins.py
@@ -198,7 +198,11 @@ async def record_requirement_metrics(
             req_name = type(req).__name__
             record_requirement_check(req_name)
             if not bool(result):
-                reason = getattr(result, "reason", None) or "unknown"
+                reason = (
+                    getattr(result, "reason", None)
+                    if req.validation_fn is not None
+                    else None
+                ) or "LLM judgment"
                 record_requirement_failure(req_name, reason)
 
 
diff --git a/test/telemetry/test_metrics_plugins.py b/test/telemetry/test_metrics_plugins.py
index aee8e0a2a..1ae691464 100644
--- a/test/telemetry/test_metrics_plugins.py
+++ b/test/telemetry/test_metrics_plugins.py
@@ -297,7 +297,8 @@ def __bool__(self) -> bool:
 
 
 class _FakeReq:
-    pass
+    def validation_fn(self, _ctx):
+        return None
 
 
 @pytest.fixture
@@ -431,10 +432,10 @@ async def test_requirement_plugin_mixed_pass_fail(requirement_plugin):
 
 
 @pytest.mark.asyncio
-async def test_requirement_plugin_failure_with_no_reason_falls_back_to_unknown(
+async def test_requirement_plugin_failure_with_no_reason_uses_default(
     requirement_plugin,
 ):
-    """A None reason falls back to 'unknown'."""
+    """A None reason falls back to the default reason."""
     req = _FakeReq()
     payload = ValidationPostCheckPayload(
         requirements=[req],
@@ -450,7 +451,7 @@ async def test_requirement_plugin_failure_with_no_reason_falls_back_to_unknown(
     ):
         await requirement_plugin.record_requirement_metrics(payload, {})
 
-        mock_fail.assert_called_once_with("_FakeReq", "unknown")
+        mock_fail.assert_called_once_with("_FakeReq", "LLM judgment")
 
 
 # ToolMetricsPlugin tests