From b743bf10a1366c3e65a135343a6f831808f0f741 Mon Sep 17 00:00:00 2001 From: Alex Bozarth Date: Fri, 17 Apr 2026 17:48:58 -0500 Subject: [PATCH 1/3] feat: add operational counters for sampling, requirements, and tools (#467) Adds six new OpenTelemetry counters giving operators visibility into retry behaviour, validation failure rates, and tool call health: mellea.sampling.attempts/successes/failures, mellea.requirement.checks/failures, and mellea.tool.calls. Follows the established lazy-init globals + record_* helpers + Plugin hooks pattern. Extends SamplingIterationPayload and SamplingLoopEndPayload with a strategy_name field so plugins can tag counters by strategy class. Assisted-by: Claude Code Signed-off-by: Alex Bozarth --- .../evaluation-and-observability/metrics.md | 43 ++ .../evaluation-and-observability/telemetry.md | 5 + mellea/plugins/hooks/sampling.py | 4 + mellea/stdlib/sampling/base.py | 3 + mellea/telemetry/__init__.py | 10 + mellea/telemetry/metrics.py | 177 +++++++++ mellea/telemetry/metrics_plugins.py | 111 +++++- test/telemetry/test_metrics.py | 67 ++++ test/telemetry/test_metrics_backend.py | 49 +++ test/telemetry/test_metrics_operational.py | 376 ++++++++++++++++++ test/telemetry/test_metrics_plugins.py | 236 ++++++++++- 11 files changed, 1079 insertions(+), 2 deletions(-) create mode 100644 test/telemetry/test_metrics_operational.py diff --git a/docs/docs/evaluation-and-observability/metrics.md b/docs/docs/evaluation-and-observability/metrics.md index ac87d3661..f6e198f23 100644 --- a/docs/docs/evaluation-and-observability/metrics.md +++ b/docs/docs/evaluation-and-observability/metrics.md @@ -173,6 +173,49 @@ Error metrics are recorded when a backend raises an exception during generation, after the request has been dispatched to the provider. Construction-time errors (e.g. missing API key) are not captured by the error counter. +## Operational metrics + +Mellea records metrics for its internal sampling, validation, and tool execution +loops. These counters give visibility into retry behavior, validation failure +rates, and tool call health — independent of the underlying LLM provider. + +### Sampling counters + +| Metric Name | Type | Unit | Description | +| ----------- | ---- | ---- | ----------- | +| `mellea.sampling.attempts` | Counter | `{attempt}` | Sampling attempts per loop iteration | +| `mellea.sampling.successes` | Counter | `{sample}` | Sampling loops that produced a passing sample | +| `mellea.sampling.failures` | Counter | `{failure}` | Sampling loops that exhausted the loop budget without success | + +All sampling metrics include: + +| Attribute | Description | Example Values | +| --------- | ----------- | -------------- | +| `strategy` | Sampling strategy class name | `RejectionSamplingStrategy`, `MultiTurnStrategy`, `RepairTemplateStrategy` | + +### Requirement counters + +| Metric Name | Type | Unit | Description | +| ----------- | ---- | ---- | ----------- | +| `mellea.requirement.checks` | Counter | `{check}` | Requirement validation checks performed | +| `mellea.requirement.failures` | Counter | `{failure}` | Requirement validation checks that failed | + +| Attribute | Description | Example Values | +| --------- | ----------- | -------------- | +| `requirement` | Requirement class name | `LLMaJRequirement`, `PythonExecutionReq`, `ALoraRequirement`, `GuardianCheck` | +| `reason` | Human-readable failure reason (`mellea.requirement.failures` only) | `"Output did not satisfy constraint"`, `"unknown"` | + +### Tool counter + +| Metric Name | Type | Unit | Description | +| ----------- | ---- | ---- | ----------- | +| `mellea.tool.calls` | Counter | `{call}` | Tool invocations by name and status | + +| Attribute | Description | Example Values | +| --------- | ----------- | -------------- | +| `tool` | Name of the invoked tool | `"search"`, `"calculator"` | +| `status` | Execution outcome | `success`, `failure` | + ## Metrics export configuration Mellea supports multiple metrics exporters that can be used independently or diff --git a/docs/docs/evaluation-and-observability/telemetry.md b/docs/docs/evaluation-and-observability/telemetry.md index 4e592f2cb..ee19e1076 100644 --- a/docs/docs/evaluation-and-observability/telemetry.md +++ b/docs/docs/evaluation-and-observability/telemetry.md @@ -125,6 +125,11 @@ OpenTelemetry. No code changes are required: `mellea.llm.ttfb` (streaming requests only). - **Error counter** — `mellea.llm.errors` on each failed backend call, classified by semantic error type. +- **Sampling counters** — `mellea.sampling.attempts`, `mellea.sampling.successes`, + and `mellea.sampling.failures` per strategy. +- **Requirement counters** — `mellea.requirement.checks` and + `mellea.requirement.failures` per requirement type. +- **Tool counter** — `mellea.tool.calls` by tool name and status. The metrics API also exposes `create_counter`, `create_histogram`, and `create_up_down_counter` for instrumenting your own application code. diff --git a/mellea/plugins/hooks/sampling.py b/mellea/plugins/hooks/sampling.py index 4e0bceefe..8488e9745 100644 --- a/mellea/plugins/hooks/sampling.py +++ b/mellea/plugins/hooks/sampling.py @@ -31,6 +31,7 @@ class SamplingIterationPayload(MelleaBasePayload): """Payload for ``sampling_iteration`` — after each sampling attempt. Attributes: + strategy_name: Class name of the sampling strategy (e.g. ``"RejectionSamplingStrategy"``). iteration: 1-based iteration number within the sampling loop. action: The ``Component`` used for this attempt. @@ -42,6 +43,7 @@ class SamplingIterationPayload(MelleaBasePayload): total_count: Total number of requirements evaluated. """ + strategy_name: str = "" iteration: int = 0 action: Any = None result: Any = None @@ -78,6 +80,7 @@ class SamplingLoopEndPayload(MelleaBasePayload): """Payload for ``sampling_loop_end`` — when sampling completes. Attributes: + strategy_name: Class name of the sampling strategy (e.g. ``"RejectionSamplingStrategy"``). success: ``True`` if at least one attempt passed all requirements. iterations_used: Total number of iterations the loop executed. final_result: The selected ``ModelOutputThunk`` (best success or best failure). @@ -91,6 +94,7 @@ class SamplingLoopEndPayload(MelleaBasePayload): ``(Requirement, ValidationResult)`` tuples for iteration *i*. """ + strategy_name: str = "" success: bool = False iterations_used: int = 0 final_result: Any = None diff --git a/mellea/stdlib/sampling/base.py b/mellea/stdlib/sampling/base.py index b843028cb..67a116d00 100644 --- a/mellea/stdlib/sampling/base.py +++ b/mellea/stdlib/sampling/base.py @@ -247,6 +247,7 @@ async def sample( from ...plugins.hooks.sampling import SamplingIterationPayload iter_payload = SamplingIterationPayload( + strategy_name=type(self).__name__, iteration=loop_count, action=next_action, result=result, @@ -272,6 +273,7 @@ async def sample( from ...plugins.hooks.sampling import SamplingLoopEndPayload end_payload = SamplingLoopEndPayload( + strategy_name=type(self).__name__, success=True, iterations_used=loop_count, final_result=result, @@ -362,6 +364,7 @@ async def sample( sample_contexts[best_failed_index] if sample_contexts else context ) end_payload = SamplingLoopEndPayload( + strategy_name=type(self).__name__, success=False, iterations_used=loop_count, final_result=sampled_results[best_failed_index], diff --git a/mellea/telemetry/__init__.py b/mellea/telemetry/__init__.py index 288e404db..62449b892 100644 --- a/mellea/telemetry/__init__.py +++ b/mellea/telemetry/__init__.py @@ -79,7 +79,12 @@ def my_function(): create_up_down_counter, is_metrics_enabled, record_request_duration, + record_requirement_check, + record_requirement_failure, + record_sampling_attempt, + record_sampling_outcome, record_token_usage_metrics, + record_tool_call, record_ttfb, ) from .tracing import ( @@ -111,7 +116,12 @@ def my_function(): "is_backend_tracing_enabled", "is_metrics_enabled", "record_request_duration", + "record_requirement_check", + "record_requirement_failure", + "record_sampling_attempt", + "record_sampling_outcome", "record_token_usage_metrics", + "record_tool_call", "record_ttfb", "set_span_attribute", "set_span_error", diff --git a/mellea/telemetry/metrics.py b/mellea/telemetry/metrics.py index eeb4f2ab0..199d3c054 100644 --- a/mellea/telemetry/metrics.py +++ b/mellea/telemetry/metrics.py @@ -54,6 +54,9 @@ - Token counters: mellea.llm.tokens.input, mellea.llm.tokens.output (unit: tokens) - Latency histograms: mellea.llm.request.duration (unit: s), mellea.llm.ttfb (unit: s, streaming only) - Error counter: mellea.llm.errors (unit: {error}), categorized by semantic error type +- Sampling counters: mellea.sampling.attempts, mellea.sampling.successes, mellea.sampling.failures (unit: {attempt}/{sample}/{failure}) +- Requirement counters: mellea.requirement.checks (unit: {check}), mellea.requirement.failures (unit: {failure}) +- Tool counter: mellea.tool.calls (unit: {call}), tagged by tool name and status Programmatic usage: from mellea.telemetry.metrics import create_counter, create_histogram @@ -719,6 +722,175 @@ def record_error( ) +_sampling_attempts_counter: Any = None +_sampling_successes_counter: Any = None +_sampling_failures_counter: Any = None + + +def _get_sampling_attempts_counter() -> Any: + """Get or create the sampling attempts counter (internal use only).""" + global _sampling_attempts_counter + + if _sampling_attempts_counter is None: + _sampling_attempts_counter = create_counter( + "mellea.sampling.attempts", + description="Total number of sampling attempts per strategy", + unit="{attempt}", + ) + return _sampling_attempts_counter + + +def _get_sampling_successes_counter() -> Any: + """Get or create the sampling successes counter (internal use only).""" + global _sampling_successes_counter + + if _sampling_successes_counter is None: + _sampling_successes_counter = create_counter( + "mellea.sampling.successes", + description="Total number of successful sampling loops per strategy", + unit="{sample}", + ) + return _sampling_successes_counter + + +def _get_sampling_failures_counter() -> Any: + """Get or create the sampling failures counter (internal use only).""" + global _sampling_failures_counter + + if _sampling_failures_counter is None: + _sampling_failures_counter = create_counter( + "mellea.sampling.failures", + description="Total number of failed sampling loops (budget exhausted) per strategy", + unit="{failure}", + ) + return _sampling_failures_counter + + +def record_sampling_attempt(strategy: str) -> None: + """Record one sampling attempt for the given strategy. + + This is a no-op when metrics are disabled, ensuring zero overhead. + + Args: + strategy: Sampling strategy class name (e.g. ``"RejectionSamplingStrategy"``). + """ + if not _METRICS_ENABLED: + return + + _get_sampling_attempts_counter().add(1, {"strategy": strategy}) + + +def record_sampling_outcome(strategy: str, success: bool) -> None: + """Record the final outcome (success or failure) of a sampling loop. + + This is a no-op when metrics are disabled, ensuring zero overhead. + + Args: + strategy: Sampling strategy class name (e.g. ``"RejectionSamplingStrategy"``). + success: ``True`` if at least one attempt passed all requirements. + """ + if not _METRICS_ENABLED: + return + + if success: + _get_sampling_successes_counter().add(1, {"strategy": strategy}) + else: + _get_sampling_failures_counter().add(1, {"strategy": strategy}) + + +_requirement_checks_counter: Any = None +_requirement_failures_counter: Any = None + + +def _get_requirement_checks_counter() -> Any: + """Get or create the requirement checks counter (internal use only).""" + global _requirement_checks_counter + + if _requirement_checks_counter is None: + _requirement_checks_counter = create_counter( + "mellea.requirement.checks", + description="Total number of requirement validation checks", + unit="{check}", + ) + return _requirement_checks_counter + + +def _get_requirement_failures_counter() -> Any: + """Get or create the requirement failures counter (internal use only).""" + global _requirement_failures_counter + + if _requirement_failures_counter is None: + _requirement_failures_counter = create_counter( + "mellea.requirement.failures", + description="Total number of requirement validation failures", + unit="{failure}", + ) + return _requirement_failures_counter + + +def record_requirement_check(requirement: str) -> None: + """Record one requirement validation check. + + This is a no-op when metrics are disabled, ensuring zero overhead. + + Args: + requirement: Requirement class name (e.g. ``"LLMaJRequirement"``). + """ + if not _METRICS_ENABLED: + return + + _get_requirement_checks_counter().add(1, {"requirement": requirement}) + + +def record_requirement_failure(requirement: str, reason: str) -> None: + """Record one requirement validation failure. + + This is a no-op when metrics are disabled, ensuring zero overhead. + + Args: + requirement: Requirement class name (e.g. ``"LLMaJRequirement"``). + reason: Human-readable failure reason from ``ValidationResult.reason``. + """ + if not _METRICS_ENABLED: + return + + _get_requirement_failures_counter().add( + 1, {"requirement": requirement, "reason": reason} + ) + + +_tool_calls_counter: Any = None + + +def _get_tool_calls_counter() -> Any: + """Get or create the tool calls counter (internal use only).""" + global _tool_calls_counter + + if _tool_calls_counter is None: + _tool_calls_counter = create_counter( + "mellea.tool.calls", + description="Total number of tool invocations by name and status", + unit="{call}", + ) + return _tool_calls_counter + + +def record_tool_call(tool: str, status: str) -> None: + """Record one tool invocation. + + This is a no-op when metrics are disabled, ensuring zero overhead. + + Args: + tool: Name of the tool that was invoked. + status: ``"success"`` if the tool executed without error, ``"failure"`` otherwise. + """ + if not _METRICS_ENABLED: + return + + counter = _get_tool_calls_counter() + counter.add(1, {"tool": tool, "status": status}) + + __all__ = [ "classify_error", "create_counter", @@ -727,6 +899,11 @@ def record_error( "is_metrics_enabled", "record_error", "record_request_duration", + "record_requirement_check", + "record_requirement_failure", + "record_sampling_attempt", + "record_sampling_outcome", "record_token_usage_metrics", + "record_tool_call", "record_ttfb", ] diff --git a/mellea/telemetry/metrics_plugins.py b/mellea/telemetry/metrics_plugins.py index 1c0561f58..3d71d783b 100644 --- a/mellea/telemetry/metrics_plugins.py +++ b/mellea/telemetry/metrics_plugins.py @@ -6,6 +6,9 @@ - TokenMetricsPlugin: Records token usage statistics from ModelOutputThunk.usage - LatencyMetricsPlugin: Records request duration and TTFB latency histograms - ErrorMetricsPlugin: Records LLM error counts categorized by semantic error type +- SamplingMetricsPlugin: Records sampling attempt/success/failure counts per strategy +- RequirementMetricsPlugin: Records requirement validation check and failure counts +- ToolMetricsPlugin: Records tool invocation counts by name and status """ from __future__ import annotations @@ -21,6 +24,12 @@ GenerationErrorPayload, GenerationPostCallPayload, ) + from mellea.plugins.hooks.sampling import ( + SamplingIterationPayload, + SamplingLoopEndPayload, + ) + from mellea.plugins.hooks.tool import ToolPostInvokePayload + from mellea.plugins.hooks.validation import ValidationPostCheckPayload class TokenMetricsPlugin(Plugin, name="token_metrics", priority=50): @@ -127,5 +136,105 @@ async def record_error_metrics( ) +class SamplingMetricsPlugin(Plugin, name="sampling_metrics", priority=54): + """Records sampling loop attempt and outcome metrics. + + Hooks into ``sampling_iteration`` to count attempts per strategy and + ``sampling_loop_end`` to count successes and failures. + """ + + @hook("sampling_iteration", mode=PluginMode.FIRE_AND_FORGET) + async def record_sampling_attempt( + self, payload: SamplingIterationPayload, context: dict[str, Any] + ) -> None: + """Record one sampling attempt after each iteration. + + Args: + payload: Contains strategy_name and iteration metadata. + context: Plugin context (unused). + """ + from mellea.telemetry.metrics import record_sampling_attempt + + record_sampling_attempt(payload.strategy_name or "unknown") + + @hook("sampling_loop_end", mode=PluginMode.FIRE_AND_FORGET) + async def record_sampling_outcome( + self, payload: SamplingLoopEndPayload, context: dict[str, Any] + ) -> None: + """Record success or failure when the sampling loop ends. + + Args: + payload: Contains strategy_name and success flag. + context: Plugin context (unused). + """ + from mellea.telemetry.metrics import record_sampling_outcome + + record_sampling_outcome(payload.strategy_name or "unknown", payload.success) + + +class RequirementMetricsPlugin(Plugin, name="requirement_metrics", priority=55): + """Records requirement validation check and failure metrics. + + Hooks into ``validation_post_check`` to count checks and failures per + requirement type after each validation batch. + """ + + @hook("validation_post_check", mode=PluginMode.FIRE_AND_FORGET) + async def record_requirement_metrics( + self, payload: ValidationPostCheckPayload, context: dict[str, Any] + ) -> None: + """Record validation checks and failures for each requirement. + + Args: + payload: Contains requirements list and corresponding results. + context: Plugin context (unused). + """ + from mellea.telemetry.metrics import ( + record_requirement_check, + record_requirement_failure, + ) + + for req, result in zip(payload.requirements, payload.results): + req_name = type(req).__name__ + record_requirement_check(req_name) + if not bool(result): + reason = getattr(result, "reason", None) or "unknown" + record_requirement_failure(req_name, reason) + + +class ToolMetricsPlugin(Plugin, name="tool_metrics", priority=56): + """Records tool invocation metrics. + + Hooks into ``tool_post_invoke`` to count tool calls by name and success/failure status. + """ + + @hook("tool_post_invoke", mode=PluginMode.FIRE_AND_FORGET) + async def record_tool_call( + self, payload: ToolPostInvokePayload, context: dict[str, Any] + ) -> None: + """Record one tool invocation after it completes. + + Args: + payload: Contains model_tool_call (with name) and success flag. + context: Plugin context (unused). + """ + from mellea.telemetry.metrics import record_tool_call + + tool_name = ( + payload.model_tool_call.name + if payload.model_tool_call is not None + else "unknown" + ) + status = "success" if payload.success else "failure" + record_tool_call(tool_name, status) + + # All metrics plugins to auto-register when metrics are enabled -_METRICS_PLUGIN_CLASSES = (TokenMetricsPlugin, LatencyMetricsPlugin, ErrorMetricsPlugin) +_METRICS_PLUGIN_CLASSES = ( + TokenMetricsPlugin, + LatencyMetricsPlugin, + ErrorMetricsPlugin, + SamplingMetricsPlugin, + RequirementMetricsPlugin, + ToolMetricsPlugin, +) diff --git a/test/telemetry/test_metrics.py b/test/telemetry/test_metrics.py index d96d71687..e5a9360ad 100644 --- a/test/telemetry/test_metrics.py +++ b/test/telemetry/test_metrics.py @@ -574,6 +574,12 @@ def test_metric_instruments_lazy_initialization(enable_metrics): _duration_histogram, _input_token_counter, _output_token_counter, + _requirement_checks_counter, + _requirement_failures_counter, + _sampling_attempts_counter, + _sampling_failures_counter, + _sampling_successes_counter, + _tool_calls_counter, _ttfb_histogram, ) @@ -582,21 +588,44 @@ def test_metric_instruments_lazy_initialization(enable_metrics): assert _output_token_counter is None assert _duration_histogram is None assert _ttfb_histogram is None + assert _sampling_attempts_counter is None + assert _sampling_successes_counter is None + assert _sampling_failures_counter is None + assert _requirement_checks_counter is None + assert _requirement_failures_counter is None + assert _tool_calls_counter is None from mellea.telemetry.metrics import ( record_request_duration, + record_requirement_check, + record_requirement_failure, + record_sampling_attempt, + record_sampling_outcome, record_token_usage_metrics, + record_tool_call, ) record_token_usage_metrics( input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama" ) record_request_duration(duration_s=1.0, model="llama2:7b", provider="ollama") + record_sampling_attempt("RejectionSamplingStrategy") + record_sampling_outcome("RejectionSamplingStrategy", success=True) + record_sampling_outcome("RejectionSamplingStrategy", success=False) + record_requirement_check("LLMaJRequirement") + record_requirement_failure("LLMaJRequirement", "constraint not met") + record_tool_call("search", "success") from mellea.telemetry.metrics import ( _duration_histogram, _input_token_counter, _output_token_counter, + _requirement_checks_counter, + _requirement_failures_counter, + _sampling_attempts_counter, + _sampling_failures_counter, + _sampling_successes_counter, + _tool_calls_counter, _ttfb_histogram, ) @@ -606,6 +635,12 @@ def test_metric_instruments_lazy_initialization(enable_metrics): assert ( _ttfb_histogram is not None ) # initialized together via _get_latency_histograms + assert _sampling_attempts_counter is not None + assert _sampling_successes_counter is not None + assert _sampling_failures_counter is not None + assert _requirement_checks_counter is not None + assert _requirement_failures_counter is not None + assert _tool_calls_counter is not None def test_record_metrics_noop_when_disabled(clean_metrics_env): @@ -613,7 +648,12 @@ def test_record_metrics_noop_when_disabled(clean_metrics_env): from mellea.telemetry.metrics import ( record_error, record_request_duration, + record_requirement_check, + record_requirement_failure, + record_sampling_attempt, + record_sampling_outcome, record_token_usage_metrics, + record_tool_call, ) record_token_usage_metrics( @@ -626,6 +666,11 @@ def test_record_metrics_noop_when_disabled(clean_metrics_env): provider="ollama", exception_class="TimeoutError", ) + record_sampling_attempt("RejectionSamplingStrategy") + record_sampling_outcome("RejectionSamplingStrategy", success=True) + record_requirement_check("LLMaJRequirement") + record_requirement_failure("LLMaJRequirement", "constraint not met") + record_tool_call("search", "success") # No instruments should have been initialized from mellea.telemetry.metrics import ( @@ -633,6 +678,12 @@ def test_record_metrics_noop_when_disabled(clean_metrics_env): _error_counter, _input_token_counter, _output_token_counter, + _requirement_checks_counter, + _requirement_failures_counter, + _sampling_attempts_counter, + _sampling_failures_counter, + _sampling_successes_counter, + _tool_calls_counter, _ttfb_histogram, ) @@ -641,19 +692,35 @@ def test_record_metrics_noop_when_disabled(clean_metrics_env): assert _duration_histogram is None assert _ttfb_histogram is None assert _error_counter is None + assert _sampling_attempts_counter is None + assert _sampling_successes_counter is None + assert _sampling_failures_counter is None + assert _requirement_checks_counter is None + assert _requirement_failures_counter is None + assert _tool_calls_counter is None def test_record_functions_exported_in_public_api(): """Test that all record functions are exported in the public API.""" from mellea.telemetry import ( record_request_duration, + record_requirement_check, + record_requirement_failure, + record_sampling_attempt, + record_sampling_outcome, record_token_usage_metrics, + record_tool_call, record_ttfb, ) assert callable(record_token_usage_metrics) assert callable(record_request_duration) assert callable(record_ttfb) + assert callable(record_sampling_attempt) + assert callable(record_sampling_outcome) + assert callable(record_requirement_check) + assert callable(record_requirement_failure) + assert callable(record_tool_call) # Token Counter Tests diff --git a/test/telemetry/test_metrics_backend.py b/test/telemetry/test_metrics_backend.py index cd34ce194..95a40b5f7 100644 --- a/test/telemetry/test_metrics_backend.py +++ b/test/telemetry/test_metrics_backend.py @@ -89,6 +89,12 @@ def _setup_metrics_provider(metrics_module, metric_reader): metrics_module._duration_histogram = None metrics_module._ttfb_histogram = None metrics_module._error_counter = None + metrics_module._sampling_attempts_counter = None + metrics_module._sampling_successes_counter = None + metrics_module._sampling_failures_counter = None + metrics_module._requirement_checks_counter = None + metrics_module._requirement_failures_counter = None + metrics_module._tool_calls_counter = None return provider @@ -480,3 +486,46 @@ async def test_error_metrics_on_backend_failure(enable_metrics, metric_reader): assert error_count is not None, "Error counter should have been recorded" assert error_count == 1, f"Expected 1 error, got {error_count}" + + +@pytest.mark.asyncio +@pytest.mark.ollama +async def test_ollama_sampling_metrics_integration(enable_metrics, metric_reader): + """Test that sampling metrics are recorded through a full RejectionSamplingStrategy loop.""" + from mellea.backends.ollama import OllamaModelBackend + from mellea.stdlib.components import Instruction + from mellea.stdlib.context import SimpleContext + from mellea.stdlib.sampling import RejectionSamplingStrategy + from mellea.telemetry import metrics as metrics_module + + provider = _setup_metrics_provider(metrics_module, metric_reader) + + backend = OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + strategy = RejectionSamplingStrategy(loop_budget=1) + ctx = SimpleContext() + + result = await strategy.sample( + action=Instruction("Say hello"), context=ctx, backend=backend, requirements=None + ) + + # Yield to event loop so FIRE_AND_FORGET plugin tasks complete + await asyncio.sleep(0.05) + provider.force_flush() + metrics_data = metric_reader.get_metrics_data() + + attempts = get_metric_value( + metrics_data, + "mellea.sampling.attempts", + {"strategy": "RejectionSamplingStrategy"}, + ) + assert attempts is not None, "Sampling attempts should be recorded" + assert attempts >= 1, f"Expected at least 1 attempt, got {attempts}" + + # With no requirements and loop_budget=1 the loop always succeeds + successes = get_metric_value( + metrics_data, + "mellea.sampling.successes", + {"strategy": "RejectionSamplingStrategy"}, + ) + assert result.success + assert successes == 1, f"Expected 1 success, got {successes}" diff --git a/test/telemetry/test_metrics_operational.py b/test/telemetry/test_metrics_operational.py new file mode 100644 index 000000000..0d2187133 --- /dev/null +++ b/test/telemetry/test_metrics_operational.py @@ -0,0 +1,376 @@ +"""Tests for operational counter metrics (sampling, requirement, tool). + +Integration tests use InMemoryMetricReader to verify counter values and attributes. +Unit tests verify no-op behaviour when metrics are disabled. +""" + +import pytest + +try: + from opentelemetry.sdk.metrics import MeterProvider + from opentelemetry.sdk.metrics.export import InMemoryMetricReader + + OTEL_AVAILABLE = True +except ImportError: + OTEL_AVAILABLE = False + +pytestmark = pytest.mark.skipif( + not OTEL_AVAILABLE, reason="OpenTelemetry not installed" +) + + +@pytest.fixture +def clean_metrics_env(monkeypatch): + """Enable metrics and reset all module state for each test.""" + monkeypatch.setenv("MELLEA_METRICS_ENABLED", "true") + monkeypatch.delenv("MELLEA_METRICS_CONSOLE", raising=False) + + import importlib + + import mellea.telemetry.metrics + + importlib.reload(mellea.telemetry.metrics) + yield + importlib.reload(mellea.telemetry.metrics) + + +def _setup_in_memory_provider(metrics_module): + """Wire an InMemoryMetricReader into the metrics module globals.""" + reader = InMemoryMetricReader() + provider = MeterProvider(metric_readers=[reader]) + metrics_module._meter_provider = provider + metrics_module._meter = provider.get_meter("mellea") + # Reset all operational counter globals so they bind to the new meter + metrics_module._sampling_attempts_counter = None + metrics_module._sampling_successes_counter = None + metrics_module._sampling_failures_counter = None + metrics_module._requirement_checks_counter = None + metrics_module._requirement_failures_counter = None + metrics_module._tool_calls_counter = None + return reader, provider + + +def _data_points_for(metrics_data, metric_name): + """Return all data points for the named metric.""" + if metrics_data is None: + return [] + data_points = [] + for rm in metrics_data.resource_metrics: + for sm in rm.scope_metrics: + for metric in sm.metrics: + if metric.name == metric_name: + data_points.extend(metric.data.data_points) + return data_points + + +# --------------------------------------------------------------------------- +# Sampling — attempts +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +def test_record_sampling_attempt_basic(clean_metrics_env): + """Sampling attempt counter records correct value and strategy attribute.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_sampling_attempt("RejectionSamplingStrategy") + + provider.force_flush() + dps = _data_points_for(reader.get_metrics_data(), "mellea.sampling.attempts") + + assert len(dps) == 1 + assert dps[0].value == 1 + assert dict(dps[0].attributes)["strategy"] == "RejectionSamplingStrategy" + + +@pytest.mark.integration +def test_record_sampling_attempt_accumulation(clean_metrics_env): + """Multiple attempts accumulate correctly.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + for _ in range(3): + m.record_sampling_attempt("RejectionSamplingStrategy") + + provider.force_flush() + dps = _data_points_for(reader.get_metrics_data(), "mellea.sampling.attempts") + + assert len(dps) == 1 + assert dps[0].value == 3 + + +@pytest.mark.integration +def test_record_sampling_attempt_multiple_strategies(clean_metrics_env): + """Different strategies are tracked as separate attribute sets.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_sampling_attempt("RejectionSamplingStrategy") + m.record_sampling_attempt("MultiTurnStrategy") + + provider.force_flush() + dps = _data_points_for(reader.get_metrics_data(), "mellea.sampling.attempts") + + assert len(dps) == 2 + strategies = {dict(dp.attributes)["strategy"] for dp in dps} + assert strategies == {"RejectionSamplingStrategy", "MultiTurnStrategy"} + + +# --------------------------------------------------------------------------- +# Sampling — outcomes +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +def test_record_sampling_outcome_success(clean_metrics_env): + """Success outcome increments the successes counter.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_sampling_outcome("RejectionSamplingStrategy", success=True) + + provider.force_flush() + success_dps = _data_points_for( + reader.get_metrics_data(), "mellea.sampling.successes" + ) + failure_dps = _data_points_for( + reader.get_metrics_data(), "mellea.sampling.failures" + ) + + assert len(success_dps) == 1 + assert success_dps[0].value == 1 + assert len(failure_dps) == 0 + + +@pytest.mark.integration +def test_record_sampling_outcome_failure(clean_metrics_env): + """Failure outcome increments the failures counter.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_sampling_outcome("RejectionSamplingStrategy", success=False) + + provider.force_flush() + success_dps = _data_points_for( + reader.get_metrics_data(), "mellea.sampling.successes" + ) + failure_dps = _data_points_for( + reader.get_metrics_data(), "mellea.sampling.failures" + ) + + assert len(success_dps) == 0 + assert len(failure_dps) == 1 + assert failure_dps[0].value == 1 + + +# --------------------------------------------------------------------------- +# Requirement checks +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +def test_record_requirement_check_basic(clean_metrics_env): + """Requirement check counter records correct value and requirement attribute.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_requirement_check("LLMaJRequirement") + + provider.force_flush() + dps = _data_points_for(reader.get_metrics_data(), "mellea.requirement.checks") + + assert len(dps) == 1 + assert dps[0].value == 1 + assert dict(dps[0].attributes)["requirement"] == "LLMaJRequirement" + + +@pytest.mark.integration +def test_record_requirement_check_multiple_types(clean_metrics_env): + """Different requirement types are tracked separately.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_requirement_check("LLMaJRequirement") + m.record_requirement_check("PythonExecutionReq") + + provider.force_flush() + dps = _data_points_for(reader.get_metrics_data(), "mellea.requirement.checks") + + assert len(dps) == 2 + req_names = {dict(dp.attributes)["requirement"] for dp in dps} + assert req_names == {"LLMaJRequirement", "PythonExecutionReq"} + + +@pytest.mark.integration +def test_record_requirement_failure_attributes(clean_metrics_env): + """Requirement failure counter records requirement and reason attributes.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_requirement_failure( + "LLMaJRequirement", "Output did not satisfy constraint" + ) + + provider.force_flush() + dps = _data_points_for(reader.get_metrics_data(), "mellea.requirement.failures") + + assert len(dps) == 1 + attrs = dict(dps[0].attributes) + assert attrs["requirement"] == "LLMaJRequirement" + assert attrs["reason"] == "Output did not satisfy constraint" + + +@pytest.mark.integration +def test_record_requirement_failure_accumulation(clean_metrics_env): + """Multiple failures with the same attributes accumulate correctly.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_requirement_failure("LLMaJRequirement", "unknown") + m.record_requirement_failure("LLMaJRequirement", "unknown") + + provider.force_flush() + dps = _data_points_for(reader.get_metrics_data(), "mellea.requirement.failures") + + assert len(dps) == 1 + assert dps[0].value == 2 + + +# --------------------------------------------------------------------------- +# Tool calls +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +def test_record_tool_call_success(clean_metrics_env): + """Tool call counter records name and success status.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_tool_call("search", "success") + + provider.force_flush() + dps = _data_points_for(reader.get_metrics_data(), "mellea.tool.calls") + + assert len(dps) == 1 + attrs = dict(dps[0].attributes) + assert attrs["tool"] == "search" + assert attrs["status"] == "success" + + +@pytest.mark.integration +def test_record_tool_call_failure(clean_metrics_env): + """Tool call counter records failure status separately from success.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_tool_call("search", "success") + m.record_tool_call("search", "failure") + + provider.force_flush() + dps = _data_points_for(reader.get_metrics_data(), "mellea.tool.calls") + + assert len(dps) == 2 + statuses = {dict(dp.attributes)["status"] for dp in dps} + assert statuses == {"success", "failure"} + + +@pytest.mark.integration +def test_record_tool_call_multiple_tools(clean_metrics_env): + """Different tool names are tracked as separate attribute sets.""" + from mellea.telemetry import metrics as m + + reader, provider = _setup_in_memory_provider(m) + + m.record_tool_call("search", "success") + m.record_tool_call("calculator", "success") + + provider.force_flush() + dps = _data_points_for(reader.get_metrics_data(), "mellea.tool.calls") + + assert len(dps) == 2 + tools = {dict(dp.attributes)["tool"] for dp in dps} + assert tools == {"search", "calculator"} + + +# --------------------------------------------------------------------------- +# Unit: no-op when metrics disabled +# --------------------------------------------------------------------------- + + +def test_record_sampling_attempt_noop_when_disabled(monkeypatch): + """record_sampling_attempt is a no-op when metrics are disabled.""" + import importlib + + import mellea.telemetry.metrics as m + + importlib.reload(m) + monkeypatch.setattr(m, "_METRICS_ENABLED", False) + + # Should not raise and should not create any counter + m.record_sampling_attempt("RejectionSamplingStrategy") + assert m._sampling_attempts_counter is None + + +def test_record_sampling_outcome_noop_when_disabled(monkeypatch): + """record_sampling_outcome is a no-op when metrics are disabled.""" + import importlib + + import mellea.telemetry.metrics as m + + importlib.reload(m) + monkeypatch.setattr(m, "_METRICS_ENABLED", False) + + m.record_sampling_outcome("RejectionSamplingStrategy", success=True) + assert m._sampling_successes_counter is None + + +def test_record_requirement_check_noop_when_disabled(monkeypatch): + """record_requirement_check is a no-op when metrics are disabled.""" + import importlib + + import mellea.telemetry.metrics as m + + importlib.reload(m) + monkeypatch.setattr(m, "_METRICS_ENABLED", False) + + m.record_requirement_check("LLMaJRequirement") + assert m._requirement_checks_counter is None + + +def test_record_requirement_failure_noop_when_disabled(monkeypatch): + """record_requirement_failure is a no-op when metrics are disabled.""" + import importlib + + import mellea.telemetry.metrics as m + + importlib.reload(m) + monkeypatch.setattr(m, "_METRICS_ENABLED", False) + + m.record_requirement_failure("LLMaJRequirement", "reason") + assert m._requirement_failures_counter is None + + +def test_record_tool_call_noop_when_disabled(monkeypatch): + """record_tool_call is a no-op when metrics are disabled.""" + import importlib + + import mellea.telemetry.metrics as m + + importlib.reload(m) + monkeypatch.setattr(m, "_METRICS_ENABLED", False) + + m.record_tool_call("search", "success") + assert m._tool_calls_counter is None diff --git a/test/telemetry/test_metrics_plugins.py b/test/telemetry/test_metrics_plugins.py index d78feea9d..aee8e0a2a 100644 --- a/test/telemetry/test_metrics_plugins.py +++ b/test/telemetry/test_metrics_plugins.py @@ -1,4 +1,4 @@ -"""Unit tests for TokenMetricsPlugin, LatencyMetricsPlugin, and ErrorMetricsPlugin.""" +"""Unit tests for metrics plugins.""" from unittest.mock import patch @@ -11,6 +11,12 @@ GenerationErrorPayload, GenerationPostCallPayload, ) +from mellea.plugins.hooks.sampling import ( + SamplingIterationPayload, + SamplingLoopEndPayload, +) +from mellea.plugins.hooks.tool import ToolPostInvokePayload +from mellea.plugins.hooks.validation import ValidationPostCheckPayload from mellea.telemetry.metrics import ( ERROR_TYPE_TIMEOUT, ERROR_TYPE_TRANSPORT_ERROR, @@ -19,7 +25,10 @@ from mellea.telemetry.metrics_plugins import ( ErrorMetricsPlugin, LatencyMetricsPlugin, + RequirementMetricsPlugin, + SamplingMetricsPlugin, TokenMetricsPlugin, + ToolMetricsPlugin, ) @@ -267,3 +276,228 @@ async def test_error_plugin_handles_none_model_output(error_plugin): provider="unknown", exception_class="RuntimeError", ) + + +# SamplingMetricsPlugin tests + + +class _PassResult: + reason = None + + def __bool__(self) -> bool: + return True + + +class _FailResult: + def __init__(self, reason: str = "constraint not met") -> None: + self.reason = reason + + def __bool__(self) -> bool: + return False + + +class _FakeReq: + pass + + +@pytest.fixture +def sampling_plugin(): + return SamplingMetricsPlugin() + + +@pytest.mark.asyncio +async def test_sampling_plugin_records_attempt(sampling_plugin): + """Plugin calls record_sampling_attempt with the strategy name on each iteration.""" + payload = SamplingIterationPayload(strategy_name="RejectionSamplingStrategy") + + with patch("mellea.telemetry.metrics.record_sampling_attempt") as mock_record: + await sampling_plugin.record_sampling_attempt(payload, {}) + + mock_record.assert_called_once_with("RejectionSamplingStrategy") + + +@pytest.mark.asyncio +async def test_sampling_plugin_attempt_empty_name_falls_back_to_unknown( + sampling_plugin, +): + """Empty strategy_name falls back to 'unknown'.""" + payload = SamplingIterationPayload(strategy_name="") + + with patch("mellea.telemetry.metrics.record_sampling_attempt") as mock_record: + await sampling_plugin.record_sampling_attempt(payload, {}) + + mock_record.assert_called_once_with("unknown") + + +@pytest.mark.asyncio +async def test_sampling_plugin_records_success_outcome(sampling_plugin): + """Plugin calls record_sampling_outcome(success=True) on a successful loop end.""" + payload = SamplingLoopEndPayload( + strategy_name="RejectionSamplingStrategy", success=True + ) + + with patch("mellea.telemetry.metrics.record_sampling_outcome") as mock_record: + await sampling_plugin.record_sampling_outcome(payload, {}) + + mock_record.assert_called_once_with("RejectionSamplingStrategy", True) + + +@pytest.mark.asyncio +async def test_sampling_plugin_records_failure_outcome(sampling_plugin): + """Plugin calls record_sampling_outcome(success=False) on a failed loop end.""" + payload = SamplingLoopEndPayload(strategy_name="MultiTurnStrategy", success=False) + + with patch("mellea.telemetry.metrics.record_sampling_outcome") as mock_record: + await sampling_plugin.record_sampling_outcome(payload, {}) + + mock_record.assert_called_once_with("MultiTurnStrategy", False) + + +# RequirementMetricsPlugin tests + + +@pytest.fixture +def requirement_plugin(): + return RequirementMetricsPlugin() + + +@pytest.mark.asyncio +async def test_requirement_plugin_records_checks_and_no_failures_when_all_pass( + requirement_plugin, +): + """When all requirements pass, only checks are recorded.""" + req = _FakeReq() + payload = ValidationPostCheckPayload( + requirements=[req], + results=[_PassResult()], + all_validations_passed=True, + passed_count=1, + failed_count=0, + ) + + with ( + patch("mellea.telemetry.metrics.record_requirement_check") as mock_check, + patch("mellea.telemetry.metrics.record_requirement_failure") as mock_fail, + ): + await requirement_plugin.record_requirement_metrics(payload, {}) + + mock_check.assert_called_once_with("_FakeReq") + mock_fail.assert_not_called() + + +@pytest.mark.asyncio +async def test_requirement_plugin_records_failure_with_reason(requirement_plugin): + """Failed requirements record both a check and a failure with the reason.""" + req = _FakeReq() + payload = ValidationPostCheckPayload( + requirements=[req], + results=[_FailResult("output too short")], + all_validations_passed=False, + passed_count=0, + failed_count=1, + ) + + with ( + patch("mellea.telemetry.metrics.record_requirement_check") as mock_check, + patch("mellea.telemetry.metrics.record_requirement_failure") as mock_fail, + ): + await requirement_plugin.record_requirement_metrics(payload, {}) + + mock_check.assert_called_once_with("_FakeReq") + mock_fail.assert_called_once_with("_FakeReq", "output too short") + + +@pytest.mark.asyncio +async def test_requirement_plugin_mixed_pass_fail(requirement_plugin): + """Mixed results record a check for each and a failure only for failing ones.""" + req_a = _FakeReq() + req_b = _FakeReq() + payload = ValidationPostCheckPayload( + requirements=[req_a, req_b], + results=[_PassResult(), _FailResult("constraint not met")], + all_validations_passed=False, + passed_count=1, + failed_count=1, + ) + + with ( + patch("mellea.telemetry.metrics.record_requirement_check") as mock_check, + patch("mellea.telemetry.metrics.record_requirement_failure") as mock_fail, + ): + await requirement_plugin.record_requirement_metrics(payload, {}) + + assert mock_check.call_count == 2 + mock_fail.assert_called_once_with("_FakeReq", "constraint not met") + + +@pytest.mark.asyncio +async def test_requirement_plugin_failure_with_no_reason_falls_back_to_unknown( + requirement_plugin, +): + """A None reason falls back to 'unknown'.""" + req = _FakeReq() + payload = ValidationPostCheckPayload( + requirements=[req], + results=[_FailResult(reason=None)], # type: ignore[arg-type] + all_validations_passed=False, + passed_count=0, + failed_count=1, + ) + + with ( + patch("mellea.telemetry.metrics.record_requirement_check"), + patch("mellea.telemetry.metrics.record_requirement_failure") as mock_fail, + ): + await requirement_plugin.record_requirement_metrics(payload, {}) + + mock_fail.assert_called_once_with("_FakeReq", "unknown") + + +# ToolMetricsPlugin tests + + +class _MockToolCall: + def __init__(self, name: str) -> None: + self.name = name + + +@pytest.fixture +def tool_plugin(): + return ToolMetricsPlugin() + + +@pytest.mark.asyncio +async def test_tool_plugin_records_success(tool_plugin): + """Successful tool calls are recorded with status='success'.""" + payload = ToolPostInvokePayload( + model_tool_call=_MockToolCall("search"), success=True + ) + + with patch("mellea.telemetry.metrics.record_tool_call") as mock_record: + await tool_plugin.record_tool_call(payload, {}) + + mock_record.assert_called_once_with("search", "success") + + +@pytest.mark.asyncio +async def test_tool_plugin_records_failure(tool_plugin): + """Failed tool calls are recorded with status='failure'.""" + payload = ToolPostInvokePayload( + model_tool_call=_MockToolCall("calculator"), success=False + ) + + with patch("mellea.telemetry.metrics.record_tool_call") as mock_record: + await tool_plugin.record_tool_call(payload, {}) + + mock_record.assert_called_once_with("calculator", "failure") + + +@pytest.mark.asyncio +async def test_tool_plugin_none_tool_call_falls_back_to_unknown(tool_plugin): + """A None model_tool_call falls back to tool name 'unknown'.""" + payload = ToolPostInvokePayload(model_tool_call=None, success=True) + + with patch("mellea.telemetry.metrics.record_tool_call") as mock_record: + await tool_plugin.record_tool_call(payload, {}) + + mock_record.assert_called_once_with("unknown", "success") From a5827b7f543824773fdf071e155f7e7ee7d2386d Mon Sep 17 00:00:00 2001 From: Alex Bozarth Date: Fri, 17 Apr 2026 18:19:46 -0500 Subject: [PATCH 2/3] test: shut down MeterProvider after exporter tests PeriodicExportingMetricReader background threads (60 s default) would fire after pytest closed stdout once the suite crossed the 60 s mark, causing "I/O operation on closed file" and OTLP UNAVAILABLE errors. Assisted-by: Claude Code Signed-off-by: Alex Bozarth --- test/telemetry/test_metrics.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/test/telemetry/test_metrics.py b/test/telemetry/test_metrics.py index e5a9360ad..e71a378fe 100644 --- a/test/telemetry/test_metrics.py +++ b/test/telemetry/test_metrics.py @@ -65,6 +65,20 @@ def enable_metrics(monkeypatch): importlib.reload(mellea.telemetry.metrics) +@pytest.fixture +def shutdown_meter_provider(): + """Shut down the MeterProvider after tests that reload with real exporters enabled. + + Prevents PeriodicExportingMetricReader background threads from firing after + pytest closes stdout (60 s default interval). + """ + yield + import mellea.telemetry.metrics as _m + + if _m._meter_provider is not None: + _m._meter_provider.shutdown() + + # Configuration Tests @@ -364,7 +378,7 @@ def test_default_service_name(enable_metrics): # Console Exporter Tests -def test_console_exporter_enabled(monkeypatch): +def test_console_exporter_enabled(monkeypatch, shutdown_meter_provider): """Test that console exporter can be enabled.""" monkeypatch.setenv("MELLEA_METRICS_ENABLED", "true") monkeypatch.setenv("MELLEA_METRICS_CONSOLE", "true") @@ -390,7 +404,7 @@ def test_console_exporter_disabled_by_default(enable_metrics): # OTLP Exporter Tests -def test_otlp_explicit_enablement(monkeypatch): +def test_otlp_explicit_enablement(monkeypatch, shutdown_meter_provider): """Test that OTLP exporter requires explicit enablement via MELLEA_METRICS_OTLP.""" monkeypatch.setenv("MELLEA_METRICS_ENABLED", "true") monkeypatch.setenv("MELLEA_METRICS_OTLP", "true") @@ -521,7 +535,7 @@ def test_prometheus_exporter_import_error_warning(monkeypatch): sys.modules.update(original_modules) -def test_prometheus_and_otlp_exporters_together(monkeypatch): +def test_prometheus_and_otlp_exporters_together(monkeypatch, shutdown_meter_provider): """Test that Prometheus and OTLP exporters can run simultaneously.""" monkeypatch.setenv("MELLEA_METRICS_ENABLED", "true") monkeypatch.setenv("MELLEA_METRICS_PROMETHEUS", "true") @@ -547,7 +561,9 @@ def test_prometheus_exporter_disabled_by_default(enable_metrics): assert _METRICS_PROMETHEUS is False -def test_prometheus_exporter_with_console_exporter(monkeypatch): +def test_prometheus_exporter_with_console_exporter( + monkeypatch, shutdown_meter_provider +): """Test that Prometheus works alongside console exporter.""" monkeypatch.setenv("MELLEA_METRICS_ENABLED", "true") monkeypatch.setenv("MELLEA_METRICS_PROMETHEUS", "true") From e0b49290dbc139339feeea02bf489fff85194cf3 Mon Sep 17 00:00:00 2001 From: Alex Bozarth Date: Tue, 21 Apr 2026 14:53:52 -0500 Subject: [PATCH 3/3] fix: use bounded reason for requirement.failures metric Replace unbounded ValidationResult.reason (which can be raw LLM output) with a bounded value: rule-based requirements pass their result.reason through directly; model-based requirements (no validation_fn) emit "LLM judgment" to prevent metric cardinality explosion. Assisted-by: Claude Code Signed-off-by: Alex Bozarth --- mellea/telemetry/metrics_plugins.py | 6 +++++- test/telemetry/test_metrics_plugins.py | 9 +++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/mellea/telemetry/metrics_plugins.py b/mellea/telemetry/metrics_plugins.py index 3d71d783b..1639c5f2b 100644 --- a/mellea/telemetry/metrics_plugins.py +++ b/mellea/telemetry/metrics_plugins.py @@ -198,7 +198,11 @@ async def record_requirement_metrics( req_name = type(req).__name__ record_requirement_check(req_name) if not bool(result): - reason = getattr(result, "reason", None) or "unknown" + reason = ( + getattr(result, "reason", None) + if req.validation_fn is not None + else None + ) or "LLM judgment" record_requirement_failure(req_name, reason) diff --git a/test/telemetry/test_metrics_plugins.py b/test/telemetry/test_metrics_plugins.py index aee8e0a2a..1ae691464 100644 --- a/test/telemetry/test_metrics_plugins.py +++ b/test/telemetry/test_metrics_plugins.py @@ -297,7 +297,8 @@ def __bool__(self) -> bool: class _FakeReq: - pass + def validation_fn(self, _ctx): + return None @pytest.fixture @@ -431,10 +432,10 @@ async def test_requirement_plugin_mixed_pass_fail(requirement_plugin): @pytest.mark.asyncio -async def test_requirement_plugin_failure_with_no_reason_falls_back_to_unknown( +async def test_requirement_plugin_failure_with_no_reason_uses_default( requirement_plugin, ): - """A None reason falls back to 'unknown'.""" + """A None reason falls back to the default reason.""" req = _FakeReq() payload = ValidationPostCheckPayload( requirements=[req], @@ -450,7 +451,7 @@ async def test_requirement_plugin_failure_with_no_reason_falls_back_to_unknown( ): await requirement_plugin.record_requirement_metrics(payload, {}) - mock_fail.assert_called_once_with("_FakeReq", "unknown") + mock_fail.assert_called_once_with("_FakeReq", "LLM judgment") # ToolMetricsPlugin tests