generative-computing · ajbozarth · Apr 17, 2026 · Apr 17, 2026
@@ -173,6 +173,49 @@ Error metrics are recorded when a backend raises an exception during generation,
 after the request has been dispatched to the provider. Construction-time errors
 (e.g. missing API key) are not captured by the error counter.
 
+## Operational metrics
+
+Mellea records metrics for its internal sampling, validation, and tool execution
+loops. These counters give visibility into retry behavior, validation failure
+rates, and tool call health — independent of the underlying LLM provider.
+
+### Sampling counters
+
+| Metric Name | Type | Unit | Description |
+| ----------- | ---- | ---- | ----------- |
+| `mellea.sampling.attempts` | Counter | `{attempt}` | Sampling attempts per loop iteration |
+| `mellea.sampling.successes` | Counter | `{sample}` | Sampling loops that produced a passing sample |
+| `mellea.sampling.failures` | Counter | `{failure}` | Sampling loops that exhausted the loop budget without success |
+
+All sampling metrics include:
+
+| Attribute | Description | Example Values |
+| --------- | ----------- | -------------- |
+| `strategy` | Sampling strategy class name | `RejectionSamplingStrategy`, `MultiTurnStrategy`, `RepairTemplateStrategy` |
+
+### Requirement counters
+
+| Metric Name | Type | Unit | Description |
+| ----------- | ---- | ---- | ----------- |
+| `mellea.requirement.checks` | Counter | `{check}` | Requirement validation checks performed |
+| `mellea.requirement.failures` | Counter | `{failure}` | Requirement validation checks that failed |
+
+| Attribute | Description | Example Values |
+| --------- | ----------- | -------------- |
+| `requirement` | Requirement class name | `LLMaJRequirement`, `PythonExecutionReq`, `ALoraRequirement`, `GuardianCheck` |
+| `reason` | Human-readable failure reason (`mellea.requirement.failures` only) | `"Output did not satisfy constraint"`, `"unknown"` |
+
+### Tool counter
+
+| Metric Name | Type | Unit | Description |
+| ----------- | ---- | ---- | ----------- |
+| `mellea.tool.calls` | Counter | `{call}` | Tool invocations by name and status |
+
+| Attribute | Description | Example Values |
+| --------- | ----------- | -------------- |
+| `tool` | Name of the invoked tool | `"search"`, `"calculator"` |
+| `status` | Execution outcome | `success`, `failure` |
+
 ## Metrics export configuration
 
 Mellea supports multiple metrics exporters that can be used independently or

@@ -125,6 +125,11 @@ OpenTelemetry. No code changes are required:
   `mellea.llm.ttfb` (streaming requests only).
 - **Error counter** — `mellea.llm.errors` on each failed backend call,
   classified by semantic error type.
+- **Sampling counters** — `mellea.sampling.attempts`, `mellea.sampling.successes`,
+  and `mellea.sampling.failures` per strategy.
+- **Requirement counters** — `mellea.requirement.checks` and
+  `mellea.requirement.failures` per requirement type.
+- **Tool counter** — `mellea.tool.calls` by tool name and status.
 
 The metrics API also exposes `create_counter`, `create_histogram`, and
 `create_up_down_counter` for instrumenting your own application code.

@@ -31,6 +31,7 @@ class SamplingIterationPayload(MelleaBasePayload):
     """Payload for ``sampling_iteration`` — after each sampling attempt.
 
     Attributes:
+        strategy_name: Class name of the sampling strategy (e.g. ``"RejectionSamplingStrategy"``).
         iteration: 1-based iteration number within the sampling loop.
         action: The ``Component`` used for this attempt.
 
@@ -42,6 +43,7 @@ class SamplingIterationPayload(MelleaBasePayload):
         total_count: Total number of requirements evaluated.
     """
 
+    strategy_name: str = ""
     iteration: int = 0
     action: Any = None
     result: Any = None
@@ -78,6 +80,7 @@ class SamplingLoopEndPayload(MelleaBasePayload):
     """Payload for ``sampling_loop_end`` — when sampling completes.
 
     Attributes:
+        strategy_name: Class name of the sampling strategy (e.g. ``"RejectionSamplingStrategy"``).
         success: ``True`` if at least one attempt passed all requirements.
         iterations_used: Total number of iterations the loop executed.
         final_result: The selected ``ModelOutputThunk`` (best success or best failure).
@@ -91,6 +94,7 @@ class SamplingLoopEndPayload(MelleaBasePayload):
             ``(Requirement, ValidationResult)`` tuples for iteration *i*.
     """
 
+    strategy_name: str = ""
     success: bool = False
     iterations_used: int = 0
     final_result: Any = None

@@ -247,6 +247,7 @@ async def sample(
                         from ...plugins.hooks.sampling import SamplingIterationPayload
 
                         iter_payload = SamplingIterationPayload(
+                            strategy_name=type(self).__name__,
                             iteration=loop_count,
                             action=next_action,
                             result=result,
@@ -272,6 +273,7 @@ async def sample(
                             from ...plugins.hooks.sampling import SamplingLoopEndPayload
 
                             end_payload = SamplingLoopEndPayload(
+                                strategy_name=type(self).__name__,
                                 success=True,
                                 iterations_used=loop_count,
                                 final_result=result,
@@ -362,6 +364,7 @@ async def sample(
                     sample_contexts[best_failed_index] if sample_contexts else context
                 )
                 end_payload = SamplingLoopEndPayload(
+                    strategy_name=type(self).__name__,
                     success=False,
                     iterations_used=loop_count,
                     final_result=sampled_results[best_failed_index],

@@ -79,7 +79,12 @@ def my_function():
     create_up_down_counter,
     is_metrics_enabled,
     record_request_duration,
+    record_requirement_check,
+    record_requirement_failure,
+    record_sampling_attempt,
+    record_sampling_outcome,
     record_token_usage_metrics,
+    record_tool_call,
     record_ttfb,
 )
 from .tracing import (
@@ -111,7 +116,12 @@ def my_function():
     "is_backend_tracing_enabled",
     "is_metrics_enabled",
     "record_request_duration",
+    "record_requirement_check",
+    "record_requirement_failure",
+    "record_sampling_attempt",
+    "record_sampling_outcome",
     "record_token_usage_metrics",
+    "record_tool_call",
     "record_ttfb",
     "set_span_attribute",
     "set_span_error",

@@ -54,6 +54,9 @@
 - Token counters: mellea.llm.tokens.input, mellea.llm.tokens.output (unit: tokens)
 - Latency histograms: mellea.llm.request.duration (unit: s), mellea.llm.ttfb (unit: s, streaming only)
 - Error counter: mellea.llm.errors (unit: {error}), categorized by semantic error type
+- Sampling counters: mellea.sampling.attempts, mellea.sampling.successes, mellea.sampling.failures (unit: {attempt}/{sample}/{failure})
+- Requirement counters: mellea.requirement.checks (unit: {check}), mellea.requirement.failures (unit: {failure})
+- Tool counter: mellea.tool.calls (unit: {call}), tagged by tool name and status
 
 Programmatic usage:
     from mellea.telemetry.metrics import create_counter, create_histogram
@@ -719,6 +722,175 @@ def record_error(
     )
 
 
+_sampling_attempts_counter: Any = None
+_sampling_successes_counter: Any = None
+_sampling_failures_counter: Any = None
+
+
+def _get_sampling_attempts_counter() -> Any:
+    """Get or create the sampling attempts counter (internal use only)."""
+    global _sampling_attempts_counter
+
+    if _sampling_attempts_counter is None:
+        _sampling_attempts_counter = create_counter(
+            "mellea.sampling.attempts",
+            description="Total number of sampling attempts per strategy",
+            unit="{attempt}",
+        )
+    return _sampling_attempts_counter
+
+
+def _get_sampling_successes_counter() -> Any:
+    """Get or create the sampling successes counter (internal use only)."""
+    global _sampling_successes_counter
+
+    if _sampling_successes_counter is None:
+        _sampling_successes_counter = create_counter(
+            "mellea.sampling.successes",
+            description="Total number of successful sampling loops per strategy",
+            unit="{sample}",
+        )
+    return _sampling_successes_counter
+
+
+def _get_sampling_failures_counter() -> Any:
+    """Get or create the sampling failures counter (internal use only)."""
+    global _sampling_failures_counter
+
+    if _sampling_failures_counter is None:
+        _sampling_failures_counter = create_counter(
+            "mellea.sampling.failures",
+            description="Total number of failed sampling loops (budget exhausted) per strategy",
+            unit="{failure}",
+        )
+    return _sampling_failures_counter
+
+
+def record_sampling_attempt(strategy: str) -> None:
+    """Record one sampling attempt for the given strategy.
+
+    This is a no-op when metrics are disabled, ensuring zero overhead.
+
+    Args:
+        strategy: Sampling strategy class name (e.g. ``"RejectionSamplingStrategy"``).
+    """
+    if not _METRICS_ENABLED:
+        return
+
+    _get_sampling_attempts_counter().add(1, {"strategy": strategy})
+
+
+def record_sampling_outcome(strategy: str, success: bool) -> None:
+    """Record the final outcome (success or failure) of a sampling loop.
+
+    This is a no-op when metrics are disabled, ensuring zero overhead.
+
+    Args:
+        strategy: Sampling strategy class name (e.g. ``"RejectionSamplingStrategy"``).
+        success: ``True`` if at least one attempt passed all requirements.
+    """
+    if not _METRICS_ENABLED:
+        return
+
+    if success:
+        _get_sampling_successes_counter().add(1, {"strategy": strategy})
+    else:
+        _get_sampling_failures_counter().add(1, {"strategy": strategy})
+
+
+_requirement_checks_counter: Any = None
+_requirement_failures_counter: Any = None
+
+
+def _get_requirement_checks_counter() -> Any:
+    """Get or create the requirement checks counter (internal use only)."""
+    global _requirement_checks_counter
+
+    if _requirement_checks_counter is None:
+        _requirement_checks_counter = create_counter(
+            "mellea.requirement.checks",
+            description="Total number of requirement validation checks",
+            unit="{check}",
+        )
+    return _requirement_checks_counter
+
+
+def _get_requirement_failures_counter() -> Any:
+    """Get or create the requirement failures counter (internal use only)."""
+    global _requirement_failures_counter
+
+    if _requirement_failures_counter is None:
+        _requirement_failures_counter = create_counter(
+            "mellea.requirement.failures",
+            description="Total number of requirement validation failures",
+            unit="{failure}",
+        )
+    return _requirement_failures_counter
+
+
+def record_requirement_check(requirement: str) -> None:
+    """Record one requirement validation check.
+
+    This is a no-op when metrics are disabled, ensuring zero overhead.
+
+    Args:
+        requirement: Requirement class name (e.g. ``"LLMaJRequirement"``).
+    """
+    if not _METRICS_ENABLED:
+        return
+
+    _get_requirement_checks_counter().add(1, {"requirement": requirement})
+
+
+def record_requirement_failure(requirement: str, reason: str) -> None:
+    """Record one requirement validation failure.
+
+    This is a no-op when metrics are disabled, ensuring zero overhead.
+
+    Args:
+        requirement: Requirement class name (e.g. ``"LLMaJRequirement"``).
+        reason: Human-readable failure reason from ``ValidationResult.reason``.
+    """
+    if not _METRICS_ENABLED:
+        return
+
+    _get_requirement_failures_counter().add(
+        1, {"requirement": requirement, "reason": reason}
+    )
+
+
+_tool_calls_counter: Any = None
+
+
+def _get_tool_calls_counter() -> Any:
+    """Get or create the tool calls counter (internal use only)."""
+    global _tool_calls_counter
+
+    if _tool_calls_counter is None:
+        _tool_calls_counter = create_counter(
+            "mellea.tool.calls",
+            description="Total number of tool invocations by name and status",
+            unit="{call}",
+        )
+    return _tool_calls_counter
+
+
+def record_tool_call(tool: str, status: str) -> None:
+    """Record one tool invocation.
+
+    This is a no-op when metrics are disabled, ensuring zero overhead.
+
+    Args:
+        tool: Name of the tool that was invoked.
+        status: ``"success"`` if the tool executed without error, ``"failure"`` otherwise.
+    """
+    if not _METRICS_ENABLED:
+        return
+
+    counter = _get_tool_calls_counter()
+    counter.add(1, {"tool": tool, "status": status})
+
+
 __all__ = [
     "classify_error",
     "create_counter",
@@ -727,6 +899,11 @@ def record_error(
     "is_metrics_enabled",
     "record_error",
     "record_request_duration",
+    "record_requirement_check",
+    "record_requirement_failure",
+    "record_sampling_attempt",
+    "record_sampling_outcome",
     "record_token_usage_metrics",
+    "record_tool_call",
     "record_ttfb",
 ]