Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions docs/docs/evaluation-and-observability/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,49 @@ Error metrics are recorded when a backend raises an exception during generation,
after the request has been dispatched to the provider. Construction-time errors
(e.g. missing API key) are not captured by the error counter.

## Operational metrics

Mellea records metrics for its internal sampling, validation, and tool execution
loops. These counters give visibility into retry behavior, validation failure
rates, and tool call health — independent of the underlying LLM provider.

### Sampling counters

| Metric Name | Type | Unit | Description |
| ----------- | ---- | ---- | ----------- |
| `mellea.sampling.attempts` | Counter | `{attempt}` | Sampling attempts per loop iteration |
| `mellea.sampling.successes` | Counter | `{sample}` | Sampling loops that produced a passing sample |
| `mellea.sampling.failures` | Counter | `{failure}` | Sampling loops that exhausted the loop budget without success |

All sampling metrics include:

| Attribute | Description | Example Values |
| --------- | ----------- | -------------- |
| `strategy` | Sampling strategy class name | `RejectionSamplingStrategy`, `MultiTurnStrategy`, `RepairTemplateStrategy` |

### Requirement counters

| Metric Name | Type | Unit | Description |
| ----------- | ---- | ---- | ----------- |
| `mellea.requirement.checks` | Counter | `{check}` | Requirement validation checks performed |
| `mellea.requirement.failures` | Counter | `{failure}` | Requirement validation checks that failed |

| Attribute | Description | Example Values |
| --------- | ----------- | -------------- |
| `requirement` | Requirement class name | `LLMaJRequirement`, `PythonExecutionReq`, `ALoraRequirement`, `GuardianCheck` |
| `reason` | Human-readable failure reason (`mellea.requirement.failures` only) | `"Output did not satisfy constraint"`, `"unknown"` |

### Tool counter

| Metric Name | Type | Unit | Description |
| ----------- | ---- | ---- | ----------- |
| `mellea.tool.calls` | Counter | `{call}` | Tool invocations by name and status |

| Attribute | Description | Example Values |
| --------- | ----------- | -------------- |
| `tool` | Name of the invoked tool | `"search"`, `"calculator"` |
| `status` | Execution outcome | `success`, `failure` |

## Metrics export configuration

Mellea supports multiple metrics exporters that can be used independently or
Expand Down
5 changes: 5 additions & 0 deletions docs/docs/evaluation-and-observability/telemetry.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,11 @@ OpenTelemetry. No code changes are required:
`mellea.llm.ttfb` (streaming requests only).
- **Error counter**`mellea.llm.errors` on each failed backend call,
classified by semantic error type.
- **Sampling counters**`mellea.sampling.attempts`, `mellea.sampling.successes`,
and `mellea.sampling.failures` per strategy.
- **Requirement counters**`mellea.requirement.checks` and
`mellea.requirement.failures` per requirement type.
- **Tool counter**`mellea.tool.calls` by tool name and status.

The metrics API also exposes `create_counter`, `create_histogram`, and
`create_up_down_counter` for instrumenting your own application code.
Expand Down
4 changes: 4 additions & 0 deletions mellea/plugins/hooks/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class SamplingIterationPayload(MelleaBasePayload):
"""Payload for ``sampling_iteration`` — after each sampling attempt.

Attributes:
strategy_name: Class name of the sampling strategy (e.g. ``"RejectionSamplingStrategy"``).
iteration: 1-based iteration number within the sampling loop.
action: The ``Component`` used for this attempt.

Expand All @@ -42,6 +43,7 @@ class SamplingIterationPayload(MelleaBasePayload):
total_count: Total number of requirements evaluated.
"""

strategy_name: str = ""
iteration: int = 0
action: Any = None
result: Any = None
Expand Down Expand Up @@ -78,6 +80,7 @@ class SamplingLoopEndPayload(MelleaBasePayload):
"""Payload for ``sampling_loop_end`` — when sampling completes.

Attributes:
strategy_name: Class name of the sampling strategy (e.g. ``"RejectionSamplingStrategy"``).
success: ``True`` if at least one attempt passed all requirements.
iterations_used: Total number of iterations the loop executed.
final_result: The selected ``ModelOutputThunk`` (best success or best failure).
Expand All @@ -91,6 +94,7 @@ class SamplingLoopEndPayload(MelleaBasePayload):
``(Requirement, ValidationResult)`` tuples for iteration *i*.
"""

strategy_name: str = ""
success: bool = False
iterations_used: int = 0
final_result: Any = None
Expand Down
3 changes: 3 additions & 0 deletions mellea/stdlib/sampling/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ async def sample(
from ...plugins.hooks.sampling import SamplingIterationPayload

iter_payload = SamplingIterationPayload(
strategy_name=type(self).__name__,
iteration=loop_count,
action=next_action,
result=result,
Expand All @@ -272,6 +273,7 @@ async def sample(
from ...plugins.hooks.sampling import SamplingLoopEndPayload

end_payload = SamplingLoopEndPayload(
strategy_name=type(self).__name__,
success=True,
iterations_used=loop_count,
final_result=result,
Expand Down Expand Up @@ -362,6 +364,7 @@ async def sample(
sample_contexts[best_failed_index] if sample_contexts else context
)
end_payload = SamplingLoopEndPayload(
strategy_name=type(self).__name__,
success=False,
iterations_used=loop_count,
final_result=sampled_results[best_failed_index],
Expand Down
10 changes: 10 additions & 0 deletions mellea/telemetry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,12 @@ def my_function():
create_up_down_counter,
is_metrics_enabled,
record_request_duration,
record_requirement_check,
record_requirement_failure,
record_sampling_attempt,
record_sampling_outcome,
record_token_usage_metrics,
record_tool_call,
record_ttfb,
)
from .tracing import (
Expand Down Expand Up @@ -111,7 +116,12 @@ def my_function():
"is_backend_tracing_enabled",
"is_metrics_enabled",
"record_request_duration",
"record_requirement_check",
"record_requirement_failure",
"record_sampling_attempt",
"record_sampling_outcome",
"record_token_usage_metrics",
"record_tool_call",
"record_ttfb",
"set_span_attribute",
"set_span_error",
Expand Down
177 changes: 177 additions & 0 deletions mellea/telemetry/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
- Token counters: mellea.llm.tokens.input, mellea.llm.tokens.output (unit: tokens)
- Latency histograms: mellea.llm.request.duration (unit: s), mellea.llm.ttfb (unit: s, streaming only)
- Error counter: mellea.llm.errors (unit: {error}), categorized by semantic error type
- Sampling counters: mellea.sampling.attempts, mellea.sampling.successes, mellea.sampling.failures (unit: {attempt}/{sample}/{failure})
- Requirement counters: mellea.requirement.checks (unit: {check}), mellea.requirement.failures (unit: {failure})
- Tool counter: mellea.tool.calls (unit: {call}), tagged by tool name and status

Programmatic usage:
from mellea.telemetry.metrics import create_counter, create_histogram
Expand Down Expand Up @@ -719,6 +722,175 @@ def record_error(
)


_sampling_attempts_counter: Any = None
_sampling_successes_counter: Any = None
_sampling_failures_counter: Any = None


def _get_sampling_attempts_counter() -> Any:
"""Get or create the sampling attempts counter (internal use only)."""
global _sampling_attempts_counter

if _sampling_attempts_counter is None:
_sampling_attempts_counter = create_counter(
"mellea.sampling.attempts",
description="Total number of sampling attempts per strategy",
unit="{attempt}",
)
return _sampling_attempts_counter


def _get_sampling_successes_counter() -> Any:
"""Get or create the sampling successes counter (internal use only)."""
global _sampling_successes_counter

if _sampling_successes_counter is None:
_sampling_successes_counter = create_counter(
"mellea.sampling.successes",
description="Total number of successful sampling loops per strategy",
unit="{sample}",
)
return _sampling_successes_counter


def _get_sampling_failures_counter() -> Any:
"""Get or create the sampling failures counter (internal use only)."""
global _sampling_failures_counter

if _sampling_failures_counter is None:
_sampling_failures_counter = create_counter(
"mellea.sampling.failures",
description="Total number of failed sampling loops (budget exhausted) per strategy",
unit="{failure}",
)
return _sampling_failures_counter


def record_sampling_attempt(strategy: str) -> None:
"""Record one sampling attempt for the given strategy.

This is a no-op when metrics are disabled, ensuring zero overhead.

Args:
strategy: Sampling strategy class name (e.g. ``"RejectionSamplingStrategy"``).
"""
if not _METRICS_ENABLED:
return

_get_sampling_attempts_counter().add(1, {"strategy": strategy})


def record_sampling_outcome(strategy: str, success: bool) -> None:
"""Record the final outcome (success or failure) of a sampling loop.

This is a no-op when metrics are disabled, ensuring zero overhead.

Args:
strategy: Sampling strategy class name (e.g. ``"RejectionSamplingStrategy"``).
success: ``True`` if at least one attempt passed all requirements.
"""
if not _METRICS_ENABLED:
return

if success:
_get_sampling_successes_counter().add(1, {"strategy": strategy})
else:
_get_sampling_failures_counter().add(1, {"strategy": strategy})


_requirement_checks_counter: Any = None
_requirement_failures_counter: Any = None


def _get_requirement_checks_counter() -> Any:
"""Get or create the requirement checks counter (internal use only)."""
global _requirement_checks_counter

if _requirement_checks_counter is None:
_requirement_checks_counter = create_counter(
"mellea.requirement.checks",
description="Total number of requirement validation checks",
unit="{check}",
)
return _requirement_checks_counter


def _get_requirement_failures_counter() -> Any:
"""Get or create the requirement failures counter (internal use only)."""
global _requirement_failures_counter

if _requirement_failures_counter is None:
_requirement_failures_counter = create_counter(
"mellea.requirement.failures",
description="Total number of requirement validation failures",
unit="{failure}",
)
return _requirement_failures_counter


def record_requirement_check(requirement: str) -> None:
"""Record one requirement validation check.

This is a no-op when metrics are disabled, ensuring zero overhead.

Args:
requirement: Requirement class name (e.g. ``"LLMaJRequirement"``).
"""
if not _METRICS_ENABLED:
return

_get_requirement_checks_counter().add(1, {"requirement": requirement})


def record_requirement_failure(requirement: str, reason: str) -> None:
"""Record one requirement validation failure.

This is a no-op when metrics are disabled, ensuring zero overhead.

Args:
requirement: Requirement class name (e.g. ``"LLMaJRequirement"``).
reason: Human-readable failure reason from ``ValidationResult.reason``.
"""
if not _METRICS_ENABLED:
return

_get_requirement_failures_counter().add(
1, {"requirement": requirement, "reason": reason}
)


_tool_calls_counter: Any = None


def _get_tool_calls_counter() -> Any:
"""Get or create the tool calls counter (internal use only)."""
global _tool_calls_counter

if _tool_calls_counter is None:
_tool_calls_counter = create_counter(
"mellea.tool.calls",
description="Total number of tool invocations by name and status",
unit="{call}",
)
return _tool_calls_counter


def record_tool_call(tool: str, status: str) -> None:
"""Record one tool invocation.

This is a no-op when metrics are disabled, ensuring zero overhead.

Args:
tool: Name of the tool that was invoked.
status: ``"success"`` if the tool executed without error, ``"failure"`` otherwise.
"""
if not _METRICS_ENABLED:
return

counter = _get_tool_calls_counter()
counter.add(1, {"tool": tool, "status": status})


__all__ = [
"classify_error",
"create_counter",
Expand All @@ -727,6 +899,11 @@ def record_error(
"is_metrics_enabled",
"record_error",
"record_request_duration",
"record_requirement_check",
"record_requirement_failure",
"record_sampling_attempt",
"record_sampling_outcome",
"record_token_usage_metrics",
"record_tool_call",
"record_ttfb",
]
Loading
Loading