Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ Tests/examples automatically skip if system lacks required resources. Heavy exam
- Use `...` in `@generative` function bodies
- Prefer primitives over classes
- **Friendly Dependency Errors**: Wraps optional backend imports in `try/except ImportError` with a helpful message (e.g., "Please pip install mellea[hf]"). See `mellea/stdlib/session.py` for examples.
- **Backend telemetry fields**: All backends must populate `mot.usage` (dict with `prompt_tokens`, `completion_tokens`, `total_tokens`), `mot.model` (str), and `mot.provider` (str) in their `post_processing()` method. Metrics are automatically recorded by `TokenMetricsPlugin` — don't add manual `record_token_usage_metrics()` calls.

## 5. Commits & Hooks
[Angular format](https://github.com/angular/angular/blob/main/CONTRIBUTING.md#commit): `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `release:`
Expand Down
23 changes: 20 additions & 3 deletions docs/dev/telemetry.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,8 @@ All token metrics include these attributes following Gen-AI semantic conventions

| Attribute | Description | Example Values |
|-----------|-------------|----------------|
| `gen_ai.system` | Backend system name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` |
| `gen_ai.provider.name` | Backend provider name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` |
| `gen_ai.request.model` | Model identifier | `gpt-4`, `llama3.2:7b`, `granite-3.1-8b-instruct` |
| `mellea.backend` | Backend class name | `OpenAIBackend`, `OllamaBackend`, `WatsonxBackend` |

#### Backend Support

Expand Down Expand Up @@ -365,12 +364,30 @@ if is_metrics_enabled():
print("Token metrics are being collected")
```

Access token usage data from `ModelOutputThunk`:

```python
from mellea import start_session

with start_session() as m:
result = m.instruct("Write a haiku about programming")

# Access token usage (follows OpenAI API format)
if result.usage:
print(f"Prompt tokens: {result.usage['prompt_tokens']}")
print(f"Completion tokens: {result.usage['completion_tokens']}")
print(f"Total tokens: {result.usage['total_tokens']}")
```

The `usage` field is a dictionary with three keys: `prompt_tokens`, `completion_tokens`, and `total_tokens`. All backends populate this field consistently.

#### Performance

- **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default), `record_token_usage_metrics()` returns immediately with no processing
- **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default), the TokenMetricsPlugin is not registered and has no overhead
- **Minimal overhead when enabled**: Counter increments are extremely fast (~nanoseconds per operation)
- **Async export**: Metrics are batched and exported asynchronously (default: every 60 seconds)
- **Non-blocking**: Metric recording never blocks LLM calls
- **Automatic collection**: Metrics are recorded via hooks after generation completes—no manual instrumentation needed

#### Use Cases

Expand Down
6 changes: 6 additions & 0 deletions docs/examples/telemetry/metrics_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ def main():
)
print(f"Email: {str(email)[:100]}...")

# Token usage is available on the result from instruct()
if email.usage:
print(f" → Prompt tokens: {email.usage['prompt_tokens']}")
print(f" → Completion tokens: {email.usage['completion_tokens']}")
print(f" → Total tokens: {email.usage['total_tokens']}")

# Example 3: Multiple operations
print("\n3. Multiple operations...")
text = "Hello, how are you today?"
Expand Down
29 changes: 14 additions & 15 deletions mellea/backends/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -1152,21 +1152,20 @@ class used during generation, if any.
except Exception:
pass

# Record metrics if enabled
if metrics_enabled and n_prompt is not None:
from ..telemetry.backend_instrumentation import (
get_model_id_str,
get_system_name,
)
from ..telemetry.metrics import record_token_usage_metrics

record_token_usage_metrics(
input_tokens=n_prompt,
output_tokens=n_completion,
model=get_model_id_str(self),
backend=self.__class__.__name__,
system=get_system_name(self),
)
# Populate standardized usage field (convert to OpenAI format)
if n_prompt is not None and n_completion is not None:
mot.usage = {
"prompt_tokens": n_prompt,
"completion_tokens": n_completion,
"total_tokens": n_prompt + n_completion,
}

# Populate model and provider metadata
if hasattr(self.model_id, "hf_model_name"):
mot.model = str(self.model_id.hf_model_name) # type: ignore
else:
mot.model = str(self.model_id)
mot.provider = "huggingface"

# Record tracing if span exists
if span is not None:
Expand Down
23 changes: 6 additions & 17 deletions mellea/backends/litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,24 +535,13 @@ async def post_processing(
if usage is None:
usage = mot._meta.get("litellm_streaming_usage")

# Record metrics if enabled
from ..telemetry.metrics import is_metrics_enabled
# Populate standardized usage field (LiteLLM uses OpenAI format)
if usage:
mot.usage = usage

if is_metrics_enabled() and usage:
from ..telemetry.backend_instrumentation import (
get_model_id_str,
get_system_name,
)
from ..telemetry.metrics import record_token_usage_metrics
from .utils import get_value

record_token_usage_metrics(
input_tokens=get_value(usage, "prompt_tokens"),
output_tokens=get_value(usage, "completion_tokens"),
model=get_model_id_str(self),
backend=self.__class__.__name__,
system=get_system_name(self),
)
# Populate model and provider metadata
mot.model = str(self.model_id)
mot.provider = "litellm"

# Record telemetry now that response is available
span = mot._meta.get("_telemetry_span")
Expand Down
26 changes: 10 additions & 16 deletions mellea/backends/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,23 +696,17 @@ async def post_processing(
)
completion_tokens = getattr(response, "eval_count", None) if response else None

# Record metrics if enabled
from ..telemetry.metrics import is_metrics_enabled
# Populate standardized usage field (convert to OpenAI format)
if prompt_tokens is not None or completion_tokens is not None:
mot.usage = {
"prompt_tokens": prompt_tokens or 0,
"completion_tokens": completion_tokens or 0,
"total_tokens": (prompt_tokens or 0) + (completion_tokens or 0),
Comment thread
ajbozarth marked this conversation as resolved.
Outdated
}

if is_metrics_enabled():
from ..telemetry.backend_instrumentation import (
get_model_id_str,
get_system_name,
)
from ..telemetry.metrics import record_token_usage_metrics

record_token_usage_metrics(
input_tokens=prompt_tokens,
output_tokens=completion_tokens,
model=get_model_id_str(self),
backend=self.__class__.__name__,
system=get_system_name(self),
)
# Populate model and provider metadata
mot.model = str(self.model_id)
mot.provider = "ollama"

# Record telemetry and close span now that response is available
span = mot._meta.get("_telemetry_span")
Expand Down
23 changes: 6 additions & 17 deletions mellea/backends/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,24 +716,13 @@ async def post_processing(
if usage is None:
usage = mot._meta.get("oai_streaming_usage")

# Record metrics if enabled
from ..telemetry.metrics import is_metrics_enabled
# Populate standardized usage field (OpenAI format already matches)
if usage:
mot.usage = usage

if is_metrics_enabled() and usage:
from ..telemetry.backend_instrumentation import (
get_model_id_str,
get_system_name,
)
from ..telemetry.metrics import record_token_usage_metrics
from .utils import get_value

record_token_usage_metrics(
input_tokens=get_value(usage, "prompt_tokens"),
output_tokens=get_value(usage, "completion_tokens"),
model=get_model_id_str(self),
backend=self.__class__.__name__,
system=get_system_name(self),
)
# Populate model and provider metadata
mot.model = str(self.model_id)
mot.provider = "openai"

# Record telemetry now that response is available
span = mot._meta.get("_telemetry_span")
Expand Down
23 changes: 6 additions & 17 deletions mellea/backends/watsonx.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,24 +576,13 @@ async def post_processing(
else getattr(response, "usage", None)
)

# Record metrics if enabled
from ..telemetry.metrics import is_metrics_enabled
# Populate standardized usage field (WatsonX uses OpenAI format)
if usage:
mot.usage = usage

if is_metrics_enabled() and usage:
from ..telemetry.backend_instrumentation import (
get_model_id_str,
get_system_name,
)
from ..telemetry.metrics import record_token_usage_metrics
from .utils import get_value

record_token_usage_metrics(
input_tokens=get_value(usage, "prompt_tokens"),
output_tokens=get_value(usage, "completion_tokens"),
model=get_model_id_str(self),
backend=self.__class__.__name__,
system=get_system_name(self),
)
# Populate model and provider metadata
mot.model = str(self.model_id)
mot.provider = "watsonx"

# Record tracing if span exists
span = mot._meta.get("_telemetry_span")
Expand Down
32 changes: 32 additions & 0 deletions mellea/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,29 @@ def __init__(
# Additional fields that should be standardized across apis.
self.tool_calls = tool_calls
self._thinking: str | None = None
self.usage: dict[str, int] | None = None
Comment thread
ajbozarth marked this conversation as resolved.
Outdated
"""Usage information following OpenAI API standard.

Core fields: 'prompt_tokens', 'completion_tokens', 'total_tokens'.
Populated by backends during post_processing. None if unavailable.

Future: May include optional breakdown fields like 'completion_tokens_details'
and 'prompt_tokens_details' for advanced features (reasoning, audio, caching).
"""

self.model: str | None = None
"""Model identifier that generated this output.

Examples: 'gpt-4', 'llama2:7b', 'meta-llama/Llama-2-7b-hf'.
Populated by backends. None if unavailable.
"""

self.provider: str | None = None
"""Provider that generated this output.

Examples: 'openai', 'ollama', 'huggingface', 'watsonx'.
Populated by backends. None if unavailable.
"""

# Used for tracking generation.
self._context: list[Component | CBlock] | None = None
Expand Down Expand Up @@ -319,6 +342,9 @@ def _copy_from(self, other: ModelOutputThunk) -> None:
self.parsed_repr = other.parsed_repr
self.tool_calls = other.tool_calls
self._thinking = other._thinking
self.usage = other.usage
self.model = other.model
self.provider = other.provider
self._generate_log = other._generate_log

def is_computed(self) -> bool:
Expand Down Expand Up @@ -528,6 +554,9 @@ def __copy__(self) -> ModelOutputThunk:
copied._context = self._context
copied._generate_log = self._generate_log
copied._model_options = self._model_options
copied.usage = self.usage
copied.model = self.model
copied.provider = self.provider
return copied

def __deepcopy__(self, memo: dict) -> ModelOutputThunk:
Expand Down Expand Up @@ -557,6 +586,9 @@ def __deepcopy__(self, memo: dict) -> ModelOutputThunk:
) # The items in a context should be immutable.
deepcopied._generate_log = copy(self._generate_log)
deepcopied._model_options = copy(self._model_options)
deepcopied.usage = deepcopy(self.usage) if self.usage else None
deepcopied.model = self.model
deepcopied.provider = self.provider
return deepcopied


Expand Down
46 changes: 30 additions & 16 deletions mellea/telemetry/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,30 +407,24 @@ def _get_token_counters() -> tuple[Any, Any]:


def record_token_usage_metrics(
input_tokens: int | None,
output_tokens: int | None,
model: str,
backend: str,
system: str,
input_tokens: int | None, output_tokens: int | None, model: str, provider: str
) -> None:
"""Record token usage metrics following Gen-AI semantic conventions.
"""Record token usage metrics following OpenTelemetry Gen-AI semantic conventions.

This is a no-op when metrics are disabled, ensuring zero overhead.

Args:
input_tokens: Number of input tokens (prompt tokens), or None if unavailable
output_tokens: Number of output tokens (completion tokens), or None if unavailable
model: Model identifier (e.g., "gpt-4", "llama2:7b")
backend: Backend class name (e.g., "OpenAIBackend", "OllamaBackend")
system: Gen-AI system name (e.g., "openai", "ollama", "watsonx")
provider: Provider name (e.g., "openai", "ollama", "watsonx")

Example:
record_token_usage_metrics(
input_tokens=150,
output_tokens=50,
model="llama2:7b",
backend="OllamaBackend",
system="ollama"
provider="ollama"
)
"""
# Early return if metrics are disabled (zero overhead)
Expand All @@ -440,12 +434,8 @@ def record_token_usage_metrics(
# Get the token counters (lazily initialized)
input_counter, output_counter = _get_token_counters()

# Prepare attributes following Gen-AI semantic conventions
attributes = {
"gen_ai.system": system,
"gen_ai.request.model": model,
"mellea.backend": backend,
}
# Prepare attributes following OTel Gen-AI semantic conventions
attributes = {"gen_ai.provider.name": provider, "gen_ai.request.model": model}

# Record input tokens if available
if input_tokens is not None and input_tokens > 0:
Expand All @@ -456,6 +446,30 @@ def record_token_usage_metrics(
output_counter.add(output_tokens, attributes)


# Auto-register TokenMetricsPlugin when metrics are enabled
if _OTEL_AVAILABLE and _METRICS_ENABLED:
try:
from mellea.plugins.registry import register
from mellea.telemetry.metrics_plugins import TokenMetricsPlugin

# Idempotent registration (supports module reloads in tests)
try:
register(TokenMetricsPlugin())
except ValueError as e:
# Already registered (expected during module reloads in tests)
warnings.warn(
f"TokenMetricsPlugin already registered: {e}", UserWarning, stacklevel=2
)
except ImportError:
warnings.warn(
"Metrics are enabled but the plugin framework is not installed. "
"Token usage metrics will not be recorded automatically. "
"Install with: pip install mellea[telemetry]",
UserWarning,
stacklevel=2,
)


__all__ = [
"create_counter",
"create_histogram",
Expand Down
Loading
Loading