From 69534b6908771764da66d4ca983247d3e373ac02 Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Mon, 11 May 2026 18:39:27 +0530 Subject: [PATCH 01/17] Add token usage telemetry and dashboard for LLM requests - Implemented TokenUsageAccumulator to track per-request, per-agent, and per-model token usage. - Emitted custom events to Azure Application Insights for monitoring. - Created KQL queries for visualizing token usage metrics in Application Insights. - Developed a workbook for easy access to token usage insights. - Updated orchestrator to integrate token usage tracking during message processing and response handling. --- docs/TokenUsageTelemetry.md | 103 ++++++ infra/dashboards/token-usage-queries.kql | 218 ++++++++++++ infra/dashboards/token-usage-workbook.json | 160 +++++++++ src/backend/orchestrator.py | 130 +++++++ src/backend/token_usage.py | 396 +++++++++++++++++++++ 5 files changed, 1007 insertions(+) create mode 100644 docs/TokenUsageTelemetry.md create mode 100644 infra/dashboards/token-usage-queries.kql create mode 100644 infra/dashboards/token-usage-workbook.json create mode 100644 src/backend/token_usage.py diff --git a/docs/TokenUsageTelemetry.md b/docs/TokenUsageTelemetry.md new file mode 100644 index 000000000..f48f62c54 --- /dev/null +++ b/docs/TokenUsageTelemetry.md @@ -0,0 +1,103 @@ +# Token Usage Telemetry & Dashboard + +The Content Generation backend emits **per-request, per-agent, and per-model** +LLM token-usage metrics to **Azure Application Insights** as custom events. +This page describes what is emitted, how to enable it, and how to visualize it. + +## What is emitted + +Three custom events are sent on every request that consumes LLM tokens +(see `src/backend/token_usage.py`): + +| Event | When | Custom dimensions | +|---|---|---| +| `LLM_Token_Usage_Summary` | Once per request | `total_input_tokens`, `total_output_tokens`, `total_tokens`, `agent_count`, `model_count`, `user_id`, `conversation_id`, `source` | +| `LLM_Agent_Token_Usage` | Per agent that ran | `agent_name`, `model_deployment_name`, `input_tokens`, `output_tokens`, `total_tokens`, `user_id`, `conversation_id`, `source` | +| `LLM_Model_Token_Usage` | Per model deployment used | `model_deployment_name`, `input_tokens`, `output_tokens`, `total_tokens`, `user_id`, `conversation_id`, `source` | + +**Agents covered:** `triage_agent`, `planning_agent`, `research_agent`, +`text_content_agent`, `image_content_agent`, `compliance_agent`, `rai_agent`. + +**Sources** (carried in the `source` dimension): +- `process_message` — main HandoffBuilder workflow +- `send_user_response` — workflow continuations +- `parse_brief` — RAI + planning agent calls +- `generate_content` — text/image/compliance agent calls +- `regenerate_image` — direct-mode image agent +- `foundry_image_generation` — direct REST call to Azure OpenAI Image API + +> **Note:** All numeric values are stored as strings in `customDimensions` +> (App Insights requirement). Always cast with `tolong()` / `toint()` in KQL. + +## Enabling telemetry + +Set `APPLICATIONINSIGHTS_CONNECTION_STRING` in the backend environment. +Application Insights wiring is already configured in `src/backend/app.py` +via `configure_azure_monitor()`. If the env var is unset, telemetry calls +are no-ops — token tracking has zero runtime impact when not configured. + +When deploying via `azd up`, the Bicep templates create an Application +Insights instance and pass the connection string to the App Service. + +## Viewing the dashboard + +A ready-to-use KQL query pack lives at: + +``` +infra/dashboards/token-usage-queries.kql +``` + +It contains 12 queries: + +1. Overall token usage (last 24h) +2. Token usage by agent +3. Token usage by model deployment +4. Top users by token spend (last 7d) +5. Hourly trend (last 24h, time chart) +6. Per-agent daily trend (last 7d, time chart) +7. Per-model daily trend (last 7d, time chart) +8. Token usage by request source +9. Top conversations by token spend +10. Avg input/output token ratio per agent +11. Heaviest individual requests +12. OpenTelemetry-instrumented OpenAI dependency calls (cross-check) + +### Run a query + +1. Open the **Application Insights** resource in the Azure portal. +2. Go to **Monitoring → Logs**. +3. Paste any query from the file above and click **Run**. + +### Build a workbook + +1. Open **Application Insights → Workbooks → + New**. +2. Add a **Query** step and paste a query from `token-usage-queries.kql`. +3. Pick a visualization (bar, time chart, table) and pin to a dashboard. +4. Repeat for each query you want as a tile. +5. Save the workbook to make it reusable across the team. + +## Verifying locally + +After triggering a brief generation in a dev environment with a valid +`APPLICATIONINSIGHTS_CONNECTION_STRING`, custom events typically appear in +Application Insights within ~2 minutes: + +```kusto +customEvents +| where timestamp > ago(15m) +| where name startswith "LLM_" +| project timestamp, name, customDimensions +| order by timestamp desc +``` + +## Design notes + +- **Best-effort by design.** Every extraction and every emit call is wrapped + in `try/except`. Telemetry failures are logged at `DEBUG`/`WARNING` and + never break the user flow. +- **No PII.** Only `user_id` and `conversation_id` are included as + dimensions; no prompt or response text is sent. +- **Out of scope (intentional).** The current implementation does not persist + token totals to Cosmos DB and does not push real-time updates to the + frontend. Operators add cost-estimation queries as needed by multiplying + token counts by their negotiated per-1K-token rates. diff --git a/infra/dashboards/token-usage-queries.kql b/infra/dashboards/token-usage-queries.kql new file mode 100644 index 000000000..b1b4af4f4 --- /dev/null +++ b/infra/dashboards/token-usage-queries.kql @@ -0,0 +1,218 @@ +// ============================================================================= +// Token Usage Dashboard — Application Insights / Log Analytics KQL queries +// ============================================================================= +// Run these in: App Insights -> Logs (or Log Analytics -> Logs) for the +// workspace attached to the Content Generation backend. +// +// Custom events emitted by the backend (see src/backend/token_usage.py): +// * LLM_Token_Usage_Summary — one per request; aggregate totals +// * LLM_Agent_Token_Usage — one per agent that consumed tokens in the request +// * LLM_Model_Token_Usage — one per model deployment that was hit +// +// Common custom dimensions on every event: +// user_id, conversation_id, source +// Plus event-specific numeric dimensions stored as STRINGS — always cast with +// toint() / tolong() in KQL. +// ============================================================================= + + +// ----------------------------------------------------------------------------- +// 1. Overall token usage — last 24 hours +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(24h) +| where name == "LLM_Token_Usage_Summary" +| extend + input_tokens = tolong(customDimensions["total_input_tokens"]), + output_tokens = tolong(customDimensions["total_output_tokens"]), + total_tokens = tolong(customDimensions["total_tokens"]) +| summarize + Requests = count(), + TotalInputTokens = sum(input_tokens), + TotalOutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens), + AvgTokensPerRequest = avg(total_tokens) + + +// ----------------------------------------------------------------------------- +// 2. Token usage by agent (last 24 hours) +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(24h) +| where name == "LLM_Agent_Token_Usage" +| extend + agent_name = tostring(customDimensions["agent_name"]), + input_tokens = tolong(customDimensions["input_tokens"]), + output_tokens = tolong(customDimensions["output_tokens"]), + total_tokens = tolong(customDimensions["total_tokens"]) +| summarize + Calls = count(), + InputTokens = sum(input_tokens), + OutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens) + by agent_name +| order by TotalTokens desc + + +// ----------------------------------------------------------------------------- +// 3. Token usage by model deployment (last 24 hours) +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(24h) +| where name == "LLM_Model_Token_Usage" +| extend + model = tostring(customDimensions["model_deployment_name"]), + input_tokens = tolong(customDimensions["input_tokens"]), + output_tokens = tolong(customDimensions["output_tokens"]), + total_tokens = tolong(customDimensions["total_tokens"]) +| summarize + Calls = count(), + InputTokens = sum(input_tokens), + OutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens) + by model +| order by TotalTokens desc + + +// ----------------------------------------------------------------------------- +// 4. Token usage by user (last 7 days) — top 50 +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(7d) +| where name == "LLM_Token_Usage_Summary" +| extend + user_id = tostring(customDimensions["user_id"]), + total_tokens = tolong(customDimensions["total_tokens"]) +| where isnotempty(user_id) +| summarize + Requests = count(), + TotalTokens = sum(total_tokens) + by user_id +| top 50 by TotalTokens desc + + +// ----------------------------------------------------------------------------- +// 5. Token usage over time — hourly trend (last 24 hours) +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(24h) +| where name == "LLM_Token_Usage_Summary" +| extend total_tokens = tolong(customDimensions["total_tokens"]) +| summarize TotalTokens = sum(total_tokens) by bin(timestamp, 1h) +| order by timestamp asc +| render timechart + + +// ----------------------------------------------------------------------------- +// 6. Per-agent token trend (last 7 days) — daily +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(7d) +| where name == "LLM_Agent_Token_Usage" +| extend + agent_name = tostring(customDimensions["agent_name"]), + total_tokens = tolong(customDimensions["total_tokens"]) +| summarize TotalTokens = sum(total_tokens) by bin(timestamp, 1d), agent_name +| order by timestamp asc +| render timechart + + +// ----------------------------------------------------------------------------- +// 7. Per-model token trend (last 7 days) — daily +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(7d) +| where name == "LLM_Model_Token_Usage" +| extend + model = tostring(customDimensions["model_deployment_name"]), + total_tokens = tolong(customDimensions["total_tokens"]) +| summarize TotalTokens = sum(total_tokens) by bin(timestamp, 1d), model +| order by timestamp asc +| render timechart + + +// ----------------------------------------------------------------------------- +// 8. Token usage by request source (process_message / send_user_response / +// parse_brief / generate_content / regenerate_image / foundry_image_generation) +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(24h) +| where name == "LLM_Token_Usage_Summary" +| extend + source = tostring(customDimensions["source"]), + total_tokens = tolong(customDimensions["total_tokens"]) +| summarize + Requests = count(), + TotalTokens = sum(total_tokens) + by source +| order by TotalTokens desc + + +// ----------------------------------------------------------------------------- +// 9. Top conversations by token spend (last 7 days) +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(7d) +| where name == "LLM_Token_Usage_Summary" +| extend + conversation_id = tostring(customDimensions["conversation_id"]), + total_tokens = tolong(customDimensions["total_tokens"]) +| where isnotempty(conversation_id) +| summarize + Requests = count(), + TotalTokens = sum(total_tokens) + by conversation_id +| top 25 by TotalTokens desc + + +// ----------------------------------------------------------------------------- +// 10. Avg input vs output token ratio per agent (last 7 days) +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(7d) +| where name == "LLM_Agent_Token_Usage" +| extend + agent_name = tostring(customDimensions["agent_name"]), + input_tokens = tolong(customDimensions["input_tokens"]), + output_tokens = tolong(customDimensions["output_tokens"]) +| summarize + AvgInput = avg(input_tokens), + AvgOutput = avg(output_tokens), + InputOutputRatio = round(todouble(sum(input_tokens)) / todouble(iif(sum(output_tokens) == 0, 1, sum(output_tokens))), 2) + by agent_name +| order by InputOutputRatio desc + + +// ----------------------------------------------------------------------------- +// 11. Heaviest individual requests (last 24 hours) — top 25 +// ----------------------------------------------------------------------------- +customEvents +| where timestamp > ago(24h) +| where name == "LLM_Token_Usage_Summary" +| extend + conversation_id = tostring(customDimensions["conversation_id"]), + source = tostring(customDimensions["source"]), + total_tokens = tolong(customDimensions["total_tokens"]), + input_tokens = tolong(customDimensions["total_input_tokens"]), + output_tokens = tolong(customDimensions["total_output_tokens"]), + agent_count = toint(customDimensions["agent_count"]), + model_count = toint(customDimensions["model_count"]) +| project timestamp, conversation_id, source, input_tokens, output_tokens, total_tokens, agent_count, model_count +| top 25 by total_tokens desc + + +// ----------------------------------------------------------------------------- +// 12. OpenTelemetry auto-instrumented dependencies (Azure OpenAI calls) +// Useful as a cross-check against our custom events. Note that auto-instrumented +// data does NOT include token counts unless GenAI semantic-conv attributes are +// enabled in the OpenAI/Azure SDK. +// ----------------------------------------------------------------------------- +dependencies +| where timestamp > ago(24h) +| where target has "openai" or name has "chat" or name has "completions" +| summarize + Calls = count(), + AvgDurMs = avg(duration), + Failures = countif(success == false) + by name, target +| order by Calls desc diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json new file mode 100644 index 000000000..fe97e525f --- /dev/null +++ b/infra/dashboards/token-usage-workbook.json @@ -0,0 +1,160 @@ +{ + "version": "Notebook/1.0", + "items": [ + { + "type": 1, + "content": { + "json": "# Token Usage — Content Generation Solution Accelerator\n---\n\nThis workbook visualizes LLM token consumption emitted by the orchestrator as Application Insights custom events:\n\n- **`LLM_Token_Usage_Summary`** — one event per user turn / brief parse / regeneration. Carries `total_input_tokens`, `total_output_tokens`, `total_tokens`, `source`, `conversation_id`, `user_id`.\n- **`LLM_Agent_Token_Usage`** — one event per agent per turn. Carries `agent_name`, `model_deployment_name`, `input_tokens`, `output_tokens`, `total_tokens`.\n- **`LLM_Model_Token_Usage`** — one event per model deployment per turn. Carries `model_deployment_name`, `input_tokens`, `output_tokens`, `total_tokens`.\n\nUse the time-range selector at the top to widen or narrow the window." + }, + "name": "header-markdown" + }, + { + "type": 9, + "content": { + "version": "KqlParameterItem/1.0", + "parameters": [ + { + "id": "time-range", + "version": "KqlParameterItem/1.0", + "name": "TimeRange", + "label": "Time range", + "type": 4, + "isRequired": true, + "value": { "durationMs": 604800000 }, + "typeSettings": { + "selectableValues": [ + { "durationMs": 3600000 }, + { "durationMs": 14400000 }, + { "durationMs": 43200000 }, + { "durationMs": 86400000 }, + { "durationMs": 172800000 }, + { "durationMs": 604800000 }, + { "durationMs": 2592000000 } + ] + } + } + ] + }, + "name": "parameters" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n TotalRequests = count(),\n TotalInputTokens = sum(input_tokens),\n TotalOutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerRequest = round(avg(total_tokens), 0)", + "size": 1, + "title": "Total token usage", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "name": "tile-total-usage", + "styleSettings": { "showBorder": true } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| summarize TotalTokens = sum(total_tokens), InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\n| render timechart", + "size": 0, + "title": "Token usage over time", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "100", + "name": "chart-usage-over-time" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = count()\n by Agent = agent\n| order by TotalTokens desc", + "size": 0, + "title": "Token usage by agent", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "50", + "name": "table-by-agent", + "styleSettings": { "showBorder": true } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "let StageMapping = datatable(agent:string, Stage:string) [\n 'rai_agent', 'Safety & RAI',\n 'planning_agent', 'Brief Parsing',\n 'text_content_agent', 'Text Generation',\n 'image_content_agent', 'Image Generation',\n 'compliance_agent', 'Compliance'\n];\ncustomEvents\n| where name == 'LLM_Agent_Token_Usage'\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| lookup kind=leftouter StageMapping on agent\n| extend Stage = iff(isempty(Stage), 'Other', Stage)\n| summarize\n TotalRequests = count(),\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerRequest = round(avg(total_tokens), 0)\n by Stage\n| order by TotalTokens desc", + "size": 0, + "title": "Token usage by pipeline stage", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "50", + "name": "table-by-stage", + "styleSettings": { "showBorder": true } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Model_Token_Usage'\n| extend model = tostring(customDimensions['model_deployment_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = count()\n by Model = model\n| order by TotalTokens desc", + "size": 0, + "title": "Token usage by model deployment", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "50", + "name": "table-by-model", + "styleSettings": { "showBorder": true } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend source = tostring(customDimensions['source'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n Requests = count(),\n TotalTokens = sum(total_tokens),\n AvgTokensPerRequest = round(avg(total_tokens), 0)\n by Source = source\n| order by TotalTokens desc", + "size": 0, + "title": "Token usage by source (entry point)", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "50", + "name": "table-by-source", + "styleSettings": { "showBorder": true } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend user_id = tostring(customDimensions['user_id'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| where isnotempty(user_id)\n| summarize Requests = count(), TotalTokens = sum(total_tokens) by User = user_id\n| order by TotalTokens desc\n| take 25", + "size": 0, + "title": "Top users by token consumption", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "50", + "name": "table-top-users", + "styleSettings": { "showBorder": true } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| extend agent = tostring(customDimensions['agent_name'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize TotalTokens = sum(total_tokens) by bin(timestamp, 1h), agent\n| render timechart", + "size": 0, + "title": "Per-agent token usage over time", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "100", + "name": "chart-per-agent-over-time" + } + ], + "isLocked": false, + "fallbackResourceIds": [] +} diff --git a/src/backend/orchestrator.py b/src/backend/orchestrator.py index b150f00cf..0a39739f4 100644 --- a/src/backend/orchestrator.py +++ b/src/backend/orchestrator.py @@ -42,6 +42,7 @@ from models import CreativeBrief from settings import app_settings +from token_usage import TokenUsageAccumulator logger = logging.getLogger(__name__) @@ -506,6 +507,11 @@ def __init__(self): self._initialized = False self._use_foundry = app_settings.ai_foundry.use_foundry self._credential = None + # agent_name -> deployment name, populated in initialize(). + # Used to attach a model dimension to LLM_*_Token_Usage events. + self._agent_model_map: dict[str, str] = {} + self._default_model: str = "" + self._image_model: str = "" def _get_chat_client(self): """Get or create the chat client (Azure OpenAI or Foundry).""" @@ -680,8 +686,42 @@ def initialize(self) -> None: ) self._initialized = True + + # Build the agent_name -> model deployment map used for token-usage + # telemetry. All chat agents share the same chat client deployment. + chat_model = ( + app_settings.ai_foundry.model_deployment + if self._use_foundry + else app_settings.azure_openai.gpt_model + ) or app_settings.azure_openai.gpt_model or "" + image_model = ( + app_settings.ai_foundry.image_deployment + if self._use_foundry + else app_settings.azure_openai.image_model + ) or app_settings.azure_openai.image_model or "" + self._default_model = chat_model + self._image_model = image_model + self._agent_model_map = { + f"triage{name_sep}agent": chat_model, + f"planning{name_sep}agent": chat_model, + f"research{name_sep}agent": chat_model, + f"text{name_sep}content{name_sep}agent": chat_model, + f"image{name_sep}content{name_sep}agent": chat_model, + f"compliance{name_sep}agent": chat_model, + f"rai{name_sep}agent": chat_model, + } + logger.info(f"Content Generation Orchestrator initialized successfully ({mode_str} mode)") + def _new_token_accumulator(self, conversation_id: str = "") -> TokenUsageAccumulator: + """Create a TokenUsageAccumulator pre-populated with this orchestrator's + agent->model map and default chat model. Telemetry is best-effort.""" + return TokenUsageAccumulator( + conversation_id=conversation_id, + agent_model_map=self._agent_model_map, + default_model=self._default_model, + ) + async def process_message( self, message: str, @@ -730,11 +770,20 @@ async def process_message( full_input = f"Context:\n{json.dumps(context, indent=2)}\n\nUser Message:\n{message}" try: + # Per-request token usage accumulator for App Insights telemetry. + token_acc = self._new_token_accumulator(conversation_id) + # Collect events from the workflow stream events = [] async for event in self._workflow.run_stream(full_input): events.append(event) + # Best-effort token-usage capture; never break the user flow. + try: + token_acc.record_event(event) + except Exception as _tu_err: + logger.debug("token_usage record_event failed: %s", _tu_err) + # Handle different event types from the workflow if isinstance(event, WorkflowStatusEvent): yield { @@ -794,8 +843,18 @@ async def process_message( "metadata": {"conversation_id": conversation_id} } + # Emit aggregated LLM_*_Token_Usage events for the request. + try: + token_acc.flush(source="process_message") + except Exception as _tu_err: + logger.debug("token_usage flush failed: %s", _tu_err) + except Exception as e: logger.exception(f"Error processing message: {e}") + try: + token_acc.flush(source="process_message:error") + except Exception: + pass yield { "type": "error", "content": f"An error occurred: {str(e)}", @@ -839,8 +898,15 @@ async def send_user_response( return # Exit immediately - do not continue workflow try: + token_acc = self._new_token_accumulator(conversation_id) + responses = {request_id: user_response} async for event in self._workflow.send_responses_streaming(responses): + try: + token_acc.record_event(event) + except Exception as _tu_err: + logger.debug("token_usage record_event failed: %s", _tu_err) + if isinstance(event, WorkflowStatusEvent): yield { "type": "status", @@ -890,8 +956,17 @@ async def send_user_response( "metadata": {"conversation_id": conversation_id} } + try: + token_acc.flush(source="send_user_response") + except Exception as _tu_err: + logger.debug("token_usage flush failed: %s", _tu_err) + except Exception as e: logger.exception(f"Error sending user response: {e}") + try: + token_acc.flush(source="send_user_response:error") + except Exception: + pass yield { "type": "error", "content": f"An error occurred: {str(e)}", @@ -938,8 +1013,13 @@ async def parse_brief( return empty_brief, RAI_HARMFUL_CONTENT_RESPONSE, True # SECONDARY RAI CHECK - Use LLM-based classifier for comprehensive safety/scope validation + token_acc = self._new_token_accumulator() try: rai_response = await self._rai_agent.run(brief_text) + try: + token_acc.record_response(agent_name="rai_agent", response=rai_response) + except Exception as _tu_err: + logger.debug("token_usage record (rai_agent) failed: %s", _tu_err) rai_result = str(rai_response).strip().upper() logger.info(f"RAI agent response for parse_brief: {rai_result}") @@ -956,6 +1036,10 @@ async def parse_brief( visual_guidelines="", cta="" ) + try: + token_acc.flush(source="parse_brief:rai_blocked") + except Exception: + pass return empty_brief, RAI_HARMFUL_CONTENT_RESPONSE, True except Exception as rai_error: # Log the error but continue - don't block legitimate requests due to RAI agent failures @@ -1009,6 +1093,14 @@ async def parse_brief( # Use the agent's run method response = await planning_agent.run(analysis_prompt) + try: + token_acc.record_response(agent_name="planning_agent", response=response) + except Exception as _tu_err: + logger.debug("token_usage record (planning_agent) failed: %s", _tu_err) + try: + token_acc.flush(source="parse_brief") + except Exception: + pass # Parse the analysis response try: @@ -1293,6 +1385,19 @@ async def _generate_foundry_image(self, image_prompt: str, results: dict) -> Non response_data = response.json() + # Capture token usage from image API response (gpt-image-1 returns + # a 'usage' field with input/output/total token counts). + try: + img_acc = self._new_token_accumulator() + img_acc.record_image_api_response( + agent_name="image_content_agent", + response_json=response_data, + model=image_deployment or self._image_model, + ) + img_acc.flush(source="foundry_image_generation") + except Exception as _tu_err: + logger.debug("token_usage capture (foundry image) failed: %s", _tu_err) + # Extract image data from response data = response_data.get("data", []) if not data: @@ -1423,8 +1528,13 @@ async def generate_content( """ try: + token_acc = self._new_token_accumulator() # Generate text content text_response = await self._agents["text_content"].run(text_request) + try: + token_acc.record_response(agent_name="text_content_agent", response=text_response) + except Exception as _tu_err: + logger.debug("token_usage record (text_content_agent) failed: %s", _tu_err) results["text_content"] = str(text_response) # Generate image prompt if requested @@ -1504,6 +1614,10 @@ async def generate_content( else: # Direct mode: use image agent to create prompt, then generate via image generation model image_response = await self._agents["image_content"].run(image_request) + try: + token_acc.record_response(agent_name="image_content_agent", response=image_response) + except Exception as _tu_err: + logger.debug("token_usage record (image_content_agent) failed: %s", _tu_err) results["image_prompt"] = str(image_response) # Extract clean prompt from the response and generate actual image @@ -1574,6 +1688,10 @@ async def generate_content( Check against brand guidelines and flag any issues. """ compliance_response = await self._agents["compliance"].run(compliance_request) + try: + token_acc.record_response(agent_name="compliance_agent", response=compliance_response) + except Exception as _tu_err: + logger.debug("token_usage record (compliance_agent) failed: %s", _tu_err) results["compliance"] = str(compliance_response) # Try to parse compliance violations @@ -1608,6 +1726,12 @@ async def generate_content( logger.exception(f"Error generating content: {e}") results["error"] = str(e) + # Emit aggregated token usage events for the generate_content request. + try: + token_acc.flush(source="generate_content") + except Exception as _tu_err: + logger.debug("token_usage flush (generate_content) failed: %s", _tu_err) + # Log results summary before returning logger.info(f"Orchestrator returning results with keys: {list(results.keys())}") has_image = bool(results.get("image_base64")) @@ -1753,6 +1877,12 @@ async def regenerate_image( else: # Direct mode: use image agent to interpret the modification image_response = await self._agents["image_content"].run(modification_prompt) + try: + regen_acc = self._new_token_accumulator() + regen_acc.record_response(agent_name="image_content_agent", response=image_response) + regen_acc.flush(source="regenerate_image") + except Exception as _tu_err: + logger.debug("token_usage capture (regenerate_image) failed: %s", _tu_err) prompt_text = str(image_response) # Extract the prompt from JSON response diff --git a/src/backend/token_usage.py b/src/backend/token_usage.py new file mode 100644 index 000000000..6aab3d2e3 --- /dev/null +++ b/src/backend/token_usage.py @@ -0,0 +1,396 @@ +""" +Token usage tracking for the Content Generation orchestrator. + +Captures LLM token usage from Microsoft Agent Framework agent runs and workflow +streams (Azure OpenAI / Azure AI Foundry) and emits per-agent and per-model +custom events to Application Insights via ``event_utils.track_event_if_configured``. + +Usage: + from token_usage import TokenUsageAccumulator, extract_usage_from_response + + acc = TokenUsageAccumulator(user_id="abc", conversation_id="xyz", + agent_model_map={"planning_agent": "gpt-5"}) + response = await agent.run(prompt) + acc.record_response(agent_name="planning_agent", response=response) + acc.flush() # emits LLM_*_Token_Usage events +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from typing import Any, Iterable, Optional + +from event_utils import track_event_if_configured + +logger = logging.getLogger(__name__) + +# Custom Application Insights event names (shared with KQL dashboard queries). +EVENT_SUMMARY = "LLM_Token_Usage_Summary" +EVENT_AGENT = "LLM_Agent_Token_Usage" +EVENT_MODEL = "LLM_Model_Token_Usage" + + +@dataclass(slots=True) +class _Counts: + input_tokens: int = 0 + output_tokens: int = 0 + total_tokens: int = 0 + model_deployment_name: str = "" + + def add(self, inp: int, out: int, tot: int) -> None: + self.input_tokens += inp + self.output_tokens += out + self.total_tokens += tot + + +def _coerce_int(value: Any) -> int: + try: + if value is None: + return 0 + return int(value) + except (TypeError, ValueError): + return 0 + + +def _from_dict(d: dict) -> Optional[tuple[int, int, int]]: + """Pull (input, output, total) out of a usage-shaped dict. + + Handles both the Microsoft Agent Framework ``UsageDetails`` shape + (``input_token_count`` / ``output_token_count`` / ``total_token_count``) + and the OpenAI SDK shape (``prompt_tokens`` / ``completion_tokens`` / + ``total_tokens``). + """ + inp = _coerce_int( + d.get("input_token_count") + or d.get("prompt_tokens") + or d.get("input_tokens") + ) + out = _coerce_int( + d.get("output_token_count") + or d.get("completion_tokens") + or d.get("output_tokens") + ) + tot = _coerce_int(d.get("total_token_count") or d.get("total_tokens")) or ( + inp + out + ) + if tot <= 0: + return None + return (inp, out, tot) + + +def _from_usage_details(details: Any) -> Optional[tuple[int, int, int]]: + """Extract counts from a ``UsageDetails`` object, dict, or similar.""" + if details is None: + return None + if isinstance(details, dict): + return _from_dict(details) + inp = _coerce_int( + getattr(details, "input_token_count", None) + or getattr(details, "prompt_tokens", None) + or getattr(details, "input_tokens", None) + ) + out = _coerce_int( + getattr(details, "output_token_count", None) + or getattr(details, "completion_tokens", None) + or getattr(details, "output_tokens", None) + ) + tot = _coerce_int( + getattr(details, "total_token_count", None) + or getattr(details, "total_tokens", None) + ) or (inp + out) + if tot <= 0: + return None + return (inp, out, tot) + + +def _scan_contents(contents: Optional[Iterable]) -> Optional[tuple[int, int, int]]: + """Look for ``UsageContent`` entries in a contents list.""" + if not contents: + return None + for item in contents: + # Framework UsageContent: has .details (UsageDetails) + details = getattr(item, "details", None) + if details is not None: + result = _from_usage_details(details) + if result: + return result + # Some shapes expose .usage_details directly + usage_details = getattr(item, "usage_details", None) + if usage_details is not None: + result = _from_usage_details(usage_details) + if result: + return result + # Plain dict content + if isinstance(item, dict): + if isinstance(item.get("details"), dict): + result = _from_dict(item["details"]) + if result: + return result + if isinstance(item.get("usage_details"), dict): + result = _from_dict(item["usage_details"]) + if result: + return result + return None + + +def extract_usage_from_response(response: Any) -> Optional[tuple[int, int, int]]: + """Extract ``(input, output, total)`` token counts from an ``AgentResponse``. + + Checks (in order): + 1. ``response.usage_details`` + 2. ``response.messages[*].contents[*]`` for ``UsageContent`` items + 3. ``response.raw_representation.usage`` (OpenAI SDK fallback) + Returns ``None`` if no usage information is present. + """ + if response is None: + return None + + result = _from_usage_details(getattr(response, "usage_details", None)) + if result: + return result + + messages = getattr(response, "messages", None) or [] + for msg in messages: + result = _scan_contents(getattr(msg, "contents", None)) + if result: + return result + + raw = getattr(response, "raw_representation", None) + if raw is not None: + usage = getattr(raw, "usage", None) or ( + raw.get("usage") if isinstance(raw, dict) else None + ) + if usage is not None: + result = _from_usage_details(usage) + if result: + return result + return None + + +def extract_usage_from_update(update: Any) -> Optional[tuple[int, int, int]]: + """Extract token counts from a streaming ``AgentResponseUpdate``.""" + if update is None: + return None + + result = _scan_contents(getattr(update, "contents", None)) + if result: + return result + + raw = getattr(update, "raw_representation", None) + if raw is not None: + usage = getattr(raw, "usage", None) or ( + raw.get("usage") if isinstance(raw, dict) else None + ) + if usage is not None: + result = _from_usage_details(usage) + if result: + return result + return None + + +def extract_usage_from_event(event: Any) -> tuple[Optional[str], Optional[tuple[int, int, int]]]: + """Extract ``(executor_id, usage_tuple)`` from a workflow stream event. + + Used while iterating ``workflow.run_stream(...)``: returns the executor / + agent name plus the usage tuple when present, or ``(None, None)`` for + unrelated events. + """ + if event is None: + return (None, None) + + executor_id = getattr(event, "executor_id", None) + data = getattr(event, "data", None) + if data is None: + return (executor_id, None) + + # AgentRunUpdateEvent → data is AgentResponseUpdate + usage = extract_usage_from_update(data) + if usage: + return (executor_id, usage) + + # AgentRunEvent → data is AgentResponse + usage = extract_usage_from_response(data) + if usage: + return (executor_id, usage) + + return (executor_id, None) + + +class TokenUsageAccumulator: + """Accumulates per-agent and per-model token usage for a single request. + + Call ``record_*`` as agent invocations complete, then ``flush()`` once at + the end of the request to emit Application Insights custom events. + Telemetry failures are logged but never raised — never break the user + flow on a telemetry error. + """ + + __slots__ = ( + "user_id", + "conversation_id", + "agent_model_map", + "default_model", + "by_agent", + "by_model", + "totals", + ) + + def __init__( + self, + *, + user_id: str = "", + conversation_id: str = "", + agent_model_map: Optional[dict[str, str]] = None, + default_model: str = "", + ) -> None: + self.user_id = user_id or "" + self.conversation_id = conversation_id or "" + self.agent_model_map: dict[str, str] = dict(agent_model_map or {}) + self.default_model = default_model or "" + self.by_agent: dict[str, _Counts] = {} + self.by_model: dict[str, _Counts] = {} + self.totals: _Counts = _Counts() + + def _resolve_model(self, agent_name: str) -> str: + return ( + self.agent_model_map.get(agent_name) + or self.agent_model_map.get(agent_name or "", "") + or self.default_model + ) + + def record(self, agent_name: str, usage: Optional[tuple[int, int, int]]) -> None: + """Record an extracted usage tuple for the named agent (no-op if None/zero).""" + if not usage: + return + inp, out, tot = usage + if tot <= 0: + return + agent = agent_name or "unknown_agent" + model = self._resolve_model(agent) + + agent_counts = self.by_agent.setdefault( + agent, _Counts(model_deployment_name=model) + ) + if not agent_counts.model_deployment_name and model: + agent_counts.model_deployment_name = model + agent_counts.add(inp, out, tot) + + if model: + self.by_model.setdefault(model, _Counts()).add(inp, out, tot) + + self.totals.add(inp, out, tot) + + def record_response(self, *, agent_name: str, response: Any) -> bool: + """Extract usage from an ``AgentResponse`` and record it. Returns True on success.""" + usage = extract_usage_from_response(response) + if usage: + self.record(agent_name, usage) + return True + return False + + def record_update(self, *, executor_id: str, update: Any) -> bool: + """Extract usage from an ``AgentResponseUpdate`` and record it.""" + usage = extract_usage_from_update(update) + if usage: + self.record(executor_id, usage) + return True + return False + + def record_event(self, event: Any) -> bool: + """Extract usage from a workflow ``run_stream`` event and record it.""" + executor_id, usage = extract_usage_from_event(event) + if usage and executor_id: + self.record(executor_id, usage) + return True + return False + + def record_image_api_response( + self, *, agent_name: str, response_json: Optional[dict], model: str = "" + ) -> bool: + """Record token usage from an image-generation REST response (OpenAI shape).""" + if not isinstance(response_json, dict): + return False + usage = response_json.get("usage") + if not isinstance(usage, dict): + return False + if model and agent_name not in self.agent_model_map: + self.agent_model_map[agent_name] = model + result = _from_dict(usage) + if result: + self.record(agent_name, result) + return True + return False + + def has_data(self) -> bool: + return self.totals.total_tokens > 0 + + def flush(self, *, source: str = "") -> None: + """Emit aggregated events to Application Insights. Safe to call once per request.""" + if not self.has_data(): + return + + base_dims = { + "user_id": self.user_id, + "conversation_id": self.conversation_id, + "source": source, + } + + try: + track_event_if_configured( + EVENT_SUMMARY, + { + **base_dims, + "total_input_tokens": str(self.totals.input_tokens), + "total_output_tokens": str(self.totals.output_tokens), + "total_tokens": str(self.totals.total_tokens), + "agent_count": str(len(self.by_agent)), + "model_count": str(len(self.by_model)), + }, + ) + except Exception as e: + logger.warning("Failed to emit %s: %s", EVENT_SUMMARY, e) + + for agent_name, c in self.by_agent.items(): + try: + track_event_if_configured( + EVENT_AGENT, + { + **base_dims, + "agent_name": agent_name, + "model_deployment_name": c.model_deployment_name or self.default_model, + "input_tokens": str(c.input_tokens), + "output_tokens": str(c.output_tokens), + "total_tokens": str(c.total_tokens), + }, + ) + except Exception as e: + logger.warning("Failed to emit %s for %s: %s", EVENT_AGENT, agent_name, e) + + for model_name, c in self.by_model.items(): + try: + track_event_if_configured( + EVENT_MODEL, + { + **base_dims, + "model_deployment_name": model_name, + "input_tokens": str(c.input_tokens), + "output_tokens": str(c.output_tokens), + "total_tokens": str(c.total_tokens), + }, + ) + except Exception as e: + logger.warning("Failed to emit %s for %s: %s", EVENT_MODEL, model_name, e) + + logger.info( + "[TOKEN USAGE] source=%s user=%s conv=%s total=%d (in=%d, out=%d) " + "agents=%s models=%s", + source, + self.user_id, + self.conversation_id, + self.totals.total_tokens, + self.totals.input_tokens, + self.totals.output_tokens, + {k: v.total_tokens for k, v in self.by_agent.items()}, + {k: v.total_tokens for k, v in self.by_model.items()}, + ) From ff168de33d77dbe2875c047dad55c2dd151285c9 Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Tue, 12 May 2026 16:02:04 +0530 Subject: [PATCH 02/17] feat: add Token Usage Application Insights workbook for LLM monitoring --- infra/main.bicep | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/infra/main.bicep b/infra/main.bicep index 221d2ef6a..d3b83c4a5 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -360,6 +360,25 @@ module applicationInsights 'br/public:avm/res/insights/component:0.7.1' = if (en } } +// ========== Token Usage Workbook ========== // +// Provisions the "Token Usage" Application Insights workbook that visualises +// LLM_Token_Usage_Summary / LLM_Agent_Token_Usage / LLM_Model_Token_Usage +// custom events emitted by the orchestrator. +// Template lives in infra/dashboards/token-usage-workbook.json. +resource tokenUsageWorkbook 'Microsoft.Insights/workbooks@2023-06-01' = if (enableMonitoring) { + name: guid(resourceGroup().id, applicationInsightsResourceName, 'token-usage') + location: solutionLocation + tags: tags + kind: 'shared' + properties: { + displayName: 'Token Usage' + category: 'workbook' + sourceId: applicationInsights!.outputs.resourceId + version: 'Notebook/1.0' + serializedData: loadTextContent('dashboards/token-usage-workbook.json') + } +} + // ========== User Assigned Identity ========== // var userAssignedIdentityResourceName = 'id-${solutionSuffix}' module userAssignedIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.5.0' = { From db73a16caf5402869019f8c8416182560abfea1d Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Tue, 12 May 2026 17:57:28 +0530 Subject: [PATCH 03/17] feat: add monitoring configuration hash to container instance for dynamic tagging --- infra/main.bicep | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/infra/main.bicep b/infra/main.bicep index d3b83c4a5..c02bc60c6 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -1021,12 +1021,24 @@ module webSite 'modules/web-sites.bicep' = { // ========== Container Instance (Backend API) ========== // var containerInstanceName = 'aci-${solutionSuffix}' +// Hash that changes whenever the monitoring config (enableMonitoring + connection string) changes. +// Used as an ACI tag so that toggling enableMonitoring (or rotating the App Insights component) +// forces ARM to detect drift on the container group, triggering a restart and re-applying env vars +// like APPLICATIONINSIGHTS_CONNECTION_STRING. ACI does not natively support forceUpdateTag. +var monitoringConfigHash = uniqueString( + string(enableMonitoring), + enableMonitoring ? applicationInsights!.outputs.connectionString : 'monitoring-disabled' +) + module containerInstance 'modules/container-instance.bicep' = { name: take('module.container-instance.${containerInstanceName}', 64) params: { name: containerInstanceName location: solutionLocation - tags: tags + tags: union(tags, { + 'monitoring-enabled': string(enableMonitoring) + 'monitoring-config-hash': monitoringConfigHash + }) containerImage: '${acrResourceName}.azurecr.io/content-gen-api:${imageTag}' cpu: 2 memoryInGB: 4 From 5f5737b93579cc73c16e4e8f4ce8583a38e23cfc Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Tue, 12 May 2026 18:04:40 +0530 Subject: [PATCH 04/17] sync main_custom.bicep with main.bicep --- infra/main_custom.bicep | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/infra/main_custom.bicep b/infra/main_custom.bicep index 99713345c..34b39f0f0 100644 --- a/infra/main_custom.bicep +++ b/infra/main_custom.bicep @@ -369,6 +369,25 @@ module applicationInsights 'br/public:avm/res/insights/component:0.7.1' = if (en } } +// ========== Token Usage Workbook ========== // +// Provisions the "Token Usage" Application Insights workbook that visualises +// LLM_Token_Usage_Summary / LLM_Agent_Token_Usage / LLM_Model_Token_Usage +// custom events emitted by the orchestrator. +// Template lives in infra/dashboards/token-usage-workbook.json. +resource tokenUsageWorkbook 'Microsoft.Insights/workbooks@2023-06-01' = if (enableMonitoring) { + name: guid(resourceGroup().id, applicationInsightsResourceName, 'token-usage') + location: solutionLocation + tags: tags + kind: 'shared' + properties: { + displayName: 'Token Usage' + category: 'workbook' + sourceId: applicationInsights!.outputs.resourceId + version: 'Notebook/1.0' + serializedData: loadTextContent('dashboards/token-usage-workbook.json') + } +} + // ========== User Assigned Identity ========== // var userAssignedIdentityResourceName = 'id-${solutionSuffix}' module userAssignedIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.5.0' = { @@ -1068,10 +1087,20 @@ resource aciTelemetry 'Microsoft.Resources/deployments@2025-04-01' = if (enableT } } +// Hash that changes whenever the monitoring config is toggled. +// Used as an ACI tag so that toggling enableMonitoring forces ARM to detect drift on the +// container group, triggering a restart and re-applying env vars like +// APPLICATIONINSIGHTS_CONNECTION_STRING. ACI does not natively support forceUpdateTag, +// and tags must be calculatable at deployment-start (no runtime references allowed). +var monitoringConfigHash = uniqueString(string(enableMonitoring)) + resource containerInstance 'Microsoft.ContainerInstance/containerGroups@2025-09-01' = if (shouldDeployACI) { name: containerInstanceName location: solutionLocation - tags: tags + tags: union(tags, { + 'monitoring-enabled': string(enableMonitoring) + 'monitoring-config-hash': monitoringConfigHash + }) identity: { type: 'UserAssigned' userAssignedIdentities: { From 52a76174d956e8ecba8a4be8431b7906787c2c34 Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Thu, 14 May 2026 16:22:21 +0530 Subject: [PATCH 05/17] feat: separate Token Usage Application Insights workbook deployment into its own template --- infra/main.bicep | 20 ++----------- infra/main_custom.bicep | 20 ++----------- infra/workbook/workbook.bicep | 53 +++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 34 deletions(-) create mode 100644 infra/workbook/workbook.bicep diff --git a/infra/main.bicep b/infra/main.bicep index c02bc60c6..40da9f64b 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -361,23 +361,9 @@ module applicationInsights 'br/public:avm/res/insights/component:0.7.1' = if (en } // ========== Token Usage Workbook ========== // -// Provisions the "Token Usage" Application Insights workbook that visualises -// LLM_Token_Usage_Summary / LLM_Agent_Token_Usage / LLM_Model_Token_Usage -// custom events emitted by the orchestrator. -// Template lives in infra/dashboards/token-usage-workbook.json. -resource tokenUsageWorkbook 'Microsoft.Insights/workbooks@2023-06-01' = if (enableMonitoring) { - name: guid(resourceGroup().id, applicationInsightsResourceName, 'token-usage') - location: solutionLocation - tags: tags - kind: 'shared' - properties: { - displayName: 'Token Usage' - category: 'workbook' - sourceId: applicationInsights!.outputs.resourceId - version: 'Notebook/1.0' - serializedData: loadTextContent('dashboards/token-usage-workbook.json') - } -} +// The "Token Usage" Application Insights workbook is now deployed separately +// via infra/workbook/workbook.bicep so it can target an Application Insights +// resource in any resource group / subscription. See infra/workbook/README.md. // ========== User Assigned Identity ========== // var userAssignedIdentityResourceName = 'id-${solutionSuffix}' diff --git a/infra/main_custom.bicep b/infra/main_custom.bicep index 34b39f0f0..a59c9ccfb 100644 --- a/infra/main_custom.bicep +++ b/infra/main_custom.bicep @@ -370,23 +370,9 @@ module applicationInsights 'br/public:avm/res/insights/component:0.7.1' = if (en } // ========== Token Usage Workbook ========== // -// Provisions the "Token Usage" Application Insights workbook that visualises -// LLM_Token_Usage_Summary / LLM_Agent_Token_Usage / LLM_Model_Token_Usage -// custom events emitted by the orchestrator. -// Template lives in infra/dashboards/token-usage-workbook.json. -resource tokenUsageWorkbook 'Microsoft.Insights/workbooks@2023-06-01' = if (enableMonitoring) { - name: guid(resourceGroup().id, applicationInsightsResourceName, 'token-usage') - location: solutionLocation - tags: tags - kind: 'shared' - properties: { - displayName: 'Token Usage' - category: 'workbook' - sourceId: applicationInsights!.outputs.resourceId - version: 'Notebook/1.0' - serializedData: loadTextContent('dashboards/token-usage-workbook.json') - } -} +// The "Token Usage" Application Insights workbook is now deployed separately +// via infra/workbook/workbook.bicep so it can target an Application Insights +// resource in any resource group / subscription. See infra/workbook/README.md. // ========== User Assigned Identity ========== // var userAssignedIdentityResourceName = 'id-${solutionSuffix}' diff --git a/infra/workbook/workbook.bicep b/infra/workbook/workbook.bicep new file mode 100644 index 000000000..a59f8e72d --- /dev/null +++ b/infra/workbook/workbook.bicep @@ -0,0 +1,53 @@ +// ============================================================================ +// Token Usage Workbook (standalone deployment) +// ---------------------------------------------------------------------------- +// Provisions the "Token Usage" Application Insights workbook that visualises +// LLM_Token_Usage_Summary / LLM_Agent_Token_Usage / LLM_Model_Token_Usage +// custom events emitted by the Content Generation Solution Accelerator +// orchestrator. +// +// This template is deployed independently of the main solution so it can +// target an existing Application Insights instance that lives in a different +// resource group (or subscription) from the rest of the accelerator. +// +// Scope: resourceGroup (the workbook resource is created in the resource +// group passed to the deployment command - it does NOT need to be the same +// resource group as the Application Insights instance). +// ============================================================================ + +targetScope = 'resourceGroup' + +@description('Optional. Full resource ID of the Application Insights instance the workbook should query. Leave as the default ("Azure Monitor") to deploy an unbound workbook and pick the App Insights instance later from the Azure portal. Re-deploy with a real resource ID to (re)bind the workbook to a specific App Insights instance.') +param applicationInsightsResourceId string = 'Azure Monitor' + +@description('Optional. Stable name used for the workbook. Keep the default to allow re-deployments to update the SAME workbook even when applicationInsightsResourceId changes. Override only if you want multiple independent copies in the same resource group.') +param workbookName string = guid(resourceGroup().id, 'token-usage-workbook') + +@description('Optional. Azure region for the workbook resource. Defaults to the resource group location.') +param location string = resourceGroup().location + +@description('Optional. Display name shown in the Azure portal workbook gallery.') +param displayName string = 'Token Usage' + +@description('Optional. Tags applied to the workbook resource.') +param tags object = {} + +resource tokenUsageWorkbook 'Microsoft.Insights/workbooks@2023-06-01' = { + name: workbookName + location: location + tags: tags + kind: 'shared' + properties: { + displayName: displayName + category: 'workbook' + sourceId: applicationInsightsResourceId + version: 'Notebook/1.0' + serializedData: loadTextContent('../dashboards/token-usage-workbook.json') + } +} + +@description('Resource ID of the deployed workbook.') +output workbookResourceId string = tokenUsageWorkbook.id + +@description('Name (GUID) of the deployed workbook.') +output workbookName string = tokenUsageWorkbook.name From cebc62e7d2a7f16f74f8ad8c399906baef2c7fc1 Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Tue, 19 May 2026 18:50:59 +0530 Subject: [PATCH 06/17] Refactor code structure for improved readability and maintainability --- azure.yaml | 343 ++- infra/dashboards/token-usage-workbook.json | 6 +- infra/main.bicep | 278 +- infra/main.json | 2 +- infra/main_custom.bicep | 2 +- infra/monitoring/README.md | 91 + infra/monitoring/monitoring.bicep | 97 + infra/workbook/README.md | 109 + src/.coverage | Bin 0 -> 53248 bytes src/backend/app.py | 17 +- src/backend/orchestrator.py | 92 +- src/coverage.xml | 2850 ++++++++++++++++++++ 12 files changed, 3675 insertions(+), 212 deletions(-) create mode 100644 infra/monitoring/README.md create mode 100644 infra/monitoring/monitoring.bicep create mode 100644 infra/workbook/README.md create mode 100644 src/.coverage create mode 100644 src/coverage.xml diff --git a/azure.yaml b/azure.yaml index 23f3ced27..8ba99fac6 100644 --- a/azure.yaml +++ b/azure.yaml @@ -1,7 +1,3 @@ -environment: - name: content-generation - location: eastus - name: content-generation metadata: template: content-generation@1.22 @@ -9,144 +5,281 @@ metadata: requiredVersions: azd: '>= 1.18.0 != 1.23.9' -parameters: - solutionPrefix: - type: string - default: contentgen - displayName: Solution Prefix - description: A unique prefix for all resources (3-15 chars) - azureAiServiceLocation: - type: string - default: eastus - displayName: AI Services Location - description: Location for Azure AI Services deployments - enableMonitoring: - type: boolean - default: false - displayName: Enable Monitoring (WAF) - description: Enable Log Analytics and Application Insights - enableScalability: - type: boolean - default: false - displayName: Enable Scalability (WAF) - description: Enable auto-scaling and higher SKUs - enableRedundancy: - type: boolean - default: false - displayName: Enable Redundancy (WAF) - description: Enable zone redundancy and geo-replication - enablePrivateNetworking: - type: boolean - default: false - displayName: Enable Private Networking (WAF) - description: Enable VNet integration and private endpoints - infra: - provider: bicep path: ./infra module: main -workflows: - up: - steps: - - azd: provision +services: + frontend: + project: ./src/App/server + language: js + host: appservice + dist: ./dist + resourceName: ${APP_SERVICE_NAME} + hooks: + prepackage: + windows: + shell: pwsh + run: ../../../infra/scripts/package_frontend.ps1 + continueOnError: false + posix: + shell: sh + run: chmod +x ../../../infra/scripts/package_frontend.sh && ../../../infra/scripts/package_frontend.sh + continueOnError: false hooks: + preprovision: + windows: + shell: pwsh + run: | + Write-Host "Preparing deployment..." -ForegroundColor Cyan + + # Check if image exists in the ACR (handles fresh deploy AND deployment mode switch) + $acrName = azd env get-value AZURE_ENV_CONTAINER_REGISTRY_NAME 2>$null + $global:LASTEXITCODE = 0 + $skipAci = $false + + if (-not $acrName) { + Write-Host "First deployment - ACI will be deployed after image build" -ForegroundColor Yellow + $skipAci = $true + } elseif ($acrName -eq 'contentgencontainerreg') { + # Switching from standard (shared ACR) to custom (own ACR) deployment + # Clear ACI name so postprovision deploys fresh with new ACR + Write-Host "Switching to custom deployment - ACI will be redeployed with new ACR" -ForegroundColor Yellow + azd env set CONTAINER_INSTANCE_NAME "" + $skipAci = $true + } else { + # Custom ACR exists - check if image is present + Write-Host "Checking for existing image in $acrName..." -ForegroundColor Cyan + $imageCheck = az acr repository show --name $acrName --image "content-gen-api:latest" 2>$null + $global:LASTEXITCODE = 0 + + if (-not $imageCheck) { + Write-Host "Image not found in ACR - will build in postprovision" -ForegroundColor Yellow + $skipAci = $true + } else { + Write-Host "Image found - ACI deployment will proceed" -ForegroundColor Green + } + } + + if ($skipAci) { + azd env set AZURE_ENV_IMAGE_TAG none + } + continueOnError: false + posix: + shell: sh + run: | + echo "Preparing deployment..." + + # Check if image exists in the ACR (handles fresh deploy AND deployment mode switch) + ACR_NAME=$(azd env get-value AZURE_ENV_CONTAINER_REGISTRY_NAME 2>/dev/null || echo "") + SKIP_ACI=false + + if [ -z "$ACR_NAME" ]; then + echo "First deployment - ACI will be deployed after image build" + SKIP_ACI=true + elif [ "$ACR_NAME" = "contentgencontainerreg" ]; then + # Switching from standard (shared ACR) to custom (own ACR) deployment + # Clear ACI name so postprovision deploys fresh with new ACR + echo "Switching to custom deployment - ACI will be redeployed with new ACR" + azd env set CONTAINER_INSTANCE_NAME "" + SKIP_ACI=true + else + # Custom ACR exists - check if image is present + echo "Checking for existing image in $ACR_NAME..." + if az acr repository show --name "$ACR_NAME" --image "content-gen-api:latest" >/dev/null 2>&1; then + echo "Image found - ACI deployment will proceed" + else + echo "Image not found in ACR - will build in postprovision" + SKIP_ACI=true + fi + fi + + if [ "$SKIP_ACI" = "true" ]; then + azd env set AZURE_ENV_IMAGE_TAG none + fi + continueOnError: false + postprovision: windows: + shell: pwsh run: | - Write-Host "===== Provision Complete =====" -ForegroundColor Green + $acrName = $env:AZURE_ENV_CONTAINER_REGISTRY_NAME + $resourceGroup = $env:RESOURCE_GROUP_NAME + $backendImage = $env:BACKEND_IMAGE_NAME + $appServiceName = $env:APP_SERVICE_NAME + + if (-not $acrName -or -not $resourceGroup -or -not $appServiceName) { + Write-Host "ERROR: Missing required environment variables" -ForegroundColor Red + exit 1 + } + + # Check if ACI already exists (reads from persisted azd env) + $aciName = azd env get-value CONTAINER_INSTANCE_NAME 2>$null + $global:LASTEXITCODE = 0 + + # ===== Build Backend Image (ACR Build) ===== Write-Host "" - Write-Host "Web App URL: " -NoNewline - Write-Host "$env:WEB_APP_URL" -ForegroundColor Cyan - Write-Host "Storage Account: " -NoNewline - Write-Host "$env:AZURE_BLOB_ACCOUNT_NAME" -ForegroundColor Cyan - Write-Host "AI Search Service: " -NoNewline - Write-Host "$env:AI_SEARCH_SERVICE_NAME" -ForegroundColor Cyan - Write-Host "AI Search Index: " -NoNewline - Write-Host "$env:AZURE_AI_SEARCH_PRODUCTS_INDEX" -ForegroundColor Cyan - Write-Host "AI Service Location: " -NoNewline - Write-Host "$env:AZURE_ENV_AI_SERVICE_LOCATION" -ForegroundColor Cyan - Write-Host "Container Instance: " -NoNewline - Write-Host "$env:CONTAINER_INSTANCE_NAME" -ForegroundColor Cyan - - # Run post-deploy script to upload sample data and create search index + Write-Host "===== Building Backend Image =====" -ForegroundColor Yellow + Write-Host "Registry: $acrName" -ForegroundColor Cyan + Write-Host "Image: ${backendImage}:latest" -ForegroundColor Cyan + + az acr login --name $acrName 2>$null + az acr build --registry $acrName --image "${backendImage}:latest" --file ./src/backend/ApiApp.Dockerfile ./src/backend + if ($LASTEXITCODE -ne 0) { + Write-Host "Failed to build container image" -ForegroundColor Red + exit 1 + } + Write-Host "Container image built and pushed successfully!" -ForegroundColor Green + + # ===== Deploy ACI if not already deployed ===== + if (-not $aciName) { + Write-Host "" + Write-Host "===== Deploying Container Instance =====" -ForegroundColor Yellow + azd env set AZURE_ENV_IMAGE_TAG latest + + # Use az deployment instead of azd provision to avoid hook recursion + # Pass parameters inline (main.parameters.json uses AZD ${VAR} syntax not supported by az CLI) + Write-Host "Deploying ACI via Bicep..." -ForegroundColor Cyan + + az deployment group create ` + --resource-group $resourceGroup ` + --template-file ./infra/main.bicep ` + --parameters solutionName=$env:AZURE_ENV_NAME ` + --parameters location=$env:AZURE_LOCATION ` + --parameters azureAiServiceLocation=$env:AZURE_ENV_AI_SERVICE_LOCATION ` + --parameters imageTag=latest ` + --query "properties.outputs" -o json | Out-Null + + if ($LASTEXITCODE -eq 0) { + # Refresh azd env with new outputs + azd env refresh --no-prompt 2>$null + Write-Host "Container Instance deployed successfully!" -ForegroundColor Green + } else { + Write-Host "Container Instance deployment failed" -ForegroundColor Red + } + } else { + Write-Host "" + Write-Host "Restarting Container Instance to pick up new image..." -ForegroundColor Yellow + az container restart --name $aciName --resource-group $resourceGroup 2>$null + Write-Host "Container Instance: $aciName (restarted)" -ForegroundColor Cyan + } + Write-Host "" - # Note: Cosmos DB role is assigned to deployer via Bicep. - Write-Host "===== Running Post-Deploy Script =====" -ForegroundColor Yellow - Write-Host "This will upload sample data and create the search index..." + Write-Host "===== Postprovision Complete - Frontend will deploy next =====" -ForegroundColor Green + exit 0 + interactive: true + continueOnError: false - # Ensure post-deploy Python dependencies are installed + posix: + shell: sh + run: | + ACR_NAME="$AZURE_ENV_CONTAINER_REGISTRY_NAME" + RESOURCE_GROUP="$RESOURCE_GROUP_NAME" + BACKEND_IMAGE="$BACKEND_IMAGE_NAME" + APP_SERVICE="$APP_SERVICE_NAME" + + if [ -z "$ACR_NAME" ] || [ -z "$RESOURCE_GROUP" ] || [ -z "$APP_SERVICE" ]; then + echo "ERROR: Missing required environment variables" + exit 1 + fi + + # Check if ACI already exists (reads from persisted azd env) + ACI_NAME=$(azd env get-value CONTAINER_INSTANCE_NAME 2>/dev/null || echo "") + + # ===== Build Backend Image (ACR Build) ===== + echo "" + echo "===== Building Backend Image =====" + echo "Registry: $ACR_NAME" + echo "Image: $BACKEND_IMAGE:latest" + + if az acr build --registry "$ACR_NAME" --image "$BACKEND_IMAGE:latest" --file ./src/backend/ApiApp.Dockerfile ./src/backend; then + echo "Container image built and pushed successfully!" + else + echo "Failed to build container image" + exit 1 + fi + + # ===== Deploy ACI if not already deployed ===== + if [ -z "$ACI_NAME" ]; then + echo "" + echo "===== Deploying Container Instance =====" + azd env set AZURE_ENV_IMAGE_TAG latest + + # Use az deployment instead of azd provision to avoid hook recursion + # Pass parameters inline (main.parameters.json uses AZD ${VAR} syntax not supported by az CLI) + echo "Deploying ACI via Bicep..." + if az deployment group create \ + --resource-group "$RESOURCE_GROUP" \ + --template-file ./infra/main.bicep \ + --parameters solutionName="$AZURE_ENV_NAME" \ + --parameters location="$AZURE_LOCATION" \ + --parameters azureAiServiceLocation="$AZURE_ENV_AI_SERVICE_LOCATION" \ + --parameters imageTag=latest \ + --query "properties.outputs" -o json > /dev/null; then + # Refresh azd env with new outputs + azd env refresh --no-prompt 2>/dev/null + echo "Container Instance deployed successfully!" + else + echo "Container Instance deployment failed" + fi + else + echo "" + echo "Restarting Container Instance to pick up new image..." + az container restart --name "$ACI_NAME" --resource-group "$RESOURCE_GROUP" 2>/dev/null + echo "Container Instance: $ACI_NAME (restarted)" + fi + + echo "" + echo "===== Postprovision Complete - Frontend will deploy next =====" + + # Ensure postprovision exits successfully so frontend deploys + exit 0 + interactive: true + continueOnError: false + + postdeploy: + windows: + shell: pwsh + run: | + Write-Host "===== Running Post-Deploy Script =====" -ForegroundColor Yellow $python = "python" if (Test-Path "./.venv/Scripts/python.exe") { $python = "./.venv/Scripts/python.exe" } - elseif (Test-Path "../.venv/Scripts/python.exe") { $python = "../.venv/Scripts/python.exe" } - elseif (Test-Path "./.venv/bin/python") { $python = "./.venv/bin/python" } - elseif (Test-Path "../.venv/bin/python") { $python = "../.venv/bin/python" } - & $python -m pip install -r ./scripts/requirements-post-deploy.txt --quiet | Out-Null + & $python -m pip install -r ./scripts/requirements-post-deploy.txt --quiet 2>$null if (Test-Path "./scripts/post_deploy.py") { & $python ./scripts/post_deploy.py --skip-tests - if ($LASTEXITCODE -eq 0) { Write-Host "Post-deploy script completed successfully!" -ForegroundColor Green } else { - Write-Host "Post-deploy script completed with warnings (some steps may have failed)" -ForegroundColor Yellow + Write-Host "Post-deploy script completed with warnings" -ForegroundColor Yellow } - } else { - Write-Host "Warning: post_deploy.py not found, skipping sample data upload" -ForegroundColor Yellow } Write-Host "" Write-Host "===== Deployment Complete =====" -ForegroundColor Green - Write-Host "" - Write-Host "Access the web application:" + Write-Host "Access the web application:" -ForegroundColor White Write-Host " $env:WEB_APP_URL" -ForegroundColor Cyan - shell: pwsh - continueOnError: false interactive: true + continueOnError: false + posix: + shell: sh run: | - echo "===== Provision Complete =====" - echo "" - echo "Web App URL: $WEB_APP_URL" - echo "Storage Account: $AZURE_BLOB_ACCOUNT_NAME" - echo "AI Search Service: $AI_SEARCH_SERVICE_NAME" - echo "AI Search Index: $AZURE_AI_SEARCH_PRODUCTS_INDEX" - echo "AI Service Location: $AZURE_ENV_AI_SERVICE_LOCATION" - echo "Container Instance: $CONTAINER_INSTANCE_NAME" - - echo "" - echo "Container Registry: $AZURE_ENV_CONTAINER_REGISTRY_NAME" - - # Run post-deploy script to upload sample data and create search index - echo "" - # Note: Cosmos DB role is assigned to deployer via Bicep. echo "===== Running Post-Deploy Script =====" - echo "This will upload sample data and create the search index..." + PYTHON="python3" + if [ -f "./.venv/bin/python" ]; then PYTHON="./.venv/bin/python"; fi + $PYTHON -m pip install -r ./scripts/requirements-post-deploy.txt --quiet 2>/dev/null if [ -f "./scripts/post_deploy.py" ]; then - # Prefer local venv if present (repo root or content-gen) - if [ -x "./.venv/bin/python" ]; then - PYTHON_BIN="./.venv/bin/python" - elif [ -x "../.venv/bin/python" ]; then - PYTHON_BIN="../.venv/bin/python" - else - PYTHON_BIN="python3" - fi - - "$PYTHON_BIN" -m pip install -r ./scripts/requirements-post-deploy.txt --quiet > /dev/null \ - && "$PYTHON_BIN" ./scripts/post_deploy.py --skip-tests \ + $PYTHON ./scripts/post_deploy.py --skip-tests \ && echo "Post-deploy script completed successfully!" \ - || echo "Post-deploy script completed with warnings (some steps may have failed)" - else - echo "Warning: post_deploy.py not found, skipping sample data upload" + || echo "Post-deploy script completed with warnings" fi echo "" echo "===== Deployment Complete =====" - echo "" echo "Access the web application:" echo " $WEB_APP_URL" - shell: sh - continueOnError: false interactive: true + continueOnError: false diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index fe97e525f..c962e0ff5 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -55,7 +55,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| summarize TotalTokens = sum(total_tokens), InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\n| render timechart", + "query": "let _range = totimespan('{TimeRange:duration}');\nlet _bin = iff(_range > 1d, 1d, 1h);\ncustomEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| summarize TotalTokens = sum(total_tokens), InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, _bin)\n| render timechart", "size": 0, "title": "Token usage over time", "timeContextFromParameter": "TimeRange", @@ -129,7 +129,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend user_id = tostring(customDimensions['user_id'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| where isnotempty(user_id)\n| summarize Requests = count(), TotalTokens = sum(total_tokens) by User = user_id\n| order by TotalTokens desc\n| take 25", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend user_id = tostring(customDimensions['user_id'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend User = iff(isempty(user_id) or user_id == 'anonymous', 'anonymous', user_id)\n| summarize Requests = count(), TotalTokens = sum(total_tokens) by User\n| order by TotalTokens desc\n| take 25", "size": 0, "title": "Top users by token consumption", "timeContextFromParameter": "TimeRange", @@ -144,7 +144,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| extend agent = tostring(customDimensions['agent_name'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize TotalTokens = sum(total_tokens) by bin(timestamp, 1h), agent\n| render timechart", + "query": "let _range = totimespan('{TimeRange:duration}');\nlet _bin = iff(_range > 1d, 1d, 1h);\ncustomEvents\n| where name == 'LLM_Agent_Token_Usage'\n| extend agent = tostring(customDimensions['agent_name'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize TotalTokens = sum(total_tokens) by bin(timestamp, _bin), agent\n| render timechart", "size": 0, "title": "Per-agent token usage over time", "timeContextFromParameter": "TimeRange", diff --git a/infra/main.bicep b/infra/main.bicep index 40da9f64b..a16b90d67 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -10,6 +10,7 @@ metadata description = '''Solution Accelerator for multimodal marketing content @description('Optional. A unique application/solution name for all resources in this deployment.') param solutionName string = 'contentgen' +@minLength(3) @maxLength(5) @description('Optional. A unique text value for the solution.') param solutionUniqueText string = substring(uniqueString(subscription().id, resourceGroup().name, solutionName), 0, 5) @@ -92,7 +93,7 @@ param azureOpenaiAPIVersion string = '2025-01-01-preview' @minValue(10) @description('Optional. AI model deployment token capacity.') -param gptModelCapacity int = 150 +param gptModelCapacity int = 50 @minValue(1) @description('Optional. Image model deployment capacity (RPM).') @@ -121,7 +122,7 @@ param vmAdminPassword string = '' param tags object = {} @description('Optional. Enable monitoring for applicable resources (WAF-aligned).') -param enableMonitoring bool = false +param enableMonitoring bool = true @description('Optional. Enable Azure AI Foundry mode for multi-agent orchestration.') param useFoundryMode bool = true @@ -135,15 +136,22 @@ param enableRedundancy bool = false @description('Optional. Enable private networking for applicable resources (WAF-aligned).') param enablePrivateNetworking bool = false -@description('Optional. The existing Container Registry name (without .azurecr.io). Must contain pre-built images: content-gen-app and content-gen-api.') -param acrName string = 'contentgencontainerreg' - -@description('Optional. Image Tag.') -param imageTag string = 'latest' - @description('Optional. Enable/Disable usage telemetry for module.') param enableTelemetry bool = true +@description('Optional. Frontend image name (without tag).') +param frontendImageName string = 'content-gen-app' + +@description('Optional. Backend image name (without tag).') +param backendImageName string = 'content-gen-api' + +@description('Optional. Image tag for container deployment. Leave empty to skip ACI deployment.') +param imageTag string + +@description('Optional. Azure Container Registry name (unused - ACR name is auto-generated). Declared for parameter file compatibility.') +#disable-next-line no-unused-params +param acrName string = '' + @description('Optional. Created by user name.') param createdBy string = contains(deployer(), 'userPrincipalName')? split(deployer().userPrincipalName, '@')[0]: deployer().objectId @@ -153,8 +161,6 @@ param createdBy string = contains(deployer(), 'userPrincipalName')? split(deploy var solutionLocation = empty(location) ? resourceGroup().location : location -// acrName is required - points to existing ACR with pre-built images -var acrResourceName = acrName var solutionSuffix = toLower(trim(replace( replace( replace(replace(replace(replace('${solutionName}${solutionUniqueText}', '-', ''), '_', ''), '.', ''), '/', ''), @@ -165,6 +171,9 @@ var solutionSuffix = toLower(trim(replace( '' ))) +// ACR name is always auto-generated in custom deployment +var acrResourceName = 'cr${solutionSuffix}' + var cosmosDbZoneRedundantHaRegionPairs = { australiaeast: 'uksouth' centralus: 'eastus2' @@ -377,6 +386,30 @@ module userAssignedIdentity 'br/public:avm/res/managed-identity/user-assigned-id } } +// ========== Azure Container Registry ========== // +// CUSTOM DEPLOYMENT: ACR for remote Docker builds (AZD pushes images here) +module containerRegistry 'br/public:avm/res/container-registry/registry:0.9.0' = { + name: take('avm.res.container-registry.registry.${acrResourceName}', 64) + params: { + name: acrResourceName + location: solutionLocation + tags: tags + enableTelemetry: enableTelemetry + acrSku: 'Standard' + acrAdminUserEnabled: false + anonymousPullEnabled: false + publicNetworkAccess: 'Enabled' + networkRuleBypassOptions: 'AzureServices' + roleAssignments: [ + { + principalId: userAssignedIdentity.outputs.principalId + roleDefinitionIdOrName: '7f951dda-4ed3-4680-a7ca-43fe172d538d' // AcrPull + principalType: 'ServicePrincipal' + } + ] + } +} + // ========== Virtual Network and Networking Components ========== // var deployAdminAccessResources = enablePrivateNetworking && deployBastionAndJumpbox && !empty(vmAdminPassword) module virtualNetwork 'modules/virtualNetwork.bicep' = if (enablePrivateNetworking) { @@ -953,35 +986,39 @@ module webServerFarm 'br/public:avm/res/web/serverfarm:0.7.0' = { var webSiteResourceName = 'app-${solutionSuffix}' // Backend URL: Use actual ACI IP/FQDN from deployment outputs // This also creates an implicit dependency ensuring ACI deploys before the web app -var aciBackendUrl = enablePrivateNetworking - ? 'http://${containerInstance.outputs.ipAddress}:8000' - : 'http://${containerInstance.outputs.fqdn}:8000' +var aciBackendUrl = shouldDeployACI + ? (enablePrivateNetworking + ? 'http://${containerInstance!.properties.ipAddress.ip}:8000' + : 'http://${containerInstance!.properties.ipAddress.fqdn}:8000') + : '' module webSite 'modules/web-sites.bicep' = { name: take('module.web-sites.${webSiteResourceName}', 64) params: { name: webSiteResourceName - tags: tags + tags: union(tags, { 'azd-service-name': 'frontend' }) location: solutionLocation - kind: 'app,linux,container' + kind: 'app,linux' serverFarmResourceId: webServerFarm.outputs.resourceId managedIdentities: { userAssignedResourceIds: [userAssignedIdentity!.outputs.resourceId] } siteConfig: { - // Frontend container - same for both modes - linuxFxVersion: 'DOCKER|${acrResourceName}.azurecr.io/content-gen-app:${imageTag}' + // Node.js runtime for frontend server (code deployment via AZD) + linuxFxVersion: 'NODE|22-lts' minTlsVersion: '1.2' alwaysOn: true ftpsState: 'FtpsOnly' + appCommandLine: 'node server.js' } virtualNetworkSubnetId: enablePrivateNetworking ? virtualNetwork!.outputs.webSubnetResourceId : null configs: concat( [ { - // Frontend container proxies to ACI backend (both modes) + // Frontend server proxies to ACI backend name: 'appsettings' properties: { - DOCKER_REGISTRY_SERVER_URL: 'https://${acrResourceName}.azurecr.io' + WEBSITES_PORT: '8080' BACKEND_URL: aciBackendUrl AZURE_CLIENT_ID: userAssignedIdentity.outputs.clientId + SCM_DO_BUILD_DURING_DEPLOYMENT: 'true' // Run npm install during deployment } applicationInsightResourceId: enableMonitoring ? applicationInsights!.outputs.resourceId : null } @@ -1006,69 +1043,130 @@ module webSite 'modules/web-sites.bicep' = { } // ========== Container Instance (Backend API) ========== // +// CUSTOM DEPLOYMENT: Inline ACI definition with managed identity auth for ACR var containerInstanceName = 'aci-${solutionSuffix}' -// Hash that changes whenever the monitoring config (enableMonitoring + connection string) changes. -// Used as an ACI tag so that toggling enableMonitoring (or rotating the App Insights component) -// forces ARM to detect drift on the container group, triggering a restart and re-applying env vars -// like APPLICATIONINSIGHTS_CONNECTION_STRING. ACI does not natively support forceUpdateTag. -var monitoringConfigHash = uniqueString( - string(enableMonitoring), - enableMonitoring ? applicationInsights!.outputs.connectionString : 'monitoring-disabled' -) - -module containerInstance 'modules/container-instance.bicep' = { - name: take('module.container-instance.${containerInstanceName}', 64) - params: { - name: containerInstanceName - location: solutionLocation - tags: union(tags, { - 'monitoring-enabled': string(enableMonitoring) - 'monitoring-config-hash': monitoringConfigHash - }) - containerImage: '${acrResourceName}.azurecr.io/content-gen-api:${imageTag}' - cpu: 2 - memoryInGB: 4 - port: 8000 - // Only pass subnetResourceId when private networking is enabled - subnetResourceId: enablePrivateNetworking ? virtualNetwork!.outputs.aciSubnetResourceId : '' - userAssignedIdentityResourceId: userAssignedIdentity.outputs.resourceId - enableTelemetry: enableTelemetry - environmentVariables: [ - // Azure OpenAI Settings - { name: 'AZURE_OPENAI_ENDPOINT', value: 'https://${aiFoundryAiServicesResourceName}.openai.azure.com/' } - { name: 'AZURE_ENV_GPT_MODEL_NAME', value: gptModelName } - { name: 'AZURE_ENV_IMAGE_MODEL_NAME', value: imageModelConfig[imageModelChoice].name } - { name: 'AZURE_OPENAI_GPT_IMAGE_ENDPOINT', value: imageModelChoice != 'none' ? 'https://${aiFoundryAiServicesResourceName}.openai.azure.com/' : '' } - { name: 'AZURE_ENV_OPENAI_API_VERSION', value: azureOpenaiAPIVersion } - // Azure Cosmos DB Settings - { name: 'AZURE_COSMOS_ENDPOINT', value: 'https://cosmos-${solutionSuffix}.documents.azure.com:443/' } - { name: 'AZURE_COSMOS_DATABASE_NAME', value: cosmosDBDatabaseName } - { name: 'AZURE_COSMOS_PRODUCTS_CONTAINER', value: cosmosDBProductsContainer } - { name: 'AZURE_COSMOS_CONVERSATIONS_CONTAINER', value: cosmosDBConversationsContainer } - // Azure Blob Storage Settings - { name: 'AZURE_BLOB_ACCOUNT_NAME', value: storageAccountName } - { name: 'AZURE_BLOB_PRODUCT_IMAGES_CONTAINER', value: productImagesContainer } - { name: 'AZURE_BLOB_GENERATED_IMAGES_CONTAINER', value: generatedImagesContainer } - // Azure AI Search Settings - { name: 'AZURE_AI_SEARCH_ENDPOINT', value: 'https://${aiSearchName}.search.windows.net' } - { name: 'AZURE_AI_SEARCH_PRODUCTS_INDEX', value: azureSearchIndex } - { name: 'AZURE_AI_SEARCH_IMAGE_INDEX', value: 'product-images' } - // App Settings - { name: 'AZURE_CLIENT_ID', value: userAssignedIdentity.outputs.clientId } - { name: 'PORT', value: '8000' } - { name: 'WORKERS', value: '4' } - { name: 'RUNNING_IN_PRODUCTION', value: 'true' } - // Azure AI Foundry Settings - { name: 'USE_FOUNDRY', value: useFoundryMode ? 'true' : 'false' } - { name: 'AZURE_AI_PROJECT_ENDPOINT', value: aiFoundryAiProjectEndpoint } - { name: 'AZURE_AI_MODEL_DEPLOYMENT_NAME', value: gptModelName } - { name: 'AZURE_AI_IMAGE_MODEL_DEPLOYMENT', value: imageModelConfig[imageModelChoice].name } - // Logging Settings - { name: 'AZURE_BASIC_LOGGING_LEVEL', value: 'INFO' } - { name: 'AZURE_PACKAGE_LOGGING_LEVEL', value: 'WARNING' } - { name: 'AZURE_LOGGING_PACKAGES', value: '' } - // Application Insights - { name: 'APPLICATIONINSIGHTS_CONNECTION_STRING', value: enableMonitoring ? applicationInsights!.outputs.connectionString : '' } +var backendImageUrl = '${containerRegistry.outputs.loginServer}/${backendImageName}:${imageTag}' +var aciPort = 8000 +var isPrivateNetworking = enablePrivateNetworking +// Construct identity resource ID from known values (required for deployment-time calculation) +var userAssignedIdentityResourceIdForACI = '/subscriptions/${subscription().subscriptionId}/resourceGroups/${resourceGroup().name}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/${userAssignedIdentityResourceName}' +// Deploy ACI only when imageTag is set to a real tag (not 'none') +var shouldDeployACI = !empty(imageTag) && imageTag != 'none' + +#disable-next-line no-deployments-resources +resource aciTelemetry 'Microsoft.Resources/deployments@2025-04-01' = if (enableTelemetry && shouldDeployACI) { + name: '46d3xbcp.res.containerinstance.${replace('-..--..-', '.', '-')}.${substring(uniqueString(deployment().name, solutionLocation), 0, 4)}' + properties: { + mode: 'Incremental' + template: { + '$schema': 'https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#' + contentVersion: '1.0.0.0' + resources: [] + } + } +} + +// Hash that changes whenever the monitoring config is toggled. +// Used as an ACI tag so that toggling enableMonitoring forces ARM to detect drift on the +// container group, triggering a restart and re-applying env vars like +// APPLICATIONINSIGHTS_CONNECTION_STRING. ACI does not natively support forceUpdateTag, +// and tags must be calculatable at deployment-start (no runtime references allowed). +var monitoringConfigHash = uniqueString(string(enableMonitoring)) + +resource containerInstance 'Microsoft.ContainerInstance/containerGroups@2025-09-01' = if (shouldDeployACI) { + name: containerInstanceName + location: solutionLocation + tags: union(tags, { + 'monitoring-enabled': string(enableMonitoring) + 'monitoring-config-hash': monitoringConfigHash + }) + identity: { + type: 'UserAssigned' + userAssignedIdentities: { + '${userAssignedIdentityResourceIdForACI}': {} + } + } + properties: { + containers: [ + { + name: containerInstanceName + properties: { + image: backendImageUrl + resources: { + requests: { + cpu: 2 + memoryInGB: 4 + } + } + ports: [ + { + port: aciPort + protocol: 'TCP' + } + ] + environmentVariables: [ + // Azure OpenAI Settings + { name: 'AZURE_OPENAI_ENDPOINT', value: 'https://${aiFoundryAiServicesResourceName}.openai.azure.com/' } + { name: 'AZURE_ENV_GPT_MODEL_NAME', value: gptModelName } + { name: 'AZURE_ENV_IMAGE_MODEL_NAME', value: imageModelConfig[imageModelChoice].name } + { name: 'AZURE_OPENAI_GPT_IMAGE_ENDPOINT', value: imageModelChoice != 'none' ? 'https://${aiFoundryAiServicesResourceName}.openai.azure.com/' : '' } + { name: 'AZURE_ENV_OPENAI_API_VERSION', value: azureOpenaiAPIVersion } + // Azure Cosmos DB Settings + { name: 'AZURE_COSMOS_ENDPOINT', value: 'https://cosmos-${solutionSuffix}.documents.azure.com:443/' } + { name: 'AZURE_COSMOS_DATABASE_NAME', value: cosmosDBDatabaseName } + { name: 'AZURE_COSMOS_PRODUCTS_CONTAINER', value: cosmosDBProductsContainer } + { name: 'AZURE_COSMOS_CONVERSATIONS_CONTAINER', value: cosmosDBConversationsContainer } + // Azure Blob Storage Settings + { name: 'AZURE_BLOB_ACCOUNT_NAME', value: storageAccountName } + { name: 'AZURE_BLOB_PRODUCT_IMAGES_CONTAINER', value: productImagesContainer } + { name: 'AZURE_BLOB_GENERATED_IMAGES_CONTAINER', value: generatedImagesContainer } + // Azure AI Search Settings + { name: 'AZURE_AI_SEARCH_ENDPOINT', value: 'https://${aiSearchName}.search.windows.net' } + { name: 'AZURE_AI_SEARCH_PRODUCTS_INDEX', value: azureSearchIndex } + { name: 'AZURE_AI_SEARCH_IMAGE_INDEX', value: 'product-images' } + // App Settings + { name: 'AZURE_CLIENT_ID', value: userAssignedIdentity.outputs.clientId } + { name: 'PORT', value: '8000' } + { name: 'WORKERS', value: '4' } + { name: 'RUNNING_IN_PRODUCTION', value: 'true' } + // Azure AI Foundry Settings + { name: 'USE_FOUNDRY', value: useFoundryMode ? 'true' : 'false' } + { name: 'AZURE_AI_PROJECT_ENDPOINT', value: aiFoundryAiProjectEndpoint } + { name: 'AZURE_AI_MODEL_DEPLOYMENT_NAME', value: gptModelName } + { name: 'AZURE_AI_IMAGE_MODEL_DEPLOYMENT', value: imageModelConfig[imageModelChoice].name } + // Logging Settings + { name: 'AZURE_BASIC_LOGGING_LEVEL', value: 'INFO' } + { name: 'AZURE_PACKAGE_LOGGING_LEVEL', value: 'WARNING' } + { name: 'AZURE_LOGGING_PACKAGES', value: '' } + // Application Insights + { name: 'APPLICATIONINSIGHTS_CONNECTION_STRING', value: enableMonitoring ? applicationInsights!.outputs.connectionString : '' } + ] + } + } + ] + osType: 'Linux' + restartPolicy: 'Always' + subnetIds: isPrivateNetworking ? [ + { + id: virtualNetwork!.outputs.aciSubnetResourceId + } + ] : null + ipAddress: { + type: isPrivateNetworking ? 'Private' : 'Public' + ports: [ + { + port: aciPort + protocol: 'TCP' + } + ] + dnsNameLabel: isPrivateNetworking ? null : containerInstanceName + } + // Managed identity auth for ACR (instead of anonymous pull) + imageRegistryCredentials: [ + { + server: containerRegistry.outputs.loginServer + identity: userAssignedIdentityResourceIdForACI + } ] } } @@ -1150,13 +1248,13 @@ output AZURE_APPLICATION_INSIGHTS_CONNECTION_STRING string = (enableMonitoring & output AZURE_ENV_AI_SERVICE_LOCATION string = azureAiServiceLocation @description('Contains Container Instance Name') -output CONTAINER_INSTANCE_NAME string = containerInstance.outputs.name +output CONTAINER_INSTANCE_NAME string = shouldDeployACI ? containerInstance!.name : '' @description('Contains Container Instance FQDN (only for non-private networking)') -output CONTAINER_INSTANCE_FQDN string = enablePrivateNetworking ? '' : containerInstance.outputs.fqdn +output CONTAINER_INSTANCE_FQDN string = (shouldDeployACI && !isPrivateNetworking) ? containerInstance!.properties.ipAddress.fqdn : '' @description('Contains ACR Name') -output AZURE_ENV_CONTAINER_REGISTRY_NAME string = acrResourceName +output AZURE_ENV_CONTAINER_REGISTRY_NAME string = containerRegistry.outputs.name @description('Contains flag for Azure AI Foundry usage') output USE_FOUNDRY bool = useFoundryMode ? true : false @@ -1169,3 +1267,15 @@ output AZURE_AI_MODEL_DEPLOYMENT_NAME string = gptModelName @description('Contains Azure AI Image Model Deployment Name (empty if none selected)') output AZURE_AI_IMAGE_MODEL_DEPLOYMENT string = imageModelConfig[imageModelChoice].name + +@description('Contains Managed Identity Client ID') +output AZURE_CLIENT_ID string = userAssignedIdentity.outputs.clientId + +@description('Frontend image name') +output FRONTEND_IMAGE_NAME string = frontendImageName + +@description('Backend image name') +output BACKEND_IMAGE_NAME string = backendImageName + +@description('Image tag') +output AZURE_ENV_IMAGE_TAG string = imageTag diff --git a/infra/main.json b/infra/main.json index 9e101d340..11ca660d2 100644 --- a/infra/main.json +++ b/infra/main.json @@ -134,7 +134,7 @@ }, "gptModelCapacity": { "type": "int", - "defaultValue": 150, + "defaultValue": 50, "minValue": 10, "metadata": { "description": "Optional. AI model deployment token capacity." diff --git a/infra/main_custom.bicep b/infra/main_custom.bicep index a59c9ccfb..873ad5869 100644 --- a/infra/main_custom.bicep +++ b/infra/main_custom.bicep @@ -93,7 +93,7 @@ param azureOpenaiAPIVersion string = '2025-01-01-preview' @minValue(10) @description('Optional. AI model deployment token capacity.') -param gptModelCapacity int = 150 +param gptModelCapacity int = 50 @minValue(1) @description('Optional. Image model deployment capacity (RPM).') diff --git a/infra/monitoring/README.md b/infra/monitoring/README.md new file mode 100644 index 000000000..d0ea34fc2 --- /dev/null +++ b/infra/monitoring/README.md @@ -0,0 +1,91 @@ +# Add monitoring after deployment (standalone) + +Use this when the main accelerator was deployed with +`enableMonitoring=false` and you now want to add **Log Analytics + Application +Insights** without re-running the full `azd up`. + +Resource names match `infra/main.bicep` exactly: + +- Log Analytics: `log-${solutionSuffix}` +- App Insights: `appi-${solutionSuffix}` + +…where `solutionSuffix = toLower("${solutionName}${solutionUniqueText}")` +(symbols stripped). So after this runs, the rest of the solution can find +them by name with no further changes. + +## Parameters + +| Name | Required | Description | +| --- | --- | --- | +| `solutionName` | Yes | Same value you passed to `main.bicep` / `azd` (3-15 chars). | +| `solutionUniqueText` | No | Same as main; defaults to the same `uniqueString(...)` expression. Override only if the original deployment used a custom value. | +| `location` | No | Region for the resources. Defaults to RG location. | +| `tags` | No | Tags applied to both resources. | +| `retentionInDays` | No | Defaults to 365 (matches main). | + +## Deploy + +```bash +RG="" +SOLUTION_NAME="" + +az deployment group create \ + --resource-group "$RG" \ + --name monitoring \ + --template-file infra/monitoring/monitoring.bicep \ + --parameters solutionName="$SOLUTION_NAME" +``` + +Capture the outputs: + +```bash +APPI_ID=$(az deployment group show -g "$RG" -n monitoring \ + --query properties.outputs.applicationInsightsResourceId.value -o tsv) +APPI_CS=$(az deployment group show -g "$RG" -n monitoring \ + --query properties.outputs.applicationInsightsConnectionString.value -o tsv) +``` + +## Wire the app to send telemetry + +Set the connection string on the running app(s): + +```bash +# App Service example +az webapp config appsettings set -g "$RG" -n \ + --settings APPLICATIONINSIGHTS_CONNECTION_STRING="$APPI_CS" + +# Container App example +az containerapp update -g "$RG" -n \ + --set-env-vars APPLICATIONINSIGHTS_CONNECTION_STRING="$APPI_CS" +``` + +Or, if managed via azd: + +```bash +azd env set APPLICATIONINSIGHTS_CONNECTION_STRING "$APPI_CS" +azd deploy # re-deploy app code only, no infra changes +``` + +## Then deploy the workbook + +```bash +az deployment group create \ + --resource-group \ + --template-file infra/workbook/workbook.bicep \ + --parameters applicationInsightsResourceId="$APPI_ID" +``` + +## Idempotency / re-runs + +Re-running this deployment against the same RG is safe — AVM modules use +stable names so existing resources are updated in place rather than +duplicated. + +## Caveat + +This template **only** creates Log Analytics + App Insights. It does **not** +re-wire `main.bicep`'s diagnostic settings on other resources (Storage, Key +Vault, Cosmos, etc.) which are normally created by `main.bicep` when +`enableMonitoring=true`. If you need those too, the cleanest fix is still to +re-run `azd provision` with `enableMonitoring=true` — Bicep will add only +the missing diagnostic settings without recreating existing resources. diff --git a/infra/monitoring/monitoring.bicep b/infra/monitoring/monitoring.bicep new file mode 100644 index 000000000..029b4ee36 --- /dev/null +++ b/infra/monitoring/monitoring.bicep @@ -0,0 +1,97 @@ +// ============================================================================ +// Monitoring add-on (standalone deployment) +// ---------------------------------------------------------------------------- +// Deploys Log Analytics Workspace + Application Insights into an EXISTING +// resource group, using the same naming convention as infra/main.bicep +// (`log-${solutionSuffix}` and `appi-${solutionSuffix}`). +// +// Use this when the main accelerator was deployed with `enableMonitoring=false` +// and you want to add monitoring afterwards WITHOUT re-running the full +// deployment. +// +// After this deployment completes, set the App Service / Container App +// `APPLICATIONINSIGHTS_CONNECTION_STRING` setting to the value emitted by the +// `applicationInsightsConnectionString` output (or run `azd env set` and +// re-deploy the app code only). +// +// Scope: resourceGroup +// ============================================================================ + +targetScope = 'resourceGroup' + +@minLength(3) +@maxLength(15) +@description('Required. Same `solutionName` you passed to main.bicep / azd. Used to derive the resource names.') +param solutionName string + +@description('Optional. Same `solutionUniqueText` used by main.bicep. Defaults to the same expression: substring(uniqueString(subscription().id, resourceGroup().name, solutionName), 0, 5).') +param solutionUniqueText string = substring(uniqueString(subscription().id, resourceGroup().name, solutionName), 0, 5) + +@description('Optional. Azure region for the new resources. Defaults to the resource group location.') +param location string = resourceGroup().location + +@description('Optional. Tags applied to both resources.') +param tags object = {} + +@description('Optional. Data retention (days) for both Log Analytics and Application Insights.') +@minValue(30) +@maxValue(730) +param retentionInDays int = 365 + +// Mirror the suffix logic from main.bicep so names line up exactly. +var solutionSuffix = toLower(trim(replace( + replace( + replace(replace(replace(replace('${solutionName}${solutionUniqueText}', '-', ''), '_', ''), '.', ''), '/', ''), + ' ', + '' + ), + '*', + '' +))) + +var logAnalyticsWorkspaceResourceName = 'log-${solutionSuffix}' +var applicationInsightsResourceName = 'appi-${solutionSuffix}' + +// ========== Log Analytics Workspace ========== +module logAnalyticsWorkspace 'br/public:avm/res/operational-insights/workspace:0.15.0' = { + name: take('avm.res.operational-insights.workspace.${logAnalyticsWorkspaceResourceName}', 64) + params: { + name: logAnalyticsWorkspaceResourceName + tags: tags + location: location + skuName: 'PerGB2018' + dataRetention: retentionInDays + features: { enableLogAccessUsingOnlyResourcePermissions: true } + diagnosticSettings: [{ useThisWorkspace: true }] + } +} + +// ========== Application Insights ========== +module applicationInsights 'br/public:avm/res/insights/component:0.7.1' = { + name: take('avm.res.insights.component.${applicationInsightsResourceName}', 64) + params: { + name: applicationInsightsResourceName + tags: tags + location: location + retentionInDays: retentionInDays + kind: 'web' + disableIpMasking: false + flowType: 'Bluefield' + workspaceResourceId: logAnalyticsWorkspace.outputs.resourceId + } +} + +@description('Resource ID of the Log Analytics workspace.') +output logAnalyticsWorkspaceResourceId string = logAnalyticsWorkspace.outputs.resourceId + +@description('Name of the Log Analytics workspace.') +output logAnalyticsWorkspaceName string = logAnalyticsWorkspaceResourceName + +@description('Resource ID of the Application Insights component (pass this to infra/workbook/workbook.bicep).') +output applicationInsightsResourceId string = applicationInsights.outputs.resourceId + +@description('Name of the Application Insights component.') +output applicationInsightsName string = applicationInsightsResourceName + +@description('Connection string for Application Insights. Set this on your app as APPLICATIONINSIGHTS_CONNECTION_STRING.') +output applicationInsightsConnectionString string = applicationInsights.outputs.connectionString diff --git a/infra/workbook/README.md b/infra/workbook/README.md new file mode 100644 index 000000000..bb0476d1f --- /dev/null +++ b/infra/workbook/README.md @@ -0,0 +1,109 @@ +# Token Usage Workbook (standalone deployment) + +This folder contains a **standalone** Bicep template that deploys the +**Token Usage** Application Insights workbook used by the Content Generation +Solution Accelerator. + +The workbook visualises the custom events emitted by the orchestrator: + +- `LLM_Token_Usage_Summary` +- `LLM_Agent_Token_Usage` +- `LLM_Model_Token_Usage` + +It is deployed **separately** from `infra/main.bicep` / +`infra/main_custom.bicep` so it can target an Application Insights instance +that lives in a **different resource group** (or subscription) from the rest +of the accelerator — for example, a shared observability workspace. + +## Files + +| File | Purpose | +| --- | --- | +| `workbook.bicep` | Bicep template that creates the workbook resource. | +| `../dashboards/token-usage-workbook.json` | Serialized workbook definition (loaded by the template). | + +## Parameters + +| Name | Required | Description | +| --- | --- | --- | +| `applicationInsightsResourceId` | No | Full resource ID of the Application Insights instance to query. Defaults to `Azure Monitor` (unbound — pick the instance later in the portal). Re-deploy with a real ID to (re)bind. | +| `workbookName` | No | Stable GUID name. Keep the default so re-deployments update the SAME workbook even when the App Insights ID changes. | +| `location` | No | Azure region for the workbook resource. Defaults to the resource group location. | +| `displayName` | No | Display name in the Azure portal. Defaults to `Token Usage`. | +| `tags` | No | Tags applied to the workbook resource. | + +## Bind / change Application Insights after deployment + +Because `workbookName` is stable by default, you can: + +1. **Deploy now without an App Insights ID** — workbook is created in + "Azure Monitor" scope and shows up under *Monitor → Workbooks*. Open it, + then use the resource picker at the top to point at any App Insights + instance ad-hoc. +2. **Bind / re-bind later by re-deploying** with a new + `applicationInsightsResourceId`. The same workbook resource is updated + in place; no duplicate is created. + +```bash +# 1) Deploy unbound first +az deployment group create \ + --resource-group rg-observability \ + --template-file infra/workbook/workbook.bicep + +# 2) Later, bind it to App Insights instance A +az deployment group create \ + --resource-group rg-observability \ + --template-file infra/workbook/workbook.bicep \ + --parameters applicationInsightsResourceId="$APPI_ID_A" + +# 3) Switch it to App Insights instance B - same workbook, new source +az deployment group create \ + --resource-group rg-observability \ + --template-file infra/workbook/workbook.bicep \ + --parameters applicationInsightsResourceId="$APPI_ID_B" +``` + +You can also change the binding from the **Azure portal**: open the +workbook → *Edit* → *Settings* → change the linked resource → *Save*. + +## Deploy with Azure CLI + +```bash +# Resource group where the WORKBOOK will live +WORKBOOK_RG="rg-observability" + +# Full resource ID of the EXISTING Application Insights instance +# (can be in a different resource group / subscription) +APPI_ID="/subscriptions//resourceGroups//providers/Microsoft.Insights/components/" + +az deployment group create \ + --resource-group "$WORKBOOK_RG" \ + --template-file infra/workbook/workbook.bicep \ + --parameters applicationInsightsResourceId="$APPI_ID" +``` + +## Deploy with Azure PowerShell + +```powershell +$workbookRg = "rg-observability" +$appiId = "/subscriptions//resourceGroups//providers/Microsoft.Insights/components/" + +New-AzResourceGroupDeployment ` + -ResourceGroupName $workbookRg ` + -TemplateFile infra/workbook/workbook.bicep ` + -applicationInsightsResourceId $appiId +``` + +## Notes + +- The workbook resource itself can live in any resource group; only the + `sourceId` it points at needs to be a valid Application Insights resource + ID. This is what allows the workbook to fetch token-count telemetry from + Application Insights deployed elsewhere. +- The workbook name is a stable GUID derived from the resource group and the + Application Insights resource ID, so re-running the deployment updates the + same workbook in place rather than creating duplicates. +- Required permission: `Microsoft.Insights/workbooks/write` on the target + resource group. **No** permissions are required on the Application Insights + resource group at deployment time — the workbook only references it; the + user *viewing* the workbook needs read access to the App Insights instance. diff --git a/src/.coverage b/src/.coverage new file mode 100644 index 0000000000000000000000000000000000000000..4f70513dd57d28b86d1fc7d9e3d3258fc53217c0 GIT binary patch literal 53248 zcmeI5e{dAl9l+o2ZEkPx*CtZtj7YPN*pQMR1RNPCm>^Pzsg$aqRZ=gTyG^p@?l#=+ zC4@$D7pzpLKb$&PY0=PjI{f2U+bYToHF2CNTB*@iJC52RhykW5P*6!in(O!N-QEu% zj(>Ek#_!JE?%TKTz3=z?dEf7Q_inS7TW-2hH592;i$`TcT1-j_!;nTvB7_L=_QBiQ zct~*82^^V%^mIvq)OOwH6K^G6=Z}ear*EBD-nZ)J)hpMpme#Ml?#9(pCQq90mSC%?kyc7^ zty}6;;!>*`Q6x1MRztF(#@ZyKU9n0hbR|qDI>>{mjuxyn>(f+F4MQzMX@e#@<7!ln z_edQ|PqkSfE6q?g8<`AfL{Zyfw1hN2TfIt(E3HagiG>tBqs4qRT;;CX*5~E}vt}{d z71Jg;9)g#r$QsDVo3f@A8VYIgFx9>#F2_Rcie4?rISWG?=(*X*SR2=(b%rKI)R<{z zOjiw6i%H66C6q9f@I?)*rqHYn;1yZ}Z8cManlm-CvI^+rD_^8$ws^L3D<~J!EUfv8 zR5VmJb&e&XEr9&4M6{E34%Em9Hniz#PFL5^vc`VuzPU(W=It%&P768h(M+vGOx=`F za-Cl-96`ik*Z;Q6wU1t!Bv3+ES)-z?%zWrp|&P z$7!HRTDBl)`k8tgRTgxRS|VCYv9FmPqbm>~Fcw(FS`)Dl4Hcc*5YoDoxZI}HX0K+3 zh0ofNz;)^cXJB>3bgq+gfna?u=}wh9`M{hxOut~dQMR|T2foOKte(8cFlt?jDzcu4 zTLZ}_4U$%k{3vO~NcIYtQ^Pw0E9XpSPPSaIKAUi*c#aRup3S5(5kbc#`YtIlg68By z;Jk_8)2rVp8|~7Cy3=e>yt2d@_{Qw%6sIZ#>*?OPZ6j-{yuI9{AB91mz8ys>)117@ z-%@&Ny5tCqCY`bxhmeIaI5PuCv|w01+-xhJo0rh)vq#U4sknZ+Tz zzoY583ezf`w)ACIu2k7B>vR;-E2S1qizsp|GdDnuH9E*tb;;D>!da;y--g~p+i2^m zG@BlT7|Kn1}Bx18APC>R0%<;Lx`T&rqyWnDjl(` z_G>$V32ikhhcn?$JEfq-?snyZ<}$O<^|GNDYE+SE5mOn{vAShl3dI$;2yR7d`X@d?AjW%`F$UX9wj&?cLv@#S@BJgy# zp+Ua+WO2->X;We}3mz`5(X~jzOvw3yO?o`k)FOvElvub48thWxM;#ymPsz3iv*+-v zX}A=GuC+yIFsQ~RePlxssWF? z=c8bg05aR`l}Ufpp+KbW0+BXHdq~rxnqD9@%7f6Ly>=EjP@5CP-C~P4r3=D72)+~1 zAtu^NY!hT?FoFgXI{j%42tE!3*%N#k49toSztaPwJVf&`EN5zm-d^vq@ON`roH=sSWmtDKN;J;8Ib0B;_3P!u7vL=TfT8 zQqN!iyW?D{&1RYBt^bApL$qJ8{&&T=RM2KS3mk}V=Tf)WBd&P;@3c=)@%q2SzTi7= z{m+HC)Eax(<<|cW`v(7l^*KE*k4VM z(#5;CUAtnglp>|1c4SM^KXmXe(o;>VG{$fDP^}iZ*tZ}0)2;6vypaJ*KBSkDe%MKH zlan+;IY`ev$ByYmk?n^j(c^WD(M1Iy>TL7STecK-rT3Yyvt zme2;takxq@%l@Z#NRmsO0+jUgcl2{E(B>W&$C63*Xs2);T%1Q9tE2xHejYhEn@K_f zHrfDemt+of?;=;g{vI>WOHK`REUmranY|T(tvlAdy!6=H<5xa2xOMN2%_sYojr^S_ zm!#{S-gUSVYCQQ?#gXyX_{PcL650$Q3(3%NBHh-VeyJ_3T~kK0M!bCwr=NW6_4L@l zibLxj`zoor3No)B`BzE-+nSRHs1Sh&#Hg%BcqLjL;qa*xJdR*G@SnVBk%UL z#_veJ`B36$_%uV_J;RmW_u#n?Nb>OOB6(oX)~U&{LqFR0&U&`-4{P5$`d|Zj_@i;> z_vVS^Jm9$6!EU~`LGx})9~>DTo*X};jSNrnJoKqh>U`OeQcFq%VcUAP?`+?CZC&e6 z2LjER=N9|!uZ+}>G7wx=7@)xR%mJgtxK864X)df?chv&Yhh9|V7%yNsr!rj3N8O4)U;err})k$;$M(t#33;t>|rVkj6um4O9HoYX`^^*QS4PxOHOJ zbECV|lXIP?Cf?n0VDFtyvgIZ67If&PLvPl<@z8Tq<;iGzF(E^QIFibac<_UnRifn-hOro3yDs#|5nz#GDdwkXK}9Th!uso zoFvcC7X1BAdjDS(cM|cCxLbTx{EN6#d_#O5z6sbb?hzjs-xo*43Gqy>A|{Xk5a6 None: logger.info(f"Content Generation Orchestrator initialized successfully ({mode_str} mode)") - def _new_token_accumulator(self, conversation_id: str = "") -> TokenUsageAccumulator: + def _new_token_accumulator( + self, conversation_id: str = "", user_id: str = "" + ) -> TokenUsageAccumulator: """Create a TokenUsageAccumulator pre-populated with this orchestrator's - agent->model map and default chat model. Telemetry is best-effort.""" + agent->model map and default chat model. Telemetry is best-effort. + + If ``user_id`` is not provided, falls back to the per-request value + stored in the ``_current_user_id`` ContextVar so accumulators created + deep inside the workflow still carry the caller's user id. + """ return TokenUsageAccumulator( conversation_id=conversation_id, + user_id=user_id or _current_user_id.get(""), agent_model_map=self._agent_model_map, default_model=self._default_model, ) @@ -726,7 +741,8 @@ async def process_message( self, message: str, conversation_id: str, - context: Optional[dict] = None + context: Optional[dict] = None, + user_id: str = "" ) -> AsyncIterator[dict]: """ Process a user message through the orchestrated workflow. @@ -769,9 +785,10 @@ async def process_message( if context: full_input = f"Context:\n{json.dumps(context, indent=2)}\n\nUser Message:\n{message}" + _ctx_token = _current_user_id.set(user_id or "") try: # Per-request token usage accumulator for App Insights telemetry. - token_acc = self._new_token_accumulator(conversation_id) + token_acc = self._new_token_accumulator(conversation_id, user_id) # Collect events from the workflow stream events = [] @@ -861,12 +878,15 @@ async def process_message( "is_final": True, "metadata": {"conversation_id": conversation_id} } + finally: + _current_user_id.reset(_ctx_token) async def send_user_response( self, request_id: str, user_response: str, - conversation_id: str + conversation_id: str, + user_id: str = "" ) -> AsyncIterator[dict]: """ Send a user response to a pending workflow request. @@ -898,7 +918,8 @@ async def send_user_response( return # Exit immediately - do not continue workflow try: - token_acc = self._new_token_accumulator(conversation_id) + token_acc = self._new_token_accumulator(conversation_id, user_id) + _ctx_token = _current_user_id.set(user_id or "") responses = {request_id: user_response} async for event in self._workflow.send_responses_streaming(responses): @@ -973,10 +994,16 @@ async def send_user_response( "is_final": True, "metadata": {"conversation_id": conversation_id} } + finally: + try: + _current_user_id.reset(_ctx_token) + except (LookupError, ValueError, NameError): + pass async def parse_brief( self, - brief_text: str + brief_text: str, + user_id: str = "" ) -> tuple[CreativeBrief, str | None, bool]: """ Parse a free-text creative brief into structured format. @@ -994,6 +1021,17 @@ async def parse_brief( if not self._initialized: self.initialize() + _ctx_token = _current_user_id.set(user_id or "") + try: + return await self._parse_brief_impl(brief_text, user_id) + finally: + _current_user_id.reset(_ctx_token) + + async def _parse_brief_impl( + self, + brief_text: str, + user_id: str = "" + ) -> tuple[CreativeBrief, str | None, bool]: # PROACTIVE CONTENT SAFETY CHECK - Block harmful content at input layer is_harmful, matched_pattern = _check_input_for_harmful_content(brief_text) if is_harmful: @@ -1013,7 +1051,7 @@ async def parse_brief( return empty_brief, RAI_HARMFUL_CONTENT_RESPONSE, True # SECONDARY RAI CHECK - Use LLM-based classifier for comprehensive safety/scope validation - token_acc = self._new_token_accumulator() + token_acc = self._new_token_accumulator(user_id=user_id) try: rai_response = await self._rai_agent.run(brief_text) try: @@ -1488,7 +1526,8 @@ async def generate_content( self, brief: CreativeBrief, products: list = None, - generate_images: bool = True + generate_images: bool = True, + user_id: str = "" ) -> dict: """ Generate complete content package from a confirmed creative brief. @@ -1497,6 +1536,7 @@ async def generate_content( brief: Confirmed creative brief products: List of products to feature generate_images: Whether to generate images + user_id: Optional caller's user id, propagated to token usage telemetry Returns: dict: Generated content with compliance results @@ -1504,6 +1544,19 @@ async def generate_content( if not self._initialized: self.initialize() + _ctx_token = _current_user_id.set(user_id or "") + try: + return await self._generate_content_impl(brief, products, generate_images, user_id) + finally: + _current_user_id.reset(_ctx_token) + + async def _generate_content_impl( + self, + brief: CreativeBrief, + products: list = None, + generate_images: bool = True, + user_id: str = "" + ) -> dict: results = { "text_content": None, "image_prompt": None, @@ -1528,7 +1581,7 @@ async def generate_content( """ try: - token_acc = self._new_token_accumulator() + token_acc = self._new_token_accumulator(user_id=user_id) # Generate text content text_response = await self._agents["text_content"].run(text_request) try: @@ -1745,7 +1798,8 @@ async def regenerate_image( modification_request: str, brief: CreativeBrief, products: list = None, - previous_image_prompt: str = None + previous_image_prompt: str = None, + user_id: str = "" ) -> dict: """ Regenerate just the image based on a user modification request. @@ -1758,6 +1812,7 @@ async def regenerate_image( brief: The confirmed creative brief products: List of products to feature previous_image_prompt: The previous image prompt (if available) + user_id: Optional caller's user id, propagated to token usage telemetry Returns: dict: Regenerated image with updated prompt @@ -1765,6 +1820,21 @@ async def regenerate_image( if not self._initialized: self.initialize() + _ctx_token = _current_user_id.set(user_id or "") + try: + return await self._regenerate_image_impl( + modification_request, brief, products, previous_image_prompt + ) + finally: + _current_user_id.reset(_ctx_token) + + async def _regenerate_image_impl( + self, + modification_request: str, + brief: CreativeBrief, + products: list = None, + previous_image_prompt: str = None + ) -> dict: logger.info(f"Regenerating image with modification: {modification_request[:100]}...") # PROACTIVE CONTENT SAFETY CHECK diff --git a/src/coverage.xml b/src/coverage.xml new file mode 100644 index 000000000..396bc0f8c --- /dev/null +++ b/src/coverage.xml @@ -0,0 +1,2850 @@ + + + + + + C:\Users\v-ayazkhan\Accelerators\content-generation-solution-accelerator\src\backend + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From e5f3d81cc4b6c63bbff3b8633f805ba63c14ff64 Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Mon, 25 May 2026 16:39:52 +0530 Subject: [PATCH 07/17] restored main.bicep and azure.yaml --- azure.yaml | 343 +++++++++++++++-------------------------------- infra/main.bicep | 278 ++++++++++++-------------------------- 2 files changed, 189 insertions(+), 432 deletions(-) diff --git a/azure.yaml b/azure.yaml index 8ba99fac6..23f3ced27 100644 --- a/azure.yaml +++ b/azure.yaml @@ -1,3 +1,7 @@ +environment: + name: content-generation + location: eastus + name: content-generation metadata: template: content-generation@1.22 @@ -5,281 +9,144 @@ metadata: requiredVersions: azd: '>= 1.18.0 != 1.23.9' +parameters: + solutionPrefix: + type: string + default: contentgen + displayName: Solution Prefix + description: A unique prefix for all resources (3-15 chars) + azureAiServiceLocation: + type: string + default: eastus + displayName: AI Services Location + description: Location for Azure AI Services deployments + enableMonitoring: + type: boolean + default: false + displayName: Enable Monitoring (WAF) + description: Enable Log Analytics and Application Insights + enableScalability: + type: boolean + default: false + displayName: Enable Scalability (WAF) + description: Enable auto-scaling and higher SKUs + enableRedundancy: + type: boolean + default: false + displayName: Enable Redundancy (WAF) + description: Enable zone redundancy and geo-replication + enablePrivateNetworking: + type: boolean + default: false + displayName: Enable Private Networking (WAF) + description: Enable VNet integration and private endpoints + infra: + provider: bicep path: ./infra module: main -services: - frontend: - project: ./src/App/server - language: js - host: appservice - dist: ./dist - resourceName: ${APP_SERVICE_NAME} - hooks: - prepackage: - windows: - shell: pwsh - run: ../../../infra/scripts/package_frontend.ps1 - continueOnError: false - posix: - shell: sh - run: chmod +x ../../../infra/scripts/package_frontend.sh && ../../../infra/scripts/package_frontend.sh - continueOnError: false +workflows: + up: + steps: + - azd: provision hooks: - preprovision: - windows: - shell: pwsh - run: | - Write-Host "Preparing deployment..." -ForegroundColor Cyan - - # Check if image exists in the ACR (handles fresh deploy AND deployment mode switch) - $acrName = azd env get-value AZURE_ENV_CONTAINER_REGISTRY_NAME 2>$null - $global:LASTEXITCODE = 0 - $skipAci = $false - - if (-not $acrName) { - Write-Host "First deployment - ACI will be deployed after image build" -ForegroundColor Yellow - $skipAci = $true - } elseif ($acrName -eq 'contentgencontainerreg') { - # Switching from standard (shared ACR) to custom (own ACR) deployment - # Clear ACI name so postprovision deploys fresh with new ACR - Write-Host "Switching to custom deployment - ACI will be redeployed with new ACR" -ForegroundColor Yellow - azd env set CONTAINER_INSTANCE_NAME "" - $skipAci = $true - } else { - # Custom ACR exists - check if image is present - Write-Host "Checking for existing image in $acrName..." -ForegroundColor Cyan - $imageCheck = az acr repository show --name $acrName --image "content-gen-api:latest" 2>$null - $global:LASTEXITCODE = 0 - - if (-not $imageCheck) { - Write-Host "Image not found in ACR - will build in postprovision" -ForegroundColor Yellow - $skipAci = $true - } else { - Write-Host "Image found - ACI deployment will proceed" -ForegroundColor Green - } - } - - if ($skipAci) { - azd env set AZURE_ENV_IMAGE_TAG none - } - continueOnError: false - posix: - shell: sh - run: | - echo "Preparing deployment..." - - # Check if image exists in the ACR (handles fresh deploy AND deployment mode switch) - ACR_NAME=$(azd env get-value AZURE_ENV_CONTAINER_REGISTRY_NAME 2>/dev/null || echo "") - SKIP_ACI=false - - if [ -z "$ACR_NAME" ]; then - echo "First deployment - ACI will be deployed after image build" - SKIP_ACI=true - elif [ "$ACR_NAME" = "contentgencontainerreg" ]; then - # Switching from standard (shared ACR) to custom (own ACR) deployment - # Clear ACI name so postprovision deploys fresh with new ACR - echo "Switching to custom deployment - ACI will be redeployed with new ACR" - azd env set CONTAINER_INSTANCE_NAME "" - SKIP_ACI=true - else - # Custom ACR exists - check if image is present - echo "Checking for existing image in $ACR_NAME..." - if az acr repository show --name "$ACR_NAME" --image "content-gen-api:latest" >/dev/null 2>&1; then - echo "Image found - ACI deployment will proceed" - else - echo "Image not found in ACR - will build in postprovision" - SKIP_ACI=true - fi - fi - - if [ "$SKIP_ACI" = "true" ]; then - azd env set AZURE_ENV_IMAGE_TAG none - fi - continueOnError: false - postprovision: windows: - shell: pwsh run: | - $acrName = $env:AZURE_ENV_CONTAINER_REGISTRY_NAME - $resourceGroup = $env:RESOURCE_GROUP_NAME - $backendImage = $env:BACKEND_IMAGE_NAME - $appServiceName = $env:APP_SERVICE_NAME - - if (-not $acrName -or -not $resourceGroup -or -not $appServiceName) { - Write-Host "ERROR: Missing required environment variables" -ForegroundColor Red - exit 1 - } - - # Check if ACI already exists (reads from persisted azd env) - $aciName = azd env get-value CONTAINER_INSTANCE_NAME 2>$null - $global:LASTEXITCODE = 0 - - # ===== Build Backend Image (ACR Build) ===== + Write-Host "===== Provision Complete =====" -ForegroundColor Green Write-Host "" - Write-Host "===== Building Backend Image =====" -ForegroundColor Yellow - Write-Host "Registry: $acrName" -ForegroundColor Cyan - Write-Host "Image: ${backendImage}:latest" -ForegroundColor Cyan - - az acr login --name $acrName 2>$null - az acr build --registry $acrName --image "${backendImage}:latest" --file ./src/backend/ApiApp.Dockerfile ./src/backend - if ($LASTEXITCODE -ne 0) { - Write-Host "Failed to build container image" -ForegroundColor Red - exit 1 - } - Write-Host "Container image built and pushed successfully!" -ForegroundColor Green - - # ===== Deploy ACI if not already deployed ===== - if (-not $aciName) { - Write-Host "" - Write-Host "===== Deploying Container Instance =====" -ForegroundColor Yellow - azd env set AZURE_ENV_IMAGE_TAG latest - - # Use az deployment instead of azd provision to avoid hook recursion - # Pass parameters inline (main.parameters.json uses AZD ${VAR} syntax not supported by az CLI) - Write-Host "Deploying ACI via Bicep..." -ForegroundColor Cyan - - az deployment group create ` - --resource-group $resourceGroup ` - --template-file ./infra/main.bicep ` - --parameters solutionName=$env:AZURE_ENV_NAME ` - --parameters location=$env:AZURE_LOCATION ` - --parameters azureAiServiceLocation=$env:AZURE_ENV_AI_SERVICE_LOCATION ` - --parameters imageTag=latest ` - --query "properties.outputs" -o json | Out-Null - - if ($LASTEXITCODE -eq 0) { - # Refresh azd env with new outputs - azd env refresh --no-prompt 2>$null - Write-Host "Container Instance deployed successfully!" -ForegroundColor Green - } else { - Write-Host "Container Instance deployment failed" -ForegroundColor Red - } - } else { - Write-Host "" - Write-Host "Restarting Container Instance to pick up new image..." -ForegroundColor Yellow - az container restart --name $aciName --resource-group $resourceGroup 2>$null - Write-Host "Container Instance: $aciName (restarted)" -ForegroundColor Cyan - } - + Write-Host "Web App URL: " -NoNewline + Write-Host "$env:WEB_APP_URL" -ForegroundColor Cyan + Write-Host "Storage Account: " -NoNewline + Write-Host "$env:AZURE_BLOB_ACCOUNT_NAME" -ForegroundColor Cyan + Write-Host "AI Search Service: " -NoNewline + Write-Host "$env:AI_SEARCH_SERVICE_NAME" -ForegroundColor Cyan + Write-Host "AI Search Index: " -NoNewline + Write-Host "$env:AZURE_AI_SEARCH_PRODUCTS_INDEX" -ForegroundColor Cyan + Write-Host "AI Service Location: " -NoNewline + Write-Host "$env:AZURE_ENV_AI_SERVICE_LOCATION" -ForegroundColor Cyan + Write-Host "Container Instance: " -NoNewline + Write-Host "$env:CONTAINER_INSTANCE_NAME" -ForegroundColor Cyan + + # Run post-deploy script to upload sample data and create search index Write-Host "" - Write-Host "===== Postprovision Complete - Frontend will deploy next =====" -ForegroundColor Green - exit 0 - interactive: true - continueOnError: false - - posix: - shell: sh - run: | - ACR_NAME="$AZURE_ENV_CONTAINER_REGISTRY_NAME" - RESOURCE_GROUP="$RESOURCE_GROUP_NAME" - BACKEND_IMAGE="$BACKEND_IMAGE_NAME" - APP_SERVICE="$APP_SERVICE_NAME" - - if [ -z "$ACR_NAME" ] || [ -z "$RESOURCE_GROUP" ] || [ -z "$APP_SERVICE" ]; then - echo "ERROR: Missing required environment variables" - exit 1 - fi - - # Check if ACI already exists (reads from persisted azd env) - ACI_NAME=$(azd env get-value CONTAINER_INSTANCE_NAME 2>/dev/null || echo "") - - # ===== Build Backend Image (ACR Build) ===== - echo "" - echo "===== Building Backend Image =====" - echo "Registry: $ACR_NAME" - echo "Image: $BACKEND_IMAGE:latest" - - if az acr build --registry "$ACR_NAME" --image "$BACKEND_IMAGE:latest" --file ./src/backend/ApiApp.Dockerfile ./src/backend; then - echo "Container image built and pushed successfully!" - else - echo "Failed to build container image" - exit 1 - fi - - # ===== Deploy ACI if not already deployed ===== - if [ -z "$ACI_NAME" ]; then - echo "" - echo "===== Deploying Container Instance =====" - azd env set AZURE_ENV_IMAGE_TAG latest - - # Use az deployment instead of azd provision to avoid hook recursion - # Pass parameters inline (main.parameters.json uses AZD ${VAR} syntax not supported by az CLI) - echo "Deploying ACI via Bicep..." - if az deployment group create \ - --resource-group "$RESOURCE_GROUP" \ - --template-file ./infra/main.bicep \ - --parameters solutionName="$AZURE_ENV_NAME" \ - --parameters location="$AZURE_LOCATION" \ - --parameters azureAiServiceLocation="$AZURE_ENV_AI_SERVICE_LOCATION" \ - --parameters imageTag=latest \ - --query "properties.outputs" -o json > /dev/null; then - # Refresh azd env with new outputs - azd env refresh --no-prompt 2>/dev/null - echo "Container Instance deployed successfully!" - else - echo "Container Instance deployment failed" - fi - else - echo "" - echo "Restarting Container Instance to pick up new image..." - az container restart --name "$ACI_NAME" --resource-group "$RESOURCE_GROUP" 2>/dev/null - echo "Container Instance: $ACI_NAME (restarted)" - fi - - echo "" - echo "===== Postprovision Complete - Frontend will deploy next =====" - - # Ensure postprovision exits successfully so frontend deploys - exit 0 - interactive: true - continueOnError: false - - postdeploy: - windows: - shell: pwsh - run: | + # Note: Cosmos DB role is assigned to deployer via Bicep. Write-Host "===== Running Post-Deploy Script =====" -ForegroundColor Yellow + Write-Host "This will upload sample data and create the search index..." + + # Ensure post-deploy Python dependencies are installed $python = "python" if (Test-Path "./.venv/Scripts/python.exe") { $python = "./.venv/Scripts/python.exe" } - & $python -m pip install -r ./scripts/requirements-post-deploy.txt --quiet 2>$null + elseif (Test-Path "../.venv/Scripts/python.exe") { $python = "../.venv/Scripts/python.exe" } + elseif (Test-Path "./.venv/bin/python") { $python = "./.venv/bin/python" } + elseif (Test-Path "../.venv/bin/python") { $python = "../.venv/bin/python" } + & $python -m pip install -r ./scripts/requirements-post-deploy.txt --quiet | Out-Null if (Test-Path "./scripts/post_deploy.py") { & $python ./scripts/post_deploy.py --skip-tests + if ($LASTEXITCODE -eq 0) { Write-Host "Post-deploy script completed successfully!" -ForegroundColor Green } else { - Write-Host "Post-deploy script completed with warnings" -ForegroundColor Yellow + Write-Host "Post-deploy script completed with warnings (some steps may have failed)" -ForegroundColor Yellow } + } else { + Write-Host "Warning: post_deploy.py not found, skipping sample data upload" -ForegroundColor Yellow } Write-Host "" Write-Host "===== Deployment Complete =====" -ForegroundColor Green - Write-Host "Access the web application:" -ForegroundColor White + Write-Host "" + Write-Host "Access the web application:" Write-Host " $env:WEB_APP_URL" -ForegroundColor Cyan - interactive: true + shell: pwsh continueOnError: false - + interactive: true posix: - shell: sh run: | + echo "===== Provision Complete =====" + echo "" + echo "Web App URL: $WEB_APP_URL" + echo "Storage Account: $AZURE_BLOB_ACCOUNT_NAME" + echo "AI Search Service: $AI_SEARCH_SERVICE_NAME" + echo "AI Search Index: $AZURE_AI_SEARCH_PRODUCTS_INDEX" + echo "AI Service Location: $AZURE_ENV_AI_SERVICE_LOCATION" + echo "Container Instance: $CONTAINER_INSTANCE_NAME" + + echo "" + echo "Container Registry: $AZURE_ENV_CONTAINER_REGISTRY_NAME" + + # Run post-deploy script to upload sample data and create search index + echo "" + # Note: Cosmos DB role is assigned to deployer via Bicep. echo "===== Running Post-Deploy Script =====" - PYTHON="python3" - if [ -f "./.venv/bin/python" ]; then PYTHON="./.venv/bin/python"; fi - $PYTHON -m pip install -r ./scripts/requirements-post-deploy.txt --quiet 2>/dev/null + echo "This will upload sample data and create the search index..." if [ -f "./scripts/post_deploy.py" ]; then - $PYTHON ./scripts/post_deploy.py --skip-tests \ + # Prefer local venv if present (repo root or content-gen) + if [ -x "./.venv/bin/python" ]; then + PYTHON_BIN="./.venv/bin/python" + elif [ -x "../.venv/bin/python" ]; then + PYTHON_BIN="../.venv/bin/python" + else + PYTHON_BIN="python3" + fi + + "$PYTHON_BIN" -m pip install -r ./scripts/requirements-post-deploy.txt --quiet > /dev/null \ + && "$PYTHON_BIN" ./scripts/post_deploy.py --skip-tests \ && echo "Post-deploy script completed successfully!" \ - || echo "Post-deploy script completed with warnings" + || echo "Post-deploy script completed with warnings (some steps may have failed)" + else + echo "Warning: post_deploy.py not found, skipping sample data upload" fi echo "" echo "===== Deployment Complete =====" + echo "" echo "Access the web application:" echo " $WEB_APP_URL" - interactive: true + shell: sh continueOnError: false + interactive: true diff --git a/infra/main.bicep b/infra/main.bicep index a16b90d67..40da9f64b 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -10,7 +10,6 @@ metadata description = '''Solution Accelerator for multimodal marketing content @description('Optional. A unique application/solution name for all resources in this deployment.') param solutionName string = 'contentgen' -@minLength(3) @maxLength(5) @description('Optional. A unique text value for the solution.') param solutionUniqueText string = substring(uniqueString(subscription().id, resourceGroup().name, solutionName), 0, 5) @@ -93,7 +92,7 @@ param azureOpenaiAPIVersion string = '2025-01-01-preview' @minValue(10) @description('Optional. AI model deployment token capacity.') -param gptModelCapacity int = 50 +param gptModelCapacity int = 150 @minValue(1) @description('Optional. Image model deployment capacity (RPM).') @@ -122,7 +121,7 @@ param vmAdminPassword string = '' param tags object = {} @description('Optional. Enable monitoring for applicable resources (WAF-aligned).') -param enableMonitoring bool = true +param enableMonitoring bool = false @description('Optional. Enable Azure AI Foundry mode for multi-agent orchestration.') param useFoundryMode bool = true @@ -136,21 +135,14 @@ param enableRedundancy bool = false @description('Optional. Enable private networking for applicable resources (WAF-aligned).') param enablePrivateNetworking bool = false -@description('Optional. Enable/Disable usage telemetry for module.') -param enableTelemetry bool = true - -@description('Optional. Frontend image name (without tag).') -param frontendImageName string = 'content-gen-app' +@description('Optional. The existing Container Registry name (without .azurecr.io). Must contain pre-built images: content-gen-app and content-gen-api.') +param acrName string = 'contentgencontainerreg' -@description('Optional. Backend image name (without tag).') -param backendImageName string = 'content-gen-api' +@description('Optional. Image Tag.') +param imageTag string = 'latest' -@description('Optional. Image tag for container deployment. Leave empty to skip ACI deployment.') -param imageTag string - -@description('Optional. Azure Container Registry name (unused - ACR name is auto-generated). Declared for parameter file compatibility.') -#disable-next-line no-unused-params -param acrName string = '' +@description('Optional. Enable/Disable usage telemetry for module.') +param enableTelemetry bool = true @description('Optional. Created by user name.') param createdBy string = contains(deployer(), 'userPrincipalName')? split(deployer().userPrincipalName, '@')[0]: deployer().objectId @@ -161,6 +153,8 @@ param createdBy string = contains(deployer(), 'userPrincipalName')? split(deploy var solutionLocation = empty(location) ? resourceGroup().location : location +// acrName is required - points to existing ACR with pre-built images +var acrResourceName = acrName var solutionSuffix = toLower(trim(replace( replace( replace(replace(replace(replace('${solutionName}${solutionUniqueText}', '-', ''), '_', ''), '.', ''), '/', ''), @@ -171,9 +165,6 @@ var solutionSuffix = toLower(trim(replace( '' ))) -// ACR name is always auto-generated in custom deployment -var acrResourceName = 'cr${solutionSuffix}' - var cosmosDbZoneRedundantHaRegionPairs = { australiaeast: 'uksouth' centralus: 'eastus2' @@ -386,30 +377,6 @@ module userAssignedIdentity 'br/public:avm/res/managed-identity/user-assigned-id } } -// ========== Azure Container Registry ========== // -// CUSTOM DEPLOYMENT: ACR for remote Docker builds (AZD pushes images here) -module containerRegistry 'br/public:avm/res/container-registry/registry:0.9.0' = { - name: take('avm.res.container-registry.registry.${acrResourceName}', 64) - params: { - name: acrResourceName - location: solutionLocation - tags: tags - enableTelemetry: enableTelemetry - acrSku: 'Standard' - acrAdminUserEnabled: false - anonymousPullEnabled: false - publicNetworkAccess: 'Enabled' - networkRuleBypassOptions: 'AzureServices' - roleAssignments: [ - { - principalId: userAssignedIdentity.outputs.principalId - roleDefinitionIdOrName: '7f951dda-4ed3-4680-a7ca-43fe172d538d' // AcrPull - principalType: 'ServicePrincipal' - } - ] - } -} - // ========== Virtual Network and Networking Components ========== // var deployAdminAccessResources = enablePrivateNetworking && deployBastionAndJumpbox && !empty(vmAdminPassword) module virtualNetwork 'modules/virtualNetwork.bicep' = if (enablePrivateNetworking) { @@ -986,39 +953,35 @@ module webServerFarm 'br/public:avm/res/web/serverfarm:0.7.0' = { var webSiteResourceName = 'app-${solutionSuffix}' // Backend URL: Use actual ACI IP/FQDN from deployment outputs // This also creates an implicit dependency ensuring ACI deploys before the web app -var aciBackendUrl = shouldDeployACI - ? (enablePrivateNetworking - ? 'http://${containerInstance!.properties.ipAddress.ip}:8000' - : 'http://${containerInstance!.properties.ipAddress.fqdn}:8000') - : '' +var aciBackendUrl = enablePrivateNetworking + ? 'http://${containerInstance.outputs.ipAddress}:8000' + : 'http://${containerInstance.outputs.fqdn}:8000' module webSite 'modules/web-sites.bicep' = { name: take('module.web-sites.${webSiteResourceName}', 64) params: { name: webSiteResourceName - tags: union(tags, { 'azd-service-name': 'frontend' }) + tags: tags location: solutionLocation - kind: 'app,linux' + kind: 'app,linux,container' serverFarmResourceId: webServerFarm.outputs.resourceId managedIdentities: { userAssignedResourceIds: [userAssignedIdentity!.outputs.resourceId] } siteConfig: { - // Node.js runtime for frontend server (code deployment via AZD) - linuxFxVersion: 'NODE|22-lts' + // Frontend container - same for both modes + linuxFxVersion: 'DOCKER|${acrResourceName}.azurecr.io/content-gen-app:${imageTag}' minTlsVersion: '1.2' alwaysOn: true ftpsState: 'FtpsOnly' - appCommandLine: 'node server.js' } virtualNetworkSubnetId: enablePrivateNetworking ? virtualNetwork!.outputs.webSubnetResourceId : null configs: concat( [ { - // Frontend server proxies to ACI backend + // Frontend container proxies to ACI backend (both modes) name: 'appsettings' properties: { - WEBSITES_PORT: '8080' + DOCKER_REGISTRY_SERVER_URL: 'https://${acrResourceName}.azurecr.io' BACKEND_URL: aciBackendUrl AZURE_CLIENT_ID: userAssignedIdentity.outputs.clientId - SCM_DO_BUILD_DURING_DEPLOYMENT: 'true' // Run npm install during deployment } applicationInsightResourceId: enableMonitoring ? applicationInsights!.outputs.resourceId : null } @@ -1043,130 +1006,69 @@ module webSite 'modules/web-sites.bicep' = { } // ========== Container Instance (Backend API) ========== // -// CUSTOM DEPLOYMENT: Inline ACI definition with managed identity auth for ACR var containerInstanceName = 'aci-${solutionSuffix}' -var backendImageUrl = '${containerRegistry.outputs.loginServer}/${backendImageName}:${imageTag}' -var aciPort = 8000 -var isPrivateNetworking = enablePrivateNetworking -// Construct identity resource ID from known values (required for deployment-time calculation) -var userAssignedIdentityResourceIdForACI = '/subscriptions/${subscription().subscriptionId}/resourceGroups/${resourceGroup().name}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/${userAssignedIdentityResourceName}' -// Deploy ACI only when imageTag is set to a real tag (not 'none') -var shouldDeployACI = !empty(imageTag) && imageTag != 'none' - -#disable-next-line no-deployments-resources -resource aciTelemetry 'Microsoft.Resources/deployments@2025-04-01' = if (enableTelemetry && shouldDeployACI) { - name: '46d3xbcp.res.containerinstance.${replace('-..--..-', '.', '-')}.${substring(uniqueString(deployment().name, solutionLocation), 0, 4)}' - properties: { - mode: 'Incremental' - template: { - '$schema': 'https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#' - contentVersion: '1.0.0.0' - resources: [] - } - } -} - -// Hash that changes whenever the monitoring config is toggled. -// Used as an ACI tag so that toggling enableMonitoring forces ARM to detect drift on the -// container group, triggering a restart and re-applying env vars like -// APPLICATIONINSIGHTS_CONNECTION_STRING. ACI does not natively support forceUpdateTag, -// and tags must be calculatable at deployment-start (no runtime references allowed). -var monitoringConfigHash = uniqueString(string(enableMonitoring)) - -resource containerInstance 'Microsoft.ContainerInstance/containerGroups@2025-09-01' = if (shouldDeployACI) { - name: containerInstanceName - location: solutionLocation - tags: union(tags, { - 'monitoring-enabled': string(enableMonitoring) - 'monitoring-config-hash': monitoringConfigHash - }) - identity: { - type: 'UserAssigned' - userAssignedIdentities: { - '${userAssignedIdentityResourceIdForACI}': {} - } - } - properties: { - containers: [ - { - name: containerInstanceName - properties: { - image: backendImageUrl - resources: { - requests: { - cpu: 2 - memoryInGB: 4 - } - } - ports: [ - { - port: aciPort - protocol: 'TCP' - } - ] - environmentVariables: [ - // Azure OpenAI Settings - { name: 'AZURE_OPENAI_ENDPOINT', value: 'https://${aiFoundryAiServicesResourceName}.openai.azure.com/' } - { name: 'AZURE_ENV_GPT_MODEL_NAME', value: gptModelName } - { name: 'AZURE_ENV_IMAGE_MODEL_NAME', value: imageModelConfig[imageModelChoice].name } - { name: 'AZURE_OPENAI_GPT_IMAGE_ENDPOINT', value: imageModelChoice != 'none' ? 'https://${aiFoundryAiServicesResourceName}.openai.azure.com/' : '' } - { name: 'AZURE_ENV_OPENAI_API_VERSION', value: azureOpenaiAPIVersion } - // Azure Cosmos DB Settings - { name: 'AZURE_COSMOS_ENDPOINT', value: 'https://cosmos-${solutionSuffix}.documents.azure.com:443/' } - { name: 'AZURE_COSMOS_DATABASE_NAME', value: cosmosDBDatabaseName } - { name: 'AZURE_COSMOS_PRODUCTS_CONTAINER', value: cosmosDBProductsContainer } - { name: 'AZURE_COSMOS_CONVERSATIONS_CONTAINER', value: cosmosDBConversationsContainer } - // Azure Blob Storage Settings - { name: 'AZURE_BLOB_ACCOUNT_NAME', value: storageAccountName } - { name: 'AZURE_BLOB_PRODUCT_IMAGES_CONTAINER', value: productImagesContainer } - { name: 'AZURE_BLOB_GENERATED_IMAGES_CONTAINER', value: generatedImagesContainer } - // Azure AI Search Settings - { name: 'AZURE_AI_SEARCH_ENDPOINT', value: 'https://${aiSearchName}.search.windows.net' } - { name: 'AZURE_AI_SEARCH_PRODUCTS_INDEX', value: azureSearchIndex } - { name: 'AZURE_AI_SEARCH_IMAGE_INDEX', value: 'product-images' } - // App Settings - { name: 'AZURE_CLIENT_ID', value: userAssignedIdentity.outputs.clientId } - { name: 'PORT', value: '8000' } - { name: 'WORKERS', value: '4' } - { name: 'RUNNING_IN_PRODUCTION', value: 'true' } - // Azure AI Foundry Settings - { name: 'USE_FOUNDRY', value: useFoundryMode ? 'true' : 'false' } - { name: 'AZURE_AI_PROJECT_ENDPOINT', value: aiFoundryAiProjectEndpoint } - { name: 'AZURE_AI_MODEL_DEPLOYMENT_NAME', value: gptModelName } - { name: 'AZURE_AI_IMAGE_MODEL_DEPLOYMENT', value: imageModelConfig[imageModelChoice].name } - // Logging Settings - { name: 'AZURE_BASIC_LOGGING_LEVEL', value: 'INFO' } - { name: 'AZURE_PACKAGE_LOGGING_LEVEL', value: 'WARNING' } - { name: 'AZURE_LOGGING_PACKAGES', value: '' } - // Application Insights - { name: 'APPLICATIONINSIGHTS_CONNECTION_STRING', value: enableMonitoring ? applicationInsights!.outputs.connectionString : '' } - ] - } - } - ] - osType: 'Linux' - restartPolicy: 'Always' - subnetIds: isPrivateNetworking ? [ - { - id: virtualNetwork!.outputs.aciSubnetResourceId - } - ] : null - ipAddress: { - type: isPrivateNetworking ? 'Private' : 'Public' - ports: [ - { - port: aciPort - protocol: 'TCP' - } - ] - dnsNameLabel: isPrivateNetworking ? null : containerInstanceName - } - // Managed identity auth for ACR (instead of anonymous pull) - imageRegistryCredentials: [ - { - server: containerRegistry.outputs.loginServer - identity: userAssignedIdentityResourceIdForACI - } +// Hash that changes whenever the monitoring config (enableMonitoring + connection string) changes. +// Used as an ACI tag so that toggling enableMonitoring (or rotating the App Insights component) +// forces ARM to detect drift on the container group, triggering a restart and re-applying env vars +// like APPLICATIONINSIGHTS_CONNECTION_STRING. ACI does not natively support forceUpdateTag. +var monitoringConfigHash = uniqueString( + string(enableMonitoring), + enableMonitoring ? applicationInsights!.outputs.connectionString : 'monitoring-disabled' +) + +module containerInstance 'modules/container-instance.bicep' = { + name: take('module.container-instance.${containerInstanceName}', 64) + params: { + name: containerInstanceName + location: solutionLocation + tags: union(tags, { + 'monitoring-enabled': string(enableMonitoring) + 'monitoring-config-hash': monitoringConfigHash + }) + containerImage: '${acrResourceName}.azurecr.io/content-gen-api:${imageTag}' + cpu: 2 + memoryInGB: 4 + port: 8000 + // Only pass subnetResourceId when private networking is enabled + subnetResourceId: enablePrivateNetworking ? virtualNetwork!.outputs.aciSubnetResourceId : '' + userAssignedIdentityResourceId: userAssignedIdentity.outputs.resourceId + enableTelemetry: enableTelemetry + environmentVariables: [ + // Azure OpenAI Settings + { name: 'AZURE_OPENAI_ENDPOINT', value: 'https://${aiFoundryAiServicesResourceName}.openai.azure.com/' } + { name: 'AZURE_ENV_GPT_MODEL_NAME', value: gptModelName } + { name: 'AZURE_ENV_IMAGE_MODEL_NAME', value: imageModelConfig[imageModelChoice].name } + { name: 'AZURE_OPENAI_GPT_IMAGE_ENDPOINT', value: imageModelChoice != 'none' ? 'https://${aiFoundryAiServicesResourceName}.openai.azure.com/' : '' } + { name: 'AZURE_ENV_OPENAI_API_VERSION', value: azureOpenaiAPIVersion } + // Azure Cosmos DB Settings + { name: 'AZURE_COSMOS_ENDPOINT', value: 'https://cosmos-${solutionSuffix}.documents.azure.com:443/' } + { name: 'AZURE_COSMOS_DATABASE_NAME', value: cosmosDBDatabaseName } + { name: 'AZURE_COSMOS_PRODUCTS_CONTAINER', value: cosmosDBProductsContainer } + { name: 'AZURE_COSMOS_CONVERSATIONS_CONTAINER', value: cosmosDBConversationsContainer } + // Azure Blob Storage Settings + { name: 'AZURE_BLOB_ACCOUNT_NAME', value: storageAccountName } + { name: 'AZURE_BLOB_PRODUCT_IMAGES_CONTAINER', value: productImagesContainer } + { name: 'AZURE_BLOB_GENERATED_IMAGES_CONTAINER', value: generatedImagesContainer } + // Azure AI Search Settings + { name: 'AZURE_AI_SEARCH_ENDPOINT', value: 'https://${aiSearchName}.search.windows.net' } + { name: 'AZURE_AI_SEARCH_PRODUCTS_INDEX', value: azureSearchIndex } + { name: 'AZURE_AI_SEARCH_IMAGE_INDEX', value: 'product-images' } + // App Settings + { name: 'AZURE_CLIENT_ID', value: userAssignedIdentity.outputs.clientId } + { name: 'PORT', value: '8000' } + { name: 'WORKERS', value: '4' } + { name: 'RUNNING_IN_PRODUCTION', value: 'true' } + // Azure AI Foundry Settings + { name: 'USE_FOUNDRY', value: useFoundryMode ? 'true' : 'false' } + { name: 'AZURE_AI_PROJECT_ENDPOINT', value: aiFoundryAiProjectEndpoint } + { name: 'AZURE_AI_MODEL_DEPLOYMENT_NAME', value: gptModelName } + { name: 'AZURE_AI_IMAGE_MODEL_DEPLOYMENT', value: imageModelConfig[imageModelChoice].name } + // Logging Settings + { name: 'AZURE_BASIC_LOGGING_LEVEL', value: 'INFO' } + { name: 'AZURE_PACKAGE_LOGGING_LEVEL', value: 'WARNING' } + { name: 'AZURE_LOGGING_PACKAGES', value: '' } + // Application Insights + { name: 'APPLICATIONINSIGHTS_CONNECTION_STRING', value: enableMonitoring ? applicationInsights!.outputs.connectionString : '' } ] } } @@ -1248,13 +1150,13 @@ output AZURE_APPLICATION_INSIGHTS_CONNECTION_STRING string = (enableMonitoring & output AZURE_ENV_AI_SERVICE_LOCATION string = azureAiServiceLocation @description('Contains Container Instance Name') -output CONTAINER_INSTANCE_NAME string = shouldDeployACI ? containerInstance!.name : '' +output CONTAINER_INSTANCE_NAME string = containerInstance.outputs.name @description('Contains Container Instance FQDN (only for non-private networking)') -output CONTAINER_INSTANCE_FQDN string = (shouldDeployACI && !isPrivateNetworking) ? containerInstance!.properties.ipAddress.fqdn : '' +output CONTAINER_INSTANCE_FQDN string = enablePrivateNetworking ? '' : containerInstance.outputs.fqdn @description('Contains ACR Name') -output AZURE_ENV_CONTAINER_REGISTRY_NAME string = containerRegistry.outputs.name +output AZURE_ENV_CONTAINER_REGISTRY_NAME string = acrResourceName @description('Contains flag for Azure AI Foundry usage') output USE_FOUNDRY bool = useFoundryMode ? true : false @@ -1267,15 +1169,3 @@ output AZURE_AI_MODEL_DEPLOYMENT_NAME string = gptModelName @description('Contains Azure AI Image Model Deployment Name (empty if none selected)') output AZURE_AI_IMAGE_MODEL_DEPLOYMENT string = imageModelConfig[imageModelChoice].name - -@description('Contains Managed Identity Client ID') -output AZURE_CLIENT_ID string = userAssignedIdentity.outputs.clientId - -@description('Frontend image name') -output FRONTEND_IMAGE_NAME string = frontendImageName - -@description('Backend image name') -output BACKEND_IMAGE_NAME string = backendImageName - -@description('Image tag') -output AZURE_ENV_IMAGE_TAG string = imageTag From cd71bad62e70dc146967d839237aabf5d1a67a9f Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Mon, 25 May 2026 17:38:18 +0530 Subject: [PATCH 08/17] remove unused field import from dataclass in token_usage.py --- src/backend/token_usage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/token_usage.py b/src/backend/token_usage.py index 6aab3d2e3..94d2566ca 100644 --- a/src/backend/token_usage.py +++ b/src/backend/token_usage.py @@ -18,7 +18,7 @@ from __future__ import annotations import logging -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, Iterable, Optional from event_utils import track_event_if_configured From ee5b3f4e569e1c1f064d2413fe27707ccc33f459 Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Mon, 25 May 2026 18:08:34 +0530 Subject: [PATCH 09/17] Refactor code --- .gitignore | 6 +- infra/main.json | 2 +- src/.coverage | Bin 53248 -> 0 bytes src/coverage.xml | 2850 ---------------------------------------------- 4 files changed, 6 insertions(+), 2852 deletions(-) delete mode 100644 src/.coverage delete mode 100644 src/coverage.xml diff --git a/.gitignore b/.gitignore index 310b95883..227e83aa6 100644 --- a/.gitignore +++ b/.gitignore @@ -70,4 +70,8 @@ pdf # RAI evaluation results rai_results/ -**/rai_results/ \ No newline at end of file +**/rai_results/ +# Python test coverage +.coverage +coverage.xml +htmlcov/ diff --git a/infra/main.json b/infra/main.json index 11ca660d2..9e101d340 100644 --- a/infra/main.json +++ b/infra/main.json @@ -134,7 +134,7 @@ }, "gptModelCapacity": { "type": "int", - "defaultValue": 50, + "defaultValue": 150, "minValue": 10, "metadata": { "description": "Optional. AI model deployment token capacity." diff --git a/src/.coverage b/src/.coverage deleted file mode 100644 index 4f70513dd57d28b86d1fc7d9e3d3258fc53217c0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 53248 zcmeI5e{dAl9l+o2ZEkPx*CtZtj7YPN*pQMR1RNPCm>^Pzsg$aqRZ=gTyG^p@?l#=+ zC4@$D7pzpLKb$&PY0=PjI{f2U+bYToHF2CNTB*@iJC52RhykW5P*6!in(O!N-QEu% zj(>Ek#_!JE?%TKTz3=z?dEf7Q_inS7TW-2hH592;i$`TcT1-j_!;nTvB7_L=_QBiQ zct~*82^^V%^mIvq)OOwH6K^G6=Z}ear*EBD-nZ)J)hpMpme#Ml?#9(pCQq90mSC%?kyc7^ zty}6;;!>*`Q6x1MRztF(#@ZyKU9n0hbR|qDI>>{mjuxyn>(f+F4MQzMX@e#@<7!ln z_edQ|PqkSfE6q?g8<`AfL{Zyfw1hN2TfIt(E3HagiG>tBqs4qRT;;CX*5~E}vt}{d z71Jg;9)g#r$QsDVo3f@A8VYIgFx9>#F2_Rcie4?rISWG?=(*X*SR2=(b%rKI)R<{z zOjiw6i%H66C6q9f@I?)*rqHYn;1yZ}Z8cManlm-CvI^+rD_^8$ws^L3D<~J!EUfv8 zR5VmJb&e&XEr9&4M6{E34%Em9Hniz#PFL5^vc`VuzPU(W=It%&P768h(M+vGOx=`F za-Cl-96`ik*Z;Q6wU1t!Bv3+ES)-z?%zWrp|&P z$7!HRTDBl)`k8tgRTgxRS|VCYv9FmPqbm>~Fcw(FS`)Dl4Hcc*5YoDoxZI}HX0K+3 zh0ofNz;)^cXJB>3bgq+gfna?u=}wh9`M{hxOut~dQMR|T2foOKte(8cFlt?jDzcu4 zTLZ}_4U$%k{3vO~NcIYtQ^Pw0E9XpSPPSaIKAUi*c#aRup3S5(5kbc#`YtIlg68By z;Jk_8)2rVp8|~7Cy3=e>yt2d@_{Qw%6sIZ#>*?OPZ6j-{yuI9{AB91mz8ys>)117@ z-%@&Ny5tCqCY`bxhmeIaI5PuCv|w01+-xhJo0rh)vq#U4sknZ+Tz zzoY583ezf`w)ACIu2k7B>vR;-E2S1qizsp|GdDnuH9E*tb;;D>!da;y--g~p+i2^m zG@BlT7|Kn1}Bx18APC>R0%<;Lx`T&rqyWnDjl(` z_G>$V32ikhhcn?$JEfq-?snyZ<}$O<^|GNDYE+SE5mOn{vAShl3dI$;2yR7d`X@d?AjW%`F$UX9wj&?cLv@#S@BJgy# zp+Ua+WO2->X;We}3mz`5(X~jzOvw3yO?o`k)FOvElvub48thWxM;#ymPsz3iv*+-v zX}A=GuC+yIFsQ~RePlxssWF? z=c8bg05aR`l}Ufpp+KbW0+BXHdq~rxnqD9@%7f6Ly>=EjP@5CP-C~P4r3=D72)+~1 zAtu^NY!hT?FoFgXI{j%42tE!3*%N#k49toSztaPwJVf&`EN5zm-d^vq@ON`roH=sSWmtDKN;J;8Ib0B;_3P!u7vL=TfT8 zQqN!iyW?D{&1RYBt^bApL$qJ8{&&T=RM2KS3mk}V=Tf)WBd&P;@3c=)@%q2SzTi7= z{m+HC)Eax(<<|cW`v(7l^*KE*k4VM z(#5;CUAtnglp>|1c4SM^KXmXe(o;>VG{$fDP^}iZ*tZ}0)2;6vypaJ*KBSkDe%MKH zlan+;IY`ev$ByYmk?n^j(c^WD(M1Iy>TL7STecK-rT3Yyvt zme2;takxq@%l@Z#NRmsO0+jUgcl2{E(B>W&$C63*Xs2);T%1Q9tE2xHejYhEn@K_f zHrfDemt+of?;=;g{vI>WOHK`REUmranY|T(tvlAdy!6=H<5xa2xOMN2%_sYojr^S_ zm!#{S-gUSVYCQQ?#gXyX_{PcL650$Q3(3%NBHh-VeyJ_3T~kK0M!bCwr=NW6_4L@l zibLxj`zoor3No)B`BzE-+nSRHs1Sh&#Hg%BcqLjL;qa*xJdR*G@SnVBk%UL z#_veJ`B36$_%uV_J;RmW_u#n?Nb>OOB6(oX)~U&{LqFR0&U&`-4{P5$`d|Zj_@i;> z_vVS^Jm9$6!EU~`LGx})9~>DTo*X};jSNrnJoKqh>U`OeQcFq%VcUAP?`+?CZC&e6 z2LjER=N9|!uZ+}>G7wx=7@)xR%mJgtxK864X)df?chv&Yhh9|V7%yNsr!rj3N8O4)U;err})k$;$M(t#33;t>|rVkj6um4O9HoYX`^^*QS4PxOHOJ zbECV|lXIP?Cf?n0VDFtyvgIZ67If&PLvPl<@z8Tq<;iGzF(E^QIFibac<_UnRifn-hOro3yDs#|5nz#GDdwkXK}9Th!uso zoFvcC7X1BAdjDS(cM|cCxLbTx{EN6#d_#O5z6sbb?hzjs-xo*43Gqy>A|{Xk5a6 - - - - - C:\Users\v-ayazkhan\Accelerators\content-generation-solution-accelerator\src\backend - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - From df02a01d29840417c491ec4e820ef33c812dd71a Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Mon, 25 May 2026 19:17:00 +0530 Subject: [PATCH 10/17] feat: enhance token usage telemetry with conversation ID tracking and improve Application Insights event emission --- .gitignore | 1 + docs/TokenUsageTelemetry.md | 8 +- infra/dashboards/token-usage-workbook.json | 2 +- infra/main_custom.bicep | 2 +- infra/workbook/README.md | 8 +- src/backend/app.py | 10 ++- src/backend/orchestrator.py | 95 ++++++++++++++++------ src/backend/token_usage.py | 88 +++++++++++--------- 8 files changed, 137 insertions(+), 77 deletions(-) diff --git a/.gitignore b/.gitignore index 227e83aa6..7e0f17ada 100644 --- a/.gitignore +++ b/.gitignore @@ -75,3 +75,4 @@ rai_results/ .coverage coverage.xml htmlcov/ +coverage_html/ diff --git a/docs/TokenUsageTelemetry.md b/docs/TokenUsageTelemetry.md index f48f62c54..e787daa01 100644 --- a/docs/TokenUsageTelemetry.md +++ b/docs/TokenUsageTelemetry.md @@ -33,8 +33,12 @@ Three custom events are sent on every request that consumes LLM tokens Set `APPLICATIONINSIGHTS_CONNECTION_STRING` in the backend environment. Application Insights wiring is already configured in `src/backend/app.py` -via `configure_azure_monitor()`. If the env var is unset, telemetry calls -are no-ops — token tracking has zero runtime impact when not configured. +via `configure_azure_monitor()`. If the env var is unset, no telemetry is +sent to Application Insights — `TokenUsageAccumulator.flush()` short-circuits +the network emit path. Aggregated per-request totals are still written to +the local logger at `INFO` level (one `[TOKEN USAGE] ...` line per flush) +so token tracking remains useful for local debugging without a connection +string. When deploying via `azd up`, the Bicep templates create an Application Insights instance and pass the connection string to the App Service. diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index c962e0ff5..59d8cf4d0 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -84,7 +84,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let StageMapping = datatable(agent:string, Stage:string) [\n 'rai_agent', 'Safety & RAI',\n 'planning_agent', 'Brief Parsing',\n 'text_content_agent', 'Text Generation',\n 'image_content_agent', 'Image Generation',\n 'compliance_agent', 'Compliance'\n];\ncustomEvents\n| where name == 'LLM_Agent_Token_Usage'\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| lookup kind=leftouter StageMapping on agent\n| extend Stage = iff(isempty(Stage), 'Other', Stage)\n| summarize\n TotalRequests = count(),\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerRequest = round(avg(total_tokens), 0)\n by Stage\n| order by TotalTokens desc", + "query": "let StageMapping = datatable(agent:string, Stage:string) [\n 'rai_agent', 'Safety & RAI',\n 'planning_agent', 'Brief Parsing',\n 'text_content_agent', 'Text Generation',\n 'image_content_agent', 'Image Generation',\n 'compliance_agent', 'Compliance'\n];\ncustomEvents\n| where name == 'LLM_Agent_Token_Usage'\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| lookup kind=leftouter StageMapping on agent\n| extend Stage = iff(isempty(Stage), 'Other', Stage)\n| summarize\n Invocations = count(),\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerInvocation = round(avg(total_tokens), 0)\n by Stage\n| order by TotalTokens desc", "size": 0, "title": "Token usage by pipeline stage", "timeContextFromParameter": "TimeRange", diff --git a/infra/main_custom.bicep b/infra/main_custom.bicep index 873ad5869..a59c9ccfb 100644 --- a/infra/main_custom.bicep +++ b/infra/main_custom.bicep @@ -93,7 +93,7 @@ param azureOpenaiAPIVersion string = '2025-01-01-preview' @minValue(10) @description('Optional. AI model deployment token capacity.') -param gptModelCapacity int = 50 +param gptModelCapacity int = 150 @minValue(1) @description('Optional. Image model deployment capacity (RPM).') diff --git a/infra/workbook/README.md b/infra/workbook/README.md index bb0476d1f..ddacce9b2 100644 --- a/infra/workbook/README.md +++ b/infra/workbook/README.md @@ -100,9 +100,11 @@ New-AzResourceGroupDeployment ` `sourceId` it points at needs to be a valid Application Insights resource ID. This is what allows the workbook to fetch token-count telemetry from Application Insights deployed elsewhere. -- The workbook name is a stable GUID derived from the resource group and the - Application Insights resource ID, so re-running the deployment updates the - same workbook in place rather than creating duplicates. +- The workbook name is a stable GUID derived from the resource group ID and + a fixed seed (`'token-usage-workbook'`), so re-running the deployment updates + the same workbook in place rather than creating duplicates. The Application + Insights resource ID is **not** part of the name, which lets you re-point + the workbook at a different App Insights instance without renaming it. - Required permission: `Microsoft.Insights/workbooks/write` on the target resource group. **No** permissions are required on the Application Insights resource group at deployment time — the workbook only references it; the diff --git a/src/backend/app.py b/src/backend/app.py index 81be73866..2bd4e509f 100644 --- a/src/backend/app.py +++ b/src/backend/app.py @@ -353,7 +353,7 @@ async def _handle_parse_brief( logger.exception(f"Failed to save message to CosmosDB: {e}") # Parse the brief - brief, questions, blocked = await orchestrator.parse_brief(message, user_id=user_id) + brief, questions, blocked = await orchestrator.parse_brief(message, user_id=user_id, conversation_id=conversation_id) if blocked: track_event_if_configured("Error_RAI_Check_Failed", {"conversation_id": conversation_id, "user_id": user_id, "status": "Brief parse blocked by RAI"}) @@ -537,7 +537,7 @@ async def _handle_refine_brief( logger.exception(f"Failed to save refinement message: {e}") # Use orchestrator to refine the brief - brief, questions, blocked = await orchestrator.parse_brief(message, user_id=user_id) + brief, questions, blocked = await orchestrator.parse_brief(message, user_id=user_id, conversation_id=conversation_id) if blocked: track_event_if_configured("Error_RAI_Check_Failed", {"conversation_id": conversation_id, "user_id": user_id, "status": "Brief refinement blocked by RAI"}) @@ -944,7 +944,8 @@ async def _run_regeneration_task( brief=brief, products=products_data, previous_image_prompt=previous_image_prompt, - user_id=user_id + user_id=user_id, + conversation_id=conversation_id ) # Check for RAI block @@ -1200,7 +1201,8 @@ async def _run_generation_task(task_id: str, brief: CreativeBrief, products_data brief=brief, products=products_data, generate_images=generate_images, - user_id=user_id + user_id=user_id, + conversation_id=conversation_id ) logger.info(f"Generation task {task_id} completed. Response keys: {list(response.keys()) if response else 'None'}") diff --git a/src/backend/orchestrator.py b/src/backend/orchestrator.py index 1ad617b78..f78ae3fb7 100644 --- a/src/backend/orchestrator.py +++ b/src/backend/orchestrator.py @@ -56,6 +56,12 @@ # orchestrator methods; read by ``_new_token_accumulator``. _current_user_id: ContextVar[str] = ContextVar("_current_user_id", default="") +# Per-request conversation_id propagated the same way as ``_current_user_id`` +# so token-usage telemetry emitted from deep helpers (image generation, +# regenerate flows, etc.) can be correlated by conversation in Application +# Insights / KQL even when the helper isn't directly given a conversation_id. +_current_conversation_id: ContextVar[str] = ContextVar("_current_conversation_id", default="") + # Harmful content patterns to detect in USER INPUT before processing # This provides proactive content safety by blocking harmful requests at the input layer @@ -726,12 +732,13 @@ def _new_token_accumulator( """Create a TokenUsageAccumulator pre-populated with this orchestrator's agent->model map and default chat model. Telemetry is best-effort. - If ``user_id`` is not provided, falls back to the per-request value - stored in the ``_current_user_id`` ContextVar so accumulators created - deep inside the workflow still carry the caller's user id. + If ``user_id`` / ``conversation_id`` are not provided, falls back to + the per-request values stored in the ``_current_user_id`` / + ``_current_conversation_id`` ContextVars so accumulators created deep + inside the workflow still carry the caller's correlation ids. """ return TokenUsageAccumulator( - conversation_id=conversation_id, + conversation_id=conversation_id or _current_conversation_id.get(""), user_id=user_id or _current_user_id.get(""), agent_model_map=self._agent_model_map, default_model=self._default_model, @@ -786,8 +793,12 @@ async def process_message( full_input = f"Context:\n{json.dumps(context, indent=2)}\n\nUser Message:\n{message}" _ctx_token = _current_user_id.set(user_id or "") + _ctx_conv = _current_conversation_id.set(conversation_id or "") + # Defined outside the try so the except/finally branches can safely + # reference ``token_acc`` even if creation fails. Each flush call is + # guarded by ``if token_acc is not None`` to avoid NoneType errors. + token_acc: Optional[TokenUsageAccumulator] = None try: - # Per-request token usage accumulator for App Insights telemetry. token_acc = self._new_token_accumulator(conversation_id, user_id) # Collect events from the workflow stream @@ -861,17 +872,19 @@ async def process_message( } # Emit aggregated LLM_*_Token_Usage events for the request. - try: - token_acc.flush(source="process_message") - except Exception as _tu_err: - logger.debug("token_usage flush failed: %s", _tu_err) + if token_acc is not None: + try: + token_acc.flush(source="process_message") + except Exception as _tu_err: + logger.debug("token_usage flush failed: %s", _tu_err) except Exception as e: logger.exception(f"Error processing message: {e}") - try: - token_acc.flush(source="process_message:error") - except Exception: - pass + if token_acc is not None: + try: + token_acc.flush(source="process_message:error") + except Exception: + pass yield { "type": "error", "content": f"An error occurred: {str(e)}", @@ -880,6 +893,7 @@ async def process_message( } finally: _current_user_id.reset(_ctx_token) + _current_conversation_id.reset(_ctx_conv) async def send_user_response( self, @@ -917,10 +931,12 @@ async def send_user_response( } return # Exit immediately - do not continue workflow + _ctx_token = _current_user_id.set(user_id or "") + _ctx_conv = _current_conversation_id.set(conversation_id or "") + # See process_message for the rationale of the None-init pattern. + token_acc: Optional[TokenUsageAccumulator] = None try: token_acc = self._new_token_accumulator(conversation_id, user_id) - _ctx_token = _current_user_id.set(user_id or "") - responses = {request_id: user_response} async for event in self._workflow.send_responses_streaming(responses): try: @@ -977,17 +993,19 @@ async def send_user_response( "metadata": {"conversation_id": conversation_id} } - try: - token_acc.flush(source="send_user_response") - except Exception as _tu_err: - logger.debug("token_usage flush failed: %s", _tu_err) + if token_acc is not None: + try: + token_acc.flush(source="send_user_response") + except Exception as _tu_err: + logger.debug("token_usage flush failed: %s", _tu_err) except Exception as e: logger.exception(f"Error sending user response: {e}") - try: - token_acc.flush(source="send_user_response:error") - except Exception: - pass + if token_acc is not None: + try: + token_acc.flush(source="send_user_response:error") + except Exception: + pass yield { "type": "error", "content": f"An error occurred: {str(e)}", @@ -999,11 +1017,16 @@ async def send_user_response( _current_user_id.reset(_ctx_token) except (LookupError, ValueError, NameError): pass + try: + _current_conversation_id.reset(_ctx_conv) + except (LookupError, ValueError, NameError): + pass async def parse_brief( self, brief_text: str, - user_id: str = "" + user_id: str = "", + conversation_id: str = "" ) -> tuple[CreativeBrief, str | None, bool]: """ Parse a free-text creative brief into structured format. @@ -1011,6 +1034,9 @@ async def parse_brief( Args: brief_text: Free-text creative brief from user + user_id: Optional caller's user id, propagated to token usage telemetry + conversation_id: Optional conversation id, propagated to token usage + telemetry for correlation in Application Insights. Returns: tuple: (CreativeBrief, clarifying_questions_or_none, is_blocked) @@ -1022,10 +1048,12 @@ async def parse_brief( self.initialize() _ctx_token = _current_user_id.set(user_id or "") + _ctx_conv = _current_conversation_id.set(conversation_id or "") try: return await self._parse_brief_impl(brief_text, user_id) finally: _current_user_id.reset(_ctx_token) + _current_conversation_id.reset(_ctx_conv) async def _parse_brief_impl( self, @@ -1527,7 +1555,8 @@ async def generate_content( brief: CreativeBrief, products: list = None, generate_images: bool = True, - user_id: str = "" + user_id: str = "", + conversation_id: str = "" ) -> dict: """ Generate complete content package from a confirmed creative brief. @@ -1537,6 +1566,9 @@ async def generate_content( products: List of products to feature generate_images: Whether to generate images user_id: Optional caller's user id, propagated to token usage telemetry + conversation_id: Optional conversation id, propagated to token usage + telemetry (including from deep helpers like ``_generate_foundry_image``) + so image-generation events can be correlated by conversation in KQL. Returns: dict: Generated content with compliance results @@ -1545,10 +1577,12 @@ async def generate_content( self.initialize() _ctx_token = _current_user_id.set(user_id or "") + _ctx_conv = _current_conversation_id.set(conversation_id or "") try: return await self._generate_content_impl(brief, products, generate_images, user_id) finally: _current_user_id.reset(_ctx_token) + _current_conversation_id.reset(_ctx_conv) async def _generate_content_impl( self, @@ -1580,8 +1614,10 @@ async def _generate_content_impl( Products to feature: {json.dumps(products or [])} """ + # Created outside the try so the post-try flush is safe even if an + # exception fires before the first assignment inside the try block. + token_acc = self._new_token_accumulator(user_id=user_id) try: - token_acc = self._new_token_accumulator(user_id=user_id) # Generate text content text_response = await self._agents["text_content"].run(text_request) try: @@ -1799,7 +1835,8 @@ async def regenerate_image( brief: CreativeBrief, products: list = None, previous_image_prompt: str = None, - user_id: str = "" + user_id: str = "", + conversation_id: str = "" ) -> dict: """ Regenerate just the image based on a user modification request. @@ -1813,6 +1850,8 @@ async def regenerate_image( products: List of products to feature previous_image_prompt: The previous image prompt (if available) user_id: Optional caller's user id, propagated to token usage telemetry + conversation_id: Optional conversation id, propagated to token usage + telemetry for correlation in Application Insights. Returns: dict: Regenerated image with updated prompt @@ -1821,12 +1860,14 @@ async def regenerate_image( self.initialize() _ctx_token = _current_user_id.set(user_id or "") + _ctx_conv = _current_conversation_id.set(conversation_id or "") try: return await self._regenerate_image_impl( modification_request, brief, products, previous_image_prompt ) finally: _current_user_id.reset(_ctx_token) + _current_conversation_id.reset(_ctx_conv) async def _regenerate_image_impl( self, diff --git a/src/backend/token_usage.py b/src/backend/token_usage.py index 94d2566ca..c7a39c07b 100644 --- a/src/backend/token_usage.py +++ b/src/backend/token_usage.py @@ -18,6 +18,7 @@ from __future__ import annotations import logging +import os from dataclasses import dataclass from typing import Any, Iterable, Optional @@ -326,61 +327,70 @@ def has_data(self) -> bool: return self.totals.total_tokens > 0 def flush(self, *, source: str = "") -> None: - """Emit aggregated events to Application Insights. Safe to call once per request.""" + """Emit aggregated events to Application Insights. Safe to call once per request. + + Short-circuits when ``APPLICATIONINSIGHTS_CONNECTION_STRING`` is unset so + we don't fan out 1+N+M no-op ``track_event_if_configured`` calls (each of + which currently emits a WARNING log line). The summary log at the bottom + of this method is still useful for local debugging and is left in place. + """ if not self.has_data(): return + ai_configured = bool(os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")) + base_dims = { "user_id": self.user_id, "conversation_id": self.conversation_id, "source": source, } - try: - track_event_if_configured( - EVENT_SUMMARY, - { - **base_dims, - "total_input_tokens": str(self.totals.input_tokens), - "total_output_tokens": str(self.totals.output_tokens), - "total_tokens": str(self.totals.total_tokens), - "agent_count": str(len(self.by_agent)), - "model_count": str(len(self.by_model)), - }, - ) - except Exception as e: - logger.warning("Failed to emit %s: %s", EVENT_SUMMARY, e) - - for agent_name, c in self.by_agent.items(): - try: - track_event_if_configured( - EVENT_AGENT, - { - **base_dims, - "agent_name": agent_name, - "model_deployment_name": c.model_deployment_name or self.default_model, - "input_tokens": str(c.input_tokens), - "output_tokens": str(c.output_tokens), - "total_tokens": str(c.total_tokens), - }, - ) - except Exception as e: - logger.warning("Failed to emit %s for %s: %s", EVENT_AGENT, agent_name, e) - - for model_name, c in self.by_model.items(): + if ai_configured: try: track_event_if_configured( - EVENT_MODEL, + EVENT_SUMMARY, { **base_dims, - "model_deployment_name": model_name, - "input_tokens": str(c.input_tokens), - "output_tokens": str(c.output_tokens), - "total_tokens": str(c.total_tokens), + "total_input_tokens": str(self.totals.input_tokens), + "total_output_tokens": str(self.totals.output_tokens), + "total_tokens": str(self.totals.total_tokens), + "agent_count": str(len(self.by_agent)), + "model_count": str(len(self.by_model)), }, ) except Exception as e: - logger.warning("Failed to emit %s for %s: %s", EVENT_MODEL, model_name, e) + logger.warning("Failed to emit %s: %s", EVENT_SUMMARY, e) + + for agent_name, c in self.by_agent.items(): + try: + track_event_if_configured( + EVENT_AGENT, + { + **base_dims, + "agent_name": agent_name, + "model_deployment_name": c.model_deployment_name or self.default_model, + "input_tokens": str(c.input_tokens), + "output_tokens": str(c.output_tokens), + "total_tokens": str(c.total_tokens), + }, + ) + except Exception as e: + logger.warning("Failed to emit %s for %s: %s", EVENT_AGENT, agent_name, e) + + for model_name, c in self.by_model.items(): + try: + track_event_if_configured( + EVENT_MODEL, + { + **base_dims, + "model_deployment_name": model_name, + "input_tokens": str(c.input_tokens), + "output_tokens": str(c.output_tokens), + "total_tokens": str(c.total_tokens), + }, + ) + except Exception as e: + logger.warning("Failed to emit %s for %s: %s", EVENT_MODEL, model_name, e) logger.info( "[TOKEN USAGE] source=%s user=%s conv=%s total=%d (in=%d, out=%d) " From 6ea5f41a3ae582ba2b9f8c82c1f4190de4c77b65 Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Wed, 27 May 2026 21:43:57 +0530 Subject: [PATCH 11/17] aligned code with other GSAs --- docs/TokenUsageTelemetry.md | 12 +- infra/dashboards/token-usage-queries.kql | 4 +- infra/dashboards/token-usage-workbook.json | 160 --- infra/main.bicep | 5 - infra/main_custom.bicep | 5 - infra/monitoring/README.md | 9 - infra/monitoring/monitoring.bicep | 2 +- infra/workbook/README.md | 111 --- infra/workbook/workbook.bicep | 53 - src/backend/llm_token_telemetry.py | 1037 ++++++++++++++++++++ src/backend/orchestrator.py | 177 +++- src/backend/telemetry.py | 90 ++ src/backend/token_usage.py | 406 -------- 13 files changed, 1299 insertions(+), 772 deletions(-) delete mode 100644 infra/dashboards/token-usage-workbook.json delete mode 100644 infra/workbook/README.md delete mode 100644 infra/workbook/workbook.bicep create mode 100644 src/backend/llm_token_telemetry.py create mode 100644 src/backend/telemetry.py delete mode 100644 src/backend/token_usage.py diff --git a/docs/TokenUsageTelemetry.md b/docs/TokenUsageTelemetry.md index e787daa01..0cabc2435 100644 --- a/docs/TokenUsageTelemetry.md +++ b/docs/TokenUsageTelemetry.md @@ -7,7 +7,7 @@ This page describes what is emitted, how to enable it, and how to visualize it. ## What is emitted Three custom events are sent on every request that consumes LLM tokens -(see `src/backend/token_usage.py`): +(see `src/backend/llm_token_telemetry.py`): | Event | When | Custom dimensions | |---|---|---| @@ -34,7 +34,7 @@ Three custom events are sent on every request that consumes LLM tokens Set `APPLICATIONINSIGHTS_CONNECTION_STRING` in the backend environment. Application Insights wiring is already configured in `src/backend/app.py` via `configure_azure_monitor()`. If the env var is unset, no telemetry is -sent to Application Insights — `TokenUsageAccumulator.flush()` short-circuits +sent to Application Insights — `_RequestTokenTracker.flush()` short-circuits the network emit path. Aggregated per-request totals are still written to the local logger at `INFO` level (one `[TOKEN USAGE] ...` line per flush) so token tracking remains useful for local debugging without a connection @@ -72,14 +72,6 @@ It contains 12 queries: 2. Go to **Monitoring → Logs**. 3. Paste any query from the file above and click **Run**. -### Build a workbook - -1. Open **Application Insights → Workbooks → + New**. -2. Add a **Query** step and paste a query from `token-usage-queries.kql`. -3. Pick a visualization (bar, time chart, table) and pin to a dashboard. -4. Repeat for each query you want as a tile. -5. Save the workbook to make it reusable across the team. - ## Verifying locally After triggering a brief generation in a dev environment with a valid diff --git a/infra/dashboards/token-usage-queries.kql b/infra/dashboards/token-usage-queries.kql index b1b4af4f4..22008109e 100644 --- a/infra/dashboards/token-usage-queries.kql +++ b/infra/dashboards/token-usage-queries.kql @@ -4,13 +4,13 @@ // Run these in: App Insights -> Logs (or Log Analytics -> Logs) for the // workspace attached to the Content Generation backend. // -// Custom events emitted by the backend (see src/backend/token_usage.py): +// Custom events emitted by the backend (see src/backend/llm_token_telemetry.py): // * LLM_Token_Usage_Summary — one per request; aggregate totals // * LLM_Agent_Token_Usage — one per agent that consumed tokens in the request // * LLM_Model_Token_Usage — one per model deployment that was hit // // Common custom dimensions on every event: -// user_id, conversation_id, source +// user_id, conversation_id, source, app // Plus event-specific numeric dimensions stored as STRINGS — always cast with // toint() / tolong() in KQL. // ============================================================================= diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json deleted file mode 100644 index 59d8cf4d0..000000000 --- a/infra/dashboards/token-usage-workbook.json +++ /dev/null @@ -1,160 +0,0 @@ -{ - "version": "Notebook/1.0", - "items": [ - { - "type": 1, - "content": { - "json": "# Token Usage — Content Generation Solution Accelerator\n---\n\nThis workbook visualizes LLM token consumption emitted by the orchestrator as Application Insights custom events:\n\n- **`LLM_Token_Usage_Summary`** — one event per user turn / brief parse / regeneration. Carries `total_input_tokens`, `total_output_tokens`, `total_tokens`, `source`, `conversation_id`, `user_id`.\n- **`LLM_Agent_Token_Usage`** — one event per agent per turn. Carries `agent_name`, `model_deployment_name`, `input_tokens`, `output_tokens`, `total_tokens`.\n- **`LLM_Model_Token_Usage`** — one event per model deployment per turn. Carries `model_deployment_name`, `input_tokens`, `output_tokens`, `total_tokens`.\n\nUse the time-range selector at the top to widen or narrow the window." - }, - "name": "header-markdown" - }, - { - "type": 9, - "content": { - "version": "KqlParameterItem/1.0", - "parameters": [ - { - "id": "time-range", - "version": "KqlParameterItem/1.0", - "name": "TimeRange", - "label": "Time range", - "type": 4, - "isRequired": true, - "value": { "durationMs": 604800000 }, - "typeSettings": { - "selectableValues": [ - { "durationMs": 3600000 }, - { "durationMs": 14400000 }, - { "durationMs": 43200000 }, - { "durationMs": 86400000 }, - { "durationMs": 172800000 }, - { "durationMs": 604800000 }, - { "durationMs": 2592000000 } - ] - } - } - ] - }, - "name": "parameters" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n TotalRequests = count(),\n TotalInputTokens = sum(input_tokens),\n TotalOutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerRequest = round(avg(total_tokens), 0)", - "size": 1, - "title": "Total token usage", - "timeContextFromParameter": "TimeRange", - "queryType": 0, - "resourceType": "microsoft.insights/components" - }, - "name": "tile-total-usage", - "styleSettings": { "showBorder": true } - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "let _range = totimespan('{TimeRange:duration}');\nlet _bin = iff(_range > 1d, 1d, 1h);\ncustomEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| summarize TotalTokens = sum(total_tokens), InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, _bin)\n| render timechart", - "size": 0, - "title": "Token usage over time", - "timeContextFromParameter": "TimeRange", - "queryType": 0, - "resourceType": "microsoft.insights/components" - }, - "customWidth": "100", - "name": "chart-usage-over-time" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = count()\n by Agent = agent\n| order by TotalTokens desc", - "size": 0, - "title": "Token usage by agent", - "timeContextFromParameter": "TimeRange", - "queryType": 0, - "resourceType": "microsoft.insights/components" - }, - "customWidth": "50", - "name": "table-by-agent", - "styleSettings": { "showBorder": true } - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "let StageMapping = datatable(agent:string, Stage:string) [\n 'rai_agent', 'Safety & RAI',\n 'planning_agent', 'Brief Parsing',\n 'text_content_agent', 'Text Generation',\n 'image_content_agent', 'Image Generation',\n 'compliance_agent', 'Compliance'\n];\ncustomEvents\n| where name == 'LLM_Agent_Token_Usage'\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| lookup kind=leftouter StageMapping on agent\n| extend Stage = iff(isempty(Stage), 'Other', Stage)\n| summarize\n Invocations = count(),\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerInvocation = round(avg(total_tokens), 0)\n by Stage\n| order by TotalTokens desc", - "size": 0, - "title": "Token usage by pipeline stage", - "timeContextFromParameter": "TimeRange", - "queryType": 0, - "resourceType": "microsoft.insights/components" - }, - "customWidth": "50", - "name": "table-by-stage", - "styleSettings": { "showBorder": true } - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Model_Token_Usage'\n| extend model = tostring(customDimensions['model_deployment_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = count()\n by Model = model\n| order by TotalTokens desc", - "size": 0, - "title": "Token usage by model deployment", - "timeContextFromParameter": "TimeRange", - "queryType": 0, - "resourceType": "microsoft.insights/components" - }, - "customWidth": "50", - "name": "table-by-model", - "styleSettings": { "showBorder": true } - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend source = tostring(customDimensions['source'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n Requests = count(),\n TotalTokens = sum(total_tokens),\n AvgTokensPerRequest = round(avg(total_tokens), 0)\n by Source = source\n| order by TotalTokens desc", - "size": 0, - "title": "Token usage by source (entry point)", - "timeContextFromParameter": "TimeRange", - "queryType": 0, - "resourceType": "microsoft.insights/components" - }, - "customWidth": "50", - "name": "table-by-source", - "styleSettings": { "showBorder": true } - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| extend user_id = tostring(customDimensions['user_id'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend User = iff(isempty(user_id) or user_id == 'anonymous', 'anonymous', user_id)\n| summarize Requests = count(), TotalTokens = sum(total_tokens) by User\n| order by TotalTokens desc\n| take 25", - "size": 0, - "title": "Top users by token consumption", - "timeContextFromParameter": "TimeRange", - "queryType": 0, - "resourceType": "microsoft.insights/components" - }, - "customWidth": "50", - "name": "table-top-users", - "styleSettings": { "showBorder": true } - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "let _range = totimespan('{TimeRange:duration}');\nlet _bin = iff(_range > 1d, 1d, 1h);\ncustomEvents\n| where name == 'LLM_Agent_Token_Usage'\n| extend agent = tostring(customDimensions['agent_name'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize TotalTokens = sum(total_tokens) by bin(timestamp, _bin), agent\n| render timechart", - "size": 0, - "title": "Per-agent token usage over time", - "timeContextFromParameter": "TimeRange", - "queryType": 0, - "resourceType": "microsoft.insights/components" - }, - "customWidth": "100", - "name": "chart-per-agent-over-time" - } - ], - "isLocked": false, - "fallbackResourceIds": [] -} diff --git a/infra/main.bicep b/infra/main.bicep index 40da9f64b..8ef058e12 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -360,11 +360,6 @@ module applicationInsights 'br/public:avm/res/insights/component:0.7.1' = if (en } } -// ========== Token Usage Workbook ========== // -// The "Token Usage" Application Insights workbook is now deployed separately -// via infra/workbook/workbook.bicep so it can target an Application Insights -// resource in any resource group / subscription. See infra/workbook/README.md. - // ========== User Assigned Identity ========== // var userAssignedIdentityResourceName = 'id-${solutionSuffix}' module userAssignedIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.5.0' = { diff --git a/infra/main_custom.bicep b/infra/main_custom.bicep index a59c9ccfb..ef6332e2a 100644 --- a/infra/main_custom.bicep +++ b/infra/main_custom.bicep @@ -369,11 +369,6 @@ module applicationInsights 'br/public:avm/res/insights/component:0.7.1' = if (en } } -// ========== Token Usage Workbook ========== // -// The "Token Usage" Application Insights workbook is now deployed separately -// via infra/workbook/workbook.bicep so it can target an Application Insights -// resource in any resource group / subscription. See infra/workbook/README.md. - // ========== User Assigned Identity ========== // var userAssignedIdentityResourceName = 'id-${solutionSuffix}' module userAssignedIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.5.0' = { diff --git a/infra/monitoring/README.md b/infra/monitoring/README.md index d0ea34fc2..3976fa378 100644 --- a/infra/monitoring/README.md +++ b/infra/monitoring/README.md @@ -66,15 +66,6 @@ azd env set APPLICATIONINSIGHTS_CONNECTION_STRING "$APPI_CS" azd deploy # re-deploy app code only, no infra changes ``` -## Then deploy the workbook - -```bash -az deployment group create \ - --resource-group \ - --template-file infra/workbook/workbook.bicep \ - --parameters applicationInsightsResourceId="$APPI_ID" -``` - ## Idempotency / re-runs Re-running this deployment against the same RG is safe — AVM modules use diff --git a/infra/monitoring/monitoring.bicep b/infra/monitoring/monitoring.bicep index 029b4ee36..05440ad0c 100644 --- a/infra/monitoring/monitoring.bicep +++ b/infra/monitoring/monitoring.bicep @@ -87,7 +87,7 @@ output logAnalyticsWorkspaceResourceId string = logAnalyticsWorkspace.outputs.re @description('Name of the Log Analytics workspace.') output logAnalyticsWorkspaceName string = logAnalyticsWorkspaceResourceName -@description('Resource ID of the Application Insights component (pass this to infra/workbook/workbook.bicep).') +@description('Resource ID of the Application Insights component.') output applicationInsightsResourceId string = applicationInsights.outputs.resourceId @description('Name of the Application Insights component.') diff --git a/infra/workbook/README.md b/infra/workbook/README.md deleted file mode 100644 index ddacce9b2..000000000 --- a/infra/workbook/README.md +++ /dev/null @@ -1,111 +0,0 @@ -# Token Usage Workbook (standalone deployment) - -This folder contains a **standalone** Bicep template that deploys the -**Token Usage** Application Insights workbook used by the Content Generation -Solution Accelerator. - -The workbook visualises the custom events emitted by the orchestrator: - -- `LLM_Token_Usage_Summary` -- `LLM_Agent_Token_Usage` -- `LLM_Model_Token_Usage` - -It is deployed **separately** from `infra/main.bicep` / -`infra/main_custom.bicep` so it can target an Application Insights instance -that lives in a **different resource group** (or subscription) from the rest -of the accelerator — for example, a shared observability workspace. - -## Files - -| File | Purpose | -| --- | --- | -| `workbook.bicep` | Bicep template that creates the workbook resource. | -| `../dashboards/token-usage-workbook.json` | Serialized workbook definition (loaded by the template). | - -## Parameters - -| Name | Required | Description | -| --- | --- | --- | -| `applicationInsightsResourceId` | No | Full resource ID of the Application Insights instance to query. Defaults to `Azure Monitor` (unbound — pick the instance later in the portal). Re-deploy with a real ID to (re)bind. | -| `workbookName` | No | Stable GUID name. Keep the default so re-deployments update the SAME workbook even when the App Insights ID changes. | -| `location` | No | Azure region for the workbook resource. Defaults to the resource group location. | -| `displayName` | No | Display name in the Azure portal. Defaults to `Token Usage`. | -| `tags` | No | Tags applied to the workbook resource. | - -## Bind / change Application Insights after deployment - -Because `workbookName` is stable by default, you can: - -1. **Deploy now without an App Insights ID** — workbook is created in - "Azure Monitor" scope and shows up under *Monitor → Workbooks*. Open it, - then use the resource picker at the top to point at any App Insights - instance ad-hoc. -2. **Bind / re-bind later by re-deploying** with a new - `applicationInsightsResourceId`. The same workbook resource is updated - in place; no duplicate is created. - -```bash -# 1) Deploy unbound first -az deployment group create \ - --resource-group rg-observability \ - --template-file infra/workbook/workbook.bicep - -# 2) Later, bind it to App Insights instance A -az deployment group create \ - --resource-group rg-observability \ - --template-file infra/workbook/workbook.bicep \ - --parameters applicationInsightsResourceId="$APPI_ID_A" - -# 3) Switch it to App Insights instance B - same workbook, new source -az deployment group create \ - --resource-group rg-observability \ - --template-file infra/workbook/workbook.bicep \ - --parameters applicationInsightsResourceId="$APPI_ID_B" -``` - -You can also change the binding from the **Azure portal**: open the -workbook → *Edit* → *Settings* → change the linked resource → *Save*. - -## Deploy with Azure CLI - -```bash -# Resource group where the WORKBOOK will live -WORKBOOK_RG="rg-observability" - -# Full resource ID of the EXISTING Application Insights instance -# (can be in a different resource group / subscription) -APPI_ID="/subscriptions//resourceGroups//providers/Microsoft.Insights/components/" - -az deployment group create \ - --resource-group "$WORKBOOK_RG" \ - --template-file infra/workbook/workbook.bicep \ - --parameters applicationInsightsResourceId="$APPI_ID" -``` - -## Deploy with Azure PowerShell - -```powershell -$workbookRg = "rg-observability" -$appiId = "/subscriptions//resourceGroups//providers/Microsoft.Insights/components/" - -New-AzResourceGroupDeployment ` - -ResourceGroupName $workbookRg ` - -TemplateFile infra/workbook/workbook.bicep ` - -applicationInsightsResourceId $appiId -``` - -## Notes - -- The workbook resource itself can live in any resource group; only the - `sourceId` it points at needs to be a valid Application Insights resource - ID. This is what allows the workbook to fetch token-count telemetry from - Application Insights deployed elsewhere. -- The workbook name is a stable GUID derived from the resource group ID and - a fixed seed (`'token-usage-workbook'`), so re-running the deployment updates - the same workbook in place rather than creating duplicates. The Application - Insights resource ID is **not** part of the name, which lets you re-point - the workbook at a different App Insights instance without renaming it. -- Required permission: `Microsoft.Insights/workbooks/write` on the target - resource group. **No** permissions are required on the Application Insights - resource group at deployment time — the workbook only references it; the - user *viewing* the workbook needs read access to the App Insights instance. diff --git a/infra/workbook/workbook.bicep b/infra/workbook/workbook.bicep deleted file mode 100644 index a59f8e72d..000000000 --- a/infra/workbook/workbook.bicep +++ /dev/null @@ -1,53 +0,0 @@ -// ============================================================================ -// Token Usage Workbook (standalone deployment) -// ---------------------------------------------------------------------------- -// Provisions the "Token Usage" Application Insights workbook that visualises -// LLM_Token_Usage_Summary / LLM_Agent_Token_Usage / LLM_Model_Token_Usage -// custom events emitted by the Content Generation Solution Accelerator -// orchestrator. -// -// This template is deployed independently of the main solution so it can -// target an existing Application Insights instance that lives in a different -// resource group (or subscription) from the rest of the accelerator. -// -// Scope: resourceGroup (the workbook resource is created in the resource -// group passed to the deployment command - it does NOT need to be the same -// resource group as the Application Insights instance). -// ============================================================================ - -targetScope = 'resourceGroup' - -@description('Optional. Full resource ID of the Application Insights instance the workbook should query. Leave as the default ("Azure Monitor") to deploy an unbound workbook and pick the App Insights instance later from the Azure portal. Re-deploy with a real resource ID to (re)bind the workbook to a specific App Insights instance.') -param applicationInsightsResourceId string = 'Azure Monitor' - -@description('Optional. Stable name used for the workbook. Keep the default to allow re-deployments to update the SAME workbook even when applicationInsightsResourceId changes. Override only if you want multiple independent copies in the same resource group.') -param workbookName string = guid(resourceGroup().id, 'token-usage-workbook') - -@description('Optional. Azure region for the workbook resource. Defaults to the resource group location.') -param location string = resourceGroup().location - -@description('Optional. Display name shown in the Azure portal workbook gallery.') -param displayName string = 'Token Usage' - -@description('Optional. Tags applied to the workbook resource.') -param tags object = {} - -resource tokenUsageWorkbook 'Microsoft.Insights/workbooks@2023-06-01' = { - name: workbookName - location: location - tags: tags - kind: 'shared' - properties: { - displayName: displayName - category: 'workbook' - sourceId: applicationInsightsResourceId - version: 'Notebook/1.0' - serializedData: loadTextContent('../dashboards/token-usage-workbook.json') - } -} - -@description('Resource ID of the deployed workbook.') -output workbookResourceId string = tokenUsageWorkbook.id - -@description('Name (GUID) of the deployed workbook.') -output workbookName string = tokenUsageWorkbook.name diff --git a/src/backend/llm_token_telemetry.py b/src/backend/llm_token_telemetry.py new file mode 100644 index 000000000..91f670c5c --- /dev/null +++ b/src/backend/llm_token_telemetry.py @@ -0,0 +1,1037 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Cross-accelerator LLM token-usage telemetry helpers. + +A single, dependency-light helper module that can be dropped into any Microsoft +Solution Accelerator to capture LLM token usage and emit standardized custom +events to Application Insights. + +Why this file exists +-------------------- +Seven solution accelerators have independently shipped near-identical +``token_usage_utils.py`` modules (see PRs: content-generation #860, CKM #933, +content-processing #586, Container-Migration #257, agentic-data-foundation +#383, customer-chatbot #218, MACAE #1003). They all: + +* extract token counts from agent_framework / Azure OpenAI responses, +* emit the same three custom events (``LLM_Token_Usage_Summary``, + ``LLM_Agent_Token_Usage``, ``LLM_Model_Token_Usage``), +* defensively swallow telemetry errors, +* duplicate the same KQL queries and Azure Workbook. + +This module consolidates the union of those behaviours behind one stable API +so each accelerator can replace its bespoke helper with an import. + +Public API +---------- +- ``TokenUsage`` -- immutable dataclass for counts +- ``extract_usage(obj)`` -- agent_framework run result / message +- ``extract_usage_from_dict(d)`` -- raw dict from any SDK +- ``extract_usage_from_stream_chunk`` -- streaming chunks +- ``extract_realtime_usage(resp)`` -- Azure AI Voice Live response.done +- ``TokenUsageEmitter`` -- emits the three events + optional + per-user / per-team / speech events +- ``TokenUsageScope`` -- context-manager that accumulates and + auto-emits on exit +- ``track_tokens`` -- decorator wrapper around the scope + +Design rules +------------ +* Telemetry NEVER raises. Extraction failures return ``None``; emission + failures are logged at WARNING. +* No hard dependency on ``azure-monitor-events-extension``; if absent the + emitter degrades to logging only. +* Arbitrary correlation dimensions are passed as ``**dimensions`` kwargs and + surface verbatim as custom-event properties. This is how each accelerator + attaches its own keys (``conversation_id``, ``process_id``, ``team_name``, + ``file_name``, ``tenant``, etc.) without forking the helper. +""" +from __future__ import annotations + +import asyncio +import functools +import logging +import os +import random +import time +from contextlib import AbstractContextManager +from dataclasses import dataclass, field +from typing import Any, Callable, Iterable, Mapping, Optional +from unittest.mock import NonCallableMock + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Event-name constants -- keep these stable; KQL queries and workbooks bind +# to these exact strings. +# --------------------------------------------------------------------------- +EVENT_SUMMARY = "LLM_Token_Usage_Summary" +EVENT_AGENT = "LLM_Agent_Token_Usage" +EVENT_MODEL = "LLM_Model_Token_Usage" +EVENT_USER = "LLM_User_Token_Usage" +EVENT_TEAM = "LLM_Team_Token_Usage" +EVENT_SPEECH = "Speech_Usage" + + +# Token-count field aliases observed across model providers / SDK versions. +_INPUT_KEYS = ( + "input_token_count", + "input_tokens", + "prompt_tokens", + "promptTokens", +) +_OUTPUT_KEYS = ( + "output_token_count", + "output_tokens", + "completion_tokens", + "completionTokens", +) +_TOTAL_KEYS = ( + "total_token_count", + "total_tokens", + "totalTokens", +) + + +# --------------------------------------------------------------------------- +# Data model +# --------------------------------------------------------------------------- +@dataclass(frozen=True) +class TokenUsage: + """Normalized token-usage record.""" + + input_tokens: int = 0 + output_tokens: int = 0 + total_tokens: int = 0 + + # Optional realtime / voice fields (None unless populated) + input_audio_tokens: Optional[int] = None + input_text_tokens: Optional[int] = None + input_cached_tokens: Optional[int] = None + output_audio_tokens: Optional[int] = None + output_text_tokens: Optional[int] = None + + @property + def has_any(self) -> bool: + return bool(self.input_tokens or self.output_tokens or self.total_tokens) + + def __add__(self, other: "TokenUsage") -> "TokenUsage": + if not isinstance(other, TokenUsage): + return NotImplemented + + def _sum(a: Optional[int], b: Optional[int]) -> Optional[int]: + if a is None and b is None: + return None + return (a or 0) + (b or 0) + + return TokenUsage( + input_tokens=self.input_tokens + other.input_tokens, + output_tokens=self.output_tokens + other.output_tokens, + total_tokens=self.total_tokens + other.total_tokens, + input_audio_tokens=_sum(self.input_audio_tokens, other.input_audio_tokens), + input_text_tokens=_sum(self.input_text_tokens, other.input_text_tokens), + input_cached_tokens=_sum(self.input_cached_tokens, other.input_cached_tokens), + output_audio_tokens=_sum(self.output_audio_tokens, other.output_audio_tokens), + output_text_tokens=_sum(self.output_text_tokens, other.output_text_tokens), + ) + + def to_event_props(self) -> dict[str, str]: + """Stringified property bag suitable for App Insights custom events.""" + props: dict[str, str] = { + "input_tokens": str(self.input_tokens), + "output_tokens": str(self.output_tokens), + "total_tokens": str(self.total_tokens), + } + for name in ( + "input_audio_tokens", + "input_text_tokens", + "input_cached_tokens", + "output_audio_tokens", + "output_text_tokens", + ): + value = getattr(self, name) + if value is not None: + props[name] = str(value) + return props + + +# --------------------------------------------------------------------------- +# Low-level coercion helpers +# --------------------------------------------------------------------------- +def _to_int(value: Any, default: int = 0) -> int: + """Best-effort int conversion; bool excluded; never raises.""" + if value is None or isinstance(value, bool): + return default + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + if isinstance(value, str): + s = value.strip() + if s.isdigit(): + return int(s) + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _get(obj: Any, key: str, default: Any = None) -> Any: + """Read an attribute or dict key uniformly.""" + if obj is None: + return default + if isinstance(obj, Mapping): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _is_iterable(obj: Any) -> bool: + """True only for real iterables (lists/tuples/sets/generators), NOT for + arbitrary objects (e.g. ``unittest.mock.Mock``) that happen to expose + ``__iter__`` but blow up on iteration.""" + if obj is None: + return False + if isinstance(obj, (list, tuple, set, frozenset)): + return True + # Strings are iterable but never the right answer for "messages". + if isinstance(obj, (str, bytes, bytearray, Mapping)): + return False + # Fall back to a duck-typed check, but reject Mock instances which would + # otherwise pretend to support iteration. + if isinstance(obj, NonCallableMock): + return False + return hasattr(obj, "__iter__") + + +def _read_counts(usage_obj: Any) -> Optional[TokenUsage]: + """Read ``input/output/total`` from any usage-bearing object/dict.""" + if usage_obj is None: + return None + + inp = out = tot = 0 + for k in _INPUT_KEYS: + v = _get(usage_obj, k) + if v: + inp = _to_int(v) + break + for k in _OUTPUT_KEYS: + v = _get(usage_obj, k) + if v: + out = _to_int(v) + break + for k in _TOTAL_KEYS: + v = _get(usage_obj, k) + if v: + tot = _to_int(v) + break + + if tot == 0 and (inp or out): + tot = inp + out + if not (inp or out or tot): + return None + return TokenUsage(input_tokens=inp, output_tokens=out, total_tokens=tot) + + +# --------------------------------------------------------------------------- +# Extraction -- public +# --------------------------------------------------------------------------- +def extract_usage(result: Any) -> Optional[TokenUsage]: + """Extract usage from an agent_framework run result, ChatMessage, or + OpenAI-style ChatCompletion. + + Checks (in order): + 1. ``result.usage_details`` or ``result.usage`` + 2. ``result.raw_representation.usage`` (OpenAI ChatCompletion shape) + 3. Aggregated ``result.messages[*].contents[*].usage_details`` + + Never raises -- returns ``None`` on any unexpected shape. + """ + if result is None: + return None + + try: + for attr in ("usage_details", "usage"): + found = _read_counts(_get(result, attr)) + if found: + return found + + raw = _get(result, "raw_representation") + if raw is not None: + found = _read_counts(_get(raw, "usage")) + if found: + return found + + aggregated = TokenUsage() + found_any = False + messages = _get(result, "messages") + if not _is_iterable(messages): + return None + for msg in messages: + contents = _get(msg, "contents") + if not _is_iterable(contents): + continue + for content in contents: + usage = _get(content, "usage_details") or _get(content, "usage") + piece = _read_counts(usage) + if piece: + aggregated = aggregated + piece + found_any = True + return aggregated if found_any else None + except Exception as exc: + logger.debug("extract_usage failed: %s", exc, exc_info=True) + return None + + +def extract_usage_from_dict(data: Any) -> Optional[TokenUsage]: + """Extract from a raw dict / SDK usage object.""" + return _read_counts(data) + + +def extract_usage_from_stream_chunk(chunk: Any) -> Optional[TokenUsage]: + """Streaming chunks: try the top-level shape, then ``chunk.metadata.usage``.""" + found = extract_usage(chunk) + if found: + return found + metadata = _get(chunk, "metadata") + if metadata is not None: + return _read_counts(_get(metadata, "usage")) + return None + + +def extract_realtime_usage(response_obj: Any) -> Optional[TokenUsage]: + """Azure AI Voice Live ``response.done`` payload extractor. + + Includes audio / text / cached sub-counts when present. + """ + usage = _get(response_obj, "usage") + if usage is None: + return None + + inp = _to_int(_get(usage, "input_tokens")) + out = _to_int(_get(usage, "output_tokens")) + tot = _to_int(_get(usage, "total_tokens")) + if tot == 0 and (inp or out): + tot = inp + out + + in_details = _get(usage, "input_token_details") or {} + out_details = _get(usage, "output_token_details") or {} + + record = TokenUsage( + input_tokens=inp, + output_tokens=out, + total_tokens=tot, + input_audio_tokens=_to_int(_get(in_details, "audio_tokens")), + input_text_tokens=_to_int(_get(in_details, "text_tokens")), + input_cached_tokens=_to_int(_get(in_details, "cached_tokens")), + output_audio_tokens=_to_int(_get(out_details, "audio_tokens")), + output_text_tokens=_to_int(_get(out_details, "text_tokens")), + ) + # Only return if at least one non-zero count surfaced. + if record.has_any or any( + v for v in ( + record.input_audio_tokens, + record.input_text_tokens, + record.input_cached_tokens, + record.output_audio_tokens, + record.output_text_tokens, + ) + ): + return record + return None + + +# --------------------------------------------------------------------------- +# Tool / sub-agent attribution +# --------------------------------------------------------------------------- +def detect_invoked_tools(result: Any) -> set[str]: + """Return the set of tool/function names invoked in an agent result, + inferred from ``function_call`` content items. + + Used by orchestrators that expose sub-agents via ``.as_tool()`` to attribute + token usage only to the sub-agents that were actually called. Never raises. + """ + invoked: set[str] = set() + try: + messages = _get(result, "messages") + if not _is_iterable(messages): + return invoked + for msg in messages: + contents = _get(msg, "contents") + if not _is_iterable(contents): + continue + for content in contents: + if _get(content, "type") == "function_call": + name = _get(content, "name") + if name: + invoked.add(str(name)) + except Exception as exc: + logger.debug("detect_invoked_tools failed: %s", exc, exc_info=True) + return invoked + + +# --------------------------------------------------------------------------- +# Event sink (optional Application Insights dependency) +# --------------------------------------------------------------------------- +EventSink = Callable[[str, Mapping[str, str]], None] + + +def _default_event_sink() -> Optional[EventSink]: + """Return ``azure.monitor.events.extension.track_event`` if importable, + else ``None``. Resolved lazily so the helper still works in unit tests + without the dependency installed.""" + try: + from azure.monitor.events.extension import track_event # type: ignore + except Exception: # pragma: no cover - optional dep + return None + return track_event + + +# --------------------------------------------------------------------------- +# Emitter +# --------------------------------------------------------------------------- +class TokenUsageEmitter: + """Emit standardized token-usage custom events. + + Parameters + ---------- + connection_string: + Application Insights connection string. If ``None`` (default), the + ``APPLICATIONINSIGHTS_CONNECTION_STRING`` env var is consulted. When + no connection string is configured the emitter logs and skips the + ``track_event`` call. + static_dimensions: + Properties merged into every event (e.g. ``{"app": "customer-chatbot"}``). + event_sink: + Callable ``(event_name, props_dict) -> None``. Defaults to + ``azure.monitor.events.extension.track_event``. Override in tests. + pricing: + Optional mapping ``{model_deployment_name -> (usd_per_1k_input, + usd_per_1k_output)}``. When provided, an ``estimated_cost_usd`` + property is attached to agent / model / summary events. Model lookup + is case-insensitive. Use this to avoid hard-coding rates in KQL. + user_id_hasher: + Optional callable ``str -> str`` applied to any ``user_id`` value + before it leaves the emitter. Use this to satisfy PII / GDPR + requirements (e.g. HMAC-SHA256 with a tenant-scoped salt). Applied + to both ``static_dimensions['user_id']`` (at construction) and + per-call ``user_id`` kwargs. + sample_rate: + Fraction of high-cardinality events (agent / model / user / team / + speech) actually shipped, in ``[0.0, 1.0]``. The cheap **summary + event always fires** regardless of sample_rate so per-request totals + remain accurate; only the per-dimension breakdown is sampled. + Defaults to ``1.0`` (no sampling). + logger: + Override the module logger. + """ + + def __init__( + self, + *, + connection_string: Optional[str] = None, + static_dimensions: Optional[Mapping[str, Any]] = None, + event_sink: Optional[EventSink] = None, + pricing: Optional[Mapping[str, tuple[float, float]]] = None, + user_id_hasher: Optional[Callable[[str], str]] = None, + sample_rate: float = 1.0, + logger: Optional[logging.Logger] = None, + ) -> None: + self._cs = connection_string if connection_string is not None else os.getenv( + "APPLICATIONINSIGHTS_CONNECTION_STRING" + ) + self._sink = event_sink if event_sink is not None else _default_event_sink() + self._log = logger or logging.getLogger(__name__) + + # PII hashing applied to user_id everywhere. + self._user_id_hasher = user_id_hasher + + # Sampling clamp to [0, 1]. + try: + sr = float(sample_rate) + except (TypeError, ValueError): + sr = 1.0 + self._sample_rate = max(0.0, min(1.0, sr)) + + # Case-insensitive pricing lookup. Values stored as a (in, out) tuple. + self._pricing: dict[str, tuple[float, float]] = {} + for model, rates in (pricing or {}).items(): + if not model or rates is None: + continue + try: + inp, out = rates + self._pricing[str(model).lower()] = (float(inp), float(out)) + except (TypeError, ValueError): + self._log.warning("Ignoring malformed pricing entry: %s=%r", model, rates) + + # Pre-stringify static dims once. user_id (if present) is hashed here + # so the raw value is never retained on the emitter. + raw_static = dict(static_dimensions or {}) + if "user_id" in raw_static: + raw_static["user_id"] = self._apply_user_id_hash(raw_static["user_id"]) + self._static: dict[str, str] = { + k: ("" if v is None else str(v)) for k, v in raw_static.items() + } + + # Performance counters. ``perf_*`` accumulate wall-clock nanoseconds + # spent inside ``emit()`` so callers can verify telemetry overhead is + # negligible. ``perf_slow_emit_threshold_ms`` is the soft threshold + # above which a WARNING is logged for an individual emit (default + # 50 ms -- emits should normally take well under 1 ms). + self._perf_total_ns: int = 0 + self._perf_emit_count: int = 0 + self._perf_max_ns: int = 0 + self.perf_slow_emit_threshold_ms: float = 50.0 + + # -- public surface --------------------------------------------------- + @property + def enabled(self) -> bool: + return bool(self._cs) and self._sink is not None + + @property + def sample_rate(self) -> float: + return self._sample_rate + + # -- internal helpers ------------------------------------------------- + def _apply_user_id_hash(self, value: Any) -> Any: + """Apply the configured user_id_hasher; never raises.""" + if value is None or value == "" or self._user_id_hasher is None: + return value + try: + return self._user_id_hasher(str(value)) + except Exception as exc: # never let hashing break telemetry + self._log.warning("user_id_hasher raised: %s", exc) + return value + + def _should_sample(self) -> bool: + """Sampling decision for high-cardinality events.""" + if self._sample_rate >= 1.0: + return True + if self._sample_rate <= 0.0: + return False + return random.random() < self._sample_rate + + def _cost_props( + self, model_deployment_name: Optional[str], usage: TokenUsage + ) -> dict[str, str]: + """Return ``{'estimated_cost_usd': '...'}`` when pricing is configured + for the given model, else ``{}``. 6-decimal formatting.""" + if not self._pricing or not model_deployment_name: + return {} + rate = self._pricing.get(model_deployment_name.lower()) + if not rate: + return {} + inp_rate, out_rate = rate + cost = (usage.input_tokens * inp_rate + usage.output_tokens * out_rate) / 1000.0 + return {"estimated_cost_usd": f"{cost:.6f}"} + + def _summary_cost_props( + self, + primary_model: Optional[str], + additional_agents: Mapping[str, str], + usage: TokenUsage, + ) -> dict[str, str]: + """Best-effort cost for the summary event: charge full usage at the + primary model's rate (the SDK aggregates sub-agent tokens to the + orchestrator, so apportioning is not possible without per-agent + usage). Falls back to silent skip when no rate is known.""" + if primary_model: + cost = self._cost_props(primary_model, usage) + if cost: + return cost + for m in additional_agents.values(): + cost = self._cost_props(m, usage) + if cost: + return cost + return {} + + def emit(self, event_name: str, **dimensions: Any) -> None: + """Low-level: emit an event with arbitrary properties. + + Non-string values are stringified. ``None`` values are dropped. Any + ``user_id`` value is passed through the configured hasher. + Never raises. Wall-clock duration is recorded for performance audit + (see :meth:`perf_stats`). + """ + start_ns = time.perf_counter_ns() + try: + props = dict(self._static) # cheap shallow copy of pre-stringified dims + for k, v in dimensions.items(): + if v is None: + continue + if k == "user_id": + v = self._apply_user_id_hash(v) + if v is None or v == "": + continue + props[k] = v if isinstance(v, str) else str(v) + + if not self.enabled: + self._log.debug( + "App Insights not configured -- skipping event %s (%s)", + event_name, props, + ) + return + try: + self._sink(event_name, props) # type: ignore[misc] + except Exception as exc: # never break the caller + self._log.warning("track_event(%s) failed: %s", event_name, exc) + finally: + elapsed_ns = time.perf_counter_ns() - start_ns + self._perf_total_ns += elapsed_ns + self._perf_emit_count += 1 + if elapsed_ns > self._perf_max_ns: + self._perf_max_ns = elapsed_ns + elapsed_ms = elapsed_ns / 1_000_000.0 + if elapsed_ms > self.perf_slow_emit_threshold_ms: + self._log.warning( + "Token telemetry emit slow: event=%s duration_ms=%.3f", + event_name, elapsed_ms, + ) + else: + self._log.debug( + "Token telemetry emit: event=%s duration_ms=%.3f", + event_name, elapsed_ms, + ) + + # -- performance audit ------------------------------------------------ + def perf_stats(self) -> dict[str, float]: + """Return cumulative telemetry-overhead stats since process start + (or since :meth:`reset_perf_stats`). + + Keys: + ``emit_count`` -- number of events emitted + ``total_ms`` -- total wall-clock time spent inside ``emit`` + ``avg_ms`` -- mean per-event duration + ``max_ms`` -- slowest single emit observed + """ + count = self._perf_emit_count + total_ms = self._perf_total_ns / 1_000_000.0 + return { + "emit_count": float(count), + "total_ms": total_ms, + "avg_ms": (total_ms / count) if count else 0.0, + "max_ms": self._perf_max_ns / 1_000_000.0, + } + + def reset_perf_stats(self) -> None: + """Zero the perf counters (useful for tests and load-tests).""" + self._perf_total_ns = 0 + self._perf_emit_count = 0 + self._perf_max_ns = 0 + + # -- typed convenience emitters -------------------------------------- + def emit_agent( + self, + *, + agent_name: str, + model_deployment_name: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + if not usage.has_any or not self._should_sample(): + return + self.emit( + EVENT_AGENT, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + **usage.to_event_props(), + **self._cost_props(model_deployment_name, usage), + **dimensions, + ) + + def emit_model( + self, + *, + model_deployment_name: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + if not usage.has_any or not self._should_sample(): + return + self.emit( + EVENT_MODEL, + model_deployment_name=model_deployment_name, + **usage.to_event_props(), + **self._cost_props(model_deployment_name, usage), + **dimensions, + ) + + def emit_user( + self, + *, + user_id: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + if not usage.has_any or not user_id or not self._should_sample(): + return + self.emit( + EVENT_USER, + user_id=user_id, + **usage.to_event_props(), + **dimensions, + ) + + def emit_team( + self, + *, + team_name: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + if not usage.has_any or not team_name or not self._should_sample(): + return + self.emit( + EVENT_TEAM, + team_name=team_name, + **usage.to_event_props(), + **dimensions, + ) + + def emit_summary( + self, + *, + usage: TokenUsage, + agent_count: int = 1, + model_count: int = 1, + primary_model: Optional[str] = None, + additional_agents: Optional[Mapping[str, str]] = None, + **dimensions: Any, + ) -> None: + """The summary event always fires (ignores ``sample_rate``) so per- + request totals remain accurate even when high-cardinality events are + sampled.""" + if not usage.has_any: + return + # Summary historically uses ``total_input_tokens`` / ``total_output_tokens`` + # field names; preserve that wire format for backward compatibility. + props = { + "total_input_tokens": str(usage.input_tokens), + "total_output_tokens": str(usage.output_tokens), + "total_tokens": str(usage.total_tokens), + "agent_count": str(agent_count), + "model_count": str(model_count), + "sample_rate": f"{self._sample_rate:.4f}", + } + # Carry over realtime sub-counts if present. + for k, v in usage.to_event_props().items(): + props.setdefault(k, v) + # Optional total cost. + props.update(self._summary_cost_props(primary_model, additional_agents or {}, usage)) + self.emit(EVENT_SUMMARY, **props, **dimensions) + + def emit_speech( + self, + *, + model_deployment_name: str, + source: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Voice-Live / realtime speech usage event.""" + if not self._should_sample(): + return + self.emit( + EVENT_SPEECH, + model_deployment_name=model_deployment_name, + source=source, + **usage.to_event_props(), + **self._cost_props(model_deployment_name, usage), + **dimensions, + ) + + # -- combined emit: summary + agent + per-distinct-model --------------- + def emit_all( + self, + *, + agent_name: str, + model_deployment_name: str, + usage: TokenUsage, + additional_agents: Optional[Mapping[str, str]] = None, + emit_user_event: bool = False, + emit_team_event: bool = False, + **dimensions: Any, + ) -> None: + """Convenience: emit summary, agent, and one model event per distinct + model deployment in one shot. + + ``additional_agents`` maps sub-agent name -> its model deployment name + so callers can describe orchestrators that involve multiple agents. + + ``emit_user_event`` / ``emit_team_event`` opt in to the user/team + events; ``user_id`` / ``team_name`` must be present in dimensions for + those to fire. + """ + if not usage.has_any: + return + + agents = {agent_name: model_deployment_name} + if additional_agents: + agents.update({k: v for k, v in additional_agents.items() if k}) + models = {m for m in agents.values() if m} + + # Wall-clock timing of the whole emit_all path so callers (or tests) + # can verify the telemetry path stays cheap relative to the LLM call + # it instruments. + batch_start_ns = time.perf_counter_ns() + + # Defer summary until last so we can stamp the batch overhead on it. + self.emit_agent( + agent_name=agent_name, + model_deployment_name=model_deployment_name, + usage=usage, + **dimensions, + ) + for model in models: + self.emit_model( + model_deployment_name=model, + usage=usage, + **dimensions, + ) + if emit_user_event and dimensions.get("user_id"): + self.emit_user( + user_id=str(dimensions["user_id"]), + usage=usage, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + ) + if emit_team_event and dimensions.get("team_name"): + self.emit_team( + team_name=str(dimensions["team_name"]), + usage=usage, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + ) + + batch_overhead_ms = (time.perf_counter_ns() - batch_start_ns) / 1_000_000.0 + self.emit_summary( + usage=usage, + agent_count=len(agents), + model_count=len(models) or 1, + primary_model=model_deployment_name, + additional_agents=additional_agents, + telemetry_overhead_ms=f"{batch_overhead_ms:.3f}", + **dimensions, + ) + + self._log.info( + "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d %s", + agent_name, + model_deployment_name, + usage.input_tokens, + usage.output_tokens, + usage.total_tokens, + " ".join(f"{k}={v}" for k, v in dimensions.items() if v), + ) + + +# --------------------------------------------------------------------------- +# Scope / decorator sugar +# --------------------------------------------------------------------------- +@dataclass +class TokenUsageScope(AbstractContextManager): + """Accumulate usage across multiple results, then emit on exit. + + Example:: + + with TokenUsageScope(emitter, + agent_name="chat", + model_deployment_name=cfg.model, + user_id=user_id) as scope: + result = await agent.run(prompt) + scope.add(result) # extracts and accumulates + """ + + emitter: TokenUsageEmitter + agent_name: str + model_deployment_name: str + dimensions: dict[str, Any] = field(default_factory=dict) + additional_agents: dict[str, str] = field(default_factory=dict) + emit_user_event: bool = False + emit_team_event: bool = False + usage: TokenUsage = field(default_factory=TokenUsage) + + def __init__( + self, + emitter: TokenUsageEmitter, + *, + agent_name: str, + model_deployment_name: str, + additional_agents: Optional[Mapping[str, str]] = None, + emit_user_event: bool = False, + emit_team_event: bool = False, + **dimensions: Any, + ) -> None: + self.emitter = emitter + self.agent_name = agent_name + self.model_deployment_name = model_deployment_name + self.additional_agents = dict(additional_agents or {}) + self.emit_user_event = emit_user_event + self.emit_team_event = emit_team_event + self.dimensions = dict(dimensions) + self.usage = TokenUsage() + # Wall-clock nanoseconds spent inside extraction (``add*``) and the + # final ``__exit__`` emit, respectively. Surfaced for callers that + # want to verify the helper doesn't add measurable latency. Available + # as ``scope.extract_ms`` / ``scope.emit_ms`` after the scope closes. + self._extract_ns: int = 0 + self._emit_ns: int = 0 + + # -- accumulation ----------------------------------------------------- + def add(self, source: Any) -> Optional[TokenUsage]: + """Extract usage from any supported shape and add to the running total. + + Never raises -- extraction failures return ``None`` and are logged + at DEBUG. + """ + start_ns = time.perf_counter_ns() + try: + found = extract_usage(source) or extract_usage_from_stream_chunk(source) + except Exception as exc: # belt + braces; extractors are already safe + logger.debug("TokenUsageScope.add failed: %s", exc, exc_info=True) + return None + finally: + self._extract_ns += time.perf_counter_ns() - start_ns + if found: + self.usage = self.usage + found + return found + + def add_usage(self, usage: TokenUsage) -> None: + self.usage = self.usage + usage + + def add_chunks(self, chunks: Iterable[Any]) -> None: + for c in chunks: + self.add(c) + + # -- timing properties ----------------------------------------------- + @property + def extract_ms(self) -> float: + """Total ms spent inside :meth:`add` / :meth:`add_chunks`.""" + return self._extract_ns / 1_000_000.0 + + @property + def emit_ms(self) -> float: + """Total ms spent in the on-exit emit batch.""" + return self._emit_ns / 1_000_000.0 + + @property + def total_overhead_ms(self) -> float: + """Total telemetry overhead added by this scope (extract + emit).""" + return self.extract_ms + self.emit_ms + + # -- context manager -------------------------------------------------- + def __exit__(self, exc_type, exc, tb) -> None: + # Always emit (best-effort) regardless of exception status. + emit_start_ns = time.perf_counter_ns() + try: + self.emitter.emit_all( + agent_name=self.agent_name, + model_deployment_name=self.model_deployment_name, + usage=self.usage, + additional_agents=self.additional_agents, + emit_user_event=self.emit_user_event, + emit_team_event=self.emit_team_event, + **self.dimensions, + ) + except Exception as emit_exc: # pragma: no cover - belt + braces + logger.warning("TokenUsageScope emit failed: %s", emit_exc) + finally: + self._emit_ns += time.perf_counter_ns() - emit_start_ns + logger.debug( + "TokenUsageScope overhead: agent=%s extract_ms=%.3f " + "emit_ms=%.3f total_ms=%.3f", + self.agent_name, + self.extract_ms, + self.emit_ms, + self.total_overhead_ms, + ) + return None # do not suppress exceptions + + +def track_tokens( + emitter: TokenUsageEmitter, + *, + agent_name: str, + model_deployment_name: str, + dimension_args: Optional[Mapping[str, str]] = None, + additional_agents: Optional[Mapping[str, str]] = None, + emit_user_event: bool = False, + emit_team_event: bool = False, +): + """Decorator: wrap an async or sync function that returns an LLM result. + + ``dimension_args`` maps emitted-property-name -> callable-keyword-argument + name so per-call values (e.g. ``user_id``) are forwarded to the event. + + Example:: + + @track_tokens(emitter, + agent_name="chat", + model_deployment_name=settings.model, + dimension_args={"user_id": "user_id", + "session_id": "session_id"}) + async def run_chat(prompt, *, user_id, session_id): ... + """ + + dim_args = dict(dimension_args or {}) + + def _decorator(fn: Callable[..., Any]): + is_coro = _is_coroutine_function(fn) + + if is_coro: + @functools.wraps(fn) + async def _aw(*args, **kwargs) -> Any: + with _scope_for(kwargs) as scope: + result = await fn(*args, **kwargs) + scope.add(result) + return result + return _aw + + @functools.wraps(fn) + def _sw(*args, **kwargs) -> Any: + with _scope_for(kwargs) as scope: + result = fn(*args, **kwargs) + scope.add(result) + return result + return _sw + + def _scope_for(call_kwargs: Mapping[str, Any]) -> TokenUsageScope: + dimensions = { + prop: call_kwargs.get(kw) + for prop, kw in dim_args.items() + if call_kwargs.get(kw) is not None + } + return TokenUsageScope( + emitter, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + additional_agents=additional_agents, + emit_user_event=emit_user_event, + emit_team_event=emit_team_event, + **dimensions, + ) + + return _decorator + + +def _is_coroutine_function(fn: Callable[..., Any]) -> bool: + return asyncio.iscoroutinefunction(fn) + + +__all__ = [ + "EVENT_SUMMARY", + "EVENT_AGENT", + "EVENT_MODEL", + "EVENT_USER", + "EVENT_TEAM", + "EVENT_SPEECH", + "TokenUsage", + "TokenUsageEmitter", + "TokenUsageScope", + "track_tokens", + "extract_usage", + "extract_usage_from_dict", + "extract_usage_from_stream_chunk", + "extract_realtime_usage", + "detect_invoked_tools", +] diff --git a/src/backend/orchestrator.py b/src/backend/orchestrator.py index f78ae3fb7..63b55832e 100644 --- a/src/backend/orchestrator.py +++ b/src/backend/orchestrator.py @@ -20,7 +20,7 @@ import logging import re from contextvars import ContextVar -from typing import AsyncIterator, Optional, cast +from typing import Any, AsyncIterator, Mapping, Optional, cast from agent_framework import ( ChatMessage, @@ -41,16 +41,23 @@ FOUNDRY_AVAILABLE = False AIProjectClient = None +from llm_token_telemetry import ( + TokenUsage, + TokenUsageEmitter, + extract_usage, + extract_usage_from_dict, + extract_usage_from_stream_chunk, +) from models import CreativeBrief from settings import app_settings -from token_usage import TokenUsageAccumulator +from telemetry import token_emitter logger = logging.getLogger(__name__) # Token endpoint for Azure Cognitive Services (used for Azure OpenAI) TOKEN_ENDPOINT = "https://cognitiveservices.azure.com/.default" -# Per-request user_id propagated to TokenUsageAccumulator instances created +# Per-request user_id propagated to _RequestTokenTracker instances created # anywhere inside the request (including deep workflow helpers like # ``_generate_foundry_image``). Set at the entry points of the public # orchestrator methods; read by ``_new_token_accumulator``. @@ -493,6 +500,154 @@ def _filter_system_prompt_from_response(response_text: str) -> str: """ +class _RequestTokenTracker: + """Per-request multi-agent token accumulator. + + Aggregates ``TokenUsage`` per agent and per model deployment over the + lifetime of a single orchestrator request, then emits the standardized + ``LLM_Token_Usage_Summary`` / ``LLM_Agent_Token_Usage`` / + ``LLM_Model_Token_Usage`` custom events via the shared + :class:`TokenUsageEmitter` on :meth:`flush`. Identical event names and + dimension keys to the cross-accelerator helper in + :mod:`llm_token_telemetry`. Telemetry failures are logged but never + raised. + """ + + __slots__ = ( + "_emitter", + "_user_id", + "_conversation_id", + "_agent_model_map", + "_default_model", + "by_agent", + "by_model", + "total", + ) + + def __init__( + self, + emitter: TokenUsageEmitter, + *, + user_id: str = "", + conversation_id: str = "", + agent_model_map: Optional[Mapping[str, str]] = None, + default_model: str = "", + ) -> None: + self._emitter = emitter + self._user_id = user_id or "" + self._conversation_id = conversation_id or "" + self._agent_model_map: dict[str, str] = dict(agent_model_map or {}) + self._default_model = default_model or "" + self.by_agent: dict[str, tuple[TokenUsage, str]] = {} + self.by_model: dict[str, TokenUsage] = {} + self.total: TokenUsage = TokenUsage() + + def _resolve_model(self, agent_name: str) -> str: + return self._agent_model_map.get(agent_name) or self._default_model + + def _add(self, agent_name: str, usage: TokenUsage) -> None: + if not usage.has_any: + return + agent = agent_name or "unknown_agent" + model = self._resolve_model(agent) + prev_usage, prev_model = self.by_agent.get(agent, (TokenUsage(), model)) + self.by_agent[agent] = (prev_usage + usage, prev_model or model) + if model: + self.by_model[model] = self.by_model.get(model, TokenUsage()) + usage + self.total = self.total + usage + + def record(self, agent_name: str, usage: TokenUsage) -> None: + """Record a pre-extracted :class:`TokenUsage` for the named agent.""" + self._add(agent_name, usage) + + def record_response(self, *, agent_name: str, response: Any) -> bool: + """Extract usage from an ``AgentResponse`` and record it. Returns True on success.""" + usage = extract_usage(response) + if usage: + self._add(agent_name, usage) + return True + return False + + def record_event(self, event: Any) -> bool: + """Extract usage from a workflow ``run_stream`` event and record it. + + Reads ``event.executor_id`` for per-agent attribution and falls back + through ``extract_usage_from_stream_chunk`` then ``extract_usage`` to + cover both ``AgentRunUpdateEvent`` and ``AgentRunEvent`` data shapes. + """ + if event is None: + return False + executor_id = getattr(event, "executor_id", None) + data = getattr(event, "data", None) + if data is None or not executor_id: + return False + usage = extract_usage_from_stream_chunk(data) or extract_usage(data) + if usage: + self._add(executor_id, usage) + return True + return False + + def record_image_api_response( + self, *, agent_name: str, response_json: Optional[dict], model: str = "" + ) -> bool: + """Record token usage from an image-generation REST response (OpenAI shape).""" + if not isinstance(response_json, dict): + return False + usage = extract_usage_from_dict(response_json.get("usage")) + if not usage: + return False + if model and agent_name not in self._agent_model_map: + self._agent_model_map[agent_name] = model + self._add(agent_name, usage) + return True + + def has_data(self) -> bool: + return self.total.has_any + + def flush(self, *, source: str = "") -> None: + """Emit aggregated LLM_*_Token_Usage events. Safe to call once per request.""" + if not self.has_data(): + return + dims = { + "user_id": self._user_id, + "conversation_id": self._conversation_id, + "source": source, + } + for agent_name, (usage, model) in self.by_agent.items(): + self._emitter.emit_agent( + agent_name=agent_name, + model_deployment_name=model or self._default_model, + usage=usage, + **dims, + ) + for model_name, usage in self.by_model.items(): + self._emitter.emit_model( + model_deployment_name=model_name, + usage=usage, + **dims, + ) + primary_model = next(iter(self.by_model), self._default_model) + self._emitter.emit_summary( + usage=self.total, + agent_count=len(self.by_agent), + model_count=len(self.by_model) or 1, + primary_model=primary_model, + **dims, + ) + logger.info( + "[TOKEN USAGE] source=%s user=%s conv=%s total=%d (in=%d, out=%d) " + "agents=%s models=%s", + source, + self._user_id, + self._conversation_id, + self.total.total_tokens, + self.total.input_tokens, + self.total.output_tokens, + {k: v[0].total_tokens for k, v in self.by_agent.items()}, + {k: v.total_tokens for k, v in self.by_model.items()}, + ) + + class ContentGenerationOrchestrator: """ Orchestrates the multi-agent content generation workflow using @@ -728,16 +883,18 @@ def initialize(self) -> None: def _new_token_accumulator( self, conversation_id: str = "", user_id: str = "" - ) -> TokenUsageAccumulator: - """Create a TokenUsageAccumulator pre-populated with this orchestrator's - agent->model map and default chat model. Telemetry is best-effort. + ) -> _RequestTokenTracker: + """Create a :class:`_RequestTokenTracker` pre-populated with this + orchestrator's agent->model map and default chat model. Telemetry + is best-effort. If ``user_id`` / ``conversation_id`` are not provided, falls back to the per-request values stored in the ``_current_user_id`` / - ``_current_conversation_id`` ContextVars so accumulators created deep + ``_current_conversation_id`` ContextVars so trackers created deep inside the workflow still carry the caller's correlation ids. """ - return TokenUsageAccumulator( + return _RequestTokenTracker( + token_emitter, conversation_id=conversation_id or _current_conversation_id.get(""), user_id=user_id or _current_user_id.get(""), agent_model_map=self._agent_model_map, @@ -797,7 +954,7 @@ async def process_message( # Defined outside the try so the except/finally branches can safely # reference ``token_acc`` even if creation fails. Each flush call is # guarded by ``if token_acc is not None`` to avoid NoneType errors. - token_acc: Optional[TokenUsageAccumulator] = None + token_acc: Optional[_RequestTokenTracker] = None try: token_acc = self._new_token_accumulator(conversation_id, user_id) @@ -934,7 +1091,7 @@ async def send_user_response( _ctx_token = _current_user_id.set(user_id or "") _ctx_conv = _current_conversation_id.set(conversation_id or "") # See process_message for the rationale of the None-init pattern. - token_acc: Optional[TokenUsageAccumulator] = None + token_acc: Optional[_RequestTokenTracker] = None try: token_acc = self._new_token_accumulator(conversation_id, user_id) responses = {request_id: user_response} diff --git a/src/backend/telemetry.py b/src/backend/telemetry.py new file mode 100644 index 000000000..b5af65460 --- /dev/null +++ b/src/backend/telemetry.py @@ -0,0 +1,90 @@ +"""Process-wide telemetry singletons. + +A single :class:`TokenUsageEmitter` is constructed at import time so every +router/utility shares the same App Insights connection-string resolution and +static dimensions. Importing this module has no side effects beyond reading +``APPLICATIONINSIGHTS_CONNECTION_STRING`` and the env vars documented below. + +Optional environment variables +------------------------------ +LLM_TOKEN_SAMPLE_RATE + Float in [0, 1]. Fraction of high-cardinality token events + (agent/model/user/team/speech) to ship. The summary event always fires. + Defaults to ``1.0``. + +LLM_TOKEN_USER_ID_HMAC_KEY + When set, ``user_id`` values are replaced with an HMAC-SHA256 hex digest + (truncated to 16 chars) before leaving the process. Use to satisfy + GDPR / PII handling requirements without modifying call sites. + +LLM_TOKEN_PRICING + Optional comma-separated list of ``model=in_per_1k:out_per_1k`` entries, + e.g. ``gpt-4o=0.0025:0.01,gpt-4o-mini=0.00015:0.0006``. When set the + emitter attaches ``estimated_cost_usd`` to agent / model / summary + events so dashboards can group by cost without hard-coded KQL rates. +""" +from __future__ import annotations + +import hashlib +import hmac +import logging +import os +from typing import Callable, Optional + +from llm_token_telemetry import TokenUsageEmitter + +_log = logging.getLogger(__name__) + + +def _parse_sample_rate() -> float: + raw = os.getenv("LLM_TOKEN_SAMPLE_RATE") + if not raw: + return 1.0 + try: + return max(0.0, min(1.0, float(raw))) + except ValueError: + _log.warning("Invalid LLM_TOKEN_SAMPLE_RATE=%r; defaulting to 1.0", raw) + return 1.0 + + +def _build_user_id_hasher() -> Optional[Callable[[str], str]]: + key = os.getenv("LLM_TOKEN_USER_ID_HMAC_KEY") + if not key: + return None + key_bytes = key.encode("utf-8") + + def _hash(value: str) -> str: + digest = hmac.new(key_bytes, value.encode("utf-8"), hashlib.sha256).hexdigest() + return digest[:16] + + return _hash + + +def _parse_pricing() -> dict[str, tuple[float, float]]: + raw = os.getenv("LLM_TOKEN_PRICING") + if not raw: + return {} + pricing: dict[str, tuple[float, float]] = {} + for entry in raw.split(","): + entry = entry.strip() + if not entry or "=" not in entry: + continue + model, rates = entry.split("=", 1) + if ":" not in rates: + continue + in_s, out_s = rates.split(":", 1) + try: + pricing[model.strip().lower()] = (float(in_s), float(out_s)) + except ValueError: + _log.warning("Ignoring malformed pricing entry: %s", entry) + return pricing + + +token_emitter = TokenUsageEmitter( + static_dimensions={"app": "content-generation"}, + sample_rate=_parse_sample_rate(), + user_id_hasher=_build_user_id_hasher(), + pricing=_parse_pricing(), +) + +__all__ = ["token_emitter"] diff --git a/src/backend/token_usage.py b/src/backend/token_usage.py deleted file mode 100644 index c7a39c07b..000000000 --- a/src/backend/token_usage.py +++ /dev/null @@ -1,406 +0,0 @@ -""" -Token usage tracking for the Content Generation orchestrator. - -Captures LLM token usage from Microsoft Agent Framework agent runs and workflow -streams (Azure OpenAI / Azure AI Foundry) and emits per-agent and per-model -custom events to Application Insights via ``event_utils.track_event_if_configured``. - -Usage: - from token_usage import TokenUsageAccumulator, extract_usage_from_response - - acc = TokenUsageAccumulator(user_id="abc", conversation_id="xyz", - agent_model_map={"planning_agent": "gpt-5"}) - response = await agent.run(prompt) - acc.record_response(agent_name="planning_agent", response=response) - acc.flush() # emits LLM_*_Token_Usage events -""" - -from __future__ import annotations - -import logging -import os -from dataclasses import dataclass -from typing import Any, Iterable, Optional - -from event_utils import track_event_if_configured - -logger = logging.getLogger(__name__) - -# Custom Application Insights event names (shared with KQL dashboard queries). -EVENT_SUMMARY = "LLM_Token_Usage_Summary" -EVENT_AGENT = "LLM_Agent_Token_Usage" -EVENT_MODEL = "LLM_Model_Token_Usage" - - -@dataclass(slots=True) -class _Counts: - input_tokens: int = 0 - output_tokens: int = 0 - total_tokens: int = 0 - model_deployment_name: str = "" - - def add(self, inp: int, out: int, tot: int) -> None: - self.input_tokens += inp - self.output_tokens += out - self.total_tokens += tot - - -def _coerce_int(value: Any) -> int: - try: - if value is None: - return 0 - return int(value) - except (TypeError, ValueError): - return 0 - - -def _from_dict(d: dict) -> Optional[tuple[int, int, int]]: - """Pull (input, output, total) out of a usage-shaped dict. - - Handles both the Microsoft Agent Framework ``UsageDetails`` shape - (``input_token_count`` / ``output_token_count`` / ``total_token_count``) - and the OpenAI SDK shape (``prompt_tokens`` / ``completion_tokens`` / - ``total_tokens``). - """ - inp = _coerce_int( - d.get("input_token_count") - or d.get("prompt_tokens") - or d.get("input_tokens") - ) - out = _coerce_int( - d.get("output_token_count") - or d.get("completion_tokens") - or d.get("output_tokens") - ) - tot = _coerce_int(d.get("total_token_count") or d.get("total_tokens")) or ( - inp + out - ) - if tot <= 0: - return None - return (inp, out, tot) - - -def _from_usage_details(details: Any) -> Optional[tuple[int, int, int]]: - """Extract counts from a ``UsageDetails`` object, dict, or similar.""" - if details is None: - return None - if isinstance(details, dict): - return _from_dict(details) - inp = _coerce_int( - getattr(details, "input_token_count", None) - or getattr(details, "prompt_tokens", None) - or getattr(details, "input_tokens", None) - ) - out = _coerce_int( - getattr(details, "output_token_count", None) - or getattr(details, "completion_tokens", None) - or getattr(details, "output_tokens", None) - ) - tot = _coerce_int( - getattr(details, "total_token_count", None) - or getattr(details, "total_tokens", None) - ) or (inp + out) - if tot <= 0: - return None - return (inp, out, tot) - - -def _scan_contents(contents: Optional[Iterable]) -> Optional[tuple[int, int, int]]: - """Look for ``UsageContent`` entries in a contents list.""" - if not contents: - return None - for item in contents: - # Framework UsageContent: has .details (UsageDetails) - details = getattr(item, "details", None) - if details is not None: - result = _from_usage_details(details) - if result: - return result - # Some shapes expose .usage_details directly - usage_details = getattr(item, "usage_details", None) - if usage_details is not None: - result = _from_usage_details(usage_details) - if result: - return result - # Plain dict content - if isinstance(item, dict): - if isinstance(item.get("details"), dict): - result = _from_dict(item["details"]) - if result: - return result - if isinstance(item.get("usage_details"), dict): - result = _from_dict(item["usage_details"]) - if result: - return result - return None - - -def extract_usage_from_response(response: Any) -> Optional[tuple[int, int, int]]: - """Extract ``(input, output, total)`` token counts from an ``AgentResponse``. - - Checks (in order): - 1. ``response.usage_details`` - 2. ``response.messages[*].contents[*]`` for ``UsageContent`` items - 3. ``response.raw_representation.usage`` (OpenAI SDK fallback) - Returns ``None`` if no usage information is present. - """ - if response is None: - return None - - result = _from_usage_details(getattr(response, "usage_details", None)) - if result: - return result - - messages = getattr(response, "messages", None) or [] - for msg in messages: - result = _scan_contents(getattr(msg, "contents", None)) - if result: - return result - - raw = getattr(response, "raw_representation", None) - if raw is not None: - usage = getattr(raw, "usage", None) or ( - raw.get("usage") if isinstance(raw, dict) else None - ) - if usage is not None: - result = _from_usage_details(usage) - if result: - return result - return None - - -def extract_usage_from_update(update: Any) -> Optional[tuple[int, int, int]]: - """Extract token counts from a streaming ``AgentResponseUpdate``.""" - if update is None: - return None - - result = _scan_contents(getattr(update, "contents", None)) - if result: - return result - - raw = getattr(update, "raw_representation", None) - if raw is not None: - usage = getattr(raw, "usage", None) or ( - raw.get("usage") if isinstance(raw, dict) else None - ) - if usage is not None: - result = _from_usage_details(usage) - if result: - return result - return None - - -def extract_usage_from_event(event: Any) -> tuple[Optional[str], Optional[tuple[int, int, int]]]: - """Extract ``(executor_id, usage_tuple)`` from a workflow stream event. - - Used while iterating ``workflow.run_stream(...)``: returns the executor / - agent name plus the usage tuple when present, or ``(None, None)`` for - unrelated events. - """ - if event is None: - return (None, None) - - executor_id = getattr(event, "executor_id", None) - data = getattr(event, "data", None) - if data is None: - return (executor_id, None) - - # AgentRunUpdateEvent → data is AgentResponseUpdate - usage = extract_usage_from_update(data) - if usage: - return (executor_id, usage) - - # AgentRunEvent → data is AgentResponse - usage = extract_usage_from_response(data) - if usage: - return (executor_id, usage) - - return (executor_id, None) - - -class TokenUsageAccumulator: - """Accumulates per-agent and per-model token usage for a single request. - - Call ``record_*`` as agent invocations complete, then ``flush()`` once at - the end of the request to emit Application Insights custom events. - Telemetry failures are logged but never raised — never break the user - flow on a telemetry error. - """ - - __slots__ = ( - "user_id", - "conversation_id", - "agent_model_map", - "default_model", - "by_agent", - "by_model", - "totals", - ) - - def __init__( - self, - *, - user_id: str = "", - conversation_id: str = "", - agent_model_map: Optional[dict[str, str]] = None, - default_model: str = "", - ) -> None: - self.user_id = user_id or "" - self.conversation_id = conversation_id or "" - self.agent_model_map: dict[str, str] = dict(agent_model_map or {}) - self.default_model = default_model or "" - self.by_agent: dict[str, _Counts] = {} - self.by_model: dict[str, _Counts] = {} - self.totals: _Counts = _Counts() - - def _resolve_model(self, agent_name: str) -> str: - return ( - self.agent_model_map.get(agent_name) - or self.agent_model_map.get(agent_name or "", "") - or self.default_model - ) - - def record(self, agent_name: str, usage: Optional[tuple[int, int, int]]) -> None: - """Record an extracted usage tuple for the named agent (no-op if None/zero).""" - if not usage: - return - inp, out, tot = usage - if tot <= 0: - return - agent = agent_name or "unknown_agent" - model = self._resolve_model(agent) - - agent_counts = self.by_agent.setdefault( - agent, _Counts(model_deployment_name=model) - ) - if not agent_counts.model_deployment_name and model: - agent_counts.model_deployment_name = model - agent_counts.add(inp, out, tot) - - if model: - self.by_model.setdefault(model, _Counts()).add(inp, out, tot) - - self.totals.add(inp, out, tot) - - def record_response(self, *, agent_name: str, response: Any) -> bool: - """Extract usage from an ``AgentResponse`` and record it. Returns True on success.""" - usage = extract_usage_from_response(response) - if usage: - self.record(agent_name, usage) - return True - return False - - def record_update(self, *, executor_id: str, update: Any) -> bool: - """Extract usage from an ``AgentResponseUpdate`` and record it.""" - usage = extract_usage_from_update(update) - if usage: - self.record(executor_id, usage) - return True - return False - - def record_event(self, event: Any) -> bool: - """Extract usage from a workflow ``run_stream`` event and record it.""" - executor_id, usage = extract_usage_from_event(event) - if usage and executor_id: - self.record(executor_id, usage) - return True - return False - - def record_image_api_response( - self, *, agent_name: str, response_json: Optional[dict], model: str = "" - ) -> bool: - """Record token usage from an image-generation REST response (OpenAI shape).""" - if not isinstance(response_json, dict): - return False - usage = response_json.get("usage") - if not isinstance(usage, dict): - return False - if model and agent_name not in self.agent_model_map: - self.agent_model_map[agent_name] = model - result = _from_dict(usage) - if result: - self.record(agent_name, result) - return True - return False - - def has_data(self) -> bool: - return self.totals.total_tokens > 0 - - def flush(self, *, source: str = "") -> None: - """Emit aggregated events to Application Insights. Safe to call once per request. - - Short-circuits when ``APPLICATIONINSIGHTS_CONNECTION_STRING`` is unset so - we don't fan out 1+N+M no-op ``track_event_if_configured`` calls (each of - which currently emits a WARNING log line). The summary log at the bottom - of this method is still useful for local debugging and is left in place. - """ - if not self.has_data(): - return - - ai_configured = bool(os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")) - - base_dims = { - "user_id": self.user_id, - "conversation_id": self.conversation_id, - "source": source, - } - - if ai_configured: - try: - track_event_if_configured( - EVENT_SUMMARY, - { - **base_dims, - "total_input_tokens": str(self.totals.input_tokens), - "total_output_tokens": str(self.totals.output_tokens), - "total_tokens": str(self.totals.total_tokens), - "agent_count": str(len(self.by_agent)), - "model_count": str(len(self.by_model)), - }, - ) - except Exception as e: - logger.warning("Failed to emit %s: %s", EVENT_SUMMARY, e) - - for agent_name, c in self.by_agent.items(): - try: - track_event_if_configured( - EVENT_AGENT, - { - **base_dims, - "agent_name": agent_name, - "model_deployment_name": c.model_deployment_name or self.default_model, - "input_tokens": str(c.input_tokens), - "output_tokens": str(c.output_tokens), - "total_tokens": str(c.total_tokens), - }, - ) - except Exception as e: - logger.warning("Failed to emit %s for %s: %s", EVENT_AGENT, agent_name, e) - - for model_name, c in self.by_model.items(): - try: - track_event_if_configured( - EVENT_MODEL, - { - **base_dims, - "model_deployment_name": model_name, - "input_tokens": str(c.input_tokens), - "output_tokens": str(c.output_tokens), - "total_tokens": str(c.total_tokens), - }, - ) - except Exception as e: - logger.warning("Failed to emit %s for %s: %s", EVENT_MODEL, model_name, e) - - logger.info( - "[TOKEN USAGE] source=%s user=%s conv=%s total=%d (in=%d, out=%d) " - "agents=%s models=%s", - source, - self.user_id, - self.conversation_id, - self.totals.total_tokens, - self.totals.input_tokens, - self.totals.output_tokens, - {k: v.total_tokens for k, v in self.by_agent.items()}, - {k: v.total_tokens for k, v in self.by_model.items()}, - ) From 8590077aef057b9264548b284ba685d52ee7fad1 Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Thu, 4 Jun 2026 11:58:44 +0530 Subject: [PATCH 12/17] chore: scope PR to code-only integration Remove infra, dashboard, and documentation files to align with the standard cross-accelerator token-telemetry integration pattern (per customer-chatbot PR #236). Infra/monitoring/dashboard work will be tracked in a follow-up PR. Removed: - docs/TokenUsageTelemetry.md - infra/dashboards/token-usage-queries.kql - infra/monitoring/ (README.md, monitoring.bicep) Reverted to dev: - .gitignore - infra/main.bicep - infra/main_custom.bicep Retained code changes: - src/backend/llm_token_telemetry.py (new) - src/backend/telemetry.py (new) - src/backend/orchestrator.py - src/backend/app.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 7 +- docs/TokenUsageTelemetry.md | 99 ---------- infra/dashboards/token-usage-queries.kql | 218 ----------------------- infra/main.bicep | 14 +- infra/main_custom.bicep | 12 +- infra/monitoring/README.md | 82 --------- infra/monitoring/monitoring.bicep | 97 ---------- 7 files changed, 3 insertions(+), 526 deletions(-) delete mode 100644 docs/TokenUsageTelemetry.md delete mode 100644 infra/dashboards/token-usage-queries.kql delete mode 100644 infra/monitoring/README.md delete mode 100644 infra/monitoring/monitoring.bicep diff --git a/.gitignore b/.gitignore index 7e0f17ada..310b95883 100644 --- a/.gitignore +++ b/.gitignore @@ -70,9 +70,4 @@ pdf # RAI evaluation results rai_results/ -**/rai_results/ -# Python test coverage -.coverage -coverage.xml -htmlcov/ -coverage_html/ +**/rai_results/ \ No newline at end of file diff --git a/docs/TokenUsageTelemetry.md b/docs/TokenUsageTelemetry.md deleted file mode 100644 index 0cabc2435..000000000 --- a/docs/TokenUsageTelemetry.md +++ /dev/null @@ -1,99 +0,0 @@ -# Token Usage Telemetry & Dashboard - -The Content Generation backend emits **per-request, per-agent, and per-model** -LLM token-usage metrics to **Azure Application Insights** as custom events. -This page describes what is emitted, how to enable it, and how to visualize it. - -## What is emitted - -Three custom events are sent on every request that consumes LLM tokens -(see `src/backend/llm_token_telemetry.py`): - -| Event | When | Custom dimensions | -|---|---|---| -| `LLM_Token_Usage_Summary` | Once per request | `total_input_tokens`, `total_output_tokens`, `total_tokens`, `agent_count`, `model_count`, `user_id`, `conversation_id`, `source` | -| `LLM_Agent_Token_Usage` | Per agent that ran | `agent_name`, `model_deployment_name`, `input_tokens`, `output_tokens`, `total_tokens`, `user_id`, `conversation_id`, `source` | -| `LLM_Model_Token_Usage` | Per model deployment used | `model_deployment_name`, `input_tokens`, `output_tokens`, `total_tokens`, `user_id`, `conversation_id`, `source` | - -**Agents covered:** `triage_agent`, `planning_agent`, `research_agent`, -`text_content_agent`, `image_content_agent`, `compliance_agent`, `rai_agent`. - -**Sources** (carried in the `source` dimension): -- `process_message` — main HandoffBuilder workflow -- `send_user_response` — workflow continuations -- `parse_brief` — RAI + planning agent calls -- `generate_content` — text/image/compliance agent calls -- `regenerate_image` — direct-mode image agent -- `foundry_image_generation` — direct REST call to Azure OpenAI Image API - -> **Note:** All numeric values are stored as strings in `customDimensions` -> (App Insights requirement). Always cast with `tolong()` / `toint()` in KQL. - -## Enabling telemetry - -Set `APPLICATIONINSIGHTS_CONNECTION_STRING` in the backend environment. -Application Insights wiring is already configured in `src/backend/app.py` -via `configure_azure_monitor()`. If the env var is unset, no telemetry is -sent to Application Insights — `_RequestTokenTracker.flush()` short-circuits -the network emit path. Aggregated per-request totals are still written to -the local logger at `INFO` level (one `[TOKEN USAGE] ...` line per flush) -so token tracking remains useful for local debugging without a connection -string. - -When deploying via `azd up`, the Bicep templates create an Application -Insights instance and pass the connection string to the App Service. - -## Viewing the dashboard - -A ready-to-use KQL query pack lives at: - -``` -infra/dashboards/token-usage-queries.kql -``` - -It contains 12 queries: - -1. Overall token usage (last 24h) -2. Token usage by agent -3. Token usage by model deployment -4. Top users by token spend (last 7d) -5. Hourly trend (last 24h, time chart) -6. Per-agent daily trend (last 7d, time chart) -7. Per-model daily trend (last 7d, time chart) -8. Token usage by request source -9. Top conversations by token spend -10. Avg input/output token ratio per agent -11. Heaviest individual requests -12. OpenTelemetry-instrumented OpenAI dependency calls (cross-check) - -### Run a query - -1. Open the **Application Insights** resource in the Azure portal. -2. Go to **Monitoring → Logs**. -3. Paste any query from the file above and click **Run**. - -## Verifying locally - -After triggering a brief generation in a dev environment with a valid -`APPLICATIONINSIGHTS_CONNECTION_STRING`, custom events typically appear in -Application Insights within ~2 minutes: - -```kusto -customEvents -| where timestamp > ago(15m) -| where name startswith "LLM_" -| project timestamp, name, customDimensions -| order by timestamp desc -``` - -## Design notes - -- **Best-effort by design.** Every extraction and every emit call is wrapped - in `try/except`. Telemetry failures are logged at `DEBUG`/`WARNING` and - never break the user flow. -- **No PII.** Only `user_id` and `conversation_id` are included as - dimensions; no prompt or response text is sent. -- **Out of scope (intentional).** The current implementation does not persist - token totals to Cosmos DB and does not push real-time updates to the - frontend. Operators add cost-estimation queries as needed by multiplying - token counts by their negotiated per-1K-token rates. diff --git a/infra/dashboards/token-usage-queries.kql b/infra/dashboards/token-usage-queries.kql deleted file mode 100644 index 22008109e..000000000 --- a/infra/dashboards/token-usage-queries.kql +++ /dev/null @@ -1,218 +0,0 @@ -// ============================================================================= -// Token Usage Dashboard — Application Insights / Log Analytics KQL queries -// ============================================================================= -// Run these in: App Insights -> Logs (or Log Analytics -> Logs) for the -// workspace attached to the Content Generation backend. -// -// Custom events emitted by the backend (see src/backend/llm_token_telemetry.py): -// * LLM_Token_Usage_Summary — one per request; aggregate totals -// * LLM_Agent_Token_Usage — one per agent that consumed tokens in the request -// * LLM_Model_Token_Usage — one per model deployment that was hit -// -// Common custom dimensions on every event: -// user_id, conversation_id, source, app -// Plus event-specific numeric dimensions stored as STRINGS — always cast with -// toint() / tolong() in KQL. -// ============================================================================= - - -// ----------------------------------------------------------------------------- -// 1. Overall token usage — last 24 hours -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(24h) -| where name == "LLM_Token_Usage_Summary" -| extend - input_tokens = tolong(customDimensions["total_input_tokens"]), - output_tokens = tolong(customDimensions["total_output_tokens"]), - total_tokens = tolong(customDimensions["total_tokens"]) -| summarize - Requests = count(), - TotalInputTokens = sum(input_tokens), - TotalOutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens), - AvgTokensPerRequest = avg(total_tokens) - - -// ----------------------------------------------------------------------------- -// 2. Token usage by agent (last 24 hours) -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(24h) -| where name == "LLM_Agent_Token_Usage" -| extend - agent_name = tostring(customDimensions["agent_name"]), - input_tokens = tolong(customDimensions["input_tokens"]), - output_tokens = tolong(customDimensions["output_tokens"]), - total_tokens = tolong(customDimensions["total_tokens"]) -| summarize - Calls = count(), - InputTokens = sum(input_tokens), - OutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens) - by agent_name -| order by TotalTokens desc - - -// ----------------------------------------------------------------------------- -// 3. Token usage by model deployment (last 24 hours) -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(24h) -| where name == "LLM_Model_Token_Usage" -| extend - model = tostring(customDimensions["model_deployment_name"]), - input_tokens = tolong(customDimensions["input_tokens"]), - output_tokens = tolong(customDimensions["output_tokens"]), - total_tokens = tolong(customDimensions["total_tokens"]) -| summarize - Calls = count(), - InputTokens = sum(input_tokens), - OutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens) - by model -| order by TotalTokens desc - - -// ----------------------------------------------------------------------------- -// 4. Token usage by user (last 7 days) — top 50 -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(7d) -| where name == "LLM_Token_Usage_Summary" -| extend - user_id = tostring(customDimensions["user_id"]), - total_tokens = tolong(customDimensions["total_tokens"]) -| where isnotempty(user_id) -| summarize - Requests = count(), - TotalTokens = sum(total_tokens) - by user_id -| top 50 by TotalTokens desc - - -// ----------------------------------------------------------------------------- -// 5. Token usage over time — hourly trend (last 24 hours) -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(24h) -| where name == "LLM_Token_Usage_Summary" -| extend total_tokens = tolong(customDimensions["total_tokens"]) -| summarize TotalTokens = sum(total_tokens) by bin(timestamp, 1h) -| order by timestamp asc -| render timechart - - -// ----------------------------------------------------------------------------- -// 6. Per-agent token trend (last 7 days) — daily -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(7d) -| where name == "LLM_Agent_Token_Usage" -| extend - agent_name = tostring(customDimensions["agent_name"]), - total_tokens = tolong(customDimensions["total_tokens"]) -| summarize TotalTokens = sum(total_tokens) by bin(timestamp, 1d), agent_name -| order by timestamp asc -| render timechart - - -// ----------------------------------------------------------------------------- -// 7. Per-model token trend (last 7 days) — daily -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(7d) -| where name == "LLM_Model_Token_Usage" -| extend - model = tostring(customDimensions["model_deployment_name"]), - total_tokens = tolong(customDimensions["total_tokens"]) -| summarize TotalTokens = sum(total_tokens) by bin(timestamp, 1d), model -| order by timestamp asc -| render timechart - - -// ----------------------------------------------------------------------------- -// 8. Token usage by request source (process_message / send_user_response / -// parse_brief / generate_content / regenerate_image / foundry_image_generation) -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(24h) -| where name == "LLM_Token_Usage_Summary" -| extend - source = tostring(customDimensions["source"]), - total_tokens = tolong(customDimensions["total_tokens"]) -| summarize - Requests = count(), - TotalTokens = sum(total_tokens) - by source -| order by TotalTokens desc - - -// ----------------------------------------------------------------------------- -// 9. Top conversations by token spend (last 7 days) -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(7d) -| where name == "LLM_Token_Usage_Summary" -| extend - conversation_id = tostring(customDimensions["conversation_id"]), - total_tokens = tolong(customDimensions["total_tokens"]) -| where isnotempty(conversation_id) -| summarize - Requests = count(), - TotalTokens = sum(total_tokens) - by conversation_id -| top 25 by TotalTokens desc - - -// ----------------------------------------------------------------------------- -// 10. Avg input vs output token ratio per agent (last 7 days) -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(7d) -| where name == "LLM_Agent_Token_Usage" -| extend - agent_name = tostring(customDimensions["agent_name"]), - input_tokens = tolong(customDimensions["input_tokens"]), - output_tokens = tolong(customDimensions["output_tokens"]) -| summarize - AvgInput = avg(input_tokens), - AvgOutput = avg(output_tokens), - InputOutputRatio = round(todouble(sum(input_tokens)) / todouble(iif(sum(output_tokens) == 0, 1, sum(output_tokens))), 2) - by agent_name -| order by InputOutputRatio desc - - -// ----------------------------------------------------------------------------- -// 11. Heaviest individual requests (last 24 hours) — top 25 -// ----------------------------------------------------------------------------- -customEvents -| where timestamp > ago(24h) -| where name == "LLM_Token_Usage_Summary" -| extend - conversation_id = tostring(customDimensions["conversation_id"]), - source = tostring(customDimensions["source"]), - total_tokens = tolong(customDimensions["total_tokens"]), - input_tokens = tolong(customDimensions["total_input_tokens"]), - output_tokens = tolong(customDimensions["total_output_tokens"]), - agent_count = toint(customDimensions["agent_count"]), - model_count = toint(customDimensions["model_count"]) -| project timestamp, conversation_id, source, input_tokens, output_tokens, total_tokens, agent_count, model_count -| top 25 by total_tokens desc - - -// ----------------------------------------------------------------------------- -// 12. OpenTelemetry auto-instrumented dependencies (Azure OpenAI calls) -// Useful as a cross-check against our custom events. Note that auto-instrumented -// data does NOT include token counts unless GenAI semantic-conv attributes are -// enabled in the OpenAI/Azure SDK. -// ----------------------------------------------------------------------------- -dependencies -| where timestamp > ago(24h) -| where target has "openai" or name has "chat" or name has "completions" -| summarize - Calls = count(), - AvgDurMs = avg(duration), - Failures = countif(success == false) - by name, target -| order by Calls desc diff --git a/infra/main.bicep b/infra/main.bicep index 8ef058e12..221d2ef6a 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -1002,24 +1002,12 @@ module webSite 'modules/web-sites.bicep' = { // ========== Container Instance (Backend API) ========== // var containerInstanceName = 'aci-${solutionSuffix}' -// Hash that changes whenever the monitoring config (enableMonitoring + connection string) changes. -// Used as an ACI tag so that toggling enableMonitoring (or rotating the App Insights component) -// forces ARM to detect drift on the container group, triggering a restart and re-applying env vars -// like APPLICATIONINSIGHTS_CONNECTION_STRING. ACI does not natively support forceUpdateTag. -var monitoringConfigHash = uniqueString( - string(enableMonitoring), - enableMonitoring ? applicationInsights!.outputs.connectionString : 'monitoring-disabled' -) - module containerInstance 'modules/container-instance.bicep' = { name: take('module.container-instance.${containerInstanceName}', 64) params: { name: containerInstanceName location: solutionLocation - tags: union(tags, { - 'monitoring-enabled': string(enableMonitoring) - 'monitoring-config-hash': monitoringConfigHash - }) + tags: tags containerImage: '${acrResourceName}.azurecr.io/content-gen-api:${imageTag}' cpu: 2 memoryInGB: 4 diff --git a/infra/main_custom.bicep b/infra/main_custom.bicep index ef6332e2a..99713345c 100644 --- a/infra/main_custom.bicep +++ b/infra/main_custom.bicep @@ -1068,20 +1068,10 @@ resource aciTelemetry 'Microsoft.Resources/deployments@2025-04-01' = if (enableT } } -// Hash that changes whenever the monitoring config is toggled. -// Used as an ACI tag so that toggling enableMonitoring forces ARM to detect drift on the -// container group, triggering a restart and re-applying env vars like -// APPLICATIONINSIGHTS_CONNECTION_STRING. ACI does not natively support forceUpdateTag, -// and tags must be calculatable at deployment-start (no runtime references allowed). -var monitoringConfigHash = uniqueString(string(enableMonitoring)) - resource containerInstance 'Microsoft.ContainerInstance/containerGroups@2025-09-01' = if (shouldDeployACI) { name: containerInstanceName location: solutionLocation - tags: union(tags, { - 'monitoring-enabled': string(enableMonitoring) - 'monitoring-config-hash': monitoringConfigHash - }) + tags: tags identity: { type: 'UserAssigned' userAssignedIdentities: { diff --git a/infra/monitoring/README.md b/infra/monitoring/README.md deleted file mode 100644 index 3976fa378..000000000 --- a/infra/monitoring/README.md +++ /dev/null @@ -1,82 +0,0 @@ -# Add monitoring after deployment (standalone) - -Use this when the main accelerator was deployed with -`enableMonitoring=false` and you now want to add **Log Analytics + Application -Insights** without re-running the full `azd up`. - -Resource names match `infra/main.bicep` exactly: - -- Log Analytics: `log-${solutionSuffix}` -- App Insights: `appi-${solutionSuffix}` - -…where `solutionSuffix = toLower("${solutionName}${solutionUniqueText}")` -(symbols stripped). So after this runs, the rest of the solution can find -them by name with no further changes. - -## Parameters - -| Name | Required | Description | -| --- | --- | --- | -| `solutionName` | Yes | Same value you passed to `main.bicep` / `azd` (3-15 chars). | -| `solutionUniqueText` | No | Same as main; defaults to the same `uniqueString(...)` expression. Override only if the original deployment used a custom value. | -| `location` | No | Region for the resources. Defaults to RG location. | -| `tags` | No | Tags applied to both resources. | -| `retentionInDays` | No | Defaults to 365 (matches main). | - -## Deploy - -```bash -RG="" -SOLUTION_NAME="" - -az deployment group create \ - --resource-group "$RG" \ - --name monitoring \ - --template-file infra/monitoring/monitoring.bicep \ - --parameters solutionName="$SOLUTION_NAME" -``` - -Capture the outputs: - -```bash -APPI_ID=$(az deployment group show -g "$RG" -n monitoring \ - --query properties.outputs.applicationInsightsResourceId.value -o tsv) -APPI_CS=$(az deployment group show -g "$RG" -n monitoring \ - --query properties.outputs.applicationInsightsConnectionString.value -o tsv) -``` - -## Wire the app to send telemetry - -Set the connection string on the running app(s): - -```bash -# App Service example -az webapp config appsettings set -g "$RG" -n \ - --settings APPLICATIONINSIGHTS_CONNECTION_STRING="$APPI_CS" - -# Container App example -az containerapp update -g "$RG" -n \ - --set-env-vars APPLICATIONINSIGHTS_CONNECTION_STRING="$APPI_CS" -``` - -Or, if managed via azd: - -```bash -azd env set APPLICATIONINSIGHTS_CONNECTION_STRING "$APPI_CS" -azd deploy # re-deploy app code only, no infra changes -``` - -## Idempotency / re-runs - -Re-running this deployment against the same RG is safe — AVM modules use -stable names so existing resources are updated in place rather than -duplicated. - -## Caveat - -This template **only** creates Log Analytics + App Insights. It does **not** -re-wire `main.bicep`'s diagnostic settings on other resources (Storage, Key -Vault, Cosmos, etc.) which are normally created by `main.bicep` when -`enableMonitoring=true`. If you need those too, the cleanest fix is still to -re-run `azd provision` with `enableMonitoring=true` — Bicep will add only -the missing diagnostic settings without recreating existing resources. diff --git a/infra/monitoring/monitoring.bicep b/infra/monitoring/monitoring.bicep deleted file mode 100644 index 05440ad0c..000000000 --- a/infra/monitoring/monitoring.bicep +++ /dev/null @@ -1,97 +0,0 @@ -// ============================================================================ -// Monitoring add-on (standalone deployment) -// ---------------------------------------------------------------------------- -// Deploys Log Analytics Workspace + Application Insights into an EXISTING -// resource group, using the same naming convention as infra/main.bicep -// (`log-${solutionSuffix}` and `appi-${solutionSuffix}`). -// -// Use this when the main accelerator was deployed with `enableMonitoring=false` -// and you want to add monitoring afterwards WITHOUT re-running the full -// deployment. -// -// After this deployment completes, set the App Service / Container App -// `APPLICATIONINSIGHTS_CONNECTION_STRING` setting to the value emitted by the -// `applicationInsightsConnectionString` output (or run `azd env set` and -// re-deploy the app code only). -// -// Scope: resourceGroup -// ============================================================================ - -targetScope = 'resourceGroup' - -@minLength(3) -@maxLength(15) -@description('Required. Same `solutionName` you passed to main.bicep / azd. Used to derive the resource names.') -param solutionName string - -@description('Optional. Same `solutionUniqueText` used by main.bicep. Defaults to the same expression: substring(uniqueString(subscription().id, resourceGroup().name, solutionName), 0, 5).') -param solutionUniqueText string = substring(uniqueString(subscription().id, resourceGroup().name, solutionName), 0, 5) - -@description('Optional. Azure region for the new resources. Defaults to the resource group location.') -param location string = resourceGroup().location - -@description('Optional. Tags applied to both resources.') -param tags object = {} - -@description('Optional. Data retention (days) for both Log Analytics and Application Insights.') -@minValue(30) -@maxValue(730) -param retentionInDays int = 365 - -// Mirror the suffix logic from main.bicep so names line up exactly. -var solutionSuffix = toLower(trim(replace( - replace( - replace(replace(replace(replace('${solutionName}${solutionUniqueText}', '-', ''), '_', ''), '.', ''), '/', ''), - ' ', - '' - ), - '*', - '' -))) - -var logAnalyticsWorkspaceResourceName = 'log-${solutionSuffix}' -var applicationInsightsResourceName = 'appi-${solutionSuffix}' - -// ========== Log Analytics Workspace ========== -module logAnalyticsWorkspace 'br/public:avm/res/operational-insights/workspace:0.15.0' = { - name: take('avm.res.operational-insights.workspace.${logAnalyticsWorkspaceResourceName}', 64) - params: { - name: logAnalyticsWorkspaceResourceName - tags: tags - location: location - skuName: 'PerGB2018' - dataRetention: retentionInDays - features: { enableLogAccessUsingOnlyResourcePermissions: true } - diagnosticSettings: [{ useThisWorkspace: true }] - } -} - -// ========== Application Insights ========== -module applicationInsights 'br/public:avm/res/insights/component:0.7.1' = { - name: take('avm.res.insights.component.${applicationInsightsResourceName}', 64) - params: { - name: applicationInsightsResourceName - tags: tags - location: location - retentionInDays: retentionInDays - kind: 'web' - disableIpMasking: false - flowType: 'Bluefield' - workspaceResourceId: logAnalyticsWorkspace.outputs.resourceId - } -} - -@description('Resource ID of the Log Analytics workspace.') -output logAnalyticsWorkspaceResourceId string = logAnalyticsWorkspace.outputs.resourceId - -@description('Name of the Log Analytics workspace.') -output logAnalyticsWorkspaceName string = logAnalyticsWorkspaceResourceName - -@description('Resource ID of the Application Insights component.') -output applicationInsightsResourceId string = applicationInsights.outputs.resourceId - -@description('Name of the Application Insights component.') -output applicationInsightsName string = applicationInsightsResourceName - -@description('Connection string for Application Insights. Set this on your app as APPLICATIONINSIGHTS_CONNECTION_STRING.') -output applicationInsightsConnectionString string = applicationInsights.outputs.connectionString From bfc55410b8c3fb5ae4a71379fe8267e235598d0b Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Thu, 4 Jun 2026 12:59:58 +0530 Subject: [PATCH 13/17] test: exclude token-telemetry modules from coverage Mirror the coverage exclusions added in the standard cross-accelerator integration (customer-chatbot PR #236). The new telemetry helpers are defensive scaffolding and don't need to contribute to the project-wide --cov-fail-under=20 gate. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/pytest.ini | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pytest.ini b/src/pytest.ini index c503390e3..b66b36bda 100644 --- a/src/pytest.ini +++ b/src/pytest.ini @@ -43,6 +43,8 @@ omit = */hypercorn.conf.py */ApiApp.Dockerfile */WebApp.Dockerfile + */telemetry.py + */llm_token_telemetry.py [coverage:report] exclude_lines = From 69aba687286a9187510b727ffd8cb26a94e5b238 Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Thu, 4 Jun 2026 13:13:37 +0530 Subject: [PATCH 14/17] feat: instrument missing agent calls for token telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap two additional agent .run() call sites that were emitting tokens without telemetry, completing parity with the cross-accelerator pattern (customer-chatbot PR #236): * orchestrator.select_products() — research agent run is now metered via the existing _RequestTokenTracker pattern and flushed on the 'select_products' source. * services.title_service.generate_title() — wraps the title agent run in a TokenUsageScope; user_id and conversation_id are now threaded from app.py so events carry correlation ids. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/backend/app.py | 5 +++-- src/backend/orchestrator.py | 9 +++++++++ src/backend/services/title_service.py | 25 +++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/src/backend/app.py b/src/backend/app.py index 2bd4e509f..28a1ee8a8 100644 --- a/src/backend/app.py +++ b/src/backend/app.py @@ -337,7 +337,7 @@ async def _handle_parse_brief( if not has_existing_title: title_service = get_title_service() - generated_title = await title_service.generate_title(message) + generated_title = await title_service.generate_title(message, user_id=user_id, conversation_id=conversation_id) await cosmos_service.add_message_to_conversation( conversation_id=conversation_id, @@ -354,6 +354,7 @@ async def _handle_parse_brief( # Parse the brief brief, questions, blocked = await orchestrator.parse_brief(message, user_id=user_id, conversation_id=conversation_id) + brief, questions, blocked = await orchestrator.parse_brief(message, user_id=user_id, conversation_id=conversation_id) if blocked: track_event_if_configured("Error_RAI_Check_Failed", {"conversation_id": conversation_id, "user_id": user_id, "status": "Brief parse blocked by RAI"}) @@ -1134,7 +1135,7 @@ async def _handle_general_chat( if not has_existing_title: title_service = get_title_service() - generated_title = await title_service.generate_title(message) + generated_title = await title_service.generate_title(message, user_id=user_id, conversation_id=conversation_id) await cosmos_service.add_message_to_conversation( conversation_id=conversation_id, diff --git a/src/backend/orchestrator.py b/src/backend/orchestrator.py index 63b55832e..fb60d50fb 100644 --- a/src/backend/orchestrator.py +++ b/src/backend/orchestrator.py @@ -1484,7 +1484,16 @@ async def select_products( """ try: + token_acc = self._new_token_accumulator() response = await research_agent.run(select_prompt) + try: + token_acc.record_response(agent_name="research_agent", response=response) + except Exception as _tu_err: + logger.debug("token_usage record (research_agent) failed: %s", _tu_err) + try: + token_acc.flush(source="select_products") + except Exception: + pass response_text = str(response) # Extract JSON from response diff --git a/src/backend/services/title_service.py b/src/backend/services/title_service.py index e849ca22d..9865b6bd1 100644 --- a/src/backend/services/title_service.py +++ b/src/backend/services/title_service.py @@ -13,6 +13,8 @@ from azure.identity import DefaultAzureCredential from settings import app_settings +from telemetry import token_emitter +from llm_token_telemetry import TokenUsageScope logger = logging.getLogger(__name__) @@ -89,7 +91,13 @@ def _fallback_title(message: str) -> str: words = message.strip().split()[:4] return " ".join(words) if words else "New Conversation" - async def generate_title(self, first_user_message: str) -> str: + async def generate_title( + self, + first_user_message: str, + *, + user_id: str = "", + conversation_id: str = "", + ) -> str: """ Generate a concise conversation title from the first user message. @@ -116,7 +124,20 @@ async def generate_title(self, first_user_message: str) -> str: ) try: - response = await self._agent.run(prompt) + deployment = ( + app_settings.ai_foundry.model_deployment + if app_settings.ai_foundry.use_foundry + else app_settings.azure_openai.gpt_model + ) or "" + with TokenUsageScope( + token_emitter, + agent_name="title_agent", + model_deployment_name=deployment, + user_id=user_id, + session_id=conversation_id, + ) as scope: + response = await self._agent.run(prompt) + scope.add(response) # Clean up the response title = str(response).strip().splitlines()[0].strip() From 4aaf0007dc2b852f3827914a724c32cd59d72f8e Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Thu, 4 Jun 2026 16:56:47 +0530 Subject: [PATCH 15/17] test: update generate_title assertion for new correlation kwargs The generate_title() signature now accepts user_id and conversation_id keyword arguments to thread correlation ids into TokenUsageScope. Update the existing test to match the new call signature. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/tests/test_app_title_endpoints.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tests/test_app_title_endpoints.py b/src/tests/test_app_title_endpoints.py index 6e4fe0650..f8867d132 100644 --- a/src/tests/test_app_title_endpoints.py +++ b/src/tests/test_app_title_endpoints.py @@ -315,7 +315,9 @@ async def test_generates_title_for_new_conversation(self, client): assert resp.status_code == 200 mock_title_svc.generate_title.assert_called_once_with( - "I need a social media post about paint products" + "I need a social media post about paint products", + user_id="user-1", + conversation_id="conv-chat-1", ) @pytest.mark.asyncio From 194beb3b99f35625b9390e683203d915df01609b Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Mon, 15 Jun 2026 14:14:44 +0530 Subject: [PATCH 16/17] implement copilot review comments --- src/backend/app.py | 1 - src/backend/llm_token_telemetry.py | 24 +-- src/backend/orchestrator.py | 8 +- src/backend/services/title_service.py | 2 +- src/backend/telemetry.py | 6 +- src/pytest.ini | 2 - src/tests/test_llm_token_telemetry.py | 220 ++++++++++++++++++++++++++ 7 files changed, 243 insertions(+), 20 deletions(-) create mode 100644 src/tests/test_llm_token_telemetry.py diff --git a/src/backend/app.py b/src/backend/app.py index 28a1ee8a8..3cb26786f 100644 --- a/src/backend/app.py +++ b/src/backend/app.py @@ -354,7 +354,6 @@ async def _handle_parse_brief( # Parse the brief brief, questions, blocked = await orchestrator.parse_brief(message, user_id=user_id, conversation_id=conversation_id) - brief, questions, blocked = await orchestrator.parse_brief(message, user_id=user_id, conversation_id=conversation_id) if blocked: track_event_if_configured("Error_RAI_Check_Failed", {"conversation_id": conversation_id, "user_id": user_id, "status": "Brief parse blocked by RAI"}) diff --git a/src/backend/llm_token_telemetry.py b/src/backend/llm_token_telemetry.py index 91f670c5c..745fceafd 100644 --- a/src/backend/llm_token_telemetry.py +++ b/src/backend/llm_token_telemetry.py @@ -177,6 +177,13 @@ def _to_int(value: Any, default: int = 0) -> int: return default +def _to_int_or_none(value: Any) -> Optional[int]: + """Like :func:`_to_int` but preserves ``None`` for missing/absent values.""" + if value is None: + return None + return _to_int(value) + + def _get(obj: Any, key: str, default: Any = None) -> Any: """Read an attribute or dict key uniformly.""" if obj is None: @@ -321,11 +328,11 @@ def extract_realtime_usage(response_obj: Any) -> Optional[TokenUsage]: input_tokens=inp, output_tokens=out, total_tokens=tot, - input_audio_tokens=_to_int(_get(in_details, "audio_tokens")), - input_text_tokens=_to_int(_get(in_details, "text_tokens")), - input_cached_tokens=_to_int(_get(in_details, "cached_tokens")), - output_audio_tokens=_to_int(_get(out_details, "audio_tokens")), - output_text_tokens=_to_int(_get(out_details, "text_tokens")), + input_audio_tokens=_to_int_or_none(_get(in_details, "audio_tokens")), + input_text_tokens=_to_int_or_none(_get(in_details, "text_tokens")), + input_cached_tokens=_to_int_or_none(_get(in_details, "cached_tokens")), + output_audio_tokens=_to_int_or_none(_get(out_details, "audio_tokens")), + output_text_tokens=_to_int_or_none(_get(out_details, "text_tokens")), ) # Only return if at least one non-zero count surfaced. if record.has_any or any( @@ -814,14 +821,13 @@ def emit_all( **dimensions, ) - self._log.info( - "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d %s", + self._log.debug( + "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d", agent_name, model_deployment_name, usage.input_tokens, usage.output_tokens, usage.total_tokens, - " ".join(f"{k}={v}" for k, v in dimensions.items() if v), ) @@ -886,7 +892,7 @@ def add(self, source: Any) -> Optional[TokenUsage]: """ start_ns = time.perf_counter_ns() try: - found = extract_usage(source) or extract_usage_from_stream_chunk(source) + found = extract_usage_from_stream_chunk(source) except Exception as exc: # belt + braces; extractors are already safe logger.debug("TokenUsageScope.add failed: %s", exc, exc_info=True) return None diff --git a/src/backend/orchestrator.py b/src/backend/orchestrator.py index b339db377..7c7e769f8 100644 --- a/src/backend/orchestrator.py +++ b/src/backend/orchestrator.py @@ -599,7 +599,7 @@ def record_image_api_response( usage = extract_usage_from_dict(response_json.get("usage")) if not usage: return False - if model and agent_name not in self._agent_model_map: + if model: self._agent_model_map[agent_name] = model self._add(agent_name, usage) return True @@ -637,12 +637,10 @@ def flush(self, *, source: str = "") -> None: primary_model=primary_model, **dims, ) - logger.info( - "[TOKEN USAGE] source=%s user=%s conv=%s total=%d (in=%d, out=%d) " + logger.debug( + "[TOKEN USAGE] source=%s total=%d (in=%d, out=%d) " "agents=%s models=%s", source, - self._user_id, - self._conversation_id, self.total.total_tokens, self.total.input_tokens, self.total.output_tokens, diff --git a/src/backend/services/title_service.py b/src/backend/services/title_service.py index a2d6e6685..c6a39e7c6 100644 --- a/src/backend/services/title_service.py +++ b/src/backend/services/title_service.py @@ -127,7 +127,7 @@ async def generate_title( agent_name="title_agent", model_deployment_name=deployment, user_id=user_id, - session_id=conversation_id, + conversation_id=conversation_id, ) as scope: response = await self._agent.run(prompt) scope.add(response) diff --git a/src/backend/telemetry.py b/src/backend/telemetry.py index b5af65460..0a2c34f80 100644 --- a/src/backend/telemetry.py +++ b/src/backend/telemetry.py @@ -2,8 +2,10 @@ A single :class:`TokenUsageEmitter` is constructed at import time so every router/utility shares the same App Insights connection-string resolution and -static dimensions. Importing this module has no side effects beyond reading -``APPLICATIONINSIGHTS_CONNECTION_STRING`` and the env vars documented below. +static dimensions. Beyond reading ``APPLICATIONINSIGHTS_CONNECTION_STRING`` and +the env vars documented below, constructing that emitter also resolves the +optional App Insights event sink, which may import +``azure.monitor.events.extension`` when the package is installed. Optional environment variables ------------------------------ diff --git a/src/pytest.ini b/src/pytest.ini index b66b36bda..c503390e3 100644 --- a/src/pytest.ini +++ b/src/pytest.ini @@ -43,8 +43,6 @@ omit = */hypercorn.conf.py */ApiApp.Dockerfile */WebApp.Dockerfile - */telemetry.py - */llm_token_telemetry.py [coverage:report] exclude_lines = diff --git a/src/tests/test_llm_token_telemetry.py b/src/tests/test_llm_token_telemetry.py new file mode 100644 index 000000000..f465f4c2d --- /dev/null +++ b/src/tests/test_llm_token_telemetry.py @@ -0,0 +1,220 @@ +"""Focused unit tests for the token-usage telemetry helpers. + +Covers the supported usage response shapes (framework ``usage_details``, +aggregated message ``contents`` usage, raw OpenAI ``usage`` fallback, streaming +chunk metadata, and realtime/voice sub-counts) plus ``TokenUsage`` arithmetic, +``TokenUsageScope`` accumulation, and ``TokenUsageEmitter`` behaviour +(user_id hashing, pricing, and the disabled no-op path). + +These guard against regressions as the Agent Framework / OpenAI SDK usage +shapes evolve. +""" +from types import SimpleNamespace + +import pytest + +from llm_token_telemetry import ( + EVENT_AGENT, + EVENT_MODEL, + EVENT_SUMMARY, + TokenUsage, + TokenUsageEmitter, + TokenUsageScope, + extract_realtime_usage, + extract_usage, + extract_usage_from_dict, + extract_usage_from_stream_chunk, +) + + +def test_extract_usage_from_usage_details_attr(): + """Framework result exposing ``usage_details`` with *_token_count keys.""" + result = SimpleNamespace( + usage_details=SimpleNamespace( + input_token_count=120, output_token_count=30, total_token_count=150 + ) + ) + usage = extract_usage(result) + assert usage == TokenUsage(input_tokens=120, output_tokens=30, total_tokens=150) + + +def test_extract_usage_from_raw_openai_usage(): + """OpenAI ChatCompletion shape via ``raw_representation.usage``.""" + result = SimpleNamespace( + raw_representation=SimpleNamespace( + usage={"prompt_tokens": 10, "completion_tokens": 5} + ) + ) + usage = extract_usage(result) + assert usage == TokenUsage(input_tokens=10, output_tokens=5, total_tokens=15) + + +def test_extract_usage_aggregates_message_contents(): + """Usage spread across ``messages[*].contents[*].usage_details`` is summed.""" + msg = SimpleNamespace( + contents=[ + SimpleNamespace(usage_details={"input_tokens": 4, "output_tokens": 1}), + SimpleNamespace(usage_details={"input_tokens": 6, "output_tokens": 2}), + ] + ) + result = SimpleNamespace(messages=[msg]) + usage = extract_usage(result) + assert usage == TokenUsage(input_tokens=10, output_tokens=3, total_tokens=13) + + +def test_extract_usage_returns_none_for_unknown_shape(): + assert extract_usage(None) is None + assert extract_usage(SimpleNamespace(foo="bar")) is None + + +def test_extract_usage_from_dict_fallback(): + usage = extract_usage_from_dict({"prompt_tokens": 7, "completion_tokens": 3}) + assert usage == TokenUsage(input_tokens=7, output_tokens=3, total_tokens=10) + assert extract_usage_from_dict({}) is None + + +def test_extract_usage_from_stream_chunk_metadata(): + chunk = SimpleNamespace(metadata={"usage": {"input_tokens": 2, "output_tokens": 8}}) + usage = extract_usage_from_stream_chunk(chunk) + assert usage == TokenUsage(input_tokens=2, output_tokens=8, total_tokens=10) + + +def test_extract_realtime_usage_omits_absent_subcounts(): + """When the provider does not report sub-counts they stay ``None`` so the + event props omit them (rather than emitting a misleading ``0``).""" + response = {"usage": {"input_tokens": 100, "output_tokens": 20}} + usage = extract_realtime_usage(response) + assert usage.input_tokens == 100 + assert usage.output_tokens == 20 + assert usage.input_audio_tokens is None + assert usage.output_text_tokens is None + props = usage.to_event_props() + assert "input_audio_tokens" not in props + assert "output_text_tokens" not in props + + +def test_extract_realtime_usage_includes_present_subcounts(): + response = { + "usage": { + "input_tokens": 100, + "output_tokens": 20, + "input_token_details": {"audio_tokens": 80, "cached_tokens": 0}, + "output_token_details": {"text_tokens": 20}, + } + } + usage = extract_realtime_usage(response) + assert usage.input_audio_tokens == 80 + assert usage.input_cached_tokens == 0 + assert usage.output_text_tokens == 20 + props = usage.to_event_props() + assert props["input_audio_tokens"] == "80" + assert props["input_cached_tokens"] == "0" + + +def test_extract_realtime_usage_returns_none_when_no_usage(): + assert extract_realtime_usage({}) is None + + +def test_token_usage_add_handles_none_subcounts(): + a = TokenUsage(input_tokens=1, output_tokens=1, total_tokens=2) + b = TokenUsage(input_tokens=2, output_tokens=3, total_tokens=5, input_audio_tokens=4) + combined = a + b + assert combined.input_tokens == 3 + assert combined.output_tokens == 4 + assert combined.total_tokens == 7 + assert combined.input_audio_tokens == 4 + assert combined.output_text_tokens is None + + +def test_to_event_props_only_includes_set_subcounts(): + usage = TokenUsage(input_tokens=5, output_tokens=5, total_tokens=10, input_text_tokens=5) + props = usage.to_event_props() + assert props["input_text_tokens"] == "5" + assert "input_audio_tokens" not in props + + +def _emitter_with_sink(): + """Return (emitter, events) where events captures (name, props) tuples.""" + events: list[tuple[str, dict]] = [] + emitter = TokenUsageEmitter( + connection_string="InstrumentationKey=test", + event_sink=lambda name, props: events.append((name, props)), + ) + return emitter, events + + +def test_token_usage_scope_accumulates_and_emits(): + emitter, events = _emitter_with_sink() + with TokenUsageScope( + emitter, + agent_name="title_agent", + model_deployment_name="gpt-4o", + conversation_id="conv-1", + ) as scope: + scope.add(SimpleNamespace(usage={"prompt_tokens": 10, "completion_tokens": 5})) + scope.add(SimpleNamespace(usage={"prompt_tokens": 4, "completion_tokens": 1})) + + assert scope.usage == TokenUsage(input_tokens=14, output_tokens=6, total_tokens=20) + names = {name for name, _ in events} + assert EVENT_AGENT in names + assert EVENT_MODEL in names + assert EVENT_SUMMARY in names + agent_props = next(p for n, p in events if n == EVENT_AGENT) + assert agent_props["conversation_id"] == "conv-1" + + +def test_token_usage_scope_no_emit_when_no_usage(): + emitter, events = _emitter_with_sink() + with TokenUsageScope(emitter, agent_name="a", model_deployment_name="m"): + pass + assert events == [] + + +def test_emitter_hashes_user_id_before_emitting(): + events: list[tuple[str, dict]] = [] + emitter = TokenUsageEmitter( + connection_string="InstrumentationKey=test", + event_sink=lambda name, props: events.append((name, props)), + user_id_hasher=lambda v: "HASHED", + ) + emitter.emit_agent( + agent_name="a", + model_deployment_name="gpt-4o", + usage=TokenUsage(input_tokens=1, output_tokens=1, total_tokens=2), + user_id="alice@example.com", + ) + assert events + _, props = events[0] + assert props["user_id"] == "HASHED" + + +def test_emitter_attaches_estimated_cost_when_pricing_configured(): + events: list[tuple[str, dict]] = [] + emitter = TokenUsageEmitter( + connection_string="InstrumentationKey=test", + event_sink=lambda name, props: events.append((name, props)), + pricing={"gpt-4o": (0.0025, 0.01)}, + ) + emitter.emit_agent( + agent_name="a", + model_deployment_name="gpt-4o", + usage=TokenUsage(input_tokens=1000, output_tokens=1000, total_tokens=2000), + ) + _, props = events[0] + assert props["estimated_cost_usd"] == "0.012500" + + +def test_disabled_emitter_is_a_noop(): + """No connection string -> emitter disabled -> sink never invoked.""" + events: list[tuple[str, dict]] = [] + emitter = TokenUsageEmitter( + connection_string="", + event_sink=lambda name, props: events.append((name, props)), + ) + assert emitter.enabled is False + emitter.emit_agent( + agent_name="a", + model_deployment_name="m", + usage=TokenUsage(input_tokens=1, output_tokens=1, total_tokens=2), + ) + assert events == [] From 692c3ab38b057ff361813aa0a55b68c304c3ec1a Mon Sep 17 00:00:00 2001 From: Ayaz-Microsoft Date: Mon, 15 Jun 2026 15:24:21 +0530 Subject: [PATCH 17/17] fix: address Copilot review comments on token telemetry attribution - Drop redundant extract_usage fallback in _RequestTokenTracker.record_event - Mark agent model as 'multiple' for mixed-model agents instead of locking first-seen - Thread user_id/conversation_id into select_products token telemetry - Fall back to gpt_model for Foundry title deployment to avoid empty model dimension Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/backend/app.py | 4 +++- src/backend/orchestrator.py | 23 ++++++++++++++++------- src/backend/services/title_service.py | 2 +- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/backend/app.py b/src/backend/app.py index 3cb26786f..37748f976 100644 --- a/src/backend/app.py +++ b/src/backend/app.py @@ -687,7 +687,9 @@ async def _handle_search_products( result = await orchestrator.select_products( request_text=message, current_products=current_products, - available_products=available_products + available_products=available_products, + user_id=user_id, + conversation_id=conversation_id ) # Save assistant response diff --git a/src/backend/orchestrator.py b/src/backend/orchestrator.py index 7c7e769f8..9e45cad49 100644 --- a/src/backend/orchestrator.py +++ b/src/backend/orchestrator.py @@ -554,7 +554,11 @@ def _add(self, agent_name: str, usage: TokenUsage) -> None: agent = agent_name or "unknown_agent" model = self._resolve_model(agent) prev_usage, prev_model = self.by_agent.get(agent, (TokenUsage(), model)) - self.by_agent[agent] = (prev_usage + usage, prev_model or model) + if prev_model and model and prev_model != model: + resolved_model = "multiple" + else: + resolved_model = prev_model or model + self.by_agent[agent] = (prev_usage + usage, resolved_model) if model: self.by_model[model] = self.by_model.get(model, TokenUsage()) + usage self.total = self.total + usage @@ -574,9 +578,10 @@ def record_response(self, *, agent_name: str, response: Any) -> bool: def record_event(self, event: Any) -> bool: """Extract usage from a workflow ``run_stream`` event and record it. - Reads ``event.executor_id`` for per-agent attribution and falls back - through ``extract_usage_from_stream_chunk`` then ``extract_usage`` to - cover both ``AgentRunUpdateEvent`` and ``AgentRunEvent`` data shapes. + Reads ``event.executor_id`` for per-agent attribution and uses + ``extract_usage_from_stream_chunk`` (which tries the top-level shape + then ``metadata.usage``) to cover both ``AgentRunUpdateEvent`` and + ``AgentRunEvent`` data shapes. """ if event is None: return False @@ -584,7 +589,7 @@ def record_event(self, event: Any) -> bool: data = getattr(event, "data", None) if data is None or not executor_id: return False - usage = extract_usage_from_stream_chunk(data) or extract_usage(data) + usage = extract_usage_from_stream_chunk(data) if usage: self._add(executor_id, usage) return True @@ -1442,7 +1447,9 @@ async def select_products( self, request_text: str, current_products: list = None, - available_products: list = None + available_products: list = None, + user_id: str = "", + conversation_id: str = "" ) -> dict: """ Select or modify product selection via natural language. @@ -1493,7 +1500,9 @@ async def select_products( """ try: - token_acc = self._new_token_accumulator() + token_acc = self._new_token_accumulator( + conversation_id=conversation_id, user_id=user_id + ) response = await research_agent.run(select_prompt) try: token_acc.record_response(agent_name="research_agent", response=response) diff --git a/src/backend/services/title_service.py b/src/backend/services/title_service.py index c6a39e7c6..0f0b2e161 100644 --- a/src/backend/services/title_service.py +++ b/src/backend/services/title_service.py @@ -118,7 +118,7 @@ async def generate_title( try: deployment = ( - app_settings.ai_foundry.model_deployment + (app_settings.ai_foundry.model_deployment or app_settings.azure_openai.gpt_model) if app_settings.ai_foundry.use_foundry else app_settings.azure_openai.gpt_model ) or ""