From a216f1a22f324d0f23e21895a423fec0bc05df95 Mon Sep 17 00:00:00 2001 From: git-hulk Date: Tue, 9 Dec 2025 11:36:46 +0800 Subject: [PATCH 1/2] chore(ai-proxy): add support of recording the cache token count for Gemini Currently, ai-proxy only supports the prompt/completions token without the prompt cache tokens, which might be helpful for users to observe the cache hit ratio and improve the performance. This PR introduce `prompt_cache_tokens` to record the cache for Gemini and OpenAI. --- kong/llm/adapters/gemini.lua | 2 ++ kong/llm/drivers/gemini.lua | 33 +++++++++++-------- kong/llm/drivers/shared.lua | 5 +++ kong/llm/plugin/observability.lua | 1 + .../shared-filters/parse-json-response.lua | 4 +++ .../shared-filters/serialize-analytics.lua | 1 + .../filters/transform-request.lua | 2 ++ .../filters/transform-response.lua | 2 ++ kong/plugins/prometheus/exporter.lua | 4 +++ kong/reports.lua | 14 +++++--- .../02-openai_integration_spec.lua | 1 + .../09-streaming_integration_spec.lua | 1 + .../11-gemini_integration_spec.lua | 1 + .../02-integration_spec.lua | 1 + 14 files changed, 55 insertions(+), 17 deletions(-) diff --git a/kong/llm/adapters/gemini.lua b/kong/llm/adapters/gemini.lua index eedf66d0e28..f11cfb5f498 100644 --- a/kong/llm/adapters/gemini.lua +++ b/kong/llm/adapters/gemini.lua @@ -50,6 +50,8 @@ function _GeminiAdapter:extract_metadata(response_body) return { prompt_tokens = response_body.usageMetadata.promptTokenCount or 0, completion_tokens = response_body.usageMetadata.candidatesTokenCount or 0, + total_tokens = response_body.usageMetadata.totalTokenCount or 0, + prompt_cache_tokens = response_body.usageMetadata.cachedContentTokenCount or 0, } end diff --git a/kong/llm/drivers/gemini.lua b/kong/llm/drivers/gemini.lua index d09edb88292..22b19b1bed9 100644 --- a/kong/llm/drivers/gemini.lua +++ b/kong/llm/drivers/gemini.lua @@ -81,6 +81,21 @@ local function has_finish_reason(event) or nil end +-- Extract usage metadata from Gemini response +-- For Gemini usage metadata reference: https://ai.google.dev/api/generate-content#UsageMetadata +local function extract_usage(usageMetadata) + if not usageMetadata then + return {} + end + + return { + prompt_tokens = usageMetadata.promptTokenCount or 0, + completion_tokens = usageMetadata.candidatesTokenCount or 0, + total_tokens = usageMetadata.totalTokenCount or 0, + prompt_cache_tokens = usageMetadata.cachedContentTokenCount or 0, + } +end + local function handle_stream_event(event_t, model_info, route_type) -- discard empty frames, it should either be a random new line, or comment if (not event_t.data) or (#event_t.data < 1) then @@ -100,10 +115,8 @@ local function handle_stream_event(event_t, model_info, route_type) local finish_reason = has_finish_reason(event) -- may be nil if is_response_content(event) then - local metadata = {} - metadata.finish_reason = finish_reason - metadata.completion_tokens = event.usageMetadata and event.usageMetadata.candidatesTokenCount or 0 - metadata.prompt_tokens = event.usageMetadata and event.usageMetadata.promptTokenCount or 0 + local metadata = extract_usage(event.usageMetadata) + metadata.finish_reason = finish_reason local new_event = { model = model_info.name, @@ -122,10 +135,8 @@ local function handle_stream_event(event_t, model_info, route_type) return cjson.encode(new_event), nil, metadata elseif is_tool_content(event) then - local metadata = {} - metadata.finish_reason = finish_reason - metadata.completion_tokens = event.usageMetadata and event.usageMetadata.candidatesTokenCount or 0 - metadata.prompt_tokens = event.usageMetadata and event.usageMetadata.promptTokenCount or 0 + local metadata = extract_usage(event.usageMetadata) + metadata.finish_reason = finish_reason if event.candidates and #event.candidates > 0 then local new_event = { @@ -445,11 +456,7 @@ local function from_gemini_chat_openai(response, model_info, route_type) -- process analytics if response.usageMetadata then - messages.usage = { - prompt_tokens = response.usageMetadata.promptTokenCount, - completion_tokens = response.usageMetadata.candidatesTokenCount, - total_tokens = response.usageMetadata.totalTokenCount, - } + messages.usage = extract_usage(response.usageMetadata) end else -- probably a server fault or other unexpected response diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua index 939fec3477f..ba7c78db422 100644 --- a/kong/llm/drivers/shared.lua +++ b/kong/llm/drivers/shared.lua @@ -43,6 +43,7 @@ local log_entry_keys = { -- usage keys PROMPT_TOKENS = "prompt_tokens", COMPLETION_TOKENS = "completion_tokens", + PROMPT_CACHE_TOKENS = "prompt_cache_tokens", TOTAL_TOKENS = "total_tokens", TIME_PER_TOKEN = "time_per_token", COST = "cost", @@ -844,9 +845,13 @@ function _M.post_request(conf, response_object) if response_object.usage.total_tokens then request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TOTAL_TOKENS] = response_object.usage.total_tokens end + if response_object.usage.prompt_cache_tokens then + request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.PROMPT_CACHE_TOKENS] = response_object.usage.prompt_cache_tokens + end ai_plugin_o11y.metrics_set("llm_prompt_tokens_count", response_object.usage.prompt_tokens) ai_plugin_o11y.metrics_set("llm_completion_tokens_count", response_object.usage.completion_tokens) + ai_plugin_o11y.metrics_set("llm_prompt_cache_tokens_count", response_object.usage.prompt_cache_tokens) if response_object.usage.prompt_tokens and response_object.usage.completion_tokens and conf.model.options and conf.model.options.input_cost and conf.model.options.output_cost then diff --git a/kong/llm/plugin/observability.lua b/kong/llm/plugin/observability.lua index 916aeb19c6f..d7c471e57fa 100644 --- a/kong/llm/plugin/observability.lua +++ b/kong/llm/plugin/observability.lua @@ -12,6 +12,7 @@ local metrics_schema = { llm_prompt_tokens_count = true, llm_completion_tokens_count = true, llm_total_tokens_count = true, + llm_prompt_cache_tokens_count = true, llm_usage_cost = true, } diff --git a/kong/llm/plugin/shared-filters/parse-json-response.lua b/kong/llm/plugin/shared-filters/parse-json-response.lua index 473c616de8c..97fe8efe5c2 100644 --- a/kong/llm/plugin/shared-filters/parse-json-response.lua +++ b/kong/llm/plugin/shared-filters/parse-json-response.lua @@ -39,6 +39,7 @@ function _M:run(_) else ai_plugin_o11y.metrics_set("llm_prompt_tokens_count", metadata.prompt_tokens) ai_plugin_o11y.metrics_set("llm_completion_tokens_count", metadata.completion_tokens) + ai_plugin_o11y.metrics_set("llm_prompt_cache_tokens_count", metadata.prompt_cache_tokens) end else @@ -55,6 +56,9 @@ function _M:run(_) if t and t.usage and t.usage.completion_tokens then ai_plugin_o11y.metrics_set("llm_completion_tokens_count", t.usage.completion_tokens) end + if t and t.usage and t.usage.prompt_tokens_details then + ai_plugin_o11y.metrics_set("llm_prompt_cache_tokens_count", t.usage.prompt_tokens_details.cache_tokens or 0) + end end end diff --git a/kong/llm/plugin/shared-filters/serialize-analytics.lua b/kong/llm/plugin/shared-filters/serialize-analytics.lua index 390d528450f..049814d6eeb 100644 --- a/kong/llm/plugin/shared-filters/serialize-analytics.lua +++ b/kong/llm/plugin/shared-filters/serialize-analytics.lua @@ -75,6 +75,7 @@ function _M:run(conf) prompt_tokens = ai_plugin_o11y.metrics_get("llm_prompt_tokens_count"), completion_tokens = ai_plugin_o11y.metrics_get("llm_completion_tokens_count"), total_tokens = ai_plugin_o11y.metrics_get("llm_total_tokens_count"), + prompt_cache_tokens = ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count"), cost = ai_plugin_o11y.metrics_get("llm_usage_cost"), } kong.log.set_serialize_value(string.format("ai.%s.usage", ai_plugin_o11y.NAMESPACE), usage) diff --git a/kong/plugins/ai-request-transformer/filters/transform-request.lua b/kong/plugins/ai-request-transformer/filters/transform-request.lua index 3270132e809..f126910b376 100644 --- a/kong/plugins/ai-request-transformer/filters/transform-request.lua +++ b/kong/plugins/ai-request-transformer/filters/transform-request.lua @@ -16,6 +16,7 @@ local FILTER_OUTPUT_SCHEMA = { -- TODO: refactor this so they don't need to be duplicated llm_prompt_tokens_count = "number", llm_completion_tokens_count = "number", + llm_prompt_cache_tokens_count = "number", llm_usage_cost = "number", } @@ -93,6 +94,7 @@ function _M:run(conf) set_ctx("model", conf.llm.model) set_ctx("llm_prompt_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_tokens_count") or 0) set_ctx("llm_completion_tokens_count", ai_plugin_o11y.metrics_get("llm_completion_tokens_count") or 0) + set_ctx("llm_prompt_cache_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count") or 0) set_ctx("llm_usage_cost", ai_plugin_o11y.metrics_get("llm_usage_cost") or 0) -- set the body for later plugins diff --git a/kong/plugins/ai-response-transformer/filters/transform-response.lua b/kong/plugins/ai-response-transformer/filters/transform-response.lua index 10b714304ec..69b4a946bed 100644 --- a/kong/plugins/ai-response-transformer/filters/transform-response.lua +++ b/kong/plugins/ai-response-transformer/filters/transform-response.lua @@ -17,6 +17,7 @@ local FILTER_OUTPUT_SCHEMA = { -- TODO: refactor this so they don't need to be duplicated llm_prompt_tokens_count = "number", llm_completion_tokens_count = "number", + llm_prompt_cache_tokens_count = "number", llm_usage_cost = "number", } @@ -190,6 +191,7 @@ function _M:run(conf) set_ctx("model", conf.llm.model) set_ctx("llm_prompt_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_tokens_count") or 0) set_ctx("llm_completion_tokens_count", ai_plugin_o11y.metrics_get("llm_completion_tokens_count") or 0) + set_ctx("llm_prompt_cache_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count") or 0) set_ctx("llm_usage_cost", ai_plugin_o11y.metrics_get("llm_usage_cost") or 0) return kong.response.exit(status, body, headers) end diff --git a/kong/plugins/prometheus/exporter.lua b/kong/plugins/prometheus/exporter.lua index d5100070b1f..ef25e089b68 100644 --- a/kong/plugins/prometheus/exporter.lua +++ b/kong/plugins/prometheus/exporter.lua @@ -435,6 +435,10 @@ local function log(message, serialized) labels_table_ai_llm_tokens[7] = "total_tokens" metrics.ai_llm_tokens:inc(ai_metrics.usage.total_tokens, labels_table_ai_llm_tokens) end + if ai_metrics.usage and ai_metrics.usage.prompt_cache_tokens and ai_metrics.usage.prompt_cache_tokens > 0 then + labels_table_ai_llm_tokens[7] = "prompt_cache_tokens" + metrics.ai_llm_tokens:inc(ai_metrics.usage.prompt_cache_tokens, labels_table_ai_llm_tokens) + end end end end diff --git a/kong/reports.lua b/kong/reports.lua index ed6e50d826a..98e93ecef98 100644 --- a/kong/reports.lua +++ b/kong/reports.lua @@ -55,9 +55,10 @@ local GO_PLUGINS_REQUEST_COUNT_KEY = "events:requests:go_plugins" local WASM_REQUEST_COUNT_KEY = "events:requests:wasm" -local AI_RESPONSE_TOKENS_COUNT_KEY = "events:ai:response_tokens" -local AI_PROMPT_TOKENS_COUNT_KEY = "events:ai:prompt_tokens" -local AI_REQUEST_COUNT_KEY = "events:ai:requests" +local AI_RESPONSE_TOKENS_COUNT_KEY = "events:ai:response_tokens" +local AI_PROMPT_TOKENS_COUNT_KEY = "events:ai:prompt_tokens" +local AI_PROMPT_CACHE_TOKENS_COUNT_KEY = "events:ai:prompt_cache_tokens" +local AI_REQUEST_COUNT_KEY = "events:ai:requests" local ROUTE_CACHE_HITS_KEY = "route_cache_hits" @@ -248,7 +249,7 @@ end local function incr_counter(key, hit) - if not hit then + if not hit then hit = 1 end @@ -539,6 +540,11 @@ return { incr_counter(AI_RESPONSE_TOKENS_COUNT_KEY, llm_response_tokens_count) end + local llm_response_cache_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count") + if llm_response_cache_tokens_count then + incr_counter(AI_PROMPT_CACHE_TOKENS_COUNT_KEY, llm_response_cache_tokens_count) + end + local suffix = get_current_suffix(ctx) if suffix then incr_counter(count_key .. ":" .. suffix) diff --git a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua index 99d4026769b..34e47a00bd3 100644 --- a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua @@ -51,6 +51,7 @@ local _EXPECTED_CHAT_STATS = { llm_latency = 1 }, usage = { + prompt_cache_tokens = 0, prompt_tokens = 25, completion_tokens = 12, total_tokens = 37, diff --git a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua index bdac132d114..587785123e6 100644 --- a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua @@ -18,6 +18,7 @@ local _EXPECTED_CHAT_STATS = { llm_latency = 1 }, usage = { + prompt_cache_tokens = 0, prompt_tokens = 18, completion_tokens = 13, -- this was from estimation total_tokens = 31, diff --git a/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua index 93273515bc0..6fe8e6d54de 100644 --- a/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua @@ -43,6 +43,7 @@ local _EXPECTED_CHAT_STATS = { llm_latency = 1, }, usage = { + prompt_cache_tokens = 0, prompt_tokens = 2, completion_tokens = 11, total_tokens = 13, diff --git a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua index 6f2a981cf86..9c98f3f5ced 100644 --- a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua +++ b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua @@ -92,6 +92,7 @@ local _EXPECTED_CHAT_STATS_GEMINI = { }, usage = { prompt_tokens = 2, + prompt_cache_tokens = 0, completion_tokens = 11, total_tokens = 13, time_per_token = 1, From 7dc88297663c6c4a6b3dc7030b1a645db5a9810b Mon Sep 17 00:00:00 2001 From: git-hulk Date: Tue, 9 Dec 2025 15:15:40 +0800 Subject: [PATCH 2/2] Improve the name style --- kong/reports.lua | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kong/reports.lua b/kong/reports.lua index 98e93ecef98..df03494359b 100644 --- a/kong/reports.lua +++ b/kong/reports.lua @@ -540,9 +540,9 @@ return { incr_counter(AI_RESPONSE_TOKENS_COUNT_KEY, llm_response_tokens_count) end - local llm_response_cache_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count") - if llm_response_cache_tokens_count then - incr_counter(AI_PROMPT_CACHE_TOKENS_COUNT_KEY, llm_response_cache_tokens_count) + local llm_prompt_cache_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count") + if llm_prompt_cache_tokens_count then + incr_counter(AI_PROMPT_CACHE_TOKENS_COUNT_KEY, llm_prompt_cache_tokens_count) end local suffix = get_current_suffix(ctx)