From a216f1a22f324d0f23e21895a423fec0bc05df95 Mon Sep 17 00:00:00 2001
From: git-hulk <hulk.website@gmail.com>
Date: Tue, 9 Dec 2025 11:36:46 +0800
Subject: [PATCH 1/2] chore(ai-proxy): add support of recording the cache token
 count for Gemini

Currently, ai-proxy only supports the prompt/completions token without
the prompt cache tokens, which might be helpful for users to observe the
cache hit ratio and improve the performance.

This PR introduce `prompt_cache_tokens` to record the cache for Gemini
and OpenAI.
---
 kong/llm/adapters/gemini.lua                  |  2 ++
 kong/llm/drivers/gemini.lua                   | 33 +++++++++++--------
 kong/llm/drivers/shared.lua                   |  5 +++
 kong/llm/plugin/observability.lua             |  1 +
 .../shared-filters/parse-json-response.lua    |  4 +++
 .../shared-filters/serialize-analytics.lua    |  1 +
 .../filters/transform-request.lua             |  2 ++
 .../filters/transform-response.lua            |  2 ++
 kong/plugins/prometheus/exporter.lua          |  4 +++
 kong/reports.lua                              | 14 +++++---
 .../02-openai_integration_spec.lua            |  1 +
 .../09-streaming_integration_spec.lua         |  1 +
 .../11-gemini_integration_spec.lua            |  1 +
 .../02-integration_spec.lua                   |  1 +
 14 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/kong/llm/adapters/gemini.lua b/kong/llm/adapters/gemini.lua
index eedf66d0e28..f11cfb5f498 100644
--- a/kong/llm/adapters/gemini.lua
+++ b/kong/llm/adapters/gemini.lua
@@ -50,6 +50,8 @@ function _GeminiAdapter:extract_metadata(response_body)
       return {
         prompt_tokens = response_body.usageMetadata.promptTokenCount or 0,
         completion_tokens = response_body.usageMetadata.candidatesTokenCount or 0,
+        total_tokens = response_body.usageMetadata.totalTokenCount or 0,
+        prompt_cache_tokens = response_body.usageMetadata.cachedContentTokenCount or 0,
       }
     end
 
diff --git a/kong/llm/drivers/gemini.lua b/kong/llm/drivers/gemini.lua
index d09edb88292..22b19b1bed9 100644
--- a/kong/llm/drivers/gemini.lua
+++ b/kong/llm/drivers/gemini.lua
@@ -81,6 +81,21 @@ local function has_finish_reason(event)
          or nil
 end
 
+-- Extract usage metadata from Gemini response
+-- For Gemini usage metadata reference: https://ai.google.dev/api/generate-content#UsageMetadata
+local function extract_usage(usageMetadata)
+  if not usageMetadata then
+    return {}
+  end
+
+  return {
+    prompt_tokens = usageMetadata.promptTokenCount or 0,
+    completion_tokens = usageMetadata.candidatesTokenCount or 0,
+    total_tokens = usageMetadata.totalTokenCount or 0,
+    prompt_cache_tokens = usageMetadata.cachedContentTokenCount or 0,
+  }
+end
+
 local function handle_stream_event(event_t, model_info, route_type)
   -- discard empty frames, it should either be a random new line, or comment
   if (not event_t.data) or (#event_t.data < 1) then
@@ -100,10 +115,8 @@ local function handle_stream_event(event_t, model_info, route_type)
   local finish_reason = has_finish_reason(event)  -- may be nil
 
   if is_response_content(event) then
-    local metadata = {}
-    metadata.finish_reason     = finish_reason
-    metadata.completion_tokens = event.usageMetadata and event.usageMetadata.candidatesTokenCount or 0
-    metadata.prompt_tokens     = event.usageMetadata and event.usageMetadata.promptTokenCount or 0
+    local metadata = extract_usage(event.usageMetadata)
+    metadata.finish_reason = finish_reason
 
     local new_event = {
       model = model_info.name,
@@ -122,10 +135,8 @@ local function handle_stream_event(event_t, model_info, route_type)
     return cjson.encode(new_event), nil, metadata
   
   elseif is_tool_content(event) then
-    local metadata = {}
-    metadata.finish_reason     = finish_reason
-    metadata.completion_tokens = event.usageMetadata and event.usageMetadata.candidatesTokenCount or 0
-    metadata.prompt_tokens     = event.usageMetadata and event.usageMetadata.promptTokenCount or 0
+    local metadata = extract_usage(event.usageMetadata)
+    metadata.finish_reason = finish_reason
 
     if event.candidates and #event.candidates > 0 then
       local new_event = {
@@ -445,11 +456,7 @@ local function from_gemini_chat_openai(response, model_info, route_type)
 
     -- process analytics
     if response.usageMetadata then
-      messages.usage = {
-        prompt_tokens = response.usageMetadata.promptTokenCount,
-        completion_tokens = response.usageMetadata.candidatesTokenCount,
-        total_tokens = response.usageMetadata.totalTokenCount,
-      }
+      messages.usage = extract_usage(response.usageMetadata)
     end
 
   else -- probably a server fault or other unexpected response
diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua
index 939fec3477f..ba7c78db422 100644
--- a/kong/llm/drivers/shared.lua
+++ b/kong/llm/drivers/shared.lua
@@ -43,6 +43,7 @@ local log_entry_keys = {
   -- usage keys
   PROMPT_TOKENS = "prompt_tokens",
   COMPLETION_TOKENS = "completion_tokens",
+  PROMPT_CACHE_TOKENS = "prompt_cache_tokens",
   TOTAL_TOKENS = "total_tokens",
   TIME_PER_TOKEN = "time_per_token",
   COST = "cost",
@@ -844,9 +845,13 @@ function _M.post_request(conf, response_object)
     if response_object.usage.total_tokens then
       request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TOTAL_TOKENS] = response_object.usage.total_tokens
     end
+    if response_object.usage.prompt_cache_tokens then
+      request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.PROMPT_CACHE_TOKENS] = response_object.usage.prompt_cache_tokens
+    end
 
     ai_plugin_o11y.metrics_set("llm_prompt_tokens_count", response_object.usage.prompt_tokens)
     ai_plugin_o11y.metrics_set("llm_completion_tokens_count", response_object.usage.completion_tokens)
+    ai_plugin_o11y.metrics_set("llm_prompt_cache_tokens_count", response_object.usage.prompt_cache_tokens)
 
     if response_object.usage.prompt_tokens and response_object.usage.completion_tokens and
        conf.model.options and conf.model.options.input_cost and conf.model.options.output_cost then
diff --git a/kong/llm/plugin/observability.lua b/kong/llm/plugin/observability.lua
index 916aeb19c6f..d7c471e57fa 100644
--- a/kong/llm/plugin/observability.lua
+++ b/kong/llm/plugin/observability.lua
@@ -12,6 +12,7 @@ local metrics_schema = {
   llm_prompt_tokens_count = true,
   llm_completion_tokens_count = true,
   llm_total_tokens_count = true,
+  llm_prompt_cache_tokens_count = true,
   llm_usage_cost = true,
 }
 
diff --git a/kong/llm/plugin/shared-filters/parse-json-response.lua b/kong/llm/plugin/shared-filters/parse-json-response.lua
index 473c616de8c..97fe8efe5c2 100644
--- a/kong/llm/plugin/shared-filters/parse-json-response.lua
+++ b/kong/llm/plugin/shared-filters/parse-json-response.lua
@@ -39,6 +39,7 @@ function _M:run(_)
       else
         ai_plugin_o11y.metrics_set("llm_prompt_tokens_count", metadata.prompt_tokens)
         ai_plugin_o11y.metrics_set("llm_completion_tokens_count", metadata.completion_tokens)
+        ai_plugin_o11y.metrics_set("llm_prompt_cache_tokens_count", metadata.prompt_cache_tokens)
       end
 
     else
@@ -55,6 +56,9 @@ function _M:run(_)
       if t and t.usage and t.usage.completion_tokens then
         ai_plugin_o11y.metrics_set("llm_completion_tokens_count", t.usage.completion_tokens)
       end
+      if t and t.usage and t.usage.prompt_tokens_details then
+        ai_plugin_o11y.metrics_set("llm_prompt_cache_tokens_count", t.usage.prompt_tokens_details.cache_tokens or 0)
+      end
     end
   end
 
diff --git a/kong/llm/plugin/shared-filters/serialize-analytics.lua b/kong/llm/plugin/shared-filters/serialize-analytics.lua
index 390d528450f..049814d6eeb 100644
--- a/kong/llm/plugin/shared-filters/serialize-analytics.lua
+++ b/kong/llm/plugin/shared-filters/serialize-analytics.lua
@@ -75,6 +75,7 @@ function _M:run(conf)
     prompt_tokens = ai_plugin_o11y.metrics_get("llm_prompt_tokens_count"),
     completion_tokens = ai_plugin_o11y.metrics_get("llm_completion_tokens_count"),
     total_tokens = ai_plugin_o11y.metrics_get("llm_total_tokens_count"),
+    prompt_cache_tokens = ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count"),
     cost = ai_plugin_o11y.metrics_get("llm_usage_cost"),
   }
   kong.log.set_serialize_value(string.format("ai.%s.usage", ai_plugin_o11y.NAMESPACE), usage)
diff --git a/kong/plugins/ai-request-transformer/filters/transform-request.lua b/kong/plugins/ai-request-transformer/filters/transform-request.lua
index 3270132e809..f126910b376 100644
--- a/kong/plugins/ai-request-transformer/filters/transform-request.lua
+++ b/kong/plugins/ai-request-transformer/filters/transform-request.lua
@@ -16,6 +16,7 @@ local FILTER_OUTPUT_SCHEMA = {
   -- TODO: refactor this so they don't need to be duplicated
   llm_prompt_tokens_count = "number",
   llm_completion_tokens_count = "number",
+  llm_prompt_cache_tokens_count = "number",
   llm_usage_cost = "number",
 }
 
@@ -93,6 +94,7 @@ function _M:run(conf)
   set_ctx("model", conf.llm.model)
   set_ctx("llm_prompt_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_tokens_count") or 0)
   set_ctx("llm_completion_tokens_count", ai_plugin_o11y.metrics_get("llm_completion_tokens_count") or 0)
+  set_ctx("llm_prompt_cache_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count") or 0)
   set_ctx("llm_usage_cost", ai_plugin_o11y.metrics_get("llm_usage_cost") or 0)
 
   -- set the body for later plugins
diff --git a/kong/plugins/ai-response-transformer/filters/transform-response.lua b/kong/plugins/ai-response-transformer/filters/transform-response.lua
index 10b714304ec..69b4a946bed 100644
--- a/kong/plugins/ai-response-transformer/filters/transform-response.lua
+++ b/kong/plugins/ai-response-transformer/filters/transform-response.lua
@@ -17,6 +17,7 @@ local FILTER_OUTPUT_SCHEMA = {
   -- TODO: refactor this so they don't need to be duplicated
   llm_prompt_tokens_count = "number",
   llm_completion_tokens_count = "number",
+  llm_prompt_cache_tokens_count = "number",
   llm_usage_cost = "number",
 }
 
@@ -190,6 +191,7 @@ function _M:run(conf)
   set_ctx("model", conf.llm.model)
   set_ctx("llm_prompt_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_tokens_count") or 0)
   set_ctx("llm_completion_tokens_count", ai_plugin_o11y.metrics_get("llm_completion_tokens_count") or 0)
+  set_ctx("llm_prompt_cache_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count") or 0)
   set_ctx("llm_usage_cost", ai_plugin_o11y.metrics_get("llm_usage_cost") or 0)
   return kong.response.exit(status, body, headers)
 end
diff --git a/kong/plugins/prometheus/exporter.lua b/kong/plugins/prometheus/exporter.lua
index d5100070b1f..ef25e089b68 100644
--- a/kong/plugins/prometheus/exporter.lua
+++ b/kong/plugins/prometheus/exporter.lua
@@ -435,6 +435,10 @@ local function log(message, serialized)
         labels_table_ai_llm_tokens[7] = "total_tokens"
         metrics.ai_llm_tokens:inc(ai_metrics.usage.total_tokens, labels_table_ai_llm_tokens)
       end
+      if ai_metrics.usage and ai_metrics.usage.prompt_cache_tokens and ai_metrics.usage.prompt_cache_tokens > 0 then
+        labels_table_ai_llm_tokens[7] = "prompt_cache_tokens"
+        metrics.ai_llm_tokens:inc(ai_metrics.usage.prompt_cache_tokens, labels_table_ai_llm_tokens)
+      end
     end
   end
 end
diff --git a/kong/reports.lua b/kong/reports.lua
index ed6e50d826a..98e93ecef98 100644
--- a/kong/reports.lua
+++ b/kong/reports.lua
@@ -55,9 +55,10 @@ local GO_PLUGINS_REQUEST_COUNT_KEY = "events:requests:go_plugins"
 local WASM_REQUEST_COUNT_KEY = "events:requests:wasm"
 
 
-local AI_RESPONSE_TOKENS_COUNT_KEY = "events:ai:response_tokens"
-local AI_PROMPT_TOKENS_COUNT_KEY   = "events:ai:prompt_tokens"
-local AI_REQUEST_COUNT_KEY         = "events:ai:requests"
+local AI_RESPONSE_TOKENS_COUNT_KEY     = "events:ai:response_tokens"
+local AI_PROMPT_TOKENS_COUNT_KEY       = "events:ai:prompt_tokens"
+local AI_PROMPT_CACHE_TOKENS_COUNT_KEY = "events:ai:prompt_cache_tokens"
+local AI_REQUEST_COUNT_KEY             = "events:ai:requests"
 
 
 local ROUTE_CACHE_HITS_KEY = "route_cache_hits"
@@ -248,7 +249,7 @@ end
 
 
 local function incr_counter(key, hit)
-  if not hit then 
+  if not hit then
     hit = 1
   end
 
@@ -539,6 +540,11 @@ return {
       incr_counter(AI_RESPONSE_TOKENS_COUNT_KEY, llm_response_tokens_count)
     end
 
+    local llm_response_cache_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count")
+    if llm_response_cache_tokens_count then
+      incr_counter(AI_PROMPT_CACHE_TOKENS_COUNT_KEY, llm_response_cache_tokens_count)
+    end
+
     local suffix = get_current_suffix(ctx)
     if suffix then
       incr_counter(count_key .. ":" .. suffix)
diff --git a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
index 99d4026769b..34e47a00bd3 100644
--- a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
+++ b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
@@ -51,6 +51,7 @@ local _EXPECTED_CHAT_STATS = {
       llm_latency = 1
     },
     usage = {
+      prompt_cache_tokens = 0,
       prompt_tokens = 25,
       completion_tokens = 12,
       total_tokens = 37,
diff --git a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
index bdac132d114..587785123e6 100644
--- a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
+++ b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
@@ -18,6 +18,7 @@ local _EXPECTED_CHAT_STATS = {
     llm_latency = 1
   },
   usage = {
+    prompt_cache_tokens = 0,
     prompt_tokens = 18,
     completion_tokens = 13, -- this was from estimation
     total_tokens = 31,
diff --git a/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua
index 93273515bc0..6fe8e6d54de 100644
--- a/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua
+++ b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua
@@ -43,6 +43,7 @@ local _EXPECTED_CHAT_STATS = {
     llm_latency = 1,
   },
   usage = {
+    prompt_cache_tokens = 0,
     prompt_tokens = 2,
     completion_tokens = 11,
     total_tokens = 13,
diff --git a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua
index 6f2a981cf86..9c98f3f5ced 100644
--- a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua
+++ b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua
@@ -92,6 +92,7 @@ local _EXPECTED_CHAT_STATS_GEMINI = {
     },
     usage = {
       prompt_tokens = 2,
+      prompt_cache_tokens = 0,
       completion_tokens = 11,
       total_tokens = 13,
       time_per_token = 1,

From 7dc88297663c6c4a6b3dc7030b1a645db5a9810b Mon Sep 17 00:00:00 2001
From: git-hulk <hulk.website@gmail.com>
Date: Tue, 9 Dec 2025 15:15:40 +0800
Subject: [PATCH 2/2] Improve the name style

---
 kong/reports.lua | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kong/reports.lua b/kong/reports.lua
index 98e93ecef98..df03494359b 100644
--- a/kong/reports.lua
+++ b/kong/reports.lua
@@ -540,9 +540,9 @@ return {
       incr_counter(AI_RESPONSE_TOKENS_COUNT_KEY, llm_response_tokens_count)
     end
 
-    local llm_response_cache_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count")
-    if llm_response_cache_tokens_count then
-      incr_counter(AI_PROMPT_CACHE_TOKENS_COUNT_KEY, llm_response_cache_tokens_count)
+    local llm_prompt_cache_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count")
+    if llm_prompt_cache_tokens_count then
+      incr_counter(AI_PROMPT_CACHE_TOKENS_COUNT_KEY, llm_prompt_cache_tokens_count)
     end
 
     local suffix = get_current_suffix(ctx)