Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions kong/llm/adapters/gemini.lua
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ function _GeminiAdapter:extract_metadata(response_body)
return {
prompt_tokens = response_body.usageMetadata.promptTokenCount or 0,
completion_tokens = response_body.usageMetadata.candidatesTokenCount or 0,
total_tokens = response_body.usageMetadata.totalTokenCount or 0,
prompt_cache_tokens = response_body.usageMetadata.cachedContentTokenCount or 0,
}
end

Expand Down
33 changes: 20 additions & 13 deletions kong/llm/drivers/gemini.lua
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,21 @@ local function has_finish_reason(event)
or nil
end

-- Extract usage metadata from Gemini response
-- For Gemini usage metadata reference: https://ai.google.dev/api/generate-content#UsageMetadata
local function extract_usage(usageMetadata)
if not usageMetadata then
return {}
end

return {
prompt_tokens = usageMetadata.promptTokenCount or 0,
completion_tokens = usageMetadata.candidatesTokenCount or 0,
total_tokens = usageMetadata.totalTokenCount or 0,
prompt_cache_tokens = usageMetadata.cachedContentTokenCount or 0,
}
end

local function handle_stream_event(event_t, model_info, route_type)
-- discard empty frames, it should either be a random new line, or comment
if (not event_t.data) or (#event_t.data < 1) then
Expand All @@ -100,10 +115,8 @@ local function handle_stream_event(event_t, model_info, route_type)
local finish_reason = has_finish_reason(event) -- may be nil

if is_response_content(event) then
local metadata = {}
metadata.finish_reason = finish_reason
metadata.completion_tokens = event.usageMetadata and event.usageMetadata.candidatesTokenCount or 0
metadata.prompt_tokens = event.usageMetadata and event.usageMetadata.promptTokenCount or 0
local metadata = extract_usage(event.usageMetadata)
metadata.finish_reason = finish_reason

local new_event = {
model = model_info.name,
Expand All @@ -122,10 +135,8 @@ local function handle_stream_event(event_t, model_info, route_type)
return cjson.encode(new_event), nil, metadata

elseif is_tool_content(event) then
local metadata = {}
metadata.finish_reason = finish_reason
metadata.completion_tokens = event.usageMetadata and event.usageMetadata.candidatesTokenCount or 0
metadata.prompt_tokens = event.usageMetadata and event.usageMetadata.promptTokenCount or 0
local metadata = extract_usage(event.usageMetadata)
metadata.finish_reason = finish_reason

if event.candidates and #event.candidates > 0 then
local new_event = {
Expand Down Expand Up @@ -445,11 +456,7 @@ local function from_gemini_chat_openai(response, model_info, route_type)

-- process analytics
if response.usageMetadata then
messages.usage = {
prompt_tokens = response.usageMetadata.promptTokenCount,
completion_tokens = response.usageMetadata.candidatesTokenCount,
total_tokens = response.usageMetadata.totalTokenCount,
}
messages.usage = extract_usage(response.usageMetadata)
end

else -- probably a server fault or other unexpected response
Expand Down
5 changes: 5 additions & 0 deletions kong/llm/drivers/shared.lua
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ local log_entry_keys = {
-- usage keys
PROMPT_TOKENS = "prompt_tokens",
COMPLETION_TOKENS = "completion_tokens",
PROMPT_CACHE_TOKENS = "prompt_cache_tokens",
TOTAL_TOKENS = "total_tokens",
TIME_PER_TOKEN = "time_per_token",
COST = "cost",
Expand Down Expand Up @@ -844,9 +845,13 @@ function _M.post_request(conf, response_object)
if response_object.usage.total_tokens then
request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TOTAL_TOKENS] = response_object.usage.total_tokens
end
if response_object.usage.prompt_cache_tokens then
request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.PROMPT_CACHE_TOKENS] = response_object.usage.prompt_cache_tokens
end

ai_plugin_o11y.metrics_set("llm_prompt_tokens_count", response_object.usage.prompt_tokens)
ai_plugin_o11y.metrics_set("llm_completion_tokens_count", response_object.usage.completion_tokens)
ai_plugin_o11y.metrics_set("llm_prompt_cache_tokens_count", response_object.usage.prompt_cache_tokens)
Comment thread
git-hulk marked this conversation as resolved.

if response_object.usage.prompt_tokens and response_object.usage.completion_tokens and
conf.model.options and conf.model.options.input_cost and conf.model.options.output_cost then
Expand Down
1 change: 1 addition & 0 deletions kong/llm/plugin/observability.lua
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ local metrics_schema = {
llm_prompt_tokens_count = true,
llm_completion_tokens_count = true,
llm_total_tokens_count = true,
llm_prompt_cache_tokens_count = true,
llm_usage_cost = true,
}

Expand Down
4 changes: 4 additions & 0 deletions kong/llm/plugin/shared-filters/parse-json-response.lua
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ function _M:run(_)
else
ai_plugin_o11y.metrics_set("llm_prompt_tokens_count", metadata.prompt_tokens)
ai_plugin_o11y.metrics_set("llm_completion_tokens_count", metadata.completion_tokens)
ai_plugin_o11y.metrics_set("llm_prompt_cache_tokens_count", metadata.prompt_cache_tokens)
Comment thread
git-hulk marked this conversation as resolved.
end

else
Expand All @@ -55,6 +56,9 @@ function _M:run(_)
if t and t.usage and t.usage.completion_tokens then
ai_plugin_o11y.metrics_set("llm_completion_tokens_count", t.usage.completion_tokens)
end
if t and t.usage and t.usage.prompt_tokens_details then
ai_plugin_o11y.metrics_set("llm_prompt_cache_tokens_count", t.usage.prompt_tokens_details.cache_tokens or 0)
end
end
end

Expand Down
1 change: 1 addition & 0 deletions kong/llm/plugin/shared-filters/serialize-analytics.lua
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ function _M:run(conf)
prompt_tokens = ai_plugin_o11y.metrics_get("llm_prompt_tokens_count"),
completion_tokens = ai_plugin_o11y.metrics_get("llm_completion_tokens_count"),
total_tokens = ai_plugin_o11y.metrics_get("llm_total_tokens_count"),
prompt_cache_tokens = ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count"),
cost = ai_plugin_o11y.metrics_get("llm_usage_cost"),
}
kong.log.set_serialize_value(string.format("ai.%s.usage", ai_plugin_o11y.NAMESPACE), usage)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ local FILTER_OUTPUT_SCHEMA = {
-- TODO: refactor this so they don't need to be duplicated
llm_prompt_tokens_count = "number",
llm_completion_tokens_count = "number",
llm_prompt_cache_tokens_count = "number",
llm_usage_cost = "number",
}

Expand Down Expand Up @@ -93,6 +94,7 @@ function _M:run(conf)
set_ctx("model", conf.llm.model)
set_ctx("llm_prompt_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_tokens_count") or 0)
set_ctx("llm_completion_tokens_count", ai_plugin_o11y.metrics_get("llm_completion_tokens_count") or 0)
set_ctx("llm_prompt_cache_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count") or 0)
set_ctx("llm_usage_cost", ai_plugin_o11y.metrics_get("llm_usage_cost") or 0)

-- set the body for later plugins
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ local FILTER_OUTPUT_SCHEMA = {
-- TODO: refactor this so they don't need to be duplicated
llm_prompt_tokens_count = "number",
llm_completion_tokens_count = "number",
llm_prompt_cache_tokens_count = "number",
llm_usage_cost = "number",
}

Expand Down Expand Up @@ -190,6 +191,7 @@ function _M:run(conf)
set_ctx("model", conf.llm.model)
set_ctx("llm_prompt_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_tokens_count") or 0)
set_ctx("llm_completion_tokens_count", ai_plugin_o11y.metrics_get("llm_completion_tokens_count") or 0)
set_ctx("llm_prompt_cache_tokens_count", ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count") or 0)
set_ctx("llm_usage_cost", ai_plugin_o11y.metrics_get("llm_usage_cost") or 0)
return kong.response.exit(status, body, headers)
end
Expand Down
4 changes: 4 additions & 0 deletions kong/plugins/prometheus/exporter.lua
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,10 @@ local function log(message, serialized)
labels_table_ai_llm_tokens[7] = "total_tokens"
metrics.ai_llm_tokens:inc(ai_metrics.usage.total_tokens, labels_table_ai_llm_tokens)
end
if ai_metrics.usage and ai_metrics.usage.prompt_cache_tokens and ai_metrics.usage.prompt_cache_tokens > 0 then
labels_table_ai_llm_tokens[7] = "prompt_cache_tokens"
metrics.ai_llm_tokens:inc(ai_metrics.usage.prompt_cache_tokens, labels_table_ai_llm_tokens)
end
end
end
end
Expand Down
14 changes: 10 additions & 4 deletions kong/reports.lua
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@ local GO_PLUGINS_REQUEST_COUNT_KEY = "events:requests:go_plugins"
local WASM_REQUEST_COUNT_KEY = "events:requests:wasm"


local AI_RESPONSE_TOKENS_COUNT_KEY = "events:ai:response_tokens"
local AI_PROMPT_TOKENS_COUNT_KEY = "events:ai:prompt_tokens"
local AI_REQUEST_COUNT_KEY = "events:ai:requests"
local AI_RESPONSE_TOKENS_COUNT_KEY = "events:ai:response_tokens"
local AI_PROMPT_TOKENS_COUNT_KEY = "events:ai:prompt_tokens"
local AI_PROMPT_CACHE_TOKENS_COUNT_KEY = "events:ai:prompt_cache_tokens"
local AI_REQUEST_COUNT_KEY = "events:ai:requests"


local ROUTE_CACHE_HITS_KEY = "route_cache_hits"
Expand Down Expand Up @@ -248,7 +249,7 @@ end


local function incr_counter(key, hit)
if not hit then
if not hit then
hit = 1
end

Expand Down Expand Up @@ -539,6 +540,11 @@ return {
incr_counter(AI_RESPONSE_TOKENS_COUNT_KEY, llm_response_tokens_count)
end

local llm_prompt_cache_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_cache_tokens_count")
if llm_prompt_cache_tokens_count then
incr_counter(AI_PROMPT_CACHE_TOKENS_COUNT_KEY, llm_prompt_cache_tokens_count)
end

local suffix = get_current_suffix(ctx)
if suffix then
incr_counter(count_key .. ":" .. suffix)
Expand Down
1 change: 1 addition & 0 deletions spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ local _EXPECTED_CHAT_STATS = {
llm_latency = 1
},
usage = {
prompt_cache_tokens = 0,
prompt_tokens = 25,
completion_tokens = 12,
total_tokens = 37,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ local _EXPECTED_CHAT_STATS = {
llm_latency = 1
},
usage = {
prompt_cache_tokens = 0,
prompt_tokens = 18,
completion_tokens = 13, -- this was from estimation
total_tokens = 31,
Expand Down
1 change: 1 addition & 0 deletions spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ local _EXPECTED_CHAT_STATS = {
llm_latency = 1,
},
usage = {
prompt_cache_tokens = 0,
prompt_tokens = 2,
completion_tokens = 11,
total_tokens = 13,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ local _EXPECTED_CHAT_STATS_GEMINI = {
},
usage = {
prompt_tokens = 2,
prompt_cache_tokens = 0,
completion_tokens = 11,
total_tokens = 13,
time_per_token = 1,
Expand Down