Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions extras/chat_template_examples/chat_template_gpt_oss.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,28 @@
{{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
{%- endif %}
{{- "<|start|>functions." + last_tool_call.name }}
{#- Original: {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} #}
{#- Actual version that works, does not escape and allows non-json: #}
{{- " to=assistant<|channel|>commentary<|message|>" + message.content + "<|end|>" -}}
{#- When content is a plain string we render it directly. #}
{#- When content is an array (OpenAI multipart format) we join the text items with newlines, #}
{#- matching what the model was trained on. JSON-serialising the array would add noise. #}
{%- if message.content is string -%}
{{- " to=assistant<|channel|>commentary<|message|>" + message.content + "<|end|>" -}}
{%- else -%}
{%- set ns = namespace(parts=[]) -%}
{%- for item in message.content if item.type == "text" -%}
{%- set ns.parts = ns.parts + [item.text] -%}
{%- endfor -%}
{{- " to=assistant<|channel|>commentary<|message|>" + ns.parts | join("\n") + "<|end|>" -}}
{%- endif -%}
{%- elif message.role == 'user' -%}
{{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
{%- if message.content is string -%}
{{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
{%- else -%}
{%- set ns = namespace(parts=[]) -%}
{%- for item in message.content if item.type == "text" -%}
{%- set ns.parts = ns.parts + [item.text] -%}
{%- endfor -%}
{{- "<|start|>user<|message|>" + ns.parts | join("\n") + "<|end|>" }}
{%- endif -%}
{%- endif -%}
{%- endfor -%}

Expand Down
1 change: 1 addition & 0 deletions src/llm/apis/openai_api_handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ class OpenAIApiHandler {
std::optional<int> getNumReturnSequences() const;
StreamOptions getStreamOptions() const;
const std::string& getProcessedJson() const;
// Returns the flat ordered list of image tensors (one per image_url item, in document order).
const ImageHistory& getImageHistory() const;
ov::genai::ChatHistory& getChatHistory();
std::optional<int> getMaxTokens() const;
Expand Down
42 changes: 25 additions & 17 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,13 +196,14 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional<std::stri
continue;
}
if (memberName == "content" && member->value.IsArray()) {
// Adjust content field format when it is passed as an array of objects (typically with images)
// Content passed as an array of objects (OpenAI multipart format).
// We preserve the array structure so chat templates can decide how to render it.
// image_url items are translated to {type:image} so that VLM chat templates
// (which use the OpenVINO GenAI MULTIPART_CONTENT convention) see them correctly.
// The corresponding decoded tensors are appended in document order to imageHistory.
if (member->value.GetArray().Size() == 0) {
return absl::InvalidArgumentError("Invalid message structure - content array is empty");
}
jsonChanged = true;
Value contentText(rapidjson::kStringType);
contentText.SetString("", doc.GetAllocator());
for (auto& v : member->value.GetArray()) {
if (!v.IsObject()) {
return absl::InvalidArgumentError("Invalid message structure - content array should contain objects");
Expand All @@ -211,14 +212,12 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional<std::stri
if (!entry.HasMember("type") || !entry["type"].IsString()) {
return absl::InvalidArgumentError("Invalid message structure - content object type missing");
}
auto entryType = entry["type"].GetString();
if (entryType == std::string("text")) {
std::string entryType = entry["type"].GetString();
if (entryType == "text") {
if (!entry.HasMember("text") || !entry["text"].IsString()) {
return absl::InvalidArgumentError("Invalid message structure - content text missing");
}
contentText = entry["text"];
continue;
} else if (entryType == std::string("image_url")) {
} else if (entryType == "image_url") {
if (!entry.HasMember("image_url") || !entry["image_url"].IsObject()) {
return absl::InvalidArgumentError("Invalid message structure - content image_url missing");
}
Expand All @@ -231,18 +230,27 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional<std::stri
if (!tensorResult.ok()) {
return tensorResult.status();
}
request.imageHistory.push_back({i, tensorResult.value()});
// Store tensor in flat image list (document order = template rendering order)
request.imageHistory.push_back(std::move(tensorResult.value()));
// Translate image_url item to {type:image} so VLM chat templates
// (which use GenAI MULTIPART_CONTENT convention) see the image in context.
while (v.MemberBegin() != v.MemberEnd()) {
v.RemoveMember(v.MemberBegin());
}
v.AddMember(rapidjson::Value("type", doc.GetAllocator()),
rapidjson::Value("image", doc.GetAllocator()),
doc.GetAllocator());
jsonChanged = true;
} else {
return absl::InvalidArgumentError("Unsupported content type");
}
}
// Pulling out text from nested structure to the "content" field for text and replace whole "content" value for image data
// with empty string, since images are stored separately in request.images
member->value = contentText;
// Add new field to the last message in history if content is text
if (member->value.IsString()) {
request.chatHistory.last()[member->name.GetString()] = member->value.GetString();
}
// Preserve the array (with any image_url translated to {type:image}) in chatHistory.
// For the Python Jinja path, processedJson is only written when jsonChanged is true
// (i.e. when image_url items were translated or tool_call arguments were injected).
// Otherwise the template falls back to payload.body and sees the original OpenAI
// format, which is equally correct — template decides how to render content arrays.
request.chatHistory.last()[memberName] = rapidJsonValueToJsonContainer(member->value);
}
}
auto lastMessage = request.chatHistory.last();
Expand Down
6 changes: 5 additions & 1 deletion src/llm/apis/openai_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,11 @@
#include "tool_schema_wrapper.hpp"

namespace ovms {
using ImageHistory = std::vector<std::pair<size_t, ov::Tensor>>;
// Flat ordered list of image tensors extracted from content arrays.
// Order matches the sequence of image_url items across all messages,
// which corresponds to the order of {"type":"image"} items in chatHistory
// after image_url → image translation.
using ImageHistory = std::vector<ov::Tensor>;

struct StreamOptions {
bool includeUsage = false;
Expand Down
32 changes: 28 additions & 4 deletions src/llm/apis/openai_responses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,13 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional<std::string> allow
return absl::InvalidArgumentError("Invalid message structure - content array is empty");
}

std::string contentText = "";
// Translate Responses API content array into the canonical multipart format
// used by chatHistory and VLM chat templates:
// input_text → {"type":"text", "text": "<value>"}
// input_image → {"type":"image"} (tensor appended to imageHistory in order)
// This mirrors the Chat Completions image_url → image translation so that
// both VLM chat templates (GenAI MULTIPART_CONTENT) and Python Jinja2 templates
// receive a uniform representation.
for (auto& contentItem : contentIt->value.GetArray()) {
if (!contentItem.IsObject()) {
return absl::InvalidArgumentError("input content items must be objects");
Expand All @@ -137,7 +143,17 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional<std::string> allow
if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) {
return absl::InvalidArgumentError("input_text requires a valid text field");
}
contentText = textIt->value.GetString();
// Normalise to {"type":"text","text":"..."} in-place.
std::string textValue = textIt->value.GetString();
while (contentItem.MemberBegin() != contentItem.MemberEnd()) {
contentItem.RemoveMember(contentItem.MemberBegin());
}
contentItem.AddMember(rapidjson::Value("type", doc.GetAllocator()),
rapidjson::Value("text", doc.GetAllocator()),
doc.GetAllocator());
contentItem.AddMember(rapidjson::Value("text", doc.GetAllocator()),
rapidjson::Value(textValue.c_str(), doc.GetAllocator()),
doc.GetAllocator());
} else if (type == "input_image") {
std::string imageUrl;
auto imageUrlIt = contentObj.FindMember("image_url");
Expand All @@ -161,13 +177,21 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional<std::string> allow
if (!tensorResult.ok()) {
return tensorResult.status();
}
request.imageHistory.push_back({i, tensorResult.value()});
request.imageHistory.push_back(std::move(tensorResult.value()));
// Translate to {"type":"image"} in-place so VLM chat templates see
// the image at the correct position in the content array.
while (contentItem.MemberBegin() != contentItem.MemberEnd()) {
contentItem.RemoveMember(contentItem.MemberBegin());
}
contentItem.AddMember(rapidjson::Value("type", doc.GetAllocator()),
rapidjson::Value("image", doc.GetAllocator()),
doc.GetAllocator());
} else {
return absl::InvalidArgumentError("Unsupported content type. Supported types are input_text and input_image.");
}
}

request.chatHistory.last()["content"] = contentText;
request.chatHistory.last()["content"] = rapidJsonValueToJsonContainer(contentIt->value);
}
} else {
return absl::InvalidArgumentError("input is not a string or array");
Expand Down
40 changes: 25 additions & 15 deletions src/llm/visual_language_model/continuous_batching/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <memory>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <vector>

#include "../../../logging.hpp"
Expand Down Expand Up @@ -74,26 +73,36 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer

for (size_t i = 0; i < chatHistory.size(); i++) {
const auto& message = chatHistory[i];
if (message["content"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
const auto& contentField = message["content"];
if (contentField.is_array()) {
for (size_t j = 0; j < contentField.size(); j++) {
const auto& item = contentField[j];
if (item["type"].as_string().value_or("") == "text" &&
item["text"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
}
}
} else if (contentField.as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
}
}

const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory();
size_t imageIndex = 0;
std::unordered_map<size_t, std::string> imageTags;
for (const auto& image : imageHistory) {
const auto& [chatTurnIndex, imageTensor] = image;
std::string imageTag = "<ov_genai_image_" + std::to_string(imageIndex++) + ">\n";
imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag;
vlmExecutionContext->inputImages.push_back(imageTensor);
}
// imageHistory is a flat ordered list of tensors matching the {type:image} items in
// chatHistory. Pass them directly to add_request; the chat template applied below will
// emit the model-specific image tokens at the correct positions.
vlmExecutionContext->inputImages = vlmExecutionContext->apiHandler->getImageHistory();

for (const auto& [chatTurnIndex, imageTagString] : imageTags) {
std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or("");
chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent;
#if (PYTHON_DISABLE == 0)
bool success;
if (vlmExecutionContext->apiHandler->getProcessedJson().size() > 0) {
success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->apiHandler->getProcessedJson(), vlmExecutionContext->inputText);
} else {
success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->payload.body, vlmExecutionContext->inputText);
}

if (!success) {
return absl::Status(absl::StatusCode::kInvalidArgument, vlmExecutionContext->inputText);
}
#else
constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded
auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer();
if (!toolsStatus.ok()) {
Expand All @@ -106,6 +115,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
}
const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value();
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs);
#endif
} else {
return absl::InvalidArgumentError("Unsupported endpoint");
}
Expand Down
39 changes: 25 additions & 14 deletions src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include <memory>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <vector>

#include "../../../logging.hpp"
Expand Down Expand Up @@ -256,25 +255,36 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge

for (size_t i = 0; i < chatHistory.size(); i++) {
const auto& message = chatHistory[i];
if (message["content"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
const auto& contentField = message["content"];
if (contentField.is_array()) {
for (size_t j = 0; j < contentField.size(); j++) {
const auto& item = contentField[j];
if (item["type"].as_string().value_or("") == "text" &&
item["text"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
}
}
} else if (contentField.as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
}
}

const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory();
size_t imageIndex = 0;
std::unordered_map<size_t, std::string> imageTags;
for (const auto& image : imageHistory) {
const auto& [chatTurnIndex, imageTensor] = image;
std::string imageTag = "<ov_genai_image_" + std::to_string(imageIndex++) + ">\n";
imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag;
vlmExecutionContext->inputImages.push_back(imageTensor);
// imageHistory is a flat ordered list of tensors matching the {type:image} items in
// chatHistory. Pass them directly; the chat template applied below will emit the
// model-specific image tokens at the correct positions.
vlmExecutionContext->inputImages = vlmExecutionContext->apiHandler->getImageHistory();

#if (PYTHON_DISABLE == 0)
bool success;
if (vlmExecutionContext->apiHandler->getProcessedJson().size() > 0) {
success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->apiHandler->getProcessedJson(), vlmExecutionContext->inputText);
} else {
success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->payload.body, vlmExecutionContext->inputText);
}
for (const auto& [chatTurnIndex, imageTagString] : imageTags) {
std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or("");
chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent;
if (!success) {
return absl::Status(absl::StatusCode::kInvalidArgument, vlmExecutionContext->inputText);
}

#else
constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded
auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer();
if (!toolsStatus.ok()) {
Expand All @@ -287,6 +297,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
}
const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value();
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs);
#endif
} else {
return absl::InvalidArgumentError("Unsupported endpoint");
}
Expand Down
Loading